示例#1
0
def word_cloud(words):
    all_words = []
    for line in list(words):
        words = line.split()
        for word in words:
            all_words.append(word.lower())
    # for line in words:
    # all_words.extend(line)
    # creates a word frequency dictionary
    word_freq = nltk.Counter(all_words)
    # draw a Word Cloud with word frequencies
    wordcloud = WordCloud(
        width=900,
        height=500,
        max_words=500,
        max_font_size=100,
        relative_scaling=0.5,
        colormap='Blues',
        normalize_plurals=True).generate_from_frequencies(word_freq)
    plt.figure(figsize=(17, 14))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig('graphs/frequent_word_cloud.png')
    plt.show()
    plt.close('all')
示例#2
0
    def __count_parts_of_speech(self, docs):
        # https://medium.freecodecamp.org/an-introduction-to-part-of-speech-tagging-and-the-hidden-markov-model-953d45338f24
        tag_types = ['NN', 'VB', 'JJ', 'RB']

        pos_counts = []
        for text in docs:
            tokens = nltk.word_tokenize(str(text).lower())
            text = nltk.Text(tokens)
            tagged = nltk.pos_tag(text)

            counts = nltk.Counter(tag for word, tag in tagged)
            total = sum(counts.values())
            my_dict = dict(
                (word, float(count) / total) for word, count in counts.items())

            tag_counts = []
            for x in tag_types:
                if my_dict.get(x) in tag_types:
                    tag_counts.append(my_dict.get(x))
                else:
                    tag_counts.append(0.0)
            pos_counts.append(tag_counts)

        pos_counts_transpose = list(map(list, zip(*pos_counts)))

        self.__num_pos_count = len(pos_counts_transpose)
        return pos_counts_transpose
示例#3
0
 def get_TTR(post):
     post = post.lower()
     post = re.sub(r'\W', ' ', post)
     tokens = word_tokenize(post)
     types = nltk.Counter(tokens)
     ttr = len(types) / len(tokens)
     return ttr
示例#4
0
def stem_tokens(tokens):
    stemmed = []
    for item in tokens:
        # stemmed.append(nltk.PorterStemmer().stem(item))
        stemmed.append(nltk.WordNetLemmatizer().lemmatize(item))
    count = nltk.Counter(stemmed)
    # print(count.most_common(10))
    return stemmed
def TTR(chunklist):
  score=0
  for i in range(len(chunklist)):
    tokens=nltk.word_tokenize(chunklist[i])
    types=nltk.Counter(tokens)
    score= score+(len(types)/len(tokens))*100
  score=score/len(chunklist)
  return score
def ttr_calc(paragraph):
    A = preprocess_ttr(paragraph)
    types = nltk.Counter(A)
    try:
        TTR = (len(types) / len(A)) * 100
    except ZeroDivisionError:
        TTR = 0
    # TTR = (len(types)/len(A))*100
    return TTR
def feature_extractor(words):
    features = {
        'highest': 0,
        'scored': 0,
        'runs': 0,
        'scorer': 0,
        'score': 0,
        'scored': 0,
        'match': 0,
        'wickets': 0,
        'boundary': 0,
        'fours': 0,
        '4s': 0,
        'six': 0,
        'sixes': 0,
        '6s': 0,
        '6': 0,
        'hit': 0,
        'four': 0,
        'aggregate': 0,
        'total': 0,
        'team': 0,
        'lead': 0,
        'leading': 0,
        'maximum': 0,
        'max': 0,
        'minimum': 0,
        'min': 0,
        'least': 0,
        'less': 0,
        '1st': 0,
        '2nd': 0,
        '3rd': 0,
        '4th': 0,
        '5th': 0,
        '6th': 0,
        '7th': 0,
        '8th': 0,
        '9th': 0,
        '10th': 0,
        'dot': 0,
        'dots': 0,
        'faced': 0,
        'entire': 0,
        'whole': 0,
        'season': 0,
        'strike': 0,
        'rate': 0,
        'strikerate': 0
    }
    #tokenized_sentence = nltk.word_tokenize(sentence)
    word_counts = nltk.Counter(words)
    for word in word_counts:
        if word in features:
            features[word] = word_counts[word]
    return features
示例#8
0
    def run(self):
        with open(self.xml, encoding="utf-8") as fd:
            tree = xmltodict.parse(fd.read(), xml_attribs=False, force_list=True)
            document = getFullText(tree)
            doc_id = search(tree, "id")[0]
            text = clean_text(document)
            words = text_to_word_sequence(text)
            filtered_doc = [w.lower() for w in words if not w in self.stop_words and w != '' and w.isalpha() and len(w)>1]

            self.corpus[doc_id] = dict((i, j) for (i, j) in nltk.Counter(filtered_doc).items())
            self.corpusWcount[doc_id] = filtered_doc
示例#9
0
def count_noun(file_name):
    txt_file = open(file_name, 'r', encoding='utf-8')
    file_lines = txt_file.readlines()
    long_line = ''
    for line in file_lines:
        long_line += line
    long_token = nltk.tokenize.word_tokenize(long_line)
    #print(long_token)
    tags = nltk.pos_tag(long_token)
    count = nltk.Counter([j for i, j in tags if j.startswith('NN')])
    #total_count = sum(j for i, j in tags if j.startswith('NN'))
    print(count)
def get_TTR(data):
    # Remove all special characters using regex
    data = re.sub(r'[^\w]', ' ', data)
    # Convert data to lowercase
    data = data.lower()
    # Tokenize the data to get word list
    tokens = nlp.word_tokenize(data)
    # Count all token and store in dictionary
    types = nlp.Counter(tokens)

    # Return Type-Token Ratio
    return (len(types) / len(tokens)) * 100
示例#11
0
def lemmatisation(word):
    w_syn = wordnet.synsets(word)

    position = nltk.Counter()
    position["n"] = len([item for item in w_syn if item.pos() == "n"])
    position["v"] = len([item for item in w_syn if item.pos() == "v"])
    position["a"] = len([item for item in w_syn if item.pos() == "a"])
    position["r"] = len([item for item in w_syn if item.pos() == "r"])

    result = position.most_common(3)

    return result[0][0]
示例#12
0
def analyze_text(text):
    adj_list = []
    sentences = nltk.sent_tokenize(text)

    for sentence in sentences:
        for word, pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
            if pos == 'JJ':
                adj_list.append(word)

    count = nltk.Counter(adj_list)

    return adj_list, count
示例#13
0
def etape2_get_freq_word_and_stop_word(data, nb_mots_max=100):
    '''Fonction qui ressort un counter de mots les plus fréquent
    INPUT:
    ------
        - data
    OUTPUT:
    -------
        ###- counter : reprend les mots les plus sités
        - freq_words
        - stopword (english + spécifiques)
    PREREQUIS:
    ----------
        - il faut que data ait la colonne 'Counter_WORD1' créée dans l'étape1
    APPEL FONCTION:
    ---------------
        freq_words, sw = etape2(df2, nb_mots_max=10)
    '''
    freq_totale = nltk.Counter()

    import datetime
    debut = datetime.datetime.now()
    print('Etape2 : DEB {}'.format(debut))

    data_text = data['Counter_WORD1']

    for m_id, token in data_text.iteritems():
        freq_totale += nltk.FreqDist(token)

    most_freq = list(zip(*freq_totale.most_common(nb_mots_max)))[0]
    # #####return most_freq

    # On créé notre set de stopwords final qui cumule ainsi les 100 mots
    # les plus fréquents du corpus ainsi que l'ensemble de stopwords par défaut
    # présent dans la librairie NLTK
    # ##sw.update(tuple(nltk.corpus.stopwords.words('french')))

    # ## Maintenant on peut fabriquer nos STOPWORD sw
    from nltk.corpus import stopwords
    nltk.data.path.append(
            r"/Users/seb/Workspace/Dev/Formation-OC/LIBRAIRIES/nltk_data")

    sw = set()
    # ######sw.update(tuple(nltk.corpus.stopwords.words('english')))
    # sw1.update(stopwords.words('english'))
    sw.update(stopwords.words('english'))
    sw.update(most_freq)
    fin = datetime.datetime.now()
    print('Etape2 : FIN : {}'.format(fin-debut))

    return freq_totale, sw
    return most_freq, sw
示例#14
0
def count_ne(tagging):
    """
        Count the number of occurrences of each named entity in a given tagging.
        :param tagging: Given tagging of sentence.
        :return: Dictionary structure of counts.
        """
    if tagging:
        ne_counts = nltk.Counter(tagging)
        counts = {}
        for key in ne_counts:
            counts[key] = ne_counts[key]
        return counts
    else:
        return None
示例#15
0
def get_tokens():
    corpus = ""
    with open("out/nyt_articles.json") as data_file:
        data = json.load(data_file)
    for article in data:
        corpus += article['text']
    lowers = corpus.lower()
    no_punctuation = lowers.translate(str.maketrans("", "",
                                                    string.punctuation))
    toker = nltk.RegexpTokenizer(r'\w+')
    tokens = toker.tokenize(no_punctuation)
    count = nltk.Counter(tokens)
    print(count.most_common(10))
    return tokens
示例#16
0
def index_pages():
    sites = [
        'e-prostor.gov.si', 'e-uprava.gov.si', 'evem.gov.si', 'podatki.gov.si'
    ]
    data = []
    allTokens = []

    for site in sites:
        root = '../input-indexing/' + site + "/"
        (_, _, filenames) = next(walk(root))
        html_files = []
        for file in filenames:
            if file.endswith(".html"):
                html_files.append(file)

        for file in html_files:
            print("file = " + file)

            text = get_text(get_html_content(site, file))

            tokens = retrieve_tokens(text)

            allTokens += tokens
            #print(tokens)
            #print(dist_words)

            pre = time.time()

            freq_table = {}
            indices = {}
            for i, token in enumerate(tokens):

                #https://towardsdatascience.com/text-summarization-using-tf-idf-e64a0644ace3
                if token in freq_table:
                    freq_table[token] += 1
                    indices[token].append(str(i))
                else:
                    freq_table[token] = 1
                    indices[token] = [str(i)]

            #print("tokens = " + str(time.time() - pre))

            for word, frequency in freq_table.items():
                data.append((word, site + '/' + file, frequency,
                             ','.join(indices[word])))

    text_counts = nltk.Counter(allTokens)
    dist_words = set(text_counts)

    return [(token, ) for token in dist_words], data
示例#17
0
def count_ne_normalized(tagging):
    """
    Count the number of occurrences of each named entity in a given tagging.
    :param tagging: Given tagging of sentence.
    :return: Dictionary structure of counts.
    """
    length = len(tagging)
    if tagging:
        ne_counts = nltk.Counter(tagging)
        normalized = {}
        for key in ne_counts:
            normalized[key] = float(ne_counts[key]) / length
        return normalized
    else:
        return None
示例#18
0
def get_tokens(text):
    lowers = text.lower()
    # remove the punctuation using the character deletion step of translate
    remove_punctuation_map = dict(
        (ord(char), None) for char in string.punctuation)
    no_punctuation = lowers.translate(remove_punctuation_map)
    tokens = nltk.word_tokenize(no_punctuation)
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    postagged = nltk.pos_tag(filtered)
    word_lemmatized = [
        nltk.WordNetLemmatizer().lemmatize(postagged[idx][0],
                                           get_wordnet_pos(postagged[idx][1]))
        for idx in range(len(postagged))
    ]
    count = nltk.Counter(word_lemmatized)
    return word_lemmatized, count
示例#19
0
def main():
    input_url = input("Enter the URL which you would like to analyze:")
    input_url += '/'
    check_url = validate_url(input_url)
    if check_url:
        web_text = crawl_web(input_url)
        web_text = re.sub('[!@#$():]', ' ', web_text)
        nlp = spacy.load('en_core_web_sm')
        parse_list = []
        for sent in sent_tokenize(web_text):
            doc = nlp(sent)
            for chunk in doc.noun_chunks:
                parse_list.append(chunk.text)

        # Removal of stopwords
        updated_parse = remove_stopwords(parse_list)

        # Lematization
        updated_parse_new = lematize(updated_parse)

        dic = {}
        for word in updated_parse_new:
            if word in dic:
                dic[word] += 1
            else:
                dic[word] = 1

        counter_dic = nltk.Counter(dic)
        # print(counter_dic.most_common())
        if len(counter_dic) < 500:
            for k, v in counter_dic.most_common(10):
                print(k)
        elif len(counter_dic) > 500 and len(counter_dic) < 750:
            for k, v in counter_dic.most_common(15):
                print(k)
        else:
            for k, v in counter_dic.most_common(20):
                print(k)
    else:
        print('Entered URL is not valid, please enter URL again')
        main()
示例#20
0
    prob_distArray.append(prob_dist_uni.prob(s))
i = 0
for lim in freq_dist_uni.most_common(10):
    print(lim, prob_distArray[i])
    i += 1

elep = ELEProbDist(freq_dist_uni)
for s in elep.samples():
    prob_distArray.append(elep.prob(s))
i = 0
for lim in freq_dist_uni.most_common(10):
    print(lim, prob_distArray[i], "\n")
    i += 1

uniqueWords = len(set(tokenized_text))
print("Unique Words: ", uniqueWords, "\n")

bigram_count = bigrams(tokenized_text)
counts = nltk.Counter(bigram_count)
print("Bigram Count: ", counts, "\n", "Most Common 10 bigram: ",
      counts.most_common(10), "\n", "Least Common 3 words: ",
      counts.most_common()[-3:], "\n")

word_mapping = dict(
    (w, w) if freq_dist_uni[w] > 1 else (w, 'UNK') for w in tokenized_text)
print("less frequent unigrams to UNK: ", word_mapping)

word_mapping = dict(
    (w, w) if counts[w] > 1 else (w, 'UNK') for w in tokenized_text)
print("less frequent bigrams to UNK: ", word_mapping)
示例#21
0
'''
Obtain words within all titles and bodytexts
'''
words = ""


titles = dfs['title'].values
bodies = dfs['body_text'].values

for idx, title in enumerate(titles):
    if str(title) != 'nan':
        words += title
for idx, body in enumerate(bodies):
    if str(body) != 'nan':
        words += body

## remove stop words

text_tokens = word_tokenize(words.strip().lower())
tokens_without_sw = [word for word in text_tokens if not word in stopWords]
counter_words = nltk.Counter(tokens_without_sw)
tokens_without_sw_str = str(tokens_without_sw)


'''
Word Cloud
'''
data = WordCloud(background_color="white", max_words=200 )
data.generate(tokens_without_sw_str)
data.to_file(CSV_FILE_DIR_HEAD + 'figure/wordcloud.eps')
示例#22
0
# Cleaning the corpus
# def clean_data(text):
#     text = str(text).lower()
#     text = re.sub('\[.*?\]', '', text)
#     text = re.sub('https?://\S+|www\.\S+', '', text)
#     text = re.sub('<.*?>+', '', text)
#     text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
#     text = re.sub('\n', '', text)
#     text = re.sub('\w*\d\w*', '', text)
#     return text
#
#
# train['text'] = train['text'].apply(lambda x: clean_data(x))
# train['selected_text'] = train['selected_text'].apply(lambda x: clean_data(x))
# print(train.head())

# Trying to visualise the most common words in the selected text


def remove_stopword(text):
    return [y for y in text if y not in stopwords.words('english')]


train['temp_list'] = train['selected_text'].apply(lambda x: str(x).split())
train['temp_list'] = train['temp_list'].apply(lambda x: remove_stopword(x))
top = nltk.Counter(
    [item for sublist in train['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words', 'count']
temp.style.background_gradient(cmap='Blues')
示例#23
0
    'abv', 'colour', 'country', 'description', 'grape_variety', 'name', 'Body'
]

a_concat = pd.concat([a0[columns_sel], a1[columns_sel]]).reset_index()
grapes = a_concat['grape_variety'].tolist()

# in case we do not want to blend grapes
grape = [
    grape for x in grapes if len(x.split(',')) == 1 for grape in x.split(', ')
]
fdist = nltk.FreqDist(grape)

result = a_concat['grape_variety']

## removing varieties that have only one member in the database
counts = nltk.Counter(result)
varieties = [key for key in counts if counts[key] > 40]
data_input = a_concat[a_concat['grape_variety'].isin(varieties)].reset_index()

# split the data into train and test
combined_features = ['Body', 'description', 'grape_variety']
target = 'grape_variety'

X_train, X_test, y_train, y_test = train_test_split(
    data_input[combined_features],
    data_input[target],
    test_size=0.33,
    random_state=42)

# aggregate description by grape type
grouped = X_train[['grape_variety',
示例#24
0
"""
import nltk as nlp
import re
import matplotlib.pyplot as plt
#open the plaintext file
file = open("The_Gift_of_The_Magi.txt")
# Read the file into a string type variable called 'text'
text = file.read()
#Using Regex (Regular Expressions) remove all the special characters in the text.
#This is done by selecting all special characters and substituting them with a space character.
text = re.sub(r'[^\w]', ' ', text)
# Tokenize the text and store it in a list called tokens
tokens = nlp.word_tokenize(text)
# Using the Counter function, obtatin the frequency of each token and store it as a 'key - value'
# Pair in a dictionary called types
types = nlp.Counter(tokens)
# For Debugging purposes
print(tokens)
print(types)
# Extract all the 'keys' from the dictionary types and store it as a list in X
X = list(types.keys())
# Declare an empty list Y
Y = []
#For Debugging purposes
print(X)
# For each item of key in X, find the corresponding value in the dictionary types and store it in list Y
for key in X:
    Y.append(types[key])
# For Debugging purposes
print(Y)
def get_news_features(headline, text):

    nlp = es_core_news_md.load()

    ## headline ##
    headline = re.sub(r"http\S+", "", headline)
    headline = re.sub(r"http", "", headline)
    headline = re.sub(r"@\S+", "", headline)
    headline = re.sub("\n", " ", headline)
    headline = re.sub(r"(?<!\n)\n(?!\n)", " ", headline)
    headline = headline.replace(r"*NUMBER*", "número")
    headline = headline.replace(r"*PHONE*", "número")
    headline = headline.replace(r"*EMAIL*", "email")
    headline = headline.replace(r"*URL*", "url")
    headline_lower = headline.lower()
    doc_h = nlp(headline_lower)

    list_tokens_h = []
    list_tags_h = []

    for sentence_h in doc_h.sents:
        for token in sentence_h:
            list_tokens_h.append(token.text)

    fdist_h = FreqDist(list_tokens_h)
    syllables_h = get_nsyllables(headline)
    words_h = len(list_tokens_h)

    # headline complexity features
    avg_word_size_h = round(
        sum(len(word) for word in list_tokens_h) / words_h, 2)
    avg_syllables_word_h = round(syllables_h / words_h, 2)
    unique_words_h = round((len(fdist_h.hapaxes()) / words_h) * 100, 2)
    mltd_h = round(ld.mtld(list_tokens_h), 2)
    ttr_h = round(ld.ttr(list_tokens_h) * 100, 2)

    ## text content##
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"http", "", text)
    text = re.sub("\n", " ", text)
    text = text.replace(r"*NUMBER*", "número")
    text = text.replace(r"*PHONE*", "número")
    text = text.replace(r"*EMAIL*", "email")
    text = text.replace(r"*URL*", "url")

    # to later calculate upper case letters ratio
    alph = list(filter(str.isalpha, text))
    text_lower = text.lower()
    doc = nlp(text_lower)

    list_tokens = []
    list_pos = []
    list_tag = []
    list_entities = []
    sents = 0

    for entity in doc.ents:
        list_entities.append(entity.label_)

    for sentence in doc.sents:
        sents += 1
        for token in sentence:
            list_tokens.append(token.text)
            list_pos.append(token.pos_)
            list_tag.append(token.tag_)

    # Calculate entities, pos, tag, freq, syllables, words and quotes
    entities = len(list_entities)
    n_pos = nltk.Counter(list_pos)
    n_tag = nltk.Counter(list_tag)
    fdist = FreqDist(list_tokens)
    syllables = get_nsyllables(text)
    words = len(list_tokens)
    quotes = n_tag['PUNCT__PunctType=Quot']

    # complexity features
    avg_word_sentence = round(words / sents, 2)
    avg_word_size = round(sum(len(word) for word in list_tokens) / words, 2)
    avg_syllables_word = round(syllables / words, 2)
    unique_words = round((len(fdist.hapaxes()) / words) * 100, 2)
    ttr = round(ld.ttr(list_tokens) * 100, 2)

    # readability spanish test
    huerta_score = round(
        206.84 - (60 * avg_syllables_word) - (1.02 * avg_word_sentence), 2)
    szigriszt_score = round(
        206.835 - ((62.3 * syllables) / words) - (words / sents), 2)

    # stylometric features
    mltd = round(ld.mtld(list_tokens), 2)
    upper_case_ratio = round(sum(map(str.isupper, alph)) / len(alph) * 100, 2)
    entity_ratio = round((entities / words) * 100, 2)
    quotes_ratio = round((quotes / words) * 100, 2)
    propn_ratio = round((n_pos['PROPN'] / words) * 100, 2)
    noun_ratio = round((n_pos['NOUN'] / words) * 100, 2)
    pron_ratio = round((n_pos['PRON'] / words) * 100, 2)
    adp_ratio = round((n_pos['ADP'] / words) * 100, 2)
    det_ratio = round((n_pos['DET'] / words) * 100, 2)
    punct_ratio = round((n_pos['PUNCT'] / words) * 100, 2)
    verb_ratio = round((n_pos['VERB'] / words) * 100, 2)
    adv_ratio = round((n_pos['ADV'] / words) * 100, 2)
    sym_ratio = round((n_tag['SYM'] / words) * 100, 2)

    # create df_features
    df_features = pd.DataFrame({
        'text': text_lower,
        'headline': headline_lower,
        'words_h': words_h,
        'word_size_h': [avg_word_size_h],
        'avg_syllables_word_h': [avg_syllables_word_h],
        'unique_words_h': [unique_words_h],
        'ttr_h': ttr_h,
        'mltd_h': [mltd_h],
        'sents': sents,
        'words': words,
        'avg_words_sent': [avg_word_sentence],
        'avg_word_size': [avg_word_size],
        'avg_syllables_word': avg_syllables_word,
        'unique_words': [unique_words],
        'ttr': [ttr],
        'huerta_score': [huerta_score],
        'szigriszt_score': [szigriszt_score],
        'mltd': [mltd],
        'upper_case_ratio': [upper_case_ratio],
        'entity_ratio': [entity_ratio],
        'quotes': quotes,
        'quotes_ratio': [quotes_ratio],
        'propn_ratio': [propn_ratio],
        'noun_ratio': [noun_ratio],
        'pron_ratio': [pron_ratio],
        'adp_ratio': [adp_ratio],
        'det_ratio': [det_ratio],
        'punct_ratio': [punct_ratio],
        'verb_ratio': [verb_ratio],
        'adv_ratio': [adv_ratio],
        'sym_ratio': [sym_ratio]
    })

    return df_features
示例#26
0
def get_ngrams(tokens):
    n_grams = []
    for i in gram_target:
        items = nltk.ngrams(tokens, i)
        n_grams.append(nltk.Counter(items))
    return n_grams
示例#27
0
def caltf_idf_Word_Counter(Sentences):
    word_Counts = []
    for s in Sentences:
        word_Counts.append(nltk.Counter(nltk.word_tokenize(s)))
    return word_Counts
示例#28
0
    def analyze(self, corpus, mode):
        """A 'corpus' here means a text file in either the keyword or country directory"""
        # Read the subcorpus, determining keywords and most common collocates
        with open(corpus, 'r', encoding='utf-8', errors='ignore') as f:
            subcorpus = json.load(f)
        name = os.path.basename(corpus)
        name = name.split('.')[0]
        subcorpus_keywords = defaultdict(int)
        subcorpus_adjectives = defaultdict(int)
        subcorpus_verbs = defaultdict(int)
        subcorpus_collocates = defaultdict(int)
        subcorpus_bigrams = defaultdict(int)
        subcorpus_hashtags = defaultdict(int)
        subcorpus_sentiment = defaultdict(int)
        clean_text = []

        for tweet in subcorpus:
            self.total_tweets['total'] += 1
            self.total_tweets[name] += 1
            tweet_text = tweet['text']

            for hashtag in tweet['hashtags']:
                subcorpus_hashtags[hashtag] += 1
                tweet_text = tweet_text.replace(f'#{hashtag}', '')

            keywords, adjectives, verbs = self.find_keywords(tweet_text, name)
            for keyword, occurrences in keywords.items():
                subcorpus_keywords[keyword] += occurrences

            for adj, occurrences in adjectives.items():
                subcorpus_adjectives[adj] += occurrences

            for verb, occurrences in verbs.items():
                subcorpus_verbs[verb] += occurrences

            if mode == 'country':
                bigrams = self.find_bigrams(tweet_text)
                for bigram, occurrences in bigrams.items():
                    subcorpus_bigrams[bigram] += occurrences
            elif mode == 'term':
                collocates = self.find_collocates(tweet_text, name)
                for collocate, occurrences in collocates.items():
                    subcorpus_collocates[collocate] += occurrences

            subcorpus_sentiment[tweet['sentiment']] += 1

            clean_text.append(tweet_text)
            print(self.total_tweets['total'])
        """Create a bar chart of the keywords"""
        keyword_counter = nltk.Counter(subcorpus_keywords)
        labels = []
        values = []
        for keyword, count in keyword_counter.most_common(20):
            labels.append(keyword)
            values.append(count)
        graph_data_bar(labels, values,
                       f'Most common keywords in the {name} subcorpus',
                       'Occurrences', 'Keyword')

        adj_counter = nltk.Counter(subcorpus_adjectives)
        labels = []
        values = []
        for adj, count in adj_counter.most_common(20):
            labels.append(adj)
            values.append(count)
        graph_data_bar(labels, values,
                       f'Most common adjectives in the {name} subcorpus',
                       'Occurrences', 'Keyword')

        verb_counter = nltk.Counter(subcorpus_verbs)
        labels = []
        values = []
        for verb, count in verb_counter.most_common(20):
            labels.append(verb)
            values.append(count)
        graph_data_bar(labels, values,
                       f'Most common verbs in the {name} subcorpus',
                       'Occurrences', 'Keyword')

        if mode == 'country':
            """Create a list/csv of the most common bigrams"""
            bigram_counter = nltk.Counter(subcorpus_bigrams)
            outputpath = os.path.join('..', 'data', 'csv_files')
            os.makedirs(outputpath, exist_ok=True)
            file = os.path.join(outputpath, f'{name}_bigrams.csv')
            if os.path.exists(file):
                os.remove(file)
            print(f'\nMost common bigrams in the {name} corpus:')
            with open(file, mode='w') as bigram_list:
                writer = csv.writer(bigram_list,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
                for n, (bigram, occurrences) in enumerate(
                        bigram_counter.most_common(20)):
                    print(f'    {n + 1}. {bigram} ({occurrences} samples)')
                    writer.writerow([bigram, occurrences])
            input('\n')
        elif mode == 'term':
            """Create a bar graph of the most common collocates"""
            collocate_counter = nltk.Counter(subcorpus_collocates)
            labels = []
            values = []
            pre_labels = []
            post_labels = []
            pre_values = []
            post_values = []
            for (w1, w2), count in collocate_counter.most_common(20):
                if len(name.split()) == 2:
                    # This collocate is for 'asylum seeker'
                    if name.split()[1] == w1:
                        # this is a post-word collocate
                        labels.append(w2)
                        values.append(count)
                        post_labels.append(w2)
                        post_values.append(count)
                    elif name.split()[0] == w2:
                        # this is a pre-word collocate
                        labels.append(w1)
                        values.append(count)
                        pre_labels.append(w1)
                        pre_values.append(count)
                elif name == w1:
                    # this is a post-word collocate, as the search term is the first word in the bigram
                    labels.append(w2)
                    values.append(count)
                    post_labels.append(w2)
                    post_values.append(count)
                else:
                    # this is a pre-word collocate, as the search term is the second word in the bigram
                    labels.append(w1)
                    values.append(count)
                    pre_labels.append(w1)
                    pre_values.append(count)

            graph_data_bar(labels, values, f'Most common collocates of {name}',
                           'Occurrences', 'Collocate')
            graph_data_bar(pre_labels, pre_values,
                           f'Most common pre-word collocates of {name}',
                           'Occurrences', 'Collocate')
            graph_data_bar(post_labels, post_values,
                           f'Most common post-word collocates of {name}',
                           'Occurrences', 'Collocate')
        """Create a list/csv of the hashtags"""
        hashtag_counter = nltk.Counter(subcorpus_hashtags)
        print(f'\nMost common hashtags in the {name} corpus:')
        outputpath = os.path.join('..', 'data', 'csv_files')
        os.makedirs(outputpath, exist_ok=True)
        file = os.path.join(outputpath, f'{name}_hashtags.csv')
        if os.path.exists(file):
            os.remove(file)
        with open(file, mode='w') as hashtag_list:
            writer = csv.writer(hashtag_list,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            for n, (key, value) in enumerate(hashtag_counter.most_common(20)):
                print(f'    {n + 1}. {key} ({value} samples)')
                writer.writerow([key, value])
        input('\n')
        """"Create a pie chart of the sentiment distribution"""
        labels = ['positive', 'negative', 'neutral']
        values = [
            subcorpus_sentiment['positive'], subcorpus_sentiment['negative'],
            subcorpus_sentiment['neutral']
        ]

        graph_data_pie(labels, values,
                       f'Sentiment distribution in the {name} subcorpus')
        """Create clean text file for AntConc"""
        path = os.path.join(corpus, '..', 'clean')
        os.makedirs(path, exist_ok=True)
        outname = os.path.join(path, f'{name}.txt')
        with open(outname, 'w', encoding='utf-8') as outfile:
            for clean_tweet in clean_text:
                outfile.write(clean_tweet)
示例#29
0
variety = {}
for ii in range(len(result)):
    tmp = result.iloc[ii].lower()
    tmp = tmp.split(',')
    tmp = [re.sub(r'^ ', '', x) for x in tmp]
    tmp = [re.sub(r' $', '', x) for x in tmp]
    tmp = [shiraz_filter(samp) for samp in tmp]
    tmp = str(set(tmp)).replace("'", '').replace('{', '').replace('}', '')
    variety[ii] = tmp

result = pd.Series(variety)
#a['Variety']=result

## removing varieties that have only one member in the database
counts = nltk.Counter(result)
varieties = [key for key in counts if counts[key] > 30]

#data_input = a[a['Variety'].isin(varieties)]
data_input = a[a['grape_variety'].isin(varieties)].reset_index()
############################################

#defTags = ['CC', '.', ',', 'IN', ';', 'PRP', 'DT', 'MD', 'PDT', 'POS', 'TO', 'WDT', 'WP', 'WRB', 'NNP', 'RP']
#defTags = ['CC', '.', ',', 'IN', ';', 'DT', 'TO', 'CD']
defTags = ['NNS', 'NN', 'JJ', 'JJS',
           'JJR']  #, 'RB', 'RBS', 'RBR']#, 'VBD', 'VBZ']
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+'


def clean_function(ll):
    list_to_remove = ["'", 's', '-']
示例#30
0
 def calculateTermFrequency(self):
     print("Calculating term frequency for all words.Please wait.")
     self.tf_list = nltk.Counter(self.word_list)