def get_bigram_review_vector(text, model, average=True, kernel=(1, 1)): bigrams = ngrams(text, 2) vector = np.zeros(model.vector_size) count = 0 for bigram in bigrams: bigram_vector = np.zeros(model.vector_size) if bigram[0] in model: bigram_vector += model[bigram[0]] * kernel[0] if bigram[1] in model: bigram_vector += model[bigram[1]] * kernel[1] count += 1 vector += bigram_vector if average and count > 0: vector /= count return vector
# Tokenizing the contents of the file. words = word_tokenize(contentsoffile) print("Lemmatization with verb form of the words:") for word in words: print(lem.lemmatize(word, pos='v')) print("-----------------------------------------------------") print("-----------------------------------------------------") print("Performing bi-gram on the text:") cp = word_tokenize(contentsoffile) li = [] # Accessing ngram function to find bigrams from the given text. bigramfinder = ngrams(cp, 2) for a in bigramfinder: li.append(a) print(li) print("-----------------------------------------------------") print("-----------------------------------------------------") # Using counter function to cpunt the number of occurances of each bigram. word_count = Counter(li) print(" Calculating the word frequency of Bi-Gram:") print(word_count) print("-----------------------------------------------------") print("-----------------------------------------------------") print("Finding the top 5 bigrams:")
def word_grams(words, min=1, max=4): s = [] for n in range(min, max): for ngram in ngrams(words, n): s.append(' '.join(str(i) for i in ngram)) return s
def trigrams(self): """ :return: trigrams in the corpus """ return ngrams(self.tokens, 3)
# Needed library from nltk.collocations import ngrams from nltk.tokenize import word_tokenize # reading the file infile = open('input.txt','r', encoding="utf-8") # Giving the file a neame text = infile.read() # Using the word_tokenize library words = word_tokenize(text) # here we are telling it ti be a tri gram X = ngrams(words, 3) # this loop is needed so that it will read output all the trigrams for a in X: print(a)
def eval_text(path): # reading file while replacing new lines and tokenizing into separate sentences file = open(path).read().replace('\n', ' ') file = file.lower() sentences = nltk.sent_tokenize(file) # emotion_type => (unnegated_count, negated_count) emotion_count = { 1: (0, 0), 2: (0, 0), 3: (0, 0), 4: (0, 0), 5: (0, 0), 6: (0, 0), 7: (0, 0), 8: (0, 0) } expression_count = { 'total': 0, 'unigram': 0, # Bigrams 'n': 0, 'i': 0, 'di': 0, # Trigrams ('n', 'x'): 0, ('n', 'i'): 0, ('n', 'n'): 0, ('n', 'di'): 0, ('i', 'x'): 0, ('i', 'i'): 0, ('i', 'n'): 0, ('i', 'di'): 0, ('di', 'x'): 0, ('di', 'i'): 0, ('di', 'n'): 0, ('di', 'di'): 0 } example_expressions = { 'unigram': [], 'n': [], 'i': [], 'di': [], ('n', 'x'): [], ('n', 'i'): [], ('n', 'n'): [], ('n', 'di'): [], ('i', 'x'): [], ('i', 'i'): [], ('i', 'n'): [], ('i', 'di'): [], ('di', 'x'): [], ('di', 'i'): [], ('di', 'n'): [], ('di', 'di'): [] } # main loop for extracting unigrams, bigrams and trigrams from each sentence token_count = 0 sent_count = 0 for sent in sentences: sent_count += 1 text_unigrams = sent_list = trivialTokenizer(sent) token_count += len( text_unigrams ) # Get a total count of number of tokens in the text. text_bigrams = list(bigrams(sent_list)) text_trigrams = list(ngrams(sent_list, 3)) for unigram in text_unigrams: if unigram in emotion_lexicon: expression_count['unigram'] += 1 example_expressions['unigram'].append(unigram) emotion_types = emotion_lexicon[unigram] for e in emotion_types: (unnegated_count, negated_count) = emotion_count[e] emotion_count[e] = (unnegated_count + 1, negated_count) # Save the current count of number of tokens that are in the emotion lexicon. expression_count['total'] = expression_count['unigram'] for bigram in text_bigrams: if bigram[1] in emotion_lexicon: emotion_types = emotion_lexicon[bigram[1]] if bigram[0] in NEGATIONS_SET: expression_count[ 'unigram'] -= 1 # Subtract from unigram since it was counted previously as unigram expression_count['n'] += 1 example_expressions['n'].append(' '.join(bigram)) for e in emotion_types: (unnegated_count, negated_count) = emotion_count[e] emotion_count[e] = (unnegated_count - 1, negated_count + 1) elif bigram[0] in INTENSIFIER_DICT: expression_count[ 'unigram'] -= 1 # Subtract from unigram since it was counted previously as unigram multiplier = INTENSIFIER_DICT[bigram[0]] if multiplier > 1: expression_count['i'] += 1 example_expressions['i'].append(' '.join(bigram)) else: # Count deintensifiers separately from intensifiers expression_count['di'] += 1 example_expressions['di'].append(' '.join(bigram)) for e in emotion_types: (unnegated_count, negated_count) = emotion_count[e] emotion_count[e] = (unnegated_count - 1 + multiplier, negated_count) for trigram in text_trigrams: if trigram[2] in emotion_lexicon: emotion_types = emotion_lexicon[trigram[2]] word_1 = '' if trigram[0] in NEGATIONS_SET: word_1 = 'n' elif trigram[0] in INTENSIFIER_DICT: word_1 = 'i' elif trigram[0] in STOP_WORDS: word_1 = 'x' word_2 = '' if trigram[1] in NEGATIONS_SET: word_2 = 'n' elif trigram[1] in INTENSIFIER_DICT: word_2 = 'i' elif trigram[1] in STOP_WORDS: word_2 = 'x' if word_1 == 'n' and word_2 == 'x': expression_count[('n', 'x')] += 1 example_expressions[('n', 'x')].append(' '.join(trigram)) expression_count['unigram'] -= 1 for e in emotion_types: (unnegated_count, negated_count) = emotion_count[e] emotion_count[e] = (unnegated_count - 1, negated_count + 1) if word_1 == 'i' and word_2 == 'x': expression_count['unigram'] -= 1 multiplier = INTENSIFIER_DICT[trigram[0]] if multiplier > 1: expression_count[('i', 'x')] += 1 example_expressions[('i', 'x')].append( ' '.join(trigram)) else: expression_count[('di', 'x')] += 1 example_expressions[('di', 'x')].append( ' '.join(trigram)) for e in emotion_types: # Subtract the previously counted unnegated unigram, then add the weighted value. # The bigram was not counted since the second word is not a negation nor intensifier. # so the unigram count was not adjusted in the bigram loop. (unnegated_count, negated_count) = emotion_count[e] emotion_count[e] = (unnegated_count - 1 + multiplier, negated_count) if word_1 == 'n' and word_2 == 'n': expression_count[('n', 'n')] += 1 example_expressions[('n', 'n')].append(' '.join(trigram)) expression_count['n'] -= 1 for e in emotion_types: # Subtract the previous negated count as bigram. (unnegated_count, negated_count) = emotion_count[e] emotion_count[e] = (unnegated_count + 1, negated_count - 1) if word_1 == 'i' and word_2 == 'n': expression_count['n'] -= 1 multiplier = INTENSIFIER_DICT[trigram[0]] if multiplier > 1: expression_count[('i', 'n')] += 1 example_expressions[('i', 'n')].append( ' '.join(trigram)) else: expression_count[('di', 'n')] += 1 example_expressions[('di', 'n')].append( ' '.join(trigram)) for e in emotion_types: (unnegated_count, negated_count) = emotion_count[e] emotion_count[e] = (unnegated_count, negated_count - 1 + multiplier) if word_1 == 'n' and word_2 == 'i': expression_count['i'] -= 1 multiplier = INTENSIFIER_DICT[trigram[1]] if multiplier > 1: expression_count[('n', 'i')] += 1 example_expressions[('n', 'i')].append( ' '.join(trigram)) else: expression_count[('n', 'di')] += 1 example_expressions[('n', 'di')].append( ' '.join(trigram)) for e in emotion_types: # Undo the previous count as a bigram with an intensifier, so subtract multiplier. (unnegated_count, negated_count) = emotion_count[e] emotion_count[e] = (unnegated_count - multiplier, negated_count + multiplier) if word_1 == 'i' and word_2 == 'i': expression_count['i'] -= 1 multiplier_1 = INTENSIFIER_DICT[trigram[0]] multiplier_2 = INTENSIFIER_DICT[trigram[1]] type_1 = multiplier_1 > 1 and 'i' or 'di' type_2 = multiplier_2 > 1 and 'i' or 'di' expression_count[(type_1, type_2)] += 1 example_expressions[(type_1, type_2)].append(' '.join(trigram)) # Since a multiplier of 1 is considered neutral, we need to determine the # 'direction' of the intensifier, then multiply by the first intensifier # to get the new 'distance' for the second intensifier (the first intensifier affects # the second intensifier not the emotion word). The new 'vector' is then changed back # to where 1 is neutral. new_multiplier = multiplier_1 * (multiplier_2 - 1) + 1 for e in emotion_types: # Undo the previous count by subtracting the original multiplier. (unnegated_count, negated_count) = emotion_count[e] emotion_count[e] = (unnegated_count - multiplier_2 + new_multiplier, negated_count) print( '-----------------------------------------------------------------------------' ) print('Summary for', path) print('') print('Sentence count:', sent_count) print('Total number of tokens:', token_count) print( '-----------------------------------------------------------------------------' ) print('') print('Expression types:') print('') def get_examples(type, num=2): freq = FreqDist(example_expressions[type]) most_common = freq.most_common(num) strings = [] for (word, word_c) in most_common: strings.append('{} ({})'.format(word, word_c)) return ', '.join(strings) print('Total emotion expressions:', expression_count['total']) print('') print('Unigram count:', expression_count['unigram']) print('Most common: {}'.format(get_examples('unigram', 5))) print('') print( 'Bigram total count:', expression_count['n'] + expression_count['i'] + expression_count['di']) print('Negation + emotion: {:5.0f}\t{:}'.format( expression_count['n'], get_examples('n', 3))) print('Intensifier + emotion: {:5.0f}\t{:}'.format( expression_count['i'], get_examples('i', 3))) print('Deintensifier + emotion: {:5.0f}\t{:}'.format( expression_count['di'], get_examples('di', 3))) print('') trigram_total = 0 for k in [('n', 'x'), ('n', 'i'), ('n', 'n'), ('n', 'di'), ('i', 'x'), ('i', 'i'), ('i', 'n'), ('i', 'di'), ('di', 'x'), ('di', 'i'), ('di', 'n'), ('di', 'di')]: trigram_total += expression_count[k] print('Trigram total count:', trigram_total) print('Negation + stop word + emotion: {:4.0f}\t {:}'.format( expression_count[('n', 'x')], get_examples(('n', 'x')))) print('Negation + negation + emotion: {:4.0f}\t {:}'.format( expression_count[('n', 'n')], get_examples(('n', 'n')))) print('Negation + intensifier + emotion: {:4.0f}\t {:}'.format( expression_count[('n', 'i')], get_examples(('n', 'i')))) print('Negation + deintensifier + emotion: {:4.0f}\t {:}'.format( expression_count[('n', 'di')], get_examples(('n', 'di')))) print('Intensifier + stop word + emotion: {:4.0f}\t {:}'.format( expression_count[('i', 'x')], get_examples(('i', 'x')))) print('Intensifier + negation + emotion: {:4.0f}\t {:}'.format( expression_count[('i', 'n')], get_examples(('i', 'n')))) print('Intensifier + intensifier + emotion: {:4.0f}\t {:}'.format( expression_count[('i', 'i')], get_examples(('i', 'i')))) print('Intensifier + deintensifier + emotion: {:4.0f}\t {:}'.format( expression_count[('i', 'di')], get_examples(('i', 'di')))) print('Deintensifier + stop word + emotion: {:4.0f}\t {:}'.format( expression_count[('di', 'x')], get_examples(('di', 'x')))) print('Deintensifier + negation + emotion: {:4.0f}\t {:}'.format( expression_count[('di', 'n')], get_examples(('di', 'n')))) print('Deintensifier + intensifier + emotion: {:4.0f}\t {:}'.format( expression_count[('di', 'i')], get_examples(('di', 'i')))) print('Deintensifier + deintensifier + emotion: {:4.0f}\t {:}'.format( expression_count[('di', 'di')], get_examples(('di', 'di')))) print( '-----------------------------------------------------------------------------' ) print('') print('Emotion percentages:') print('') total = 0 for (k, v) in emotion_count.items(): total += v[0] + v[1] print("Total weighted sum:", total) print('') print('emotion (count): % not emotion (count): %') print('') percentage_dict = {} for (k, v) in emotion_count.items(): percentage_dict[k] = (v[0] / total * 100, v[1] / total * 100) print("{:12} ({:.0f}): {:.1f}% \t not {:12} ({:.0f}): {:.1f}%". format(emotion_name[k], v[0], v[0] / total * 100, emotion_name[k], v[1], v[1] / total * 100)) print('') print('Emotion percentages grouped by positive/neutral/negative') print('') # positive side pos_total = (0, 0) for k in [5, 8]: (i, j) = emotion_count[k] pos_total = (pos_total[0] + i, pos_total[1] + j) # neutral side neutral_total = (0, 0) for k in [2, 7]: (i, j) = emotion_count[k] neutral_total = (neutral_total[0] + i, neutral_total[1] + j) # negative side neg_total = (0, 0) for k in [1, 3, 4, 6]: (i, j) = emotion_count[k] neg_total = (neg_total[0] + i, neg_total[1] + j) print("{:12} ({:.0f}): {:.1f}% \t negated {:12} ({:.0f}): {:.1f}%". format("positive", pos_total[0], pos_total[0] / total * 100, "positive", pos_total[1], pos_total[1] / total * 100)) print("{:12} ({:.0f}): {:.1f}% \t negated {:12} ({:.0f}): {:.1f}%". format("neutral", neutral_total[0], neutral_total[0] / total * 100, "neutral", neutral_total[1], neutral_total[1] / total * 100)) print("{:12} ({:.0f}): {:.1f}% \t negated {:12} ({:.0f}): {:.1f}%". format("negative", neg_total[0], neg_total[0] / total * 100, "negative", neg_total[1], neg_total[1] / total * 100)) return percentage_dict