def get_bigram_review_vector(text, model, average=True, kernel=(1, 1)):
    bigrams = ngrams(text, 2)
    vector = np.zeros(model.vector_size)
    count = 0
    for bigram in bigrams:
        bigram_vector = np.zeros(model.vector_size)
        if bigram[0] in model:
            bigram_vector += model[bigram[0]] * kernel[0]
        if bigram[1] in model:
            bigram_vector += model[bigram[1]] * kernel[1]
        count += 1
        vector += bigram_vector
    if average and count > 0:
        vector /= count
    return vector
# Tokenizing the contents of the file.
words = word_tokenize(contentsoffile)

print("Lemmatization with verb form of the words:")
for word in words:
    print(lem.lemmatize(word, pos='v'))

print("-----------------------------------------------------")
print("-----------------------------------------------------")
print("Performing bi-gram on the text:")
cp = word_tokenize(contentsoffile)
li = []

# Accessing ngram function to find bigrams from the given text.
bigramfinder = ngrams(cp, 2)
for a in bigramfinder:
    li.append(a)
print(li)

print("-----------------------------------------------------")
print("-----------------------------------------------------")

# Using counter function to cpunt the number of occurances of each bigram.
word_count = Counter(li)
print(" Calculating the word frequency of Bi-Gram:")
print(word_count)

print("-----------------------------------------------------")
print("-----------------------------------------------------")
print("Finding the top 5 bigrams:")
示例#3
0
def word_grams(words, min=1, max=4):
    s = []
    for n in range(min, max):
        for ngram in ngrams(words, n):
            s.append(' '.join(str(i) for i in ngram))
    return s
示例#4
0
 def trigrams(self):
     """
     :return: trigrams in the corpus
     """
     return ngrams(self.tokens, 3)
示例#5
0
# Needed library
from nltk.collocations import ngrams
from nltk.tokenize import word_tokenize
# reading the file
infile = open('input.txt','r', encoding="utf-8")

# Giving the file a neame
text = infile.read()
# Using the word_tokenize library
words = word_tokenize(text)
# here we are telling it ti be a tri gram
X = ngrams(words, 3)

# this loop is needed so that it will read output all the trigrams
for a in X:
    print(a)
    def eval_text(path):
        # reading file while replacing new lines and tokenizing into separate sentences
        file = open(path).read().replace('\n', ' ')
        file = file.lower()
        sentences = nltk.sent_tokenize(file)

        # emotion_type => (unnegated_count, negated_count)
        emotion_count = {
            1: (0, 0),
            2: (0, 0),
            3: (0, 0),
            4: (0, 0),
            5: (0, 0),
            6: (0, 0),
            7: (0, 0),
            8: (0, 0)
        }
        expression_count = {
            'total': 0,
            'unigram': 0,
            # Bigrams
            'n': 0,
            'i': 0,
            'di': 0,
            # Trigrams
            ('n', 'x'): 0,
            ('n', 'i'): 0,
            ('n', 'n'): 0,
            ('n', 'di'): 0,
            ('i', 'x'): 0,
            ('i', 'i'): 0,
            ('i', 'n'): 0,
            ('i', 'di'): 0,
            ('di', 'x'): 0,
            ('di', 'i'): 0,
            ('di', 'n'): 0,
            ('di', 'di'): 0
        }

        example_expressions = {
            'unigram': [],
            'n': [],
            'i': [],
            'di': [],
            ('n', 'x'): [],
            ('n', 'i'): [],
            ('n', 'n'): [],
            ('n', 'di'): [],
            ('i', 'x'): [],
            ('i', 'i'): [],
            ('i', 'n'): [],
            ('i', 'di'): [],
            ('di', 'x'): [],
            ('di', 'i'): [],
            ('di', 'n'): [],
            ('di', 'di'): []
        }

        # main loop for extracting unigrams, bigrams and trigrams from each sentence
        token_count = 0
        sent_count = 0

        for sent in sentences:
            sent_count += 1
            text_unigrams = sent_list = trivialTokenizer(sent)

            token_count += len(
                text_unigrams
            )  # Get a total count of number of tokens in the text.

            text_bigrams = list(bigrams(sent_list))
            text_trigrams = list(ngrams(sent_list, 3))

            for unigram in text_unigrams:
                if unigram in emotion_lexicon:
                    expression_count['unigram'] += 1
                    example_expressions['unigram'].append(unigram)

                    emotion_types = emotion_lexicon[unigram]
                    for e in emotion_types:
                        (unnegated_count, negated_count) = emotion_count[e]
                        emotion_count[e] = (unnegated_count + 1, negated_count)

            # Save the current count of number of tokens that are in the emotion lexicon.
            expression_count['total'] = expression_count['unigram']

            for bigram in text_bigrams:
                if bigram[1] in emotion_lexicon:
                    emotion_types = emotion_lexicon[bigram[1]]

                    if bigram[0] in NEGATIONS_SET:
                        expression_count[
                            'unigram'] -= 1  # Subtract from unigram since it was counted previously as unigram
                        expression_count['n'] += 1
                        example_expressions['n'].append(' '.join(bigram))

                        for e in emotion_types:
                            (unnegated_count, negated_count) = emotion_count[e]
                            emotion_count[e] = (unnegated_count - 1,
                                                negated_count + 1)

                    elif bigram[0] in INTENSIFIER_DICT:
                        expression_count[
                            'unigram'] -= 1  # Subtract from unigram since it was counted previously as unigram

                        multiplier = INTENSIFIER_DICT[bigram[0]]

                        if multiplier > 1:
                            expression_count['i'] += 1
                            example_expressions['i'].append(' '.join(bigram))
                        else:
                            # Count deintensifiers separately from intensifiers
                            expression_count['di'] += 1
                            example_expressions['di'].append(' '.join(bigram))

                        for e in emotion_types:
                            (unnegated_count, negated_count) = emotion_count[e]
                            emotion_count[e] = (unnegated_count - 1 +
                                                multiplier, negated_count)

            for trigram in text_trigrams:
                if trigram[2] in emotion_lexicon:
                    emotion_types = emotion_lexicon[trigram[2]]

                    word_1 = ''
                    if trigram[0] in NEGATIONS_SET:
                        word_1 = 'n'
                    elif trigram[0] in INTENSIFIER_DICT:
                        word_1 = 'i'
                    elif trigram[0] in STOP_WORDS:
                        word_1 = 'x'

                    word_2 = ''
                    if trigram[1] in NEGATIONS_SET:
                        word_2 = 'n'
                    elif trigram[1] in INTENSIFIER_DICT:
                        word_2 = 'i'
                    elif trigram[1] in STOP_WORDS:
                        word_2 = 'x'

                    if word_1 == 'n' and word_2 == 'x':
                        expression_count[('n', 'x')] += 1
                        example_expressions[('n',
                                             'x')].append(' '.join(trigram))
                        expression_count['unigram'] -= 1

                        for e in emotion_types:
                            (unnegated_count, negated_count) = emotion_count[e]
                            emotion_count[e] = (unnegated_count - 1,
                                                negated_count + 1)

                    if word_1 == 'i' and word_2 == 'x':
                        expression_count['unigram'] -= 1

                        multiplier = INTENSIFIER_DICT[trigram[0]]
                        if multiplier > 1:
                            expression_count[('i', 'x')] += 1
                            example_expressions[('i', 'x')].append(
                                ' '.join(trigram))

                        else:
                            expression_count[('di', 'x')] += 1
                            example_expressions[('di', 'x')].append(
                                ' '.join(trigram))

                        for e in emotion_types:
                            # Subtract the previously counted unnegated unigram, then add the weighted value.
                            # The bigram was not counted since the second word is not a negation nor intensifier.
                            # so the unigram count was not adjusted in the bigram loop.
                            (unnegated_count, negated_count) = emotion_count[e]
                            emotion_count[e] = (unnegated_count - 1 +
                                                multiplier, negated_count)

                    if word_1 == 'n' and word_2 == 'n':
                        expression_count[('n', 'n')] += 1
                        example_expressions[('n',
                                             'n')].append(' '.join(trigram))

                        expression_count['n'] -= 1

                        for e in emotion_types:
                            # Subtract the previous negated count as bigram.
                            (unnegated_count, negated_count) = emotion_count[e]
                            emotion_count[e] = (unnegated_count + 1,
                                                negated_count - 1)

                    if word_1 == 'i' and word_2 == 'n':
                        expression_count['n'] -= 1

                        multiplier = INTENSIFIER_DICT[trigram[0]]

                        if multiplier > 1:
                            expression_count[('i', 'n')] += 1
                            example_expressions[('i', 'n')].append(
                                ' '.join(trigram))

                        else:
                            expression_count[('di', 'n')] += 1
                            example_expressions[('di', 'n')].append(
                                ' '.join(trigram))

                        for e in emotion_types:
                            (unnegated_count, negated_count) = emotion_count[e]
                            emotion_count[e] = (unnegated_count,
                                                negated_count - 1 + multiplier)

                    if word_1 == 'n' and word_2 == 'i':
                        expression_count['i'] -= 1

                        multiplier = INTENSIFIER_DICT[trigram[1]]

                        if multiplier > 1:
                            expression_count[('n', 'i')] += 1
                            example_expressions[('n', 'i')].append(
                                ' '.join(trigram))
                        else:
                            expression_count[('n', 'di')] += 1
                            example_expressions[('n', 'di')].append(
                                ' '.join(trigram))

                        for e in emotion_types:
                            # Undo the previous count as a bigram with an intensifier, so subtract multiplier.
                            (unnegated_count, negated_count) = emotion_count[e]
                            emotion_count[e] = (unnegated_count - multiplier,
                                                negated_count + multiplier)

                    if word_1 == 'i' and word_2 == 'i':
                        expression_count['i'] -= 1

                        multiplier_1 = INTENSIFIER_DICT[trigram[0]]
                        multiplier_2 = INTENSIFIER_DICT[trigram[1]]

                        type_1 = multiplier_1 > 1 and 'i' or 'di'
                        type_2 = multiplier_2 > 1 and 'i' or 'di'

                        expression_count[(type_1, type_2)] += 1
                        example_expressions[(type_1,
                                             type_2)].append(' '.join(trigram))

                        # Since a multiplier of 1 is considered neutral, we need to determine the
                        # 'direction' of the intensifier, then multiply by the first intensifier
                        # to get the new 'distance' for the second intensifier (the first intensifier affects
                        # the second intensifier not the emotion word). The new 'vector' is then changed back
                        # to where 1 is neutral.
                        new_multiplier = multiplier_1 * (multiplier_2 - 1) + 1

                        for e in emotion_types:
                            # Undo the previous count by subtracting the original multiplier.
                            (unnegated_count, negated_count) = emotion_count[e]
                            emotion_count[e] = (unnegated_count -
                                                multiplier_2 + new_multiplier,
                                                negated_count)

        print(
            '-----------------------------------------------------------------------------'
        )
        print('Summary for', path)
        print('')
        print('Sentence count:', sent_count)
        print('Total number of tokens:', token_count)

        print(
            '-----------------------------------------------------------------------------'
        )
        print('')
        print('Expression types:')
        print('')

        def get_examples(type, num=2):
            freq = FreqDist(example_expressions[type])
            most_common = freq.most_common(num)
            strings = []
            for (word, word_c) in most_common:
                strings.append('{} ({})'.format(word, word_c))
            return ', '.join(strings)

        print('Total emotion expressions:', expression_count['total'])
        print('')

        print('Unigram count:', expression_count['unigram'])
        print('Most common: {}'.format(get_examples('unigram', 5)))
        print('')

        print(
            'Bigram total count:', expression_count['n'] +
            expression_count['i'] + expression_count['di'])
        print('Negation      + emotion: {:5.0f}\t{:}'.format(
            expression_count['n'], get_examples('n', 3)))
        print('Intensifier   + emotion: {:5.0f}\t{:}'.format(
            expression_count['i'], get_examples('i', 3)))
        print('Deintensifier + emotion: {:5.0f}\t{:}'.format(
            expression_count['di'], get_examples('di', 3)))
        print('')

        trigram_total = 0
        for k in [('n', 'x'), ('n', 'i'), ('n', 'n'), ('n', 'di'), ('i', 'x'),
                  ('i', 'i'), ('i', 'n'), ('i', 'di'), ('di', 'x'),
                  ('di', 'i'), ('di', 'n'), ('di', 'di')]:
            trigram_total += expression_count[k]

        print('Trigram total count:', trigram_total)

        print('Negation      + stop word     + emotion: {:4.0f}\t {:}'.format(
            expression_count[('n', 'x')], get_examples(('n', 'x'))))
        print('Negation      + negation      + emotion: {:4.0f}\t {:}'.format(
            expression_count[('n', 'n')], get_examples(('n', 'n'))))
        print('Negation      + intensifier   + emotion: {:4.0f}\t {:}'.format(
            expression_count[('n', 'i')], get_examples(('n', 'i'))))
        print('Negation      + deintensifier + emotion: {:4.0f}\t {:}'.format(
            expression_count[('n', 'di')], get_examples(('n', 'di'))))

        print('Intensifier   + stop word     + emotion: {:4.0f}\t {:}'.format(
            expression_count[('i', 'x')], get_examples(('i', 'x'))))
        print('Intensifier   + negation      + emotion: {:4.0f}\t {:}'.format(
            expression_count[('i', 'n')], get_examples(('i', 'n'))))
        print('Intensifier   + intensifier   + emotion: {:4.0f}\t {:}'.format(
            expression_count[('i', 'i')], get_examples(('i', 'i'))))
        print('Intensifier   + deintensifier + emotion: {:4.0f}\t {:}'.format(
            expression_count[('i', 'di')], get_examples(('i', 'di'))))

        print('Deintensifier + stop word     + emotion: {:4.0f}\t {:}'.format(
            expression_count[('di', 'x')], get_examples(('di', 'x'))))
        print('Deintensifier + negation      + emotion: {:4.0f}\t {:}'.format(
            expression_count[('di', 'n')], get_examples(('di', 'n'))))
        print('Deintensifier + intensifier   + emotion: {:4.0f}\t {:}'.format(
            expression_count[('di', 'i')], get_examples(('di', 'i'))))
        print('Deintensifier + deintensifier + emotion: {:4.0f}\t {:}'.format(
            expression_count[('di', 'di')], get_examples(('di', 'di'))))

        print(
            '-----------------------------------------------------------------------------'
        )
        print('')
        print('Emotion percentages:')
        print('')

        total = 0
        for (k, v) in emotion_count.items():
            total += v[0] + v[1]

        print("Total weighted sum:", total)

        print('')
        print('emotion (count): %               not emotion (count): %')
        print('')
        percentage_dict = {}
        for (k, v) in emotion_count.items():
            percentage_dict[k] = (v[0] / total * 100, v[1] / total * 100)
            print("{:12} ({:.0f}): {:.1f}% \t not {:12} ({:.0f}): {:.1f}%".
                  format(emotion_name[k], v[0], v[0] / total * 100,
                         emotion_name[k], v[1], v[1] / total * 100))

        print('')

        print('Emotion percentages grouped by positive/neutral/negative')
        print('')

        # positive side
        pos_total = (0, 0)
        for k in [5, 8]:
            (i, j) = emotion_count[k]
            pos_total = (pos_total[0] + i, pos_total[1] + j)

        # neutral side
        neutral_total = (0, 0)
        for k in [2, 7]:
            (i, j) = emotion_count[k]
            neutral_total = (neutral_total[0] + i, neutral_total[1] + j)

        # negative side
        neg_total = (0, 0)
        for k in [1, 3, 4, 6]:
            (i, j) = emotion_count[k]
            neg_total = (neg_total[0] + i, neg_total[1] + j)

        print("{:12} ({:.0f}): {:.1f}% \t negated {:12} ({:.0f}): {:.1f}%".
              format("positive", pos_total[0], pos_total[0] / total * 100,
                     "positive", pos_total[1], pos_total[1] / total * 100))
        print("{:12} ({:.0f}): {:.1f}% \t negated {:12} ({:.0f}): {:.1f}%".
              format("neutral", neutral_total[0],
                     neutral_total[0] / total * 100, "neutral",
                     neutral_total[1], neutral_total[1] / total * 100))
        print("{:12} ({:.0f}): {:.1f}% \t negated {:12} ({:.0f}): {:.1f}%".
              format("negative", neg_total[0], neg_total[0] / total * 100,
                     "negative", neg_total[1], neg_total[1] / total * 100))

        return percentage_dict