Пример #1
0
def ngram_collocation(words, sents, n, support=10, topK=200):

    if n >= 4:
        finder = TrigramCollocationFinder.from_words(words)
        ngram_measures = TrigramAssocMeasures()
        finder.apply_freq_filter(support)
        pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)
        ext_ngrams = NgramCollocationExtender(pmi_ngrams, sents, support / 3,
                                              0.3)
        print_ngrams(ext_ngrams)
        return ext_ngrams
        #pmi_ngrams = NgramCollocationFinder(words, 2, lowFreq, topK)
        #the current collocation measure is PMI
    else:
        if n == 2:
            finder = BigramCollocationFinder.from_words(words)
            ngram_measures = BigramAssocMeasures()
        if n == 3:
            finder = TrigramCollocationFinder.from_words(words)
            ngram_measures = TrigramAssocMeasures()

        finder.apply_freq_filter(support)
        pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)

    print_ngrams(pmi_ngrams)
    return pmi_ngrams
Пример #2
0
def getNameScore(name, data):
    name = name + '\n'
    trigram_measures = TrigramAssocMeasures()
    name_len = len(name) - 2
    score = 1
    for i in range(0, name_len):
        trigram_score = data.colloc_finder.score_ngram(
            trigram_measures.raw_freq, char_with_type(name[i]),
            char_with_type(name[i + 1]), char_with_type(name[i + 2]))
        if trigram_score is None:
            score = score * data.base_frequency
        else:
            score = score * trigram_score

    name_len_score = 0
    if data.name_len_probabilities.has_key(len(name)):
        name_len_score = data.name_len_probabilities[len(name)]
    else:
        name_len_score = data.base_name_len_probability

    # last_letter_score = data.base_name_len_probability
    # if data.last_letter_probabilities.has_key(name[-1:]):
    #     last_letter_score = data.last_letter_probabilities[name[-1:]]

    return score * name_len_score * data.name_probability
Пример #3
0
def collocation(inp, outp, freq_filter, results, coll_type, pos):
    pos = bool(pos == 'true')
    with open(inp, 'r') as fd:
        i = fd.read()

    all_words = []
    if pos:
        text = i.split(' ')[:-1]
        all_words = [x[0:x.index('/')] if x != '\n' else x for x in text]
        all_words = [x.strip(' ').strip('\n') for x in all_words]
    else:
        sents = nltk.sent_tokenize(i)
        for sent in sents:
            all_words += nltk.word_tokenize(sent)
    if coll_type == 'bigram':
        measures = BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(all_words)
    else:
        measures = TrigramAssocMeasures()
        finder = TrigramCollocationFinder.from_words(all_words)
    finder.apply_freq_filter(int(freq_filter))
    # score the ngrams and get the first N
    colls = finder.score_ngrams(measures.pmi)[:int(results)]
    with open(outp, 'w') as output:
        for coll in colls:
            (a, b), score = coll
            output.write("%s\t%s\n" % (a, b))
 def extract_trigrams(self, sent):
     sent = self._preprocess_sent(sent)
     trigram_measures = TrigramAssocMeasures()
     TriFinder = TrigramCollocationFinder.from_words(sent)
     trigrams = TriFinder.nbest(trigram_measures.pmi, 10000)
     trigrams = set([' '.join(i) for i in trigrams])
     trigrams = trigrams & self._trigrams_set
     return {i: True for i in trigrams}
Пример #5
0
def get_top_trigrams(corpus, top_n=100):
    '''
    Most frequent tri-gram detection
    '''

    finder = TrigramCollocationFinder.from_documents(
        [item.split() for item in corpus])
    trigram_measures = TrigramAssocMeasures()
    return finder.nbest(trigram_measures.raw_freq, top_n)
Пример #6
0
def retrieve_top_trigrams_collocations(corpus, top=5, measure='pmi'):
    finder = TrigramCollocationFinder.from_documents(
        [item.split() for item in corpus])
    trigram_measures = TrigramAssocMeasures()

    if measure == 'pmi':
        top_trigrams = finder.nbest(trigram_measures.pmi, top)
    elif measure == 'frequency':
        top_trigrams = finder.nbest(trigram_measures.raw_freq, top)
    else:
        raise ValueError('Type of measure is unknown!')

    return top_trigrams
Пример #7
0
def common_collocations(text, occurences=20):
    tokens = word_tokenize(text)
    final_results = []
    for measures, collocationFinder, min_size in [
        (BigramAssocMeasures(), BigramCollocationFinder, 2),
        (TrigramAssocMeasures(), TrigramCollocationFinder, 3)
    ]:
        m = measures
        finder = collocationFinder.from_words(tokens, window_size=min_size)
        finder.apply_word_filter(lambda w: len(w) < 2)
        finder.apply_freq_filter(1)
        results = finder.nbest(m.student_t, occurences)
        final_results += [" ".join(gram) for gram in results]
    return final_results
Пример #8
0
def save_trigrams(tokenized_docs, shouldWriteToFile=False):
    trigrams_finder = create_trigram_finder(tokenized_docs)
    trigram_measures = TrigramAssocMeasures()
    trigrams_scores = trigrams_finder.score_ngrams(trigram_measures.likelihood_ratio)

    trigrams_counts = ['%s_%s_%s,%d\n' % (most_common[0][0], most_common[0][1], most_common[0][2], most_common[1])
                       for most_common in trigrams_finder.ngram_fd.most_common()]

    trigrams_scores_as_str = [
        '%s_%s_%s,%d\n' % (most_common[0][0], most_common[0][1], most_common[0][2], most_common[1])
        for most_common in trigrams_scores]
    if shouldWriteToFile:
        with open('./output/trigrams_counts.csv', "w", encoding="utf8") as fout:
            lines_to_file(trigrams_counts, fout)
        with open('./output/trigrams_lr_scores.csv', "w", encoding="utf8") as fout:
            lines_to_file(trigrams_scores_as_str, fout)
def create_wordCloud_dict_trigrams(text_content, bad_trigrams):
    finder = TrigramCollocationFinder.from_words(text_content)
    trigram_measures = TrigramAssocMeasures()
    scored = finder.score_ngrams(trigram_measures.raw_freq)
    # Sort highest to lowest based on the score.
    #scoredList = sorted(scored, key=itemgetter(1), reverse=True)
    scoredList = scored
    word_dict = {}
    listLen = len(scoredList)
    # Set the key to the scored value. 
    for i in range(listLen):
        word_dict[' '.join(scoredList[i][0])] = scoredList[i][1]
    for bad_bigram in bad_trigrams:
        if bad_trigram in word_dict:
            del word_dict[bad_trigram]
    return word_dict        
def compute_collocation(corpora_dir: str, session: int, party: str,
                        num_chunks: int, bigram_out_path: str,
                        trigram_out_path: str, discard_tokens: Set[str],
                        stop_words: Set[str], min_frequency: int) -> None:
    """
    discard_tokens should be a subset of stop_words. This is used for
    a heuristic to filter trigrams, where the second word is permitted
    to be a stop word (e.g. "freedom of speech") but not a discarded token
    (e.g. "I yield to"). The first and third words can never be a stop word.
    """
    tokenized_corpus: List[str] = []
    for chunk_index in range(num_chunks):
        corpus_path = os.path.join(corpora_dir,
                                   f'{session}_{party}{chunk_index}.txt')
        with open(corpus_path) as corpus_file:
            raw_text = corpus_file.read()
        tokens: List[str] = nltk.tokenize.word_tokenize(raw_text)
        tokens = [
            t.lower() for t in tokens
            if t not in discard_tokens and not t.isdigit()
        ]
        tokenized_corpus.extend(tokens)
    del tokens

    bigram_finder = BigramCollocationFinder.from_words(tokenized_corpus)
    bigram_finder.apply_freq_filter(min_frequency)
    bigram_finder.apply_word_filter(lambda word: word in stop_words)
    bigrams = bigram_finder.score_ngrams(BigramAssocMeasures().raw_freq)

    trigram_finder = TrigramCollocationFinder.from_words(tokenized_corpus)
    trigram_finder.apply_freq_filter(min_frequency)
    trigram_finder.apply_ngram_filter(lambda w1, w2, w3: (
        w1 in stop_words) or (w3 in stop_words) or (w2 in discard_tokens))
    trigrams = trigram_finder.score_ngrams(TrigramAssocMeasures().raw_freq)

    num_tokens = len(tokenized_corpus)
    with open(bigram_out_path, 'w') as bigram_file:
        for bigram, relative_freq in bigrams:
            absolute_freq = relative_freq * num_tokens
            bigram_str = ' '.join(bigram)
            bigram_file.write(f'{absolute_freq:.0f}\t{bigram_str}\n')
    with open(trigram_out_path, 'w') as trigram_file:
        for trigram, relative_freq in trigrams:
            absolute_freq = relative_freq * num_tokens
            trigram_str = ' '.join(trigram)
            trigram_file.write(f'{absolute_freq:.0f}\t{trigram_str}\n')
Пример #11
0
    def ngram_analyze(self, lst, model="student_t"):
        """
        Documentation for analysis tools:
        http://www.nltk.org/_modules/nltk/metrics/association.html

        Uses student_t distribution to analyze a list of words by splitting them into \
        tuples of 3 elements: eg. (a, b, c), (b, c, d), ...

        The distribution assigns a score to each tuple. This function returns the \
        highest score words

        Args:
        -----
        lst : a list of words
        model : the chosen model for ngram analysis (student_t, chi_sq, mi_like, pmi, jaccard)
        """
        lst = self.nlp(lst)
        string = " ".join(map(str, lst))
        words = nltk.word_tokenize(string)

        measures = TrigramAssocMeasures()

        finder = TrigramCollocationFinder.from_words(words)

        scores = []

        if model == "student_t":
            scores = finder.score_ngrams(measures.student_t)[:]
        elif model == "chi_sq":
            scores = finder.score_ngrams(measures.chi_sq)[:]
        elif model == "mi_like":
            scores = finder.score_ngrams(measures.mi_like)[:]
        elif model == "pmi":
            scores = finder.score_ngrams(measures.pmi)[:]
        elif model == "jaccard":
            scores = finder.score_ngrams(measures.jaccard)[:]
        else:
            print("Not valid model!")

        scores.sort(key=lambda i: i[1], reverse=True)
        top = scores[:3]
        return top
Пример #12
0
def Collocation(contents, n):

    from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder, TrigramAssocMeasures, TrigramCollocationFinder, QuadgramAssocMeasures, QuadgramCollocationFinder

    from nltk.probability import FreqDist, DictionaryProbDist

    if n==2:
        bigram_measures = BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(contents)
        scored = finder.score_ngrams(bigram_measures.raw_freq)
    elif n==3:
        trigram_measures = TrigramAssocMeasures()
        finder = TrigramCollocationFinder.from_words(contents)
        scored = finder.score_ngrams(trigram_measures.raw_freq)
    elif n==4:
        quadgram_measures = QuadgramAssocMeasures()
        finder = QuadgramCollocationFinder.from_words(contents)
        scored = finder.score_ngrams(quadgram_measures.raw_freq)
    else:
        print("Collocation is only available for n=2, 3, or 4.")

    return(scored)
Пример #13
0
def x_trigrams(tokens, x):
    '''
    Find the x best tri-grams given tokens (a list of strings) and x which will 
    tell you how many tri-grams to return.
    
    Parameters
    ----------
    tokens: A list of strings
    x: An integer
    
    
    Returns
    -------
    tri_list: A list of tuples, with the tuples being of the 
    form (str, str, str).
    
    '''
    #Finds bigrams than finds the x best ones
    trigramass = TrigramAssocMeasures()
    finder = TrigramCollocationFinder.from_words(tokens)
    tri_list = finder.nbest(trigramass.pmi, x)

    return tri_list
def trigram_collocation_finder(tokens,window_size = 3):
    '''It returns trigram collocations, including their raw frequency, by using a list of tokens or list of sentences that are list of tokens as input. Window size is three.
     Parameters
    -----------
   tokens: a list of tokens or list of sentences that are list of tokens
   window_size: the window size of the collocation, by default 3

    Returns
    -------
    bigram_collocations: list of bigram collocations and their raw frequency in tuples

    '''

    trigram_measures = TrigramAssocMeasures()
    if isinstance(tokens[0],list):
        # todo how to measure the window size here
        finder = TrigramCollocationFinder.from_documents(tokens)
    else:
        finder = TrigramCollocationFinder.from_words(tokens,window_size=window_size)


    result = finder.score_ngrams(trigram_measures.raw_freq)
    return result
Пример #15
0
 def top_trigrams(self, tokens):
     tfinder = TrigramCollocationFinder.from_words(tokens)
     trigram_measures = TrigramAssocMeasures()
     tfinder.apply_freq_filter(int(self.trigrams_pct_words * len(tokens)))
     trigrams = tfinder.nbest(trigram_measures.pmi, self.num_trigrams)
     return trigrams
Пример #16
0
 def collocations(self, top, freq=None):
     if freq:
         self._bigram_finder.apply_freq_filter(freq)
         self._trigram_finder.apply_freq_filter(freq)
     return (self._bigram_finder.nbest(BigramAssocMeasures().pmi, top),
             self._trigram_finder.nbest(TrigramAssocMeasures().pmi, top))
Пример #17
0
file_1 = open('res/task_1-5.tsv', 'a')
file_1.write('3-грамма\tчастота\n')

for trigram, freq in count_frequency(trigrams=tri_grams):
    file_1.write(' '.join(trigram) + '\t' + str(freq) + '\n')

file_1.close()

# Задание 6
my_evaluation = evaluate_association(
    trigrams=list(tri_grams),
    num_of_tokens=len(tokens_without_stop_words))[:30]

# Задание 7
trigram_measures = TrigramAssocMeasures()

tokens_2 = word_tokenize(open('Text.txt', 'r').read(), 'russian', True)

text_2 = Text(tokens_2)
finder_thr_1 = TrigramCollocationFinder.from_words(text_2)
evaluation_with_punctuation = finder_thr_1.nbest(
    TrigramAssocMeasures().student_t, 30)

file_2 = open('res/task_7_with_p.tsv', 'a')
file_2.write('Мои 3-граммы\tNLTK 3-граммы\n')

for i in range(30):
    file_2.write(' '.join(my_evaluation[i][0]) + '\t' +
                 ' '.join(evaluation_with_punctuation[i][0]) + '\n')
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder = BigramCollocationFinder.from_documents(
    [item.split() for item in norm_alice])
bigram_measures = BigramAssocMeasures()
finder.nbest(bigram_measures.raw_freq, 10)
finder.nbest(bigram_measures.pmi, 10)

from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents(
    [item.split() for item in norm_alice])
trigram_measures = TrigramAssocMeasures()
finder.nbest(trigram_measures.raw_freq, 10)
finder.nbest(trigram_measures.pmi, 10)

toy_text = """
Elephants are large mammals of the family Elephantidae 
and the order Proboscidea. Two species are traditionally recognised, 
the African elephant and the Asian elephant. Elephants are scattered 
throughout sub-Saharan Africa, South Asia, and Southeast Asia. Male 
African elephants are the largest extant terrestrial animals. All 
elephants have a long trunk used for many purposes, 
particularly breathing, lifting water and grasping objects. Their 
incisors grow into tusks, which can serve as weapons and as tools 
for moving objects and digging. Elephants' large ear flaps help 
to control their body temperature. Their pillar-like legs can 
carry their great weight. African elephants have larger ears 
Пример #19
0
def get_topic_data(product, df, final_results, input_text, load_path, encoding_type):
    data = df[df['ProductId']==product]
    # prepare the corpus
    texts = data[input_text].str.split()
    dictionary = corpora.Dictionary(texts)
    remove_freq(dictionary, 10)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    # Load in the tuned LDA model for the product
    t = final_results.loc[product, 'num_topics']
    p = final_results.loc[product, 'passes']
    tn = final_results.loc[product, 'top_n removed'].astype(int)
    na = final_results.loc[product, 'n_above threshold']

    lda = gensim.models.ldamodel.LdaModel.load('../models/{}/final_models/{}_{}_{}_{}_{}'.format(load_path, 
                                                                                         product, 
                                                                                         t, p,
                                                                                         tn, na))    
    topic_data=[]
    # be sure to set the appropriate coherence measure
    topics = lda.top_topics(texts=texts, corpus=corpus, coherence='c_v')
    
    # iterate through the topics to get coherence, top review, key words, and bigrams
    for topic in range(0,t):
        # sub dataframe where this is the main topic
        main_topic_df = data[data['{} Topic'.format(encoding_type)]==topic] 
        # sub dataframe where this is a subtopic
        sub_topic_df = data[data['{} Subtopic'.format(encoding_type)]==topic]
        # grab the coherence measure
        coherence = (topics[topic][-1]) 
        # Make a list of the top words from the topic
        l = lda.show_topic(topic, topn=10) 
        # And then reformat this into a usable list 
        top_words = [x[0] for x in l]
        # Get the number of reviews fitting into the topic ...
        # as the main topic, with a fit value above 0.7
        as_main = len(main_topic_df.loc[main_topic_df['{} Fit'.format(encoding_type)]>=0.7])
        # as the primary subtopic
        as_primary_sub = len(main_topic_df.loc[(main_topic_df['{} Fit'.format(encoding_type)]<0.7)&
                                               (main_topic_df['{} Fit'.format(encoding_type)]>=0.3)])
        as_secondary_sub = len(sub_topic_df.loc[sub_topic_df['{} Subtopic Fit'.format(encoding_type)]>=0.3])
        #count = len(data[data['{} Topic'.format(encoding_type)]==topic]) 
        try: 
            # Get an index locator for the best fitting review
            ix = main_topic_df['{} Fit'.format(encoding_type)].idxmax(axis=0) 
            # Find the review that best matches the topic
            top_review = main_topic_df.loc[ix, 'clean_review'] 
            # Get that best review's fit value (probability review comes from topic)
            fit = main_topic_df['{} Fit'.format(encoding_type)].max(axis=0) 
            # Getting the bigrams
            bigram_measures = BigramAssocMeasures()
            trigram_measures = TrigramAssocMeasures()
            
            # Build the bigram distribution over the set of words found in the reviews tagged to this topic
            #words = np.concatenate(np.array([word_tokenize(r) for r in sub_df['{}_x'.format(input_text)].values])) 
            words = np.concatenate(np.array([word_tokenize(r) for r in main_topic_df['clean_vanilla_x'].values])) 

            bigram_fd = FreqDist(bigrams(words))
            trigram_fd = FreqDist(trigrams(words))

            bfinder = BigramCollocationFinder.from_words(words, window_size=3)
            tfinder = TrigramCollocationFinder.from_words(words, window_size=4)
            for finder in [bfinder, tfinder]:
                # Get rid of words we don't want
                finder.apply_word_filter(lambda w: w in ('GOODREVIEW', 'BADREVIEW', 
                                                         'VGOODREVIEW', 'VBADREVIEW', 
                                                         's', 'b', 'c', 'oz', 'be')) 
                                                         
                # Filter out bigrams that don't appear at least 2 times
                finder.apply_freq_filter(2) 
                
            # Filter out some common n-grams
            bfinder.apply_ngram_filter(lambda w1, w2: (w1, w2) in bigrams_filter) 
            tfinder.apply_ngram_filter(lambda w1, w2, w3: (w1, w2, w3) in trigrams_filter)
            # Get the top 3 bigrams and trigrams by raw frequency and by PMI value
            bgrams_pmi = bfinder.nbest(bigram_measures.pmi, 10) 
            bgrams_freq = bfinder.nbest(bigram_measures.raw_freq, 10) 
            tgrams_pmi = tfinder.nbest(trigram_measures.pmi, 10)
            tgrams_freq = tfinder.nbest(trigram_measures.raw_freq, 10)
            # Format a bit more nicely for readability
            top_bigrams_pmi = [a[0]+" "+a[1] for a in bgrams_pmi] 
            top_bigrams_freq = [a[0]+" "+a[1] for a in bgrams_freq[2:]]
            top_trigrams_pmi = [a[0]+" "+a[1]+" "+a[2] for a in tgrams_pmi]
            top_trigrams_freq = [a[0]+" "+a[1]+" "+a[2] for a in tgrams_freq[2:]]

        except ValueError: 
            # ValueError in this case indicates there were no reviews that were matched to the topic
            # hence the results will be blank for that
            top_review = 'none'
            fit = ''
            top_bigrams_pmi = []
            top_trigrams_pmi = []
            top_bigrams_freq = []
            top_trigrams_freq = []
   
        
        topic_data.append([product, topic, as_main,
                           as_primary_sub, as_secondary_sub, 
                           coherence, top_words, 
                           top_review, fit, top_bigrams_pmi, 
                           top_bigrams_freq, top_trigrams_pmi, 
                           top_trigrams_freq])
                
        
    topic_data=pd.DataFrame(data=topic_data, 
                            columns=['product', 'topic', 'as_main_topic',
                                     'as_primary_subtopic', 'as_secondary_subtopic',
                                    'topic_coherence', 'top_words', 
                                    'best_review', 'best_review_fit', 
                                    'top_bigrams_pmi', 'top_bigrams_freq',
                                    'top_trigrams_pmi', 'top_trigrams_freq'])
    return topic_data