示例#1
0
 def _chiSq(self,depGraphList, word1 ,word2):
     bigram_measures = BigramAssocMeasures()
 
     firstTuple = word1
     secondTuple = word2
     depGraphList = depGraphList[0]
     depLength = len(depGraphList)
     # value of n11
     i = 0
     j = 0
     count1 = 0
     count2 = 0
     count3 = 0
     for j in range(depLength):
         if (firstTuple == depGraphList[i][0] or firstTuple == depGraphList[i][1]) and (secondTuple == depGraphList[i][0] or secondTuple == depGraphList[i][1]):    
             count1 = count1+1
         else:
             count1 = count1
         i = i+1
         j = j+1
     cnt1 = count1
     # value of n12
     i = 0
     j = 0
     for j in range(depLength):
         if firstTuple == depGraphList[i][0] or firstTuple == depGraphList[i][1]:
             count2 = count2+1
         else:
             count2 = count2
         i = i+1
         j = j+1
     cnt2 = count2-1
     #value of n21
     i = 0
     j = 0
     for j in range(depLength):
         if secondTuple == depGraphList[i][0] or secondTuple == depGraphList[i][1]:
             count3 = count3+1
         else:
             count3 = count3
         i = i+1
         j = j+1
     cnt3 = count3-1
     #value of n22
     cnt4 = depLength-cnt1-cnt2-cnt3
     #total of n11 & n12
     n1p = cnt1+cnt2
     #total of n21 & n22
     n2p = cnt3+cnt4
     #total of n11 & n21
     np1 = cnt1+cnt3
     #total of n12 & n22
     np2 = cnt2+cnt4
     # Equatio of chi square test=> X^2 = [N(n11 * n22 - n12 * n21)^2]/[n1. * n2. * n.1 * n.2]
     x2 = float(bigram_measures.chi_sq(cnt1,(np1,n1p),depLength))
     if( x2 < 0):
         x2 = -x2
     return x2
示例#2
0
def generate_least_frequent_words_wordcloud(title, text_content):
    finder = BigramCollocationFinder.from_words(text_content)
    bigram_measures = BigramAssocMeasures()
    scored = finder.score_ngrams(bigram_measures.raw_freq)
    scoredList = sorted(scored, key=itemgetter(1))
    scoredListLen = len(scoredList) - 1
    maxLenCnt = 0
    MINSCORE = 0.000265
    indx = 0
    while (indx < scoredListLen) and (scoredList[indx][1] < MINSCORE):
        indx += 1
    word_dict2 = {}
    while (indx < scoredListLen) and (maxLenCnt < WC_max_words):
        word_dict2['_'.join(scoredList[indx][0])] = scoredList[indx][1]
        indx += 1
        maxLenCnt += 1

    wordCloud = WordCloud(max_words=WC_max_words,
                          height=WC_height,
                          width=WC_width)
    if len(word_dict2) > 0:
        wordCloud.generate_from_frequencies(word_dict2)
        plt.title(slugify(title))
        plt.imshow(wordCloud, interpolation='bilinear')
        plt.axis("off")
        wordCloud.to_file("bigram/least_frequent_words/" + slugify(title) +
                          ".png")
示例#3
0
def ngram_collocation(words, sents, n, support=10, topK=200):

    if n >= 4:
        finder = TrigramCollocationFinder.from_words(words)
        ngram_measures = TrigramAssocMeasures()
        finder.apply_freq_filter(support)
        pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)
        ext_ngrams = NgramCollocationExtender(pmi_ngrams, sents, support / 3,
                                              0.3)
        print_ngrams(ext_ngrams)
        return ext_ngrams
        #pmi_ngrams = NgramCollocationFinder(words, 2, lowFreq, topK)
        #the current collocation measure is PMI
    else:
        if n == 2:
            finder = BigramCollocationFinder.from_words(words)
            ngram_measures = BigramAssocMeasures()
        if n == 3:
            finder = TrigramCollocationFinder.from_words(words)
            ngram_measures = TrigramAssocMeasures()

        finder.apply_freq_filter(support)
        pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)

    print_ngrams(pmi_ngrams)
    return pmi_ngrams
示例#4
0
文件: bllb_nltk.py 项目: brl0/bripy
def bigrams(corpus):
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(corpus)
    # only bigrams that appear 3+ times
    finder.apply_freq_filter(3)
    # return the 5 n-grams with the highest PMI
    return finder.nbest(bigram_measures.pmi, 5)
示例#5
0
def bigram_cloud(toks):
    finder = BigramCollocationFinder.from_words(toks)
    bigram_measures = BigramAssocMeasures()
    scored = finder.score_ngrams(bigram_measures.raw_freq)

    scoredList = sorted(scored, key=itemgetter(1), reverse=True)

    word_dict = {}
 
    listLen = len(scoredList)
 
    for i in range(listLen):
        word_dict['_'.join(scoredList[i][0])] = scoredList[i][1]
 
    WC_height = 500
    WC_width = 1000
    WC_max_words = 100
     
    wordCloud = WordCloud(max_words=WC_max_words, height=WC_height, width=WC_width)
     
    wordCloud.generate_from_frequencies(word_dict)
     
    plt.title('Most frequently occurring bigrams connected with an underscore_')
    plt.imshow(wordCloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
def N_collocations_in_text(text, N, min_freq):
    # finds <N> most significant two word collocations which occur at
    # least <min_freq> times
    text_lower = [w.lower() for w in text]
    finder = BigramCollocationFinder.from_words(text_lower)
    finder.apply_freq_filter(min_freq)
    return finder.nbest(BigramAssocMeasures().pmi, N)
def bigram_collocation_finder_with_log_likelihood_ratio(tokens,window_size=2):
    '''It returns bigram collocations, including their pointwise mutual information, by using a list of tokens or list of sentences that are list of tokens as input. Window size is two.
     Parameters
    -----------
    tokens: a list of tokens or list of sentences that are list of tokens
   window_size: the window size of the collocation, by default 3


    Returns
    -------
    bigram_collocations: list of bigram collocations and their raw frequency in tuples

    '''

    bigram_measures = BigramAssocMeasures()
    if isinstance(tokens[0],list):
        
        finder=BigramCollocationFinder.from_words(BigramCollocationFinder._build_new_documents(tokens, window_size, pad_right=True),window_size=window_size)
        
        #this is the original code
        #finder = BigramCollocationFinder.from_documents(tokens)
    else:
        finder = BigramCollocationFinder.from_words(tokens,window_size=window_size)

  
    result = finder.score_ngrams(bigram_measures.likelihood_ratio)
    return result
示例#8
0
def collocation(inp, outp, freq_filter, results, coll_type, pos):
    pos = bool(pos == 'true')
    with open(inp, 'r') as fd:
        i = fd.read()

    all_words = []
    if pos:
        text = i.split(' ')[:-1]
        all_words = [x[0:x.index('/')] if x != '\n' else x for x in text]
        all_words = [x.strip(' ').strip('\n') for x in all_words]
    else:
        sents = nltk.sent_tokenize(i)
        for sent in sents:
            all_words += nltk.word_tokenize(sent)
    if coll_type == 'bigram':
        measures = BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(all_words)
    else:
        measures = TrigramAssocMeasures()
        finder = TrigramCollocationFinder.from_words(all_words)
    finder.apply_freq_filter(int(freq_filter))
    # score the ngrams and get the first N
    colls = finder.score_ngrams(measures.pmi)[:int(results)]
    with open(outp, 'w') as output:
        for coll in colls:
            (a, b), score = coll
            output.write("%s\t%s\n" % (a, b))
def cal(finder, _TOP_NUM, total_bigrams, annotations):
    
    #print('total bigram num:%s'%(len(total_bigrams)))
    
    truth_no_related = []
    for x,y in total_bigrams:
        if x+'' +y not in annotations:
            truth_no_related.append(x+'' +y)
    

    bigram_measures = BigramAssocMeasures()
    #print('TOP WORDS : %s'%(_TOP_NUM))
    
    TP = 0
    TN = 0
    system_bigrams = finder.nbest(bigram_measures.likelihood_ratio, _TOP_NUM)
    system_no_related = []
    for x,y in total_bigrams:
        if x+' '+y not in system_bigrams:
            system_no_related.append(x+'' +y)
    
    for w in truth_no_related:
        if w in system_no_related:
            TN += 1
        
    for x,y in system_bigrams:
        if x+' '+y in annotations:
            TP += 1
    #print('likelihood_ratio precision: %s'%(TP/_TOP_NUM))
    #print('likelihood_ratio accuracy: %s'%((TP+TN)/(_TOP_NUM+len(system_no_related))))
    print('%s'%((TP+TN)/(_TOP_NUM+len(system_no_related))))
def test():
    
    text = """
    LTE single-card dual-standby multi-mode terminal and method for processing concurrency of its CS service and PS service 

    The present invention is applicable to the field of communications technologies, and provides an method, the method includes: when a CS service and PS service of a local LTE single-card dual-standby multi-mode terminal are concurrent, detecting, by a local LTE single-card dual-standby multi-mode terminal, whether a peer communication terminal that is performing voice communication with it is in a voice silent period; when detecting that the peer communication terminal is not in the voice silent period, receiving, by the local LTE single-card dual-standby multi-mode terminal, downlink data in an LTE system, and suspending, by the local LTE single-card dual-standby multi-mode terminal, sending of uplink data in the LTE system at the same time; and when detecting that the peer communication terminal is in the voice silent period, sending the uplink data and receiving the downlink data, by the local LTE single-card dual-standby multi-mode terminal, in the LTE system.
    
    """
     
    bigram_measures = BigramAssocMeasures()
    #trigram_measures = TrigramAssocMeasures()
      
    # change this to read in your data
      
    finder = BigramCollocationFinder.from_words(preprocessing(text))
     
     
      
    # only bigrams that appear 3+ times
    #finder.apply_freq_filter(2)
      
    # return the 10 n-grams with the highest PMI
    #print(finder.nbest(bigram_measures.pmi,50))
    #print(finder.nbest(bigram_measures.likelihood_ratio, 20))
    #print(finder.nbest(bigram_measures.poisson_stirling, 20))
    for x,y in finder.nbest(bigram_measures.likelihood_ratio,50):
        print(x+' '+y)
示例#11
0
def get_keyword_collocations(corpus, keyword, windowsize=10, numresults=10):
    '''This function uses the Natural Language Toolkit to find collocations
    for a specific keyword in a corpus. It takes as an argument a string that
    contains the corpus you want to find collocations from. It prints the top
    collocations it finds for each keyword.
    '''
    # convert the corpus (a string) into  a list of words
    tokens = word_tokenize(corpus)
    # initialize the bigram association measures object to score each collocation
    bigram_measures = BigramAssocMeasures()
    # initialize the bigram collocation finder object to find and rank collocations
    finder = BigramCollocationFinder.from_words(tokens, window_size=windowsize)
    # initialize a function that will narrow down collocates that don't contain the keyword
    keyword_filter = lambda *w: keyword not in w
    # apply a series of filters to narrow down the collocation results
    ignored_words = stopwords.words('english')
    finder.apply_word_filter(
        lambda w: len(w) < 2 or w.lower() in ignored_words)
    finder.apply_freq_filter(1)
    finder.apply_ngram_filter(keyword_filter)
    # calculate the top results by T-score
    # list of all possible measures: .raw_freq, .pmi, .likelihood_ratio, .chi_sq, .phi_sq, .fisher, .student_t, .mi_like, .poisson_stirling, .jaccard, .dice
    results = finder.nbest(bigram_measures.student_t, numresults)
    # print the results
    print("Top collocations for ", str(keyword), ":")
    collocations = ''
    for k, v in results:
        if k != keyword:
            collocations += k + ' '
        else:
            collocations += v + ' '
    print(collocations, '\n')
示例#12
0
def nlp_process(text):
    # Tokenize the string, remove all the punctuations, and make them all lower case
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_string = tokenizer.tokenize(text)
    tokenized_string = list(map(lambda x: x.lower(), tokenized_string))
    stop_words = set(stopwords.words('english')) 

    # Stopword Removal
    stop_removed = []
    for word in tokenized_string:
        if word not in stop_words:
            stop_removed.append(word)

    # Count the frequency for words
    fdist1 = FreqDist(stop_removed)
    common_word = fdist1.most_common(1)
    top_word, top_freq = common_word[0]
    print("the most common word '{top_word}' occours {top_freq} times in the sampled text.".format(top_word = top_word, top_freq = top_freq))
    print(f'The most common 10 words are: {fdist1.most_common(10)}')

    # Get the association between two words. See: http://www.nltk.org/howto/collocations.html
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(stop_removed)
    finder.apply_freq_filter(3)
    finder.apply_word_filter(lambda x: x in stopwords.words('english'))
    print(f'The most correlated words are: {finder.nbest(bigram_measures.pmi, 10)}')
示例#13
0
def get_collocation(keywords, source):
    """
    Filtering NLTK BigramFinder Object to get the score of maximum likelyhood
    of words (length >= 2) co-occuring with the keywords.
    Returns a dataframe with the scores and the associated words.
    """
    bgm = BigramAssocMeasures()

    word_filter = lambda w1, w2: keywords not in w1 or len(w2) < 2

    filename = f"finder_{source}_trimmed.sav"
    finder = pickle.load(open(filename, 'rb'))

    try:
        scorelist = bidirection_score_ngrams(finder, bgm.likelihood_ratio,
                                             word_filter)
        word_pairs, scores = zip(*scorelist)
        key, asso = zip(*word_pairs)
        df = pd.DataFrame(np.array([asso, scores]).transpose(),
                          columns=['Collocation', 'Score'])
    except:
        # for cases where no word collocations are found
        df = pd.DataFrame(np.array([[np.NaN, np.NaN]]),
                          columns=['Collocation', 'Score'])

    df['Keyword'] = keywords
    df['Source'] = source
    return df.drop_duplicates('Collocation').reset_index(drop=True)[:80]
def jieba_feature(number):
    pos_words = []
    neg_words = []
    for items in pickle.load(open('./data/pos_cut.pkl', 'rb')):  #把集合的集合变成集合
        for item in items:
            pos_words.append(item)
    for items in pickle.load(open('./data/neg_cut.pkl', 'rb')):
        for item in items:
            neg_words.append(item)

    word_fd = FreqDist()  #可统计所有词的词频

    cond_word_fd = ConditionalFreqDist()  #可统计积极文本中的词频和消极文本中的词频

    for word in pos_words:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1

    for word in neg_words:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()  #积极词的数量

    neg_word_count = cond_word_fd['neg'].N()  #消极词的数量

    total_word_count = pos_word_count + neg_word_count

    word_scores = {}  #包括了每个词和这个词的信息量

    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(
            cond_word_fd['pos'][word], (freq, pos_word_count),
            total_word_count)  #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)  #同理

        word_scores[word] = pos_score + neg_score  #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    best_vals = sorted(
        word_scores.items(), key=lambda item: item[1],
        reverse=True)[:number]  #把词按信息量倒序排序。number是特征的维度,是可以不断调整直至最优的

    best_words = set([w for w, s in best_vals])

    return dict([(word, True) for word in best_words])
 def extract_bigrams(self, sent):
     sent = self._preprocess_sent(sent)
     bigram_measures = BigramAssocMeasures()
     BiFinder = BigramCollocationFinder.from_words(sent)
     bigrams = BiFinder.nbest(bigram_measures.pmi, 10000)
     bigrams = set([' '.join(i) for i in bigrams])
     bigrams = bigrams & self._bigrams_set
     return {i: True for i in bigrams}
示例#16
0
def get_top_bigrams(text, n):
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(clean_and_tokenize_text(text))
    finder.apply_freq_filter(2)
    return [
        ' '.join(list(words))
        for words in finder.nbest(bigram_measures.raw_freq, n)
    ]
示例#17
0
def collocs(text):
    bigrams = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_documents(
        [nltk.word_tokenize(" ".join(text))])
    finder.apply_freq_filter(2)
    topk = finder.nbest(bigrams.pmi, 15)
    for tk in topk:
        print(tk)
示例#18
0
def get_top_bigrams(corpus, top_n=100):
    '''
    Most frequent bigram detection
    '''

    finder = BigramCollocationFinder.from_documents(
        [item.split() for item in corpus])
    bigram_measures = BigramAssocMeasures()
    return finder.nbest(bigram_measures.raw_freq, top_n)
示例#19
0
def get_collocations(tokens, n_collocations=None):
    """This functions returns the collocations for a given set of tokens"""
    from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(tokens)
    scored = finder.score_ngrams(bigram_measures.raw_freq)
    colls = sorted(bigram for bigram, score in scored)[:100]

    return colls
示例#20
0
def collocations(words):
	bigrams =  defaultdict(int)
	bg_meas = BigramAssocMeasures()

	bi_finder = BigramCollocationFinder.from_words(words)
	bi_collocs = bi_finder.nbest(bg_meas.likelihood_ratio, 10)

	for colloc in bi_collocs:
		bigrams[colloc] += 1

	return bigrams # returns defaultdict, not dict!!!
示例#21
0
 def bigram_colwise(col):
     tokens = word_tokenize(col)
     tokens = [s for s in tokens if len(s) >= 4]
     finder = BigramCollocationFinder.from_words(tokens)
     bigram_measures = BigramAssocMeasures()
     scored = finder.score_ngrams(bigram_measures.raw_freq)
     scoredList = sorted(scored, key=itemgetter(1), reverse=True)
     word_dict = {}
     listLen = len(scoredList)
     for i in range(listLen):
         word_dict['_'.join(scoredList[i][0])] = scoredList[i][1]
     return list(word_dict.keys())[:4]
示例#22
0
def retrieve_top_bigrams_collocations(corpus, top=5, measure='pmi'):
    finder = BigramCollocationFinder.from_documents(
        [item.split() for item in corpus])
    bigram_measures = BigramAssocMeasures()

    if measure == 'pmi':
        top_bigrams = finder.nbest(bigram_measures.pmi, top)
    elif measure == 'frequency':
        top_bigrams = finder.nbest(bigram_measures.raw_freq, top)
    else:
        raise ValueError('Type of measure is unknown!')

    return top_bigrams
示例#23
0
def common_collocations(text, occurences=20):
    tokens = word_tokenize(text)
    final_results = []
    for measures, collocationFinder, min_size in [
        (BigramAssocMeasures(), BigramCollocationFinder, 2),
        (TrigramAssocMeasures(), TrigramCollocationFinder, 3)
    ]:
        m = measures
        finder = collocationFinder.from_words(tokens, window_size=min_size)
        finder.apply_word_filter(lambda w: len(w) < 2)
        finder.apply_freq_filter(1)
        results = finder.nbest(m.student_t, occurences)
        final_results += [" ".join(gram) for gram in results]
    return final_results
 def getBigramFeatures(documents, stopwords):
     # lower-case conversionof complete document tokenization
     all_words_list = [
         word.lower() for (email, cat) in documents for word in email
     ]
     # Top 1000 bigram feature extraction
     measures = BigramAssocMeasures()
     finder = BigramCollocationFinder.from_words(all_words_list)  # scorer
     finder.apply_word_filter(alpha_filter)  # exclude non-alphabetic words
     finder.apply_word_filter(
         lambda w: w in stopwords)  # exclude stop words
     scored = finder.score_ngrams(measures.raw_freq)
     bigram_features = [s[0] for s in scored[:1000]]
     return bigram_features
示例#25
0
def bigrams(unigram_stats, bigram_stats, measure="pmi", freq_filter=20):
    """Produce a list of scored bigrams. 
    
    Args:
        unigram_stats (FreqDist)
        bigram_stats (FreqDist)
        measure (str): a measure like "pmi" or "student_t". Should be an attribute of BigramAssocMeasures
        freq_filter (int): minimum number of occurences to consider a bigram
    """

    finder = BigramCollocationFinder(unigram_stats, bigram_stats)
    finder.apply_freq_filter(freq_filter)
    measures = BigramAssocMeasures()
    return finder.score_ngrams(getattr(measures, measure))
示例#26
0
def extract_bigrams(titles, stopwords):
    bigram_measures = BigramAssocMeasures()
    # split all titles into a single list of one-word terms
    words = [word for title in titles for word in title.split(' ')]
    # create a bigram collocation finder based on the list of words
    finder = BigramCollocationFinder.from_words(words)
    # Remove bigrams that occur fewer than five times
    finder.apply_freq_filter(5)
    # select all bigrams that do no include stopwords
    bigrams = []
    for bigram in finder.nbest(bigram_measures.pmi, 1000):
        if bigram[0] in stopwords or bigram[1] in stopwords:
            continue
        bigrams.append(bigram)
    return bigrams
示例#27
0
        def collocations(self, words):
            '''
			Rerturns frequency distribution of collocations

			NOT CURRENTLY IN USE
			'''
            bigrams = defaultdict(int)
            bg_meas = BigramAssocMeasures()

            bi_finder = BigramCollocationFinder.from_words(words)
            bi_collocs = bi_finder.nbest(bg_meas.likelihood_ratio, 10)

            for colloc in bi_collocs:
                bigrams[colloc] += 1

            return bigrams
def create_wordCloud_dict_bigrams(text_content, bad_bigrams = []):
    finder = BigramCollocationFinder.from_words(text_content)
    bigram_measures = BigramAssocMeasures()
    scored = finder.score_ngrams(bigram_measures.raw_freq)
    # Sort highest to lowest based on the score.
    #scoredList = sorted(scored, key=itemgetter(1), reverse=True)
    scoredList = scored
    word_dict = {}
    listLen = len(scoredList)
    # Set the key to the scored value. 
    for i in range(listLen):
        word_dict[' '.join(scoredList[i][0])] = scoredList[i][1]
    for bad_bigram in bad_bigrams:
        if bad_bigram in word_dict:
            del word_dict[bad_bigram]
    return word_dict
def pmi_jumlah(text1, text2):
    stopwords_ = set(stopwords.words('english'))
    words1 = [
        word.lower() for word in text1.split()
        if len(word) > 2 and word not in stopwords_
    ]
    words2 = [
        word.lower() for word in text2.split()
        if len(word) > 2 and word not in stopwords_
    ]
    finder = BigramCollocationFinder.from_words(words1 + words2)
    bgm = BigramAssocMeasures()
    score = bgm.mi_like
    total_pmi = sum(
        [math.log(pmi) for bigram, pmi in finder.score_ngrams(score)])
    return total_pmi
def compute_collocation(corpora_dir: str, session: int, party: str,
                        num_chunks: int, bigram_out_path: str,
                        trigram_out_path: str, discard_tokens: Set[str],
                        stop_words: Set[str], min_frequency: int) -> None:
    """
    discard_tokens should be a subset of stop_words. This is used for
    a heuristic to filter trigrams, where the second word is permitted
    to be a stop word (e.g. "freedom of speech") but not a discarded token
    (e.g. "I yield to"). The first and third words can never be a stop word.
    """
    tokenized_corpus: List[str] = []
    for chunk_index in range(num_chunks):
        corpus_path = os.path.join(corpora_dir,
                                   f'{session}_{party}{chunk_index}.txt')
        with open(corpus_path) as corpus_file:
            raw_text = corpus_file.read()
        tokens: List[str] = nltk.tokenize.word_tokenize(raw_text)
        tokens = [
            t.lower() for t in tokens
            if t not in discard_tokens and not t.isdigit()
        ]
        tokenized_corpus.extend(tokens)
    del tokens

    bigram_finder = BigramCollocationFinder.from_words(tokenized_corpus)
    bigram_finder.apply_freq_filter(min_frequency)
    bigram_finder.apply_word_filter(lambda word: word in stop_words)
    bigrams = bigram_finder.score_ngrams(BigramAssocMeasures().raw_freq)

    trigram_finder = TrigramCollocationFinder.from_words(tokenized_corpus)
    trigram_finder.apply_freq_filter(min_frequency)
    trigram_finder.apply_ngram_filter(lambda w1, w2, w3: (
        w1 in stop_words) or (w3 in stop_words) or (w2 in discard_tokens))
    trigrams = trigram_finder.score_ngrams(TrigramAssocMeasures().raw_freq)

    num_tokens = len(tokenized_corpus)
    with open(bigram_out_path, 'w') as bigram_file:
        for bigram, relative_freq in bigrams:
            absolute_freq = relative_freq * num_tokens
            bigram_str = ' '.join(bigram)
            bigram_file.write(f'{absolute_freq:.0f}\t{bigram_str}\n')
    with open(trigram_out_path, 'w') as trigram_file:
        for trigram, relative_freq in trigrams:
            absolute_freq = relative_freq * num_tokens
            trigram_str = ' '.join(trigram)
            trigram_file.write(f'{absolute_freq:.0f}\t{trigram_str}\n')
示例#31
0
def get_keyword_collocations(tokens, keyword, windowsize=10, numresults=35):
    '''This function uses the Natural Language Toolkit to find collocations
    for a specific keyword in a corpus. It takes as an argument a string that
    contains the corpus you want to find collocations from. It prints the top
    collocations it finds for each keyword.
    '''

    # initialize the bigram association measures object to score each collocation
    bigram_measures = BigramAssocMeasures()
    # initialize the bigram collocation finder object to find and rank collocations
    finder = BigramCollocationFinder.from_words(tokens, window_size=windowsize)
    # initialize a function that will narrow down collocates that don't contain the keyword
    keyword_filter = lambda *w: keyword not in w
    # apply a series of filters to narrow down the collocation results
    ignored_words = stopwords.words('english')
    finder.apply_word_filter(
        lambda w: len(w) < 2 or w.lower() in ignored_words)
    finder.apply_freq_filter(1)
    finder.apply_ngram_filter(keyword_filter)
    # calculate the top results by T-score
    # list of all possible measures: .raw_freq, .pmi, .likelihood_ratio, .chi_sq, .phi_sq, .fisher, .student_t, .mi_like, .poisson_stirling, .jaccard, .dice
    results = finder.score_ngrams(bigram_measures.student_t)
    results = results[:numresults]

    t = sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))
    for p in range(0, len(results)):
        for n in range(0, len(t)):
            if t[n][0] == results[p][0]:
                freq.append(t[n][1])
    # print the results
    for n in range(0, len(results)):
        r.append(results[n][0])
    print("Top collocations for ", str(keyword), ":")
    print('total occurences of' + ' ' + keyword + ':' + ' ',
          tokens.count(keyword))
    for n in range(0, len(results)):
        score.append(results[n][1])

    for k, v in r:
        collocations = ''
        if k != keyword:
            collocations = k
        else:
            collocations = v
        collocate.append(collocations)