def generate_trigrams(self):

        finder = TrigramCollocationFinder.from_words(self.corpus_tokens)
        resto_len = finder.N

        finder_contrast = TrigramCollocationFinder.from_words(
            self.contrast_tokens)
        contrast_len = finder_contrast.N

        corpus = self.generate_corpus(finder.ngram_fd,
                                      finder_contrast.ngram_fd)

        finder.apply_freq_filter(3)
        finder_contrast.apply_freq_filter(3)

        trigrams_resto = finder.ngram_fd
        trigrams_contrast = finder_contrast.ngram_fd

        scores = self.compute_tf_idf(trigrams_resto, resto_len, corpus)

        for i in scores:
            if i != 0.0:
                for tg in scores[i]:
                    tmp = ''
                    for word in tg:
                        tmp += word + ' '
                    self.trigrams.append(tmp)
Exemplo n.º 2
0
def ngram_collocation(words, sents, n, support=10, topK=200):

    if n >= 4:
        finder = TrigramCollocationFinder.from_words(words)
        ngram_measures = TrigramAssocMeasures()
        finder.apply_freq_filter(support)
        pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)
        ext_ngrams = NgramCollocationExtender(pmi_ngrams, sents, support / 3,
                                              0.3)
        print_ngrams(ext_ngrams)
        return ext_ngrams
        #pmi_ngrams = NgramCollocationFinder(words, 2, lowFreq, topK)
        #the current collocation measure is PMI
    else:
        if n == 2:
            finder = BigramCollocationFinder.from_words(words)
            ngram_measures = BigramAssocMeasures()
        if n == 3:
            finder = TrigramCollocationFinder.from_words(words)
            ngram_measures = TrigramAssocMeasures()

        finder.apply_freq_filter(support)
        pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)

    print_ngrams(pmi_ngrams)
    return pmi_ngrams
Exemplo n.º 3
0
def ngram_collocation(words, sents, n, support=10, topK=200):

    if n>=4: 
        finder = TrigramCollocationFinder.from_words(words)
        ngram_measures = TrigramAssocMeasures()
        finder.apply_freq_filter(support)
        pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)
        ext_ngrams = NgramCollocationExtender(pmi_ngrams, sents, support/3, 0.3)
        print_ngrams(ext_ngrams)
        return ext_ngrams
        #pmi_ngrams = NgramCollocationFinder(words, 2, lowFreq, topK)
        #the current collocation measure is PMI
    else:
        if n==2:
            finder = BigramCollocationFinder.from_words(words)
            ngram_measures = BigramAssocMeasures()
        if n==3:
            finder = TrigramCollocationFinder.from_words(words)
            ngram_measures = TrigramAssocMeasures()

        finder.apply_freq_filter(support)
        pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)

    print_ngrams(pmi_ngrams)
    return pmi_ngrams
Exemplo n.º 4
0
def best_ngrams(words, top_n=1000, min_freq=100):
    """
    Extract `top_n` most salient collocations (bigrams and trigrams),
    from a stream of words. Ignore collocations with frequency
    lower than `min_freq`.

    This fnc uses NLTK for the collocation detection itself -- not very scalable!

    Return the detected ngrams as compiled regular expressions, for their faster
    detection later on.

    """
    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_freq_filter(min_freq)
    trigrams = [
        ' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)
    ]
    logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_freq)
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
    logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))

    pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return pat_gram2, pat_gram3
Exemplo n.º 5
0
    def __init__(self, words, sentences, language):
        self.num_words = len(words)
        self.unique_words = len(set(words))
        self.num_sentences = len(sentences)
        self.average_sentence_length = round(self.num_words / self.num_sentences)
        self.lexical_diversity = round(self.num_words / self.unique_words)

        fdist = FreqDist(words)
        stop_words = stopwords.words(language)
        not_stopwords = [w for w in words if w not in stop_words]
        fdist2 = FreqDist(not_stopwords)
        self.fifty_first_words = fdist.most_common(50)
        self.hundreds_nsw = fdist2.most_common(300)

        bigram_measures = BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(words)
        finder.apply_freq_filter(10)
        self.fifty_collocations = finder.nbest(bigram_measures.pmi, 50)

        trigram_measures = TrigramAssocMeasures()
        finder3 = TrigramCollocationFinder.from_words(words)
        finder3.apply_freq_filter(10)
        self.fifty_collocations3 = finder3.nbest(trigram_measures.pmi, 50)

        self.stcs_width_words = [' '.join(sent) for sent in sentences
                                 if "malheureusement" in sent.lower()]
Exemplo n.º 6
0
def best_ngrams(words, top_n=10, min_freq=5):
    """
    Extract `top_n` most salient collocations (bigrams and trigrams),
    from a stream of words. Ignore collocations with frequency
    lower than `min_freq`.

    This fnc uses NLTK for the collocation detection itself -- not very scalable!

    Return the detected ngrams as compiled regular expressions, for their faster
    detection later on.

    """
    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_freq_filter(min_freq)
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
    logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_freq)
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
    logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))

    pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    print pat_gram2
    
    return pat_gram2, pat_gram3
def find_ngrams(data, PATH_SW=None, ntopbg=10, ntoptg=10):
    '''Find top occuring bigrams and trigrams in a corpus
        Parameters
            data: list of strings (each string in list is a document in corpus)
            PATH_SW: path to stop words file
            ntopbg: how many bigrams to return
            ntoptg: how many trigrams to return
        Returns
            topbg: list of tuples containing top bigrams
            toptg: list of tuples containing top trigrams

    '''
    long_string = ' '.join(data)
    tokenizer = RegexpTokenizer('[\w]+')
    words = tokenizer.tokenize(long_string)
    # english_stemmer = SnowballStemmer('english')
    # stemmed = [english_stemmer.stem(item)
    # for item in filter_stops]
    # print(stemmed)
    bef = BigramCollocationFinder.from_words(words)
    tcf = TrigramCollocationFinder.from_words(words)
    with open(PATH_SW, 'r') as f:
        stops = [re.sub(r'\s', '', line) for line in f]
    stopset = set(stops)
    filter_stops = lambda w: w in stopset
    bef.apply_word_filter(filter_stops)
    tcf.apply_word_filter(filter_stops)
    tcf.apply_freq_filter(3)
    topbg = bef.nbest(BigramAssocMeasures.likelihood_ratio, ntopbg)
    toptg = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, ntoptg)
    return topbg, toptg
Exemplo n.º 8
0
    def get_frequencies(self, desc):

        stopset = set(stopwords.words('english'))
        filter_stops = lambda w: len(w) < 3 or w in stopset
        words = word_tokenize(desc)

        print '------gram--------'
        words_to_count = [word for word in words if word not in stopset]
        words_to_count = [word for word in words_to_count if not len(word) < 3]
        c = Counter(words_to_count)
        single = c.most_common(20)
        print single

        print '------bigram--------'
        bcf = BigramCollocationFinder.from_words(words)
        bcf.apply_word_filter(filter_stops)
        bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15)
        print bigrm

        print '------trigram--------'
        tcf = TrigramCollocationFinder.from_words(words)
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(3)  #only keep those that appear more than 3 times
        trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
        print trigrm

        matches = [single,bigrm,trigrm]
        return matches
Exemplo n.º 9
0
def create_trigram_finder(tokenized_docs, should_filter=False):
    if should_filter:
        trigrams_data_samples = [trigram_prep(doc) for doc in tokenized_docs]
    else:
        trigrams_data_samples = tokenized_docs
    trigrams_finder = TrigramCollocationFinder.from_documents(trigrams_data_samples)
    return trigrams_finder
Exemplo n.º 10
0
def collocation(inp, outp, freq_filter, results, coll_type, pos):
    pos = bool(pos == 'true')
    with open(inp, 'r') as fd:
        i = fd.read()

    all_words = []
    if pos:
        text = i.split(' ')[:-1]
        all_words = [x[0:x.index('/')] if x != '\n' else x for x in text]
        all_words = [x.strip(' ').strip('\n') for x in all_words]
    else:
        sents = nltk.sent_tokenize(i)
        for sent in sents:
            all_words += nltk.word_tokenize(sent)
    if coll_type == 'bigram':
        measures = BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(all_words)
    else:
        measures = TrigramAssocMeasures()
        finder = TrigramCollocationFinder.from_words(all_words)
    finder.apply_freq_filter(int(freq_filter))
    # score the ngrams and get the first N
    colls = finder.score_ngrams(measures.pmi)[:int(results)]
    with open(outp, 'w') as output:
        for coll in colls:
            (a, b), score = coll
            output.write("%s\t%s\n" % (a, b))
Exemplo n.º 11
0
def generate_trigrams(tokens):
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    finder = TrigramCollocationFinder.from_words(tokens, window_size = 3)
    finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in stoplist)
    finder.apply_freq_filter(1)
    colls = finder.nbest(trigram_measures.likelihood_ratio, 10)
    return colls 
Exemplo n.º 12
0
def collocations(stream, top_n=10000, min_bigram_freq=50, min_trigram_freq=20):
    """Extract text collocations (bigrams and trigrams), from a stream of words.

    Parameters
    ----------
    stream: iterable object
        An iterable of words

    top_n: int
        Number of collocations to retrieve from the stream of words (order by decreasing frequency). Default is 10000

    min_bigram_freq: int
        Minimum frequency of a bigram in order to retrieve it. Default is 50.

    min_trigram_freq: int
        Minimum frequency of a trigram in order to retrieve it. Default is 20.

    """
    tcf = TrigramCollocationFinder.from_words(stream)

    tcf.apply_freq_filter(min_trigram_freq)
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
    logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_bigram_freq)
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
    logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))

    bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return bigrams_patterns, trigrams_patterns
Exemplo n.º 13
0
    def get_trigrams(self, size):

        file_name = self.disease_type + '-trigram-freq-' + str(size)
        if 'training' in file_name:
            full_training_trigram_filename = file_name + '.csv'
            file_trigrams = csv.writer(
                open(full_training_trigram_filename, 'w'))
        else:
            self.full_test_trigram_filename = file_name + '.csv'
            file_trigrams = csv.writer(
                open(self.full_test_trigram_filename, 'w'))

        finder = TrigramCollocationFinder.from_words(self.word_set)
        #scored = finder.score_ngrams(bigram_measures.raw_freq)
        True

        sortedTriGrams = sorted(
            finder.ngram_fd.items(), key=lambda t:
            (-t[1], t[0]))[:size]  # doctest: +NORMALIZE_WHITESPACE

        # Store results of 400 bigrams into CSV file
        for trigram_tuple, count in sortedTriGrams:
            file_trigrams.writerow([
                type(trigram_tuple)(x.encode('utf-8') for x in trigram_tuple),
                count
            ])  # formatted properly x.encode

        return self.full_training_trigram_filename, self.full_test_trigram_filename
Exemplo n.º 14
0
    def get_frequencies(self, desc):

        stopset = set(stopwords.words('english'))
        filter_stops = lambda w: len(w) < 3 or w in stopset
        words = word_tokenize(desc)

        print '------gram--------'
        words_to_count = [word for word in words if word not in stopset]
        words_to_count = [word for word in words_to_count if not len(word) < 3]
        c = Counter(words_to_count)
        single = c.most_common(20)
        print single

        print '------bigram--------'
        bcf = BigramCollocationFinder.from_words(words)
        bcf.apply_word_filter(filter_stops)
        bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15)
        print bigrm

        print '------trigram--------'
        tcf = TrigramCollocationFinder.from_words(words)
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(
            3)  #only keep those that appear more than 3 times
        trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
        print trigrm

        matches = [single, bigrm, trigrm]
        return matches
Exemplo n.º 15
0
def predict(test_string, models):
    # clean string
    test_string = pre_processing(test_string)

    bi_test = BigramCollocationFinder.from_words(test_string)
    tri_test = TrigramCollocationFinder.from_words(test_string)
    quad_test = QuadgramCollocationFinder.from_words(test_string)
    final_test = list(bi_test.ngram_fd.items()) + list(
        tri_test.ngram_fd.items()) + list(quad_test.ngram_fd.items())

    model_name = []

    for model in models:
        model_name.append(model[0])

    freq_sum = np.zeros(len(models))
    for ngram, freq in final_test:
        exists = 0

        for i, lang_model in enumerate(models):
            lang = lang_model[0]
            model = lang_model[1]
            total_ngram = lang_model[2]

            if ngram in model:
                if DEBUG:
                    print("Found", ngram, model[ngram], lang, total_ngram)
                # normalizing to prevent freq/total to be zero
                freq_sum[i] = freq_sum[i] + (freq * 10000) / total_ngram
                exist = 1

            if not exists:
                freq_sum[i] += 1

        max_val = freq_sum.max()
        index = freq_sum.argmax()

    if not max(freq_sum):
        if DEBUG:
            print("[ERROR] Invalid string. String: {}".format(test_string))
        return 0, "Hmm, I do not know this word. Please try other words."

    # get highest score and normalize it to be between 0,1}
    _max = 0
    freq_to_model = list(zip(freq_sum, model_name))
    scores = [x for x, y in freq_to_model]
    normalized_scores_name = [(normalize_score(f, scores), m)
                              for f, m in freq_to_model]
    sorted_score_model = sorted(normalized_scores_name, reverse=True)

    if DEBUG: print("[DEBUG] Frequency to model: {}".format(freq_to_model))
    if DEBUG: print("[DEBUG] Scores: {}".format(scores))
    if DEBUG:
        print("[DEBUG] Normalized scores name: {}".format(
            normalized_scores_name))
    if DEBUG:
        print("[DEBUG] Reverse sorted score model: {}".format(
            sorted_score_model))

    return 1, sorted_score_model
Exemplo n.º 16
0
def create_tri_collocations(features_words,document_preprocess):
    finder = TrigramCollocationFinder.from_words(movie_reviews.words())
    finder.apply_freq_filter(3)
    tricoll = finder.nbest(trigram_measures.pmi,1000)
    for f in document_preprocess:
        tricoll = [(f(a),f(b),f(c)) for (a,b,c) in tricoll if (f(a) and f(b) and f(c))]
    return tricoll
Exemplo n.º 17
0
def collocations(stream, top_n=10000, min_bigram_freq=50, min_trigram_freq=20):
    """Extract text collocations (bigrams and trigrams), from a stream of words.

    Parameters
    ----------
    stream: iterable object
        An iterable of words

    top_n: int
        Number of collocations to retrieve from the stream of words (order by decreasing frequency). Default is 10000

    min_bigram_freq: int
        Minimum frequency of a bigram in order to retrieve it. Default is 50.

    min_trigram_freq: int
        Minimum frequency of a trigram in order to retrieve it. Default is 20.

    """
    tcf = TrigramCollocationFinder.from_words(stream)

    tcf.apply_freq_filter(min_trigram_freq)
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
    logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_bigram_freq)
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
    logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))

    bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return bigrams_patterns, trigrams_patterns
Exemplo n.º 18
0
def train_language(language, training_path):
    words = []
    filter_words(training_path, words)
    seq = ' ' + ''.join(words)

    # Bigram
    bigram_finder = BigramCollocationFinder.from_words(seq)
    bigram_finder.apply_freq_filter(FREQ_FILTER)
    bigram_model = bigram_finder.ngram_fd.items()

    # Trigram
    trigram_finder = TrigramCollocationFinder.from_words(seq)
    trigram_finder.apply_freq_filter(FREQ_FILTER)
    trigram_model = trigram_finder.ngram_fd.items()

    # Quad
    quadgram_finder = QuadgramCollocationFinder.from_words(seq)
    quadgram_finder.apply_freq_filter(FREQ_FILTER)
    quadgram_model = quadgram_finder.ngram_fd.items()

    bigram_model = sorted(bigram_finder.ngram_fd.items(),
                          key=lambda item: item[1],
                          reverse=True)
    trigram_model = sorted(trigram_finder.ngram_fd.items(),
                           key=lambda item: item[1],
                           reverse=True)
    quadgram_model = sorted(quadgram_finder.ngram_fd.items(),
                            key=lambda item: item[1],
                            reverse=True)

    final_model = bigram_model + trigram_model + quadgram_model
    #print(final_model)
    np.save(MODELS_PATH + language + '.npy', final_model)
    print("Language model for {} stored at {}".format(
        language, MODELS_PATH + language + '.npy'))
Exemplo n.º 19
0
def _get_trigrams(words, top_n, min_freq):
    tcf = TrigramCollocationFinder.from_words(iter(words))
    tcf.apply_freq_filter(min_freq)
    trigrams = [
        ' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)
    ]
    return re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)
Exemplo n.º 20
0
def tagged_trigram_collocation(document):
#     content = re.sub('[,;.!-:\(\)“”"\'’‘]','',document)
    
    tag_filter = (
        ('CC','NN','NNS','NNP','NNPS','IN','JJ','JJR','JJS'),
        ('CC','NN','NNS','NNP','NNPS','IN','JJ','JJR','JJS'),
        ('NN','NNS','NNP','NNPS')
    )
    tag_func = lambda key1,key2,key3:key1[1] not in tag_filter[0] or key2[1] not in tag_filter[1] or key3[1] not in tag_filter[2]
    
    #每个单词
    words = nltk.word_tokenize(document)
    
    #每个单词标注词性
    tagged_words = nltk.pos_tag(words)
    
    #全部转换成小写
    tagged_words = ((tw[0].lower(),tw[1]) for tw in tagged_words)
    
    #英语停用词
    sw = stopwords.words("english")
    words = [w for w in tagged_words if w[0].strip() and w[0] not in sw and len(w[0]) > 3]
    
    trigram_finder = TrigramCollocationFinder.from_words(words)
    trigram_finder.apply_ngram_filter(tag_func)
    
    for (key1,key2,key3),feq in trigram_finder.ngram_fd.items():
        print(key1,key2,key3,feq)
def trigramFeats(thesewords, n=100):
    si = iter(thesewords)
    words = [c + " " + next(si, '') + " " + next(si, '') for c in si]
    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_freq_filter(n)
    trigram = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, trigram)])
Exemplo n.º 22
0
def trigram(words, score_fn=TrigramAssocMeasures.likelihood_ratio, n=1500, freq=1):
    """
    tmp_words=[]
    for w in words:
        tmp_words.append(w)
    words=tmp_words
    """
    if len(words) <= 0:
        return {}

    tmp_dict = {}

    for w in words:
        tmp_dict[w] = 1

    if len(tmp_dict.keys()) < 3:
        return {}

    trigram_finder = TrigramCollocationFinder.from_words(words)  # 把文本变成双词搭配的形式
    trigram_finder.apply_freq_filter(freq)
    trigrams = trigram_finder.nbest(score_fn, n)  # 使用了卡方统计的方法,选择排名前1000的双词

    # print type(words)

    res = {}

    for s in trigrams:

        if res.has_key(s[0] + s[1] + s[2]) == True:
            res[s[0] + s[1] + s[2]] += 1
        else:
            res[s[0] + s[1] + s[2]] = 1

    return res
 def set_trigramas(self, freq=2, best=20):
     tcf = TrigramCollocationFinder.from_words(self.palavras)
     stopset = set(stopwords.words('portuguese'))
     filter_stops = lambda w: len(w) < 3 or w in stopset
     tcf.apply_word_filter(filter_stops)
     tcf.apply_freq_filter(freq)
     a = tcf.nbest(TrigramAssocMeasures.pmi, best)
     self.trigramas = a
Exemplo n.º 24
0
 def extract_trigrams(self, sent):
    sent = self._preprocess_sent(sent)
    trigram_measures = TrigramAssocMeasures()
    TriFinder = TrigramCollocationFinder.from_words(sent)
    trigrams = TriFinder.nbest(trigram_measures.pmi, 10000)
    trigrams = set([' '.join(i) for i in trigrams])
    trigrams = trigrams & self._trigrams_set
    return { i: True for i in trigrams }
Exemplo n.º 25
0
def calc_trigrams(text, min_freq=50):
	"""Returns frequency of trigrams from a text input."""
	words = [w.lower() for w in text]
	tcf = TrigramCollocationFinder.from_words(words)
	tcf.apply_freq_filter(min_freq)
	trigrams = tcf.ngram_fd.items()
	trigram_list.append(trigrams)
	return trigram_list
Exemplo n.º 26
0
	def set_trigramas(self,freq=2,best=20):
		tcf = TrigramCollocationFinder.from_words(self.palavras)
		stopset = set(stopwords.words('portuguese'))
		filter_stops = lambda w: len(w) < 3 or w in stopset
		tcf.apply_word_filter(filter_stops)
		tcf.apply_freq_filter(freq)
		a = tcf.nbest(TrigramAssocMeasures.pmi, best)
		self.trigramas = a
Exemplo n.º 27
0
def best_trigram_word_feats(words,
                            score_fn=TrigramAssocMeasures.chi_sq,
                            n=trigram_feature_number):
    trigram_finder = TrigramCollocationFinder.from_words(words)
    trigrams = trigram_finder.nbest(score_fn, n)
    d = dict([(trigram, True) for trigram in trigrams])
    #d.update(best_word_feats(words))
    return d
def trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=100):
    """Splits each review into a list of trigrams E.g "The book is good" - (the book is), (book is good)
    Filters out the top 100 most relevant trigrams with a chi-squared association measure """
    words = text_process(words)
    trigram_finder = TrigramCollocationFinder.from_words(words)
    trigrams = trigram_finder.nbest(score_fn, n)
    bigrams = bigram_word_feats(review_class)
    return ([ngram for ngram in trigrams] + bigrams)
Exemplo n.º 29
0
 def extract_trigrams(self, sent):
     sent = self._preprocess_sent(sent)
     trigram_measures = TrigramAssocMeasures()
     TriFinder = TrigramCollocationFinder.from_words(sent)
     trigrams = TriFinder.nbest(trigram_measures.pmi, 10000)
     trigrams = set([' '.join(i) for i in trigrams])
     trigrams = trigrams & self._trigrams_set
     return {i: True for i in trigrams}
def tri(text):
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    finder = TrigramCollocationFinder.from_words(word_tokenize(text))
    finder.apply_freq_filter(30)
    finder.nbest(trigram_measures.pmi, 200)
    print(finder.ngram_fd.items())
    print(len(finder.ngram_fd.items()))
    return finder.ngram_fd.items()
Exemplo n.º 31
0
def _collect_bigrams_and_trigrams(raw_corpus,
                                  top_n=10000,
                                  min_length=1,
                                  min_freqs=None,
                                  stopwords=None):
    """collects bigrams and trigrams from collection of documents.  Input to collocation tokenizer.

    bigrams are pairs of words that recur in the collection; trigrams are triplets.

    Parameters
    ----------
    raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
        body of documents to examine
    top_n : int
        limit results to this many entries
    min_length : int
        Minimum length of any single word
    min_freqs : iterable of int
        threshold of when to consider a pair of words as a recognized n-gram,
        starting with bigrams.
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2])
    >>> patterns[0].pattern
    u'(frank swank|swank tank|sassy unicorns)'
    >>> patterns[1].pattern
    u'(frank swank tank)'
    """

    from nltk.collocations import TrigramCollocationFinder
    from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

    # generator of documents, turn each element to its list of words
    doc_texts = (_simple_document(doc_text,
                                  min_length=min_length,
                                  stopwords=stopwords)
                 for doc_id, doc_text in raw_corpus)
    # generator, concatenate (chain) all words into a single sequence, lazily
    words = itertools.chain.from_iterable(doc_texts)
    tcf = TrigramCollocationFinder.from_words(iter(words))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_freqs[0])
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]

    tcf.apply_freq_filter(min_freqs[1])
    trigrams = [
        ' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)
    ]

    bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return bigrams_patterns, trigrams_patterns
Exemplo n.º 32
0
def create_word_features(words):
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    score = TrigramAssocMeasures.chi_sq
    trigram_measures = nltk.collocations.TrigramAssocMeasures()

    finder = TrigramCollocationFinder.from_words(words)
    trigrams = finder.score_ngrams(trigram_measures.raw_freq)

    return dict([(word, True) for word in itertools.chain(words, trigrams)])
Exemplo n.º 33
0
    def best_n_trigrams(self, n, method="pmi"):
        trigram_measures = TrigramAssocMeasures()
        tokens = self.get_word_lst()
        finder = TrigramCollocationFinder.from_words(tokens)

        if method == "pmi":
            return finder.nbest(trigram_measures.pmi, n)
        if method == "raw_freq":
            return finder.nbest(trigram_measures.raw_freq, n)
Exemplo n.º 34
0
def get_top_trigrams(corpus, top_n=100):
    '''
    Most frequent tri-gram detection
    '''

    finder = TrigramCollocationFinder.from_documents(
        [item.split() for item in corpus])
    trigram_measures = TrigramAssocMeasures()
    return finder.nbest(trigram_measures.raw_freq, top_n)
Exemplo n.º 35
0
 def best_trigram_word_feats(words,
                             score_fn=TrigramAssocMeasures.chi_sq,
                             n=200):
     tcf = TrigramCollocationFinder.from_words(words)
     trigrams = tcf.nbest(score_fn, n)
     d = dict([(trigram, True) for trigram in trigrams])
     d.update(best_bigram_word_feats(words))
     d.update(best_word_feats(words))
     return d
Exemplo n.º 36
0
def bag_of_ngram_words(words,
                       bscore_fn=BigramAssocMeasures.chi_sq,
                       tscore_fn=TrigramAssocMeasures.chi_sq,
                       n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(bscore_fn, n)
    trigram_finder = TrigramCollocationFinder.from_words(words)
    trigrams = trigram_finder.nbest(tscore_fn, n)
    return bag_of_words(words + bigrams + trigrams)
Exemplo n.º 37
0
Arquivo: seo.py Projeto: blorenz/cms
def getTrigram(haystack):
    tokenizer = WordPunctTokenizer()
    words = tokenizer.tokenize(haystack)
    tcf = TrigramCollocationFinder.from_words(words)
    stopset = set(stopwords.words('english'))
    filter_stops = lambda w: len(w) < 3 or w in stopset
    tcf.apply_word_filter(filter_stops)

    return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)
Exemplo n.º 38
0
    def getTrigrams(self):

        words = [w.lower() for w in nltk.word_tokenize(self.text)]
        tcf = TrigramCollocationFinder.from_words(words)
        stopset = set(stopwords.words('english'))
        filter_stops = lambda w: len(w) < 3 or w in stopset
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(1)
        return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 6)
Exemplo n.º 39
0
def trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=50):
    trigram_finder = TrigramCollocationFinder.from_words(words)
    try:
        trigrams = trigram_finder.nbest(score_fn, n)
    except:
        print "lost trigrams", words
        return dict([(ngram, True) for ngram in itertools.chain(words)])

    return dict([(ngram, True) for ngram in itertools.chain(words, trigrams)])
Exemplo n.º 40
0
    def setup_mwes(self, trigram_nbest=100, bigram_nbest=2000):
        """Create multi-word expressions by learning a corpus located in a corpus directory.

        Testing setting up mwes with custom path and setting it up twice (correct when no exception):
        >>> corpus_dir = os.path.join(base_path, 'test', 'corpus')
        >>> clusterer = DumbClusterer(corpus_dir=corpus_dir, mwes=['custom mwe'])
        >>> mwes = clusterer.setup_mwes(trigram_nbest=1000, bigram_nbest=15000)
        >>> 'custom mwe' not in mwes
        True

        >>> 'custom mwe' in clusterer.mwes
        True

        Args:
            trigram_nbest(int): Number of highest ranked trigrams to acquire.
            bigram_nbest(int): Number of highest ranked trigrams to acquire.
        Returns:
            list: List of multi-word expressions.
        """
        if self.corpus is None:
            raise Exception("Corpus not found. Run method `setup_corpus` with given corpus directory first.")

        bigram_measures = BigramAssocMeasures()
        trigram_measures = TrigramAssocMeasures()

        # Following are not used since ne chunk takes too much time.
        # Text processing before bigrams and trigrams calculated
        # words = []
        # for sent in self.corpus.sents():
        #     for chunk in nltk.ne_chunk(nltk.pos_tag(sent)):
        #         if not isinstance(chunk, nltk.Tree):
        #             w = chunk[0]
        #             # - Removal of words containing numbers or punctuations
        #             if not any((ch.isdigit() or ch in string.punctuation) for ch in w):
        #                 # - Lowercasing all words
        #                 words.append(w.lower())
        #                 print(w.lower().encode("utf-8")),

        # Text processing before bigrams and trigrams calculated
        words = []
        for w in self.corpus.words():
            # - Removal of words containing numbers or punctuations
            if not any((ch.isdigit() or ch in string.punctuation) for ch in w):
                # - Lowercasing all words
                words.append(w.lower())

        bigram_finder = BigramCollocationFinder.from_words(words)
        trigram_finder = TrigramCollocationFinder.from_words(words)
        mwes = trigram_finder.nbest(trigram_measures.pmi, trigram_nbest) + bigram_finder.nbest(bigram_measures.pmi, bigram_nbest)
        # Basically combining two list by turning them into sets to make sure union returned 
        # i.e. `set1 | set2` where set1 could be list of string or list, and if the latter, they
        # need to be converted into sets.
        set1 = {(tuple(mwe) if isinstance(mwe,list) else mwe) for mwe in self.mwes}
        set2 = set(mwes)
        self.mwes = list(set1 | set2)
        return mwes
Exemplo n.º 41
0
def print_top_trig_collocs(word, pd_series, tokenizer, frac_corpus = 0.1, stopwords = gen_stop_words):
    corpus = [tokenizer.tokenize(x) for x in pd_series.to_list()]
    finder = TrigramCollocationFinder.from_documents(corpus)
    finder.apply_freq_filter(round(frac_corpus*len(pd_series)))
    main_trigrams = finder.nbest(trigram_measures.likelihood_ratio, 100000)
    for trigram in main_trigrams:
        if word in trigram:
            print(trigram)
        
    return
Exemplo n.º 42
0
def collocations(data, col='text', n_gram='bigram'):
    fulltext = ' '.join(data[col].tolist()).lower()
    tokens = fulltext.split()

    if n_gram == 'bigram':
        collocation = BigramCollocationFinder.from_words(tokens)
        n_grams = collocation.nbest(BigramAssocMeasures.likelihood_ratio, 10)
    elif n_gram == 'trigram':
        collocation = TrigramCollocationFinder.from_words(tokens)
        n_grams = collocation.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
 def collocation_finder(self, n_gram_total, n_gram_filter_word):
     cf = TrigramCollocationFinder.from_words(
         word_tokenize(self.filtered_desc))
     #checking what words appear frequently with 'word' in this case it is 'work'
     n_filter = lambda *words: n_gram_filter_word not in words
     cf.apply_ngram_filter(n_filter)
     #apply frq filter removes occurences that happened less than x times
     self.collocation_scores = cf.nbest(
         TrigramAssocMeasures.likelihood_ratio, n_gram_total)
     return self.collocation_scores
Exemplo n.º 44
0
def get_trigrams(filelocation, ratio):
    '''In addition to BigramCollocationFinder, there's also TrigramCollocationFinder, which 
    finds triplets instead of pairs.'''
    words = [w.lower() for w in webtext.words(filelocation)]
    stopset = set(stopwords.words('english'))
    filter_stops = lambda w: len(w) < 3 or w in stopset
    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_word_filter(filter_stops)
    tcf.apply_freq_filter(3)
    return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, ratio)
Exemplo n.º 45
0
def trigrama(tokens):
    trigram_medidas = trigram()
    for i in xrange(len(tokens)):
        for j in tokens[i]:
            finder = trigram_finder.from_words(j)
            finder.apply_freq_filter(
                1)  #filtramos los trigramas que hayan aparecido una vez
            print finder.nbest(trigram_medidas.pmi, 30)
            time.sleep(
                3)  #esperamos tres segundos para leer el proximo trigrama
Exemplo n.º 46
0
def trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=50):
    trigram_finder = TrigramCollocationFinder.from_words(words)
    trigrams = trigram_finder.nbest(score_fn, n)
    """
    print words
    for ngram in itertools.chain(words, bigrams): 
        if ngram not in stopset: 
            print ngram
    exit()
    """
    return dict([(ngram, True) for ngram in itertools.chain(words, trigrams)])
Exemplo n.º 47
0
def trigrams(words, max_trigrams=100):
    print "Extracting trigrams"
    trigram_finder = TrigramCollocationFinder.from_words(words)

    for trigram, score in trigram_finder.score_ngrams(trigram_measures.raw_freq)[:max_trigrams]:
        l_trigram = [lmtzr.lemmatize(p) for p in trigram]
        if l_trigram in tg:
            print "Common trigram", trigram
            continue

        #print trigram, score
        yield trigram
def find_collocations(text_series):
    bigram_measures = BigramAssocMeasures()
    trigram_measures = TrigramAssocMeasures()
    tokens = [ token for token_list in text_series for token in token_list ]
    bigrams = BigramCollocationFinder.from_words(tokens)
    trigrams = TrigramCollocationFinder.from_words(tokens)
    scored_bigrams = bigrams.score_ngrams(bigram_measures.likelihood_ratio)
    scored_trigrams = trigrams.score_ngrams(trigram_measures.likelihood_ratio)
    with open('bigrams.pkl', 'wb') as fid:
        cPickle.dump(scored_bigrams, fid)
    with open('trigrams.pkl', 'wb') as fid:
        cPickle.dump(scored_trigrams, fid)
Exemplo n.º 49
0
def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None):
    """collects bigrams and trigrams from collection of documents.  Input to collocation tokenizer.

    bigrams are pairs of words that recur in the collection; trigrams are triplets.

    Parameters
    ----------
    raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
        body of documents to examine
    top_n : int
        limit results to this many entries
    min_length : int
        Minimum length of any single word
    min_freqs : iterable of int
        threshold of when to consider a pair of words as a recognized n-gram,
        starting with bigrams.
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2])
    >>> patterns[0].pattern
    u'(frank swank|swank tank|sassy unicorns)'
    >>> patterns[1].pattern
    u'(frank swank tank)'
    """

    from nltk.collocations import TrigramCollocationFinder
    from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

    # generator of documents, turn each element to its list of words
    doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords)
                 for doc_id, doc_text in raw_corpus)
    # generator, concatenate (chain) all words into a single sequence, lazily
    words = itertools.chain.from_iterable(doc_texts)
    tcf = TrigramCollocationFinder.from_words(iter(words))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_freqs[0])
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]

    tcf.apply_freq_filter(min_freqs[1])
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]

    bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return bigrams_patterns, trigrams_patterns
    def best_ngrams(words, top_n=1000, min_freq=100):
        tcf = TrigramCollocationFinder.from_words(words)
        tcf.apply_freq_filter(min_freq)
        trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
        logging.info('%i trigrams found: %s...' % (len(trigrams), trigrams[:10]))

        bcf = tcf.bigram_finder()
        bcf.apply_freq_filter(min_freq)
        bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
        logging.info('%i bigrams found: %s...' % (len(bigrams), bigrams[:10]))

        pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
        pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

        return pat_gram2, pat_gram3
Exemplo n.º 51
0
def tri_collocations(tokens, num=20):
    from nltk.corpus import stopwords
    ignored_words = stopwords.words('english')

    word_list = [word for sent in tokens for word in sent]
    finder = TrigramCollocationFinder.from_words(word_list)
    finder.apply_freq_filter(3)
    finder.apply_ngram_filter(lambda w1, w2, w3: 
                                  len(w1) < 3 \
                                  or len(w3) < 3 \
                                  or (len(w1)+len(w2)+len(w3)) < 11 \
                                  or w1.lower() in ignored_words \
                                  or w3.lower() in ignored_words)
    trigram_measures = TrigramAssocMeasures()
    collocations = finder.nbest(trigram_measures.likelihood_ratio, num)
    return collocations
Exemplo n.º 52
0
def get_trigrams(sentences, freq_filter):
    '''
    Method to parse corpus into trigrams, then filter to include
    only those that occur more than 10 times.
    '''
    # Initialize trigram utils
    trigram_measures = TrigramAssocMeasures()
    trigram_finder = TrigramCollocationFinder.from_words(
        word_tokenize(" ".join(sentences).lower()))

    # Filter trigrams by frequency to reduce pmi pollution
    trigram_finder.apply_freq_filter(freq_filter)
    # Generate pmi ranked set of trigrams for sorting
    scored = trigram_finder.score_ngrams(trigram_measures.pmi)

    return sorted(trigram for trigram, score in scored)
def find_collocations(text_series): 
    #use stemmed collocations to tokenizer
    #text_series= text_series.map(custom_tokenizer)
    #use nltk.collocations to find the most commonly occuring bigrams and trigrams
    bigram_measures = BigramAssocMeasures()
    trigram_measures = TrigramAssocMeasures()
    tokens = [token for token_list in text_series for token in token_list]
    bigrams = BigramCollocationFinder.from_words(tokens)
    trigrams  = TrigramCollocationFinder.from_words(tokens)
    scored_bigrams = bigrams.score_ngrams(bigram_measures.likelihood_ratio)
    scored_trigrams = trigrams.score_ngrams(trigram_measures.likelihood_ratio)
    #save to pickle
    with open('bigrams.pkl', 'wb') as fid:
        cPickle.dump(scored_bigrams,fid)
    with open('trigrams.pkl', 'wb') as fid:
        cPickle.dump(scored_trigrams, fid)
Exemplo n.º 54
0
def nGrams(string,corpus,number,clean=True):
    global wordList
    biList=[]
    triList=[]
    words = WordPunctTokenizer().tokenize(string)
    stopset = set(stopwords.words('english'))
    if clean == True:
        words = [word.lower() for word in words]
    if clean == False:
        words = [word.lower() for word in words]
    filter = lambda words: len(words) < 2 or words.isdigit()
    
    bcf = BigramCollocationFinder.from_words(words)
    bcf.apply_word_filter(filter)
    biResult = bcf.nbest(BigramAssocMeasures.likelihood_ratio, number)

    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_word_filter(filter)
    triResult = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, number)

    for i in range(len(biResult)):
        if len(biResult) > 0:
            biPrint = " ".join(biResult[i])
            biList.append(biPrint)
        else:
            biList=[]
    csv = open('db\cyttron-keywords.csv','a')            
    if len(biList) > 1:
        csv.write('"' + ','.join(biList[:-1]) + ',' + biList[-1] + '";')
    else:
        csv.write('"' + ''.join(biList) + '";')
    csv.close()
    
    for i in range(len(triResult)):
        if len(triResult) > 0:
            triPrint = " ".join(triResult[i])
            triList.append(triPrint)
        else:
            triList=[]
    csv = open('db\cyttron-keywords.csv','a')
    if len(triList) > 1:
        csv.write('"' + ','.join(triList[:-1]) + ',' + triList[-1] + '"\n')
    else:
        csv.write('"' + ''.join(triList) + '"\n')
    csv.close()
    print biList
    print triList
Exemplo n.º 55
0
    def process(self, document):
        trigram_measures = nltk.collocations.TrigramAssocMeasures()
        metrics = ['chi_sq',
                   'jaccard',
                   'likelihood_ratio',
                   'mi_like',
                   'pmi',
                   'poisson_stirling',
                   'raw_freq',
                   'student_t']
        trigram_finder = TrigramCollocationFinder.from_words(document['tokens'])
        tr = defaultdict(lambda: [])
        for m in metrics:
            for res in trigram_finder.score_ngrams(getattr(trigram_measures,m)):
                tr[res[0]].append(res[1])

        return {'trigram_rank': tr, 'metrics':metrics}
Exemplo n.º 56
0
    def calc_trigram_collocation_set(string, regexp, boolStem):
        tokens = nltk.regexp_tokenize(string, regexp)

        if boolStem:
            tokens = Util.applyStem(tokens)

        trigram_collocation_finder = \
        TrigramCollocationFinder.from_words(tokens)
        #trigram_collocation_finder.apply_freq_filter(5)
        trigrams = \
        trigram_collocation_finder.nbest(TrigramAssocMeasures.chi_sq, len(tokens)/10)
        #trigram_collocation_finder.apply_freq_filter(2)

        final_tokens = []
        for trigram in trigrams:
            final_tokens += ['~'.join(list(trigram))]

        return final_tokens
Exemplo n.º 57
0
def find_collocations(words):
  """
  Find trigram and bigram collocations in text.
  
  Args:
    words - an array of tokenized words.

  Returns:
    A list of collocations, sorted by score.  
  """
  ignore_words = lambda w: len(w) < 3 or w.lower() in _stopset
  trigram_finder = TrigramCollocationFinder.from_words(words)
  trigram_finder.apply_word_filter(ignore_words)
  collocations = trigram_finder.nbest(TrigramAssocMeasures.raw_freq, 10)
  bigram_finder = BigramCollocationFinder.from_words(words)
  bigram_finder.apply_word_filter(ignore_words)
  collocations += bigram_finder.nbest(BigramAssocMeasures.raw_freq, 25)
  return map(lambda w: ' '.join(w), collocations)
def extract_top_collocations(json_cleaned, return_top_n=10, use_trigrams=False):
    if use_trigrams:
        measures = nltk.collocations.TrigramAssocMeasures()
    else:
        measures = nltk.collocations.BigramAssocMeasures()

    items = json_cleaned
    tweets = "\n".join(item["tweet"] for item in items)
    tweets_split = tweet_as_terms(tweets)

    # change this to read in your data
    if use_trigrams:
        finder = TrigramCollocationFinder.from_words(tweets_split)
    else:
        finder = BigramCollocationFinder.from_words(tweets_split)

    # only bigrams that appear 3+ times
    finder.apply_freq_filter(3)

    # return the 10 n-grams with the highest PMI
    top_collocations = finder.nbest(measures.pmi, return_top_n)
    return top_collocations
Exemplo n.º 59
0
def best_ngrams(words, top_n, min_freq):

    """
    This function has been extracted from an Europython 2014 tutorial about
    topic modelling given by Radim Rehurek and modified for this particular project.

    Extract `top_n` most salient collocations (bigrams and trigrams),
    from a stream of words. Ignore collocations with frequency
    lower than `min_freq`.
    This fnc uses NLTK for the collocation detection itself -- not very scalable!
    Return the detected ngrams as compiled regular expressions, for their faster
    detection later on.

    """
    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_freq_filter(min_freq)
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
    logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_freq)
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
    logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))

    # Write collocations to two files to be read by the preprocess program
    f1 = open('bigrams.txt', 'w')
    f1.writelines(["{0}\n".format(item)  for item in bigrams])
    f1.close()

    f2 = open('trigrams.txt', 'w')
    f2.writelines(["{0}\n".format(item)  for item in trigrams])
    f2.close()

    pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return pat_gram2, pat_gram3
Exemplo n.º 60
0
 def TriGram(self, text):
     '''
     @param text: POS tagged text which is going to be collocated.
     
     This use the NLTK Collocations methods to find the relevent collocations
     which have a frequancy of 1 or more
     
     @return: A set of Tri Gram Collocations.
     '''
     words = []
     for s in text:
         for w in s:
             words.append(w[0])
             
     tri = TrigramCollocationFinder.from_words(words)
     tri.apply_word_filter(self.filter_stop)
     tri.apply_freq_filter(1)
     tmp = tri.nbest(TrigramAssocMeasures.chi_sq, 20)
     
     tmp1 = []
     for word in tmp:
         tmp1.append(self.pos.POSTag(word, s=True))
     
     return tmp1