Python BigramCollocationFinder.from_documents примеры, nltk.collocations.BigramCollocationFinder.from_documents Python примеры использования

Пример #1

0

Показать файл

    def find(self, docs, top_n, strip_tags=True):
        """
        Parameter:
        ---------------

        docs: list of tokenized documents

        top_n: int
            how many labels to return

        strip_tags: bool
            whether return without the POS tags or not

        Return:
        ---------------
        list of tuple of str: the bigrams
        """
        # if apply pos constraints
        # check the pos properties

        score_func = self.score_func

        finder = BigramCollocationFinder.from_documents(docs)
        finder.apply_freq_filter(self._min_freq)

        bigrams = finder.nbest(score_func, top_n)
        return bigrams

Пример #2

0

Показать файл

Файл: collocation_analysis.py Проект: patrickmesana/nlpkit

def create_bigram_finder(tokenized_docs, should_filter=False):
    if should_filter:
        bigrams_data_samples = [bigram_prep(doc) for doc in tokenized_docs]
    else:
        bigrams_data_samples = tokenized_docs
    bigrams_finder = BigramCollocationFinder.from_documents(bigrams_data_samples)
    return bigrams_finder

Пример #3

0

Показать файл

def collocs(text):
    bigrams = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_documents(
        [nltk.word_tokenize(" ".join(text))])
    finder.apply_freq_filter(2)
    topk = finder.nbest(bigrams.pmi, 15)
    for tk in topk:
        print(tk)

Пример #4

0

Показать файл

    def fit(self, X, **fit_params):
        """
        Procedure to iteratively contract bigrams (up to max_collocation_iterations times)
        that score higher on the collocation_function than the min_collocation_score (and satisfy other
        criteria set out by the optional parameters).
        """
        self.tokenization_ = X
        n_tokens = sum([len(x) for x in X])
        for i in range(self.max_iterations):
            bigramer = BigramCollocationFinder.from_documents(
                self.tokenization_)

            if not self.ignored_tokens == None:
                ignore_fn = lambda w: w in self.ignored_tokens
                bigramer.apply_word_filter(ignore_fn)

            if not self.excluded_token_regex == None:
                exclude_fn = (lambda w: re.fullmatch(self.excluded_token_regex,
                                                     w) is not None)
                bigramer.apply_word_filter(exclude_fn)

            if not self.min_token_occurrences == None:
                minocc_fn = lambda w: bigramer.word_fd[
                    w] < self.min_token_occurrences
                bigramer.apply_word_filter(minocc_fn)

            if not self.max_token_occurrences == None:
                maxocc_fn = lambda w: bigramer.word_fd[
                    w] > self.max_token_occurrences
                bigramer.apply_word_filter(maxocc_fn)

            if not self.min_token_frequency == None:
                minfreq_fn = (lambda w: bigramer.word_fd[w] < self.
                              min_token_frequency * n_tokens)
                bigramer.apply_word_filter(minfreq_fn)

            if not self.max_token_frequency == None:
                maxfreq_fn = (lambda w: bigramer.word_fd[w] > self.
                              max_token_frequency * n_tokens)
                bigramer.apply_word_filter(maxfreq_fn)

            if not self.min_ngram_occurrences == None:
                bigramer.apply_freq_filter(self.min_ngram_occurrences)

            new_grams = list(
                bigramer.above_score(self.score_function, self.min_score))

            if len(new_grams) == 0:
                break

            self.mtes_.append(new_grams)

            contracter = MWETokenizer(new_grams)
            self.tokenization_ = tuple([
                tuple(contracter.tokenize(doc)) for doc in self.tokenization_
            ])

        return self

Пример #5

0

Показать файл

def get_top_bigrams(corpus, top_n=100):
    '''
    Most frequent bigram detection
    '''

    finder = BigramCollocationFinder.from_documents(
        [item.split() for item in corpus])
    bigram_measures = BigramAssocMeasures()
    return finder.nbest(bigram_measures.raw_freq, top_n)

Пример #6

0

Показать файл

    def find(self, docs, top_n, strip_tags=True):
        """
        Parameter:
        ---------------

        docs: list of tokenized documents

        top_n: int
            how many labels to return

        strip_tags: bool
            whether return without the POS tags or not

        Return:
        ---------------
        list of tuple of str: the bigrams
        """
        # if apply pos constraints
        # check the pos properties
        if self._pos:
            assert isinstance(self._pos, list)
            for pair in self._pos:
                assert isinstance(pair, tuple) or isinstance(pair, list)
                assert len(pair) == 2  # because it's bigram

        score_func = getattr(self.bigram_measures, self._measure_method)

        finder = BigramCollocationFinder.from_documents(docs)
        finder.apply_freq_filter(self._min_freq)
        finder.apply_word_filter(lambda w: len(w) < 3)

        if self._pos:
            valid_pos_tags = set([pair for pair in self._pos])
            valid_bigrams = []
            bigrams = map(
                partial(get, 0),  # get the bigram
                finder.score_ngrams(score_func))
            cnt = 0
            for bigram in bigrams:
                if tuple(map(partial(get, 1), bigram)) in valid_pos_tags:
                    valid_bigrams.append(bigram)
                    cnt += 1
                if cnt == top_n:  # enough
                    break

            if strip_tags:
                valid_bigrams = [
                    tuple(map(partial(get, 0), bigram))
                    for bigram in valid_bigrams
                ]

            return valid_bigrams
        else:
            bigrams = finder.nbest(score_func, top_n)
            return bigrams

Пример #7

0

Показать файл

def retrieve_top_bigrams_collocations(corpus, top=5, measure='pmi'):
    finder = BigramCollocationFinder.from_documents(
        [item.split() for item in corpus])
    bigram_measures = BigramAssocMeasures()

    if measure == 'pmi':
        top_bigrams = finder.nbest(bigram_measures.pmi, top)
    elif measure == 'frequency':
        top_bigrams = finder.nbest(bigram_measures.raw_freq, top)
    else:
        raise ValueError('Type of measure is unknown!')

    return top_bigrams

Пример #8

0

Показать файл

Файл: main.py Проект: yenchiah/ape-analysis

def compute_ngrams_count(text_corpus, out_p, n=20):
    print("Compute ngrams count...")
    list_of_tokens = []
    for document in text_corpus:
        for sentence in document:
            list_of_tokens.append(word_tokenize(sentence))

    # Unigram
    tokens = util.flatten_one_level(list_of_tokens)
    custom_sw = [".", "[", "]", ","]
    sw = stopwords.words("english") + custom_sw
    tokens = [w for w in tokens if w not in sw]
    word_fd = FreqDist(tokens)
    uni_mc = word_fd.most_common(n)

    # Bigram
    bi = BigramCollocationFinder.from_documents(list_of_tokens)
    #bi.apply_freq_filter(2)
    #print(bi.ngram_fd.items())
    bi_mc = bi.ngram_fd.most_common(n)

    # Trigram
    tri = TrigramCollocationFinder.from_documents(list_of_tokens)
    tri_mc = tri.ngram_fd.most_common(n)

    # Quadgram
    quad = QuadgramCollocationFinder.from_documents(list_of_tokens)
    quad_mc = quad.ngram_fd.most_common(n)

    # Plot
    data = [uni_mc, bi_mc, tri_mc, quad_mc]
    x = []
    y = []
    for i in range(len(data)):
        x_ng = []
        y_ng = []
        for d in data[i]:
            if i==0:
                x_ng.append(d[0])
            else:
                x_ng.append(" ".join(d[0]))
            y_ng.append(d[1])
        x.append(x_ng[::-1])
        y.append(y_ng[::-1])
    title = ["Unigram", "Bigram", "Trigram", "Quadgram"]
    sup_title = "ngrams count"
    util.plot_bar_chart_grid(x, y, 1, len(data), title, sup_title, out_p, sup_title_font_size=16,
        tick_font_size=14, title_font_size=14, h_size=5, w_size=5, rotate=True)

Пример #9

0

Показать файл

Файл: collocation_analysis.py Проект: patrickmesana/nlpkit

def DEPRECATED_save_bigrams(tokenized_docs, shouldWriteToFile=False):
    bigrams_data_samples = [bigram_prep(doc) for doc in tokenized_docs]

    bigram_measures = BigramAssocMeasures()
    bigrams_finder = BigramCollocationFinder.from_documents(bigrams_data_samples)
    bigrams_scores = bigrams_finder.score_ngrams(bigram_measures.likelihood_ratio)

    bigrams_counts = ['%s_%s,%d\n' % (most_common[0][0], most_common[0][1], most_common[1])
                      for most_common in bigrams_finder.ngram_fd.most_common()]

    bigrams_scores_as_str = ['%s_%s,%d\n' % (most_common[0][0], most_common[0][1], most_common[1])
                             for most_common in bigrams_scores]

    if shouldWriteToFile:
        with open('./output/bigrams_counts.csv', "w", encoding="utf8") as fout:
            lines_to_file(bigrams_counts, fout)
        with open('./output/bigrams_lr_scores.csv', "w", encoding="utf8") as fout:
            lines_to_file(bigrams_scores_as_str, fout)

Пример #10

0

Показать файл

Файл: tokenizers.py Проект: gclen/TextMAP

 def iteratively_contract_bigrams(self):
     """
     Procedure to iteratively contract bigrams (up to max_collocation_iterations times)
     that score higher on the collocation_function than the min_collocation_score
     """
     for i in range(self.max_collocation_iterations):
         bigramer = BigramCollocationFinder.from_documents(self.tokens_by_sent())
         mwes = list(
             bigramer.above_score(
                 self.collocation_score_function, self.min_collocation_score
             )
         )
         if len(mwes) == 0:
             break
         contracter = MWETokenizer(mwes)
         self.tokens_by_sent_by_doc_ = [
             contracter.tokenize_sents(doc) for doc in self.tokens_by_sent_by_doc()
         ]

Пример #11

0

Показать файл

Файл: sentiment_analyzer.py Проект: Weiming-Hu/text-based-six-degree

    def bigram_collocation_feats(self, documents, top_n=None, min_freq=3,
                                 assoc_measure=BigramAssocMeasures.pmi):
        """
        Return `top_n` bigram features (using `assoc_measure`).
        Note that this method is based on bigram collocations measures, and not
        on simple bigram frequency.

        :param documents: a list (or iterable) of tokens.
        :param top_n: number of best words/tokens to use, sorted by association
            measure.
        :param assoc_measure: bigram association measure to use as score function.
        :param min_freq: the minimum number of occurrencies of bigrams to take
            into consideration.

        :return: `top_n` ngrams scored by the given association measure.
        """
        finder = BigramCollocationFinder.from_documents(documents)
        finder.apply_freq_filter(min_freq)
        return finder.nbest(assoc_measure, top_n)

Пример #12

0

Показать файл

Файл: SentimentAnalyzer.py Проект: richardtz12/Sentimeter

    def bigram_collocation_feats(self, documents, top_n=None, min_freq=3,
                                 assoc_measure=BigramAssocMeasures.pmi):
        """
        Return `top_n` bigram features (using `assoc_measure`).
        Note that this method is based on bigram collocations measures, and not
        on simple bigram frequency.

        :param documents: a list (or iterable) of tokens.
        :param top_n: number of best words/tokens to use, sorted by association
            measure.
        :param assoc_measure: bigram association measure to use as score function.
        :param min_freq: the minimum number of occurrencies of bigrams to take
            into consideration.

        :return: `top_n` ngrams scored by the given association measure.
        """
        finder = BigramCollocationFinder.from_documents(documents)
        finder.apply_freq_filter(min_freq)
        return finder.nbest(assoc_measure, top_n)

Пример #13

0

Показать файл

def main() -> None:
    """Точка входа в приложение."""
    corpus_root = Path('corpus')
    # Настроим логирование результатов
    global _logger
    setup_logger(_logger, corpus_root / 'collocations.log')

    # Загрузим стоп-слова
    nltk.download('stopwords', '.env/share/nltk_data')
    stop_words = set(stopwords.words('russian'))

    # Импортируем корпус
    tags_root = corpus_root / 'pos_tagging'
    reader = ConllCorpusReader(
        str(tags_root), [f.name for f in tags_root.glob('*.tags')],
        columntypes=['words', 'ignore', 'ignore', 'ignore', 'pos'],
        separator='\t')
    _logger.info('Документов: %d', len(reader.fileids()))
    _logger.info('Токенов в первом документе (%s): %d',
                 reader.fileids()[0], len(reader.words(reader.fileids()[0])))

    _logger.info('Загружаем предложения')
    sentences = reader.sents()

    # Строим таблицы сопряжённости для всех слов в корпусе
    _logger.info('Считаем таблицу сопряжённости по всем словам')
    bigram_finder = BigramCollocationFinder.from_documents(
        [w.lower() for w in sent] for sent in tqdm(sentences))
    _logger.info('Всего биграм: %d', bigram_finder.N)

    print_samples(bigram_finder)

    # А теперь отфильтруем по частоте и удалим пунктуацию, стоп-слова
    _logger.info(
        'Отфильтруем пунктуацию, стоп-слова и установим предел по частоте')
    bigram_finder.apply_freq_filter(5)
    bigram_finder.apply_word_filter(lambda w: len(w) < 3 or w in stop_words)
    _logger.info('Всего биграм: %d', bigram_finder.N)
    print_samples(bigram_finder)

Пример #14

0

Показать файл

Файл: class_TextProcessor.py Проект: jirvingphd/dsc-5-capstone-project-online-ds-ft-021119

    def fit_bigrams(self, text_data=None, show_top_bigrams=True, top_n=20):
        """If text_data is None, use self.corpus"""
        import pandas as pd
        from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder, TrigramAssocMeasures, TrigramCollocationFinder

        if text_data is None:
            text_data = self.corpus

        ## Instantiate and fit bigram functions
        bigram_measures = BigramAssocMeasures()
        finder = BigramCollocationFinder.from_documents(text_data)
        scored = finder.score_ngrams(bigram_measures.raw_freq)
        self.bigrams = scored

        if show_top_bigrams:
            from IPython.display import display
            bigrams_to_show = scored[:top_n]
            col_names = ['Bigram', 'Frequency']
            caption = f'Top {top_n} Bigrams'
            df = pd.DataFrame.from_records(bigrams_to_show, columns=col_names)
            dfs = df.style.set_caption(caption)
            display(dfs)

Пример #15

0

Показать файл

Файл: pyngramspell.py Проект: jina-ai/jina-hub

    def fit(self, X: Iterable[str]):
        """Fit the ngram model and the vocabulary from the training data.
        :param X : Iterable over strings containing the corpus used to train the spellcheker.
        """
        from nltk.collocations import BigramCollocationFinder
        from editdistance import eval as edit_distance

        self.tokenize_func = self._build_tokenizer()
        X_tokenized = [
            self.tokenize_func(self.string_preprocessor_func(x)) for x in X
        ]
        self.unigram_freq_dict = dict(Counter(itertools.chain(*X_tokenized)))
        bigram_finder = BigramCollocationFinder.from_documents(X_tokenized)
        self.bigram_freq_dict = dict(bigram_finder.ngram_fd.items())
        self.vocabulary = set(
            list(itertools.chain(*self.bigram_freq_dict.keys())))

        if self.min_freq > 0:
            self._filter_vocabulary(min_freq=self.min_freq)

        if self.use_bktree:
            self.bktree = BKTree(edit_distance,
                                 self.vocabulary,
                                 sort_candidates=self.sort_candidates)

Пример #16

0

Показать файл

Файл: model.py Проект: suhren/keyword

def process(text: str,
            num_1_grams: int = 100,
            num_2_grams: int = 100,
            num_3_grams: int = 100,
            num_4_grams: int = 100,
            min_chars: int = 3,
            max_chars: int = 30):
    """ Extract keywords from text sources """

    # Find all sentences in the text
    sents = get_sentences(text)

    # Filter out any sentences which occur identically more than once
    sent_counter = collections.Counter(sents)
    sents = [sent for sent in sents if sent_counter[sent] == 1]

    # Tokenize each sentence
    sents = [RE_TOKEN.split(sent) for sent in sents]

    # and len(word) > 1

    # Filter out non-alphabetic tokens and convert to lowercase
    sents = [[token.lower() for token in sent if is_alpha(token)]
             for sent in sents]

    # We look at two variants of the input sentences
    # a. For 1-grams, we remove all stopwords, short tokens, and possesives
    # b. For 2-grams and longer, we want to keep stopwords and short tokens as
    #    these might provide some information in relation to other words
    sents_a = [[RE_POSS.sub('', t) for t in sent if filter_token(t)]
               for sent in sents]
    sents_b = sents

    assert len(sents_a) > 0 and len(sents_b), 'Not enough words'

    counter = collections.Counter()
    for sent in sents_a:
        for token in sent:
            counter[token] += 1

    res = [[], [], [], []]

    if num_1_grams:
        # Represent tokens using a tuple with only one element to match the
        # format of the other ngrams with n > 1
        tuples = [((token, ), count) for token, count in counter.items()]
        df_1 = df_top(tuples=tuples,
                      num=num_1_grams,
                      token_filter=filter_1_grams,
                      min_char=min_chars,
                      max_char=max_chars)
        if df_1 is not None:
            res[0] = df_1['entry'].tolist()

    if num_2_grams:
        bigrams = BigramCollocationFinder.from_documents(sents_b)
        tuples = bigrams.score_ngrams(BigramAssocMeasures.raw_freq)
        df_2 = df_top(tuples=tuples,
                      num=num_2_grams,
                      token_filter=filter_2_grams,
                      min_char=min_chars,
                      max_char=max_chars)
        if df_2 is not None:
            res[1] = df_2['entry'].tolist()

    if num_3_grams:
        trigrams = TrigramCollocationFinder.from_documents(sents_b)
        tuples = trigrams.score_ngrams(TrigramAssocMeasures.raw_freq)
        df_3 = df_top(tuples=tuples,
                      num=num_3_grams,
                      token_filter=filter_3_grams,
                      min_char=min_chars,
                      max_char=max_chars)
        if df_3 is not None:
            res[2] = df_3['entry'].tolist()

    if num_4_grams:
        quadgrams = QuadgramCollocationFinder.from_documents(sents_b)
        tuples = quadgrams.score_ngrams(QuadgramAssocMeasures.raw_freq)
        df_4 = df_top(tuples=tuples,
                      num=num_4_grams,
                      token_filter=filter_4_grams,
                      min_char=min_chars,
                      max_char=max_chars)
        if df_4 is not None:
            res[3] = df_4['entry'].tolist()

    return res

Пример #17

0

Показать файл

Файл: tweettokenization.py Проект: sean-nik/PoliticalSpeechAnalysis

print(biden_tweets)

####################################################
# Bigram Analysis

dnc_tokens_by_document = list()
rnc_tokens_by_document = list()
for tweet in dnc_doclist:
    dnc_tokens_by_document.append(
        tweet_to_tokens(tweet, ["dnc", "dncconvention"]))

for tweet in rnc_doclist:
    rnc_tokens_by_document.append(
        tweet_to_tokens(tweet, ["rnc", "rncconvention"]))

dnc_finder = BigramCollocationFinder.from_documents(dnc_tokens_by_document)
dnc_finder.nbest(BigramAssocMeasures.raw_freq, 30)  # top 30 DNC bigrams
dnc_finder.score_ngrams(
    BigramAssocMeasures.raw_freq)[:30]  # bigrams with scores
# horizontal bar chart
plot_word_freqs(
    dnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], 'b',
    "Top 30 bigrams in @DNCConvention2020", "Frequency Score")

rnc_finder = BigramCollocationFinder.from_documents(rnc_tokens_by_document)
rnc_finder.nbest(BigramAssocMeasures.raw_freq, 30)  # top 30 RNC bigrams
rnc_finder.score_ngrams(
    BigramAssocMeasures.raw_freq)[:30]  # bigrams with scores
plot_word_freqs(
    rnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], 'r',
    "Top 30 bigrams in @RNCConvention2020", "Frequency Score")

Пример #18

0

Показать файл

Файл: keyphrase_extraction.py Проект: santosh500/APL_PythonDeepLearning

                              key=itemgetter(1),
                              reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams]

    return sorted_ngrams


get_top_ngrams(corpus=norm_alice, ngram_val=2, limit=10)

get_top_ngrams(corpus=norm_alice, ngram_val=3, limit=10)

from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder = BigramCollocationFinder.from_documents(
    [item.split() for item in norm_alice])
bigram_measures = BigramAssocMeasures()
finder.nbest(bigram_measures.raw_freq, 10)
finder.nbest(bigram_measures.pmi, 10)

from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents(
    [item.split() for item in norm_alice])
trigram_measures = TrigramAssocMeasures()
finder.nbest(trigram_measures.raw_freq, 10)
finder.nbest(trigram_measures.pmi, 10)

toy_text = """
Elephants are large mammals of the family Elephantidae

Пример #19

0

Показать файл

def statictic(sentens):
    sw = splitSentence(sentences=sentens)
    return nltk.TextCollection(sw), BigramCollocationFinder.from_documents(sw)

Пример #20

0

Показать файл

def compute_collocation_bigram(corpus):
    finder = BigramCollocationFinder.from_documents([item.split() for item in corpus])
    bigram_measures = BigramAssocMeasures()

    return finder, bigram_measures

Пример #21

0

Показать файл

    right_wing_train = read_csv('../data/train/right_wing_train.csv')
    
    left_wing_corpus = merge_to_corpus(left_wing_train, 'Tokenizing left-wing:')
    right_wing_corpus = merge_to_corpus(right_wing_train, 'Tokenizing right-wing:')

    # Ignoring too common or unwanted words
    ignored_words = nltk.corpus.stopwords.words('german')
    ignored_words.extend(["junge", "freiheit", "www.jungefreiheit.de", "co."])

    # Filter bigrams, ignored words and words that are too small are ignored
    bigram_measures = BigramAssocMeasures() 
    word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words

    # Calculate most common bigrams
    print("\nCalculating left-wing bigrams")
    left_finder = BigramCollocationFinder.from_documents(left_wing_corpus)
    left_finder.apply_freq_filter(MIN_FREQ)
    left_finder.apply_word_filter(word_filter)
    print(left_finder.nbest(bigram_measures.likelihood_ratio, 10), '\n')

    print("Calculating right-wing bigrams")
    right_finder = BigramCollocationFinder.from_documents(right_wing_corpus)
    right_finder.apply_freq_filter(MIN_FREQ)
    right_finder.apply_word_filter(word_filter)
    print(right_finder.nbest(bigram_measures.likelihood_ratio, 10), '\n')

    # Calculate most common unigrams
    print("Calculating left-wing unigrams")
    left_wing_tokens = [item for sublist in left_wing_corpus for item in sublist]
    left_wing_fdist = nltk.FreqDist(left_wing_tokens)
    print(left_wing_fdist.most_common()[:10], '\n')

Пример #22

0

Показать файл

Файл: freq_ngrams.py Проект: dllllb/contest-dh-turing

def find_bigrams(sentences, n_ngrams):
    cf = BigramCollocationFinder.from_documents(sentences)
    fng = cf.nbest(BigramAssocMeasures.likelihood_ratio, n_ngrams)
    return fng

Пример #23

0

Показать файл

Файл: keyphrase_extraction.py Проект: 000Nelson000/text-analytics-with-python

                     for text, freq in sorted_ngrams]

    return sorted_ngrams   
    

get_top_ngrams(corpus=norm_alice, ngram_val=2,
               limit=10)
               
get_top_ngrams(corpus=norm_alice, ngram_val=3,
               limit=10)

from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder = BigramCollocationFinder.from_documents([item.split() 
                                                for item 
                                                in norm_alice])
bigram_measures = BigramAssocMeasures()                                                
finder.nbest(bigram_measures.raw_freq, 10)
finder.nbest(bigram_measures.pmi, 10)   

from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents([item.split() 
                                                for item 
                                                in norm_alice])
trigram_measures = TrigramAssocMeasures()                                                
finder.nbest(trigram_measures.raw_freq, 10)
finder.nbest(trigram_measures.pmi, 10)

Пример #24

0

Показать файл

Файл: kp_model.py Проект: sanjeeku/keyphrase-1

def bigram_coll_score(text, n=500):
    bigram_measure = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_documents([text])
    finder.apply_freq_filter(2)
    scored = finder.score_ngrams(bigram_measure.likelihood_ratio)
    return scored[:n]

Пример #25

0

Показать файл

def probable_occur(bi_gram):
    bi_gram_measures = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder.from_documents(bi_gram)
    return sorted(finder.nbest(bi_gram_measures.pmi, 10))

Пример #26

0

Показать файл

def score_bi_gram(bi_gram):
    bi_gram_measures = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder.from_documents(bi_gram)
    finder.apply_freq_filter(2)
    scored = finder.score_ngrams(bi_gram_measures.pmi)
    return scored

Пример #27

0

Показать файл

    ngram_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngram_freq_dist.items(),
                              key=itemgetter(1),
                              reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams]

    return sorted_ngrams


corpus, category = get_data()

from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder = BigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
bigram_measures = BigramAssocMeasures()

print finder.nbest(bigram_measures.raw_freq, 10)

from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures

finder = TrigramCollocationFinder.from_documents(
    [item.split() for item in corpus])
trigram_measures = TrigramAssocMeasures()

print finder.nbest(trigram_measures.raw_freq, 10)
print finder.nbest(trigram_measures.pmi, 10)

# print get_top_ngrams(corpus, ngram_val=2, limit=10)

Python BigramCollocationFinder.from_documents примеры использования