예제 #1
0
def create_word_bigram_scores():
    posdata = pickle.load(open('pos_review.pkl', 'rb'))
    negdata = pickle.load(open('neg_review.pkl', 'rb'))

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder_pos = BigramCollocationFinder.from_words(posWords)
    bigram_finder_neg = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder_pos.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder_neg.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams  # 词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
예제 #2
0
파일: ngram.py 프로젝트: dragoon/kilogram
    def association(self, measure='pmi'):
        if measure in self._association_dict:
            return self._association_dict[measure]

        ngrams = [self.ngram]
        collocs = {}

        for ngram in ngrams:
            self.ngram = ngram
            dist = self._get_freq_distributions()

            if len(self.ngram) == 2:
                finder = BigramCollocationFinder(*dist)
                measures = getattr(bigram_measures, measure)
            else:
                finder = TrigramCollocationFinder(*dist)
                measures = getattr(trigram_measures, measure)

            try:
                collocs = finder.score_ngrams(measures)
                collocs = dict((x[0][self.edit_pos], (i, x[1]))
                               for i, x in enumerate(collocs))
            except Exception as e:
                print('Exception in pmi_preps', e)
                print(self)
                print(dist)
                collocs = {}
            self._association_dict[measure] = collocs
            if collocs:
                return collocs
        return collocs
def N_collocations_in_text(text, N, min_freq):
    # finds <N> most significant two word collocations which occur at
    # least <min_freq> times
    text_lower = [w.lower() for w in text]
    finder = BigramCollocationFinder.from_words(text_lower)
    finder.apply_freq_filter(min_freq)
    return finder.nbest(BigramAssocMeasures().pmi, N)
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    newwords = []
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    for ngram in itertools.chain(words, bigrams):
        newwords.append(ngram)
    return newwords
예제 #5
0
파일: task1.py 프로젝트: ypandit/exercises
def get_types_of_guitar_bigrams(tokens):
    """
    Returns bigrams by co-occurrence for guitar types

    :param tokens:
    :return:defaultdict()
    """
    T = []
    for token in tokens:
        for tok in token:
            if not any(t.isdigit() for t in tok):
                T.append(tok)
    tokens = T

    bgm = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(tokens)
    guitar_filter = lambda *w: 'guitar' not in w
    finder.apply_freq_filter(2)
    finder.apply_ngram_filter(guitar_filter)

    scored = finder.score_ngrams(bgm.pmi)
    prefix_keys = collections.defaultdict(list)
    for key, scores in scored:
        prefix_keys[key[1]].append((key[0], scores))
    for key in prefix_keys:
        prefix_keys[key].sort(key=lambda x: -x[1])
    return prefix_keys
예제 #6
0
def createCollocationGraph(data, title):

    print('Create Collocation Graph')

    words = extractTokens(data)
    words = removeCommonWords(words)

    #find word pairs
    finder = BigramCollocationFinder.from_words(words, window_size=5)
    pairs = sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))

    collocation = nx.DiGraph()
    for key, value in pairs:
        left = key[0].lower()
        right = key[1].lower()
        if left not in collocation:
            collocation.add_node(left)
        if right not in collocation:
            collocation.add_node(right)
        if collocation.has_edge(left, right):
            collocation[left][right]['weight'] += value
        else:
            collocation.add_weighted_edges_from([(left, right, value)])

    nx.write_gexf(collocation, title)
예제 #7
0
def BiGramFeature(word_list, method=BigramAssocMeasures.chi_sq, n=2000):
	# 防止由于wordlist中只包含一种词而造成调用nbest引起的错误
	if len(set(word_list)) != 1:
		bigram_list = BigramCollocationFinder.from_words(word_list)
		top_bigram = bigram_list.nbest(method, n)
		return UniGramFeature(top_bigram)
	else:
		return UniGramFeature([])
예제 #8
0
def get_bigrams(words):
    bigram_measures = nltk.collocations.BigramAssocMeasures()

    finder = BigramCollocationFinder.from_words(words)

    finder.apply_freq_filter(
        4)  # Restrict bigrams to those that appear at least three times

    return sorted(finder.ngram_fd.items())
예제 #9
0
def most_common_bigrams(all_words, num_bigrams):
    bigram_finder = BigramCollocationFinder.from_words(all_words)
    bigram_freq = dict(bigram_finder.ngram_fd.viewitems())
    for k, v in bigram_freq.items():
        if not is_feature_relevant(k[0]) or not is_feature_relevant(k[1]):
            del bigram_freq[k]

    fd = FreqDist(bigram_freq)
    return dict(fd.most_common(num_bigrams)).keys()
예제 #10
0
 def bigrams(self) -> List[str]:
     """
     Returns list of bigrams, words inside bigrams are space-separated.
     Returns up to 50 bigrams.
     """
     finder = BigramCollocationFinder.from_words(self.tokens)
     return [
         f"{word_0}_{word_1}"
         for word_0, word_1 in finder.nbest(BigramAssocMeasures().pmi, 50)
     ]
예제 #11
0
def Mixup2Feature(word_list, bi_method=BigramAssocMeasures.chi_sq,\
 bi_n=2000):
	# 防止由于wordlist中只包含一种词而造成调用nbest引起的错误
	if len(set(word_list)) != 1:
		bigram_list = BigramCollocationFinder.from_words(word_list)
		# print json.dumps(word_list, ensure_ascii=False)
		top_bigram = bigram_list.nbest(bi_method, bi_n)
		return UniGramFeature(word_list + top_bigram)
	else:
		return UniGramFeature(word_list)
예제 #12
0
파일: ngram.py 프로젝트: dedcode/kilogram
    def association(self, measure='pmi'):
        if measure in self._association_dict:
            return self._association_dict[measure]
        dist = self._get_freq_distributions()

        if len(self.ngram) == 2:
            finder = BigramCollocationFinder(*dist)
            measures = getattr(bigram_measures, measure)
        else:
            finder = TrigramCollocationFinder(*dist)
            measures = getattr(trigram_measures, measure)

        try:
            collocs = finder.score_ngrams(measures)
        except Exception, e:
            print 'Exception in pmi_preps'
            print e
            print self
            print dist
            collocs = []
예제 #13
0
def Mixup3Feature(word_list, bi_method=BigramAssocMeasures.chi_sq,\
 bi_n=2000, tri_method=TrigramAssocMeasures.chi_sq, tri_n=1000):
	# 防止由于wordlist中只包含一种词而造成调用nbest引起的错误
	if len(set(word_list)) != 1:
		bigram_list = BigramCollocationFinder.from_words(word_list)
		top_bigram = bigram_list.nbest(bi_method, bi_n)
		trigram_list = TrigramCollocationFinder.from_words(word_list)
		top_trigram = trigram_list.nbest(tri_method, tri_n)
		return UniGramFeature(word_list + top_bigram + top_trigram)
	else:
		trigram_list = TrigramCollocationFinder.from_words(word_list)
		top_trigram = trigram_list.nbest(tri_method, tri_n)
		return UniGramFeature(word_list + top_trigram)
예제 #14
0
def create_word_bigram_scores():
    posdata = pickle.load(open('pos_review.pkl', 'rb'))
    negdata = pickle.load(open('neg_review.pkl', 'rb'))

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder_pos = BigramCollocationFinder.from_words(posWords)
    bigram_finder_neg = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder_pos.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder_neg.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams  # 词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
예제 #15
0
def find_freq_of_features(local_features, imp_feature_set):
    """
    Finds the frequency of most common features in this
    data
    """
    local_bigram_features = dict(BigramCollocationFinder.from_words(local_features).ngram_fd.viewitems())
    local_freq_features = dict.fromkeys(imp_feature_set, 0)
    for word in local_features:
        if word in imp_feature_set:
            local_freq_features[word] += 1

    for bigram in local_bigram_features:
        if bigram in imp_feature_set:
            local_freq_features[bigram] = local_bigram_features[bigram]

    return local_freq_features
def remove_non_price_numbers(tokens):
    """
    Remove numbers which are not prices
    """
    finalized_tokens = []
    token_bigrams = BigramCollocationFinder.from_words(tokens)
    num_pattern = r"\d+(?:\.\d+)?"
    pattern = re.compile(f"{num_pattern}(-{num_pattern})?"
                         )  # include tokens like ranges of numbers/ numbers
    for token in tokens:
        if not pattern.fullmatch(token):
            finalized_tokens.append(token)
            continue
        if (token_bigrams.ngram_fd.get(('$', token)) or token_bigrams.ngram_fd.get((token, '$'))) \
                or (token_bigrams.ngram_fd.get(('usd', token)) or token_bigrams.ngram_fd.get((token, 'usd'))):
            finalized_tokens.append(token)
    return finalized_tokens
예제 #17
0
def grams():
    from nltk import BigramCollocationFinder, BigramAssocMeasures, TrigramAssocMeasures, TrigramCollocationFinder
    
    
    words = get_words()
    
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(words)
    finder.apply_freq_filter(40)
    bigrams = finder.nbest(bigram_measures.pmi, 500)
    
    trigram_measures = TrigramAssocMeasures()
    finder3 = TrigramCollocationFinder.from_words(words)
    finder3.apply_freq_filter(100)
    trigrams = finder3.nbest(trigram_measures.pmi, 300)
    combos2 = [combo2[0]+ " " + combo2[1] for combo2 in bigrams]
    combos3 = [combo3[0]+ " " + combo3[1] + " " + combo3[2] for combo3 in trigrams]
    return combos2, combos3
예제 #18
0
def strip_and_tokenize(msg):
    msg = msg.strip()
    msg = msg.lower()
    msg = re.sub(b'\xe2\x80\x99'.decode(), "'", msg)
    # remove stop words
    # remove non-ascii characters
    tokens = word_tokenize(msg)
    # tokens_ = tokens.copy()
    ret_tokens = []
    token_bigrams = BigramCollocationFinder.from_words(tokens)
    for token in tokens:
        try:
            token.encode("ascii")
        except UnicodeEncodeError:
            # tokens.remove(token)  # remove non-ascii tokens
            continue
        #
        if token in punctuations:  # remove punctuations
            # tokens.remove(token)
            continue
        #
        if is_date(token):
            ret_tokens.append("<DATE>")
            continue
        if is_phone_number(token):
            # tokens.remove(token)
            ret_tokens.append("<PHONE>")
            continue
        #
        if bool(is_url(token)):
            ret_tokens.append(retrieve_url_tokens(token))
            continue
        new_token, is_resolved = resolve_clitics(token)
        if is_resolved:
            ret_tokens.append(new_token)
            continue
        p_token, is_tokenized = tokenize_price_numbers(token, token_bigrams)
        if is_tokenized:
            ret_tokens.append(p_token)
            continue
        ret_tokens.extend(regex_tokenizer.tokenize(token))
        #
    return ret_tokens
예제 #19
0
def tokenize_age_tokens(tokens) -> bool:
    age_pattern = re.compile(r"\d{1,2}")
    finalized_tokens = []
    token_bigrams = BigramCollocationFinder.from_words(tokens)
    for token in tokens:
        if short_age_pattern.fullmatch(token):
            finalized_tokens.append("<AGE>")
            continue
        if not age_pattern.fullmatch(token):
            finalized_tokens.append(token)
            continue
        if token_bigrams.ngram_fd.get((token, 'year')) or token_bigrams.ngram_fd.get((token, 'years')):
            finalized_tokens.append("<AGE>")
            continue
        elif token_bigrams.ngram_fd.get((token, 'month')) or token_bigrams.ngram_fd.get((token, 'months')):
            finalized_tokens.append("<AGE>")
            continue
        else:
            finalized_tokens.append(token)
    return finalized_tokens
예제 #20
0
def top_phrases_nltk(revs):
    """
    Find top phrases by finding collocations using nltk
    :param revs:
    :return:
    """
    revs = '.\n'.join(revs)
    bigram_measures = BigramAssocMeasures()
    tokens = word_tokenize(revs)
    finder = BigramCollocationFinder.from_words(tokens)
    finder.apply_freq_filter(3)
    finder.apply_word_filter(
        lambda w: len(w) < 2 or w in stopwords.words("english"))
    colloc = []
    for tup in finder.nbest(bigram_measures.pmi, 50):
        pos = pos_tag(tup)
        if pos[0][1].startswith(("JJ", "RB")) or pos[1][1].startswith(
            ("JJ", "RB")):
            colloc.append(tup)
    return colloc
예제 #21
0
def collocations(text, language='english', num=0, window_size=2):  #num=20
    """
    A reimplementation of the basic workings of the collocations method of the ``Text`` class of NLTK.

    :param text: raw text
    :param language: language to use to eliminate stopwords
    :param num: number of collocations
    :param window_size: window for collocations
    :return: a list of collocations for the text
    """

    from nltk.corpus import stopwords

    ignored_words = stopwords.words(language)
    finder = BigramCollocationFinder.from_words(word_tokenize(text),
                                                window_size)
    finder.apply_freq_filter(2)
    finder.apply_word_filter(
        lambda w: len(w) < 3 or w.lower() in ignored_words)
    bigram_measures = BigramAssocMeasures()
    c = finder.nbest(bigram_measures.likelihood_ratio, num)

    collocation_strings = [w1 + ' ' + w2 for w1, w2 in c]
    return collocation_strings
예제 #22
0
    def collect(self, input_file, limit=None, save=None):
        """
        corllect corpus from Jodel JSON data
        :param input_file:
        :param limit:
        :param save:
        :return:
        """
        corpus = []
        words = []
        labels = []

        identities = {
            'Basel-Stadt': 'Basel',
            'Brunswick': 'Braunschweig',
            'Cologne': 'Köln',
            'Frankfurt': 'Frankfurt am Main',
            'Freiburg': 'Freiburg im Breisgau',
            'Fribourg-en-Brisgau': 'Freiburg im Breisgau',
            'Geneva': 'Genf',
            'Genève': 'Genf',
            'Hanover': 'Hannover',
            'Klagenfurt am Wörthersee': 'Klagenfurt',
            'Munich': 'München',
            'Nuremberg': 'Nürnberg',
            'Ouest lausannois': 'Lausanne',
            'Sankt Pölten': 'St. Pölten',
            'Sankt Gallen': 'St. Gallen',
            'Salzburg-Umgebung': 'Salzburg',
            'Vienna': 'Wien',
            'Zurich': 'Zürich'
        }

        self.city_frequency = defaultdict(int)

        # iterate over the data
        with open(input_file, encoding='utf-8', errors='ignore') as f:
            for line_no, line in enumerate(islice(f, None)):
                if line_no > 0:
                    if line_no % 10000 == 0:
                        print("%s" % (line_no), file=sys.stderr, flush=True)
                    elif line_no % 500 == 0:
                        print('.', file=sys.stderr, end=' ', flush=True)

                try:
                    jodel = json.loads(line)
                except ValueError:
                    print(line)
                msg = jodel.get('message', None)
                location = jodel.get('location', None)

                # skip empty jodels
                if msg is None or location is None:
                    continue

                city = location.get('name', None)

                if city == 'Jodel Team' or city is None:
                    continue

                # correct city names
                city = identities.get(city, city)

                self.city_frequency[city] += 1

                # collect all the data and transform it
                data = [self.clean(msg)]
                data.extend([
                    self.clean(child.get('message', []))
                    for child in jodel.get('children', [])
                ])

                # one instance for each jodel
                # corpus.extend(data)
                # labels.extend([city] * len(data))

                # one instance for each conversation
                corpus.append([word for message in data for word in message])
                labels.append(city)

                words.extend([word for message in data for word in message])

                if limit is not None and line_no == limit:
                    break

        assert len(labels) == len(
            corpus
        ), "umm, the number of labels (%s) and the number of instances (%s) is not the same" % (
            len(labels), len(corpus))

        self.int2word = {
            i: max(self.stems[w].keys(), key=(lambda key: self.stems[w][key]))
            for w, i in self.word2int.items()
        }
        # find collocations
        print('\nlooking for collocations', file=sys.stderr, flush=True)
        finder = BigramCollocationFinder.from_words(words)
        bgm = BigramAssocMeasures()
        collocations = [
            b for b, f in finder.score_ngrams(bgm.mi_like) if f > 1.0
        ]
        self.collocations = set(collocations)

        print('\ncreating corpus', file=sys.stderr, flush=True)
        if save is not None:
            self.corpus = []
            with open('%s.corpus' % save, 'w',
                      encoding='utf-8') as save_corpus:
                for doc, tag in zip(corpus, labels):
                    words = self.join_collocations(doc)
                    tags = [tag]
                    self.corpus.append(TaggedDocument(words, tags=tags))
                    save_corpus.write('%s\n' % json.dumps({
                        'words': words,
                        'tags': tags
                    }))
            print('\ncorpus saved as %s' % save, file=sys.stderr, flush=True)

            with open('%s.citycounts' % save, 'w',
                      encoding='utf-8') as save_counts:
                json.dump(dict(self.city_frequency), save_counts)

        else:
            self.corpus = [
                TaggedDocument(self.join_collocations(doc), tags=[tag])
                for doc, tag in zip(corpus, labels)
            ]

        print('\n%s instances' % len(self.corpus), file=sys.stderr, flush=True)

        # update mappings
        self.int2word = {
            i: max(self.stems[w].keys(), key=(lambda key: self.stems[w][key]))
            for w, i in self.word2int.items()
        }
        print("Found %s collocations" % (len(collocations)),
              file=sys.stderr,
              flush=True)
        for (w1, w2) in collocations[:10]:
            print('\t',
                  self.int2word[w1],
                  self.int2word[w2],
                  file=sys.stderr,
                  flush=True)
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return  bigrams
예제 #24
0
파일: lab2.py 프로젝트: chMatvey/itmo
one_grams = calculate_n_grams(words, 1)
one_grams_freq = calculate_n_grams_frequency(one_grams, words, 1)

# Calculate MI

be_mi = calculate_mi(be_grams_freq, 2, one_grams_freq, len(words), 30)
print()
print(be_mi)

three_mi = calculate_mi(three_grams_freq, 3, one_grams_freq, len(words), 30)
print()
print(three_mi)

# Check

bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

text = nltk.Text(words)

finder_bi = BigramCollocationFinder.from_words(text)
finder_thr = TrigramCollocationFinder.from_words(text)

print()
be_best = finder_bi.nbest(bigram_measures.pmi, 30)
print(be_best)

print()
tri_best = finder_thr.nbest(trigram_measures.pmi, 30)
print(tri_best)
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
예제 #26
0
"""计算句子中某种语言模式出现概率的统计模型 把自然语言作为模型进行统计分析"""
import nltk
from nltk import ngrams, BigramCollocationFinder, BigramAssocMeasures, unique_list, KneserNeyProbDist
from nltk.corpus import alpino, webtext, stopwords
"""单词分组 util.py"""
n = 4
grams = ngrams(alpino.words(), n)
# for i in grams:
# print(i)
out = list(ngrams([1, 2, 3, 4, 5], 3))
print(out)  # [(1, 2, 3), (2, 3, 4), (3, 4, 5)]

set = set(stopwords.words('english'))
stops_filter = lambda w: len(w) < 3 or w in set
tokens = [t.lower() for t in webtext.words('grail.txt')]
words = BigramCollocationFinder.from_words(tokens)
words.apply_word_filter(stops_filter)
out = words.nbest(BigramAssocMeasures.likelihood_ratio, 10)
print(out)
"""最大似然估计的目的就是:利用已知的样本结果,反推最有可能(最大概率)导致这样结果的参数值。"""
"""最大似然估计wiki https://zh.wikipedia.org/zh-cn/%E6%9C%80%E5%A4%A7%E4%BC%BC%E7%84%B6%E4%BC%B0%E8%AE%A1"""
"""隐马尔科夫模型估计 HMM"""
corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:700]
print(len(corpus))
tag_set = unique_list(tag for sent in corpus for (word, tag) in sent)
print(len(tag_set))
"""平滑"""
# gt = lambda fd, bins:SimpleGoodTuringProbDist(fd, bins=1e5)
# train_and_test(gt)
corpus = [[((x[0], y[0], z[0]), (x[1], y[1], z[1]))
           for x, y, z in nltk.trigrams(sent)]
예제 #27
0
 def __init__(self, limit=1000):
     # TODO: Read all of shakespeare into words?
     fileid = shakespeare.fileids()[0]
     words = remove_punctuation(shakespeare.words(fileid))
     self.finder = BigramCollocationFinder.from_words(words)
     self.bigrams = self.finder.nbest(bigram_measures.raw_freq, limit)
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words))
    return d
예제 #29
0
###############################################################################
# Generate a list of word tuples (bigram) to examine collocations
from nltk import bigrams
bigram_tuples = list(bigrams(brown.words()))
len(bigram_tuples)

# http://www.nltk.org/howto/collocations.html
# collocations are essentially just frequent bigrams,
# except that we want to pay more attention to the cases that involve rare words.
# In particular, we want to find bigrams that occur more often than we would expect
# based on the frequency of the individual words. The collocations() function does this for us.
from nltk import collocations
from nltk import BigramCollocationFinder
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = BigramCollocationFinder.from_words(brown.words())
finder.nbest(bigram_measures.pmi, 10)

# apply filters to collocations, such as ignoring all bigrams which occur less than three times in the corpus
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.pmi, 10)

# find collocations among tagged words
finder = BigramCollocationFinder.from_words(brown.tagged_words('ca01', tagset='universal'))
finder.nbest(bigram_measures.pmi, 5)

# tags alone
finder = BigramCollocationFinder.from_words(t for w, t in  brown.tagged_words('ca01', tagset='universal'))
finder.nbest(bigram_measures.pmi, 5)

# Spanning intervening words
ret_axes.get_legend().remove()
ret_axes.set_axisbelow(True)
ret_axes.grid(True)
plt.show()

ret_axes: Axes = pivot_by_length_frequency[3:].plot(kind='barh')
ret_axes.plot(pivot_by_length_frequency['frequency'].iloc[3:],
              list(ret_axes.get_yticks()))
ret_axes.set_xlabel("Bin wise cumulative frequencies by word length")
ret_axes.set_title("Whole corpus")
ret_axes.get_legend().remove()
ret_axes.set_axisbelow(True)
ret_axes.grid(True)
plt.show()

bigrams = BigramCollocationFinder.from_words(corpus)
trigrams = TrigramCollocationFinder.from_words(corpus)

phone_nums = {}


def is_phone_number(phone_str: str) -> bool:
    phone_regex_1 = r'(?:\+1(?P<delim>(\.|-)?))([0-9]{3})(?P=delim)([0-9]{3})(?P=delim)([0-9]{4})'
    phone_regex_2 = r'([0-9]{3})(?P<delim>(\.|-)?)([0-9]{3})(?P=delim)([0-9]{4})'
    if phone_str.startswith('+1'):
        match = re.fullmatch(phone_regex_1, phone_str)
        if not match:
            return False
        num = '-'.join(match.groups()[2:])
        phone_nums[num] = phone_nums.get(num, 0) + 1
    else:
예제 #31
0
def extract_keywords(string, tokenizer, sent_tokenizer, tagger, extractor, proper_noun_tag='SUBST_PROP'):
    """
    Implements KERA keyword extraction algorithm.

    See: https://www.ida.org/~/media/Corporate/Files/Publications/IDA_Documents/ITSD/ida-document-ns-d-4931.pdf

    Basic implementation of the procedure described in the paper.
    Probably needs some refinements in order to be more broadly effective.

    :param string: Document to analyze.
    :type string: str|unicode
    :param tokenizer: Function that returns a token segmentation as an iterable of strings given a string..
    :type tokenizer: (str|unicode) -> list[str|unicode]
    :param sent_tokenizer: Function that returns a sentence segmentation as an iterable of strings given a string.
    :type sent_tokenizer: (str|unicode) -> list[str|unicode]
    :param tagger: TextBlob compatible POS tagger. Must accept untokenized sentences.
    :type tagger: textblob.base.BaseTagger
    :param extractor: TextBlob compatible noun phrase extractor. Must accept untokenized sentences and use the same
      POS tagger which is passed as the tagger parameter.
    :type extractor: textblob.base.BaseNPExtractor
    :param proper_noun_tag: POS tag indicating proper nouns.
    :type proper_noun_tag: str|unicode
    :return: List of keyword/score tuples. Keyword may be a string or tuple of strings.
    :rtype : list[(str|unicode|(str|unicode)), float]
    """
    # find bigram collocations
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(tokenizer(string))
    collocations = finder.score_ngrams(bigram_measures.likelihood_ratio)[0:50]

    # find noun phrases
    phrases = [extractor.extract(s) for s in sent_tokenizer(string)]
    phrases = [item for sublist in phrases for item in sublist]

    # find proper noun tokens, collect total/frequency for weighting/normalization
    sents = [tagger.tag(s) for s in sent_tokenizer(string)]
    sents = [item for sublist in sents for item in sublist]

    proper_nouns = []

    np_doc_len = 0

    for i, (token, tag) in enumerate(sents):
        np_doc_len += 1

        if tag == proper_noun_tag:
            proper_nouns.append((token, i))

    # find noun phrase/collocation overlap
    phrase_strings = [' '.join(x[0]).lower() for x in phrases if isinstance(x[0], list)]
    collocations = [c for c in collocations if ' '.join(c[0]) in phrase_strings]

    ranks = []

    # calculate combined index score and normalized collocation score for collocations
    coll_score_total = sum([x[1] for x in collocations])
    coll_doc_len = len(tokenizer(string))

    for coll, coll_score in collocations:
        idx = phrases[phrase_strings.index(' '.join(coll))][1]

        alpha = coll_score / coll_score_total
        beta = 1 - (float(idx) / coll_doc_len)

        score = 2 * alpha * beta / (alpha + beta)

        ranks.append((coll, score))

    # calculate combined index score and normalized term frequency score for proper nouns
    np_strings = [x[0] for x in proper_nouns]
    np_counts = Counter(np_strings)
    np_total = len(proper_nouns)

    # only use normalize over the same number of proper nouns as collocations in order to keep
    # the scores roughly comparable.
    # TODO There are rarely more proper names than collocations. Handle this too.
    for np, count in sorted(np_counts.items(), key=itemgetter(1), reverse=True)[0:len(collocations)]:
        idx = proper_nouns[np_strings.index(np)][1]

        alpha = float(count) / np_total
        beta = 1 - (float(idx) / np_doc_len)

        score = 2 * alpha * beta / (alpha + beta)

        ranks.append((np, score))

    # return list of keywords and scores sorted by score
    return sorted(ranks, key=itemgetter(1), reverse=True)
예제 #32
0
    ' ', '?', '!', ',', ';', ':', '-', '--', '---', '(', ')', '{', '}', '[',
    ']', "'", '"', '.', '`', '·', '``', '~', "''"
]

# remove punctuations
BNC_filtered_words = [w for w in BNC_words if w not in filter_words]
CS_filtered_words = [w for w in CS_words if w not in filter_words]

# remove stop words
BNC_Stop_filtered_words = [
    w for w in BNC_filtered_words if w not in stop_words
]
CS_Stop_filtered_words = [w for w in CS_filtered_words if w not in stop_words]

bigram_measures = collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(BNC_Stop_filtered_words)

f = open('G_bnc_output.txt', 'w')
text = '\nChi-square Measure - '
s1 = ''.join(str(text))
f.write(str(s1))

bigrams = finder.nbest(bigram_measures.chi_sq, 20)
write_file(bigrams)

text = ' \nPMI Measure - '
s1 = ''.join(str(text))
f.write(str(s1))
bigrams = finder.nbest(bigram_measures.pmi, 20)
write_file(bigrams)
예제 #33
0
    tokens = tokens + lineTokens
    #print(tokens, file=log_file)

# Exclude non-english words (%%%), punctuation & stopwords
for token in tokens:
    #Excludes non-word tokens
    if re.search('\W', token) == None:
        #Excludes stopwords
        if (token not in stopwords.words('english')):
            words.append(
                token
            )  # This will include all tokens except for stopwords and punctuation

# Find bigram collocations
# http://www.nltk.org/howto/collocations.html
finder = BigramCollocationFinder.from_words(words)
scored = finder.score_ngrams(BigramAssocMeasures.raw_freq)

# Write two-word expressions to log file
for item in scored:
    log_file.write(str(item[1]) + "\t" + str(item[0]) + "\n")

#sorted(bigram for bigram, score in scored)
mostFrequentBigramsList = finder.nbest(BigramAssocMeasures.raw_freq, 150)
# To sort the list alphabetically, just use the sorted() function in Python.

# Write to file
for item in mostFrequentBigramsList:
    output_file1.write(str(item) + "\n")

# Find trigram collocations
예제 #34
0
def run_wordcloud_model(
    entry_id, mode
):  # this will extract paragraph and header text from given json file and extract the topics from that

    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    print("wordcloud model started", str(data[0]['_id']), data[0]['link'])
    try:
        h_p_data = data[0]["paragraph_text"] + data[0][
            "header_text"]  # do topic extraction on paragraph and header text

        wordcloud = WordCloud(background_color="white",
                              max_words=100,
                              contour_width=3,
                              contour_color='steelblue')
        # Generate a word cloud
        data_words = list(sent_to_words(h_p_data))
        # data_words_nostops = remove_stopwords(data_words)
        data_lemmatized = lemmatization(data_words,
                                        allowed_postags=['NOUN', 'ADJ'])
        # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ'])
        # print(data_lemmatized)
        all_tokens = [j for i in data_lemmatized for j in i]
        # print('all', all_tokens)
        all_tokens = [
            value for value in all_tokens
            if (value != 'other' and value != 'day' and value != 'thing'
                and value != 'last')
        ]

        if mode == 'single':
            combined_text = " ".join(all_tokens)
        else:
            if mode == 'bi':
                finder = BigramCollocationFinder.from_words(all_tokens)
                bigram_measures = BigramAssocMeasures()
                scored = finder.score_ngrams(bigram_measures.raw_freq)
            if mode == 'tri':
                # print(combined_text)
                # setup and score the bigrams using the raw frequency.
                finder = TrigramCollocationFinder.from_words(all_tokens)
                trigram_measures = TrigramAssocMeasures()
                scored = finder.score_ngrams(trigram_measures.raw_freq)

            # print(scored)

            # By default finder.score_ngrams is sorted, however don't rely on this default behavior.
            # Sort highest to lowest based on the score.
            scoredList = sorted(scored, key=itemgetter(1), reverse=True)
            # print('sclist',scoredList)
            # word_dict is the dictionary we'll use for the word cloud.
            # Load dictionary with the FOR loop below.
            # The dictionary will look like this with the bigram and the score from above.
            # word_dict = {'bigram A': 0.000697411,
            #             'bigram B': 0.000524882}

            word_dict = {}
            listLen = len(scoredList)
            # Get the bigram and make a contiguous string for the dictionary key.
            # Set the key to the scored value.
            for i in range(listLen - 1):
                word_dict['_'.join(scoredList[i][0])] = scoredList[i][1]
            # print('dic',word_dict)

        if mode == 'single':
            wordcloud.generate(combined_text)
        else:
            wordcloud.generate_from_frequencies(word_dict)
    except Exception:
        print("cannot make word cloud for empty text")
        mycol.update_one({'_id': entry_id},
                         {'$set': {
                             'wordcloud_results_' + mode: []
                         }})
        print("vocabulary is empty")
        return "Vocabulary is empty"

    # Visualize the word cloud
    wordcloud.to_image()
    wordcloud_words = []

    word_cloud_results = []
    for each_res in wordcloud.words_:

        word_cloud_results.append([each_res, wordcloud.words_[each_res]])
        wordcloud_words.append(each_res)
    # print('words', wordcloud_words)
    # plt.imshow(wordcloud, interpolation='bilinear')
    # plt.axis("off")
    # plt.savefig(name_j[:-4]+"png")
    # plt.show()
    print(word_cloud_results)
    # return wordcloud_words

    mycol.update_one(
        {'_id': entry_id},
        {'$set': {
            'wordcloud_results_' + mode: word_cloud_results
        }})
    print("Successfully extended the data entry with wordcloud results",
          entry_id)


# run_wordcloud_model("F://Armitage_project//crawl_n_depth//extracted_json_files//3_www.hydroterra.com.au_data.json")
예제 #35
0
def get_wc_results(text,mode):
    try:
        h_p_data = text  # do topic extraction on paragraph and header text

        wordcloud = WordCloud(background_color="white", max_words=100, contour_width=3, contour_color='steelblue')
        # Generate a word cloud
        data_words = list(sent_to_words(h_p_data))
        # data_words_nostops = remove_stopwords(data_words)
        data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ'])
        # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ'])
        # print(data_lemmatized)
        all_tokens = [j for i in data_lemmatized for j in i]
        # print('all', all_tokens)
        all_tokens = [value for value in all_tokens if
                      (value != 'other' and value != 'day' and value != 'thing' and value != 'last')]


        if mode == 'single':
            combined_text = " ".join(all_tokens)
        else:
            if mode=='bi':

                finder = BigramCollocationFinder.from_words(all_tokens)
                bigram_measures = BigramAssocMeasures()
                scored = finder.score_ngrams(bigram_measures.raw_freq)
            if mode =='tri':
                # print(combined_text)
                # setup and score the bigrams using the raw frequency.
                finder = TrigramCollocationFinder.from_words(all_tokens)
                trigram_measures = TrigramAssocMeasures()
                scored = finder.score_ngrams(trigram_measures.raw_freq)

            # print(scored)

            # By default finder.score_ngrams is sorted, however don't rely on this default behavior.
            # Sort highest to lowest based on the score.
            scoredList = sorted(scored, key=itemgetter(1), reverse=True)
            # print('sclist',scoredList)
            # word_dict is the dictionary we'll use for the word cloud.
            # Load dictionary with the FOR loop below.
            # The dictionary will look like this with the bigram and the score from above.
            # word_dict = {'bigram A': 0.000697411,
            #             'bigram B': 0.000524882}

            word_dict = {}
            listLen = len(scoredList)
            # Get the bigram and make a contiguous string for the dictionary key.
            # Set the key to the scored value.
            for i in range(listLen-1):
                word_dict['_'.join(scoredList[i][0])] = scoredList[i][1]
            print('dic',word_dict)

        if mode=='single':
            wordcloud.generate(combined_text)
        else:
            wordcloud.generate_from_frequencies(word_dict)
    except Exception:
        print("cannot make word cloud for empty text")
        return []



    # Visualize the word cloud
    wordcloud.to_image()
    wordcloud_words = []

    word_cloud_results = []
    for each_res in wordcloud.words_:

        word_cloud_results.append([each_res,wordcloud.words_[each_res]])
        wordcloud_words.append(each_res)
    # print('words', wordcloud_words)
    # plt.imshow(wordcloud, interpolation='bilinear')
    # plt.axis("off")
    # plt.savefig(name_j[:-4]+"png")
    # plt.show()
    print(word_cloud_results)
    return word_cloud_results