Пример #1
0
def create_word_scores():
    posWords = pickle.load(open('pos_review.pkl', 'rb'))
    negWords = pickle.load(open('neg_review.pkl', 'rb'))

    posWords = list(itertools.chain(*posWords))  # 把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords))  # 同理

    word_fd = FreqDist()  # 可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  # 可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()  # 积极词的数量
    neg_word_count = cond_word_fd['neg'].N()  # 消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count),
                                               total_word_count)  # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count),
                                               total_word_count)  # 同理
        word_scores[word] = pos_score + neg_score  # 一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores  # 包括了每个词和这个词的信息量
Пример #2
0
def create_word_scores():
    posWords = pickle.load(open('pos_review.pkl', 'rb'))
    negWords = pickle.load(open('neg_review.pkl', 'rb'))

    posWords = list(itertools.chain(*posWords))  # 把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords))  # 同理

    word_fd = FreqDist()  # 可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  # 可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()  # 积极词的数量
    neg_word_count = cond_word_fd['neg'].N()  # 消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(
            cond_word_fd['pos'][word], (freq, pos_word_count),
            total_word_count)  # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)  # 同理
        word_scores[word] = pos_score + neg_score  # 一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores  # 包括了每个词和这个词的信息量
Пример #3
0
def create_word_bigram_scores():
    posdata = pickle.load(open('pos_review.pkl', 'rb'))
    negdata = pickle.load(open('neg_review.pkl', 'rb'))

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder_pos = BigramCollocationFinder.from_words(posWords)
    bigram_finder_neg = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder_pos.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder_neg.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams  # 词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Пример #4
0
def bestWords():
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    reviews = product_reviews_1.reviews()
    reviewlines = []
    for review in reviews:
        for line in review.review_lines:
            reviewlines.append(line)

    featlines = [line for line in reviewlines if len(line.features) > 0]
    pluswords = []
    minuswords = []
    for line in featlines:
        plus = False
        minus = False
        for feat in line.features:
            if feat[1][0] == "+":
                plus = True
            elif feat[1][0] == "-":
                minus = True
        if plus:
            for word in line.sent:
                pluswords.append(word)
        if minus:
            for word in line.sent:
                minuswords.append(word)

    for word in pluswords:
        word_fd[word.lower()] += 1
        label_word_fd['+'][word.lower()] += 1

    for word in minuswords:
        word_fd[word.lower()] += 1
        label_word_fd['-'][word.lower()] += 1

    pos_word_count = label_word_fd['+'].N()
    neg_word_count = label_word_fd['-'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['+'][word],
                                               (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['-'][word],
                                               (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    best = sorted(word_scores.items(), key=(lambda s: s[1]), reverse=True)[:515]
    return set([w for w, s in best])
def find_1000_best_words(pos_tweet_words,neg_tweet_words, stop_words_filter, bigram_collocation_check):
    for tweet in pos_tweet_words:
        tweet_words  = tweet[0]
        all_words = []
        if bigram_collocation_check:
            bigrams = best_bigram_word_feats(tweet_words)
        if stop_words_filter:
            words = SentiUtil.stopword_filtered_word_feats(tweet_words)
        all_words.extend(words)
        for word in all_words:
            word_fd.inc(word.lower())
            label_word_fd['positive'].inc(word.lower())

    for tweet in neg_tweet_words:
        tweet_words = tweet[0]
        all_words = []
        if bigram_collocation_check:
            bigrams = best_bigram_word_feats(tweet_words)
        if stop_words_filter:
            words = SentiUtil.stopword_filtered_word_feats(tweet_words)
        all_words.extend(words)
        for word in all_words:
            word_fd.inc(word.lower())
            label_word_fd['negative'].inc(word.lower())

    pos_word_count = label_word_fd['positive'].N()
    neg_word_count = label_word_fd['negative'].N()
    total_word_count = pos_word_count + neg_word_count

#    print "Pos word count  : ", pos_word_count
#    print "Neg word count  : ", neg_word_count
#    print "Total word count : " , total_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][word],
                                               (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['negative'][word],
                                               (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
        best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
        bestwords = set([w for w, s in best])
    return bestwords
Пример #6
0
 def bigrams(self) -> List[str]:
     """
     Returns list of bigrams, words inside bigrams are space-separated.
     Returns up to 50 bigrams.
     """
     finder = BigramCollocationFinder.from_words(self.tokens)
     return [
         f"{word_0}_{word_1}"
         for word_0, word_1 in finder.nbest(BigramAssocMeasures().pmi, 50)
     ]
Пример #7
0
def create_word_bigram_scores():
    posdata = pickle.load(open('pos_review.pkl', 'rb'))
    negdata = pickle.load(open('neg_review.pkl', 'rb'))

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder_pos = BigramCollocationFinder.from_words(posWords)
    bigram_finder_neg = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder_pos.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder_neg.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams  # 词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Пример #8
0
def create_word_scores():
    tweets = get_tweets_from_db()
    postweets = tweets[800001:]
    negtweets = tweets[:800001]
 
    posWords = []
    negWords = []
    for tweet in postweets:
        posWords.append(tweet[0])
    for tweet in negtweets:
        negWords.append(tweet[0])

    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()

    for word in posWords:
        word_fd[word.lower()] += 1
        cond_word_fd['pos'][word.lower()] += 1
    for word in negWords:
        word_fd[word.lower()] += 1
        cond_word_fd['neg'][word.lower()] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
Пример #9
0
def grams():
    from nltk import BigramCollocationFinder, BigramAssocMeasures, TrigramAssocMeasures, TrigramCollocationFinder
    
    
    words = get_words()
    
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(words)
    finder.apply_freq_filter(40)
    bigrams = finder.nbest(bigram_measures.pmi, 500)
    
    trigram_measures = TrigramAssocMeasures()
    finder3 = TrigramCollocationFinder.from_words(words)
    finder3.apply_freq_filter(100)
    trigrams = finder3.nbest(trigram_measures.pmi, 300)
    combos2 = [combo2[0]+ " " + combo2[1] for combo2 in bigrams]
    combos3 = [combo3[0]+ " " + combo3[1] + " " + combo3[2] for combo3 in trigrams]
    return combos2, combos3
Пример #10
0
def top_phrases_nltk(revs):
    """
    Find top phrases by finding collocations using nltk
    :param revs:
    :return:
    """
    revs = '.\n'.join(revs)
    bigram_measures = BigramAssocMeasures()
    tokens = word_tokenize(revs)
    finder = BigramCollocationFinder.from_words(tokens)
    finder.apply_freq_filter(3)
    finder.apply_word_filter(
        lambda w: len(w) < 2 or w in stopwords.words("english"))
    colloc = []
    for tup in finder.nbest(bigram_measures.pmi, 50):
        pos = pos_tag(tup)
        if pos[0][1].startswith(("JJ", "RB")) or pos[1][1].startswith(
            ("JJ", "RB")):
            colloc.append(tup)
    return colloc
Пример #11
0
def select_most_informative_features(feature_set, top_informative_features_percentile):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    for feature_list, groupKey in feature_set:
        for feature in feature_list:
            word_fd[feature] += 1
            label_word_fd[groupKey][feature] += 1
    label_feature_count = dict((label, label_word_fd[label].N()) for label in label_word_fd)
    total_feature_count = sum([label_word_fd[label].N() for label in label_word_fd])
    feature_scores = {}
    for feature in word_fd:
        label_scores = {(label, BigramAssocMeasures.chi_sq(label_word_fd[label][feature],
                                                           (word_fd[feature], label_feature_count[label]),
                                                           total_feature_count)) for label in label_word_fd}
        feature_scores[feature] = sum([score for label, score in label_scores])
    top_features_count = int(round(top_informative_features_percentile * total_feature_count))
    best = sorted(feature_scores, key=lambda x: feature_scores[x], reverse=True)[:top_features_count]
    bestwords = set([w for w in best])
    filtered_feature_set = []
    for feature_list, groupKey in feature_set:
        filtered_feature_set.append((dict([(x, True) for x in feature_list if x in bestwords]), groupKey))
    return filtered_feature_set
Пример #12
0
def collocations(text, language='english', num=0, window_size=2):  #num=20
    """
    A reimplementation of the basic workings of the collocations method of the ``Text`` class of NLTK.

    :param text: raw text
    :param language: language to use to eliminate stopwords
    :param num: number of collocations
    :param window_size: window for collocations
    :return: a list of collocations for the text
    """

    from nltk.corpus import stopwords

    ignored_words = stopwords.words(language)
    finder = BigramCollocationFinder.from_words(word_tokenize(text),
                                                window_size)
    finder.apply_freq_filter(2)
    finder.apply_word_filter(
        lambda w: len(w) < 3 or w.lower() in ignored_words)
    bigram_measures = BigramAssocMeasures()
    c = finder.nbest(bigram_measures.likelihood_ratio, num)

    collocation_strings = [w1 + ' ' + w2 for w1, w2 in c]
    return collocation_strings
Пример #13
0
def get_wc_results(text,mode):
    try:
        h_p_data = text  # do topic extraction on paragraph and header text

        wordcloud = WordCloud(background_color="white", max_words=100, contour_width=3, contour_color='steelblue')
        # Generate a word cloud
        data_words = list(sent_to_words(h_p_data))
        # data_words_nostops = remove_stopwords(data_words)
        data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ'])
        # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ'])
        # print(data_lemmatized)
        all_tokens = [j for i in data_lemmatized for j in i]
        # print('all', all_tokens)
        all_tokens = [value for value in all_tokens if
                      (value != 'other' and value != 'day' and value != 'thing' and value != 'last')]


        if mode == 'single':
            combined_text = " ".join(all_tokens)
        else:
            if mode=='bi':

                finder = BigramCollocationFinder.from_words(all_tokens)
                bigram_measures = BigramAssocMeasures()
                scored = finder.score_ngrams(bigram_measures.raw_freq)
            if mode =='tri':
                # print(combined_text)
                # setup and score the bigrams using the raw frequency.
                finder = TrigramCollocationFinder.from_words(all_tokens)
                trigram_measures = TrigramAssocMeasures()
                scored = finder.score_ngrams(trigram_measures.raw_freq)

            # print(scored)

            # By default finder.score_ngrams is sorted, however don't rely on this default behavior.
            # Sort highest to lowest based on the score.
            scoredList = sorted(scored, key=itemgetter(1), reverse=True)
            # print('sclist',scoredList)
            # word_dict is the dictionary we'll use for the word cloud.
            # Load dictionary with the FOR loop below.
            # The dictionary will look like this with the bigram and the score from above.
            # word_dict = {'bigram A': 0.000697411,
            #             'bigram B': 0.000524882}

            word_dict = {}
            listLen = len(scoredList)
            # Get the bigram and make a contiguous string for the dictionary key.
            # Set the key to the scored value.
            for i in range(listLen-1):
                word_dict['_'.join(scoredList[i][0])] = scoredList[i][1]
            print('dic',word_dict)

        if mode=='single':
            wordcloud.generate(combined_text)
        else:
            wordcloud.generate_from_frequencies(word_dict)
    except Exception:
        print("cannot make word cloud for empty text")
        return []



    # Visualize the word cloud
    wordcloud.to_image()
    wordcloud_words = []

    word_cloud_results = []
    for each_res in wordcloud.words_:

        word_cloud_results.append([each_res,wordcloud.words_[each_res]])
        wordcloud_words.append(each_res)
    # print('words', wordcloud_words)
    # plt.imshow(wordcloud, interpolation='bilinear')
    # plt.axis("off")
    # plt.savefig(name_j[:-4]+"png")
    # plt.show()
    print(word_cloud_results)
    return word_cloud_results
Пример #14
0
    def collect(self, input_file, limit=None, save=None):
        """
        corllect corpus from Jodel JSON data
        :param input_file:
        :param limit:
        :param save:
        :return:
        """
        corpus = []
        words = []
        labels = []

        identities = {
            'Basel-Stadt': 'Basel',
            'Brunswick': 'Braunschweig',
            'Cologne': 'Köln',
            'Frankfurt': 'Frankfurt am Main',
            'Freiburg': 'Freiburg im Breisgau',
            'Fribourg-en-Brisgau': 'Freiburg im Breisgau',
            'Geneva': 'Genf',
            'Genève': 'Genf',
            'Hanover': 'Hannover',
            'Klagenfurt am Wörthersee': 'Klagenfurt',
            'Munich': 'München',
            'Nuremberg': 'Nürnberg',
            'Ouest lausannois': 'Lausanne',
            'Sankt Pölten': 'St. Pölten',
            'Sankt Gallen': 'St. Gallen',
            'Salzburg-Umgebung': 'Salzburg',
            'Vienna': 'Wien',
            'Zurich': 'Zürich'
        }

        self.city_frequency = defaultdict(int)

        # iterate over the data
        with open(input_file, encoding='utf-8', errors='ignore') as f:
            for line_no, line in enumerate(islice(f, None)):
                if line_no > 0:
                    if line_no % 10000 == 0:
                        print("%s" % (line_no), file=sys.stderr, flush=True)
                    elif line_no % 500 == 0:
                        print('.', file=sys.stderr, end=' ', flush=True)

                try:
                    jodel = json.loads(line)
                except ValueError:
                    print(line)
                msg = jodel.get('message', None)
                location = jodel.get('location', None)

                # skip empty jodels
                if msg is None or location is None:
                    continue

                city = location.get('name', None)

                if city == 'Jodel Team' or city is None:
                    continue

                # correct city names
                city = identities.get(city, city)

                self.city_frequency[city] += 1

                # collect all the data and transform it
                data = [self.clean(msg)]
                data.extend([
                    self.clean(child.get('message', []))
                    for child in jodel.get('children', [])
                ])

                # one instance for each jodel
                # corpus.extend(data)
                # labels.extend([city] * len(data))

                # one instance for each conversation
                corpus.append([word for message in data for word in message])
                labels.append(city)

                words.extend([word for message in data for word in message])

                if limit is not None and line_no == limit:
                    break

        assert len(labels) == len(
            corpus
        ), "umm, the number of labels (%s) and the number of instances (%s) is not the same" % (
            len(labels), len(corpus))

        self.int2word = {
            i: max(self.stems[w].keys(), key=(lambda key: self.stems[w][key]))
            for w, i in self.word2int.items()
        }
        # find collocations
        print('\nlooking for collocations', file=sys.stderr, flush=True)
        finder = BigramCollocationFinder.from_words(words)
        bgm = BigramAssocMeasures()
        collocations = [
            b for b, f in finder.score_ngrams(bgm.mi_like) if f > 1.0
        ]
        self.collocations = set(collocations)

        print('\ncreating corpus', file=sys.stderr, flush=True)
        if save is not None:
            self.corpus = []
            with open('%s.corpus' % save, 'w',
                      encoding='utf-8') as save_corpus:
                for doc, tag in zip(corpus, labels):
                    words = self.join_collocations(doc)
                    tags = [tag]
                    self.corpus.append(TaggedDocument(words, tags=tags))
                    save_corpus.write('%s\n' % json.dumps({
                        'words': words,
                        'tags': tags
                    }))
            print('\ncorpus saved as %s' % save, file=sys.stderr, flush=True)

            with open('%s.citycounts' % save, 'w',
                      encoding='utf-8') as save_counts:
                json.dump(dict(self.city_frequency), save_counts)

        else:
            self.corpus = [
                TaggedDocument(self.join_collocations(doc), tags=[tag])
                for doc, tag in zip(corpus, labels)
            ]

        print('\n%s instances' % len(self.corpus), file=sys.stderr, flush=True)

        # update mappings
        self.int2word = {
            i: max(self.stems[w].keys(), key=(lambda key: self.stems[w][key]))
            for w, i in self.word2int.items()
        }
        print("Found %s collocations" % (len(collocations)),
              file=sys.stderr,
              flush=True)
        for (w1, w2) in collocations[:10]:
            print('\t',
                  self.int2word[w1],
                  self.int2word[w2],
                  file=sys.stderr,
                  flush=True)
Пример #15
0
def FeatureChoose(pos_wordlist, neg_wordlist, method=BigramAssocMeasures.chi_sq, featuregram='one', n=6000):
	pos_feature = list()
	neg_feature = list()
	pos_all_words = list()
	neg_all_words = list()
	# pos_all_feature = dict()
	# neg_all_feature = dict()
	if featuregram == 'one':
		for each in pos_wordlist:
			cur = UniGramFeature(each)
			pos_feature.append(cur)
			# pos_all_feature.update(cur)
			pos_all_words.extend(cur)
		for each in neg_wordlist:
			cur = UniGramFeature(each)
			neg_feature.append(cur)
			# neg_all_feature.update(cur)
			neg_all_words.extend(cur)
	elif featuregram == 'two':
		for each in pos_wordlist:
			cur = Mixup2Feature(each)
			pos_feature.append(cur)
			# pos_all_feature.update(cur)
			pos_all_words.extend(cur)
		for each in neg_wordlist:
			cur = Mixup2Feature(each)
			neg_feature.append(cur)
			# neg_all_feature.update(cur)
			neg_all_words.extend(cur)
	elif featuregram == 'three':
		for each in pos_wordlist:
			cur = Mixup3Feature(each)
			pos_feature.append(cur)
			# pos_all_feature.update(cur)
			pos_all_words.extend(cur)
		for each in neg_wordlist:
			cur = Mixup3Feature(each)
			neg_feature.append(cur)
			# neg_all_feature.update(cur)
			neg_all_words.extend(cur)
	else:
		return []

	fd = FreqDist()
	cfd = ConditionalFreqDist()
	for word in pos_all_words:
		fd[word] += 1
		cfd['pos'][word] += 1
	for word in neg_all_words:
		fd[word] += 1
		cfd['neg'][word] += 1
	pos_N = cfd['pos'].N()
	neg_N = cfd['neg'].N()
	N = fd.N()
	score_list = dict()
	for word, freq in fd.iteritems():
		pos_score = BigramAssocMeasures.chi_sq(cfd['pos'][word], (freq, pos_N), N)
		neg_score = BigramAssocMeasures.chi_sq(cfd['neg'][word], (freq, neg_N), N)
		score_list[word] = pos_score + neg_score

	best_topwords = sorted(score_list.iteritems(), key=lambda kk:kk[1], reverse=True)
	# print json.dumps(best_topwords[-100:-1], ensure_ascii=False)
	best_topwords = best_topwords[:n]
	# print json.dumps(best_topwords[:100], ensure_ascii=False)
	best_topwords = set(word for word, freq in best_topwords)
	return pos_feature, neg_feature, best_topwords
Пример #16
0
def extract_keywords(string, tokenizer, sent_tokenizer, tagger, extractor, proper_noun_tag='SUBST_PROP'):
    """
    Implements KERA keyword extraction algorithm.

    See: https://www.ida.org/~/media/Corporate/Files/Publications/IDA_Documents/ITSD/ida-document-ns-d-4931.pdf

    Basic implementation of the procedure described in the paper.
    Probably needs some refinements in order to be more broadly effective.

    :param string: Document to analyze.
    :type string: str|unicode
    :param tokenizer: Function that returns a token segmentation as an iterable of strings given a string..
    :type tokenizer: (str|unicode) -> list[str|unicode]
    :param sent_tokenizer: Function that returns a sentence segmentation as an iterable of strings given a string.
    :type sent_tokenizer: (str|unicode) -> list[str|unicode]
    :param tagger: TextBlob compatible POS tagger. Must accept untokenized sentences.
    :type tagger: textblob.base.BaseTagger
    :param extractor: TextBlob compatible noun phrase extractor. Must accept untokenized sentences and use the same
      POS tagger which is passed as the tagger parameter.
    :type extractor: textblob.base.BaseNPExtractor
    :param proper_noun_tag: POS tag indicating proper nouns.
    :type proper_noun_tag: str|unicode
    :return: List of keyword/score tuples. Keyword may be a string or tuple of strings.
    :rtype : list[(str|unicode|(str|unicode)), float]
    """
    # find bigram collocations
    bigram_measures = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(tokenizer(string))
    collocations = finder.score_ngrams(bigram_measures.likelihood_ratio)[0:50]

    # find noun phrases
    phrases = [extractor.extract(s) for s in sent_tokenizer(string)]
    phrases = [item for sublist in phrases for item in sublist]

    # find proper noun tokens, collect total/frequency for weighting/normalization
    sents = [tagger.tag(s) for s in sent_tokenizer(string)]
    sents = [item for sublist in sents for item in sublist]

    proper_nouns = []

    np_doc_len = 0

    for i, (token, tag) in enumerate(sents):
        np_doc_len += 1

        if tag == proper_noun_tag:
            proper_nouns.append((token, i))

    # find noun phrase/collocation overlap
    phrase_strings = [' '.join(x[0]).lower() for x in phrases if isinstance(x[0], list)]
    collocations = [c for c in collocations if ' '.join(c[0]) in phrase_strings]

    ranks = []

    # calculate combined index score and normalized collocation score for collocations
    coll_score_total = sum([x[1] for x in collocations])
    coll_doc_len = len(tokenizer(string))

    for coll, coll_score in collocations:
        idx = phrases[phrase_strings.index(' '.join(coll))][1]

        alpha = coll_score / coll_score_total
        beta = 1 - (float(idx) / coll_doc_len)

        score = 2 * alpha * beta / (alpha + beta)

        ranks.append((coll, score))

    # calculate combined index score and normalized term frequency score for proper nouns
    np_strings = [x[0] for x in proper_nouns]
    np_counts = Counter(np_strings)
    np_total = len(proper_nouns)

    # only use normalize over the same number of proper nouns as collocations in order to keep
    # the scores roughly comparable.
    # TODO There are rarely more proper names than collocations. Handle this too.
    for np, count in sorted(np_counts.items(), key=itemgetter(1), reverse=True)[0:len(collocations)]:
        idx = proper_nouns[np_strings.index(np)][1]

        alpha = float(count) / np_total
        beta = 1 - (float(idx) / np_doc_len)

        score = 2 * alpha * beta / (alpha + beta)

        ranks.append((np, score))

    # return list of keywords and scores sorted by score
    return sorted(ranks, key=itemgetter(1), reverse=True)
Пример #17
0
    K_FOLDS = 10  # 10-fold crossvalidation
    CLF = LinearSVC()  # the default, non-parameter optimized linear-kernel SVM

    # Loading dataset and featurised simple Tfidf-BoW model
    corpus, y = parse_dataset(DATASET_FP)
    X, vectorizer = featurize(corpus)

    class_counts = np.asarray(np.unique(y, return_counts=True)).T.tolist()
    print(class_counts)

    print(corpus)
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize
    tokens = tokenizer('\n'.join(corpus))
    finder = BigramCollocationFinder.from_words(tokens)
    bigram_measures = BigramAssocMeasures()
    scored = finder.score_ngrams(bigram_measures.student_t)
    sorted(bigram for bigram, score in scored)
    map(lambda x: print(' '.join(x[0]), x[1]), scored[:10])

    CLF.fit(X, y)

    # Returns an array of the same size as 'y' where each entry is a prediction obtained by cross validated
    predicted = cross_val_predict(CLF, X, y, cv=K_FOLDS)

    most_informative_feature_for_binary_classification(vectorizer, CLF, n=10)

    # Modify F1-score calculation depending on the task
    if TASK.lower() == 'a':
        score = metrics.f1_score(y, predicted, pos_label=1)
    elif TASK.lower() == 'b':
for word in pluswords:
    word_fd[word.lower()]+=1
    label_word_fd['+'][word.lower()]+=1

for word in minuswords:
    word_fd[word.lower()]+=1
    label_word_fd['-'][word.lower()]+=1

pos_word_count = label_word_fd['+'].N()
neg_word_count = label_word_fd['-'].N()
total_word_count = pos_word_count + neg_word_count

word_scores = {}

for word, freq in word_fd.items():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['+'][word],
                                           (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['-'][word],
                                           (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score

# accs = []
# for x in range(0,2000,5):
#print(x)
best = sorted(word_scores.items(), key=(lambda s: s[1]), reverse=True)[:515]
bestwords = set([w for w, s in best])
# 1500: accuracy: 0.8091397849462365
# pos precision: 0.891156462585034
# pos recall: 0.7043010752688172
# neg precision: 0.7555555555555555
# neg recall: 0.9139784946236559
# 2000: accuracy: 0.803763440860215
Пример #19
0
def run_wordcloud_model(
    entry_id, mode
):  # this will extract paragraph and header text from given json file and extract the topics from that

    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    print("wordcloud model started", str(data[0]['_id']), data[0]['link'])
    try:
        h_p_data = data[0]["paragraph_text"] + data[0][
            "header_text"]  # do topic extraction on paragraph and header text

        wordcloud = WordCloud(background_color="white",
                              max_words=100,
                              contour_width=3,
                              contour_color='steelblue')
        # Generate a word cloud
        data_words = list(sent_to_words(h_p_data))
        # data_words_nostops = remove_stopwords(data_words)
        data_lemmatized = lemmatization(data_words,
                                        allowed_postags=['NOUN', 'ADJ'])
        # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ'])
        # print(data_lemmatized)
        all_tokens = [j for i in data_lemmatized for j in i]
        # print('all', all_tokens)
        all_tokens = [
            value for value in all_tokens
            if (value != 'other' and value != 'day' and value != 'thing'
                and value != 'last')
        ]

        if mode == 'single':
            combined_text = " ".join(all_tokens)
        else:
            if mode == 'bi':
                finder = BigramCollocationFinder.from_words(all_tokens)
                bigram_measures = BigramAssocMeasures()
                scored = finder.score_ngrams(bigram_measures.raw_freq)
            if mode == 'tri':
                # print(combined_text)
                # setup and score the bigrams using the raw frequency.
                finder = TrigramCollocationFinder.from_words(all_tokens)
                trigram_measures = TrigramAssocMeasures()
                scored = finder.score_ngrams(trigram_measures.raw_freq)

            # print(scored)

            # By default finder.score_ngrams is sorted, however don't rely on this default behavior.
            # Sort highest to lowest based on the score.
            scoredList = sorted(scored, key=itemgetter(1), reverse=True)
            # print('sclist',scoredList)
            # word_dict is the dictionary we'll use for the word cloud.
            # Load dictionary with the FOR loop below.
            # The dictionary will look like this with the bigram and the score from above.
            # word_dict = {'bigram A': 0.000697411,
            #             'bigram B': 0.000524882}

            word_dict = {}
            listLen = len(scoredList)
            # Get the bigram and make a contiguous string for the dictionary key.
            # Set the key to the scored value.
            for i in range(listLen - 1):
                word_dict['_'.join(scoredList[i][0])] = scoredList[i][1]
            # print('dic',word_dict)

        if mode == 'single':
            wordcloud.generate(combined_text)
        else:
            wordcloud.generate_from_frequencies(word_dict)
    except Exception:
        print("cannot make word cloud for empty text")
        mycol.update_one({'_id': entry_id},
                         {'$set': {
                             'wordcloud_results_' + mode: []
                         }})
        print("vocabulary is empty")
        return "Vocabulary is empty"

    # Visualize the word cloud
    wordcloud.to_image()
    wordcloud_words = []

    word_cloud_results = []
    for each_res in wordcloud.words_:

        word_cloud_results.append([each_res, wordcloud.words_[each_res]])
        wordcloud_words.append(each_res)
    # print('words', wordcloud_words)
    # plt.imshow(wordcloud, interpolation='bilinear')
    # plt.axis("off")
    # plt.savefig(name_j[:-4]+"png")
    # plt.show()
    print(word_cloud_results)
    # return wordcloud_words

    mycol.update_one(
        {'_id': entry_id},
        {'$set': {
            'wordcloud_results_' + mode: word_cloud_results
        }})
    print("Successfully extended the data entry with wordcloud results",
          entry_id)


# run_wordcloud_model("F://Armitage_project//crawl_n_depth//extracted_json_files//3_www.hydroterra.com.au_data.json")