Пример #1
0
    def __freqs_dict(self, raw_text):

        t_start = time()
        print('Making filtered text...')

        stopset = set(stopwords.words('russian'))
        ad = AlphabetDetector()

        tokens = word_tokenize(raw_text)
        tokens_filtered = [w.lower() for w in tokens
                           if w not in stopset
                           and w not in self.__custom_stopwords
                           and w.isalpha()
                           and len(w) >= self.__min_word_len
                           and ad.is_cyrillic(w)]


        freqs_tokenized_text = FreqDist(tokens_filtered)
        freqs_most_common = OrderedDict(freqs_tokenized_text.most_common(self.__max_words))

        res_text = ''
        for item in freqs_most_common.items():
            word = item[0]
            freq = item[1]
            for i in range(freq):
                res_text += word + ' '

        t_end = time()
        print("TIME = %.2f s" % (t_end - t_start))

        return res_text
Пример #2
0
def answer_six():

    from nltk.book import FreqDist
    dictionary = FreqDist(moby_tokens)
    sorted_dict = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
    result = [(v, k) for k, v in sorted_dict if ((v > 2000) & k.isalpha())]
    return result
Пример #3
0
def answer_six():
    words = [word for word in moby_tokens if word.isalpha()]
    dist = FreqDist(words)
    s = pd.Series(data=dist)
    s = s.sort_values(ascending=False)
    s = s.where(s > 2000).dropna()
    return list(zip(s, s.index))[:20]
Пример #4
0
def extract_monthly_user_CV_and_num_tweets(user_list):

    user_monthly_tweets = defaultdict(int)
    cnt_all_tweets = 0

    with codecs.open(F_IN, 'r', encoding='utf8') as input_file:
        # the code loops through the input, collects tweets text for each user into a dict
        for line in input_file:
            cnt_all_tweets += 1
            line = line.split()
            user = line[0]
            #if not user in user_list:
            #	continue
            if user not in user_monthly_tweets:
                user_monthly_tweets[user] = defaultdict(list)
            UTS = long(line[4])
            month = datetime.datetime.utcfromtimestamp(UTS).month
            tweet = line[5:]
            user_monthly_tweets[user][month] += clean(tweet)
            if cnt_all_tweets % 100000 == 0:
                print tweet, clean(tweet)
    print "Processed %d tweets" % cnt_all_tweets

    for user in user_monthly_tweets:
        for MO in user_monthly_tweets[user]:
            output_file = f_out_list[MO]
            usr_tweets_json = {}
            usr_tweets_json['_id'] = str(user)
            usr_tweets_json['txt'] = [{
                el[0]: el[1]
            } for el in FreqDist(user_monthly_tweets[user][MO]).iteritems()]
            output_file.write(
                unicode(json.dumps(usr_tweets_json, ensure_ascii=False)) +
                '\n')
Пример #5
0
def answer_five():
    from nltk.book import FreqDist
    dictionary = FreqDist(moby_tokens)
    sorted_dict = sorted(dictionary.items(),
                         key=lambda x: len(x[0]),
                         reverse=True)
    w, _ = sorted_dict[0]
    return (w, len(w))
Пример #6
0
def answer_eight():
    from nltk.book import FreqDist

    part_of_speech = nltk.pos_tag(moby_tokens)
    tags = [t for _, t in part_of_speech]
    dictionary = FreqDist(tags)
    sorted_dict = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
    return sorted_dict[:5]
Пример #7
0
def word_frenquency_static(data,rate): #统计每一个rate的词分布
   texts=list(data[data['rate'] == rate]['review']) #选取同一个rate的评论
   wordlist=[]
   for text in texts:
     text=[word for word in text.lower().strip().split()]
     wordlist.extend(text)
   fdist=FreqDist(wordlist)
   return fdist
Пример #8
0
def answer_four():
    from nltk.book import FreqDist
    token_dict = FreqDist(moby_tokens)
    new_list = []
    for val in token_dict.keys():
        if len(val) > 5 and token_dict[val] > 150:
            new_list.append(val)
    new_list.sort()
    return new_list  # Your answer here
Пример #9
0
def answer_three():

    from nltk.book import FreqDist
    import operator
    token_dict = FreqDist(moby_tokens)
    sorted_token_dict = sorted(token_dict.items(), key=operator.itemgetter(1))
    lst = sorted_token_dict[-20:]
    lst.reverse()
    return lst
Пример #10
0
def answer_four():

    from nltk.book import FreqDist
    token_dict = FreqDist(moby_tokens)
    res_lis = []
    for w in token_dict.keys():
        if len(w) > 5 and token_dict[w] > 150:
            res_lis.append(w)
    res_lis.sort()
    return res_lis
Пример #11
0
def get_most_common_words(string,
                          common_words_number=settings.COMMON_WORDS_NUMBER):
    """
    * Sanitizes and removes all stop words
    * Returns a list with the most common words
    """
    cleaned_data = remove_stop_words(sanitize_string(string))
    common_words = FreqDist(cleaned_data).most_common(common_words_number)

    return [i[0] for i in common_words]
Пример #12
0
def get_most_common_words(string, common_words_number=3):
    """ 
    * Sanitizes and removes all stop words
    * Returns a list with the most common words
    """
    cleaned_data = sanitize_string(string)
    cleaned_data = remove_stop_words(cleaned_data)
    common_words = FreqDist(cleaned_data).most_common(common_words_number)

    return [i[0] for i in common_words]
Пример #13
0
def answer_three():

    from nltk.book import FreqDist
    import operator

    token_dict = FreqDist(moby_tokens)
    sorted_token_dict = sorted(token_dict.items(),
                               key=operator.itemgetter(1),
                               reverse=True)

    return sorted_token_dict[0:20]  # Your answer here
Пример #14
0
def answer_five():
    from nltk.book import FreqDist
    token_dict = FreqDist(moby_tokens)
    max_len = 0
    for w in token_dict.keys():
        if len(w) > max_len:
            max_word = w
            max_len = len(w)
    tups = (max_word, max_len)

    return tups  # Your answer here
Пример #15
0
def answer_six():
    import operator
    from nltk.book import FreqDist
    token_dict = FreqDist(moby_tokens)
    res_lis = {}
    for w in token_dict.keys():
        if w.isalpha() and token_dict[w] > 2000:
            res_lis[w] = token_dict[w]
    sorted_res_list = sorted(res_lis.items(), key=operator.itemgetter(1))
    sorted_res_list.reverse()
    result = [(f, w) for w, f in sorted_res_list]
    return result
Пример #16
0
def answer_six():
    import operator
    from nltk.book import FreqDist
    token_dict = FreqDist(moby_tokens)
    liss = {}
    for w in token_dict.keys():
        if w.isalpha() and token_dict[w] > 2000:
            liss[w] = token_dict[w]
    sortedlist = sorted(liss.items(), key=operator.itemgetter(1), reverse=True)
    finale = [(f, w) for w, f in sortedlist]

    return finale  # Your answer here
Пример #17
0
def extract_phrases(corpus,
                    requested_terms=None,
                    term_freq_threshold=1,
                    spell_correction=False):
    bigram_docs = extract_bigrams(corpus, spell_correction)

    # consolidate bigrams of the coprus, I did not dissolve everything since individual document bigrams are
    # also needed in further processing
    corpus_bigram = list(chain(*bigram_docs))

    # freq of all corpus
    corpus_freq = dict(FreqDist(corpus_bigram).items())
    corpus_bigram = None

    # stitch related phrases in the corpus together
    corpus_freq = stitch_related_phrases(bigram_docs, corpus_freq,
                                         term_freq_threshold)
    bigram_docs = None

    # sort the corpus in descending order of the frequency
    corpus_freq = sorted(corpus_freq.items(), key=lambda x: x[1], reverse=True)
    corpus_freq = dict(corpus_freq)

    # initialize stop words
    stop_words = nltk.corpus.stopwords.words('english')
    stop_words.extend(" ".join(stop_words).title().split())
    stop_words.extend(string.punctuation)
    stop_words = set(stop_words)

    phrases = list(corpus_freq)
    corpus_freq = None

    # remove stop words
    phrases = list(map(lambda x: set(x).difference(stop_words), phrases))
    phrases = set(map(lambda x: tuple(x), phrases))

    if () in phrases:
        phrases.remove(())

    phrases = list(phrases)

    if requested_terms is not None:
        phrases = phrases[:requested_terms]

    # find nouns
    annotated_data = list(
        map(lambda x: nltk.pos_tag([*x]) if len(x) > 1 else 'NA', phrases))
    noun_terms = list(
        filter(lambda x: x is not None and len(x) > 1,
               map(lambda x: filter_nouns(x), annotated_data)))

    #.translate(translator)
    return phrases, noun_terms
Пример #18
0
def exercise1():
    print("Part a")
    answer = {}
    plural_nouns = [
        w.lower() for (w, t) in brown.tagged_words() if t.startswith('NNS')
    ]  #Find the Plural Nouns
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_singular_nouns = [
        lemmatizer.lemmatize(w) for w in set(plural_nouns)
    ]
    singular_nouns = [
        w.lower() for (w, t) in brown.tagged_words() if t.startswith('NN')
        if w in lemmatized_singular_nouns
    ]
    fdist_pnouns = (FreqDist(plural_nouns))
    fdist_lsnouns = (FreqDist(singular_nouns))
    for (singular_word, singular_count) in fdist_lsnouns.most_common(100):
        for (word, count) in fdist_pnouns.most_common(100):
            if (singular_word == lemmatizer.lemmatize(word)
                    and count > singular_count):
                answer[word] = count
                #print("Singular form:%s,Plural Form:%s,Count_Singular:%s,Count_Plural:%s"%(singular_word, word, singular_count, count))
    print(
        sorted(answer.items(), key=operator.itemgetter(1), reverse=True)[0:5])
    print("Part b")
    tags = [t for (w, t) in brown.tagged_words()]
    fdist_tags = (FreqDist(tags))
    print(fdist_tags.most_common(5))
    print("Part c")
    for genre in ['humor', 'romance', 'government']:
        print("The genre is:", genre)
        preceeding_tags = []
        four_grams = list(ngrams(brown.tagged_words(categories=genre), 4))
        for my_list in four_grams:
            if len(my_list) == 4 and my_list[3][1].startswith('NN'):
                preceeding_tags.append(my_list[0][1] + " " + my_list[1][1] +
                                       " " + my_list[2][1])
        print("The most common tags which preceed NN are:")
        fdist_mctags = (FreqDist(preceeding_tags))
        print(fdist_mctags.most_common(5))
Пример #19
0
def user_tweet_text_2_mongo(f_in, f_out):

    user_tweets = collect_tweet_text_per_user(f_in)
    output_file = codecs.open(f_out, 'w', encoding='utf8')
    for usr in user_tweets.iterkeys():
        usr_tweets_json = {}
        usr_tweets_json['_id'] = str(usr)
        usr_tweets_json['txt'] = [{
            el[0]: el[1]
        } for el in FreqDist(user_tweets[usr]).iteritems()]
        output_file.write(
            unicode(json.dumps(usr_tweets_json, ensure_ascii=False)) + '\n')
        #print usr_tweets_json

    print "User with most words had: ", len(max(user_tweets.values(), key=len))
Пример #20
0
def answer_four():
    from nltk.book import FreqDist
    dictionary = FreqDist(moby_tokens)
    freqwords = [w for w in dictionary if len(w) > 5 and dictionary[w] > 150]

    return sorted(freqwords)
Пример #21
0
def answer_three():
    from nltk.book import FreqDist
    dictionary = FreqDist(moby_tokens)
    sorted_dict = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
    return sorted_dict[:20]
Пример #22
0
from nltk.tag import pos_tag
s=nltk.pos_tag(nltk.word_tokenize(t))
print(s)

#removing verbs from input file
file_without_verbs = [word for word,tag in s if tag != 'VBG' and tag != 'VBZ' and tag!='VBN']
z=' '.join(file_without_verbs)         # z is the file without verbs
print(z)
s1=nltk.pos_tag(nltk.word_tokenize(z))
print(s1)                              # you can see in the output that all the verbs are removed





fdist=FreqDist(z)
print(fdist)
q=fdist.most_common(5)
print(q)

#word frequency of remaining words
def tokens(text):
    """
    Get all words from the corpus
    """
    return re.findall('[a-z]+', text.lower())
WORD_COUNTS = collections.Counter(tokens(z))
print (WORD_COUNTS)
print (WORD_COUNTS.most_common(5))

Пример #23
0
def main():

    # Parsing user input
    parser = ap.ArgumentParser()
    parser.add_argument('-i',
                        '--input',
                        nargs='?',
                        type=str,
                        required=True,
                        help='Input filename.')
    parser.add_argument('-c',
                        '--concordance',
                        nargs='?',
                        type=str,
                        default=None,
                        help='Word concordance.')
    parser.add_argument('-d',
                        '--dispersion',
                        nargs='*',
                        type=str,
                        default=None,
                        help='Word dispersion.')
    parser.add_argument('-f',
                        '--frequency',
                        nargs='?',
                        type=int,
                        default=None,
                        help='Word frequency.')
    parser.add_argument('-a',
                        '--acro',
                        action='store_true',
                        help='Acronyms only.')
    args = parser.parse_args()

    with open(args.input, 'r') as f:
        plain = f.read()

    plain = remove_comments(plain)

    words = nltk.word_tokenize(plain)

    if args.acro:
        words = [w for w in words if is_acro(w)]

    print '%d unique words out of %d total words.' % (len(
        set(words)), len(words))

    text = nltk.Text(words)

    if args.concordance is not None:
        text.concordance(args.concordance)
        return

    if args.dispersion is not None:
        text.dispersion_plot(args.dispersion)
        return

    if args.frequency is not None:
        freq = FreqDist(text)
        for i, f in enumerate(freq.most_common(args.frequency)):
            print '%9d%9d %s' % (i, f[1], f[0])
        freq.plot(args.frequency)
Пример #24
0
     for classifier in label:
         g.write(str(classifier) + '\n')
 else:
     for line in g:
         label.append(int(line.strip('\n')))
 g.close()
 big_feature_list = []
 for feat in features:
     big_feature_list.append(' '.join(word for word in feat))
 big_feature_list, label = shuffle(big_feature_list, label, random_state=5)
 label = label.tolist()
 #data = [features, opening_prices]
 corpus = []
 for document in big_feature_list.tolist():
     corpus += document.split()
 fdist = FreqDist(corpus)
 #print(fdist.most_common(200))
 #s = open('stopwords.txt' , 'w')
 #for pair in fdist.most_common(500):
 #    s.write(str(pair[0])+'\n')
 #s.close()
 xform = count_vect.fit_transform(big_feature_list)
 xform = preprocessing.scale(xform, with_mean=False)
 #tform = count_vect.fit_transform(opening_prices)
 #xform = hstack([xform, tform])
 #xform = coo_matrix.tocsr(xform)
 f.close()
 e.close()
 #print(xform)
 #        V = 12
 M = count // V
Пример #25
0
def frequency_distribution(text):
    # TODO: is there a better way to get a freqdist that doesn't involve import nltk.book?
    from nltk.book import FreqDist

    return FreqDist(text)
Пример #26
0
	# Word Tokenization

	word_lists = []
	for sentences in df['sentence_tokenize']:
	    for sentence in sentences:
	        words = word_tokenize(sentence)
	    word_lists.append(words)
	df['word_tokenize'] = word_lists

	# Saving word tokenization to file
	df.to_csv(path+str(j)+'_word.csv', index=False)
	print('Saved file to disk.')

	for word_list in word_lists:
	    freq = FreqDist(word_list)
	    # freq.plot(10)

	freq = FreqDist(word_list)
	freq_dict = dict(freq)
	freq_words = list(freq_dict.keys())
	frequencies = list(freq_dict.values())
	freq_df = pd.DataFrame(list(zip(freq_words, frequencies)), columns=['Word','Freq'])
	freq_df.to_csv(path+str(j)+'_frequency.csv', index=False)
	print('Saved file to disk.')

	# Stopwords and Non Stopwords

	stop = []
	non_stopwords = []
	for word_list in word_lists: 
    if (data['Abstract'][i] != 'N'):
        s = s + ' ' + data['Abstract'][i]
    else:
        data['Abstract'][i] = ''

# In[415]:

s = re.sub('[^A-Za-z0-9 ]+', '', s)
#calculate frequency of word using nltk library
from nltk.book import FreqDist
#newlist is list of all words
newlist = s.split()
#normalize capitals- The and the are same
for i in range(len(newlist)):
    newlist[i] = newlist[i].lower()
fdist = FreqDist(newlist)
#extracting most common i.e. most frequent words
remlist = fdist.most_common(10)
# print(remlist)
rem = remlist
for i in range(0, 10):
    rem[i] = remlist[i][0]
# print(rem)

# TASK 2-

# In[416]:

#this will remove frequent 10 words from whole corpus
newlist = [w for w in newlist if w.lower() not in rem]
Пример #28
0
 def most_freq_words(self, text, number):
     word_freq = FreqDist(text)
     words_counts = word_freq.most_common(number)
     words = [pair[0] for pair in words_counts]
     return words
Пример #29
0
# ◑ Write a function that takes a list of words (containing duplicates) and returns a list of words (with no duplicates) sorted by decreasing frequency. E.g. if the input list contained 10 instances of the word table and 9 instances of the word chair, then table would appear before chair in the output list.

from nltk.book import FreqDist


words = ['this', 'is', 'my', 'list', 'of', 'list', 'of', 'list', 'is', 'this', 'of', 'list', 'of', 'list', 'of', 'list', 'of', 'list', 'of', 'words']

fdist = FreqDist(words)
length = len(set(fdist))
answer = list(fdist.most_common(length))
answer.reverse()
answer = [i[0] for i in answer]
print(answer)
Пример #30
0
rate_unique.sort()
rate_unique=list(rate_unique)

#统计每个rate下,每种词出现的频数,以及all
for i in (rate_unique):
    path="../data/wordFrequency/word_"+str(i)+".json"
    with open(path, "w", encoding="utf-8") as f:
            json.dump(word_frenquency_static(data,i), f)  

#word_all 所有词的        
texts=list(data['review']) #选取同一个rate的评论
wordlist=[]
for text in texts:
      text=[word for word in text.lower().strip().split()]
      wordlist.extend(text)
fdist_all=FreqDist(wordlist)
path="../data/wordFrequency/word_all.json"
with open(path, "w", encoding="utf-8") as f:
            json.dump(fdist_all, f)  

#打开尝试调用一下
f=open(path,'r')
a=f.read()
dict_fdist_all=eval(a)
print('ok',dict_fdist_all['ok'])


'''
2.计算词在各类文本中的分布概率,并保存结果
'''