示例#1
0
    def __init__(self, words, sentences, language):
        self.num_words = len(words)
        self.unique_words = len(set(words))
        self.num_sentences = len(sentences)
        self.average_sentence_length = round(self.num_words / self.num_sentences)
        self.lexical_diversity = round(self.num_words / self.unique_words)

        fdist = FreqDist(words)
        stop_words = stopwords.words(language)
        not_stopwords = [w for w in words if w not in stop_words]
        fdist2 = FreqDist(not_stopwords)
        self.fifty_first_words = fdist.most_common(50)
        self.hundreds_nsw = fdist2.most_common(300)

        bigram_measures = BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(words)
        finder.apply_freq_filter(10)
        self.fifty_collocations = finder.nbest(bigram_measures.pmi, 50)

        trigram_measures = TrigramAssocMeasures()
        finder3 = TrigramCollocationFinder.from_words(words)
        finder3.apply_freq_filter(10)
        self.fifty_collocations3 = finder3.nbest(trigram_measures.pmi, 50)

        self.stcs_width_words = [' '.join(sent) for sent in sentences
                                 if "malheureusement" in sent.lower()]
def freqSingle(list):
    global nouns
    global adjectives
    
    #fig, axs = plt.subplots(1,2)

    varPOS = [nltk.pos_tag(list)] 
    
    ##################################################
    #ALL LANGUAGE - FREQUENCY
    varAll = FreqDist(list)
    
    ##USE THIS TO PRINT THE TOP WORDS
    print "TOP TERMS"
    varAll_common = varAll.most_common(25)
    print varAll_common
    print ""
    
    #PLOT TOP TERMS
    #varAll.plot(25, cumulative=False, title='All Language')
    #plt.show()
    
    ##################################################
    #NOUNS - FREQUENCY
    nouns = []
    for word,pos in varPOS[0]:
        if pos in ['NN', 'NNP']:
            nouns.append(word)
    varNouns = FreqDist(nouns)

    ##USE THIS TO PRINT THE TOP NOUNS
    print "TOP NOUNS"
    varNouns_common = varNouns.most_common(25)
    print varNouns_common
    print ""
    
    #PLOT TOP NOUNS
    #varNouns.plot(25, cumulative=False, title='Nouns')
    #plt.show()
    
    ##################################################
    #ADJECTIVES - FREQUENCY
    adjectives = []
    for word,pos in varPOS[0]:
        if pos in ['JJ', 'JJR', 'JJS']:
            adjectives.append(word)
    varAdjectives = FreqDist(adjectives)
    
    
    ##USE THIS TO PRINT THE TOP ADJECTIVES
    print "TOP ADJECTIVES"
    varAdjectives_common = varAdjectives.most_common(25)
    print varAdjectives_common
    print ""
示例#3
0
 def _count(self, words):
     """
     >>> wordCounter()._count(['plain', 'word1', 'word2', 'word2', 'word3', 'word3', 'word3'])
     [('word3', 3), ('word2', 2), ('plain', 1), ('word1', 1)]
     >>> wordCounter()._count([])
     []
     >>> wordCounter(words_per_message=-1)._count(['plain', 'word1', 'word2', 'word2', 'word3', 'word3', 'word3'])
     [('word3', 3), ('word2', 2), ('plain', 1), ('word1', 1)]
     """
     fdist1 = FreqDist(words)
     if (self.words_per_message > 0):
         return fdist1.most_common(self.words_per_message)
     else:
         return fdist1.most_common()
示例#4
0
def BigramAll():
    to_save_folder = "./#Bigram[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1 :
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name+"data.doc";
        fw = open(data_path,"r",encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);

        big = list(bigrams(w for w in words if len(w) > 1 and w != "``"));
        myBig = []
        for bi in big:
            myBig.append(bi[0]+" "+bi[1]);

        fdist = FreqDist(str(w) for w in myBig);

        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = "";
        for key in keys:
            dataFreq+= str(key[0]).strip()+","+str(key[1]).strip()+"\n";

        make_sure_path_exists(to_save_folder+folder)
        writer = open(to_save_folder+folder+"/"+folder+"[bigram_Freq].csv","w+",encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
def sentanceLenFrequency():
    to_save_folder = "./#SentanceLenFreq[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1:
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name + "data.doc";
        fw = open(data_path, "r", encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);
        sents = getSentancesTokens(text);
        freq=[]
        for sent in sents:
            sent_len = getSentanceLen(sent);
            if sent_len==147:
                print(sent);
            if sent_len>0:
                freq.append(sent_len)

        fdist = FreqDist(freq);
        keys = fdist.most_common();
        dataFreq = "Sentance Len,Freqency\n"
        for key in sorted(keys):
            dataFreq+= str(key[0])+","+str(key[1]) +"\n";
        make_sure_path_exists(to_save_folder + folder)
        writer = open(to_save_folder + folder + "/" + folder +"[data]"+ "[SentanceLen_Freq].csv", "w+", encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
示例#6
0
def ngram4All():
    to_save_folder = "./#Ngram_4[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1:
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name + "data.doc";
        fw = open(data_path, "r", encoding="utf8")
        text = fw.read();
        words = word_tokenize(text);
        valid_word = [w for w in words if len(w) > 1 and w != "``"]
        nlist4 = []
        vlen = len(valid_word);
        for i in range(0,vlen-3):
            nlist4.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2] + " " +valid_word[i+3])

        fdist = FreqDist(w for w in nlist4)
        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = ""
        for key in keys:
            dataFreq += str(key[0])+ "," + str(key[1]) + "\n"
        make_sure_path_exists(to_save_folder + folder)
        writer = open(to_save_folder + folder + "/" + folder + "[Ngram_4_Freq].csv", "w+", encoding="utf8")
        writer.write(dataFreq)
        fw.close()
        writer.close()
示例#7
0
def get_frequency(data_file,all_vocab):
    
    input_file = open(data_file, "r")
    input_file_contents = input_file.read()

    words = nltk.tokenize.word_tokenize(input_file_contents, 'english')
    fdist = FreqDist(words)
    print(fdist)

    output_file = open("../Training/vocab_freq.txt", "w")
    

    for word, frequency in fdist.most_common(4000):        
        if word in all_vocab  and word!='+' and word!='-':
            output_file.write(word + " : " + str(frequency) + "\n")        
            
    output_file.close()
    return 1

#data = "data.txt"
#stop_words = "stopwords.txt"

#accuracy= multinomial_naive_bayes_unigram(data, data, stop_words)
#print(accuracy)
#print("Separating Done!!")
def wordLenFrequency():
    to_save_folder = "./#WordLenFreq[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1:
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name + "data.doc";
        fw = open(data_path, "r", encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);

        freq=[]
        for word in words:
            word_len = getWordLen(word);
            if word_len==20:
                for char in word:
                    print(char,end=' ');
                print(word);
            if word_len>0:
                freq.append(word_len)

        fdist = FreqDist(freq);
        keys = fdist.most_common();
        dataFreq = "Word Len,Freqency\n"
        for key in sorted(keys):
            dataFreq+= str(key[0])+","+str(key[1]) +"\n";
        make_sure_path_exists(to_save_folder + folder)
        writer = open(to_save_folder + folder + "/" + folder+ "[WordLen_Freq].csv", "w+", encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
def most_frequent_words(path,top):
    root_path = "./"+path;
    writers = os.listdir(root_path);
    word_set = set();
    for writer in writers:
        if writer.find(".") != -1:
            continue;
        inside_folder = root_path + "//" +writer;
        files = os.listdir(inside_folder);
        formated_text = "";
        for file in files:
            file_path = root_path + "//" +writer+"//"+ file;
            fw = open(file_path,"r",encoding="utf8");
            article = fw.read();
            #print(article);
            formated_text+=" ";
            formated_text += formatText(article);
            fw.close();

        words = get_bigrams(formated_text);
        fdist = FreqDist(w for w in words if
                         len(w) > 1 and isEnglish(w) == False and w != "``");
        keys = fdist.most_common(top);
        for key in keys:
            #print(str(key[0]) + " , " + str(key[1]) + "\n");
            word_set.add(key[0]);
    print(word_set);
    fw = open("./Features/Bigrams.csv","w",encoding="utf8");
    for word in word_set:
        fw.write(word);
        fw.write("\n");
    fw.close();
def label_clusters(business_id,K,clusters):
    '''
        Label the clusters of a particular run specified by business_id 
        as the most common noun in that cluster
    '''
    base='../Models/%s/Clusters/'%business_id
    sentence_count=FreqDist(clusters)
    total_sentences=len(clusters)
    labels=[]
    for i in range(0,K):
        f=open(base+'Cluster_%d'%i,'r')
        text=f.read().decode('utf-8')
        f.close()
        tokens=nltk.word_tokenize(text)
        tokens = [w for w in tokens if w.isalpha() and len(w) > 3 and w not in stopwords.words()]
        fd=FreqDist(tokens)
        frequent=fd.most_common(5)
        label="None"
        label_freq=0
        for f in frequent:
            if is_noun(f[0]):
                label,label_freq=f
                break
        
        relative_score=float(label_freq)/len(tokens)
        cluster_score=float(sentence_count[i])/total_sentences
        print "test label:",i,label
        labels.append((i,label,label_freq,len(tokens),sentence_count[i],total_sentences,relative_score*cluster_score))
    return labels
示例#11
0
def word_tag_model(words, tagged_words, limit=200):
    fd = FreqDist(words)
    cfd = ConditionalFreqDist(tagged_words)

    most_freq = (word for word, count in fd.most_common(limit))

    return dict((word, cfd[word].max()) for word in most_freq)
示例#12
0
def trigramAll():
    to_save_folder = "./#Trigram[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1:
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name + "data.doc";
        fw = open(data_path, "r", encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);
        valid_word = [w for w in words if len(w) > 1 and w != "``"];
        tri_list = [];
        vlen = len(valid_word);
        for i in range(0,vlen-2):
            tri_list.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2]);

        fdist = FreqDist(w for w in tri_list);

        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = "";
        for key in keys:
            dataFreq += str(key[0]).strip()+ "," + str(key[1]).strip() + "\n";

        make_sure_path_exists(to_save_folder + folder)
        writer = open(to_save_folder + folder + "/" + folder + "[Triram_Freq].csv", "w+", encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
示例#13
0
def experiments():
    f = open('classEvent_NEs.txt','r')
    # Displaying top occurring K NEs
    text= nltk.Text(f.read().split('\n'))
    freqd = FreqDist(text)
    most_common = freqd.most_common(15)
    # Sorting according to Type of entity
    for el in most_common:
        print el 
示例#14
0
def most_common_bigrams(all_words, num_bigrams):
    bigram_finder = BigramCollocationFinder.from_words(all_words)
    bigram_freq = dict(bigram_finder.ngram_fd.viewitems())
    for k, v in bigram_freq.items():
        if not is_feature_relevant(k[0]) or not is_feature_relevant(k[1]):
            del bigram_freq[k]

    fd = FreqDist(bigram_freq)
    return dict(fd.most_common(num_bigrams)).keys()
def one_by_four(path,top=50):
    root_path = "./"+path;
    writers = os.listdir(root_path);
    word_set = set();
    temp_set = set();
    writer_table = dict();
    for writer in writers:
        if writer.find(".") != -1:
            continue;
        inside_folder = root_path + "//" +writer;
        files = os.listdir(inside_folder);
        formated_text = "";
        for file in files:
            file_path = root_path + "//" +writer+"//"+ file;
            fw = open(file_path,"r",encoding="utf8");
            article = fw.read();
            #print(article);
            formated_text+=" ";
            formated_text += formatText(article);
            fw.close();

        words = getWordList(formated_text);
        fdist = FreqDist(w for w in words if
                         len(w) > 1 and isEnglish(w) == False and w != "``");
        keys = fdist.most_common(top);
        print(keys);
        writer_table[writer] = dict(keys);
        for key in keys:
            temp_set.add(key[0]);

    writers = writer_table.keys();
    for word in temp_set:
        for writer1 in writers:
            freq1 = writer_table[writer1].get(word,0);
            for writer2 in writers:
                if writer2 == writer1:
                    continue;
                freq2 = writer_table[writer2].get(word,0);
                if freq1  >= freq2*4:
                    print(writer1+" "+writer2+" "+str(freq1)+" " + str(freq2)+ " "+word);
                    word_set.add(word);





    print(word_set);
    fw = open("./Features/Modified word frequency.csv","w",encoding="utf8");
    for word in word_set:
        fw.write(word);
        fw.write("\n");
    fw.close();
    def unigram_word_feats(self, words, top_n=None, min_freq=0):
        """
        Return most common top_n word features.

        :param words: a list of words/tokens.
        :param top_n: number of best words/tokens to use, sorted by frequency.
        :rtype: list(str)
        :return: A list of `top_n` words/tokens (with no duplicates) sorted by
            frequency.
        """
        # Stopwords are not removed
        unigram_feats_freqs = FreqDist(word for word in words)
        return [w for w, f in unigram_feats_freqs.most_common(top_n)
                if unigram_feats_freqs[w] > min_freq]
def lemmatised_words(debate):	
	
	lemmatised_words = debate

	fdist = FreqDist(lemmatised_words)

	list_mc = fdist.most_common(20)

	most_common_lemmatised_list = []
	
	for list_tuple in list_mc:
		if list_tuple[1] > 5:           
			most_common_lemmatised_list.append(list_tuple[0].encode('utf-8'))

	return most_common_lemmatised_list
示例#18
0
def most_common_single_features(all_words, num_words):
    """
    Gets the :param num_words most frequent words
    from the given :param dataset.
    Does not include stop words
    :param dataset:
    :param num_words:
    :return: list of most common features
    """
    words_in_x = []
    for word in all_words:
        if is_feature_relevant(word):
            words_in_x.append(word)
    fd = FreqDist(words_in_x)
    return dict(fd.most_common(num_words)).keys()
示例#19
0
def get_most_frequent_words(indirectory, outfile, count, **args):

	print (indirectory, outfile, count)

	corpus=PlaintextCorpusReader(indirectory,'.*\.txt')
	tokens=corpus.words()
	fdist=FreqDist(tokens)	
	most_freq=fdist.most_common(count)

	with open(outfile,'wb') as of:
		of.write(bytes('<s>\n','UTF-8'))
		of.write(bytes('</s>\n','UTF-8'))
		for word,frequency in most_freq:
			if word.isalpha():
				of.write(bytes(word+"\n",'UTF-8'))
示例#20
0
def tagsTrend(json_data, number):
    taglist = []
    for item in json_data:
        if item != "statistics":
            for t in json_data[item]:
                for tag in t['tags']:
                    taglist.append(tag)

    #TODO : USE similarity.findTags instead
    fdist = FreqDist(taglist)
    out = []
    for x in fdist.most_common(number):
        if len(x[0]) > 1:
            out.append(x[0])

    return out
示例#21
0
 def animal_frequency(self, input_list, raw_text):
     animals = input_list
     animals = [word.lower() for word in animals]
     #http://www.nltk.org/howto/stem.html
     stemmer = nltk.PorterStemmer()
     raw_text = raw_text
     singles = [stemmer.stem(word) for word in raw_text]
     animal_text = []
     for word in singles:
         if word in animals:
             animal_text.append(word)
         else:
             continue
     fdist1 = FreqDist(animal_text)
     top_50 = fdist1.most_common(50)
     return top_50
示例#22
0
def get_words(tweets):
    cleaned_listings = []
    words1 = []
    for tweet in tweets:
        for i, listing in enumerate(tweets):
            tokens = word_tokenize(listing)
            lowercase_tokens = [w.lower() for w in tokens]
            no_punctuation = [x.translate(table) for x in lowercase_tokens]
            alphabetic = [word for word in no_punctuation if word.isalpha()]
            words = [w for w in alphabetic if not w in stop_words]
            cleaned_listings.extend(words)
            fdist = FreqDist(cleaned_listings)
        for word, frequency in fdist.most_common(5):
            result = ('{} - {}'.format(word, frequency))
            words1.append(result)
        return (words1)
    def get_label_preferences(self, cluster_key):
        def exists(label_tree, key):
            return key in label_tree.nodes

        n_grams = self.cluster_processed_df[self.cluster_processed_df.labels_tree.apply(lambda tree: exists(tree, cluster_key))].n_grams_emb

        if len(n_grams) == 0:
            logger.info('Error: ' + cluster_key + ' does not exist in label graph.')
            return

        label_name_distr = FreqDist()
        for n_gram in n_grams:
            label = self._apply_positions(cluster_key, n_gram)
            label_name_distr[label] += 1

        return label_name_distr.most_common()
def stemmed_snowball(debate):

	snowball = SnowballStemmer("english")

	stemmed_words_snowball = [snowball.stem(w) for w in debate]

	fdist2 = FreqDist(stemmed_words_snowball)
	list_mc2 = fdist2.most_common(20)             #adjust this too!!!

	most_common_snowball = []
	
	for list_tuple in list_mc2:
		if list_tuple[1] > 5:
			most_common_snowball.append(list_tuple[0].encode('utf-8'))

	return most_common_snowball
示例#25
0
 def train(self, bookset):
     
     self.agg = AgglomerativeClustering(n_clusters = len(bookset))
     bookX = []
     
     for b in bookset:
         databook = ngrams(b, self.gramn)
         fdist = FreqDist(databook)
         common = fdist.most_common(100)
         
         inputlist = []
         for c in common:
             inputlist.append(c[0])
             inputlist.append(c[1])
         bookX.append(inputlist)
     self.agg.fit(bookX)
示例#26
0
    def metrics(self):
        fdist = FreqDist(self.token_clean)
        top_word = fdist.most_common(10)
        idx = text.ContextIndex(self.token)
        list_similarity = []
        for word in self.token:
            list_similarity.append(idx.similar_words(word))

        print(
            '\nMetricas\n' + '# Palabras Originales: ' +
            str(len(self.newcorpus)) + '\n' + '# Palabras Limpias: ' +
            str(len(self.newcorpus_clean_token)) + '\n' + 'Dif Palabras: ' +
            str(len(self.token) - len(self.token_clean)) + '\n' +
            'Concordance: ' + str(Text(self.token).concordance('sancho')) +
            '\n' + 'Similarity:', list_similarity, '\nSet de palabras: ',
            set(self.token), '\nTop 10 Freq Plabras: ', top_word)
示例#27
0
    def unigram_word_feats(self, words, top_n=None, min_freq=0):
        """
        Return most common top_n word features.

        :param words: a list of words/tokens.
        :param top_n: number of best words/tokens to use, sorted by frequency.
        :rtype: list(str)
        :return: A list of `top_n` words/tokens (with no duplicates) sorted by
            frequency.
        """
        # Stopwords are not removed
        unigram_feats_freqs = FreqDist(word for word in words)
        return [
            w for w, f in unigram_feats_freqs.most_common(top_n)
            if unigram_feats_freqs[w] > min_freq
        ]
示例#28
0
def create_vectorizers(data_dict):
    topic_list = list(data_dict.keys())
    vectorizer_dict = {}
    for topic in topic_list:
        text_array = data_dict[topic]
        text = " ".join(text_array)
        word_list = tokenize_nltk(text)
        word_list = [word for word in word_list if word not in stopwords]
        freq_dist = FreqDist(word_list)
        top_200 = freq_dist.most_common(200)
        vocab = [
            wtuple[0] for wtuple in top_200 if wtuple[0] not in stopwords
            and wtuple[0] not in string.punctuation
        ]
        vectorizer_dict[topic] = CountVectorizer(vocabulary=vocab)
    return vectorizer_dict
示例#29
0
def extract_most_frequent_words(fileids, num_most_frequent):
    ''' Function to extract the most frequent words from a corpus.
        Args:
              fileids: file ids for the documents in the reuters corpus.
              num_most_frequent: Number of most frequent words the user
                                 wish to compute.
    '''
    fdist = FreqDist()
    for fileid in fileids:
        for word in reuters.words(fileid):
            fdist[word.lower()] += 1

    most_frequent_words = [k for k, v in fdist.most_common(num_most_frequent)
                           if k.isalpha() and len(k) > 2 
                           and k not in MOST_COMMON_WORDS_ENGLISH]
    return most_frequent_words
示例#30
0
def get_stop_words_1(data, num_stop_words):
    total_words = []
    for d in data:
       total_words.extend(d["ques"])
       total_words.extend(d["answer1"])
       for d_i in d["summary"]:
           total_words.extend(d_i)
    fdist = FreqDist(total_words)
    stop_words = fdist.most_common(num_stop_words)
    stop_words = [t[0] for t in stop_words]
    pronoun_list = ["he", "she", "him", "her", "his", "them", "their", "they"] 
    filtered_stop_words = []
    for p in stop_words:
       if p not in pronoun_list:
           filtered_stop_words.append(p)
    return filtered_stop_words 
示例#31
0
def correct_spell_word(incorrect_word, dict_byword):
    main_word = incorrect_word
    size = len(main_word)
    all_word_condid = []
    for i in range(size - 1):
        bigram = main_word[0] + main_word[1]
        if dict_byword.__contains__(bigram):
            temp = dict_byword.get(bigram)
            for x in temp:
                all_word_condid.append(x)
        main_word = main_word[1:]

    fdist = FreqDist(all_word_condid)
    most_common = dict(fdist.most_common(30))

    contain_jacard = {}
    for x in most_common:
        number_bigram = len(x) - 1
        jacard = most_common.get(x) / (number_bigram + size -
                                       most_common.get(x))
        contain_jacard.update({x: jacard})

    sort_jacard = {
        k: v
        for k, v in sorted(
            contain_jacard.items(), key=lambda item: item[1], reverse=True)
    }

    # print("jaccard : ",incorrect_word,sort_jacard)

    min1 = math.inf
    editDistanceDict = {}
    correct_word = ""
    for x in sort_jacard:
        ED = editDistance(x, incorrect_word, len(x), len(incorrect_word))
        editDistanceDict.update({x: ED})

    editDistanceDict_sort = {
        k: v
        for k, v in sorted(editDistanceDict.items(), key=lambda item: item[1])
    }
    minval = min(editDistanceDict_sort.values())
    correct_words = [
        k for k, v in editDistanceDict_sort.items() if v == minval
    ]

    return correct_words
示例#32
0
def sentiment_ana_exwords():
	sen=''
	sid = SentimentIntensityAnalyzer()
	df=pd.read_csv("pkm-19-clean.csv")
	for i in range(20,27):
		df=df.append(pd.read_csv(f'pkm-{str(i)}-clean.csv'),ignore_index=True)
	for i in df['text']:
		ss=sid.polarity_scores(i)
		if ss['compound']>-0.05 and ss['compound']<0.05:
			sen+=i
	sen=sen.lower()
	toker=RegexpTokenizer(r'\w+')
	words=toker.tokenize(sen)
	stop_words = set(stopwords.words('english'))
	filtered_sentence = [w for w in words if not w in stop_words]
	fdist=FreqDist(filtered_sentence)
	print(fdist.most_common(50))
示例#33
0
文件: termFreq.py 项目: nbeirne/IFS
def termFreq(text, options):
    # Stop Words
    swords = set(stopwords.words(options['language']))

    table = str.maketrans(dict.fromkeys(string.punctuation))
    cleanText = text.translate(table)

    # Change to lower case and tokenize, exclude punctuation and stopwords
    words = [
        i.lower() for i in wordpunct_tokenize(cleanText)
        if i.lower() not in swords
    ]

    fdist = FreqDist(words)
    result = [list(i) for i in fdist.most_common(options['termLimit'])]

    return result
示例#34
0
def evaluate(row):
    record, stop_words = row
    text = nltk.word_tokenize(record['text'])

    #stemming
    stemmer = SnowballStemmer("english")
    for indx, word in enumerate(text):
        word = stemmer.stem(word)
        text[indx] = word

    fdist = FreqDist(text)
    unfiltered_frequencies = fdist.most_common(fdist.B())
    frequencies = [
        t for t in unfiltered_frequencies
        if t[0] not in stop_words and t[1] > 5 and len(t[0]) > 2
    ]
    return frequencies
示例#35
0
    def summarize(self):
        tokens = tokenize(self.text)

        entities = self.ner.extract_entities(tokens)

        entities_text = list()

        for e in entities:
            range = e[0]
            tag = e[1]
            score = e[2]
            score_text = "{:0.3f}".format(score)
            entity_text = " ".join(tokens[i].decode() for i in range)
            # print("   Score: " + score_text + ": " + tag + ": " + entity_text)
            entities_text.append(entity_text)
        frequency_distribution = FreqDist(entities_text)
        return frequency_distribution.most_common(5)
示例#36
0
def my_unigram(words):
    freq = FreqDist(words)
    freqDistr = freq.most_common(50)
    for i in range(len(freqDistr)):
        freqDistr[i] = list(freqDistr[i])
        freqDistr[i][1] = freqDistr[i][1] + 1

    rowSum = 0
    for i in range(len(freqDistr)):
        rowSum = rowSum + freqDistr[i][1]

    uni = []
    for i in range(len(freqDistr)):
        uni.append(
            [freqDistr[i][0], (-1 * numpy.log(freqDistr[i][1] / rowSum))])
    # print(uni)
    return uni
示例#37
0
def plot_iz():
    pd_data = pd.read_csv(DATA_PATH + 'iz.csv')
    tokens = pd_data['text'].apply(str.lower).apply(
        nltk.tokenize.word_tokenize)
    all_tokens = [token for text in tokens for token in text]
    fd = FreqDist(all_tokens)
    data = {x[0]: x[1] for x in fd.most_common(10)}
    plt.xticks(rotation='vertical')
    print(list(data.keys()))
    print(list(data.values()))
    plt.bar(list(data.keys()),
            list(data.values()),
            1,
            edgecolor='w',
            color='g')
    plt.savefig('static/plot.png')
    return render_template('iz.html')
def AccessTwitter(search_string):

    key = configurations.consumer_key
    secret = configurations.consumer_secret
    access_token = configurations.access_token
    access_secret = configurations.access_secret

    auth = tweepy.OAuthHandler(consumer_key=key, consumer_secret=secret)
    auth.set_access_token(access_token, access_secret)

    api = tweepy.API(auth)
    tweets = []

    for tweet in api.search(q=search_string, count=100, lang="en"):
        tweets.append(tweet)
    data = pd.DataFrame(data=[(tweet.text) for tweet in tweets],
                        columns=['Tweets'])
    data['clean_tweets'] = data['Tweets'].apply(lambda x: cleanTweet(x))
    data['subjectivity'] = data['clean_tweets'].apply(getsubjectivity)
    data['polarity'] = data['clean_tweets'].apply(getpolarity)
    data['analysis'] = data['polarity'].apply(getanalysis)
    all_words = ' '.join(tweet for tweet in data['clean_tweets'])
    allwords = word_tokenize(all_words)
    stop_words = set(stopwords.words("english"))
    filtered_sent = []
    for w in allwords:
        if w not in stop_words:
            filtered_sent.append(w)
    fdist = FreqDist(filtered_sent)
    fd = pd.DataFrame(fdist.most_common(10),
                      columns=["Word", "Frequency"]).drop([0]).reindex()

    # sentiment bar
    sentiment_bar = data['analysis']
    # sentiment pie
    sentiment_pie = data['analysis']
    # word cloud
    word_cloud = WordCloud(width=800,
                           height=400,
                           random_state=1,
                           max_font_size=120).generate(all_words)
    # frequently used words
    n = fd['Word']
    s = fd['Frequency']

    return (sentiment_bar, sentiment_pie, word_cloud, n, s)
示例#39
0
文件: tw2.py 项目: candybains/Tweets
def get_trends(tag):
	print(tag)
	tkns_CE=''
	with open('tweets.txt','r', encoding='UTF8') as file:
		for line in file:
			pat = re.compile(tag)
			if pat.search(line.lower()) != None:
				#print(line)
				seg = re.sub(r'[^\x00-\x7f]+',r' ',line)
				seg = re.sub(r'[^a-zA-Z0-9_\s]+', '', seg)
				#print(seg)
				#tokens = word_tokenize(seg)
				#stopw = [i for i in tokens if not i in stop_words]
				result = TextBlob(seg.lower())
				#print(result.tags)
				for word, pos in result.tags:
					if pos == 'JJ':
						if (sid.polarity_scores(word)['compound']) >= 0.5:
						#print(word)
							tkns_CE += ' ' + word

	tkns_CE = tkns_CE.split()
	#print(tkns)
	for i in range(len(tkns_CE)):
	#print(tkns_CE[i]_
		for syns in wordnet.synsets(tkns_CE[i]):
			if syns:
				lemmas = syns.lemmas()
				for l in lemmas:
				#syns = wordnet.synsets(tkns_CE1[i], pos = 'a')
				#print(tkns_CE1[i])
					#if syns:
					for j in range(i,len(tkns_CE)):
						#print(tkns_CE1[j])
						syns1 = wordnet.synsets(tkns_CE[j])
						syns2 = wordnet.synsets(l.name())
						if syns1:
							sim = syns2[0].wup_similarity(syns1[0])
							if sim:
								if sim > 0.4:
									#print(tkns_CE[j],tkns_CE[i])
									tkns_CE[j] = tkns_CE[i]
	fdist = FreqDist(tkns_CE)
	#print(fdist)
	print(fdist.most_common(10))
示例#40
0
def frequencyDis(str1):

    words = word_tokenize(str1)

    words_no_punc = []
    for w in words:
        if w.isalpha():
            words_no_punc.append(w.lower())

    notwords = stopwords.words("english")

    clean_words = []
    for w in words_no_punc:
        if w not in notwords:
            clean_words.append(w)

    freqDist3 = FreqDist(clean_words)
    return freqDist3.most_common(10)
示例#41
0
def assignment3():
    drop_words = []

    word_list = nltk.corpus.gutenberg.words('melville-moby_dick.txt')
    #print(word_list)
    fd = FreqDist(word_list)

    most_common = sorted(fd.most_common(100))
    # the need for this extra step is because one would like to have dropwords that have a high frequency. If you have a dropword that only occurs once, then it is not worth it to drop it
    # Without this however you could simply create the dropwords from word_list instead of creating an fd of the most_common
    for word in most_common:
        if len(word[0]) <= 3:
            drop_words.append(word[0])
    clean = [word for word in word_list if word not in drop_words]

    new_fd = FreqDist(clean)

    return new_fd
示例#42
0
def build_vocabulary(data_file):
	
	input_file = open(data_file, "r")
	input_file_contents = input_file.read()

	words = nltk.tokenize.word_tokenize(input_file_contents, 'english')
	fdist = FreqDist(words)
	print(fdist)
	# print(fdist.most_common(2000))

	output_file = open("../Vocabulary/vocabulary.txt", "w")

	for word, frequency in fdist.most_common(4000):		
		if frequency >= 2 and word!='+' and word!='-':
			output_file.write(word + "\n")
			
	output_file.close()
	return 1
示例#43
0
def filter_sentences(sentences, fraction_words_to_use=1):
    stops = set(stopwords.words("english")).union(string.punctuation)
    stemmer = PorterStemmer()

    for i in range(len(sentences)):
        sentences[i] = word_tokenize(sentences[i])
        sentences[i] = [stemmer.stem(word) for word in sentences[i] if word not in stops]

    words = list(itertools.chain.from_iterable(sentences))
    freq_dist = FreqDist(words)
    target_num = int(freq_dist.B() * fraction_words_to_use)
    targets = freq_dist.most_common(target_num)
    targets = [target[0] for target in targets]

    for i in range(len(sentences)):
        sentences[i] = [word for word in sentences[i] if word in targets]

    return sentences
示例#44
0
def CalculateBestWords(corpus):
    # Create frequency distributions for later
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    # For each document in the corpus
    for document in corpus:
        # Split out of the words from the label
        words = document[0]
        label = document[1]
        # For each word in the document
        for word in words:
            # Split off the word and frequency
            token, frequency = word.split(":")
            # Add the word to the distribution equal to the number of times it
            # occurs in the document
            for i in range(int(frequency)):
                word_fd[token.lower()] += 1
                label_word_fd[label][token.lower()] += 1

    # Figures out the number of words that apply to each label
    pos_word_count = label_word_fd['positive'].N()
    neg_word_count = label_word_fd['negative'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    # This computes the probability that a word is in a given class, for each class
    for word, freq in word_fd.most_common(word_fd.N()):
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['negative'][word],
                                               (freq, neg_word_count),
                                               total_word_count)

        word_scores[word] = pos_score + neg_score

    # This sorts the list of words by their score and retrieves the 5000 best words
    best = sorted(word_scores.items(),
                  key=operator.itemgetter(1),
                  reverse=True)[:5000]
    best_words = set([w for w, s in best])
    return best_words
示例#45
0
文件: app.py 项目: mrticia/firstrepo
def home():
    if (request.method == 'POST'):
        textName = request.form['textName']
        text = request.form['text']
        devideSentences = request.form.get('devideSentences')
        freqDist = request.form.get('freqDist')
        number = 0
        if freqDist != None:
            number = request.form['number']

        sentences = sent_tokenize(text)
        clean_sentences = []

        if (devideSentences == "on"):

            for num, sent in enumerate(sentences):
                clear_sentence = tokenizer.tokenize(sent)
                clear_sentence_str = ' '.join(
                    [str(elem) for elem in clear_sentence])
                clean_sentences.append(clear_sentence_str)

        freq_dist = ''
        if (freqDist == "on"):
            clean = []
            filtered_words = []
            for num, sent in enumerate(sentences):
                clear_sentence = tokenizer.tokenize(sent)
                clean.append(clear_sentence)

            for sent in clean:
                for word in sent:
                    words = lem.lemmatize(word, "v")
                    if words not in stopWords:
                        filtered_words.append(word)
            dist = FreqDist(filtered_words)
            freq_dist = dist.most_common(int(number))

        return render_template('analyse.html',
                               textName=textName,
                               text=text,
                               clean_sentences=clean_sentences,
                               freq_dist=freq_dist)

    return render_template('home.html')
示例#46
0
def mostFrequentWords(filePath):
    import re
    import json
    import nltk
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    
    clean(filePath)
    stop_words = set(stopwords.words('english'))
    stop_words_new = ['haha', 'https', 'hahaha', 'u', 'btw','dr', 'pm', 'am', 'like','lol','one','na','yeah', "a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","knows","known","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero","a",",",".","?","!","|",":","'",";","<NUM>","?","$","km","s","u","&","#","'s","/","dr."]
    for s in stop_words_new:
        stop_words.add(s)

    with open('new.json') as f:
        data = json.load(f)

    allText = ""

    for key in data:
        text = data[key]['text']
        text = re.sub(r'\W+', ' ', text)
        line = re.sub(r"(^|\W)\d+", "", text)
        allText = allText + " "+ text

    word_tokens = word_tokenize(allText)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    
    filtered_sentence = []
    
    for w in word_tokens:
        w = w.lower()
        if w not in stop_words:
            filtered_sentence.append(w)
    
    # print(word_tokens)
    # print(filtered_sentence)
    from nltk.probability import FreqDist
    freq = FreqDist(filtered_sentence)
    for w in freq.most_common(10):
        print ("word: " + w[0] + " frequency: " + str(w[1]))
        print("***")

    freq.plot(10,cumulative=False)
示例#47
0
def _make_capped_word_index(stanford, dataset, vocab_size=20000):
    len_stanford = len(stanford.vocab)
    print("Stanford vocab original length:", len_stanford)
    print("Capped vocab (20k) fraction:", vocab_size / len_stanford)

    # Combine all tweets into one big array of words
    flatten = lambda l: [item for sublist in l for item in sublist]
    dataset_corpus = flatten(dataset["train_tweets"])

    # Count most frequent words
    fd = FreqDist(dataset_corpus)
    top20k = fd.most_common(vocab_size)

    # Make new word index (vocab) with PAD and UNK as the first two tokens (ordering is important)
    top20k_words = [PAD, UNK]
    top20k_words += [x[0] for x in top20k]
    word_index_20k = {word: idx for idx, word in enumerate(top20k_words)}

    return word_index_20k
示例#48
0
    def __init__(self, file):

        self.tokenized_sentences = []

        #Opening file and replacing carriage return by space
        brexit_text = file.read().replace('\n', ' ')

        #Initializing tokenizer
        tokenizer = RegexpTokenizer(r'\w+')
        #Initializing Stemmer (not supported yet)
        ps = PorterStemmer()

        #Tokenizing sentences
        tokenized_words = tokenizer.tokenize(brexit_text.lower())
        self.sentences = nltk.sent_tokenize(brexit_text.lower())

        filtered_tokenized_words = []
        filtered_tokenized_sentences = []

        #Removing stopwords from text
        for word in tokenized_words:
            if word not in stopwords.words('english'):
                filtered_tokenized_words.append(ps.stem(word))

        for b_sentence in self.sentences:
            filtered_tokenized_sentences.append([
                ps.stem(word) for word in tokenizer.tokenize(b_sentence)
                if word not in stopwords.words('english')
            ])

#Creating the fdist dictionnary
        fdist_words = FreqDist(filtered_tokenized_words)
        self.fdist_dict = dict(fdist_words.most_common(fdist_words.N()))
        for k in self.fdist_dict.keys():
            print(k + "," + str(self.fdist_dict[k]))
        i = 0
        for filtered_sentence in filtered_tokenized_sentences:
            self.tokenized_sentences.append(
                sentence(
                    sorted(filtered_sentence,
                           key=lambda x: self.fdist_dict.get(x),
                           reverse=True), i, file))
            i += 1
示例#49
0
    def get_word_frequency(self, size):
        file_name = self.disease_type + '-word-freq-' + str(size)
        if 'training' in file_name:
            full_training_word_freq_filename = file_name + '.csv'
            file = csv.writer(open(full_training_word_freq_filename, 'w'))
        else:
            full_test_word_freq_filename = file_name + '.csv'
            file = csv.writer(open(full_test_word_freq_filename, 'w'))

        fd = FreqDist(self.word_set)
        #print fd.most_common(200)
        #print fd.hapaxes()
        # fd.plot(50,cumulative=False)

        # Print word counts to a CSV file
        for key, count in fd.most_common(size):
            file.writerow([key.encode('utf-8'), count])  # encode

        return self.full_training_word_freq_filename, self.full_test_word_freq_filename
示例#50
0
def parseDataSet():
    file1 = open("text.txt", "r")  # Open file
    print("... file opened")

    data = file1.read()  # Read & assign entire dataset
    # data = file1.read(2000) # Read & assign first 2000 chars of dataset
    print("... file read")

    # --TOKENIZE--
    tokenized_word = word_tokenize(data)  # Split data into word tokens
    print("... file tokenized")

    # --REMOVE STOPWORDS--
    stopWords = set(stopwords.words('english'))
    addWords = ['I', 'Im', 'Its', 'bc', 'www', 'http', 'com',
                ',']  # Additional meaningless words
    filteredData = []

    forbiddenBar = re.compile('\\|\\|\\|')  # regex for |||

    for word in tokenized_word:  # Filter stop words from token set
        if not word in stopWords and not word in addWords:
            if not word.startswith("http") and not forbiddenBar.search(word):
                filteredData.append(word)
    print("... stopwords removed")

    # --STEMMING--
    ps = PorterStemmer()
    stemmedData = []
    for word in filteredData:
        stemmedData.append(ps.stem(word))
    print("... data stemmed")

    # TODO: --LEMMATIZATION--
    # TODO: --POS TAGGING--

    fdist = FreqDist(filteredData)  # Pass token set and return distribution
    print(fdist.most_common(50))

    file1.close()  # Close file
    print("... file closed")
    commonWords(fdist)
示例#51
0
文件: lda.py 项目: jihwangk/NLP
class VocabBuilder:
    """
    Creates a vocabulary after scanning a corpus.
    """
    def __init__(self, lang="english", min_length=3, cut_first=100):
        """
        Set the minimum length of words and which stopword list (by language) to
        use.
        """
        self._counts = FreqDist()
        self._stop = set(stopwords.words(lang))
        self._min_length = min_length
        self._cut_first = cut_first

        print(("Using stopwords: %s ... " % " ".join(list(self._stop)[:10])))

    def scan(self, words):
        """
        Add a list of words as observed.
        """

        for ii in [x.lower() for x in words if x.lower() not in self._stop \
                       and len(x) >= self._min_length]:
            self._counts[ii] += 1

    def vocab(self, size=5000):
        """
        Return a list of the top words sorted by frequency.
        """
        sorted_list = self._counts.most_common(len(self._counts))
        count_list = []
        for i in range(size):
            if i == len(self._counts):
                break

            if len(self._counts) > self._cut_first + size:
                count_list.append(sorted_list[self._cut_first + i][0])

            else:
                count_list.append(sorted_list[i][0])

        return count_list
def pnl_module(df):
    content = ' '.join(df["text"])
    content = re.sub(r"http\S+", "", content)
    content = content.replace('RT ', ' ').replace('&amp;', 'and')
    content = re.sub('[^A-Za-z0-9]+', ' ', content)
    content = content.lower()

    tokenized_word = word_tokenize(content)
    #Extra  fine filter
    tokenized_word = [word for word in tokenized_word if len(word) > 3]
    stop_words = set(stopwords.words("spanish"))
    filtered_sent = []
    for w in tokenized_word:
        if w not in stop_words:
            filtered_sent.append(w)
    fdist = FreqDist(filtered_sent)
    fd = pd.DataFrame(fdist.most_common(15),
                      columns=["Word", "Frequency"]).drop([0]).reindex()

    return fd
示例#53
0
    def get_features(self, topk):

        # Good vocab size for just twitter data is 2800
        fdist = FreqDist()
        for line in self.input:
            for word in word_tokenize(line):
                fdist[word] += 1

        print(len(fdist))

        common = fdist.most_common(topk)
        vocab = {}
        i = 0
        for pair in common:
            vocab[pair[0]] = i
            i += 1

        vocab["UNK"] = i

        return vocab
示例#54
0
def unigramAGivenFolder(folder):
    to_save_folder = "./#Unigram[.]/"
    folder_name = "./" + folder + "/"
    data_path = folder_name + "data.doc";
    fw = open(data_path, "r", encoding="utf8");
    text = fw.read();
    print(len(text))
    #text = text[:10000];
    words = word_tokenize(text);
    fdist = FreqDist(w for w in words if len(w) > 1 and w != "``");

    keys = fdist.most_common(len(fdist.keys()))
    dataFreq = "";
    for key in keys:
        dataFreq += str(key[0]) + " , " + str(key[1]) + "\n";

    make_sure_path_exists(to_save_folder + folder)
    writer = open(to_save_folder + folder + "/" + folder + "[unigram].csv", "w+", encoding="utf8");
    writer.write(dataFreq);
    fw.close();
    writer.close();
示例#55
0
	def get_summarized(self, input, num_sentences ):
		# TODO: allow the caller to specify the tokenizer they want
		# TODO: allow the user to specify the sentence tokenizer they want

		tokenizer = RegexpTokenizer('\w+')

		# get the frequency of each word in the input
		base_words = [word.lower()
			for word in tokenizer.tokenize(input)]
		words = [word for word in base_words if word not in stopwords.words()]
		word_frequencies = FreqDist(words)

		# now create a set of the most frequent words
		most_frequent_words = [pair[0] for pair in
			word_frequencies.most_common(100)]

		# break the input up into sentences.  working_sentences is used
		# for the analysis, but actual_sentences is used in the results
		# so capitalization will be correct.

		sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
		actual_sentences = sent_detector.tokenize(input)
		working_sentences = [sentence.lower()
			for sentence in actual_sentences]

		# iterate over the most frequent words, and add the first sentence
		# that inclues each word to the result.
		output_sentences = []

		for word in most_frequent_words:
			for i in range(0, len(working_sentences)):
				if (word in working_sentences[i]
				  and actual_sentences[i] not in output_sentences):
					output_sentences.append(actual_sentences[i])
					break
				if len(output_sentences) >= num_sentences: break
			if len(output_sentences) >= num_sentences: break

		# sort the output sentences back to their original order
		return self.reorder_sentences(output_sentences, input)
    meaningful_words = [w for w in token_noun if not w in stops]
    pat_bigrams=bigrams(meaningful_words)
    pat_bigrams=[' '.join(x) for x in pat_bigrams]
    counter_final_words=Counter(pat_bigrams)
    string_words=' '.join(meaningful_words)
    return counter_final_words, string_words
    
#Counter function creates a TDM for each corpora
counter_1, words_1=parse_abstracts(name1+'_uspc_2012.csv')
counter_2, words_2=parse_abstracts(name2+'_uspc_2012.csv')


#collac_1=words_1.collocations()
#print(collac_1)
fdist1=FreqDist(counter_1)
fdist1_top50=fdist1.most_common(50)
print('\nTop 50 Bigram List for',name1)
print(fdist1_top50)
#fdist1.plot(25, cumulative=False)
fdist2=FreqDist(counter_2)
fdist2_top50=fdist2.most_common(50)
print('\nTop 50 BigramList for',name2)
print(fdist2_top50)


#all_items creates a set of all unique words used in both counters
all_items=set()
all_items=set(counter_1.keys()).union( set(counter_2.keys()) )

#Create a vector of the counts of all words in each corpora
vector_1=[counter_1[k] for k in all_items]
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred) * 100

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Frequency of words
from nltk.probability import FreqDist
import string  
words = dataset.iloc[:, 0].values
fdist = {}
a = ''.join(words)
b = ''.join(['' if ch in string.punctuation else ch for ch in a])
tokens = nltk.tokenize.word_tokenize(b)  
fdist = FreqDist(tokens)             
mostWords = fdist.most_common(100)


#tokens = nltk.word_tokenize(words)
#fdist=FreqDist(tokens)
#for sentence in nltk.tokenize.sent_tokenize(tokenized_sents):
#    for i in nltk.tokenize.word_tokenize(sentence):
#         fdist[i] += 1   
#for i in tokenized_sents:
#    fdist[i].append(i)
# tokenized_sents = [nltk.word_tokenize(i) for i in words]
示例#58
0
def frequencyDistribution():
	t = word_tokenize(text)
	freq = FreqDist(t)
	print freq.most_common(50)
示例#59
0
	def GET(self, jsoninput):
		wordlist = word_tokenize(jsoninput)
		wordlist = filter(lambda a: a != '.' and a != ','  and a != '?' and a !='!', wordlist)
		fd = FreqDist(wordlist)
		mostcom = fd.most_common(10)
		return jsonify('mostfreq', mostcom)
stop_words = nltk.corpus.stopwords.words('spanish') 
non_alphabetic = re.compile("\W|\d")
words = []
tags = []

# Using TreeTagger 
# 1) pip install treetaggerwrapper
# 2) put treetragger in %PYHOME%\Lib\site-packages\TreeTagger
# 3) put spanish-utf8.par and spanish-chunker.par in \TreeTagger\lib
# See http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/spanish-tagset.txt for tag meanings
tagger = treetaggerwrapper.TreeTagger(TAGLANG='es')
for sentence in article_corpus.sents():
	tagged_sentence = tagger.tag_text(sentence) 
	tags.extend(treetaggerwrapper.make_tags(tagged_sentence))

#TODO: create a tagger script, save the tagged files
#TODO: look at alternate taggers, compare

#TODO: profile this and see which part is taking so long
for tag in tags:
	lemma = tag[2].lower()
	if lemma not in stop_words and not non_alphabetic.search(lemma):
		words.append(lemma)

freq_dist = FreqDist(words)

with open('./frequency_distribution.txt', 'w', encoding='utf-8') as f:
	f.write("word, number of occurences\n")
	for word in freq_dist.most_common():
		f.write(word[0] + ", " + str(word[1]) + "\n")