Пример #1
0
    def alpha(self):
        """Krippendorff 1980

        """
        # check for degenerate cases
        if len(self.K) == 0:
            raise ValueError("Cannot calculate alpha, no data present!")
        if len(self.K) == 1:
            log.debug("Only one annotation value, allpha returning 1.")
            return 1
        if len(self.C) == 1 and len(self.I) == 1:
            raise ValueError(
                "Cannot calculate alpha, only one coder and item present!")

        total_disagreement = 0.0
        total_ratings = 0
        all_valid_labels_freq = FreqDist([])

        total_do = 0.0  # Total observed disagreement for all items.
        for i, itemdata in self._grouped_data("item"):
            label_freqs = FreqDist(x["labels"] for x in itemdata)
            labels_count = sum(label_freqs.values())
            if labels_count < 2:
                # Ignore the item.
                continue
            all_valid_labels_freq += label_freqs
            total_do += self.Disagreement(label_freqs) * labels_count

        do = total_do / sum(all_valid_labels_freq.values())

        de = self.Disagreement(all_valid_labels_freq)  # Expected disagreement.
        k_alpha = 1.0 - do / de

        return k_alpha
Пример #2
0
    def alpha(self):
        """Krippendorff 1980

        """
        # check for degenerate cases
        if len(self.K) == 0:
            raise ValueError("Cannot calculate alpha, no data present!")
        if len(self.K) == 1:
            log.debug("Only one annotation value, allpha returning 1.")
            return 1
        if len(self.C) == 1 and len(self.I) == 1:
            raise ValueError("Cannot calculate alpha, only one coder and item present!")

        total_disagreement = 0.0
        total_ratings = 0
        all_valid_labels_freq = FreqDist([])

        total_do = 0.0 # Total observed disagreement for all items.
        for i, itemdata in self._grouped_data('item'):
            label_freqs = FreqDist(x['labels'] for x in itemdata)
            labels_count = sum(label_freqs.values())
            if labels_count < 2:
                # Ignore the item.
                continue
            all_valid_labels_freq += label_freqs
            total_do += self.Disagreement(label_freqs) * labels_count

        do = total_do / sum(all_valid_labels_freq.values())

        de = self.Disagreement(all_valid_labels_freq) # Expected disagreement.
        k_alpha = 1.0 - do / de

        return k_alpha
def char_freq(lines):
    """ 返回 DataFrame,按字符频率倒序排列 """
    corpus = nltk.Text(chain.from_iterable(lines))  # 需要一个长字符串,而不是字符串列表
    wc = FreqDist(corpus)
    df = pd.DataFrame({'word': wc.keys(), 'freq': wc.values()})
    df.sort('freq', ascending=False, inplace=True)
    df['idx'] = np.arange(len(wc.values()))
    return df
Пример #4
0
def char_freq(lines):
    """ 返回 DataFrame,按字符频率倒序排列 """
    corpus = nltk.Text(chain.from_iterable(lines))  # 需要一个长字符串,而不是字符串列表
    wc = FreqDist(corpus)
    df = pd.DataFrame({'word': wc.keys(), 'freq': wc.values()})
    df.sort('freq', ascending=False, inplace=True)
    df['idx'] = np.arange(len(wc.values()))
    return df
Пример #5
0
 def getFreq(self, text, normalize=True):
     stop_words = stopwords.words(self.detectLanguage(text))
     words = self.getTokens(text)
     clean_words = filter(
         lambda word: not word in stop_words and not word in punctuation,
         words)
     fdist = FreqDist(clean_words)
     #==============================================================================
     #         # same result
     #         fdist = FreqDist()
     #         for word in word_tokenize(text):
     #             word = word.lower()
     #             if not word in stop_words and not word in punctuation:
     #                 fdist[word] += 1
     #==============================================================================
     # normalization by dividing on max freqency
     if normalize:
         norm = float(max(fdist.values()))
         for word in fdist.keys():
             fdist[word] = fdist[word] / norm
             # remove too frequent and too rare words
             if fdist[word] >= self._upper_bound or fdist[
                     word] <= self._lower_bound:
                 del fdist[word]
     return fdist
Пример #6
0
def entropy(alist):
    f = FreqDist(alist)
    ent = (-1) * sum([
        float(i) / len(alist) * math.log(float(i) / len(alist))
        for i in f.values()
    ])
    return ent
Пример #7
0
def get_top_words(directory, n, file):
	num_docs = 0.0
	flist = {}
	result = {}
	for f in os.listdir(directory):
		#stop = "/Users/oliverfengpet/Dropbox/TwitterAffect/stoplist.txt"
		
		num_docs+=1
		rawContents = load_file_tokens(directory+'/'+f)
		fdist = FreqDist( rawContents )
		normalF = max(fdist.values())
		
		for key in fdist.keys():
			fdist[key]=float(float(fdist[key])/normalF)
	
		flist[directory+'/'+f] = fdist
		
		
	for key in flist[file].keys():
		num_appear=0
		for key_file in flist.keys():
			if key in flist[key_file].keys():
				num_appear+=1
		
		result[key] = flist[file][key]*math.log(num_docs/(num_appear))
	
	sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1),reverse=True)
	
	top_x = sorted_x[:n]
	result = []
	
	for item in top_x:
		result.append(item[0])
	
	return result
Пример #8
0
    def main(self):
        TABLE = "article"
        cats = [
            'Economy', 'Art', 'Climate', 'Crime', 'Health', 'Politics',
            'Religion', 'Science', 'Sport', 'Tech'
        ]

        config = {
            "host": 'localhost',
            "user": '******',
            "password": '******',
            "db": 'mdac'
        }
        self.db_connect(config)
        print(u"processing records")
        self.corpora = dict()
        for cat in cats:
            bag = dict()
            for article in self.select(
                    TABLE, cat):  # tryutn Tuple .. [0] is the result
                article = purifier.purify(article[0])
                words = classic_tokenizer.tokenize(article)
                bag = FreqDist(words)
                self.corpora.update({cat: bag})
            print(u"Words of Cat:[{}] are: ({})".format(
                cat, str(len(bag.values()))))
Пример #9
0
def freq_func(input_text): #nltk输入文本,输出词频
    corpus = nltk.Text(input_text)
    fdist = FreqDist(corpus)
    w = list(fdist.keys())
    v = list(fdist.values())
    freqpd = pd.DataFrame({'word':w,'freq':v})
    freqpd.sort_values(by='freq',ascending=False,inplace=True)
    freqpd['idx'] = np.arange(len(v))
    return freqpd
Пример #10
0
def freq_func(input_txt):
    corpus = nltk.Text(input_txt)
    fdist = FreqDist(corpus)
    w = fdist.keys()
    v = fdist.values()
    freqdf = pd.DataFrame({'word': w, 'freq': v})
    freqdf.sort('freq', ascending=False, inplace=True)
    freqdf['idx'] = np.arange(len(v))
    return freqdf
def word_count_func(data):
    #     #using the process_tweet function to tokenize, lower and remove stopwords
    process_data = list(map(process_tweet, data))

    #     #lemmatizing words
    from nltk.stem import WordNetLemmatizer

    #     #instantiating
    lemmatizer = WordNetLemmatizer()

    #     #process_data is a list of lists - here looping over the lists and then the words in the list
    lemmatizer_tweets = []
    for l in process_data:
        new_row = []
        for w in l:
            new_row.append(lemmatizer.lemmatize(w))
        lemmatizer_tweets.append(new_row)

    #This is more descriptive info - calculating the unique vocab of the subset
    overall_lem_vocab = set()
    for tweet in lemmatizer_tweets:
        overall_lem_vocab.update(tweet)
    print(f'Overall vocab of subset: {len(overall_lem_vocab)}')

    #Flattening (going from a list of lists to one single list) the lemmatized tweets for freq
    flat_lemmatizer_tweets = [
        item for sublist in lemmatizer_tweets for item in sublist
    ]

    #applying nltk freqdist function to the flat list
    lem_freq = FreqDist(flat_lemmatizer_tweets)
    print('30 most common words in subset:')
    print(lem_freq.most_common(30))

    #returning normalized word freq because there are different N's
    total_words = sum(lem_freq.values())
    top_30 = lem_freq.most_common(30)
    print("Word \t\t Normalized Frequency")
    print()
    for word in top_30:
        normalized_frequency = word[1] / total_words
        print("{} \t\t {:.4}".format(word[0], normalized_frequency))

    #Creating word clouds - input is a dict with key word value num of occurences
    word_dict = dict(top_30)
    from wordcloud import WordCloud
    wordcloud = WordCloud(
        colormap='Spectral').generate_from_frequencies(word_dict)

    # Display the generated image w/ matplotlib:

    plt.figure(figsize=(10, 10), facecolor='k')
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()
Пример #12
0
def yules_k(text):
    """Returns the yules_k of the text
        Keyword arguments:
            text: text
    """
    word_list = nltk.tokenize.word_tokenize(text)
    s1 = len(word_list)
    word_freq_dist = FreqDist(nltk.tokenize.word_tokenize(text))
    s2 = sum([freq ** 2 for freq in word_freq_dist.values()])
    K = 10000 * (s2-s1)/(s1**2)
    return K
def createPDwithTeleport(readerWordlist,mergedWordList):
    ### teleporation paramerter with value of 1 percent 
    
    corpusPD = {}
    readerPD = {}
    
    unigramReaderWordList = FreqDist(readerWordlist)
    unigramCorpusWordList = FreqDist(mergedWordList)
    
    for word in unigramCorpusWordList.keys():
        
        corpusPD[word] = unigramCorpusWordList[word]/float(sum(unigramCorpusWordList.values()))
        
        if word in unigramReaderWordList:
            readerPD[word] = unigramReaderWordList[word]/float(sum(unigramReaderWordList.values()))
        else:
            readerPD[word] = 0
            
        readerPD[word] = 0.99*readerPD[word] + 0.01*corpusPD[word]
        
    return readerPD
Пример #14
0
def wordprefixsuffixsubstringsprobdist():
	for w in englishdicttxt:
		wtok=w.split()
		if len(wtok) > 0:		
			computeprefixessuffixessubstrings(wtok[0])
			wordlist.append(wtok[0])
	#prefixf=open("WordPrefixesProbabilities.txt","w")
	#suffixf=open("WordSuffixesProbabilities.txt","w")
	prefixdict=FreqDist(prefixes)
	suffixdict=FreqDist(suffixes)
	substringsdict=FreqDist(suffixes)
	totalprefixes=sum(prefixdict.values())
	totalsuffixes=sum(suffixdict.values())
	totalsubstrings=sum(substringsdict.values())
	for pk,pv in zip(prefixdict.keys(), prefixdict.values()):
		prefixprobdict[pk] = float(pv)/float(totalprefixes)
	for pk,pv in zip(suffixdict.keys(), suffixdict.values()):
		suffixprobdict[pk] = float(pv)/float(totalsuffixes)
	for pk,pv in zip(substringsdict.keys(), substringsdict.values()):
		substringsprobdict[pk] = float(pv)/float(totalsubstrings)
	#json.dump(prefixprobdict,prefixf)
	#json.dump(suffixprobdict,suffixf)
	#print "prefix probabilities:",prefixprobdict
	#print "suffix probabilities:",suffixprobdict
	return (prefixprobdict, suffixprobdict, substringsprobdict)
Пример #15
0
def freq_words(x, terms = 30):
  all_words = ' '.join([text for text in x])
  all_words = all_words.split()

  fdist = FreqDist(all_words)
  words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})

  # selecting top 20 most frequent words
  d = words_df.nlargest(columns="count", n = terms) 
  plt.figure(figsize=(20,5))
  ax = sns.barplot(data=d, x= "word", y = "count")
  ax.set(ylabel = 'Count')
  plt.show()
def remove_low_frequent_words(texts):
    """
    Function to remove low frequent words from texts
  """
    utils.log("Doc2Vec", "Remove low frequent words...")
    dictionary = FreqDist([item for sublist in texts for item in sublist])
    word_frequencies = list(dictionary.values())
    low_word_frequency_quantile = np.percentile(np.array(word_frequencies),
                                                LOW_WORD_FREQUENCY_QUANTILE)
    return [[
        word for word in text
        if dictionary[word] >= low_word_frequency_quantile
    ] for text in tqdm(texts)]
Пример #17
0
def plot_dist_productions_by_frequency(productions):
    f= FreqDist(productions)
    fdd = FreqDist(f.values())
    x = []
    y = []
    for k in fdd.keys():
        x.append(k)
        y.append(fdd[k])
    plt.plot(x,y,lw=2,color= 'b')
    plt.title('Productions by frequency' )
    plt.xlabel('frequency')
    plt.ylabel('number of rules with frequency')
    plt.show()
Пример #18
0
def plot_dist_productions_by_frequency(productions):
    f = FreqDist(productions)
    fdd = FreqDist(f.values())
    x = []
    y = []
    for k in fdd.keys():
        x.append(k)
        y.append(fdd[k])
    plt.plot(x, y, lw=2, color='b')
    plt.title('Productions by frequency')
    plt.xlabel('frequency')
    plt.ylabel('number of rules with frequency')
    plt.show()
Пример #19
0
    def _termInfo(self):
        info = []
        rawInfo = []
        # lemma frec in referencedLemmas
        try:
            for bData in self.referencedLemmas:
                if self.lemma in bData:
                    fdist = FreqDist(bData)
                    freq = fdist[self.lemma]
                    rawInfo.append(freq)
                    lenTokenList = len(bData)
                    if self.useWdfIdf:
                        metric = math.log(freq * 1.0 + 1.0, 2) / math.log(
                            lenTokenList + 1, 2)
                    else:
                        metric = freq
                    if DISPLAY:
                        app_logger.info(
                            u'[%s] Apariciones: %s Len: %s Max: %s Metric: %s'
                            % (self.lemma, fdist[self.lemma], len(bData),
                               max(fdist.values()), metric))
                    info.append(metric)

            # lemma frec in textLemmas
            freq = self.fdistLemmas[self.lemma]

            _lowerLimit, _median, upperLimit = getMedianDistributionInfo(
                rawInfo)

            self.rawScore = int(freq)
            self.upperLimit = max(settings.MANDATORY_TOKEN_MIN_QUANTITY,
                                  int(upperLimit))

            lenTokenList = len(self.textLemmas)
            if self.useWdfIdf:
                termFreq = math.log(freq * 1.0 + 1.0, 2) / math.log(
                    lenTokenList + 1, 2)
            else:
                termFreq = freq
            # referencedLemmas mean/sigma of lemma
            lowerLimit, _median, upperLimit = getMedianDistributionInfo(info)

            if not self.useWdfIdf:
                lowerLimit = math.ceil(lowerLimit)
                upperLimit = math.ceil(upperLimit)

            return termFreq, lowerLimit, upperLimit
        except Exception as ex:
            raise ex
Пример #20
0
def _train(self, tagged_corpus, cutoff=0, verbose=False): 
    token_count = hit_count = 0 
    useful_contexts = set() 
    fd = ConditionalFreqDist() 
    tag_prob = FreqDist()
    for sentence in tagged_corpus: 
        tokens, tags = zip(*sentence) 
        for index, (token, tag) in enumerate(sentence): 
            # Record the event. 
            token_count += 1 
            tag_prob.inc(tag)
            context = self.context(tokens, index, tags[:index])
            if context is None: continue 
            fd[context].inc(tag) 
            # If the backoff got it wrong, this context is useful: 
            if (self.backoff is None or 
                tag != self.backoff.tag_one(tokens, index, tags[:index])): 
                useful_contexts.add(context) 
    # Build the context_to_tag table -- for each context,  
    # calculate the entropy.  Only include contexts that 
    # lower then `cutoff` .
    total_tags = float(sum(tag_prob.values()))
    tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()]
    useful_contexts_after_filter = useful_contexts.copy()
    most_high = FreqDist()
    for context in useful_contexts:
        dd = fd[context]
#        total_tags = float(sum(dd.values()))
#        tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()]
        h = self.H(dd.keys(),tags_probs)
        if h > cutoff:
            useful_contexts_after_filter.remove(context)
            continue
        most_high[context] = h
    print most_high.keys()
    # Build the context_to_tag table -- for each context, figure
    # out what the most likely tag is.  
    for context in useful_contexts_after_filter:
        best_tag = fd[context].max()
        hits = fd[context][best_tag]
        self._context_to_tag[context] = best_tag
        hit_count += hits
    # Display some stats, if requested. 
    if verbose: 
        size = len(self._context_to_tag) 
        backoff = 100 - (hit_count * 100.0)/ token_count 
        pruning = 100 - (size * 100.0) / len(fd.conditions()) 
        print "[Trained Unigram tagger:", 
        print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
Пример #21
0
def add_description_text_analysis(data):
    print "Adding description text analysis..."

    d = data.description

    d_words = d.apply(word_tokenize)
    d_words_count = pd.Series(d_words.apply(len))
    d_words_count.reset_index(d.index)
    d_words_count.rename("word_count", inplace=True)

    content = " ".join(d)
    distr = FreqDist(word_tokenize(content))
    distr_len = float(len(distr.values()))
    word_freqs = d_words.apply(lambda x: [distr[z] / distr_len for z in x])

    data['description_diversity'] = word_freqs.apply(
        np.mean)  # this introduces nans

    return data.join(d_words_count)
Пример #22
0
def get_pos_entropy(all_tokens):
    """Get part-of-speech entropy."""

    # Get all pos tags
    pos = [t.sim_pos for t in all_tokens]

    # Get frequencies
    pos_dist = FreqDist(pos)
    values = list(pos_dist.values())

    # Get probability array
    prob_array = np.array(values)
    prob_array_norm = prob_array / sum(prob_array)

    # Compute entropy
    entropy = np.sum(-1 * (prob_array_norm) *
                     np.nan_to_num(np.log2(prob_array_norm)))

    return entropy
Пример #23
0
def filterTokens(tokens, typefeatures=None):
    all_terms = FreqDist(tokens)

    
    if typefeatures == 'unigrams':
        minimal = 2
    elif typefeatures == 'bigrams':    
        minimal = 2
    else:
        minimal = 1
   
    other = FreqDist()
    for freq,term in zip(all_terms.values(),all_terms.keys()):
        if freq >= minimal:
            other.inc(term, freq)
        else:
            break

    return other
Пример #24
0
def maxTF(text, normalize=True):
    
    lang = detectLanguage(text)
    stop_words = stopwords.words(lang) + [i for i in punctuation]    

    words = simple_preprocess(text)
    clean_words = filter(lambda word: not word in stop_words, words)
    fdist = FreqDist(clean_words)

    # Maximum tf normalization source: 
    # http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html
    if normalize:
        norm = float(max(fdist.values()))
        a = 0.5
        for word in fdist.keys():
            fdist[word] = a + (1-a) * (fdist[word] / norm)
            # remove too frequent and too rare words
            if fdist[word] >= 0.9 or fdist[word] <= 0.1:
                del fdist[word]
    return fdist
Пример #25
0
    def common_words(self, wfilter, n_words):

        self.filter = wfilter
        self.n_words = n_words
        all_words = ' '.join([text for text in wfilter])
        all_words = all_words.split()

        #get word frequency
        fdist = FreqDist(all_words)
        words_df = pd.DataFrame({
            'word': list(fdist.keys()),
            'count': list(fdist.values())
        })  #converts to df

        # selecting top #terms most frequent words and plot
        d = words_df.nlargest(columns="count", n=self.n_words)
        # plt.figure(figsize=(20, 5))
        # ax = sns.barplot(data=d, x="word", y="count")
        # ax.set(ylabel='Count')
        # plt.show()
        return d
Пример #26
0
def getFreq(text, normalize=True):
    try:
        stop_words = stopwords.words(detectLanguage(text))
    except LookupError:
        import nltk
        nltk.download('stopwords')
        stop_words = stopwords.words(detectLanguage(text))
    words = getTokens(text)
    clean_words = filter(
        lambda word: not word in stop_words and not word in punctuation, words)
    fdist = FreqDist(clean_words)

    # normalization by dividing on max freqency
    if normalize:
        norm = float(max(fdist.values()))
        for word in fdist.keys():
            fdist[word] = fdist[word] / norm
            # remove too frequent and too rare words
            if fdist[word] >= upper_bound or fdist[word] <= lower_bound:
                del fdist[word]
    return fdist
def get_buzzwords(docs):
	buzzwords = []
	for doc in docs:
		freqdist = FreqDist(docs[doc])
		vocab = freqdist.keys()
		freqs = freqdist.values()
		buzzwords = buzzwords + vocab[:50]

	buzzwords = set(buzzwords)

	freq_counts = {}
	for buzzword in buzzwords:
		print buzzword
		l = []
		for doc in docs:
			freqdist = FreqDist(docs[doc])
			t = (doc, freqdist[buzzword])
			l.append(t)
		freq_counts[buzzword] = l
	dump_content('freqs', freq_counts)
	return freq_counts
Пример #28
0
    def getFreq(self, text, normalize=True):
        stop_words = stopwords.words(self.detectLanguage(text))
        words = self.getTokens(text)
        clean_words = filter(lambda word: not word in stop_words and not word in punctuation, words)
        fdist = FreqDist(clean_words)
#==============================================================================
#         # same result        
#         fdist = FreqDist()
#         for word in word_tokenize(text):
#             word = word.lower()
#             if not word in stop_words and not word in punctuation:
#                 fdist[word] += 1
#==============================================================================
        # normalization by dividing on max freqency 
        if normalize:
            norm = float(max(fdist.values()))
            for word in fdist.keys():
                fdist[word] = fdist[word] / norm
                # remove too frequent and too rare words
                if fdist[word] >= self._upper_bound or fdist[word] <= self._lower_bound:
                    del fdist[word]
        return fdist
Пример #29
0
def txt_summry(text_data):
    text_data = text_data.lower()
    tokens = nltk.word_tokenize(text_data)
    fdist = FreqDist(tokens)
    maxfreq = max(fdist.values())
    for word in fdist:
        fdist[word] = (fdist[word] / maxfreq)
    sentence_list = nltk.sent_tokenize(text_data)
    sentence_scores = {}
    for sent in sentence_list:
        #considering each word in the sentence, in lowercase
        for word in nltk.word_tokenize(sent.lower()):
            """checking if the word exists in the word_frequencies dictionary.
            This check is performed since we created the sentence_list list from the wikiarticle_text object but the word frequencies were calculated 
            using the formatted_wikiarticle object(which doesn't contain any stop words, numbers, etc.)"""
            if word in fdist.keys():
                #considering only those sentences which have less than 30 words
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        #for first word of sentence, setting frequency to frequency of the first word
                        sentence_scores[sent] = fdist[word]
                    else:
                        #for other words (not first word) in same sentence, increasing frequency by frequency of the word
                        sentence_scores[sent] += fdist[word]

    #gathering the 7 sentences which have the largest scores into a list
    summary_sentences = heapq.nlargest(3,
                                       sentence_scores,
                                       key=sentence_scores.get)

    #making the sentences into a printable format
    summary = ''.join(summary_sentences)

    #generating the summary
    print("Summarised version of the article: ")
    print()
    print(summary)
    return summary
Пример #30
0
word_tokenize(text)
re.sub('\W', '', text)
re.sub('[^\w ]', '', text)
re.sub('[^\w ']', '', text)
re.sub('[^\w \']', '', text)
nltk.bigrams(text)
big = nltk.bigrams(text)
next(big)
nltk.word_tokenize(text)
text.similar
fdist
fdist['delicious']
dir(fdist)
fdist.max
fdist.values
fdist.values()
fdist.values().sum()
sum(fdist.values())
fdist['delicious'] / sum(fdist.values())
fdist['disgusting'] / sum(fdist.values())
fdist['disgusting']
fdist['vegetarian']
fdist['old-timey']
fdist['healthy']
fdist['expensive']
print text
print(text)
fdist.freq('delicious')
fdist.freq('delicnotehu')
fdist.N()
fdist ?
Пример #31
0
for r_list in tokenized:
    for word in r_list:
        freq_string = freq_string + word + " "

oneString_tokenize = nltk.word_tokenize(freq_string)
oneString_distribution = FreqDist(oneString_tokenize)

mostcommon_words = oneString_distribution.most_common(10)

keys = []
vals = []

for key in oneString_distribution.keys():
    keys.append(key)

for val in oneString_distribution.values():
    vals.append(val)

#Plotting Charts
plt.figure(figsize=(80, 3))
plt.bar(keys, vals)
plt.title("Distribution of a sentence")
plt.xticks(rotation='vertical')
plt.ylabel("Counts ")
plt.xlabel("words")
plt.show()

#higher than 10 letters
print("Words which have more than 10 letters")
print("*************************************")
for i in tokenized:
class StylometryExtractor:
    DALE_CHALL_WORDS = _load_dale_chall_words()
    TOKENIZER = RegexpTokenizer(r"\w+'\w+|\w+")
    SPECIAL_CHAR = '@<:@'

    def __init__(self, text):
        self.raw_text = text
        self.raw_text_length = len(text)
        self.number_of_letters = len(
            [x for x in self.raw_text if x.isalpha() or x.isdigit()])
        self.words = StylometryExtractor.TOKENIZER.tokenize(self.raw_text)
        self.tokens = word_tokenize(self.raw_text)
        self.number_of_words = len(self.words)
        self.number_of_tokens = len(self.tokens)
        #         self.text = Text(word_tokenize(self.raw_text))
        self.words_frequency = FreqDist(Text(self.words))
        self.tokens_frequency = FreqDist(Text(self.tokens))
        self.chars_counter = FreqDist(self.raw_text)
        self.lemmatizer = WordNetLemmatizer()
        self.lemmatized_words_frequency = FreqDist(
            Text([self.lemmatizer.lemmatize(word) for word in self.words]))
        self.sentences = sent_tokenize(self.raw_text)
        self.number_of_sentences = len(self.sentences)
        self.sentence_chars = [len(sent) for sent in self.sentences]
        self.sentence_word_length = [
            len(sent.split()) for sent in self.sentences
        ]
        self.paragraphs = [
            p for p in self.raw_text.split("\n\n")
            if len(p) > 0 and not p.isspace()
        ]
        self.paragraph_word_length = [len(p.split()) for p in self.paragraphs]
        self.all_trigrams = self._all_trigrams()
        self.all_fourgrams = self._all_fourgrams()
        self.ngram_string = self._to_ngram_string()
        self.features = self._to_dict()
        self.feature_names = list(self.features.keys())

    def _to_ngram_string(self):
        cleared_text = ' '.join([
            word for word in self.words
            if word not in stopwords.words('english')
        ])
        return StylometryExtractor.SPECIAL_CHAR.join(
            ''.join(ngram) for ngram in ngrams(cleared_text, 4)
            if ' ' not in ngram and '\n' not in ngram)

    def word_per_thousand(self, word):
        return self.words_frequency[word] * 1000 / self.words_frequency.N()

    def token_per_thousand(self, token):
        return self.tokens_frequency[token] * 1000 / self.tokens_frequency.N()

    def char_per_thousand(self, char):
        return self.chars_counter.freq(char) * 1000

    def chars_per_thousand(self, chars):
        return sum([self.char_per_thousand(char) for char in chars])

    def special_chars_per_thousand(self, special_chars):
        count = self.chars_counter.N()
        for char in special_chars:
            count -= self.chars_counter[char]
        return count / self.chars_counter.N() * 1000

    def upper_chars_per_thousand(self):
        return len(re.findall(r'[A-Z]',
                              self.raw_text)) / self.raw_text_length * 1000

    def spaces_per_thousand(self):
        return len([x for x in self.raw_text if x.isspace()
                    ]) / self.raw_text_length * 1000

    def has_urls(self):
        return int(
            bool(
                re.search('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+',
                          self.raw_text)))

    def syllables_per_thousand(self):
        return self.get_number_syllables() / self.raw_text_length * 1000

    def get_number_syllables(self):
        dic = pyphen.Pyphen(lang='en')
        return sum([len(dic.inserted(word).split("-")) for word in self.words])

    def get_number_pollisyllable_words(self):
        dic = pyphen.Pyphen(lang='en')
        return len([
            word for word in self.words
            if len(dic.inserted(word).split("-")) >= 3
        ])

    def get_words_longer_than_X(self, x):
        return len([word for word in self.words if len(word) >= x])

    def mean_of_syllables_per_word(self):
        return self.get_number_syllables() / self.number_of_words

    def num_of_words_with_more_than_three_syllables_per_thousand(self):
        return self.get_number_pollisyllable_words(
        ) / self.number_of_words * 1000

    def get_flesch_reading_ease(self):
        # http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
        """
        90.0- 100.0 - sily understood by an average 11-year-old student
        60.0 - 70.0 - easily understood by 13- to 15-year-old students
        0.00 - 30.0 -  best understood by university graduates
        """
        return 206.835 - 1.015 * self.number_of_words / self.number_of_sentences - 84.6 * self.get_number_syllables(
        ) / self.number_of_words

    def flesch_kincaid_grade_level(self):
        # http://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
        """
            It is more or less the number of years of education generally required to understand this text.
            The lowest grade level score in theory is -3.40.
        """
        return 0.39 * self.number_of_words / self.number_of_sentences + 11.8 * self.get_number_syllables(
        ) / self.number_of_words - 15.59

    def get_coleman_liau_index(self):
        # http://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
        """
             It approximates the U.S. grade level thought necessary to comprehend the text.
        """
        return 5.89 * self.number_of_letters / self.number_of_words - 29.6 * self.number_of_sentences / self.number_of_words - 15.8

    def get_gunning_fog_index(self):
        # http://en.wikipedia.org/wiki/Gunning_fog_index
        """
        The index estimates the years of formal education needed to understand the text on a first reading
        """
        return 0.4 * (self.number_of_words / self.number_of_sentences +
                      100.0 * self.get_number_pollisyllable_words() /
                      self.number_of_words)

    def get_smog_index(self):
        # http://en.wikipedia.org/wiki/SMOG
        """
            Simple Measure of Gobbledygook (SMOG) is a simplification of Gunning Fog, also estimating the years of formal education needed
            to understand a text
        """
        return 1.043 * math.sqrt(self.get_number_pollisyllable_words() * 30.0 /
                                 self.number_of_sentences) + 3.1291

    def get_ari_index(self):
        # http://en.wikipedia.org/wiki/Automated_Readability_Index
        """
            It produces an approximate representation of the US grade level needed to comprehend the text.
        """
        return 4.71 * self.number_of_letters / self.number_of_words + 0.5 * self.number_of_words / self.number_of_sentences - 21.43

    def get_lix_index(self):
        # http://en.wikipedia.org/wiki/LIX
        # http://www.readabilityformulas.com/the-LIX-readability-formula.php
        """
            Value interpretation:
            Very Easy      - 20, 25
            Easy           - 30, 35
            Medium         - 40. 45
            Difficult      - 50, 55
            Very Difficult - 60+
        """
        long_words = self.get_words_longer_than_X(6)
        number_of_periods = self.number_of_sentences + self.tokens_frequency[
            ':'] + self.tokens_frequency[';']
        return self.number_of_words / number_of_periods + 100.0 * long_words / self.number_of_words

    def number_of_dale_chall_difficult_words(self):
        return len([
            word for word in self.words
            if word not in StylometryExtractor.DALE_CHALL_WORDS
        ])

    def get_dale_chall_score(self):
        # http://en.wikipedia.org/wiki/Dale%E2%80%93Chall_readability_formula
        """
            4.9 or lower    ---  easily understood by an average 4th-grade student or lower
            5.0–5.9         ---  easily understood by an average 5th or 6th-grade student
            6.0–6.9         ---  easily understood by an average 7th or 8th-grade student
            7.0–7.9         ---  easily understood by an average 9th or 10th-grade student
            8.0–8.9         ---  easily understood by an average 11th or 12th-grade student
            9.0–9.9         ---  easily understood by an average 13th to 15th-grade (college) student
            10.0 or higher  ---  easily understood by an average college graduate
        """
        return 15.79 * self.number_of_dale_chall_difficult_words(
        ) / self.number_of_words + 0.0496 * self.number_of_words / self.number_of_sentences

    def get_dale_chall_known_fraction(self):
        """
            Computes the fraction of easy words in the text, i.e., the fraction of words that could be found in the
            dale chall list of 3.000 easy words.
        """
        return 1.0 - self.number_of_dale_chall_difficult_words(
        ) / self.number_of_words

    def yule_vocabulary_richness(self):
        M2 = sum([
            len(list(g)) * (freq**2) for freq, g in groupby(
                sorted(self.lemmatized_words_frequency.values()))
        ])
        M1 = float(sum(self.lemmatized_words_frequency.values()))
        return ((M2 - M1) / (M1 * M1)) * 10000

    def simpson_vocabulary_richness(self):
        result = 0
        for freq, g in groupby(sorted(
                self.lemmatized_words_frequency.values())):
            result += (len(list(g))) * freq * (freq - 1)
        n = sum(self.lemmatized_words_frequency.values())
        return (result / n / (n - 1))

    def mean_sentence_len(self):
        return np.mean(self.sentence_word_length)

    def std_sentence_len(self):
        return np.std(self.sentence_word_length)

    def mean_paragraph_len(self):
        return np.mean(self.paragraph_word_length)

    def std_paragraph_len(self):
        return np.std(self.paragraph_word_length)

    def mean_word_len(self):
        word_chars = [len(word) for word in self.words]
        return sum(word_chars) / len(word_chars)

    def unique_words_ratio(self):
        return len(set(self.words)) / self.number_of_words * 100

#     def get_byte_ngrams(self, number_of_bytes):

    @classmethod
    def to_pos_tags(cls, sentence):
        tokens = StylometryExtractor.TOKENIZER.tokenize(sentence)
        pos_tags = list(map(lambda x: x[1], pos_tag(tokens)))
        return ['__START__'] + pos_tags + ['__END__']

    @classmethod
    def pos_tag_trigrams(cls, sentence):
        pos_tags = StylometryExtractor.to_pos_tags(sentence)
        return [(x, y, z)
                for x, y, z in zip(pos_tags, pos_tags[1:], pos_tags[2:])]

    @classmethod
    def pos_tag_fourgrams(cls, sentence):
        pos_tags = StylometryExtractor.to_pos_tags(sentence)
        return [(p, l, m, n) for p, l, m, n in zip(pos_tags, pos_tags[1:],
                                                   pos_tags[2:], pos_tags[3:])]

    def _all_trigrams(self):
        return Counter(
            trigram for sentence in self.sentences
            for trigram in StylometryExtractor.pos_tag_trigrams(sentence))

    def _all_fourgrams(self):
        return Counter(
            fourgram for sentence in self.sentences
            for fourgram in StylometryExtractor.pos_tag_fourgrams(sentence))

    def pos_tag_trigrams_percents(self):
        number_of_trigrams = sum(self.all_trigrams.values())
        return {
            '_'.join(trigram):
            self.all_trigrams[trigram] / number_of_trigrams * 1000
            for trigram in MOST_COMMON_POS_TAG_TRIGRAMS
        }

    def pos_tag_fourgrams_percents(self):
        number_of_fourgrams = sum(self.all_fourgrams.values())
        return {
            '_'.join(fourgram):
            self.all_fourgrams[fourgram] / number_of_fourgrams * 1000
            for fourgram in MOST_COMMON_POS_TAG_FOURGRAMS
        }

    def char_ngrams_tf_idf(self):
        return dict(
            zip(VECTORIZER.get_feature_names(),
                VECTORIZER.transform([self.ngram_string]).toarray()[0]))

    def to_dict(self):
        return self.features

    def to_vector(self):
        return list(self.features.values())

    def _to_dict(self):
        features = {
            'Lexical diversity':
            self.unique_words_ratio(),
            'Mean Word Length':
            self.mean_word_len(),
            'Mean Sentence Length':
            self.mean_sentence_len(),
            'STDEV Sentence Length':
            self.std_sentence_len(),
            'Mean paragraph Length':
            self.mean_paragraph_len(),
            'Flesch Reading Ease':
            self.get_flesch_reading_ease(),
            'Flesch Kincaid Grade':
            self.flesch_kincaid_grade_level(),
            'Coleman Liau Index':
            self.get_coleman_liau_index(),
            'Gunning Fog Index':
            self.get_gunning_fog_index(),
            'Smog Index':
            self.get_smog_index(),
            'Ari Index':
            self.get_ari_index(),
            'Lix Index':
            self.get_lix_index(),
            'Dale Chall Score':
            self.get_dale_chall_score(),
            'Dale Chall Known Fraction':
            self.get_dale_chall_known_fraction(),
            'Yule Vocabulary Richness':
            self.yule_vocabulary_richness(),
            'Simpson Vocabulary Richness':
            self.simpson_vocabulary_richness(),
            'Punctuation':
            self.chars_per_thousand(['.', ',', '!', ';', '?']),
            'Special characters':
            self.chars_per_thousand([
                '%', '#', ')', '(', '@', '$', '^', '&', '>', '<', '*', '_',
                '-', '=', '-', '+', '/', '\\', '\'', '`'
            ]),
            'Even more special characters':
            self.special_chars_per_thousand([
                "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B",
                "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N",
                "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",
                "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l",
                "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x",
                "y", "z", '%', '#', ')', '(', '@', '$', '^', '&', '>', '<',
                '*', '_', '-', '=', '-', '+', '/', '\\', "'", '`', '"', '\n',
                '\r', ' ', '.', ',', '!', ';', '?', '[', ']', '{', '}', '\t',
                ':'
            ]),
            'Commas':
            self.token_per_thousand(','),
            'Semicolons':
            self.token_per_thousand(';'),
            'Quotations':
            self.token_per_thousand('"'),
            'Exclamations':
            self.token_per_thousand('!'),
            'Colons':
            self.token_per_thousand(':'),
            'Hyphens':
            self.token_per_thousand('-'),
            'Double Hyphens':
            self.token_per_thousand('--'),
            'Spaces':
            self.spaces_per_thousand(),
            'UpperCase Letters':
            self.upper_chars_per_thousand(),
            'Has URLs':
            self.has_urls(),
            'A':
            self.chars_per_thousand(['a', 'A']),
            'B':
            self.chars_per_thousand(['b', 'B']),
            'C':
            self.chars_per_thousand(['c', 'C']),
            'D':
            self.chars_per_thousand(['d', 'D']),
            'E':
            self.chars_per_thousand(['e', 'E']),
            'F':
            self.chars_per_thousand(['f', 'F']),
            'G':
            self.chars_per_thousand(['g', 'G']),
            'H':
            self.chars_per_thousand(['h', 'H']),
            'I':
            self.chars_per_thousand(['i', 'I']),
            'J':
            self.chars_per_thousand(['j', 'J']),
            'K':
            self.chars_per_thousand(['k', 'K']),
            'L':
            self.chars_per_thousand(['l', 'L']),
            'M':
            self.chars_per_thousand(['m', 'M']),
            'N':
            self.chars_per_thousand(['n', 'N']),
            'O':
            self.chars_per_thousand(['o', 'O']),
            'P':
            self.chars_per_thousand(['p', 'P']),
            'Q':
            self.chars_per_thousand(['q', 'Q']),
            'R':
            self.chars_per_thousand(['r', 'R']),
            'S':
            self.chars_per_thousand(['s', 'S']),
            'T':
            self.chars_per_thousand(['t', 'T']),
            'U':
            self.chars_per_thousand(['u', 'U']),
            'V':
            self.chars_per_thousand(['v', 'V']),
            'W':
            self.chars_per_thousand(['w', 'W']),
            'X':
            self.chars_per_thousand(['x', 'X']),
            'Y':
            self.chars_per_thousand(['y', 'Y']),
            'Z':
            self.chars_per_thousand(['z', 'Z']),
            'Numbers':
            self.chars_per_thousand(
                ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']),
            'Syllables':
            self.syllables_per_thousand(),
            'Mean syllables per word':
            self.mean_of_syllables_per_word(),
            'Words with >= 3 syllables':
            self.num_of_words_with_more_than_three_syllables_per_thousand(),
        }

        for stopword in stopwords.words('english'):
            features[stopword] = self.word_per_thousand(stopword)

        features.update(self.pos_tag_trigrams_percents())
        features.update(self.pos_tag_fourgrams_percents())
        features.update(self.char_ngrams_tf_idf())

        return OrderedDict(sorted(features.items(), key=lambda t: t[0]))
counts = data_vect.sum(axis=0).A1
top_idxs = (-counts).argsort()[:50]
top_idxs

import matplotlib.pyplot as plt
fdist.plot(30,cumulative=False)
plt.show()

ranks = range(1, len(fdist) + 1)
# range of 1-2-3-4-5-...-N 
#freqs = list()
#for token in fdist.keys():
#  freqs.append(fdist[token])
# unsorted list of frequencies per word
#ranks = range(1, fdist.B() + 1)
freqs = list(fdist.values())
# sorted (=ranked!) list of frequencies 
freqs.sort(reverse = True)
plt.plot(ranks, freqs, '-')
plt.xscale('log')
plt.yscale('log')
plt.show()

import random
random.sample(unique_tokens, 20)


all_words = list()
for text in sections:
  words = text.split()
  all_words.extend(words)
Пример #34
0
def getDomainRanking(siteDomain, seoLibrary, queries):
    from nltk.probability import FreqDist

    queriesRankingInfo = {}
    domainList = []
    '''
    Recopilamos la información para cada Query
    '''

    for query in queries:
        try:
            queriesRankingInfo[query] = getQueryRanking(
                query, seoLibrary.language, seoLibrary.country)
            domainList.extend(queriesRankingInfo[query].keys())
        except Exception as ex:
            print(ex)
            continue

    domainFreq = FreqDist(domainList)

    #lowerLimit = domainFreq[siteDomain]
    import numpy as np
    lowerLimit = np.percentile(domainFreq.values(), 25)
    '''
    Unificamos los resultados por dominio
    '''

    domainsInfo = {}

    for domain in domainFreq.keys():
        if domainFreq[domain] >= lowerLimit or domain == siteDomain:
            appearIn = {}
            notAppearIn = []
            for query, data in queriesRankingInfo.items():
                if domain in data:
                    appearIn[query] = data[domain]
                else:
                    notAppearIn.append(query)

            domainsInfo[domain] = DomainGoogleRankingInfo(
                domain, appearIn, notAppearIn)

    #Analisis del sitio a partir de las queries. Si no está .. malo
    # puede que pasemos sin www y se redirija a www
    try:
        siteDomainInfo = domainsInfo[siteDomain]
    except:
        # ponemos o quitamos www
        p = urlparse.urlparse(siteDomain)
        netloc = p.netloc or p.path

        if not netloc.startswith('www.'):
            siteDomain = 'www.' + netloc
        else:
            siteDomain = netloc[4:]
        siteDomainInfo = domainsInfo[siteDomain]

    #Competencia detectada en las queries analizadas
    del domainsInfo[siteDomain]
    domainCompetence = domainsInfo.values()

    #mas apariciones
    ##domainCompetence.sort(key=lambda x: x.avgPos, reverse=False)
    domainCompetence.sort(key=lambda x: len(x.appearIn), reverse=True)
    domainCompetence = domainCompetence[:MAX_COMPETENCE_URLS]
    domainCompetence.sort(key=lambda x: x.avgPos,
                          reverse=False)  # por posicion

    return siteDomainInfo, domainCompetence
     ngrams_most_common.append([k for (k,_) in fdist_ngrams.most_common(params.m)])
     outputname = "output_for_" + f.name.rsplit(os.sep, 2)[1]
     
     # Write out the distribution of words in the document
     with codecs.open("distributions-data/output/words_" + outputname, "w", encoding=my_encoding) as out:
         for k,v in fdist_words.most_common():
             prozent = fdist_words.freq(k)
             out.write("{},{},{}\n".format(k,v, prozent))
     # Write out the distribution of ngrams in the document
     with codecs.open("distributions-data/output/letters_" + outputname, "w", encoding=my_encoding) as out:
         for k,v in fdist_ngrams.most_common():
             prozent = v / (len(unigrams) if len(k) == 1 else len(bigrams))
             out.write("{},{},{}\n".format(k,v, prozent))  
     # Write the size of bins of words that appear with the same frequency               
     with codecs.open("distributions-data/bins/" + outputname, "w", encoding=my_encoding) as out:
         for i in sorted(set(fdist_words.values())):
             bin_size = fdist_words.Nr(i)
             out.write("{},{}\n".format(i,bin_size))     
 print('Output distributions saved in \'output\' folder.')
 print('Output bins saved in \'bins\' folder.')
 # If there are many documents -> compare their most common words and ngrams
 if len(params.files) > 1:
     print("Pairwise overlap between {} most frequent words:".format(params.n))
     short_names = [f.name[-15:] for f in params.files]
     for i, list1 in enumerate(words_most_common):
         for j, list2 in enumerate(words_most_common[i+1:]):
             print("{} | {} | ".format(short_names[i], short_names[i+j+1]), end="")
             overlap = len([w for w in list1 if w in list2])
             print(overlap)
     print("Pairwise overlap between {} most frequent letters and letter pairs:".format(params.m))
     short_names = [f.name[-15:] for f in params.files]
Пример #36
0
def zipfs(data):
    # create empty summary_tokens list to hold tokens from every summary
    summary_tokens = []
    # boolean check to see if we have already gone through and tokenized everything
    files_exist = os.path.isfile("data/summary_tokens.txt")

    # if the data files don't exist, lets go through the process of creating them (takes ~3 minutes)
    if not files_exist:
        print(
            "\nThe summary_tokens file doesn't exist, beginning tokenization process, grab some coffee..."
        )

        # store the start time so we can keep track of how long this process takes
        t1 = time.time()

        # for zipfs law we will take out punctuation, but not stop words
        noiseWords = [
            "{{Expand section}}", ",", ".", "(", "[", "{", ")", "]", "}", ":",
            ";", "&", "'", '"', "'s", "``", "''", "n't", "`", '’'
        ]

        # iterate through the dataset, this is largely the same structure as in top_genres so I won't repeat comments
        for row in data.itertuples(index=True):
            # grab the summary string for tokenization
            summary_str = str(getattr(row, 'summary'))

            # tokenize the summary string
            tokens = word_tokenize(summary_str)

            # check to see if any of the tokens are in our noiseTokens list...
            for token in tokens:
                # if its not a noise word...
                if token not in noiseWords:
                    # then add the token to our summary_tokens list
                    summary_tokens.append(token)

        # grab the stop time and alert the user of progress
        t2 = time.time()
        print("Tokenization completed in " + str(t2 - t1) + " seconds.\n")

        # next lets write our summary tokens to the "summary_tokens.txt" file so we don't have to do this again
        summary_file = open("data/summary_tokens.txt", "w")
        # go through each token in the summary_tokens list
        for token in summary_tokens:
            # write each token on a newline
            summary_file.write("%s\n" % token)

        # close our file for memory
        summary_file.close()

    # the summary_tokens file already exists and we don't need to do any tokenization, this should be the normal case
    else:
        print("\nThe summary_tokens file exists, beginning token loading...")
        # open the summary_tokens.txt file
        summary_file = open("data/summary_tokens.txt", "r")

        # iterate over each line in the file
        for index, line in enumerate(summary_file):
            # trim the new line characters from the line
            trimmed_line = line.replace("\n", "")
            # append the line (token) to the summary_tokens list
            summary_tokens.append(trimmed_line)

        # close the summary file for memory
        summary_file.close()

        # we are now done loading the summary file and can proceed with addressing zipfs law
        print("Done loading!\n")

    print("Creating frequency distribution from " + str(len(summary_tokens)) +
          " summary tokens...")
    # create the frequency distribution of our summary tokens
    summary_fdist = FreqDist(summary_tokens)
    print("Frequency distribution computed!")

    # caclulate the frequency of our summary_fdist by grabbing the values and sorting descending (plot trends downwards)
    # this will be our x axis on our zipfs plot
    freqs = sorted(summary_fdist.values(), reverse=True)
    # calculate the ranks via our frequencies
    # this will be our y axis
    ranks = range(1, len(freqs) + 1)

    print(
        "\nNow plotting the ranks/frequencies of the summary frequency distribution to demonstrate zipfs law..."
    )
    print(
        "A plot should be displayed shortly, close the plot to finish script execution."
    )

    # use a loglog plot from matplotlib/pyplot (log of both axis) from our ranks by freqs
    plt.loglog(ranks, freqs)
    # label our x axis
    plt.xlabel('frequency (f)', fontsize=12, fontweight='bold')
    # label our y axis
    plt.ylabel('rank (r)', fontsize=12, fontweight='bold')
    # add a grid for visibility
    plt.grid(True)
    # display the plot
    plt.show()
Пример #37
0
#print(numpa)
termino = []  # palabras por documento
termifrecdoc = []  # matriz termino frecuencia documento
m = 0  # contador
k = 1  # contador
for i in range(len(numpa)):
    n = numpa[i]  # numero de palabras en la posicion i de numpa
    for j in range(n):
        termino.append(palabras[m + j])
    #lista de frecuencia de palabras
    termino.sort()
    #print(termino)
    #print(len(termino))
    fd = FreqDist(termino)  # frecuencia de una palabra de un documento
    frecpalab = list(
        fd.values())  # lista de frecuencias de cada palabra de un documento
    #print(frecpalab)
    #print(len(frecpalab))
    #lista de palabras sin repetir
    p = 0  # contador
    listpalabras = []  # palabras sin repetir
    for i in range(len(frecpalab)):
        listpalabras.append(termino[p])
        p = p + frecpalab[i]
    #print(listpalabras)
    #crear matriz termino frecuencia documento
    for i in range(len(listpalabras)):
        termifrecdoc.append([listpalabras[i], frecpalab[i], k])
    k = k + 1
    listpalabras = []
    termino = []
def summarize(text, n):
    sents = sent_tokenize(text)
    
#    assert n <= len(sents)
    wordSent = word_tokenize(text.lower())
    stopWords = set(stopwords.words('english')+list(punctuation))
    
    wordSent= [word for word in wordSent if word not in stopWords]
    freq = FreqDist(wordSent)
#    print(freq.items())             # (word,frequency)
#    print(list(freq.keys()))              # (words)
    words = list(freq.keys())

#    print(list(freq.values()))            # (frequency)
    frequency = list(freq.values())
    #print(frequency)
#    freq.plot(20,cumulative=False)  # graph plot of the word and frquency

    dictlist = []

    for i in range(len(words)):
        dict1 = {'word':words[i],'freq':frequency[i]}
        dictlist.append(dict1)
#    df = pd.DataFrame(dict)
#    print(df.head())

#    dataFrame = df
   

    # ====================================================== Feed data into MySql database ================================================================
    '''
    tableName = "project"

    sqlEngine = create_engine('mysql+pymysql://root:[email protected]/project', pool_recycle=3600)

    dbConnection = sqlEngine.connect()

    '''

    mydb = mysql.connector.connect(
      host="localhost",
      user="******",
      passwd="password",
      auth_plugin='mysql_native_password',
      database='project'
    )
    
    mycursor = mydb.cursor()
    
    for i in dictlist:
        word,count = i.values()
        sql = "INSERT INTO project (word, count) VALUES (%s, %s)"
        val = (word,count)
        mycursor.execute(sql, val)

    mydb.commit()


 
    '''
    try:

        frame = dataFrame.to_sql(tableName, dbConnection);

    except ValueError as vx:

        print(vx)

    except Exception as ex:   

        print(ex)

    else:

        print("Table %s created successfully."%tableName);   

    finally:

        dbConnection.close()
    '''
    # ======================================================== End of Database connection ================================================================

    ranking = defaultdict(int)
    
    for i, sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]
    sentsIDX = nlargest(n, ranking, key=ranking.get)
    return [sents[j] for j in sorted(sentsIDX)]
Пример #39
0
def entropy(alist):
    f = FreqDist(alist)
    ent = (-1) * sum(
        [i / len(alist) * math.log(i / len(alist), 2) for i in f.values()])
    return ent