def evaluate(row): record, stop_words = row text = nltk.word_tokenize(record['text']) sentList = nltk.sent_tokenize(record['text']) wordsInSentsPos = [nltk.pos_tag(nltk.word_tokenize(s)) for s in sentList] wordsInSentsWnPos = [[(w[0], penn2morphy(w[1])) for w in s if w[0].lower() not in stop_words] for s in wordsInSentsPos] #the above returns a list of sentences where each sentence is a list of #(word-as-string, pos tag) tuples. Stop words are removed here because pos_tag #uses grammatical structure but lesk does not. #lemmatization lmtzr = WordNetLemmatizer() for sent in wordsInSentsWnPos: for indx, tup in enumerate(sent): if tup[1] != '': #if pos exists, reset word to lemma word = lmtzr.lemmatize(tup[0], tup[1]) text[indx] = word #if pos tagger failed, keep original word fdist = FreqDist(text) unfiltered_frequencies = fdist.most_common(fdist.B()) frequencies = [ t for t in unfiltered_frequencies if t[0] not in stop_words and t[1] > 5 and len(t[0]) > 2 ] return frequencies
def show_stats_for_text(text): words = tokenize(text, clean_filter=FILTER_ALL) fd = FreqDist(words) logger.info('Total words: %s', len(words)) logger.info('Recurrent words: %s', fd.B()) logger.info('Most common words') for word, count in fd.most_common(20): logger.info('%s\t%s', word, count)
def evaluate(row): record, stop_words = row text = nltk.word_tokenize(record['text']) fdist = FreqDist(text) unfiltered_frequencies = fdist.most_common(fdist.B()) frequencies = [ t for t in unfiltered_frequencies if t[0] not in stop_words and t[1] > 5 and len(t[0]) > 2 ] return frequencies
def calculo_frecuencias(bag_of_words): """Calcula frecuencias de las palabras y muestra una gráfica con las más frecuentes Args: bag_of_words: lista de strings """ freq_dist = FreqDist(bag_of_words) print("Nº. objetos: %d" % freq_dist.N()) print("Nº. objetos únicos: %d" % freq_dist.B()) print("El objeto más frecuente es: %s" % str(freq_dist.max())) freq_dist.plot(50)
def getFreqDist(self): fieldnames = ['Word','Frequency'] with open(self.csvfile, 'wb') as csvf: writer = csv.DictWriter(csvf, fieldnames=fieldnames) writer.writeheader() text=self.text #set stopwords stopwords = set(nltk.corpus.stopwords.words('english')) words=word_tokenize(text) #remove words if length of word is not over 1 (i.e. punctuation) words = [word for word in words if len(word) > 1] #remove numbers words = [word for word in words if not word.isnumeric()] #make all words lowercase words = [word.lower() for word in words] #remove stopwords words = [word for word in words if word not in stopwords] fdist= FreqDist(words) #number of all words print ('Total number of samples: %i' % fdist.N()) #number of all distinct words print ('Total number of bins: %i' % fdist.B()) #write all bins and count into CSV file for word, frequency in fdist.most_common(fdist.B()): writer.writerow({'Word':word,'Frequency': frequency})
def evaluate(row): record, stop_words = row text = nltk.word_tokenize(record['text']) #stemming stemmer = SnowballStemmer("english") for indx, word in enumerate(text): word = stemmer.stem(word) text[indx] = word fdist = FreqDist(text) unfiltered_frequencies = fdist.most_common(fdist.B()) frequencies = [ t for t in unfiltered_frequencies if t[0] not in stop_words and t[1] > 5 and len(t[0]) > 2 ] return frequencies
def __init__(self, n, train, estimator=None): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: C{int} @param train: the training text @type train: C{list} of C{string} @param estimator: a function for generating a probability distribution @type estimator: a function that takes a C{ConditionalFreqDist} and returns a C{ConditionalProbDist} """ self._n = n self._N = 1 + len(train) - n if estimator is None: def estimator(fdist, bins): return MLEProbDist(fdist) if n == 1: fd = FreqDist(train) self._model = estimator(fd, fd.B()) else: cfd = ConditionalFreqDist() self._ngrams = set() self._prefix = ('',) * (n - 1) for ngram in ingrams(chain(self._prefix, train), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context][token] += 1 self._model = ConditionalProbDist(cfd, estimator, len(cfd)) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n - 1, train, estimator)
def __init__(self, n, train, pad_left=False, pad_right=False, estimator=None, *estimator_args, **estimator_kwargs): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. :param n: the order of the language model (ngram size) :type n: C{int} :param train: the training text :type train: C{iterable} of C{string} or C{iterable} of C{iterable} of C{string} :param estimator: a function for generating a probability distribution---defaults to MLEProbDist :type estimator: a function that takes a C{ConditionalFreqDist} and returns a C{ConditionalProbDist} :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of <s> :type pad_left: bool :param pad_right: whether to pad the right of each sentence with </s> :type pad_right: bool :param estimator_args: Extra arguments for estimator. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying ConditionalFreqDist are passed to the estimator as an argument. :type estimator_args: (any) :param estimator_kwargs: Extra keyword arguments for the estimator :type estimator_kwargs: (any) """ # protection from cryptic behavior for calling programs # that use the pre-2.0.2 interface assert(isinstance(pad_left, bool)) assert(isinstance(pad_right, bool)) # make sure n is greater than zero, otherwise print it assert (n > 0), n # For explicitness save the check whether this is a unigram model self.is_unigram_model = (n == 1) # save the ngram order number self._n = n # save left and right padding self._lpad = ('<s>',) * (n - 1) if pad_left else () # Need _rpad even for unigrams or padded entropy will give # wrong answer because '</s>' will be treated as unseen... self._rpad = ('</s>',) if pad_right else () self._padLen = len(self._lpad)+len(self._rpad) self._N=0 delta = 1+self._padLen-n # len(sent)+delta == ngrams in sent if estimator is None: assert (estimator_args is ()) and (estimator_kwargs=={}),\ "estimator_args (%s) or _kwargs supplied (%s), but no estimator"%(estimator_args,estimator_kwargs) estimator = lambda fdist, bins: MLEProbDist(fdist) # Given backoff, a generator isn't acceptable if not isinstance(train,collections.abc.Sequence): train=list(train) self._W = len(train) # Coerce to list of list -- note that this means to train charGrams, # requires exploding the words ahead of time if train is not None: if isinstance(train[0], compat.string_types): train = [train] self._W=1 elif not isinstance(train[0],collections.abc.Sequence): # if you mix strings and generators, you have only yourself # to blame! for i in range(len(train)): train[i]=list(train[i]) if n == 1: if pad_right: sents=(chain(s,self._rpad) for s in train) else: sents=train fd=FreqDist() for s in sents: fd.update(s) if not estimator_args and not estimator_kwargs: self._model = estimator(fd,fd.B()) else: self._model = estimator(fd,fd.B(), *estimator_args, **estimator_kwargs) self._N=fd.N() else: cfd = ConditionalFreqDist() self._ngrams = set() for sent in train: self._N+=len(sent)+delta for ngram in ingrams(chain(self._lpad, sent, self._rpad), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context][token]+=1 if not estimator_args and not estimator_kwargs: self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) # recursively construct the lower-order models if not self.is_unigram_model: self._backoff = NgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs) # Code below here in this method, and the _words_following and _alpha method, are from # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015" self._backoff_alphas = dict() # For each condition (or context) for ctxt in cfd.conditions(): backoff_ctxt = ctxt[1:] backoff_total_pr = 0.0 total_observed_pr = 0.0 # this is the subset of words that we OBSERVED following # this context. # i.e. Count(word | context) > 0 for word in self._words_following(ctxt, cfd): total_observed_pr += self.prob(word, ctxt) # we also need the total (n-1)-gram probability of # words observed in this n-gram context backoff_total_pr += self._backoff.prob(word, backoff_ctxt) if isclose(total_observed_pr,1.0): total_observed_pr=1.0 else: assert 0.0 <= total_observed_pr <= 1.0,\ "sum of probs for %s out of bounds: %.10g"%(ctxt,total_observed_pr) # beta is the remaining probability weight after we factor out # the probability of observed words. # As a sanity check, both total_observed_pr and backoff_total_pr # must be GE 0, since probabilities are never negative beta = 1.0 - total_observed_pr if beta!=0.0: assert (0.0 <= backoff_total_pr < 1.0), \ "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr) alpha_ctxt = beta / (1.0 - backoff_total_pr) else: assert ((0.0 <= backoff_total_pr < 1.0) or isclose(1.0,backoff_total_pr)), \ "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr) alpha_ctxt = 0.0 self._backoff_alphas[ctxt] = alpha_ctxt
token_list = TweetTokenizer().tokenize(line) for token in token_list: #Excludes non-word tokens if re.search('\W',token) == None: #Excludes stopwords if (EXCLUDE_STOPWORDS) and (token not in stopwords.words('english')): text.append(token) elif (not EXCLUDE_STOPWORDS): text.append(token) print text #Create frequency distribution fdist = FreqDist(text) total_tokens = fdist.N() unique_tokens = fdist.B() if EXCLUDE_STOPWORDS: print "Stopwords are excluded" else: print "Stopwords are NOT excluded" #Print distribution properties print "\nThe number of total tokens:", total_tokens print "The number of unique tokens:", unique_tokens print "Lexical density:", (unique_tokens + 0.0)/total_tokens print "\nThe most common Words:" print "=======================" for x in fdist.most_common(100): w, n = x
allwords.count('Hamlet') A = set(allwords) longwords = [w for w in A if len(w) > 12] #单词长度>12的所有单词 print(sorted(longwords)) from nltk.probability import FreqDist, ConditionalFreqDist """ FreqDist: 创建一个所给数据的频率分布 B(): 不同单词的个数 N(): 所有单词的个数 tabulate(20): 把前20组数据以表格的形式显示出来 fd2.plot(20,cumulative=True): 参数cumulative 对数据进行累计 """ fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()]) print("不同单词的个数:%d" % fd2.B()) print("所有单词的个数:%d" % fd2.N()) fd2.tabulate(20) #把前20组数据 以表格的形式显示出来 fd2.plot(20) fd2.plot(20, cumulative=True) """ freq('the') #单词the出现的频率 ConditionalFreqDist( ): 条件频率统计的函数,研究类别之间的系统性的差异 """ from nltk.corpus import inaugural print(fd2.freq('the')) #单词the出现的频率 cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids() for w in inaugural.word(fileid) if fileid > '1980' and fileid < '2010') print(cfd.items()) cfd.plot()
def preprocess(s, output_train=False, output_test=False): """ preprocess takes a string of the form '(ham|spam) words words words\n(ham|spam) more words here....' And returns a list of processed texts. The class is removed, all words are lowercase, there are no stop words or punctuation, all the words are stemmed, and tokens that appear less than 5 times in the entire string are removed entirely. Return values: - list of processed SMS texts - nltk.probability.FreqDist of frequency of tokens - labels of each SMS text in order """ # Removing punctuation from unicde is tricky # I'm doing this because the word_tokenizer gives us unicode, so we want # everything to be unicode # Anyway, use this punctuation table with unicode.translate() # https://stackoverflow.com/questions/11066400/remove-punctuation-from-unicode-formatted-strings#11066687 # punctuation = dict.fromkeys(i for i in xrange(sys.maxunicode) # if unicodedata.category(unichr(i)).startswith('P')) stopwords = set(sw.words('english')) punctuation = [i for i in u'{}'.format(string.punctuation)] # Step 1: Remove uppercase, and make utf-8 to be sure s = s.lower() # Step 2: Tokenize! # Split everything by line (one line for each text) # Tokenize it # Skip word 1 (the class -- spam/ham), put it to array # Also trim the last item (it's an empty string after the last \n) token_texts = [word_tokenize(text) for text in s.split('\n')][:-1] labels = [text[0] for text in token_texts] texts = [text[1:] for text in token_texts] if output_train: answer_question( 'STEP 2.a', 'Total numner of distinct tokens is ' + str(FreqDist([word for text in texts for word in text]).B()) + '.') # Step 3: Remove stop words # Step 4: Remove punctuation # Step 5: Stem all the tokens # Doing this all in one go for simplicity. stemmer = PorterStemmer() for i in range(len(texts)): texts[i] = [ stemmer.stem(word) for word in texts[i] if word not in stopwords and word not in punctuation ] if output_train: answer_question('STEP 5.a', 'The list is ' + str(texts[10]) + '.') if output_test: answer_question('STEP 1.a', 'The list is ' + str(texts[23]) + '.') # Get freq distribution of the whole set freq = FreqDist([word for text in texts for word in text]) # Step 6: Dump all infrequent tokens # Note that len(freq) can give you the number of unique tokens in the data texts = [[word for word in text if freq[word] >= 5] for text in texts] freq = FreqDist([word for text in texts for word in text]) if output_train: answer_question( 'STEP 6.a', 'Total numner of distinct tokens is ' + str(freq.B()) + '.') # Done! return texts, freq, labels
#print("Line Bigrams:",list(bigrams(lineWords)), file=log_file) bgs = bgs + list(bigrams(lineWords)) #print("bgs=",bgs, file=log_file) #out = " ".join(s.encode('ascii', 'ignore') for s in tokens) #file_tokenized.write(out+'\n') print('*end of line*', file=log_file) # Write to file for item in bgs: output_file1.write(str(item) + "\n") output_file1.close() # Compute frequency distribution for all the bigrams in the corpus fdist1 = FreqDist(bgs) total_bigrams = fdist1.N() unique_bigrams = fdist1.B() print("Number of total bigrams:", total_bigrams, file=log_file) print("Number of unique bigrams:", unique_bigrams, file=log_file) print("100 most frequent bigrams:", file=log_file) mostFrequentBigramsList = fdist1.most_common(100) print(mostFrequentBigramsList, file=log_file) # Write to file for item in mostFrequentBigramsList: #print(item[1],"\t",item[0],"\n\n") output_file2.write(str(item[1]) + "\t" + str(item[0]) + "\n") output_file2.close() # Compute frequency distribution for all the words (excluding stopwords) in the corpus fdist2 = FreqDist(words) total_words = fdist2.N()
def __init__(self, n, train, k=5, v=None, liveDangerously=False, quiet=False): """ Creates an Katz-threshholded Ngram language model to capture patterns in n consecutive words of training text. Uses the KGoodTuringProbDist to estimate the conditional and unigram probabilities, to provide coverage of Ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: C{int} @param train: the training text @type train: C{list} of C{string} @param k: The threshhold above which counts are assumed to be reliable. Defaults to 5. @type k: C{Int} @param v: The number of unseens of degree 1. Defaults to the number of types in the training set @type v: C{Int} @param liveDangerously: If False, for each model check that the total probability mass after all adjustments is close to 1. Defaults to False. @type liveDangerously: C{Boolean} @param quiet: Various information will be printed during model construction unless this is True. Defaults to False. @type quiet: C{Boolean} """ self._n = n self._N = 1 + len(train) - n fd = FreqDist(train) if v is None: v = fd.B() print(('v', v)) if n == 1: # Treat this case specially self._model = KGoodTuringProbDist(fd, k, v, liveDangerously, ()) if not quiet: print("%s entries for %s tokens at degree 1, %s" % (len(fd), fd.N(), self._model.status)) else: def estimator(fdist, ctxt): return KGoodTuringProbDist(fdist, k, v, liveDangerously, ctxt) cfd = ConditionalFreqDist() for ngram in ingrams(train, n): # self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) self._model = ConditionalProbDist(cfd, estimator, True) if not quiet: statuses = {'normal': 0, 'bigSkewed': 0, 'weak': 0, LowHacked: 0} for ctx in cfd.conditions(): statuses[self[ctx].status] += 1 print("%s conditions at degree %s" % (len(cfd.conditions()), n)) for s in list(statuses.keys()): print(" %s %6d" % (s, statuses[s])) # recursively construct the lower-order models self._backoff = KBNgramModel(n - 1, train, k, v, liveDangerously)
for _, r in df: sentences = tokenize_sentence(r.Review) text_preprocessed.append([remove_stopword(tokenize_word(s), stopwords=stopwords) for s in sentences]) marker_shopId.append(r.ShopID) #Observe result print(text_preprocessed[:10]) #Save result #In binary, must be read in binary mode with open(r'..\data\preprocessed_{}.pickle'.format(title), 'wb') as f: pickle.dump((text_preprocessed, marker_shopId), f) return text_preprocessed text_preprocessed = preprocess() #--Word count (all docs) #Word frequency distribution by nltk fdist = FreqDist([i for i in flatten_list(text_preprocessed)]) #Observe result print('Unique terms:', fdist.B()) print('Total terms:', fdist.N()) sorted(fdist.items(), key=operator.itemgetter(1), reverse=True) #Top terms #Save result with open(r'..\data\fdist.pickle', 'wb') as f: pickle.dump(fdist, f)
def load_filter_list(json_file): filter_list = [] with open(json_file, 'r', encoding='utf8') as filter_file: for line in filter_file: filter_list.append(json.loads(line)['word']) return sorted(filter_list) start = datetime.datetime.now() assert os.path.isdir(CORPUS_DIR) newcorpus = PlaintextCorpusReader(CORPUS_DIR, '.*\.txt') print ('Corpus begins with {}'.format(newcorpus.words()[:10])) frequencies = FreqDist(newcorpus.words()) print('Samples: %d' % frequencies.N()) print('Words: %d' % frequencies.B()) pattern = '^[a-zěščřžýáýíéóďťňúů]*$' if DIACRITICS else '^[a-z]*$' candidates = {word: freq for (word, freq) in frequencies.most_common() if 6 < len(word) < 9 and re.match(pattern,word)} print('Candidates: %d' % len(candidates)) print ((datetime.datetime.now() - start).total_seconds()) spell = get_spellcheck_candidates() print('Spellchck candidates:\nAdjectives %d\nSubstantives %d\nNouns %d' % (len(spell['adjectives']), len(spell['substantives']), len(spell['nouns']))) print ((datetime.datetime.now() - start).total_seconds()) substs = [word for word in candidates.keys() if word in spell['substantives']] substs.sort(key=lambda subst : -candidates[subst]) adjs = [word for word in candidates.keys() if word in spell['adjectives']] adjs.sort(key=lambda adj : -candidates[adj]) nouns = [word for word in candidates.keys() if word in spell['nouns']] nouns.sort(key=lambda noun : -candidates[noun])
def plot_words(wordList): fDist = FreqDist(wordList) #print(fDist.most_common()) print("单词总数: ", fDist.N()) print("不同单词数: ", fDist.B()) fDist.plot(10)
class TextFeatures: parts_of_speech = [ "NN", "NNS", "NNP", "NNPS", "DT", "RB", "IN", "PRP", "CC", "CD", "VB", "VBD", "VBN", "VBG", "JJ", "EX", "FW" ] most_common_words = [ "the", "of", "and", "to", "a", "in", "for", "is" "on", "that", "by", "this", "with", "i", "you", "it", "not", "or", "be", "are", "from", "at", "as", "your", "all", "have", "new", "more", "an", "was", "we", "will", "home", "can", "us", "about", "if", "page", "my", "has", "search", "free" ] punctuation = [".", ",", "!", "?", ";", ":"] def __init__(self, text, session): self.session = session self.tokens = nltk.word_tokenize(text) self.text = text self.fdist = FreqDist() for token in self.tokens: self.fdist.inc(token.lower()) self.tagged = nltk.pos_tag(self.tokens) self.counts = self.__get_word_commonality_counts(self.text.split()) self.word_lengths = [len(word) for word in self.tokens] self.sentences = nltk.sent_tokenize(self.text) self.sentence_lengths = [len(sen.split()) for sen in self.sentences] def __get_word_commonality_counts(self, words): results = [ self.session.query(WordCount).filter_by(word=w).first() for w in words ] results = [w.count for w in results if w is not None] if len(results) == 0: return [0] return results def _word_freq_to_vector(self): dist = self.word_freq() return [dist.freq(word) for word in TextFeatures.most_common_words] def _punctuation_freq_vector(self): dist = self.word_freq() return [dist.freq(mark) for mark in TextFeatures.punctuation] def _word_length_freq_to_vector(self): dist = self.word_length_freq() return [dist.freq(length) for length in range(1, 12)] def _POS_freq_to_vector(self): dist = self.POS_freq() return [dist.freq(pos) for pos in TextFeatures.parts_of_speech] def _POS_cond_freq_to_vector(self): dist = self.POS_cond_freq() freq_vector = [] for pos0 in TextFeatures.parts_of_speech: for pos1 in TextFeatures.parts_of_speech: freq_vector.append(dist[pos0].freq(pos1)) return freq_vector def _word_rarity_freq_to_vector(self): dist = self.word_rarity_freq() return [dist.freq(i) for i in range(20)] def to_vector(self): return ([ self.avg_word_length(), self.std_dev_word_length(), float(self.max_word_length()), float(self.max_sentence_length()), float(self.min_sentence_length()), self.avg_sentence_length(), self.std_sentence_length(), float(self.avg_word_commonality()), float(self.std_word_commonality()), self.unique_word_freq() ] + self._word_rarity_freq_to_vector() + self._word_freq_to_vector() + self._punctuation_freq_vector() + self._word_length_freq_to_vector() + self._POS_freq_to_vector() #self._POS_cond_freq_to_vector() ) def word_freq(self): return self.fdist def word_length_freq(self): return FreqDist(len(word) for word in self.tokens) def POS_freq(self): "Returns the frequency distribution of parts of speech" pos_dist = FreqDist() for pos_pair in self.tagged: pos_dist.inc(pos_pair[1]) return pos_dist def POS_cond_freq(self): "Returns the conditional frequency distribution of parts of speech" cond_dist = ConditionalFreqDist() pos = [word_pos[1] for word_pos in self.tagged] [cond_dist[pair[0]].inc(pair[1]) for pair in pairwise(pos)] return cond_dist def word_rarity_freq(self): "Returns the frequency distribution of groups of word rarities" rarity_dist = FreqDist() for common in self.counts: if common > 500000000: rarity_dist.inc(0) elif common > 450000000: rarity_dist.inc(1) elif common > 400000000: rarity_dist.inc(2) elif common > 350000000: rarity_dist.inc(3) elif common > 300000000: rarity_dist.inc(4) elif common > 250000000: rarity_dist.inc(5) elif common > 200000000: rarity_dist.inc(6) elif common > 150000000: rarity_dist.inc(7) elif common > 100000000: rarity_dist.inc(8) elif common > 80000000: rarity_dist.inc(9) elif common > 65000000: rarity_dist.inc(10) elif common > 50000000: rarity_dist.inc(11) elif common > 30000000: rarity_dist.inc(12) elif common > 10000000: rarity_dist.inc(13) elif common > 8000000: rarity_dist.inc(14) elif common > 5500000: rarity_dist.inc(15) elif common > 3000000: rarity_dist.inc(16) elif common > 1000000: rarity_dist.inc(17) elif common > 500000: rarity_dist.inc(18) else: rarity_dist.inc(19) return rarity_dist def avg_word_length(self): return numpy.average(self.word_lengths) def std_dev_word_length(self): return numpy.std(self.word_lengths) def max_word_length(self): return max(self.word_lengths) def unique_word_freq(self): return float(self.fdist.B()) / self.fdist.N() def max_sentence_length(self): return max(self.sentence_lengths) def min_sentence_length(self): return min(self.sentence_lengths) def avg_sentence_length(self): return numpy.average(self.sentence_lengths) def std_sentence_length(self): return numpy.std(self.sentence_lengths) def avg_word_commonality(self): return numpy.average(self.counts) def std_word_commonality(self): return numpy.std(self.counts)
def frequency_analysis(mode, tokens): """ Performs simple frequency analysis with options for minimum word length, number of words and parts-of-speech to be included. """ # Variables for word selection num_tokens = 100 min_token_length = 3 max_token_length = 16 all_pos_tags_included = True pos_tags_included = { 'untagged words': True, 'nouns': True, 'pronouns': True, 'verbs': True, 'adverbs': True, 'adjectives': True, 'prepositions': True, 'miscellaneous words': True } committed = False while not committed: #Determining words to use in new FreqDist new_tokens = [] working_tokens = [] # Only choose words with POS tags # matching the classes in pos_tags_included: if not all_pos_tags_included: for w in tokens: word_included = False for tag in pos_tags_included: if pos_tags_included[tag]: for t in POS_TAGS[tag]: if w[1] == t: #print(w[0] + ' matches ' + str(w[1])) working_tokens.append(w[0]) word_included = True if not word_included: if pos_tags_included['miscellaneous words']: working_tokens.append(w[0]) else: working_tokens = tokens.copy() print(working_tokens) for w in working_tokens: if mode == TokenisationMode.CHUNKS: token = w elif mode == TokenisationMode.NGRAMS: token = w elif mode == TokenisationMode.WORDS: token = w[0] if min_token_length <= len(token) <= max_token_length: new_tokens.append(token) fdist = FreqDist(new_tokens) prelim_results = ['\nFrequency analysis had found '] prelim_results.append(str(fdist.B())) prelim_results.append(' unique token of potential interest\n') prelim_results.append('out of a total of ') prelim_results.append(str(fdist.N())) prelim_results.append('.') print(''.join(prelim_results)) intro = ['The '] intro.append(str(num_tokens)) intro.append(' most frequent tokens are currently selected.\n') intro.append('Selected words are currently between ') intro.append(str(min_token_length)) intro.append(' and ') intro.append(str(max_token_length)) intro.append(' characters in length.\n') if not mode == TokenisationMode.NGRAMS: if all_pos_tags_included: intro.append( 'All parts-of-speech are included in the selection.\n') else: intro.append('Parts-of-speech included in the selection:\n') for tag in pos_tags_included: if pos_tags_included[tag]: intro.append(tag) intro.append(', ') intro[len(intro) - 1] = '\n' # replace trailing comma with linebreak intro.append('Parts-of-speech excluded from the selection:\n') for tag in pos_tags_included: if not pos_tags_included[tag]: intro.append(tag) intro.append(', ') intro[len(intro) - 1] = '\n' # replace trailing comma with linebreak intro.append('Below are the selected words, most frequent first:\n') print(''.join(intro)) selected_tokens = fdist.most_common(num_tokens) word_string = [] charcount = 0 print(selected_tokens) for w in selected_tokens: if mode == TokenisationMode.CHUNKS: token = w[0] elif mode == TokenisationMode.NGRAMS: token = w[0] elif mode == TokenisationMode.WORDS: token = w[0] charcount += len(token) + 2 if charcount > 80: word_string.append('\n') charcount = 0 word_string.append(token) word_string.append(', ') word_string[len(word_string) - 1] = '\n' # strip comma, add linebreak print(''.join(word_string)) chosen = False while not chosen: print('Enter M below to change the minimum token length.') print('Enter X below to change the maximum token length.') print('Enter N to change the total number of tokens selected.') if not mode == TokenisationMode.NGRAMS: print('Enter P to restrict selection with PoS tagging.') print('Enter A to accept the current list of tokens and continue.') user_input = input() if user_input.lower() == 'n': num_tokens = int_input_prompt( '\nHow many words do you want selected?\n') chosen = True elif user_input.lower() == 'm': validated = False while not validated: min_token_length = int_input_prompt( '\nEnter a new minimum word length...\n') if min_token_length > max_token_length: print("Minimum word length can't exceed maximum!") else: validated = True chosen = True elif user_input.lower() == 'x': validated = False while not validated: max_token_length = int_input_prompt( '\nEnter a new maximum word length...\n') if mamin_token_length > max_token_length: print("Maximum word length cannot be less than" + "minimum!") else: validated = True chosen = True elif user_input.lower() == 'p': if not mode == TokenisationMode.NGRAMS: nothing_selected = True while nothing_selected: for tag in pos_tags_included: print('Do you want to include ' + tag + '?') pos_tag_chosen = yes_no_input_prompt() pos_tags_included[tag] = pos_tag_chosen if pos_tag_chosen: chosen = True nothing_selected = False else: all_pos_tags_included = False if nothing_selected: print('Error: you must include at least ' + 'one class of POS tags.') print('Restarting selection...\n') else: print('PoS tagging not applicable to ngrams.') print('Doing nothing...') elif user_input.lower() == 'a': chosen = True committed = True else: print('Input not recognised') words_string = ', '.join([w[0][0] for w in selected_tokens]) string_to_text_file(sys.argv[3], words_string) print('Words/phrases sucessfully saved to ' + sys.argv[3]) quit()
# In[50]: l=[s for ls in l for s in ls if s != '' ] # In[81]: fd['<unk>']=1 # In[82]: b=fd.B() for k in fd.keys(): fd[k]=(fd[k]+1)/(n+b) # In[83]: fd # In[79]: def generate_unigram_model(corpus,vocab): fd=FreqDist(corpus)