def text_to_dict(docs, metric): """ Create dictionaries of term frequencies based on documents Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist( ) # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) num_docs = len(docs) # Build dictionaries dicts = [] for i, fd in enumerate(tf_dists): if i % 100 == 0: print ' dict', str(i) + '/' + str(len(tf_dists)) d = {} if metric == FrequencyMetrics.TF: for word in fd.samples(): d[word] = fd.freq(word) elif metric == FrequencyMetrics.TF_IDF: for word in fd.samples(): d[word] = fd.freq(word) * math.log( float(num_docs) / doc_freqs[word]) else: raise ValueError("No such feature type: %s" % feature_type) dicts.append(d) return dicts
def text_to_dict(docs, metric): """ Create dictionaries of term frequencies based on documents Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist() # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) num_docs = len(docs) # Build dictionaries dicts = [] for i, fd in enumerate(tf_dists): if i%100==0: print ' dict',str(i)+'/'+str(len(tf_dists)) d = {} if metric == FrequencyMetrics.TF: for word in fd.samples(): d[word] = fd.freq(word) elif metric == FrequencyMetrics.TF_IDF: for word in fd.samples(): d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) else: raise ValueError("No such feature type: %s" % feature_type); dicts.append(d) return dicts
def text_to_vector(docs, metric): """ Create frequency based feature-vector from text Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist() # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) all_tokens = doc_freqs.keys() num_docs = len(docs) num_features = len(all_tokens) # Build feature x document matrix matrix = np.zeros((num_features, num_docs)) for i, fd in enumerate(tf_dists): if metric == FrequencyMetrics.TF: v = [fd.freq(word) for word in all_tokens] elif metric == FrequencyMetrics.TF_IDF: v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens] else: raise ValueError("No such feature type: %s" % feature_type); matrix[:,i] = v return matrix
def calculaEntropia(documento): freq_dist = FreqDist() corpus = Token(TEXT=open(documento).read()) WhitespaceTokenizer().tokenize(corpus) for token in corpus['SUBTOKENS']: freq_dist.inc(token['TEXT']) entropia = 0 for i in freq_dist.samples(): entropia = entropia + (freq_dist.freq(i) * log(freq_dist.freq(i), 2)) return -entropia
def getFeatures(self, corpus): stemmer = PorterStemmer() stems = FreqDist() onlyLettersNumbers = re.compile('[^a-zA-Z0-9%!]') corpus = onlyLettersNumbers.sub(' ', corpus.lower()) corpus = TreebankWordTokenizer().tokenize(corpus) count = 0 for word in corpus : if not stopwords.STOP_WORDS.get(word) and len(word.strip()) > 1 : stems.inc(stemmer.stem_word(word)) count += 1 if self.__maxFeatures > 0 and count >= self.__maxFeatures : break features = stems.samples() return features
def text_to_vector(docs, metric): """ Create frequency based feature-vector from text Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist( ) # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) all_tokens = doc_freqs.keys() num_docs = len(docs) num_features = len(all_tokens) # Build feature x document matrix matrix = np.zeros((num_features, num_docs)) for i, fd in enumerate(tf_dists): if metric == FrequencyMetrics.TF: v = [fd.freq(word) for word in all_tokens] elif metric == FrequencyMetrics.TF_IDF: v = [ fd.freq(word) * math.log(float(num_docs) / doc_freqs[word]) for word in all_tokens ] else: raise ValueError("No such feature type: %s" % feature_type) matrix[:, i] = v return matrix
for tag in sorted(set(tags_found.keys()) | set(tags_actual.keys())): found = tags_found[tag] actual = tags_actual[tag] precision = nltk.metrics.precision(tag_word_refs[tag], tag_word_test[tag]) recall = nltk.metrics.recall(tag_word_refs[tag], tag_word_test[tag]) print ' '.join([tag.ljust(taglen), str(found).rjust(9), str(actual).rjust(10), str(precision).ljust(13)[:13], str(recall).ljust(13)[:13]]) print ' '.join(['='*taglen, '='*9, '='*10, '='*13, '='*13]) else: sents = corpus.sents(**kwargs) taglen = 7 if args.fraction != 1.0: cutoff = int(math.ceil(len(sents) * args.fraction)) sents = sents[:cutoff] for sent in sents: for word, tag in tagger.tag(sent): tags_found.inc(tag) if len(tag) > taglen: taglen = len(tag) print ' '.join(['Tag'.center(taglen), 'Count'.center(9)]) print ' '.join(['='*taglen, '='*9]) for tag in sorted(tags_found.samples()): print ' '.join([tag.ljust(taglen), str(tags_found[tag]).rjust(9)]) print ' '.join(['='*taglen, '='*9])
print ' '.join([ tag.ljust(taglen), str(found).rjust(9), str(actual).rjust(10), str(precision).ljust(13)[:13], str(recall).ljust(13)[:13] ]) print ' '.join(['=' * taglen, '=' * 9, '=' * 10, '=' * 13, '=' * 13]) else: sents = corpus.sents(**kwargs) taglen = 7 if args.fraction != 1.0: cutoff = int(math.ceil(len(sents) * args.fraction)) sents = sents[:cutoff] for sent in sents: for word, tag in tagger.tag(sent): tags_found.inc(tag) if len(tag) > taglen: taglen = len(tag) print ' '.join(['Tag'.center(taglen), 'Count'.center(9)]) print ' '.join(['=' * taglen, '=' * 9]) for tag in sorted(tags_found.samples()): print ' '.join([tag.ljust(taglen), str(tags_found[tag]).rjust(9)]) print ' '.join(['=' * taglen, '=' * 9])
print corpus for token in corpus['SUBTOKENS']: freq_dist.inc(token['TEXT']) # How many times did "the" occur? freq_dist.count('the') # What was the frequency of the word "the"? freq_dist.freq('the') # How many word tokens were counted? freq_dist.N() # What word types were encountered? freq_dist.samples() # What was the most common word? freq_dist.max() # What is the distribution of word lengths in a corpus? freq_dist = FreqDist() for token in corpus['SUBTOKENS']: freq_dist.inc(len(token['TEXT'])) # Plot the results. wordlens = freq_dist.samples() # Ordena a lista wordlens.sort()
cutoff = int(math.ceil(len(chunked_sents) * args.fraction)) chunked_sents = chunked_sents[:cutoff] print chunker.evaluate(chunked_sents), '\n' if args.trace: print 'analyzing chunker coverage of %s with %s\n' % (args.corpus, chunker.__class__.__name__) iobs_found = FreqDist() sents = corpus.sents() if args.fraction != 1.0: cutoff = int(math.ceil(len(sents) * args.fraction)) sents = sents[:cutoff] for sent in sents: tree = chunker.parse(tagger.tag(sent)) for child in tree.subtrees(lambda t: t.node != 'S'): iobs_found.inc(child.node) iobs = iobs_found.samples() justify = max(7, *[len(iob) for iob in iobs]) print 'IOB'.center(justify) + ' Found ' print '='*justify + ' =========' for iob in sorted(iobs): print ' '.join([iob.ljust(justify), str(iobs_found[iob]).rjust(9)]) print '='*justify + ' ========='
from nltk.token import * from nltk.tokenizer import WhitespaceTokenizer from nltk.probability import FreqDist from nltk.draw.plot import Plot freq_dist = FreqDist() corpus = Token(TEXT=open('dados/may2001_pdf.torto').read()) print corpus WhitespaceTokenizer().tokenize(corpus) print corpus for token in corpus['SUBTOKENS']: freq_dist.inc(token['TEXT']) # Quantas vezes a palavra form aparece no corpus? freq_dist.count('form') # Qual é a freqüência da palavra form? freq_dist.freq('form') # Quantas palavras foram contadas? freq_dist.N() # Quais foram os tipos de palavras encontradas? freq_dist.samples() # Qual é a palavra mais comum? freq_dist.max()
def test_add_to_freq_dist(self): fd = FreqDist() fd = coveyquery.add_words_to_freq_dist(fd, "hi how are you doing_today") assert_equals(len(fd.samples()), 5) assert_equals(fd.keys(), ['doing_today', 'how', 'you', 'hi', 'are'])
chunked_sents = chunked_sents[:cutoff] print chunker.evaluate(chunked_sents), '\n' if args.trace: print 'analyzing chunker coverage of %s with %s\n' % ( args.corpus, chunker.__class__.__name__) iobs_found = FreqDist() sents = corpus.sents() if args.fraction != 1.0: cutoff = int(math.ceil(len(sents) * args.fraction)) sents = sents[:cutoff] for sent in sents: tree = chunker.parse(tagger.tag(sent)) for child in tree.subtrees(lambda t: t.node != 'S'): iobs_found.inc(child.node) iobs = iobs_found.samples() justify = max(7, *[len(iob) for iob in iobs]) print 'IOB'.center(justify) + ' Found ' print '=' * justify + ' =========' for iob in sorted(iobs): print ' '.join([iob.ljust(justify), str(iobs_found[iob]).rjust(9)]) print '=' * justify + ' ========='
# Um exemplo da lei de Zipf from nltk.token import * from nltk.tokenizer import WhitespaceTokenizer from nltk.probability import FreqDist from nltk.draw.plot import Plot freq_dist = FreqDist() corpus = Token(TEXT=open('dados/may2001_pdf.torto').read()) WhitespaceTokenizer().tokenize(corpus) for token in corpus['SUBTOKENS']: freq_dist.inc(token['TEXT']) wordcount = freq_dist.samples() #points = [(freq_dist.freq(l),l) for l in wordcount] #points.sort() x = 0 points = list(wordcount) for l in wordcount: points[x] = (freq_dist.count(l), x) x = x + 1 points.sort() print points Plot(points)