예제 #1
0
def text_to_dict(docs, metric):
    """ Create dictionaries of term frequencies based on documents

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist(
    )  # Distribution over how many documents each word appear in.
    tf_dists = []  # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc:
            fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)

    num_docs = len(docs)
    # Build dictionaries
    dicts = []
    for i, fd in enumerate(tf_dists):
        if i % 100 == 0: print '    dict', str(i) + '/' + str(len(tf_dists))
        d = {}
        if metric == FrequencyMetrics.TF:
            for word in fd.samples():
                d[word] = fd.freq(word)
        elif metric == FrequencyMetrics.TF_IDF:
            for word in fd.samples():
                d[word] = fd.freq(word) * math.log(
                    float(num_docs) / doc_freqs[word])
        else:
            raise ValueError("No such feature type: %s" % feature_type)
        dicts.append(d)
    return dicts
예제 #2
0
def text_to_dict(docs, metric):
    """ Create dictionaries of term frequencies based on documents

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    num_docs = len(docs)
    # Build dictionaries
    dicts = []
    for i, fd in enumerate(tf_dists):
        if i%100==0: print '    dict',str(i)+'/'+str(len(tf_dists))
        d = {}
        if metric == FrequencyMetrics.TF:
            for word in fd.samples():
                d[word] = fd.freq(word)
        elif metric == FrequencyMetrics.TF_IDF:
            for word in fd.samples():
                d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word])
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        dicts.append(d)
    return dicts
예제 #3
0
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)


    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        matrix[:,i] = v

    return matrix
예제 #4
0
def calculaEntropia(documento):
    freq_dist = FreqDist()
    corpus = Token(TEXT=open(documento).read())
    WhitespaceTokenizer().tokenize(corpus)
    for token in corpus['SUBTOKENS']:
        freq_dist.inc(token['TEXT'])
    entropia = 0
    for i in freq_dist.samples():
        entropia = entropia + (freq_dist.freq(i) * log(freq_dist.freq(i), 2))
    return -entropia
예제 #5
0
 def getFeatures(self, corpus):
     stemmer = PorterStemmer()
     stems = FreqDist()
     onlyLettersNumbers = re.compile('[^a-zA-Z0-9%!]')
     corpus = onlyLettersNumbers.sub(' ', corpus.lower())
     corpus = TreebankWordTokenizer().tokenize(corpus)
     
     count = 0
     for word in corpus :
         if not stopwords.STOP_WORDS.get(word) and len(word.strip()) > 1 :
             stems.inc(stemmer.stem_word(word))
             count += 1
             if self.__maxFeatures > 0 and count >= self.__maxFeatures :
                 break
             
     features = stems.samples()
     
     return features
예제 #6
0
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist(
    )  # Distribution over how many documents each word appear in.
    tf_dists = []  # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc:
            fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)

    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)

    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [
                fd.freq(word) * math.log(float(num_docs) / doc_freqs[word])
                for word in all_tokens
            ]
        else:
            raise ValueError("No such feature type: %s" % feature_type)
        matrix[:, i] = v

    return matrix
예제 #7
0
	for tag in sorted(set(tags_found.keys()) | set(tags_actual.keys())):
		found = tags_found[tag]
		actual = tags_actual[tag]
		precision = nltk.metrics.precision(tag_word_refs[tag], tag_word_test[tag])
		recall = nltk.metrics.recall(tag_word_refs[tag], tag_word_test[tag])
		print '  '.join([tag.ljust(taglen), str(found).rjust(9), str(actual).rjust(10),
			str(precision).ljust(13)[:13], str(recall).ljust(13)[:13]])
	
	print '  '.join(['='*taglen, '='*9, '='*10, '='*13, '='*13])
else:
	sents = corpus.sents(**kwargs)
	taglen = 7
	
	if args.fraction != 1.0:
		cutoff = int(math.ceil(len(sents) * args.fraction))
		sents = sents[:cutoff]
	
	for sent in sents:
		for word, tag in tagger.tag(sent):
			tags_found.inc(tag)
			
			if len(tag) > taglen:
				taglen = len(tag)
	
	print '  '.join(['Tag'.center(taglen), 'Count'.center(9)])
	print '  '.join(['='*taglen, '='*9])
	
	for tag in sorted(tags_found.samples()):
		print '  '.join([tag.ljust(taglen), str(tags_found[tag]).rjust(9)])
	
	print '  '.join(['='*taglen, '='*9])
예제 #8
0
        print '  '.join([
            tag.ljust(taglen),
            str(found).rjust(9),
            str(actual).rjust(10),
            str(precision).ljust(13)[:13],
            str(recall).ljust(13)[:13]
        ])

    print '  '.join(['=' * taglen, '=' * 9, '=' * 10, '=' * 13, '=' * 13])
else:
    sents = corpus.sents(**kwargs)
    taglen = 7

    if args.fraction != 1.0:
        cutoff = int(math.ceil(len(sents) * args.fraction))
        sents = sents[:cutoff]

    for sent in sents:
        for word, tag in tagger.tag(sent):
            tags_found.inc(tag)

            if len(tag) > taglen:
                taglen = len(tag)

    print '  '.join(['Tag'.center(taglen), 'Count'.center(9)])
    print '  '.join(['=' * taglen, '=' * 9])

    for tag in sorted(tags_found.samples()):
        print '  '.join([tag.ljust(taglen), str(tags_found[tag]).rjust(9)])

    print '  '.join(['=' * taglen, '=' * 9])
print corpus

for token in corpus['SUBTOKENS']:
	freq_dist.inc(token['TEXT'])

# How many times did "the" occur?
freq_dist.count('the')

# What was the frequency of the word "the"?
freq_dist.freq('the')

# How many word tokens were counted?
freq_dist.N()

# What word types were encountered?
freq_dist.samples()

# What was the most common word?
freq_dist.max()

# What is the distribution of word lengths in a corpus?
freq_dist = FreqDist()
for token in corpus['SUBTOKENS']:
	freq_dist.inc(len(token['TEXT']))

# Plot the results.
wordlens = freq_dist.samples()

# Ordena a lista
wordlens.sort()
		cutoff = int(math.ceil(len(chunked_sents) * args.fraction))
		chunked_sents = chunked_sents[:cutoff]
	
	print chunker.evaluate(chunked_sents), '\n'

if args.trace:
	print 'analyzing chunker coverage of %s with %s\n' % (args.corpus, chunker.__class__.__name__)

iobs_found = FreqDist()
sents = corpus.sents()

if args.fraction != 1.0:
	cutoff = int(math.ceil(len(sents) * args.fraction))
	sents = sents[:cutoff]

for sent in sents:
	tree = chunker.parse(tagger.tag(sent))
	
	for child in tree.subtrees(lambda t: t.node != 'S'):
		iobs_found.inc(child.node)

iobs = iobs_found.samples()
justify = max(7, *[len(iob) for iob in iobs])

print 'IOB'.center(justify) + '    Found  '
print '='*justify + '  ========='

for iob in sorted(iobs):
	print '  '.join([iob.ljust(justify), str(iobs_found[iob]).rjust(9)])

print '='*justify + '  ========='
예제 #11
0
from nltk.token import *
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.draw.plot import Plot

freq_dist = FreqDist()
corpus = Token(TEXT=open('dados/may2001_pdf.torto').read())
print corpus
WhitespaceTokenizer().tokenize(corpus)
print corpus

for token in corpus['SUBTOKENS']:
	freq_dist.inc(token['TEXT'])

# Quantas vezes a palavra form aparece no corpus?
freq_dist.count('form')
# Qual é a freqüência da palavra form?
freq_dist.freq('form')
# Quantas palavras foram contadas?
freq_dist.N()
# Quais foram os tipos de palavras encontradas?
freq_dist.samples()
# Qual é a palavra mais comum?
freq_dist.max()
예제 #12
0
 def test_add_to_freq_dist(self):
     fd = FreqDist()
     fd = coveyquery.add_words_to_freq_dist(fd, "hi how are you doing_today")
     assert_equals(len(fd.samples()), 5)
     assert_equals(fd.keys(), ['doing_today', 'how', 'you', 'hi', 'are'])
예제 #13
0
        chunked_sents = chunked_sents[:cutoff]

    print chunker.evaluate(chunked_sents), '\n'

if args.trace:
    print 'analyzing chunker coverage of %s with %s\n' % (
        args.corpus, chunker.__class__.__name__)

iobs_found = FreqDist()
sents = corpus.sents()

if args.fraction != 1.0:
    cutoff = int(math.ceil(len(sents) * args.fraction))
    sents = sents[:cutoff]

for sent in sents:
    tree = chunker.parse(tagger.tag(sent))

    for child in tree.subtrees(lambda t: t.node != 'S'):
        iobs_found.inc(child.node)

iobs = iobs_found.samples()
justify = max(7, *[len(iob) for iob in iobs])

print 'IOB'.center(justify) + '    Found  '
print '=' * justify + '  ========='

for iob in sorted(iobs):
    print '  '.join([iob.ljust(justify), str(iobs_found[iob]).rjust(9)])

print '=' * justify + '  ========='
예제 #14
0
# Um exemplo da lei de Zipf

from nltk.token import *
from nltk.tokenizer import WhitespaceTokenizer
from nltk.probability import FreqDist
from nltk.draw.plot import Plot
freq_dist = FreqDist()

corpus = Token(TEXT=open('dados/may2001_pdf.torto').read())
WhitespaceTokenizer().tokenize(corpus)

for token in corpus['SUBTOKENS']:
    freq_dist.inc(token['TEXT'])

wordcount = freq_dist.samples()
#points = [(freq_dist.freq(l),l) for l in wordcount]
#points.sort()

x = 0
points = list(wordcount)
for l in wordcount:
    points[x] = (freq_dist.count(l), x)
    x = x + 1
points.sort()

print points
Plot(points)