예제 #1
0
def main():
	corpus_root = sys.argv[1]
	num_text_files=int(sys.argv[2])
	algorithm_type=sys.argv[3]
	pmi_freq_filter=int(sys.argv[4])
	file_list=[]
	for i in range(0,num_text_files):
		file_list.append(sys.argv[5+i])
	corpus = PlaintextCorpusReader(corpus_root, '.*')
	if 'bigram' in algorithm_type: 
		measures=nltk.collocations.BigramAssocMeasures()
		finder = BigramCollocationFinder.from_words(corpus.words())
		finder.apply_freq_filter(pmi_freq_filter)
		scored = finder.score_ngrams((f(algorithm_type)))
	else: 
		measures=nltk.collocations.TrigramAssocMeasures()
		finder = TrigramCollocationFinder.from_words(corpus.words())
		finder.apply_freq_filter(pmi_freq_filter)	
		scored = finder.score_ngrams((f(algorithm_type)))

	sort= (sorted(scored , key=lambda tu: tu[1]))
	for key in sort:
		ngrams= len(key[0])
		if(ngrams == 2):
			print key[0][0] + "\t" + key[0][1] + "\t"+ str(key[1])
		else:
			print key[0][0] + "\t" + key[0][1] + "\t"+  key[0][2]+ "\t"+ str(key[1])
예제 #2
0
def describe(corpus):
  print "\t".join(["c/w", "w/s", "w/v", "id"])
  for fileid in corpus.fileids():
    nchars = len(corpus.raw(fileid))
    nwords = len(corpus.words(fileid))
    nsents = len(corpus.sents(fileid))
    nvocab = len(set([w.lower() for w in corpus.words(fileid)]))
    print "\t".join([str(nchars/nwords), str(nwords/nsents),
      str(nwords/nvocab), fileid])
예제 #3
0
	def get_cosine_similarity_to_corpora(self, outbound_conv):
		outbound = ""
		for tm in outbound_conv:
			# outbound.append(tm["body"])
			outbound += tm["body"] + " "

		similarities = {}

		for corpus, name in self.corpora.iteritems():
			corp_concat = " ".join(corpus.words())
			corp_sim = self._calculate_cosine_similarity(corp_concat, outbound)
			similarities[name] = corp_sim

		return similarities
예제 #4
0
def subset_dictionary(  ):
    """Create a subset dictionary from working context's dictionary
    
    This is a bit useless, so not recommended for use. Average person 
    has ~35,000 word vocabulary. The default dictionary is ~130,000
    words, way too large and full of lots of stuff you're not likely 
    to actually want to dictate.
    """
    log = logging.getLogger( 'subset' )
    parser = base_arguments('Create a dictionary subset of highest-frequency words from an NLTK corpus')
    parser.add_argument(
        '--corpus',type=bytes,default='webtext',
        help="NLTK corpus to download and process",
    )
    parser.add_argument(
        '--count',type=int,default=10000,
        help="Number of items to include in dictionary",
    )
    arguments = parser.parse_args()
    working_context = context.Context( arguments.context )
    import nltk, nltk.corpus
    nltk.download( arguments.corpus )
    corpus = getattr( nltk.corpus, arguments.corpus )
    total = corpus.words()
    log.info( '%s words in corpus', len(total))
    fd =nltk.FreqDist([
        as_unicode(w).lower()
        for w in total
        if w.isalnum()
    ])
    all_items = list(fd.iteritems())
    log.info( '%s distinct words', len(all_items))
    translations = working_context.dictionary_cache.have_words(
        *[x[0] for x in all_items[:int(arguments.count*1.5)]]
    )
    count = 0
    items = []
    for (word,frequency) in fd.iteritems():
        if translations.get( word ):
            count += 0
            for t in translations.get(word):
                items.append( (word,t))
            if count >= arguments.count:
                break 
    items.sort()
    for word,translation in items:
        print '%s\t%s'%(as_bytes(word),as_bytes(translation))
def build_tfidf(corpus_dir,model_filename):
    stemmer = nltk.stem.PorterStemmer()
    corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt$')  # a memory-friendly iterator
    dictionary = corpora.Dictionary()

    bigram_transformer = Phrases(TextCorpus(corpus))

    for myfile in corpus.fileids():
        try:
            chunks = bigram_transformer[[word.lower() for word in corpus.words(myfile)]]
            dictionary.add_documents([[stemmer.stem(chunk) for chunk in chunks]])

        except Exception as e:
            print 'Warning error in file:', myfile

    model = TfidfModel(BowCorpus(corpus,dictionary,bigram_transformer), id2word=dictionary)
    model.save(model_filename)
예제 #6
0
		print "Error: Corpus does not exist."
		exit(0)

if args.number:
	topn = int(args.number) if int(args.number) else topn

if args.casefold:
	options.append('cf')
if args.stem:
	options.append('stem')

if args.filters:
	filters = args.filters.split()

# Vocabulary
words = map(preprocess, corpus.words())
vocabulary = FreqDist(words)

# Collocations using all bigrams
bg = bigrams(words)                 # Get processed bigrams
bigramFreq = FreqDist(bg)
freqs.append(bigramFreq)

# Collocations filtered by part of speech
bg = bigrams(corpus.tagged_words()) # Get all tagged bigrams
bg = filter(posFilter, bg)          # Filter bigrams by POS
bg = map(rmPos, bg)                 # Remove tags
bg = map(preprocessBigram, bg)      # Process bigrams
posFreq = FreqDist(bg)
freqs.append(posFreq)
예제 #7
0
t.start()




#dicionarios de hifenizacao
dic = pyphen.Pyphen(lang='pt-BR')
dicEN = pyphen.Pyphen(lang='en-US')
dicFR = pyphen.Pyphen(lang='fr-FR')

escrita=[]
banco = open("banco.txt", 'w', encoding='utf-8')

corpus = nltk.corpus.machado #outros corpus: mac_morpho e floresta
corpus.ensure_loaded()
freq = nltk.FreqDist(corpus.words())
#pegamos as 100000 palavras mais frequentes. todos os corpus apresentados possuem menos que 100000 palavras.
for palavra, frequencia in freq.most_common(100000): 
    palavra = re.sub('['+string.punctuation+']', '', palavra)
    if palavra is not "" and not hasNumbers(palavra) and palavra not in escrita and roman_to_int(palavra.upper()) == 0: #excluindo palavras vazias, numeros e numerais.
        escrita.append(palavra.lower())
        escrita.append(".")
        escrita.append("\n")
for palavra in escrita: #hifenizacao
    palavra_hyphen = dic.inserted(palavra)
    if len(palavra_hyphen) > 4 and "-" not in palavra_hyphen:
        palavra_hyphen = dicEN.inserted(palavra_hyphen)
        palavra_hyphen = dicFR.inserted(palavra_hyphen)
    banco.write(palavra_hyphen)
done = True
banco.close