def main(): corpus_root = sys.argv[1] num_text_files=int(sys.argv[2]) algorithm_type=sys.argv[3] pmi_freq_filter=int(sys.argv[4]) file_list=[] for i in range(0,num_text_files): file_list.append(sys.argv[5+i]) corpus = PlaintextCorpusReader(corpus_root, '.*') if 'bigram' in algorithm_type: measures=nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(corpus.words()) finder.apply_freq_filter(pmi_freq_filter) scored = finder.score_ngrams((f(algorithm_type))) else: measures=nltk.collocations.TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(corpus.words()) finder.apply_freq_filter(pmi_freq_filter) scored = finder.score_ngrams((f(algorithm_type))) sort= (sorted(scored , key=lambda tu: tu[1])) for key in sort: ngrams= len(key[0]) if(ngrams == 2): print key[0][0] + "\t" + key[0][1] + "\t"+ str(key[1]) else: print key[0][0] + "\t" + key[0][1] + "\t"+ key[0][2]+ "\t"+ str(key[1])
def describe(corpus): print "\t".join(["c/w", "w/s", "w/v", "id"]) for fileid in corpus.fileids(): nchars = len(corpus.raw(fileid)) nwords = len(corpus.words(fileid)) nsents = len(corpus.sents(fileid)) nvocab = len(set([w.lower() for w in corpus.words(fileid)])) print "\t".join([str(nchars/nwords), str(nwords/nsents), str(nwords/nvocab), fileid])
def get_cosine_similarity_to_corpora(self, outbound_conv): outbound = "" for tm in outbound_conv: # outbound.append(tm["body"]) outbound += tm["body"] + " " similarities = {} for corpus, name in self.corpora.iteritems(): corp_concat = " ".join(corpus.words()) corp_sim = self._calculate_cosine_similarity(corp_concat, outbound) similarities[name] = corp_sim return similarities
def subset_dictionary( ): """Create a subset dictionary from working context's dictionary This is a bit useless, so not recommended for use. Average person has ~35,000 word vocabulary. The default dictionary is ~130,000 words, way too large and full of lots of stuff you're not likely to actually want to dictate. """ log = logging.getLogger( 'subset' ) parser = base_arguments('Create a dictionary subset of highest-frequency words from an NLTK corpus') parser.add_argument( '--corpus',type=bytes,default='webtext', help="NLTK corpus to download and process", ) parser.add_argument( '--count',type=int,default=10000, help="Number of items to include in dictionary", ) arguments = parser.parse_args() working_context = context.Context( arguments.context ) import nltk, nltk.corpus nltk.download( arguments.corpus ) corpus = getattr( nltk.corpus, arguments.corpus ) total = corpus.words() log.info( '%s words in corpus', len(total)) fd =nltk.FreqDist([ as_unicode(w).lower() for w in total if w.isalnum() ]) all_items = list(fd.iteritems()) log.info( '%s distinct words', len(all_items)) translations = working_context.dictionary_cache.have_words( *[x[0] for x in all_items[:int(arguments.count*1.5)]] ) count = 0 items = [] for (word,frequency) in fd.iteritems(): if translations.get( word ): count += 0 for t in translations.get(word): items.append( (word,t)) if count >= arguments.count: break items.sort() for word,translation in items: print '%s\t%s'%(as_bytes(word),as_bytes(translation))
def build_tfidf(corpus_dir,model_filename): stemmer = nltk.stem.PorterStemmer() corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt$') # a memory-friendly iterator dictionary = corpora.Dictionary() bigram_transformer = Phrases(TextCorpus(corpus)) for myfile in corpus.fileids(): try: chunks = bigram_transformer[[word.lower() for word in corpus.words(myfile)]] dictionary.add_documents([[stemmer.stem(chunk) for chunk in chunks]]) except Exception as e: print 'Warning error in file:', myfile model = TfidfModel(BowCorpus(corpus,dictionary,bigram_transformer), id2word=dictionary) model.save(model_filename)
print "Error: Corpus does not exist." exit(0) if args.number: topn = int(args.number) if int(args.number) else topn if args.casefold: options.append('cf') if args.stem: options.append('stem') if args.filters: filters = args.filters.split() # Vocabulary words = map(preprocess, corpus.words()) vocabulary = FreqDist(words) # Collocations using all bigrams bg = bigrams(words) # Get processed bigrams bigramFreq = FreqDist(bg) freqs.append(bigramFreq) # Collocations filtered by part of speech bg = bigrams(corpus.tagged_words()) # Get all tagged bigrams bg = filter(posFilter, bg) # Filter bigrams by POS bg = map(rmPos, bg) # Remove tags bg = map(preprocessBigram, bg) # Process bigrams posFreq = FreqDist(bg) freqs.append(posFreq)
t.start() #dicionarios de hifenizacao dic = pyphen.Pyphen(lang='pt-BR') dicEN = pyphen.Pyphen(lang='en-US') dicFR = pyphen.Pyphen(lang='fr-FR') escrita=[] banco = open("banco.txt", 'w', encoding='utf-8') corpus = nltk.corpus.machado #outros corpus: mac_morpho e floresta corpus.ensure_loaded() freq = nltk.FreqDist(corpus.words()) #pegamos as 100000 palavras mais frequentes. todos os corpus apresentados possuem menos que 100000 palavras. for palavra, frequencia in freq.most_common(100000): palavra = re.sub('['+string.punctuation+']', '', palavra) if palavra is not "" and not hasNumbers(palavra) and palavra not in escrita and roman_to_int(palavra.upper()) == 0: #excluindo palavras vazias, numeros e numerais. escrita.append(palavra.lower()) escrita.append(".") escrita.append("\n") for palavra in escrita: #hifenizacao palavra_hyphen = dic.inserted(palavra) if len(palavra_hyphen) > 4 and "-" not in palavra_hyphen: palavra_hyphen = dicEN.inserted(palavra_hyphen) palavra_hyphen = dicFR.inserted(palavra_hyphen) banco.write(palavra_hyphen) done = True banco.close