def generate_words_grammar(): """ Use sentence grammar to find words that could be Rent lyrics :return: """ # Load corpuses to look in gentrification = PlaintextCorpusReader( 'corpus', '.*') # Gentrification articles are in this directory gentrify_sents = gentrification.sents() # wine_sents = nltk.corpus.webtext.sents('wine.txt') corpus_sents = gentrify_sents + wine_sents syls_1 = [] syls_2 = [] syls_4 = [] syls_2_sing = [] for sent in corpus_sents: parsed_sent = nltk.pos_tag(sent) for word in parsed_sent: no_syls = count_syllables(word[0]) if word[1] == 'NNS' and len(word[0]) > 3: if no_syls == 1: syls_1 = syls_1 + [word[0].lower()] elif no_syls == 2: syls_2 = syls_2 + [word[0].lower()] elif no_syls == 4: syls_4 = syls_4 + [word[0].lower()] if word[1] == 'NN' and len(word[0]) > 2: if no_syls == 2: syls_2_sing = syls_2_sing + [word[0].lower()] return list(set(syls_1)), list(set(syls_2)), list(set(syls_4)), list( set(syls_2_sing))
def align(filename): files = filename.split('(') ripe_file = os.path.abspath(files[1]) raw_file = os.path.abspath(files[0]) raw_for_nltk = os.path.abspath('data/newcorpus/source.txt') with open(files[0]) as f: with open(raw_for_nltk,"w") as f1: for line in f: f1.write(line) corpusdir = 'data/newcorpus/' newcorpus = PlaintextCorpusReader(corpusdir, '.*',sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/german.pickle')) out = open(ripe_file, "w") i = 0 temp =[] temp.append(newcorpus.sents(raw_for_nltk)) tempVal = str(temp[i]) tempVal = tempVal.replace(",", "") tempVal = tempVal.replace("u'", "") tempVal = tempVal.replace("'", "") tempVal = tempVal.replace("[", "") tempVal = tempVal.replace("]", "") out.write(tempVal+os.linesep) out.close() return
def generateNgramModel(corpusPath, corpusName): corpusdir = 'corpora/' # Directory of corpus. generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False, estimator) #uses bigrams just cause they BETTER return ngrammodel
def processFile(newCorpusDir): if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('VirtualBoxTroubleshooting.pdf') txt3 = word.getTextWord('my_doc.docx') files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()) print(newCorpus.sents(newCorpus.fileids()[1])) print(newCorpus.paras(newCorpus.fileids()[0]))
import os import word, pdf from nltk.corpus.reader.plaintext import PlaintextCorpusReader def getText(txtFileName): file = open(txtFileName, 'r') return file.read() newCorpusDir = 'mycorpus/' if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('sample-pdf.pdf') txt3 = word.getTextWord('sample-one-line.docx') files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()) print(newCorpus.sents(newCorpus.fileids()[1])) print(newCorpus.paras(newCorpus.fileids()[0]))
# Access the plaintext; outputs pure string/basestring. print newcorpus.raw().strip() print # Access paragraphs in the corpus. (list of list of list of strings) # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print # To access pargraphs of a specific fileid. print newcorpus.paras(newcorpus.fileids()[0]) # Access sentences in the corpus. (list of list of strings) # NOTE: That the texts are flattened into sentences that contains tokens. print newcorpus.sents() print # To access sentences of a specific fileid. print newcorpus.sents(newcorpus.fileids()[0]) # Access just tokens/words in the corpus. (list of strings) print newcorpus.words() # To access tokens of a specific fileid. print newcorpus.words(newcorpus.fileids()[0])
class TextAnalizer: def __init__(self, my_input_file): self.config = configparser.ConfigParser() self.config.read("text_analysis.cfg") self.input_file = my_input_file self.nlp_model = self.config["DEFAULT"]["nlp_model"] #The output file name self.output_file = self.config["DEFAULT"]["output_file"] self.nlp = load_nlp(self.nlp_model) self.corpus = CorpusReader(".", self.input_file) self.raw_text = self.corpus.raw() self.nlp_text = self.nlp(self.raw_text) # Here, lets put together the infos for text analysis with spacy. self.analysis_dictionary = Counter() self.word_count = 0 self.get_word_count_nltk() def get_paragraph(self): return self.corpus.paras() def get_sentence(self): return self.corpus.sents() def get_word(self): return self.corpus.words() def get_word_count_nltk(self): tokenizer = Tokenizer(r'\w+') counts = Counter() sentences = self.get_sentence() for sentence in sentences: tokens = tokenizer.tokenize(" ".join(sentence)) self.word_count = self.word_count + len(tokens) filtered = [w for w in sentence if w.isalnum()] counts = counts + Counter(filtered) return counts, self.word_count def analize_nlp(self): analized_data_str = (self.config["ANALIZED"]["POS"]) analized_data = (analized_data_str.split(",")) result_dict = {} diff_str, tot_str = ( self.config["DEFAULT"]["diff_tot_string"]).split(",") lemma_counter = Counter() pos_counter = Counter() tag_counter = Counter() for token in self.nlp_text: lemma_counter = lemma_counter + Counter([token.lemma_]) pos_counter = pos_counter + Counter([token.pos_]) tag_counter = tag_counter + Counter([token.tag_]) my_key = token.lemma_ + "_" + token.tag_ + "_" + token.pos_ self.analysis_dictionary[my_key] += 1 for pos in analized_data: instance_counter = 0 total_counter = 0 for key in self.analysis_dictionary.keys(): try: my_lemma, my_tag, my_pos = key.split("_") except ValueError: print("Warning: Array has a empty line") # add logging if pos == my_pos: instance_counter += 1 total_counter = total_counter + self.analysis_dictionary.get( key) result_dict[pos + diff_str] = instance_counter result_dict[pos + tot_str] = total_counter #add the stuff from nltk diff_word, word_count = self.get_word_count_nltk() result_dict["WORDS" + tot_str] = word_count result_dict["WORDS" + diff_str] = len(diff_word) result_dict["PARAGRAPHS"] = len(self.get_paragraph()) result_dict["SENTENCES"] = len(self.get_sentence()) return result_dict def write_output(self): with open(self.output_file, "w+") as f: f.write("Number of paragraphes: " + str(len(self.get_paragraph())) + "\n") f.write("Number of sentences: " + str(len(self.get_sentence())) + "\n") f.write("Number of words: " + str(self.word_count) + "\n") f.write("Average words per sentence: " + str(round(self.word_count / len(self.get_sentence()), 2)) + "\n") f.write("Number of different words: " + str(len(self.get_word_count_nltk())) + "\n") f.write("Text variety (different words/total words: " + str( round(len(self.get_word_count_nltk()) / self.word_count, 2)) + "\n") f.close()
def filter_token_in(self): corpusdir = 'data/cv_corpus' corpa = PlaintextCorpusReader(corpusdir,'.*',encoding='windows-1252') corpa_words = set(token.lemma_ for sent in corpa.sents() for token in nlp(" ".join(sent).lower()) ) tokens = [t for t in self.tokens_in if t in corpa_words] return tokens
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.probability import LidstoneProbDist, WittenBellProbDist from nltk.model import NgramModel from nltk.tokenize import sent_tokenize, word_tokenize corpusdir = 'corpora/' # Directory of corpus. SickCorpus = PlaintextCorpusReader(corpusPath, 'sick_tweets.txt') HealthyCorpus = PlaintextCorpusReader(corpusdir, 'healthy_tweets.txt') estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) sick_model_1 = NgramModel(1, SickCorpus.sents(), True, False, estimator) sick_model_2 = NgramModel(2, SickCorpus.sents(), True, False, estimator) healthy_model_1 = NgramModel(1, HealthyCorpus.sents(), True, False, estimator) healthy_model_2 = NgramModel(2, HealthyCorpus.sents(), True, False, estimator) tweet = "Remember when we were all diagnosed with Bieber fever ? Lol" print "sick_model_1 is: " + str(sick_model_1.perplexity(word_tokenize(tweet))) print "sick_model_2 is: " + str(sick_model_2.perplexity(word_tokenize(tweet))) print "healthy_model_1 is: " + str(healthy_model_1.perplexity(word_tokenize(tweet))) print "healthy_model_2 is: " + str(healthy_model_2.perplexity(word_tokenize(tweet)))
# Variables #STEP 1 # This is the variable name for the target file to read. Note it is useful to copy and paste all from # .PDF into a .TXT file to read File_to_Read = 'Sample_from_PDF.txt' # Read file from nltk.corpus.reader.plaintext import PlaintextCorpusReader # Read file corpus = PlaintextCorpusReader(os.getcwd(), File_to_Read) #print(corpus.raw()) # Counts total sentences in document and creates a list of words in document sentences = corpus.sents() print("\n Total sentences in this corpus : ", len(sentences)) print("\n Words in this corpus : ", corpus.words()) # Finds frequency distribution of words in document course_freq_dist = nltk.FreqDist(corpus.words()) print("\n Top 30 words in the corpus : ", course_freq_dist.most_common(30)) # Calculate distribution for a specific word print("\n Distribution for \"hydrogen\" : ", course_freq_dist.get('hydrogen')) # Tokenization # Read base file into raw text variable base_file = open(os.getcwd() + "/" + File_to_Read, mode='rt', encoding='utf-8') raw_text = base_file.read()
def try_out_some_functionalities(): corpusdir ="/media/benzro/OS/Users/benzro/Desktop/Studium Uni/2)" \ "ZweitesSemester/27)PCL-2/Uebungen/Uebung03/Enron/test/" newcorpus = PCR(corpusdir, '.*') print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access one file in the corpus" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" infile = corpusdir + "0001.1999-12-10.farmer.ham.txt" infile = "0004.1999-12-14.farmer.ham.txt" fin = newcorpus.open(infile) print fin.read().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "all file ids" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.fileids() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access each file in the corpus" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # (reduced output: [0:2]) for infile in sorted(newcorpus.fileids()): # the fileids of each file print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print infile # opens the file fin = newcorpus.open(infile) # prints the content of the file print fin.read().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access the plaintext; outputs pure string of all files" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.raw().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access paragraphs in the corpus. (list of list of list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access pargraphs of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.paras(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access sentences in the corpus. (list of list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # NOTE: That the texts are flattened into sentences that contains tokens. print newcorpus.sents() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access sentences of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.sents(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access just tokens/words in the corpus. (list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.words() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access tokens of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.words(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
class Contract_Reader(): def __init__(self, config): print('Filepath for texts = ', config.textpath) self.corpus = PCR(config.textpath, '.*\.txt', encoding='utf-16', para_block_reader=read_line_block) if config.clean_paragraphs == 'yes': self.clean(config, mode='para') if config.clean_sentences == 'yes': self.clean(config, mode='sent') #Corpus summaries self.corpus_info() self.LDA(config.num_topics, config.num_words) self.plot(config.num_words) def clean(self, config, mode='sent'): stop = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WNL() if mode == 'para': #paragraphs are lists of sentences each of which is a list of tokens. Reducing to list of strings. self.para_list = [ list(itertools.chain.from_iterable(para)) for para in self.corpus.paras() ] for index, paragraph in enumerate(self.para_list): paragraph = " ".join(paragraph) stop_free = " ".join( [i for i in paragraph.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) normalized = " ".join( lemma.lemmatize(word) for word in punc_free.split()) self.para_list[index] = normalized print(self.para_list[0]) self.para_list = [para.split() for para in self.para_list] print(self.para_list[0]) if mode == 'sent': #Obtain list of strings each one a sentence rather than list of lists. self.sents_list = [" ".join(sent) for sent in self.corpus.sents()] for index, sentence in enumerate(self.sents_list): stop_free = " ".join( [i for i in sentence.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) normalized = " ".join( lemma.lemmatize(word) for word in punc_free.split()) self.sents_list[index] = normalized print(self.sents_list[0]) self.sents_list = [ sentence.split() for sentence in self.sents_list ] print(self.sents_list[0]) def LDA(self, num_topics, num_words): dictionary = corpora.Dictionary(self.para_list) doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list] path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623' self.ldamodel = LdaVowpalWabbit(path, doc_term_matrix, num_topics=num_topics, id2word=dictionary) self.ldamodel.save('model/lda_model') print(self.ldamodel.print_topics(num_topics=10, num_words=num_words)) def plot(self, num_words): for t in range(self.ldamodel.num_topics): plt.figure() tuples = [ reversed(x) for x in self.ldamodel.show_topic(t, num_words) ] plt.imshow(WordCloud().fit_words(dict(tuples))) plt.axis("off") plt.title("Topic #" + str(t)) plt.savefig('plots/topic' + str(t)) def corpus_info(self): """ Summary information about the status of a corpus. """ fids = len(self.corpus.fileids()) paras = len(self.corpus.paras()) sents = len(self.corpus.sents()) sperp = sum(len(para) for para in self.corpus.paras()) / float(paras) tokens = FreqDist(self.corpus.words()) count = sum(tokens.values()) vocab = len(tokens) lexdiv = float(count) / float(vocab) print( ("Text corpus contains {} files\n" "Composed of {} paragraphs and {} sentences.\n" "{:0.3f} sentences per paragraph\n" "Word count of {} with a vocabulary of {}\n" "lexical diversity is {:0.3f}").format(fids, paras, sents, sperp, count, vocab, lexdiv))
#corpus = nltk.corpus.gutenberg #mytexts = TextCollection([text1]) APPDIR = os.path.dirname(__file__) corpus_root = 'D:\\INSTALL\\Python3\\PROJECTS\\SCRIPTS\\TEXTS\\corpora\\' corpus = PlaintextCorpusReader(corpus_root, '*.txt') print(corpus.words()) pprint(rank_quadgrams(corpus, QuadgramAssocMeasures.likelihood_ratio)) tokens = [''.join(word[0]) for word in corpus.words()] vocab = Counter(tokens) sents = list([word[0] for word in sent] for sent in corpus.sents()) trigram_counts = count_ngrams(3, vocab, sents) #Распределение частоты для униграмм можно получить из атрибута unigrams. print(trigram_counts.unigrams) #Для nграмм более высокого порядка условное распределение частот можно #получить из атрибута ngrams. print(trigram_counts.ngrams[3]) # <FreqDist with 88 samples and 3015993 outcomes> #Ключи условного распределения частот показывают возможные контексты, #предшествующие каждому слову. #print(sorted(trigram_counts.ngrams[3].conditions())) # неверно #Наша модель также способна возвращать список возможных следующих слов: print(list(trigram_counts.ngrams[3][('the', 'President')]))
print("Processing directory " + corpusdir) print(str(len(corpusfiles)) + " files found") print("Loading NLTK") from nltk import pos_tag_sents from nltk.corpus.reader.plaintext import PlaintextCorpusReader counter = 0 for fn in corpusfiles: counter += 1 outputfn = fn + ".pos" print("Processing file " + str(counter) + " of " + str(len(corpusfiles)) + ": " + fn) if not os.path.isfile(fn): print("Warning: " + fn + " is not a regular file. Skipping") elif os.path.exists(os.path.join(corpusdir, outputfn)): print("Warning: " + outputfn + " already exists. Skipping") else: # read file into corpus object using the plain text reader corpus = PlaintextCorpusReader(corpusdir, fn) # POS-tag corpus object using the default NLTK tagger tagged = pos_tag_sents(corpus.sents()) # write tagged corpus object outputpath = os.path.join(corpusdir, outputfn) try: with open(outputpath, 'w') as output: for sent in tagged: line = " ".join(word + "/" + tag for word, tag in sent) output.write(line + '\n') except OSError: print("Error: could not write to file " + outputfn + ". Skipping")
import os import word, pdf from nltk.corpus.reader.plaintext import PlaintextCorpusReader def getText(txtFileName): file = open(txtFileName, 'r') return file.read() # 새로운 corpus 폴더 생성-디렉터리 newCorpusDir = 'mycorpus/' if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('sample-pdf.pdf') txt3 = word.getTextWord('sample-one-line.docx') # 세 문자열 객체의 내용을 디스크에 파일로 작성(쓰기모드) files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) # 파일을 저장한 디렉터리에서 plaintext 객체 생성 newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()) #0.txt 모든 단어 출력 print(newCorpus.sents(newCorpus.fileids()[1])) #1.txt 문장 출력 print(newCorpus.sents(newCorpus.fileids()[0])) #0.txt 단락별 출력
from nltk.corpus.reader.plaintext import PlaintextCorpusReader corpus = PlaintextCorpusReader(".", 'mobi_dick.txt') for oracion in corpus.sents(): print(oracion)
import nltk #import this module for drawing graphs import matplotlib # Reader of NLTK to access our own text files and treat them as regular corpora from nltk.corpus.reader.plaintext import PlaintextCorpusReader #This is the directory in which we can store our text file corpusdir = 'newcorpus/' #this will make the directory in the folder you are working. if not os.path.isdir(corpusdir): os.mkdir(corpusdir) #accesing the file which is inside the directory newcorpus = PlaintextCorpusReader('newcorpus/', '.*') #Now Let us perform some of the operation using Natural Language processing #displaying the content of the file in the newcorpus which has been made print(newcorpus.raw().strip()) #displaying the length of the words of the file which is inside the directory newcorpus a=(len(newcorpus.words())) print("This will tell me the words inside the file",a) #displaying the length of the words of the file which is inside the directory newcorpus b=(len(newcorpus.sents())) print("This will tell me the sentence inside the file",b) #calculating average words per sentence aws= a/b; print("This will give me average words per sentence",aws) #********************************************************************** words_dispalyed = newcorpus.words() #This function will tell me the frequency distribution of each word in the text file fre_dis = nltk.FreqDist(words_dispalyed) #Let us plot each word and their frequency distribution using plot function. fre_dis.plot(title="Frequency Distribution")
import random import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk import bigrams, trigrams from nltk.tokenize import sent_tokenize, word_tokenize from collections import Counter, defaultdict #create a folder for your corpus corpusdir = 'miscme/' newcorpus = PlaintextCorpusReader(corpusdir, '.*') #tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #tokenizer.tokenize(newcorpus.strip()) words = newcorpus.words() sents = newcorpus.sents() words = [w.lower() for w in words] sents = [[w.lower() for w in sent] for sent in sents] trigram_counts = defaultdict(lambda: Counter()) for sentence in sents: for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True): trigram_counts[(w1, w2)][w3] += 1 trigram_probs = defaultdict(lambda: Counter()) for w1_w2 in trigram_counts: total_count = float(sum(trigram_counts[w1_w2].values())) trigram_probs[w1_w2] = Counter({w3: c/total_count for w3,c in trigram_counts[w1_w2].items()}) for i in range(10):
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader import tf_glove corpusdir = 'abstract/' corpus = PlaintextCorpusReader(corpusdir, '.*') model = tf_glove.GloVeModel(embedding_size=200, context_size=10, min_occurrences=25, learning_rate=0.05, batch_size=512) model.fit_to_corpus(corpus.sents()) model.train(num_epochs=50, log_dir="log/example", summary_batch_interval=1000) import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader import tf_glove corpusdir = 'abstract/' corpus = PlaintextCorpusReader(corpusdir, '.*') model = tf_glove.GloVeModel(embedding_size=200, context_size=10, min_occurrences=25, learning_rate=0.05, batch_size=512) model.fit_to_corpus(corpus.sents()) model.train(num_epochs=50, log_dir="log/example", summary_batch_interval=1000)
if keepWordPOS: return words, lemmas, [None if i == '' else i for i in poss] return lemmas regex = re.compile('[_]+') for f in corpus.fileids(): outname = args.preprocess + "/" + f + ".out" fout = open(outname,"w", encoding="utf8") splitter = nltk.data.load(‘tokenizers/punkt/english.pickle’) tokenizer.tokenize(text) word_tokenize for sent in corpus.sents(f): s = [] for w in sent: w = regex.sub('',w).lower() if ( len(w)>2 and not w in stop_words and w.isalpha() ): s.append(w) if args.lemmatize: s = lemmatize_sentence(s) print(s) if len(s) > 1: # fout.write (f + "\t") for w in s:
return file.read() # 말뭉치 폴더 생성 newCorpusDir = 'mycorpus/' if not os.path.isdir(newCorpusDir): # 말뭉치 폴더가 이미 존재하는가? os.mkdir(newCorpusDir) # 파일 읽기 # 일반 텍스트 파일 txt1 = getText('./Files/sample_feed.txt') # PDF 파일 txt2 = pdf.getTextPDF('./Files/sample-pdf.pdf') # DOCX 파일 txt3 = word.getTextWord('./Files/sample-one-line.docx') # 파일 쓰기 files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) # 사용자 정의 말뭉치 만들기 # 폴더 내의 모든 파일을 읽어와 파일들로부터 말뭉치를 생성한다 newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') # 사용자 정의 말뭉치가 잘 만들어 졌는지 확인 print(newCorpus.words()) # 말뭉치의 모든 단어를 포함하는 배열 print(newCorpus.sents(newCorpus.fileids()[1])) # 1.txt에 있는 모든 문장 배열을 출력 print(newCorpus.paras(newCorpus.fileids()[0])) # 0.txt에 있는 모든 단락 배열을 출력
article_corpus = PlaintextCorpusReader('text_plain/', '.*\.txt', sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/spanish.pickle')) stop_words = nltk.corpus.stopwords.words('spanish') non_alphabetic = re.compile("\W|\d") words = [] tags = [] # Using TreeTagger # 1) pip install treetaggerwrapper # 2) put treetragger in %PYHOME%\Lib\site-packages\TreeTagger # 3) put spanish-utf8.par and spanish-chunker.par in \TreeTagger\lib # See http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/spanish-tagset.txt for tag meanings tagger = treetaggerwrapper.TreeTagger(TAGLANG='es') for sentence in article_corpus.sents(): tagged_sentence = tagger.tag_text(sentence) tags.extend(treetaggerwrapper.make_tags(tagged_sentence)) #TODO: create a tagger script, save the tagged files #TODO: look at alternate taggers, compare #TODO: profile this and see which part is taking so long for tag in tags: lemma = tag[2].lower() if lemma not in stop_words and not non_alphabetic.search(lemma): words.append(lemma) freq_dist = FreqDist(words) with open('./frequency_distribution.txt', 'w', encoding='utf-8') as f:
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk import word_tokenize import re corpusdir = 'python/' # Directory of corpus. newcorpus = PlaintextCorpusReader(corpusdir, '.*') print(newcorpus.fileids()[0]) print(type(newcorpus)) #print newcorpus.raw() print newcorpus.words(newcorpus.fileids()[0]) print(len(newcorpus.words())) tokens = word_tokenize(newcorpus.raw()) #type(tokens) print len(tokens) print tokens[:50] #tokens[:10] print newcorpus.sents() print #to remove comments def removeComments(string): string = re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,string) # remove all occurance streamed comments (/*COMMENT */) from string fdf string = re.sub(re.compile("//.*?\n" ) ,"" ,string) # remove all occurance singleline comments (//COMMENT\n ) from string return string print(removeComments(newcorpus.words(newcorpus.raw())))
print(common_words) data = [] hmap = {} detokenized = {} for word, frequency in common_words.items(): datum = {'word': word, 'frequency': frequency} docs = [] sents = [] for key, fileid in enumerate(corpus.fileids()): if key not in hmap: hmap[key] = {} for s_id, sentence in enumerate(corpus.sents(fileid)): if key in hmap and s_id in hmap[key]: words = hmap[key][s_id] else: words = [lemmatizer.lemmatize(w.lower()) for w in sentence] hmap[key][s_id] = words if word in words: s_key = f'{key}-{s_id}' sent = '' if s_key in detokenized: sent = detokenized[s_key] else: sent = TreebankWordDetokenizer().detokenize(sentence) detokenized[s_key] = sent
def cv_to_matrix(self): corpusdir = 'data/cv_corpus' corpa = PlaintextCorpusReader(corpusdir,'.*',encoding='windows-1252') print("Preprocessing words....") sents = [[token.lemma_ for token in nlp(" ".join(self.clean(sent)).lower()) if token.lemma_ not in stopset] for sent in corpa.sents()] print("training word vectors....") model = Word2Vec(sents,window=5, size=self.ncol,min_count=1, workers=4) fname = get_tmpfile("vectors.kv") model.wv.save(fname) print("cv_to_matrix model saved") return model.wv