def load_corpus(race_code=None, gender_code=None ): #loads corpora into an array based on race and gender if (race_code == None): # if none is specified, search all race_code = ".." if (gender_code == None): gender_code = ".." reader = PlaintextCorpusReader( corpus_root, ".*_" + race_code + "_" + gender_code + "\.txt") # uses filename encoding to load specified texts corpora = [] for fileid in reader.fileids( ): #creates ComedyCorpus object, populates with fileid and name new_corpus = ComedyCorpus() new_corpus.set_fileid(fileid) try: new_corpus.set_text( reader.raw(fileid)) #gets word content based on fileid except UnicodeDecodeError: continue fileid = re.sub("_" + race_code + "-" + gender_code + "\.txt", "", fileid) #name is fileid without encoding fileid = fileid.replace("%20", " ") fileid = fileid.replace("_", "; ") print(fileid) new_corpus.set_name(fileid) corpora.append(new_corpus) return corpora
def load_feat_data(dir_array): data_list = [] for direct in dir_array: data = [] corpus_dir = 'dataset/' + direct corpus = PlaintextCorpusReader(corpus_dir, '.*\.*') file_ids = corpus.fileids() for file in file_ids: text = corpus.raw(file) e = email.message_from_string(text) if (e.is_multipart()): for payload in e.get_payload: text = payload.get_payload else: text = e.get_payload() data.append(extract_features(text, corpus, file)) data_list.extend(data) return data_list
def parseFolder( dirPath ): assignments = {} draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*') finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*') numFiles = len( os.listdir( dirPath )) assert numFiles % 2 == 0 finalIdsSortedList = finalReader.fileids() draftIdsSortedList = draftReader.fileids() for pid in finalReader.fileids(): final = finalReader.paras( pid ) #finalIdsSortedList[i] ) draft = draftReader.paras( pid ) #draftIdsSortedList[i] ) assn = assignment( draft, final ) assignments[pid] = assn return assignments
def parseFolder( dirPath ): assignments = [] draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*') finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*') numFiles = len( os.listdir( dirPath )) assert numFiles % 2 == 0 finalIdsSortedList = finalReader.fileids() draftIdsSortedList = draftReader.fileids() for i in range(len(finalReader.fileids())): final = finalReader.paras( finalIdsSortedList[i] ) draft = draftReader.paras( draftIdsSortedList[i] ) assn = assignment( draft, final ) assignments.append( assn ) return assignments
def get_fileid_lst(source_dir): ''' Use NLTK to pull in the list of file ids in the given source directory :param {str} source_dir: The relative path to the source directory that contains all the data (book) files :return {str} fileid_lst: List of all file id's ending in '.txt' in the source_dir ''' temp_corp = PlaintextCorpusReader(source_dir, '.*\.txt') fileid_lst = temp_corp.fileids() return fileid_lst
def main(): """ Main function of the program """ corpus_dir = 'NLP_dataset/training_set' # Directory of corpus. new_corpus = PlaintextCorpusReader(corpus_dir, '.*') for file_id in new_corpus.fileids(): file_to_read = open(corpus_dir+"/"+file_id, "r") # reading each file to get matched sentences matched_sen = match_regular_expressions(file_to_read) # writing the matched sentences to files write_to_files(matched_sen, file_id)
def load_data(dir_label): data_list = [] labels = [] for dl in dir_label: data = [] directory = dl[0] label = dl[1] corpus_dir = 'dataset/' + directory corpus = PlaintextCorpusReader(corpus_dir, '.*\.*') file_ids = corpus.fileids() for file in file_ids: d = [] text = corpus.raw(file) e = email.message_from_string(text) if (e.is_multipart()): for payload in e.get_payload: text = payload.get_payload else: text = e.get_payload() feats = [ cf.charac_feats_extractor(text), wf.word_feats_extractor(text), syf.syntac_feats_extractor(text), stf.struct_feats_extractor(corpus, file, text), fwf.funct_word_feats_extractor(text) ] for f in feats: d.extend(list(f.values())) data.append(d) labels.append(label) data_list.extend(data) return [data_list, labels]
def processFile(newCorpusDir): if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('VirtualBoxTroubleshooting.pdf') txt3 = word.getTextWord('my_doc.docx') files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()) print(newCorpus.sents(newCorpus.fileids()[1])) print(newCorpus.paras(newCorpus.fileids()[0]))
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader # Create a new corpus by specifying the parameters # (1) directory of the new corpus # (2) the fileids of the corpus # NOTE: in this case the fileids are simply the filenames. newcorpus = PlaintextCorpusReader('nltkCorpusAll/', '.*') # Access each file in the corpus. for infile in sorted(newcorpus.fileids()): print infile # The fileids of each file. fin = newcorpus.open(infile)# Opens the file. print fin.read().strip() # Prints the content of the file print # Access the plaintext; outputs pure string/basestring. print newcorpus.raw().strip() print # Access paragraphs in the corpus. (list of list of list of strings) # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print # To access pargraphs of a specific fileid.
#Tried to find misspellings in a corpus of text files. See find_misspellings.py and grouping_docs.py for documentation. #There are ~30,400 unique words in these 49 communication files #Rebecca's laptop took too long to make the correlation matrix import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.tokenize import RegexpTokenizer import numpy as np from numpy import linalg #make a new corpus corpusdir = 'communications/small_test_batch' #where the files are newcorpus = PlaintextCorpusReader(corpusdir, '.*') fileids = newcorpus.fileids() #list of fileids j = len(fileids) #number of docs words_list = [] #['doc', '1', 'words', 'doc', '2', 'words',...] doc_breaks = [0] #ith entry = index of first word in doc i in words_list keywords = set() #{'doc', '1', 'words', '2',...} tokenizer = RegexpTokenizer('\w+') #pick out alphanumeric sequences; discard punctuation, white space #create set of keywords and list of file texts for id in fileids: raw = newcorpus.raw(id) raw2 = ''.join([i if ord(i)<128 else '' for i in raw]) #remove unicode characters raw3 = raw2.encode('ascii') file_words = map(str.lower,tokenizer.tokenize(raw3)) #list of cleaned words: lower-case, no punct, no whitespace words_list = words_list + file_words doc_breaks = doc_breaks + [len(file_words)+doc_breaks[len(doc_breaks)-1]]
from nltk.corpus.reader.plaintext import PlaintextCorpusReader import nltk # Might need the below line once # nltk.download('punkt') corpusDir = 'own_corpus/' newCorpus = PlaintextCorpusReader(corpusDir, '.*\.txt') for file in sorted(newCorpus.fileids()): words = newCorpus.words(file) text = nltk.Text(words) print(text)
# nltk.download() # nltk.download('gutenberg') # text1.concordance("water") # print(FreqDist(text1).most_common(50)) # FreqDist(text1).plot(50, cumulative=True) # print(set(text1)) corpus_root = '/Users/devindyson/Desktop/nltk/corpora' corpora = PlaintextCorpusReader(corpus_root, '.*') # print(corpora.raw('meditations.txt')) # print(SentimentIntensityAnalyzer().polarity_scores("NLTK is pretty dope.")) print(sorted(corpora.fileids())) print(len(corpora.words('meditations.txt'))) print(len(corpora.words('benjamin.txt'))) meditations = Text(corpora.words('meditations.txt')) benjamin = Text(corpora.words('benjamin.txt')) def lexical_diversity(text_data): word_count = len(text_data) vocab_size = len(set(text_data)) diversity_score = vocab_size / word_count return diversity_score print(lexical_diversity(meditations))
tokenizer=word_tokenize, postagger=pos_tag, lemmatizer=wnl, stemmer=porter): words, lemmas, poss = [], [], [] for word, pos in postagger(sentence): pos = penn2morphy(pos) lemmas.append(lemmatize(word.lower(), pos, neverstem, lemmatizer, stemmer)) poss.append(pos) words.append(word) if keepWordPOS: return words, lemmas, [None if i == '' else i for i in poss] return lemmas regex = re.compile('[_]+') for f in corpus.fileids(): outname = args.preprocess + "/" + f + ".out" fout = open(outname,"w", encoding="utf8") splitter = nltk.data.load(‘tokenizers/punkt/english.pickle’) tokenizer.tokenize(text) word_tokenize for sent in corpus.sents(f): s = [] for w in sent: w = regex.sub('',w).lower() if ( len(w)>2 and not w in stop_words
from nltk.stem.porter import * from nltk.corpus.reader.plaintext import PlaintextCorpusReader #from modifiedtexttiling import TextTilingTokenizer import modifiedtexttiling #input as all the documents with preprocessed text. corpusdir = '/home/abc/Desktop/adm/new_dataset' #output as all the segmented documents with thier corresponding document names as prefix. corpusdir_p = '/home/abc/Desktop/adm/segments' newcorpus = PlaintextCorpusReader(corpusdir, '.*') #sort all the document names alphabetically. sortedall = sorted(newcorpus.fileids()) #print sortedall for filename in sortedall: #open each document. fp = open(corpusdir + "/" + filename) #print message. print 'processing : ' + filename #save document text as string. n = fp.read() #Create TextTilingTokenizer() object t = modifiedtexttiling.TextTilingTokenizer() #get the segments as list of strings. k = t.tokenize(n)
import nltk from nltk import word_tokenize import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.corpus import floresta,mac_morpho from parser_portuguese_risk import evaluateModel, splitTrainTestModel, simplify_tag time1 =datetime.datetime.now() ############################################################################### ### ATENTION: if we have some tmp files like .DS_STORE in Mac OSX, we must remove it ### # Reading corpus corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/glossAnnotated/' # Directory of corpus. #corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/test1/' # Directory of corpus. risco = PlaintextCorpusReader(corpusdir, '.*') risco.fileids() raw_text = risco.raw('gloss533.txt') #print raw_text[0:] # Some statistics print 'Number of term: ', len(risco.words()) print 'Number of unique terms: ', len(set(risco.words())) fd = nltk.FreqDist(risco.words()) print fd.freq('bem') print fd['bem'] # presenting ngrams of the term target_word = 'bem como'
import os import word, pdf from nltk.corpus.reader.plaintext import PlaintextCorpusReader def getText(txtFileName): file = open(txtFileName, 'r') return file.read() # 새로운 corpus 폴더 생성-디렉터리 newCorpusDir = 'mycorpus/' if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('sample-pdf.pdf') txt3 = word.getTextWord('sample-one-line.docx') # 세 문자열 객체의 내용을 디스크에 파일로 작성(쓰기모드) files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) # 파일을 저장한 디렉터리에서 plaintext 객체 생성 newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()) #0.txt 모든 단어 출력 print(newCorpus.sents(newCorpus.fileids()[1])) #1.txt 문장 출력 print(newCorpus.sents(newCorpus.fileids()[0])) #0.txt 단락별 출력
def try_out_some_functionalities(): corpusdir ="/media/benzro/OS/Users/benzro/Desktop/Studium Uni/2)" \ "ZweitesSemester/27)PCL-2/Uebungen/Uebung03/Enron/test/" newcorpus = PCR(corpusdir, '.*') print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access one file in the corpus" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" infile = corpusdir + "0001.1999-12-10.farmer.ham.txt" infile = "0004.1999-12-14.farmer.ham.txt" fin = newcorpus.open(infile) print fin.read().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "all file ids" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.fileids() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access each file in the corpus" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # (reduced output: [0:2]) for infile in sorted(newcorpus.fileids()): # the fileids of each file print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print infile # opens the file fin = newcorpus.open(infile) # prints the content of the file print fin.read().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access the plaintext; outputs pure string of all files" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.raw().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access paragraphs in the corpus. (list of list of list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access pargraphs of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.paras(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access sentences in the corpus. (list of list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # NOTE: That the texts are flattened into sentences that contains tokens. print newcorpus.sents() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access sentences of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.sents(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access just tokens/words in the corpus. (list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.words() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access tokens of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.words(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
OUTPUT_SIGNATURE = "file*.lemmatized" import nltk import os import glob from os.path import join from nltk.collocations import * from nltk.corpus.reader.plaintext import PlaintextCorpusReader bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() # read in corpus, find all the 3-grams above the min frequency print "Reading in corpus from", CORPUS_ROOT my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION) print "Read in " + str(len(my_corpus.fileids())) + " files" print "Finding 3-grams" finder_3gram = TrigramCollocationFinder.from_words(my_corpus.words()) print "Filtering out 3-grams of frequency less than", MIN_FREQUENCY finder_3gram.apply_freq_filter(MIN_FREQUENCY) # combine all the 3-grams meeting the PMI threshold print "Looking for 3-grams with a PMI of at least ", MIN_3GRAM_PMI filelist = [f for f in glob.glob(CORPUS_ROOT + CORPUS_OUTPUT_EXTENSION)] gen = finder_3gram.above_score(trigram_measures.pmi, MIN_3GRAM_PMI) processGrams(gen, filelist) # now let's do the same for the 2-grams # our previous step altered the corpus so let's read it in again print "Reading in corpus from", CORPUS_ROOT
robotStoryCorpusDir = '../resources/robot_stories' childStoryCorpus = PlaintextCorpusReader(childStoryCorpusDir, ".*\.txt") robotStoryCorpus = PlaintextCorpusReader(robotStoryCorpusDir, ".*\.txt") # average word length, average sentence length, and the number of times each vocabulary item appears in the text on average (our lexical diversity score) # for fileid in childStoryCorpus.fileids(): # num_chars = len(childStoryCorpus.raw(fileid)) # num_words = len(childStoryCorpus.words(fileid)) # num_sents = len(childStoryCorpus.sents(fileid)) # num_vocab = len(set([w.lower() for w in childStoryCorpus.words(fileid)])) # print ((float(num_chars)/float(num_words)), float(num_words)/float(num_sents), float(num_words)/float(num_vocab), fileid) for fileid in childStoryCorpus.fileids(): print (fileid) file_path = os.path.join(childStoryCorpusDir, fileid) with open(file_path, 'r') as orgf: for line in orgf: for s in tokenize.sent_tokenize(line): print(s) #print(st.tag(tokenize.word_tokenize(s))) #print(st.tag(s.split())) print(list(parser.raw_parse(s))) # for line in parser.raw_parse(s): # for sentence in line: # sentence.draw()
with open(corpusdir + str(filename) + '.txt', 'w') as fout: print << fout, text # Check that our corpus do exist and the files are correct. assert os.path.isdir(corpusdir) for infile, text in zip(sorted(os.listdir(corpusdir)), corpus): assert open(corpusdir + infile, 'r').read().strip() == text.strip() # Create a new corpus by specifying the parameters # (1) directory of the new corpus # (2) the fileids of the corpus # NOTE: in this case the fileids are simply the filenames. newcorpus = PlaintextCorpusReader('newcorpus/', '.*') # Access each file in the corpus. for infile in sorted(newcorpus.fileids()): print(infile) # The fileids of each file. with newcorpus.open(infile) as fin: # Opens the file. print(fin.read().strip()) # Prints the content of the file # # Access the plaintext; outputs pure string/basestring. # print(newcorpus.raw().strip()) # # # Access paragraphs in the corpus. (list of list of list of strings) # # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # # nltk.tokenize.word_tokenize. # # # # Each element in the outermost list is a paragraph, and # # Each paragraph contains sentence(s), and # # Each sentence contains token(s) # print newcorpus.paras()
from nltk.text import * import nltk if len(sys.argv) != 4: print "Usage:", sys.argv[0], "word sense1 sense2" exit(-1) focal_word = sys.argv[1] senses = [sys.argv[2], sys.argv[3]] #focal_word = "plant" #senses = ["manufacturing","life"] corpus = PlaintextCorpusReader('outcorpus/', '.*') collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ] decision_list = wsd.DecisionList() decision_list.load("senses_bootstrap_" + focal_word + ".csv") corpus_ids = corpus.fileids() random.shuffle(corpus_ids) num_words = 1 num_words_max = 100 tagged = 0 ambiguous = 0 unknown = 0 for infile in corpus_ids: if num_words > num_words_max: break words = corpus.words(infile) text = Text(words) c = nltk.ConcordanceIndex(text.tokens) offsets = c.offsets(focal_word)
# Access the plaintext; outputs pure string/basestring. print newcorpus.raw().strip() print # Access paragraphs in the corpus. (list of list of list of strings) # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print # To access pargraphs of a specific fileid. print newcorpus.paras(newcorpus.fileids()[0]) # Access sentences in the corpus. (list of list of strings) # NOTE: That the texts are flattened into sentences that contains tokens. print newcorpus.sents() print # To access sentences of a specific fileid. print newcorpus.sents(newcorpus.fileids()[0]) # Access just tokens/words in the corpus. (list of strings) print newcorpus.words() # To access tokens of a specific fileid. print newcorpus.words(newcorpus.fileids()[0])
class Contract_Reader(): def __init__(self, config): print('Filepath for texts = ', config.textpath) self.corpus = PCR(config.textpath, '.*\.txt', encoding='utf-16', para_block_reader=read_line_block) if config.clean_paragraphs == 'yes': self.clean(config, mode='para') if config.clean_sentences == 'yes': self.clean(config, mode='sent') #Corpus summaries self.corpus_info() self.LDA(config.num_topics, config.num_words) self.plot(config.num_words) def clean(self, config, mode='sent'): stop = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WNL() if mode == 'para': #paragraphs are lists of sentences each of which is a list of tokens. Reducing to list of strings. self.para_list = [ list(itertools.chain.from_iterable(para)) for para in self.corpus.paras() ] for index, paragraph in enumerate(self.para_list): paragraph = " ".join(paragraph) stop_free = " ".join( [i for i in paragraph.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) normalized = " ".join( lemma.lemmatize(word) for word in punc_free.split()) self.para_list[index] = normalized print(self.para_list[0]) self.para_list = [para.split() for para in self.para_list] print(self.para_list[0]) if mode == 'sent': #Obtain list of strings each one a sentence rather than list of lists. self.sents_list = [" ".join(sent) for sent in self.corpus.sents()] for index, sentence in enumerate(self.sents_list): stop_free = " ".join( [i for i in sentence.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) normalized = " ".join( lemma.lemmatize(word) for word in punc_free.split()) self.sents_list[index] = normalized print(self.sents_list[0]) self.sents_list = [ sentence.split() for sentence in self.sents_list ] print(self.sents_list[0]) def LDA(self, num_topics, num_words): dictionary = corpora.Dictionary(self.para_list) doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list] path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623' self.ldamodel = LdaVowpalWabbit(path, doc_term_matrix, num_topics=num_topics, id2word=dictionary) self.ldamodel.save('model/lda_model') print(self.ldamodel.print_topics(num_topics=10, num_words=num_words)) def plot(self, num_words): for t in range(self.ldamodel.num_topics): plt.figure() tuples = [ reversed(x) for x in self.ldamodel.show_topic(t, num_words) ] plt.imshow(WordCloud().fit_words(dict(tuples))) plt.axis("off") plt.title("Topic #" + str(t)) plt.savefig('plots/topic' + str(t)) def corpus_info(self): """ Summary information about the status of a corpus. """ fids = len(self.corpus.fileids()) paras = len(self.corpus.paras()) sents = len(self.corpus.sents()) sperp = sum(len(para) for para in self.corpus.paras()) / float(paras) tokens = FreqDist(self.corpus.words()) count = sum(tokens.values()) vocab = len(tokens) lexdiv = float(count) / float(vocab) print( ("Text corpus contains {} files\n" "Composed of {} paragraphs and {} sentences.\n" "{:0.3f} sentences per paragraph\n" "Word count of {} with a vocabulary of {}\n" "lexical diversity is {:0.3f}").format(fids, paras, sents, sperp, count, vocab, lexdiv))
print(a) from scipy.spatial.distance import cosine print(cosine(dtm[0].toarray(),dtm[1].toarray())) from sklearn.feature_extraction.text import TfidfVectorizer tfid_vectors = TfidfVectorizer() tfid_vectors = tfid_vectors.fit_transform([sent1,sent2]) print(pd.DataFrame(data = tfid_vectors.toarray())) a1=pairwise_distances(tfid_vectors[0].toarray(),tfid_vectors[1].toarray(),metric='cosine') print(a1) print("________________Tf-idf corpus reader__________________________") from nltk.corpus.reader.plaintext import PlaintextCorpusReader path="./text_docs/" president_corpus = PlaintextCorpusReader(path,".*",encoding="utf-8") tfid_vectors_corpus = TfidfVectorizer(input='filename') files= [path+filename for filename in list(president_corpus.fileids())] tf_idf_matrix = tfid_vectors_corpus.fit_transform(raw_documents=files) barack = tf_idf_matrix.toarray()[0] bush = tf_idf_matrix.toarray()[1] trump = tf_idf_matrix.toarray()[2] print(cosine(barack,bush)) print(cosine(bush,trump)) print(cosine(trump,barack))
return file.read() # 말뭉치 폴더 생성 newCorpusDir = 'mycorpus/' if not os.path.isdir(newCorpusDir): # 말뭉치 폴더가 이미 존재하는가? os.mkdir(newCorpusDir) # 파일 읽기 # 일반 텍스트 파일 txt1 = getText('./Files/sample_feed.txt') # PDF 파일 txt2 = pdf.getTextPDF('./Files/sample-pdf.pdf') # DOCX 파일 txt3 = word.getTextWord('./Files/sample-one-line.docx') # 파일 쓰기 files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) # 사용자 정의 말뭉치 만들기 # 폴더 내의 모든 파일을 읽어와 파일들로부터 말뭉치를 생성한다 newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') # 사용자 정의 말뭉치가 잘 만들어 졌는지 확인 print(newCorpus.words()) # 말뭉치의 모든 단어를 포함하는 배열 print(newCorpus.sents(newCorpus.fileids()[1])) # 1.txt에 있는 모든 문장 배열을 출력 print(newCorpus.paras(newCorpus.fileids()[0])) # 0.txt에 있는 모든 단락 배열을 출력
# marks occur replace it with null for x in string.lower(): if x in punctuations: string = string.replace(x, "") return string debug = None big_paras = [] print_timestamp('\n' * 3 + 'End') print_timestamp('\n' * 3 + 'Begin') corpusdir = '..\\Thinkful\\Datafiles/UnsupervisedLearningCapstone\\fiction_corpus\\' fiction_corpus = PlaintextCorpusReader(corpusdir, '.*.txt') documents_stat = fiction_corpus.fileids() if debug: print("documents_stat={} and is a {} datatype".format( documents_stat, type(documents_stat))) documents_stat_0 = [] # documents_stat_0.append(documents_stat[0]) if debug: print("documents_stat_0 is a {} datatype".format(type(documents_stat))) item_num = 0 book_block = [] word_counts = {} for book in documents_stat: item_num += 1
''' from nltk.corpus.reader.plaintext import PlaintextCorpusReader import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet as wn import string import csv from fileinput import filename corpusdir = 'C:/Users/Advaith GVK/workspace/Trial/src/Pack/New folder' # Directory of corpus. newcorpus = PlaintextCorpusReader(corpusdir, '.*') filenames = newcorpus.fileids() # print newcorpus.sents() def getWordNetType(tag): #print tag if tag in ['JJ', 'JJR', 'JJS']: return wn.ADJ elif tag in ['NN', 'NNS', 'NNP', 'NNPS','POS','FW']: return wn.NOUN elif tag in ['RB', 'RBR', 'RBS','WRB']: return wn.ADV elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: return wn.VERB return wn.NOUN
import os from nltk.tokenize import RegexpTokenizer from stop_words import get_stop_words from nltk.stem.porter import PorterStemmer from gensim import corpora, models import gensim import numpy as np import operator import os import sys #corpus of segments reload(sys) sys.setdefaultencoding('Cp1252') corpusdir = '/home/abc/Desktop/adm/segments' newcorpus = PlaintextCorpusReader(corpusdir, '.*') sortedall = sorted(newcorpus.fileids()) tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() doc_a = "/home/rashmi/Documents/adm_project/appended/0.txt" doc_b = "/home/rashmi/Documents/adm_project/appended/1.txt" doc_c = "/home/rashmi/Documents/adm_project/appended/2.txt" doc_d = "/home/rashmi/Documents/adm_project/appended/3.txt" doc_e = "/home/rashmi/Documents/adm_project/appended/4.txt" # compile sample documents into a list
if (verbose and j%inc==0): print('Progress:',j,'/',jobs) return term_yn # prepocess the text here def preprocess(t): rem_chars = "[!\"#$%&()*+,:;<=>?@[\\]^_`{|}~0123456789]" # remove these rep_chars = "[-./\']" # replace these t_temp = re.sub(rem_chars, "", t.lower()) t_temp = re.sub(rep_chars, " ", t_temp) t_strip_lower_filt = [w for w in t_temp.split() if not w in stopwords.words('english')] return " ".join(t_strip_lower_filt) # load the data corpusdir = 'corpus_txt/' # Directory of corpus. mycorp_raw = PlaintextCorpusReader(corpusdir, '.*') file_index = mycorp_raw.fileids() # preprocess the text (slow) # uncomment one of the following lines for usual vs parallel processing #mycorp_proc = nltk.Text([preprocess(mycorp_raw.raw(f)) for f in file_index]) mycorp_proc = Parallel(n_jobs=3,verbose=True)(delayed(preprocess)(mycorp_raw.raw(f)) for f in file_index) # get ngrams (1-3) vectorizer_ngrams = CountVectorizer(min_df = 0.05, ngram_range=(1, 3)) mat_ngrams = vectorizer_ngrams.fit_transform(mycorp_proc) n_df = pd.DataFrame(data = mat_ngrams.A, columns = vectorizer_ngrams.get_feature_names()) n_df['pt_id'] = [i[:-4] for i in file_index] # write results to file n_df.to_csv('ngrams_dtm.csv', index = False)
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk import word_tokenize import re corpusdir = 'python/' # Directory of corpus. newcorpus = PlaintextCorpusReader(corpusdir, '.*') print(newcorpus.fileids()[0]) print(type(newcorpus)) #print newcorpus.raw() print newcorpus.words(newcorpus.fileids()[0]) print(len(newcorpus.words())) tokens = word_tokenize(newcorpus.raw()) #type(tokens) print len(tokens) print tokens[:50] #tokens[:10] print newcorpus.sents() print #to remove comments def removeComments(string): string = re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,string) # remove all occurance streamed comments (/*COMMENT */) from string fdf string = re.sub(re.compile("//.*?\n" ) ,"" ,string) # remove all occurance singleline comments (//COMMENT\n ) from string return string print(removeComments(newcorpus.words(newcorpus.raw())))
# text_list.append(text) # preprocessed_docs = [] # for n,t in enumerate(text_list): # # print sample of text before and after processing # #if n == (len(text_list) - 1): # # print(("Doc {} (before preproc): {}").format(n, t)) # # print(("Doc {}: {}").format(n, p)) # p = preprocess(t) # preprocessed_docs.append(p) # print("Preprocessed docs len:", len(text_list)) texts = PlaintextCorpusReader(d, ".*\.txt") boc_texts = [extract(texts.raw(fileid)) for fileid in texts.fileids()] dictionary = gensim.corpora.Dictionary(boc_texts) #dictionary = gensim.corpora.Dictionary(preprocessed_docs) #dictionary.filter_extremes(no_below=10,no_above=.5,keep_n=100000) #bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs] bow_corpus = [dictionary.doc2bow(doc) for boc_text in boc_texts] tfidf = models.TfidfModel(bow_corpus) corpus_tfidf = tfidf[bow_corpus] fileids = texts.fileids() for idx, doc in enumerate(corpus_tfidf): new_file.write("Document '{}' key phrases:\n".format(fileids[idx])) # Get top 100 terms by TF-IDF score for wid, score in heapq.nlargest(100, doc, key=itemgetter(1)):
CORPUS_EXTENSION =r'.*\.txt' import nltk import os from os import listdir from os.path import isfile, join from nltk.collocations import * from nltk.corpus.reader.plaintext import PlaintextCorpusReader bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() # read in corpus, find all the 3-grams above the min frequency print "Reading in corpus from", CORPUS_ROOT my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION) print "Read in " + str(len(my_corpus.fileids())) + " files" print "Finding 3-grams" finder_3gram = TrigramCollocationFinder.from_words(my_corpus.words()) print "Filtering out 3-grams of frequency less than", MIN_FREQUENCY finder_3gram.apply_freq_filter(MIN_FREQUENCY) # combine all the 3-grams meeting the PMI threshold print "Looking for 3-grams with a PMI of at least ", MIN_3GRAM_PMI filelist = [ join(CORPUS_ROOT,f) for f in listdir(CORPUS_ROOT) if isfile(join(CORPUS_ROOT,f)) ] gen = finder_3gram.above_score(trigram_measures.pmi, MIN_3GRAM_PMI) processGrams(gen, filelist) # now let's do the same for the 2-grams # our previous step altered the corpus so let's read it in again print "Reading in corpus from", CORPUS_ROOT my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION)
import nltk from nltk.corpus.reader.plaintext import PlaintextCorpusReader #Loading the file you want to Train corpusdir = 'E:\MTech' # Directory of corpus. newcorpus = PlaintextCorpusReader(corpusdir, '.*') print(newcorpus.fileids()) dictlist=[] #Converting from word/tag pait to a list containing two tuple i.e,[(word1,tag),(word2,tag)] for i in newcorpus.fileids(): tagged_sent=newcorpus.raw(i) tagged=tagged_sent.split() for t in tagged: temp1=nltk.tag.str2tuple(t) dictlist.append(temp1) print(dictlist) print("This is the length of distinct words") print(len(set(dictlist))) fdist=nltk.FreqDist(dictlist) print("fdist items") print(fdist.items()) print(fdist.max()) rawtext = ''' '''
from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer nltk.download('wordnet') download('punkt') download('stopwords') from nltk.corpus import wordnet as wn from nltk.stem.wordnet import WordNetLemmatizer ############################ Read Data ############################################# #if you run this code with ipython you should enter the address of your file: instead of './txts' corpus_directory = './txts' textsfile = PlaintextCorpusReader(corpus_directory, '.*') ID_files = textsfile.fileids() print(ID_files, len(ID_files)) ############################## Preprossesing Data ###################################### stop_words = stopwords.words('english') lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer() Index_of_files = [] texts = [] count = 0 #file with file_ids for fileid in ID_files: texts.append(textsfile.raw(fileids=fileid)) Index_of_files.append(fileid)
if len(sys.argv) != 4: print "Usage:", sys.argv[0], "word sense1 sense2" exit(-1) focal_word = sys.argv[1] senses = [sys.argv[2], sys.argv[3]] #focal_word = "plant" #senses = ["manufacturing","life"] corpus = PlaintextCorpusReader('outcorpus/', '.*') collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ] decision_list = wsd.DecisionList() decision_list.load("senses_bootstrap_" + focal_word + ".csv") i = 0 for infile in sorted(corpus.fileids()): print i, "/", len(corpus.fileids()) i += 1 words = corpus.words(infile) text = Text(words) c = nltk.ConcordanceIndex(text.tokens) offsets = c.offsets(focal_word) for offset in offsets: for collocation in collocations: tokens = collocation.get_collocation(text, offset) if tokens == None: continue sense = decision_list.get_sense(tokens, collocation.index) if sense == None: continue collocation.add_collocation(text, offset, sense)
import glob import os import string import nltk from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity from scipy.cluster.hierarchy import ward, dendrogram from sklearn.manifold import MDS import matplotlib.pyplot as plt import scipy.stats as stats names = [] corpus = [] co = PlaintextCorpusReader("./election", ".*\.txt") for fileids in co.fileids(): names.append(fileids) corpus.append(co.raw(fileids)) print len(names), 'documents in the corpus' print names[:30] for idx in range(len(corpus) - 1, -1, -1): print print names[idx] print corpus[idx][:70].replace('\n', ' ') vectorizer = TfidfVectorizer(stop_words='english', min_df=2) dtm = vectorizer.fit_transform(corpus) print dtm.shape vocab = vectorizer.get_feature_names(
from nltk.corpus.reader.plaintext import PlaintextCorpusReader from decimal import Decimal from math import pi if __name__ == '__main__': ptcr = PlaintextCorpusReader('C:\Users\Jakub\Downloads\pr4\Trzeci plik', ['znormalizowane.txt', 'katy.txt']) data = [] t = ptcr.raw(fileids=ptcr.fileids()[1]).replace(',', '.').replace('\r', '').split('\n') t.remove('') for x in t: data.append(float(Decimal(x)*360/315)) print data data_ = [] t = ptcr.raw(fileids=ptcr.fileids()[0]).replace(',', '.').replace('\r', '').split('\n') t.remove('') for x in t: data_.append(float(x)/100) print data_
with open(corpusdir + str(filename) + '.txt', 'w') as fout: print >> fout, text # Check that our corpus do exist and the files are correct. assert os.path.isdir(corpusdir) for infile, text in zip(sorted(os.listdir(corpusdir)), corpus): assert open(corpusdir + infile, 'r').read().strip() == text.strip() # Create a new corpus by specifying the parameters # (1) directory of the new corpus # (2) the fileids of the corpus # NOTE: in this case the fileids are simply the filenames. newcorpus = PlaintextCorpusReader('newcorpus/', '.*') # Access each file in the corpus. for infile in sorted(newcorpus.fileids()): print infile # The fileids of each file. with newcorpus.open(infile) as fin: # Opens the file. print fin.read().strip() # Prints the content of the file print # Access the plaintext; outputs pure string/basestring. print newcorpus.raw().strip() print # Access paragraphs in the corpus. (list of list of list of strings) # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and
# 150 common_words x 6 fileids x sentences(937 total) x words + append/join sentence into string + 6x search fileids from nltk.tokenize.treebank import TreebankWordDetokenizer common_words = dict(freq.most_common(150)) print(common_words) data = [] hmap = {} detokenized = {} for word, frequency in common_words.items(): datum = {'word': word, 'frequency': frequency} docs = [] sents = [] for key, fileid in enumerate(corpus.fileids()): if key not in hmap: hmap[key] = {} for s_id, sentence in enumerate(corpus.sents(fileid)): if key in hmap and s_id in hmap[key]: words = hmap[key][s_id] else: words = [lemmatizer.lemmatize(w.lower()) for w in sentence] hmap[key][s_id] = words if word in words: s_key = f'{key}-{s_id}' sent = '' if s_key in detokenized:
import os import word, pdf from nltk.corpus.reader.plaintext import PlaintextCorpusReader def getText(txtFileName): file = open(txtFileName, 'r') return file.read() newCorpusDir = 'mycorpus/' if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('sample-pdf.pdf') txt3 = word.getTextWord('sample-one-line.docx') files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()) print(newCorpus.sents(newCorpus.fileids()[1])) print(newCorpus.paras(newCorpus.fileids()[0]))
boc_texts = [ extract(texts.raw(fileid)) for fileid in texts.fileids() ] # make gensim dictionary and corpus dictionary = gensim.corpora.Dictionary(boc_texts) corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts] # transform corpus with tf*idf model tfidf = gensim.models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] return corpus_tfidf, dictionary # Can change this to output just keywords by commenting out 1st, 2nd, and 4th "new_file.write" lines if __name__ == '__main__': tfidfs, id2word = score_keyphrases_by_tfidf(texts)#, 'words') fileids = texts.fileids() # Print top keywords by TF-IDF for idx, doc in enumerate(tfidfs): new_file.write("Document '{}' key phrases:\n".format(fileids[idx])) # Get top 10 terms by TF-IDF score for wid, score in heapq.nlargest(keyphrase_num, doc, key=itemgetter(1)): new_file.write("{:0.3f}: {}\n".format(score, id2word[wid])) #new_file.write("{}\n".format(id2word[wid])) new_file.write("\n") print("Done! Look for {} in the 'Classifier' directory".format(new_file_name))
def create_content (gdocs,graphicsdir,gcontent): for file in gdocs: gcontent.append(open(graphicsdir+'/'+str(file),'r').read()) # defining the directory path for each category graphicsdir,autosdir,gunsdir = '20news-bydate/train/comp.graphics','20news-bydate/train/rec.autos','20news-bydate/train/talk.politics.guns' graphicstest,autostest,gunstest = '20news-bydate/test/comp.graphics','20news-bydate/test/rec.autos','20news-bydate/test/talk.politics.guns' graphicscorpus,autoscorpus,gunscorpus = PlaintextCorpusReader(graphicsdir, '.*'),PlaintextCorpusReader(autosdir, '.*'),PlaintextCorpusReader(gunsdir, '.*') graphicscorpustest,autoscorpustest,gunscorpustest = PlaintextCorpusReader(graphicstest, '.*'),PlaintextCorpusReader(autostest, '.*'),PlaintextCorpusReader(gunstest, '.*') # initializing the lists gdocs,adocs,ndocs,gcontent,acontent,ncontent,gwords,awords,nwords,vocab = [],[],[],[],[],[],[],[],[],[] gtdocs,atdocs,ntdocs,gtcontent,atcontent,ntcontent,gtwords,atwords,ntwords,vtocab = [],[],[],[],[],[],[],[],[],[] # for train dataset gdocs.extend(graphicscorpus.fileids()) # for graphics category adocs.extend(autoscorpus.fileids()) # for autos category ndocs.extend(gunscorpus.fileids()) # for guns category # for test dataset gtdocs.extend(graphicscorpustest.fileids()) # for graphics category atdocs.extend(autoscorpustest.fileids()) # for autos category ntdocs.extend(gunscorpustest.fileids()) # for guns category # retriving the words for each category # for train dataset create_content(gdocs,graphicsdir,gcontent) create_content(adocs,autosdir,acontent) create_content(ndocs,gunsdir,ncontent) # for test dataset create_content(gtdocs,graphicstest,gtcontent) create_content(atdocs,autostest,atcontent) create_content(ntdocs,gunstest,ntcontent)