def load_corpus(race_code=None, gender_code=None ): #loads corpora into an array based on race and gender if (race_code == None): # if none is specified, search all race_code = ".." if (gender_code == None): gender_code = ".." reader = PlaintextCorpusReader( corpus_root, ".*_" + race_code + "_" + gender_code + "\.txt") # uses filename encoding to load specified texts corpora = [] for fileid in reader.fileids( ): #creates ComedyCorpus object, populates with fileid and name new_corpus = ComedyCorpus() new_corpus.set_fileid(fileid) try: new_corpus.set_text( reader.raw(fileid)) #gets word content based on fileid except UnicodeDecodeError: continue fileid = re.sub("_" + race_code + "-" + gender_code + "\.txt", "", fileid) #name is fileid without encoding fileid = fileid.replace("%20", " ") fileid = fileid.replace("_", "; ") print(fileid) new_corpus.set_name(fileid) corpora.append(new_corpus) return corpora
def generateNgramModel(corpusPath, corpusName): corpusdir = 'corpora/' # Directory of corpus. generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False, estimator) #uses bigrams just cause they BETTER return ngrammodel
def get_phrase(): root_dir = r'E:\github_repo\python_basic\pythonbasictest\self_nltk\files' wordlists = PlaintextCorpusReader(root_dir,".*") x = nltk.Text(wordlists.words("test.txt")) print(x) print(x.collocations())
def align(filename): files = filename.split('(') ripe_file = os.path.abspath(files[1]) raw_file = os.path.abspath(files[0]) raw_for_nltk = os.path.abspath('data/newcorpus/source.txt') with open(files[0]) as f: with open(raw_for_nltk,"w") as f1: for line in f: f1.write(line) corpusdir = 'data/newcorpus/' newcorpus = PlaintextCorpusReader(corpusdir, '.*',sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/german.pickle')) out = open(ripe_file, "w") i = 0 temp =[] temp.append(newcorpus.sents(raw_for_nltk)) tempVal = str(temp[i]) tempVal = tempVal.replace(",", "") tempVal = tempVal.replace("u'", "") tempVal = tempVal.replace("'", "") tempVal = tempVal.replace("[", "") tempVal = tempVal.replace("]", "") out.write(tempVal+os.linesep) out.close() return
def read_corpus(corpus_path): from nltk.corpus.reader.plaintext import PlaintextCorpusReader corpus = PlaintextCorpusReader(corpus_path, ".*\.txt") ctext = corpus.raw() # with open('corpus.txt', 'w') as cf: # cf.write(ctext.encode('utf-8')) return ctext
def generate_words_grammar(): """ Use sentence grammar to find words that could be Rent lyrics :return: """ # Load corpuses to look in gentrification = PlaintextCorpusReader( 'corpus', '.*') # Gentrification articles are in this directory gentrify_sents = gentrification.sents() # wine_sents = nltk.corpus.webtext.sents('wine.txt') corpus_sents = gentrify_sents + wine_sents syls_1 = [] syls_2 = [] syls_4 = [] syls_2_sing = [] for sent in corpus_sents: parsed_sent = nltk.pos_tag(sent) for word in parsed_sent: no_syls = count_syllables(word[0]) if word[1] == 'NNS' and len(word[0]) > 3: if no_syls == 1: syls_1 = syls_1 + [word[0].lower()] elif no_syls == 2: syls_2 = syls_2 + [word[0].lower()] elif no_syls == 4: syls_4 = syls_4 + [word[0].lower()] if word[1] == 'NN' and len(word[0]) > 2: if no_syls == 2: syls_2_sing = syls_2_sing + [word[0].lower()] return list(set(syls_1)), list(set(syls_2)), list(set(syls_4)), list( set(syls_2_sing))
def load_feat_data(dir_array): data_list = [] for direct in dir_array: data = [] corpus_dir = 'dataset/' + direct corpus = PlaintextCorpusReader(corpus_dir, '.*\.*') file_ids = corpus.fileids() for file in file_ids: text = corpus.raw(file) e = email.message_from_string(text) if (e.is_multipart()): for payload in e.get_payload: text = payload.get_payload else: text = e.get_payload() data.append(extract_features(text, corpus, file)) data_list.extend(data) return data_list
def __init__(self, master): ''' Constructor. master is a string that names a directory in the same repository that contains all the work from inspiration ''' self.master = 'masters/' + master self.reader = PlaintextCorpusReader(self.master, r'.*', encoding='utf-8') self.text = self.reader.words()
def get_corpus_words(): ''' Returns all the words from corpus. ''' reader = PlaintextCorpusReader(settings.CORPUS_ROOT, settings.CORPUS_FILES_GLOBB) if reader: return reader.words() return []
def cv_to_matrix(self): corpusdir = 'data/cv_corpus' corpa = PlaintextCorpusReader(corpusdir,'.*',encoding='windows-1252') print("Preprocessing words....") sents = [[token.lemma_ for token in nlp(" ".join(self.clean(sent)).lower()) if token.lemma_ not in stopset] for sent in corpa.sents()] print("training word vectors....") model = Word2Vec(sents,window=5, size=self.ncol,min_count=1, workers=4) fname = get_tmpfile("vectors.kv") model.wv.save(fname) print("cv_to_matrix model saved") return model.wv
def __init__(self, data_root): self.data_root = data_root self.data = PlaintextCorpusReader(data_root, '.*') self.words = [i for i in self.data.words() if i.isalpha()] self.text = Text(self.words) self.stop = set(stopwords.words('english')).union({ 'cid', 'et', 'al', 'also', 'and', 'editingboston', 'arxiv', 'pages', 'trackboston', 'preprint', 'page', 'vol', 'volume', 'march', 'boston', 'table' }) with open('bib.json') as fi: self.bib = json.load(fi)
def build_d2v_model(self): print("Début de la construction du modèle Doc2Vec") corpusdir = 'data/cv_corpus' corpa = PlaintextCorpusReader(corpusdir, '.*',encoding='windows-1252') print("tokenizing...") resumes = [[token.lemma_ for sent in paras for token in nlp(" ".join(self.clean(sent)).lower()) if token.lemma_ not in stopset] for paras in corpa.paras()] #print(resumes[0:3]) print("tokenization completed") documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(resumes)] model = Doc2Vec(documents, vector_size=self.cv_length, window=5, min_count=1, workers=4) print("Fin de la construction du modèle Doc2Vec") return model
def token_in_coverage(self): corpusdir = 'data/cv_corpus' corpa = PlaintextCorpusReader(corpusdir, '.*',encoding='windows-1252') resumes = [[item for sent in paras for item in sent] for paras in corpa.paras()] cpt=0 for resume in resumes : resume_text = " ".join(resume) resume_sents = nltk.sent_tokenize(resume_text) resume_words = set(token.lemma_ for sent in resume_sents for token in nlp(" ".join(sent).lower())) if not resume_words.isdisjoint(self.tokens_in) : cpt+=1 coverage = cpt*1.0/len(resumes) print("token_in coverage : {}".format(coverage))
def get_fileid_lst(source_dir): ''' Use NLTK to pull in the list of file ids in the given source directory :param {str} source_dir: The relative path to the source directory that contains all the data (book) files :return {str} fileid_lst: List of all file id's ending in '.txt' in the source_dir ''' temp_corp = PlaintextCorpusReader(source_dir, '.*\.txt') fileid_lst = temp_corp.fileids() return fileid_lst
def main(): """ Main function of the program """ corpus_dir = 'NLP_dataset/training_set' # Directory of corpus. new_corpus = PlaintextCorpusReader(corpus_dir, '.*') for file_id in new_corpus.fileids(): file_to_read = open(corpus_dir+"/"+file_id, "r") # reading each file to get matched sentences matched_sen = match_regular_expressions(file_to_read) # writing the matched sentences to files write_to_files(matched_sen, file_id)
def create_corpus(): ## Create corpus from abstract ## fetched by BIBOT ## return a corpus object ## Read the abstract result file abstract_to_content = {} abstract_file = open("fetched/pubmed_abstract.txt", "r") for line in abstract_file: line = line.replace("\n", "") if (line[0] == ">"): abstract = line[1:] abstract_to_content[abstract] = "" else: content = line abstract_to_content[abstract] = content abstract_file.close() ## create files for key in abstract_to_content.keys(): text_file = open("fetched/corpus/" + str(key) + ".txt", "w") text_file.write(abstract_to_content[key]) text_file.close() ## ntlk magical lines corpusdir = 'fetched/corpus/' newcorpus = PlaintextCorpusReader(corpusdir, '.*') return newcorpus
def getCorupsFromCorpusFile(CorpusFile): CorpusDir, CorpusFile = os.path.split(CorpusFile) corpus = PlaintextCorpusReader(CorpusDir, CorpusFile) return corpus
def load_corpus(self): if len(self.corpus) == 0: raise Exception('No corpus defined.') if os.path.isdir(self.corpusdir) is False: self.generate_corpus_files() newcorpus = PlaintextCorpusReader(self.corpusdir, '.*') # bard.sents = newcorpus.sents bard.tokens = newcorpus.words() print len(bard.tokens) # print 'init markov NLG text generator' self.generator = bard.generators.markov.IntelligentMarkovGenerator(bard.tokens)
class App: def makeTrainingData (reader): for category in reader.categories(): for file in reader.fileids(category): yield FreqDist(reader.words(fileids=[file])), category corpusDirectory = "../../resources/input/" #Was using PlaintextCorpusReader, switched to Categorized to provide categories wattsCorpus = PlaintextCorpusReader(corpusDirectory, '.*') print wattsCorpus.raw().strip() print wattsCorpus.words() for sentence in wattsCorpus.sents(): print sentence print len(wattsCorpus.sents()) text = nltk.tokenize.word_tokenize(wattsCorpus.raw()) print "tokenized text: ", text #example of finding similar word text = nltk.Text(word.lower() for word in wattsCorpus.words()) print "similar to god: ", text.similar('god') words = nltk.pos_tag(text) fdist = nltk.FreqDist(words) print "frequencey distribution: ", fdist sentence = "So there are two ways of playing the game. The first way, which is the usual way, is that a guru or teacher who wants " sentenceWords = nltk.word_tokenize(sentence) fdistForSentence = nltk.FreqDist(sentenceWords) fdistForSentence.plot()
def construct_models(): """ Builds the classification models. """ sources = [ 'Conservative', # Scalia + Rehnquist 'Progressive' ] # Ginsburg + Stevens corpus = [(PlaintextCorpusReader('data/' + path + '/', '.*'), path) for path in sources] documents = [] for (c, cat) in corpus: for fileid in c.fileids(): documents.append((c.words(fileid), cat)) random.shuffle(documents) all_words = [] for (c, cat) in corpus: all_words.extend(c.words()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] featuresets = [(find_features(opinion, word_features), cat) for (opinion, cat) in documents] training_subset = int(len(featuresets) * 0.9) training_set = featuresets[:training_subset] testing_set = featuresets[training_subset:] ensemble = EnsembleClassifer(training_set, testing_set) ensemble.show_most_useful_features() ensemble.accuracy() print(ensemble.classify(testing_set[0][0]))
def main(): corpus_root = sys.argv[1] num_text_files = int(sys.argv[2]) algorithm_type = sys.argv[3] pmi_freq_filter = int(sys.argv[4]) file_list = [] for i in range(0, num_text_files): file_list.append(sys.argv[5 + i]) corpus = PlaintextCorpusReader(corpus_root, '.*') if 'bigram' in algorithm_type: measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(corpus.words()) finder.apply_freq_filter(pmi_freq_filter) scored = finder.score_ngrams((f(algorithm_type))) else: measures = nltk.collocations.TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(corpus.words()) finder.apply_freq_filter(pmi_freq_filter) scored = finder.score_ngrams((f(algorithm_type))) sort = (sorted(scored, key=lambda tu: tu[1])) for key in sort: ngrams = len(key[0]) if (ngrams == 2): print key[0][0] + "\t" + key[0][1] + "\t" + str(key[1]) else: print key[0][0] + "\t" + key[0][1] + "\t" + key[0][2] + "\t" + str( key[1])
def create_corpus(directory): corpus = PlaintextCorpusReader(directory, '.*', encoding="iso-8859-1", word_tokenizer=word_tokenize, sent_tokenizer=sent_tokenize) return corpus
def corpus_reader(corpus_name): ''' Open a PlaintextCorpusReader for the given UDN corpus. ''' # If the user requested an unfiltered corpus version, we need to know the root corpus name root_corpus = corpus_name.replace('-unfiltered', '') # Ensure the desired corpus's submodule is checked out if not os.path.exists('./corpora/{}/README.md'.format(root_corpus)): retcode = subprocess.call( "git submodule update --init -- corpora/{}".format( root_corpus).split(" ")) if retcode != 0: print( "Attempt to checkout submodule for corpus '{}'. Try running 'git submodule update --init' manually." .format(root_corpus)) exit() percentage = '' with open('./corpora/{0}/{0}.txt'.format(root_corpus), 'r') as f: manifest = f.readlines() query = manifest[0].split(" ")[3] num_found = util.dry_make_request(query, 0, 1)[0]['numFound'] num_in_corpus, last_one = util.files_in_dir('./corpora/{}/{}'.format( root_corpus, corpus_name)) percentage = '{0:.0%}'.format(num_in_corpus / num_found) if percentage != '100%': print('NOTE: This corpus is only {} complete. Last file: {}\n'. format(percentage, last_one)) corpus = PlaintextCorpusReader( './corpora/{}/{}'.format(root_corpus, corpus_name), r'.*\.txt') return corpus
def __init__(self, input_folder_name, doc_pattern, categ_pattern, encoding='utf-8'): CategorizedPlaintextCorpusReader.__init__(self, input_folder_name, doc_pattern, cat_pattern=categ_pattern) self.input_folder_name = input_folder_name self.encoding = encoding self.root_reader = PlaintextCorpusReader(input_folder_name, fileids=r'[^\/]*.' + doc_pattern[-3:]) #self.root_ids =[ os.path.join(input_folder_name,item) for item in self.root_reader.fileids()] self.root_ids = list(self.root_reader.fileids())
def setup_corpus(self, corpus_dir, paths='.*'): """Setting up a corpus. Args: corpus_dir(str): Path to corpus directory. """ self.corpus = PlaintextCorpusReader(corpus_dir, paths) return self.corpus
def load_data(dir_label): data_list = [] labels = [] for dl in dir_label: data = [] directory = dl[0] label = dl[1] corpus_dir = 'dataset/' + directory corpus = PlaintextCorpusReader(corpus_dir, '.*\.*') file_ids = corpus.fileids() for file in file_ids: d = [] text = corpus.raw(file) e = email.message_from_string(text) if (e.is_multipart()): for payload in e.get_payload: text = payload.get_payload else: text = e.get_payload() feats = [ cf.charac_feats_extractor(text), wf.word_feats_extractor(text), syf.syntac_feats_extractor(text), stf.struct_feats_extractor(corpus, file, text), fwf.funct_word_feats_extractor(text) ] for f in feats: d.extend(list(f.values())) data.append(d) labels.append(label) data_list.extend(data) return [data_list, labels]
def pdf_to_corpus(): path = 'D://Eclipse Workspace//NLP//Assignment//res//' for filename in glob.glob(os.path.join(path, '*.pdf')): print(filename) pdfFileObj = open(filename, 'rb') # creating a pdf reader object pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # printing number of pages in pdf file print(pdfReader.numPages) # creating a page object pageObj = pdfReader.getPage(0) # extracting text from page text = pageObj.extractText() strings_list = text.split("\n") # Make new dir for the corpus. corpusdir = 'customcorpus/' if not os.path.isdir(corpusdir): os.mkdir(corpusdir) # Output the files into the directory. file_name = filename.split("\\")[-1] print(file_name) pbar = ProgressBar(widgets=[ 'Creating Corpus', Bar('#', '[', ']'), ' ', Percentage(), ' ', ETA() ], maxval=100) for text in pbar(strings_list): with open(corpusdir + '[PDF] ' + file_name + '.txt', 'ab') as fout: fout.write(text.encode('utf-8')) pbar.finish() #create_corpus(text) corpus = PlaintextCorpusReader('customcorpus/', '.*') print(corpus.raw())
def _strip_tags(self, title): new_title = '' custom_corpus = PlaintextCorpusReader('../custom_corpora/', '.*') #For each word in the title for word in title.split(): #Remove all punctuation noPunc = ''.join(c for c in word if c not in string.punctuation) #If this word isn't in stopwords and isn't just a single letter if noPunc.lower() not in (stopwords.words('english')) and len(noPunc) > 1: stripped_word = self._strip_word(word) if stripped_word not in (custom_corpus.words('media')) and len(stripped_word) > 1: new_title = ' '.join([new_title, stripped_word]) return new_title[1:]
def corpus_reader(filepath): """ takes a filepath including filename formats in case file is csv loads file into PlainTextCorpusReader """ print "TEST: corpus_reader call" csv_file = open(filepath, 'rb') # use test_1.csv as test case csv_data = csv.reader(csv_file) global csv_read csv_read = open('uploads/tmp/read.tmp', 'w') for line in csv_data: line_to_write = re.sub('[\s\t]+', ' ', str(line)) line_to_write = line_to_write.lstrip('[\'') line_to_write = line_to_write.rstrip('\']') csv_read.write(str(line_to_write) + "\n\n") root = 'uploads/' corpus = PlaintextCorpusReader(root, 'tmp/read.tmp') #response = corpus.paras() words = corpus.words() return words
def __init__(self, config): print('Filepath for texts = ', config.textpath) self.corpus = PCR(config.textpath, '.*\.txt', encoding='utf-16', para_block_reader=read_line_block) if config.clean_paragraphs == 'yes': self.clean(config, mode='para') if config.clean_sentences == 'yes': self.clean(config, mode='sent') #Corpus summaries self.corpus_info() self.LDA(config.num_topics, config.num_words) self.plot(config.num_words)
def Read_corpus(path_c, fname_c, fo1): import nltk import re import spacy import en_core_web_sm import fileinput nlp = spacy.load('en_core_web_sm') from nltk.corpus.reader.plaintext import PlaintextCorpusReader pcorpus = PlaintextCorpusReader(path_c, fname_c, encoding="utf") #HTML Tags to file fappend(fo1, P_htmltag.writehtmltag1(fname_c), fname_c) # Iterate through each paragraph for para in pcorpus.paras(): L0 = rep_tags(para) L1 = L0.split("\n") for i, w in enumerate(L1): if (w != ""): ApplyNLP(nlp(str(w[1:])), fo1) fappend(fo1, P_htmltag.writehtmltag3(fname_c), fname_c)
def token_assamese(): # Modifiy these to change the location of the coupus file and the name of the courpus file corpus_path = "/Users/partha/All/Python/ProjectMaterials/Learned material/Arts" corpus_filename = 'Psychology.txt' newcorpus = PlaintextCorpusReader(corpus_path, corpus_filename, encoding='utf16') text = newcorpus.raw().strip().replace('ред', '.') words = nltk.word_tokenize(text) for index, item in enumerate(words): if (str(item) == '.'): words[index] = 'ред' output_file_path = "C:/Users/HEMANT/Documents/1.Project/" output_filename = 'Result.txt' with open(output_file_path + output_filename, 'w', encoding='utf8') as f: for i in words: f.writelines(str(i) + '\n') f.close()
def __init__(self, my_input_file): self.config = configparser.ConfigParser() self.config.read("text_analysis.cfg") self.input_file = my_input_file self.nlp_model = self.config["DEFAULT"]["nlp_model"] #The output file name self.output_file = self.config["DEFAULT"]["output_file"] self.nlp = load_nlp(self.nlp_model) self.corpus = CorpusReader(".", self.input_file) self.raw_text = self.corpus.raw() self.nlp_text = self.nlp(self.raw_text) # Here, lets put together the infos for text analysis with spacy. self.analysis_dictionary = Counter() self.word_count = 0 self.get_word_count_nltk()
def read_article(file_path): #file = open(file_path, "r") ##INSERT FILE NAME IN FUNCTION CALL BELOW###### bcr = PlaintextCorpusReader(file_path, 'bernie.txt') #filedata = file.read() filedata = bcr.raw() #for word in filedata.split(): # if word == 'Mr.': # filedata[word] = 'Mr' article = filedata.replace("\n\n", '. ').replace('Mr.', 'Mr').replace( "\r", ' ').replace('\n', ' ').split('. ') articlez = [] for line in article: if line == '': continue if line[0] == '\n': line = line[1:] articlez.append(line) sentences = [] for sentence in articlez: sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" ")) sentences.pop() return sentences
def parseFolder( dirPath ): assignments = [] draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*') finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*') numFiles = len( os.listdir( dirPath )) assert numFiles % 2 == 0 finalIdsSortedList = finalReader.fileids() draftIdsSortedList = draftReader.fileids() for i in range(len(finalReader.fileids())): final = finalReader.paras( finalIdsSortedList[i] ) draft = draftReader.paras( draftIdsSortedList[i] ) assn = assignment( draft, final ) assignments.append( assn ) return assignments
def parseFolder( dirPath ): assignments = {} draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*') finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*') numFiles = len( os.listdir( dirPath )) assert numFiles % 2 == 0 finalIdsSortedList = finalReader.fileids() draftIdsSortedList = draftReader.fileids() for pid in finalReader.fileids(): final = finalReader.paras( pid ) #finalIdsSortedList[i] ) draft = draftReader.paras( pid ) #draftIdsSortedList[i] ) assn = assignment( draft, final ) assignments[pid] = assn return assignments
def processFile(newCorpusDir): if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('VirtualBoxTroubleshooting.pdf') txt3 = word.getTextWord('my_doc.docx') files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()) print(newCorpus.sents(newCorpus.fileids()[1])) print(newCorpus.paras(newCorpus.fileids()[0]))
import nltk.data import re from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.probability import FreqDist from nltk.corpus import stopwords import treetaggerwrapper article_corpus = PlaintextCorpusReader('text_plain/', '.*\.txt', sent_tokenizer=nltk.data.LazyLoader('tokenizers/punkt/spanish.pickle')) stop_words = nltk.corpus.stopwords.words('spanish') non_alphabetic = re.compile("\W|\d") words = [] tags = [] # Using TreeTagger # 1) pip install treetaggerwrapper # 2) put treetragger in %PYHOME%\Lib\site-packages\TreeTagger # 3) put spanish-utf8.par and spanish-chunker.par in \TreeTagger\lib # See http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/spanish-tagset.txt for tag meanings tagger = treetaggerwrapper.TreeTagger(TAGLANG='es') for sentence in article_corpus.sents(): tagged_sentence = tagger.tag_text(sentence) tags.extend(treetaggerwrapper.make_tags(tagged_sentence)) #TODO: create a tagger script, save the tagged files #TODO: look at alternate taggers, compare #TODO: profile this and see which part is taking so long for tag in tags: lemma = tag[2].lower()
import os import nltk import pickle import zlib import base64 from nltk.classify.naivebayes import NaiveBayesClassifier from nltk.classify import PositiveNaiveBayesClassifier from nltk.corpus.reader.plaintext import PlaintextCorpusReader corpusdir = "./text" newcorpus = PlaintextCorpusReader(corpusdir, ".*") labeled_names = ( [(name, "comp") for name in newcorpus.words("comp.txt")] + [(name, "animal") for name in newcorpus.words("animal.txt")] + [(word, "ignore") for word in newcorpus.words("ignorethese.txt")] ) features = [({n: n}, thing) for (n, thing) in labeled_names] training = features[:] testing = "What color is the mouse?".lower().split(" ") classifier = NaiveBayesClassifier.train(training) pickleclf = pickle.dumps(classifier) compressed = base64.b64encode(zlib.compress(pickleclf, 9)) with open("PickledClassifier.txt", "wb") as outobj: outobj.write(compressed) compScore = 0 animalScore = 0 for word in testing: if ( word[len(word) - 1] == "." or word[len(word) - 1] == "," or word[len(word) - 1] == "?"
#!/usr/bin/python import sys import wsd import nltk from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.text import * if len(sys.argv) != 4: print "Usage:", sys.argv[0], "word sense1 sense2" exit(-1) focal_word = sys.argv[1] senses = [sys.argv[2], sys.argv[3]] #focal_word = "plant" #senses = ["manufacturing","life"] corpus = PlaintextCorpusReader('outcorpus/', '.*') collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ] decision_list = wsd.DecisionList() decision_list.load("senses_bootstrap_" + focal_word + ".csv") i = 0 for infile in sorted(corpus.fileids()): print i, "/", len(corpus.fileids()) i += 1 words = corpus.words(infile) text = Text(words) c = nltk.ConcordanceIndex(text.tokens) offsets = c.offsets(focal_word) for offset in offsets:
for j in range(len(gwords[i])): file.write(str(gwords[i][j]) + ':' +str(gwords[i].count(gwords[i][j])) + ' ') # writing the words in a filw with a proper format file.write('#label#:'+ str(category) +'\n') # adding label at the end of each file file.close() def create_content (gdocs,graphicsdir,gcontent): for file in gdocs: gcontent.append(open(graphicsdir+'/'+str(file),'r').read()) # defining the directory path for each category graphicsdir,autosdir,gunsdir = '20news-bydate/train/comp.graphics','20news-bydate/train/rec.autos','20news-bydate/train/talk.politics.guns' graphicstest,autostest,gunstest = '20news-bydate/test/comp.graphics','20news-bydate/test/rec.autos','20news-bydate/test/talk.politics.guns' graphicscorpus,autoscorpus,gunscorpus = PlaintextCorpusReader(graphicsdir, '.*'),PlaintextCorpusReader(autosdir, '.*'),PlaintextCorpusReader(gunsdir, '.*') graphicscorpustest,autoscorpustest,gunscorpustest = PlaintextCorpusReader(graphicstest, '.*'),PlaintextCorpusReader(autostest, '.*'),PlaintextCorpusReader(gunstest, '.*') # initializing the lists gdocs,adocs,ndocs,gcontent,acontent,ncontent,gwords,awords,nwords,vocab = [],[],[],[],[],[],[],[],[],[] gtdocs,atdocs,ntdocs,gtcontent,atcontent,ntcontent,gtwords,atwords,ntwords,vtocab = [],[],[],[],[],[],[],[],[],[] # for train dataset gdocs.extend(graphicscorpus.fileids()) # for graphics category adocs.extend(autoscorpus.fileids()) # for autos category ndocs.extend(gunscorpus.fileids()) # for guns category # for test dataset gtdocs.extend(graphicscorpustest.fileids()) # for graphics category atdocs.extend(autoscorpustest.fileids()) # for autos category ntdocs.extend(gunscorpustest.fileids()) # for guns category # retriving the words for each category # for train dataset
class DumbClusterer(): """A rather dumb clusterer. """ def __init__(self, corpus_dir=None, mwes=[], setup_mwes=True, **kwargs): self.mwes = mwes if corpus_dir is not None: self.setup_corpus(corpus_dir, '.*') if setup_mwes: self.setup_mwes(**kwargs) def setup_corpus(self, corpus_dir, paths='.*'): """Setting up a corpus. Args: corpus_dir(str): Path to corpus directory. """ self.corpus = PlaintextCorpusReader(corpus_dir, paths) return self.corpus def extract_expressions(self, document, features=None): """Returns expressions from given features and multi-word expressions. In addition to passing a document into this method, MWEs or Multi-Word Expressions can be given to treat some multi words as one expression. >>> from document import ArthurDocument >>> pdf_path = base_path + '/test/test.pdf' >>> with open(pdf_path, 'rb') as f: ... document = ArthurDocument(f.read()) >>> features = document.get_features()[730:816,:] >>> print(document.get_text(features)) # doctest:+ELLIPSIS VICTORIA'S CROWN JEWEL OF WATERFRONT ESTATES. Nestled on a quiet cove in the exclusive Multi-word expression should be detected: >>> clusterer = DumbClusterer(mwes=['crown jewel', 'waterfront estates']) >>> expressions = clusterer.extract_expressions(document, features) >>> print(expressions[2]['text']) CROWN JEWEL x position should equal x of "C" from "CROWN JEWEL" : >>> expressions[2]['x'] == features[11, ArthurDocument.get_feature_id('x')] True and width should equal to width of "CROWN JEWEL": >>> expr_width = expressions[2]['x1']-expressions[2]['x'] >>> ftr_width = features[21, ArthurDocument.get_feature_id('x1')] - features[11, ArthurDocument.get_feature_id('x')] >>> expr_width == ftr_width True Args: document(ArthurDocument): Document to extract data fields from. features(list): List of features containing data fields to extract. If not given, use all document features. mwes(list): List of Multi-Word Expressions. Example value: `['property type', 'single family)]`. With that list, both "property type" and "single family" will each be treated as single expressions. Returns: np.array: An array of data_fields. """ mwes = self.mwes if features is None: features = document.get_features() text = document.get_text(features) for idx, mwe in enumerate(mwes): if isinstance(mwe, str): mwes[idx] = word_tokenize(mwe.lower()) elif hasattr(mwe, '__iter__'): mwes[idx] = [x.lower() for x in mwe] tokenizer = MWETokenizer(mwes, separator=' ') tokenized = tokenizer.tokenize(word_tokenize(text.lower())) expressions = [] pos = 0 for token in tokenized: # token could be "deez nutz" but text contains multiple spaces e.g. "deez nutz", # so we need to split the token and find position of first and last characters. words = token.split() start_pos = text.lower().index(words[0], pos) for word in words: ipos = text.lower().index(word, pos) end_pos = ipos + len(word) pos = end_pos min_x = 0 max_x = 0 min_y = 0 max_y = 0 page = 0 if len(features[start_pos:end_pos,:] > 0): min_x = np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x')] max_x = np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x1')] min_y = np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y')] max_y = np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y1')] page = features[start_pos, ArthurDocument.get_feature_id('page')] expressions.append({ 'text': text[start_pos:end_pos], 'x': min_x, 'x1': max_x, 'y': min_y, 'y1': max_y, 'page': page }) return expressions def setup_mwes(self, trigram_nbest=100, bigram_nbest=2000): """Create multi-word expressions by learning a corpus located in a corpus directory. Testing setting up mwes with custom path and setting it up twice (correct when no exception): >>> corpus_dir = os.path.join(base_path, 'test', 'corpus') >>> clusterer = DumbClusterer(corpus_dir=corpus_dir, mwes=['custom mwe']) >>> mwes = clusterer.setup_mwes(trigram_nbest=1000, bigram_nbest=15000) >>> 'custom mwe' not in mwes True >>> 'custom mwe' in clusterer.mwes True Args: trigram_nbest(int): Number of highest ranked trigrams to acquire. bigram_nbest(int): Number of highest ranked trigrams to acquire. Returns: list: List of multi-word expressions. """ if self.corpus is None: raise Exception("Corpus not found. Run method `setup_corpus` with given corpus directory first.") bigram_measures = BigramAssocMeasures() trigram_measures = TrigramAssocMeasures() # Following are not used since ne chunk takes too much time. # Text processing before bigrams and trigrams calculated # words = [] # for sent in self.corpus.sents(): # for chunk in nltk.ne_chunk(nltk.pos_tag(sent)): # if not isinstance(chunk, nltk.Tree): # w = chunk[0] # # - Removal of words containing numbers or punctuations # if not any((ch.isdigit() or ch in string.punctuation) for ch in w): # # - Lowercasing all words # words.append(w.lower()) # print(w.lower().encode("utf-8")), # Text processing before bigrams and trigrams calculated words = [] for w in self.corpus.words(): # - Removal of words containing numbers or punctuations if not any((ch.isdigit() or ch in string.punctuation) for ch in w): # - Lowercasing all words words.append(w.lower()) bigram_finder = BigramCollocationFinder.from_words(words) trigram_finder = TrigramCollocationFinder.from_words(words) mwes = trigram_finder.nbest(trigram_measures.pmi, trigram_nbest) + bigram_finder.nbest(bigram_measures.pmi, bigram_nbest) # Basically combining two list by turning them into sets to make sure union returned # i.e. `set1 | set2` where set1 could be list of string or list, and if the latter, they # need to be converted into sets. set1 = {(tuple(mwe) if isinstance(mwe,list) else mwe) for mwe in self.mwes} set2 = set(mwes) self.mwes = list(set1 | set2) return mwes
import nltk import re from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk import FreqDist corpus_root = '/home/aman/entire-src/py/dir' speeches = PlaintextCorpusReader(corpus_root, '.*\.txt') print "Finished importing corpus" raw = speeches.raw().lower() tokens = nltk.word_tokenize(raw) tgs = nltk.trigrams(tokens) fdist = nltk.FreqDist(tgs) for k,v in fdist.items(): print k,v
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader # Create a new corpus by specifying the parameters # (1) directory of the new corpus # (2) the fileids of the corpus # NOTE: in this case the fileids are simply the filenames. newcorpus = PlaintextCorpusReader('nltkCorpusAll/', '.*') # Access each file in the corpus. for infile in sorted(newcorpus.fileids()): print infile # The fileids of each file. fin = newcorpus.open(infile)# Opens the file. print fin.read().strip() # Prints the content of the file print # Access the plaintext; outputs pure string/basestring. print newcorpus.raw().strip() print # Access paragraphs in the corpus. (list of list of list of strings) # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print # To access pargraphs of a specific fileid.
#Tried to find misspellings in a corpus of text files. See find_misspellings.py and grouping_docs.py for documentation. #There are ~30,400 unique words in these 49 communication files #Rebecca's laptop took too long to make the correlation matrix import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.tokenize import RegexpTokenizer import numpy as np from numpy import linalg #make a new corpus corpusdir = 'communications/small_test_batch' #where the files are newcorpus = PlaintextCorpusReader(corpusdir, '.*') fileids = newcorpus.fileids() #list of fileids j = len(fileids) #number of docs words_list = [] #['doc', '1', 'words', 'doc', '2', 'words',...] doc_breaks = [0] #ith entry = index of first word in doc i in words_list keywords = set() #{'doc', '1', 'words', '2',...} tokenizer = RegexpTokenizer('\w+') #pick out alphanumeric sequences; discard punctuation, white space #create set of keywords and list of file texts for id in fileids: raw = newcorpus.raw(id) raw2 = ''.join([i if ord(i)<128 else '' for i in raw]) #remove unicode characters raw3 = raw2.encode('ascii') file_words = map(str.lower,tokenizer.tokenize(raw3)) #list of cleaned words: lower-case, no punct, no whitespace words_list = words_list + file_words doc_breaks = doc_breaks + [len(file_words)+doc_breaks[len(doc_breaks)-1]]
import sys import wsd import random from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.text import * import nltk if len(sys.argv) != 4: print "Usage:", sys.argv[0], "word sense1 sense2" exit(-1) focal_word = sys.argv[1] senses = [sys.argv[2], sys.argv[3]] #focal_word = "plant" #senses = ["manufacturing","life"] corpus = PlaintextCorpusReader('outcorpus/', '.*') collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ] decision_list = wsd.DecisionList() decision_list.load("senses_bootstrap_" + focal_word + ".csv") corpus_ids = corpus.fileids() random.shuffle(corpus_ids) num_words = 1 num_words_max = 100 tagged = 0 ambiguous = 0 unknown = 0 for infile in corpus_ids: if num_words > num_words_max: break
import datetime import nltk from nltk import word_tokenize import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.corpus import floresta,mac_morpho from parser_portuguese_risk import evaluateModel, splitTrainTestModel, simplify_tag time1 =datetime.datetime.now() ############################################################################### ### ATENTION: if we have some tmp files like .DS_STORE in Mac OSX, we must remove it ### # Reading corpus corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/glossAnnotated/' # Directory of corpus. #corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/test1/' # Directory of corpus. risco = PlaintextCorpusReader(corpusdir, '.*') risco.fileids() raw_text = risco.raw('gloss533.txt') #print raw_text[0:] # Some statistics print 'Number of term: ', len(risco.words()) print 'Number of unique terms: ', len(set(risco.words())) fd = nltk.FreqDist(risco.words()) print fd.freq('bem') print fd['bem'] # presenting ngrams of the term
# # for line in p: # for sentence in line: # sentence.draw() st=StanfordPOSTagger('english-bidirectional-distsim.tagger') parser=StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") # setup corpus of texts childStoryCorpusDir = '../resources/org_transcripts' robotStoryCorpusDir = '../resources/robot_stories' childStoryCorpus = PlaintextCorpusReader(childStoryCorpusDir, ".*\.txt") robotStoryCorpus = PlaintextCorpusReader(robotStoryCorpusDir, ".*\.txt") # average word length, average sentence length, and the number of times each vocabulary item appears in the text on average (our lexical diversity score) # for fileid in childStoryCorpus.fileids(): # num_chars = len(childStoryCorpus.raw(fileid)) # num_words = len(childStoryCorpus.words(fileid)) # num_sents = len(childStoryCorpus.sents(fileid)) # num_vocab = len(set([w.lower() for w in childStoryCorpus.words(fileid)])) # print ((float(num_chars)/float(num_words)), float(num_words)/float(num_sents), float(num_words)/float(num_vocab), fileid) for fileid in childStoryCorpus.fileids(): print (fileid)
@author: Advaith GVK ''' from nltk.corpus.reader.plaintext import PlaintextCorpusReader import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet as wn import string import csv from fileinput import filename corpusdir = 'C:/Users/Advaith GVK/workspace/Trial/src/Pack/New folder' # Directory of corpus. newcorpus = PlaintextCorpusReader(corpusdir, '.*') filenames = newcorpus.fileids() # print newcorpus.sents() def getWordNetType(tag): #print tag if tag in ['JJ', 'JJR', 'JJS']: return wn.ADJ elif tag in ['NN', 'NNS', 'NNP', 'NNPS','POS','FW']: return wn.NOUN elif tag in ['RB', 'RBR', 'RBS','WRB']: return wn.ADV elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: return wn.VERB return wn.NOUN
from nltk.tokenize import word_tokenize, sent_tokenize import nltk import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.probability import FreqDist #Create a corpus corpusdir = "/home/erdinc/nltk/cs290f_proj/tos/" newcorpus = PlaintextCorpusReader(corpusdir, '.*') corpusWords = nltk.Text(newcorpus.words()) posTags = nltk.pos_tag(corpusWords) #Total number of words in corpus def getTotalNumberOfWords(words): return len(words) #Number of unique words in corpus def getNumberOfUniqueWords(words): return len(set(words)) #Most frequently used 25 words def getMostFreqWords(words): fdist = FreqDist(words) vocab = fdist.keys() return vocab[:25] #Name List def getNameList(tags): nameList = []
from nltk.corpus.reader.plaintext import PlaintextCorpusReader from decimal import Decimal from math import pi if __name__ == '__main__': ptcr = PlaintextCorpusReader('C:\Users\Jakub\Downloads\pr4\Trzeci plik', ['znormalizowane.txt', 'katy.txt']) data = [] t = ptcr.raw(fileids=ptcr.fileids()[1]).replace(',', '.').replace('\r', '').split('\n') t.remove('') for x in t: data.append(float(Decimal(x)*360/315)) print data data_ = [] t = ptcr.raw(fileids=ptcr.fileids()[0]).replace(',', '.').replace('\r', '').split('\n') t.remove('') for x in t: data_.append(float(x)/100) print data_
import os import nltk from nltk.classify.naivebayes import NaiveBayesClassifier from nltk.classify import PositiveNaiveBayesClassifier from nltk.corpus.reader.plaintext import PlaintextCorpusReader def features(sentence): words = sentence.lower().split() return dict(('contains(%s)' % w, True) for w in words) corpusdir = './text' newcorpus = PlaintextCorpusReader(corpusdir, '.*') positive_featuresets = list(map(features, newcorpus.raw('comp.txt'))) unlabeled_featuresets = list(map(features, newcorpus.raw('animal.txt'))) classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, unlabeled_featuresets, .3) print classifier.classify(features('.'))
nltk.download('stopwords') nltk.download('punkt') nltk.download('wordnet') from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from wordcloud import STOPWORDS _stop_words = set(STOPWORDS) stop_words = set(stopwords.words('english')) stop_words.update(_stop_words, ('thing', 'u', 'us', 'nt')) lemmatizer = WordNetLemmatizer() # Read .txt files from ./docs directory into a corpus corpus = PlaintextCorpusReader('./docs/', ".*\.txt") # filter list of words to remove uneeded ones and punctuation # losing U.S. which is not ideal, tried splitting sentences on spaces and preserving dots just for it from nltk.tokenize import TweetTokenizer tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) tokenized = tokenizer.tokenize(corpus.raw()) # drop punctuation non_punct = list( filter(lambda token: nltk.tokenize.punkt.PunktToken(token).is_non_punct, tokenized)) # lowercase everything lowercased = [word.lower() for word in non_punct]
#!/usr/bin/python import sys import csv from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.text import * import wsd if len(sys.argv) != 4: print "Usage:", sys.argv[0], "word sense1 sense2" exit(-1) corpus = PlaintextCorpusReader('outcorpus/', '.*') focal_word = sys.argv[1] senses = [sys.argv[2], sys.argv[3]] #senses = ["manufacturing","life"] collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ] with open("senses_" + focal_word + ".csv") as senses_file: reader = csv.reader(senses_file) for row in reader: infile, offset, sense = row offset = int(offset) words = corpus.words(infile) text = Text(words) for collocation in collocations: collocation.add_collocation(text, offset, sense) #print collocations[0].frequencies.items()[0][1].items()[0][1]
# new file with weightings new_file = open(new_file_name, "w+", encoding="utf-8") more_stopwords = open("stopwords.txt", "r", encoding="utf-8") stop_words = set(nltk.corpus.stopwords.words('english')) for line in more_stopwords: stop_words.add(line[:-1]) #words = line.split() #for word in words: #stop_words.add(word) regex = re.compile(r'(?:^|)[a-zA-Z0-9\-]+') not_regex = re.compile(r'\@[a-zA-Z0-9\-]+') #print(stop_words) texts = PlaintextCorpusReader(CORPUS_TEXT, '.*\.txt') def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'): #pdb.set_trace() #print(stop_words) # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) # tokenize, POS-tag, and chunk using regular expressions chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase lambda_func = lambda w_p_c: w_p_c[2] != 'O' candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda_func) if key]
CORPUS_ROOT = '/home/ksotala/Hiit/mallet-2.0.7/dataset/lemmatized/nostop2/' CORPUS_EXTENSION =r'.*\.txt' import nltk import os from os import listdir from os.path import isfile, join from nltk.collocations import * from nltk.corpus.reader.plaintext import PlaintextCorpusReader bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() # read in corpus, find all the 3-grams above the min frequency print "Reading in corpus from", CORPUS_ROOT my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION) print "Read in " + str(len(my_corpus.fileids())) + " files" print "Finding 3-grams" finder_3gram = TrigramCollocationFinder.from_words(my_corpus.words()) print "Filtering out 3-grams of frequency less than", MIN_FREQUENCY finder_3gram.apply_freq_filter(MIN_FREQUENCY) # combine all the 3-grams meeting the PMI threshold print "Looking for 3-grams with a PMI of at least ", MIN_3GRAM_PMI filelist = [ join(CORPUS_ROOT,f) for f in listdir(CORPUS_ROOT) if isfile(join(CORPUS_ROOT,f)) ] gen = finder_3gram.above_score(trigram_measures.pmi, MIN_3GRAM_PMI) processGrams(gen, filelist) # now let's do the same for the 2-grams # our previous step altered the corpus so let's read it in again print "Reading in corpus from", CORPUS_ROOT
import os import nltk from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.tokenize import RegexpTokenizer ## create the corpus of 1965 songs from html files corpusdir = '../../data/billboard_data/1960/billboard_1965/' bb_1965 = PlaintextCorpusReader(corpusdir, '.*') ## get the raw text from specific songs/files help = bb_1965.raw('help.html') desolation_row = bb_1965.raw('desolation_row.html') ## clean the raw text to remove the p tags clean_help = nltk.clean_html(help) clean_desolation = nltk.clean_html(desolation_row) # word tokenize tokens_help = nltk.word_tokenize(clean_help) tokens_desolation = nltk.word_tokenize(clean_desolation) # point of speech tagging tags_help = nltk.pos_tag(tokens_help) tags_desolation = nltk.pos_tag(tokens_desolation) tokenizer = RegexpTokenizer(r'\w+') ## print the unique, sorted pos tags for item in sorted(set(tags_help)): print 'help tags: ', item