def main(): corpus_root = sys.argv[1] num_text_files = int(sys.argv[2]) algorithm_type = sys.argv[3] pmi_freq_filter = int(sys.argv[4]) file_list = [] for i in range(0, num_text_files): file_list.append(sys.argv[5 + i]) corpus = PlaintextCorpusReader(corpus_root, '.*') if 'bigram' in algorithm_type: measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(corpus.words()) finder.apply_freq_filter(pmi_freq_filter) scored = finder.score_ngrams((f(algorithm_type))) else: measures = nltk.collocations.TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(corpus.words()) finder.apply_freq_filter(pmi_freq_filter) scored = finder.score_ngrams((f(algorithm_type))) sort = (sorted(scored, key=lambda tu: tu[1])) for key in sort: ngrams = len(key[0]) if (ngrams == 2): print key[0][0] + "\t" + key[0][1] + "\t" + str(key[1]) else: print key[0][0] + "\t" + key[0][1] + "\t" + key[0][2] + "\t" + str( key[1])
def corpus_reader(corpus_name): ''' Open a PlaintextCorpusReader for the given UDN corpus. ''' # If the user requested an unfiltered corpus version, we need to know the root corpus name root_corpus = corpus_name.replace('-unfiltered', '') # Ensure the desired corpus's submodule is checked out if not os.path.exists('./corpora/{}/README.md'.format(root_corpus)): retcode = subprocess.call( "git submodule update --init -- corpora/{}".format( root_corpus).split(" ")) if retcode != 0: print( "Attempt to checkout submodule for corpus '{}'. Try running 'git submodule update --init' manually." .format(root_corpus)) exit() percentage = '' with open('./corpora/{0}/{0}.txt'.format(root_corpus), 'r') as f: manifest = f.readlines() query = manifest[0].split(" ")[3] num_found = util.dry_make_request(query, 0, 1)[0]['numFound'] num_in_corpus, last_one = util.files_in_dir('./corpora/{}/{}'.format( root_corpus, corpus_name)) percentage = '{0:.0%}'.format(num_in_corpus / num_found) if percentage != '100%': print('NOTE: This corpus is only {} complete. Last file: {}\n'. format(percentage, last_one)) corpus = PlaintextCorpusReader( './corpora/{}/{}'.format(root_corpus, corpus_name), r'.*\.txt') return corpus
def get_phrase(): root_dir = r'E:\github_repo\python_basic\pythonbasictest\self_nltk\files' wordlists = PlaintextCorpusReader(root_dir,".*") x = nltk.Text(wordlists.words("test.txt")) print(x) print(x.collocations())
def construct_models(): """ Builds the classification models. """ sources = [ 'Conservative', # Scalia + Rehnquist 'Progressive' ] # Ginsburg + Stevens corpus = [(PlaintextCorpusReader('data/' + path + '/', '.*'), path) for path in sources] documents = [] for (c, cat) in corpus: for fileid in c.fileids(): documents.append((c.words(fileid), cat)) random.shuffle(documents) all_words = [] for (c, cat) in corpus: all_words.extend(c.words()) all_words = nltk.FreqDist(all_words) word_features = list(all_words.keys())[:3000] featuresets = [(find_features(opinion, word_features), cat) for (opinion, cat) in documents] training_subset = int(len(featuresets) * 0.9) training_set = featuresets[:training_subset] testing_set = featuresets[training_subset:] ensemble = EnsembleClassifer(training_set, testing_set) ensemble.show_most_useful_features() ensemble.accuracy() print(ensemble.classify(testing_set[0][0]))
def create_corpus(directory): corpus = PlaintextCorpusReader(directory, '.*', encoding="iso-8859-1", word_tokenizer=word_tokenize, sent_tokenizer=sent_tokenize) return corpus
class App: def makeTrainingData (reader): for category in reader.categories(): for file in reader.fileids(category): yield FreqDist(reader.words(fileids=[file])), category corpusDirectory = "../../resources/input/" #Was using PlaintextCorpusReader, switched to Categorized to provide categories wattsCorpus = PlaintextCorpusReader(corpusDirectory, '.*') print wattsCorpus.raw().strip() print wattsCorpus.words() for sentence in wattsCorpus.sents(): print sentence print len(wattsCorpus.sents()) text = nltk.tokenize.word_tokenize(wattsCorpus.raw()) print "tokenized text: ", text #example of finding similar word text = nltk.Text(word.lower() for word in wattsCorpus.words()) print "similar to god: ", text.similar('god') words = nltk.pos_tag(text) fdist = nltk.FreqDist(words) print "frequencey distribution: ", fdist sentence = "So there are two ways of playing the game. The first way, which is the usual way, is that a guru or teacher who wants " sentenceWords = nltk.word_tokenize(sentence) fdistForSentence = nltk.FreqDist(sentenceWords) fdistForSentence.plot()
def load_corpus(race_code=None, gender_code=None ): #loads corpora into an array based on race and gender if (race_code == None): # if none is specified, search all race_code = ".." if (gender_code == None): gender_code = ".." reader = PlaintextCorpusReader( corpus_root, ".*_" + race_code + "_" + gender_code + "\.txt") # uses filename encoding to load specified texts corpora = [] for fileid in reader.fileids( ): #creates ComedyCorpus object, populates with fileid and name new_corpus = ComedyCorpus() new_corpus.set_fileid(fileid) try: new_corpus.set_text( reader.raw(fileid)) #gets word content based on fileid except UnicodeDecodeError: continue fileid = re.sub("_" + race_code + "-" + gender_code + "\.txt", "", fileid) #name is fileid without encoding fileid = fileid.replace("%20", " ") fileid = fileid.replace("_", "; ") print(fileid) new_corpus.set_name(fileid) corpora.append(new_corpus) return corpora
def generateNgramModel(corpusPath, corpusName): corpusdir = 'corpora/' # Directory of corpus. generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False, estimator) #uses bigrams just cause they BETTER return ngrammodel
def getCorupsFromCorpusFile(CorpusFile): CorpusDir, CorpusFile = os.path.split(CorpusFile) corpus = PlaintextCorpusReader(CorpusDir, CorpusFile) return corpus
def create_corpus(): ## Create corpus from abstract ## fetched by BIBOT ## return a corpus object ## Read the abstract result file abstract_to_content = {} abstract_file = open("fetched/pubmed_abstract.txt", "r") for line in abstract_file: line = line.replace("\n", "") if (line[0] == ">"): abstract = line[1:] abstract_to_content[abstract] = "" else: content = line abstract_to_content[abstract] = content abstract_file.close() ## create files for key in abstract_to_content.keys(): text_file = open("fetched/corpus/" + str(key) + ".txt", "w") text_file.write(abstract_to_content[key]) text_file.close() ## ntlk magical lines corpusdir = 'fetched/corpus/' newcorpus = PlaintextCorpusReader(corpusdir, '.*') return newcorpus
def generate_words_grammar(): """ Use sentence grammar to find words that could be Rent lyrics :return: """ # Load corpuses to look in gentrification = PlaintextCorpusReader( 'corpus', '.*') # Gentrification articles are in this directory gentrify_sents = gentrification.sents() # wine_sents = nltk.corpus.webtext.sents('wine.txt') corpus_sents = gentrify_sents + wine_sents syls_1 = [] syls_2 = [] syls_4 = [] syls_2_sing = [] for sent in corpus_sents: parsed_sent = nltk.pos_tag(sent) for word in parsed_sent: no_syls = count_syllables(word[0]) if word[1] == 'NNS' and len(word[0]) > 3: if no_syls == 1: syls_1 = syls_1 + [word[0].lower()] elif no_syls == 2: syls_2 = syls_2 + [word[0].lower()] elif no_syls == 4: syls_4 = syls_4 + [word[0].lower()] if word[1] == 'NN' and len(word[0]) > 2: if no_syls == 2: syls_2_sing = syls_2_sing + [word[0].lower()] return list(set(syls_1)), list(set(syls_2)), list(set(syls_4)), list( set(syls_2_sing))
def read_corpus(corpus_path): from nltk.corpus.reader.plaintext import PlaintextCorpusReader corpus = PlaintextCorpusReader(corpus_path, ".*\.txt") ctext = corpus.raw() # with open('corpus.txt', 'w') as cf: # cf.write(ctext.encode('utf-8')) return ctext
def load_feat_data(dir_array): data_list = [] for direct in dir_array: data = [] corpus_dir = 'dataset/' + direct corpus = PlaintextCorpusReader(corpus_dir, '.*\.*') file_ids = corpus.fileids() for file in file_ids: text = corpus.raw(file) e = email.message_from_string(text) if (e.is_multipart()): for payload in e.get_payload: text = payload.get_payload else: text = e.get_payload() data.append(extract_features(text, corpus, file)) data_list.extend(data) return data_list
def __init__(self, master): ''' Constructor. master is a string that names a directory in the same repository that contains all the work from inspiration ''' self.master = 'masters/' + master self.reader = PlaintextCorpusReader(self.master, r'.*', encoding='utf-8') self.text = self.reader.words()
def cv_to_matrix(self): corpusdir = 'data/cv_corpus' corpa = PlaintextCorpusReader(corpusdir,'.*',encoding='windows-1252') print("Preprocessing words....") sents = [[token.lemma_ for token in nlp(" ".join(self.clean(sent)).lower()) if token.lemma_ not in stopset] for sent in corpa.sents()] print("training word vectors....") model = Word2Vec(sents,window=5, size=self.ncol,min_count=1, workers=4) fname = get_tmpfile("vectors.kv") model.wv.save(fname) print("cv_to_matrix model saved") return model.wv
def build_d2v_model(self): print("Début de la construction du modèle Doc2Vec") corpusdir = 'data/cv_corpus' corpa = PlaintextCorpusReader(corpusdir, '.*',encoding='windows-1252') print("tokenizing...") resumes = [[token.lemma_ for sent in paras for token in nlp(" ".join(self.clean(sent)).lower()) if token.lemma_ not in stopset] for paras in corpa.paras()] #print(resumes[0:3]) print("tokenization completed") documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(resumes)] model = Doc2Vec(documents, vector_size=self.cv_length, window=5, min_count=1, workers=4) print("Fin de la construction du modèle Doc2Vec") return model
def __init__(self, data_root): self.data_root = data_root self.data = PlaintextCorpusReader(data_root, '.*') self.words = [i for i in self.data.words() if i.isalpha()] self.text = Text(self.words) self.stop = set(stopwords.words('english')).union({ 'cid', 'et', 'al', 'also', 'and', 'editingboston', 'arxiv', 'pages', 'trackboston', 'preprint', 'page', 'vol', 'volume', 'march', 'boston', 'table' }) with open('bib.json') as fi: self.bib = json.load(fi)
def get_fileid_lst(source_dir): ''' Use NLTK to pull in the list of file ids in the given source directory :param {str} source_dir: The relative path to the source directory that contains all the data (book) files :return {str} fileid_lst: List of all file id's ending in '.txt' in the source_dir ''' temp_corp = PlaintextCorpusReader(source_dir, '.*\.txt') fileid_lst = temp_corp.fileids() return fileid_lst
def token_in_coverage(self): corpusdir = 'data/cv_corpus' corpa = PlaintextCorpusReader(corpusdir, '.*',encoding='windows-1252') resumes = [[item for sent in paras for item in sent] for paras in corpa.paras()] cpt=0 for resume in resumes : resume_text = " ".join(resume) resume_sents = nltk.sent_tokenize(resume_text) resume_words = set(token.lemma_ for sent in resume_sents for token in nlp(" ".join(sent).lower())) if not resume_words.isdisjoint(self.tokens_in) : cpt+=1 coverage = cpt*1.0/len(resumes) print("token_in coverage : {}".format(coverage))
def __init__(self, input_folder_name, doc_pattern, categ_pattern, encoding='utf-8'): CategorizedPlaintextCorpusReader.__init__(self, input_folder_name, doc_pattern, cat_pattern=categ_pattern) self.input_folder_name = input_folder_name self.encoding = encoding self.root_reader = PlaintextCorpusReader(input_folder_name, fileids=r'[^\/]*.' + doc_pattern[-3:]) #self.root_ids =[ os.path.join(input_folder_name,item) for item in self.root_reader.fileids()] self.root_ids = list(self.root_reader.fileids())
def processFile(newCorpusDir): if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('VirtualBoxTroubleshooting.pdf') txt3 = word.getTextWord('my_doc.docx') files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()) print(newCorpus.sents(newCorpus.fileids()[1])) print(newCorpus.paras(newCorpus.fileids()[0]))
def load_data(dir_label): data_list = [] labels = [] for dl in dir_label: data = [] directory = dl[0] label = dl[1] corpus_dir = 'dataset/' + directory corpus = PlaintextCorpusReader(corpus_dir, '.*\.*') file_ids = corpus.fileids() for file in file_ids: d = [] text = corpus.raw(file) e = email.message_from_string(text) if (e.is_multipart()): for payload in e.get_payload: text = payload.get_payload else: text = e.get_payload() feats = [ cf.charac_feats_extractor(text), wf.word_feats_extractor(text), syf.syntac_feats_extractor(text), stf.struct_feats_extractor(corpus, file, text), fwf.funct_word_feats_extractor(text) ] for f in feats: d.extend(list(f.values())) data.append(d) labels.append(label) data_list.extend(data) return [data_list, labels]
def pdf_to_corpus(): path = 'D://Eclipse Workspace//NLP//Assignment//res//' for filename in glob.glob(os.path.join(path, '*.pdf')): print(filename) pdfFileObj = open(filename, 'rb') # creating a pdf reader object pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # printing number of pages in pdf file print(pdfReader.numPages) # creating a page object pageObj = pdfReader.getPage(0) # extracting text from page text = pageObj.extractText() strings_list = text.split("\n") # Make new dir for the corpus. corpusdir = 'customcorpus/' if not os.path.isdir(corpusdir): os.mkdir(corpusdir) # Output the files into the directory. file_name = filename.split("\\")[-1] print(file_name) pbar = ProgressBar(widgets=[ 'Creating Corpus', Bar('#', '[', ']'), ' ', Percentage(), ' ', ETA() ], maxval=100) for text in pbar(strings_list): with open(corpusdir + '[PDF] ' + file_name + '.txt', 'ab') as fout: fout.write(text.encode('utf-8')) pbar.finish() #create_corpus(text) corpus = PlaintextCorpusReader('customcorpus/', '.*') print(corpus.raw())
def token_assamese(): # Modifiy these to change the location of the coupus file and the name of the courpus file corpus_path = "/Users/partha/All/Python/ProjectMaterials/Learned material/Arts" corpus_filename = 'Psychology.txt' newcorpus = PlaintextCorpusReader(corpus_path, corpus_filename, encoding='utf16') text = newcorpus.raw().strip().replace('ред', '.') words = nltk.word_tokenize(text) for index, item in enumerate(words): if (str(item) == '.'): words[index] = 'ред' output_file_path = "C:/Users/HEMANT/Documents/1.Project/" output_filename = 'Result.txt' with open(output_file_path + output_filename, 'w', encoding='utf8') as f: for i in words: f.writelines(str(i) + '\n') f.close()
def Read_corpus(path_c, fname_c, fo1): import nltk import re import spacy import en_core_web_sm import fileinput nlp = spacy.load('en_core_web_sm') from nltk.corpus.reader.plaintext import PlaintextCorpusReader pcorpus = PlaintextCorpusReader(path_c, fname_c, encoding="utf") #HTML Tags to file fappend(fo1, P_htmltag.writehtmltag1(fname_c), fname_c) # Iterate through each paragraph for para in pcorpus.paras(): L0 = rep_tags(para) L1 = L0.split("\n") for i, w in enumerate(L1): if (w != ""): ApplyNLP(nlp(str(w[1:])), fo1) fappend(fo1, P_htmltag.writehtmltag3(fname_c), fname_c)
def read_article(file_path): #file = open(file_path, "r") ##INSERT FILE NAME IN FUNCTION CALL BELOW###### bcr = PlaintextCorpusReader(file_path, 'bernie.txt') #filedata = file.read() filedata = bcr.raw() #for word in filedata.split(): # if word == 'Mr.': # filedata[word] = 'Mr' article = filedata.replace("\n\n", '. ').replace('Mr.', 'Mr').replace( "\r", ' ').replace('\n', ' ').split('. ') articlez = [] for line in article: if line == '': continue if line[0] == '\n': line = line[1:] articlez.append(line) sentences = [] for sentence in articlez: sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" ")) sentences.pop() return sentences
# new file with weightings new_file = open(new_file_name, "w+", encoding="utf-8") more_stopwords = open("stopwords.txt", "r", encoding="utf-8") stop_words = set(nltk.corpus.stopwords.words('english')) for line in more_stopwords: stop_words.add(line[:-1]) #words = line.split() #for word in words: #stop_words.add(word) regex = re.compile(r'(?:^|)[a-zA-Z0-9\-]+') not_regex = re.compile(r'\@[a-zA-Z0-9\-]+') #print(stop_words) texts = PlaintextCorpusReader(CORPUS_TEXT, '.*\.txt') def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'): #pdb.set_trace() #print(stop_words) # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) # tokenize, POS-tag, and chunk using regular expressions chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase lambda_func = lambda w_p_c: w_p_c[2] != 'O' candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda_func) if key]
nltk.download('stopwords') nltk.download('punkt') nltk.download('wordnet') from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from wordcloud import STOPWORDS _stop_words = set(STOPWORDS) stop_words = set(stopwords.words('english')) stop_words.update(_stop_words, ('thing', 'u', 'us', 'nt')) lemmatizer = WordNetLemmatizer() # Read .txt files from ./docs directory into a corpus corpus = PlaintextCorpusReader('./docs/', ".*\.txt") # filter list of words to remove uneeded ones and punctuation # losing U.S. which is not ideal, tried splitting sentences on spaces and preserving dots just for it from nltk.tokenize import TweetTokenizer tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) tokenized = tokenizer.tokenize(corpus.raw()) # drop punctuation non_punct = list( filter(lambda token: nltk.tokenize.punkt.PunktToken(token).is_non_punct, tokenized)) # lowercase everything lowercased = [word.lower() for word in non_punct]
def detect(request): #Entrada de datos if request.method == 'POST': identificacion=request.POST.get('dni') a=request.FILES['document'] documento=str(a) datos_doc=documento.split('.') nombre_doc=datos_doc[0] tipo_doc=datos_doc[1] if tipo_doc=='txt': name=request.FILES['document'].read().lower() print(datos_doc) #mul=set(stopwords.words("spanish")) mul=codecs.open('mul.txt', "r", encoding='UTF-8').read() remove('muletillas.txt') discurso=(name.decode('UTF-8')) #Separar muletillas de palabras comunes text_completo = wordpunct_tokenize(discurso) m = [] m = [w for w in text_completo if w in mul] muletillas= codecs.open('muletillas.txt', "a") for i in m: muletillas.write(i) muletillas.write(" ") muletillas.close() #Contabilizar muletillas tokenizador=RegexpTokenizer('\w+|[^\w\s]+') corpus = PlaintextCorpusReader(".", 'muletillas.txt',word_tokenizer=tokenizador, encoding='Latin-1') frecuencia=FreqDist(corpus.words()) salida=codecs.open("muletillasR.txt","w",encoding="utf-8") palabras=[] repeticiones=[] #Agregar los datos extraidos en un txt para posterior presentacion for mc in frecuencia.most_common(): palabra=mc[0] frecuencia_absoluta=mc[1] frecuencia_relativa=frecuencia.freq(palabra) cadena=str(frecuencia_absoluta)+"\t"+str(frecuencia_relativa)+"\t"+palabra palabras.append(palabra.upper()) repeticiones.append(frecuencia_absoluta) salida.write(cadena+"\n") try: collection.insert_one({ 'identificacion':identificacion, 'documento': documento, 'discurso':discurso, 'muletillas':palabras }) except Exception as e: print("Error : ", type(e), e) #Enviado de datos al front context={ 'documento': nombre_doc, 'muletillas':palabras[0:10], 'repeticiones': repeticiones[0:10] } return render(request, 'responde.html', context) else : messages.warning(request, "Verifique el tipo de archivo", extra_tags='file') return render(request, 'home.html') return render(request, 'home.html') # class LineChartJSONView(BaseLineChartView): # def get_labels(): # """Return 7 labels for the x-axis.""" # return ["January", "February", "March", "April", "May", "June","July", "August", "September", "October"] # def get_providers(self): # """Return names of datasets.""" # return ["Repeticiones"] # def get_data(self): # """Return 3 datasets to plot.""" # return [[75, 44, 92, 11, 44, 95, 35, 11, 44, 95, 35]] # line_chart = TemplateView.as_view(template_name='responde.html') # line_chart_json = LineChartJSONView.as_view()
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.probability import LidstoneProbDist, WittenBellProbDist from nltk.model import NgramModel from nltk.tokenize import sent_tokenize, word_tokenize corpusdir = 'corpora/' # Directory of corpus. SickCorpus = PlaintextCorpusReader(corpusPath, 'sick_tweets.txt') HealthyCorpus = PlaintextCorpusReader(corpusdir, 'healthy_tweets.txt') estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) sick_model_1 = NgramModel(1, SickCorpus.sents(), True, False, estimator) sick_model_2 = NgramModel(2, SickCorpus.sents(), True, False, estimator) healthy_model_1 = NgramModel(1, HealthyCorpus.sents(), True, False, estimator) healthy_model_2 = NgramModel(2, HealthyCorpus.sents(), True, False, estimator) tweet = "Remember when we were all diagnosed with Bieber fever ? Lol" print "sick_model_1 is: " + str(sick_model_1.perplexity(word_tokenize(tweet))) print "sick_model_2 is: " + str(sick_model_2.perplexity(word_tokenize(tweet))) print "healthy_model_1 is: " + str(healthy_model_1.perplexity(word_tokenize(tweet))) print "healthy_model_2 is: " + str(healthy_model_2.perplexity(word_tokenize(tweet)))