def tag_files(direc): with open('nltk_german_classifier_data.pickle', 'rb') as t: tagger = pickle.load(t) for subdir, dirs, files in os.walk(direc): for file in files: if ((file.endswith('.txt')) and (not ("nounlist" in file))): filename = os.path.join(direc, file) wordlist = PlaintextCorpusReader(direc, '.*') name = (filename.split("/"))[-1] #to_tag = wordlist.words(filename) to_tag = wordlist.words(name) tagged = tagger.tag(to_tag) name = name.replace(".txt", "") p = filename.split("/") path = p[0] + "/" + p[1] + "/" + p[2] + "/" + p[3] + "/tagged/" with open('%stagged_%s_data.pickle' % (path, name), 'wb') as f: pickle.dump(tagged, f) nouns = [] for word in tagged: if word[1] == 'NN': nouns.append(word[0]) path = path.replace("tagged", "nouns") with open('%snoun-list_%s_data.pickle' % (path, name), 'wb') as f: pickle.dump(nouns, f) #count noun frequency: noun_frequency = Counter(nouns) with open('%snoun-frequ_%s_data.pickle' % (path, name), 'wb') as f: pickle.dump(noun_frequency, f) with open('%snounlist_%s.txt' % (path, name), 'w') as f: f.write("\n".join(nouns))
def give(filename): corpus_root = '/home/helios/Desktop/easclepius/easclepius' wordlists = PlaintextCorpusReader(corpus_root, '.*') name = "/home/helios/Desktop/easclepius/easclepius/" + filename print(filename) print(name) print( "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) textwords = [w.lower() for w in wordlists.words(name)] print(textwords) bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(textwords) finder.apply_freq_filter(2) ignored_words = set(stopwords.words('english')) finder.apply_word_filter(lambda w: len(w) < 3 or w in ignored_words) a = finder.nbest(bigram_measures.likelihood_ratio, 5) fd = FreqDist(a) print("fd") return a
def get_pos_features(dataset, feature_set_file, output_file): corpus_root = '/home1/c/cis530/data-hw2/' dataset_path = corpus_root + dataset files = PlaintextCorpusReader(dataset_path, '.*') ids = files.fileids() feature_set_tuples = get_feature_set_tuples(feature_set_file) out_file = open(output_file,'w') ## Off by one error here, don't use range for i in range(len(ids)): out_string= '' current_file = dataset + '/'+ids[i] e = current_file.split('/') out_string = out_string + current_file+ ' '+e[-2] tagged_file = nltk.pos_tag(files.words(ids[i])) for feature in feature_set_tuples: count =0 for tag in tagged_file: if(feature == tag): count = count +1 out_string = out_string + " " + feature[1]+feature[0]+ ':' + str(count) out_file.write(out_string + '\n') out_file.flush()
def tag(self, path, files): corpus = PlaintextCorpusReader(path, files) count = 1 taggedDocs = {} nes = {} for f in files: print("Tagging file " + f + ", " + str(count) + "/" + str(len(files))) count += 1 chunked = self.process(corpus.raw(f)) for i in range(len(chunked)): for j in range(len(chunked[i])): chunked[i][j] = self.neTagSentence(chunked[i][j]) taggedDocs[f] = chunked foundNes = [] for split in chunked: for sentence in split: foundNes += self.getNesFromTree(sentence) nes[f] = foundNes return (taggedDocs, nes)
def extractWordsOnly(self, article): templist = [] listtextstring = [] articlename = article + '.txt' #corpus_root = '/home/jesal/onedump/' wl = PlaintextCorpusReader(corpus_root, '.*') allwords = wl.words(fileids=articlename) exturllist = self.extractexternalURL(article) textstring = wl.raw(articlename) for item in exturllist: textstring = textstring.replace(item, ' ') #templist = re.sub(r'[.!,;?]', ' ', textstring).split() templist = nltk.word_tokenize(textstring) listtemp = [] for i in templist: j = re.sub('[^A-Za-z]+', '', i) listtemp.append(str(j)) templistfinal = [] templistfinal = self.removeEmpty(listtemp) return templistfinal
def generate_custom_lists(project_dir): """ Same as generate_existing_lists, but on custom corpora that reside in root/custom_corpora. Write a pickle file for each corpus. """ from nltk.corpus import PlaintextCorpusReader import os for dir in os.scandir("{}/custom-corpora/".format(project_dir)): if dir.is_dir(): print("generating list for custom corpus '{}'".format(dir.name)) custom_word_tags = defaultdict(set) custom_corpus = PlaintextCorpusReader( root="{}/custom-corpora/{}/".format(project_dir, dir.name), fileids=".*") # tokenize and tag sentences try: tags = get_tags_sentence(list(custom_corpus.sents())) except ValueError: print("No sentences found for corpus '{0}'. Did you place your" " text files inside of /custom-corpora/{0}/ ?".format( dir.name)) else: for tag in tags: custom_word_tags[tag[-1]].update([tag[0]]) # write results, dump .pkl regardless if empty with open( "{}/pre-generated-lists/custom_word_tags_{}.pkl".format( project_dir, dir.name), "wb") as outfile: pickle.dump(custom_word_tags, outfile) print("Completed dumping of `{}` custom corpus. {} total words " "saved".format( dir.name, sum([ len(values) for values in custom_word_tags.values() ])))
def LDA_pretreatment(filename): # read words from files corpus_root = filename file_pattern = r".*" ptb = PlaintextCorpusReader(corpus_root, file_pattern) # remove the words whose lenth>6 initialwords = [[w for w in ptb.words(fileid) if len(w) > 6] for fileid in ptb.fileids()] # remove the words that were spelled wrong spellcheckedwords = [[w for w in document if d.check(w)] for document in initialwords] # translate to low litter lowerwords = [[w.lower() for w in document] for document in spellcheckedwords] # wnl = nltk.WordNetLemmatizer() rectifywords = [[wnl.lemmatize(s) for s in document] for document in lowerwords] # remove the stopwords stopwords = nltk.corpus.stopwords.words('english') finitialwords = [[w for w in document if w not in stopwords] for document in rectifywords] return finitialwords
def print_my_count(self, corpus, patt, n): wordlists = PlaintextCorpusReader(corpus, patt) fileids = wordlists.fileids() for id in fileids: words = wordlists.words(id) wordt = len(words) wordc = len(set(words)) wor = "=> corpus tokens : " + ` wordt ` dis = "=> distinct token types : " + ` wordc ` ric = "=> ind lex richness : " + ` wordt / wordc ` print "********************************************" print "1. Corpus parameters", "(= " + id + ")" print "********************************************" print dis print ric print wor print "********************************************" fre = FreqDist(word.lower() for word in words if word.isalpha()) print "2. Top 100 words" print "********************************************" for word in fre.keys()[:n]: t = word, ` fre[word] ` + "/" + ` wordt ` print t
def read_corpus(path, pattern): cr = PlaintextCorpusReader(path, pattern) for fid in cr.fileids(): code = cr.raw(fid) try: tokens = list(tokenize.generate_tokens(io.StringIO(code).readline)) tokens = [ token for t in tokens if t.type == tokenize.NAME or t.type == tokenize.OP for token in t.string.split(" ") ] if tokens: # print("Task: %s; Color: %s" % (programming_task.name, color_val)) yield TaggedDocument(tokens, fid) except tokenize.TokenError as e: # print("%s: %s" % (type(e).__name__, e)) pass except IndentationError as e: # print("%s: %s" % (type(e).__name__, e)) pass except Exception as e: print("%s: %s" % (type(e).__name__, e)) pass
def my_bar(self,corpus,patt,n): wordlists = PlaintextCorpusReader(corpus,patt) fileids = wordlists.fileids() k = len(fileids) figA = pylab.figure(1) figB = pylab.figure(2) li = ['Corpus'] for id in fileids: if k > 1: i = fileids.index(id)+1 words = wordlists.words(id) fre = FreqDist(word.lower() for word in words if word.isalpha()) self.bar_count(fre,n,figA,2*k,2*i,id,li) self.bar_freq(fre,n,figB,2*k,2*i,id,li) figA.savefig('..data/complex-freq.pdf') figB.savefig('..data/complex-relfreq.pdf') else: words = wordlists.words(id) fre = FreqDist(word.lower() for word in words if word.isalpha()) self.bar_count(fre,n,figA,k,1,id,li) self.bar_freq(fre,n,figB,k,1,id,li) figA.savefig('../data/simple-freq.pdf') figB.savefig('../data/simple-relfreq.pdf') pylab.show()
def __init__(self, dname, sfile=None, stemmer=None): if not os.path.isdir(os.path.abspath(dname)): raise FileExistsError('invalid directory!') if sfile == "": self.stop_words = () elif sfile is None: self.stop_words = set(nltk.corpus.stopwords.words('english')) else: if not os.path.exists(sfile): raise FileExistsError('invalid file!') else: reader = PlaintextCorpusReader(str(os.getcwd()), sfile) self.stop_words = set(reader.words([ sfile, ])) self.root = os.path.abspath(dname) if isinstance(stemmer, nltk.stem.porter.PorterStemmer) or \ isinstance(stemmer, nltk.stem.snowball.SnowballStemmer) or \ stemmer is None: self.stemmer = stemmer else: raise Exception('invalid stemmer')
def __init__(self, corpus, tf="raw", idf="base", stopword=nltk.corpus.stopwords.words('english'), stemmer=PorterStemmer(), ignorecase="yes"): self.corpus = corpus self.tf_key = tf self.idf_key = idf # Setup stop words if stopword == "none": self.stop_words = () elif stopword is None or stopword == nltk.corpus.stopwords.words( 'english'): self.stop_words = set(nltk.corpus.stopwords.words('english')) else: if not os.path.exists(stopword): raise FileExistsError('Invalid stopword file!') else: reader = PlaintextCorpusReader(str(os.getcwd()), stopword) self.stop_words = set(reader.words([ stopword, ])) self.stemmer = stemmer self.ignorecase = ignorecase # Setup caching self variables self.dim = set() self.tf = {} self.idf = {} self.tf_idf_res = {} self.words_dict = {}
def read(self): portuguese_sent_tokenizer = nltk.data.load( "tokenizers/punkt/portuguese.pickle") newcorpus = PlaintextCorpusReader( os.path.join(self.s.path, "leis"), ".*", sent_tokenizer=portuguese_sent_tokenizer) ponctuation = [ ".", "!", "-", "?", ",", "lei", "artigo", ")", "(", "subchefia", ":", "presidente", "$", "°" ] #print(newcorpus.words()) for files in newcorpus.fileids(): words = newcorpus.words(files) words = [w.lower() for w in words] filtered_words = [ word for word in words if word not in stopwords.words("portuguese") ] filtered_words = [ word for word in filtered_words if word not in ponctuation ] fd1 = nltk.FreqDist(filtered_words) print(files, fd1.most_common(10))
def occStats(self, path, format, list, plotting): wordlists = PlaintextCorpusReader(path, format) fileids = wordlists.fileids() k = len(fileids) # computing rel frequencies self.fileStats(path, fileids) # closing threads #self.pool.close() #self.pool.join() # plotting vars figname = "Base GQs" figpath = plotting + '/' + figname.replace(' ', '-') + '-stats.pdf' savpath = plotting + '/' + figname.replace(' ', '-') # plotting MyPlot(self.stats, self.classstats, figname, "one", plotting, list) # all # generating report SaveStats(self.classstats, self.stats, figpath, savpath, plotting) # all
###################Codificacao correta dos caracteres###################### #coding: utf-8 ########################################################################### #######################Bibliotecas nencessárias############################ import nltk from nltk.corpus import PlaintextCorpusReader import os ########################################################################### ######################Configuração inicial################################# os.chdir('/home/joaopedropp/Uptpt/') A = os.listdir('/home/joaopedropp/Uptpt/') textos = '/home/joaopedropp/Uptpt/' t = PlaintextCorpusReader(textos, '.*') ########################################################################### ######################Vetores de Classificacao############################# Tele = [ "dsl", "amps", "analógico", "amplificador", "ansi", "antena", "banda", "broadband", "cdma", "comutação", "discagem", "dsl", "dtmf", "erlang", "espectro", "rádio-base", "frequência", "gsm", "hertz", "interferência", "isdn", "largura de banda", "modulação", "qpsk", "multimÍdia", "multiplexador", "pcm", "propagação", "roaming", "ruído", "sinalização", "tdd", "tdm", "tdma", "telefonia fixa", "canal", "viva-voz", "voip" ] Eletro = [ 'amperímetro', 'capacitância', 'capacitor', 'circuito', 'tensão', 'smd', 'corrente alternada', 'corrente contínua', 'diodo', 'eletromagnetismo', 'fet', 'led', 'mosfet', 'multÍmetro', 'ohmímetro', 'potenciômetro',
def extract_data(self, filepath, ind_features=_PARAIND_FEAT, dep_features=_PARADEP_FEAT, labels_per_sent=None, labels_per_window=None): """Extract features, reduce dimensions with a PCA and return data. Exports raw- and PCA-reduced data both in arff- and numpy-format. """ start = time.clock() self.dictVectorizer = DictVectorizer(sparse=False) filename = os.path.split(filepath)[1] directory = os.path.split(filepath)[0] plain_reader = PlaintextCorpusReader( directory, [filename], word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|[" + string.punctuation + "]"), sent_tokenizer=LineTokenizer(blanklines="discard"), encoding='utf8') # create new subdir for extracted data if _NEW_SUBDIR is not None: path = os.path.join(directory, _NEW_SUBDIR) if not os.path.exists(path): os.makedirs(path) path = os.path.join(path, os.path.splitext(filename)[0]) # print "path {}".format(path) else: path = os.path.splitext(filepath)[0] # print "path {}".format(path) # filepaths for weka- and numpy-files arff_filepath = path + ".arff" arff_filepath_pca = path + "_pca95.arff" numpy_filepath = path + ".npy" numpy_filepath_pca = path + "_pca95.npy" # print(":time: Reader created, time elapsed {}").format(time.clock() - start) paras = plain_reader.paras() # print(":time: Paras created, time elapsed {}").format(time.clock() - start) sents = plain_reader.sents() # print(":time: Sents created, time elapsed {}").format(time.clock() - start) # get paragraph boundaries for sliding-window self.boundaries = util.get_boundaries(paras) boundaries_backup = self.boundaries # check if all files necessary exist, if yes - unpickle/load them and return data if util.files_already_exist([ numpy_filepath_pca, ]): print "Features already extracted. Calculating clusters...\n" matrix_sklearn_pca = numpy.load(numpy_filepath_pca) return filepath, self.boundaries, matrix_sklearn_pca, len(sents) # save correct target-labels and additional info of current data targets_path = open(path + ".tbs", "wb") pickle.dump((labels_per_sent, labels_per_window, boundaries_backup, len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path) # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start) self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE, ind_features, dep_features) # self.data[year] = self.extract_features_para(paras, ind_features, dep_features) # print(":time: Features extracted, time elapsed {}").format(time.clock() - start) self.all_features = self.unified_features(self.data) # print(":time: Unified features, time elapsed {}").format(time.clock() - start) matrix_sklearn = self.feature_matrix_sklearn( self.generator_data(self.data)) # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start) matrix_sklearn = util.normalize(matrix_sklearn) # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start) print "Exporting raw-data..." util.export_arff(matrix_sklearn, self.dictVectorizer.get_feature_names(), arff_filepath, filename + "_RAW", labels_per_window, file_info=None) numpy.save(numpy_filepath, matrix_sklearn) # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape) feature_names, feature_names_part = None, None if _DO_PCA: print "PCA calculation..." matrix_sklearn_pca, feature_names = util.pca( matrix_sklearn, self.dictVectorizer.get_feature_names()) util.export_arff(matrix_sklearn_pca, feature_names, arff_filepath_pca, filename + "_PCA95", labels_per_window, file_info=None) numpy.save(numpy_filepath_pca, matrix_sklearn_pca) del matrix_sklearn gc.collect() return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
# Text procesing file. Ingest and initial processing # Project: Congressional Testimony Essay # Ian P. Cook import os, re, csv, string, operator import nltk from nltk.corpus import PlaintextCorpusReader import time start_time = time.time() dir = '/Users/ian/Dropbox/Academia/Dissertation/testimony/hearings/112Congress/' corpus_root = dir hearings = PlaintextCorpusReader(corpus_root, '.*') cfd = nltk.ConditionalFreqDist((target, fileid[-4:-9]) for fileid in hearings.fileids() if os.path.getsize(fileid) > 0 for w in hearings.words(fileid) for target in ['apologize', 'regret'] if w.lower().startswith(target)) print(time.time() - start_time) cfd.plot() # The above code works, with one exception: longer/multi-wor:d # "targets" causes an error: "local variable 'legend-loc' referenced before # assignment'. Not sure what that is about. # Working on getting the filenames into a csv # pass the function a directory and it should travers all the files # and subfolders, read the filenames, and put them into the CSV's first column
# CALCULATION OF TF VALUES # TF= (NO OF TIMES A TERM APPEAR IN SENTENCE) / (NO OF TERMS IN EACH SENTENCE) import nltk from nltk.corpus import PlaintextCorpusReader from nltk.tokenize import word_tokenize from nltk.corpus import stopwords corpus_root = 'C:\MyData\PythonPractice\Mycorpus' wordlists = PlaintextCorpusReader(corpus_root, 'resort.*\.txt') print('\nFollowing file ids are there in this corpus: \n ') print(wordlists.fileids()) print("\nNumber of sentences in the file are :") sencount = len(wordlists.sents(fileids=['resort.txt'])) print(sencount) print('\n Sentences are : \n') sentences = wordlists.sents(fileids='resort.txt') print(sentences) sample = wordlists.raw("resort.txt") s = sample.split('.') # NUMBER OF TIMES A TERM APPEAR IN EACH SENTENCE # NUMBER OF TERMS IN EACH SENTENCE wordfreq = [] term_freq = [] terms_count_doc = []
# Module 3: Corpus # Own Corpus # Author: Dr. Alfred from nltk.corpus import PlaintextCorpusReader corpus_root = 'corpus' my_corpus = PlaintextCorpusReader(corpus_root, '.*') # print(my_corpus.fileids()) fileid = 'file1.txt' text = my_corpus.raw(fileid) print(text) print(" Num of chars :",len(my_corpus.raw(fileid))) print(" Num of words :",len(my_corpus.words(fileid))) print(" Num of sentences :",len(my_corpus.sents(fileid)))
# let's make our program compatible with Python 3.0/1/2/3 from __future__ import division, print_function from future_builtins import ascii, filter, hex, map, oct, zip search_word = 'samsung' # one-word string for this program import os # operating system commands import re # regular expressions import nltk # draw on the Python natural language toolkit from nltk.corpus import PlaintextCorpusReader from numpy import * # for array calculations # create lists of positive and negative words using Hu and Liu (2004) lists my_directory = '/Users/ngaonkar/Desktop/Predict542/final_project' positive_list = PlaintextCorpusReader(my_directory, 'Hu_Liu_positive_word_list.txt', encoding='latin-1') negative_list = PlaintextCorpusReader(my_directory, 'Hu_Liu_negative_word_list.txt', encoding='latin-1') positive_words = positive_list.words() negative_words = negative_list.words() # define bag-of-words dictionaries def bag_of_words(words, value): return dict([(word, value) for word in words]) positive_scoring = bag_of_words(positive_words, 1) negative_scoring = bag_of_words(negative_words, -1)
print(result) # 3b) If it is a NP, print out the text inside it print("Matching texts in NP chunks: ") for subtree in result.subtrees(): if subtree.label() == "NP": subtree = list(map(lambda x: x[0], subtree.leaves())) subtree = " ".join(subtree) print(subtree) print() if __name__ == '__main__': # Initialize Parser grammar = r""" NP: {< NNP > ∗} {< DT >? < JJ >? < NNS >} {< NN >< NN >} """ cp = nltk.RegexpParser(grammar) # Initalize corpus reader corpus_reader = PlaintextCorpusReader(root="./SpaceX", fileids=".*\.txt") # Tag all sentences sents = corpus_reader.sents("SpaceX.txt") tagged_sents = nltk.pos_tag_sents(sents) # Run parser on first five sentences run_parser(cp, tagged_sents[:5])
def build_index(in_dir, out_dict, out_postings): """ build index from documents stored in the input directory, then output the dictionary file and postings file """ print('indexing...') #reading the files corpus = PlaintextCorpusReader(in_dir, '.*') file_names_str = corpus.fileids() file_names = sorted(map(int, file_names_str)) #Load corpus and generate the postings dictionary postings = defaultdict(dict) tokens = list() for docID in file_names: content = corpus.raw(str(docID)) # read file content content = preprocess(content) words = tokenize(content) # tokenization: content -> words tokens = stemming(words) # stemming if phrasal_query: token_len = defaultdict(list) else: token_len = defaultdict(int) # count the apeearing times of the token in the file term_pos = 0 for token in tokens: if phrasal_query: if token in token_len.keys(): token_len[token][0] += 1 token_len[token][1].append(term_pos) else: token_len[token] = [1, [term_pos]] else: token_len[token] += 1 term_pos += 1 ''' Generate weighted token frequency. Generate dictionary of key -> token, value -> a dict with k,v as file_name, weighted_token_frequency ''' if phrasal_query: weighted_tokenfreq = normalize( [get_tf(y[0]) for (x, y) in token_len.items()]) for ((token, freq), w_tf) in zip(token_len.items(), weighted_tokenfreq): postings[token][docID] = PhrasalToken(freq[0], freq[1], w_tf) else: weighted_tokenfreq = normalize( [get_tf(y) for (x, y) in token_len.items()]) for ((token, freq), w_tf) in zip(token_len.items(), weighted_tokenfreq): postings[token][docID] = Token(w_tf) ''' Output dictionary and postings files - Dictionary file stores all the tokens, with their doc frequency, the offset in the postings file. - Postings file stores the list of tuples -> (document ID, term freq). ''' # write postings file dictionary = defaultdict(Entry) #print(postings.items()) with open(out_postings, mode="wb") as postings_file: for key, value in postings.items(): #print(value) ''' len(value) := the document frequency of the token := how many times the token appears in all documents offset := current writing position of the postings file ''' offset = postings_file.tell() pickle.dump(value, postings_file) size = postings_file.write(pickle.dumps(value)) dictionary[key] = Entry(len(value), offset, size) # write dictionary file with open(out_dict, mode="wb") as dictionary_file: pickle.dump(url_map, dictionary_file) pickle.dump(doc_id_map, dictionary_file) pickle.dump(pr_result, dictionary_file) pickle.dump(dictionary, dictionary_file) print("dictionary done")
''' Statistics about a given corpus ''' from math import sqrt from nltk.corpus import PlaintextCorpusReader #corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/personae/data_50" #corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/fed_papers/F3" corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/dw/almedad/al3" #corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/dw/ansar1/an9" #corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/blog_corpus/B2" #corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/test/test1_64" corpus = PlaintextCorpusReader(corpus_root, '.*txt', encoding="UTF-8") n_texts = len(corpus.fileids()) txt_lengths = [] text_classes = [] DISTINCT_AUTHORS = True TEXT_STATS = True for text in corpus.fileids(): if DISTINCT_AUTHORS: found_category = text.partition(".")[0] text_classes.append(found_category) if TEXT_STATS: wrd_tokens = corpus.words(text)
# Retrieve a file list# files = os.listdir(".") print "All the files in the directory" #Starts a loop that prints the name of each file, # #with the condition that it ends with ".txt"# for file in files: if file.endswith(".txt"): print file file_name = raw_input("Choose the file:") print "The file that was chosen is {0}".format(file_name) from nltk.corpus import PlaintextCorpusReader corpus_root = "." # "." means the existing directory I am in search_text = PlaintextCorpusReader(corpus_root, file_name) search_text = nltk.Text(search_text.words()) #creates text object keyword = raw_input("Specify word to search:") search_text.concordance(keyword, 80, lines=30) ##NEW THING## from nltk.corpus import PlaintextCorpusReader corpus_root = '.' search_text = PlaintextCorpusReader(corpus_root, file_name) search_text = nltk.Text(search_text.words()) # from nltk.corpus import stopwords ## path is andreaantenan/Desktop/cs195/nltk_data/corpora/stopwords/english.txt
from sklearn.ensemble import RandomForestClassifier #s-l随机森林分类器 from sklearn.neighbors import KNeighborsClassifier #s-lKNN分类器 from sklearn.svm import SVC #s-l支持向量机分类器 from sklearn import metrics #计算评价指标 from sklearn import cross_validation #划分训练集和测试集 from nltk.corpus import PlaintextCorpusReader from gensim.models import word2vec import numpy as np from numpy import * #加载自己的语料库 print('加载语料库...') corpus_root_neg = r"E:\Strange\SRTP.11\NLP\data\ChnSentiCorp_htl_ba_6000\neg_pre6000" corpus_root_pos = r"E:\Strange\SRTP.11\NLP\data\ChnSentiCorp_htl_ba_6000\pos_pre6000" neg = PlaintextCorpusReader(corpus_root_neg, '.*') pos = PlaintextCorpusReader(corpus_root_pos, '.*') documents_neg = [(list(neg.words(fileid)), 0) for fileid in neg.fileids()] documents_pos = [(list(pos.words(fileid)), 1) for fileid in pos.fileids()] documents_neg.extend(documents_pos) documents = documents_neg random.shuffle(documents) #随机打乱 sentences = word2vec.Text8Corpus( r"E:\Strange\SRTP.11\NLP\data\ChnSentiCorp_htl_ba_6000\merge\1000.txt" ) #加载词向量训练语料 model = word2vec.Word2Vec(sentences, size=150, min_count=1) #从零开始训练word2vec模型 # 增量训练 #print('加载模型...')
import nltk from nltk.corpus import PlaintextCorpusReader mycorpus = PlaintextCorpusReader('.', '.*.txt') mycorpus.fileids() part2 = mycorpus.fileids()[1] part2 part2string = mycorpus.raw('state_union_part2.txt') part2tokens = nltk.word_tokenize(part2string) part2tokens[:100] len(part2string) len(part2tokens) alphapart2 = [w for w in part2tokens if w.isalpha()] alphapart2[:100] alphalowerpart2 = [w.lower() for w in alphapart2] alphalowerpart2[:50] stopwords = nltk.corpus.stopwords.words('english') len(stopwords) stopwords stoppedalphalowerpart2 = [w for w in alphalowerpart2 if w not in stopwords] from nltk import FreqDist fdist = FreqDist(stoppedalphalowerpart2) fdistkeys = list(fdist.keys()) fdistkeys[:50] print('Printing top 50 words by frequency: ') topkeys = fdist.most_common(50) for pair in topkeys: print(pair) from nltk.collocations import * bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(alphalowerpart2)
if op == '': op = '1' dataset = int(op) if dataset == 2: #articuloscientificos corpus_root = '/home/mguevara/Dropbox/DOCTORADO/TECNOLOGIA BUSQUEDA/Tarea1/SolucionconNLTK/articulos2' export_indices = '/home/mguevara/datasets/info/indices/articulos/' export_matrices = '/home/mguevara/datasets/info/matrices/articulos/' export_vocabularios = '/home/mguevara/datasets/info/vocabularios/articulos/' exp_archivos = '.*' termino_ejemplo = 'actor' #'articulo857.txt' documento_ejemplo = '857' print_titulo("CREAR CORPUS") from nltk.corpus import PlaintextCorpusReader corpus = PlaintextCorpusReader(corpus_root, exp_archivos) if dataset == 3: export_indices = '/home/mguevara/datasets/info/indices/reuters/' export_matrices = '/home/mguevara/datasets/info/matrices/reuters/' export_vocabularios = '/home/mguevara/datasets/info/vocabularios/reuters/' #exp_archivos='.*' termino_ejemplo = 'aguila' #'articulo857.txt' documento_ejemplo = 'training/844' print_titulo("CREAR CORPUS") from nltk.corpus import reuters corpus = reuters #reuters27000 #corpus_root = '/home/mguevara/datasets' #exp_archivos='reuters/.*'
import nltk from nltk.corpus import PlaintextCorpusReader corpus_root = '/Users/sydshir/Desktop/Code' wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids() pillar = nltk.Text(wordlists.words('cityandthepillar.txt')) pillar[:10] pillar.concordance('gay') list(nltk.bigrams(pillar)) #Frequency Distribution Calculator fdist1 = FreqDist(pillar) print(fdist1) fdist1.most_common(50) fdist1['gay'] #Bigrams Generator def generate_model(cfdist, word, num=1): for i in range(num): print(word, end=' ') word = cfdist[word].max() text = nltk.Text(wordlists.words('cityandthepillar.txt')) bigrams = nltk.bigrams(pillar) cfd = nltk.ConditionalFreqDist(bigrams) cfd['gay'] generate_model(cfd, 'gay')
plt.title('Number of Unique Words', fontsize=20) plt.subplot(1, 3, 3) plt.barh(y_pos, data_wpm_sort.words_per_posts, align='center') plt.yticks(y_pos, data_wpm_sort.Anio) plt.title('Number of Words Per Posts', fontsize=20) plt.tight_layout() plt.show() #Frecuencia de palabras import nltk from nltk.corpus import PlaintextCorpusReader #corpus_root = './python_projects/blog' corpus_root=path wordlists = PlaintextCorpusReader(corpus_root, '.*', encoding='latin-1') #wordlists.fileids() # con esto listamos los archivos del directorio cfd = nltk.ConditionalFreqDist( (word,genre) for genre in anios for w in wordlists.words(genre + '.txt') for word in ['casa','mundo','tiempo','vida'] if w.lower().startswith(word) ) cfd.plot() #(no funciona) #Analisis de sentimientos ''' from classifier import SentimentClassifier
import sys import os.path import math import nltk from nltk.corpus import PlaintextCorpusReader # sys.argv.append('./gold/pku_training_words.utf8') # sys.argv.append('./training/pku_training.utf8') # sys.argv.append('./testing/pku_test.utf8') assert len(sys.argv) == 4 with open(sys.argv[1], 'rt', encoding='utf8') as f: training_words = [w.strip() for w in f.readlines()] training = PlaintextCorpusReader(*os.path.split(sys.argv[2])) training_words += list(training.words()) #training_words = list(training.words()) N = len(training_words) V = len(set(training_words)) fdist = nltk.FreqDist(training_words) fdist = dict([(w, math.log((c + 1.0) / (N + V))) for w, c in fdist.items()]) defprob = math.log(1.0 / (N + V)) with open(sys.argv[3], 'rt', encoding='utf8') as f: test = f.readlines() def get_DAG(sentence): DAG = {} T = len(sentence)