def compare(request): errors = [] statistics=[] stats=[] for x in range(1,3): cantoname = "canto"+str(x)+".txt" w=PlaintextCorpusReader("./",cantoname); w.words(); t=nltk.text.Text(w.words()); l_lines=len(line_tokenize(w.raw())) l_uwords=len(set(w.words())) l_words=len(w.words()) l_sents=len(w.sents()) l_paras=len(w.paras()) l_linperpara=l_lines/l_paras statistics.append(x) statistics.append("Number of Words - "+ str(l_words)) statistics.append("Number of Unique Words - "+ str(l_uwords)) statistics.append("Number of Setences - "+ str(l_sents)) statistics.append("Number of Lines - "+ str(l_lines)) statistics.append("Number of Paras - "+ str(l_paras)) statistics.append("Number of Lines/Paras - "+ str(l_linperpara)) lexical_density=l_words/l_uwords l_wordpersent = l_words/l_sents statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density)) statistics.append("Words per sentence - "+ str(l_wordpersent)) stats.append(statistics) return render_to_response('compare.html', {'stats':statistics})
def stats(request): errors = [] statistics=[] if 'q' in request.GET: q = request.GET['q'] if not q: errors.append('Enter a Canto Number') else: cantoname = "canto"+q+".txt" w=PlaintextCorpusReader("./",cantoname); w.words(); t=nltk.text.Text(w.words()); l_lines=len(line_tokenize(w.raw())) l_uwords=len(set(w.words())) l_words=len(w.words()) l_sents=len(w.sents()) l_paras=len(w.paras()) l_linperpara=l_lines/l_paras statistics.append("Number of Words - "+ str(l_words)) statistics.append("Number of Unique Words - "+ str(l_uwords)) statistics.append("Number of Setences - "+ str(l_sents)) statistics.append("Number of Lines - "+ str(l_lines)) statistics.append("Number of Paras - "+ str(l_paras)) statistics.append("Number of Lines/Paras - "+ str(l_linperpara)) lexical_density=l_words/l_uwords l_wordpersent = l_words/l_sents statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density)) statistics.append("Words per sentence - "+ str(l_wordpersent)) return render_to_response('stats.html', {'statistics':statistics}) return render_to_response('stats.html', {'errors': errors})
def extractParasInList(name): corpuslocation ='/Users/anis/seniorProject/aligned Paragraphs/algebra' reader = PlaintextCorpusReader(corpuslocation, '.*\.txt') # This gives the list of paragraphs. every paragraph list contains ist of sentences # So it is a list of lists. Bunch of sentenses as a list joins together to make #lists of pararagraph pList = [] paragraphlist = reader.paras(name) #'simpleTuring.txt' numpara = len(paragraphlist) for sentlist in paragraphlist: #print sentlist numsent = len(sentlist) #print type(sentlist), #print numsent paraAsAList = [] # this loops through all the sentence lists and make them one list''' for i in range(numsent): paraAsAList = paraAsAList + sentlist[i] #print paraAsAList # this is the whole parapragph as one list paraAsAString = "" for word in paraAsAList: paraAsAString = paraAsAString + word + str(" ") #print paraAsAString pList.append(paraAsAString) #print len(pList) return pList
def train(): wordlists = PlaintextCorpusReader('', file_path) st = stemmer() # Get blocks of text using NLTK words = wordlists.words(file_path) sents = wordlists.sents(file_path) paras = wordlists.paras(file_path) # LOGIC # If a sentence contains a known [posi/nega]tive word, count the instances of words in that sentence as # [posi/nega]tive # Count words word_features = [] # Go through paragraphs for p in paras: # Classify S score_positive_negative = 0 for s in p: for word in s: word = st.stem(word) if word in words_positive: score_positive_negative += 1 elif word in words_negative: score_positive_negative -= 1 # Record class of paragraph for any words present for s in p: for word in s: word = st.stem(word) if score_positive_negative > 0: word_features.append( ({"word": word}, "+") ) elif score_positive_negative < 0: word_features.append( ({"word": word}, "-") ) else: word_features.append( ({"word": word}, " ") ) # Create and return classifier classifier = nltk.NaiveBayesClassifier.train(word_features) return classifier
def main(): st = stemmer() # Get data wordlists = PlaintextCorpusReader('', file_path) words = wordlists.words(file_path) sents = wordlists.sents(file_path) paras = wordlists.paras(file_path) # Train classifier = train() # Get class probabilities (for MAP estimation) counts = {"P":0, "-":0, "N":0} for i in range(0,len(paras)): for s in paras[i]: score_pos = 0 score_neg = 0 # Classify paragraph for word in s: word = st.stem(word) feature = {"word":word} classified = classifier.classify(feature) if classified == "+": score_pos += 1 elif classified == "-": score_neg += 1 # Record result if score_pos > score_neg: counts["P"] += 1 elif score_pos < score_neg: counts["N"] += 1 else: counts["-"] += 1 # Done! print counts
def extractParasInList(name): corpuslocation ='/home/aniszaman/seniorProject/combined/carnivore' reader = PlaintextCorpusReader(corpuslocation, '.*\.txt') # This gives the list of paragraphs. every paragraph list contains ist of sentences # So it is a list of lists. Bunch of sentenses as a list joins together to make #lists of pararagraph pList = [] paragraphlist = reader.paras(name) #'simpleTuring.txt' numpara = len(paragraphlist) for sentlist in paragraphlist: numsent = len(sentlist) paraAsAList = [] # this loops through all the sentence lists and make them one list''' for i in range(numsent): paraAsAList = paraAsAList + sentlist[i] paraAsAString = "" for word in paraAsAList: paraAsAString = paraAsAString + word + str(" ") pList.append(paraAsAString) return pList
def extract_data(self, filepath, ind_features=_PARAIND_FEAT, dep_features=_PARADEP_FEAT, labels_per_sent=None, labels_per_window=None): """Extract features, reduce dimensions with a PCA and return data. Exports raw- and PCA-reduced data both in arff- and numpy-format. """ start = time.clock() self.dictVectorizer = DictVectorizer(sparse=False) filename = os.path.split(filepath)[1] directory = os.path.split(filepath)[0] plain_reader = PlaintextCorpusReader( directory, [filename], word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|[" + string.punctuation + "]"), sent_tokenizer=LineTokenizer(blanklines="discard"), encoding='utf8') # create new subdir for extracted data if _NEW_SUBDIR is not None: path = os.path.join(directory, _NEW_SUBDIR) if not os.path.exists(path): os.makedirs(path) path = os.path.join(path, os.path.splitext(filename)[0]) # print "path {}".format(path) else: path = os.path.splitext(filepath)[0] # print "path {}".format(path) # filepaths for weka- and numpy-files arff_filepath = path + ".arff" arff_filepath_pca = path + "_pca95.arff" numpy_filepath = path + ".npy" numpy_filepath_pca = path + "_pca95.npy" # print(":time: Reader created, time elapsed {}").format(time.clock() - start) paras = plain_reader.paras() # print(":time: Paras created, time elapsed {}").format(time.clock() - start) sents = plain_reader.sents() # print(":time: Sents created, time elapsed {}").format(time.clock() - start) # get paragraph boundaries for sliding-window self.boundaries = util.get_boundaries(paras) boundaries_backup = self.boundaries # check if all files necessary exist, if yes - unpickle/load them and return data if util.files_already_exist([ numpy_filepath_pca, ]): print "Features already extracted. Calculating clusters...\n" matrix_sklearn_pca = numpy.load(numpy_filepath_pca) return filepath, self.boundaries, matrix_sklearn_pca, len(sents) # save correct target-labels and additional info of current data targets_path = open(path + ".tbs", "wb") pickle.dump((labels_per_sent, labels_per_window, boundaries_backup, len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path) # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start) self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE, ind_features, dep_features) # self.data[year] = self.extract_features_para(paras, ind_features, dep_features) # print(":time: Features extracted, time elapsed {}").format(time.clock() - start) self.all_features = self.unified_features(self.data) # print(":time: Unified features, time elapsed {}").format(time.clock() - start) matrix_sklearn = self.feature_matrix_sklearn( self.generator_data(self.data)) # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start) matrix_sklearn = util.normalize(matrix_sklearn) # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start) print "Exporting raw-data..." util.export_arff(matrix_sklearn, self.dictVectorizer.get_feature_names(), arff_filepath, filename + "_RAW", labels_per_window, file_info=None) numpy.save(numpy_filepath, matrix_sklearn) # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape) feature_names, feature_names_part = None, None if _DO_PCA: print "PCA calculation..." matrix_sklearn_pca, feature_names = util.pca( matrix_sklearn, self.dictVectorizer.get_feature_names()) util.export_arff(matrix_sklearn_pca, feature_names, arff_filepath_pca, filename + "_PCA95", labels_per_window, file_info=None) numpy.save(numpy_filepath_pca, matrix_sklearn_pca) del matrix_sklearn gc.collect() return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
def main(print_out, motifs, chapter): wordlists = PlaintextCorpusReader('', 'Punctuated/pot_ch[12345]\.txt') #rep_words = nltk.FreqDist(brown.words()) # Get representative word counts st = LancasterStemmer() #st = RegexpStemmer('ing$|s$|e$', min=4) for i in range(1,6): if i != chapter: continue g = nx.Graph() words = wordlists.words('Punctuated/pot_ch{!s}.txt'.format(str(i))) paras = wordlists.paras('Punctuated/pot_ch{!s}.txt'.format(str(i))) # Generate HTML #with open("test" + str(i) + ".txt", "w+") as fi: # output = generate_html_level2(wordlists, st, words, paras, i) # fi.write(output) json_dict = {} json_dict["nodes"] = [] json_dict["edges"] = [] # Get correlation coefficients corr_data = get_corr_coefs(wordlists, st, words, paras, print_out, motifs) corr_coefs = corr_data[0] corr_freqs = corr_data[1] # ---------------------------------- NetworkX ---------------------------------- # Get NetworkX nodes nx_added_nodes = [] for m1 in corr_coefs: g.add_node(m1) # Get NetworkX edges for m1 in corr_coefs: for m2 in corr_coefs[m1]: # Avoid repeats if m1 <= m2: continue g.add_edge(m1, m2) # -------------------------------- End NetworkX -------------------------------- # -------------------------------------- d3.js -------------------------------------- # Get d3-js nodes json_node_numbers = dict() square_size = 0 for m1 in corr_coefs: sz = int(min(corr_freqs[m1]/3.0,50))*3 #print sz json_node = { "name": m1, "size": str(sz), "color": "#aaaaaa" } json_dict["nodes"].append(json_node) json_node_numbers[m1] = len(json_node_numbers) # Get d3-js edges m1m2 = 0; for m1 in corr_coefs: for m2 in corr_coefs[m1]: # Avoid repeats if m1 <= m2: continue # No need to worry about repeats, since corr_coefs won't contain them edge_size = corr_coefs[m1][m2] #print "ES " + m1 + "/" + m2 + ": " + str(edge_size) json_edge = { "name": m1 + "-" + m2, "source": json_node_numbers[m1], "target": json_node_numbers[m2], "size": str(edge_size) } json_dict["edges"].append(json_edge) # Add boundary d3-js node json_dict["nodes"].append({"id":"the-end", "x":square_size, "y":square_size, "size":"1", "color":"#000000" }) # Write JSON to file if not print_out: with open("OFFICIAL/data" + str(i) + ".json", "w+") as fi: fi.write("var json_str_" + str(i) + "=" + json.dumps(json_dict, fi, indent = 2)) else: mn = smin(motifs[0],motifs[1]) mx = smax(motifs[0],motifs[1]) path = "OFFICIAL/inserts/" + mn + ("-" if len(mn) != 0 else "") + mx + "-" + str(chapter) + ".html" print path with open(path, "w+") as fi: fi.write(corr_data[2])
def extract_data(self, filepath, ind_features=_PARAIND_FEAT, dep_features=_PARADEP_FEAT, labels_per_sent=None, labels_per_window=None): """Extract features, reduce dimensions with a PCA and return data. Exports raw- and PCA-reduced data both in arff- and numpy-format. """ start = time.clock() self.dictVectorizer = DictVectorizer(sparse=False) filename = os.path.split(filepath)[1] directory = os.path.split(filepath)[0] plain_reader = PlaintextCorpusReader( directory, [filename], word_tokenizer=RegexpTokenizer("(-?\d+\.\d+)|[\w']+|["+string.punctuation+"]"), sent_tokenizer=LineTokenizer(blanklines="discard"), encoding='utf8') # create new subdir for extracted data if _NEW_SUBDIR is not None: path = os.path.join(directory, _NEW_SUBDIR) if not os.path.exists(path): os.makedirs(path) path = os.path.join(path, os.path.splitext(filename)[0]) # print "path {}".format(path) else: path = os.path.splitext(filepath)[0] # print "path {}".format(path) # filepaths for weka- and numpy-files arff_filepath = path + ".arff" arff_filepath_pca = path + "_pca95.arff" numpy_filepath = path + ".npy" numpy_filepath_pca = path + "_pca95.npy" # print(":time: Reader created, time elapsed {}").format(time.clock() - start) paras = plain_reader.paras() # print(":time: Paras created, time elapsed {}").format(time.clock() - start) sents = plain_reader.sents() # print(":time: Sents created, time elapsed {}").format(time.clock() - start) # get paragraph boundaries for sliding-window self.boundaries = util.get_boundaries(paras) boundaries_backup = self.boundaries # check if all files necessary exist, if yes - unpickle/load them and return data if util.files_already_exist([numpy_filepath_pca,]): print "Features already extracted. Calculating clusters...\n" matrix_sklearn_pca = numpy.load(numpy_filepath_pca) return filepath, self.boundaries, matrix_sklearn_pca, len(sents) # save correct target-labels and additional info of current data targets_path = open(path + ".tbs", "wb") pickle.dump((labels_per_sent, labels_per_window, boundaries_backup, len(sents), _WINDOW_SIZE, _STEP_SIZE), targets_path) # print(":time: Boundaries calculated, time elapsed {}").format(time.clock() - start) self.data = self.extract_features(sents, _WINDOW_SIZE, _STEP_SIZE, ind_features, dep_features) # self.data[year] = self.extract_features_para(paras, ind_features, dep_features) # print(":time: Features extracted, time elapsed {}").format(time.clock() - start) self.all_features = self.unified_features(self.data) # print(":time: Unified features, time elapsed {}").format(time.clock() - start) matrix_sklearn = self.feature_matrix_sklearn(self.generator_data(self.data)) # print(":time: Matrix sklearn created, time elapsed {}").format(time.clock() - start) matrix_sklearn = util.normalize(matrix_sklearn) # print(":time: Matrix normalized, time elapsed {}").format(time.clock() - start) print "Exporting raw-data..." util.export_arff(matrix_sklearn, self.dictVectorizer.get_feature_names(), arff_filepath, filename+"_RAW", labels_per_window, file_info=None) numpy.save(numpy_filepath, matrix_sklearn) # print "matrix dimensions before pca: {}".format(matrix_sklearn.shape) feature_names, feature_names_part = None, None if _DO_PCA: print "PCA calculation..." matrix_sklearn_pca, feature_names = util.pca(matrix_sklearn, self.dictVectorizer.get_feature_names()) util.export_arff(matrix_sklearn_pca, feature_names, arff_filepath_pca, filename+"_PCA95", labels_per_window, file_info=None) numpy.save(numpy_filepath_pca, matrix_sklearn_pca) del matrix_sklearn gc.collect() return filepath, boundaries_backup, matrix_sklearn_pca, len(sents)
NUM_WORDS = 20 wordlist = PlaintextCorpusReader('', 'ofk(_chap_[1234])?\.txt') def clean_words(words): #convert everything to lower case words = [w.lower() for w in words] #remove period from end of sentences words = [re.sub('\.','',w) for w in words] #only keep alphabetic strings words = [w for w in words if w.isalpha()] words = [w for w in words if not w in stopwords.words('english')] #do stemming "goes" => "go" words = [nltk.PorterStemmer().stem(w) for w in words] return words paras = [sum(para, []) for para in wordlist.paras('ofk.txt')] words = clean_words(wordlist.words('ofk.txt')) groups = [] for i in range(0, len(paras), GROUP_LENGTH): group = sum(paras[i : min(i + GROUP_LENGTH, len(paras))], []) groups.append(group) freqs = [] for group in groups: freq = FreqDist(clean_words(group)) table = {w:freq[w] for w in freq} freqs.append(table) top_words = [w for w in FreqDist(words)][:NUM_WORDS] def get_word_freqs(word): return {'word':word, 'values':[{'x':i, 'y':freqs[i].get(word, 0)} for i in range(len(freqs))]}
class Corpus: """ The Corpus class creates a corpus, that is a set of speeches to be analyzed. The constructor takes a list of files from the speeches folder as parameter. Example. ["1977.txt", "1980.txt"] If the list is empty, the corpus is created with the complete collection of speeches (from 1975 to 2017) """ def __init__(self, files): if not files: self.corpus = PlaintextCorpusReader('./data/speeches', '.*') else: self.corpus = PlaintextCorpusReader('./data/speeches', files) self.speech = Speech(self.corpus.raw(), self.corpus.words(), self.corpus.sents(), self.corpus.paras(), None, None, None, None) self.speeches = build_speeches_dict(self.corpus) self.years = [ int(year.split('.')[0]) for year in self.corpus.fileids() ] complementary_years = list( set(os.listdir("./data/speeches")) - set([str(years) + '.txt' for years in self.years])) if not files: self.complementary = None self.unique_words = None else: self.complementary = ComplementaryCorpus(complementary_years) self.unique_words = [ word for word in self.speech.tokens if word not in self.complementary.speech.tokens ] def to_speeches_list(self): speeches_list = [] for key, speech in self.speeches.items(): speeches_list.append(speech.speech_to_dict()) return speeches_list def print_graph(self, my_words): """ :param my_words: list of words whose frequency is to be plotted :return: a frequency plot """ cfd = nltk.ConditionalFreqDist((target, fileid) for fileid in self.speeches.keys() for w in self.speeches[fileid].tokens for target in my_words if w.lower() == target) cfd.plot() def get_files(self): """ :return: list of files in the Corpus object """ return self.corpus.fileids() def unique_words_freq(self): """ :return: the words in the corpus object that are unique to that corpus (i.e., these words dont appear in the rest of the speeches) """ if self.unique_words is None: return "The corpus contains all speeches, so no comparison can be made" else: return nltk.FreqDist(self.unique_words).most_common(50) def radiography(self): """ The method that returns the lexical radiography of the corpus :return: prints lexical analysis from the corpus """ print("Lexical data for period from " + str(self.years[0]) + " to " + str(self.years[-1])) print(str(len(self.years)) + " total speeches") print(str(len(self.corpus.words())) + " total words") print( str(len(self.corpus.words()) / len(self.get_files())) + " words per speech") print("Frequency distribution:") print(self.speech.frequencies()) print("Content words frequency distribution:") print(self.speech.most_frequent_content_words()) print("Unique words frequency distribution:") print(self.unique_words_freq()) print("Most frequent content bigrams:") print(self.speech.most_frequent_bigrams()) print("Most frequent content trigrams:") print(self.speech.most_frequent_trigrams()) print("#######################################")
def text_to_paras(name, book_dir): book_id = get_book_id(name) book_path = get_path(book_id, book_dir) corpus = PlaintextCorpusReader(book_path, '.*') paragraphs = corpus.paras('book%s.txt' % book_id) return paragraphs
def text_to_paras(book_id): #put in directory of the .txt files book_path = get_path(book_id, book_dir) corpus = PlaintextCorpusReader(book_path, '.*') paragraphs = corpus.paras('book%s.txt' % book_id) return paragraphs
from nltk.corpus import PlaintextCorpusReader from nltk.corpus import wordnet as wn # Grab stopwords. stopwords = nltk.corpus.stopwords.words('english') len(stopwords) #127 # Read the plain text. corpus_root = 'corpora' aow = PlaintextCorpusReader(corpus_root, 'Art-of-War.txt') aow.fileids() #['artofwar.txt'] aow.words() #['THE', 'ART', 'OF', 'WAR', 'BY', 'SUN', 'TZU', ...] len(aow.words()) #13038 len(aow.sents()) #943 len(aow.paras()) #399 len([s for s in aow.sents() if 'enemy' in s]) #111 len([s for s in aow.sents() if 'enemies' in s]) #1 len([s for s in aow.sents() if 'ally' in s]) #2 len([s for s in aow.sents() if 'allies' in s]) #3 len([s for s in aow.sents() if 'spy' in s]) #8 len([s for s in aow.sents() if 'spies' in s]) #11 # Extract the list of sentences with /^enem(?:y|ies)$/i words. enemy_sents = [] for s in aow.sents(): for w in s: if w.lower().startswith('enem'): enemy_sents.append(s) # TODO Skip if seen. len(enemy_sents) #126 XXX Can contain duplicates. Fix TODO above.
else: return '' # Reading corpus corpus_root = '/Users/elgayaro/nltk_data/corpora/tudiab' ww = PlaintextCorpusReader(corpus_root, r'(?!README|\.).*\.txt', para_block_reader=read_line_block) # Displaying sample corpus print('The files in this corpus are: ', ww.fileids()) print('Number of words (before pre-processing) in the corpus = ', len(set(ww.words()))) print('There are documents/paragraphs/threads in the corpus = ', len(ww.paras())) print("+++++++++ Sample 1000 un-filtered words +++++++++") words = set(ww.words()) print(random.sample(words, 1000)) # print(ww.words()) # print(ww.sents()[0]) # print(ww.paras()[10]) # print(ww.raw()[:10]) # Defining stop words stopwords = stopwords.words('english') add_stopwords = [ 'http', 'https', '://', 'www', 'com', '8800', '...', '....', 'yep', '.).', '](#', '.:).', '++..', 'github', 'etc', 'also', 'org', 'gee', 'let',
'''def beatles_corpus.sents(self, fileids=None): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) """ if self._sent_tokenizer is None: raise ValueError('No sentence tokenizer for this corpus') return concat([self.CorpusView(path, self._read_sent_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) ''' k = 0 custom_corpus = nltk.Text(beatles_corpus.words()) for i in beatles_corpus.paras(): # Iterate through songs, printing out contents # print "Song # " + str( k ) k += 1 l = 0 while l < len(i): # print "Line " + str(l) # print i[l] l += 1 # for j in i: # print j[0] # new_song = custom_corpus.generate(100)