def load_feat_data(dir_array): data_list = [] for direct in dir_array: data = [] corpus_dir = 'dataset/' + direct corpus = PlaintextCorpusReader(corpus_dir, '.*\.*') file_ids = corpus.fileids() for file in file_ids: text = corpus.raw(file) e = email.message_from_string(text) if (e.is_multipart()): for payload in e.get_payload: text = payload.get_payload else: text = e.get_payload() data.append(extract_features(text, corpus, file)) data_list.extend(data) return data_list
def read_corpus(corpus_path): from nltk.corpus.reader.plaintext import PlaintextCorpusReader corpus = PlaintextCorpusReader(corpus_path, ".*\.txt") ctext = corpus.raw() # with open('corpus.txt', 'w') as cf: # cf.write(ctext.encode('utf-8')) return ctext
def load_corpus(race_code=None, gender_code=None ): #loads corpora into an array based on race and gender if (race_code == None): # if none is specified, search all race_code = ".." if (gender_code == None): gender_code = ".." reader = PlaintextCorpusReader( corpus_root, ".*_" + race_code + "_" + gender_code + "\.txt") # uses filename encoding to load specified texts corpora = [] for fileid in reader.fileids( ): #creates ComedyCorpus object, populates with fileid and name new_corpus = ComedyCorpus() new_corpus.set_fileid(fileid) try: new_corpus.set_text( reader.raw(fileid)) #gets word content based on fileid except UnicodeDecodeError: continue fileid = re.sub("_" + race_code + "-" + gender_code + "\.txt", "", fileid) #name is fileid without encoding fileid = fileid.replace("%20", " ") fileid = fileid.replace("_", "; ") print(fileid) new_corpus.set_name(fileid) corpora.append(new_corpus) return corpora
def load_data(dir_label): data_list = [] labels = [] for dl in dir_label: data = [] directory = dl[0] label = dl[1] corpus_dir = 'dataset/' + directory corpus = PlaintextCorpusReader(corpus_dir, '.*\.*') file_ids = corpus.fileids() for file in file_ids: d = [] text = corpus.raw(file) e = email.message_from_string(text) if (e.is_multipart()): for payload in e.get_payload: text = payload.get_payload else: text = e.get_payload() feats = [ cf.charac_feats_extractor(text), wf.word_feats_extractor(text), syf.syntac_feats_extractor(text), stf.struct_feats_extractor(corpus, file, text), fwf.funct_word_feats_extractor(text) ] for f in feats: d.extend(list(f.values())) data.append(d) labels.append(label) data_list.extend(data) return [data_list, labels]
def pdf_to_corpus(): path = 'D://Eclipse Workspace//NLP//Assignment//res//' for filename in glob.glob(os.path.join(path, '*.pdf')): print(filename) pdfFileObj = open(filename, 'rb') # creating a pdf reader object pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # printing number of pages in pdf file print(pdfReader.numPages) # creating a page object pageObj = pdfReader.getPage(0) # extracting text from page text = pageObj.extractText() strings_list = text.split("\n") # Make new dir for the corpus. corpusdir = 'customcorpus/' if not os.path.isdir(corpusdir): os.mkdir(corpusdir) # Output the files into the directory. file_name = filename.split("\\")[-1] print(file_name) pbar = ProgressBar(widgets=[ 'Creating Corpus', Bar('#', '[', ']'), ' ', Percentage(), ' ', ETA() ], maxval=100) for text in pbar(strings_list): with open(corpusdir + '[PDF] ' + file_name + '.txt', 'ab') as fout: fout.write(text.encode('utf-8')) pbar.finish() #create_corpus(text) corpus = PlaintextCorpusReader('customcorpus/', '.*') print(corpus.raw())
def token_assamese(): # Modifiy these to change the location of the coupus file and the name of the courpus file corpus_path = "/Users/partha/All/Python/ProjectMaterials/Learned material/Arts" corpus_filename = 'Psychology.txt' newcorpus = PlaintextCorpusReader(corpus_path, corpus_filename, encoding='utf16') text = newcorpus.raw().strip().replace('ред', '.') words = nltk.word_tokenize(text) for index, item in enumerate(words): if (str(item) == '.'): words[index] = 'ред' output_file_path = "C:/Users/HEMANT/Documents/1.Project/" output_filename = 'Result.txt' with open(output_file_path + output_filename, 'w', encoding='utf8') as f: for i in words: f.writelines(str(i) + '\n') f.close()
def read_article(file_path): #file = open(file_path, "r") ##INSERT FILE NAME IN FUNCTION CALL BELOW###### bcr = PlaintextCorpusReader(file_path, 'bernie.txt') #filedata = file.read() filedata = bcr.raw() #for word in filedata.split(): # if word == 'Mr.': # filedata[word] = 'Mr' article = filedata.replace("\n\n", '. ').replace('Mr.', 'Mr').replace( "\r", ' ').replace('\n', ' ').split('. ') articlez = [] for line in article: if line == '': continue if line[0] == '\n': line = line[1:] articlez.append(line) sentences = [] for sentence in articlez: sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" ")) sentences.pop() return sentences
refined_used_vocab = [] n = 0 for word in used_vocab: if word not in refined_used_vocab: refined_used_vocab.append(word) n = n + 1 print ' Used Vocab words: ' print '\t\t\t', refined_used_vocab print ' No. of vocab words used: ', n, '\n' return n datadir = 'just for test/moreErrors/' train_data = PlaintextCorpusReader(datadir, '.*') all_contents = train_data.raw().strip() all_text = preprocess(all_contents) bag_of_words = [word for word, word_count in Counter(all_text).most_common(20)] print('\n Bag of Words:') print bag_of_words print('\n') path1 = '/home/sudo-this/PycharmProjects/Automated Essay Marking/top_scored_essay/*.txt' files = glob.glob(path1) reference_collection = [] for filename in sorted(files): with open(filename, 'r') as f: f_contents = f.read() preprocessed_text = preprocess(f_contents) reference_text = preprocessed_text[:-1] reference_collection.append(reference_text)
rem_chars = "[!\"#$%&()*+,:;<=>?@[\\]^_`{|}~0123456789]" # remove these rep_chars = "[-./\']" # replace these t_temp = re.sub(rem_chars, "", t.lower()) t_temp = re.sub(rep_chars, " ", t_temp) t_strip_lower_filt = [w for w in t_temp.split() if not w in stopwords.words('english')] return " ".join(t_strip_lower_filt) # load the data corpusdir = 'corpus_txt/' # Directory of corpus. mycorp_raw = PlaintextCorpusReader(corpusdir, '.*') file_index = mycorp_raw.fileids() # preprocess the text (slow) # uncomment one of the following lines for usual vs parallel processing #mycorp_proc = nltk.Text([preprocess(mycorp_raw.raw(f)) for f in file_index]) mycorp_proc = Parallel(n_jobs=3,verbose=True)(delayed(preprocess)(mycorp_raw.raw(f)) for f in file_index) # get ngrams (1-3) vectorizer_ngrams = CountVectorizer(min_df = 0.05, ngram_range=(1, 3)) mat_ngrams = vectorizer_ngrams.fit_transform(mycorp_proc) n_df = pd.DataFrame(data = mat_ngrams.A, columns = vectorizer_ngrams.get_feature_names()) n_df['pt_id'] = [i[:-4] for i in file_index] # write results to file n_df.to_csv('ngrams_dtm.csv', index = False) # note the following analysis is redundant in this small case, since all words # will be captured. In a very large dataset we might limit the capture of n-grams # to those that only occur in say 5% fo the documents, thus potentially excluding other # terms of interest such as those below
import os import nltk from nltk.classify.naivebayes import NaiveBayesClassifier from nltk.classify import PositiveNaiveBayesClassifier from nltk.corpus.reader.plaintext import PlaintextCorpusReader def features(sentence): words = sentence.lower().split() return dict(('contains(%s)' % w, True) for w in words) corpusdir = './text' newcorpus = PlaintextCorpusReader(corpusdir, '.*') positive_featuresets = list(map(features, newcorpus.raw('comp.txt'))) unlabeled_featuresets = list(map(features, newcorpus.raw('animal.txt'))) classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, unlabeled_featuresets, .3) print classifier.classify(features('.'))
def try_out_some_functionalities(): corpusdir ="/media/benzro/OS/Users/benzro/Desktop/Studium Uni/2)" \ "ZweitesSemester/27)PCL-2/Uebungen/Uebung03/Enron/test/" newcorpus = PCR(corpusdir, '.*') print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access one file in the corpus" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" infile = corpusdir + "0001.1999-12-10.farmer.ham.txt" infile = "0004.1999-12-14.farmer.ham.txt" fin = newcorpus.open(infile) print fin.read().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "all file ids" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.fileids() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access each file in the corpus" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # (reduced output: [0:2]) for infile in sorted(newcorpus.fileids()): # the fileids of each file print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print infile # opens the file fin = newcorpus.open(infile) # prints the content of the file print fin.read().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access the plaintext; outputs pure string of all files" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.raw().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access paragraphs in the corpus. (list of list of list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access pargraphs of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.paras(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access sentences in the corpus. (list of list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # NOTE: That the texts are flattened into sentences that contains tokens. print newcorpus.sents() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access sentences of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.sents(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access just tokens/words in the corpus. (list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.words() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access tokens of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.words(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
# Create a new corpus by specifying the parameters # (1) directory of the new corpus # (2) the fileids of the corpus # NOTE: in this case the fileids are simply the filenames. newcorpus = PlaintextCorpusReader('nltkCorpusAll/', '.*') # Access each file in the corpus. for infile in sorted(newcorpus.fileids()): print infile # The fileids of each file. fin = newcorpus.open(infile)# Opens the file. print fin.read().strip() # Prints the content of the file print # Access the plaintext; outputs pure string/basestring. print newcorpus.raw().strip() print # Access paragraphs in the corpus. (list of list of list of strings) # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print # To access pargraphs of a specific fileid. print newcorpus.paras(newcorpus.fileids()[0])
config = configparser.ConfigParser() config.read("text_analysis.cfg") #Read out configuration parameters #Filname of text file to analyze input_file = config["DEFAULT"]["input_file"] #The nlp model used nlp_model = config["DEFAULT"]["nlp_model"] #The output file name output_file = config["DEFAULT"]["output_file"] #Load the nlp model nlp = load_nlp(nlp_model) #This Section generates a corpus (for nltk) and a string-text (for spacy) corpus = CorpusReader(".", input_file) my_text = corpus.raw() #This section deals with nltk-stuff for analysis paragraphs = corpus.paras() sentences = corpus.sents() words = corpus.words() tokenizer = Tokenizer(r'\w+') word_count = 0 counts = Counter() for sentence in sentences: tokens = tokenizer.tokenize(" ".join(sentence)) word_count = word_count + len(tokens) filtered = [w for w in sentence if w.isalnum()] counts = counts + Counter(filtered)
from nltk.corpus.reader.plaintext import PlaintextCorpusReader from decimal import Decimal from math import pi if __name__ == '__main__': ptcr = PlaintextCorpusReader('C:\Users\Jakub\Downloads\pr4\Trzeci plik', ['znormalizowane.txt', 'katy.txt']) data = [] t = ptcr.raw(fileids=ptcr.fileids()[1]).replace(',', '.').replace('\r', '').split('\n') t.remove('') for x in t: data.append(float(Decimal(x)*360/315)) print data data_ = [] t = ptcr.raw(fileids=ptcr.fileids()[0]).replace(',', '.').replace('\r', '').split('\n') t.remove('') for x in t: data_.append(float(x)/100) print data_
import os #import the module nltk import nltk from nltk.corpus.reader.plaintext import PlaintextCorpusReader corpusdir = 'newcorpus2/' if not os.path.isdir(corpusdir): os.mkdir(corpusdir) # *************************************************************************************************************************** # Reading the content of the file which is placed inside the directory newcorpus newcorpus = PlaintextCorpusReader('newcorpus/', '.*') print("This is the text file inside newcorpus directory:") print (newcorpus.raw()) # *************************************************************************************************************************** # Reading the content of the file which is placed inside the directory newcorpus2 newcorpus2 = PlaintextCorpusReader('newcorpus2/', '.*') print("This is the text file inside newcorpus2 directory:") print(newcorpus2.raw()) # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ###################################################################################################################### file_1_count=newcorpus.words() print print("Display of each word of the file inside the directory newcorpus:") print(file_1_count) #count the frequency distribution of each word in the text file fre_count_file_1= nltk.FreqDist(file_1_count) print print("Please see the frequency distribution of each word:") print(fre_count_file_1) most_common_word = fre_count_file_1.most_common(2) print print("See the most two common used words from the file:")
import nltk import re from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk import FreqDist corpus_root = '/home/aman/entire-src/py/dir' speeches = PlaintextCorpusReader(corpus_root, '.*\.txt') print "Finished importing corpus" raw = speeches.raw().lower() tokens = nltk.word_tokenize(raw) tgs = nltk.trigrams(tokens) fdist = nltk.FreqDist(tgs) for k,v in fdist.items(): print k,v
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk import word_tokenize import re corpusdir = 'python/' # Directory of corpus. newcorpus = PlaintextCorpusReader(corpusdir, '.*') print(newcorpus.fileids()[0]) print(type(newcorpus)) #print newcorpus.raw() print newcorpus.words(newcorpus.fileids()[0]) print(len(newcorpus.words())) tokens = word_tokenize(newcorpus.raw()) #type(tokens) print len(tokens) print tokens[:50] #tokens[:10] print newcorpus.sents() print #to remove comments def removeComments(string): string = re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,string) # remove all occurance streamed comments (/*COMMENT */) from string fdf string = re.sub(re.compile("//.*?\n" ) ,"" ,string) # remove all occurance singleline comments (//COMMENT\n ) from string return string print(removeComments(newcorpus.words(newcorpus.raw())))
#import the module nltk import nltk #import this module for drawing graphs import matplotlib # Reader of NLTK to access our own text files and treat them as regular corpora from nltk.corpus.reader.plaintext import PlaintextCorpusReader #This is the directory in which we can store our text file corpusdir = 'newcorpus/' #this will make the directory in the folder you are working. if not os.path.isdir(corpusdir): os.mkdir(corpusdir) #accesing the file which is inside the directory newcorpus = PlaintextCorpusReader('newcorpus/', '.*') #Now Let us perform some of the operation using Natural Language processing #displaying the content of the file in the newcorpus which has been made print(newcorpus.raw().strip()) #displaying the length of the words of the file which is inside the directory newcorpus a=(len(newcorpus.words())) print("This will tell me the words inside the file",a) #displaying the length of the words of the file which is inside the directory newcorpus b=(len(newcorpus.sents())) print("This will tell me the sentence inside the file",b) #calculating average words per sentence aws= a/b; print("This will give me average words per sentence",aws) #********************************************************************** words_dispalyed = newcorpus.words() #This function will tell me the frequency distribution of each word in the text file fre_dis = nltk.FreqDist(words_dispalyed) #Let us plot each word and their frequency distribution using plot function. fre_dis.plot(title="Frequency Distribution")
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.corpus import floresta,mac_morpho from parser_portuguese_risk import evaluateModel, splitTrainTestModel, simplify_tag time1 =datetime.datetime.now() ############################################################################### ### ATENTION: if we have some tmp files like .DS_STORE in Mac OSX, we must remove it ### # Reading corpus corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/glossAnnotated/' # Directory of corpus. #corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/test1/' # Directory of corpus. risco = PlaintextCorpusReader(corpusdir, '.*') risco.fileids() raw_text = risco.raw('gloss533.txt') #print raw_text[0:] # Some statistics print 'Number of term: ', len(risco.words()) print 'Number of unique terms: ', len(set(risco.words())) fd = nltk.FreqDist(risco.words()) print fd.freq('bem') print fd['bem'] # presenting ngrams of the term target_word = 'bem como' fd = nltk.FreqDist(ng for ng in nltk.ngrams(risco.words(), 6)
import os import nltk from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.tokenize import RegexpTokenizer ## create the corpus of 1965 songs from html files corpusdir = '../../data/billboard_data/1960/billboard_1965/' bb_1965 = PlaintextCorpusReader(corpusdir, '.*') ## get the raw text from specific songs/files help = bb_1965.raw('help.html') desolation_row = bb_1965.raw('desolation_row.html') ## clean the raw text to remove the p tags clean_help = nltk.clean_html(help) clean_desolation = nltk.clean_html(desolation_row) # word tokenize tokens_help = nltk.word_tokenize(clean_help) tokens_desolation = nltk.word_tokenize(clean_desolation) # point of speech tagging tags_help = nltk.pos_tag(tokens_help) tags_desolation = nltk.pos_tag(tokens_desolation) tokenizer = RegexpTokenizer(r'\w+') ## print the unique, sorted pos tags for item in sorted(set(tags_help)): print 'help tags: ', item
# text_list.append(text) # preprocessed_docs = [] # for n,t in enumerate(text_list): # # print sample of text before and after processing # #if n == (len(text_list) - 1): # # print(("Doc {} (before preproc): {}").format(n, t)) # # print(("Doc {}: {}").format(n, p)) # p = preprocess(t) # preprocessed_docs.append(p) # print("Preprocessed docs len:", len(text_list)) texts = PlaintextCorpusReader(d, ".*\.txt") boc_texts = [extract(texts.raw(fileid)) for fileid in texts.fileids()] dictionary = gensim.corpora.Dictionary(boc_texts) #dictionary = gensim.corpora.Dictionary(preprocessed_docs) #dictionary.filter_extremes(no_below=10,no_above=.5,keep_n=100000) #bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs] bow_corpus = [dictionary.doc2bow(doc) for boc_text in boc_texts] tfidf = models.TfidfModel(bow_corpus) corpus_tfidf = tfidf[bow_corpus] fileids = texts.fileids() for idx, doc in enumerate(corpus_tfidf): new_file.write("Document '{}' key phrases:\n".format(fileids[idx])) # Get top 100 terms by TF-IDF score for wid, score in heapq.nlargest(100, doc, key=itemgetter(1)):
#make a new corpus corpusdir = 'communications/small_test_batch' #where the files are newcorpus = PlaintextCorpusReader(corpusdir, '.*') fileids = newcorpus.fileids() #list of fileids j = len(fileids) #number of docs words_list = [] #['doc', '1', 'words', 'doc', '2', 'words',...] doc_breaks = [0] #ith entry = index of first word in doc i in words_list keywords = set() #{'doc', '1', 'words', '2',...} tokenizer = RegexpTokenizer('\w+') #pick out alphanumeric sequences; discard punctuation, white space #create set of keywords and list of file texts for id in fileids: raw = newcorpus.raw(id) raw2 = ''.join([i if ord(i)<128 else '' for i in raw]) #remove unicode characters raw3 = raw2.encode('ascii') file_words = map(str.lower,tokenizer.tokenize(raw3)) #list of cleaned words: lower-case, no punct, no whitespace words_list = words_list + file_words doc_breaks = doc_breaks + [len(file_words)+doc_breaks[len(doc_breaks)-1]] doc_breaks = doc_breaks + [len(words_list)] keywords = set(words_list) print 'Number of keywords: ' + str(len(keywords)) print 'Number of total words: ' + str(len(words_list)) red_keywords = set() #reduced set of keywords; try to remove too common words to save matrix computation later cutoff = 3*j sorted_words_list = sorted(words_list)
from gensim.models import LsiModel from gensim.similarities import MatrixSimilarity download('punkt') download('stopwords') corpusdir = './txts' # Directory of corpus. all_files = PlaintextCorpusReader(corpusdir, '.*') fileids = all_files.fileids() print fileids print len(fileids) texts = [] fileindex = [] i = 0; for fileid in fileids: texts.append(all_files.raw(fileids=fileid)) fileindex.append(fileid) i += 1 stop_words = stopwords.words('english') def preprocess(text): text = text.lower() doc = word_tokenize(text) doc = [word for word in doc if word not in stop_words] doc = [word for word in doc if word.isalpha()] return doc texts_og = texts corpus = [preprocess(text) for text in texts] number_of_docs = len(corpus)
import os import nltk from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.tokenize import RegexpTokenizer ## create the corpus of 1965 songs from html files corpusdir = 'text/' bb_1965 = PlaintextCorpusReader(corpusdir, '.*') ## get the raw text from specific songs/files beatles_help = bb_1965.raw('help.html') desolation_row = bb_1965.raw('desolation_row.html') ## clean the raw text to remove the p tags clean_help = nltk.clean_html(beatles_help) clean_desolation = nltk.clean_html(desolation_row) # word tokenize tokens_help = nltk.word_tokenize(clean_help) tokens_desolation = nltk.word_tokenize(clean_desolation) # point of speech tagging tags_help = nltk.pos_tag(tokens_help) tags_desolation = nltk.pos_tag(tokens_desolation) tokenizer = RegexpTokenizer(r'\w+') help_tags = [item for item in sorted(set(tags_help))] desolation_tags = [item for item in sorted(set(tags_desolation))] print 'help_tags: ', help_tags
import nltk from nltk.corpus.reader.plaintext import PlaintextCorpusReader #Loading the file you want to Train corpusdir = 'E:\MTech' # Directory of corpus. newcorpus = PlaintextCorpusReader(corpusdir, '.*') print(newcorpus.fileids()) dictlist=[] #Converting from word/tag pait to a list containing two tuple i.e,[(word1,tag),(word2,tag)] for i in newcorpus.fileids(): tagged_sent=newcorpus.raw(i) tagged=tagged_sent.split() for t in tagged: temp1=nltk.tag.str2tuple(t) dictlist.append(temp1) print(dictlist) print("This is the length of distinct words") print(len(set(dictlist))) fdist=nltk.FreqDist(dictlist) print("fdist items") print(fdist.items()) print(fdist.max()) rawtext = ''' '''
# Create a new corpus by specifying the parameters # (1) directory of the new corpus # (2) the fileids of the corpus # NOTE: in this case the fileids are simply the filenames. newcorpus = PlaintextCorpusReader('newcorpus/', '.*') # Access each file in the corpus. for infile in sorted(newcorpus.fileids()): print infile # The fileids of each file. with newcorpus.open(infile) as fin: # Opens the file. print fin.read().strip() # Prints the content of the file print # Access the plaintext; outputs pure string/basestring. print newcorpus.raw().strip() print # Access paragraphs in the corpus. (list of list of list of strings) # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print # To access pargraphs of a specific fileid. print newcorpus.paras(newcorpus.fileids()[0])
textsfile = PlaintextCorpusReader(corpus_directory, '.*') ID_files = textsfile.fileids() print(ID_files, len(ID_files)) ############################## Preprossesing Data ###################################### stop_words = stopwords.words('english') lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer() Index_of_files = [] texts = [] count = 0 #file with file_ids for fileid in ID_files: texts.append(textsfile.raw(fileids=fileid)) Index_of_files.append(fileid) count += 1 def get_lemma(word): lemma = wn.morphy(word) if lemma is None: return WordNetLemmatizer().lemmatize(word) else: return WordNetLemmatizer().lemmatize(lemma) def clean_preprocessing(text): text = text.lower() doc = word_tokenize(text)
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk import word_tokenize from nltk.corpus import stopwords from nltk.tokenize import RegexpTokenizer from collections import Counter from nltk.stem import WordNetLemmatizer corpusdir = 'train_data/' newcorpus = PlaintextCorpusReader(corpusdir, '.*') #print newcorpus.raw().strip() all_contents = newcorpus.raw().strip() #print(all_contents) tokenizer = RegexpTokenizer(r'\w+') words = tokenizer.tokenize(all_contents) lowered_words = [] for w in words: lowered_words.append(w.lower()) stopwords = set(stopwords.words("english")) filtered_contents = [w for w in lowered_words if not w in stopwords] lemmatized_contents = [] lemmatizer = WordNetLemmatizer() for w in filtered_contents: lemmatized_contents.append(lemmatizer.lemmatize(w)) #most_common_words = [word for word,word_count in Counter(lemmatized_contents).most_common(20)] #bag_of_words = most_common_words
from wordcloud import STOPWORDS _stop_words = set(STOPWORDS) stop_words = set(stopwords.words('english')) stop_words.update(_stop_words, ('thing', 'u', 'us', 'nt')) lemmatizer = WordNetLemmatizer() # Read .txt files from ./docs directory into a corpus corpus = PlaintextCorpusReader('./docs/', ".*\.txt") # filter list of words to remove uneeded ones and punctuation # losing U.S. which is not ideal, tried splitting sentences on spaces and preserving dots just for it from nltk.tokenize import TweetTokenizer tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) tokenized = tokenizer.tokenize(corpus.raw()) # drop punctuation non_punct = list( filter(lambda token: nltk.tokenize.punkt.PunktToken(token).is_non_punct, tokenized)) # lowercase everything lowercased = [word.lower() for word in non_punct] # filter stop words filtered = list(filter(lambda token: token not in stop_words, lowercased)) # lemmatize to get root of words token_list = [lemmatizer.lemmatize(word) for word in filtered]
class TextAnalizer: def __init__(self, my_input_file): self.config = configparser.ConfigParser() self.config.read("text_analysis.cfg") self.input_file = my_input_file self.nlp_model = self.config["DEFAULT"]["nlp_model"] #The output file name self.output_file = self.config["DEFAULT"]["output_file"] self.nlp = load_nlp(self.nlp_model) self.corpus = CorpusReader(".", self.input_file) self.raw_text = self.corpus.raw() self.nlp_text = self.nlp(self.raw_text) # Here, lets put together the infos for text analysis with spacy. self.analysis_dictionary = Counter() self.word_count = 0 self.get_word_count_nltk() def get_paragraph(self): return self.corpus.paras() def get_sentence(self): return self.corpus.sents() def get_word(self): return self.corpus.words() def get_word_count_nltk(self): tokenizer = Tokenizer(r'\w+') counts = Counter() sentences = self.get_sentence() for sentence in sentences: tokens = tokenizer.tokenize(" ".join(sentence)) self.word_count = self.word_count + len(tokens) filtered = [w for w in sentence if w.isalnum()] counts = counts + Counter(filtered) return counts, self.word_count def analize_nlp(self): analized_data_str = (self.config["ANALIZED"]["POS"]) analized_data = (analized_data_str.split(",")) result_dict = {} diff_str, tot_str = ( self.config["DEFAULT"]["diff_tot_string"]).split(",") lemma_counter = Counter() pos_counter = Counter() tag_counter = Counter() for token in self.nlp_text: lemma_counter = lemma_counter + Counter([token.lemma_]) pos_counter = pos_counter + Counter([token.pos_]) tag_counter = tag_counter + Counter([token.tag_]) my_key = token.lemma_ + "_" + token.tag_ + "_" + token.pos_ self.analysis_dictionary[my_key] += 1 for pos in analized_data: instance_counter = 0 total_counter = 0 for key in self.analysis_dictionary.keys(): try: my_lemma, my_tag, my_pos = key.split("_") except ValueError: print("Warning: Array has a empty line") # add logging if pos == my_pos: instance_counter += 1 total_counter = total_counter + self.analysis_dictionary.get( key) result_dict[pos + diff_str] = instance_counter result_dict[pos + tot_str] = total_counter #add the stuff from nltk diff_word, word_count = self.get_word_count_nltk() result_dict["WORDS" + tot_str] = word_count result_dict["WORDS" + diff_str] = len(diff_word) result_dict["PARAGRAPHS"] = len(self.get_paragraph()) result_dict["SENTENCES"] = len(self.get_sentence()) return result_dict def write_output(self): with open(self.output_file, "w+") as f: f.write("Number of paragraphes: " + str(len(self.get_paragraph())) + "\n") f.write("Number of sentences: " + str(len(self.get_sentence())) + "\n") f.write("Number of words: " + str(self.word_count) + "\n") f.write("Average words per sentence: " + str(round(self.word_count / len(self.get_sentence()), 2)) + "\n") f.write("Number of different words: " + str(len(self.get_word_count_nltk())) + "\n") f.write("Text variety (different words/total words: " + str( round(len(self.get_word_count_nltk()) / self.word_count, 2)) + "\n") f.close()
import string import nltk from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity from scipy.cluster.hierarchy import ward, dendrogram from sklearn.manifold import MDS import matplotlib.pyplot as plt import scipy.stats as stats names = [] corpus = [] co = PlaintextCorpusReader("./election", ".*\.txt") for fileids in co.fileids(): names.append(fileids) corpus.append(co.raw(fileids)) print len(names), 'documents in the corpus' print names[:30] for idx in range(len(corpus) - 1, -1, -1): print print names[idx] print corpus[idx][:70].replace('\n', ' ') vectorizer = TfidfVectorizer(stop_words='english', min_df=2) dtm = vectorizer.fit_transform(corpus) print dtm.shape vocab = vectorizer.get_feature_names( ) # list of unique vocab, we will use this later print len(vocab), '# of unique words'