def w_count(text): """ raw word counts for full document """ tokens = tm.tokenize(text, casefold=True) output = dict([(token, tokens.count(token)) for token in set(tokens)]) return output
for s in text.split('\n\n'): if s: paragraph = s #.lower() #paragraph = re.sub(r'\W',' ',paragraph) #paragraph = re.sub(r'\d',' ',paragraph) paragraph = re.sub(r'[^A-Za-z]', ' ', paragraph) paragraph = re.sub(r' +', ' ', paragraph) paragraphs.append(paragraph.rstrip()) return paragraphs paragraphs = get_para(filepath) print paragraphs[0] ### parts of speech tagging print pos_tag(tm.tokenize(paragraphs[100]), tagset='universal', lang='eng') # monster tokenizer that includes specific part of speech i = 0 para_token = [] for paragraph in paragraphs: print i tokens = tm.tokenize(paragraph, length=1, casefold=False) tagset = pos_tag(tokens, tagset='universal', lang='eng') tokens = [tag[0] for tag in tagset if tag[1] in ['NOUN']] tokens = [token.lower() for token in tokens] para_token.append(tokens) i += 1 print para_token[100] # generate stopword list from text
for sermon in sermons: date = date_reg.findall(sermon) dates.append(date) month = [] for date in dates: try: month.append(date[0][4:6]) except: month.append('FALSE') #tokenize list with sermons using the function in tm called tokenize. We lowercase everything and are not interested in allcaps.. tokenized_sermons=[] for i in sermons: tokenized_sermons.append(tm.tokenize(i.lower())) #Use pruning to remove unwanted words: prune = tm.prune_multi(tokenized_sermons, 50, 500) #Alternatively you can use a stopword-list #Create and apply stopword-list sw = tm.gen_ls_stoplist(tokenized_sermons, 250) #How many words do we want to delete sermons_nosw = [] for sermon in tokenized_sermons: #For each sermon: nosw_sermon = [] #Create empty list nosw_sermon =[token for token in sermon if token not in sw] #Fill the empty list with the words not in sw sermons_nosw.append(nosw_sermon) #Add the created list to sermons_nosw
### #### Section 1.2 #### df = pd.read_csv('fake_or_real_news_cleaned_sent.csv', encoding = 'utf-8') print df.label.value_counts() #balanced dataset (approx 3000 of each) print df.loc[1] #MAKING A TOPIC MODEL DATAFRAME #defining a working df - change this when we want to work with all of the texts tp_df = df #insert articles into a list texts_tokenized = [] for text in tp_df['text_clean']: tokens = tm1.tokenize(text, length = 1, casefold = False) #casefold equal false because we want uppercase letters to categorize the text using pos_tag tagset = pos_tag(tokens, tagset = 'universal', lang = 'eng') #tag tokens with their category tokens = [tag[0] for tag in tagset if tag[1] in ['NOUN']] #only retain nouns tokens = [token.lower() for token in tokens] #lowercase the tokens texts_tokenized.append(tokens) print type(texts_tokenized[0][0]) #the word in the text print type(texts_tokenized[0]) #list of words in text print type(texts_tokenized) #list of texts #So it is a string within a list within a list (the first list is the text, the second list the nouns in the text and the string is the noun) #making a stopwordlist sw = tm1.gen_ls_stoplist(texts_tokenized, 40) print sw #this stopword might say some general things about the period of the articles rather than something about the topics #for now let's just not use it """ #applying stopword list to all texts#
# use regex to identify START and END of Gutenberg text pat1 = r'\*{3} STAR(.*?)\*{3}' pat2 = r'\*{3} END(.*?)\*{3}' start_idx = [(m.start(0), m.end(0)) for m in re.finditer(pat1, text)] end_idx = [(m.start(0), m.end(0)) for m in re.finditer(pat2, text)] # print start string of Gutenberg text print(text[start_idx[0][0]:start_idx[0][1]]) idx1 = start_idx[0][1] + 1 # beginning of content idx2 = end_idx[0][0] # end of content # extract text content and assign to variable content = text[idx1:idx2] print(content[:100]) tokens = tm.tokenize(content, lentoken=1) print(tokens[:100]) def slice_tokens(tokens, n=100, cut_off=True): # result: list of slices slices = [] # slice tokens for i in range(0, len(tokens), n): slices.append(tokens[i:(i + n)]) #cut_off function if cut_off: del slices[-1] return slices
#!/usr/bin/env python2 # -*- coding: utf-8 -*- from __future__ import division import os import matplotlib.pyplot as plt # set working directory os.chdir(os.path.expanduser('~/Documents/tmgu17/scripts')) import textminer as tm ## get data text_ls, text_names = tm.read_dir_txt('data/') text = text_ls[3] tokens = tm.tokenize(text, casefold=True) ## tag could from tokenized text from wordcloud import WordCloud # help(WordCloud)# for more information def tag_cloud(tokens, stop_set=None): wc = WordCloud(stopwords=stop_set).generate(' '.join(tokens)) plt.figure(figsize=(12, 12), dpi=200) plt.imshow(wc, interpolation="bilinear") plt.axis("off") #plt.savefig('wordcloud.png',bbox_inches='tight') plt.show() plt.close() # run tag_cloud(tokens)