예제 #1
0
def w_count(text):
    """
    raw word counts for full document
    """
    tokens = tm.tokenize(text, casefold=True)
    output = dict([(token, tokens.count(token)) for token in set(tokens)])
    return output
예제 #2
0
        for s in text.split('\n\n'):
            if s:
                paragraph = s  #.lower()
                #paragraph = re.sub(r'\W',' ',paragraph)
                #paragraph = re.sub(r'\d',' ',paragraph)
                paragraph = re.sub(r'[^A-Za-z]', ' ', paragraph)
                paragraph = re.sub(r' +', ' ', paragraph)
                paragraphs.append(paragraph.rstrip())
    return paragraphs


paragraphs = get_para(filepath)
print paragraphs[0]

### parts of speech tagging
print pos_tag(tm.tokenize(paragraphs[100]), tagset='universal', lang='eng')

# monster tokenizer that includes specific part of speech
i = 0
para_token = []
for paragraph in paragraphs:
    print i
    tokens = tm.tokenize(paragraph, length=1, casefold=False)
    tagset = pos_tag(tokens, tagset='universal', lang='eng')
    tokens = [tag[0] for tag in tagset if tag[1] in ['NOUN']]
    tokens = [token.lower() for token in tokens]
    para_token.append(tokens)
    i += 1

print para_token[100]
# generate stopword list from text
예제 #3
0
for sermon in sermons:
        date = date_reg.findall(sermon)
        dates.append(date)

month = []
for date in dates:
    try:
        month.append(date[0][4:6])
    except:
        month.append('FALSE')

#tokenize list with sermons using the function in tm called tokenize. We lowercase everything and are not interested in allcaps..

tokenized_sermons=[]
for i in sermons:
    tokenized_sermons.append(tm.tokenize(i.lower()))
    
#Use pruning to remove unwanted words:
prune = tm.prune_multi(tokenized_sermons, 50, 500)
    
#Alternatively you can use a stopword-list
#Create and apply stopword-list
sw = tm.gen_ls_stoplist(tokenized_sermons, 250) #How many words do we want to delete
sermons_nosw = []
for sermon in tokenized_sermons: #For each sermon:
    nosw_sermon = [] #Create empty list
    nosw_sermon =[token for token in sermon if token not in sw] #Fill the empty list with the words not in sw
    sermons_nosw.append(nosw_sermon) #Add the created list to sermons_nosw
    

예제 #4
0
###

#### Section 1.2 #### 

df = pd.read_csv('fake_or_real_news_cleaned_sent.csv', encoding = 'utf-8')
print df.label.value_counts() #balanced dataset (approx 3000 of each)
print df.loc[1]


#MAKING A TOPIC MODEL DATAFRAME
 #defining a working df  - change this when we want to work with all of the texts
tp_df = df
#insert articles into a list
texts_tokenized = []
for text in tp_df['text_clean']:
    tokens = tm1.tokenize(text, length = 1, casefold = False) #casefold equal false because we want uppercase letters to categorize the text using pos_tag
    tagset = pos_tag(tokens, tagset = 'universal', lang = 'eng') #tag tokens with their category
    tokens = [tag[0] for tag in tagset if tag[1] in ['NOUN']] #only retain nouns
    tokens = [token.lower() for token in tokens] #lowercase the tokens
    texts_tokenized.append(tokens)
print type(texts_tokenized[0][0])  #the word in the text
print type(texts_tokenized[0])  #list of words in text
print type(texts_tokenized) #list of texts
#So it is a string within a list within a list (the first list is the text, the second list the nouns in the text and the string is the noun)
    #making a stopwordlist
sw = tm1.gen_ls_stoplist(texts_tokenized, 40)
print sw #this stopword might say some general things about the period of the articles rather than something about the topics
#for now let's just not use it

"""
#applying stopword list to all texts#
예제 #5
0
# use regex to identify START and END of Gutenberg text
pat1 = r'\*{3} STAR(.*?)\*{3}'
pat2 = r'\*{3} END(.*?)\*{3}'
start_idx = [(m.start(0), m.end(0)) for m in re.finditer(pat1, text)]
end_idx = [(m.start(0), m.end(0)) for m in re.finditer(pat2, text)]

# print start string of Gutenberg text
print(text[start_idx[0][0]:start_idx[0][1]])
idx1 = start_idx[0][1] + 1  # beginning of content
idx2 = end_idx[0][0]  # end of content

# extract text content and assign to variable
content = text[idx1:idx2]
print(content[:100])

tokens = tm.tokenize(content, lentoken=1)
print(tokens[:100])


def slice_tokens(tokens, n=100, cut_off=True):
    # result: list of slices
    slices = []
    # slice tokens
    for i in range(0, len(tokens), n):
        slices.append(tokens[i:(i + n)])
    #cut_off function
    if cut_off:
        del slices[-1]
    return slices

예제 #6
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
from __future__ import division
import os
import matplotlib.pyplot as plt
# set working directory
os.chdir(os.path.expanduser('~/Documents/tmgu17/scripts'))
import textminer as tm
## get data
text_ls, text_names = tm.read_dir_txt('data/')
text = text_ls[3]
tokens = tm.tokenize(text, casefold=True)
## tag could from tokenized text
from wordcloud import WordCloud
# help(WordCloud)# for more information


def tag_cloud(tokens, stop_set=None):
    wc = WordCloud(stopwords=stop_set).generate(' '.join(tokens))
    plt.figure(figsize=(12, 12), dpi=200)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    #plt.savefig('wordcloud.png',bbox_inches='tight')
    plt.show()
    plt.close()


# run
tag_cloud(tokens)