return (ratio >= threshold)

def generate_bow(sentence):
    tokenset = [token.lower().strip(string.punctuation) for token in nltk.word_tokenize(sentence)\
                if token.lower().strip(string.punctuation) not in stopwords]
    return tokenset

tokensets = []
#Generate all BoW's
for i in unprocessed_sentences:
    tokensets.append(generate_bow(i))

print "Finished creating bow representations."
# Compare all sentences s for similarity with the sentences on index > s

keep_items = []
for i in range(len(unprocessed_sentences)):
    unique_item = True
    for item in range(i+1,len(unprocessed_sentences)-1):
        if is_ci_token_stopword_set_match(tokensets[i], tokensets[item]) == True:
            unique_item = False
    if unique_item == True:
        keep_items.append(unprocessed_sentences[i])
    if i%500 == 0: # Used to track progress of the deduplication
        print "Processed sentences:\t %d/%d"%(i,len(unprocessed_sentences))

print "Amount of kept items: %d"%len(keep_items)

# Write away processed data
w.write_to_pickle("output_file_name", keep_items)
예제 #2
0
            keep_items.append(sentence)
        if (source_list.index(sentence)+ 1) % 5000 == 0:
            print "Kept/processed: %d/%d"%(len(keep_items), source_list.index(sentence)+1)
    return keep_items

similar_sentences = find_similar_sentences(wikipedia, top_words)
print len(similar_sentences)

if len(similar_sentences) > len(oneliners):
    wiki_sentences = draw_from_list_randomly(similar_sentences, len(oneliners))
else:
    print 'Not enough sentences were found in the first shift. \nProcess will be reiterated with the top 1000 ngrams.'
    wiki_sentences = similar_sentences
    while len(wiki_sentences) < len(oneliners):
        print "No. of Wikipedia sentences similar to oneliners; %d" %len(wiki_sentences)
        wikipedia2 = [s for s in wikipedia if s not in wiki_sentences]
        top_2000 = print_top_tfidf(vectorizer, clf, ['POS'], 2000)
        second_shift_sents = find_similar_sentences(wikipedia2, top_2000)
        no_required_sents = len(oneliners) - len(wiki_sentences)
        if len(second_shift_sents) < no_required_sents:
            for i in second_shift_sents:
                wiki_sentences.append(i)
        else:
            for i in draw_from_list_randomly(second_shift_sents, no_required_sents):
                wiki_sentences.append(i)

print "Amount of kept items: %d"%len(wiki_sentences)

# Write away processed data
w.write_to_pickle("similar_wiki_sentences", wiki_sentences)
예제 #3
0
keep_sentences = []
tokensets = []

#Generate all bows
for i in unprocessed_sentences:
    tokensets.append(generate_bow(i))

print "Finished creating bow representations."

# Compare all sentences s for similarity with the sentences on index > s
for i in range(len(unprocessed_sentences) + 1):
    if i not in remove_items:
        for item in range(i + 1, len(unprocessed_sentences)):
            if is_ci_token_stopword_set_match(tokensets[i],
                                              tokensets[item]) == True:
                remove_items.append(item)
    if i % 50 == 0:  # Used to track progress of the deduplication
        print "processed %d sentences" % i

remove_items = list(OrderedDict.fromkeys(remove_items))
print "Removed items:"
print remove_items

for item in range(len(unprocessed_sentences)):
    if item not in remove_items:
        keep_sentences.append(unprocessed_sentences[item])

print "Amount of kept items: %d" % len(keep_sentences)
# Write away processed data
w.write_to_pickle("humorous_jokes", keep_sentences)
예제 #4
0
user = '******'

latest_status_id = 987796177497096200  #Arbitrary high number. NOTE: Make sure the latest tweet sent by your user is below this threshold
complete_statuses_list = []
amount_of_tweets_desired = 3200  #The number of tweets you want to gather from the respective user. This includes rts in its count.
#Add extra on accounts that Retweet a lot. Twitter allows you to request a maximum of 3200 of the latest tweets per user account and 200 per request
for i in range(amount_of_tweets_desired / 200):
    statuses = twitter_api.statuses.user_timeline(screen_name=user,
                                                  count=201,
                                                  max_id=(latest_status_id -
                                                          1),
                                                  include_entities=False,
                                                  include_rts=False)
    #if you want to include retweets by the user, include_rts should be set to True
    status_ids = [status['id'] for status in statuses]

    # Use "[re.sub(....)" if you need all links to be removed from the tweet and "[status['text']" if not.
    complete_statuses_list += [
        re.sub(r"http\S+", "", status['text'])  #[status['text']
        for status in statuses
    ]

    latest_status_id = status_ids[-1]  #is used to iterate to next tweet
    print "Scraped the %d latest tweets with an ID lower than %d" % (
        len(status_ids), latest_status_id)
print "Total number of tweets found: %d." % len(complete_statuses_list)

# Write away the data
savename = "%s" % user
w.write_to_pickle(savename, complete_statuses_list)
# Required library imports
import cPickle as pickle
from random import randrange
import write_functions as w
import sys  
reload(sys)  
sys.setdefaultencoding('utf8')

# Import wikipedia dataset
file_object = open('wiki.txt' ,'r')
all_wiki_sentences = file_object.readlines()
print "File successfully opened. It contains %d sentences."%(len(all_wiki_sentences))

# Import dataset of oneliners
filename = "humorous_jokes.pickle"
oneliners = pickle.load(open(filename))

# Randomly pick as much sentences from the wikipedia dataset as there are jokes in the oneliner dataset
random_index = []
while len(random_index) < len(oneliners):
    random_no = randrange(len(all_wiki_sentences))
    if random_no not in random_index: # This statement ensures no random sentences are picked twice
        random_index.append(random_no)

wiki_sents = []
for i in random_index:
    wiki_sents.append(all_wiki_sentences[i])
w.write_to_pickle("wiki_sentences", wiki_sents)
for p in range(1, page_max + 1):
    tree = html.fromstring(
        requests.get('http://onelinefun.com/%d/' % (p)).content)
    category_scores += scrape_single_user_rating(tree, rating_classname)
found_lines += scrape_webpages('http://onelinefun.com/', page_max,
                               text_class_name)

# Filter out not funny jokes.
if len(category_scores) == len(found_lines):
    for i in range(len(found_lines)):
        if category_scores[i] > 40.0:
            keep_lines.append(
                found_lines[i].text_content().split('One-liner')[0])
else:
    print 'The number of scores and the number of lines are imbalanced'
    print '%d scores found' % len(category_scores)
    print '%d lines found' % len(found_lines)

print "Finished processing.\nIt contained %d Funny lines." % (len(keep_lines))

lines.append(keep_lines)

for cat_lines in lines:
    for l in cat_lines:
        strip1 = l.strip('\n')
        strip2 = strip1.strip(' ')
        output1.append(strip2.strip('\n'))

# Write away the found oneliners
w.write_to_pickle("onelinefun", output1)
    for p in range(1,page_max):
        tree = html.fromstring(requests.get('http://www.funnyshortjokes.com/c/%s/page/%d'%(c,p)).content)       
        category_scores = category_scores + scrape_user_ratings(tree, pos_classname,neg_classname)
    found_lines += scrape_webpages('http://www.funnyshortjokes.com/c/%s/page/'%(c),page_max,text_class_name)
    # Filter out not funny jokes by removing all jokes that have received more negative than positive feedback
    if len(category_scores) == len(found_lines):
        print '%d scores and %d lines found'%(len(category_scores), len(found_lines))
        for i in range(len(found_lines)):
            if category_scores[i] >= 0:
                keep_lines.append(found_lines[i].text_content())
    else:
        print 'The number of scores and the number of lines are imbalanced'
        print '%d scores found'%len(category_scores)
        print '%d lines found'%len(found_lines)
    
    print "Finished processing %s.\nThe category contained %d Funny lines."%(c,len(keep_lines))
   
    lines.append(keep_lines)
    
print lines[0:2]
for cat_lines in lines:
    for l in cat_lines:
        strip1 = l.strip('\n')
        strip2 = strip1.strip(' ')
        output1.append(strip2.strip('\n').decode('utf-8', "replace"))
print output1[0:2]

# Write away the found oneliners
w.write_to_pickle("funnyshortjokes",output1)

예제 #8
0
    for p in range(1, int(upper_bound_page_no) + 1):
        #request_page = requests.get('%s%s'%(url,p))
        tree = html.fromstring(requests.get('%s%s' % (url[:-1], p)).content)
        elements += tree.find_class(name_text_class)

    print "Found %d lines" % len(elements)
    return elements


uri_string = 'http://www.reuters.com/news/archive/businessNews?view=page&page=1'
output1 = []
tree = html.fromstring(requests.get('%s' % uri_string).content)
# print html.tostring(tree) # Useful for scanning the

text_class = 'story-title'
max_page_no = 400
categories = ['business', 'world', 'politics', 'technology']
headlines = []
for c in categories:
    cat_uri = 'http://www.reuters.com/news/archive/%sNews?view=page&page=1' % c
    cat_headlines = scrape_webpages(cat_uri, max_page_no, text_class)
    for hl in cat_headlines:
        headlines.append(hl)

for h in range(len(headlines)):
    headlines[h] = headlines[h].text_content().strip()

# Write away the gathered headlines:
w.write_to_pickle("reuters_scraped", headlines)
예제 #9
0
        tree = html.fromstring(requests.get('%s%s' % (url[:-1], p)).content)
        elements += tree.find_class(name_text_class)

    print "Found %d lines" % len(elements)
    return elements


uri_string = 'http://www.reuters.com/news/archive/businessNews?view=page&page=1'
output1 = []
tree = html.fromstring(requests.get('%s' % uri_string).content)

# Uncomment this print to print the entire html structure and find the relevant classnames for your website
#print html.tostring(tree)

text_class = 'story-title'
max_page_no = 400
# Since Reuters currently stores 10 headlines in one container, this will result in 400*10 headlines per category
categories = ['business', 'world', 'politics', 'technology']
headlines = []
for c in categories:
    cat_uri = 'http://www.reuters.com/news/archive/%sNews?view=page&page=1' % c
    cat_headlines = scrape_webpages(cat_uri, max_page_no, text_class)
    for hl in cat_headlines:
        headlines.append(hl)

for h in range(len(headlines)):
    headlines[h] = headlines[h].text_content().strip()

# Write away the gathered headlines:
w.write_to_pickle("reuters_raw", headlines)
reload(sys)  
sys.setdefaultencoding('utf8')

# The following code opens a CSV file and stores the lines in a variable of type list
filename = 'humorous_jokes.pickle'
sentences = []
sentences = pickle.load(open(filename))
print "File succesfully imported. The file contains %d sentences." %len(sentences)

# Count the number of letters in each sentence
short_jokes = []
long_jokes = []

for sent in sentences:
    letter_counter = 0
    for char in sent:
        if char.lower() in 'abcdefghijklmnopqrstuvwxyz':
            letter_counter +=1
    if letter_counter <= 140:
        short_jokes.append(sent)
    else:
        long_jokes.append(sent)

# Some information on the data created
print "The short jokes file contains %d sentences."%len(short_jokes)
print "The longer jokes file contains %d sentences."%len(long_jokes)

# Write away the data to two separate pickle files
w.write_to_pickle('short_oneliners', short_jokes)
w.write_to_pickle('longer_jokes', long_jokes)
for p in range(1, page_max + 1):
    tree = html.fromstring(
        requests.get('https://unijokes.com/%d/' % (p), verify=False).content)
    category_scores += scrape_single_user_rating(tree, rating_classname)
found_lines += scrape_webpages('https://unijokes.com/', page_max,
                               text_class_name)

# Filter out not funny jokes.
if len(category_scores) == len(found_lines):
    for i in range(len(found_lines)):
        if category_scores[i] > 40.0:
            keep_lines.append(found_lines[i].text_content().split('Vote:')[0])
else:
    print 'The number of scores and the number of lines are imbalanced'
    print '%d scores found' % len(category_scores)
    print '%d lines found' % len(found_lines)

print "Finished processing.\nIt contained %d Funny lines." % (len(keep_lines))

for l in keep_lines:
    r = l.strip('\r')
    n = r.strip('\n')
    rn = n.replace('\r\n', '')
    space = rn.strip(' ')
    q = space.replace('Q: ', '')
    a = q.replace('A: ', ' ')
    lines.append(a.decode('utf-8', "replace"))

# Write away the found oneliners
w.write_to_pickle("unijokes", lines)
reload(sys)
sys.setdefaultencoding('utf8')

# The following code opens a CSV file and stores the lines in a variable of type list
filename = 'humorous_jokes.pickle'
sentences = []
sentences = pickle.load(open(filename))
print "File succesfully imported. The file contains %d sentences." % len(
    sentences)

# Count the number of letters in each sentence
short_jokes = []
long_jokes = []

for sent in sentences:
    letter_counter = 0
    for char in sent:
        if char.lower() in 'abcdefghijklmnopqrstuvwxyz':
            letter_counter += 1
    if letter_counter <= 140:
        short_jokes.append(sent)
    else:
        long_jokes.append(sent)

# Some information on the data created
print "The short jokes file contains %d sentences." % len(short_jokes)
print "The longer jokes file contains %d sentences." % len(long_jokes)

# Write away the data to two separate pickle files
w.write_to_pickle('short_oneliners', short_jokes)
w.write_to_pickle('longer_jokes', long_jokes)
        #request_page = requests.get('%s%s'%(url,p))
        tree = html.fromstring(requests.get('%s%s'%(url[:-1],p)).content)
        elements += tree.find_class(name_text_class)
        
    print "Found %d lines"%len(elements)
    return elements

uri_string = 'http://www.reuters.com/news/archive/businessNews?view=page&page=1'
output1 = []
tree = html.fromstring(requests.get('%s'%uri_string).content)

# Uncomment this print to print the entire html structure and find the relevant classnames for your website
#print html.tostring(tree) 

text_class = 'story-title'
max_page_no = 400
# Since Reuters currently stores 10 headlines in one container, this will result in 400*10 headlines per category
categories = ['business', 'world', 'politics', 'technology']
headlines = []
for c in categories:
    cat_uri = 'http://www.reuters.com/news/archive/%sNews?view=page&page=1'%c
    cat_headlines = scrape_webpages(cat_uri, max_page_no, text_class)
    for hl in cat_headlines:
        headlines.append(hl)

for h in range(len(headlines)):
    headlines[h] = headlines[h].text_content().strip()

# Write away the gathered headlines:
w.write_to_pickle("reuters_raw", headlines)
예제 #14
0
# Required library imports
import cPickle as pickle
from random import randrange
import write_functions as w
import sys  
reload(sys)  
sys.setdefaultencoding('utf8')

# Import wikipedia dataset
file_object = open('wiki.txt' ,'r')
all_wiki_sentences = file_object.readlines()
print "File successfully opened. It contains %d sentences."%(len(all_wiki_sentences))

# Import dataset of oneliners
filename = "humorous_jokes.pickle"
oneliners = pickle.load(open(filename))

# Randomly pick as much sentences from the wikipedia dataset as there are jokes in the oneliner dataset
random_index = []
while len(random_index) < len(oneliners):
    random_no = randrange(len(all_wiki_sentences))
    if random_no not in random_index: # This statement ensures no random sentences are picked twice
        random_index.append(random_no)

wiki_sents = []
for i in random_index:
    wiki_sents.append(all_wiki_sentences[i])
w.write_to_pickle("wiki_sentences", wiki_sents)
tree = html.fromstring(requests.get('http://onelinefun.com/1/').content)
page_max = 359

for p in range(1,page_max+1):
    tree = html.fromstring(requests.get('http://onelinefun.com/%d/'%(p)).content)       
    category_scores += scrape_single_user_rating(tree, rating_classname)
found_lines += scrape_webpages('http://onelinefun.com/',page_max,text_class_name)

# Filter out not funny jokes.  
if len(category_scores) == len(found_lines):
    for i in range(len(found_lines)):
        if category_scores[i] > 40.0:
            keep_lines.append(found_lines[i].text_content().split('One-liner')[0])
else:
    print 'The number of scores and the number of lines are imbalanced'
    print '%d scores found'%len(category_scores)
    print '%d lines found'%len(found_lines)
    
print "Finished processing.\nIt contained %d Funny lines."%(len(keep_lines))
   
lines.append(keep_lines)

for cat_lines in lines:
    for l in cat_lines:
        strip1 = l.strip('\n')
        strip2 = strip1.strip(' ')
        output1.append(strip2.strip('\n'))

# Write away the found oneliners
w.write_to_pickle("onelinefun",output1)
'''Tweet scraping, stripping (URLS) and saving step'''

import re
import write_functions as w

#Define the user you wish to gather the latest tweets from
user = '******'

latest_status_id = 987796177497096200 #Arbitrary high number. NOTE: Make sure the latest tweet sent by your user is below this threshold
complete_statuses_list = []
amount_of_tweets_desired = 3200 #The number of tweets you want to gather from the respective user. This includes rts in its count. 
#Add extra on accounts that Retweet a lot. Twitter allows you to request a maximum of 3200 of the latest tweets per user account and 200 per request
for i in range(amount_of_tweets_desired/200):
    statuses = twitter_api.statuses.user_timeline(screen_name=user, count=201, max_id= (latest_status_id-1),
                                                  include_entities=False, include_rts=False) 
    #if you want to include retweets by the user, include_rts should be set to True
    status_ids = [status['id']
        for status in statuses ]
    
    # Use "[re.sub(....)" if you need all links to be removed from the tweet and "[status['text']" if not.
    complete_statuses_list +=  [re.sub(r"http\S+", "",status['text']) #[status['text']
        for status in statuses]

    latest_status_id = status_ids[-1] #is used to iterate to next tweet
    print "Scraped the %d latest tweets with an ID lower than %d"% (len(status_ids), latest_status_id)
print "Total number of tweets found: %d." %len(complete_statuses_list)

# Write away the data
savename = "%s"%user
w.write_to_pickle(savename,complete_statuses_list)
예제 #17
0
# Make sure all sentences are decoded correctly
unprocessed_sentences = [i.decode('utf8','replace') for i in unprocessed_sentences]
print "A total of %d lines were imported and are ready for deduplication!"%len(unprocessed_sentences)

''''Compare all sentences s for similarity with the sentences on index > s'''
keep_items = []
for i in range(len(unprocessed_sentences)):
    unique_item = True
    for item in keep_items:
        if tp.measure_overlap_bows(unprocessed_sentences[i], item, 0.9) == True:
            unique_item = False
    if unique_item == True:
        keep_items.append(unprocessed_sentences[i])
    if i%50 == 0: # Used to track progress of the deduplication process, might take a while
        print "Processed sentences:\t %d"%i

print "Amount of kept items: %d"%len(keep_items)
keep_short = []
keep_long = []
for item in keep_items:
    if len(item) < 140:
        keep_short.append(item)
    else:
        keep_long.append(item)

print "Amount of short items kept: %d"%len(keep_short)
# Write away processed data
w.write_to_pickle("deduplicated_file", keep_short)
# Uncomment next line to also store longer items
#w.write_to_pickle("deduplicated_long_jokes", keep_long)
remove_items = []
keep_sentences = []
tokensets = []

#Generate all bows
for i in unprocessed_sentences:
    tokensets.append(generate_bow(i))

print "Finished creating bow representations."

# Compare all sentences s for similarity with the sentences on index > s
for i in range(len(unprocessed_sentences)+1):
    if i not in remove_items:
        for item in range(i+1,len(unprocessed_sentences)):
            if is_ci_token_stopword_set_match(tokensets[i], tokensets[item]) == True:
                remove_items.append(item)
    if i%50 == 0: # Used to track progress of the deduplication
        print "processed %d sentences"%i
        
remove_items = list(OrderedDict.fromkeys(remove_items))
print "Removed items:"
print remove_items

for item in range(len(unprocessed_sentences)):
    if item not in remove_items:
        keep_sentences.append(unprocessed_sentences[item])

print "Amount of kept items: %d"%len(keep_sentences)
# Write away processed data
w.write_to_pickle("humorous_jokes",keep_sentences)
import requests
import write_functions as w

output1 = []
page_max = 385  #The link can access all jokes on the website

for i in range(1, page_max + 1):
    request_page = requests.get(
        'http://goodriddlesnow.com/jokes/find?sort=popularity&direction=desc&page=%d'
        % i)
    tree = html.fromstring(request_page.content)
    elements = tree.find_class('joke-question') + tree.find_class(
        'joke-answer hide')

    output = []
    for el in elements:
        strip1 = el.text_content().strip('\n').strip('Joke: ')
        strip2 = strip1.strip(' ').strip('Punch line: ')
        output.append(strip2.replace('\n', ' ').decode('utf-8', "replace"))
    #filter out all noise
    for k in range(0, len(output) / 2):
        if 'Show Your Support :)' not in output[k + (len(output) / 2)]:
            output[k] = '%s %s' % (output[k], output[int(k +
                                                         (len(elements) / 2))])
    output = output[0:5]
    for out in output:
        output1.append(out)

# Write away the found oneliners
w.write_to_pickle('goodriddlesnow', output1)
        print '%d scores found'%len(category_scores)
        print '%d lines found'%len(found_lines)
    
    print "Finished processing %s.\nIt contained %d funny lines."%(c,len(keep_lines))
    for l in keep_lines:
        r = l.strip('\r')
        n = r.strip('\n')
        rn = n.replace('\r\n', '')
        space = rn.strip(' ')
        q = space.replace('Q:','')
        a = q.replace('A:','')
        lines.append(a.decode('utf-8', "replace"))
        
print len(lines)
print lines[0:5]
# split_output1 = []
# split2_output1 = []
# output2 = []
# for i in lines:
#     split_output1 += i.split('\n')
# for i in split_output1:
#     split2_output1 += i.split('\r')
# split2_output1 = filter(None,split2_output1)
# split3_output1 = [[i.strip(' ')] for i in split2_output1]
# 
# for i in split3_output1:
#     output2 += filter(None,i)
# print output2[0:5]
# # Write away the found oneliners
w.write_to_pickle("laughfactory", lines)
page_max = 1100

for p in range(1,page_max+1):
    tree = html.fromstring(requests.get('https://unijokes.com/%d/'%(p), verify=False).content)       
    category_scores += scrape_single_user_rating(tree, rating_classname)
found_lines += scrape_webpages('https://unijokes.com/', page_max, text_class_name)

# Filter out not funny jokes.  
if len(category_scores) == len(found_lines):
    for i in range(len(found_lines)):
        if category_scores[i] > 40.0:
            keep_lines.append(found_lines[i].text_content().split('Vote:')[0])
else:
    print 'The number of scores and the number of lines are imbalanced'
    print '%d scores found'%len(category_scores)
    print '%d lines found'%len(found_lines)
    
print "Finished processing.\nIt contained %d Funny lines."%(len(keep_lines))
   
for l in keep_lines:
    r = l.strip('\r')
    n = r.strip('\n')
    rn = n.replace('\r\n', '')
    space = rn.strip(' ')
    q = space.replace('Q: ','')
    a = q.replace('A: ',' ')
    lines.append(a.decode('utf-8', "replace"))

# Write away the found oneliners
w.write_to_pickle("unijokes", lines)