return (ratio >= threshold) def generate_bow(sentence): tokenset = [token.lower().strip(string.punctuation) for token in nltk.word_tokenize(sentence)\ if token.lower().strip(string.punctuation) not in stopwords] return tokenset tokensets = [] #Generate all BoW's for i in unprocessed_sentences: tokensets.append(generate_bow(i)) print "Finished creating bow representations." # Compare all sentences s for similarity with the sentences on index > s keep_items = [] for i in range(len(unprocessed_sentences)): unique_item = True for item in range(i+1,len(unprocessed_sentences)-1): if is_ci_token_stopword_set_match(tokensets[i], tokensets[item]) == True: unique_item = False if unique_item == True: keep_items.append(unprocessed_sentences[i]) if i%500 == 0: # Used to track progress of the deduplication print "Processed sentences:\t %d/%d"%(i,len(unprocessed_sentences)) print "Amount of kept items: %d"%len(keep_items) # Write away processed data w.write_to_pickle("output_file_name", keep_items)
keep_items.append(sentence) if (source_list.index(sentence)+ 1) % 5000 == 0: print "Kept/processed: %d/%d"%(len(keep_items), source_list.index(sentence)+1) return keep_items similar_sentences = find_similar_sentences(wikipedia, top_words) print len(similar_sentences) if len(similar_sentences) > len(oneliners): wiki_sentences = draw_from_list_randomly(similar_sentences, len(oneliners)) else: print 'Not enough sentences were found in the first shift. \nProcess will be reiterated with the top 1000 ngrams.' wiki_sentences = similar_sentences while len(wiki_sentences) < len(oneliners): print "No. of Wikipedia sentences similar to oneliners; %d" %len(wiki_sentences) wikipedia2 = [s for s in wikipedia if s not in wiki_sentences] top_2000 = print_top_tfidf(vectorizer, clf, ['POS'], 2000) second_shift_sents = find_similar_sentences(wikipedia2, top_2000) no_required_sents = len(oneliners) - len(wiki_sentences) if len(second_shift_sents) < no_required_sents: for i in second_shift_sents: wiki_sentences.append(i) else: for i in draw_from_list_randomly(second_shift_sents, no_required_sents): wiki_sentences.append(i) print "Amount of kept items: %d"%len(wiki_sentences) # Write away processed data w.write_to_pickle("similar_wiki_sentences", wiki_sentences)
keep_sentences = [] tokensets = [] #Generate all bows for i in unprocessed_sentences: tokensets.append(generate_bow(i)) print "Finished creating bow representations." # Compare all sentences s for similarity with the sentences on index > s for i in range(len(unprocessed_sentences) + 1): if i not in remove_items: for item in range(i + 1, len(unprocessed_sentences)): if is_ci_token_stopword_set_match(tokensets[i], tokensets[item]) == True: remove_items.append(item) if i % 50 == 0: # Used to track progress of the deduplication print "processed %d sentences" % i remove_items = list(OrderedDict.fromkeys(remove_items)) print "Removed items:" print remove_items for item in range(len(unprocessed_sentences)): if item not in remove_items: keep_sentences.append(unprocessed_sentences[item]) print "Amount of kept items: %d" % len(keep_sentences) # Write away processed data w.write_to_pickle("humorous_jokes", keep_sentences)
user = '******' latest_status_id = 987796177497096200 #Arbitrary high number. NOTE: Make sure the latest tweet sent by your user is below this threshold complete_statuses_list = [] amount_of_tweets_desired = 3200 #The number of tweets you want to gather from the respective user. This includes rts in its count. #Add extra on accounts that Retweet a lot. Twitter allows you to request a maximum of 3200 of the latest tweets per user account and 200 per request for i in range(amount_of_tweets_desired / 200): statuses = twitter_api.statuses.user_timeline(screen_name=user, count=201, max_id=(latest_status_id - 1), include_entities=False, include_rts=False) #if you want to include retweets by the user, include_rts should be set to True status_ids = [status['id'] for status in statuses] # Use "[re.sub(....)" if you need all links to be removed from the tweet and "[status['text']" if not. complete_statuses_list += [ re.sub(r"http\S+", "", status['text']) #[status['text'] for status in statuses ] latest_status_id = status_ids[-1] #is used to iterate to next tweet print "Scraped the %d latest tweets with an ID lower than %d" % ( len(status_ids), latest_status_id) print "Total number of tweets found: %d." % len(complete_statuses_list) # Write away the data savename = "%s" % user w.write_to_pickle(savename, complete_statuses_list)
# Required library imports import cPickle as pickle from random import randrange import write_functions as w import sys reload(sys) sys.setdefaultencoding('utf8') # Import wikipedia dataset file_object = open('wiki.txt' ,'r') all_wiki_sentences = file_object.readlines() print "File successfully opened. It contains %d sentences."%(len(all_wiki_sentences)) # Import dataset of oneliners filename = "humorous_jokes.pickle" oneliners = pickle.load(open(filename)) # Randomly pick as much sentences from the wikipedia dataset as there are jokes in the oneliner dataset random_index = [] while len(random_index) < len(oneliners): random_no = randrange(len(all_wiki_sentences)) if random_no not in random_index: # This statement ensures no random sentences are picked twice random_index.append(random_no) wiki_sents = [] for i in random_index: wiki_sents.append(all_wiki_sentences[i]) w.write_to_pickle("wiki_sentences", wiki_sents)
for p in range(1, page_max + 1): tree = html.fromstring( requests.get('http://onelinefun.com/%d/' % (p)).content) category_scores += scrape_single_user_rating(tree, rating_classname) found_lines += scrape_webpages('http://onelinefun.com/', page_max, text_class_name) # Filter out not funny jokes. if len(category_scores) == len(found_lines): for i in range(len(found_lines)): if category_scores[i] > 40.0: keep_lines.append( found_lines[i].text_content().split('One-liner')[0]) else: print 'The number of scores and the number of lines are imbalanced' print '%d scores found' % len(category_scores) print '%d lines found' % len(found_lines) print "Finished processing.\nIt contained %d Funny lines." % (len(keep_lines)) lines.append(keep_lines) for cat_lines in lines: for l in cat_lines: strip1 = l.strip('\n') strip2 = strip1.strip(' ') output1.append(strip2.strip('\n')) # Write away the found oneliners w.write_to_pickle("onelinefun", output1)
for p in range(1,page_max): tree = html.fromstring(requests.get('http://www.funnyshortjokes.com/c/%s/page/%d'%(c,p)).content) category_scores = category_scores + scrape_user_ratings(tree, pos_classname,neg_classname) found_lines += scrape_webpages('http://www.funnyshortjokes.com/c/%s/page/'%(c),page_max,text_class_name) # Filter out not funny jokes by removing all jokes that have received more negative than positive feedback if len(category_scores) == len(found_lines): print '%d scores and %d lines found'%(len(category_scores), len(found_lines)) for i in range(len(found_lines)): if category_scores[i] >= 0: keep_lines.append(found_lines[i].text_content()) else: print 'The number of scores and the number of lines are imbalanced' print '%d scores found'%len(category_scores) print '%d lines found'%len(found_lines) print "Finished processing %s.\nThe category contained %d Funny lines."%(c,len(keep_lines)) lines.append(keep_lines) print lines[0:2] for cat_lines in lines: for l in cat_lines: strip1 = l.strip('\n') strip2 = strip1.strip(' ') output1.append(strip2.strip('\n').decode('utf-8', "replace")) print output1[0:2] # Write away the found oneliners w.write_to_pickle("funnyshortjokes",output1)
for p in range(1, int(upper_bound_page_no) + 1): #request_page = requests.get('%s%s'%(url,p)) tree = html.fromstring(requests.get('%s%s' % (url[:-1], p)).content) elements += tree.find_class(name_text_class) print "Found %d lines" % len(elements) return elements uri_string = 'http://www.reuters.com/news/archive/businessNews?view=page&page=1' output1 = [] tree = html.fromstring(requests.get('%s' % uri_string).content) # print html.tostring(tree) # Useful for scanning the text_class = 'story-title' max_page_no = 400 categories = ['business', 'world', 'politics', 'technology'] headlines = [] for c in categories: cat_uri = 'http://www.reuters.com/news/archive/%sNews?view=page&page=1' % c cat_headlines = scrape_webpages(cat_uri, max_page_no, text_class) for hl in cat_headlines: headlines.append(hl) for h in range(len(headlines)): headlines[h] = headlines[h].text_content().strip() # Write away the gathered headlines: w.write_to_pickle("reuters_scraped", headlines)
tree = html.fromstring(requests.get('%s%s' % (url[:-1], p)).content) elements += tree.find_class(name_text_class) print "Found %d lines" % len(elements) return elements uri_string = 'http://www.reuters.com/news/archive/businessNews?view=page&page=1' output1 = [] tree = html.fromstring(requests.get('%s' % uri_string).content) # Uncomment this print to print the entire html structure and find the relevant classnames for your website #print html.tostring(tree) text_class = 'story-title' max_page_no = 400 # Since Reuters currently stores 10 headlines in one container, this will result in 400*10 headlines per category categories = ['business', 'world', 'politics', 'technology'] headlines = [] for c in categories: cat_uri = 'http://www.reuters.com/news/archive/%sNews?view=page&page=1' % c cat_headlines = scrape_webpages(cat_uri, max_page_no, text_class) for hl in cat_headlines: headlines.append(hl) for h in range(len(headlines)): headlines[h] = headlines[h].text_content().strip() # Write away the gathered headlines: w.write_to_pickle("reuters_raw", headlines)
reload(sys) sys.setdefaultencoding('utf8') # The following code opens a CSV file and stores the lines in a variable of type list filename = 'humorous_jokes.pickle' sentences = [] sentences = pickle.load(open(filename)) print "File succesfully imported. The file contains %d sentences." %len(sentences) # Count the number of letters in each sentence short_jokes = [] long_jokes = [] for sent in sentences: letter_counter = 0 for char in sent: if char.lower() in 'abcdefghijklmnopqrstuvwxyz': letter_counter +=1 if letter_counter <= 140: short_jokes.append(sent) else: long_jokes.append(sent) # Some information on the data created print "The short jokes file contains %d sentences."%len(short_jokes) print "The longer jokes file contains %d sentences."%len(long_jokes) # Write away the data to two separate pickle files w.write_to_pickle('short_oneliners', short_jokes) w.write_to_pickle('longer_jokes', long_jokes)
for p in range(1, page_max + 1): tree = html.fromstring( requests.get('https://unijokes.com/%d/' % (p), verify=False).content) category_scores += scrape_single_user_rating(tree, rating_classname) found_lines += scrape_webpages('https://unijokes.com/', page_max, text_class_name) # Filter out not funny jokes. if len(category_scores) == len(found_lines): for i in range(len(found_lines)): if category_scores[i] > 40.0: keep_lines.append(found_lines[i].text_content().split('Vote:')[0]) else: print 'The number of scores and the number of lines are imbalanced' print '%d scores found' % len(category_scores) print '%d lines found' % len(found_lines) print "Finished processing.\nIt contained %d Funny lines." % (len(keep_lines)) for l in keep_lines: r = l.strip('\r') n = r.strip('\n') rn = n.replace('\r\n', '') space = rn.strip(' ') q = space.replace('Q: ', '') a = q.replace('A: ', ' ') lines.append(a.decode('utf-8', "replace")) # Write away the found oneliners w.write_to_pickle("unijokes", lines)
reload(sys) sys.setdefaultencoding('utf8') # The following code opens a CSV file and stores the lines in a variable of type list filename = 'humorous_jokes.pickle' sentences = [] sentences = pickle.load(open(filename)) print "File succesfully imported. The file contains %d sentences." % len( sentences) # Count the number of letters in each sentence short_jokes = [] long_jokes = [] for sent in sentences: letter_counter = 0 for char in sent: if char.lower() in 'abcdefghijklmnopqrstuvwxyz': letter_counter += 1 if letter_counter <= 140: short_jokes.append(sent) else: long_jokes.append(sent) # Some information on the data created print "The short jokes file contains %d sentences." % len(short_jokes) print "The longer jokes file contains %d sentences." % len(long_jokes) # Write away the data to two separate pickle files w.write_to_pickle('short_oneliners', short_jokes) w.write_to_pickle('longer_jokes', long_jokes)
#request_page = requests.get('%s%s'%(url,p)) tree = html.fromstring(requests.get('%s%s'%(url[:-1],p)).content) elements += tree.find_class(name_text_class) print "Found %d lines"%len(elements) return elements uri_string = 'http://www.reuters.com/news/archive/businessNews?view=page&page=1' output1 = [] tree = html.fromstring(requests.get('%s'%uri_string).content) # Uncomment this print to print the entire html structure and find the relevant classnames for your website #print html.tostring(tree) text_class = 'story-title' max_page_no = 400 # Since Reuters currently stores 10 headlines in one container, this will result in 400*10 headlines per category categories = ['business', 'world', 'politics', 'technology'] headlines = [] for c in categories: cat_uri = 'http://www.reuters.com/news/archive/%sNews?view=page&page=1'%c cat_headlines = scrape_webpages(cat_uri, max_page_no, text_class) for hl in cat_headlines: headlines.append(hl) for h in range(len(headlines)): headlines[h] = headlines[h].text_content().strip() # Write away the gathered headlines: w.write_to_pickle("reuters_raw", headlines)
tree = html.fromstring(requests.get('http://onelinefun.com/1/').content) page_max = 359 for p in range(1,page_max+1): tree = html.fromstring(requests.get('http://onelinefun.com/%d/'%(p)).content) category_scores += scrape_single_user_rating(tree, rating_classname) found_lines += scrape_webpages('http://onelinefun.com/',page_max,text_class_name) # Filter out not funny jokes. if len(category_scores) == len(found_lines): for i in range(len(found_lines)): if category_scores[i] > 40.0: keep_lines.append(found_lines[i].text_content().split('One-liner')[0]) else: print 'The number of scores and the number of lines are imbalanced' print '%d scores found'%len(category_scores) print '%d lines found'%len(found_lines) print "Finished processing.\nIt contained %d Funny lines."%(len(keep_lines)) lines.append(keep_lines) for cat_lines in lines: for l in cat_lines: strip1 = l.strip('\n') strip2 = strip1.strip(' ') output1.append(strip2.strip('\n')) # Write away the found oneliners w.write_to_pickle("onelinefun",output1)
'''Tweet scraping, stripping (URLS) and saving step''' import re import write_functions as w #Define the user you wish to gather the latest tweets from user = '******' latest_status_id = 987796177497096200 #Arbitrary high number. NOTE: Make sure the latest tweet sent by your user is below this threshold complete_statuses_list = [] amount_of_tweets_desired = 3200 #The number of tweets you want to gather from the respective user. This includes rts in its count. #Add extra on accounts that Retweet a lot. Twitter allows you to request a maximum of 3200 of the latest tweets per user account and 200 per request for i in range(amount_of_tweets_desired/200): statuses = twitter_api.statuses.user_timeline(screen_name=user, count=201, max_id= (latest_status_id-1), include_entities=False, include_rts=False) #if you want to include retweets by the user, include_rts should be set to True status_ids = [status['id'] for status in statuses ] # Use "[re.sub(....)" if you need all links to be removed from the tweet and "[status['text']" if not. complete_statuses_list += [re.sub(r"http\S+", "",status['text']) #[status['text'] for status in statuses] latest_status_id = status_ids[-1] #is used to iterate to next tweet print "Scraped the %d latest tweets with an ID lower than %d"% (len(status_ids), latest_status_id) print "Total number of tweets found: %d." %len(complete_statuses_list) # Write away the data savename = "%s"%user w.write_to_pickle(savename,complete_statuses_list)
# Make sure all sentences are decoded correctly unprocessed_sentences = [i.decode('utf8','replace') for i in unprocessed_sentences] print "A total of %d lines were imported and are ready for deduplication!"%len(unprocessed_sentences) ''''Compare all sentences s for similarity with the sentences on index > s''' keep_items = [] for i in range(len(unprocessed_sentences)): unique_item = True for item in keep_items: if tp.measure_overlap_bows(unprocessed_sentences[i], item, 0.9) == True: unique_item = False if unique_item == True: keep_items.append(unprocessed_sentences[i]) if i%50 == 0: # Used to track progress of the deduplication process, might take a while print "Processed sentences:\t %d"%i print "Amount of kept items: %d"%len(keep_items) keep_short = [] keep_long = [] for item in keep_items: if len(item) < 140: keep_short.append(item) else: keep_long.append(item) print "Amount of short items kept: %d"%len(keep_short) # Write away processed data w.write_to_pickle("deduplicated_file", keep_short) # Uncomment next line to also store longer items #w.write_to_pickle("deduplicated_long_jokes", keep_long)
remove_items = [] keep_sentences = [] tokensets = [] #Generate all bows for i in unprocessed_sentences: tokensets.append(generate_bow(i)) print "Finished creating bow representations." # Compare all sentences s for similarity with the sentences on index > s for i in range(len(unprocessed_sentences)+1): if i not in remove_items: for item in range(i+1,len(unprocessed_sentences)): if is_ci_token_stopword_set_match(tokensets[i], tokensets[item]) == True: remove_items.append(item) if i%50 == 0: # Used to track progress of the deduplication print "processed %d sentences"%i remove_items = list(OrderedDict.fromkeys(remove_items)) print "Removed items:" print remove_items for item in range(len(unprocessed_sentences)): if item not in remove_items: keep_sentences.append(unprocessed_sentences[item]) print "Amount of kept items: %d"%len(keep_sentences) # Write away processed data w.write_to_pickle("humorous_jokes",keep_sentences)
import requests import write_functions as w output1 = [] page_max = 385 #The link can access all jokes on the website for i in range(1, page_max + 1): request_page = requests.get( 'http://goodriddlesnow.com/jokes/find?sort=popularity&direction=desc&page=%d' % i) tree = html.fromstring(request_page.content) elements = tree.find_class('joke-question') + tree.find_class( 'joke-answer hide') output = [] for el in elements: strip1 = el.text_content().strip('\n').strip('Joke: ') strip2 = strip1.strip(' ').strip('Punch line: ') output.append(strip2.replace('\n', ' ').decode('utf-8', "replace")) #filter out all noise for k in range(0, len(output) / 2): if 'Show Your Support :)' not in output[k + (len(output) / 2)]: output[k] = '%s %s' % (output[k], output[int(k + (len(elements) / 2))]) output = output[0:5] for out in output: output1.append(out) # Write away the found oneliners w.write_to_pickle('goodriddlesnow', output1)
print '%d scores found'%len(category_scores) print '%d lines found'%len(found_lines) print "Finished processing %s.\nIt contained %d funny lines."%(c,len(keep_lines)) for l in keep_lines: r = l.strip('\r') n = r.strip('\n') rn = n.replace('\r\n', '') space = rn.strip(' ') q = space.replace('Q:','') a = q.replace('A:','') lines.append(a.decode('utf-8', "replace")) print len(lines) print lines[0:5] # split_output1 = [] # split2_output1 = [] # output2 = [] # for i in lines: # split_output1 += i.split('\n') # for i in split_output1: # split2_output1 += i.split('\r') # split2_output1 = filter(None,split2_output1) # split3_output1 = [[i.strip(' ')] for i in split2_output1] # # for i in split3_output1: # output2 += filter(None,i) # print output2[0:5] # # Write away the found oneliners w.write_to_pickle("laughfactory", lines)
page_max = 1100 for p in range(1,page_max+1): tree = html.fromstring(requests.get('https://unijokes.com/%d/'%(p), verify=False).content) category_scores += scrape_single_user_rating(tree, rating_classname) found_lines += scrape_webpages('https://unijokes.com/', page_max, text_class_name) # Filter out not funny jokes. if len(category_scores) == len(found_lines): for i in range(len(found_lines)): if category_scores[i] > 40.0: keep_lines.append(found_lines[i].text_content().split('Vote:')[0]) else: print 'The number of scores and the number of lines are imbalanced' print '%d scores found'%len(category_scores) print '%d lines found'%len(found_lines) print "Finished processing.\nIt contained %d Funny lines."%(len(keep_lines)) for l in keep_lines: r = l.strip('\r') n = r.strip('\n') rn = n.replace('\r\n', '') space = rn.strip(' ') q = space.replace('Q: ','') a = q.replace('A: ',' ') lines.append(a.decode('utf-8', "replace")) # Write away the found oneliners w.write_to_pickle("unijokes", lines)