''' Week3 Word count example. By Paul Wu [email protected] ''' from __future__ import print_function import sys import re from util import spark, sc import time start_time = time.time() text = sc.textFile("data/war_and_peace.txt") words = text.flatMap(lambda line: re.split('\W+', line.lower())) peace_and_war = text.filter(lambda line: 'peace' in re.split( '\W+', line.lower()) and 'war' in re.split('\W+', line.lower())) wordsPairs = words.map(lambda word: (word, 1)) counts = wordsPairs.reduceByKey(lambda value1, value2: value1 + value2) counts = counts.sortBy(lambda pair: -pair[1]) counts.foreach(print) sc.stop() print("--- %s seconds ---" % (time.time() - start_time))
''' By Paul Wu [email protected] ''' from __future__ import print_function from util import sc lines = sc.textFile("../data/airport-codes.csv") print("partition size", lines.getNumPartitions()) #If this is not done, it will get a memory error. lines = lines.repartition(200) california = lines.filter(lambda line: "US-CA" in line) print(california.take(10)) norcal = california.filter( lambda line: float(line.split(',')[4].strip('"')) > 37) codes = norcal.map(lambda line: line.split(',')[0]) print("count", codes.count()) sc.stop()
from __future__ import print_function from util import sc #Define a function to convert a string to a list. def to_list(str): # print(str) ret = [] for c in str: if c.isalpha(): ret.append(c) return ret if __name__ == '__main__': # Read files...can be from hdfs. text_file = sc.textFile("../data/ch_words*.txt") # Transform the rdd into key-value pair (word, count) and use reduce alg. text_file.foreach(print) counts = text_file.flatMap(lambda line: to_list(line)) \ .map(lambda word: (word.lower(), 1)) \ .reduceByKey(lambda a, b: a + b) # print # counts.foreach(print) counts = counts.sortBy(lambda pair: -pair[1]) # Collect it as tuple and print. count_list = counts.collect() for item in count_list: print(item[0] + ":", item[1]) sc.stop()
''' By Paul Wu [email protected] ''' from __future__ import print_function from util import spark, sc input = sc.textFile("../data/numbers.txt") numbers = input.flatMap(lambda line: line.split('\t')) numbers.foreach(print) sums = numbers.reduce(lambda a, b: float(a) + float(b)) print('sum={0}'.format(sums)) print('avg={0}'.format(sums / numbers.count()))
def main(training, input_file, output, sample): # Import libraries here to avoid cost when parser failes prematurely from pyspark import SparkContext from pyspark import SparkConf from util import load_tweets, sc, load_stopwords # Find total tweet count tweets = load_tweets(training, sample=sample) tweet_count = tweets.count() # Read input text raw_input_text = sc.textFile(input_file) input_text = raw_input_text.map(lambda row: row.lower().split(' ')) input_text = input_text.collect()[0] input_text_set = set(input_text) # Load tweets tweets = tweets.map(lambda row: (row[4], (row[10].lower().split(' ')))) # Calculate tweets per location location_count_int = tweets.map(lambda row: (row[0], 1)) # ('Framingham, MA', 1) location_count = location_count_int.reduceByKey(lambda a, b: (a + b)) # ('Manhattan, NY', 686809) # This is a cheeky line which filters all irrelevant words from the tweets which # are not present in the input text. tweets = tweets.map(lambda row: (row[0], set(row[1]).intersection(input_text_set))) # Flatmap tweet words to (location, word) tuples # Use a set so we don't double count word occurances in a single tweet location_word = tweets.flatMap(lambda row: (((row[0]), word) for word in row[1])) # ('Framingham, MA', 'can') # Count occurances of each word per city location_word_int = location_word.map(lambda row: (row, 1)) # (('East Hartford, CT', '#easthartford'), 1) location_word_count_int = location_word_int.reduceByKey(lambda a, b: a + b) # (('East Hartford, CT', '#easthartford'), 21) location_word_count = location_word_count_int.map( lambda row: ((row[0][0]), [(row[0][1], row[1])])) # ('East Hartford, CT', ('#easthartford', 21) ) # Reverse of flatmap so we can have lists of tuples (word, frequency) for each city location_words = location_word_count.combineByKey(list, list.__add__, list.__add__) # ('El Oro, Mxico', [('oro', 1), ... ('norte', 4), ('tiro', 2), ('#puebleando', 1)]) # Convert list of tuples to a dictionary with words as keys, and frequencies as values location_word_freqs = location_words.map( lambda row: (row[0], {word: frequency for word, frequency in row[1]})) #('El Oro, Mxico', {'oro': 1, '(': 1, '@': 3, ... '#puebleando': 1})) # Join in Tc (Tweet count per city) location_word_freq_tc = location_word_freqs.join(location_count) # ('El Oro, Mxico', (frequency_dict, Tc)) # This gives all the weights of Tcw1/Tc * Tcw2/Tc * ... * Tcwn/Tc location_prob_intermediate = location_word_freq_tc.map(lambda row: ( (row[0]), (multiply_list([get_or_zero(row[1][0], word) for word in input_text], row[1][1])), row[1][1])) # Calculate final probability: (Tc / T) * PI-product[Tc,w_i)/Tc] location_prob = location_prob_intermediate.map( lambda row: (row[0], (row[2] / tweet_count) * row[1])) # Get top results top_results = location_prob.takeOrdered(2, key=lambda row: -row[1]) # Print results that fit the requirements if not top_results: # There are no matching cities top_results = [['', '']] elif top_results[0][1] != top_results[1][1]: # The top result is not tied with the second place # Keep top result top_results = [top_results[0]] if top_results[0][1] == 0.0: # Top result is zero, replace with empty string top_results = [['', '']] with open(output, 'w+') as out: # Print result to file for line in top_results: out.writelines(f'{line[0]}\t{line[1]}\t')
''' Week3 Word count example. By Paul Wu [email protected] ''' from __future__ import print_function import sys import re from util import spark, sc import time start_time = time.time() text = sc.textFile("../../data/war_and_peace-300.txt") words = text.flatMap(lambda line: re.split('\W+', line.lower())) wordsPairs = words.map(lambda word: (word, 1)) counts = wordsPairs.reduceByKey(lambda value1, value2: value1 + value2) counts = counts.sortBy(lambda pair: -pair[1]) counts.foreach(print) sc.stop() print("--- %s seconds ---" % (time.time() - start_time))