Пример #1
0
'''

Week3 Word count example.

By Paul Wu [email protected]

'''

from __future__ import print_function
import sys
import re

from util import spark, sc
import time

start_time = time.time()

text = sc.textFile("data/war_and_peace.txt")
words = text.flatMap(lambda line: re.split('\W+', line.lower()))
peace_and_war = text.filter(lambda line: 'peace' in re.split(
    '\W+', line.lower()) and 'war' in re.split('\W+', line.lower()))
wordsPairs = words.map(lambda word: (word, 1))
counts = wordsPairs.reduceByKey(lambda value1, value2: value1 + value2)
counts = counts.sortBy(lambda pair: -pair[1])
counts.foreach(print)

sc.stop()
print("--- %s seconds ---" % (time.time() - start_time))
'''

By Paul Wu [email protected]

'''

from __future__ import print_function
from util import sc

lines = sc.textFile("../data/airport-codes.csv")
print("partition size", lines.getNumPartitions())
#If this is not done, it will get a memory error.
lines = lines.repartition(200)
california = lines.filter(lambda line: "US-CA" in line)
print(california.take(10))
norcal = california.filter(
    lambda line: float(line.split(',')[4].strip('"')) > 37)
codes = norcal.map(lambda line: line.split(',')[0])
print("count", codes.count())
sc.stop()
from __future__ import print_function
from util import sc


#Define a function to convert a string to a list.
def to_list(str):
    # print(str)
    ret = []
    for c in str:
        if c.isalpha():
            ret.append(c)
    return ret


if __name__ == '__main__':
    # Read files...can be from hdfs.
    text_file = sc.textFile("../data/ch_words*.txt")
    # Transform the rdd into key-value pair (word, count) and use reduce alg.
    text_file.foreach(print)

    counts = text_file.flatMap(lambda line: to_list(line)) \
        .map(lambda word: (word.lower(), 1)) \
        .reduceByKey(lambda a, b: a + b)
    # print
    # counts.foreach(print)
    counts = counts.sortBy(lambda pair: -pair[1])
    # Collect it as tuple and print.
    count_list = counts.collect()
    for item in count_list:
        print(item[0] + ":", item[1])
    sc.stop()
Пример #4
0
'''

By Paul Wu [email protected]

'''
from __future__ import print_function
from util import spark, sc

input = sc.textFile("../data/numbers.txt")

numbers = input.flatMap(lambda line: line.split('\t'))
numbers.foreach(print)

sums = numbers.reduce(lambda a, b: float(a) + float(b))
print('sum={0}'.format(sums))
print('avg={0}'.format(sums / numbers.count()))
Пример #5
0
def main(training, input_file, output, sample):
    # Import libraries here to avoid cost when parser failes prematurely

    from pyspark import SparkContext
    from pyspark import SparkConf
    from util import load_tweets, sc, load_stopwords

    # Find total tweet count
    tweets = load_tweets(training, sample=sample)
    tweet_count = tweets.count()

    # Read input text
    raw_input_text = sc.textFile(input_file)
    input_text = raw_input_text.map(lambda row: row.lower().split(' '))
    input_text = input_text.collect()[0]
    input_text_set = set(input_text)

    # Load tweets
    tweets = tweets.map(lambda row: (row[4], (row[10].lower().split(' '))))

    # Calculate tweets per location
    location_count_int = tweets.map(lambda row: (row[0], 1))
    # ('Framingham, MA', 1)

    location_count = location_count_int.reduceByKey(lambda a, b: (a + b))
    # ('Manhattan, NY', 686809)

    # This is a cheeky line which filters all irrelevant words from the tweets which
    # are not present in the input text.
    tweets = tweets.map(lambda row:
                        (row[0], set(row[1]).intersection(input_text_set)))

    # Flatmap tweet words to (location, word) tuples
    # Use a set so we don't double count word occurances in a single tweet
    location_word = tweets.flatMap(lambda row:
                                   (((row[0]), word) for word in row[1]))
    # ('Framingham, MA', 'can')

    # Count occurances of each word per city
    location_word_int = location_word.map(lambda row: (row, 1))
    # (('East Hartford, CT', '#easthartford'), 1)
    location_word_count_int = location_word_int.reduceByKey(lambda a, b: a + b)
    # (('East Hartford, CT', '#easthartford'), 21)
    location_word_count = location_word_count_int.map(
        lambda row: ((row[0][0]), [(row[0][1], row[1])]))
    # ('East Hartford, CT', ('#easthartford', 21) )

    # Reverse of flatmap so we can have lists of tuples (word, frequency) for each city
    location_words = location_word_count.combineByKey(list, list.__add__,
                                                      list.__add__)
    # ('El Oro, Mxico', [('oro', 1), ... ('norte', 4),   ('tiro', 2), ('#puebleando', 1)])

    # Convert list of tuples to a dictionary with words as keys, and frequencies as values
    location_word_freqs = location_words.map(
        lambda row: (row[0], {word: frequency
                              for word, frequency in row[1]}))
    #('El Oro, Mxico', {'oro': 1, '(': 1, '@': 3, ...  '#puebleando': 1}))
    # Join in Tc (Tweet count per city)
    location_word_freq_tc = location_word_freqs.join(location_count)
    # ('El Oro, Mxico', (frequency_dict, Tc))

    # This gives all the weights of Tcw1/Tc * Tcw2/Tc * ... * Tcwn/Tc
    location_prob_intermediate = location_word_freq_tc.map(lambda row: (
        (row[0]),
        (multiply_list([get_or_zero(row[1][0], word)
                        for word in input_text], row[1][1])), row[1][1]))

    # Calculate final probability: (Tc / T) * PI-product[Tc,w_i)/Tc]
    location_prob = location_prob_intermediate.map(
        lambda row: (row[0], (row[2] / tweet_count) * row[1]))

    # Get top results
    top_results = location_prob.takeOrdered(2, key=lambda row: -row[1])

    # Print results that fit the requirements
    if not top_results:
        # There are no matching cities
        top_results = [['', '']]
    elif top_results[0][1] != top_results[1][1]:
        # The top result is not tied with the second place
        # Keep top result
        top_results = [top_results[0]]
    if top_results[0][1] == 0.0:
        # Top result is zero, replace with empty string
        top_results = [['', '']]

    with open(output, 'w+') as out:
        # Print result to file
        for line in top_results:
            out.writelines(f'{line[0]}\t{line[1]}\t')
Пример #6
0
'''

Week3 Word count example.

By Paul Wu [email protected]

'''

from __future__ import print_function
import sys
import re

from util import spark, sc
import time
start_time = time.time()

text = sc.textFile("../../data/war_and_peace-300.txt")
words = text.flatMap(lambda line: re.split('\W+', line.lower()))
wordsPairs = words.map(lambda word: (word, 1))
counts = wordsPairs.reduceByKey(lambda value1, value2: value1 + value2)
counts = counts.sortBy(lambda pair: -pair[1])
counts.foreach(print)

sc.stop()
print("--- %s seconds ---" % (time.time() - start_time))