from operator import add from pyspark import SparkContext from csv import reader from myUtils import readFiles, getSomeFileNames, getAllFileNames def checkPassengerCountValid(passenger_count): try: int(passenger_count) if int(passenger_count) >= 0 and int(passenger_count) < 10: return "Valid" else: return "Invalid_not_within_range_" + passenger_count except ValueError: return "Invalid_not_integer_" + passenger_count if __name__ == "__main__": sc = SparkContext() #(taxi_data,prefix) = readFiles(['data/yellow_tripdata_2016-01.csv','data/yellow_tripdata_2016-02.csv','data/yellow_tripdata_2016-03.csv','data/yellow_tripdata_2016-04.csv','data/yellow_tripdata_2016-05.csv','data/yellow_tripdata_2016-06.csv'],sc ) filenames = getAllFileNames() (taxi_data, prefix) = readFiles(filenames, sc) vendorID = taxi_data.map(lambda entry: (checkValid(entry[3]), 1) ).reduceByKey(lambda x, y: x + y) tabSeparated = vendorID.map(lambda x: x[0] + "\t" + str(x[1])) tabSeparated.saveAsTextFile("passenger_count_valid.out") sc.stop()
from __future__ import print_function import sys from operator import add from pyspark import SparkContext from csv import reader from myUtils import readFiles if __name__ == "__main__": sc = SparkContext() (taxi_data, prefix) = readFiles([ 'data/yellow_tripdata_2016-01.csv', 'data/yellow_tripdata_2016-02.csv', 'data/yellow_tripdata_2016-03.csv', 'data/yellow_tripdata_2016-04.csv', 'data/yellow_tripdata_2016-05.csv', 'data/yellow_tripdata_2016-06.csv' ], sc) vendorID = taxi_data.map(lambda entry: (entry[0], 1)).reduceByKey( lambda x, y: x + y) tabSeparated = vendorID.map(lambda x: x[0] + "\t" + str(x[1])) tabSeparated.saveAsTextFile(sys.argv[0].split('.')[0] + "_valid.out") sc.stop()