included_python_files_package = ['/var/machine_learning/stocks/python/stocks_python.zip'] mysql_url = "jdbc:mysql://10.211.55.4:3306/stocks?user=parallels&password=dellc123" data_files = "file:///var/data/stocks/historical_data/*.csv" sc = SparkContext(spark_url, spark_context_name, pyFiles=included_python_files_package) sqlContext = HiveContext(sc) from pyspark.sql import Row from stockRdd import StockRdd from dateInterval import DateIntervalManager sample_data_rdd = sc.textFile("file:///var/data/stocks/historical_data/*.csv").distinct() yesterday_date = DateInterval.getYesterdayDate() dailyDateIntervalDictionaryToCalculateFor = DateIntervalManager.createDailyIntervalDictionaryForPastYear(yesterday_date) number_of_days_in_dictionary = dailyDateIntervalDictionaryToCalculateFor.getNumberOfDaysInDictionary() minimum_number_of_days = int((4.0 / 7.0) * float(number_of_days_in_dictionary)) mapStockCsvToKeyValueClosure = StockRdd.getMapStockCsvToKeyValueForDatesInDictionaryClosure(dailyDateIntervalDictionaryToCalculateFor) symbol_creation_function_closure = StockRdd.getSymbolDataInstanceForDateDictionaryDataPointsClosure(dailyDateIntervalDictionaryToCalculateFor, yesterday_date) symbol_down_stocks_data_filtered = sample_data_rdd.map(mapStockCsvToKeyValueClosure)\ .filter(lambda line: not(line is None))\ .reduceByKey(lambda a,b : a + b)\ .map(lambda tuple : ( tuple[0], StockRdd.sort_and_compute_deltas( list(tuple[1]) ) ) )\ .filter(lambda tuple : len(list(tuple[1])) > minimum_number_of_days)\ .map(symbol_creation_function_closure)\ .filter(lambda symbol_and_instance_tuple: not(symbol_and_instance_tuple[1].getTodayPrice() is None))\
included_python_files_package = ['/var/machine_learning/stocks/python/stocks_python.zip'] mysql_url = "jdbc:mysql://localhost:3306/stocks?user=parallels&password=dellc123" data_files = "file:///var/data/stocks/historical_data/Z*.csv" # In a production-environment example, this value would be dynamically generated using today_date = '2016-03-24' # Instantiate the Spark Context to be used for this script sc = SparkContext(spark_url, spark_context_name, pyFiles=included_python_files_package) sqlContext = HiveContext(sc) # Initialize the RDD with the stock data files sample_data_rdd = sc.textFile(data_files).distinct() # Create a dictionary containing date-intervals to represent 26 intervals of 2-week spans spanning the past year dateDictionaryToCalculateFor = DateIntervalManager.createDateIntervalDictionaryForPastYear(today_date) # We want to ensure that any stocks being calculated existed during the entire period number_of_days_in_dictionary = dateDictionaryToCalculateFor.getNumberOfDaysInDictionary() minimum_number_of_days_for_stock = int((4.0 / 7.0) * float(number_of_days_in_dictionary)) # map_stock_csv_to_key_value_closure is a function closure that will filter out lines of data whose dates are outside of the # time-frame we are concerned about map_stock_csv_to_key_value_closure = StockRdd.getMapStockCsvToKeyValueForDatesInDictionaryClosure(dateDictionaryToCalculateFor) # symbol_creation_function_closure is a function closure which will convert the list of csv data lines to a SymbolData object # which can return the data points we need to cluster a stock symbol_creation_function_closure = StockRdd.getSymbolDataInstanceForDateDictionaryDataPointsClosure(dateDictionaryToCalculateFor, today_date) # symbol_cluster_data_closure is a function closure which will convert a SymbolData object to a list of data points which # should be used to cluster stocks by