def main(): """ Application main. """ K = [10] # folds N = [3] # neighbors P = [4.5] # powers # The time scale is the only parameter being trained here, so we # consider a number of options. C = [0.001 * i for i in range(1, 25)] C.extend([0.025 * i for i in range(1, 81)]) A = [0.75] # alphas M = [3] # bags # Build the list of k-fold configurations under analysis. conf_list = [ kfold.KFoldConf(k, n, p, None, c, a, m) for k in K for n in N for p in P for c in C for a in A for m in M ] # Distribute the RDD of k-fold configurations. conf_rdd = SC.parallelize(conf_list, 104).cache() # Group all the partitions that are to be examined. partition_files = [ 'partitions/monthly_ozone_1990-2015_partition.csv', 'partitions/monthly_pm25_1990-2015_partition.csv' ] # Run learning tasks for each partition. for file_name in partition_files: # Note that we reuse the method from "point.py" here. point_list = point.load_point_file(file_name) point_list_brd = SC.broadcast(point_list) # Define a mapper to run your statistical routines. def fold(conf): """ Return a result tuple for the given configuration. """ return ( conf, # KFoldConf object kfold.mare(conf, point_list_brd), # MARE statistic kfold.rmspe(conf, point_list_brd)) # RMSPE statistic # Run the learning routines and generate the report. report_rdd = conf_rdd.map(fold).map(report) # Write the output to a file in a "results/" directory, regardless of # the order in which the partitions were analysed. if 'no2' in file_name: report_rdd.saveAsTextFile('results/no2_max_results') elif 'ozone' in file_name: report_rdd.saveAsTextFile('results/ozone_max_results') elif 'pm25' in file_name: report_rdd.saveAsTextFile('results/pm25_max_results') else: import sys sys.exit(1)
def main(): """ Application main. """ K = [10] # folds N = [3, 4, 5, 6, 7, 8] # neighbors P = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5] # powers C = [0.001 * i for i in range(1, 25)] # time_scales C.extend([0.025 * i for i in range(1, 81)]) # build the list and then RDD of KFoldConf objects under analysis conf_list = [kfold.KFoldConf(k, n, p, None, c) for k in K for n in N for p in P for c in C] # add incremental "conf_id" attribute to each KFoldConf object for i, conf in enumerate(conf_list): conf.conf_id = i conf_rdd = SC.parallelize(conf_list, 150).cache() # load radius_table and broadcast it with open('radius_table.pkl', 'r') as f: radius_table = pickle.load(f) radius_table_brd = SC.broadcast(radius_table) # run learning tasks for each partition for i in range(3): point_list = load_partition(i) point_list_brd = SC.broadcast(point_list) def fold(conf): """ Return a result tuple for the given configuration. """ return (i, # partition_id conf, # KFoldConf object kfold.mare(conf, # MARE statistic point_list_brd, radius_table_brd), kfold.rmspe(conf, # RMSPE statistic point_list_brd, radius_table_brd)) report_rdd = conf_rdd.map(fold).map(report) report_rdd.saveAsTextFile('results/partition%02d' % i) # collect all results into one rdd, then into one file result_rdds = [SC.textFile('results/partition0%d/' % i) for i in range(3)] results = result_rdds[0].\ union(result_rdds[1]).\ union(result_rdds[2]).\ collect() with open('results.csv', 'w') as output: results = map(lambda line: line + '\n', results) output.writelines(results)
def main(): """ Application main. """ N = [3, 4, 5, 6, 7] P = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5] conf_list = [kfold.KFoldConf(10, n, p, None, 0.1086) for n in N for p in P] conf_rdd = SC.parallelize(conf_list, 45).cache() point_list = point.load_pm25_file('../../data/pm25_2009_measured.csv') random.shuffle(point_list) # The following was used to test execution of this script locally. # point_list = point_list[:250] point_list_brd = SC.broadcast(point_list) def fold(conf): return (conf, kfold.mare(conf, point_list_brd), kfold.rmspe(conf, point_list_brd)) report_rdd = conf_rdd.map(fold).map(report) report_rdd.saveAsTextFile( "hdfs:///user/jf00936/aeolus/experiment_01/results")