def aggregate(rdd, op=None): rdd = rdd.reduceByKey(op) if op else average_by_key(rdd) rdd = rdd.map(lambda ((_experiment, second), value): (second, value)) rdd = average_by_key(rdd) rdd = rdd.map(lambda (second, value): (second, int(value))) rdd = rdd.sortByKey() data = rdd.collect() return zip(*data)
def process(sc, series_dir): log_paths = fetch_log_paths(series_dir) rdd = sc.parallelize(log_paths) rdd = rdd.flatMap(parse_log_file) rdd = average_by_key(rdd) rdd = rdd.map(lambda ((_experiment, measurement), value): (measurement, value)) rdd = average_by_key(rdd) rdd = rdd.sortByKey() return zip(*rdd.collect())
def aggregate(rdd): rdd = rdd.filter(lambda (_key, value): value > -10000) rdd = rdd.reduceByKey(min) rdd = rdd.map(lambda ((_experiment, second), value): (second, value)) rdd = average_by_key(rdd) rdd = rdd.map(lambda (second, value): (second, value)) rdd = rdd.sortByKey() data = rdd.collect() return zip(*data)
def aggregate(rdd): rdd = rdd.map(lambda (key, value): (key, 1) if value < 10 else (key, 0)) rdd = rdd.reduceByKey(add) rdd = rdd.map(lambda ((_experiment, second), value): (second, value)) rdd = average_by_key(rdd) rdd = rdd.map(lambda (second, value): (second, int(value))) rdd = rdd.sortByKey() data = rdd.collect() return zip(*data)
def process(sc, logs_dir): log_paths = fetch_log_paths(logs_dir) rdd = sc.parallelize(log_paths) rdd = rdd.flatMap(parse_log_file) rdd = rdd.filter(lambda ((_nodes_count, _experiment, measurement), _value): 50 < measurement < 250) rdd = rdd.map(lambda ((nodes_count, experiment, _measurement), value): ((nodes_count, experiment), value)) rdd = rdd.reduceByKey(add) rdd = rdd.map(lambda ((nodes_count, _experiment), value): (nodes_count, value / 200)) rdd = average_by_key(rdd) rdd = rdd.sortByKey() return zip(*rdd.collect())