return result # test_ts = generate_ts() # test_query = generate_query() # # match_result = best_match_ts(test_query, test_ts) # # plt.figure(figsize=(15, 15)) # plt.plot(match_result['value'], label='match result') # plt.plot(test_query, label='test query') # plt.show() # Prepare test ts and query features_to_append = [0, 1, 2, 3, 4] res_list, time_series_dict, global_min, global_max = generate_source( '2013e_001_2_channels_02backs.csv', features_to_append) normalized_ts_dict = normalize_ts_with_min_max(time_series_dict, global_min, global_max) def generate_query_v2(ts_dict, amount=5): query_results = [] for i in range(amount): query_id, query_raw_ts = random.choice(list(ts_dict.items())) start = (i + 1) * 2 query_sequence = query_raw_ts[start:numpy.random.randint( low=start + 5, high=len(query_raw_ts) // 2)] query_results.append(query_sequence)
gp_not_opened_error() else: if len(args ) != 2: # if wronge number of arguments is given err_msg = FormattedText([ ('class:error', 'Wrong number of arguments, please specify the path to the the data you wish to load' ), ]) print_formatted_text(err_msg, style=style) elif not os.path.isfile(args[1]): load_file_not_found_error(args[1]) else: time_series_list, time_series_dict, global_min, global_max = generate_source( args[1], features_to_append) print("loaded file " + args[1]) print("Global Max is " + str(global_max)) print("Global Min is " + str(global_min)) normalized_ts_dict = normalize_ts_with_min_max( time_series_dict, global_min, global_max) # gp_project.save_time_series(time_series_dict, normalized_ts_dict, args[1]) # TODO include load history try: gp_project.load_time_series( time_series_dict, normalized_ts_dict, time_series_list) except DuplicateIDError as e: err_msg = FormattedText([ ('class:error', 'Error: duplicate ID(s) found in existing time series and newly loaded time series, dupplicate ID(s):'
def main(args): file_path = args.input # './dataset/001-SART-August2017-MB.csv' Server_path = ['/usr/lib/jvm/java-1.8.0-openjdk-amd64', './res/saved_dataset', file_path ] Yu_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home', './res/saved_dataset', './dataset/001-SART-August2017-MB-50.csv'] Leo_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_151.jdk/Contents/Home', './res/saved_dataset', file_path] Yuncong_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_161.jdk/Contents/Home', './res/saved_dataset', file_path] path = Server_path os.environ['JAVA_HOME'] = path[0] # create a spark job cores = args.cores st = args.st full_length = args.full_length sc = SparkContext('' + 'local' + '[' + str(cores) + ']' + '', "First App") # sc = SparkContext("local[4]", "First App") # st = 0.25 new_path = re.match(r"(.*)\.csv", path[2]).group(1) path_save_res = path[1] + '/' + new_path + '_' + str(st) # if path exist, the job can't be executed if os.path.isdir(path_save_res): group_rdd = sc.pickleFile(path_save_res + '/group/') cluster_rdd = sc.pickleFile(path_save_res + '/cluster/') global_dict_rdd = sc.pickleFile(path_save_res + '/dict/') # shutil.rmtree(path_save_res) else: # TODO file = path[2] # add test for commit features_to_append = [0, 1, 2, 3, 4] # res_list: list of raw time series data to be on distributed # timeSeries: a dictionary version of as res_list, used for sebsequence look up res_list, time_series_dict, global_min, global_max = generate_source(file, features_to_append) print('processing dataset' + path[2]) print("Global Max is " + str(global_max)) print("Global Min is " + str(global_min)) normalized_ts_dict = normalize_ts_with_min_max(time_series_dict, global_min, global_max) # TODO # add clustering method after grouping # this broadcast object can be accessed from all nodes in computer cluster # in order to access the value this, just use val = global_dict.value # for future reading data # NOTE that the data being broadcasted is the minmax-normalized data global_dict = sc.broadcast(normalized_ts_dict) time_series_dict = sc.broadcast(time_series_dict) # max(flows, key=lambda k: len(flows[k])) # find the key of largest length of # max_len_key = max(global_dict.value, key=lambda k: len(global_dict.value[k])) # max_length = len(global_dict.value[max_len_key]) if full_length: grouping_range = (1, max([len(v) for v in global_dict.value.values()])) else: grouping_range = (89, 90) # grouping_range = (1, length) global_dict_rdd = sc.parallelize(res_list[1:], numSlices=16) global_dict_rdd.saveAsPickleFile(path_save_res + '/dict/') # global_dict_res = global_dict_rdd.collect() # finish grouping here, result in a key, value pair where # key is the length of sub-sequence, value is the [id of source time series, start_point, end_point] # res_rdd = global_dict_rdd.flatMap(lambda x: get_all_subsquences(x)).collect() # In get_subsquences(x, 100, 110): we are grouping subsequences that are of length 90 to 110 """ ##### group group_rdd_res: list: items = (length, time series list) -> time series list: items = (id, start, end) """ # add save option or not group_start_time = time.time() group_rdd = global_dict_rdd.flatMap(lambda x: get_subsquences(x, grouping_range[0], grouping_range[1])).map( lambda x: (x[0], [x[1:]])).reduceByKey( lambda a, b: a + b) group_rdd.saveAsPickleFile(path_save_res + '/group/') group_end_time = time.time() print('group of timeseries from ' + str(grouping_range[0]) + ' to ' + str(grouping_range[1]) + ' using ' + str( group_end_time - group_start_time) + ' seconds') # group_rdd_res = group_rdd.collect() print("grouping done, saved to dataset") """ ##### cluster The following code is for testing clustering operation. Cluster one group without using RDD 4/15/19 # print("Test clustering") # group_res = group_rdd.collect() # cluster(group_res[1][1], group_res[1][0], st, global_dict.value) # testing group with length of 9 """ # print("Test clustering") # group_res = group_rdd.collect() # # cluster_two_pass(group_res[1][1], group_res[1][0], st, global_dict.value) # testing group with length of 9 # cluster(group_res[1][1], group_res[1][0], st, global_dict.value) # testing group with length of 9 print("Working on clustering") cluster_start_time = time.time() cluster_rdd = group_rdd.map(lambda x: cluster(x[1], x[0], st, global_dict.value)) cluster_rdd.saveAsPickleFile(path_save_res + '/cluster/') # save all the cluster to the hard drive cluster_rdd_reload = sc.pickleFile(path_save_res).collect() # here we have all the clusters in memory # first_dict = cluster_rdd_reload[0] cluster_end_time = time.time() print('clustering of timeseries from ' + str(grouping_range[0]) + ' to ' + str( grouping_range[1]) + ' using ' + str(cluster_end_time - cluster_start_time) + ' seconds') print("clustering done, saved to dataset") # plot all the clusters # plot_cluster(cluster_rdd_reload, 2, time_series_dict, 5) """ ##### query Current implementation: if we want to find k best matches, we give the first k best matches for given sequence length range The following line is for testing querying on one cluster # query_result = query(query_sequence, cluster_rdd_reload[0], k, time_series_dict.value) """ # print("Using Twopass") # total_cluster_count = 0 # for cluster_dic in cluster_rdd.collect(): # # representative, cluster_subsequences = random.choice(list(cluster_dic.items())) # # cluster_length = representative.get_length() # total_cluster_count = total_cluster_count + len(cluster_dic.keys()) # # print("length " + str(cluster_length) + " has cluster count of " + str(len(cluster_dic.keys()))) # print("Total cluster count is: " + str(total_cluster_count)) # # '(001-SART-August2017-MB)_(211-Current-Item:-3)_(A-DC1)_(64434.0)_(105950.0)' # '(2013e_001)_(100-0-Back)_(B-DC8)_(232665953.1250)' query_id = '(001-SART-August2017-MB)_(211-Current-Item:-3)_(A-DC1)_(64434.0)_(105950.0)' query_sequence = get_data(query_id, 24, 117, time_series_dict.value) # get an example query filter_rdd = cluster_rdd.filter(lambda x: exclude_same_id(x, query_id)) # raise exception if the query_range exceeds the grouping range querying_range = (90, 91) k = 5 # looking for k best matches if querying_range[0] < grouping_range[0] or querying_range[1] > grouping_range[1]: raise Exception("query_operations: query: Query range does not match group range") query_result = cluster_rdd.filter(lambda x: x).map( lambda clusters: query(query_sequence, querying_range, clusters, k, time_series_dict.value)).collect() exclude_overlapping = True query_result = filter_rdd.map( lambda clusters: query(query_sequence, querying_range, clusters, k, time_series_dict.value, exclude_overlapping, 0.5)).collect() plot_query_result(query_sequence, query_result, time_series_dict.value) sc.stop()
if ts_list is not None and ts_dict is not None and normalized_ts_dict is not None: is_Update_pre_infor = prompt( "Project " + args[1] + " pre-processing information exist, would you like to update ? [y/n]" ) if is_Update_pre_infor == 'y': # creating new GenexPlus project update_pre_processing = True print("update pre-processing information " + args[1]) # change here if update_pre_processing or not ts_list or not ts_dict or not normalized_ts_dict: ts_list, global_min, global_max = generate_source( args[1], features_to_append) print("loaded file " + args[1]) print("Global Max is " + str(global_max)) print("Global Min is " + str(global_min)) # get a normalize version of the time series norm_ts_list = normalize_ts_with_min_max( ts_list, global_min, global_max) global_norm_list = sc.parallelize(norm_ts_list) elif args[ 0] == 'save': # TODO save changes to the GenexPlusProject pickle file path_to_save = SAVED_DATASET_DIR + os.sep + gp_project.get_project_name( )