Пример #1
0
    return result


# test_ts = generate_ts()
# test_query = generate_query()
#
# match_result = best_match_ts(test_query, test_ts)
#
# plt.figure(figsize=(15, 15))
# plt.plot(match_result['value'], label='match result')
# plt.plot(test_query, label='test query')
# plt.show()

# Prepare test ts and query
features_to_append = [0, 1, 2, 3, 4]
res_list, time_series_dict, global_min, global_max = generate_source(
    '2013e_001_2_channels_02backs.csv', features_to_append)

normalized_ts_dict = normalize_ts_with_min_max(time_series_dict, global_min,
                                               global_max)


def generate_query_v2(ts_dict, amount=5):
    query_results = []
    for i in range(amount):
        query_id, query_raw_ts = random.choice(list(ts_dict.items()))
        start = (i + 1) * 2
        query_sequence = query_raw_ts[start:numpy.random.randint(
            low=start + 5, high=len(query_raw_ts) // 2)]

        query_results.append(query_sequence)
Пример #2
0
                    gp_not_opened_error()
                else:
                    if len(args
                           ) != 2:  # if wronge number of arguments is given
                        err_msg = FormattedText([
                            ('class:error',
                             'Wrong number of arguments, please specify the path to the the data you wish to load'
                             ),
                        ])
                        print_formatted_text(err_msg, style=style)

                    elif not os.path.isfile(args[1]):
                        load_file_not_found_error(args[1])
                    else:

                        time_series_list, time_series_dict, global_min, global_max = generate_source(
                            args[1], features_to_append)
                        print("loaded file " + args[1])
                        print("Global Max is " + str(global_max))
                        print("Global Min is " + str(global_min))
                        normalized_ts_dict = normalize_ts_with_min_max(
                            time_series_dict, global_min, global_max)

                        # gp_project.save_time_series(time_series_dict, normalized_ts_dict, args[1])  # TODO include load history
                        try:
                            gp_project.load_time_series(
                                time_series_dict, normalized_ts_dict,
                                time_series_list)
                        except DuplicateIDError as e:
                            err_msg = FormattedText([
                                ('class:error',
                                 'Error: duplicate ID(s) found in existing time series and newly loaded time series, dupplicate ID(s):'
Пример #3
0
def main(args):
    file_path = args.input
    # './dataset/001-SART-August2017-MB.csv'
    Server_path = ['/usr/lib/jvm/java-1.8.0-openjdk-amd64',
                   './res/saved_dataset',
                   file_path
                   ]
    Yu_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home',
               './res/saved_dataset',
               './dataset/001-SART-August2017-MB-50.csv']
    Leo_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_151.jdk/Contents/Home',
                './res/saved_dataset',
                file_path]
    Yuncong_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_161.jdk/Contents/Home',
                    './res/saved_dataset',
                    file_path]

    path = Server_path
    os.environ['JAVA_HOME'] = path[0]
    # create a spark job
    cores = args.cores
    st = args.st
    full_length = args.full_length
    sc = SparkContext('' + 'local' + '[' + str(cores) + ']' + '', "First App")

    # sc = SparkContext("local[4]", "First App")
    # st = 0.25
    new_path = re.match(r"(.*)\.csv", path[2]).group(1)
    path_save_res = path[1] + '/' + new_path + '_' + str(st)
    # if path exist, the job can't be executed
    if os.path.isdir(path_save_res):
        group_rdd = sc.pickleFile(path_save_res + '/group/')
        cluster_rdd = sc.pickleFile(path_save_res + '/cluster/')
        global_dict_rdd = sc.pickleFile(path_save_res + '/dict/')
        # shutil.rmtree(path_save_res)
    else:
        # TODO
        file = path[2]
        # add test for commit
        features_to_append = [0, 1, 2, 3, 4]

        # res_list: list of raw time series data to be on distributed
        # timeSeries: a dictionary version of as res_list, used for sebsequence look up
        res_list, time_series_dict, global_min, global_max = generate_source(file, features_to_append)
        print('processing dataset' + path[2])
        print("Global Max is " + str(global_max))
        print("Global Min is " + str(global_min))

        normalized_ts_dict = normalize_ts_with_min_max(time_series_dict, global_min, global_max)

        # TODO
        # add clustering method after grouping

        # this broadcast object can be accessed from all nodes in computer cluster
        # in order to access the value this, just use val = global_dict.value
        # for future reading data
        # NOTE that the data being broadcasted is the minmax-normalized data
        global_dict = sc.broadcast(normalized_ts_dict)
        time_series_dict = sc.broadcast(time_series_dict)
        # max(flows, key=lambda k: len(flows[k]))
        # find the key of largest length of
        # max_len_key = max(global_dict.value, key=lambda k: len(global_dict.value[k]))
        # max_length = len(global_dict.value[max_len_key])
        if full_length:
            grouping_range = (1, max([len(v) for v in global_dict.value.values()]))

        else:
            grouping_range = (89, 90)
        #         grouping_range = (1, length)

        global_dict_rdd = sc.parallelize(res_list[1:], numSlices=16)
        global_dict_rdd.saveAsPickleFile(path_save_res + '/dict/')

        # global_dict_res = global_dict_rdd.collect()
        # finish grouping here, result in a key, value pair where
        # key is the length of sub-sequence, value is the [id of source time series, start_point, end_point]
        # res_rdd = global_dict_rdd.flatMap(lambda x: get_all_subsquences(x)).collect()

        # In get_subsquences(x, 100, 110): we are grouping subsequences that are of length 90 to 110

        """
        ##### group
        group_rdd_res: list: items = (length, time series list) -> time series list: items = (id, start, end)
        """
        # add save option or not
        group_start_time = time.time()
        group_rdd = global_dict_rdd.flatMap(lambda x: get_subsquences(x, grouping_range[0], grouping_range[1])).map(
            lambda x: (x[0], [x[1:]])).reduceByKey(
            lambda a, b: a + b)
        group_rdd.saveAsPickleFile(path_save_res + '/group/')
        group_end_time = time.time()
        print('group of timeseries from ' + str(grouping_range[0]) + ' to ' + str(grouping_range[1]) + ' using ' + str(
            group_end_time - group_start_time) + ' seconds')
        # group_rdd_res = group_rdd.collect()
        print("grouping done, saved to dataset")

        """
        ##### cluster

        The following code is for testing clustering operation. Cluster one group without using RDD
        4/15/19
        # print("Test clustering")
        # group_res = group_rdd.collect()
        # cluster(group_res[1][1], group_res[1][0], st, global_dict.value)  # testing group with length of 9
        """

        # print("Test clustering")
        # group_res = group_rdd.collect()
        # # cluster_two_pass(group_res[1][1], group_res[1][0], st, global_dict.value)  # testing group with length of 9
        # cluster(group_res[1][1], group_res[1][0], st, global_dict.value)  # testing group with length of 9

        print("Working on clustering")
        cluster_start_time = time.time()
        cluster_rdd = group_rdd.map(lambda x: cluster(x[1], x[0], st, global_dict.value))

        cluster_rdd.saveAsPickleFile(path_save_res + '/cluster/')  # save all the cluster to the hard drive
        cluster_rdd_reload = sc.pickleFile(path_save_res).collect()  # here we have all the clusters in memory
        # first_dict = cluster_rdd_reload[0]
        cluster_end_time = time.time()

        print('clustering of timeseries from ' + str(grouping_range[0]) + ' to ' + str(
            grouping_range[1]) + ' using ' + str(cluster_end_time - cluster_start_time) + ' seconds')

        print("clustering done, saved to dataset")

        # plot all the clusters
        # plot_cluster(cluster_rdd_reload, 2, time_series_dict, 5)

        """
            ##### query
            Current implementation: if we want to find k best matches, we give the first k best matches for given sequence length range


            The following line is for testing querying on one cluster
            # query_result = query(query_sequence, cluster_rdd_reload[0], k, time_series_dict.value)

        """



        # print("Using Twopass")
        # total_cluster_count = 0
        # for cluster_dic in cluster_rdd.collect():
        #
        #     representative, cluster_subsequences = random.choice(list(cluster_dic.items()))
        #
        #     cluster_length = representative.get_length()
        #     total_cluster_count = total_cluster_count + len(cluster_dic.keys())
        #
        #     print("length " + str(cluster_length) + " has cluster count of " + str(len(cluster_dic.keys())))
        # print("Total cluster count is: " + str(total_cluster_count))

        # # '(001-SART-August2017-MB)_(211-Current-Item:-3)_(A-DC1)_(64434.0)_(105950.0)'
        # '(2013e_001)_(100-0-Back)_(B-DC8)_(232665953.1250)'
        query_id = '(001-SART-August2017-MB)_(211-Current-Item:-3)_(A-DC1)_(64434.0)_(105950.0)'
        query_sequence = get_data(query_id, 24, 117, time_series_dict.value)  # get an example query
        filter_rdd = cluster_rdd.filter(lambda x: exclude_same_id(x, query_id))
        # raise exception if the query_range exceeds the grouping range
        querying_range = (90, 91)
        k = 5  # looking for k best matches
        if querying_range[0] < grouping_range[0] or querying_range[1] > grouping_range[1]:
            raise Exception("query_operations: query: Query range does not match group range")

        query_result = cluster_rdd.filter(lambda x: x).map(
            lambda clusters: query(query_sequence, querying_range, clusters, k, time_series_dict.value)).collect()
        exclude_overlapping = True
        query_result = filter_rdd.map(
            lambda clusters: query(query_sequence, querying_range, clusters, k, time_series_dict.value,
                                   exclude_overlapping,
                                   0.5)).collect()

        plot_query_result(query_sequence, query_result, time_series_dict.value)

    sc.stop()
Пример #4
0
                        if ts_list is not None and ts_dict is not None and normalized_ts_dict is not None:

                            is_Update_pre_infor = prompt(
                                "Project " + args[1] +
                                " pre-processing information exist, would you like to update ? [y/n]"
                            )

                            if is_Update_pre_infor == 'y':
                                # creating new GenexPlus project
                                update_pre_processing = True
                                print("update pre-processing information " +
                                      args[1])

                        # change here
                        if update_pre_processing or not ts_list or not ts_dict or not normalized_ts_dict:
                            ts_list, global_min, global_max = generate_source(
                                args[1], features_to_append)

                            print("loaded file " + args[1])
                            print("Global Max is " + str(global_max))
                            print("Global Min is " + str(global_min))

                            #  get a normalize version of the time series
                            norm_ts_list = normalize_ts_with_min_max(
                                ts_list, global_min, global_max)

                            global_norm_list = sc.parallelize(norm_ts_list)

            elif args[
                    0] == 'save':  # TODO save changes to the GenexPlusProject pickle file
                path_to_save = SAVED_DATASET_DIR + os.sep + gp_project.get_project_name(
                )