Python cluster示例，cluster_operations.cluster Python示例

示例#1

0

显示文件

文件： genex.py 项目： yuli14/GenexPlus

def genex(ts_dict, query, similarity_threshold):
    """
    make sure that all time series in the ts_dict are of the same length
    :param ts_dict:
    :param query:
    """
    javaHome_path = '/Library/Java/JavaVirtualMachines/jdk1.8.0_151.jdk/Contents/Home'

    os.environ['JAVA_HOME'] = javaHome_path

    conf = SparkConf().setAppName("GenexPlus").setMaster("local[*]")  # using all available cores
    sc = SparkContext(conf=conf)

    ts_len = 0
    # get the length of the longest ts
    for id, data in ts_dict.items():
        ts_len = len(data)

    global_min, global_max = get_global_min_max(ts_dict)

    normalized_ts_dict = normalize_ts_with_min_max(ts_dict, global_min, global_max)

    global_dict = sc.broadcast(normalized_ts_dict)
    time_series_dict = sc.broadcast(ts_dict)

    # make the ts dict into a list so that we can parallelize the list
    ts_list = ts_dict_to_list(normalized_ts_dict)
    ts_list_rdd = sc.parallelize(ts_list[1:], numSlices=16)

    """
    grouping
    """
    group_rdd = ts_list_rdd.flatMap(lambda x: get_subsquences(x, 0, ts_len)).map(
        lambda x: (x[0], [x[1:]])).reduceByKey(
        lambda a, b: a + b)
    """
    clustering
    """
    cluster_rdd = group_rdd.map(lambda x: cluster(x[1], x[0], similarity_threshold, global_dict.value))
    cluster_result = cluster_rdd.collect()

    """
    querying
    """
    filter_rdd = cluster_rdd.filter(lambda x: exclude_same_id(x, query_id))

    print()

示例#2

0

显示文件

文件： main_cmd.py 项目： yuli14/GenexPlus

def main(args):
    file_path = args.input
    # './dataset/001-SART-August2017-MB.csv'
    Server_path = ['/usr/lib/jvm/java-1.8.0-openjdk-amd64',
                   './res/saved_dataset',
                   file_path
                   ]
    Yu_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_171.jdk/Contents/Home',
               './res/saved_dataset',
               './dataset/001-SART-August2017-MB-50.csv']
    Leo_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_151.jdk/Contents/Home',
                './res/saved_dataset',
                file_path]
    Yuncong_path = ['/Library/Java/JavaVirtualMachines/jdk1.8.0_161.jdk/Contents/Home',
                    './res/saved_dataset',
                    file_path]

    path = Server_path
    os.environ['JAVA_HOME'] = path[0]
    # create a spark job
    cores = args.cores
    st = args.st
    full_length = args.full_length
    sc = SparkContext('' + 'local' + '[' + str(cores) + ']' + '', "First App")

    # sc = SparkContext("local[4]", "First App")
    # st = 0.25
    new_path = re.match(r"(.*)\.csv", path[2]).group(1)
    path_save_res = path[1] + '/' + new_path + '_' + str(st)
    # if path exist, the job can't be executed
    if os.path.isdir(path_save_res):
        group_rdd = sc.pickleFile(path_save_res + '/group/')
        cluster_rdd = sc.pickleFile(path_save_res + '/cluster/')
        global_dict_rdd = sc.pickleFile(path_save_res + '/dict/')
        # shutil.rmtree(path_save_res)
    else:
        # TODO
        file = path[2]
        # add test for commit
        features_to_append = [0, 1, 2, 3, 4]

        # res_list: list of raw time series data to be on distributed
        # timeSeries: a dictionary version of as res_list, used for sebsequence look up
        res_list, time_series_dict, global_min, global_max = generate_source(file, features_to_append)
        print('processing dataset' + path[2])
        print("Global Max is " + str(global_max))
        print("Global Min is " + str(global_min))

        normalized_ts_dict = normalize_ts_with_min_max(time_series_dict, global_min, global_max)

        # TODO
        # add clustering method after grouping

        # this broadcast object can be accessed from all nodes in computer cluster
        # in order to access the value this, just use val = global_dict.value
        # for future reading data
        # NOTE that the data being broadcasted is the minmax-normalized data
        global_dict = sc.broadcast(normalized_ts_dict)
        time_series_dict = sc.broadcast(time_series_dict)
        # max(flows, key=lambda k: len(flows[k]))
        # find the key of largest length of
        # max_len_key = max(global_dict.value, key=lambda k: len(global_dict.value[k]))
        # max_length = len(global_dict.value[max_len_key])
        if full_length:
            grouping_range = (1, max([len(v) for v in global_dict.value.values()]))

        else:
            grouping_range = (89, 90)
        #         grouping_range = (1, length)

        global_dict_rdd = sc.parallelize(res_list[1:], numSlices=16)
        global_dict_rdd.saveAsPickleFile(path_save_res + '/dict/')

        # global_dict_res = global_dict_rdd.collect()
        # finish grouping here, result in a key, value pair where
        # key is the length of sub-sequence, value is the [id of source time series, start_point, end_point]
        # res_rdd = global_dict_rdd.flatMap(lambda x: get_all_subsquences(x)).collect()

        # In get_subsquences(x, 100, 110): we are grouping subsequences that are of length 90 to 110

        """
        ##### group
        group_rdd_res: list: items = (length, time series list) -> time series list: items = (id, start, end)
        """
        # add save option or not
        group_start_time = time.time()
        group_rdd = global_dict_rdd.flatMap(lambda x: get_subsquences(x, grouping_range[0], grouping_range[1])).map(
            lambda x: (x[0], [x[1:]])).reduceByKey(
            lambda a, b: a + b)
        group_rdd.saveAsPickleFile(path_save_res + '/group/')
        group_end_time = time.time()
        print('group of timeseries from ' + str(grouping_range[0]) + ' to ' + str(grouping_range[1]) + ' using ' + str(
            group_end_time - group_start_time) + ' seconds')
        # group_rdd_res = group_rdd.collect()
        print("grouping done, saved to dataset")

        """
        ##### cluster

        The following code is for testing clustering operation. Cluster one group without using RDD
        4/15/19
        # print("Test clustering")
        # group_res = group_rdd.collect()
        # cluster(group_res[1][1], group_res[1][0], st, global_dict.value)  # testing group with length of 9
        """

        # print("Test clustering")
        # group_res = group_rdd.collect()
        # # cluster_two_pass(group_res[1][1], group_res[1][0], st, global_dict.value)  # testing group with length of 9
        # cluster(group_res[1][1], group_res[1][0], st, global_dict.value)  # testing group with length of 9

        print("Working on clustering")
        cluster_start_time = time.time()
        cluster_rdd = group_rdd.map(lambda x: cluster(x[1], x[0], st, global_dict.value))

        cluster_rdd.saveAsPickleFile(path_save_res + '/cluster/')  # save all the cluster to the hard drive
        cluster_rdd_reload = sc.pickleFile(path_save_res).collect()  # here we have all the clusters in memory
        # first_dict = cluster_rdd_reload[0]
        cluster_end_time = time.time()

        print('clustering of timeseries from ' + str(grouping_range[0]) + ' to ' + str(
            grouping_range[1]) + ' using ' + str(cluster_end_time - cluster_start_time) + ' seconds')

        print("clustering done, saved to dataset")

        # plot all the clusters
        # plot_cluster(cluster_rdd_reload, 2, time_series_dict, 5)

        """
            ##### query
            Current implementation: if we want to find k best matches, we give the first k best matches for given sequence length range


            The following line is for testing querying on one cluster
            # query_result = query(query_sequence, cluster_rdd_reload[0], k, time_series_dict.value)

        """



        # print("Using Twopass")
        # total_cluster_count = 0
        # for cluster_dic in cluster_rdd.collect():
        #
        #     representative, cluster_subsequences = random.choice(list(cluster_dic.items()))
        #
        #     cluster_length = representative.get_length()
        #     total_cluster_count = total_cluster_count + len(cluster_dic.keys())
        #
        #     print("length " + str(cluster_length) + " has cluster count of " + str(len(cluster_dic.keys())))
        # print("Total cluster count is: " + str(total_cluster_count))

        # # '(001-SART-August2017-MB)_(211-Current-Item:-3)_(A-DC1)_(64434.0)_(105950.0)'
        # '(2013e_001)_(100-0-Back)_(B-DC8)_(232665953.1250)'
        query_id = '(001-SART-August2017-MB)_(211-Current-Item:-3)_(A-DC1)_(64434.0)_(105950.0)'
        query_sequence = get_data(query_id, 24, 117, time_series_dict.value)  # get an example query
        filter_rdd = cluster_rdd.filter(lambda x: exclude_same_id(x, query_id))
        # raise exception if the query_range exceeds the grouping range
        querying_range = (90, 91)
        k = 5  # looking for k best matches
        if querying_range[0] < grouping_range[0] or querying_range[1] > grouping_range[1]:
            raise Exception("query_operations: query: Query range does not match group range")

        query_result = cluster_rdd.filter(lambda x: x).map(
            lambda clusters: query(query_sequence, querying_range, clusters, k, time_series_dict.value)).collect()
        exclude_overlapping = True
        query_result = filter_rdd.map(
            lambda clusters: query(query_sequence, querying_range, clusters, k, time_series_dict.value,
                                   exclude_overlapping,
                                   0.5)).collect()

        plot_query_result(query_sequence, query_result, time_series_dict.value)

    sc.stop()

示例#3

0

显示文件

文件： CLI.py 项目： yuli14/GenexPlus

                    print("grouping done, saved to dataset")

            elif args[0] == 'cluster':
                if gp_project is None:
                    gp_not_opened_error()
                elif sc is None:
                    spark_context_not_set_error()
                elif gp_project.time_series_dict is None:
                    no_group_before_cluster_error()
                else:
                    print("Working on clustering")
                    cluster_start_time = time.time()
                    # TODO Question: why do we call cluster on the global_dict?
                    cluster_rdd = group_rdd.map(
                        lambda x: cluster(x[1], x[0], 0.1, global_dict.value)
                    )  # TODO have the user decide the similarity threshold

                    # cluster_rdd_collected = cluster_rdd.collect()
                    # first_dict = cluster_rdd_reload[0]
                    gp_project.set_cluster_data(
                        cluster_rdd
                    )  # save cluster information to the projecty

                    cluster_end_time = time.time()

                    print('clustering of timeseries from ' +
                          str(grouping_range[0]) + ' to ' +
                          str(grouping_range[1]) + ' using ' +
                          str(cluster_end_time - cluster_start_time) +
                          ' seconds')

示例#4

0

显示文件

                    gp_not_opened_error()

                elif sc is None:
                    spark_context_not_set_error()
                else:
                    grouping_range = (1,
                                      max([
                                          len(v)
                                          for v in dict(norm_ts_list).values()
                                      ]))
                    group_rdd = global_norm_list.flatMap(
                        lambda x: get_subsquences(x, grouping_range[
                            0], grouping_range[1])).map(lambda x: (x[
                                0], [x[1:]])).reduceByKey(lambda a, b: a + b)
                    global_norm_dict = sc.broadcast(dict(norm_ts_list))
                    cluster_rdd = group_rdd.map(lambda x: cluster(
                        x[1], x[0], 0.1, global_norm_dict.value))

            elif args[0] == 'group':
                if gp_project is None:
                    gp_not_opened_error()
                elif sc is None:
                    spark_context_not_set_error()
                elif ts_list is None or ts_dict is None or normalized_ts_dict is None:
                    get_arg_error()
                else:
                    update_group = False
                    if group_rdd:

                        is_Update_group_infor = prompt(
                            "Project " + gp_project.get_project_name() +
                            "r's group information exist, would you like to update ? [y/n]"

示例#5

0

显示文件

    "local[*]")  # using all available cores
sc = SparkContext(conf=conf)

features_to_append = [0, 1, 2, 3, 4]
ts_list, global_min, global_max = generate_source(
    '2013e_001_2_channels_02backs.csv', features_to_append)
norm_ts_list = normalize_ts_with_min_max(ts_list, global_min, global_max)

global_norm_list = sc.parallelize(norm_ts_list)
grouping_range = (1, max([len(v) for v in dict(norm_ts_list).values()]))

# group operation
group_rdd = global_norm_list.flatMap(
    lambda x: get_subsquences(x, grouping_range[0], grouping_range[1])).map(
        lambda x: (x[0], [x[1:]])).reduceByKey(lambda a, b: a + b)

group_result = group_rdd.collect()
global_norm_dict = sc.broadcast(dict(norm_ts_list))

# start = time.time()
# for group_entry in group_result:
#     result = cluster(group_entry[1], group_entry[0], 0.1, dict(norm_ts_list))
# end = time.time()
# print('Clustering all group WITHOUT Spark took ' + str(end - start) + ' seconds')

start = time.time()
cluster_rdd = group_rdd.map(
    lambda x: cluster(x[1], x[0], 0.1, global_norm_dict.value)).collect()
end = time.time()
print('Clustering all group WITH Spark took ' + str(end - start) + ' seconds')