Пример #1
0
def ent_dc_step_by_step(id_index, index_id, data, threshold, distance, distance_c, dataset='none'):
    """
    :param id_index:
    :param index_id:
    :param data:
    :param threshold:
    :param distance:
    :param distance_c:
    :return:
    """
    # TODO modify the clustering without outlier
    i = 0
    level = "INFO"
    N = int(index_id.shape[0])
    # next_distance_c=get_next_distance_c(distance,distance_c)
    max_distance_c = max_distance(distance, distance_c)
    learning_rate = 0
    gradient = 0.00001
    jarge_now = 0
    jarge_pre = 5
    pre = 65535


    # 方差步长
    temp = distance.copy()
    # cache=temp.ravel()

    # percent = 0.2
    # position = int(index_id.shape[0] * (index_id.shape[0] + 1) / 2 * percent / 100)
    # log.debug("init the first max distance_c:" + str(max_distance_c) + " distance shape:" + str(distance.shape)+" start:"+str(sorted(cache)[position * 2 + index_id.shape[0]]))
    # distance_c = sorted(cache)[position * 2 + index_id.shape[0]]

    temp[np.isnan(temp)] = 0
    stand = np.std(temp)
    temp = distance.copy()

    temp[np.isnan(temp)] = stand

    temp = temp.min(axis=0)
    next_distance_c = np.std(temp)

    clusterRecorder = ClusterRecorder(dataset)
    cr_i = str(Properties.name_str_static() + "#" + str(i))

    # DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distance_c', 'dataset', 'pile_size', 'H','note'])

    clusterRecorder.setValue(str(cr_i), 'id', Properties.name_str_static())
    clusterRecorder.setValue(str(cr_i), 'start', Properties.name_str_HMS())
    clusterRecorder.setValue(str(cr_i), 'd_c', distance_c)
    clusterRecorder.setValue(str(cr_i), 'max_distance_c', max_distance_c)
    clusterRecorder.setValue(str(cr_i), 'dataset', dataset)
    clusterRecorder.setValue(str(cr_i), 'pile_size', N)
    clusterRecorder.setValue(str(cr_i), 'H', 65535)
    clusterRecorder.setValue(str(cr_i), 'note', '整个算法运行时间')
    if learning_rate != 0:
        distance_c = distance_c + learning_rate

    start_time = Properties.name_str_HMS()

    while max_distance_c >= distance_c:
        i = i + 1
        last_time = Properties.name_str_HMS()
        # pile = 0
        # 设置pile的pile元素,与pile的类成员个数
        pile_id = DataFrame([], columns=['p_id', 'pile', 'size', 'outlier'])
        # delta_id, data_id = delta_function(id_index, index_id, rho_id, distance)
        # data = DataFrame([], columns=['gamma', 'rho', 'delta', 'pile'], index=index_id.index)
        data_id = DataFrame([], columns=['i_id', 'j_id', 'rho', 'delta', 'gamma', 'i', 'j', 'pile'],
                            index=id_index.values)
        pile_id = pile_function(pile_id, id_index, index_id, data, distance, distance_c, next_distance_c, dataset)
        pile_size = pile_id['size']
        pile = pile_id.shape[0] - np.sum(pile_id['outlier'])
        # id_index, index_id

        e=[]
        e_outlier=0
        #log.fatal(pile_size.values)


        pile_id = pile_id.sort_values('size', ascending=False)
        pile_id = pile_id.reset_index(drop=True)
        for i in range(0,len(pile_id)):
            if not pile_id.loc[i,'outlier']:
                e.append(pile_id.loc[i,'size'])
            else:
                e_outlier+=pile_id.loc[i,'size']
            if e_outlier>0:
                e.append(e_outlier)

        ee=np.array(e)
        e = _calc_ent(ee / N)
        merge = list([e, distance_c, pile])
        threshold = add_row(threshold, merge)
        jarge_now = pre - e
        # if jarge_now > jarge_pre:
        if jarge_now > 0:
            cr_j = str(Properties.name_str_static() + "#" + str(i))
            # DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distanc', 'dataset', 'pile_size', 'H','note'])
            clusterRecorder.setValue(str(cr_j), 'id', Properties.name_str_static())
            clusterRecorder.setValue(str(cr_j), 'start', last_time)
            clusterRecorder.setValue(str(cr_j), 'd_c', distance_c)
            clusterRecorder.setValue(str(cr_j), 'max_distance_c', max_distance_c)
            clusterRecorder.setValue(str(cr_j), 'dataset', dataset)
            clusterRecorder.setValue(str(cr_j), 'pile_size', len(pile_id))
            clusterRecorder.setValue(str(cr_j), 'H', e)
            clusterRecorder.setValue(str(cr_j), 'note', '发现新下降时间')
            clusterRecorder.setValue(str(cr_j), 'end', Properties.name_str_HMS())

            save_show_cluster(index_id, data, distance_c, pile_id, dataset)

            cr_j = str(Properties.name_str_static() + "#" + str(i))
            # DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distanc', 'dataset', 'pile_size', 'H','note'])
            clusterRecorder.setValue(str(cr_j), 'id', Properties.name_str_static())
            clusterRecorder.setValue(str(cr_j), 'start', start_time)
            clusterRecorder.setValue(str(cr_j), 'd_c', distance_c)
            clusterRecorder.setValue(str(cr_j), 'max_distance_c', max_distance_c)
            clusterRecorder.setValue(str(cr_j), 'dataset', dataset)
            clusterRecorder.setValue(str(cr_j), 'pile_size', len(pile_id))
            clusterRecorder.setValue(str(cr_j), 'H', e)
            clusterRecorder.setValue(str(cr_j), 'note', '发现新下降时间')
            clusterRecorder.setValue(str(cr_j), 'end', Properties.name_str_HMS())
            save_show_cluster(index_id, data, distance_c, pile_id, dataset)
            start_time = Properties.name_str_HMS()
            clusterRecorder.save()
            if level is "DEBUG":
                pile_id = pile_function(pile_id, id_index, index_id, data, distance, distance_c - next_distance_c,
                                        next_distance_c, dataset, level="DEBUG")
                save_show_cluster(index_id, data, distance_c, pile_id, dataset, level="DEBUG")



        pre = e
        # jarge_now = jarge_now + 1
        # jarge_pre = jarge_now
        # next_distance_c = get_next_distance_c(distance, distance_c)
        # next_distance_c = 0


        distance_c = distance_c + next_distance_c

        # if gradient==0.00005:
        #
        #     distance_c = distance_c + gradient
        #     gradient = gradient + 0.00001
        # elif learning_rate != 0:
        #     distance_c = distance_c + learning_rate
        #     gradient = 0.00001
        # else:
        #     distance_c = distance_c + next_distance_c
        #     gradient = 0.00001

        # distance_c = distance_c + learning_rate
        if e == 0:
            log.debug("e is 0.")
            break

        log.info(
            str(i) + " time, finished the next_distance_c about: " + str(next_distance_c) + " distance_c:" + str(
                distance_c) + " next-learning_rate:" + str(learning_rate) + " H:" + str(e))
    clusterRecorder.setValue(str(cr_i), 'end', Properties.name_str_HMS())
    clusterRecorder.save()
    log.debug(threshold)
    return threshold
Пример #2
0
def ent_dc_step_by_step(id_index,
                        index_id,
                        data,
                        threshold,
                        distance,
                        distance_c,
                        dataset='none'):
    """
    :param id_index:
    :param index_id:
    :param data:
    :param threshold:
    :param distance:
    :param distance_c:
    :return:
    """
    # TODO modify the clustering without outlier
    i = 0
    N = int(index_id.shape[0])
    # next_distance_c=get_next_distance_c(distance,distance_c)
    max_distance_c = max_distance(distance, distance_c)
    learning_rate = 0
    gradient = 0.00001
    jarge_now = 0
    jarge_pre = 5
    pre = 65535

    # 方差步长
    temp = distance.copy()
    temp[np.isnan(temp)] = 0
    stand = np.std(temp)
    temp = distance.copy()
    temp[np.isnan(temp)] = stand
    temp = temp.min(axis=0)
    next_distance_c = np.std(temp)

    clusterRecorder = ClusterRecorder(dataset)
    cr_i = str(Properties.name_str_static() + "#" + str(i))

    #DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distance_c', 'dataset', 'pile_size', 'H','note'])

    clusterRecorder.setValue(str(cr_i), 'id', Properties.name_str_static())
    clusterRecorder.setValue(str(cr_i), 'start', Properties.name_str_HMS())
    clusterRecorder.setValue(str(cr_i), 'd_c', distance_c)
    clusterRecorder.setValue(str(cr_i), 'max_distance_c', max_distance_c)
    clusterRecorder.setValue(str(cr_i), 'dataset', dataset)
    clusterRecorder.setValue(str(cr_i), 'pile_size', N)
    clusterRecorder.setValue(str(cr_i), 'H', 65535)
    clusterRecorder.setValue(str(cr_i), 'note', '整个算法运行时间')
    if learning_rate != 0:
        distance_c = distance_c + learning_rate
    log.debug("init the first max distance_c:" + str(max_distance_c) +
              " distance shape:" + str(distance.shape))
    start_time = Properties.name_str_HMS()

    while max_distance_c >= distance_c:
        i = i + 1
        last_time = Properties.name_str_HMS()
        # pile = 0
        # 设置pile的pile元素,与pile的类成员个数
        pile_id = DataFrame([], columns=['p_id', 'pile', 'size', 'outlier'])
        # delta_id, data_id = delta_function(id_index, index_id, rho_id, distance)
        # data = DataFrame([], columns=['gamma', 'rho', 'delta', 'pile'], index=index_id.index)
        data_id = DataFrame([],
                            columns=[
                                'i_id', 'j_id', 'rho', 'delta', 'gamma', 'i',
                                'j', 'pile'
                            ],
                            index=id_index.values)
        pile_id = pile_function(pile_id, id_index, index_id, data_id, distance,
                                distance_c, next_distance_c)
        pile_size = pile_id['size']
        pile = pile_id.shape[0] - np.sum(pile_id['outlier'])
        # id_index, index_id
        e = _calc_ent(pile_size.values / N)
        merge = list([e, distance_c, pile])
        threshold = add_row(threshold, merge)
        jarge_now = pre - e
        # if jarge_now > jarge_pre:
        if jarge_now > 0:
            cr_j = str(Properties.name_str_static() + "#" + str(i))
            #DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distanc', 'dataset', 'pile_size', 'H','note'])
            clusterRecorder.setValue(str(cr_j), 'id',
                                     Properties.name_str_static())
            clusterRecorder.setValue(str(cr_j), 'start', last_time)
            clusterRecorder.setValue(str(cr_j), 'd_c', distance_c)
            clusterRecorder.setValue(str(cr_j), 'max_distance_c',
                                     max_distance_c)
            clusterRecorder.setValue(str(cr_j), 'dataset', dataset)
            clusterRecorder.setValue(str(cr_j), 'pile_size', len(pile_id))
            clusterRecorder.setValue(str(cr_j), 'H', e)
            clusterRecorder.setValue(str(cr_j), 'note', '发现新下降时间')
            clusterRecorder.setValue(str(cr_j), 'end',
                                     Properties.name_str_HMS())

            save_show_cluster(index_id, data, distance_c, pile_id)

            cr_j = str(Properties.name_str_static() + "#" + str(i))
            #DataFrame([], columns=['id', 'start', 'end', 'd_c', 'max_distanc', 'dataset', 'pile_size', 'H','note'])
            clusterRecorder.setValue(str(cr_j), 'id',
                                     Properties.name_str_static())
            clusterRecorder.setValue(str(cr_j), 'start', start_time)
            clusterRecorder.setValue(str(cr_j), 'd_c', distance_c)
            clusterRecorder.setValue(str(cr_j), 'max_distance_c',
                                     max_distance_c)
            clusterRecorder.setValue(str(cr_j), 'dataset', dataset)
            clusterRecorder.setValue(str(cr_j), 'pile_size', len(pile_id))
            clusterRecorder.setValue(str(cr_j), 'H', e)
            clusterRecorder.setValue(str(cr_j), 'note', '发现新下降时间')
            clusterRecorder.setValue(str(cr_j), 'end',
                                     Properties.name_str_HMS())
            save_show_cluster(index_id, data, distance_c, pile_id)
            start_time = Properties.name_str_HMS()

        pre = e
        # jarge_now = jarge_now + 1
        # jarge_pre = jarge_now
        # next_distance_c = get_next_distance_c(distance, distance_c)
        # next_distance_c = 0
        distance_c = distance_c + next_distance_c

        # if gradient==0.00005:
        #
        #     distance_c = distance_c + gradient
        #     gradient = gradient + 0.00001
        # elif learning_rate != 0:
        #     distance_c = distance_c + learning_rate
        #     gradient = 0.00001
        # else:
        #     distance_c = distance_c + next_distance_c
        #     gradient = 0.00001

        # distance_c = distance_c + learning_rate
        if e == 0:
            log.debug("e is 0.")
            break

        log.info(
            str(i) + " time, finished the next_distance_c about: " +
            str(next_distance_c) + " distance_c:" + str(distance_c) +
            " next-learning_rate:" + str(learning_rate) + " H:" + str(e))
    clusterRecorder.setValue(str(cr_i), 'end', Properties.name_str_HMS())
    clusterRecorder.save()
    log.debug(threshold)
    return threshold