Пример #1
0
def returnRuleWithMinSupport(binary_data_path, label, ruleList, minSupport,
                             minConfidence, maxLength, file_path, length):
    binary_data = load_daily_data_simple(binary_data_path)
    if length == 5:
        minSupport_return = minSupport + 0.005
    else:
        minSupport_return = minSupport
    try:
        f = open(file_path, 'a+')
        returnList = []
        for rule in ruleList:
            df = (binary_data[rule[0]] == 1)
            for idx in range(1, len(rule)):
                df = df & (binary_data[rule[idx]] == 1)
            sup = np.sum(np.sum(df))
            if (sup / maxLength) >= minSupport_return:
                returnList.append(rule)
                label_tem = label.copy()
                label_tem[~(df == True)] = np.nan
                ret = np.sum(np.sum(label_tem))
                if ret / sup >= minConfidence and (sup /
                                                   maxLength) >= minSupport:
                    f.write('%s with sup %.6lf and conf %.6lf\n' %
                            (str(rule), sup / maxLength, ret / sup))
                    f.flush()
        f.close()
    except Exception as e:
        print(traceback.format_exc())
    del binary_data
    return returnList
def adjust_key(args,
               result_data,
               key,
               key_rule,
               label,
               accuracy=0.002,
               epochs=10):
    conf_baseline = get_conf(result_data, key_rule, label)
    with open(args.splitPointPath, 'r') as f:
        split_point_dict = json.load(fp=f)
    key_name = key[:-3]
    key_num = int(key[-3:])
    data = load_daily_data_simple(args.dataPath, ['%s' % key_name])
    data = data[key_name]
    array = data.values.reshape(len(data) * len(data.columns))
    array = array[np.isfinite(array)]
    array = list(sorted(array))
    accuracy_length = int(len(array) * accuracy)
    if key_num == 0:
        left_point = -np.inf
        right_point = split_point_dict[key_name][key_num]
        if len(key_rule) == 0:
            return [[-np.inf, right_point], [-np.inf, right_point]]
        for epoch in range(epochs):
            try:
                right_idx = array.index(right_point)
            except:
                right_idx = int(len(array) * (key_num + 1) * 0.1)
                if int(array[right_idx]) == int(right_point):
                    pass
                else:
                    return [[-np.inf, right_point], [-np.inf, right_point]]

            # 右→右
            right_point = array[right_idx + accuracy_length]
            conf_right, data_tem_right_1 = get_interval_conf(
                result_data, data, key, key_rule, label, left_point,
                right_point)

            if conf_right > conf_baseline:
                result_data[key] = data_tem_right_1
                right_point = array[right_idx + accuracy_length]
                continue

            # 右→左
            right_point = array[right_idx - accuracy_length]
            conf_right, data_tem_right_2 = get_interval_conf(
                result_data, data, key, key_rule, label, left_point,
                right_point)

            if conf_right > conf_baseline:
                result_data[key] = data_tem_right_2
                right_point = array[right_idx - accuracy_length]
            else:
                right_point = array[right_idx]
                break
        return [[-np.inf, split_point_dict[key_name][key_num]],
                [-np.inf, right_point]]
    elif key_num == len(split_point_dict[key_name]):
        left_point = split_point_dict[key_name][key_num - 1]
        right_point = np.inf
        if len(key_rule) == 0:
            return [[left_point, np.inf], [left_point, np.inf]]
        for epoch in range(epochs):
            try:
                left_idx = array.index(left_point)
            except:
                left_idx = int(len(array) * (key_num) * 0.1)
                if int(array[left_idx]) == int(left_point):
                    pass
                else:
                    return [[left_point, np.inf], [left_point, np.inf]]

            # 左→左
            left_point = array[left_idx - accuracy_length]
            conf_left, data_tem_left_1 = get_interval_conf(
                result_data, data, key, key_rule, label, left_point,
                right_point)

            if conf_left > conf_baseline:
                result_data[key] = data_tem_left_1
                left_point = array[left_idx - accuracy_length]
                continue

            # 左→右
            left_point = array[left_idx + accuracy_length]
            conf_left, data_tem_left_2 = get_interval_conf(
                result_data, data, key, key_rule, label, left_point,
                right_point)

            if conf_left > conf_baseline:
                result_data[key] = data_tem_left_2
                left_point = array[left_idx + accuracy_length]
            else:
                left_point = array[left_idx]
                break
        return [[split_point_dict[key_name][key_num - 1], np.inf],
                [left_point, np.inf]]
    else:
        left_point = split_point_dict[key_name][key_num - 1]
        right_point = split_point_dict[key_name][key_num]
        if len(key_rule) == 0:
            return [[left_point, right_point], [left_point, right_point]]
        for epoch in range(epochs):
            try:
                left_idx = array.index(left_point)
                right_idx = array.index(right_point)
            except:
                left_idx = int(len(array) * (key_num) * 0.1)
                right_idx = int(len(array) * (key_num + 1) * 0.1)
                if int(array[left_idx]) == int(left_point) and int(
                        array[right_idx]) == int(right_point):
                    pass
                else:
                    return [[left_point, right_point],
                            [left_point, right_point]]

            # 左→左
            left_point = array[left_idx - accuracy_length]
            right_point = array[right_idx]
            conf_left, data_tem_left_1 = get_interval_conf(
                result_data, data, key, key_rule, label, left_point,
                right_point)

            # 右→右
            left_point = array[left_idx]
            right_point = array[right_idx + accuracy_length]
            conf_right, data_tem_right_1 = get_interval_conf(
                result_data, data, key, key_rule, label, left_point,
                right_point)

            flag = compare_conf(conf_left, conf_right, conf_baseline)
            if flag == 1:
                result_data[key] = data_tem_left_1
                left_point = array[left_idx - accuracy_length]
                right_point = array[right_idx]
                continue
            elif flag == 2:
                result_data[key] = data_tem_right_1
                left_point = array[left_idx]
                right_point = array[right_idx + accuracy_length]
                continue

            # 左→右
            left_point = array[left_idx + accuracy_length]
            right_point = array[right_idx]
            conf_left, data_tem_left_2 = get_interval_conf(
                result_data, data, key, key_rule, label, left_point,
                right_point)

            # 右→左
            left_point = array[left_idx]
            right_point = array[right_idx - accuracy_length]
            conf_right, data_tem_right_2 = get_interval_conf(
                result_data, data, key, key_rule, label, left_point,
                right_point)

            flag = compare_conf(conf_left, conf_right, conf_baseline)
            if flag == 0:
                left_point = array[left_idx]
                right_point = array[right_idx]
                break
            elif flag == 1:
                result_data[key] = data_tem_left_2
                left_point = array[left_idx + accuracy_length]
                right_point = array[right_idx]
            elif flag == 2:
                result_data[key] = data_tem_right_2
                left_point = array[left_idx]
                right_point = array[right_idx - accuracy_length]
        return [[
            split_point_dict[key_name][key_num - 1],
            split_point_dict[key_name][key_num]
        ], [left_point, right_point]]
Пример #3
0
            adjust_interval(args, binary_data, label, ruleList, elogger)
            for key in binary_data.keys():
                binary_data[key].to_csv('./binary_data_adjusted/%s.csv' %
                                        str(key))
            binary_data_path = './binary_data_adjusted'
        length += 1

        elogger.log('Time spent in filtering ruleList: %d seconds' %
                    (end - start).seconds)
        candidateList = get_candidate_List(ruleList, elogger)
        elogger.log("The number of candidateList filtered is %d" %
                    len(candidateList))


if __name__ == '__main__':
    data = load_daily_data_simple(args.dataPath)
    label = load_daily_data_simple(args.labelPath, ['label'])
    label = label['label']
    label = label.loc[label.index <= '20171231']
    minSup = args.sup
    minConf = args.conf
    elogger = Logger(args.logger)
    elogger.log(str(os.getpid()))
    elogger.log(str(args._get_kwargs()))

    with open('./EquiDepth_Label000_pct5.json', 'r') as f:
        split_point_dict = json.load(fp=f)
    binary_data = GetBinaryBySplitPoint(data, split_point_dict)

    attr_set = get_attr()
    result = {}
    split_point_list = {}
    for key in data.keys():
        print("Processing feature %s." % key)
        if key == 'label':
            continue
        key_type = get_type(data[key])
        print("Processing feature %s. It's type %d" % (key, key_type))
        if key_type == 0:
            result[key] = data[key]
        elif key_type == 1:
            result_tem, split_point_tem = categorical_to_binary(data[key], key)
            result.update(result_tem)
            split_point_list[key] = split_point_tem
        elif key_type == 2:
            result_tem, split_point_tem = quantitative_to_binary(data[key], data['label'], key)
            result.update(result_tem)
            split_point_list[key] = split_point_tem
    with open(config['split_point_save_path'], 'w') as f:
        json.dump(split_point_list, f)
    return result

if __name__ == '__main__':
    data = load_daily_data_simple(config['path_data'])
    label = data['label'].loc[data['label'].index <= '20171231']
    for key in data.keys():
        data[key] = data[key].reindex(label.index)
    data['label'] = label
    result = get_binary(data)
    print(len(result.keys()))