Пример #1
0
def clever_cluster(dcpc, k, logger=None):
    if logger == None:
        logger = init_logging('')

    keep_model = None
    keep_dis = -1
    dcpc = dcpc.T
    for i in range(0, 20):
        model = KMeans(n_clusters=k).fit(dcpc)
        centers = np.array(model.cluster_centers_)
        labels = model.labels_
        overall_dis = 0
        for label in range(0, k):
            clu_idx = np.where(labels == label)[0]
            if len(clu_idx) == 0:
                continue
            clu_ins = []
            for idx in clu_idx:
                clu_ins.append(dcpc[idx, :])
            clu_ins = np.array(clu_ins)
            center_label = centers[label]
            center_label = center_label.reshape(1, len(center_label))
            clu_dis = euclidean_distances(clu_ins, center_label)
            clu_dis = np.sum(clu_dis)
            overall_dis = overall_dis + clu_dis
        if keep_dis < 0 or keep_dis > overall_dis:
            keep_model = model
            keep_dis = overall_dis
            print model.labels_

    closest, _ = pairwise_distances_argmin_min(keep_model.cluster_centers_,
                                               dcpc)

    return closest
Пример #2
0
def run_ijcnn_fcn(train_x_matrix,
                  train_y_matrix,
                  test_x_matrix,
                  test_y_matrix,
                  cnn_setting,
                  saver_file_profix='',
                  logger=None):
    if logger == None:
        logger = init_logging('')
    start_class = 0
    class_column = 0
    train_row, attr_len, attr_num, input_map = train_x_matrix.shape
    cnn_setting.feature_method = 'none'

    num_classes = train_y_matrix.shape[1]
    data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len,
                                 class_column)

    input_x_placeholder = tf.placeholder(tf.float32,
                                         [None, attr_len, attr_num, input_map])
    output_y_placeholder = tf.placeholder(tf.float32, [None, num_classes])
    predict_y_prob, keep_prob_placeholder, keeped_feature_list, saver_file = fcn_configure(
        input_x_placeholder, num_classes, logger)

    saver_file = saver_file_profix + saver_file
    cnn_eval_value, train_run_time, test_run_time, cnn_predict_prob, saver_file, feature_list_obj_file = cnn_train(
        train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix,
        num_classes, cnn_setting, input_x_placeholder, output_y_placeholder,
        predict_y_prob, keep_prob_placeholder, keeped_feature_list, saver_file,
        logger)
    if str(cnn_eval_value) == 'nan':
        cnn_eval_value = 0
    return cnn_eval_value, train_run_time, test_run_time, cnn_predict_prob, saver_file, feature_list_obj_file
Пример #3
0
def run_projected_cnn(train_x_matrix,
                      train_y_matrix,
                      test_x_matrix,
                      test_y_matrix,
                      data_stru,
                      cnn_setting,
                      group_all=False,
                      saver_file_profix='',
                      logger=None):
    if logger is None:
        logger = init_logging('')
    num_classes = data_stru.num_classes
    attr_num = data_stru.attr_num
    attr_len = data_stru.attr_len
    logger.info(cnn_setting)

    train_row, attr_len, attr_num, input_map = train_x_matrix.shape
    data_stru.attr_num = attr_num
    data_stru.attr_len = attr_len

    train_x_placeholder, output_y_placeholder, logits_out, keep_prob_placeholder, keeped_feature_list, saver_file = cnn_set_flow_graph(
        data_stru, cnn_setting, input_map, group_all, logger)

    saver_file = saver_file_profix + "_group_" + str(group_all) + saver_file
    cnn_eval_value, train_run_time, test_run_time, cnn_predict_proba, saver_file, feature_list_obj_file = cnn_train(
        train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix,
        num_classes, cnn_setting, train_x_placeholder, output_y_placeholder,
        logits_out, keep_prob_placeholder, keeped_feature_list, saver_file,
        logger)
    if str(cnn_eval_value) == 'nan':
        cnn_eval_value = 0
    return cnn_eval_value, train_run_time, test_run_time, cnn_predict_proba, saver_file, feature_list_obj_file
Пример #4
0
def fcn_configure(input_x_placeholder, num_classes, logger):
    if logger == None:
        logger = init_logging('')
    #conv_kernel_list = [[1,8], [1,5], [1,3]]
    conv_kernel_list = [[8, 1], [5, 1], [3, 1]]
    conv_kernel_list = np.array(conv_kernel_list)
    feature_num_list = [50, 40, 20]
    activation_fun = 0
    num_input_map = 1
    conv_row_num = len(conv_kernel_list)
    saver_file = ''
    strides_list = [1, 1, 1, 1]
    std_value = 0.02
    same_size = False
    out_conv = input_x_placeholder
    keeped_feature_list = []
    for i in range(0, conv_row_num):
        logger.info('layer: ' + str(i) + " input:")
        logger.info(out_conv.get_shape())
        conv_row_kernel = conv_kernel_list[i, 0]
        conv_col_kernel = conv_kernel_list[i, 1]

        num_output_map = feature_num_list[i]

        saver_file = saver_file + "_c" + str(conv_row_kernel) + "_" + str(
            conv_col_kernel)
        out_conv = conf_conv_layer(i, conv_row_kernel, conv_col_kernel,
                                   out_conv, num_input_map, num_output_map,
                                   activation_fun, strides_list, std_value,
                                   same_size)
        logger.info("Conv output: " + str(out_conv.get_shape()))
        out_conv = tf.layers.batch_normalization(out_conv)
        logger.info("Conv after batch normal: " + str(out_conv.get_shape()))
        num_input_map = num_output_map

    row_samp_rate = out_conv.get_shape()[1]
    col_samp_rate = 1
    out_conv = conf_pool_layer(out_conv, row_samp_rate, col_samp_rate, False)
    keeped_feature_list.append(out_conv)
    logger.info("Feature result shape")
    logger.info(out_conv.get_shape())

    saver_file = saver_file + "global_p" + str(row_samp_rate) + "_" + str(
        col_samp_rate) + '.ckpt'

    #dropout
    keep_prob_placeholder = tf.placeholder(tf.float32)
    out_conv = tf.nn.dropout(out_conv, keep_prob_placeholder)

    out_fir, out_sec, out_thi, out_for = out_conv.get_shape()
    feature_num = int(out_sec * out_thi * out_for)
    print out_conv.get_shape()
    out_conv = tf.reshape(out_conv, [-1, feature_num])
    print std_value
    print feature_num
    predict_y_prob = conf_out_layer(out_conv, feature_num, num_classes,
                                    std_value)
    #print "predict_y_prob"
    print predict_y_prob.get_shape()
    return predict_y_prob, keep_prob_placeholder, keeped_feature_list, saver_file
Пример #5
0
def rf_feature_extraction(x_matrix,
                          y_vector,
                          predict=False,
                          logger=None,
                          rf_estimator=50):
    if logger is None:
        logger = init_logging('')

    rf_model = ExtraTreesClassifier(n_estimators=rf_estimator, random_state=0)
    start_time = time.time()
    rf_model.fit(x_matrix, y_vector)
    run_time = time.time() - start_time

    feature_value_vector = np.absolute(rf_model.feature_importances_)
    #sum_value = float(np.sum(feature_value_vector))
    #feature_value_vector = feature_value_vector/sum_value
    #feature_value_vector = preprocessing.normalize(feature_value_vector.reshape(1, len(feature_value_vector)), norm='l2')[0]
    if predict is True:
        predict_y = rf_model.predict(x_matrix)
        #averaged_acc = averaged_class_based_accuracy(predict_y, y_vector)
        accuracy, precision, recall, f1_value, tp, fp, tn, fn = f1_value_precision_recall_accuracy(
            predict_y, y_vector, 1)
    else:
        averaged_acc = -1
    return feature_value_vector, rf_model, f1_value, run_time
Пример #6
0
def run_cnn_lda_class_based_feature_analysis_main(feature_folder, feature_file_keyword, class_label, data_folder, data_stru, logger=None):
    if logger == None:
        logger = init_logging('')

    lda_weight_feature_matrix = []

    feature_file_list = listFiles(feature_folder)
    overall_time = 0
    file_count = 0
    class_keyword = 'class_'+ str(class_label)+'_'

    file_demiliter = '_'

    for feature_file in feature_file_list:
        if feature_file_keyword not in feature_file or class_keyword not in feature_file:
            continue
        logger.info(feature_file)
        
        file_count = file_count + 1
        feature_file_array = feature_file.split(file_demiliter)
        train_file = feature_file_array[1] + file_demiliter + feature_file_array[2] + '.txt'
        logger.info(train_file)
        train_x_matrix, train_y_vector = readFile(data_folder + train_file)
        temp_train_y_vector = np.where(train_y_vector==class_label, 1, 0)
        fold_positive_len = len(np.where(temp_train_y_vector == 1)[0])
        fold_negative_len = len(temp_train_y_vector) - fold_positive_len

        logger.info("=====")
        logger.info("positive class labels length: " + str(fold_positive_len))
        logger.info("negative class labels length: " + str(fold_negative_len))

        [fold_train_sensor_result, fold_weight_fullconn, fold_bias_fullconn] = load_obj(feature_folder + feature_file)

        logger.info(fold_train_sensor_result.shape)
        logger.info(fold_weight_fullconn.shape)
        logger.info(fold_bias_fullconn.shape)

        fold_train_sensor_result = np.squeeze(fold_train_sensor_result)
        logger.info(fold_train_sensor_result.shape)
        fold_attr_imp_index, skip_count, fold_time = project_cnn_feature_lda_analysis(fold_train_sensor_result, temp_train_y_vector, logger)
        overall_time = overall_time + fold_time
        logger.info(fold_attr_imp_index.shape)
        logger.info("skip: " + str(skip_count))
        lda_weight_feature_matrix.append(fold_attr_imp_index)
        #if file_count > 2:
        #    break
        #break



    lda_weight_feature_matrix = np.array(lda_weight_feature_matrix)
    lda_weight_feature_matrix = np.squeeze(lda_weight_feature_matrix)
    logger.info(lda_weight_feature_matrix.shape)

    start_time = time.time()
    class_based_value_array = np.sum(lda_weight_feature_matrix, axis=0)
    class_based_index_array = np.argsort(-class_based_value_array)
    overall_time = overall_time + time.time() - start_time
    return class_based_index_array, class_based_value_array, overall_time
Пример #7
0
def run_cnn_pca_class_based_feature_analysis_main(feature_folder, feature_file_keyword, class_label, data_folder, data_stru, logger=None):
    if logger == None:
        logger = init_logging('')

    weight_feature_matrix = []

    feature_file_list = listFiles(feature_folder)
    overall_time = 0
    file_count = 0
    class_keyword = 'class_'+ str(class_label)+'_'

    file_demiliter = '_'

    for feature_file in feature_file_list:
        if feature_file_keyword not in feature_file or class_keyword not in feature_file:
            continue
        logger.info(feature_file)
        #print(feature_file)
        file_count = file_count + 1
        feature_file_array = feature_file.split(file_demiliter)
        train_file = feature_file_array[1] + file_demiliter + feature_file_array[2] + '.txt'
        logger.info(train_file)
        train_x_matrix, train_y_vector = readFile(data_folder + train_file)

        class_label_index = np.where(train_y_vector==class_label)[0]

        logger.info("=====")
        logger.info("positive class labels length: " + str(len(class_label_index)))

        [fold_train_sensor_result, fold_weight_fullconn, fold_bias_fullconn] = load_obj(feature_folder + feature_file)

        logger.info(fold_train_sensor_result.shape)
        logger.info(fold_weight_fullconn.shape)
        logger.info(fold_bias_fullconn.shape)

        fold_train_sensor_result = np.squeeze(fold_train_sensor_result)
        fold_train_sensor_result = fold_train_sensor_result[class_label_index, :, :]
        logger.info(fold_train_sensor_result.shape)
        start_time = time.time()
        fold_attr_imp_index, fold_attr_imp = run_pca_proj_feature_3D(fold_train_sensor_result)
        overall_time = overall_time + time.time() - start_time
        logger.info(fold_attr_imp_index.shape)
        logger.info(fold_attr_imp.shape)
        weight_feature_matrix.append(fold_attr_imp_index)
        #if file_count > 2:           
        #    break
        #break

    weight_feature_matrix = np.array(weight_feature_matrix)
    weight_feature_matrix = weight_feature_matrix.astype(int)
    logger.info(weight_feature_matrix.shape)
    start_time = time.time()
    #print weight_feature_matrix
    lda_feature_vector, lda_feature_value_vector = majority_vote_index(weight_feature_matrix, -1)
    #print lda_feature_matrix
    logger.info(lda_feature_vector.shape)
    overall_time = overall_time + time.time() - start_time
    return lda_feature_vector, lda_feature_value_vector, overall_time
Пример #8
0
def run_cnn_combined_rf_lda_class_based_feature_analysis_main(feature_folder, feature_file_pre, feature_file_post, start_class, end_class, data_folder, data_stru, logger=None):
    if logger == None:
        logger = init_logging('')

    feature_weight_feature_matrix = []
    data_file_list = listFiles(data_folder)
    overall_time = 0
    for train_file in data_file_list:
        if 'train_' not in train_file:
            continue
        logger.info(train_file)
        train_keyword = train_file.replace('.txt', '')
        fold_feature_weight_matrix = []
        train_x_matrix, train_y_vector = readFile(data_folder + train_file)
        for class_label in range(start_class, end_class):
            logger.info("class label: " + str(class_label))
            feature_file = feature_file_pre + train_keyword + "_class_" + str(class_label) + feature_file_post
            [fold_train_sensor_result, fold_weight_fullconn, fold_bias_fullconn] = load_obj(feature_folder + feature_file)
            fold_train_sensor_result = np.squeeze(fold_train_sensor_result)

            temp_train_y_vector = np.where(train_y_vector==class_label, 1, 0)
            fold_attr_imp_index, fold_time = project_cnn_feature_combined_rf_lda_analysis(fold_train_sensor_result, temp_train_y_vector, logger)
            overall_time = overall_time + fold_time

            if class_label == 0:
                logger.info(fold_train_sensor_result.shape)
                logger.info(fold_weight_fullconn.shape)
                logger.info(fold_bias_fullconn.shape)
                logger.info(fold_attr_imp_index.shape)
            fold_feature_weight_matrix.append(fold_attr_imp_index)

        fold_feature_weight_matrix = np.array(fold_feature_weight_matrix)
        logger.info("fold_feature_weight_matrix.shape")
        logger.info(fold_feature_weight_matrix.shape)

        fold_feature_weight_matrix = np.sum(fold_feature_weight_matrix, axis=1)

        logger.info("fold_feature_weight_matrix final shape")
        logger.info(fold_feature_weight_matrix.shape)

        feature_weight_feature_matrix.append(fold_feature_weight_matrix)

    feature_weight_feature_matrix = np.array(feature_weight_feature_matrix)
    logger.info("feature_weight_feature_matrix.shape")
    logger.info(feature_weight_feature_matrix.shape)

    start_time = time.time()
    feature_weight_feature_matrix = np.sum(feature_weight_feature_matrix, axis=0)
    feature_index_feature_matrix = np.argsort(-feature_weight_feature_matrix, axis=1)
    overall_time = overall_time + time.time() - start_time
    logger.info(feature_index_feature_matrix.shape)
    logger.info(feature_index_feature_matrix[0:5, 0:6])
    logger.info(feature_weight_feature_matrix.shape)
    logger.info(feature_weight_feature_matrix[0:5, 0:6])
    logger.info("fold cnn combined rf and lda projected feature generation overall time (sec)")
    logger.info(overall_time)
    return feature_index_feature_matrix, feature_weight_feature_matrix, overall_time
Пример #9
0
def multiple_f1_value_precision_recall_accuracy(predict_y_vector,
                                                real_y_vector,
                                                logger=None):
    if logger == None:
        logger = init_logging('')
    if len(predict_y_vector) != len(real_y_vector):
        raise Exception("Length for prediction is not same")

    min_class = min(real_y_vector)
    max_class = max(real_y_vector)

    instance_num = len(predict_y_vector)
    f1_value_list = []
    for i in range(min_class, max_class + 1):
        class_predict_y = np.where(predict_y_vector == i, 1, 0)
        class_real_y = np.where(real_y_vector == i, 1, 0)
        #print class_predict_y
        #print class_real_y
        #print "==="
        tp = 0
        tn = 0
        fp = 0
        fn = 0
        for instance_index in range(0, instance_num):
            predict = int(class_predict_y[instance_index])
            real = int(class_real_y[instance_index])
            if real == 1:
                if predict == real:
                    tp = tp + 1
                else:
                    fn = fn + 1
            else:
                if predict == real:
                    tn = tn + 1
                else:
                    fp = fp + 1

        if tp == 0:
            precision = 0
            recall = 0
            f1_value = 0
        else:
            precision = float(tp) / float(tp + fp)
            recall = float(tp) / float(tp + fn)
            f1_value = float(
                2 * precision * recall) / float(precision + recall)
        f1_value_list.append(f1_value)
    f1_value_list = np.array(f1_value_list)

    accuracy = 0
    for instance_index in range(0, instance_num):
        predict = int(predict_y_vector[instance_index])
        real = int(real_y_vector[instance_index])
        if predict == real:
            accuracy = accuracy + 1
    accuracy = float(accuracy) / float(instance_num)
    return accuracy, f1_value_list
Пример #10
0
def clever_rank(dcpc, logger=None):
    if logger is None:
        logger = init_logging('')
    top_p, attr_num = dcpc.shape
    attr_score = {}
    for a in range(0, attr_num):
        var_score = dcpc[:, a]
        l2_score = np.sum(np.square(var_score))
        attr_score[a] = l2_score
    return attr_score
Пример #11
0
def project_cnn_feature_combined_rf_lda_analysis(feature_matrix,
                                                 y_vector,
                                                 logger=None):
    if logger == None:
        logger = init_logging('')

    num_instance, num_attribute, num_map = feature_matrix.shape
    map_attr_imp_matrix = [
    ]  # used to store all attribute importance from each map
    map_attr_imp_index_matrix = []  # used to store
    predict = True
    skip_count = 0
    rf_time = 0
    lda_time = 0
    for i in range(0, num_map):
        map_feature_matrix = feature_matrix[:, :, i]
        start_time = 0
        feature_vector_norm, feature_value_vector, rf_model, averaged_acc = rf_feature_extraction(
            map_feature_matrix, y_vector, predict, logger)
        rf_time = rf_time + time.time() - start_time
        #if averaged_acc != -1:
        #    logger.info("RF accuracy:")
        #    logger.info(averaged_acc)
        #    feature_value_vector = feature_value_vector * averaged_acc
        if i == 0:
            logger.info(feature_value_vector.shape)

        #if np.any(map_feature_matrix) == False:
        #print "do not know why"
        #    skip_count = skip_count + 1
        #    map_attr_imp_matrix.append(feature_value_vector)
        #    continue
        start_time = 0
        lda_feature_vector_norm, lda_feature_value_vector, lda_model, lda_averaged_acc = gene_lda_feature_v2(
            map_feature_matrix, y_vector, predict, logger)
        lda_time = lda_time + time.time() - start_time
        #if lda_averaged_acc != -1:
        #    logger.info("LDA accuracy:")
        #    logger.info(lda_averaged_acc)
        #    lda_feature_value_vector = lda_feature_value_vector * lda_averaged_acc
        feature_value_vector = feature_value_vector + lda_feature_value_vector

        #print feature_value_vector
        map_attr_imp_matrix.append(feature_value_vector)
        #len_feature = len(feature_value_vector)
        #sort_weight_vector = np.argsort(feature_value_vector)
        #feature_vector_norm = np.zeros(len_feature)
        #for i in range(0, len_feature):
        #    feature_vector_norm[sort_weight_vector[i]] = i
        #map_attr_imp_matrix.append(feature_vector_norm)

    map_attr_imp_matrix = np.array(map_attr_imp_matrix)
    logger.info(map_attr_imp_matrix.shape)

    return map_attr_imp_matrix, rf_time + lda_time
Пример #12
0
def run_cnn_lda_feature_analysis_main(feature_folder, feature_file_keyword, data_folder, data_stru, feature_postfix, logger=None):
    if logger == None:
        logger = init_logging('')
    num_classes = data_stru.num_classes
    start_class = data_stru.start_class
    attr_num = data_stru.attr_num
    class_column = data_stru.class_column

    lda_weight_feature_matrix = []

    file_list = listFiles(data_folder)
    feature_file_list = listFiles(feature_folder)
    overall_time = 0
    file_count = 0
    file_demiliter = '_'
    for train_file in file_list:
        logger.info(train_file)
        file_count = file_count + 1
        for feature_file in feature_file_list:
            if feature_file_keyword not in feature_file or train_file not in feature_file: 
                continue
            logger.info(feature_file)
            train_x_matrix, train_y_vector = readFile(data_folder + train_file)
            [fold_train_sensor_result, fold_weight_fullconn, fold_bias_fullconn] = load_obj(feature_folder + feature_file)
    
            logger.info(fold_train_sensor_result.shape)
            logger.info(fold_weight_fullconn.shape)
            logger.info(fold_bias_fullconn.shape)
    
            fold_train_sensor_result = np.squeeze(fold_train_sensor_result)
            logger.info(fold_train_sensor_result.shape)
            fold_attr_imp_index, fold_attr_imp, skip_count, fold_time = project_cnn_feature_lda_analysis(fold_train_sensor_result, train_y_vector, logger)
            overall_time = overall_time + fold_time
            logger.info(fold_attr_imp_index.shape)
            logger.info(fold_attr_imp.shape)
            logger.info("skip: " + str(skip_count))
            lda_weight_feature_matrix.append(fold_attr_imp_index)

            #if file_count > 2:
            #    break
            #break
    lda_weight_feature_matrix = np.array(lda_weight_feature_matrix)
    lda_weight_feature_matrix = lda_weight_feature_matrix.astype(int)
    logger.info(lda_weight_feature_matrix.shape)
    start_time = time.time()
    lda_feature_matrix = fold_feature_combination_F_C_A(lda_weight_feature_matrix)
    logger.info(lda_feature_matrix.shape)
    overall_time = overall_time + time.time() - start_time
    logger.info(lda_feature_matrix.shape)
    logger.info(lda_feature_matrix[0:5, 0:6])
    return lda_feature_matrix, overall_time
Пример #13
0
def computeDCPC(mts_data, threshold=0.9, logger=None):
    if logger == None:
        logger = init_logging('')

    row_num, attr_len, attr_num = mts_data.shape
    print mts_data.shape
    loading = []
    percent = []
    for r in range(0, row_num):
        mts_item = mts_data[r, :, :]
        #mts_item = standardization(mts_item)
        #logger.info("mts item: " + str(mts_item.shape))
        corr_matrix = np.corrcoef(mts_item)
        indices = np.where(np.isnan(corr_matrix))
        corr_matrix[indices] = 0
        #logger.info("corr_matrix: " + str(corr_matrix.shape))
        u, s, vh = np.linalg.svd(corr_matrix, full_matrices=True)
        percent_var = (s / sum(s))
        p_sum = float(0)
        for p in range(0, len(percent_var)):
            p_sum = p_sum + percent_var[p]
            if p_sum >= threshold:
                break
        percent.append(p)
        #logger.info("u: " + str(u.shape))
        loading.append(u)
    p = max(percent)

    h_matrix = []
    for r in range(0, row_num):
        load_m = loading[r]
        mul_load = np.multiply(load_m.T, load_m)
        if len(h_matrix) == 0:
            h_matrix = mul_load
        else:
            h_matrix = h_matrix + mul_load

    logger.info(h_matrix.shape)
    indices = np.where(np.isnan(h_matrix))
    h_matrix[indices] = 0
    dcpc, h_s, h_v = np.linalg.svd(h_matrix, full_matrices=True)
    logger.info(dcpc[0:p, :])
    return dcpc[0:p, :]
Пример #14
0
def run_dcpc_main(data_folder,
                  class_column,
                  num_classes,
                  obj_folder,
                  threshold,
                  logger=None):
    if logger == None:
        logger = init_logging('')

    file_list = list_files(data_folder)
    overall_time = 0

    file_count = 0
    out_obj_dict = {}
    for train_file in file_list:
        if "train_" not in train_file:
            continue
        logger.info(train_file)
        out_obj_file = train_file.replace('.txt', '_dcpc.obj')
        file_count = file_count + 1

        test_file = train_file.replace('train_', 'test_')

        x_matrix, y_vector = file_read_split(data_folder + train_file)
        min_class = min(y_vector)
        max_class = max(y_vector) + 1
        #logger.info("x matrix tran after shape: " + str(x_matrix.shape))
        #x_matrix = x_matrix.transpose((0, 2, 1))
        logger.info("x matrix tran after shape: " + str(x_matrix.shape))
        for label in range(min_class, max_class):
            label_index = np.where(y_vector == label)[0]
            label_x_matrix = x_matrix[label_index, :, :]
            logger.info("class: " + str(label))
            print "class: " + str(label)
            logger.info("x matrix tran before shape: " +
                        str(label_x_matrix.shape))
            label_dcpc = computeDCPC(label_x_matrix, threshold)
            logger.info("class: " + str(label) + " dcpc shape: " +
                        str(label_dcpc.shape))
            out_obj_dict[label] = label_dcpc
        logger.info("dcpc out obj: " + str(obj_folder + out_obj_file))
        save_obj([out_obj_dict], obj_folder + out_obj_file)
Пример #15
0
def cnn_set_flow_graph(data_stru,
                       cnn_setting,
                       input_map,
                       group_all=False,
                       logger=None):
    if logger is None:
        logger = init_logging('')
    tf.reset_default_graph()
    tf.random.set_random_seed(0)

    attr_num = data_stru.attr_num
    attr_len = data_stru.attr_len
    num_classes = data_stru.num_classes

    output_y_placeholder = tf.placeholder(tf.float32, [None, num_classes])
    train_x_placeholder = tf.placeholder(tf.float32,
                                         [None, attr_len, attr_num, input_map])
    logits_out, keep_prob_placeholder, keeped_feature_list, saver_file = cnn_configure(
        train_x_placeholder, cnn_setting, num_classes, group_all, logger)
    return train_x_placeholder, output_y_placeholder, logits_out, keep_prob_placeholder, keeped_feature_list, saver_file
Пример #16
0
def cnn_feature_lda(train_x_matrix,
                    train_y_vector,
                    predict=False,
                    logger=None):
    if logger == None:
        logger = init_logging('')
    train_norm_vector = np.linalg.norm(train_x_matrix, axis=0,
                                       ord=np.inf)[None, :]
    #print train_norm_vector
    train_x_matrix = np.true_divide(train_x_matrix,
                                    train_norm_vector,
                                    where=(train_norm_vector != 0))
    train_x_matrix[np.isnan(train_x_matrix)] = 0
    train_x_matrix[np.isinf(train_x_matrix)] = 1
    #print train_x_matrix[0:3, 0:5]
    lda_model, train_time = bi_gene_lda_model(train_x_matrix, train_y_vector)
    #feature_value_vector = np.absolute(lda_model.coef_[0])
    feature_value_vector = np.absolute(lda_model.scalings_.T[0])
    #logger.info("predict_bool: " + str(predict))
    if predict == True:
        feature_value_vector = preprocessing.normalize(
            feature_value_vector.reshape(1, len(feature_value_vector)),
            norm='l2')[0]
        predict_y = lda_model.predict(train_x_matrix)

        averaged_acc = averaged_class_based_accuracy(predict_y, train_y_vector)

        #mean_acc = lda_model.score(train_x_matrix, train_y_vector)
        #logger.info('mean_acc: ' + str(mean_acc))

        #logger.info('averaged_acc: ' + str(averaged_acc))
    else:
        averaged_acc = -1

    len_weight = len(feature_value_vector)
    sort_feature_value_vector = np.argsort(feature_value_vector)
    feature_vector_norm = np.zeros(len_weight)
    for i in range(0, len_weight):
        feature_vector_norm[sort_feature_value_vector[i]] = i

    return feature_vector_norm, feature_value_vector, lda_model, averaged_acc
Пример #17
0
def lda_feature_extraction(x_matrix, y_vector, predict=False, logger=None):
    if logger is None:
        logger = init_logging('')
    train_norm_vector = np.linalg.norm(x_matrix, axis=0, ord=np.inf)[None, :]
    #print "train_norm_vector"
    #print train_norm_vector
    x_matrix = np.true_divide(x_matrix,
                              train_norm_vector,
                              where=(train_norm_vector != 0))
    x_matrix[np.isnan(x_matrix)] = 0
    x_matrix[np.isinf(x_matrix)] = 1
    if x_matrix.max() == x_matrix.min():
        return None, None, -1, 0
    if np.any(x_matrix) is False:
        return None, None, -1, 0
    prior_vector = []
    min_class = min(y_vector)
    max_class = max(y_vector) + 1
    all_count = len(y_vector)
    for i in range(min_class, max_class):
        c_count = len(np.where(y_vector == i)[0])
        prior_vector.append(float(c_count) / all_count)
    lda_model = LinearDiscriminantAnalysis(priors=prior_vector)
    start_time = time.time()
    lda_model.fit(x_matrix, y_vector)
    run_time = time.time() - start_time
    feature_value_vector = np.absolute(lda_model.scalings_.T[0])
    #sum_value = float(np.sum(feature_value_vector))
    #feature_value_vector = feature_value_vector/sum_value
    #feature_value_vector = preprocessing.normalize(feature_value_vector.reshape(1, len(feature_value_vector)), norm='l2')[0]
    if predict is True:
        predict_y = lda_model.predict(x_matrix)
        #averaged_acc = averaged_class_based_accuracy(predict_y, y_vector)
        accuracy, precision, recall, f1_value, tp, fp, tn, fn = f1_value_precision_recall_accuracy(
            predict_y, y_vector, 1)
    else:
        averaged_acc = -1

    return feature_value_vector, lda_model, f1_value, run_time
Пример #18
0
def conf_conv_layer(layer,
                    kernel_r,
                    kernel_c,
                    input_matrix,
                    num_input_map,
                    num_output_map,
                    activation_fun=0,
                    strides_list=[1, 1, 1, 1],
                    std_value=0.1,
                    same_size=False,
                    logger=None):
    #if layer == 0:
    #    std_value = sqrt(0.2)
    #else:
    #    std_value = sqrt(0.2 / num_input_map)
    if logger is None:
        logger = init_logging("")
    tf.random.set_random_seed(layer)
    weight_variable = tf.Variable(tf.truncated_normal(
        [kernel_r, kernel_c, num_input_map, num_output_map], stddev=std_value),
                                  name='conv_w_' + str(layer))

    bias_variable = tf.Variable(tf.constant(std_value, shape=[num_output_map]),
                                name='conv_b_' + str(layer))
    #bias_variable = tf.Variable(tf.constant(std_value, shape=[num_output_map]))
    #weight_variable = tf.Variable(tf.truncated_normal([kernel_r, kernel_c, num_input_map, num_output_map], stddev=std_value), name='conv_weight_'+str(layer))
    #bias_variable = tf.Variable(tf.constant(0.0, shape=[num_output_map]), name='conv_bias_'+str(layer))
    if same_size == "True":
        str_padding = 'SAME'
    else:
        str_padding = 'VALID'

    ret_conv_before_act = tf.nn.conv2d(input_matrix,
                                       weight_variable,
                                       strides=[1, 1, 1, 1],
                                       padding=str_padding) + bias_variable

    ret_conv = tf.nn.relu(ret_conv_before_act)
    return ret_conv
Пример #19
0
def run_load_predict_cnn(fold_keyword, model_saved_folder, feature_array, top_k, test_x_matrix, test_y_vector, data_stru, cnn_setting, group_all=True, save_obj_folder="./", logger=None):
    if logger is None:
        logger = init_logging('')
    
    real_num_classes = data_stru.num_classes
    model_list = list_files(model_saved_folder)
    data_stru.num_classes = 2
    
    load_time = 0
    test_time = 0
    multi_predict = []
    for c in range(real_num_classes):
        logger.info("Class: " + str(c))
        class_keyword = "class" + str(c) + "_"
        found_model_file = ""
        for model_file in model_list:
            if ".index" not in model_file:
                continue
            if fold_keyword not in model_file:
                continue
            if class_keyword not in model_file:
                continue
            found_model_file = model_file.replace(".index", "")
            print (found_model_file)
            break
    
        if found_model_file == "":
            raise Exception("Model for " + class_keyword + " and " + fold_keyword + " Not Found!!!")
        else:
            found_model_file = model_saved_folder + found_model_file
        class_feature = feature_array[c]
        class_feature = class_feature[0:top_k]
        logger.info("model file: " + str(model_saved_folder + found_model_file))
        logger.info("feature list: " + str(class_feature))
        
        temp_test_x_matrix = test_x_matrix[:, :, class_feature, :]
        logger.info("In run_load_predict_cnn: " + str(temp_test_x_matrix.shape))
        start_time = time.time()
        cnn_session, predict_y_proba, train_x_placeholder, keep_prob_placeholder = load_model(found_model_file, data_stru, cnn_setting, group_all, logger)
        load_time = load_time + time.time() - start_time
        start_time = time.time()
        cnn_predict_proba = load_model_predict(cnn_session, temp_test_x_matrix, predict_y_proba, train_x_placeholder, keep_prob_placeholder)
        #print (cnn_predict_proba[0:10, :])
        test_time = test_time + time.time() - start_time
        multi_predict.append(cnn_predict_proba[:, 1])
        cnn_session.close()
    
    multi_predict = np.array(multi_predict)
    #print multi_predict[0:2, 5:11]
    multi_predict_vector = np.argmax(multi_predict, axis=0)
    save_obj_file = save_obj_folder + fold_keyword + "_" + str(top_k) + ".out"
    save_obj([multi_predict], save_obj_file)
    logger.info("output obj saved to: " + save_obj_file)
    logger.info("multi predict matrix shape: " + str(multi_predict.shape))
    logger.info("multi predict vector shape: " + str(multi_predict_vector.shape))
    #print (str(multi_predict_vector[0:10]))
    logger.info("test y vector: " + str(test_y_vector.shape))
    #print (str(test_y_vector[0:10]))
    acc = accuracy_score(test_y_vector, multi_predict_vector)
    data_stru.num_classes = real_num_classes
    acc1, f1_list = multiple_f1_value_precision_recall_accuracy(multi_predict_vector, test_y_vector, logger)
    if acc != acc1:
        raise Exception("check accuracy")
    return acc, f1_list, load_time, test_time
Пример #20
0
def run_feature_svm_load_proba(model_pre,
                               test_x_matrix,
                               test_y_vector,
                               feature_array,
                               attr_num,
                               logger=None):
    if logger is None:
        logger = init_logging("")
        logger.info('no log file: ')
    num_classes, num_features = feature_array.shape
    test_row, test_col = test_x_matrix.shape
    svm_predict_matrix = np.zeros(test_row * num_classes).reshape(
        test_row, num_classes)
    svm_predict_proba = np.zeros(test_row * num_classes).reshape(
        test_row, num_classes)

    svm_train_time = 0
    svm_test_time = 0

    svm_accuracy = 0
    proba = True
    for i in range(0, num_classes):
        #print 'class: ' + str(i)
        #print feature_array[i]
        logger.info("class: " + str(i))
        logger.info(str(feature_array[i]))
        #temp_train_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation(train_x_matrix, attr_num, feature_array[i])

        temp_test_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation(
            test_x_matrix, attr_num, feature_array[i])
        model_file = model_pre + '_class' + str(i) + "_top" + str(
            temp_attr_len) + ".model"
        print model_file
        logger.info('model file: ' + model_file)
        start_time = time.time()
        svm_model = svm_load_model(model_file)
        temp_train_time = time.time() - start_time
        svm_train_time = svm_train_time + temp_train_time

        #print 'class: ' + str(i)
        if i == 0:
            logger.info('sub feature data shape: ')
            logger.info(str(temp_test_x_matrix.shape))

        temp_test_y_vector = np.where(test_y_vector == i, 1, 0)
        temp_test_x_matrix = temp_test_x_matrix.tolist()
        temp_test_y_vector = temp_test_y_vector.astype(np.integer).tolist()

        ###START FROM HERE
        start_time = time.time()
        temp_predict_y, temp_accuracy, temp_predict_y_proba = svm_predict(
            temp_test_y_vector, temp_test_x_matrix, svm_model, '-b 1')
        temp_test_time = time.time() - start_time
        svm_train_time = svm_train_time + temp_test_time

        temp_accuracy, temp_precision, temp_recall, temp_f1_value, temp_tp, temp_fp, temp_tn, temp_fn = f1_value_precision_recall_accuracy(
            temp_predict_y, temp_test_y_vector)
        temp_predict_y = np.array(temp_predict_y)
        temp_predict_y_proba = np.array(temp_predict_y_proba)

        logger.info("Accuracy for class " + str(i) + ": " + str(temp_accuracy))
        logger.info("Recall for class " + str(i) + ": " + str(temp_recall))
        logger.info("Precision for class " + str(i) + ": " +
                    str(temp_precision))
        logger.info("F1 Score for class " + str(i) + ": " + str(temp_f1_value))
        logger.info("Prediction matrix:")
        logger.info("TP=" + str(temp_tp) + " FP=" + str(temp_fp))
        logger.info("TN=" + str(temp_tn) + " FN=" + str(temp_fn))

        proba_row, proba_col = temp_predict_y_proba.shape

        svm_predict_matrix[:, i] = temp_predict_y
        svm_predict_proba[:, i] = temp_predict_y_proba[:, 1]
        logger.info('=============')
        #break
    svm_accuracy, svm_predict_y = predict_matrix_with_proba_to_predict_accuracy(
        svm_predict_matrix, svm_predict_proba, test_y_vector)
    return svm_accuracy, svm_train_time, svm_test_time, svm_predict_y
Пример #21
0
def run_feature_projected_cnn(train_x_matrix,
                              train_y_vector,
                              test_x_matrix,
                              test_y_vector,
                              data_stru,
                              cnn_setting,
                              feature_dict,
                              top_k,
                              saver_file_profix='',
                              class_id=-1,
                              logger=None):
    if logger is None:
        logger = init_logging('')
    method = 'cnn'

    real_num_classes = data_stru.num_classes
    data_stru.num_classes = 2
    cnn_setting.num_classes = 2
    cnn_setting.feature_method = 'none'
    num_classes = 2

    train_row, attr_len, attr_num, input_map = train_x_matrix.shape
    test_row, attr_len, attr_num, input_map = test_x_matrix.shape

    all_predict_matrix = np.zeros(test_row * real_num_classes).reshape(
        test_row, real_num_classes)

    saver_file = ''
    if class_id == -1:
        min_class = min(train_y_vector)
        max_class = max(train_y_vector) + 1
    else:
        min_class = class_id
        max_class = class_id + 1

    saver_file_profix = saver_file_profix + '_class'

    keep_saver_file = ''
    all_train_time = 0
    all_test_time = 0
    all_f1_value = []
    all_train_time = []
    all_test_time = []
    for i in range(min_class, max_class):
        logger.info('class: ' + str(i))
        temp_train_y_vector = np.where(train_y_vector == i, 1, 0)
        temp_test_y_vector = np.where(test_y_vector == i, 1, 0)
        class_saver_profix = saver_file_profix + str(i)

        fold_positive_len = len(np.where(temp_train_y_vector == 1)[0])
        fold_negative_len = len(temp_train_y_vector) - fold_positive_len

        logger.info("=====")
        logger.info("positive class labels length: " + str(fold_positive_len))
        logger.info("negative class labels length: " + str(fold_negative_len))
        class_feature = feature_dict[i]
        class_feature = class_feature[0:top_k]
        print("class: " + str(i))
        print("number of features: " + str(top_k))
        print("Top features list: " + str(class_feature))
        logger.info("Top feature list: " + str(class_feature))

        temp_train_x_matrix = train_x_matrix[:, :, class_feature, :]
        temp_test_x_matrix = test_x_matrix[:, :, class_feature, :]
        temp_train_y_matrix = y_vector_to_matrix(temp_train_y_vector,
                                                 num_classes)
        temp_test_y_matrix = y_vector_to_matrix(temp_test_y_vector,
                                                num_classes)

        if i == min_class:
            train_x_placeholder, output_y_placeholder, predict_y_prob, keep_prob_placeholder, keeped_feature_list, saver_file = cnn_set_flow_graph(
                data_stru, cnn_setting, input_map, False, logger)
            keep_saver_file = saver_file

        saver_file = cnn_setting.temp_obj_folder + class_saver_profix + keep_saver_file + "_top" + str(
            top_k)
        print saver_file
        class_eval_value, class_train_time, class_test_time, class_predict_prob, fold_saver_file, fold_obj_file = cnn_train(
            temp_train_x_matrix, temp_train_y_matrix, temp_test_x_matrix,
            temp_test_y_matrix, num_classes, cnn_setting, train_x_placeholder,
            output_y_placeholder, predict_y_prob, keep_prob_placeholder,
            keeped_feature_list, saver_file, logger)

        class_predict_y = np.argmax(class_predict_prob, axis=1)
        class_accuracy, precision, recall, class_f1, tp, fp, tn, fn = f1_value_precision_recall_accuracy(
            class_predict_y, temp_test_y_vector, 1)
        if str(class_eval_value) == 'nan':
            class_eval_value = 0
            class_f1 = 0
        logger.info(method + " f1 for class " + str(i) + ": " + str(class_f1))
        logger.info(method + " accuracy for class " + str(i) + ": " +
                    str(class_accuracy))
        logger.info(method + ' model saved: ' + fold_saver_file)
        all_f1_value.append(class_f1)
        all_train_time.append(class_train_time)
        all_test_time.append(class_test_time)
        all_predict_matrix[:, i] = class_predict_prob[:, 1]
        #if i > 2:
        #    break
    all_accuracy, all_predict_y = predict_matrix_with_prob_to_predict_accuracy(
        all_predict_matrix, test_y_vector)
    data_stru.num_classes = real_num_classes
    return all_accuracy, all_f1_value, all_predict_y, all_train_time, all_test_time, all_predict_matrix
Пример #22
0
def run_feature_rf_use_proba(train_x_matrix,
                             train_y_vector,
                             test_x_matrix,
                             test_y_vector,
                             feature_array,
                             attr_num,
                             logger=None):
    if logger is None:
        logger = init_logging("")
        logger.info('no log file: ')
    num_classes, num_features = feature_array.shape
    test_row, test_col = test_x_matrix.shape
    rf_predict_matrix = np.zeros(test_row * num_classes).reshape(
        test_row, num_classes)
    rf_predict_proba = np.zeros(test_row * num_classes).reshape(
        test_row, num_classes)

    rf_train_time = 0
    rf_test_time = 0

    rf_accuracy = 0
    proba = True
    for i in range(0, num_classes):
        logger.info("class: " + str(i))
        logger.info(str(feature_array[i]))
        temp_train_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation(
            train_x_matrix, attr_num, feature_array[i])
        temp_test_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation(
            test_x_matrix, attr_num, feature_array[i])

        if i == 0:
            logger.info('sub feature data shape: ')
            logger.info(str(temp_train_x_matrix.shape))
            logger.info(str(temp_test_x_matrix.shape))

        temp_train_y_vector = np.where(train_y_vector == i, 1, 0)
        temp_test_y_vector = np.where(test_y_vector == i, 1, 0)

        temp_accuracy, temp_predict_y, temp_predict_y_proba, temp_train_time, temp_test_time = run_rf(
            temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix,
            temp_test_y_vector, 20, True)
        temp_accuracy, temp_precision, temp_recall, temp_f1_value, temp_tp, temp_fp, temp_tn, temp_fn = f1_value_precision_recall_accuracy(
            temp_predict_y, temp_test_y_vector)

        logger.info("Accuracy for class " + str(i) + ": " + str(temp_accuracy))
        logger.info("Recall for class " + str(i) + ": " + str(temp_recall))
        logger.info("Precision for class " + str(i) + ": " +
                    str(temp_precision))
        logger.info("F1 Score for class " + str(i) + ": " + str(temp_f1_value))
        logger.info("Prediction matrix:")
        logger.info("TP=" + str(temp_tp) + " FP=" + str(temp_fp))
        logger.info("TN=" + str(temp_tn) + " FN=" + str(temp_fn))

        rf_train_time = rf_train_time + temp_train_time
        rf_test_time = rf_test_time + temp_test_time

        proba_row, proba_col = temp_predict_y_proba.shape

        rf_predict_matrix[:, i] = temp_predict_y
        rf_predict_proba[:, i] = temp_predict_y_proba[:, 1]
        logger.info('=============')
        #break

    rf_accuracy, rf_predict_y = predict_matrix_with_proba_to_predict_accuracy(
        rf_predict_matrix, rf_predict_proba, test_y_vector)
    return rf_accuracy, rf_train_time, rf_test_time, rf_predict_y
Пример #23
0
    #k = 2
    #clever_rank(train_x_matrix, k, threshold)
    #dcpc = computeDCPC(train_x_matrix, threshold)
    #clever_cluster(dcpc, k)
    #sdfs

    data_keyword = 'dsa'
    data_keyword = 'rar'
    data_keyword = 'arc'
    data_keyword = 'asl'
    data_keyword = 'fixed_arc'

    data_folder = '../../data/' + data_keyword + '/train_test_10_fold/'
    data_folder = '../../data/' + data_keyword + '/train_test_3_fold/'
    data_folder = '../../data/' + data_keyword + '/train_test_1_fold/'
    class_column = 0
    num_classes = 18
    threshold = 0.9
    obj_folder = '../../object/' + data_keyword + '/tkde_2005_dcpc/'
    obj_folder = init_folder(obj_folder)
    log_folder = '../../log/' + data_keyword + '/tkde_2005/'
    log_folder = init_folder(log_folder)
    #log_file = log_folder + data_keyword + "_tkde_dcpc.log"
    #logger = init_logging(log_file)
    #run_dcpc_main(data_folder, class_column, num_classes, obj_folder, threshold, logger)

    method = 0
    log_file = log_folder + data_keyword + "_dcpc_to_score.log"
    logger = init_logging(log_file)
    run_dcpc_processing(obj_folder, num_classes, method, logger)
Пример #24
0
def cnn_train(train_x_matrix,
              train_y_matrix,
              test_x_matrix,
              test_y_matrix,
              num_classes,
              cnn_setting,
              input_x_placeholder,
              output_y_placeholder,
              logits_out,
              keep_prob,
              keeped_feature_list,
              saver_file="./",
              logger=None):
    if logger is None:
        logger = init_logging('')
    min_class = 0
    eval_method = cnn_setting.eval_method
    batch_size = cnn_setting.batch_size
    stop_threshold = cnn_setting.stop_threshold
    max_iter = cnn_setting.max_iter
    feature_method = cnn_setting.feature_method
    feature_obj_file = cnn_setting.out_obj_folder + saver_file
    saver_file = cnn_setting.out_model_folder + saver_file
    predict_y_proba = tf.nn.softmax(logits_out)
    prediction = tf.argmax(predict_y_proba, 1)
    actual = tf.argmax(output_y_placeholder, 1)
    correct_prediction = tf.equal(prediction, actual)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    if eval_method == 'f1':
        train_y_vector = np.argmax(train_y_matrix, axis=1)
        train_class_index_dict, train_min_length, train_max_length = class_label_vector_checking(
            train_y_vector)
        min_class = 0
        max_class = max(train_y_vector)
        num_classes = max_class + 1
        if max_class == 1:
            TP = tf.count_nonzero(prediction * actual, dtype=tf.float32)
            TN = tf.count_nonzero((prediction - 1) * (actual - 1),
                                  dtype=tf.float32)
            FP = tf.count_nonzero(prediction * (actual - 1), dtype=tf.float32)
            FN = tf.count_nonzero((prediction - 1) * actual, dtype=tf.float32)
            precision = (TP) / (TP + FP)
            recall = (TP) / (TP + FN)
            f1 = (2 * precision * recall) / (precision + recall)
            eval_method_value = f1
            eval_method_keyword = "f1"
        else:
            eval_method_value = accuracy
            eval_method_keyword = "acc with batch"
        coefficient_placeholder = tf.placeholder(tf.float32,
                                                 shape=[num_classes])
        cross_entropy = tf.reduce_mean(
            tf.nn.weighted_cross_entropy_with_logits(
                targets=output_y_placeholder,
                logits=logits_out,
                pos_weight=coefficient_placeholder))
    else:
        cross_entropy = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels=output_y_placeholder, logits=logits_out))
        eval_method_value = accuracy
        eval_method_keyword = "acc"
    #print cross_entropy.get_shape()
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
    cnn_session = tf.InteractiveSession()
    cnn_session.run(tf.global_variables_initializer())

    test_eval_value = 0
    best_eval_value = 0
    i = 0
    start = 0
    epoch = 0
    end = batch_size
    batch_each_class = int(batch_size / num_classes)
    overall_len = len(train_y_matrix)
    saver = tf.train.Saver()
    train_run_time = 0
    np.random.seed(epoch)
    batch_index = np.random.permutation(overall_len)
    logger.info("Random Epoch:" + str(epoch) + str(batch_index[0:5]))
    f1_unbalance_count = np.zeros(num_classes)
    second_chance = False
    re_init = False
    while (test_eval_value < stop_threshold):
        if start >= overall_len:
            start = 0
            end = start + batch_size
            epoch = epoch + 1
            np.random.seed(epoch)
            logger.info("Random Epoch:" + str(epoch) + str(batch_index[0:5]))
            batch_index = np.random.permutation(overall_len)
        elif end > overall_len:
            end = overall_len
        batch_x_matrix = train_x_matrix[batch_index[start:end], :, :, :]
        batch_y_matrix = train_y_matrix[batch_index[start:end], :]

        #print 'batch_x_matrix shape'
        #print batch_x_matrix.shape
        #print batch_y_matrix.shape
        if eval_method == 'f1':
            if i == 0:
                logger.info("Batch controlled")
            ### Normal BATCH Weight
            #batch_y_vector = np.argmax(batch_y_matrix, axis=1)
            #batch_class_index_dict, batch_min_length, batch_max_length = class_label_vector_checking(batch_y_vector)
            #coefficients_vector = []
            #batch_class_index_dict_keys = batch_class_index_dict.keys()
            #for c_label in range(min_class, max_class+1):
            #    if c_label not in batch_class_index_dict_keys:
            #        add_index_vector_len = 0.1
            #    else:
            #        add_index_vector_len = len(batch_class_index_dict[c_label])
            #    coefficients_vector.append(float(batch_max_length)/float(add_index_vector_len))
            #coefficients_vector = np.array(coefficients_vector)
            ### End of Normal BATCH Weight
            # BATCH_CONTROLLED
            batch_y_vector = np.argmax(batch_y_matrix, axis=1)
            batch_class_index_dict, batch_min_length, batch_max_length = class_label_vector_checking(
                batch_y_vector)
            if i < 3:
                logger.info("class index before: ")
                logger.info(batch_class_index_dict)
            coefficients_vector = []
            batch_class_index_dict_keys = batch_class_index_dict.keys()
            for c_label in range(min_class, max_class + 1):
                #print "class: " + str(c_label)
                #print class_label_vector_checking
                if c_label not in batch_class_index_dict_keys:
                    f1_unbalance_count[
                        c_label] = f1_unbalance_count[c_label] + 1
                    c_label_index = train_class_index_dict[c_label]
                    c_label_index_len = len(c_label_index)
                    add_index_vector_len = 0
                    if c_label_index_len > batch_each_class:
                        add_index_vector = np.random.choice(c_label_index_len,
                                                            batch_each_class,
                                                            replace=False)
                        if (i < 3):
                            logger.info("add index vector for c " +
                                        str(c_label))
                            logger.info(add_index_vector)
                        add_index_vector_len = len(add_index_vector)
                        batch_x_matrix = np.concatenate(
                            (batch_x_matrix, train_x_matrix[
                                c_label_index[add_index_vector], :, :, :]),
                            axis=0)
                        batch_y_matrix = np.concatenate(
                            (batch_y_matrix,
                             train_y_matrix[c_label_index[add_index_vector], :]
                             ),
                            axis=0)
                    else:
                        batch_x_matrix = np.concatenate(
                            (batch_x_matrix,
                             train_x_matrix[c_label_index, :, :, :]),
                            axis=0)
                        batch_y_matrix = np.concatenate(
                            (batch_y_matrix, train_y_matrix[c_label_index, :]),
                            axis=0)
                        add_index_vector_len = c_label_index_len
                else:
                    batch_class_index = batch_class_index_dict[c_label]
                    add_index_vector_len = len(batch_class_index)
                    c_label_index = train_class_index_dict[c_label]
                    c_label_index_len = len(c_label_index)
                    if add_index_vector_len < batch_each_class:
                        add_count = batch_each_class - add_index_vector_len
                        if c_label_index_len > add_count:
                            add_index_vector = np.random.choice(
                                c_label_index_len, add_count, replace=False)
                            if (i < 3):
                                logger.info("add index vector for c " +
                                            str(c_label))
                                logger.info(add_index_vector)
                            add_index_vector_len = add_index_vector_len + len(
                                add_index_vector)
                            batch_x_matrix = np.concatenate(
                                (batch_x_matrix, train_x_matrix[
                                    c_label_index[add_index_vector], :, :, :]),
                                axis=0)
                            batch_y_matrix = np.concatenate(
                                (batch_y_matrix, train_y_matrix[
                                    c_label_index[add_index_vector], :]),
                                axis=0)
                        else:
                            batch_x_matrix = np.concatenate(
                                (batch_x_matrix,
                                 train_x_matrix[c_label_index, :, :, :]),
                                axis=0)
                            batch_y_matrix = np.concatenate(
                                (batch_y_matrix,
                                 train_y_matrix[c_label_index, :]),
                                axis=0)
                            add_index_vector_len = add_index_vector_len + c_label_index_len
                    elif add_index_vector_len > 2 * batch_each_class:
                        remove_count = (add_index_vector_len -
                                        2 * batch_each_class)
                        remove_index_vector = np.random.choice(
                            batch_class_index, remove_count, replace=False)
                        add_index_vector_len = add_index_vector_len - len(
                            remove_index_vector)
                        batch_x_matrix = np.delete(batch_x_matrix,
                                                   remove_index_vector,
                                                   axis=0)
                        batch_y_matrix = np.delete(batch_y_matrix,
                                                   remove_index_vector,
                                                   axis=0)
                        batch_y_vector = np.argmax(batch_y_matrix, axis=1)
                        batch_class_index_dict, batch_min_length, batch_max_length = class_label_vector_checking(
                            batch_y_vector)
                coefficients_vector.append(float(add_index_vector_len))
            #print "End of F1"

            coefficients_vector = np.array(coefficients_vector)
            batch_max_len = float(max(coefficients_vector))
            coefficients_vector = batch_max_len / coefficients_vector
            if i < 3:
                batch_y_vector = np.argmax(batch_y_matrix, axis=1)
                batch_class_index_dict, batch_min_length, batch_max_length = class_label_vector_checking(
                    batch_y_vector)
                logger.info("class index after: ")
                logger.info(batch_class_index_dict)
                logger.info("coefficient vector: ")
                logger.info(coefficients_vector)

            start_time = time.time()
            train_step.run(
                feed_dict={
                    input_x_placeholder: batch_x_matrix,
                    output_y_placeholder: batch_y_matrix,
                    coefficient_placeholder: coefficients_vector,
                    keep_prob: 1
                })
            train_run_time = train_run_time + time.time() - start_time
        else:
            start_time = time.time()
            train_step.run(
                feed_dict={
                    input_x_placeholder: batch_x_matrix,
                    output_y_placeholder: batch_y_matrix,
                    keep_prob: 1
                })
            train_run_time = train_run_time + time.time() - start_time
        if i % 100 == 0:
            fir_weight_variable = tf.get_default_graph().get_tensor_by_name(
                "conv_w_0:0")
            logger.info("fir weight")
            logger.info(fir_weight_variable.get_shape())
            fir_weight_var_val = cnn_session.run(fir_weight_variable)
            logger.info(fir_weight_var_val[0, 0:5, 0, 0])
            test_eval_value = eval_method_value.eval(
                feed_dict={
                    input_x_placeholder: test_x_matrix,
                    output_y_placeholder: test_y_matrix,
                    keep_prob: 1
                })
            if str(test_eval_value) == 'nan':
                test_eval_value = 0
            print_str = "step " + str(
                i) + ", testing " + eval_method_keyword + ": " + str(
                    test_eval_value)
            logger.info(print_str)
            if best_eval_value < test_eval_value:
                # Save the variables to disk.
                best_eval_value = test_eval_value
                save_path = saver.save(cnn_session, saver_file)
                print_str = "Model saved in file: " + save_path + ' at iteration: ' + str(
                    i)
                logger.info(print_str)

        i = i + 1
        start = end
        end = end + batch_size
        if epoch > max_iter:
            logger.info("best eval value at epoch: " + str(epoch))
            logger.info("best eval value to break")
            logger.info(best_eval_value)
            break

    start_time = time.time()
    test_eval_value = eval_method_value.eval(
        feed_dict={
            input_x_placeholder: test_x_matrix,
            output_y_placeholder: test_y_matrix,
            keep_prob: 1
        })
    test_run_time = time.time() - start_time
    if test_eval_value < best_eval_value:
        cnn_session.close()
        cnn_session = tf.InteractiveSession()
        saver.restore(cnn_session, saver_file)
    else:
        best_eval_value = test_eval_value

    #if best_eval_value == 0:
    #    return
    logger.info("Running iteration: %d" % (i))
    logger.info("final best " + eval_method_keyword + ": " +
                str(best_eval_value))
    logger.info(f1_unbalance_count)

    cnn_predict_proba = cnn_session.run(predict_y_proba,
                                        feed_dict={
                                            input_x_placeholder: test_x_matrix,
                                            keep_prob: 1.0
                                        })
    logger.info("CNN model saved: " + str(saver_file))

    if cnn_setting.feature_method == 'none':
        cnn_session.close()
        return best_eval_value, train_run_time, test_run_time, cnn_predict_proba, saver_file, ''

    #keeped_feature_value_list = []
    logger.info("feature value generation")
    #for feature_placeholder in keeped_feature_list:
    #    feature_value = feature_placeholder.eval(feed_dict={input_x_placeholder: train_x_matrix, keep_prob: 1.0})
    #    keeped_feature_value_list.append(feature_value)
    #    logger.info(feature_value.shape)
    test_keeped_feature_value_list = cnn_session.run(keeped_feature_list,
                                                     feed_dict={
                                                         input_x_placeholder:
                                                         test_x_matrix,
                                                         keep_prob: 1.0
                                                     })
    logger.info('test feature list ready')
    start = 0
    end = 0
    train_row = len(train_x_matrix)
    train_obj_list = []
    while (start < train_row):
        logger.info(start)
        end = start + 1000
        if end > train_row:
            end = train_row
        keep_obj = cnn_session.run(keeped_feature_list[0],
                                   feed_dict={
                                       input_x_placeholder:
                                       train_x_matrix[start:end, :, :, :],
                                       keep_prob:
                                       1.0
                                   })
        train_obj_list.append(keep_obj)
        start = end
    #keeped_feature_value_list = cnn_session.run(keeped_feature_list, feed_dict={input_x_placeholder: train_x_matrix, keep_prob: 1.0})
    logger.info('train feature list ready')
    logger.info(
        "The order of feature value list: fir_out_conv_no_act, fir_out_conv, fir_weight, fir_bias, last_conv, weight_full, bias_full"
    )
    logger.info("All features saved to ")
    logger.info("CNN feature list saved to: " + feature_obj_file)
    save_obj([train_obj_list, test_keeped_feature_value_list],
             feature_obj_file)
    cnn_session.close()
    return best_eval_value, train_run_time, test_run_time, cnn_predict_proba, saver_file, feature_obj_file
Пример #25
0
    #y, x = svm_read_problem('/Users/ivan/Platform/python/libsvm-3.22/heart_scale')
    #
    #
    #
    #train_x_matrix = np.array(x[0:10])
    #train_y_vector = np.array(y[0:10]).astype(np.float64)
    #
    #test_x_matrix = np.array(x[10:100])
    #test_y_vector = np.array(y[10:100]).astype(np.float64)
    #print train_y_vector
    #distance_matrix, predict_y_vector, train_time, test_time = run_knn_with_dist(train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, n_neighbors)

    save_file = 'test.model'

    logger = init_logging('')

    accuracy, predict_y, predict_y_proba, train_time, test_time = run_libsvm(
        train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, logger,
        True, save_file)

    sdfs

    model = svm_load_model(save_file)
    predict_y, predict_acc, predict_val = svm_predict(test_y_vector.tolist(),
                                                      test_x_matrix.tolist(),
                                                      model, '-b 1')

    print predict_acc
    print predict_y
    print predict_val
Пример #26
0
def cnn_configure(train_x_placeholder,
                  cnn_setting,
                  num_classes,
                  group_all=False,
                  logger=None):
    if logger is None:
        logger = init_logging('')

    # CNN Parameters
    conv_kernel_list = cnn_setting.conv_kernel_list
    pool_rate_list = cnn_setting.pool_rate_list
    feature_num_list = cnn_setting.feature_num_list
    activation_fun = cnn_setting.activation_fun
    std_value = cnn_setting.std_value
    same_size = cnn_setting.same_size
    cnn_group_list = cnn_setting.group_list
    conv_row_num = len(conv_kernel_list)
    saver_file = ''

    keeped_feature_list = []

    num_input_map = cnn_setting.input_map
    strides_list = [1, 1, 1, 1]

    for i in range(0, conv_row_num):
        logger.info('layer: ' + str(i) + " input:")
        logger.info(train_x_placeholder.get_shape())
        conv_row_kernel = conv_kernel_list[i, 0]
        conv_col_kernel = conv_kernel_list[i, 1]

        train_x_row = int(train_x_placeholder.get_shape()[1])
        train_x_col = int(train_x_placeholder.get_shape()[2])

        if conv_row_kernel < 0:
            conv_row_kernel = train_x_row
        elif conv_row_kernel > train_x_row:
            conv_row_kernel = train_x_row

        num_output_map = feature_num_list[i]
        if i == 0 and group_all == True:
            conv_col_kernel = train_x_col
        elif conv_col_kernel > train_x_col:
            conv_col_kernel = train_x_col
        elif conv_col_kernel < 0:
            conv_col_kernel = train_x_col

        saver_file = saver_file + "_c" + str(conv_row_kernel) + "_" + str(
            conv_col_kernel)
        #activation_fun = 3
        #print i, conv_row_kernel, conv_col_kernel, train_x_placeholder, num_input_map, num_output_map, activation_fun, strides_list, std_value, same_size
        out_conv = conf_conv_layer(i, conv_row_kernel, conv_col_kernel,
                                   train_x_placeholder, num_input_map,
                                   num_output_map, activation_fun,
                                   strides_list, std_value, same_size, logger)

        logger.info("Conv output: " + str(out_conv.get_shape()))
        pool_row_kernel = pool_rate_list[i, 0]
        pool_col_kernel = pool_rate_list[i, 1]

        saver_file = saver_file + "_p" + str(pool_row_kernel) + "_" + str(
            pool_col_kernel)

        out_conv_row = int(out_conv.get_shape()[1])
        out_conv_col = int(out_conv.get_shape()[2])

        if pool_row_kernel > 0 and pool_col_kernel > 0:
            if pool_row_kernel > out_conv_row:
                warning_str = "Warning: given pooling row number " + str(pool_row_kernel) + \
                    " is bigger than the data row number " + str(out_conv_row)
                logger.info(warning_str)
                warning_str = "Setting the pooling row number to be the data row number"
                logger.info(warning_str)
                pool_row_kernel = out_conv_row
            if pool_col_kernel > out_conv_col:
                warning_str = "Warning: given pooling column number " + \
                    str(pool_col_kernel) + \
                    " is bigger than the data column number " + \
                    str(out_conv_row)
                logger.info(warning_str)
                warning_str = "Setting the pooling column number to be the data column number"
                logger.info(warning_str)
                pool_col_kernel = out_conv_col
            train_x_placeholder = conf_pool_layer(out_conv, pool_row_kernel,
                                                  pool_col_kernel, same_size)
            logger.info("Pooling output: " +
                        str(train_x_placeholder.get_shape()))
        else:
            train_x_placeholder = out_conv
        num_input_map = num_output_map

    saver_file = saver_file + '.ckpt'
    #############################################
    # typical full connect layer
    # print final_conv_kernel
    last_out_conv = train_x_placeholder
    # Only save the matrix before fully connected layer
    keeped_feature_list.append(last_out_conv)
    logger.info("Feature result shape")
    logger.info(last_out_conv.get_shape())
    #print "last out conv"
    #print last_out_conv.get_shape()
    second_feature_num = int(last_out_conv.get_shape()[1] *
                             last_out_conv.get_shape()[2] *
                             last_out_conv.get_shape()[3])
    output_feature_num = 400

    #std_value = sqrt(2.0 / second_feature_num)
    #std_value = 0.02
    tf.random.set_random_seed(0)
    weight_fullconn = tf.Variable(
        tf.truncated_normal([second_feature_num, output_feature_num],
                            stddev=std_value))
    logger.info("full conn weight shape")
    logger.info(weight_fullconn.get_shape())
    bias_fullconn = tf.Variable(
        tf.constant(std_value, shape=[output_feature_num]))
    #keeped_feature_list.append(weight_fullconn)
    #keeped_feature_list.append(bias_fullconn)
    h_pool2_flat = tf.reshape(last_out_conv, [-1, second_feature_num])
    output_fullconn_no_act = tf.matmul(h_pool2_flat,
                                       weight_fullconn) + bias_fullconn

    output_fullconn = tf.nn.relu(output_fullconn_no_act)
    logger.info('last full connect layer output:')
    logger.info(str(output_fullconn.get_shape()))

    #dropout
    keep_prob_placeholder = tf.placeholder(tf.float32)
    output_fullconn_drop = tf.nn.dropout(output_fullconn,
                                         keep_prob_placeholder)

    logits_out = conf_out_layer(output_fullconn_drop, output_feature_num,
                                num_classes, std_value)
    #print "logits_out"
    #print logits_out.get_shape()
    return logits_out, keep_prob_placeholder, keeped_feature_list, saver_file
Пример #27
0
def run_feature_knn_use_proba(train_x_matrix,
                              train_y_vector,
                              test_x_matrix,
                              test_y_vector,
                              feature_array,
                              attr_num,
                              n_neighbors,
                              class_id=-1,
                              logger=None):
    if logger is None:
        logger = init_logging("")
        logger.info('no log file: ')

    num_classes, num_features = feature_array.shape
    test_row, test_col = test_x_matrix.shape
    knn_predict_matrix = np.zeros(test_row * num_classes).reshape(
        test_row, num_classes)
    knn_predict_proba = np.zeros(test_row * num_classes).reshape(
        test_row, num_classes)

    knn_train_time = 0
    knn_test_time = 0

    knn_accuracy = 0
    proba = True
    if class_id == -1:
        min_class = min(train_y_vector)
        max_class = max(train_y_vector) + 1
    else:
        min_class = class_id
        max_class = class_id + 1
    #result_matrix = np.zeros((10, num_classes))
    for i in range(min_class, max_class):
        logger.info('class: ' + str(i))
        logger.info(str(feature_array[i]))
        #print 'class: ' + str(i)
        #print feature_array[i]
        temp_train_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation(
            train_x_matrix, attr_num, feature_array[i])
        temp_test_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation(
            test_x_matrix, attr_num, feature_array[i])
        #print 'class: ' + str(i)
        temp_train_y_vector = np.where(train_y_vector == i, 1, 0)
        temp_test_y_vector = np.where(test_y_vector == i, 1, 0)
        if i == 0:
            logger.info('sub feature data shape: ')
            logger.info(str(temp_train_x_matrix.shape))
            logger.info(str(temp_test_x_matrix.shape))
            #print 'sub feature data shape:'
            #print temp_train_x_matrix.shape
            #print temp_test_x_matrix.shape

        temp_accuracy, temp_predict_y, temp_predict_y_proba, temp_train_time, temp_test_time = run_knn(
            temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix,
            temp_test_y_vector, n_neighbors, proba)

        #temp_predict_y_proba, temp_predict_y, temp_train_time, temp_test_time = run_knn_with_dist(temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix, temp_test_y_vector)

        #temp_accuracy_1, temp_precision, temp_recall, temp_f1_value = f1_value_precision_recall_accuracy(temp_predict_y, temp_test_y_vector)
        temp_accuracy, temp_precision, temp_recall, temp_f1_value, temp_tp, temp_fp, temp_tn, temp_fn = f1_value_precision_recall_accuracy(
            temp_predict_y, temp_test_y_vector)
        #if temp_accuracy != temp_accuracy_1:
        #    logger.info(str(temp_accuracy))
        #    logger.info(str(temp_accuracy_1))
        #    #print temp_accuracy
        #    #print temp_accuracy_1
        #    raise Exception("Two accuracy results are not the same")
        #result_matrix[0, i] = temp_accuracy
        #result_matrix[1, i] = temp_precision
        #result_matrix[2, i] = temp_recall
        #result_matrix[3, i] = temp_f1_value
        #result_matrix[4, i] = temp_tp
        #result_matrix[5, i] = temp_fp
        #result_matrix[6, i] = temp_tn
        #result_matrix[7, i] = temp_fn
        #result_matrix[8, i] = temp_train_time
        #result_matrix[9, i] = temp_test_time
        logger.info("Accuracy for class " + str(i) + ": " + str(temp_accuracy))
        logger.info("Recall for class " + str(i) + ": " + str(temp_recall))
        logger.info("Precision for class " + str(i) + ": " +
                    str(temp_precision))
        logger.info("F1 Score for class " + str(i) + ": " + str(temp_f1_value))
        logger.info("Prediction matrix:")
        logger.info("TP=" + str(temp_tp) + " FP=" + str(temp_fp))
        logger.info("TN=" + str(temp_tn) + " FN=" + str(temp_fn))

        knn_train_time = knn_train_time + temp_train_time
        knn_test_time = knn_test_time + temp_test_time

        proba_row, proba_col = temp_predict_y_proba.shape

        knn_predict_matrix[:, i] = temp_predict_y_proba[:, 1]
        logger.info('=============')
        #break

    knn_accuracy, knn_predict_y = predict_matrix_with_proba_to_predict_accuracy(
        knn_predict_matrix, knn_predict_matrix, test_y_vector)

    return knn_accuracy, knn_train_time, knn_test_time, knn_predict_y
Пример #28
0
def run_feature_projected_classification(train_x_matrix,
                                         train_y_vector,
                                         test_x_matrix,
                                         test_y_vector,
                                         feature_array,
                                         top_k,
                                         method,
                                         class_id=-1,
                                         logger=None):
    if logger is None:
        logger = init_logging('')

    train_row, attr_len, attr_num, input_map = train_x_matrix.shape
    test_row, attr_len, attr_num, input_map = test_x_matrix.shape
    real_num_classes, attr_num = feature_array.shape
    all_predict_matrix = np.zeros(test_row * real_num_classes).reshape(
        test_row, real_num_classes)

    feature_col = attr_len * top_k * input_map

    if class_id == -1:
        min_class = min(train_y_vector)
        max_class = max(train_y_vector) + 1
    else:
        min_class = class_id
        max_class = class_id + 1

    n_neighbors = 1
    samples_leaf = 20
    prob = True

    all_f1_value = []
    all_train_time = []
    all_test_time = []
    #min_class = 9
    for i in range(min_class, max_class):
        logger.info('class: ' + str(i))
        temp_train_y_vector = np.where(train_y_vector == i, 1, 0)
        temp_test_y_vector = np.where(test_y_vector == i, 1, 0)

        fold_positive_len = len(np.where(temp_train_y_vector == 1)[0])
        fold_negative_len = len(temp_train_y_vector) - fold_positive_len

        logger.info("=====")
        logger.info("positive class labels length: " + str(fold_positive_len))
        logger.info("negative class labels length: " + str(fold_negative_len))
        class_feature = feature_array[i]
        class_feature = class_feature[0:top_k]
        logger.info("feature list: " + str(class_feature))

        temp_train_x_matrix = train_x_matrix[:, :, class_feature, :]
        temp_test_x_matrix = test_x_matrix[:, :, class_feature, :]
        temp_train_x_matrix = temp_train_x_matrix.reshape(
            train_row, feature_col)
        temp_test_x_matrix = temp_test_x_matrix.reshape(test_row, feature_col)

        if method == 'knn':
            class_accuracy, class_predict_y, class_predict_prob, class_train_time, class_test_time = run_knn(
                temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix,
                temp_test_y_vector, n_neighbors, prob)
        elif method == 'rf':
            class_accuracy, class_predict_y, class_predict_prob, class_train_time, class_test_time = run_rf(
                temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix,
                temp_test_y_vector, samples_leaf, prob)
        elif method == 'libsvm':
            class_accuracy, class_predict_y, class_predict_prob, class_train_time, class_test_time = run_libsvm(
                temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix,
                temp_test_y_vector, logger, prob, '', True)

        class_accuracy, precision, recall, class_f1, tp, fp, tn, fn = f1_value_precision_recall_accuracy(
            class_predict_y, temp_test_y_vector, 1)

        logger.info(method + " f1 for class " + str(i) + ": " + str(class_f1))
        logger.info(method + " accuracy for class " + str(i) + ": " +
                    str(class_accuracy))

        all_f1_value.append(class_f1)
        all_train_time.append(class_train_time)
        all_test_time.append(class_test_time)
        all_predict_matrix[:, i] = class_predict_prob[:, 1]
        #if i > 2:
        #    break
    all_accuracy, all_predict_y = predict_matrix_with_prob_to_predict_accuracy(
        all_predict_matrix, test_y_vector)
    return all_accuracy, all_f1_value, all_predict_y, all_train_time, all_test_time, all_predict_matrix
Пример #29
0
def run_feature_svm_use_proba(train_x_matrix,
                              train_y_vector,
                              test_x_matrix,
                              test_y_vector,
                              feature_array,
                              attr_num,
                              logger=None,
                              save_pre=''):
    if logger is None:
        logger = init_logging("")
        logger.info('no log file: ')
    num_classes, num_features = feature_array.shape
    test_row, test_col = test_x_matrix.shape
    svm_predict_matrix = np.zeros(test_row * num_classes).reshape(
        test_row, num_classes)
    svm_predict_proba = np.zeros(test_row * num_classes).reshape(
        test_row, num_classes)

    svm_train_time = 0
    svm_test_time = 0

    svm_accuracy = 0
    proba = True

    banlanced_ratio = 5

    for i in range(0, num_classes):
        #print 'class: ' + str(i)
        #print feature_array[i]
        logger.info("class: " + str(i))
        logger.info(str(feature_array[i]))
        temp_train_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation(
            train_x_matrix, attr_num, feature_array[i])
        temp_test_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation(
            test_x_matrix, attr_num, feature_array[i])
        #print 'class: ' + str(i)
        if i == 0:
            logger.info('sub feature data shape: ')
            logger.info(str(temp_train_x_matrix.shape))
            logger.info(str(temp_test_x_matrix.shape))

        temp_train_y_vector = np.where(train_y_vector == i, 1, 0)
        temp_test_y_vector = np.where(test_y_vector == i, 1, 0)

        temp_train_x_matrix, temp_train_y_vector = banlanced_binary_processing(
            temp_train_x_matrix, temp_train_y_vector, banlanced_ratio)

        save_file = save_pre + "_class" + str(i) + "_top" + str(
            temp_attr_num) + ".model"

        logger.info('svm saved to ' + save_file)
        #temp_accuracy, temp_predict_y, temp_predict_y_proba, temp_train_time, temp_test_time = run_libsvm(temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix, temp_test_y_vector, logger, proba, save_file)
        temp_accuracy, temp_predict_y, temp_predict_y_proba, temp_train_time, temp_test_time = run_sklearn_libsvm(
            temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix,
            temp_test_y_vector, logger, proba)
        temp_accuracy, temp_precision, temp_recall, temp_f1_value, temp_tp, temp_fp, temp_tn, temp_fn = f1_value_precision_recall_accuracy(
            temp_predict_y, temp_test_y_vector)
        temp_predict_y = np.array(temp_predict_y)
        temp_predict_y_proba = np.array(temp_predict_y_proba)

        logger.info("Accuracy for class " + str(i) + ": " + str(temp_accuracy))
        logger.info("Recall for class " + str(i) + ": " + str(temp_recall))
        logger.info("Precision for class " + str(i) + ": " +
                    str(temp_precision))
        logger.info("F1 Score for class " + str(i) + ": " + str(temp_f1_value))
        logger.info("Prediction matrix:")
        logger.info("TP=" + str(temp_tp) + " FP=" + str(temp_fp))
        logger.info("TN=" + str(temp_tn) + " FN=" + str(temp_fn))

        svm_train_time = svm_train_time + temp_train_time
        svm_test_time = svm_test_time + temp_test_time

        proba_row, proba_col = temp_predict_y_proba.shape

        svm_predict_matrix[:, i] = temp_predict_y
        svm_predict_proba[:, i] = temp_predict_y_proba[:, 1]
        logger.info('=============')
        #break

    svm_accuracy, svm_predict_y = predict_matrix_with_proba_to_predict_accuracy(
        svm_predict_matrix, svm_predict_proba, test_y_vector)
    return svm_accuracy, svm_train_time, svm_test_time, svm_predict_y