Пример #1
0
def SVM_validation(X_train, y_train, X_val, y_val, sample_amount, data_ratio,
                   C_parameter, gamma_parameter):
    #sample_amount = 40000
    #data_ratio = 1.2
    label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1)))
    label_2_amount = int(sample_amount - label_1_amount)
    tpr_list_temp = []
    fpr_list_temp = []
    BER_list_temp = []
    f1_score_list_temp = []
    iter_max = 4
    count = 0
    while (count < iter_max):
        sample_feature, sample_label = choose_data_seperately(
            X_train, y_train, label_1_amount, label_2_amount)
        Precall, f1_score, BER, FPR, y_pred = SVM_base_fuction(
            sample_feature, sample_label, X_val, y_val, 2**C_parameter,
            2**gamma_parameter)
        tpr_list_temp.append(Precall)
        fpr_list_temp.append(FPR)
        BER_list_temp.append(BER)
        f1_score_list_temp.append(f1_score)
        count = count + 1

    tpr = (sum(tpr_list_temp) / len(tpr_list_temp))
    fpr = (sum(fpr_list_temp) / len(fpr_list_temp))
    BER = (sum(BER_list_temp) / len(BER_list_temp))
    f1_score = (sum(f1_score_list_temp) / len(f1_score_list_temp))
    return tpr, fpr, BER, f1_score
Пример #2
0
def kNN_validation(X_train, y_train, X_val, y_val, sample_amount, data_ratio,
                   k_value):
    #sample_amount = 40000
    #data_ratio = 1.2
    label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1)))
    label_2_amount = int(sample_amount - label_1_amount)
    tpr_list_temp = []
    fpr_list_temp = []
    BER_list_temp = []
    f1_score_list_temp = []
    iter_max = 10
    count = 0
    while (count < iter_max):
        sample_feature, sample_label = choose_data_seperately(
            X_train, y_train, label_1_amount, label_2_amount)
        Precall, f1_score, BER, FPR = kNN_base_function(
            sample_feature, sample_label, X_val, y_val, k_value)
        tpr_list_temp.append(Precall)
        fpr_list_temp.append(FPR)
        BER_list_temp.append(BER)
        f1_score_list_temp.append(f1_score)
        count = count + 1

    tpr = (sum(tpr_list_temp) / len(tpr_list_temp))
    fpr = (sum(fpr_list_temp) / len(fpr_list_temp))
    BER = (sum(BER_list_temp) / len(BER_list_temp))
    f1_score = (sum(f1_score_list_temp) / len(f1_score_list_temp))
    return tpr, fpr, BER, f1_score
Пример #3
0
def parameter_adjust(X_train, y_train, sample_amount, data_ratio):
    start = time.time()
    label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1)))
    label_2_amount = int(sample_amount - label_1_amount)
    Result_List = CalParList(3, "gamma_exp", "C_exp", "time")

    for gamma_exp in [-15, -13, -11, -9, -7, -5, -3, -1, 1, 3]:
        for C_exp in [-5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15]:
            start1 = time.time()

            sample_feature, sample_label = choose_data_seperately(
                X_train, y_train, label_1_amount, label_2_amount)

            tpr, fpr, BER, f1_score, time_var = SVM_cross_validation(
                sample_feature, sample_label, 2**C_exp, 2**gamma_exp)

            Result_List.list_append(tpr, f1_score, BER, fpr, gamma_exp, C_exp,
                                    time_var)

            #print("fit time:%5.1fminute"%(temp))

    print("the total executing time:%5.1fminute" %
          ((time.time() - start) / 60))
    result = Result_List.return_result()
    return result
Пример #4
0
def kNN_data_ratio_adjust(training_feature, training_label, k_value):
    start = time.time()
    label_1_amount = 40000
    label_2_amount = 10000

    Result_List = CalParList(4, "label_1_amount", "label_2_amount", "ratio",
                             "time")
    iter_amount = 5
    train_data = pd.concat([training_feature, training_label['label']],
                           axis=1,
                           join='outer')

    while (label_1_amount > 2000):
        count = 0
        Cal_Result_List = CalList()
        time_list_temp = []

        while (count < iter_amount):
            start1 = time.time()

            sample_feature, sample_label = choose_data_seperately(
                training_feature, training_label, label_1_amount,
                label_2_amount)

            Precall, FPR, BER, f1_score, time_var = kNN_cross_validation(
                sample_feature, sample_label, k_value)
            Cal_Result_List.list_append(Precall, f1_score, BER, FPR)
            time_list_temp.append(time_var)
            count = count + 1

        Precall, FPR, BER, f1_score = Cal_Result_List.list_average_cal()
        time_ave = sum(time_list_temp) / len(time_list_temp)
        Result_List.list_append(Precall, f1_score, BER, FPR, label_1_amount,
                                label_2_amount,
                                label_1_amount / label_2_amount, time_ave)

        #print("current data labe 1 size:%d ,fit time:%5.1fminute"%(t,(time.time()-start1)/60))
        if (label_1_amount > 10000):
            label_1_amount = label_1_amount - 5000
        else:
            label_1_amount = label_1_amount - 2500

    print("the total executing time:%5.1fminute" %
          ((time.time() - start) / 60))
    result = Result_List.return_result()
    return result
Пример #5
0
def parameter_adjust(X_train, y_train, sample_amount, data_ratio):
    start = time.time()
    tpr_list = []
    fpr_list = []
    BER_list = []
    f1_score_list = []
    time_list = []
    gamma_exp_list = []
    C_exp_list = []
    label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1)))
    label_2_amount = int(sample_amount - label_1_amount)

    for gamma_exp in [-15, -13, -11, -9, -7, -5, -3, -1, 1, 3]:
        for C_exp in [-5, -3, -1, 1, 3, 5, 7, 9, 11, 13, 15]:
            start1 = time.time()

            sample_feature, sample_label = choose_data_seperately(
                X_train, y_train, label_1_amount, label_2_amount)

            tpr, fpr, BER, f1_score, time_var = SVM_cross_validation(
                sample_feature, sample_label, 2**C_exp, 2**gamma_exp)

            time_list.append(time_var)
            tpr_list.append(tpr)
            f1_score_list.append(f1_score)
            BER_list.append(BER)
            fpr_list.append(fpr)
            gamma_exp_list.append(gamma_exp)
            C_exp_list.append(C_exp)
            #print("fit time:%5.1fminute"%(temp))

    print("the total executing time:%5.1fminute" %
          ((time.time() - start) / 60))
    result = {
        "gamma_exp": gamma_exp_list,
        "C_exp": C_exp_list,
        "TPR": tpr_list,
        "FPR": fpr_list,
        "f1_score": f1_score_list,
        "BER": BER_list,
        "time": time_list
    }
    columns = ["gamma_exp", "C_exp", "f1_score", "TPR", "FPR", "BER", "time"]
    result = pd.DataFrame(data=result, columns=columns)
    return result
Пример #6
0
def SVC_data_ratio_adjust(X_train, y_train, sample_amount):
    start = time.time()
    Result_List = CalParList(4, "label_1_amount", "label_2_amount", "ratio",
                             "time")
    iter_amount = 5
    data_ratio = 4

    while (data_ratio > 0.2):
        count = 0
        Cal_Result_List = CalList()
        time_list_temp = []
        label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1)))
        label_2_amount = int(sample_amount - label_1_amount)
        while (count < iter_amount):
            start1 = time.time()

            sample_feature, sample_label = choose_data_seperately(
                X_train, y_train, label_1_amount, label_2_amount)

            tpr, fpr, BER, f1_score, time_var = SVM_cross_validation(
                sample_feature, sample_label, 1, 'auto')
            time_list_temp.append(time_var)
            Cal_Result_List.list_append(tpr, f1_score, BER, fpr)
            count = count + 1

        Precall, FPR, BER, f1_score = Cal_Result_List.list_average_cal()
        time_ave = sum(time_list_temp) / len(time_list_temp)
        Result_List.list_append(Precall, f1_score, BER, FPR, label_1_amount,
                                label_2_amount,
                                label_1_amount / label_2_amount, time_ave)

        if (data_ratio > 2):
            data_ratio = data_ratio / 2
        elif (data_ratio < 0.8):
            data_ratio = data_ratio - 0.25
        else:
            data_ratio = data_ratio - 0.1

    print("the total executing time:%5.1fminute" %
          ((time.time() - start) / 60))
    result = Result_List.return_result()
    return result
Пример #7
0
def kNN_k_parameter_adjust_with_specific_data_ratio(X_train, y_train,
                                                    data_ratio):
    tpr_list = []
    fpr_list = []
    BER_list = []
    f1_score_list = []
    count = 0
    iter_count = 5
    while (count < iter_count):

        label_1_amount = int(20000 * data_ratio)
        label_2_amount = 20000

        sample_feature, sample_label = choose_data_seperately(
            X_train, y_train, label_1_amount, label_2_amount)

        result = kNN_k_parameter_adjust(sample_feature, sample_label)

        if (count == 0):
            tpr_list = result['TPR']
            fpr_list = result['FPR']
            f1_score_list = result['f1_score']
            BER_list = result['BER']
        else:
            tpr_list = (count * tpr_list + result['TPR']) / (count + 1)
            fpr_list = (count * fpr_list + result['FPR']) / (count + 1)
            f1_score_list = (count * f1_score_list +
                             result['f1_score']) / (count + 1)
            BER_list = (count * BER_list + result['BER']) / (count + 1)
        count = count + 1

    result_total = {
        "k_value": result['k_value'],
        "TPR": tpr_list,
        "FPR": fpr_list,
        "f1_score": f1_score_list,
        "BER": BER_list
    }
    columns = ["k_value", "f1_score", "TPR", "FPR", "BER"]
    result = pd.DataFrame(data=result, columns=columns)
    return result
Пример #8
0
def kNN_validation(X_train, y_train, X_val, y_val, sample_amount, data_ratio,
                   k_value):
    #sample_amount = 40000
    #data_ratio = 1.2
    label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1)))
    label_2_amount = int(sample_amount - label_1_amount)

    Cal_Result_List = CalList()
    iter_max = 10
    count = 0

    while (count < iter_max):
        sample_feature, sample_label = choose_data_seperately(
            X_train, y_train, label_1_amount, label_2_amount)
        Precall, f1_score, BER, FPR = kNN_base_function(
            sample_feature, sample_label, X_val, y_val, k_value)
        Cal_Result_List.list_append(Precall, f1_score, BER, FPR)
        count = count + 1

    Precall, FPR, BER, f1_score = Cal_Result_List.list_average_cal()
    return Precall, FPR, BER, f1_score
Пример #9
0
def SVM_validation(X_train, y_train, X_val, y_val, sample_amount, data_ratio,
                   C_parameter, gamma_parameter):
    #sample_amount = 40000
    #data_ratio = 1.2
    label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1)))
    label_2_amount = int(sample_amount - label_1_amount)
    Cal_Result_List = CalList()
    iter_max = 4
    count = 0
    while (count < iter_max):
        sample_feature, sample_label = choose_data_seperately(
            X_train, y_train, label_1_amount, label_2_amount)
        Precall, f1_score, BER, FPR, y_pred = SVM_base_fuction(
            sample_feature, sample_label, X_val, y_val, 2**C_parameter,
            2**gamma_parameter)

        Cal_Result_List.list_append(Precall, f1_score, BER, FPR)
        count = count + 1

    Precall, FPR, BER, f1_score = Cal_Result_List.list_average_cal()
    return Precall, FPR, BER, f1_score
Пример #10
0
def kNN_data_ratio_adjust(training_feature, training_label, k_value):
    start = time.time()
    label_1_amount = 40000
    label_2_amount = 10000
    label_1_amount_list = []
    label_2_amount_list = []
    ratio_list = []
    tpr_list = []
    fpr_list = []
    BER_list = []
    f1_score_list = []
    time_list = []
    iter_amount = 5
    train_data = pd.concat([training_feature, training_label['label']],
                           axis=1,
                           join='outer')

    while (label_1_amount > 2000):
        count = 0
        tpr_list_temp = []
        fpr_list_temp = []
        BER_list_temp = []
        f1_score_list_temp = []
        time_list_temp = []
        while (count < iter_amount):
            start1 = time.time()

            sample_feature, sample_label = choose_data_seperately(
                training_feature, training_label, label_1_amount,
                label_2_amount)

            Precall, FPR, BER, f1_score, time_var = kNN_cross_validation(
                sample_feature, sample_label, k_value)

            time_list_temp.append(time_var)
            tpr_list_temp.append(Precall)
            fpr_list_temp.append(FPR)
            BER_list_temp.append(BER)
            f1_score_list_temp.append(f1_score)
            count = count + 1

        label_1_amount_list.append(label_1_amount)
        label_2_amount_list.append(label_2_amount)
        ratio_list.append(label_1_amount / label_2_amount)
        tpr_list.append(sum(tpr_list_temp) / len(tpr_list_temp))
        fpr_list.append(sum(fpr_list_temp) / len(fpr_list_temp))
        BER_list.append(sum(BER_list_temp) / len(BER_list_temp))
        f1_score_list.append(sum(f1_score_list_temp) / len(f1_score_list_temp))
        time_list.append(sum(time_list_temp) / len(time_list_temp))
        #print("current data labe 1 size:%d ,fit time:%5.1fminute"%(t,(time.time()-start1)/60))
        if (label_1_amount > 10000):
            label_1_amount = label_1_amount - 5000
        else:
            label_1_amount = label_1_amount - 2500

    print("the total executing time:%5.1fminute" %
          ((time.time() - start) / 60))
    result = {
        "label_1_amount": label_1_amount_list,
        "label_2_amount": label_2_amount_list,
        "label 1: label 2 ratio": ratio_list,
        "TPR": tpr_list,
        "FPR": fpr_list,
        "f1_score": f1_score_list,
        "BER": BER_list,
        "time": time_list
    }
    columns = [
        "label_1_amount", "label_2_amount", "label 1: label 2 ratio",
        "f1_score", "TPR", "FPR", "BER", "time"
    ]
    result = pd.DataFrame(data=result, columns=columns)
    return result
Пример #11
0
def SVC_data_ratio_adjust(X_train, y_train, sample_amount):
    start = time.time()
    tpr_list = []
    fpr_list = []
    BER_list = []
    f1_score_list = []
    time_list = []
    label_1_amount_list = []
    label_2_amount_list = []
    ratio_list = []

    iter_amount = 5
    data_ratio = 4

    while (data_ratio > 0.2):
        count = 0
        tpr_list_temp = []
        fpr_list_temp = []
        BER_list_temp = []
        f1_score_list_temp = []
        time_list_temp = []
        label_1_amount = int(sample_amount * (data_ratio / (data_ratio + 1)))
        label_2_amount = int(sample_amount - label_1_amount)
        while (count < iter_amount):
            start1 = time.time()

            sample_feature, sample_label = choose_data_seperately(
                X_train, y_train, label_1_amount, label_2_amount)

            tpr, fpr, BER, f1_score, time_var = SVM_cross_validation(
                sample_feature, sample_label, 1, 'auto')

            time_list_temp.append(time_var)
            tpr_list_temp.append(tpr)
            fpr_list_temp.append(fpr)
            BER_list_temp.append(BER)
            f1_score_list_temp.append(f1_score)
            count = count + 1

        label_1_amount_list.append(label_1_amount)
        label_2_amount_list.append(label_2_amount)
        ratio_list.append(label_1_amount / label_2_amount)
        tpr_list.append(sum(tpr_list_temp) / len(tpr_list_temp))
        fpr_list.append(sum(fpr_list_temp) / len(fpr_list_temp))
        BER_list.append(sum(BER_list_temp) / len(BER_list_temp))
        f1_score_list.append(sum(f1_score_list_temp) / len(f1_score_list_temp))
        time_list.append(sum(time_list_temp) / len(time_list_temp))

        if (data_ratio > 2):
            data_ratio = data_ratio / 2
        elif (data_ratio < 0.8):
            data_ratio = data_ratio - 0.25
        else:
            data_ratio = data_ratio - 0.1

    print("the total executing time:%5.1fminute" %
          ((time.time() - start) / 60))
    result = {
        "label_1_amount": label_1_amount_list,
        "label_2_amount": label_2_amount_list,
        "label 1: label 2 ratio": ratio_list,
        "TPR": tpr_list,
        "FPR": fpr_list,
        "f1_score": f1_score_list,
        "BER": BER_list,
        "time": time_list
    }
    columns = [
        "label_1_amount", "label_2_amount", "label 1: label 2 ratio",
        "f1_score", "TPR", "FPR", "BER", "time"
    ]
    result = pd.DataFrame(data=result, columns=columns)
    return result