예제 #1
0
def run_self_CFS(project):
    X, y = load_data(project)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.40,
                                                        random_state=18)
    loc = X_test.CountLineCode
    df_smote = pd.concat([X_train, y_train], axis=1)
    df_smote = apply_smote(df_smote)
    df_smote, cols = apply_cfs(df_smote)
    y_train = df_smote.Bugs
    X_train = df_smote.drop('Bugs', axis=1)
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test[cols[:-1]])
    abcd = metrices.measures(y_test, predicted, loc)
    pf = abcd.get_pf()
    recall = abcd.calculate_recall()
    precision = abcd.calculate_precision()
    f1 = abcd.calculate_f1_score()
    g_score = abcd.get_g_score()
    pci_20 = abcd.get_pci_20()
    ifa = abcd.get_ifa()
    try:
        auc = roc_auc_score(y_test, predicted)
    except:
        auc = 0
    print(classification_report(y_test, predicted))
    return recall, precision, pf, f1, g_score, auc, pci_20, ifa
예제 #2
0
def run_self_k(project, metric):
    precision = []
    recall = []
    pf = []
    f1 = []
    g_score = []
    auc = []
    pci_20 = []
    ifa = []
    importance = []
    X, y = load_both_data(project, metric)
    for _ in range(5):
        skf = StratifiedKFold(n_splits=5)
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.loc[train_index], X.loc[test_index]
            y_train, y_test = y.loc[train_index], y.loc[test_index]
            if metric == 'process':
                loc = X_test['file_la'] + X_test['file_lt']
            elif metric == 'product':
                loc = X_test.CountLineCode
            else:
                loc = X_test['file_la'] + X_test['file_lt']
            df_smote = pd.concat([X_train, y_train], axis=1)
            df_smote = apply_smote(df_smote)
            y_train = df_smote.Bugs
            X_train = df_smote.drop('Bugs', axis=1)
            clf = SVC()
            clf.fit(X_train, y_train)
            importance = 0
            predicted = clf.predict(X_test)
            abcd = metrices.measures(y_test, predicted, loc)
            pf.append(abcd.get_pf())
            recall.append(abcd.calculate_recall())
            precision.append(abcd.calculate_precision())
            f1.append(abcd.calculate_f1_score())
            g_score.append(abcd.get_g_score())
            pci_20.append(abcd.get_pci_20())
            ifa.append(abcd.get_ifa())
            try:
                auc.append(roc_auc_score(y_test, predicted))
            except:
                auc.append(0)


#             print(classification_report(y_test, predicted))
    return recall, precision, pf, f1, g_score, auc, pci_20, ifa, importance
예제 #3
0
def run_self_release(project,metric):
    precision = []
    recall = []
    pf = []
    f1 = []
    g_score = []
    auc = []
    pci_20 = []
    ifa = []
    X_train,y_train,test_df = load_data_commit_level(project,metric)      
    df_smote = pd.concat([X_train,y_train],axis = 1)
    df_smote = apply_smote(df_smote)
    y_train = df_smote.Bugs
    X_train = df_smote.drop('Bugs',axis = 1)
    clf =  RandomForestClassifier()
    clf.fit(X_train,y_train)
    importance = clf.feature_importances_
    unique_commits_list = np.array_split(test_df.commit_hash.unique(), 5)
    for i in range(len(unique_commits_list)):
        test_df_subset = test_df[test_df.commit_hash.isin(unique_commits_list[i])]
        y_test = test_df_subset.Bugs
        X_test = test_df_subset.drop(['Bugs','commit_hash'],axis = 1)
        if metric == 'process':
            loc = X_test['file_la'] + X_test['file_lt']
        elif metric == 'product':
            loc = X_test.CountLineCode
        else:
            loc = X_test['file_la'] + X_test['file_lt']                
        predicted = clf.predict(X_test)
        abcd = metrices.measures(y_test,predicted,loc)
        pf.append(abcd.get_pf())
        recall.append(abcd.calculate_recall())
        precision.append(abcd.calculate_precision())
        f1.append(abcd.calculate_f1_score())
        g_score.append(abcd.get_g_score())
        pci_20.append(abcd.get_pci_20())
        ifa.append(abcd.get_ifa())
        try:
            auc.append(roc_auc_score(y_test, predicted))
        except:
            auc.append(0)
        print(classification_report(y_test, predicted))
#         print(recall,precision,pf,f1,g_score,auc,pci_20)
    return recall,precision,pf,f1,g_score,auc,pci_20,ifa,importance
예제 #4
0
def run_self_k(project,projects,metric):
    precision = []
    recall = []
    pf = []
    f1 = []
    g_score = []
    auc = []
    pci_20 = []
    ifa = []
    importance = []
    X_train,y_train = load_both_data(project,metric)
    df_smote = pd.concat([X_train,y_train],axis = 1)
    df_smote = apply_smote(df_smote)
    y_train = df_smote.Bugs
    X_train = df_smote.drop('Bugs',axis = 1)
    clf = RandomForestClassifier()
    clf.fit(X_train,y_train)
    importance = clf.feature_importances_
    for _project in projects:
        try:
            X_test,y_test = load_both_data(_project,metric)
            if metric == 'process':
                loc = X_test['file_la'] + X_test['file_lt']
            elif metric == 'product':
                loc = X_test.CountLineCode
            else:
                loc = X_test['file_la'] + X_test['file_lt']
            predicted = clf.predict(X_test)
            abcd = metrices.measures(y_test,predicted,loc)
            pf.append(abcd.get_pf())
            recall.append(abcd.calculate_recall())
            precision.append(abcd.calculate_precision())
            f1.append(abcd.calculate_f1_score())
            g_score.append(abcd.get_g_score())
            pci_20.append(abcd.get_pci_20())
            ifa.append(abcd.get_ifa())
            try:
                auc.append(roc_auc_score(y_test, predicted))
            except:
                auc.append(0)
        except Exception as e:
            print('error in test',_project,e)
            continue
    return recall,precision,pf,f1,g_score,auc,pci_20,ifa,importance
def run_self(project, projects):
    precision_list = []
    recall_list = []
    pf_list = []
    f1_list = []
    g_list = []
    auc_list = []
    pci_20_list = []
    ifa_list = []
    featue_importance = []
    X_train, y_train = load_data(project)
    df_smote = pd.concat([X_train, y_train], axis=1)
    df_smote = apply_smote(df_smote)
    y_train = df_smote.Bugs
    X_train = df_smote.drop('Bugs', axis=1)
    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    importance = 0
    for project in projects:
        try:
            X_test, y_test = load_data(project)
            loc = X_test.CountLineCode
            predicted = clf.predict(X_test)
            abcd = metrices.measures(y_test, predicted, loc)
            pf_list.append(abcd.get_pf())
            recall_list.append(abcd.calculate_recall())
            precision_list.append(abcd.calculate_precision())
            f1_list.append(abcd.calculate_f1_score())
            g_list.append(abcd.get_g_score())
            pci_20_list.append(abcd.get_pci_20())
            ifa_list.append(abcd.get_ifa())
            try:
                auc_list.append(roc_auc_score(y_test, predicted))
            except:
                precision_list.append(0)
        except:
            continue
    return np.nanmedian(recall_list), np.nanmedian(
        precision_list), np.nanmedian(pf_list), np.nanmedian(
            f1_list), np.nanmedian(g_list), np.nanmedian(
                auc_list), np.nanmedian(pci_20_list), np.nanmedian(
                    ifa_list), importance