Пример #1
0
def get_input_data(epathName, efileName, tolFlag, splitFlag):
    tolX = np.zeros((SAMPLE_SIZE, subSum, FeatureSize))
    tolY = np.zeros((SAMPLE_SIZE, subSum, 1))
    for idx in range(subSum-1,subSum):
        embedding = load_pickle(epathName, efileName+"_%d.pkl" % idx)
        embedding_df = pd.DataFrame(embedding)
        deltaFeature_df = load_pickle(
            deltaFeaturePathName, "/deltaFeature_%d.pkl" % idx)
        tolFeature_df = load_pickle(
            tolFeaturePathName, "/tolFeature_%d.pkl" % idx)
        # 合并两个特征
        if(tolFlag == True):
            aggreFeature = pd.concat([embedding_df, tolFeature_df], axis=1)
        else:
            aggreFeature = pd.concat([embedding_df, deltaFeature_df], axis=1)
        y_cols_name = ['label']
        x_cols_name = [x for x in aggreFeature.columns if x not in y_cols_name]
        subX = dcopy(aggreFeature[x_cols_name]).values
        subY = dcopy(aggreFeature[y_cols_name]).values
        tolX[:, idx, :] = subX
        tolY[:, idx, :] = subY
    if(splitFlag == True):
        trainX, testX, trainY, testY = train_test_split(
            tolX, tolY, test_size=0.2, random_state=RANDOM_SEED)
    else:  # False用于无监督情况
        trainX, trainY, testX, testY = tolX, tolY, 0, 0
    return trainX, trainY, testX, testY
Пример #2
0
def GCN_tol_embedding(i):
    
    for idx in range(0, subSum):
        print(idx)
        tolFeature = load_pickle(tolFeaturePathName, "/tolFeature_%d.pkl" % idx)
        tolFeature_df = pd.DataFrame(
            tolFeature, columns=['label', 'AF1', 'AF2', 'AF3', 'AF4', 'AF5', 'AF6', 'AF7', 'AF8'])
            
        sp_muldG = load_pickle(muldigPathName, "/G_%d.pkl" % idx)
        y_cols_name = ['label']
        x_cols_name = [x for x in tolFeature_df.columns if x not in y_cols_name]
        global scipy_adj_matrix,train_x,train_y
        train_x = dcopy(tolFeature_df[x_cols_name])
        train_y = dcopy(tolFeature_df[y_cols_name])
        pos_cnt, neg_cnt = int(train_y.sum()), int(len(train_y) - train_y.sum())
            
        scipy_adj_matrix = get_scipy_adj_matrix(sp_muldG)
        print('pos node cnts:', pos_cnt)
        print('neg node cnts:', neg_cnt, 'pos/all ratio:',
            pos_cnt / (pos_cnt + neg_cnt))

        embSize = 8
        fGCNembedding = get_GCN_embedding(epoch=6, lr=0.005, weight_decay=1e-6,
                                    esize=embSize, random_seed=7+i)
        print("finish calculate embedding data!")
        save_pickle(fGCNembedding, GCNPathName+"%d_%d" %
                    (embSize,testNum), "/fGCNembedding_%d.pkl" % idx)
Пример #3
0
def GCN_classfication(emsize):
    gcn_res, rcnt = [0, 0, 0, 0], 5
    rs = 7
    for i in range(rcnt):
        for idx in range(subSum - 1, subSum):
            #print(idx)
            #print(tolFeaturePathName+"/tolFeature_%d.pkl" % idx)
            tolFeature = load_pickle(tolFeaturePathName,
                                     "/tolFeature_%d.pkl" % idx)
            tolFeature_df = pd.DataFrame(tolFeature,
                                         columns=[
                                             'label', 'AF1', 'AF2', 'AF3',
                                             'AF4', 'AF5', 'AF6', 'AF7', 'AF8'
                                         ])

            sp_muldG = load_pickle(muldigPathName, "/G_%d.pkl" % idx)
            #sp_mulG = load_pickle(mulgPathName+"/G_%d.pkl" % idx)
            #print(mulgPathName+"/G_%d.pkl" % idx)
            y_cols_name = ['label']
            x_cols_name = [
                x for x in tolFeature_df.columns if x not in y_cols_name
            ]
            global scipy_adj_matrix, train_x, train_y
            train_x = dcopy(tolFeature_df[x_cols_name])
            train_y = dcopy(tolFeature_df[y_cols_name])
            pos_cnt, neg_cnt = int(
                train_y.sum()), int(len(train_y) - train_y.sum())

            scipy_adj_matrix = get_scipy_adj_matrix(sp_muldG)
            #print('pos node cnts:', pos_cnt)
            #print('neg node cnts:', neg_cnt, 'pos/all ratio:',
            #    pos_cnt / (pos_cnt + neg_cnt))

            fGCNembedding = get_GCN_embedding(epoch=6,
                                              lr=0.005,
                                              weight_decay=1e-6,
                                              esize=emsize,
                                              random_seed=rs + i)
            print("finish calculate embedding data!")
            save_pickle(fGCNembedding, GCNPathName + "%d" % testNum,
                        "/fGCNembedding_%d.pkl" % idx)
            trainX, trainY, testX, testY = get_input_data(
                GCNPathName + "%d" % testNum, "/fGCNembedding", esize, True,
                False, True)
            #ftrainX, ftrainY, ftestX, ftestY = get_pure_feature(False)
            #print(ftrainX[:,subSum-1,:].size)
            #print(trainX[:, subSum-1, :])
            inputX = trainX[:, subSum - 1, :]
            #inputX=np.concatenate((ftrainX[:,subSum-1,:], trainX[:, subSum-1, :]), axis=1)
            #inputX=trainX
            trainX_2D, trainY_1D = inputX, trainY[:, subSum - 1, 0]
            lgb_res = lgb_train_model_with_split(pd.DataFrame(trainX_2D),
                                                 pd.DataFrame(trainY_1D), 2011)
            #print_res(lgb_res)
        for j in range(len(gcn_res)):
            gcn_res[j] += lgb_res[j]
    gcn_res = [i / rcnt for i in gcn_res]
    print(gcn_res)
    gc.collect()
    return gcn_res
Пример #4
0
def get_trainx_trainy():
    idx = subSum - 1
    tolFeature = load_pickle(tolFeaturePathName, "/tolFeature_%d.pkl" % idx)
    tolFeature_df = pd.DataFrame(tolFeature,
                                 columns=[
                                     'label', 'AF1', 'AF2', 'AF3', 'AF4',
                                     'AF5', 'AF6', 'AF7', 'AF8'
                                 ])
    sp_muldG = load_pickle(muldigPathName, "/G_%d.pkl" % idx)
    #print(mulgPathName+"/G_%d.pkl" % idx)
    y_cols_name = ['label']
    x_cols_name = [x for x in tolFeature_df.columns if x not in y_cols_name]
    global train_x, train_y
    train_x = dcopy(tolFeature_df[x_cols_name])
    train_y = dcopy(tolFeature_df[y_cols_name])
    pos_cnt, neg_cnt = int(train_y.sum()), int(len(train_y) - train_y.sum())
Пример #5
0
def get_pure_feature(splitFlag):
    tolX = np.zeros((SAMPLE_SIZE, subSum, fSize))
    tolY = np.zeros((SAMPLE_SIZE, subSum, 1))
    idx = subSum-1
    tolFeature_df = load_pickle(tolFeaturePathName, "/tolFeature_%d.pkl" % idx)
    # print(tolFeature_df.info())
    # print(tolFeature_df.head())
    # print(tolFeature_df.sum())
    y_cols_name = ['label']
    x_cols_name = [x for x in tolFeature_df.columns if x not in y_cols_name]

    subX = dcopy(tolFeature_df[x_cols_name]).values
    subY = dcopy(tolFeature_df[y_cols_name]).values
    tolX[:, idx, :] = subX
    tolY[:, idx, :] = subY
    if(splitFlag == True):
        trainX, testX, trainY, testY = train_test_split(
            tolX, tolY, test_size=0.2, random_state=RANDOM_SEED)
    else:
        trainX, trainY, testX, testY = tolX, tolY, 0, 0
    return trainX, trainY, testX, testY