def get_input_data(epathName, efileName, tolFlag, splitFlag): tolX = np.zeros((SAMPLE_SIZE, subSum, FeatureSize)) tolY = np.zeros((SAMPLE_SIZE, subSum, 1)) for idx in range(subSum-1,subSum): embedding = load_pickle(epathName, efileName+"_%d.pkl" % idx) embedding_df = pd.DataFrame(embedding) deltaFeature_df = load_pickle( deltaFeaturePathName, "/deltaFeature_%d.pkl" % idx) tolFeature_df = load_pickle( tolFeaturePathName, "/tolFeature_%d.pkl" % idx) # 合并两个特征 if(tolFlag == True): aggreFeature = pd.concat([embedding_df, tolFeature_df], axis=1) else: aggreFeature = pd.concat([embedding_df, deltaFeature_df], axis=1) y_cols_name = ['label'] x_cols_name = [x for x in aggreFeature.columns if x not in y_cols_name] subX = dcopy(aggreFeature[x_cols_name]).values subY = dcopy(aggreFeature[y_cols_name]).values tolX[:, idx, :] = subX tolY[:, idx, :] = subY if(splitFlag == True): trainX, testX, trainY, testY = train_test_split( tolX, tolY, test_size=0.2, random_state=RANDOM_SEED) else: # False用于无监督情况 trainX, trainY, testX, testY = tolX, tolY, 0, 0 return trainX, trainY, testX, testY
def GCN_tol_embedding(i): for idx in range(0, subSum): print(idx) tolFeature = load_pickle(tolFeaturePathName, "/tolFeature_%d.pkl" % idx) tolFeature_df = pd.DataFrame( tolFeature, columns=['label', 'AF1', 'AF2', 'AF3', 'AF4', 'AF5', 'AF6', 'AF7', 'AF8']) sp_muldG = load_pickle(muldigPathName, "/G_%d.pkl" % idx) y_cols_name = ['label'] x_cols_name = [x for x in tolFeature_df.columns if x not in y_cols_name] global scipy_adj_matrix,train_x,train_y train_x = dcopy(tolFeature_df[x_cols_name]) train_y = dcopy(tolFeature_df[y_cols_name]) pos_cnt, neg_cnt = int(train_y.sum()), int(len(train_y) - train_y.sum()) scipy_adj_matrix = get_scipy_adj_matrix(sp_muldG) print('pos node cnts:', pos_cnt) print('neg node cnts:', neg_cnt, 'pos/all ratio:', pos_cnt / (pos_cnt + neg_cnt)) embSize = 8 fGCNembedding = get_GCN_embedding(epoch=6, lr=0.005, weight_decay=1e-6, esize=embSize, random_seed=7+i) print("finish calculate embedding data!") save_pickle(fGCNembedding, GCNPathName+"%d_%d" % (embSize,testNum), "/fGCNembedding_%d.pkl" % idx)
def GCN_classfication(emsize): gcn_res, rcnt = [0, 0, 0, 0], 5 rs = 7 for i in range(rcnt): for idx in range(subSum - 1, subSum): #print(idx) #print(tolFeaturePathName+"/tolFeature_%d.pkl" % idx) tolFeature = load_pickle(tolFeaturePathName, "/tolFeature_%d.pkl" % idx) tolFeature_df = pd.DataFrame(tolFeature, columns=[ 'label', 'AF1', 'AF2', 'AF3', 'AF4', 'AF5', 'AF6', 'AF7', 'AF8' ]) sp_muldG = load_pickle(muldigPathName, "/G_%d.pkl" % idx) #sp_mulG = load_pickle(mulgPathName+"/G_%d.pkl" % idx) #print(mulgPathName+"/G_%d.pkl" % idx) y_cols_name = ['label'] x_cols_name = [ x for x in tolFeature_df.columns if x not in y_cols_name ] global scipy_adj_matrix, train_x, train_y train_x = dcopy(tolFeature_df[x_cols_name]) train_y = dcopy(tolFeature_df[y_cols_name]) pos_cnt, neg_cnt = int( train_y.sum()), int(len(train_y) - train_y.sum()) scipy_adj_matrix = get_scipy_adj_matrix(sp_muldG) #print('pos node cnts:', pos_cnt) #print('neg node cnts:', neg_cnt, 'pos/all ratio:', # pos_cnt / (pos_cnt + neg_cnt)) fGCNembedding = get_GCN_embedding(epoch=6, lr=0.005, weight_decay=1e-6, esize=emsize, random_seed=rs + i) print("finish calculate embedding data!") save_pickle(fGCNembedding, GCNPathName + "%d" % testNum, "/fGCNembedding_%d.pkl" % idx) trainX, trainY, testX, testY = get_input_data( GCNPathName + "%d" % testNum, "/fGCNembedding", esize, True, False, True) #ftrainX, ftrainY, ftestX, ftestY = get_pure_feature(False) #print(ftrainX[:,subSum-1,:].size) #print(trainX[:, subSum-1, :]) inputX = trainX[:, subSum - 1, :] #inputX=np.concatenate((ftrainX[:,subSum-1,:], trainX[:, subSum-1, :]), axis=1) #inputX=trainX trainX_2D, trainY_1D = inputX, trainY[:, subSum - 1, 0] lgb_res = lgb_train_model_with_split(pd.DataFrame(trainX_2D), pd.DataFrame(trainY_1D), 2011) #print_res(lgb_res) for j in range(len(gcn_res)): gcn_res[j] += lgb_res[j] gcn_res = [i / rcnt for i in gcn_res] print(gcn_res) gc.collect() return gcn_res
def get_trainx_trainy(): idx = subSum - 1 tolFeature = load_pickle(tolFeaturePathName, "/tolFeature_%d.pkl" % idx) tolFeature_df = pd.DataFrame(tolFeature, columns=[ 'label', 'AF1', 'AF2', 'AF3', 'AF4', 'AF5', 'AF6', 'AF7', 'AF8' ]) sp_muldG = load_pickle(muldigPathName, "/G_%d.pkl" % idx) #print(mulgPathName+"/G_%d.pkl" % idx) y_cols_name = ['label'] x_cols_name = [x for x in tolFeature_df.columns if x not in y_cols_name] global train_x, train_y train_x = dcopy(tolFeature_df[x_cols_name]) train_y = dcopy(tolFeature_df[y_cols_name]) pos_cnt, neg_cnt = int(train_y.sum()), int(len(train_y) - train_y.sum())
def get_pure_feature(splitFlag): tolX = np.zeros((SAMPLE_SIZE, subSum, fSize)) tolY = np.zeros((SAMPLE_SIZE, subSum, 1)) idx = subSum-1 tolFeature_df = load_pickle(tolFeaturePathName, "/tolFeature_%d.pkl" % idx) # print(tolFeature_df.info()) # print(tolFeature_df.head()) # print(tolFeature_df.sum()) y_cols_name = ['label'] x_cols_name = [x for x in tolFeature_df.columns if x not in y_cols_name] subX = dcopy(tolFeature_df[x_cols_name]).values subY = dcopy(tolFeature_df[y_cols_name]).values tolX[:, idx, :] = subX tolY[:, idx, :] = subY if(splitFlag == True): trainX, testX, trainY, testY = train_test_split( tolX, tolY, test_size=0.2, random_state=RANDOM_SEED) else: trainX, trainY, testX, testY = tolX, tolY, 0, 0 return trainX, trainY, testX, testY