def changeIndex(): if not os.path.exists(args.index_path): printT("don't find index file, fail to change index.") return # change index with open(args.index_path, 'r') as f: lines = f.readlines() index = [line.strip() for line in lines] for i in range(data.type_num): data.trainDatas_df[i].index = index
def del_non_value(): remainIndexs = None for i in range(data.type_num): nanIndex = data.edges_df_all[i][data.edges_df_all[i].isnull().all()].index remainIndexs = nanIndex if remainIndexs is None else remainIndexs & nanIndex for i in range(data.type_num): data.edges_df_all[i] = data.edges_df_all[i].drop(index=remainIndexs, columns=remainIndexs) printT(" output file", args.result_dir + str(i) + '_cor_new.csv') data.edges_df_all[i].to_csv(args.result_dir + str(i) + '_cor_new.csv') printT(" remain shape", data.edges_df_all[0].shape)
def cutOffByMin(minNum): remainIndexs = None for i in range(data.type_num): meanFt = data.trainDatas_df[i].mean(1) # get mean remainIndex = meanFt[meanFt > minNum].index if remainIndexs is None: remainIndexs = remainIndex else: remainIndexs = remainIndexs & remainIndex for i in range(data.type_num): data.trainDatas_df[i] = data.trainDatas_df[i].loc[remainIndexs] printT(' remain features count:', data.trainDatas_df[0].shape[0]) return
def run(cor_cut, limit_up, limit_down, model_4trend, path_model_cor, name_list, model_edges_count): dir_int_str = {} trends_pos_int_list = [] trends_neg_int_list = [] trends81_int_list = [] for a in range(3): a_ = a - 1 for b in range(3): b_ = b - 1 for c in range(3): c_ = c - 1 for d in range(3): d_ = d - 1 pre = a_ * 1000 + b_ * 100 + c_ * 10 + d_ pre_str = "%2d%2d%2d%2d" % (a_, b_, c_, d_) trends81_int_list.append(pre) dir_int_str[pre] = pre_str sum_ = a_ + b_ + c_ + d_ if (sum_ > 2) & (a_ == 1): trends_pos_int_list.append(pre) elif (sum_ < -2) & (a_ == -1): trends_neg_int_list.append(pre) printT(dir_int_str) printT(trends81_int_list) pl = multiprocessing.Pool(processes=5) resultBuffer = [] for name in name_list: resultBuffer.append( pl.apply_async(each_data, (name, cor_cut, limit_up, limit_down, model_4trend, path_model_cor, dir_int_str, trends_pos_int_list, trends_neg_int_list, model_edges_count))) pl.close() pl.join()
def draw_boxplot( name, f_list, data_path_model, data_path_p_model, out_path_pic_model, ): data_f = [] for f in f_list: data_f_pre = [] for i in range(5): printT("read data ", data_path_model % (name, name, i, f)) with open(data_path_model % (name, name, i, f), 'rb') as fr: data_pre = pickle.load(fr)[0] with open(data_path_p_model % (name, name, i, f), 'rb') as fr: p_pre = pickle.load(fr)[0] data_pre[p_pre > 0.01] = 0 data_pre[data_pre < 0] = -data_pre data_pre_np = data_pre.values data_list = data_pre_np.ravel()[np.flatnonzero(data_pre_np)] data_f_pre.append(data_list) data_f.append(data_f_pre) plt.figure(figsize=(20, 10)) p_list = ["Normal", "S1", "S2", "S3", "S4"] y = np.linspace(0, 1, 6) printT("draw ", name) plt.subplot(1, 2, 1) plt.boxplot(data_f[0], labels=p_list) plt.yticks(y) plt.title(f_list[0]) plt.subplot(1, 2, 2) plt.boxplot(data_f[1], labels=p_list) plt.yticks(y) plt.title(f_list[1]) plt.suptitle(name) printT("save ", name) plt.savefig(out_path_pic_model % name) printT("finish ", name, f)
def cal_out_subgraph(connect_graph, data_name, index, trend_data_pre, subgraphs_model, trend_pre, subgraph_nodelist_model_new, subgraphs_max_edges_model, subgraphs_max_model_edges_v_new): connect_graph_pd = pd.DataFrame(connect_graph, index=index, columns=index) remainIndex = connect_graph_pd.index subgraphs = [] added_index = [] for pre in remainIndex: if pre in added_index: continue pre_index = connect_graph_pd.loc[connect_graph_pd[pre] >= 1].index subgraphs.append(pre_index.tolist()) added_index.extend(pre_index) printT(data_name, trend_pre, "subgraph count:", len(subgraphs)) subgraphs.sort(key=functools.cmp_to_key(cmp)) output = open(subgraphs_model % (trend_pre, data_name), 'w') for row in subgraphs: row_str = str(row).replace("[", "").replace("]", "") output.write(row_str + '\n') submax_pd = pd.DataFrame(subgraphs[0]) submax_pd.to_csv(subgraph_nodelist_model_new % (data_name, trend_pre), header=None, index=False) subnet = trend_data_pre[trend_data_pre['Node_A'].isin(subgraphs[0]) & trend_data_pre['Node_B'].isin(subgraphs[0])] printT(data_name, trend_pre, "subnet nodes count", len(subgraphs[0])) printT(data_name, trend_pre, "subnet edges count", subnet.shape[0]) subnet[['Node_A', 'Node_B' ]].to_csv(subgraphs_max_edges_model % (trend_pre, data_name), index=False, sep="\t") subnet.to_csv(subgraphs_max_model_edges_v_new % (trend_pre, data_name), index=False)
def draw_heatmap( name, f_list, data_path_model, out_path_pic_model, ): sort_by = ["4", "0"] for i, f in enumerate(f_list): printT("read data ", data_path_model % (f, name)) edges_pre = pd.read_csv(data_path_model % (f, name)) edges_pre.sort_values(sort_by) printT("draw ", name) plt.figure() plt.suptitle(name + f) sns.clustermap(edges_pre[["0", "1", "2", "3", "4"]], col_cluster=False, cmap=sns.diverging_palette(270, 5, as_cmap=True)) plt.savefig(out_path_pic_model % (name, f)) printT("finish ", name)
def runAll(): initFile(args) initCode() # start printT("1.0 read data") initData.createDataSet() printT("1.1 initData min by", args.cut_value) initData.cutOffByMin(minNum=args.cut_value) printT("1.2 change data to log") initData.toLog() printT("2 calculate the similarity matrix") calculate.getSprearmon(need_output=True) printT("3 delete pvalue larger than", args.remain_p_value) calculate.del_non_value() # # if want to compare co-expression with expression value # printT("cal wilcox") # cal_wilconxon.get_wilcox() printT("over.")
def run(namelist, path_model, sub_model, stat_model): df_stat = pd.DataFrame(None, columns=namelist) trendlist = ['pos', 'neg'] df_result = pd.DataFrame(None, columns=['C1', 'C2', 'C3', 'C4']) for name in namelist: index_maxsub_s = [] index_secsub_s = [] printT() printT(name) for trend in trendlist: subgraph_path = sub_model % (trend, name) printT(subgraph_path, end=" ") with open(subgraph_path, 'r') as f: lines = f.readlines() points_list = lines[0].split(', ') points_list2 = lines[1].split(', ') points_list = [pre.replace("'", "").strip() for pre in points_list] points_list2 = [ pre.replace("'", "").strip() for pre in points_list2 ] printT(len(points_list)) index_maxsub_s.append(points_list) index_secsub_s.append(points_list2) df_stat.at['sub_' + trend + '_points', name] = len(list(set(points_list))) df_stat.at['sec_' + trend + '_points', name] = len(list(set(points_list2))) node_count_s = [] for i, trend in enumerate(trendlist): data_path = path_model % (trend, name) data = pd.read_csv(data_path) df_stat.at['all_' + trend + '_edges', name] = data.shape[0] df_stat.at['all_' + trend + '_points', name] = len( list(set(data['Node_A'].tolist() + data['Node_B'].tolist()))) printT(data.shape, end="->") data_sec = data[data['Node_A'].isin(index_secsub_s[i])] data_sec = data_sec[data_sec['Node_B'].isin(index_secsub_s[i])] printT(data_sec.shape) df_stat.at['sec_' + trend + '_edges', name] = data_sec.shape[0] data = data[data['Node_A'].isin(index_maxsub_s[i])] data = data[data['Node_B'].isin(index_maxsub_s[i])] printT(data.shape, data_path) df_stat.at['sub_' + trend + '_edges', name] = data.shape[0] index = data['Node_A'].tolist() + data['Node_B'].tolist() df_stat.at['sub_' + trend + '_points', name] = len(list(set(index))) result = Counter(index) node_count_pre = pd.DataFrame(list(result.most_common()), columns=['Node', trend + '_num']) node_count_pre.set_index(["Node"], inplace=True) printT(i, "node count", node_count_pre.shape) node_count_s.append(node_count_pre) node_count_df = node_count_s[0].join(node_count_s[1], how='outer') printT("all node count", node_count_df.shape) node_count_df = node_count_df.fillna(0) c1 = node_count_df[(node_count_df[trendlist[0] + '_num'] != 0) & (node_count_df[trendlist[1] + '_num'] != 0)] c2 = node_count_df[(node_count_df[trendlist[0] + '_num'] > 5) & (node_count_df[trendlist[1] + '_num'] > 5)] c3_pos = node_count_df[(node_count_df[trendlist[0] + '_num'] >= 1) & (node_count_df[trendlist[1] + '_num'] == 0)] c3_neg = node_count_df[(node_count_df[trendlist[0] + '_num'] == 0) & (node_count_df[trendlist[1] + '_num'] >= 1)] df_result.loc[name] = [ c1.shape[0], c2.shape[0], c3_pos.shape[0], c3_neg.shape[0] ] df_stat = df_stat.sort_index() print(df_stat) df_stat.to_csv(stat_model)
def each_data(data_name, cor_cut, limit_up, limit_down, model_4trend, path_model_cor, dir_int_str, trends_pos_int_list, trends_neg_int_list, model_edges_count): adj_sum = None na_num_list = [] cors_ori_pd = [] cors_sign_np = [] cal_sign_np = None for i in range(5): # init printT(path_model_cor % (data_name, i)) cor_pre_pd = pd.read_csv(path_model_cor % (data_name, i), index_col=0) cor_pre_np = np.triu(cor_pre_pd.values, 1) cor_pre_pd[np.abs(cor_pre_pd) < cor_cut] = 0 cor_pre_pd = pd.DataFrame(cor_pre_np, index=cor_pre_pd.index, columns=cor_pre_pd.columns) cor_pre_pd = cor_pre_pd.fillna(0) if adj_sum is None: adj_sum = cor_pre_pd else: adj_sum = adj_sum + abs(cor_pre_pd) na_num_list.append((cor_pre_pd != 0).sum().sum()) # cal signal cors_ori_pd.append(cor_pre_pd) cors_sign_pre_np = np.where(cor_pre_pd > 0, 1, cor_pre_pd) cors_sign_pre_np = np.where(cors_sign_pre_np < 0, 10, cor_pre_pd) cors_sign_np.append(cors_sign_pre_np) if cal_sign_np is None: cal_sign_np = cors_sign_pre_np else: cal_sign_np = cal_sign_np + cors_sign_pre_np na_num_list.append((adj_sum != 0).sum().sum()) pd.DataFrame(na_num_list).to_csv(model_edges_count % data_name) # set abs or ori cal_sign_np = np.where((cal_sign_np % 10 == 0) & (cal_sign_np != 0), -1, 1) # all<0 | all=0 cors_abs = [] printT("calculate sign", data_name) for i in range(5): cors_abs.append(cal_sign_np * cors_ori_pd[i]) # cal trend up printT("calculate trend", data_name) trends = None trends_add = None for i in range(4): pre_distance = cors_abs[i + 1] - cors_abs[i] if i == 0: pre_distance[pre_distance >= limit_up] = 1 pre_distance[pre_distance <= -limit_up] = -1 pre_distance[(pre_distance < limit_up) & (pre_distance > -limit_up)] = 0 else: """ add>0 (-d,u) add=0 (-u,u) add<0 (-u,d) """ pre_distance[(pre_distance >= limit_up) & (trends_add >= 0)] = 1 pre_distance[(pre_distance >= limit_down) & (trends_add < 0)] = 1 pre_distance[(pre_distance <= -limit_down) & (trends_add > 0)] = -1 pre_distance[(pre_distance <= -limit_up) & (trends_add <= 0)] = -1 pre_distance[(pre_distance < limit_up) & (pre_distance > -limit_down) & (trends_add > 0)] = 0 pre_distance[(pre_distance < limit_up) & (pre_distance > -limit_up) & (trends_add == 0)] = 0 pre_distance[(pre_distance < limit_down) & (pre_distance > -limit_up) & (trends_add < 0)] = 0 if trends is None: trends = pre_distance trends_add = pre_distance else: trends = trends * 10 + pre_distance trends_add = trends_add + pre_distance # output index = trends.index columns = trends.columns for trend_pre, trends_int_list in zip( ["pos", "neg"], [trends_pos_int_list, trends_neg_int_list]): trends4_pre = pd.DataFrame( None, columns=["trend", "Node_A", "Node_B", "0", "1", "2", "3", "4"]) printT("collect", data_name, trend_pre) for pre_trend in trends_int_list: locs = np.where(trends == pre_trend) index_pre_name = [index[i] for i in locs[0]] columns_pre_name = [columns[i] for i in locs[1]] pd_pre = pd.DataFrame(None, columns=trends4_pre.columns) pd_pre["Node_A"] = index_pre_name pd_pre["Node_B"] = columns_pre_name for i in range(5): pd_pre[str(i)] = [ cors_ori_pd[i].at[a, b] for a, b in zip(index_pre_name, columns_pre_name) ] pd_pre["trend"] = dir_int_str[pre_trend] trends4_pre = pd.concat([trends4_pre, pd_pre], axis=0) printT(data_name, trend_pre, trends4_pre.shape, "output", model_4trend % ("pos", data_name)) trends4_pre.to_csv(model_4trend % (trend_pre, data_name))
def readFile(path): dataSet = pd.read_csv(path, sep='\t') printT(' ' + path, dataSet.shape) data.trainDatas_df.append(dataSet) return
def each_process_fold(sample_num, i, trainDataFrame, need_output): # non-multi import pickle printT(" calculate spearmon %d, " % i, trainDataFrame.shape) # # doesn't use fold # scc_unfold, p_unfold = getSimilarAndPvalue(trainDataFrame) # cDF_unfold = pd.DataFrame(scc_unfold, index=trainDataFrame.index, columns=trainDataFrame.index) # pDF_unfold = pd.DataFrame(p_unfold, index=trainDataFrame.index, columns=trainDataFrame.index) # printT(" output file", args.result_dir + args.data_name + '_' + str(i) + '_cor_unfold.pkl') # with open(args.result_dir + args.data_name + '_' + str(i) + '_cor_unfold.pkl', 'wb') as fw: # pickle.dump([cDF_unfold], fw, 0) # printT(" output file", args.result_dir + args.data_name + '_' + str(i) + '_p_unfold.pkl') # with open(args.result_dir + args.data_name + '_' + str(i) + '_p_unfold.pkl', 'wb') as fw: # pickle.dump([pDF_unfold], fw, 0) np.random.seed(66) fold_num = 10 sccDF = 0 pDF = 1 need_g = 1 + int((sample_num * fold_num) / data.trainDatas_df[i].shape[1]) list_all = [] for nee in range(need_g): list_pre = [j for j in range(data.trainDatas_df[i].shape[1])] shuffle(list_pre) list_all.extend(list_pre) for j in range(fold_num): # 10-fold train_index = list_all[j * sample_num:(j + 1) * sample_num] printT(" spearmon %d by fold=%d/%d %d sample=%s" % (i, j, fold_num, len(train_index), train_index)) trainDataFrame_pre = trainDataFrame.iloc[:, train_index] scc, p = getSimilarAndPvalue(trainDataFrame_pre) sccDF_pre = pd.DataFrame(scc, index=trainDataFrame_pre.index, columns=trainDataFrame_pre.index) pDF_pre = pd.DataFrame(p, index=trainDataFrame_pre.index, columns=trainDataFrame_pre.index) sccDF = sccDF + sccDF_pre pDF = pDF * pDF_pre printT(" spearmon %d finish" % i) cDF_result = sccDF / fold_num pDF_result = pDF ** (1 / fold_num) if need_output: printT(" output file", args.result_dir + args.data_name + '_' + str(i) + '_cor_fold.pkl') with open(args.result_dir + args.data_name + '_' + str(i) + '_cor_fold.pkl', 'wb') as fw: pickle.dump([cDF_result], fw, 0) printT(" output file", args.result_dir + args.data_name + '_' + str(i) + '_p_fold.pkl') with open(args.result_dir + args.data_name + '_' + str(i) + '_p_fold.pkl', 'wb') as fw: pickle.dump([pDF_result], fw, 0) printT(" delete spearmon %d by p>%f" % (i, args.remain_p_value)) cDF_result[pDF_result > args.remain_p_value] = np.nan printT(" calculate spearmon %d finish" % i) return cDF_result