def draw_one(filepath,output_path,score_col=3, label_col=4): result_score = single_column(filepath, col_num=score_col) result_label = single_column(filepath,col_num=label_col) all_score = list(map(float, result_score)) all_label = list(map(int, result_label)) #进行归一化 all_score = normalization(all_score) long_ = sum(all_label) zipped = zip(all_score, all_label) # 先按 x[1] 进行排序,若 x[1] 相同,再按照 x[0] 排序 combine_zip = sorted(zipped, key=lambda x : (x[1], x[0]),reverse=True) combine = zip(*combine_zip) all_score_new, all_label_new = [list(x) for x in combine] score = all_score_new[:proportion*long_] label = all_label_new[:proportion*long_] with open(output_path, 'w', encoding='utf-8') as output: for row in range(len(score)): rowtext = '{} {}'.format(score[row], label[row]) output.write(rowtext) output.write('\n') # input('stop') return score, label
def sample(filepath_pos, filepath_neg, seed, proportion=1): '''这是一个对负样本随机采样的函数''' random.seed(seed) pos_name = single_column(filepath_pos, 0, removetitle=False) pos_max = single_column(filepath_pos, 1, removetitle=False) pos_ave = single_column(filepath_pos, 2, removetitle=False) neg_name = single_column(filepath_neg, 0, removetitle=False) neg_max = single_column(filepath_neg, 1, removetitle=False) neg_ave = single_column(filepath_neg, 2, removetitle=False) pos_num = len(pos_name) neg_num = pos_num * proportion # 下面用随机函数进行采样 neg_max_sample = [] neg_ave_sample = [] neg_name_sample = random.sample(neg_name, neg_num) for i in neg_name_sample: loc = neg_name.index(i) neg_max_sample.append(neg_max[loc]) neg_ave_sample.append(neg_ave[loc]) temp1 = [1 for i in range(len(pos_name))] temp2 = [0 for j in range(len(neg_name_sample))] temp1.extend(temp2) pos_max.extend(neg_max_sample) pos_ave.extend(neg_ave_sample) return neg_name_sample, temp1, pos_ave
def r_c(filepath): '''我们拿到一个矩阵,需要检查行列的标题是否是对应的就可以用这个函数,输入就是文件路径进就行了''' col = single_column(filepath, 0, removetitle=True) print('第一列长度:', len(col)) # 检查行列是否相等 with open(filepath, 'r', encoding='utf-8') as file: lines = file.readlines() line_tempt = lines[0] line_1 = line_tempt.split()[1:] print('第一行长度: ', len(line_1)) print("并集的个数", len(set(col) | set(line_1))) print('交集的个数', len(set(col) & set(line_1))) # 匹配两列看下情况 yes = 0 for v, k in enumerate(col): if k == line_1[v]: yes += 1 else: print('行和列从第:' + str(v + 1) + '个数开始不匹配') print('连续匹配的个数', yes) break
from extract import single_column "这个可以根据文件中的某一列数字,选择出TOP,相比于版本1,这里利用了数字出现的顺序,跑起来会更快。但是要注意几个参数" top = 20 # 这里是设置想要前几 col = 2 # 这里是选择第几列数字来排序(由0开始计数) genenum = 116 # 词典和文件匹配的蛋白质数目,用于写文件 drugnum = 2953 # 药物的数量,也就是一个蛋白出现了多少次,用于循环 # 文件路径,注意,这里的数据不是在data文件夹,而是在result文件夹。因为该文件是之前的函数运行的结果 filepath = 'C:\\Users\\hw\\Desktop\\test\\list_only_cov.txt' filepath_dic = 'H:\\Data\\dict\\Covs-protein_dict127.txt' # 新冠蛋白质词典 output_path = 'C:\\Users\\hw\\Desktop\\test\\list_top20.txt' # 设置输出结果路径 # 下面两行注意列的参数的修改 covid19_protein_dic = single_column(filepath_dic, 1, removetitle=True) only_procol = single_column(filepath, 1, removetitle=False) # 首先把每一行转换成列表,并且提前把要比较数字的那一列由字符串转换为数字 contents = [] with open(filepath, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: content = line.strip('\n').split() a = float(content[col]) content[col] = a contents.append(content) print("------------数据转换为列表阶段结束---------------") results = [] count = 0 for i in range(genenum): temps = [] # 每个蛋白(temp)都有2953个 while len(temps) < drugnum:
from extract import single_column filepath_list = 'C:\\Users\\hw\\Desktop\\list_only_morethan0.txt' filepath_pos = 'C:\\Users\\hw\\Desktop\\positive.txt' line_0 = single_column(filepath_list, 0, removetitle=False) line_2 = single_column(filepath_list, 2, removetitle=False) line_pos = single_column(filepath_pos, 0, removetitle=False) pos_name, neg_name = [], [] pos_score, neg_score = [], [] for loc, drug in enumerate(line_0): if drug in line_pos: pos_name.append(drug) pos_score.append(line_2[loc]) else: neg_name.append(drug) neg_score.append(line_2[loc]) with open('C:\\Users\\hw\\Desktop\\数据\\positive_data3.txt', 'w', encoding='utf-8') as f: for i, j in zip(pos_name, pos_score): rowtext = '{} {}'.format(i, j) f.write(rowtext) f.write('\n') with open('C:\\Users\\hw\\Desktop\\数据\\negative_data3.txt', 'w', encoding='utf-8') as ll: for k, l in zip(neg_name, neg_score):
from extract import single_column #这是排序 filepath = 'H:\\PyCharm_Projects\\DataProcessing\\test_list.txt' nums = [] count_1 = 0 count_05 = 0 list = single_column(filepath, 2, removetitle=False) for i in list: temp = float(i) nums.append(temp) nums.sort(reverse=True) for num in nums: if num >= 1: count_1 += 1 if num >= 0.5 and num < 1: count_05 += 1 print(count_05) print(count_1) print(nums[0:20])
from extract import single_column filepath = 'C:\\Users\\hw\\Desktop\\drug_wrong.txt' f = single_column(filepath, 0, removetitle=False) f.sort() with open('hahahaha.txt', 'w', encoding='utf-8') as l: for i in f: l.write(i) l.write('\n')
from extract import single_column '''这是从数据中挑出是新冠蛋白的,需要修改的参数就是下面三行文件路径代码''' # 下面第一行是新冠蛋白词典路径、第二行是数组路径、第三行是输出的文件名(文件会保存在代码所在文件夹) filepath_covid19_dic = 'H:\\Data\\dict\\Covs-protein_dict127.txt' filepath_list = 'C:\\Users\\hw\\Desktop\\test\\test_list.txt' file_output = 'C:\\Users\\hw\\Desktop\\test\\list_only_cov.txt' # 载入数据 covid19_dic = single_column(filepath_covid19_dic, 0, removetitle=True) proteins = single_column(filepath_list, 1, removetitle=False) print(len(set(covid19_dic))) print(len(set(proteins))) # 记录是新冠蛋白的行数 locs = [] for key, value in enumerate(proteins): if value in covid19_dic: locs.append(key) # 保留新冠对应行,并把数据写入新文件 contents = [] with open(filepath_list, 'r', encoding='utf-8') as f: l_temp = f.readlines() for r in locs: temp = l_temp[r].strip('\n').split() contents.append(temp) # 这里是统计最后结果中有多少个1 cc_num = 0 for cc in contents: if cc[3] == '1': cc_num += 1 print('最后的结果中1的个数为:', cc_num)
import numpy as np from extract import single_column from tomatrix import tomatrix from check import check '''这个主要是调用写好的函数(主要是tomatrix),把一个矩阵,根据字典,去除无用的行和列''' #文件路径,file_1表示字典,file_2表示需要修改的矩阵 file_1 = 'H:\\Data\\data422\\data\\dict\\gene_dict.txt' file_2 = 'H:\\Data\\data422\\data\\protein similarity\\protein_sequence_similarity.txt' #提取出词典 file_1_title = single_column(file_1, 0, removetitle=True) ##把剩下的数据转换成矩阵的形式,第一个参数为词典,第二个参数为根据词典修改的矩阵,第三个为是否去掉矩阵的第一行,第四个为是否去掉矩阵第一列 pure_mat = tomatrix(file_1_title, file_2, removetitle=True, removefirstcol=True) #检查输出的矩阵的格式 check(pure_mat) #如果矩阵不是0/1矩阵,把所有元素进行转化,然后求和,就可以清楚看出有几个非零元素 repeat_mat = pure_mat.copy() repeat_mat[(repeat_mat < 0)] = 0 repeat_mat[repeat_mat > 0] = 1 num = np.sum(repeat_mat) print("非0个数", num) #输出结果到txt文件 np.savetxt('C:\\Users\\hw\\Desktop\\2\\Sim_prosequence.txt', pure_mat, fmt="%f") print('end')
from extract import single_column import random #参数的 file_path_rel = '/Users/huangwei/Downloads/drug_disease_association.txt' drug_name_col = single_column(file_path_rel, col_num=0, removetitle=False) target_name_col = single_column(file_path_rel, col_num=1, removetitle=False) temp_drug_col = [] #这是没有去重的 for drug_name in drug_name_col: drug_num = drug_name.split(':')[0] temp_drug_col.append(drug_num) drug_col = set(temp_drug_col) #这是进行了去重操作的 print('没有去重,药物总共有:' + str(len(temp_drug_col))) print('去重后, 药物总共有:' + str(len(drug_col))) print('----------------------------------------------') temp_target_col = [] #这是没有去重的 for target_name in target_name_col: target_num = target_name.split(':')[0] temp_target_col.append(target_num) target_col = set(temp_target_col) #这是进行了去重操作的 print('没有去重,靶点总共有:' + str(len(temp_target_col))) print('去重后, 靶点总共有:' + str(len(target_col))) print('----------------------------------------------')
from extract import single_column import numpy as np import matplotlib.pyplot as plt from sklearn.manifold import TSNE file_dic = 'H:\\Data\\dict\\Covs-protein_dict127.txt' file_protein = 'H:\\Data\\dict\\protein5390_dict.txt' dic_temp = single_column(file_dic, 0, removetitle=True) pro = single_column(file_protein, 0, removetitle=True) #去除字典中的重复 dic = list(set(dic_temp)) locs = [] for loc, name in enumerate(dic): if name in pro: locs.append(pro.index(name)) #保存新冠蛋白的向量 data_pro = [] file_protein = 'C:\\Users\\hw\\Desktop\\all_protein.txt' with open(file_protein, 'r', encoding='utf-8') as f: contents = f.readlines() for loc in locs: data_pro.append(contents[loc].split()) #把116个向量中的字符串转换为数字 data_protein = [] for li in data_pro: temp = list(map(float, li)) data_protein.append(temp)
from extract import single_column '''这是一个把交互矩阵转换成列表,第一列药物,第二列蛋白,第三列数字''' # 需要注意两个位置的参数:1、第11行filepath的路径 2、39行输出的路径 results = [] contents = [] count = 0 # 记录最后一共有几个交互信息,方便核对 # 把字典,矩阵数据载入 filepath_drug_dic = 'H:\\Data\\dict\\drug2953_dict.txt' filepath_protein_dic = 'H:\\Data\\dict\\protein5390_dict.txt' filepath = 'C:\\Users\\hw\\Desktop\\test\\Zscore.txt' filepath_interaction = 'C:\\Users\\hw\\Desktop\\test\\interaction.txt' # 把药物和蛋白质词典提取出第一列 drug_list = single_column(filepath_drug_dic, 0, removetitle=True) protein_list = single_column(filepath_protein_dic, 0, removetitle=True) # 检查词典数量是否正确 print("药物词典的长度:", len(drug_list)) print("蛋白词典的长度:", len(protein_list)) # 交互数据 rows = [] with open(filepath_interaction, 'r', encoding='utf-8') as f: tempts = f.readlines() for tempt in tempts: row = tempt.strip('\n').split() rows.append(row)
def tomatrix(file_dic, filepath, removetitle=False, removefirstcol=False): '''把数据转换为矩阵,其中行标题和列标题是否要去掉可以通过最后两个参数决定,第一个参数是文件路径,第二个参数是矩阵的长度''' dic_1 = file_dic col_1 = single_column(filepath, 0, removetitle=True) #创建全0矩阵 long_mat = len(dic_1) mat_a = np.zeros((long_mat, long_mat)) #打开文件读取数据 with open(filepath, 'r', encoding='utf-8') as file: final_list = [] #用来记录最后剩下的行 lines = file.readlines() #把文件的每一行读取进一个列表,列表中每一个元素就是矩阵的一行 if removetitle == True: #决定是否删除第一行 del lines[0] if removefirstcol == True: #决定是否删除第一列 yes1 = 0 for line in lines: tempt_list = line.strip('\n').split() #把每一行拆分成一个列表 del tempt_list[0] #对于每一行数据删除第一个,想当于删除了第一列 #用0去替换列表中的缺失值"NA" for miss in tempt_list: try: tempt_list[tempt_list.index(miss)] = float(miss) except ValueError: tempt_list[tempt_list.index(miss)] = float(0) # 检查是否清除干净了 for r, ring in enumerate(tempt_list): if type(ring) != float: print(type(ring)) print(ring) print("行数", lines.index(line)) print("这个位置出错了:", r) break else: yes1 += 1 final_list.append(tempt_list) print("正确的元素的个数", yes1) else: yes1 = 0 for line in lines: tempt_list = line.strip('\n').split() for miss in tempt_list: try: tempt_list[tempt_list.index(miss)] = float(miss) except ValueError: tempt_list[tempt_list.index(miss)] = float(0) #检查是否清除干净了 for r, ring in enumerate(tempt_list): if type(ring) != float: print("行数", lines.index(line)) print("这个位置出错了:", r) break else: yes1 += 1 final_list.append(tempt_list) print("正确的元素的个数", yes1) #检查final_list: print('总的行数', len(final_list)) #遍历所有行,删除无用的行以及对应的列 for drug_name in col_1: if drug_name not in dic_1: drug_loc = col_1.index(drug_name) final_list[drug_loc] = 'XX' #删除列 for pp in final_list: if pp != 'XX': pp[col_1.index(drug_name)] = 'X' else: continue #把标记了要删除的统一删除 while 'XX' in final_list: final_list.remove('XX') for undel_col in final_list: while 'X' in undel_col: undel_col.remove('X') #补删除col_1 for drug0, drug in enumerate(col_1): if drug not in dic_1: col_1[drug0] = 'XXX' while 'XXX' in col_1: col_1.remove('XXX') #检查删除后的结果 print('*****删除后的行数:', len(final_list)) for cccc in final_list: if len(cccc) != len(final_list): print('删除个数不对') print('------------------------') break #检查数据中0的个数,方便核对 ehe_num = 0 for ehes in final_list: # print(ehes) for ehe in ehes: if ehe == float(0): ehe_num += 1 print('*****最后数据中有' + str(ehe_num) + '个0') #遍历列表,把列表的值一个一个插入到矩阵中,由于是对称的,所以用双循环来实现 for drug_1 in col_1: a = dic_1.index(drug_1) aa = col_1.index(drug_1) for drug_2 in col_1: c = dic_1.index(drug_2) cc = col_1.index(drug_2) mat_a[a][c] = final_list[aa][cc] return mat_a
from extract import single_column from combine import combine import numpy as np from check import check, precheck '''这个文件主要是调用几个写好的函数,把表示交互反应的文件,根据字典中的顺序,转换成一个邻接矩阵(只有0/1)''' #输入两个词典和一个交互信息的数据文件路径,file_1,file_2是file_1x2中两个属性的字典 file_1 = 'H:\\Data\\data422\\data\\dict\\gene_dict.txt' file_2 = 'H:\\Data\\data422\\data\\dict\\gene_dict.txt' file_1x2 = 'H:\\Data\\data422\\data\\interaction\\PPI.txt' #调用column中的single_column,分离出某个数据文件的某一列。 #第一个参数是文件,第二个参数是想要第几列,第三个参数是决定是否要去掉首行(标题) file_1_list = single_column(file_1, 0, removetitle=True) file_2_list = single_column(file_2, 0, removetitle=True) file_1x2_list1 = single_column(file_1x2, 0, removetitle=True) file_1x2_list2 = single_column(file_1x2, 1, removetitle=True) #对于分离出来的几个列进行检查,就是检查数据有没有问题 #从check中调用precheck,第1,3个参数是字典,第2,4个参数是交互数据的两列 precheck(file_1_list, file_1x2_list1, file_2_list, file_1x2_list2) # 拼接矩阵并检查 hello = combine(file_1_list, file_1x2_list1, file_2_list, file_1x2_list2) check(hello) #输出为txt格式 np.savetxt('C:\\Users\\hw\\Desktop\\protein_protein_mat.txt', hello, fmt="%d") print("end")
from extract import single_column '''这是从数据中挑出大于阈值的''' filepath_list = 'C:\\Users\\hw\\Desktop\\test\\list_only_cov.txt' file_output = 'C:\\Users\\hw\\Desktop\\test\\list_only_morethan0.txt' door_num = 0 #载入数据 nums = single_column(filepath_list, 2, removetitle=False) #记录大于阈值的位置 locs = [] count = 0 for key, value in enumerate(nums): if float(value) > door_num: locs.append(key) count += 1 print("大于阈值的个数为:", count) #保留大于阈值的行 contents = [] with open(filepath_list, 'r', encoding='utf-8') as f: lines = f.readlines() for loc in locs: content = lines[loc].strip('\n').split() contents.append(content) #把数据写入文件 with open(file_output, 'w', encoding='utf-8') as output: for content in contents: rowtext = '{} {} {} {}'.format(content[0], content[1], content[2], content[3])