def get_gene(): table = util.get_modified_data() gene_list = [] id_list = [] current_date = util.get_current_date() pattern = re.compile('[A-Za-z0-9]*-*[A-Za-z0-9]*,\s*\{[0-9]*\.?[0-9]*\}') for i in range(table.nrows): # 循环逐行打印 if i == 0: continue else: list = util.get_list_by_enter(table.row_values(i)[3]) # 获取基因数据 for item in list: # 对每一行数据进行处理 if item == '\n' or item == '': continue m = pattern.match(item) if m != None: temp = m.group() gene_list.append(temp.split(',')[0]) id_list.append( temp.split(',')[1][temp.split(',')[1].find('{') + 1:temp.split(',')[1].find('}')]) wb = xlwt.Workbook() sheet1 = wb.add_sheet('gene') sheet1.write(0, 0, 'id') sheet1.write(0, 1, 'geneName') sheet1.write(0, 2, 'geneNumber') for i in range(len(gene_list)): sheet1.write(i + 1, 0, i + 1) sheet1.write(i + 1, 1, gene_list[i]) sheet1.write(i + 1, 2, id_list[i]) wb.save("../final_data/gene.xls")
def get_hpo(): current_date = util.get_current_date() table = util.get_file_data('../final_data/system.xls') src = [] dst = [] for i in range(table.nrows): if i == 0: continue else: src.append(table.row_values(i)[8]) for item in src: if item=="": continue item = item[item.index(":") + 1:] list = util.get_list_by_colon(item) for l in list: if l == "": continue else: try: dst.index(l) except: dst.append(l) wb = xlwt.Workbook() sheet1 = wb.add_sheet('sheet1') sheet1.write(0, 0, 'hid') sheet1.write(0, 1, 'hpo') for i in range(len(dst)): # print(result[i][j]) try: sheet1.write(i + 1, 0, i + 1) sheet1.write(i + 1, 1, dst[i]) except: print(i, dst[i]) wb.save("../final_data/HPO.xls") print("end")
def get_all(): # 获取需要的数据 type_dict = util.get_type_list() raw_data = util.get_modified_data() current_date = util.get_current_date() phenomenon_src = [] phenomenon_dst = [] type_dst = [] system_dst = [] for i in range(raw_data.nrows): if i == 0: continue else: phenomenon_src.append(raw_data.row_values(i)[4]) for p in phenomenon_src: list = util.get_list_by_enter(p) for l in list: if l == ";snomedct:;;;;" or l == "": continue first_colon = l.index(':') first_semicolon = l.index(';') type = l[0:first_colon] # 获取疾病部位 phenomenon = l[0:first_semicolon] # 获取疾病描述 system = l[first_semicolon + 1:] # 获取各个系统的所有信息 # 此处假设对相同部位相同表型,只有一种描述 try: phenomenon_dst.index(phenomenon) except: type_dst.append(type) phenomenon_dst.append(phenomenon) system_dst.append(system) wb = xlwt.Workbook() sheet1 = wb.add_sheet('sheet1') sheet1.write(0, 0, 'pid') sheet1.write(0, 1, 'phenomenon') sheet1.write(0, 2, 'type') sheet1.write(0, 3, 'system') sheet1.write(0, 4, 'snomedct') sheet1.write(0, 5, 'UMLS') sheet1.write(0, 6, 'ICD10CM') sheet1.write(0, 7, 'ICD9Cm') sheet1.write(0, 8, 'HPO') for i in range(len(phenomenon_dst)): try: sheet1.write(i + 1, 0, i + 1) sheet1.write(i + 1, 1, phenomenon_dst[i]) sheet1.write(i + 1, 2, type_dict.index(type_dst[i]) + 1) sheet1.write(i + 1, 3, system_dst[i]) list = util.get_list_by_semicolon(system_dst[i]) sheet1.write(i + 1, 4, list[0]) sheet1.write(i + 1, 5, list[1]) sheet1.write(i + 1, 6, list[2]) sheet1.write(i + 1, 7, list[3]) sheet1.write(i + 1, 8, list[4]) except: print(i, phenomenon_dst[i], ":", type_dst[i]) wb.save("../final_data/system.xls") print("end")
def prehandler(): current_date = util.get_current_date() phenomenon_src = [] phenomenon_dst = [] mimnumber_src = [] preferred_title_src = [] inheritance_src = [] gene_src = [] # 获取原表数据 raw_data = util.get_raw_data() for i in range(raw_data.nrows): if i == 0: continue else: phenomenon_src.append(raw_data.row_values(i)[4]) mimnumber_src.append(raw_data.row_values(i)[0]) preferred_title_src.append(raw_data.row_values(i)[1]) inheritance_src.append(raw_data.row_values(i)[2]) gene_src.append(raw_data.row_values(i)[3]) # 获取部位列表 type_dict = util.get_type_list() # 处理表型,在每个表型前增加部位,并将之前多余的信息去除 for p in phenomenon_src: list = util.get_list_by_enter(p) if list[-1] == "": list.pop() type = '' temp = '' for l in list: if l == ";snomedct:;;;;": continue first_colon = l.index(':') first_semicolon = l.index(';') if first_colon < first_semicolon: tmp = l[:first_colon] try: type_dict.index(tmp) except: l = type + ":" + l else: type = tmp else: l = type + ":" + l temp = temp + "\n" + l; phenomenon_dst.append(temp) wb = xlwt.Workbook() sheet1 = wb.add_sheet('sheet1') sheet1.write(0, 0, 'mimnumber') sheet1.write(0, 1, 'preferredTitle') sheet1.write(0, 2, 'inheritance') sheet1.write(0, 3, 'molecularBasis') sheet1.write(0, 4, 'clinicalSynopsis') for i in range(len(phenomenon_dst)): sheet1.write(i + 1, 0, mimnumber_src[i]) sheet1.write(i + 1, 1, preferred_title_src[i]) sheet1.write(i + 1, 2, inheritance_src[i]) sheet1.write(i + 1, 3, gene_src[i]) sheet1.write(i + 1, 4, phenomenon_dst[i]) wb.save("../final_data/omim_phenotype_modified.xls") print("modify data end")