def ubiquitin(filepath,fts): table = functions.connectMongoDB('uniprot','ubiquitinTable') # Open a file ac_flag = 0 out_ac = [] out_position = [] out_data = dict() special = 0 specials = ['Glycyllysineisopeptide(Lys-Gly)','Peptide(Met-Gly)(interchainwithG-Cter','Glycylserineester(Ser-Gly)','Glycylcysteinethioester(Cys-Gly)'] with open(filepath) as fp: for line in fp: collapsed = ' '.join(line.split()) data = collapsed.split(";") parsed_1 = data[0].split(" ") if parsed_1[0] == "AC" and ac_flag == 0: ac_flag = 1 out_ac.append(parsed_1[1]) if len(data) > 2: for x in range(1, len(data)-1): out_ac.append(data[x]) out_data = {'ac':out_ac} ##[go,interpro,pfam,prosite,smart,supfam] elif parsed_1[0] == "FT": if len(parsed_1) > 4 and special == 0: ft = '' for i in range(4,len(parsed_1)): ft = ft + parsed_1[i] ft = re.sub('[.]', '', ft) out_position = functions.remove_duplicates([parsed_1[2],parsed_1[3]]) if ft in specials: special = 1 continue if ft in fts: fts.setdefault(ft, []).append(out_position) out_position = [] elif special == 1: for i in range(1,len(parsed_1)): ft = ft + parsed_1[i] ft = re.sub('[.]', '', ft) if ft in fts: fts.setdefault(ft, []).append(out_position) out_position = [] special = 0 elif parsed_1[0] == '//': fts = dict( [(k,list(itertools.chain.from_iterable(v))) for k,v in fts.items() if len(v)>0]) #delete empty FTs from dictionary ##list(itertools.chain.from_iterable(v)) format out_data = functions.merge_two_dicts(out_data,fts) #print(out_data) table.save(out_data) fts = {'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-Cterinubiquitin)':[],'Glycylserineester(Ser-Gly)(interchainwithG-Cterinubiquitin)':[], 'Peptide(Met-Gly)(interchainwithG-Cterinubiquitin)':[],'Glycylcysteinethioester(Cys-Gly)(interchainwithG-Cterinubiquitin)':[]} ##rewind out_ac = [] ac_flag = 0 out_position = [] fp.close()
def tableGeneration(filepath, fts): table = functions.connectMongoDB('uniprot', 'table') # Open a file id_flag = 0 ac_flag = 0 out_ac = [] out_position = [] out_data = dict() special = 0 sequence = '' with open(filepath) as fp: for line in fp: collapsed = ' '.join(line.split()) data = collapsed.split(";") parsed_1 = data[0].split(" ") if parsed_1[0] == "ID" and id_flag == 0: id_flag = 1 out_id = parsed_1[1] elif parsed_1[0] == "AC" and ac_flag == 0: ac_flag = 1 out_ac.append(parsed_1[1]) if len(data) > 2: for x in range(1, len(data) - 1): out_ac.append(data[x]) out_data = {'_id': out_id, 'ac': out_ac} ##[go,interpro,pfam,prosite,smart,supfam] elif parsed_1[0] == "FT": if len(parsed_1) > 4 and special == 0: ft = '' for i in range(4, len(parsed_1)): ft = ft + parsed_1[i] ft = re.sub('[.]', '', ft) out_position = functions.remove_duplicates( [parsed_1[2], parsed_1[3]]) if ft == 'Glycyllysineisopeptide(Lys-Gly)': special = 1 continue if ft in fts: fts.setdefault(ft, []).append(out_position) out_position = [] elif special == 1: for i in range(1, len(parsed_1)): ft = ft + parsed_1[i] ft = re.sub('[.]', '', ft) if ft in fts: fts.setdefault(ft, []).append(out_position) out_position = [] special = 0 ## ## parse_1[0] is usually RT,DR,FT,or SQ etc... only squence part has length greater than 2 elif len(parsed_1[0]) > 2: sequence += collapsed elif parsed_1[0] == '//': fts = dict( [(k, list(itertools.chain.from_iterable(v))) for k, v in fts.items() if len(v) > 0] ) #delete empty FTs from dictionary ##list(itertools.chain.from_iterable(v)) format out_data = functions.merge_two_dicts(out_data, fts) sequence = ''.join(sequence.split()) out_data['sequence'] = sequence #print(out_data) table.save(out_data) fts = { 'Phosphoserine': [], 'Phosphothreonine': [], 'Phosphotyrosine': [], 'N6-acetyllysine': [], 'Omega-N-methylarginine': [], 'N6-methyllysine': [], 'N6,N6-dimethyllysine': [], 'N6,N6,N6-trimethyllysine': [], 'N-linked(GlcNAc)asparagine': [], 'S-palmitoylcysteine': [], 'Pyrrolidonecarboxylicacid': [], 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)': [], 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-Cterinubiquitin)': [] } ##rewind out_ac = [] id_flag = 0 ac_flag = 0 out_position = [] sequence = '' fp.close()
def tableGeneration(filepath, ptms): table = functions.connectMongoDB('uniprot', 'table') table.drop() out_id = "" out_ac = [] out_position = [] out_data = dict() sequence = "" temp_ptm = "" prev_fp_pos = 0 check = [] fp = open(filepath) line = fp.readline() while line: collapsed = ' '.join(line.split()) data = collapsed.split(";") info = data[0].split(" ") tag = info[0] #print(info[0]+" info1 "+info[1]+"\n") if tag == "ID": out_id = info[1] elif tag == "AC": out_ac.append(info[1]) if len(data) > 2: for x in range(1, len(data) - 1): out_ac.append(data[x].lstrip()) elif tag == "OC": check.append(info[1].lstrip()) if len(data) > 2: for x in range(1, len(data) - 1): check.append(data[x].lstrip()) out_data = {"_id": out_id, "ac": out_ac, "species": check} elif tag == "FT": temp_ptm = "" out_position = functions.remove_duplicates([info[2], info[3]]) temp_ptm = " ".join(info[4:]) #if "Q9TT90" in out_ac: # print("################temp_ptm is 1 "+temp_ptm+"\n") prev_fp_pos = fp.tell() line = ' '.join(fp.readline().split()) info = line.split(" ") while info[0] == "FT": if len(info) > 3 and is_number(info[2]) and is_number(info[3]): #if "Q9TT90" in out_ac: # print("###########temp_ptm is 2 "+temp_ptm+"\n") temp_ptm = re.sub('(\.*)\)', ')', temp_ptm) for doc in ptms: #if "Q9TT90" in out_ac and doc == 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)': # print(doc+" vs "+re.sub('[\.|\;].*','',temp_ptm)+"\n") if doc == re.sub('[\.|\;].*', '', temp_ptm): #if "Q9TT90" in out_ac: # print("yes\n") ptms.setdefault(doc, []).append(out_position) temp_ptm = "" out_position = functions.remove_duplicates( [info[2], info[3]]) temp_ptm = " ".join(info[4:]) else: temp_ptm = temp_ptm + " ".join(info[1:]) #if "Q9TT90" in out_ac: # print("#################temp_ptm is 3 "+temp_ptm+"\n") #for i in range(1,len(info)): # temp_ptm += info[i].rstrip() #print(temp_ptm+"\n") prev_fp_pos = fp.tell() line = ' '.join(fp.readline().split()) info = line.split(" ") temp_ptm = re.sub('(\.*)\)', ')', temp_ptm) for doc in ptms: #if "Q9TT90" in out_ac and doc == 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)': # print(doc+" vs "+re.sub('[\.|\;].*','',temp_ptm)+"\n") if doc == re.sub('[\.|\;].*', '', temp_ptm): #if "Q9TT90" in out_ac: # print("yes\n") ptms.setdefault(doc, []).append(out_position) ptms = dict([(k, list(itertools.chain.from_iterable(v))) for k, v in ptms.items() if len(v) > 0]) fp.seek(prev_fp_pos) elif tag == "SQ": sequence = seq_read(fp) out_data = functions.merge_two_dicts(out_data, ptms) out_data['sequence'] = sequence table.save(out_data) ##rewind ptms = { 'Phosphoserine': [], 'Phosphothreonine': [], 'Phosphotyrosine': [], 'N6-acetyllysine': [], 'Omega-N-methylarginine': [], 'Dimethylated arginine': [], 'Symmetric dimethylarginine': [], 'Asymmetric dimethylarginine': [], 'N6-methyllysine': [], 'N6,N6-dimethyllysine': [], 'N6,N6,N6-trimethyllysine': [], 'N-linked (GlcNAc) asparagine': [], 'S-palmitoyl cysteine': [], 'Pyrrolidone carboxylic acid': [], 'Glycyl lysine isopeptide (Lys-Gly)(interchain with G-Cter in SUMO)': [], 'Glycyl lysine isopeptide (Lys-Gly)(interchain with G-Cter in ubiquitin)': [] } out_data.clear() out_ac = [] out_position = [] sequence = "" check = [] line = fp.readline() fp.close()
with open(Dic_File, 'a') as f: f.write(stock_info['公司名称:'] + "\n") f.write(stock_info['公司简称:'] + "\n") for n in stock_info['证券简称更名历史:']: f.write(n + '\n') if 'A' in n or 'B' in n or 'G' in n or 'N' in n: f.write( n.replace('A', '').replace('B', '').replace( 'G', '').replace('N', '') + '\n') stock_info['证券简称更名历史:'] = ' '.join(stock_info['证券简称更名历史:']) stock_info_csv.append(stock_info) i += 1 print("%s/%s" % (i, len(Stock_Codes)), stock_info) time.sleep(10 + random.random() * 10) ## 重新读入文件,删除重复 remove_duplicates(Dic_File) ## 写入csv文件 csv_exist = os.path.exists(Csv_File) with open(Csv_File, 'a') as f: header = ['code', '公司名称:', '公司简称:', '证券简称更名历史:'] f_csv = csv.DictWriter(f, header) if not csv_exist: f_csv.writeheader() f_csv.writerows(stock_info_csv) remove_duplicates(Csv_File, csv=True) else: from functions import getdic getdic("stock_name.txt") print("Complete 1/2. See ./sub_dics/stock_name.txt") getdic("indus_name.txt") print("Complete 2/2. See ./sub_dics/indus_name.txt")
return if __name__ == "__main__": ## 获取命令行参数 Need_Craw = False args = sys.argv[1:] for arg in args: if arg == "-update": Need_Craw = True else: print("无效的参数:" + arg) sys.exit(1) if Need_Craw: ## 从网络爬取 get_glossary() ## 写入文件 remove_duplicates(Dic_File) ## 写入csv文件 csv_exist = os.path.exists(Csv_File) with open(Csv_File, 'a') as f: header = ['iterm', 'link', 'father', 'tag'] f_csv = csv.DictWriter(f, header) if not csv_exist: f_csv.writeheader() f_csv.writerows(glossary) else: from functions import getdic getdic("glossary.txt") print("Complete. See ./sub_dics/glossary.txt")