def MongotoPTMannotation(proteinIDs, Tag_FTs, output_types, output_prefix): table = functions.connectMongoDB('test', 'table') entry = functions.connectMongoDB('uniprot', 'entry') file = [] out_data = '' if not os.path.exists(output_prefix): os.makedirs(output_prefix) for index, tag in enumerate(Tag_FTs): file.append(open(output_prefix + '/' + tag + '.fasta', 'w')) for id in proteinIDs: ptm = table.find_one({'_id': id}) ft_index = [] print(ptm) for index, ft in enumerate(Tag_FTs): # ft = re.sub('[.]', '',ft) #take off . unfold_ft = ft.split(" ") for new_ft in unfold_ft: if new_ft in ptm: ft_index.extend(ptm[new_ft]) if len(ft_index) >= 1: sequence = ptm['sequence'] if output_types == 1: #DUOLIN out_data = duolin(ptm['_id'], ft_index, sequence) else: #CHUNHUI out_data = chunhui(ptm['_id'], sequence) file[index].write(out_data) for index, tag in enumerate(Tag_FTs): file[index].close()
def main(): dbname = 'uniprot' colname = 'table' collection = functions.connectMongoDB(dbname, colname) results = collection.find({}) display = { 'Phosphoserine': 0, 'Phosphothreonine': 0, 'Phosphotyrosine': 0, 'N6-acetyllysine': 0, 'Omega-N-methylarginine': 0, 'N6-methyllysine': 0, 'N6,N6-dimethyllysine': 0, 'N6,N6,N6-trimethyllysine': 0, 'N-linked(GlcNAc)asparagine': 0, 'S-palmitoylcysteine': 0, 'Pyrrolidonecarboxylicacid': 0, 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)': 0 } for data in results: for ptm in display: if ptm in data: display[ptm] += len(data[ptm]) print(display) with open('display.txt', 'w') as outfile: json.dump(display, outfile)
def ptmPosition(Tag_FTs): table = functions.connectMongoDB('uniprot', 'table') out_data = '' file = [] if not os.path.exists("data"): os.makedirs("data") for index, tag in enumerate(Tag_FTs): file.append(open('data/' + tag + '.txt', 'w')) with open("format8.txt") as fp: for line in fp: collapse = ' '.join(line.split()) parse = collapse.split(" ") id = parse[1] ptm = table.find_one({'_id': id}) ptm_pos = [] for index, ft in enumerate(Tag_FTs): if ft in ptm: ptm_pos.extend(ptm[ft]) relative_positions = calc_psition(int(parse[6]), int(parse[8]), int(parse[9]), ptm_pos) out_data = prepare(ptm['_id'], relative_positions) file[index].write(out_data) for index, tag in enumerate(Tag_FTs): file[index].close()
def get_ids(sp): ids = [] table = functions.connectMongoDB('uniprot', 'table') cursor = table.find() for doc in cursor: if doc['species'] and sp in doc['species']: ids.append(doc['_id']) return ids
def ubiquitin(filepath,fts): table = functions.connectMongoDB('uniprot','ubiquitinTable') # Open a file ac_flag = 0 out_ac = [] out_position = [] out_data = dict() special = 0 specials = ['Glycyllysineisopeptide(Lys-Gly)','Peptide(Met-Gly)(interchainwithG-Cter','Glycylserineester(Ser-Gly)','Glycylcysteinethioester(Cys-Gly)'] with open(filepath) as fp: for line in fp: collapsed = ' '.join(line.split()) data = collapsed.split(";") parsed_1 = data[0].split(" ") if parsed_1[0] == "AC" and ac_flag == 0: ac_flag = 1 out_ac.append(parsed_1[1]) if len(data) > 2: for x in range(1, len(data)-1): out_ac.append(data[x]) out_data = {'ac':out_ac} ##[go,interpro,pfam,prosite,smart,supfam] elif parsed_1[0] == "FT": if len(parsed_1) > 4 and special == 0: ft = '' for i in range(4,len(parsed_1)): ft = ft + parsed_1[i] ft = re.sub('[.]', '', ft) out_position = functions.remove_duplicates([parsed_1[2],parsed_1[3]]) if ft in specials: special = 1 continue if ft in fts: fts.setdefault(ft, []).append(out_position) out_position = [] elif special == 1: for i in range(1,len(parsed_1)): ft = ft + parsed_1[i] ft = re.sub('[.]', '', ft) if ft in fts: fts.setdefault(ft, []).append(out_position) out_position = [] special = 0 elif parsed_1[0] == '//': fts = dict( [(k,list(itertools.chain.from_iterable(v))) for k,v in fts.items() if len(v)>0]) #delete empty FTs from dictionary ##list(itertools.chain.from_iterable(v)) format out_data = functions.merge_two_dicts(out_data,fts) #print(out_data) table.save(out_data) fts = {'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-Cterinubiquitin)':[],'Glycylserineester(Ser-Gly)(interchainwithG-Cterinubiquitin)':[], 'Peptide(Met-Gly)(interchainwithG-Cterinubiquitin)':[],'Glycylcysteinethioester(Cys-Gly)(interchainwithG-Cterinubiquitin)':[]} ##rewind out_ac = [] ac_flag = 0 out_position = [] fp.close()
def db_to_fasta(output_prefix): entry = functions.connectMongoDB('uniprot', 'table') out_data = '' out_file = open(output_prefix + '.fasta', 'w') entrys = entry.find({}) for doc in entrys: out_data = prepareData(doc['_id'], doc['sequence']) out_file.write(out_data) out_file.close()
def main(): dbname = 'uniprot' colname = 'ubiquitinTable' collection = functions.connectMongoDB(dbname, colname) write2file = [] takeoff = ['_id', 'ac'] ## with open('positionInfo.txt', 'w') as outfile: for ac in acs: result = collection.find_one({'ac': ac}) print(ac) temp = {'AC': ac} for output in result: if output not in takeoff: temp[output] = result[output] write2file.append(temp) json.dump(write2file, outfile)
def blast_output(output_prefix): table = functions.connectMongoDB('uniprot', 'table') if not os.path.exists(output_prefix): os.makedirs(output_prefix) out_file = open(output_prefix + '/input1.txt', 'w') current = " " with open("format8.txt") as fp: for line in fp: collapse = ' '.join(line.split()) parse = collapse.split(" ") if current != parse[0]: current = parse[0] seq = get_query_seq(current) q_id = '{:14}'.format(current) out_file.write(q_id + seq + "\n") pid = '{:14}'.format(parse[1]) p_doc = table.find_one({'_id': parse[1]}) p_seq = p_doc["sequence"] start = int(parse[8]) - 1 end = int(parse[9]) p_seq = p_seq[start:end] p_seq = fillup(int(parse[6]) - 1, p_seq) out_file.write(pid + p_seq + "\n") out_file.close()
def tableGeneration(filepath, fts): table = functions.connectMongoDB('uniprot', 'table') # Open a file id_flag = 0 ac_flag = 0 out_ac = [] out_position = [] out_data = dict() special = 0 sequence = '' with open(filepath) as fp: for line in fp: collapsed = ' '.join(line.split()) data = collapsed.split(";") parsed_1 = data[0].split(" ") if parsed_1[0] == "ID" and id_flag == 0: id_flag = 1 out_id = parsed_1[1] elif parsed_1[0] == "AC" and ac_flag == 0: ac_flag = 1 out_ac.append(parsed_1[1]) if len(data) > 2: for x in range(1, len(data) - 1): out_ac.append(data[x]) out_data = {'_id': out_id, 'ac': out_ac} ##[go,interpro,pfam,prosite,smart,supfam] elif parsed_1[0] == "FT": if len(parsed_1) > 4 and special == 0: ft = '' for i in range(4, len(parsed_1)): ft = ft + parsed_1[i] ft = re.sub('[.]', '', ft) out_position = functions.remove_duplicates( [parsed_1[2], parsed_1[3]]) if ft == 'Glycyllysineisopeptide(Lys-Gly)': special = 1 continue if ft in fts: fts.setdefault(ft, []).append(out_position) out_position = [] elif special == 1: for i in range(1, len(parsed_1)): ft = ft + parsed_1[i] ft = re.sub('[.]', '', ft) if ft in fts: fts.setdefault(ft, []).append(out_position) out_position = [] special = 0 ## ## parse_1[0] is usually RT,DR,FT,or SQ etc... only squence part has length greater than 2 elif len(parsed_1[0]) > 2: sequence += collapsed elif parsed_1[0] == '//': fts = dict( [(k, list(itertools.chain.from_iterable(v))) for k, v in fts.items() if len(v) > 0] ) #delete empty FTs from dictionary ##list(itertools.chain.from_iterable(v)) format out_data = functions.merge_two_dicts(out_data, fts) sequence = ''.join(sequence.split()) out_data['sequence'] = sequence #print(out_data) table.save(out_data) fts = { 'Phosphoserine': [], 'Phosphothreonine': [], 'Phosphotyrosine': [], 'N6-acetyllysine': [], 'Omega-N-methylarginine': [], 'N6-methyllysine': [], 'N6,N6-dimethyllysine': [], 'N6,N6,N6-trimethyllysine': [], 'N-linked(GlcNAc)asparagine': [], 'S-palmitoylcysteine': [], 'Pyrrolidonecarboxylicacid': [], 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)': [], 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-Cterinubiquitin)': [] } ##rewind out_ac = [] id_flag = 0 ac_flag = 0 out_position = [] sequence = '' fp.close()
def tableGeneration(filepath, ptms): table = functions.connectMongoDB('uniprot', 'table') table.drop() out_id = "" out_ac = [] out_position = [] out_data = dict() sequence = "" temp_ptm = "" prev_fp_pos = 0 check = [] fp = open(filepath) line = fp.readline() while line: collapsed = ' '.join(line.split()) data = collapsed.split(";") info = data[0].split(" ") tag = info[0] #print(info[0]+" info1 "+info[1]+"\n") if tag == "ID": out_id = info[1] elif tag == "AC": out_ac.append(info[1]) if len(data) > 2: for x in range(1, len(data) - 1): out_ac.append(data[x].lstrip()) elif tag == "OC": check.append(info[1].lstrip()) if len(data) > 2: for x in range(1, len(data) - 1): check.append(data[x].lstrip()) out_data = {"_id": out_id, "ac": out_ac, "species": check} elif tag == "FT": temp_ptm = "" out_position = functions.remove_duplicates([info[2], info[3]]) temp_ptm = " ".join(info[4:]) #if "Q9TT90" in out_ac: # print("################temp_ptm is 1 "+temp_ptm+"\n") prev_fp_pos = fp.tell() line = ' '.join(fp.readline().split()) info = line.split(" ") while info[0] == "FT": if len(info) > 3 and is_number(info[2]) and is_number(info[3]): #if "Q9TT90" in out_ac: # print("###########temp_ptm is 2 "+temp_ptm+"\n") temp_ptm = re.sub('(\.*)\)', ')', temp_ptm) for doc in ptms: #if "Q9TT90" in out_ac and doc == 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)': # print(doc+" vs "+re.sub('[\.|\;].*','',temp_ptm)+"\n") if doc == re.sub('[\.|\;].*', '', temp_ptm): #if "Q9TT90" in out_ac: # print("yes\n") ptms.setdefault(doc, []).append(out_position) temp_ptm = "" out_position = functions.remove_duplicates( [info[2], info[3]]) temp_ptm = " ".join(info[4:]) else: temp_ptm = temp_ptm + " ".join(info[1:]) #if "Q9TT90" in out_ac: # print("#################temp_ptm is 3 "+temp_ptm+"\n") #for i in range(1,len(info)): # temp_ptm += info[i].rstrip() #print(temp_ptm+"\n") prev_fp_pos = fp.tell() line = ' '.join(fp.readline().split()) info = line.split(" ") temp_ptm = re.sub('(\.*)\)', ')', temp_ptm) for doc in ptms: #if "Q9TT90" in out_ac and doc == 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)': # print(doc+" vs "+re.sub('[\.|\;].*','',temp_ptm)+"\n") if doc == re.sub('[\.|\;].*', '', temp_ptm): #if "Q9TT90" in out_ac: # print("yes\n") ptms.setdefault(doc, []).append(out_position) ptms = dict([(k, list(itertools.chain.from_iterable(v))) for k, v in ptms.items() if len(v) > 0]) fp.seek(prev_fp_pos) elif tag == "SQ": sequence = seq_read(fp) out_data = functions.merge_two_dicts(out_data, ptms) out_data['sequence'] = sequence table.save(out_data) ##rewind ptms = { 'Phosphoserine': [], 'Phosphothreonine': [], 'Phosphotyrosine': [], 'N6-acetyllysine': [], 'Omega-N-methylarginine': [], 'Dimethylated arginine': [], 'Symmetric dimethylarginine': [], 'Asymmetric dimethylarginine': [], 'N6-methyllysine': [], 'N6,N6-dimethyllysine': [], 'N6,N6,N6-trimethyllysine': [], 'N-linked (GlcNAc) asparagine': [], 'S-palmitoyl cysteine': [], 'Pyrrolidone carboxylic acid': [], 'Glycyl lysine isopeptide (Lys-Gly)(interchain with G-Cter in SUMO)': [], 'Glycyl lysine isopeptide (Lys-Gly)(interchain with G-Cter in ubiquitin)': [] } out_data.clear() out_ac = [] out_position = [] sequence = "" check = [] line = fp.readline() fp.close()
def blast_output(filepath,ptms,out_folder): """ main function to generate display from blast output and write to files """ file = [] for ptm in ptms: file.append(open(out_folder+'/'+ptm+'.txt','w')) table = functions.connectMongoDB('uniprot','table') out_file = open(out_folder+'/blast_output.txt','w') seqs_start_position = 0 seqs_end_position = 0 output = dict() ac_deletions = dict() insertions = dict() ab_ptms = dict() fp = open(filepath) line = fp.readline() sequence_pad = -1 while line: collapsed = ' '.join(line.split()) data = collapsed.split(" ") tag = data[0] # 1. read blast result sequences if tag == 'Query_1': #blast result start sequence_pad += 1 temp_q_end = int(data[3]) q_seq = data[2] seqs_start_position = line.find(data[2]) # start position in txt seqs_end_position = line.find(data[3]) - 2 # end position in txt seqs_end_index = dict() # sequence end index seqs_start_index = dict() # sequence start index line = fp.readline() collapsed = ' '.join(line.split()) data = collapsed.split(" ") prev_ac = "" while line and data[0] != "Lambda": if data[0] == 'Query_1': # if its query sequence_pad += 1 seqs_start_position = line.find(data[2]) seqs_end_position = line.find(data[3]) - 2 if temp_q_end == int(data[1])-1: temp_q_end = int(data[3]) q_seq += data[2] else: print("special case!") elif len(data) == 4 and is_int(data[1]) and is_int(data[3]): # if its subjects if data[0] in output: # if its not head prev_ac = data[0] seqs_end_index[data[0]] = int(data[3]) output[data[0]] += line[seqs_start_position:seqs_end_position] else: # if its head prev_ac = data[0] seqs_start_index[data[0]] = int(data[1]) seqs_end_index[data[0]] = int(data[3]) output[data[0]] = reposition_seq(line[seqs_start_position:seqs_end_position],sequence_pad) elif data[0] == '\\': if prev_ac in ac_deletions: delete = get_deletions(fp,sequence_pad) ac_deletions[prev_ac] += delete else: delete = get_deletions(fp,sequence_pad) ac_deletions[prev_ac] = delete line = fp.readline() collapsed = ' '.join(line.split()) data = collapsed.split(" ") ##########check ptms #for id in ids: # temp = table.find_one({"_id": id}) # for ptm in ptms: # if ptm in temp: # print(temp[ptm]) ################## # preprocess data for id in output: output[id] = output[id].ljust(len(q_seq)) output[id] = output[id].replace(" ",".") insertions[id] = get_inserts(output[id]) dict((k, v) for k, v in insertions.iteritems() if v) #############check deletions #for i in ac_deletions: # for j in ac_deletions[i]: # print("id: "+i+"\tpos: "+str(j.pos)+"\tseq: "+j.seq) ##################### # ptm position is relative to the line in the file not sequence now for counter, ptm in enumerate(ptms): # generate the ptm position for display ab_ptms = get_ptms(ptm,table,seqs_start_index,seqs_end_index,insertions,ac_deletions,output) #TODO check if ids are right display_ptm(ab_ptms,file[counter],output) #TODO one more ids identities = get_identities(out_folder) display_output(q_seq,output,identities,out_file) #TODO ids here line = fp.readline() out_file.close() fp.close() for index, ptm in enumerate(ptms): file[index].close()