def GIID2Seq_HTTP(uniprot_url, idList, addedIDset, fpout):#{{{ isShowProgress = g_params['isShowProgress'] cntRetrieved = 0 cnt = 0 for giid in idList: cnt += 1 if isShowProgress: if (cnt%100) == 1: print >> sys.stderr, "Running %d ..."%(cnt) if giid in addedIDset: continue try: rec = Entrez.read(Entrez.esearch(db="protein", term="%s"%(giid))) handle = Entrez.efetch(db="protein", id=rec['IdList'][0], rettype="fasta") record_str = handle.read() if record_str: addedIDset.add(giid) (tmpid, tmpanno, tmpseq ) = myfunc.ExtractFromSeqWithAnno(record_str) fpout.write(">gi|%s %s\n"%(giid, tmpanno)) fpout.write("%s\n"%(tmpseq)) # fpout.write(record_str) cntRetrieved += 1 except (IndexError, urllib2.HTTPError, urllib2.URLError): pass return cntRetrieved
def OutputPfamFastaFile(seqidList, pfamid, seqdbDict, hdl_seqdb, extra_desp_dict, outpath): #{{{ outfile = "%s%s%s%s" % (outpath, os.sep, pfamid, g_params['out_ext']) fpout = myfunc.myopen(outfile, None, "w", True) isAddExtraDescription = False if len(extra_desp_dict) > 0: isAddExtraDescription = True for seqid in seqidList: if seqid.find("UniRef") != -1: try: ss = seqid.split("_") seqid = ss[1] except IndexError: pass if g_params['isBigmem']: try: record = seqdbDict[seqid] (tmpanno, tmpseq) = record if isAddExtraDescription: try: extraanno = extra_desp_dict[seqid] except KeyError: extraanno = "" if extraanno != "": tmpanno = "%s %s" % (extraanno, tmpanno) fpout.write(">%s\n%s\n" % (tmpanno, tmpseq)) except KeyError: print >> sys.stderr, "seqid %s not found in seqdb" % (seqid) else: record = hdl_seqdb.GetRecord(seqid) if record: if isAddExtraDescription: try: extraanno = extra_desp_dict[seqid] except KeyError: extraanno = "" if extraanno == "": fpout.write("%s" % (record)) else: (tmpseqid, tmpanno, tmpseq) = myfunc.ExtractFromSeqWithAnno(record) tmpanno = "%s %s" % (extraanno, tmpanno) fpout.write(">%s\n%s\n" % (tmpanno, tmpseq)) else: fpout.write("%s" % (record)) else: print >> sys.stderr, "seqid %s not found in seqdb" % (seqid) myfunc.myclose(fpout) if g_params['isGzip']: cmd = ["gzip", "-N", "-f", outfile] print " ".join(cmd) subprocess.check_call(cmd, stdout=open(os.devnull, "w"))
def GetTopoDict(topoDB, idList): hdl = myfunc.MyDB(topoDB) if hdl.failure: return {} dt = {} for seqid in idList: data = hdl.GetRecord(seqid) if data: (tmp_id, tmp_anno, tmp_seq) = myfunc.ExtractFromSeqWithAnno(data) dt[seqid] = tmp_seq hdl.close() return dt
def MatchMSATopo_using_topodb( msafile, topodb, isIgnoreBadseq, #{{{ method_match, outfile): hdl_topo = myfunc.MyDB(topodb) if hdl_topo.failure: return 1 hdl = myfunc.ReadFastaByBlock(msafile) if hdl.failure: return 1 fpout = myfunc.myopen(outfile, sys.stdout, "w", False) recordList = hdl.readseq() while recordList != None: for rd in recordList: topowithanno = hdl_topo.GetRecord(rd.seqid) if topowithanno != None: (topoid, topoanno, topo) = myfunc.ExtractFromSeqWithAnno(topowithanno) else: print("topo not found for ID %s" % (rd.seqid), file=sys.stderr) topo = "" matchedtopo = MatchSeqToTopo(rd.seq, topo, method_match) if not (matchedtopo == "BADSEQ" and isIgnoreBadseq): print(">%s" % (rd.description), file=fpout) print("%s" % (matchedtopo), file=fpout) recordList = hdl.readseq() myfunc.myclose(fpout) hdl.close() hdl_topo.close() return 0
def UniprotID2Seq_HTTP(uniprot_url, idList, addedIDset, fpout): #{{{ cntRetrieved = 0 cnt = 0 isShowProgress = g_params['isShowProgress'] for uniprotid in idList: cnt += 1 if isShowProgress: if (cnt % 100) == 1: print >> sys.stderr, "Running %d ..." % (cnt) if uniprotid in addedIDset: continue try: url = uniprot_url + uniprotid + ".fasta" filename = uniprotid + ".fasta" data = urllib2.urlopen(url).read() if data: # fpout.write(data) (tmpid, tmpanno, tmpseq) = myfunc.ExtractFromSeqWithAnno(data) fpout.write(">%s %s\n" % (uniprotid, tmpanno)) fpout.write("%s\n" % (tmpseq)) addedIDset.add(uniprotid) cntRetrieved += 1 except urllib2.HTTPError, e: print >> sys.stderr, "HTTPError for ID %s" % (uniprotid)
def main(): #{{{ if 0: #{{{ strTop1 = "---MMMM-----i-i-i---MMM----MMMM-ooo" strTop2 = "----MMMM-----i-ii-----MMM---MMM--oo" strProtein1 = "id1" strProtein2 = "id2" fpLog = sys.stdout class_gapless, num1_gapless, num2_gapless = ct.CompareToposGaplesslyNew( strTop1, strTop2, strProtein1, strProtein2, fpLog) # Note: calling the int, float, string will not change their original value # calling the dict, list will change their original value print "strTop1:", strTop1 print "strTop2:", strTop2 #}}} if 0: #{{{ PrintFuncName() print("this file name is: %s" % __file__) #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") lines = fp.readlines() fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename BLOCK_SIZE = 100000 fp = open(filename, "r") buff = fp.read(BLOCK_SIZE) while buff: buff = fp.read(BLOCK_SIZE) fp.close() #}}} if 0: #{{{ # filename="/nanjiang/data/blastdb/uniprot_KW181_idt50.fasta" filename = sys.argv[1] print filename fp = open(filename, "r") line = fp.readline() while line: line = fp.readline() fp.close() #}}} if 0: #{{{ try: BLOCK_SIZE = 100000 infile = sys.argv[1] fpin = open(infile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) if len(recordList) > 0: for record in recordList: sys.stdout.write(">%s\n" % record[1]) sys.stdout.write("%s\n" % record[2]) if isEOFreached == True: break fpin.close() except IOError: raise #}}} if 0: #{{{ try: infile = sys.argv[1] (annoList, seqList) = myfunc.ReadFasta_without_id(infile) for i in xrange(len(seqList)): sys.stdout.write(">%s\n" % annoList[i]) sys.stdout.write("%s\n" % seqList[i]) except IOError: raise #}}} if 0: #{{{ hhrfile = "hhsearch/A1RZ92-Q74DY9.hhr" if IsDuplicatedByHHSearch(hhrfile): print "yes" #}}} if 0: #{{{ import pairlistwithfamid2pairaln_by_msa seq1 = "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------MLSSTATTMLRAGVSRSSGALQPMLLRSAACPCSPFSMNTKLSQPTSV-----RPLSTSPSALVLRFRAQQQAQLAQQQLRRASSSSSSSSSSTRPRSDAELDANAAEAAAAAQSAAHAGEPVLDWNTFFKLRKTRRRVQLAFSVIMTLITSGAGGAVLSTGVADAMVAQVPLEPMFAVGLMTASFGALGWLMGPAMGGMVFNALKSKYRGQMEIKEGQFFARIKKHRVDPSASSMGNPVPDFYGEKISSVAGYRQWLKDQRAFNKKRTTFV" seq2 = "MDILLAVLEQGFIFSIVCFGVYITYKILDFPDLSVDGTFPLGAAVAAAFLVKGYSPVLSSLAALVAGAIAGGITGILHVKFKITNLLSGILVMVGLYSINLRIMGKSNIPLFNKIHLFSDTMNPIIIITVFLLICKITLDLFLKTKAGFILKATGDNEQLVLSLGVNKDLVKIMGLMLSNALVALGGALMAQYQGFSDVGMGTGIVVMGLASVIIGESLFGRIKALNATTRVLLGALVYKLSVSI---ALTVGLAP-------TDLKLVTAIIVVIALSLNKNPLKIITKQKTKEGGIL------NASNTKSAQSVQ-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" seq1 = "---------------------------------------------------------------------------------------------------------------------------------------MALSSLFFTASALLLMFLAFLGGARNSNPLDRIYWLEAATGNIPGAPALSRWTYWNLCAVNSEGHNECGKSYPDYPFDPPSHRNFNTHVNIPAAFIGTRHYFLTSRFMFPFHIIALFFATCSLLTGFLAMCTRIGNWVSAFSAYFALTFQTITTCLMTAVYVQGRDKFNNNGQSSHLGVKAFAFMWTSVALLFLSCVIYCMGGAVGRKDGGYSGREQRRRGFFNSHRSGSLRSNKETAP" seq2 = "MRKIAAIGGIVFISFILTIVAMFTKLWISWSIGKFSYGIGIVPYHSNSAGWFTAASWMVFISFGLFIPLILVVLFTAYKVHHDGCCHSIRHCFNSICLICSIIAVLEIIAFVLMAVNASRYVKGASISEKKSLLQLGSSAYLDLVSAILIIVATVLSGHASHHDCH----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" alignFactor = pairlistwithfamid2pairaln_by_msa.GetAlignmentFactorFromPairAlignment( seq1, seq2) print alignFactor #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ import my_extractdb #miniking my_extractdb.py see which one is faster try: dbname = sys.argv[1] idlistfile = sys.argv[2] cls = myfunc.MyDB(dbname) if cls.failure: print >> sys.stderr, "MyDB init failed" else: idlist = open(idlistfile, "r").read().split("\n") fpout = sys.stdout for seqid in idlist: if seqid: record = cls.GetRecord(seqid) fpout.write(record) # for rd in cls.GetAllRecord(): # print rd # (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) # print (seqid, anno, seq) except IndexError: print "error" pass #}}} if 0: #{{{ #test ReadLineByBlock try: infile = sys.argv[1] from myfunc import ReadLineByBlock cls = ReadLineByBlock(infile) lines = cls.readlines() while lines != None: for line in lines: print line lines = cls.readlines() except IndexError: pass #}}} if 0: #{{{ #test speed of ReadLineByBlock # ReadLineByBlock is about 3 times fater than file.readline() try: from myfunc import ReadLineByBlock infile = sys.argv[1] start = time.time() hdl = ReadLineByBlock(infile) lines = hdl.readlines() while lines != None: lines = hdl.readlines() hdl.close() end = time.time() msg = "Reading %s by ReadLineByBlock costs %.3fs seconds" print msg % (infile, (end - start)) start = time.time() hdl = open(infile, "r") line = hdl.readline() while line: line = hdl.readline() hdl.close() end = time.time() msg = "Reading %s by readline() costs %.3fs seconds" print msg % (infile, (end - start)) except IndexError: pass #}}} if 0: #{{{ #test readline try: infile = sys.argv[1] fp = open(infile, "r") line = fp.readline() while line: print line line = fp.readline() fp.close() except IndexError: pass #}}} if 0: #{{{ #test the speed of GetFirstWord try: nloop = int(sys.argv[1]) string = "kjdafk jasdfj j" #string = "askdf askdf " # string = "kajsdfasdfsdfjakasjdfka" # string = "kajsdfasdf,sdfjakasjdfka" delimiter = " \t\r,.\n" delimiter = " " for i in xrange(nloop): #firstword = myfunc.GetFirstWord(string, delimiter) #firstword = string.split()[0] #firstword = string.partition(" ")[0] firstword = myfunc.GetFirstWord(string) #pass #print firstword except (IndexError, ValueError): pass #}}} if 0: #{{{ # read seq by SeqIO from Bio import SeqIO try: seqfile = sys.argv[1] # 1. SeqIO #################### start = time.time() handle = open(seqfile, "rU") cnt = 0 for record in SeqIO.parse(handle, "fasta"): cnt += 1 handle.close() end = time.time() msg = "Reading %d sequences by SeqIO costs %.3fs seconds" print msg % (cnt, (end - start)) # 2. ReadFasta #################### start = time.time() seqfile = sys.argv[1] (idList, annoList, seqList) = myfunc.ReadFasta(seqfile) end = time.time() msg = "Reading %d sequences by ReadFasta costs %.3fs seconds" print msg % (len(idList), (end - start)) # 3. ReadFasta from buffer BLOCK_SIZE = 100000 start = time.time() cnt = 0 fpin = open(seqfile, 'rb') unprocessedBuffer = "" isEOFreached = False while 1: buff = fpin.read(BLOCK_SIZE) if len(buff) < BLOCK_SIZE: isEOFreached = True buff = unprocessedBuffer + buff recordList = [] unprocessedBuffer = myfunc.ReadFastaFromBuffer( buff, recordList, isEOFreached) cnt += len(recordList) if isEOFreached == True: break fpin.close() end = time.time() msg = "Reading %d sequences by ReadFastaFromBuffer costs %.3fs seconds" print msg % (cnt, (end - start)) # 4. ReadFastaByBlock #################### start = time.time() seqfile = sys.argv[1] hdl = myfunc.ReadFastaByBlock(seqfile, 0, 0) if hdl.failure: print >> sys.stderr, "Failed to init ReadFastaByBlock" return 1 recordList = hdl.readseq() cnt = 0 while recordList != None: cnt += len(recordList) # for rd in recordList: # print ">%s"%rd.description # print rd.seq recordList = hdl.readseq() hdl.close() end = time.time() msg = "Reading %d sequences by ReadFastaByBlock costs %.3fs seconds" print msg % (cnt, (end - start)) except (IndexError, ValueError): pass #}}} if 0: #{{{ #test RemoveUnnecessaryGap try: infile = sys.argv[1] start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap_old(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap_old for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq start = time.time() (idList, seqList) = myfunc.ReadFasta_without_annotation(infile) seqList = lcmp.RemoveUnnecessaryGap(seqList) end = time.time() msg = "Run RemoveUnnecessaryGap for %s costs %.3fs seconds" print >> sys.stderr, msg % (infile, (end - start)) for seq in seqList: print seq except IndexError: pass #}}} if 0: #{{{ #test ReadMPAByBlock try: infile = sys.argv[1] hdl = myfunc.ReadMPAByBlock(infile) if hdl.failure: return recordList = hdl.readseq() while recordList != None: for rd in recordList: #print rd.seqid print ">%s" % (rd.description) print "%s" % (myfunc.mpa2seq(rd.mpa)) recordList = hdl.readseq() hdl.close() except IndexError: pass #}}} if 0: #{{{ try: dbname = sys.argv[1] print dbname from myfunc import MyDB cls = MyDB(dbname) # print cls.idList record = cls.GetRecord("A0FGX9") if record: print record # for rd in cls.GetAllRecord(): # print rd (seqid, anno, seq) = myfunc.ExtractFromSeqWithAnno(record) print(seqid, anno, seq) except IndexError: pass #}}} if 0: #{{{ #test subprocess import glob #invoke shell explicitly, not very good, may have security problems subprocess.call("seq 10", shell=True) subprocess.call("echo wait for 2 seconds...; sleep 2", shell=True) subprocess.call("ls topo*.py", shell=True) if 1: #{{{ #test subprocess import glob #invoke shell implicitly, recommended way subprocess.call(["seq", "10"], shell=False) subprocess.call(["echo", "wait for 1 seconds..."]) subprocess.call(["sleep", "1"]) try: print subprocess.check_call(["ls", "topo*.py"]) #This will not work except subprocess.CalledProcessError, e: print "error message:", e subprocess.call(["ls"] + glob.glob("topo*.py"))
def HHAlign2Pairaln( infile, evalue_threshold, coverage_threshold, hdl_seq, #{{{ fpout, fpout_tableinfo, fpout_stat): if not os.path.exists(infile): print >> sys.stderr, "infile %s does not exist, Ignore" % (infile) return 1 hhalignHitList = ReadHHAlignResult(infile) numHit = len(hhalignHitList) if numHit < 1: print >> sys.stderr, "No hit found for file %s. Ignore" % infile return 1 elif numHit > 1: print >> sys.stderr, "More than 1 (%d) hit found for file %s." % ( numHit, infile) return 1 # for item in hhalignHitList[0]: # print item, hhalignHitList[0][item] hit = hhalignHitList[0] if coverage_threshold >= 0.0: try: if hit['query_length'] >= hit['template_length']: coverage_of_shorter_seq = myfunc.FloatDivision( len(hit['template_alignseq'].replace("-", "")), hit['template_length']) else: coverage_of_shorter_seq = myfunc.FloatDivision( len(hit['query_alignseq'].replace("-", "")), hit['query_length']) except KeyError: print >> sys.stderr, "bad hit for file %s" % (infile) return 1 if coverage_of_shorter_seq < coverage_threshold: print >> sys.stderr, "coverage (%.3f) < %g for %s. Ignore" % ( coverage_of_shorter_seq, coverage_threshold, infile) return 1 if hit['evalue'] > evalue_threshold: print >> sys.stderr, "evalue (%g) > %g for %s. Ignore" % ( hit['evalue'], evalue_threshold, infile) return 1 query_rawseq = hdl_seq.GetRecord(hit['query_seqid']) if query_rawseq == None: return 1 hit_rawseq = hdl_seq.GetRecord(hit['hit_seqid']) if hit_rawseq == None: return 1 (hit_seqid, hit_annotation, hit_seq) = myfunc.ExtractFromSeqWithAnno(hit_rawseq) (query_seqid, query_annotation, query_seq) = myfunc.ExtractFromSeqWithAnno(query_rawseq) (hit_unaligned_head, hit_unaligned_tail) = GetUnalignedHeadTail(hit_seq, hit['pos_template_begin'], hit['pos_template_end']) (query_unaligned_head, query_unaligned_tail) = GetUnalignedHeadTail(query_seq, hit['pos_query_begin'], hit['pos_query_end']) (hit_unaligned_head, query_unaligned_head) = FillUnalignedGapForHead(hit_unaligned_head, query_unaligned_head) (hit_unaligned_tail, query_unaligned_tail) = FillUnalignedGapForTail(hit_unaligned_tail, query_unaligned_tail) # output pairaln softmargin = 5 if hit['pos_query_begin'] <= softmargin or hit[ 'pos_template_begin'] <= softmargin: isHeadUnaligned = False query_unaligned_head = query_unaligned_head.upper() hit_unaligned_head = hit_unaligned_head.upper() else: isHeadUnaligned = True query_unaligned_head = query_unaligned_head.lower() hit_unaligned_head = hit_unaligned_head.lower() if (hit['pos_query_end'] >= hit['query_length'] - softmargin or hit['pos_template_end'] >= hit['template_length'] - softmargin): isTailUnaligned = False query_unaligned_tail = query_unaligned_tail.upper() hit_unaligned_tail = hit_unaligned_tail.upper() else: isTailUnaligned = True query_unaligned_tail = query_unaligned_tail.lower() hit_unaligned_tail = hit_unaligned_tail.lower() complete_query_alignseq = "%s%s%s" % (query_unaligned_head, hit['query_alignseq'].upper(), query_unaligned_tail) complete_tempalte_alignseq = "%s%s%s" % (hit_unaligned_head, hit['template_alignseq'].upper(), hit_unaligned_tail) #print hit['query_alignseq'] #print hit['template_alignseq'] if fpout != None: fpout.write(">%s\n" % (hit['query_description'])) fpout.write("%s\n" % complete_query_alignseq) fpout.write(">%s\n" % (hit['hit_description'])) fpout.write("%s\n" % complete_tempalte_alignseq) # output stat if fpout_stat != None: pos_query = "%d-%d" % (hit['pos_query_begin'], hit['pos_query_end']) pos_template = "%d-%d" % (hit['pos_template_begin'], hit['pos_template_end']) fpout_stat.write( "%-8s %-8s %7g %8.3f %6.1f %6.1f %6d %9s %4d %9s %4d\n" % ( hit['query_seqid'], hit['hit_seqid'], hit['evalue'], coverage_of_shorter_seq, hit['identity'], hit['prob'], hit['num_align_col'], pos_query, hit['query_length'], pos_template, hit['template_length'], )) # output tableinfo if fpout_tableinfo != None: isLocalAlignment = True rd = lcmp.GetAlignmentFactorFromPairAlignment(hit['query_alignseq'], hit['template_alignseq'], isLocalAlignment) # rd = lcmp.GetAlignmentFactorFromPairAlignment(complete_query_alignseq, complete_tempalte_alignseq, isLocalAlignment) fpout_tableinfo.write( "%-16s %-15s %6.1f %6.1f %9d %6d %6d %9.1f %6d %6d %6d %6.1f %6.1f\n" % (hit['query_seqid'], hit['hit_seqid'], rd['seqidt0'], hit['similarity'] * 100, rd['alnLength'], rd['seqLength1'], rd['seqLength2'], hit['score'], rd['numIDT'], -1, rd['numGap'], rd['seqidt1'], rd['seqidt2']))