def getBlastScoreRatios(FASTAfile, allelescore, queryDef, databasePath, queryProteomeName, referenceGenomeArray, referenceCDS, isXML): alleleProt='' proteome="" countP=0 countCDS=0 if isXML == 'True': blast_out_file = 'BLASTresults.xml' cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000) print 'BSR:' #print cline blast_records = runBlastParser(cline,blast_out_file, False) startTime = datetime.now() ToNewAllele = parseBLASTRecordsXML(blast_records, allelescore, queryDef, referenceGenomeArray, referenceCDS) print 'CheckResults:' + str(datetime.now() - startTime) else: blast_out_file = 'BLASTresults.tab' cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=6, num_alignments=7000, num_descriptions=7000) print 'BSR:' blast_records = runBlastParserTAB(cline,blast_out_file, False) startTime = datetime.now() ToNewAllele = parseBLASTRecordsTAB(blast_records, allelescore, queryDef, referenceGenomeArray, referenceCDS) print 'CheckResults:' + str(datetime.now() - startTime) os.remove(queryProteomeName) return ToNewAllele
def reDogetBlastScoreRatios(genefile,basepath,alleleI,allelescores2,newGene_Blast_DB_name,alleleList2,picklepath): gene_fp = HTSeq.FastaReader(genefile) alleleProt='' alleleI+=1 proteinfastaPath=genefile print ("Re-starting Blast alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) blast_out_file2 = os.path.join(basepath,'blastdbs/temp.xml') cline = NcbiblastpCommandline(query=proteinfastaPath, db=newGene_Blast_DB_name, evalue=0.001, out=blast_out_file2, outfmt=5) allelescore=0 blast_records = runBlastParser(cline,blast_out_file2, proteinfastaPath) print ("Blasted alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) found =False for blast_record in blast_records: for alignment in blast_record.alignments: for match in alignment.hsps: allelescores2.append(int(match.score)) var=[alleleI,allelescores2] with open(picklepath,'wb') as f: currentCDSDict = pickle.dump(var, f) return int(alleleI),allelescores2,alleleList2
def getOwnBlastScore(FASTAfile, databasePath, queryProteomeName, numberOfLocus, blastResultsPath, LocusToUse, queryFile): allelescores = [] alleleNumbers = {} sameAlleles = {} prevAlleleName = {} databasePath, isEmpty, proteinsToQueryFile, queryAlleleList, prevAlleleName = CreateQueryDatabase(FASTAfile, databasePath,queryProteomeName) if isEmpty: return allelescores, isEmpty, proteinsToQueryFile, alleleNumbers, sameAlleles, prevAlleleName blast_out_file = blastResultsPath + '/' + numberOfLocus + '_BLASTresults.xml' cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000) #print cline allelescore=0 blast_records = runBlastParser(cline,blast_out_file, False) allelescores, alleleList, alleleNumbers, sameAlleles = parseOwnBLASTRecordsAndDuplicates(blast_records, FASTAfile, queryAlleleList) proteinsToQueryFile = translateAlleleList(alleleList, queryProteomeName, LocusToUse, queryFile) os.remove(databasePath+ ".pin") os.remove(databasePath+ ".phr") os.remove(databasePath+ ".psq") os.remove(databasePath+ "_blast.log") os.remove(blast_out_file) return allelescores, isEmpty, proteinsToQueryFile, alleleNumbers, sameAlleles, prevAlleleName
def getBlastScoreRatios(allelescore, alleleList, databasePath, queryProteomeName, referenceGenomeArray, referenceCDS, bestmatches, referenceCDSsequences, referenceFileName, countNumberOfGenomes, blastResultsPath, LocusToUse): alleleProt = '' proteome = "" countP = 0 countCDS = 0 blast_out_file = blastResultsPath + countNumberOfGenomes + '_BLASTresults.xml' cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000) #print cline blast_records = runBlastParser(cline, blast_out_file, False) resultsList, addNewAlleles = parseBLASTRecordsXML( blast_records, allelescore, alleleList, referenceGenomeArray, referenceCDS, bestmatches, referenceCDSsequences, referenceFileName, LocusToUse) os.remove(blast_out_file) return resultsList, addNewAlleles
def getOwnBlastScore(FASTAfile): gene_fp = HTSeq.FastaReader(FASTAfile) #alleleI=0 names="" alleleProt='' proteome="" for allele in gene_fp: #new db for each allele to blast it against himself try: x = str(translateSeq(allele.seq)) except: continue #print str(allele.name) #names=allele.name.split("|")[3] #print allele.seq alleleProt+=">"+str(allele.name)+"\n"+x+"\n" proteome+=">"+str(allele.name)+"\n"+x+"\n" with open(pathRef+'allAllelesAA.fasta', "wb") as f: f.write(alleleProt) with open(pathRef+nameOrg+'proteome.fasta', "wb") as v: v.write(proteome) Gene_Blast_DB_name = Create_Blastdb(pathRef+'allAllelesAA.fasta',1,True) # --- get BLAST score ratio --- # cline = NcbiblastpCommandline(query=pathRef+nameOrg+'proteome.fasta', db=name, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000) #print cline allelescore=0 blast_records = runBlastParser(cline,blast_out_file, alleleProt) allelescores={} for blast_record in blast_records: found=False for alignment in blast_record.alignments: if found is False: #print blast_record.query, alignment.hit_def for match in alignment.hsps: #print alignment.hit_def #print "---------------------" #print alignment.hit_def #print blast_record.query #print alignment.hit_def try: if allelescores[str(alignment.hit_def)] < match.score: allelescores[str(alignment.hit_def)] = int(match.score) break except KeyError: allelescores[str(alignment.hit_def)] = int(match.score) break else: break #print allelescores #for i in allelescores: #hitsName.append(str(i)+";"+str(allelescores[i])+";") #hitsName.sort(key=Align_sort_key) #print hitsName #return alleleI,allelescores,Gene_Blast_DB_name #print alleleI #print len(allelescores) return allelescores
def main(): parser = argparse.ArgumentParser(description="Given an ffn file, recovers the genes that are not paralogs and have a size bigger than the g parameter provided") parser.add_argument('-i', nargs='?', type=str, help='ffn file', required=True) parser.add_argument('-g', nargs='?', type=int, help='int minimum size', required=True) args = parser.parse_args() genes = args.i sizethresh = args.g gene_fp = HTSeq.FastaReader(genes) geneFile = os.path.abspath( genes ) Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 ) geneF = os.path.splitext( geneFile )[0] blast_out_file = geneF + '.xml' # list of results - the output of the function resultsList = [] # ------------------------------ RUNNING BLAST ------------------------------ # cline = NcbiblastnCommandline(query=geneFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) blast_records = runBlastParser(cline, blast_out_file, geneFile) paralogs=[] for blast_record in blast_records: try: alignment=blast_record.alignments[1] paralogs.append( alignment.hit_def) except: continue pathfiles=os.path.dirname(geneFile) pathfiles=pathfiles+"/" print pathfiles g_fp = HTSeq.FastaReader( genes ) removedparalogs=0 removedsize=0 for contig in g_fp: name = contig.name+" "+contig.descr if name not in paralogs: if int(len(contig.seq))>sizethresh: namefile=contig.name namefile=namefile.replace("|","_") with open(pathfiles+namefile+".fasta", "wb") as f: f.write(">1\n"+contig.seq+"\n") else: removedsize+=1 else: print name removedparalogs+=1 print "Removed %s paralog genes" % str(removedparalogs) print "Removed %s because of size :" % str(removedsize)
def reDogetBlastScoreRatios(genefile,basepath,alleleI,allelescores2,newGene_Blast_DB_name,alleleList2): gene_fp = HTSeq.FastaReader(genefile) #alleleI=0 #allelescores=[] alleleProt='' #alleleList=[] """for allele in gene_fp: #new db for each allele to blast it against himself print allele alleleI+=1 genome=-1 alleleList2.append(allele.seq) translatedSequence,x,y=translateSeq(allele.seq) print translatedSequence alleleProt+=">"+str(alleleI)+"\n"+str(translatedSequence+"\n")""" alleleI+=1 proteinfastaPath=genefile print proteinfastaPath blast_out_file2 = os.path.join(basepath,'blastdbs/temp.xml') #with open(proteinfastaPath, "wb") as f: # f.write(alleleProt) #Gene_Blast_DB_name = Create_Blastdb( proteinfastaPath, 1, True ) # --- get BLAST score ratio --- # cline = NcbiblastpCommandline(query=proteinfastaPath, db=newGene_Blast_DB_name, evalue=0.001, out=blast_out_file2, outfmt=5) #print cline allelescore=0 blast_records = runBlastParser(cline,blast_out_file2, proteinfastaPath) found =False for blast_record in blast_records: found=False print blast_record #print blast_record.header #print blast_record.alignments[0] for alignment in blast_record.alignments: print alignment,alignment.hsps if found is False: #print blast_record.query, alignment.hit_def for match in alignment.hsps: print match #print "---------------------" if(int(alignment.hit_def)== int(blast_record.query)): #print match allelescores2.append(int(match.score)) found=True break else: break #print allelescores2, alleleList2 return alleleI,allelescores2,alleleList2
def getBlastScoreRatios(genefile,basepath): gene_fp = HTSeq.FastaReader(genefile) alleleI=0 allelescores=[] alleleProt='' alleleList=[] for allele in gene_fp: #new db for each allele to blast it against himself alleleI+=1 genome=-1 alleleList.append(allele.seq) translatedSequence,x,y=translateSeq(allele.seq) alleleProt+=">"+str(alleleI)+"\n"+str(translatedSequence+"\n") #basepath="./blastdbs/temp"+str(os.path.basename(genefile)) #if not os.path.exists(basepath): # os.makedirs(basepath) proteinfastaPath=os.path.join(basepath,str(os.path.basename(genefile)+'_protein.fasta')) blast_out_file = os.path.join(basepath,"blastdbs/"+os.path.basename(genefile) + '.xml') with open(proteinfastaPath, "wb") as f: f.write(alleleProt) print ("Starting Blast alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) Gene_Blast_DB_name = Create_Blastdb( proteinfastaPath, 1, True ) print proteinfastaPath # --- get BLAST score ratio --- # cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) #print cline allelescore=0 print ("Parse bsr blast at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) blast_records = runBlastParser(cline,blast_out_file, alleleProt) print ("Blasted alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) for blast_record in blast_records: found=False for alignment in blast_record.alignments: if found is False: #print blast_record.query, alignment.hit_def for match in alignment.hsps: #print "---------------------" if(int(alignment.hit_def)== int(blast_record.query)): #print match allelescores.append(int(match.score)) found=True break else: break #print allelescores return alleleI,allelescores,Gene_Blast_DB_name,alleleList
def reDogetBlastScoreRatios(genefile, basepath, alleleI, allelescores2, newGene_Blast_DB_name, alleleList2, picklepath, verbose, blastPath, listAllelesNames): if verbose: def verboseprint(*args): for arg in args: print(arg), print else: verboseprint = lambda *a: None # do-nothing function #gene_fp = HTSeq.FastaReader(genefile) alleleProt = '' proteinfastaPath = genefile verboseprint("Starting Blast of new alleles to calculate BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) blast_out_file2 = os.path.join(basepath, 'blastdbs/temp.xml') cline = NcbiblastpCommandline(cmd=blastPath, query=proteinfastaPath, db=newGene_Blast_DB_name, evalue=0.001, out=blast_out_file2, outfmt=5, num_threads=1) allelescore = 0 blast_records = runBlastParser(cline, blast_out_file2) verboseprint("Blasted alleles at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) found = False matchscore = 0 for blast_record in blast_records: for alignment in blast_record.alignments: for match in alignment.hsps: matchscore = int(match.score) allelescores2[alleleI] = matchscore with open(picklepath, 'wb') as f: pickle.dump(allelescores2, f) return allelescores2, alleleList2, listAllelesNames
def BLASTp(queryFile, dbName, blast_out_path, queryNames, sequenceLengths): blast_out_file = os.path.join(blast_out_path,'blastOut.xml') cline = NcbiblastpCommandline(query=queryFile, db=dbName, out=blast_out_file, outfmt=5) blast_records = runBlastParser(cline,blast_out_file, "") matchGene = '' score = -1 for blast_record in blast_records: queryGeneIndex = queryNames.index(blast_record.query.strip('|')) querySequenceLength = sequenceLengths[queryGeneIndex] for alignment in blast_record.alignments: for match in alignment.hsps: identity_length_ratio = float(match.identities)/float(querySequenceLength) if identity_length_ratio >= 0.8: if score < match.score: matchGene = alignment.hit_def score = match.score return matchGene
def getBlastScoreRatios(allelescore, alleleList, databasePath, queryProteomeName, referenceGenomeArray, referenceCDS, bestmatches, referenceCDSsequences,referenceFileName, countNumberOfGenomes, blastResultsPath, LocusToUse): alleleProt='' proteome="" countP=0 countCDS=0 blast_out_file = blastResultsPath + countNumberOfGenomes + '_BLASTresults.xml' cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000) #print cline blast_records = runBlastParser(cline,blast_out_file, False) resultsList, addNewAlleles = parseBLASTRecordsXML(blast_records, allelescore, alleleList, referenceGenomeArray, referenceCDS, bestmatches, referenceCDSsequences, referenceFileName, LocusToUse) os.remove(blast_out_file) return resultsList, addNewAlleles
def getOwnBlastScore(FASTAfile, databasePath, queryProteomeName, numberOfLocus, blastResultsPath, LocusToUse, queryFile): allelescores = [] alleleNumbers = {} sameAlleles = {} prevAlleleName = {} databasePath, isEmpty, proteinsToQueryFile, queryAlleleList, prevAlleleName = CreateQueryDatabase( FASTAfile, databasePath, queryProteomeName) if isEmpty: return allelescores, isEmpty, proteinsToQueryFile, alleleNumbers, sameAlleles, prevAlleleName blast_out_file = blastResultsPath + '/' + numberOfLocus + '_BLASTresults.xml' cline = NcbiblastpCommandline(query=queryProteomeName, db=databasePath, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000) #print cline allelescore = 0 blast_records = runBlastParser(cline, blast_out_file, False) allelescores, alleleList, alleleNumbers, sameAlleles = parseOwnBLASTRecordsAndDuplicates( blast_records, FASTAfile, queryAlleleList) proteinsToQueryFile = translateAlleleList(alleleList, queryProteomeName, LocusToUse, queryFile) os.remove(databasePath + ".pin") os.remove(databasePath + ".phr") os.remove(databasePath + ".psq") os.remove(databasePath + "_blast.log") os.remove(blast_out_file) return allelescores, isEmpty, proteinsToQueryFile, alleleNumbers, sameAlleles, prevAlleleName
def getBlastScoreRatios(genefile): gene_fp = HTSeq.FastaReader(genefile) alleleI=0 allelescores=[] alleleProt='' for allele in gene_fp: #new db for each allele to blast it against himself alleleI+=1 genome=-1 alleleProt+=">"+str(alleleI)+"\n"+str(translateSeq(allele.seq)+"\n") basepath="./blastdbs/temp"+str(os.path.basename(genefile)) if not os.path.exists(basepath): os.makedirs(basepath) with open(basepath+'/protein.fasta', "wb") as f: f.write(alleleProt) Gene_Blast_DB_name = Create_Blastdb( basepath+'/protein.fasta', 1, True ) # --- get BLAST score ratio --- # cline = NcbiblastpCommandline(query=basepath+'/protein.fasta', db=Gene_Blast_DB_name, evalue=0.001, out=basepath+'protein.xml', outfmt=5) #print cline allelescore=0 blast_records = runBlastParser(cline,basepath+'protein.xml', alleleProt) for blast_record in blast_records: found=False for alignment in blast_record.alignments: if found is False: #print blast_record.query, alignment.hit_def for match in alignment.hsps: #print "---------------------" if(int(alignment.hit_def)== int(blast_record.query)): #print match allelescores.append(int(match.score)) found=True break else: break #print allelescores return alleleI,allelescores,Gene_Blast_DB_name
def reDogetBlastScoreRatios(genefile, basepath, alleleI, allelescores2, newGene_Blast_DB_name, alleleList2, picklepath): gene_fp = HTSeq.FastaReader(genefile) alleleProt = '' alleleI += 1 proteinfastaPath = genefile print("Re-starting Blast alleles at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) blast_out_file2 = os.path.join(basepath, 'blastdbs/temp.xml') cline = NcbiblastpCommandline(query=proteinfastaPath, db=newGene_Blast_DB_name, evalue=0.001, out=blast_out_file2, outfmt=5) allelescore = 0 blast_records = runBlastParser(cline, blast_out_file2, proteinfastaPath) print("Blasted alleles at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) found = False for blast_record in blast_records: for alignment in blast_record.alignments: for match in alignment.hsps: allelescores2.append(int(match.score)) var = [alleleI, allelescores2] with open(picklepath, 'wb') as f: currentCDSDict = pickle.dump(var, f) return int(alleleI), allelescores2, alleleList2
def callAlleles(argumentList): geneFile = argumentList[0] genomesList = argumentList[1] listOfProts = argumentList[2] listAllCDS = argumentList[3] #print geneFile gene_fp = HTSeq.FastaReader(geneFile) alleleI = 0 #inverted=False #orderedAlleleNames=[] resultsList = [] i = 0 perfectMatchIdAllele=[] bestmatch=[0,0,False,'',0] #score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID allelescores=[] alleleI,allelescores,Gene_Blast_DB_name=getBlastScoreRatios(geneFile) genome=-1 for protList in listOfProts: #alleleI = 0 #alleleProt='' #for allele in gene_fp: #new db for each allele to blast it against himself # alleleI+=1 # alleleProt+=">"+str(alleleI)+"\n"+str(translateSeq(allele.seq)+"\n") basepath="./blastdbs/temp"+str(os.path.basename(geneFile)) #if not os.path.exists(basepath): # os.makedirs(basepath) #with open(basepath+'/protein.fasta', "wb") as f: # f.write(alleleProt) #Gene_Blast_DB_name = Create_Blastdb( basepath+'/protein.fasta', 1, True ) genome+=1 with open(basepath+'/proteinList.fasta', "wb") as f: f.write(protList) #Gene_Blast_DB_name = Create_Blastdb( './temp/proteinList.fasta', 1, True ) cline = NcbiblastpCommandline(query=basepath+'/proteinList.fasta', db=Gene_Blast_DB_name, evalue=0.001, out=basepath+'proteinList.xml', outfmt=5) #print cline blast_records = runBlastParser(cline, basepath+'proteinList.xml', basepath+'/proteinList.fasta') for blast_record in blast_records: for alignment in blast_record.alignments: #print alignment #print alignment.hsps #print alignment.hit_id #print alignment.hit_def #print alignment.title for match in alignment.hsps: #print blast_record.query #print match #print alleleI, len(allelescores) scoreRatio=float(match.score)/float(allelescores[int(alignment.hit_def)-1]) #print scoreRatio #print alignment.hit_def cdsStrName=blast_record.query if(scoreRatio == 1 and bestmatch[2] is False): bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alignment.hit_def)] #print alignment #print match elif(scoreRatio == 1 and match.score>bestmatch[0]): bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alignment.hit_def)] #print match elif(match.score>bestmatch[0] and scoreRatio>0.4 and scoreRatio>bestmatch[1] and bestmatch[2] is False): #print match.query #print match.sbjct #print allelescores bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alignment.hit_def)] #print match #print bestmatch if bestmatch[0]==0: #if no best match was found ################### # LOCUS NOT FOUND # ################### resultsList.append('LNF3:-1') # append result to the list of results perfectMatchIdAllele.append('LNF') #printinfo(genomeFile,geneFile) print "Locus not found, no matches \n" elif bestmatch[2] is True: #if a perfect match was found ################################################ # EXACT MATCH --- MATCH == GENE --- GENE FOUND # ################################################ perfectMatchIdAllele.append(str(bestmatch[4])) resultsList.append('EXC:' + str(bestmatch[4]) ) else: ####################### # ADD INFERRED ALLELE # # a new allele ####################### #print "infered allele has location : "+(CDSType) #printinfo(genomeFile,geneFile) tagAux='INF' perfectMatchIdAllele.append( tagAux +"-"+str(alleleI+1)) print "New allele! Adding allele "+ tagAux + str(alleleI+1) +" to the database\n" resultsList.append( tagAux + str(alleleI+1) ) #orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +"_" +str(os.path.basename(genomeFile))) # --- add the new allele to the gene fasta --- # fG = open( geneFile, 'a' ) fG.write('>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n') #print alleleStr listOfCDS=listAllCDS[genome] #print listOfCDS fG.write( listOfCDS[">"+bestmatch[3]] + '\n') fG.close() #alleleI += 1 # --- remake blast DB --- # Gene_Blast_DB_name = Create_Blastdb( basepath+'/protein.fasta', 1, True ) alleleI,allelescores,Gene_Blast_DB_name=getBlastScoreRatios(geneFile) #x=y shutil.rmtree(basepath) final = (resultsList,perfectMatchIdAllele) #return (resultsList) return final
def callAlleles(argumentList): geneFile = argumentList[0] genomesList = argumentList[1] listOfCDSDicts = argumentList[2] listOfGenomesDict = argumentList[3] gene_fp = HTSeq.FastaReader(geneFile) geneDict = {} alleleI = 1 inverted=False orderedAlleleNames=[] biggestAllelelen=0 for allele in gene_fp: if allele.seq in geneDict: print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile else: if len(allele.seq)>biggestAllelelen: biggestAllelelen=len(allele.seq) orderedAlleleNames.append(allele.name) geneDict[ allele.seq ] = alleleI alleleI += 1 #print geneDict #print orderedAlleleNames # --- make 1st blast DB --- # Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 ) geneF = os.path.splitext( geneFile )[0] blast_out_file = geneF + '.xml' # list of results - the output of the function resultsList = [] i = 0 perfectMatchIdAllele=[] for genomeFile in genomesList: #print geneDict currentCDSDict = listOfCDSDicts[i] currentGenomeDict = listOfGenomesDict[i] #print genomeFile #print resultsList #print geneDict #print orderedAlleleNames i+=1 # it has to be incremented here if genomeFile[-1] == '\n': genomeFile = genomeFile[:-1] # ------------------------------ RUNNING BLAST ------------------------------ # cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) blast_records = runBlastParser(cline, blast_out_file, genomeFile) # ------ DETERMINING BEST MATCH ------ # # bestMatch = ['rec.query','hsp', lenRatio] bestMatch = ['','', 0] bestMatchContig='' bestMatchContigLen='' bestalignlen=0 perfectMatch=False bmAlleleLen2=0 bmAllele='' #noAlignment=False for blast_record in blast_records: # --- the LNF cases are now called outside de loop --- # #print blast_record if perfectMatch==True: break try: #print blast_record.alignments hspC = blast_record.alignments[0] if bestMatch[0] == '' and bestMatch[1] == '': bestMatch[0] = blast_record.query bestMatch[1] = hspC except IndexError: continue # --- the contig tag is used in the progigal function --- # contigTag = blast_record.query # --- brute force parsing of the contig tag - better solution is advisable --- # j=0 for l in contigTag: if l == ' ': break j+=1 contigTag = contigTag[:j] contigLen = blast_record.query_letters #print blast_record.query_id # --- iterating over all the results to determine the best match --- # for alignment in blast_record.alignments: index=orderedAlleleNames.index(alignment.hit_def) #print alignment.hit_def for k, v in geneDict.iteritems(): if v == index+1: bmAlleleLen2= len(k) if perfectMatch: break for match in alignment.hsps: #print match scoreRatio = float(match.score) / float(bmAlleleLen2) #print alignment.hit_def #print match.identities #print bmAlleleLen2 #print #print match.identities #print len(match.sbjct) if (int(match.identities)==int(bmAlleleLen2) and int(match.identities)==int(len(match.sbjct)) and "N" not in match.query ): index=orderedAlleleNames.index(alignment.hit_def) bmAlleleLen= len(geneDict.keys()[index]) lenratio=float(len(match.query))/float(bmAlleleLen) bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen] bestMatchContig=contigTag perfectMatch=True index=orderedAlleleNames.index(alignment.hit_def) bmAlleleLen= len(geneDict.keys()[index]) break elif scoreRatio > bestMatch[2]: index=orderedAlleleNames.index(alignment.hit_def) bmAllele=geneDict.keys()[index] bmAlleleLen= len(bmAllele) lenratio=float(len(match.query))/float(bmAlleleLen) bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen] bestMatchContig=contigTag bestMatchContigLen=blast_record.query_letters if match.sbjct_start > match.sbjct_end: inverted=True #print match.query bestalignlen=alignment.length if perfectMatch==True: break # ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- # try: #print bestMatch[0] match = bestMatch[1] #print match.query geneLen = bestMatch[5] alleleStr = match.query nIdentities = match.identities idPercent = float(nIdentities) / float(geneLen) scoreRatio = bestMatch[2] lenRatio = bestMatch[4] #print perfectMatch #print "\nContig best/exact match is :" #print bestMatchContig +"\n" except: resultsList.append('LNF3:-1') # append result to the list of results perfectMatchIdAllele.append('LNF') printinfo(genomeFile,geneFile) print "Locus not found \n" continue #TODO check identities >0.8 if perfectMatch is True: #TODO perfect match to top if match.sbjct_start > match.sbjct_end: alleleStr = reverseComplement(alleleStr) #TODO test replace - #alleleStr = alleleStr.replace('-', '') alleleNumber = geneDict[ alleleStr ] ################################################ # EXACT MATCH --- MATCH == GENE --- GENE FOUND # ################################################ if "_" in bestMatch[3]: a=bestMatch[3].split("_") perfectMatchIdAllele.append(a[1]) else: perfectMatchIdAllele.append(bestMatch[3]) resultsList.append('EXC:' + str(alleleNumber) ) ################### # LOCUS NOT FOUND # ################### #elif bestMatch[0] == '': # resultsList.append('LNF:-1') # append result to the list of results # perfectMatchIdAllele.append('LNF') # printinfo(genomeFile,geneFile) # print "Locus not found \n" elif bestMatch[0] != '' and perfectMatch is not True: ########################### # LOCUS ON THE CONTIG TIP # ########################### #if match.query_start == 1 or bestMatchContigLen <= match.query_end: ## TODO- ## 1 - LOT5 match.query_start ==1 and match.length < match.subj.length (allele length) alignement length ## 2 - LOT 3' match.query_end == match.query.length (contig length) and match.length < contig length (allele length??) ## 3 - LOT SC bestMatchContigLen <= allele length if match.query_start ==1 and len(match.query) < geneLen: resultsList.append('LOT5:-1') perfectMatchIdAllele.append('LOT5') printinfo(genomeFile,geneFile) print "Locus is on the 5' tip of the contig \n" elif match.query_end == bestMatchContigLen and len(match.query) < bestMatchContigLen: resultsList.append('LOT3:-1') perfectMatchIdAllele.append('LOT3') printinfo(genomeFile,geneFile) print "Locus is on the 3' tip of the contig \n" elif bestMatchContigLen <= geneLen: resultsList.append('LOTSC:-1') perfectMatchIdAllele.append('LOTSC') printinfo(genomeFile,geneFile) #print match.query_start print "Locus is bigger than the contig \n" elif 'N' in alleleStr: #TODO gravar para ficheiro ##################### # ALLELE NOT FOUND # # N base found! ##################### geneFile2= os.path.splitext(geneFile)[0] + "LNFN.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+"\n") f.write((alleleStr) +"\n") resultsList.append('LNFN:-1') perfectMatchIdAllele.append('LNFN') printinfo(genomeFile,geneFile) print "LNFN, contains N bases! \n" else: # ------------------------------------------------------------------------------------------------------- # # # # USING PRODIGAL TO TRY TO EXTEND CDS # # # # ------------------------------------------------------------------------------------------------------- # CDSType='' extended, strCDS, CDSType = extendCDS(bestMatchContig, currentCDSDict, match.query_start, match.query_end, currentGenomeDict, geneLen) # --- if it was possible to extend it using prodigal --- # #print extended #print strCDS #print CDSType if extended : alleleStr = strCDS lenRatio = float(len(strCDS)) / float(geneLen) #print alleleStr #print lenRatio elif not extended and biggestAllelelen > geneLen: extended, strCDS, CDSType = extendCDS(bestMatchContig, currentCDSDict, match.query_start, match.query_end, currentGenomeDict, biggestAllelelen) if extended : alleleStr = strCDS lenRatio = float(len(strCDS)) / float(geneLen) else: alleleStr = alleleStr.replace('-', '') else: # --- removing gaps '-' --- # #print alleleStr alleleStr = alleleStr.replace('-', '') # --- continuing the allele calling --- # #print geneDict #print alleleStr # --- it might be needed to obtain the reverse complement of the allele string --- # if match.sbjct_start > match.sbjct_end: alleleStr = reverseComplement(alleleStr) if alleleStr in geneDict: alleleNumber = geneDict[ alleleStr ] ################################################ # EXACT MATCH --- MATCH == GENE --- GENE FOUND # ################################################ perfectMatchIdAllele.append(alleleNumber) resultsList.append('EXC:' + str(alleleNumber) ) else: isUndefined = False #print geneDict.keys()[0] defAllele='' defAlleleName='' for k in geneDict.keys(): if alleleStr in k: defAllele=k #print alleleStr isUndefined = True defAlleleName=geneDict.get(k) break if extended and isUndefined and idPercent > 0.8 and ((int(len(match.query))==int(len(defAllele)) or int(len(match.query))==int(len(defAllele))+1 or int(len(match.query))==int(len(defAllele))-1)) : #extended allele to compare may be different from the allele to compare from bm alleleStr=match.query alleleStr = alleleStr.replace('-', '') if match.sbjct_start > match.sbjct_end: #### - error?? alleleStr = reverseComplement(alleleStr) if int(len(alleleStr))==int(len(defAllele)): # se o match for do mesmo tamanho que o alello tagAux = 'NA1:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA1-"+str(alleleI)) elif int(len(alleleStr))==int(len(defAllele))-1 : # se o match tiver uma base a mais que o alelo tagAux = 'NA2:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA2-"+str(alleleI)) else: #se o match tiver uma base a menos que o alelo tagAux = 'NA3:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA3-"+str(alleleI)) print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database" geneDict[alleleStr] = alleleI resultsList.append( tagAux + str(alleleI) ) orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile))) # --- add the new allele to the gene fasta --- # fG = open( geneFile, 'a' ) fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)) + '\n') fG.write( alleleStr + '\n') fG.close() alleleI += 1 Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 ) elif not extended and idPercent > 0.8 and ((int(len(match.query))==int(geneLen) or int(len(match.query))==int(geneLen)+1 or int(len(match.query))==int(geneLen)-1)) : alleleStr=match.query alleleStr = alleleStr.replace('-', '') if match.sbjct_start > match.sbjct_end: #### - error?? alleleStr = reverseComplement(alleleStr) if int(len(alleleStr))==int(geneLen): # se o match for do mesmo tamanho que o alello tagAux = 'NA4:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA4-"+str(alleleI)) elif int(len(alleleStr))==int(geneLen)-1 : # se o match tiver uma base a mais que o alelo tagAux = 'NA5:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA5-"+str(alleleI)) else: #se o match tiver uma base a menos que o alelo tagAux = 'NA6:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA6-"+str(alleleI)) print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database" geneDict[alleleStr] = alleleI resultsList.append( tagAux + str(alleleI) ) orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile))) # --- add the new allele to the gene fasta --- # fG = open( geneFile, 'a' ) fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)) + '\n') fG.write( alleleStr + '\n') fG.close() alleleI += 1 Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 ) elif isUndefined: #################### # UNDEFINED ALLELE # # it is contained in another allele #################### alleleStr=match.query #if match.sbjct_start > match.sbjct_end: #### - error #alleleStr = reverseComplement(alleleStr) resultsList.append('UND:-1') perfectMatchIdAllele.append("undefined allele") printinfo(genomeFile,geneFile) print "Undefined allele \n" geneFile2= os.path.splitext(geneFile)[0] + "undefined.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n") f.write((alleleStr) +"\n") #f.write(">BlastBestMatch"+str(defAlleleName)+"\n") #f.write((alleleStr)+"\n") f.write(">Allele"+str(defAlleleName)+"\n") f.write((defAllele)+"\n") else: if not extended : if lenRatio < 0.5: ############### # SMALL MATCH # ############### resultsList.append('SAC:-1') # don't know what 'SAC' stands for perfectMatchIdAllele.append('small match') printinfo(genomeFile,geneFile) print "lower than 50% match \n" elif lenRatio < 0.8 and idPercent < 0.5: ##################### # INCOMPLETE ALLELE # # it was not possible to extend it to at least 80% of the length of the gene ##################### resultsList.append('INC:-1') perfectMatchIdAllele.append('allele incomplete') printinfo(genomeFile,geneFile) print "Incomplete allele\n" else: ################## # LNF WTFFF # ################## geneFile2= os.path.splitext(geneFile)[0] + "LNF2.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n") f.write((alleleStr) +"\n") f.write(">Allele\n") f.write((bmAllele)+"\n") resultsList.append('LNF2') printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("LNF2") print "Not extended and no allele found" else: ####################### # ADD INFERRED ALLELE # # a new allele that was extended with prodigal ####################### if(CDSType=='larger than match'): tagAux = 'INF1:' elif(CDSType=='start codon inside match'): tagAux = 'INF2:' elif(CDSType=='early stop codon in match'): tagAux = 'INF3:' elif(CDSType=='same size as allele'): tagAux = 'INF4:' else: tagAux = 'INF5:' print "infered allele has location : "+(CDSType) printinfo(genomeFile,geneFile) perfectMatchIdAllele.append( tagAux +"-"+str(alleleI)) print "New allele Infered with prodigal! Adding allele "+ tagAux + str(alleleI) +" to the database\n" geneDict[alleleStr] = alleleI resultsList.append( tagAux + str(alleleI) ) orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile))) # --- add the new allele to the gene fasta --- # fG = open( geneFile, 'a' ) fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + str(os.path.basename(genomeFile)) + '\n') #print alleleStr fG.write( alleleStr + '\n') fG.close() alleleI += 1 # --- remake blast DB --- # Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 ) final = (resultsList,perfectMatchIdAllele) #return (resultsList) return final
def main(): parser = argparse.ArgumentParser(description="Given an ffn file, recovers the genes that are not paralogs and have a size bigger than the g parameter provided") parser.add_argument('-i', nargs='?', type=str, help='ffn file', required=True) parser.add_argument('-g', nargs='?', type=int, help='int minimum size', required=True) args = parser.parse_args() genes = args.i sizethresh = args.g passSteps = False #translate to protein and create new file abspath=os.path.abspath(genes) filename=os.path.basename(genes) abspath=abspath.replace(filename,'') proteinfile=os.path.join(abspath,'proteins.fasta') geneDict = {} protDict={} orderedprotDict=collections.OrderedDict() alreadyIn=[] totalgenes=0 repeatedgenes=0 smallgenes=0 if not passSteps: print "not passing steps" with open(proteinfile, "wb") as f: g_fp = HTSeq.FastaReader( genes ) totalgenes+=1 for gene in g_fp: dnaseq= str(gene.seq) protseq,x,y=translateSeq(dnaseq) if len(protseq)>1: if str(protseq) in alreadyIn: repeatedgenes+=1 elif len(str(protseq))<67: smallgenes+=1 else: alreadyIn.append(str(protseq)) protname=">"+str(gene.name)+"\n" f.write(protname+str(protseq)+"\n") protDict[protname] = str(protseq) geneDict[str(gene.name)] = gene.seq else: print gene.name orderedprotList=[] orderedprotList=sorted(protDict.items(), key=lambda x: len(x[1]), reverse=True) i=0 while i < len(orderedprotList): elem=orderedprotList[i] orderedprotDict[elem[0]] = elem[1] i+=1 #print orderedprotDict print str(repeatedgenes) + " repeated genes out of "+ str(totalgenes) print str(smallgenes) + " small genes out of "+ str(totalgenes) print "protein file created" # first step - remove genes contained in other genes or 100% equal genes # list of results - the output of the function resultsList = [] auxDict={} g_fp = HTSeq.FastaReader( proteinfile ) g=0 j=0 print "Checking if proteins are equal or substring of others..." # for each gene from all the annotated genes - starting with an empty dictionary, only add a new gene if the "to be added gene" is not contained or equal to a gene already added to the dictionary auxprot=[] for elem in orderedprotDict.items(): contained=False prot=str(elem[1]) if any(prot in x for x in auxprot): g+=1 contained=True else: auxDict[elem[1]] = elem[0] auxprot.append(str(elem[1])) print str(j)+ " out of " + str(len(orderedprotDict) ) j+=1 #print "____" +str(j) print "%s genes are contained in other genes" % (g) #overwrite the original file, obtaining a new file with unique genes with open(proteinfile, "wb") as f: allsequences='' for k,v in auxDict.iteritems(): allsequences+=v+k+"\n" f.write(allsequences) else: totalgenes=0 smallgenes=0 g_fp = HTSeq.FastaReader( genes ) totalgenes+=1 for gene in g_fp: dnaseq= str(gene.seq) protseq,x,y=translateSeq(dnaseq) if len(protseq)>1: if str(protseq) in alreadyIn: repeatedgenes+=1 #print gene.name + " already saved " elif len(str(protseq))<67: smallgenes+=1 else: alreadyIn.append(str(protseq)) protname=">"+str(gene.name)+"\n" #print protseq protDict[protname] = str(protseq) geneDict[str(gene.name)] = gene.seq else: print gene.name geneFile = os.path.abspath( proteinfile ) print proteinfile Gene_Blast_DB_name = Create_Blastdb( geneFile, 1, True ) geneF = os.path.splitext( geneFile )[0] blast_out_file = geneF + '.xml' # ------------------------------ RUNNING BLAST ------------------------------ # cline = NcbiblastpCommandline(query=geneFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) blast_records = runBlastParser(cline, blast_out_file, geneFile) toRemove=[] genesToKeep=[] log=["removed\tcause\texplanation"] for blast_record in blast_records: allelename=blast_record.query allelename=allelename.split(" ") allelename=allelename[0] alleleLength=len(geneDict[allelename]) try: #if gene A is not on the toRemove list yet, add to genesToKeep list if str(blast_record.query) not in toRemove: genesToKeep.append(blast_record.query) i=0 #if first alignement is not against self, gene B is bigger than gene A and very simillar - remove gene A from genesToKeep and add gene B instead if not str(blast_record.query) == str((blast_record.alignments[0]).hit_def): genesToKeep.remove(str(blast_record.query)) toRemove.append(str(blast_record.query)) log.append(str(blast_record.query)+"\t"+str((blast_record.alignments[0]).hit_def)+"\t"+"2 is first best match") #if gene B is not on the toRemove list, add to genesToKeep list if str((blast_record.alignments[0]).hit_def) not in toRemove: genesToKeep.append(str((blast_record.alignments[0]).hit_def)) raise selfblastscore=(((blast_record.alignments[0]).hsps)[0]).score while i<len(blast_record.alignments): align=blast_record.alignments[i] match=(align.hsps)[0] scoreRatio=float(match.score)/float(selfblastscore) alleleLength2=len(geneDict[str(align.hit_def)]) #if good match and gene B not in toremove list if(scoreRatio>0.6 and not str(align.hit_def) == str(blast_record.query) and str(align.hit_def) not in toRemove): #if gene B is bigger than gene A, keep bigger gene B if alleleLength2>alleleLength : genesToKeep.append(str(align.hit_def)) genesToKeep.remove(str(blast_record.query)) toRemove.append(str(blast_record.query)) log.append(str(blast_record.query)+"\t"+str(align.hit_def)+"\t"+"2 is bigger and bsr >0.6") raise #else add gene B to toremove list elif str(align.hit_def) in genesToKeep: genesToKeep.remove(str(align.hit_def)) toRemove.append(str(align.hit_def)) log.append(str(align.hit_def)+"\t"+str(blast_record.query)+"\t"+"2 is bigger and bsr >0.6") i+=1 #else gene A is on toRemove list, add all similar genes (not in genesToKeep) list to the toRemove list else: i=0 selfblastscore=0 for align in blast_record.alignments: if not (str(align.hit_def) == str(blast_record.query)): selfblastscore=((align.hsps)[0]).score print "gene "+str(align.hit_def)+" is bigger than gene "+str(blast_record.query) raise while i<len(blast_record.alignments): align=blast_record.alignments[i] match=(align.hsps)[0] scoreRatio=float(match.score)/float(selfblastscore) if align.hit_def not in genesToKeep and not str(align.hit_def) == str(blast_record.query) and scoreRatio>0.6 : toRemove.append(align.hit_def) log.append(str(align.hit_def)+"\t"+str(blast_record.query)+"\t"+"2 was on the removed list and bsr >0.6") else: pass i+=1 except Exception as e: #print e pass with open("logfile.txt", "wb") as f: for elem in log: f.write(str(elem)+"\n") genesToKeep=list(set(genesToKeep)) toRemove=list(set(toRemove)) s = set(toRemove) notcommonToKeep= [x for x in genesToKeep if x not in s] print len(toRemove) print len(genesToKeep) print len(notcommonToKeep) pathfiles=os.path.dirname(geneFile) pathfiles=pathfiles+"/" g_fp = HTSeq.FastaReader( genes ) removedparalogs=0 removedsize=0 totalgenes=0 rest=0 concatenatedFile='' for contig in g_fp: totalgenes+=1 name = contig.name+" "+contig.descr name2= contig.name if name2 not in toRemove and name2 in genesToKeep: if int(len(contig.seq))>sizethresh: namefile=contig.name namefile=namefile.replace("|","_") with open(pathfiles+namefile+".fasta", "wb") as f: f.write(">1\n"+contig.seq+"\n") rest+=1 concatenatedFile+=">"+namefile+"\n"+contig.seq+"\n" else: removedsize+=1 else: removedparalogs+=1 print "%s genes are contained in other genes" % (g) print "Removed %s same Locus genes" % str(removedparalogs) print "Removed %s because of size " % str(removedsize) print "%s Scheme genes " % str(rest) print "total genes:" + str(totalgenes) with open (pathfiles+"concatenated.fasta","wb") as f: f.write (concatenatedFile)
def main(): print("Starting script at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) try: input_file = sys.argv[1] temppath = sys.argv[2] except IndexError: print "usage: list_pickle_obj" argumentList = [] with open(input_file, 'rb') as f: argumentList = pickle.load(f) geneFile = argumentList[0] genomesList = argumentList[1] basepath = os.path.join(temppath, os.path.splitext(geneFile)[0]) if not os.path.exists(basepath): os.makedirs(basepath) gene_fp = HTSeq.FastaReader(geneFile) alleleI = 0 resultsList = [] i = 0 perfectMatchIdAllele = [] perfectMatchIdAllele2 = [] allelescores = [] print("Getting BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) geneScorePickle = os.path.abspath(geneFile) + '_bsr.txt' #check if bsr as arealdy been calculated and recalculate it if os.path.isfile(geneScorePickle): alleleI, allelescores, alleleList = getBlastScoreRatios( geneFile, basepath, False) else: alleleI, allelescores, alleleList = getBlastScoreRatios( geneFile, basepath, True) print("Finished BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) genome = -1 genomeDict = {} print("starting allele call at: " + time.strftime("%H:%M:%S-%d/%m/%Y")) for genomeFile in genomesList: print genomeFile bestmatch = [ 0, 0, False, '', 0 ] #score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID currentGenomeDict = {} currentCDSDict = {} # load the translated CDS from the genome to a dictionary filepath = os.path.join( temppath, str(os.path.basename(genomeFile)) + "_ORF_Protein.txt") with open(filepath, 'rb') as f: currentCDSDict = pickle.load(f) #load the contig info of the genome to a dictionary g_fp = HTSeq.FastaReader(genomeFile) for contig in g_fp: sequence = str(contig.seq) genomeDict[contig.name] = sequence currentGenomeDict = genomeDict genome += 1 listOfCDS = currentCDSDict genomeProteinfastaPath = os.path.join( temppath, str(os.path.basename(genomeFile) + '_Protein.fasta')) print("Blasting alleles on genome at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) blast_out_file = os.path.join( basepath, "blastdbs/" + os.path.basename(geneFile) + '_List.xml') Gene_Blast_DB_name = os.path.join( temppath, str(os.path.basename(genomeFile)) + "/" + str(os.path.basename(genomeFile)) + "_db") proteinfastaPath = os.path.join( basepath, str(os.path.basename(geneFile) + '_protein.fasta')) #blast the genome CDS against the translated locus cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) blast_records = runBlastParser(cline, blast_out_file, proteinfastaPath) print("Blasted alleles on genome at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) alleleSizes = [] for allele in alleleList: alleleSizes.append(len(allele)) biggestSizeAllele = 0 moda = max(set(alleleSizes), key=alleleSizes.count) contador = Counter(alleleSizes).most_common() if (contador[0])[1] == 1: moda = alleleSizes[0] try: # iterate through the blast results for blast_record in blast_records: locationcontigs = [] for alignment in blast_record.alignments: # select the best match for match in alignment.hsps: alleleMatchid = str( blast_record.query_id).split("_")[1] scoreRatio = float(match.score) / float( allelescores[int(alleleMatchid) - 1]) cdsStrName = ((alignment.title).split(" "))[1] DNAstr = listOfCDS[">" + cdsStrName] AlleleDNAstr = alleleList[int(alleleMatchid) - 1] if len(AlleleDNAstr) > biggestSizeAllele: biggestSizeAllele = len(AlleleDNAstr) compare = False #compare the DNA match and the allele DNA sequence (protein sequences may be equal and DNA different) if DNAstr == AlleleDNAstr is False: try: DNAstr = reverseComplement(DNAstr) if DNAstr == AlleleDNAstr is False: pass else: compare = True except: pass else: compare = True if scoreRatio > 0.6: locationcontigs.append(cdsStrName) if "N" in DNAstr or "K" in DNAstr or "R" in DNAstr: pass elif (scoreRatio == 1 and bestmatch[2] is False and compare is True): bestmatch = [ match.score, scoreRatio, True, cdsStrName, int(alleleMatchid), match, len(AlleleDNAstr) ] elif (scoreRatio == 1 and match.score > bestmatch[0] and compare is True): bestmatch = [ match.score, scoreRatio, True, cdsStrName, int(alleleMatchid), match, len(AlleleDNAstr) ] elif (scoreRatio == 1 and bestmatch[2] is False and compare is False): bestmatch = [ match.score, scoreRatio, False, cdsStrName, int(alleleMatchid), match, len(AlleleDNAstr) ] elif (scoreRatio == 1 and match.score > bestmatch[0] and compare is False): bestmatch = [ match.score, scoreRatio, False, cdsStrName, int(alleleMatchid), match, len(AlleleDNAstr) ] elif (match.score > bestmatch[0] and scoreRatio > 0.6 and scoreRatio > bestmatch[1] and bestmatch[2] is False): bestmatch = [ match.score, scoreRatio, False, cdsStrName, int(alleleMatchid), match, len(AlleleDNAstr) ] print("Classifying the match at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) #if no best match was found it's a Locus Not Found if bestmatch[ 0] == 0 or "N" in AlleleDNAstr or "K" in AlleleDNAstr or "R" in AlleleDNAstr: ################### # LOCUS NOT FOUND # ################### if bestmatch[0] == 0: resultsList.append('LNF3:-1') perfectMatchIdAllele.append('LNF') perfectMatchIdAllele2.append('LNF') print "Locus not found, no matches \n" else: resultsList.append('LNFN:-1') perfectMatchIdAllele.append('LNF') perfectMatchIdAllele2.append('LNF') print "Locus has strange base (N, K or R) \n" #if more than one BSR >0.6 in two different CDSs it's a Non Paralog Locus elif len(list(set(locationcontigs))) > 1: resultsList.append('NIPL') perfectMatchIdAllele.append('NIPL') perfectMatchIdAllele2.append('NIPL') for elem in locationcontigs: print elem #in case the DNA match sequence equal to the DNA sequence of the comparing allele elif bestmatch[2] is True: contigname = bestmatch[3] contigname = contigname.split("&") matchLocation = contigname[2] contigname = contigname[0] print contigname alleleStr = listOfCDS[">" + bestmatch[3]] protSeq, alleleStr, Reversed = translateSeq(alleleStr) #check for possible locus on tip match = bestmatch[5] matchLocation2 = matchLocation.split("-") seq = currentGenomeDict[contigname] bestMatchContigLen = len(seq) rightmatchContig = bestMatchContigLen - int(matchLocation2[1]) leftmatchContig = int(matchLocation2[0]) if Reversed: aux = rightmatchContig rightmatchContig = leftmatchContig leftmatchContig = aux # get extra space to the right and left between the allele and match possibleExtra = int(moda) - ((int(match.query_end) * 3) - (int(match.query_start) * 3)) if possibleExtra < 0: perfectMatchIdAllele.append(str(bestmatch[4])) if not Reversed: perfectMatchIdAllele2.append( str(contigname) + "&" + str(matchLocation) + "&" + "+") else: perfectMatchIdAllele2.append( str(contigname) + "&" + str(matchLocation) + "&" + "-") resultsList.append('EXC:' + str(bestmatch[4])) else: rightmatchAllele = possibleExtra leftmatchAllele = possibleExtra if leftmatchContig < leftmatchAllele and rightmatchContig < rightmatchAllele: resultsList.append('PLOTSC:-1') perfectMatchIdAllele.append('PLOTSC') perfectMatchIdAllele2.append('PLOTSC') print match print "contig extras (l,r)" print leftmatchContig, rightmatchContig print "allele extras (l,r)" print leftmatchAllele, rightmatchAllele print "Locus is possibly bigger than the contig \n" elif leftmatchContig < leftmatchAllele: resultsList.append('PLOT3:-1') perfectMatchIdAllele.append('PLOT3') perfectMatchIdAllele2.append('PLOT3') print match print "contig extras (l,r)" print leftmatchContig, rightmatchContig print "allele extras (l,r)" print leftmatchAllele, rightmatchAllele print "Locus is possibly on the 3' tip of the contig \n" elif rightmatchContig < rightmatchAllele: resultsList.append('PLOT5:-1') perfectMatchIdAllele.append('PLOT5') perfectMatchIdAllele2.append('PLOT5') print match print "contig extras (l,r)" print leftmatchContig, rightmatchContig print "allele extras (l,r)" print leftmatchAllele, rightmatchAllele print "Locus is possibly on the 5' tip of the contig \n" else: #if a perfect match was found ################################################ # EXACT MATCH --- MATCH == GENE --- GENE FOUND # ################################################ perfectMatchIdAllele.append(str(bestmatch[4])) if not Reversed: perfectMatchIdAllele2.append( str(contigname) + "&" + str(matchLocation) + "&" + "+") else: perfectMatchIdAllele2.append( str(contigname) + "&" + str(matchLocation) + "&" + "-") resultsList.append('EXC:' + str(bestmatch[4])) # if match with BSR >0.6 and not equal DNA sequences else: match = bestmatch[5] geneLen = bestmatch[6] contigname = bestmatch[3] contigname = contigname.split("&") matchLocation = contigname[2] matchLocation = matchLocation.split("-") contigname = contigname[0] seq = currentGenomeDict[contigname] bestMatchContigLen = len(seq) alleleStr = listOfCDS[">" + bestmatch[3]] protSeq, alleleStr, Reversed = translateSeq(alleleStr) rightmatchContig = bestMatchContigLen - int(matchLocation[1]) leftmatchContig = int(matchLocation[0]) if Reversed: aux = rightmatchContig rightmatchContig = leftmatchContig leftmatchContig = aux print rightmatchContig, leftmatchContig # get extra space to the right and left between the allele and match and check if it's still inside the contig rightmatchAllele = geneLen - ((int(match.query_end) + 1) * 3) leftmatchAllele = ((int(match.query_start) - 1) * 3) ########################### # LOCUS ON THE CONTIG TIP # ########################### if leftmatchContig < leftmatchAllele and rightmatchContig < rightmatchAllele: resultsList.append('LOTSC:-1') perfectMatchIdAllele.append('LOTSC') perfectMatchIdAllele2.append('LOTSC') print match print contigname print geneFile print leftmatchAllele, rightmatchAllele print "Locus is bigger than the contig \n" elif leftmatchContig < leftmatchAllele: resultsList.append('LOT3:-1') perfectMatchIdAllele.append('LOT3') perfectMatchIdAllele2.append('LOT3') print match print contigname print geneFile print leftmatchAllele, rightmatchAllele print "Locus is on the 3' tip of the contig \n" elif rightmatchContig < rightmatchAllele: resultsList.append('LOT5:-1') perfectMatchIdAllele.append('LOT5') perfectMatchIdAllele2.append('LOT5') print match print contigname print geneFile print leftmatchAllele, rightmatchAllele print "Locus is on the 5' tip of the contig \n" elif len(alleleStr) > moda + (moda * 0.2): print moda print alleleStr resultsList.append('ALM') perfectMatchIdAllele.append('ALM') perfectMatchIdAllele2.append('ALM') elif len(alleleStr) < moda - (moda * 0.2): print moda print alleleStr resultsList.append('ASM') perfectMatchIdAllele.append('ASM') perfectMatchIdAllele2.append('ASM') else: ####################### # ADD INFERRED ALLELE # # a new allele ####################### tagAux = 'INF' perfectMatchIdAllele.append(tagAux + "-" + str(alleleI + 1)) if not Reversed: perfectMatchIdAllele2.append( str(contigname) + "&" + str(matchLocation[0]) + "-" + str(matchLocation[1]) + "&" + "+") else: perfectMatchIdAllele2.append( str(contigname) + "&" + str(matchLocation[0]) + "-" + str(matchLocation[1]) + "&" + "-") print "New allele! Adding allele " + tagAux + str( alleleI + 1) + " to the database\n" resultsList.append(tagAux + str(alleleI + 1)) # --- add the new allele to the gene fasta --- # appendAllele = '>allele_' + str( alleleI + 1) + '_' + tagAux[:-1] + "_" + str( os.path.basename(genomesList[genome])) + '\n' fG = open(geneFile, 'a') fG.write(appendAllele) fG.write(alleleStr + '\n') fG.close() fG = open( os.path.join( basepath, str( os.path.basename(geneFile) + '_protein2.fasta')), 'w') fG.write('>' + str(alleleI + 1) + '\n' + str(protSeq) + '\n') fG.close() fG = open( os.path.join( basepath, str(os.path.basename(geneFile) + '_protein.fasta')), 'a') fG.write('>' + str(alleleI + 1) + '\n' + str(protSeq) + '\n') fG.close() match = bestmatch[5] # --- remake blast DB and recalculate the BSR for the locus --- # alleleList.append(alleleStr) print os.path.join( basepath, str(os.path.basename(geneFile) + '_protein.fasta')) genefile2 = os.path.join( basepath, str(os.path.basename(geneFile) + '_protein2.fasta')) Gene_Blast_DB_name2 = Create_Blastdb(genefile2, 1, True) print("Re-calculating BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) alleleI, allelescores, alleleList = reDogetBlastScoreRatios( genefile2, basepath, alleleI, allelescores, Gene_Blast_DB_name2, alleleList, geneScorePickle) print "allele id " + str(alleleI) print("Done Re-calculating BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) except Exception as e: print "some error occurred" print e print 'Error on line {}'.format(sys.exc_info()[-1].tb_lineno) perfectMatchIdAllele2.append("ERROR") perfectMatchIdAllele.append("ERROR") resultsList.append('ERROR') final = (resultsList, perfectMatchIdAllele) print("Finished allele calling at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) filepath = os.path.join(temppath, os.path.basename(geneFile) + "_result.txt") filepath2 = os.path.join(temppath, os.path.basename(geneFile) + "_result2.txt") with open(filepath, 'wb') as f: pickle.dump(final, f) with open(filepath2, 'wb') as f: pickle.dump(perfectMatchIdAllele2, f) shutil.rmtree(basepath) return True
def getBlastScoreRatios(orgName,allelescores,cdsDict,prodigalPath): openPresults = prodigalPath Presults=open(openPresults, 'r') linesP = Presults.readlines() lastlineP=len(linesP) alleleProt='' proteome="" countP=0 countCDS=0 if isContig=="no": CreateProteome(nameOrg) else: queryCDS = CreateProteomeContig(nameOrg,cdsDict) cline = NcbiblastpCommandline(query=pathRef+nameOrg+'proteome.fasta', db=name, out=blast_out_file, outfmt=5, num_alignments=7000, num_descriptions=7000) #print cline allelescore=0 blast_records = runBlastParser(cline,blast_out_file, alleleProt) os.remove(pathRef+nameOrg+'proteome.fasta') blastScoreRatio=0 countRecords=0 bestMatches={} BestMatchResults= [] length=[] alignment_posStart=[] query_length=[] for blast_record in blast_records: found=False countRecords+=1 for alignment in blast_record.alignments: if found is False: #print blast_record.query, alignment.hit_def scoreToUse=0 for match in alignment.hsps: if len(blast_record.alignments)==0: countResults=countResults else: blastScoreRatio = float(match.score) / float(allelescores[str(alignment.hit_def)]) #or re.search("ENA|",alignment.title) #print alignment.title try: geneName=alignment.title.split("|")[5] except IndexError: geneName=alignment.title.split("|")[2] #print geneName #products.append(alignment.title.split("|")[6].split("[")[0]) #if hsp.expect < 0.001 and 100 <= hsp.align_length: if geneName.strip() not in BestMatchResults and blastScoreRatio>0.6: BestMatchResults.append(genomeDB+"..."+str(geneName).strip()) length.append(str(match.align_length-1)) #score.append(str(Score)) alignment_posStart.append(str(match.query_start)) query_length.append(str(len(match.query))) break else: break #print str(blast_record.query) bestMatches[str(countRecords)] = [BestMatchResults,length,alignment_posStart,query_length,str(blast_record.query)] BestMatchResults= [] length=[] alignment_posStart=[] query_length=[] print countRecords #fG = open( pathRef+'AllAlleles.fasta', 'a' ) #for i in ToNewAllele: #print i #fG.write(i) #fG.close() #Create_Blastdb(pathRef+'allAllelesAA.fasta',1,True) #print matchR #print allelescores #return alleleI,allelescores,Gene_Blast_DB_name #print alleleI #print len(allelescores) #print countT return bestMatches,queryCDS
def getBlastScoreRatios(genefile, basepath, doAll): gene_fp = HTSeq.FastaReader(genefile) alleleI = 0 allelescores = [] alleleProt = '' alleleAllProt = '' alleleList = [] for allele in gene_fp: #new db for each allele to blast it against himself alleleI += 1 genome = -1 alleleList.append(allele.seq) translatedSequence, x, y = translateSeq(allele.seq) if translatedSequence == '': pass else: alleleProt = ">" + str(alleleI) + "\n" + str(translatedSequence + "\n") alleleAllProt += ">" + str(alleleI) + "\n" + str( translatedSequence + "\n") proteinfastaPath = os.path.join( basepath, str(os.path.basename(genefile) + '_protein2.fasta')) with open(proteinfastaPath, "wb") as f: f.write(alleleProt) Gene_Blast_DB_name = Create_Blastdb(proteinfastaPath, 1, True) if doAll: blast_out_file = os.path.join(basepath, 'blastdbs/temp.xml') print("Starting Blast alleles at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) # --- get BLAST score ratio --- # cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) allelescore = 0 blast_records = runBlastParser(cline, blast_out_file, alleleProt) print("Blasted alleles at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) for blast_record in blast_records: for alignment in blast_record.alignments: for match in alignment.hsps: allelescores.append(int(match.score)) geneScorePickle = os.path.abspath(genefile) + '_bsr.txt' print "________" var = [alleleI, allelescores] with open(geneScorePickle, 'wb') as f: pickle.dump(var, f) else: geneScorePickle = os.path.abspath(genefile) + '_bsr.txt' with open(geneScorePickle, 'rb') as f: var = pickle.load(f) allelescores = var[1] proteinfastaPath = os.path.join( basepath, str(os.path.basename(genefile) + '_protein.fasta')) with open(proteinfastaPath, "wb") as f: f.write(alleleAllProt) return int(alleleI), allelescores, alleleList
def getBlastScoreRatios(genefile,basepath,doAll): gene_fp = HTSeq.FastaReader(genefile) alleleI=0 allelescores=[] alleleProt='' alleleAllProt='' alleleList=[] for allele in gene_fp: #new db for each allele to blast it against himself alleleI+=1 genome=-1 alleleList.append(allele.seq) translatedSequence,x,y=translateSeq(allele.seq) if translatedSequence =='': pass else: alleleProt=">"+str(alleleI)+"\n"+str(translatedSequence+"\n") alleleAllProt+=">"+str(alleleI)+"\n"+str(translatedSequence+"\n") proteinfastaPath=os.path.join(basepath,str(os.path.basename(genefile)+'_protein2.fasta')) with open(proteinfastaPath, "wb") as f: f.write(alleleProt) Gene_Blast_DB_name = Create_Blastdb( proteinfastaPath, 1, True ) if doAll: blast_out_file = os.path.join(basepath,'blastdbs/temp.xml') print ("Starting Blast alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) # --- get BLAST score ratio --- # cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) allelescore=0 blast_records = runBlastParser(cline,blast_out_file, alleleProt) print ("Blasted alleles at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) for blast_record in blast_records: for alignment in blast_record.alignments: for match in alignment.hsps: allelescores.append(int(match.score)) geneScorePickle=os.path.abspath(genefile)+'_bsr.txt' print "________" var=[alleleI,allelescores] with open(geneScorePickle,'wb') as f: pickle.dump(var, f) else: geneScorePickle=os.path.abspath(genefile)+'_bsr.txt' with open(geneScorePickle,'rb') as f: var = pickle.load(f) allelescores=var[1] proteinfastaPath=os.path.join(basepath,str(os.path.basename(genefile)+'_protein.fasta')) with open(proteinfastaPath, "wb") as f: f.write(alleleAllProt) return int(alleleI),allelescores,alleleList
def main(): print ("Starting script at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) try: input_file = sys.argv[1] temppath = sys.argv[2] except IndexError: print "usage: list_pickle_obj" argumentList=[] with open(input_file,'rb') as f: argumentList = pickle.load(f) geneFile = argumentList[0] genomesList = argumentList[1] basepath=os.path.join(temppath,os.path.splitext(geneFile)[0]) if not os.path.exists(basepath): os.makedirs(basepath) gene_fp = HTSeq.FastaReader(geneFile) alleleI = 0 resultsList = [] i = 0 perfectMatchIdAllele=[] perfectMatchIdAllele2=[] allelescores=[] print ("Getting BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) geneScorePickle=os.path.abspath(geneFile)+'_bsr.txt' #check if bsr as arealdy been calculated and recalculate it if os.path.isfile(geneScorePickle) : alleleI,allelescores,alleleList=getBlastScoreRatios(geneFile,basepath,False) else: alleleI,allelescores,alleleList=getBlastScoreRatios(geneFile,basepath,True) print ("Finished BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) genome=-1 genomeDict = {} print ("starting allele call at: "+time.strftime("%H:%M:%S-%d/%m/%Y")) for genomeFile in genomesList: print genomeFile bestmatch=[0,0,False,'',0] #score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID currentGenomeDict={} currentCDSDict={} # load the translated CDS from the genome to a dictionary filepath=os.path.join(temppath,str(os.path.basename(genomeFile))+"_ORF_Protein.txt") with open(filepath,'rb') as f: currentCDSDict = pickle.load(f) #load the contig info of the genome to a dictionary g_fp = HTSeq.FastaReader( genomeFile ) for contig in g_fp: sequence=str(contig.seq) genomeDict[ contig.name ] = sequence currentGenomeDict = genomeDict genome+=1 listOfCDS=currentCDSDict genomeProteinfastaPath=os.path.join(temppath,str(os.path.basename(genomeFile)+'_Protein.fasta')) print ("Blasting alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) blast_out_file = os.path.join(basepath,"blastdbs/"+os.path.basename(geneFile)+ '_List.xml') Gene_Blast_DB_name = os.path.join(temppath,str(os.path.basename(genomeFile))+"/"+str(os.path.basename(genomeFile))+"_db") proteinfastaPath=os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta')) #blast the genome CDS against the translated locus cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) blast_records = runBlastParser(cline, blast_out_file, proteinfastaPath) print ("Blasted alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) alleleSizes=[] for allele in alleleList: alleleSizes.append(len(allele)) biggestSizeAllele=0 moda=max(set(alleleSizes), key=alleleSizes.count) contador= Counter(alleleSizes).most_common() if (contador[0])[1] ==1: moda= alleleSizes[0] try: # iterate through the blast results for blast_record in blast_records: locationcontigs=[] for alignment in blast_record.alignments: # select the best match for match in alignment.hsps: alleleMatchid=str(blast_record.query_id).split("_")[1] scoreRatio=float(match.score)/float(allelescores[int(alleleMatchid)-1]) cdsStrName=((alignment.title).split(" "))[1] DNAstr=listOfCDS[">"+cdsStrName] AlleleDNAstr=alleleList[int(alleleMatchid)-1] if len(AlleleDNAstr)>biggestSizeAllele: biggestSizeAllele=len(AlleleDNAstr) compare=False #compare the DNA match and the allele DNA sequence (protein sequences may be equal and DNA different) if DNAstr==AlleleDNAstr is False: try: DNAstr=reverseComplement(DNAstr) if DNAstr==AlleleDNAstr is False: pass else: compare=True except: pass else: compare=True if scoreRatio>0.6: locationcontigs.append(cdsStrName) if "N" in DNAstr or "K" in DNAstr or "R" in DNAstr: pass elif(scoreRatio == 1 and bestmatch[2] is False and compare is True): bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)] elif(scoreRatio == 1 and match.score>bestmatch[0] and compare is True): bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)] elif(scoreRatio == 1 and bestmatch[2] is False and compare is False): bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)] elif(scoreRatio == 1 and match.score>bestmatch[0] and compare is False): bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)] elif(match.score>bestmatch[0] and scoreRatio>0.6 and scoreRatio>bestmatch[1] and bestmatch[2] is False): bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alleleMatchid),match,len(AlleleDNAstr)] print ("Classifying the match at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) #if no best match was found it's a Locus Not Found if bestmatch[0]==0 or "N" in AlleleDNAstr or "K" in AlleleDNAstr or "R" in AlleleDNAstr : ################### # LOCUS NOT FOUND # ################### if bestmatch[0]==0: resultsList.append('LNF3:-1') perfectMatchIdAllele.append('LNF') perfectMatchIdAllele2.append('LNF') print "Locus not found, no matches \n" else: resultsList.append('LNFN:-1') perfectMatchIdAllele.append('LNF') perfectMatchIdAllele2.append('LNF') print "Locus has strange base (N, K or R) \n" #if more than one BSR >0.6 in two different CDSs it's a Non Paralog Locus elif len(list(set(locationcontigs)))>1: resultsList.append('NIPL') perfectMatchIdAllele.append('NIPL') perfectMatchIdAllele2.append('NIPL') for elem in locationcontigs: print elem #in case the DNA match sequence equal to the DNA sequence of the comparing allele elif bestmatch[2] is True: contigname=bestmatch[3] contigname=contigname.split("&") matchLocation=contigname[2] contigname=contigname[0] print contigname alleleStr=listOfCDS[">"+bestmatch[3]] protSeq,alleleStr,Reversed=translateSeq(alleleStr) #check for possible locus on tip match=bestmatch[5] matchLocation2=matchLocation.split("-") seq=currentGenomeDict[ contigname ] bestMatchContigLen=len(seq) rightmatchContig=bestMatchContigLen-int(matchLocation2[1]) leftmatchContig=int(matchLocation2[0]) if Reversed: aux=rightmatchContig rightmatchContig=leftmatchContig leftmatchContig=aux # get extra space to the right and left between the allele and match possibleExtra=int(moda)-((int(match.query_end)*3)-(int(match.query_start)*3)) if possibleExtra<0: perfectMatchIdAllele.append(str(bestmatch[4])) if not Reversed: perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"+") else: perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"-") resultsList.append('EXC:' + str(bestmatch[4]) ) else: rightmatchAllele=possibleExtra leftmatchAllele=possibleExtra if leftmatchContig<leftmatchAllele and rightmatchContig < rightmatchAllele: resultsList.append('PLOTSC:-1') perfectMatchIdAllele.append('PLOTSC') perfectMatchIdAllele2.append('PLOTSC') print match print "contig extras (l,r)" print leftmatchContig,rightmatchContig print "allele extras (l,r)" print leftmatchAllele,rightmatchAllele print "Locus is possibly bigger than the contig \n" elif leftmatchContig<leftmatchAllele: resultsList.append('PLOT3:-1') perfectMatchIdAllele.append('PLOT3') perfectMatchIdAllele2.append('PLOT3') print match print "contig extras (l,r)" print leftmatchContig,rightmatchContig print "allele extras (l,r)" print leftmatchAllele,rightmatchAllele print "Locus is possibly on the 3' tip of the contig \n" elif rightmatchContig < rightmatchAllele: resultsList.append('PLOT5:-1') perfectMatchIdAllele.append('PLOT5') perfectMatchIdAllele2.append('PLOT5') print match print "contig extras (l,r)" print leftmatchContig,rightmatchContig print "allele extras (l,r)" print leftmatchAllele,rightmatchAllele print "Locus is possibly on the 5' tip of the contig \n" else: #if a perfect match was found ################################################ # EXACT MATCH --- MATCH == GENE --- GENE FOUND # ################################################ perfectMatchIdAllele.append(str(bestmatch[4])) if not Reversed: perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"+") else: perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"-") resultsList.append('EXC:' + str(bestmatch[4]) ) # if match with BSR >0.6 and not equal DNA sequences else: match=bestmatch[5] geneLen=bestmatch[6] contigname=bestmatch[3] contigname=contigname.split("&") matchLocation=contigname[2] matchLocation=matchLocation.split("-") contigname=contigname[0] seq=currentGenomeDict[ contigname ] bestMatchContigLen=len(seq) alleleStr=listOfCDS[">"+bestmatch[3]] protSeq,alleleStr,Reversed=translateSeq(alleleStr) rightmatchContig=bestMatchContigLen-int(matchLocation[1]) leftmatchContig=int(matchLocation[0]) if Reversed: aux=rightmatchContig rightmatchContig=leftmatchContig leftmatchContig=aux print rightmatchContig,leftmatchContig # get extra space to the right and left between the allele and match and check if it's still inside the contig rightmatchAllele=geneLen-((int(match.query_end)+1)*3) leftmatchAllele=((int(match.query_start)-1)*3) ########################### # LOCUS ON THE CONTIG TIP # ########################### if leftmatchContig<leftmatchAllele and rightmatchContig < rightmatchAllele: resultsList.append('LOTSC:-1') perfectMatchIdAllele.append('LOTSC') perfectMatchIdAllele2.append('LOTSC') print match print contigname print geneFile print leftmatchAllele,rightmatchAllele print "Locus is bigger than the contig \n" elif leftmatchContig<leftmatchAllele: resultsList.append('LOT3:-1') perfectMatchIdAllele.append('LOT3') perfectMatchIdAllele2.append('LOT3') print match print contigname print geneFile print leftmatchAllele,rightmatchAllele print "Locus is on the 3' tip of the contig \n" elif rightmatchContig < rightmatchAllele: resultsList.append('LOT5:-1') perfectMatchIdAllele.append('LOT5') perfectMatchIdAllele2.append('LOT5') print match print contigname print geneFile print leftmatchAllele,rightmatchAllele print "Locus is on the 5' tip of the contig \n" elif len(alleleStr) > moda+(moda*0.2) : print moda print alleleStr resultsList.append('ALM') perfectMatchIdAllele.append('ALM') perfectMatchIdAllele2.append('ALM') elif len(alleleStr) < moda-(moda*0.2): print moda print alleleStr resultsList.append('ASM') perfectMatchIdAllele.append('ASM') perfectMatchIdAllele2.append('ASM') else: ####################### # ADD INFERRED ALLELE # # a new allele ####################### tagAux='INF' perfectMatchIdAllele.append( tagAux +"-"+str(alleleI+1)) if not Reversed: perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+") else: perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-") print "New allele! Adding allele "+ tagAux + str(alleleI+1) +" to the database\n" resultsList.append( tagAux + str(alleleI+1) ) # --- add the new allele to the gene fasta --- # appendAllele='>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n' fG = open( geneFile, 'a' ) fG.write(appendAllele) fG.write( alleleStr + '\n') fG.close() fG = open( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein2.fasta')), 'w' ) fG.write('>'+str(alleleI+1)+'\n'+str(protSeq) + '\n') fG.close() fG = open( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta')), 'a' ) fG.write('>'+str(alleleI+1)+'\n'+str(protSeq) + '\n') fG.close() match=bestmatch[5] # --- remake blast DB and recalculate the BSR for the locus --- # alleleList.append(alleleStr) print os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta')) genefile2= os.path.join(basepath,str(os.path.basename(geneFile)+'_protein2.fasta')) Gene_Blast_DB_name2 = Create_Blastdb( genefile2, 1, True ) print ("Re-calculating BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) alleleI,allelescores,alleleList=reDogetBlastScoreRatios(genefile2,basepath,alleleI,allelescores,Gene_Blast_DB_name2,alleleList,geneScorePickle) print "allele id " + str(alleleI) print ("Done Re-calculating BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) except Exception as e: print "some error occurred" print e print 'Error on line {}'.format(sys.exc_info()[-1].tb_lineno) perfectMatchIdAllele2.append("ERROR") perfectMatchIdAllele.append("ERROR") resultsList.append('ERROR') final = (resultsList,perfectMatchIdAllele) print ("Finished allele calling at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) filepath=os.path.join(temppath , os.path.basename(geneFile)+"_result.txt") filepath2=os.path.join(temppath , os.path.basename(geneFile)+"_result2.txt") with open(filepath, 'wb') as f: pickle.dump(final, f) with open(filepath2, 'wb') as f: pickle.dump(perfectMatchIdAllele2, f) shutil.rmtree(basepath) return True
def main(): parser = argparse.ArgumentParser(description="Given two list of genes, creates a folder with paired files when located on the same locus") parser.add_argument('-i', nargs='?', type=str, help='1st list of genes files to compare', required=True) parser.add_argument('-g', nargs='?', type=str, help='2nd list of genes files to compare', required=True) args = parser.parse_args() geneFiles1 = args.i geneFiles2 = args.g name1="concat1.fasta" name2="concat2.fasta" concat_genes(geneFiles1, name1) concat_genes(geneFiles2, name2) #orderedAlleleNames=[] geneDict={} gene_fp = HTSeq.FastaReader(name1) alleleI=0 for allele in gene_fp: #if allele.seq in geneDict: # print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile #else: #orderedAlleleNames.append(allele.name) geneDict[ allele.seq ] = allele.name alleleI += 1 gene_fp = HTSeq.FastaReader(name1) geneFile = os.path.abspath( name1 ) Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 , False) geneF = os.path.splitext( geneFile )[0] blast_out_file = geneF + '.xml' # list of results - the output of the function resultsList = [] # ------------------------------ RUNNING BLAST ------------------------------ # cline = NcbiblastnCommandline(query=name2, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) blast_records = runBlastParser(cline, blast_out_file, name2) samelocus=0 alreadyUsed=[] nomatch=0 small=0 if not os.path.exists("./sameLocus"): os.makedirs("./sameLocus") LocusID=0 for blast_record in blast_records: try: alignment=blast_record.alignments[1] #print blast_record.query #print alignment.num_alignments try: #print alleleLength, alignment.length i=0 align=blast_record.alignments[i] while i<len(blast_record.alignments): if align.hit_def: result,allelename2,LocusID,alreadyUsed=alignHasGoodMatch(align,geneDict,LocusID,blast_record,alreadyUsed) if result>0 and allelename2: samelocus+=result i+=999 else: small+=1 i+=999 alreadyUsed.append(allelename2) elif allelename : #alreadyUsed.append(allelename) result,allelename,LocusID,alreadyUsed=alignHasGoodMatch(align,geneDict,LocusID,blast_record,alreadyUsed) if result>0: samelocus+=result i+=999 else: small+=1 i+=999 #alreadyUsed.append(allelename2) else : nomatch+=1 #print align.length, alleleleng i+=1 except Exception as e: print e #print "lkjh" pass except: try: alignment=blast_record.alignments[0] #print blast_record.query result,allelename,LocusID,alreadyUsed=alignHasGoodMatch(alignment,geneDict,LocusID,blast_record,alreadyUsed) if result>0 and allelename: samelocus+=result else : small+=1 #alreadyUsed.append(allelename) #alreadyUsed.append(alignment.hit_def) except: nomatch+=1 print "%s are within same locus, %s had no match and %s had a bigger than 0.2 ratio size difference or less than 0.8 similarity ratio" % (samelocus,nomatch, small) os.remove(name1) os.remove(name2) shutil.rmtree('./blastdbs')
def main(): try: input_file = sys.argv[1] temppath = sys.argv[2] except IndexError: print "usage: list_pickle_obj" argumentList=[] with open(input_file,'rb') as f: argumentList = pickle.load(f) geneFile = argumentList[0] genomesList = argumentList[1] #listOfCDSDicts = argumentList[2] basepath=temppath gene_fp = HTSeq.FastaReader(geneFile) geneDict = {} alleleI = 1 inverted=False orderedAlleleNames=[] biggestAllelelen=0 smallestAllelelen=99999 for allele in gene_fp: if allele.seq in geneDict: print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile else: if len(allele.seq)>biggestAllelelen: biggestAllelelen=len(allele.seq) if len(allele.seq)<smallestAllelelen: smallestAllelelen=len(allele.seq) orderedAlleleNames.append(allele.name) geneDict[ allele.seq ] = alleleI alleleI += 1 #print geneDict #print orderedAlleleNames # --- make 1st blast DB --- # Gene_Blast_DB_name = Create_Blastdb( geneFile, 1, False ) geneF = os.path.basename(geneFile) blast_out_file = os.path.dirname(geneFile)+"/blastdbs/"+geneF + '.xml' # list of results - the output of the function resultsList = [] i = 0 perfectMatchIdAllele=[] genomeDict = {} for genomeFile in genomesList: #currentCDSDict = listOfCDSDicts[i] filepath=os.path.join(basepath,str(os.path.basename(genomeFile))+"_ORF.txt") with open(filepath,'rb') as f: currentCDSDict = pickle.load(f) g_fp = HTSeq.FastaReader( genomeFile ) for contig in g_fp: sequence=str(contig.seq) genomeDict[ contig.name ] = sequence currentGenomeDict = genomeDict #print genomeFile #print resultsList #print geneDict #print orderedAlleleNames i+=1 # it has to be incremented here if genomeFile[-1] == '\n': genomeFile = genomeFile[:-1] # ------------------------------ RUNNING BLAST ------------------------------ # #print Gene_Blast_DB_name #cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) #print cline blast_records = runBlastParser(cline, blast_out_file, genomeFile) print ("Finished Blast at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) # ------ DETERMINING BEST MATCH ------ # # bestMatch = ['rec.query','hsp', lenRatio] bestMatch = ['','', 0] bestMatchContig='' bestMatchContigLen='' bestalignlen=0 perfectMatch=False bmAlleleLen2=0 bmAllele='' #noAlignment=False for blast_record in blast_records: # --- the LNF cases are now called outside de loop --- # #print blast_record if perfectMatch==True: break try: #print blast_record.alignments hspC = blast_record.alignments[0] if bestMatch[0] == '' and bestMatch[1] == '': bestMatch[0] = blast_record.query bestMatch[1] = hspC except IndexError: continue # --- the contig tag is used in the progigal function --- # contigTag = blast_record.query # --- brute force parsing of the contig tag - better solution is advisable --- # j=0 for l in contigTag: if l == ' ': break j+=1 contigTag = contigTag[:j] contigLen = blast_record.query_letters #print blast_record.query_id # --- iterating over all the results to determine the best match --- # for alignment in blast_record.alignments: index=orderedAlleleNames.index(alignment.hit_def) #print alignment.hit_def for k, v in geneDict.iteritems(): if v == index+1: bmAlleleLen2= len(k) if perfectMatch: break for match in alignment.hsps: #print match scoreRatio = float(match.score) / float(bmAlleleLen2) #print alignment.hit_def #print match.identities #print bmAlleleLen2 #print #print match.identities #print len(match.sbjct) #if #identities is the same as the length of the allele and it has no gaps or N's if (int(match.identities)==int(bmAlleleLen2) and int(match.identities)==int(len(match.sbjct)) and "N" not in match.query and "K" not in match.query and "R" not in match.query ): index=orderedAlleleNames.index(alignment.hit_def) for seq, alleleid in geneDict.iteritems(): if alleleid == index+1: bmAllele=seq break bmAlleleLen= len(bmAllele) lenratio=float(len(match.query))/float(bmAlleleLen) bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen] bestMatchContig=contigTag perfectMatch=True index=orderedAlleleNames.index(alignment.hit_def) bmAlleleLen= len(geneDict.keys()[index]) break #choose the match with the best score ratio score/length of allele elif scoreRatio > bestMatch[2]: index=orderedAlleleNames.index(alignment.hit_def) for seq, alleleid in geneDict.iteritems(): if alleleid == index+1: bmAllele=seq break bmAlleleLen= len(bmAllele) lenratio=float(len(match.query))/float(bmAlleleLen) bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen] bestMatchContig=contigTag bestMatchContigLen=blast_record.query_letters if match.sbjct_start > match.sbjct_end: inverted=True #print match.query bestalignlen=alignment.length #print match #print bmAlleleLen, bestMatchContig if perfectMatch==True: break # ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- # print ("Finished choosing best match at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) try: #print bestMatch[0] match = bestMatch[1] #print match.query geneLen = bestMatch[5] alleleStr = match.query nIdentities = match.identities idPercent = float(nIdentities) / float(geneLen) scoreRatio = bestMatch[2] lenRatio = bestMatch[4] #print perfectMatch #print "\nContig best/exact match is :" #print bestMatchContig +"\n" except: #if no best match was found ################### # LOCUS NOT FOUND # ################### resultsList.append('LNF3:-1') # append result to the list of results perfectMatchIdAllele.append('LNF') printinfo(genomeFile,geneFile) print "Locus not found, no matches \n" continue if perfectMatch is True: #if a perfect match was found if match.sbjct_start > match.sbjct_end: #reverse the order if needed alleleStr = reverseComplement(alleleStr) alleleNumber = geneDict[ alleleStr ] ################################################ # EXACT MATCH --- MATCH == GENE --- GENE FOUND # ################################################ if "_" in bestMatch[3]: a=bestMatch[3].split("_") perfectMatchIdAllele.append(a[1]) else: perfectMatchIdAllele.append(bestMatch[3]) resultsList.append('EXC:' + str(alleleNumber) ) elif bestMatch[0] != '' and perfectMatch is not True: #if a best match was found but it's not an exact match ########################### # LOCUS ON THE CONTIG TIP # ########################### if match.query_start ==1 and len(match.query) < geneLen: resultsList.append('LOT5:-1') perfectMatchIdAllele.append('LOT5') printinfo(genomeFile,geneFile) print "Locus is on the 5' tip of the contig \n" elif match.query_end == bestMatchContigLen and len(match.query) < geneLen: resultsList.append('LOT3:-1') perfectMatchIdAllele.append('LOT3') printinfo(genomeFile,geneFile) print "Locus is on the 3' tip of the contig \n" elif bestMatchContigLen <= geneLen: resultsList.append('LOTSC:-1') perfectMatchIdAllele.append('LOTSC') printinfo(genomeFile,geneFile) #print match.query_start print "Locus is bigger than the contig \n" elif 'N' in alleleStr: #TODO gravar para ficheiro ##################### # ALLELE NOT FOUND # # N base found! ##################### geneFile2= os.path.splitext(geneFile)[0] + "LNFN.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+"\n") f.write((alleleStr) +"\n") resultsList.append('LNFN:-1') perfectMatchIdAllele.append('LNFN') printinfo(genomeFile,geneFile) print "LNFN, contains N bases! \n" else: # ------------------------------------------------------------------------------------------------------- # # # # USING PRODIGAL TO TRY TO EXTEND CDS # # # # ------------------------------------------------------------------------------------------------------- # CDSType='' sizeratio=0.2 ORFFoundInMatch, strCDS, CDSType = extendCDS(bestMatchContig, currentCDSDict, match.query_start, match.query_end, currentGenomeDict, biggestAllelelen, smallestAllelelen,sizeratio) # --- if it was possible to extend it using prodigal --- # print ("Finished extension at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) #print ORFFoundInMatch #print strCDS #print CDSType isContainedDefinedAllele = False try: if ORFFoundInMatch : alleleStr = strCDS if match.sbjct_start > match.sbjct_end: #reverse the order if needed alleleStr = reverseComplement(alleleStr) lenRatio = float(len(strCDS)) / float(geneLen) defAllele=[] if alleleStr in geneDict: #if ORF found is already defined alleleNumber = geneDict[ alleleStr ] ################################################ # EXACT MATCH --- MATCH == GENE --- GENE FOUND # ################################################ perfectMatchIdAllele.append(alleleNumber) resultsList.append('EXC2:' + str(alleleNumber) ) else: ####################### # ADD INFERRED ALLELE # # a new allele that was extended with prodigal ####################### if(CDSType=='stop codon in match end'): tagAux = 'INF1:' elif(CDSType=='start codon in match beggining'): tagAux = 'INF2:' elif(CDSType=='bigger than match'): tagAux = 'INF3:' elif(CDSType=='same size as match'): tagAux = 'INF4:' elif(CDSType=='cds inside match'): tagAux = 'INF5:' elif(CDSType=='start codon inside match'): tagAux = 'INF6:' else: tagAux = 'INF7:' print "infered allele has location : "+(CDSType) printinfo(genomeFile,geneFile) perfectMatchIdAllele.append( tagAux +"-"+str(alleleI)) print "New allele Infered with prodigal! Adding allele "+ tagAux + str(alleleI) +" to the database\n" geneDict[alleleStr] = alleleI resultsList.append( tagAux + str(alleleI) ) orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +"_" +str(os.path.basename(genomeFile))) # --- add the new allele to the gene fasta --- # fG = open( geneFile, 'a' ) fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomeFile)) + '\n') #print alleleStr fG.write( alleleStr + '\n') fG.close() alleleI += 1 # --- remake blast DB --- # Gene_Blast_DB_name = Create_Blastdb( geneFile, 1,False ) else: ################## # LNF WTFFF # ################## geneFile2= os.path.splitext(geneFile)[0] + "LNF2.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n") f.write((alleleStr) +"\n") f.write(">Allele\n") f.write((bmAllele)+"\n") resultsList.append('LNF2') printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("LNF2") print "CDS not found" except: if ORFFoundInMatch : alleleStr = strCDS ################## # LNF WTFFF # ################## geneFile2= os.path.splitext(geneFile)[0] + "LNF99.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n") f.write((alleleStr) +"\n") f.write(">Allele\n") f.write((bmAllele)+"\n") resultsList.append('LNF99') printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("LNF99") print "A problem occurred" final = (resultsList,perfectMatchIdAllele) #return (resultsList) print ("Finished allele calling at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) filepath=os.path.join(basepath , os.path.basename(geneFile)+"_result.txt") #print filepath with open(filepath, 'wb') as f: pickle.dump(final, f) return True
def callAlleles(argumentList): geneFile = argumentList[0] genomesList = argumentList[1] listOfGenomesDict = argumentList[2] gene_fp = HTSeq.FastaReader(geneFile) geneDict = {} alleleI = 1 orderedAlleleNames=[] biggestAllelelen=0 smallestAllelelen=99999 for allele in gene_fp: if allele.seq in geneDict: print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile else: if len(allele.seq)>biggestAllelelen: biggestAllelelen=len(allele.seq) if len(allele.seq)<smallestAllelelen: smallestAllelelen=len(allele.seq) orderedAlleleNames.append(allele.name) geneDict[ allele.seq ] = alleleI alleleI += 1 #print geneDict #print orderedAlleleNames # --- make 1st blast DB --- # Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 ) geneF = os.path.splitext( geneFile )[0] blast_out_file = geneF + '.xml' # list of results - the output of the function resultsList = [] i = 0 perfectMatchIdAllele=[] for genomeFile in genomesList: #print geneDict currentGenomeDict = listOfGenomesDict[i] #print genomeFile #print resultsList #print geneDict #print orderedAlleleNames i+=1 # it has to be incremented here if genomeFile[-1] == '\n': genomeFile = genomeFile[:-1] # ------------------------------ RUNNING BLAST ------------------------------ # cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) blast_records = runBlastParser(cline, blast_out_file, genomeFile) # ------ DETERMINING BEST MATCH ------ # # bestMatch = ['rec.query','hsp', lenRatio] bestMatch = ['','', 0] bestMatchContig='' bestMatchContigLen='' bestalignlen=0 perfectMatch=False bmAlleleLen2=0 bmAllele='' #noAlignment=False for blast_record in blast_records: # --- the LNF cases are now called outside de loop --- # #print blast_record if perfectMatch==True: break try: #print blast_record.alignments hspC = blast_record.alignments[0] if bestMatch[0] == '' and bestMatch[1] == '': bestMatch[0] = blast_record.query bestMatch[1] = hspC except IndexError: continue # --- the contig tag is used in the progigal function --- # contigTag = blast_record.query # --- brute force parsing of the contig tag - better solution is advisable --- # j=0 for l in contigTag: if l == ' ': break j+=1 contigTag = contigTag[:j] contigLen = blast_record.query_letters #print blast_record.query_id # --- iterating over all the results to determine the best match --- # for alignment in blast_record.alignments: index=orderedAlleleNames.index(alignment.hit_def) #print alignment.hit_def for k, v in geneDict.iteritems(): if v == index+1: bmAlleleLen2= len(k) if perfectMatch: break for match in alignment.hsps: #print match scoreRatio = float(match.score) / float(bmAlleleLen2) #print alignment.hit_def #print match.identities #print bmAlleleLen2 #print #print match.identities #print len(match.sbjct) #if #identities is the same as the length of the allele and it has no gaps or N's if (int(match.identities)==int(bmAlleleLen2) and int(match.identities)==int(len(match.sbjct)) and "N" not in match.query ): index=orderedAlleleNames.index(alignment.hit_def) for seq, alleleid in geneDict.iteritems(): if alleleid == index+1: bmAllele=seq break bmAlleleLen= len(bmAllele) lenratio=float(len(match.query))/float(bmAlleleLen) bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen] bestMatchContig=contigTag perfectMatch=True index=orderedAlleleNames.index(alignment.hit_def) bmAlleleLen= len(geneDict.keys()[index]) break #choose the match with the best score ratio score/length of allele elif scoreRatio > bestMatch[2]: index=orderedAlleleNames.index(alignment.hit_def) #print orderedAlleleNames #print geneDict #print orderedAlleleNames #print alignment.hit_def #print index #print geneDict for seq, alleleid in geneDict.iteritems(): if alleleid == index+1: bmAllele=seq break bmAlleleLen= len(bmAllele) #print bmAllele lenratio=float(len(match.query))/float(bmAlleleLen) bestMatch = [blast_record.query, match, scoreRatio, alignment.hit_def,lenratio,bmAlleleLen] bestMatchContig=contigTag bestMatchContigLen=blast_record.query_letters #print match.query bestalignlen=alignment.length if perfectMatch==True: break # ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- # try: #print bestMatch[0] match = bestMatch[1] #print match #print match.sbjct geneLen = bestMatch[5] alleleStr = match.query nIdentities = match.identities idPercent = float(nIdentities) / float(geneLen) scoreRatio = bestMatch[2] lenRatio = bestMatch[4] #print perfectMatch #print "\nContig best/exact match is :" #print bestMatchContig +"\n" except: #if no best match was found ################### # LOCUS NOT FOUND # ################### resultsList.append('LNF:-1') # append result to the list of results perfectMatchIdAllele.append('LNF') printinfo(genomeFile,geneFile) print "Locus not found \n" continue if perfectMatch is True: #if a perfect match was found if match.sbjct_start > match.sbjct_end: #reverse the order if needed alleleStr = reverseComplement(alleleStr) alleleNumber = geneDict[ alleleStr ] ################################################ # EXACT MATCH --- MATCH == GENE --- GENE FOUND # ################################################ if "_" in bestMatch[3]: a=bestMatch[3].split("_") perfectMatchIdAllele.append(a[1]) else: perfectMatchIdAllele.append(bestMatch[3]) resultsList.append('EXC:' + str(alleleNumber) ) elif bestMatch[0] != '' and perfectMatch is not True: #if a best match was found but it's not an exact match ########################### # LOCUS ON THE CONTIG TIP # ########################### if match.query_start ==1 and len(match.query) < geneLen: resultsList.append('LOT5:-1') perfectMatchIdAllele.append('LOT5') printinfo(genomeFile,geneFile) print "Locus is on the 5' tip of the contig \n" elif match.query_end == bestMatchContigLen and len(match.query) < bestMatchContigLen: resultsList.append('LOT3:-1') perfectMatchIdAllele.append('LOT3') printinfo(genomeFile,geneFile) print "Locus is on the 3' tip of the contig \n" elif bestMatchContigLen <= geneLen: resultsList.append('LOTSC:-1') perfectMatchIdAllele.append('LOTSC') printinfo(genomeFile,geneFile) #print match.query_start print "Locus is bigger than the contig \n" elif 'N' in alleleStr: #TODO gravar para ficheiro ##################### # ALLELE NOT FOUND # # N base found! ##################### geneFile2= os.path.splitext(geneFile)[0] + "LNFN.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+"\n") f.write((alleleStr) +"\n") resultsList.append('LNFN:-1') perfectMatchIdAllele.append('LNFN') printinfo(genomeFile,geneFile) print "LNFN, contains N bases! \n" else: #removing gaps alleleStr = alleleStr.replace('-', '') #lenExtraThresh=int(biggestAllelelen*0.1) lenExtraThresh=50 #print alleleStr # --- it might be needed to obtain the reverse complement of the allele string --- # if match.sbjct_start > match.sbjct_end: alleleStr = reverseComplement(alleleStr) #if alleleStr in geneDict: #if best match without gaps is already defined, example: best match allele was already defined but without gaps it's equal to a NA added # alleleNumber = geneDict[ alleleStr ] ################################################ # EXACT MATCH --- MATCH == GENE --- GENE FOUND # ################################################ # perfectMatchIdAllele.append("EXC2-"+str(alleleNumber)) # resultsList.append('EXC2:' + str(alleleNumber) ) #else: #check if best match without gaps are contained inside an already defined allele isContainedDefinedAllele = False #print geneDict.keys()[0] definedAllele='' definedAlleleName='' for k in geneDict.keys(): if alleleStr in k: definedAllele=k #print alleleStr isContainedDefinedAllele = True definedAlleleName=geneDict.get(k) break if isContainedDefinedAllele and int(len(match.query))<=int(len(definedAllele))+lenExtraThresh and int(len(match.query))>=int(len(definedAllele))-lenExtraThresh : #allele without gaps is contained in a defined allele #best match with gaps has same size +1/-1 base as the defined allele #print int(len(definedAllele)), int(len(match.sbjct)) if int(len(alleleStr))==int(len(definedAllele)): # if match without gaps has same size as the defined allele tagAux = 'NA1:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA1-"+str(alleleI)) elif int(len(alleleStr))==int(len(definedAllele))-1 : # if match without gaps has minus one base than the defined allele tagAux = 'NA2:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA2-"+str(alleleI)) #elif int(len(alleleStr))==int(len(definedAllele))+1 : # if match without gaps has plus one base than the defined allele # tagAux = 'NA3:' # printinfo(genomeFile,geneFile) # perfectMatchIdAllele.append("NA3-"+str(alleleI)) else: # if match without gaps has more than one base missing comparing to the defined allele tagAux = 'NA4:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA4-"+str(alleleI)) #TODO catch +1 and others print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database" geneDict[alleleStr] = alleleI resultsList.append( tagAux + str(alleleI) ) orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile))) # --- add the new allele to the gene fasta --- # fG = open( geneFile, 'a' ) fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)) + '\n') fG.write( alleleStr + '\n') fG.close() alleleI += 1 Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 ) #if best match is not contained in an already defined allele, check if it has similar size with the match allele and has 0.8 similarity elif not isContainedDefinedAllele and idPercent > 0.8 and int(len(match.query))<=int(geneLen)+lenExtraThresh and int(len(match.query))>=int(geneLen)-lenExtraThresh : #best match with gaps has 80% identity #best match with gaps is the same size or +1/-1 as the defined allele ratio=float(len(alleleStr)) / float(geneLen) if ratio>=0.8 and ratio<=1.2: # if match without gaps has same size as the best match allele and 80%similarity tagAux = '' extraleft=0 extraright=0 tS=0 tE=0 #print int(geneLen), len(match.sbjct) #print match.sbjct #print match handle = open(genomeFile, "rU") record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) handle.close() record= record_dict[bestMatchContig] #print match.sbjct #if(int(len(alleleStr))<int(len(match.query)) and int(len(match.query))<int(geneLen)) and int(geneLen)==int(match.sbjct_start): #if best match allele has missing bases, the tips would be cut #if len(match.sbjct)<geneLen and "-" not in match.sbjct: #if the allele is not fully used against the match, compensate the tips if (1<int(match.sbjct_start) and 1<int(match.sbjct_end)): if match.sbjct_start > match.sbjct_end: extraleft=match.sbjct_end-1 else: extraleft=match.sbjct_start-1 if (int(geneLen)>int(match.sbjct_start) and int(geneLen)>int(match.sbjct_end) ): # if 3' tip bases of the allele are missing on the match if match.sbjct_start > match.sbjct_end: extraright=geneLen-match.sbjct_start else: extraright=geneLen-match.sbjct_end #print extraleft, extraright if match.sbjct_start > match.sbjct_end: tS=match.query_start-extraright-1 tE=match.query_end+extraleft alleleStr=str(record.seq[tS:tE]) alleleStr = reverseComplement(alleleStr) else: tS=match.query_start-extraleft-1 tE=match.query_end+extraright alleleStr=str(record.seq[tS:tE]) tagAux = 'NA5:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA5-"+str(alleleI)) print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database" geneDict[alleleStr] = alleleI resultsList.append( tagAux + str(alleleI) ) orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile))) # --- add the new allele to the gene fasta --- # fG = open( geneFile, 'a' ) fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)) + '\n') fG.write( alleleStr + '\n') fG.close() alleleI += 1 Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 ) else: ################## # LNF WTFFF # ################## geneFile2= os.path.splitext(geneFile)[0] + "LNF3.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n") f.write((alleleStr) +"\n") f.write(">Allele\n") f.write((bmAllele)+"\n") resultsList.append('LNF3') printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("LNF3") print "No allele found" elif isContainedDefinedAllele: #################### # UNDEFINED ALLELE # # it is contained in another allele #################### alleleStr=match.query #if match.sbjct_start > match.sbjct_end: #### - error #alleleStr = reverseComplement(alleleStr) resultsList.append('UND:-1') perfectMatchIdAllele.append("undefined allele") printinfo(genomeFile,geneFile) print "Undefined allele \n" geneFile2= os.path.splitext(geneFile)[0] + "undefined.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n") f.write((alleleStr) +"\n") #f.write(">BlastBestMatch"+str(definedAlleleName)+"\n") #f.write((alleleStr)+"\n") f.write(">Allele"+str(definedAlleleName)+"\n") f.write((definedAllele)+"\n") elif lenRatio < 0.5: ############### # SMALL MATCH # ############### resultsList.append('SAC:-1') # don't know what 'SAC' stands for perfectMatchIdAllele.append('small match') printinfo(genomeFile,geneFile) print "lower than 50% match \n" elif lenRatio < 0.8 and idPercent < 0.5: ##################### # INCOMPLETE ALLELE # # it was not possible to extend it to at least 80% of the length of the gene ##################### resultsList.append('INC:-1') perfectMatchIdAllele.append('allele incomplete') printinfo(genomeFile,geneFile) print "Incomplete allele\n" else: ################## # LNF WTFFF # ################## geneFile2= os.path.splitext(geneFile)[0] + "LNF2.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n") f.write((alleleStr) +"\n") f.write(">Allele\n") f.write((bmAllele)+"\n") resultsList.append('LNF2') printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("LNF2") print "No allele found" final = (resultsList,perfectMatchIdAllele) #return (resultsList) return final
def main(): try: input_file = sys.argv[1] temppath = sys.argv[2] except IndexError: print "usage: list_pickle_obj" argumentList=[] with open(input_file,'rb') as f: argumentList = pickle.load(f) geneFile = argumentList[0] genomesList = argumentList[1] basepath=temppath+"/"+os.path.basename(geneFile) if not os.path.exists(basepath+"/blastdbs/"): os.makedirs(basepath+"/blastdbs/") gene_fp = HTSeq.FastaReader(geneFile) geneDict = {} alleleI = 1 inverted=False orderedAlleleNames=[] biggestAllelelen=0 smallestAllelelen=99999 for allele in gene_fp: if allele.seq in geneDict: print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile else: if len(allele.seq)>biggestAllelelen: biggestAllelelen=len(allele.seq) if len(allele.seq)<smallestAllelelen: smallestAllelelen=len(allele.seq) orderedAlleleNames.append(str(alleleI)) geneDict[ allele.seq ] = alleleI alleleI += 1 # --- make 1st blast DB --- # geneF = os.path.basename(geneFile) blast_out_file = os.path.dirname(geneFile)+"/blastdbs/"+geneF + '.xml' # list of results - the output of the function resultsList = [] i = 0 perfectMatchIdAllele=[] perfectMatchIdAllele2=[] genomeDict = {} genome=-1 print genomesList for genomeFile in genomesList: print "_______________________________________________________" printinfo(genomeFile,geneFile) g_fp = HTSeq.FastaReader( genomeFile ) for contig in g_fp: sequence=str(contig.seq) genomeDict[ contig.name ] = sequence currentGenomeDict = genomeDict genome+=1 print ("Blasting alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) blast_out_file = os.path.join(basepath,"blastdbs/"+os.path.basename(geneFile)+ '_List.xml') Gene_Blast_DB_name = os.path.join(temppath,str(os.path.basename(genomeFile))+"/"+str(os.path.basename(genomeFile))+"_db") cline = NcbiblastnCommandline(query=geneFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) blast_records = runBlastParser(cline, blast_out_file, geneFile) print ("Blasted alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) # ------ DETERMINING BEST MATCH ------ # bestMatch = ['','', 0] bestMatchContig='' bestMatchContigLen='' bestalignlen=0 perfectMatch=False bmAlleleLen2=0 bmAllele='' for blast_record in blast_records: if perfectMatch==True: break try: hspC = blast_record.alignments[0] if bestMatch[0] == '' and bestMatch[1] == '': bestMatch[0] = blast_record.query bestMatch[1] = hspC except IndexError: continue # --- the contig tag is used in the progigal function --- # contigTag = blast_record.query j=0 for l in contigTag: if l == ' ': break j+=1 contigTag = contigTag[:j] contigLen = blast_record.query_letters # --- iterating over all the results to determine the best match --- # for alignment in blast_record.alignments: contigTag = alignment.hit_def contigTag=(contigTag.split(" "))[0] index=orderedAlleleNames.index(str(blast_record.query_id).split("_")[1]) #print alignment.hit_def for k, v in geneDict.iteritems(): if v == index+1: bmAlleleLen2= len(k) if perfectMatch: break for match in alignment.hsps: scoreRatio = float(match.score) / float(bmAlleleLen2) #if # of identities is the same as the length of the allele and it has no gaps or N's if (int(match.identities)==int(bmAlleleLen2) and int(match.identities)==int(len(match.query)) and "N" not in match.sbjct and "K" not in match.sbjct and "Y" not in match.sbjct and "R" not in match.sbjct ): index=orderedAlleleNames.index(str(blast_record.query_id).split("_")[1]) for seq, alleleid in geneDict.iteritems(): if alleleid == index+1: bmAllele=seq break bmAlleleLen= len(bmAllele) lenratio=float(len(match.sbjct))/float(bmAlleleLen) bestMatch = [blast_record.query, match, scoreRatio, blast_record.query_id,lenratio,bmAlleleLen] bestMatchContig=contigTag perfectMatch=True index=orderedAlleleNames.index(str(blast_record.query_id).split("_")[1]) bmAlleleLen= len(geneDict.keys()[index]) break #choose the match with the best score ratio score/length of allele elif scoreRatio > bestMatch[2]: index=orderedAlleleNames.index(str(blast_record.query_id).split("_")[1]) for seq, alleleid in geneDict.iteritems(): if alleleid == index+1: bmAllele=seq break bmAlleleLen= len(bmAllele) lenratio=float(len(match.sbjct))/float(bmAlleleLen) bestMatch = [blast_record.query, match, scoreRatio, blast_record.query_id,lenratio,bmAlleleLen] bestMatchContig=contigTag bestMatchContigLen=len(currentGenomeDict[contigTag]) print contigTag bestalignlen=alignment.length if perfectMatch==True: break # ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- # print ("Finished choosing best match at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) try: match = bestMatch[1] bestMatchStart=match.sbjct_start bestMatchEnd=match.sbjct_end if match.query_start > match.query_end: bestMatchEnd=match.sbjct_start bestMatchStart=match.sbjct_end geneLen = bestMatch[5] alleleStr = match.sbjct nIdentities = match.identities idPercent = float(nIdentities) / float(geneLen) scoreRatio = bestMatch[2] lenRatio = bestMatch[4] except: #if no best match was found ################### # LOCUS NOT FOUND # ################### resultsList.append('LNF3:-1') perfectMatchIdAllele.append('LNF') perfectMatchIdAllele2.append('LNF') print "Locus not found, no matches \n" continue print "is perfect match true?" +str(perfectMatch) if perfectMatch is True: #if a perfect match was found (DNA sequence is the same) try: alleleNumber = geneDict[ alleleStr ] except: alleleStr=reverseComplement(alleleStr) alleleNumber = geneDict[ alleleStr ] ################################################ # EXACT MATCH --- MATCH == GENE --- GENE FOUND # ################################################ if "_" in bestMatch[3]: a=bestMatch[3].split("_") perfectMatchIdAllele.append(a[1]) perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+") else: perfectMatchIdAllele.append(bestMatch[3]) perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+") resultsList.append('EXC:' + str(alleleNumber) ) printinfo(genomeFile,geneFile) print "Exact match \n" continue elif bestMatch[0] != '' and perfectMatch is not True: #if a best match was found but it's not an exact match ########################### # LOCUS ON THE CONTIG TIP # ########################### #check if the match is on the tip of the contig if bestMatchContigLen <= geneLen: resultsList.append('LOTSC:-1') perfectMatchIdAllele.append('LOTSC') perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+") printinfo(genomeFile,geneFile) print "Locus is bigger than the contig \n" elif match.sbjct_start ==1 and len(match.query) < geneLen: resultsList.append('LOT5:-1') perfectMatchIdAllele.append('LOT5') perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+") printinfo(genomeFile,geneFile) print "Locus is on the 5' tip of the contig \n" elif match.sbjct_end ==1 and len(match.query) < geneLen and match.sbjct_start > match.sbjct_end: resultsList.append('LOT3:-1') perfectMatchIdAllele.append('LOT3') perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+") printinfo(genomeFile,geneFile) print "Locus is on the 3' tip of the contig \n" elif match.sbjct_end == bestMatchContigLen and len(match.query) < bestMatchContigLen: resultsList.append('LOT3:-1') perfectMatchIdAllele.append('LOT3') perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+") printinfo(genomeFile,geneFile) print "Locus is on the 3' tip of the contig \n" elif match.sbjct_start == bestMatchContigLen and len(match.query) < bestMatchContigLen and match.sbjct_start > match.sbjct_end: resultsList.append('LOT5:-1') perfectMatchIdAllele.append('LOT5') perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+") printinfo(genomeFile,geneFile) print "Locus is on the 5' tip of the contig \n" elif 'N' in alleleStr or "K" in alleleStr or "R" in alleleStr or "Y" in alleleStr: ##################### # ALLELE NOT FOUND # ##################### # strange base found! geneFile2= os.path.splitext(geneFile)[0] + "LNFN.fasta" with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+"\n") f.write((alleleStr) +"\n") resultsList.append('LNFN:-1') perfectMatchIdAllele.append('LNFN') perfectMatchIdAllele2.append('LNFN') printinfo(genomeFile,geneFile) print "LNFN, contains strange (N,K,R) bases! \n" else: print "new allele?" alleleStr = alleleStr.replace('-', '') lenExtraThresh=int(biggestAllelelen*0.2) #else: #check if best match without gaps are contained inside an already defined allele isContainedDefinedAllele = False definedAllele='' definedAlleleName='' for k in geneDict.keys(): if alleleStr in k: definedAllele=k isContainedDefinedAllele = True definedAlleleName=geneDict.get(k) break print "is contained? " + str(isContainedDefinedAllele) print idPercent print geneLen print lenExtraThresh print lenRatio if isContainedDefinedAllele and int(len(match.sbjct))<=int(len(definedAllele))+lenExtraThresh and int(len(match.sbjct))>=int(len(definedAllele))-lenExtraThresh : #allele without gaps is contained in a defined allele #best match with gaps has same size +1/-1 base as the defined allele isnewallele=False if int(len(alleleStr))==int(len(definedAllele)): # if match without gaps has same size as the defined allele tagAux = 'NA1:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA1-"+str(alleleI)) perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+") isnewallele=True elif int(len(alleleStr))==int(len(definedAllele))-1 : # if match without gaps has minus one base than the defined allele tagAux = 'NA2:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA2-"+str(alleleI)) perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(bestMatchStart)+"-"+str(bestMatchEnd)+"&"+"+") isnewallele=True else: extraleft=0 extraright=0 tS=0 tE=0 handle = open(genomeFile, "rU") record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) handle.close() record= record_dict[bestMatchContig] # if match without gaps has more than one base missing comparing to the defined allele if (1<int(match.query_start) and 1<int(match.query_end)): if match.query_start > match.query_end: extraleft=match.query_end-1 else: extraleft=match.query_start-1 print extraleft, extraright # if 3' tip bases of the allele are missing on the match if (int(geneLen)>int(match.query_start) and int(geneLen)>int(match.query_end) ): if match.query_start > match.query_end: extraright=geneLen-match.query_start else: extraright=geneLen-match.query_end print extraleft, extraright if match.sbjct_start > match.sbjct_end: tE=match.sbjct_start+extraleft tS=match.sbjct_end-extraright-1 alleleStr=str(record.seq[tS:tE]) alleleStr = reverseComplement(alleleStr) else: tS=match.sbjct_start-extraleft-1 tE=match.sbjct_end+extraright alleleStr=str(record.seq[tS:tE]) print tS print tE print "allele is:" print alleleStr if tE> bestMatchContigLen: resultsList.append('LOT3B:-1') perfectMatchIdAllele.append('LOT3B') perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(tS)+"-"+str(bestMatchContigLen)+"&"+"+") printinfo(genomeFile,geneFile) print "Locus is on the 3B' tip of the contig \n" elif tS<0: resultsList.append('LOT5B:-1') perfectMatchIdAllele.append('LOT5B') perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(0)+"-"+str(tE)+"&"+"+") printinfo(genomeFile,geneFile) print "Locus is on the 5B' tip of the contig \n" else: tagAux = 'NA4:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA4-"+str(alleleI)) perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(tS)+"-"+str(tE)+"&"+"+") isnewallele=True if isnewallele: print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database" geneDict[alleleStr] = alleleI resultsList.append( tagAux + str(alleleI) ) orderedAlleleNames.append(str(alleleI)) # --- add the new allele to the gene fasta --- # fG = open( geneFile, 'a' ) fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)) + '\n') fG.write( alleleStr + '\n') fG.close() alleleI += 1 #if best match is not contained in an already defined allele, check if it has similar size with the match allele and has 0.8 similarity elif not isContainedDefinedAllele and idPercent >= 0.8 and int(len(match.sbjct))<=int(geneLen)+lenExtraThresh and int(len(match.sbjct))>=int(geneLen)-lenExtraThresh : ratio=float(len(alleleStr)) / float(geneLen) if ratio>=0.8 and ratio<=1.2: # if match without gaps has same size as the best match allele and 80%similarity tagAux = '' extraleft=0 extraright=0 tS=0 tE=0 handle = open(genomeFile, "rU") record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) handle.close() record= record_dict[bestMatchContig] #if len(match.sbjct)<geneLen and "-" not in match.sbjct: #if the allele is not fully used against the match, compensate the tips try: print match if (1<int(match.query_start) and 1<int(match.query_end)): if match.query_start > match.query_end: extraleft=match.query_end-1 else: extraleft=match.query_start-1 print extraleft, extraright if (int(geneLen)>int(match.query_start) and int(geneLen)>int(match.query_end) ): # if 3' tip bases of the allele are missing on the match if match.query_start > match.query_end: extraright=geneLen-match.query_start else: extraright=geneLen-match.query_end print extraleft, extraright if match.sbjct_start > match.sbjct_end: tE=match.sbjct_start+extraleft tS=match.sbjct_end-extraright-1 alleleStr=str(record.seq[tS:tE]) alleleStr = reverseComplement(alleleStr) else: tS=match.sbjct_start-extraleft-1 tE=match.sbjct_end+extraright alleleStr=str(record.seq[tS:tE]) print tS print tE print "allele is:" print alleleStr if tE> bestMatchContigLen: resultsList.append('LOT3B:-1') perfectMatchIdAllele.append('LOT3B') perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(tS)+"-"+str(bestMatchContigLen)+"&"+"+") printinfo(genomeFile,geneFile) print "Locus is on the 3B' tip of the contig \n" elif tS<0: resultsList.append('LOT5B:-1') perfectMatchIdAllele.append('LOT5B') perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(0)+"-"+str(tE)+"&"+"+") printinfo(genomeFile,geneFile) print "Locus is on the 5B' tip of the contig \n" else: tagAux = 'NA5:' printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("NA5-"+str(alleleI)) perfectMatchIdAllele2.append(str(bestMatchContig)+"&"+str(tS)+"-"+str(tE)+"&"+"+") print "New allele found! Adding allele "+ tagAux + str(alleleI) +" to the database" geneDict[alleleStr] = alleleI resultsList.append( tagAux + str(alleleI) ) #orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile))) orderedAlleleNames.append(str(alleleI)) # --- add the new allele to the gene fasta --- # fG = open( geneFile, 'a' ) fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] +'_' + str(os.path.basename(genomeFile)) + '\n') fG.write( alleleStr + '\n') fG.close() alleleI += 1 except Exception as e: ################## # LNF WTF # ################## print e geneFile2= os.path.splitext(geneFile)[0] + "LNF3.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n") f.write((alleleStr) +"\n") f.write(">Allele\n") f.write((bmAllele)+"\n") resultsList.append('LNF3') printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("LNF3") perfectMatchIdAllele2.append("LNF3") print "No allele found" else: ################## # LNF WTF2 # ################## geneFile2= os.path.splitext(geneFile)[0] + "LNF4.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n") f.write((alleleStr) +"\n") f.write(">Allele\n") f.write((bmAllele)+"\n") resultsList.append('LNF4') printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("LNF4") perfectMatchIdAllele2.append("LNF4") print "No allele found" elif isContainedDefinedAllele: #################### # UNDEFINED ALLELE # # it is contained in another allele #################### alleleStr=match.query resultsList.append('UND:-1') perfectMatchIdAllele.append("undefined allele") perfectMatchIdAllele2.append("undefined allele") printinfo(genomeFile,geneFile) print "Undefined allele \n" geneFile2= os.path.splitext(geneFile)[0] + "undefined.fasta" print geneFile2 """with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n") f.write((alleleStr) +"\n") #f.write(">BlastBestMatch"+str(definedAlleleName)+"\n") #f.write((alleleStr)+"\n") f.write(">Allele"+str(definedAlleleName)+"\n") f.write((definedAllele)+"\n")""" elif lenRatio < 0.5: ############### # SMALL MATCH # ############### resultsList.append('SAC:-1') # don't know what 'SAC' stands for perfectMatchIdAllele.append('small match') perfectMatchIdAllele2.append('small match') printinfo(genomeFile,geneFile) print "lower than 50% match \n" elif lenRatio < 0.8 and idPercent < 0.5: ##################### # INCOMPLETE ALLELE # # it was not possible to extend it to at least 80% of the length of the gene ##################### resultsList.append('INC:-1') perfectMatchIdAllele.append('allele incomplete') perfectMatchIdAllele2.append('allele incomplete') printinfo(genomeFile,geneFile) print "Incomplete allele\n" else: ################## # LNF WTFFF # ################## geneFile2= os.path.splitext(geneFile)[0] + "LNF.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">"+ (str(os.path.basename(genomeFile)))+"|"+(str(os.path.basename(geneFile)))+" | "+str(bestMatchContig)+"\n") f.write((alleleStr) +"\n") f.write(">Allele\n") f.write((bmAllele)+"\n") resultsList.append('LNF5') printinfo(genomeFile,geneFile) perfectMatchIdAllele.append("LNF5") perfectMatchIdAllele2.append("LNF5") print "Locus not found" final = (resultsList,perfectMatchIdAllele) print ("Finished allele calling at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) filepath=os.path.join(temppath , os.path.basename(geneFile)+"_result.txt") filepath2=os.path.join(temppath , os.path.basename(geneFile)+"_result2.txt") with open(filepath, 'wb') as f: pickle.dump(final, f) with open(filepath2, 'wb') as f: pickle.dump(perfectMatchIdAllele2, f) return True
def main(): print ("Starting script at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) try: input_file = sys.argv[1] temppath = sys.argv[2] except IndexError: print "usage: list_pickle_obj" argumentList=[] with open(input_file,'rb') as f: argumentList = pickle.load(f) geneFile = argumentList[0] genomesList = argumentList[1] #listOfCDSDicts = argumentList[2] basepath=os.path.join(temppath,os.path.splitext(geneFile)[0]) if not os.path.exists(basepath): os.makedirs(basepath) #print geneFile gene_fp = HTSeq.FastaReader(geneFile) alleleI = 0 #inverted=False #orderedAlleleNames=[] resultsList = [] i = 0 perfectMatchIdAllele=[] perfectMatchIdAllele2=[] allelescores=[] print ("Getting BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) alleleI,allelescores,Gene_Blast_DB_name,alleleList=getBlastScoreRatios(geneFile,basepath) print ("Finished BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) genome=-1 genomeDict = {} print ("starting allele call at: "+time.strftime("%H:%M:%S-%d/%m/%Y")) for genomeFile in genomesList: print genomeFile bestmatch=[0,0,False,'',0] #score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID currentGenomeDict={} currentCDSDict={} #currentCDSDict = listOfCDSDicts[i] filepath=os.path.join(temppath,str(os.path.basename(genomeFile))+"_ORF_Protein.txt") with open(filepath,'rb') as f: currentCDSDict = pickle.load(f) g_fp = HTSeq.FastaReader( genomeFile ) for contig in g_fp: sequence=str(contig.seq) genomeDict[ contig.name ] = sequence currentGenomeDict = genomeDict #print currentGenomeDict #alleleI = 0 #alleleProt='' #for allele in gene_fp: #new db for each allele to blast it against himself # alleleI+=1 # alleleProt+=">"+str(alleleI)+"\n"+str(translateSeq(allele.seq)+"\n") #basepath="./blastdbs/temp"+str(os.path.basename(geneFile)) #if not os.path.exists(basepath): # os.makedirs(basepath) #with open(basepath+'/protein.fasta', "wb") as f: # f.write(alleleProt) #Gene_Blast_DB_name = Create_Blastdb( basepath+'/protein.fasta', 1, True ) genome+=1 listOfCDS=currentCDSDict genomeProteinfastaPath=os.path.join(temppath,str(os.path.basename(genomeFile)+'_Protein.fasta')) print ("Blasting alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) blast_out_file = os.path.join(basepath,"blastdbs/"+os.path.basename(geneFile)+ '_List.xml') #with open(basepath+'/proteinList.fasta', "wb") as f: # f.write(protList) #Gene_Blast_DB_name = Create_Blastdb( './temp/proteinList.fasta', 1, True ) cline = NcbiblastpCommandline(query=genomeProteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) #print cline #try: print ("Parse bsr blast at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) blast_records = runBlastParser(cline, blast_out_file, genomeProteinfastaPath) #except: # cline = NcbiblastpCommandline(query=genomeProteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) print ("Blasted alleles on genome at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) for blast_record in blast_records: for alignment in blast_record.alignments: for match in alignment.hsps: #print blast_record.query #print match #print alleleI, len(allelescores) scoreRatio=float(match.score)/float(allelescores[int(alignment.hit_def)-1]) #print scoreRatio #print alignment.hit_def cdsStrName=blast_record.query DNAstr=listOfCDS[">"+cdsStrName] AlleleDNAstr=alleleList[int(alignment.hit_def)-1] compare=False if DNAstr==AlleleDNAstr is False: try: DNAstr=reverseComplement(DNAstr) if DNAstr==AlleleDNAstr is False: pass else: compare=True except: pass else: compare=True if "N" in DNAstr or "K" in DNAstr or "R" in DNAstr: pass elif(scoreRatio == 1 and bestmatch[2] is False and compare is True): bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters] #print alignment #print match elif(scoreRatio == 1 and match.score>bestmatch[0] and compare is True): bestmatch=[match.score,scoreRatio,True,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters] #print match elif(scoreRatio == 1 and bestmatch[2] is False and compare is False): bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters] #print alignment #print match elif(scoreRatio == 1 and match.score>bestmatch[0] and compare is False): bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters] #print match elif(match.score>bestmatch[0] and scoreRatio>0.6 and scoreRatio>bestmatch[1] and bestmatch[2] is False): #print match.query #print match.sbjct #print allelescores bestmatch=[match.score,scoreRatio,False,cdsStrName,int(alignment.hit_def),match,len(AlleleDNAstr),blast_record.query_letters] #print match #print bestmatch print ("Classifying the match at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) if bestmatch[0]==0 or "N" in AlleleDNAstr or "K" in AlleleDNAstr or "R" in AlleleDNAstr : #if no best match was found ################### # LOCUS NOT FOUND # ################### if bestmatch[0]==0: resultsList.append('LNF3:-1') # append result to the list of results perfectMatchIdAllele.append('LNF') perfectMatchIdAllele2.append('LNF') #printinfo(genomeFile,geneFile) print "Locus not found, no matches \n" else: resultsList.append('LNFN:-1') # append result to the list of results perfectMatchIdAllele.append('LNF') perfectMatchIdAllele2.append('LNF') #printinfo(genomeFile,geneFile) print "Locus has strange base (N, K or R) \n" elif bestmatch[2] is True: contigname=bestmatch[3] contigname=contigname.split("&") matchLocation=contigname[2] #matchLocation=matchLocation.split("-") contigname=contigname[0] alleleStr=listOfCDS[">"+bestmatch[3]] protSeq,alleleStr,Reversed=translateSeq(alleleStr) #if a perfect match was found ################################################ # EXACT MATCH --- MATCH == GENE --- GENE FOUND # ################################################ perfectMatchIdAllele.append(str(bestmatch[4])) if not Reversed: perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"+") else: perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation)+"&"+"-") resultsList.append('EXC:' + str(bestmatch[4]) ) else: match=bestmatch[5] #print match geneLen=bestmatch[6] contigname=bestmatch[3] #print contigname contigname=contigname.split("&") matchLocation=contigname[2] matchLocation=matchLocation.split("-") contigname=contigname[0] seq=currentGenomeDict[ contigname ] bestMatchContigLen=len(seq) alleleStr=listOfCDS[">"+bestmatch[3]] protSeq,alleleStr,Reversed=translateSeq(alleleStr) print match print matchLocation print bestMatchContigLen # get extra space to the right and left between the contig and match rightmatchContig=bestMatchContigLen-int(matchLocation[1]) leftmatchContig=int(matchLocation[0]) if Reversed: aux=rightmatchContig rightmatchContig=leftmatchContig leftmatchContig=aux """else: rightmatchContig=bestMatchContigLen-int(matchLocation[0]) leftmatchContig=int(matchLocation[1])""" print rightmatchContig,leftmatchContig # get extra space to the right and left between the allele and match rightmatchAllele=geneLen-(int(match.sbjct_end)*3) leftmatchAllele=(int(match.sbjct_start)*3) """if Reversed: aux=rightmatchAllele rightmatchAllele=leftmatchAllele leftmatchAllele=aux""" """else: rightmatchAllele=geneLen-(int(match.sbjct_start )*3) leftmatchAllele=(int(match.sbjct_end)*3)""" #if a best match was found but it's not an exact match ########################### # LOCUS ON THE CONTIG TIP # ########################### print rightmatchAllele, leftmatchAllele print geneLen #if bestMatchContigLen <= geneLen: if leftmatchContig<leftmatchAllele and rightmatchContig < rightmatchAllele: resultsList.append('LOTSC:-1') perfectMatchIdAllele.append('LOTSC') perfectMatchIdAllele2.append('LOTSC') #printinfo(genomeFile,geneFile) #print match.query_start print "Locus is bigger than the contig \n" #if match.query_start ==1 and len(match.query) < geneLen: elif leftmatchContig<leftmatchAllele: resultsList.append('LOT3:-1') perfectMatchIdAllele.append('LOT3') perfectMatchIdAllele2.append('LOT3') print "Locus is on the 3' tip of the contig \n" #elif match.query_end == bestMatchContigLen and len(match.query) < geneLen: elif rightmatchContig < rightmatchAllele: resultsList.append('LOT5:-1') perfectMatchIdAllele.append('LOT5') perfectMatchIdAllele2.append('LOT5') print "Locus is on the 5' tip of the contig \n" else: ####################### # ADD INFERRED ALLELE # # a new allele ####################### #print "infered allele has location : "+(CDSType) #printinfo(genomeFile,geneFile) tagAux='INF' perfectMatchIdAllele.append( tagAux +"-"+str(alleleI+1)) #perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])) if not Reversed: perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+") else: perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-") print "New allele! Adding allele "+ tagAux + str(alleleI+1) +" to the database\n" resultsList.append( tagAux + str(alleleI+1) ) #orderedAlleleNames.append('allele_' + str(alleleI) + '_' + tagAux[:-1] +"_" +str(os.path.basename(genomeFile))) # --- add the new allele to the gene fasta --- # appendAllele='>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n' fG = open( geneFile, 'a' ) #fG.write('>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n') fG.write(appendAllele) #print alleleStr #print listOfCDS #alleleStr=listOfCDS[">"+bestmatch[3]] #match=bestmatch[5] #reverse the order if needed #if match.sbjct_start > match.sbjct_end: # alleleStr = reverseComplement(alleleStr) fG.write( alleleStr + '\n') fG.close() fG = open( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein2.fasta')), 'w' ) #fG.write('>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n') fG.write('>'+str(alleleI+1)+'\n'+str(protSeq) + '\n') fG.close() fG = open( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta')), 'a' ) #fG.write('>allele_' + str(alleleI+1) + '_' + tagAux[:-1] +"_" + str(os.path.basename(genomesList[genome])) + '\n') fG.write('>'+str(alleleI+1)+'\n'+str(protSeq) + '\n') #print alleleStr fG.close() #print listOfCDS #alleleStr=listOfCDS[">"+bestmatch[3]] match=bestmatch[5] #alleleI += 1 # --- remake blast DB --- # alleleList.append(alleleStr) Gene_Blast_DB_name = Create_Blastdb( os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta')), 1, True ) print os.path.join(basepath,str(os.path.basename(geneFile)+'_protein.fasta')) genefile2= os.path.join(basepath,str(os.path.basename(geneFile)+'_protein2.fasta')) Gene_Blast_DB_name2 = Create_Blastdb( genefile2, 1, True ) print ("Re-calculating BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) alleleI,allelescores,alleleList=reDogetBlastScoreRatios(genefile2,basepath,alleleI,allelescores,Gene_Blast_DB_name2,alleleList) print allelescores print ("Done Re-calculating BSR at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) #x=y #shutil.rmtree(basepath) final = (resultsList,perfectMatchIdAllele) #return (resultsList) print ("Finished allele calling at : "+time.strftime("%H:%M:%S-%d/%m/%Y")) filepath=os.path.join(temppath , os.path.basename(geneFile)+"_result.txt") filepath2=os.path.join(temppath , os.path.basename(geneFile)+"_result2.txt") #print filepath with open(filepath, 'wb') as f: pickle.dump(final, f) with open(filepath2, 'wb') as f: pickle.dump(perfectMatchIdAllele2, f) shutil.rmtree(basepath) return True
def getBlastScoreRatios(genefile, basepath, doAll, verbose, blastPath): if verbose: def verboseprint(*args): for arg in args: print(arg), print else: verboseprint = lambda *a: None # do-nothing function #gene_fp = HTSeq.FastaReader(genefile) allelescores = [] alleleProt = '' alleleAllProt = '' alleleList = [] alleleI = 0 alleleIlist = [] listAllelesNames = [] # calculate bsr for each allele for allele in SeqIO.parse(genefile, "fasta", generic_dna): # usually first allele name is just >1 and after that it has >gene_id_genome aux = allele.id.split("_") if len(aux) < 2: alleleI = int(aux[0]) else: alleleI = int(aux[-1]) # try to translate the allele alleleIlist.append(alleleI) alleleList.append(str(allele.seq)) listAllelesNames.append(allele.id) translatedSequence, x, y = translateSeq(allele.seq) if translatedSequence == '': print("cannot translate allele on bsr calculation") pass # calculate BSR for the allele else: alleleProt = ">" + str(alleleI) + "\n" + str(translatedSequence + "\n") alleleAllProt += ">" + str(alleleI) + "\n" + str( translatedSequence + "\n") proteinfastaPath = os.path.join( basepath, str(os.path.basename(genefile) + '_protein2.fasta')) # new db for each allele to blast it against himself with open(proteinfastaPath, "w") as f: f.write(alleleProt) Gene_Blast_DB_name = Create_Blastdb(proteinfastaPath, 1, True) # if bsr hasn't been calculated, do the BLAST if doAll: blast_out_file = os.path.join(basepath, 'blastdbs/temp.xml') verboseprint("Starting Blast alleles at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) # --- get BLAST score ratio --- # cline = NcbiblastpCommandline(cmd=blastPath, query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5, num_threads=1) allelescore = 0 blast_records = runBlastParser(cline, blast_out_file) verboseprint("Blasted alleles at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) for blast_record in blast_records: for alignment in blast_record.alignments: for match in alignment.hsps: allelescores.append(int(match.score)) geneScorePickle = os.path.abspath(genefile) + '_bsr.txt' verboseprint("________") # ~ var=[alleleI,allelescores] var = dict(zip(alleleIlist, allelescores)) with open(geneScorePickle, 'wb') as f: pickle.dump(var, f) # bsr had already been calculated, load it to memory else: geneScorePickle = os.path.abspath(genefile) + '_bsr.txt' with open(geneScorePickle, 'rb') as f: var = pickle.load(f) # ~ allelescores=var[1] proteinfastaPath = os.path.join( basepath, str(os.path.basename(genefile) + '_protein.fasta')) with open(proteinfastaPath, "w") as f: f.write(alleleAllProt) # returning all allele BSR scores and list of alleles for this gene return var, alleleList, listAllelesNames
def main(): parser = argparse.ArgumentParser( description= "Given an ffn file, recovers the genes that are not paralogs and have a size bigger than the g parameter provided" ) parser.add_argument('-i', nargs='?', type=str, help='ffn file', required=True) parser.add_argument('-l', nargs='?', type=int, help='int minimum length', required=True) parser.add_argument( '--cpu', nargs='?', type=int, help="Number of cpus, if over the maximum uses maximum -2", required=False) parser.add_argument('-p', nargs='?', type=str, help="file with protein", required=False, default=False) parser.add_argument('-o', nargs='?', type=str, help="output filename", required=False, default=False) parser.add_argument('-b', nargs='?', type=str, help="BLAST full path", required=False, default='blastp') parser.add_argument('--bsr', nargs='?', type=float, help="minimum BSR similarity", required=False, default=0.6) parser.add_argument("-v", "--verbose", help="increase output verbosity", dest='verbose', action="store_true", default=False) args = parser.parse_args() genes = args.i sizethresh = args.l cpuToUse = args.cpu proteinFIlePath = args.p outputFIlePath = args.o BlastpPath = args.b bsr = args.bsr verbose = args.verbose if verbose: def verboseprint(*args): for arg in args: print(arg), print else: verboseprint = lambda *a: None starttime = "\nStarting Script at : " + time.strftime("%H:%M:%S-%d/%m/%Y") verboseprint("\nStarting Script at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) verboseprint("Checking Blast installed... " + str(which(BlastpPath))) # translate to protein and create new file abspath = os.path.abspath(genes) filename = os.path.basename(genes) abspath = abspath.replace(filename, '') proteinfile = os.path.join(abspath, 'proteins.fasta') geneDict = {} protDict = {} orderedprotDict = collections.OrderedDict() alreadyIn = [] totalgenes = 0 repeatedgenes = 0 smallgenes = 0 nottranslatable = 0 verboseprint("Checking translatability of the loci:\n") if not proteinFIlePath: # print "not passing steps" with open(proteinfile, "w") as f: #g_fp = HTSeq.FastaReader(genes) for gene in SeqIO.parse(genes, "fasta", generic_dna): dnaseq = str(gene.seq) protseq, seq, y = translateSeq(dnaseq, gene.id) totalgenes += 1 if len(protseq) > 1: if str(protseq) in alreadyIn: repeatedgenes += 1 elif len(str(seq)) < sizethresh: smallgenes += 1 else: alreadyIn.append(str(protseq)) protname = ">" + str(gene.id) + "\n" f.write(protname + str(protseq) + "\n") protDict[protname] = str(protseq) geneDict[str(gene.name)] = dnaseq else: nottranslatable += 1 continue verboseprint( str(nottranslatable) + " not translatable out of " + str(totalgenes)) verboseprint("\nChecking if repeated protein sequences:\n") orderedprotList = [] orderedprotList = sorted(protDict.items(), key=lambda x: len(x[1]), reverse=True) i = 0 while i < len(orderedprotList): elem = orderedprotList[i] orderedprotDict[elem[0]] = elem[1] i += 1 verboseprint( str(repeatedgenes) + " repeated loci out of " + str(totalgenes)) verboseprint( str(smallgenes) + " loci out of " + str(totalgenes) + " smaller than " + str(sizethresh) + "bp") verboseprint("\nprotein file created\n") # first step - remove genes contained in other genes or 100% equal genes # list of results - the output of the function resultsList = [] auxDict = {} #g_fp = HTSeq.FastaReader(proteinfile) g = 0 j = 0 verboseprint( "Checking if protein sequences are contained in others...") # for each gene from all the annotated genes - starting with an empty dictionary, only add a new gene if the "to be added gene" is not contained or equal to a gene already added to the dictionary auxprot = [] for elem in orderedprotDict.items(): contained = False prot = str(elem[1]) if any(prot in x for x in auxprot): g += 1 contained = True else: auxDict[elem[1]] = elem[0] auxprot.append(str(elem[1])) j += 1 verboseprint(str(g) + " loci are contained in other genes\n") # overwrite the original file, obtaining a new file with unique genes with open(proteinfile, "w") as f: allsequences = '' for k, v in auxDict.items(): allsequences += v + k + "\n" f.write(allsequences) else: # print "passed steps" proteinfile = proteinFIlePath totalgenes = 0 smallgenes = 0 #g_fp = HTSeq.FastaReader(genes) proteinfile = proteinFIlePath for gene in SeqIO.parse(genes, "fasta", generic_dna): #for gene in g_fp: dnaseq = str(gene.seq) protname = ">" + str(gene.id) + "\n" # protDict[protname] = str(protseq) geneDict[str(gene.name)] = dnaseq verboseprint("Starting Blast") # print "Blasting the total of "+ str(len(auxDict.keys())) + " loci" geneFile = os.path.abspath(proteinfile) Gene_Blast_DB_name = Create_Blastdb(geneFile, 1, True) geneF = os.path.splitext(geneFile)[0] blast_out_file = geneF + '.xml' # ------------------------------ RUNNING BLAST ------------------------------ # if cpuToUse: cline = NcbiblastpCommandline(cmd=BlastpPath, query=geneFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5, num_threads=int(cpuToUse)) else: cline = NcbiblastpCommandline(cmd=BlastpPath, query=geneFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5, num_threads=1) blast_records = runBlastParser(cline, blast_out_file) verboseprint("Finished blast") toRemove = [] genesToKeep = [] log = ["removed\tcause\texplanation"] for blast_record in blast_records: allelename = blast_record.query allelename = allelename.split(" ") allelename = allelename[0] alleleLength = len(geneDict[allelename]) try: # if gene A is not on the toRemove list yet, add to genesToKeep list if str(blast_record.query) not in toRemove: genesToKeep.append(blast_record.query) i = 0 # if first alignement is not against self, gene B is bigger than gene A and very simillar - remove gene A from genesToKeep and add gene B instead if not str(blast_record.query) == str( (blast_record.alignments[0]).hit_def): genesToKeep.remove(str(blast_record.query)) toRemove.append(str(blast_record.query)) log.append( str(blast_record.query) + "\t" + str((blast_record.alignments[0]).hit_def) + "\t" + "2 is first best match") # if gene B is not on the toRemove list, add to genesToKeep list if str( (blast_record.alignments[0]).hit_def) not in toRemove: genesToKeep.append( str((blast_record.alignments[0]).hit_def)) raise selfblastscore = (((blast_record.alignments[0]).hsps)[0]).score while i < len(blast_record.alignments): align = blast_record.alignments[i] match = (align.hsps)[0] scoreRatio = float(match.score) / float(selfblastscore) alleleLength2 = len(geneDict[str(align.hit_def)]) # if good match and gene B not in toremove list if (scoreRatio > bsr and not str(align.hit_def) == str(blast_record.query) and str(align.hit_def) not in toRemove): # if gene B is bigger than gene A, keep bigger gene B if alleleLength2 > alleleLength: genesToKeep.append(str(align.hit_def)) genesToKeep.remove(str(blast_record.query)) toRemove.append(str(blast_record.query)) log.append( str(blast_record.query) + "\t" + str(align.hit_def) + "\t" + "2 is bigger and bsr >" + str(bsr)) raise # else add gene B to toremove list elif str(align.hit_def) in genesToKeep: genesToKeep.remove(str(align.hit_def)) toRemove.append(str(align.hit_def)) log.append( str(align.hit_def) + "\t" + str(blast_record.query) + "\t" + "2 is bigger and bsr >" + str(bsr)) i += 1 # else gene A is on toRemove list, add all similar genes (not in genesToKeep) list to the toRemove list else: i = 0 selfblastscore = 0 for align in blast_record.alignments: if not (str(align.hit_def) == str(blast_record.query)): selfblastscore = ((align.hsps)[0]).score # print "gene "+str(align.hit_def)+" is larger than gene "+str(blast_record.query) raise while i < len(blast_record.alignments): align = blast_record.alignments[i] match = (align.hsps)[0] scoreRatio = float(match.score) / float(selfblastscore) if align.hit_def not in genesToKeep and not str( align.hit_def) == str( blast_record.query) and scoreRatio > bsr: toRemove.append(align.hit_def) log.append( str(align.hit_def) + "\t" + str(blast_record.query) + "\t" + "2 was on the removed list and bsr >" + str(bsr)) else: pass i += 1 except Exception as e: # print e pass genesToKeep = list(set(genesToKeep)) toRemove = list(set(toRemove)) s = set(toRemove) notcommonToKeep = [x for x in genesToKeep if x not in s] pathfiles = os.path.dirname(geneFile) pathfiles = pathfiles + "/" listfiles = [] #g_fp = HTSeq.FastaReader(genes) removedparalogs = 0 removedsize = 0 totalgenes = 0 rest = 0 concatenatedFile = '' schema_folder_path = os.path.join(pathfiles, 'schema_seed') if not os.path.exists( schema_folder_path) and not proteinFIlePath and not outputFIlePath: os.makedirs(schema_folder_path) elif not proteinFIlePath and outputFIlePath: os.makedirs(outputFIlePath) for contig in SeqIO.parse(genes, "fasta", generic_dna): totalgenes += 1 #name = contig.name + " " + contig.descr name2 = contig.id # print name2 if name2 not in toRemove and name2 in genesToKeep: if int(len(contig.seq)) > sizethresh: namefile = contig.name namefile = namefile.replace("|", "_") namefile = namefile.replace("_", "-") namefile = namefile.replace("(", "") namefile = namefile.replace(")", "") namefile = namefile.replace("'", "") namefile = namefile.replace("\"", "") namefile = namefile.replace(":", "") if not proteinFIlePath and not outputFIlePath: newFile = os.path.join(schema_folder_path, namefile + ".fasta") listfiles.append(newFile) with open(newFile, "w") as f: f.write(">" + namefile + "_1\n" + str(contig.seq) + "\n") elif not proteinFIlePath and outputFIlePath: newFile = os.path.join(outputFIlePath, namefile + ".fasta") listfiles.append(newFile) with open(newFile, "w") as f: f.write(">" + namefile + "_1\n" + str(contig.seq) + "\n") else: concatenatedFile += ">" + contig.id + " \n" + str( contig.seq) + "\n" rest += 1 else: removedsize += 1 else: removedparalogs += 1 if proteinFIlePath and outputFIlePath: with open(outputFIlePath, "w") as f: f.write(concatenatedFile) elif not proteinFIlePath and outputFIlePath: get_Short(listfiles) verboseprint("\nRemoved " + str(removedparalogs) + " with a high similarity (BSR>" + str(bsr) + ")") print("Total of " + str(rest) + " loci that constitute the schema") os.remove(proteinfile) # create short folder else: # ~ with open("schemacreation.log", "wb") as f: # ~ for elem in log: # ~ f.write(str(elem)+"\n") get_Short(listfiles) verboseprint("\nRemoved " + str(removedparalogs) + " with a high similarity (BSR>" + str(bsr) + ")") print("Total of " + str(rest) + " loci that constitute the schema") os.remove(proteinfile) shutil.rmtree(os.path.join(pathfiles, 'blastdbs')) os.remove(blast_out_file) verboseprint(starttime) verboseprint("Finished Script at : " + time.strftime("%H:%M:%S-%d/%m/%Y"))
def main(): try: input_file = sys.argv[1] temppath = sys.argv[2] blastPath = sys.argv[3] verbose = sys.argv[4] bsrTresh = sys.argv[5] if verbose == 'True': verbose = True else: verbose = False except IndexError: print( "Error starting the callAlleleles_protein3 script. usage: list_pickle_obj" ) bsrTresh = float(bsrTresh) argumentList = [] with open(input_file, 'rb') as f: argumentList = pickle.load(f) if verbose: def verboseprint(*args): for arg in args: print(arg), print else: verboseprint = lambda *a: None geneFile = argumentList[0] verboseprint("Using gene: " + str(geneFile)) shortgeneFile = os.path.join(os.path.dirname(argumentList[0]), "short", os.path.basename(argumentList[0])) shortgeneFile = shortgeneFile.replace(".fasta", "_short.fasta") genomesList = argumentList[1] genesList = argumentList[2] newListgenes = [] with open(genesList, 'r') as gene_fp: for gene in gene_fp: gene = gene.rstrip('\n') gene = gene.rstrip('\r') newListgenes.append(gene) statusbar = float(newListgenes.index(str(geneFile))) / len(newListgenes) locusnumber = (newListgenes.index(str(geneFile))) totalocusnumber = len(newListgenes) basepath = os.path.join(temppath, os.path.splitext(geneFile)[0]) print("\rProcessing " + os.path.basename(geneFile) + ". Start " + time.strftime("%H:%M:%S-%d/%m/%Y") + " Locus " + str(locusnumber) + " of " + str(totalocusnumber) + ". Done " + str(int(statusbar * 100)) + "%.", end="") if not os.path.exists(basepath): os.makedirs(basepath) #gene_fp = HTSeq.FastaReader(geneFile) fullAlleleList = [] fullAlleleNameList = [] alleleI = 0 # get full list of alleles from main gene file and last allele number id for allele in SeqIO.parse(geneFile, "fasta", generic_dna): aux = allele.id.split("_") if len(aux) < 2: alleleI = int(aux[0]) else: alleleI = int(aux[-1]) fullAlleleList.append(str(allele.seq)) fullAlleleNameList.append(allele.id) resultsList = [] i = 0 perfectMatchIdAllele = [] perfectMatchIdAllele2 = [] allelescores = [] listShortAllelesNames = [] verboseprint("Getting BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) geneScorePickle = os.path.abspath(shortgeneFile) + '_bsr.txt' # check if bsr as arealdy been calculated and recalculate it if necessary if os.path.isfile(geneScorePickle): allelescores, alleleList, listShortAllelesNames = getBlastScoreRatios( shortgeneFile, basepath, False, verbose, blastPath) else: allelescores, alleleList, listShortAllelesNames = getBlastScoreRatios( shortgeneFile, basepath, True, verbose, blastPath) verboseprint("Finished BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) verboseprint("starting allele call blast at: " + time.strftime("%H:%M:%S-%d/%m/%Y")) for genomeFile in genomesList: verboseprint(genomeFile) bestmatch = [ 0, 0, False, '', 0 ] # score, score ratio, perfectmatch, key name of the DNA sequence string, allele ID currentGenomeDict = {} currentCDSDict = {} # load the CDS from the genome to a dictionary filepath = os.path.join( temppath, str(os.path.basename(genomeFile)) + "_ORF_Protein.txt") with open(filepath, 'rb') as f: currentCDSDict = pickle.load(f) try: intersection = set(fullAlleleList).intersection( currentCDSDict.values()) intersection = list(intersection) if len(intersection) > 1: perfectMatchIdAllele.append('NIPHEM') perfectMatchIdAllele2.append('NIPHEM') verboseprint( os.path.basename(genomeFile) + " has " + str(len(intersection)) + " multiple exact match : " + os.path.basename(geneFile) + " MULTIPLE ALLELES as EXACT MATCH") raise ValueError("MULTIPLE ALLELES as EXACT MATCH") elif len(intersection) == 1: alleleStr = intersection[0] # it doenst return both keys with equal values # ~ elem=currentCDSDict.keys()[currentCDSDict.values().index(alleleStr)] elem = [ key for key, value in currentCDSDict.items() if value == alleleStr ] if len(elem) > 1: perfectMatchIdAllele.append('NIPHEM') perfectMatchIdAllele2.append('NIPHEM') verboseprint( os.path.basename(genomeFile) + " has " + str(len(intersection)) + " multiple exact match : " + os.path.basename(geneFile) + " MULTIPLE ALLELES as EXACT MATCH") raise ValueError("MULTIPLE ALLELES as EXACT MATCH") contigname = elem[0].split("&") matchLocation = contigname[2] # starting CDS base need to be +1 matchLocation = matchLocation.split("-") matchLocation = [ int(matchLocation[0]) + 1, int(matchLocation[1]) ] contigname = (contigname[0]).replace(">", "") alleleName = '' alleleMatchid = 0 alleleName = fullAlleleNameList[fullAlleleList.index( alleleStr)] alleleMatchid = int((alleleName.split("_"))[-1]) perfectMatchIdAllele.append(str(alleleMatchid)) if matchLocation[0] > matchLocation[1]: perfectMatchIdAllele2.append( str(contigname) + "&" + str(matchLocation[0]) + "-" + str(matchLocation[1]) + "&" + "-") else: perfectMatchIdAllele2.append( str(contigname) + "&" + str(matchLocation[0]) + "-" + str(matchLocation[1]) + "&" + "+") # check if atributed allele is contained or contains try: containedInfo = (alleleName.split("_"))[1] except: containedInfo = '' if containedInfo == "CD": resultsList.append([(os.path.basename(genomeFile)), str(alleleMatchid), containedInfo.rstrip()]) elif containedInfo == "CS": resultsList.append([(os.path.basename(genomeFile)), str(alleleMatchid), containedInfo.rstrip()]) else: pass raise ValueError("EQUAL") except Exception as e: # ~ exc_type, exc_obj, exc_tb = sys.exc_info() # ~ fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] # ~ print(exc_tb.tb_lineno) # ~ print e continue else: verboseprint("Blasting alleles on genome at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) blast_out_file = os.path.join( basepath, "blastdbs/" + os.path.basename(geneFile) + '_List.xml') Gene_Blast_DB_name = os.path.join( temppath, str(os.path.basename(genomeFile)) + "/" + str(os.path.basename(genomeFile)) + "_db") proteinfastaPath = os.path.join( basepath, str(os.path.basename(shortgeneFile) + '_protein.fasta')) # blast the genome CDS against the translated locus # cline = NcbiblastpCommandline(query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5,max_target_seqs=10,max_hsps_per_subject=10) # 2.2.28 up cline = NcbiblastpCommandline(cmd=blastPath, query=proteinfastaPath, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5, max_target_seqs=10, max_hsps=10, num_threads=1) blast_records = runBlastParser(cline, blast_out_file) verboseprint("Blasted alleles on genome at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) alleleSizes = [] for allele in fullAlleleList: alleleSizes.append(len(allele)) biggestSizeAllele = max(alleleSizes) # get mode allele size moda = max(set(alleleSizes), key=alleleSizes.count) contador = Counter(alleleSizes).most_common() # if most common allele size appears 1 time, get first allele size if (contador[0])[1] == 1: moda = alleleSizes[0] try: # iterate through the blast results for blast_record in blast_records: locationcontigs = [] for alignment in blast_record.alignments: # select the best match for match in alignment.hsps: # query id comes with query_id, not name of the allele alleleMatchid = int( (blast_record.query_id.split("_"))[-1]) # ~ scoreRatio=float(match.score)/float(allelescores[int(alleleMatchid)-1]) # query_id starts with 1 alleleMatchid2 = (( listShortAllelesNames[alleleMatchid - 1]).split("_"))[-1] scoreRatio = float(match.score) / float( allelescores[int(alleleMatchid2)]) cdsStrName = (alignment.title.split(" "))[1] #DNAstr = str(currentCDSDict[">" + cdsStrName]) AlleleDNAstr = alleleList[int(alleleMatchid) - 1] if scoreRatio >= bsrTresh: locationcontigs.append(cdsStrName) # select the best match from BLAST results if scoreRatio == 1 and match.score > bestmatch[0]: bestmatch = [ match.score, scoreRatio, False, cdsStrName, int(alleleMatchid), match, len(AlleleDNAstr) ] elif (match.score > bestmatch[0] and scoreRatio >= bsrTresh and scoreRatio > bestmatch[1] and bestmatch[2] is False): bestmatch = [ match.score, scoreRatio, False, cdsStrName, int(alleleMatchid), match, len(AlleleDNAstr) ] verboseprint("Classifying the match at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) # if no best match was found it's a Locus Not Found # check for ambiguious bases if not bestmatch[0] == 0: alleleStr = currentCDSDict[">" + bestmatch[3]] listFoundAmbiguities = [] listambiguousBases = [ 'K', 'M', 'R', 'Y', 'S', 'W', 'B', 'V', 'H', 'D', 'X', 'N', '-', '.' ] listFoundAmbiguities = [ e for e in listambiguousBases if e in alleleStr ] if bestmatch[0] == 0 or len(listFoundAmbiguities) > 0: ################### # LOCUS NOT FOUND # ################### if bestmatch[0] == 0: perfectMatchIdAllele.append('LNF') perfectMatchIdAllele2.append('LNF') verboseprint("Locus not found, no matches \n") else: perfectMatchIdAllele.append('LNF') perfectMatchIdAllele2.append('LNF') verboseprint("Locus has strange base \n") # if more than one BSR >0.6 in two different CDSs it's a Non Paralog Locus elif len(list(set(locationcontigs))) > 1: verboseprint("NIPH", "") perfectMatchIdAllele.append('NIPH') perfectMatchIdAllele2.append('NIPH') for elem in locationcontigs: verboseprint(elem) # if match with BSR >0.6 and not equal DNA sequences else: # load the contig info of the genome to a dictionary #g_fp = HTSeq.FastaReader(genomeFile) for contig in SeqIO.parse(genomeFile, "fasta", generic_dna): currentGenomeDict[contig.id] = len(str(contig.seq)) match = bestmatch[5] geneLen = bestmatch[6] alleleStr = currentCDSDict[">" + bestmatch[3]] contigname = bestmatch[3] contigname = contigname.split("&") matchLocation = contigname[2] matchLocation = matchLocation.split("-") matchLocation = [ int(matchLocation[0]) + 1, matchLocation[1] ] contigname = contigname[0] bestMatchContigLen = currentGenomeDict[contigname] protSeq, alleleStr, Reversed = translateSeq(alleleStr) # get extra space to the right and left between the allele and match and check if it's still inside the contig rightmatchAllele = geneLen - ( (int(match.query_end) + 1) * 3) leftmatchAllele = ((int(match.query_start) - 1) * 3) # ~ if Reversed swap left and right contig extra if int(matchLocation[1]) < int(matchLocation[0]): rightmatchContig = bestMatchContigLen - int( matchLocation[0]) leftmatchContig = int(matchLocation[1]) aux = rightmatchAllele rightmatchAllele = leftmatchAllele leftmatchAllele = aux else: rightmatchContig = bestMatchContigLen - int( matchLocation[1]) leftmatchContig = int(matchLocation[0]) ########################### # LOCUS ON THE CONTIG TIP # ########################### # check if contig is smaller than the matched allele if leftmatchContig < leftmatchAllele and rightmatchContig < rightmatchAllele: # ~ resultsList.append('PLOTSC:-1') perfectMatchIdAllele.append('LOTSC') perfectMatchIdAllele2.append('LOTSC') # ~ if not Reversed: # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+") # ~ else: # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-") verboseprint(match, contigname, geneFile, leftmatchAllele, rightmatchAllele, "Locus is bigger than the contig \n") elif leftmatchContig < leftmatchAllele: # ~ resultsList.append('PLOT3:-1') perfectMatchIdAllele.append('PLOT3') perfectMatchIdAllele2.append('PLOT3') # ~ if not Reversed: # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+") # ~ else: # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-") verboseprint( match, contigname, geneFile, leftmatchAllele, rightmatchAllele, "Locus is on the 3' tip of the contig \n") elif rightmatchContig < rightmatchAllele: # ~ resultsList.append('PLOT5:-1') perfectMatchIdAllele.append('PLOT5') perfectMatchIdAllele2.append('PLOT5') # ~ if not Reversed: # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+") # ~ else: # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-") verboseprint( match, contigname, geneFile, leftmatchAllele, rightmatchAllele, "Locus is on the 5' tip of the contig \n") elif float(len(alleleStr)) > moda + (moda * 0.2): verboseprint("Locus is larger than mode", moda, alleleStr) # ~ resultsList.append('ALM') perfectMatchIdAllele.append('ALM') perfectMatchIdAllele2.append('ALM') # ~ if not Reversed: # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+") # ~ else: # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-") elif float(len(alleleStr)) < moda - (moda * 0.2): verboseprint("Locus is smaller than mode", moda, alleleStr) # ~ resultsList.append('ASM') perfectMatchIdAllele.append('ASM') perfectMatchIdAllele2.append('ASM') # ~ if not Reversed: # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"+") # ~ else: # ~ perfectMatchIdAllele2.append(str(contigname)+"&"+str(matchLocation[0])+"-"+str(matchLocation[1])+"&"+"-") else: ####################### # ADD INFERRED ALLELE # # a new allele ####################### wasContained = False tagAuxC = 'S' for alleleaux in fullAlleleList: if alleleStr in alleleaux: alleleName = fullAlleleNameList[ fullAlleleList.index(alleleaux)] alleleMatchid = (alleleName.split("_"))[-1] tagAuxC = 'CD' + alleleMatchid.rstrip() resultsList.append([ (os.path.basename(genomeFile)), str(alleleI + 1), tagAuxC ]) break elif alleleaux in alleleStr: alleleName = fullAlleleNameList[ fullAlleleList.index(alleleaux)] alleleMatchid = (alleleName.split("_"))[-1] tagAuxC = 'CS' + alleleMatchid.rstrip() resultsList.append([ (os.path.basename(genomeFile)), str(alleleI + 1), tagAuxC ]) break if not wasContained: tagAux = 'INF' perfectMatchIdAllele.append(tagAux + "-" + str(alleleI + 1)) if not Reversed: perfectMatchIdAllele2.append( str(contigname) + "&" + str(matchLocation[0]) + "-" + str(matchLocation[1]) + "&" + "+") else: perfectMatchIdAllele2.append( str(contigname) + "&" + str(matchLocation[0]) + "-" + str(matchLocation[1]) + "&" + "-") verboseprint("New allele! Adding allele " + tagAux + str(alleleI + 1) + " to the database\n") # --- add the new allele to the gene fasta --- # alleleI += 1 appendAllele = '>' + str(( ((os.path.basename(geneFile)).split("."))[0] ).replace("_", "-")) + "_" + tagAuxC + "_" + (str( os.path.basename(genomeFile))).replace( "_", "-") + "_" + str(alleleI) + '\n' fG = open(geneFile, 'a') fG.write(appendAllele) fG.write(alleleStr + '\n') fG.close() fullAlleleList.append(alleleStr) fullAlleleNameList.append(appendAllele) if bestmatch[1] >= int(bsrTresh) and float( bestmatch[1]) < int(bsrTresh) + 0.1: fG = open(shortgeneFile, 'a') fG.write(appendAllele) fG.write(alleleStr + '\n') fG.close() geneTransalatedPath2 = os.path.join( basepath, str( os.path.basename(shortgeneFile) + '_protein2.fasta')) geneTransalatedPath = os.path.join( basepath, str( os.path.basename(shortgeneFile) + '_protein.fasta')) with open(geneTransalatedPath2, 'w') as fG: fG.write('>' + str(alleleI) + '\n' + str(protSeq) + '\n') with open(geneTransalatedPath, 'a') as fG: fG.write('>' + str(alleleI) + '\n' + str(protSeq) + '\n') match = bestmatch[5] # --- remake blast DB and recalculate the BSR for the locus --- # alleleList.append(alleleStr) listShortAllelesNames.append(appendAllele) genefile2 = geneTransalatedPath2 Gene_Blast_DB_name2 = Create_Blastdb( genefile2, 1, True) verboseprint( "Re-calculating BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) allelescores, alleleList, listShortAllelesNames = reDogetBlastScoreRatios( genefile2, basepath, alleleI, allelescores, Gene_Blast_DB_name2, alleleList, geneScorePickle, verbose, blastPath, listShortAllelesNames) verboseprint( "Done Re-calculating BSR at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) except Exception as e: print("some error occurred") print(e) print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno)) perfectMatchIdAllele2.append("ERROR") perfectMatchIdAllele.append("ERROR") final = (resultsList, perfectMatchIdAllele) verboseprint("Finished allele calling at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) filepath = os.path.join(temppath, os.path.basename(geneFile) + "_result.txt") filepath2 = os.path.join(temppath, os.path.basename(geneFile) + "_result2.txt") with open(filepath, 'wb') as f: pickle.dump(final, f) with open(filepath2, 'wb') as f: pickle.dump(perfectMatchIdAllele2, f) shutil.rmtree(basepath) return True
def main(): parser = argparse.ArgumentParser( description= "Given an ffn file, recovers the genes that are not paralogs and have a size bigger than the g parameter provided" ) parser.add_argument('-i', nargs='?', type=str, help='ffn file', required=True) parser.add_argument('-g', nargs='?', type=int, help='int minimum size', required=True) args = parser.parse_args() genes = args.i sizethresh = args.g passSteps = False #translate to protein and create new file abspath = os.path.abspath(genes) filename = os.path.basename(genes) abspath = abspath.replace(filename, '') proteinfile = os.path.join(abspath, 'proteins.fasta') geneDict = {} protDict = {} orderedprotDict = collections.OrderedDict() alreadyIn = [] totalgenes = 0 repeatedgenes = 0 smallgenes = 0 if not passSteps: print "not passing steps" with open(proteinfile, "wb") as f: g_fp = HTSeq.FastaReader(genes) totalgenes += 1 for gene in g_fp: dnaseq = str(gene.seq) protseq, x, y = translateSeq(dnaseq) if len(protseq) > 1: if str(protseq) in alreadyIn: repeatedgenes += 1 elif len(str(protseq)) < 67: smallgenes += 1 else: alreadyIn.append(str(protseq)) protname = ">" + str(gene.name) + "\n" f.write(protname + str(protseq) + "\n") protDict[protname] = str(protseq) geneDict[str(gene.name)] = gene.seq else: print gene.name orderedprotList = [] orderedprotList = sorted(protDict.items(), key=lambda x: len(x[1]), reverse=True) i = 0 while i < len(orderedprotList): elem = orderedprotList[i] orderedprotDict[elem[0]] = elem[1] i += 1 #print orderedprotDict print str(repeatedgenes) + " repeated genes out of " + str(totalgenes) print str(smallgenes) + " small genes out of " + str(totalgenes) print "protein file created" # first step - remove genes contained in other genes or 100% equal genes # list of results - the output of the function resultsList = [] auxDict = {} g_fp = HTSeq.FastaReader(proteinfile) g = 0 j = 0 print "Checking if proteins are equal or substring of others..." # for each gene from all the annotated genes - starting with an empty dictionary, only add a new gene if the "to be added gene" is not contained or equal to a gene already added to the dictionary auxprot = [] for elem in orderedprotDict.items(): contained = False prot = str(elem[1]) if any(prot in x for x in auxprot): g += 1 contained = True else: auxDict[elem[1]] = elem[0] auxprot.append(str(elem[1])) print str(j) + " out of " + str(len(orderedprotDict)) j += 1 #print "____" +str(j) print "%s genes are contained in other genes" % (g) #overwrite the original file, obtaining a new file with unique genes with open(proteinfile, "wb") as f: allsequences = '' for k, v in auxDict.iteritems(): allsequences += v + k + "\n" f.write(allsequences) else: totalgenes = 0 smallgenes = 0 g_fp = HTSeq.FastaReader(genes) totalgenes += 1 for gene in g_fp: dnaseq = str(gene.seq) protseq, x, y = translateSeq(dnaseq) if len(protseq) > 1: if str(protseq) in alreadyIn: repeatedgenes += 1 #print gene.name + " already saved " elif len(str(protseq)) < 67: smallgenes += 1 else: alreadyIn.append(str(protseq)) protname = ">" + str(gene.name) + "\n" #print protseq protDict[protname] = str(protseq) geneDict[str(gene.name)] = gene.seq else: print gene.name geneFile = os.path.abspath(proteinfile) print proteinfile Gene_Blast_DB_name = Create_Blastdb(geneFile, 1, True) geneF = os.path.splitext(geneFile)[0] blast_out_file = geneF + '.xml' # ------------------------------ RUNNING BLAST ------------------------------ # cline = NcbiblastpCommandline(query=geneFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) blast_records = runBlastParser(cline, blast_out_file, geneFile) toRemove = [] genesToKeep = [] log = ["removed\tcause\texplanation"] for blast_record in blast_records: allelename = blast_record.query allelename = allelename.split(" ") allelename = allelename[0] alleleLength = len(geneDict[allelename]) try: #if gene A is not on the toRemove list yet, add to genesToKeep list if str(blast_record.query) not in toRemove: genesToKeep.append(blast_record.query) i = 0 #if first alignement is not against self, gene B is bigger than gene A and very simillar - remove gene A from genesToKeep and add gene B instead if not str(blast_record.query) == str( (blast_record.alignments[0]).hit_def): genesToKeep.remove(str(blast_record.query)) toRemove.append(str(blast_record.query)) log.append( str(blast_record.query) + "\t" + str((blast_record.alignments[0]).hit_def) + "\t" + "2 is first best match") #if gene B is not on the toRemove list, add to genesToKeep list if str( (blast_record.alignments[0]).hit_def) not in toRemove: genesToKeep.append( str((blast_record.alignments[0]).hit_def)) raise selfblastscore = (((blast_record.alignments[0]).hsps)[0]).score while i < len(blast_record.alignments): align = blast_record.alignments[i] match = (align.hsps)[0] scoreRatio = float(match.score) / float(selfblastscore) alleleLength2 = len(geneDict[str(align.hit_def)]) #if good match and gene B not in toremove list if (scoreRatio > 0.6 and not str(align.hit_def) == str(blast_record.query) and str(align.hit_def) not in toRemove): #if gene B is bigger than gene A, keep bigger gene B if alleleLength2 > alleleLength: genesToKeep.append(str(align.hit_def)) genesToKeep.remove(str(blast_record.query)) toRemove.append(str(blast_record.query)) log.append( str(blast_record.query) + "\t" + str(align.hit_def) + "\t" + "2 is bigger and bsr >0.6") raise #else add gene B to toremove list elif str(align.hit_def) in genesToKeep: genesToKeep.remove(str(align.hit_def)) toRemove.append(str(align.hit_def)) log.append( str(align.hit_def) + "\t" + str(blast_record.query) + "\t" + "2 is bigger and bsr >0.6") i += 1 #else gene A is on toRemove list, add all similar genes (not in genesToKeep) list to the toRemove list else: i = 0 selfblastscore = 0 for align in blast_record.alignments: if not (str(align.hit_def) == str(blast_record.query)): selfblastscore = ((align.hsps)[0]).score print "gene " + str( align.hit_def) + " is bigger than gene " + str( blast_record.query) raise while i < len(blast_record.alignments): align = blast_record.alignments[i] match = (align.hsps)[0] scoreRatio = float(match.score) / float(selfblastscore) if align.hit_def not in genesToKeep and not str( align.hit_def) == str( blast_record.query) and scoreRatio > 0.6: toRemove.append(align.hit_def) log.append( str(align.hit_def) + "\t" + str(blast_record.query) + "\t" + "2 was on the removed list and bsr >0.6") else: pass i += 1 except Exception as e: #print e pass with open("logfile.txt", "wb") as f: for elem in log: f.write(str(elem) + "\n") genesToKeep = list(set(genesToKeep)) toRemove = list(set(toRemove)) s = set(toRemove) notcommonToKeep = [x for x in genesToKeep if x not in s] print len(toRemove) print len(genesToKeep) print len(notcommonToKeep) pathfiles = os.path.dirname(geneFile) pathfiles = pathfiles + "/" g_fp = HTSeq.FastaReader(genes) removedparalogs = 0 removedsize = 0 totalgenes = 0 rest = 0 concatenatedFile = '' for contig in g_fp: totalgenes += 1 name = contig.name + " " + contig.descr name2 = contig.name if name2 not in toRemove and name2 in genesToKeep: if int(len(contig.seq)) > sizethresh: namefile = contig.name namefile = namefile.replace("|", "_") with open(pathfiles + namefile + ".fasta", "wb") as f: f.write(">1\n" + contig.seq + "\n") rest += 1 concatenatedFile += ">" + namefile + "\n" + contig.seq + "\n" else: removedsize += 1 else: removedparalogs += 1 print "%s genes are contained in other genes" % (g) print "Removed %s same Locus genes" % str(removedparalogs) print "Removed %s because of size " % str(removedsize) print "%s Scheme genes " % str(rest) print "total genes:" + str(totalgenes) with open(pathfiles + "concatenated.fasta", "wb") as f: f.write(concatenatedFile)
def main(): try: input_file = sys.argv[1] temppath = sys.argv[2] except IndexError: print "usage: list_pickle_obj" argumentList = [] with open(input_file, 'rb') as f: argumentList = pickle.load(f) geneFile = argumentList[0] genomesList = argumentList[1] basepath = temppath + "/" + os.path.basename(geneFile) if not os.path.exists(basepath + "/blastdbs/"): os.makedirs(basepath + "/blastdbs/") gene_fp = HTSeq.FastaReader(geneFile) geneDict = {} alleleI = 1 inverted = False orderedAlleleNames = [] biggestAllelelen = 0 smallestAllelelen = 999999 for allele in gene_fp: if allele.seq in geneDict: print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile else: if len(allele.seq) > biggestAllelelen: biggestAllelelen = len(allele.seq) if len(allele.seq) < smallestAllelelen: smallestAllelelen = len(allele.seq) orderedAlleleNames.append(str(alleleI)) geneDict[allele.seq] = alleleI alleleI += 1 # --- make 1st blast DB --- # geneF = os.path.basename(geneFile) blast_out_file = os.path.dirname(geneFile) + "/blastdbs/" + geneF + '.xml' # list of results - the output of the function i = 0 perfectMatchIdAllele = [] perfectMatchIdAllele2 = [] genomeDict = {} genome = -1 resultsList = [] print genomesList for genomeFile in genomesList: print "_______________________________________________________" print perfectMatchIdAllele printinfo(genomeFile, geneFile) #currentCDSDict = listOfCDSDicts[i] g_fp = HTSeq.FastaReader(genomeFile) for contig in g_fp: sequence = str(contig.seq) genomeDict[contig.name] = sequence currentGenomeDict = genomeDict genome += 1 print("Blasting alleles on genome at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) blast_out_file = os.path.join( basepath, "blastdbs/" + os.path.basename(geneFile) + '_List.xml') Gene_Blast_DB_name = os.path.join( temppath, str(os.path.basename(genomeFile)) + "/" + str(os.path.basename(genomeFile)) + "_db") cline = NcbiblastnCommandline(query=geneFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) blast_records = runBlastParser(cline, blast_out_file, geneFile) print("Blasted alleles on genome at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) # ------ DETERMINING BEST MATCH ------ # bestMatch = ['', '', 0] bestMatchContig = '' bestMatchContigLen = '' bestalignlen = 0 perfectMatch = False bmAlleleLen2 = 0 bmAllele = '' for blast_record in blast_records: if perfectMatch == True: break try: hspC = blast_record.alignments[0] if bestMatch[0] == '' and bestMatch[1] == '': bestMatch[0] = blast_record.query bestMatch[1] = hspC except IndexError: continue # --- the contig tag is used in the progigal function --- # contigTag = blast_record.query # --- brute force parsing of the contig tag - better solution is advisable --- # j = 0 for l in contigTag: if l == ' ': break j += 1 contigTag = contigTag[:j] contigLen = blast_record.query_letters # --- iterating over all the results to determine the best match --- # for alignment in blast_record.alignments: contigTag = alignment.hit_def contigTag = (contigTag.split(" "))[0] index = orderedAlleleNames.index( str(blast_record.query_id).split("_")[1]) for k, v in geneDict.iteritems(): if v == index + 1: bmAlleleLen2 = len(k) if perfectMatch: break for match in alignment.hsps: scoreRatio = float(match.score) / float(bmAlleleLen2) #if #identities is the same as the length of the allele and it has no gaps or N's if (int(match.identities) == int(bmAlleleLen2) and int(match.identities) == int(len(match.query)) and "N" not in match.sbjct and "K" not in match.sbjct and "Y" not in match.sbjct and "R" not in match.sbjct): index = orderedAlleleNames.index( str(blast_record.query_id).split("_")[1]) for seq, alleleid in geneDict.iteritems(): if alleleid == index + 1: bmAllele = seq break bmAlleleLen = len(bmAllele) lenratio = float(len(match.sbjct)) / float(bmAlleleLen) bestMatch = [ blast_record.query, match, scoreRatio, blast_record.query_id, lenratio, bmAlleleLen ] bestMatchContig = contigTag perfectMatch = True index = orderedAlleleNames.index( str(blast_record.query_id).split("_")[1]) bmAlleleLen = len(geneDict.keys()[index]) break #chose the match with the best score ratio (score/length of allele) elif scoreRatio > bestMatch[2]: index = orderedAlleleNames.index( str(blast_record.query_id).split("_")[1]) for seq, alleleid in geneDict.iteritems(): if alleleid == index + 1: bmAllele = seq break bmAlleleLen = len(bmAllele) lenratio = float(len(match.sbjct)) / float(bmAlleleLen) bestMatch = [ blast_record.query, match, scoreRatio, blast_record.query_id, lenratio, bmAlleleLen ] bestMatchContig = contigTag bestMatchContigLen = len(currentGenomeDict[contigTag]) print contigTag bestalignlen = alignment.length if perfectMatch == True: break # ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- # print("Finished choosing best match at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) try: match = bestMatch[1] bestMatchStart = match.sbjct_start bestMatchEnd = match.sbjct_end if match.query_start > match.query_end: bestMatchEnd = match.sbjct_start bestMatchStart = match.sbjct_end print match geneLen = bestMatch[5] alleleStr = match.sbjct nIdentities = match.identities idPercent = float(nIdentities) / float(geneLen) scoreRatio = bestMatch[2] lenRatio = bestMatch[4] except: #if no best match was found ################### # LOCUS NOT FOUND # ################### perfectMatchIdAllele.append('LNF') perfectMatchIdAllele2.append('LNF') print "Locus not found, no matches \n" continue print "is perfect match true?" + str(perfectMatch) if perfectMatch is True: #if a perfect match was found try: alleleNumber = geneDict[alleleStr] except: alleleStr = reverseComplement(alleleStr) alleleNumber = geneDict[alleleStr] ################################################ # EXACT MATCH --- MATCH == GENE --- GENE FOUND # ################################################ if "_" in bestMatch[3]: a = bestMatch[3].split("_") perfectMatchIdAllele.append(a[1]) perfectMatchIdAllele2.append( str(bestMatchContig) + "&" + str(bestMatchStart) + "-" + str(bestMatchEnd) + "&" + "+") else: perfectMatchIdAllele.append(bestMatch[3]) perfectMatchIdAllele2.append( str(bestMatchContig) + "&" + str(bestMatchStart) + "-" + str(bestMatchEnd) + "&" + "+") printinfo(genomeFile, geneFile) print "Exact match \n" continue else: #if a best match was found but it's not an exact match ########################### # LOCUS ON THE CONTIG TIP # ########################### print geneLen if bestMatchContigLen <= geneLen: perfectMatchIdAllele.append('LOTSC') perfectMatchIdAllele2.append( str(bestMatchContig) + "&" + str(bestMatchStart) + "-" + str(bestMatchEnd) + "&" + "+") printinfo(genomeFile, geneFile) print "Locus is bigger than the contig \n" elif (match.sbjct_start == 1 and len(match.query) < geneLen) or ( match.sbjct_start == bestMatchContigLen and len(match.query) < bestMatchContigLen and match.sbjct_start > match.sbjct_end): perfectMatchIdAllele.append('LOT5') perfectMatchIdAllele2.append( str(bestMatchContig) + "&" + str(bestMatchStart) + "-" + str(bestMatchEnd) + "&" + "+") printinfo(genomeFile, geneFile) print "Locus is on the 5' tip of the contig \n" elif (match.sbjct_end == 1 and len(match.query) < geneLen and match.sbjct_start > match.sbjct_end) or ( match.sbjct_end == bestMatchContigLen and len(match.query) < bestMatchContigLen): perfectMatchIdAllele.append('LOT3') perfectMatchIdAllele2.append( str(bestMatchContig) + "&" + str(bestMatchStart) + "-" + str(bestMatchEnd) + "&" + "+") printinfo(genomeFile, geneFile) print "Locus is on the 3' tip of the contig \n" elif 'N' in alleleStr or "K" in alleleStr or "R" in alleleStr or "Y" in alleleStr: ##################### # ALLELE NOT FOUND # # N base found! ##################### geneFile2 = os.path.splitext(geneFile)[0] + "LNFN.fasta" with open(geneFile2, 'a') as f: f.write(">" + (str(os.path.basename(genomeFile))) + "|" + (str(os.path.basename(geneFile))) + "\n") f.write((alleleStr) + "\n") perfectMatchIdAllele.append('LNFN') perfectMatchIdAllele2.append('LNFN') printinfo(genomeFile, geneFile) print "LNFN, contains strange (N,K,R) bases! \n" else: print "new allele?" #removing gaps alleleStr = alleleStr.replace('-', '') lenExtraThresh = int(biggestAllelelen * 0.2) #else: #check if best match without gaps are contained inside an already defined allele isContainedDefinedAllele = False definedAllele = '' definedAlleleName = '' for k in geneDict.keys(): if alleleStr in k: definedAllele = k isContainedDefinedAllele = True definedAlleleName = geneDict.get(k) break print "is contained? " + str(isContainedDefinedAllele) print idPercent print geneLen print lenExtraThresh print lenRatio if isContainedDefinedAllele and int(len(match.sbjct)) <= int( len(definedAllele)) + lenExtraThresh and int( len(match.sbjct)) >= int( len(definedAllele)) - lenExtraThresh: #allele without gaps is contained in a defined allele #best match with gaps has same size +1/-1 base as the defined allele isnewallele = False if int(len(alleleStr)) == int( len(definedAllele) ): # if match without gaps has same size as the defined allele tagAux = 'NA?:' printinfo(genomeFile, geneFile) perfectMatchIdAllele.append("NA?-" + str(alleleI)) perfectMatchIdAllele2.append( str(bestMatchContig) + "&" + str(bestMatchStart) + "-" + str(bestMatchEnd) + "&" + "+") isnewallele = True elif int(len(alleleStr)) == int( len(definedAllele) ) - 1: # if match without gaps has minus one base than the defined allele tagAux = 'NA2:' printinfo(genomeFile, geneFile) perfectMatchIdAllele.append("NA2-" + str(alleleI)) perfectMatchIdAllele2.append( str(bestMatchContig) + "&" + str(bestMatchStart) + "-" + str(bestMatchEnd) + "&" + "+") isnewallele = True else: extraleft = 0 extraright = 0 tS = 0 tE = 0 handle = open(genomeFile, "rU") record_dict = SeqIO.to_dict( SeqIO.parse(handle, "fasta")) handle.close() record = record_dict[bestMatchContig] # if match without gaps has more than one base missing comparing to the defined allele if (1 < int(match.query_start) and 1 < int(match.query_end)): if match.query_start > match.query_end: extraleft = match.query_end - 1 else: extraleft = match.query_start - 1 print extraleft, extraright if ( int(geneLen) > int(match.query_start) and int(geneLen) > int(match.query_end) ): # if 3' tip bases of the allele are missing on the match if match.query_start > match.query_end: extraright = geneLen - match.query_start else: extraright = geneLen - match.query_end print extraleft, extraright if match.sbjct_start > match.sbjct_end: tE = match.sbjct_start + extraleft tS = match.sbjct_end - extraright - 1 alleleStr = str(record.seq[tS:tE]) alleleStr = reverseComplement(alleleStr) else: tS = match.sbjct_start - extraleft - 1 tE = match.sbjct_end + extraright alleleStr = str(record.seq[tS:tE]) print tS print tE print "allele is:" print alleleStr if tE > bestMatchContigLen: perfectMatchIdAllele.append('LOT3B') perfectMatchIdAllele2.append( str(bestMatchContig) + "&" + str(tS) + "-" + str(bestMatchContigLen) + "&" + "+") printinfo(genomeFile, geneFile) print "Locus is on the 3B' tip of the contig \n" elif tS < 0: perfectMatchIdAllele.append('LOT5B') perfectMatchIdAllele2.append( str(bestMatchContig) + "&" + str(0) + "-" + str(tE) + "&" + "+") printinfo(genomeFile, geneFile) print "Locus is on the 5B' tip of the contig \n" else: tagAux = 'NA2:' printinfo(genomeFile, geneFile) perfectMatchIdAllele.append("NA2-" + str(alleleI)) perfectMatchIdAllele2.append( str(bestMatchContig) + "&" + str(tS) + "-" + str(tE) + "&" + "+") isnewallele = True if isnewallele: print "New allele found! Adding allele " + tagAux + str( alleleI) + " to the database" geneDict[alleleStr] = alleleI orderedAlleleNames.append(str(alleleI)) # --- add the new allele to the gene fasta --- # fG = open(geneFile, 'a') fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + '_' + str(os.path.basename(genomeFile)) + '\n') fG.write(alleleStr + '\n') fG.close() alleleI += 1 #if best match is not contained in an already defined allele, check if it has similar size with the match allele and has 0.8 similarity elif not isContainedDefinedAllele and idPercent >= 0.8 and int( len(match.sbjct) ) <= int(geneLen) + lenExtraThresh and int(len( match.sbjct)) >= int(geneLen) - lenExtraThresh: #best match with gaps has 80% identity #best match with gaps is the same size or +1/-1 as the defined allele ratio = float(len(alleleStr)) / float(geneLen) if ratio >= 0.8 and ratio <= 1.2: # if match without gaps has same size as the best match allele and 80%similarity tagAux = '' extraleft = 0 extraright = 0 tS = 0 tE = 0 handle = open(genomeFile, "rU") record_dict = SeqIO.to_dict( SeqIO.parse(handle, "fasta")) handle.close() record = record_dict[bestMatchContig] #if len(match.sbjct)<geneLen and "-" not in match.sbjct: #if the allele is not fully covered against the match, compensate the tips try: print match if (1 < int(match.query_start) and 1 < int(match.query_end)): if match.query_start > match.query_end: extraleft = match.query_end - 1 else: extraleft = match.query_start - 1 print extraleft, extraright if ( int(geneLen) > int(match.query_start) and int(geneLen) > int(match.query_end) ): # if 3' tip bases of the allele are missing on the match if match.query_start > match.query_end: extraright = geneLen - match.query_start else: extraright = geneLen - match.query_end print extraleft, extraright if match.sbjct_start > match.sbjct_end: tE = match.sbjct_start + extraleft tS = match.sbjct_end - extraright - 1 alleleStr = str(record.seq[tS:tE]) alleleStr = reverseComplement(alleleStr) else: tS = match.sbjct_start - extraleft - 1 tE = match.sbjct_end + extraright alleleStr = str(record.seq[tS:tE]) print tS print tE print "allele is:" print alleleStr if tE > bestMatchContigLen: perfectMatchIdAllele.append('LOT3C') perfectMatchIdAllele2.append( str(bestMatchContig) + "&" + str(tS) + "-" + str(bestMatchContigLen) + "&" + "+") printinfo(genomeFile, geneFile) print "Locus is on the 3C' tip of the contig \n" elif tS < 0: perfectMatchIdAllele.append('LOT5C') perfectMatchIdAllele2.append( str(bestMatchContig) + "&" + str(0) + "-" + str(tE) + "&" + "+") printinfo(genomeFile, geneFile) print "Locus is on the 5C' tip of the contig \n" else: tagAux = 'NA3:' printinfo(genomeFile, geneFile) perfectMatchIdAllele.append("NA3-" + str(alleleI)) perfectMatchIdAllele2.append( str(bestMatchContig) + "&" + str(tS) + "-" + str(tE) + "&" + "+") print "New allele found! Adding allele " + tagAux + str( alleleI) + " to the database" geneDict[alleleStr] = alleleI orderedAlleleNames.append(str(alleleI)) # --- add the new allele to the gene fasta --- # fG = open(geneFile, 'a') fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + '_' + str(os.path.basename(genomeFile)) + '\n') fG.write(alleleStr + '\n') fG.close() alleleI += 1 except Exception as e: ################## # LNF # ################## print e geneFile2 = os.path.splitext( geneFile)[0] + "LNF3.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">" + (str(os.path.basename(genomeFile))) + "|" + (str(os.path.basename(geneFile))) + " | " + str(bestMatchContig) + "\n") f.write((alleleStr) + "\n") f.write(">Allele\n") f.write((bmAllele) + "\n") printinfo(genomeFile, geneFile) perfectMatchIdAllele.append("LNF3") perfectMatchIdAllele2.append("LNF3") print "No allele found" else: ################## # LNF # ################## geneFile2 = os.path.splitext( geneFile)[0] + "LNF4.fasta" print geneFile2 with open(geneFile2, 'a') as f: f.write(">" + (str(os.path.basename(genomeFile))) + "|" + (str(os.path.basename(geneFile))) + " | " + str(bestMatchContig) + "\n") f.write((alleleStr) + "\n") f.write(">Allele\n") f.write((bmAllele) + "\n") printinfo(genomeFile, geneFile) perfectMatchIdAllele.append("LNF4") perfectMatchIdAllele2.append("LNF4") print "No allele found" elif isContainedDefinedAllele: #################### # UNDEFINED ALLELE # # it is contained in another allele #################### alleleStr = match.query perfectMatchIdAllele.append("undefined allele") perfectMatchIdAllele2.append("undefined allele") printinfo(genomeFile, geneFile) print "Undefined allele \n" geneFile2 = os.path.splitext( geneFile)[0] + "undefined.fasta" print geneFile2 elif lenRatio < 0.5: ############### # SMALL MATCH # ############### perfectMatchIdAllele.append('small match') perfectMatchIdAllele2.append('small match') printinfo(genomeFile, geneFile) print "lower than 50% match \n" elif lenRatio < 0.8 and idPercent < 0.5: ##################### # INCOMPLETE ALLELE # # it was not possible to extend it to at least 80% of the length of the gene ##################### perfectMatchIdAllele.append('allele incomplete') perfectMatchIdAllele2.append('allele incomplete') printinfo(genomeFile, geneFile) print "Incomplete allele\n" else: ################## # LNF # ################## printinfo(genomeFile, geneFile) perfectMatchIdAllele.append("LNF5") perfectMatchIdAllele2.append("LNF5") print "Locus not found" final = (resultsList, perfectMatchIdAllele) print("Finished allele calling at : " + time.strftime("%H:%M:%S-%d/%m/%Y")) filepath = os.path.join(temppath, os.path.basename(geneFile) + "_result.txt") filepath2 = os.path.join(temppath, os.path.basename(geneFile) + "_result2.txt") with open(filepath, 'wb') as f: pickle.dump(final, f) with open(filepath2, 'wb') as f: pickle.dump(perfectMatchIdAllele2, f) return True
def callAlleles(argumentList): geneFile = argumentList[0] genomesList = argumentList[1] listOfCDSDicts = argumentList[2] listOfGenomesDict = argumentList[3] gene_fp = HTSeq.FastaReader(geneFile) geneDict = {} alleleI = 0 for allele in gene_fp: if allele.seq in geneDict: print "\nWARNING: this file contains a repeated allele, it should be checked. Ignoring it now!\n", geneFile else: geneDict[ allele.seq ] = alleleI alleleI += 1 # --- make 1st blast DB --- # Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 ) geneF = os.path.splitext( geneFile )[0] blast_out_file = geneF + '.xml' # list of results - the output of the function resultsList = [] i = 0 for genomeFile in genomesList: currentCDSDict = listOfCDSDicts[i] currentGenomeDict = listOfGenomesDict[i] i+=1 # it has to be incremented here if genomeFile[-1] == '\n': genomeFile = genomeFile[:-1] # ------------------------------ RUNNING BLAST ------------------------------ # cline = NcbiblastnCommandline(query=genomeFile, db=Gene_Blast_DB_name, evalue=0.001, out=blast_out_file, outfmt=5) blast_records = runBlastParser(cline, blast_out_file, genomeFile) # ------ DETERMINING BEST MATCH ------ # # bestMatch = ['rec.query','hsp', lenRatio] bestMatch = ['','', 0] for blast_record in blast_records: # --- the LNF cases are now called outside de loop --- # try: hspC = blast_record.alignments[0] if bestMatch[0] == '' and bestMatch[1] == '': bestMatch[0] = blast_record.query bestMatch[1] = hspC except IndexError: continue # --- the contig tag is used in the progigal function --- # contigTag = blast_record.query # --- brute force parsing of the contig tag - better solution is advisable --- # j=0 for l in contigTag: if l == ' ': break j+=1 contigTag = contigTag[:j] contigLen = blast_record.query_letters # --- iterating over all the results to determine the best match --- # for alignment in blast_record.alignments: for match in alignment.hsps: lenRatio = float(len( match.query )) / float( len(match.sbjct) ) if lenRatio > bestMatch[2]: bestMatch = [blast_record.query, match, lenRatio] # ---------- ALLELE CALLING AFTER DETERMINING BEST MATCH ---------- # ################### # LOCUS NOT FOUND # ################### if bestMatch[0] == '': resultsList.append('LNF:-1') # append result to the list of results continue match = bestMatch[1] geneLen = len(match.sbjct) alleleStr = match.query nIdentities = match.identities idPercent = float(nIdentities) / float(geneLen) lenRatio = bestMatch[2] ########################### # LOCUS ON THE CONTIG TIP # ########################### if contigLen <= match.query_start or contigLen <= match.query_end: resultsList.append('LOT:-1') ############### # SMALL MATCH # ############### elif lenRatio < 0.5: resultsList.append('SAC:-1') # don't know what 'SAC' stands for else: # ------------------------------------------------------------------------------------------------------- # # # # USING PRODIGAL TO TRY TO EXTEND CDS # # # # ------------------------------------------------------------------------------------------------------- # extended, strCDS = extendCDS(contigTag, currentCDSDict, match.sbjct_start, match.sbjct_end, currentGenomeDict) # --- if it was possible to extend it using prodigal --- # if extended and ( ( len(strCDS) * lenRatio ) >= geneLen): # and idPercent > 0.8 and ( len(strCDS) / geneLen) > 0.8: alleleStr = strCDS lenRatio = float(len(strCDS)) / float(geneLen) # --- removing gaps '-' --- # alleleStr = alleleStr.replace('-', '') # --- continuing the allele calling --- # if lenRatio < 0.8 and idPercent < 0.5: ##################### # INCOMPLETE ALLELE # # it was not possible to extend it to at least 80% of the length of the gene ##################### resultsList.append('INC:-1') else: # --- it might be needed to obtain the reverse complement of the allele string --- # if match.sbjct_start > match.sbjct_end: alleleStr = reverseComplement(alleleStr) if alleleStr in geneDict: alleleNumber = geneDict[ alleleStr ] ################################################ # EXACT MATCH --- MATCH == GENE --- GENE FOUND # ################################################ resultsList.append('EXC:' + str(alleleNumber) ) else: isUndefined = False for k in geneDict.keys(): if alleleStr in k: isUndefined = True break if isUndefined: #################### # UNDEFINED ALLELE # # it is contained in another allele #################### resultsList.append('UND:-1') else: if not extended and idPercent > 0.8: ################## # ADD NEW ALLELE # ################## tagAux = 'NA:' else: ####################### # ADD INFERRED ALLELE # # a new allele that was extended with prodigal ####################### tagAux = 'INF:' resultsList.append( tagAux + str(alleleI) ) geneDict[ alleleStr ] = alleleI alleleI += 1 # --- add the new allele to the gene fasta --- # fG = open( geneFile, 'a' ) fG.write('>allele_' + str(alleleI) + '_' + tagAux[:-1] + '\n') fG.write( alleleStr + '\n') fG.close() # --- remake blast DB --- # Gene_Blast_DB_name = Create_Blastdb( geneFile, 1 ) return resultsList