def blastall_seq2seq(fastadata=(),filenames=(),output="ncbiparsed",blastprogram="blastp",remove_files=True,extra_blastp_params={'F': 'F', 'e': '10'}): """ choose proper input: fastadata ( ( headerQUERY, seqQUERY ) , ( headerSBJCT, seqSBJCT ) ) or filenames ( filenameQUERY, filenameSBJCT ) """ input = None if blastprogram not in ['blastp','tblastn','tblastx','blastx']: raise "only blastp and tblastn are supported" elif blastprogram in ['tblastn','tblastx']: dna_or_prot = "F" else: dna_or_prot = "T" if fastadata and type(fastadata) == type(()) and len(fastadata) == 2 and not filenames: # input is fasta headers and sequence input = "fastadata" # write input filenames uniquetag = get_random_string_tag() fname_q = "_".join( [ uniquetag, str(fastadata[0][0]), 'Q.fa' ] ) fname_s = "_".join( [ uniquetag, str(fastadata[1][0]), 'S.fa' ] ) fh = open(fname_q,'w') fh.write(">%s\n%s" % (fastadata[0][0],fastadata[0][1])) fh.close() fh = open(fname_s,'w') fh.write(">%s\n%s" % (fastadata[1][0],fastadata[1][1])) fh.close() elif filenames and type(filenames) == type(()) and len(filenames) == 2 and not fastadata: # input is (supposed to be) filenames input = "filenames" # get filenames fname_q = filenames[0] fname_s = filenames[1] elif not filenames and not fastadata: raise "no input!" else: raise "inproper input!" # formatdb OSsystem("%s -i %s -p %s" % (FORMATDB_PATH,fname_s,dna_or_prot)) # and blastall! extra_params = " ".join(["-%s %s" % (k,v) for k,v in extra_blastp_params.iteritems()]) ci,co,ce = osPopen3("%s -p %s %s -i %s -d %s " % (BLASTALL_PATH,blastprogram,extra_params,fname_q,fname_s)) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() if remove_files: OSsystem("rm %s.*" % fname_s) osRemove("%s" % fname_s) osRemove("%s" % fname_q) # and return! return blastallout
def clustalw(inputfile="", seqs={}, remove_inputfile=True, params={}): """ """ if inputfile and seqs: raise "wrong usage!" elif inputfile and not seqs: # input is (hopefully) a filename pass elif not inputfile and seqs: # input is (hopefully) sequences # do a quick check if (sequence) strings are given ARE_ALL_STRINGS = True for header, seq in seqs.iteritems(): if not seq: ARE_ALL_STRINGS = False break if not ARE_ALL_STRINGS: raise Exception, "no sequence string(s) specified: %s" % seqs # make a kind of semi-unique filename uniqueid = get_random_string_tag() inputfile = uniqueid + "_" + "_".join( [_nonstringheader2stringheader(hdr) for hdr in seqs.keys()[0:5]]) inputfile += ".mfa" writeMultiFasta(seqs, inputfile) else: # no input at all raise "no input specified" # okay, do the clustalw fname_in = inputfile # get hard-assigned parameters paramstring = " ".join(["-%s=%s" % (k, v) for k, v in params.iteritems()]) ci, co = osPopen2("%s %s %s" % (EXECUTABLE_CLUSTALW, fname_in, paramstring)) ci.close() clwout = co.read() co.close() # abstract output filenames from input filename if fname_in.find(".") == -1: fname_out = fname_in + ".aln" fname_tree = fname_in + ".dnd" else: _base = fname_in[0:fname_in.rfind(".")] fname_out = _base + ".aln" fname_tree = _base + ".dnd" # parse alignment output file _seqs, _alignment = _parse_clustalw(fname_out) # and delete tmp. created files osRemove(fname_out) osRemove(fname_tree) if remove_inputfile: osRemove(fname_in) # check if the keys (headers) in _seqs correspont to those in seqs # differences can occur when non-string headers are used # and return return (_seqs, _alignment)
def clustalw(inputfile="",seqs={},remove_inputfile=True,params={}): """ """ if inputfile and seqs: raise "wrong usage!" elif inputfile and not seqs: # input is (hopefully) a filename pass elif not inputfile and seqs: # input is (hopefully) sequences # do a quick check if (sequence) strings are given ARE_ALL_STRINGS = True for header, seq in seqs.iteritems(): if not seq: ARE_ALL_STRINGS = False break if not ARE_ALL_STRINGS: raise Exception, "no sequence string(s) specified: %s" % seqs # make a kind of semi-unique filename uniqueid = get_random_string_tag() inputfile = uniqueid+"_"+"_".join([ _nonstringheader2stringheader(hdr) for hdr in seqs.keys()[0:5] ]) inputfile+=".mfa" writeMultiFasta(seqs,inputfile) else: # no input at all raise "no input specified" # okay, do the clustalw fname_in = inputfile # get hard-assigned parameters paramstring = " ".join([ "-%s=%s" % (k,v) for k,v in params.iteritems() ]) ci,co = osPopen2("%s %s %s" % (EXECUTABLE_CLUSTALW,fname_in, paramstring)) ci.close() clwout = co.read() co.close() # abstract output filenames from input filename if fname_in.find(".") == -1: fname_out = fname_in+".aln" fname_tree = fname_in+".dnd" else: _base = fname_in[0:fname_in.rfind(".")] fname_out = _base+".aln" fname_tree = _base+".dnd" # parse alignment output file _seqs,_alignment = _parse_clustalw(fname_out) # and delete tmp. created files osRemove(fname_out) osRemove(fname_tree) if remove_inputfile: osRemove(fname_in) # check if the keys (headers) in _seqs correspont to those in seqs # differences can occur when non-string headers are used # and return return (_seqs,_alignment)
def get_cexpander_uniformly_aligned_count(self): """ """ # run cexpander. TODO -> move to one place fname = "%s.tmp.cexpander.mfa" % get_random_string_tag() fh = open(fname,'w') for node,seq in self.getmaxsrproteinsequences().iteritems(): fh.write( ">%s\n%s\n" % (node,seq)) fh.close() # get cxpdrOutput object; file-cleanup is taken care for cxpdrOutput = runcexpander(fname, cbalignp_commandline = " -y", output='binary') # do cexpander binary string evaluation return cxpdrOutput.binarystring.count("1")
def blastall_seq2db(header, sequence, dbname="", blastprogram="blastp", output="ncbiparsed", extra_blastp_params={ 'F': 'F', 'e': '10' }): """ """ if blastprogram not in ['blastp', 'tblastn', 'blastn', 'blastx']: raise "only blastp and tblastn are supported" extra_params = " ".join( ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()]) # generate (semi ;-) unique filename uniquetag = get_random_string_tag() fname = "_".join( [uniquetag, str(header).replace(" ", "_"), sequence[0:10] + ".fa"]) fname = osPathJoin(OSgetcwd(), fname) fh = open(fname, 'w') fh.write(">%s\n%s\n" % (header, sequence)) fh.close() command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram, extra_params, fname, dbname) try: ci, co, ce = osPopen3(command) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() except: # for some kind of - obvious or freak accident case - # Blast or parsing of the blast record failed # No debugging here; just cleanup and return False print "BLAST CRASHED::" print command blastallout = False # remove the created Query file osRemove(fname) # and return! return blastallout
def get_cexpander_uniformly_aligned_count(self): """ """ # run cexpander. TODO -> move to one place fname = "%s.tmp.cexpander.mfa" % get_random_string_tag() fh = open(fname, 'w') for node, seq in self.getmaxsrproteinsequences().iteritems(): fh.write(">%s\n%s\n" % (node, seq)) fh.close() # get cxpdrOutput object; file-cleanup is taken care for cxpdrOutput = runcexpander(fname, cbalignp_commandline=" -y", output='binary') # do cexpander binary string evaluation return cxpdrOutput.binarystring.count("1")
def get_unguided_nt_identity(self): """ Get identity% of UNGUIDED DNA alignment """ # if zerosized -> return 0.0 if self.length == 0: return 0.0 # get DNA sequences dnaQ,dnaS = self.get_aligned_dna_sequences() dnaQ,dnaS = dnaQ.replace("-",""), dnaS.replace("-","") # make (semi) unique headers uniqueid = get_random_string_tag() (qs,qe,ss,se) = self.barcode()[0:4] headerQ = "query%s%s%s" % (qs,qe,uniqueid) headerS = "sbjct%s%s%s" % (ss,se,uniqueid) # prepare & run clustalw seqs = { headerQ: dnaQ, headerS: dnaS } out,alignment = clustalw( seqs=seqs ) # get id% on aligned dna sequences cnt = 0 for pos in range(0,len(out[headerQ])): if out[headerQ][pos] == out[headerS][pos]: cnt+=1 # return relative ratio return float(cnt) / len(out[headerQ])
def get_unguided_nt_identity(self): """ Get identity% of UNGUIDED DNA alignment """ # if zerosized -> return 0.0 if self.length == 0: return 0.0 # get DNA sequences dnaQ, dnaS = self.get_aligned_dna_sequences() dnaQ, dnaS = dnaQ.replace("-", ""), dnaS.replace("-", "") # make (semi) unique headers uniqueid = get_random_string_tag() (qs, qe, ss, se) = self.barcode()[0:4] headerQ = "query%s%s%s" % (qs, qe, uniqueid) headerS = "sbjct%s%s%s" % (ss, se, uniqueid) # prepare & run clustalw seqs = {headerQ: dnaQ, headerS: dnaS} out, alignment = clustalw(seqs=seqs) # get id% on aligned dna sequences cnt = 0 for pos in range(0, len(out[headerQ])): if out[headerQ][pos] == out[headerS][pos]: cnt += 1 # return relative ratio return float(cnt) / len(out[headerQ])
def blastall_seq2db(header,sequence,dbname="",blastprogram="blastp",output="ncbiparsed",extra_blastp_params={'F': 'F', 'e': '10'}): """ """ if blastprogram not in ['blastp','tblastn','blastn','blastx']: raise "only blastp and tblastn are supported" extra_params = " ".join(["-%s %s" % (k,v) for k,v in extra_blastp_params.iteritems()]) # generate (semi ;-) unique filename uniquetag = get_random_string_tag() fname = "_".join( [ uniquetag, str(header).replace(" ","_"), sequence[0:10]+".fa" ] ) fname = osPathJoin(OSgetcwd(),fname) fh = open(fname,'w') fh.write(">%s\n%s\n" % (header,sequence)) fh.close() command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH,blastprogram,extra_params,fname,dbname) try: ci,co,ce = osPopen3(command) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() except: # for some kind of - obvious or freak accident case - # Blast or parsing of the blast record failed # No debugging here; just cleanup and return False print "BLAST CRASHED::" print command blastallout = False # remove the created Query file osRemove(fname) # and return! return blastallout
def _create_hmm_db(organism, inputdict, cbg, prev, next, orf_must_have_start=False, max_intron_nt_length=200, verbose=False): """ Create fasta ORF database for a organism in a CBG and its viscinity @type organism: * (presumably string) @param organism: Organism identifier recognizable in <input data structure> @type inputdict: dict @param inputdict: <input data structure> @type cbg: CodingBlockGraph or related object @param cbg: CodingBlockGraph upstream/5p of the cbg that must be completed @type prev: CodingBlockGraph or related object (or None) @param prev: CodingBlockGraph upstream/5p of cbg that must be completed @type next: CodingBlockGraph or related object (or None) @param next: CodingBlockGraph downstream/3p of cbg that must be completed @attention: `prev` and `next` CodingBlockGraphs reduce the search space of ORFs to scan with the HMM profile. This Speeds up and improves the quality of results. @type orf_must_have_start: Boolean @param orf_must_have_start: only allow ORFs with methionines as sbjct ORFs @type max_intron_nt_length: integer @param max_intron_nt_length: positive maximum intron length to take into acount when selecting suitable ORFs @type verbose: Boolean @param verbose: report debugging-report on STDOUT (True) or be quiet (False) """ # fullpath filename of result hmm multi fasta database fname_hmm_db_mfa = None if not cbg: return fname_hmm_db_mfa # (1) try to limit searchspace by prev and next CBG prevNode, nextNode = None, None prevMin, nextMax = None, None maskcoords = [] # (1a) check if (informant) organism is in the prev CBG AND if this CBG # has an OMSR -> not per se the case! if prev and organism in prev.organism_set() and\ prev.has_overall_minimal_spanning_range(): prevNode = prev.node_by_organism(organism) try: omsr = prev.overall_minimal_spanning_range(organism=organism) prevMin = (max(omsr) + 1) * 3 maskcoords.append((0, max(omsr))) except KeyError: # hmmm.... block has an OMSR, but not for this organism!??!!? pass # (1b) check if (informant) organism is in the next CBG AND if this CBG # has an OMSR -> not per se the case! if next and organism in next.organism_set() and\ next.has_overall_minimal_spanning_range(): nextNode = next.node_by_organism(organism) try: omsr = next.overall_minimal_spanning_range(organism=organism) nextMax = min(omsr) * 3 aaseqlen = len(inputdict[organism]['genomeseq']) / 3 maskcoords.append((min(omsr), aaseqlen)) except KeyError: # hmmm.... block has an OMSR, but not for this organism!??!!? pass # (1c) limit search space if only prev or next was specified if not prev and next and nextMax: prevMin = nextMax - max_intron_nt_length if not next and prev and prevMin: nextMax = prevMin + max_intron_nt_length # (2a) get elegiable sets of orfs from prev and next if not orf_must_have_start: elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs( min_orf_end=prevMin, max_orf_start=nextMax) else: # ORFs *must* have starts => searching for a TSS exon/CBG elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs( min_orf_end=prevMin, max_orf_start=nextMax, has_starts=True) # (2b) check orf count; can be zero in case of a very tiny region to check if not elegiable_orfs: return fname_hmm_db_mfa # (3) write masked orfs to fasta database multi line string db_fasta = inputdict[organism]['orfs'].tomaskedfasta( coords=maskcoords, orflist=elegiable_orfs, header_prefix=organism) if orf_must_have_start: if len(db_fasta.strip()) == 0: # no UNmasked suitable ORFs remaining! # This is recognized lateron in this function pass else: # mask out all AAs before the first start lines = db_fasta.split("\n") for linenr in range(0, len(lines)): line = lines[linenr] if line[0] != ">": mpos = line.find("M") if mpos > 0: line = "X" * mpos + line[mpos:] lines[linenr] = line # recreate db_fasta string db_fasta = "\n".join(lines) ############################################################################ if verbose: if len(elegiable_orfs) > 10: orfidlist = len(elegiable_orfs) else: orfidlist = [orf.id for orf in elegiable_orfs] print "hmm-elegibable orfs:", organism, orfidlist, "/", print len(inputdict[organism]['orfs'].orfs), "prevMin:", prevMin, if prev: print prev.has_overall_minimal_spanning_range(), else: print None, print "nextMax:", nextMax, if next: print next.has_overall_minimal_spanning_range() else: print None ############################################################################ # (4) make unique filename for hmm database file fname_base = get_random_string_tag() fname_hmm_db_mfa = "hmm_database_%s_%s.fa" % (fname_base, organism) # (5) write masked orfs to fasta database fh = open(fname_hmm_db_mfa, 'w') fh.write(db_fasta) fh.close() # (6) make shure that there where orfs written to file; # in case very little orfs are selected and all are masked -> no files! seqs_in_db = parseFasta(open(fname_hmm_db_mfa).readlines()) if not seqs_in_db: # delete this (empty) file osRemove(fname_hmm_db_mfa) return None # (7) return hmm search database filename return fname_hmm_db_mfa
def createblastdbs(input, GSG, OPTIONS, dbfraction=None, organism=None, acceptorfids=[], rejectorfids=[]): """ (Re)create blast-db's by masking the areas thar are incorporated in the GSG @type input: dict @param input: `input` data structure dictionary @type GSG: GenestructureOfCodingBlockGraphs @param GSG: GenestructureOfCodingBlockGraphs instance @type OPTIONS: optparse options instance @param OPTIONS: optparse options instance (with attribute 'abinitio') @type dbfraction: string @param dbfraction: None, 'all', 'GSGupstream', 'GSGcentral', 'GSGdownstream', 'annotation' @type organism: organism identifier @param organism: only recreate blastdb for this organism/gene identifier @type acceptorfids: list with integers @param acceptorfids: list of orf ids to accept @type rejectorfids: list with integers @param rejectorfids: list of orf ids to reject @attention: acceptorfids and rejectorfids are only used when organism is specified! """ seqsindb = {} for org in input.keys(): # if organism is given, do only this one if organism and org != organism: continue # acceptorfids anc rejectorfids only valid in combi with `organism` if not organism: acceptorfids, rejectorfids = [], [] # assign blast database name / multi fasta file and open filehandle uniquetag = get_random_string_tag() fname = '%s-blastdb-%s.fa' % (uniquetag, org) fullpath = osPathJoin(OPTIONS.outdir, fname) fh = open(fullpath, 'w') seqsindb[org] = 0 # distinct cases possible: if len(GSG): # there is already a GSG, so this is not the first blast iteration # do not apply a shortcut when OPTIONS.abinitio == False coords = GSG.omsr2mask(org) if dbfraction == 'GSGupstream': # take only orfs LEFT of the first CBG in GSG max_orf_nt_start = max( GSG[0].overall_minimal_spanning_range(organism=org)) * 3 orflist = input[org]['orfs'].get_elegiable_orfs( max_orf_start=max_orf_nt_start, acceptorfids=acceptorfids, rejectorfids=rejectorfids) elif dbfraction == 'GSGdownstream': # take only orfs RIGTH of the last CBG in GSG min_orf_nt_end = min( GSG[-1].overall_minimal_spanning_range(organism=org)) * 3 orflist = input[org]['orfs'].get_elegiable_orfs( min_orf_end=min_orf_nt_end, acceptorfids=acceptorfids, rejectorfids=rejectorfids) elif dbfraction == 'GSGcentral': # take only orfs in between FIRST and LAST CBG in GSG (can be only one CBG!) max_orf_nt_start = max( GSG[-1].overall_minimal_spanning_range(organism=org)) * 3 min_orf_nt_end = min( GSG[0].overall_minimal_spanning_range(organism=org)) * 3 orflist = input[org]['orfs'].get_elegiable_orfs( min_orf_end=min_orf_nt_end, max_orf_start=max_orf_nt_start, acceptorfids=acceptorfids, rejectorfids=rejectorfids) else: # dbfraction equals 'all' or None -> no limitation, just take all orfs! # do only the general limitation on sublists of orfids orflist = input[org]['orfs'].get_elegiable_orfs( acceptorfids=acceptorfids, rejectorfids=rejectorfids) # create masked fasta of this sequence part only newfasta = input[org]['orfs'].tomaskedfasta(coords=coords, orflist=orflist, header_prefix=org) # write to file and count accessions in this file -> seqsindb[org] fh.write(newfasta) seqsindb[org] = newfasta.count(">") else: # No filled GSG objects -> no a priori knowledge yet # When dbfraction=='annotated' and !OPTIONS.abinitio -> take annotated orfs only # TODO: dbfraction is not checked/used here -> just OPTIONS.abinitio for orf in input[org]['orfs'].orfs: # in case not abinitio, make only a db of orfs in teh current annotation! if OPTIONS.abinitio == False and orf.id not in input[org][ 'orfid-genestructure']: continue if orf.id in rejectorfids: # ignore Orfs that are listed as to-be-ignored continue if acceptorfids and orf.id not in acceptorfids: # ignore Orfs that are not listed as to-be-accepted continue # write fasta of orf to file fh.write( orf.tofasta(header="%s_orf_%s" % (org, orf.id)) + "\n") # increase seqsindb[org] counter seqsindb[org] += 1 # close the filehandle fh.close() # run formatdb formatdb(fname=fullpath) # set name of blastdb in infodict input[org]['blastdb'] = fullpath # return the counter of how much orf sequences are stored in the blast database return seqsindb
def createblastdbs(input,GSG,OPTIONS,dbfraction=None,organism=None,acceptorfids=[],rejectorfids=[]): """ (Re)create blast-db's by masking the areas thar are incorporated in the GSG @type input: dict @param input: `input` data structure dictionary @type GSG: GenestructureOfCodingBlockGraphs @param GSG: GenestructureOfCodingBlockGraphs instance @type OPTIONS: optparse options instance @param OPTIONS: optparse options instance (with attribute 'abinitio') @type dbfraction: string @param dbfraction: None, 'all', 'GSGupstream', 'GSGcentral', 'GSGdownstream', 'annotation' @type organism: organism identifier @param organism: only recreate blastdb for this organism/gene identifier @type acceptorfids: list with integers @param acceptorfids: list of orf ids to accept @type rejectorfids: list with integers @param rejectorfids: list of orf ids to reject @attention: acceptorfids and rejectorfids are only used when organism is specified! """ seqsindb = {} for org in input.keys(): # if organism is given, do only this one if organism and org!=organism: continue # acceptorfids anc rejectorfids only valid in combi with `organism` if not organism: acceptorfids, rejectorfids = [], [] # assign blast database name / multi fasta file and open filehandle uniquetag = get_random_string_tag() fname = '%s-blastdb-%s.fa' % (uniquetag,org) fullpath = osPathJoin(OPTIONS.outdir,fname) fh = open(fullpath,'w') seqsindb[org] = 0 # distinct cases possible: if len(GSG): # there is already a GSG, so this is not the first blast iteration # do not apply a shortcut when OPTIONS.abinitio == False coords = GSG.omsr2mask(org) if dbfraction == 'GSGupstream': # take only orfs LEFT of the first CBG in GSG max_orf_nt_start = max(GSG[0].overall_minimal_spanning_range(organism=org)) * 3 orflist = input[org]['orfs'].get_elegiable_orfs(max_orf_start=max_orf_nt_start, acceptorfids=acceptorfids,rejectorfids=rejectorfids) elif dbfraction == 'GSGdownstream': # take only orfs RIGTH of the last CBG in GSG min_orf_nt_end = min(GSG[-1].overall_minimal_spanning_range(organism=org)) * 3 orflist = input[org]['orfs'].get_elegiable_orfs(min_orf_end=min_orf_nt_end, acceptorfids=acceptorfids,rejectorfids=rejectorfids) elif dbfraction == 'GSGcentral': # take only orfs in between FIRST and LAST CBG in GSG (can be only one CBG!) max_orf_nt_start = max(GSG[-1].overall_minimal_spanning_range(organism=org)) * 3 min_orf_nt_end = min(GSG[0].overall_minimal_spanning_range(organism=org)) * 3 orflist = input[org]['orfs'].get_elegiable_orfs(min_orf_end=min_orf_nt_end, max_orf_start=max_orf_nt_start, acceptorfids=acceptorfids,rejectorfids=rejectorfids) else: # dbfraction equals 'all' or None -> no limitation, just take all orfs! # do only the general limitation on sublists of orfids orflist = input[org]['orfs'].get_elegiable_orfs( acceptorfids=acceptorfids,rejectorfids=rejectorfids) # create masked fasta of this sequence part only newfasta = input[org]['orfs'].tomaskedfasta(coords=coords,orflist=orflist,header_prefix=org) # write to file and count accessions in this file -> seqsindb[org] fh.write(newfasta) seqsindb[org] = newfasta.count(">") else: # No filled GSG objects -> no a priori knowledge yet # When dbfraction=='annotated' and !OPTIONS.abinitio -> take annotated orfs only # TODO: dbfraction is not checked/used here -> just OPTIONS.abinitio for orf in input[org]['orfs'].orfs: # in case not abinitio, make only a db of orfs in teh current annotation! if OPTIONS.abinitio == False and orf.id not in input[org]['orfid-genestructure']: continue if orf.id in rejectorfids: # ignore Orfs that are listed as to-be-ignored continue if acceptorfids and orf.id not in acceptorfids: # ignore Orfs that are not listed as to-be-accepted continue # write fasta of orf to file fh.write(orf.tofasta(header="%s_orf_%s" % (org,orf.id))+"\n") # increase seqsindb[org] counter seqsindb[org]+=1 # close the filehandle fh.close() # run formatdb formatdb(fname=fullpath) # set name of blastdb in infodict input[org]['blastdb'] = fullpath # return the counter of how much orf sequences are stored in the blast database return seqsindb
def _create_hmm_profile(cbg,area="OMSR",prevcbg=None,nextcbg=None, strip_nonaligned_residues=False, verbose=False,**kwargs): """ """ # area must be one of # OMSR MINSR MAXSR # LEFTSPRDIF RIGTHSPRDIF # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF # RIGTHORFEND # update to default value if not kwargs.has_key('sprdif_min_aa_length'): kwargs['sprdif_min_aa_length'] = 20 if area == "OMSR": if cbg.has_overall_minimal_spanning_range(): coords = cbg.overall_minimal_spanning_range() else: return None, {} elif area == "MINSR": if cbg.has_minimal_spanning_range(): coords = cbg.minimal_spanning_range() else: return None, {} elif area == "MAXSR": if cbg.has_maximal_spanning_range(): coords = cbg.maximal_spanning_range() else: return None, {} elif area == "LEFTSPRDIF": if cbg.has_left_spanningrange_difference(**kwargs): coords = cbg.left_spanningrange_difference(**kwargs) else: return None, {} elif area == "RIGTHSPRDIF": if cbg.has_rigth_spanningrange_difference(**kwargs): coords = cbg.rigth_spanningrange_difference(**kwargs) else: return None, {} elif area == "OMSRANDLEFTSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_left_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.left_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords,verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node,coordrange in coords.iteritems(): coords[node] = Set( range( min(coordrange), max(omsr[node])+1 ) ) elif area == "OMSRANDRIGTHSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_rigth_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.rigth_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords,verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node,coordrange in coords.iteritems(): coords[node] = Set( range( min(omsr[node]), max(coordrange)+1 ) ) elif area == "RIGTHORFEND": # area in between MAXSR and orfend if not cbg.has_maximal_spanning_range(): return None, {} # get coords & obtain Orf ends coords = cbg.maximal_spanning_range() nodes = coords.keys() for node in nodes: organism = cbg.organism_by_node(node) theorf = cbg.get_orfs_of_graph(organism=organism)[0] coords[node] = range(max(coords[node])+1,theorf.protein_endPY) # remove zero-length ranges if len(coords[node]) == 0: del(coords[node]) else: raise "WHAT ELSE!?" ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # decrease coord range by prevcbg if applicable if area in ["MAXSR","LEFTSPRDIF","OMSRANDLEFTSPRDIF"] and prevcbg: omsr = prevcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection( prevcbg.organism_set() ): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodePrev = prevcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodePrev): continue sta = max( [ max(omsr[nodePrev])+1, min(coords[nodeCbg]) ] ) end = max(coords[nodeCbg])+1 coords[nodeCbg] = Set(range(sta,end)) if not coords[nodeCbg]: del( coords[nodeCbg] ) # decrease coord range by nextcbg if applicable if area in ["MAXSR","RIGTHSPRDIF","OMSRANDRIGTHSPRDIF"] and nextcbg: omsr = nextcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection( nextcbg.organism_set() ): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodeNext = nextcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodeNext): continue sta = min(coords[nodeCbg]) end = min( [ min(omsr[nodeNext]), max(coords[nodeCbg])+1 ] ) coords[nodeCbg] = Set(range(sta,end)) if not coords[nodeCbg]: del( coords[nodeCbg] ) # check if coords still present if not coords: return None, {} ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # do/redo _remove_short_sprdif_contributors id required if area in ["MAXSR","LEFTSPRDIF","RIGTHSPRDIF", "OMSRANDLEFTSPRDIF","OMSRANDRIGTHSPRDIF","RIGTHORFEND"]: coords = _remove_short_sprdif_contributors(coords) ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # check if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # check sprdif_min_aa_length if applicable if area in ["RIGTHSPRDIF","LEFTSPRDIF","OMSRANDRIGTHSPRDIF", "OMSRANDLEFTSPRDIF"]: maxlength = max([ len(vlist) for vlist in coords.values() ]) if maxlength < kwargs['sprdif_min_aa_length']: return None, {} # if here, obtain sequences and build HMM search profile # get fasta sequences and fastaseqs = cbg._get_sequences_by_coords(coords) # rewrite dict (node) keys to string keys fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords) # remove empty sequence strings from fastaseqs dict empty_seq_keys = [] for k,seq in fastaseqs.iteritems(): if seq == "" or len(seq) == 1: empty_seq_keys.append(k) for k in empty_seq_keys: del(coords[k]) del(fastaseqs[k]) # check (again) if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # rewrite coords to (min,max) tuple coords = dict([ (key,[min(vlist),max(vlist)+1]) for key,vlist in coords.iteritems() ]) # perform clustalw multiple alignment (alignedseqs,alignment) = clustalw( seqs= fastaseqs ) # strip exterior gaps in case of OMSR/MINSR area if area in ["OMSR","MINSR"]: alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) ) # strip poorly conserved residues in case of RIGTHORFEND if area in ["RIGTHORFEND"]: alignedseqs,alignment,coords = strip_poorly_supported_tails( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords),0.20 ) # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID! if strip_nonaligned_residues: alignedseqs,alignment,coords = strip_overall_nonaligned_residues( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) ) # check if alignment was completely consumed or not if not alignment or len(alignment) <= 1: return None, {} ############################################################################ if verbose: print "## HMM clustalw input profile:",prevcbg!=None,area,nextcbg!=None for node,algseq in alignedseqs.iteritems(): print algseq, node, coords[node] print alignment ############################################################################ # make unique filename for hmm profile file fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag() # write multiple alignment input file writeMultiFasta(alignedseqs,fname_hmm_profile) # make hmmbuild file of the multiplealignment fname_hmmbuild_file = hmmbuild_protein( fname_hmm_profile ) # remove hmm profile multiple alignment file osRemove(fname_hmm_profile) # return HMM serach profile filename return fname_hmmbuild_file, coords
def _create_hmm_db(organism,inputdict,cbg,prev,next, orf_must_have_start=False,max_intron_nt_length=200, verbose=False): """ Create fasta ORF database for a organism in a CBG and its viscinity @type organism: * (presumably string) @param organism: Organism identifier recognizable in <input data structure> @type inputdict: dict @param inputdict: <input data structure> @type cbg: CodingBlockGraph or related object @param cbg: CodingBlockGraph upstream/5p of the cbg that must be completed @type prev: CodingBlockGraph or related object (or None) @param prev: CodingBlockGraph upstream/5p of cbg that must be completed @type next: CodingBlockGraph or related object (or None) @param next: CodingBlockGraph downstream/3p of cbg that must be completed @attention: `prev` and `next` CodingBlockGraphs reduce the search space of ORFs to scan with the HMM profile. This Speeds up and improves the quality of results. @type orf_must_have_start: Boolean @param orf_must_have_start: only allow ORFs with methionines as sbjct ORFs @type max_intron_nt_length: integer @param max_intron_nt_length: positive maximum intron length to take into acount when selecting suitable ORFs @type verbose: Boolean @param verbose: report debugging-report on STDOUT (True) or be quiet (False) """ # fullpath filename of result hmm multi fasta database fname_hmm_db_mfa = None if not cbg: return fname_hmm_db_mfa # (1) try to limit searchspace by prev and next CBG prevNode, nextNode = None, None prevMin, nextMax = None, None maskcoords = [] # (1a) check if (informant) organism is in the prev CBG AND if this CBG # has an OMSR -> not per se the case! if prev and organism in prev.organism_set() and\ prev.has_overall_minimal_spanning_range(): prevNode = prev.node_by_organism(organism) try: omsr = prev.overall_minimal_spanning_range(organism=organism) prevMin = (max(omsr)+1)*3 maskcoords.append( ( 0, max(omsr) ) ) except KeyError: # hmmm.... block has an OMSR, but not for this organism!??!!? pass # (1b) check if (informant) organism is in the next CBG AND if this CBG # has an OMSR -> not per se the case! if next and organism in next.organism_set() and\ next.has_overall_minimal_spanning_range(): nextNode = next.node_by_organism(organism) try: omsr = next.overall_minimal_spanning_range(organism=organism) nextMax = min(omsr)*3 aaseqlen = len(inputdict[organism]['genomeseq'])/3 maskcoords.append( ( min(omsr), aaseqlen ) ) except KeyError: # hmmm.... block has an OMSR, but not for this organism!??!!? pass # (1c) limit search space if only prev or next was specified if not prev and next and nextMax: prevMin = nextMax - max_intron_nt_length if not next and prev and prevMin: nextMax = prevMin + max_intron_nt_length # (2a) get elegiable sets of orfs from prev and next if not orf_must_have_start: elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs( min_orf_end = prevMin, max_orf_start = nextMax ) else: # ORFs *must* have starts => searching for a TSS exon/CBG elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs( min_orf_end = prevMin, max_orf_start = nextMax, has_starts=True ) # (2b) check orf count; can be zero in case of a very tiny region to check if not elegiable_orfs: return fname_hmm_db_mfa # (3) write masked orfs to fasta database multi line string db_fasta = inputdict[organism]['orfs'].tomaskedfasta( coords=maskcoords, orflist=elegiable_orfs, header_prefix=organism) if orf_must_have_start: if len(db_fasta.strip()) == 0: # no UNmasked suitable ORFs remaining! # This is recognized lateron in this function pass else: # mask out all AAs before the first start lines = db_fasta.split("\n") for linenr in range(0,len(lines)): line = lines[linenr] if line[0] != ">": mpos = line.find("M") if mpos > 0: line = "X"*mpos+line[mpos:] lines[linenr] = line # recreate db_fasta string db_fasta = "\n".join(lines) ############################################################################ if verbose: if len(elegiable_orfs) > 10: orfidlist = len(elegiable_orfs) else: orfidlist = [ orf.id for orf in elegiable_orfs ] print "hmm-elegibable orfs:", organism, orfidlist, "/", print len(inputdict[organism]['orfs'].orfs), "prevMin:", prevMin, if prev: print prev.has_overall_minimal_spanning_range(), else: print None, print "nextMax:", nextMax, if next: print next.has_overall_minimal_spanning_range() else: print None ############################################################################ # (4) make unique filename for hmm database file fname_base = get_random_string_tag() fname_hmm_db_mfa = "hmm_database_%s_%s.fa" % (fname_base,organism) # (5) write masked orfs to fasta database fh = open(fname_hmm_db_mfa,'w') fh.write( db_fasta ) fh.close() # (6) make shure that there where orfs written to file; # in case very little orfs are selected and all are masked -> no files! seqs_in_db = parseFasta(open(fname_hmm_db_mfa).readlines()) if not seqs_in_db: # delete this (empty) file osRemove( fname_hmm_db_mfa ) return None # (7) return hmm search database filename return fname_hmm_db_mfa
def runcexpander(fname_fasta,cbalignp_commandline=" -y",output='binary'): """ Run the complete cascade of cexpander algorithms on an input multi fasta file and return the output as a CexpanderOutput object @type fname_fasta: string @param fname_fasta: path to input multi fasta file @type cbalignp_commandline: string @param cbalignp_commandline: (extra) command line for cbalignp @type min_cols: integer @param min_cols: minimal number of uniformly matched positions (cols) required to report transfer blocks for (>= 0) @type projected_on: string @param projected_on: apply fasta seqeunce header which to use for projection; apply ':::' to do projections on all input sequences @attention: requires global variable EXECUTABLE_cexpander_ALLVSALL @attention: requires global variable EXECUTABLE_CEXPANDER_CBALIGNP @attention: requires global variable EXECUTABLE_CEXPANDER_CEXPANDER @attention: see cexpander_dr for (additional) command line options @attention: only a subset of cexpander_dr commandline options are supported! @rtype: CexpanderOutput object @return: CexpanderOutput object """ if not fname_fasta: raise "NoProperFunctionArguments" if not osPathIsfile(fname_fasta): raise "FileDoesNotExist" # (0) create (~unique) filenames uniquetag = get_random_string_tag() fname_allvsall = ".".join([fname_fasta,uniquetag,"allvsall"]) fname_report = ".".join([fname_fasta,uniquetag,"report"]) fname_aligned = ".".join([fname_fasta,uniquetag,"aligned"]) fname_settings = ".".join([fname_fasta,uniquetag,"settings"]) fname_cexpander = ".".join([fname_fasta,uniquetag,"cexpander"]) # (1) create complete .fa -> cexpanderstring command command = """ python %s %s %s %s; %s -i %s %s > %s; %s < %s; """ % ( EXECUTABLE_CEXPANDER_ALLVSALL, fname_fasta, fname_allvsall, fname_report, EXECUTABLE_CEXPANDER_CBALIGNP, fname_allvsall, cbalignp_commandline, fname_aligned, EXECUTABLE_CEXPANDER_CEXPANDER, fname_settings, ) # (2) create fname_settings file binorfloat = "$dumpcv" if output == "float": binorfloat = "$dumpcvc" fh = open(fname_settings,'w') content = "\n\n".join( [ "$load\n%s\n%s" % (fname_report,fname_aligned), "$addquery\n-1", "$run", "$dumpentries", "$cv_linear", "%s" % ( binorfloat ), # BINARY == $dumpcv, FLOAT = $dumpcvc "$exit\n\n", ] ) fh.write(content) fh.close() # (3) run the command ci,co,ce = osPopen3(command) ci.close() # output of EXECUTABLE_CEXPANDER_ALLVSALL is cast to STDOUT as well! cexpanderdata = co.read() co.close() error = ce.read() ce.close() # (4) parse fname_cexpander to CexpanderOutput object cxpdr = parse_cexpander(cexpanderdata,fname_fasta) # (5) cleanup files osSystem("rm -f %s %s.%s.*" % ( fname_fasta, fname_fasta,uniquetag ) ) # (6) return the output object return cxpdr
def _create_hmm_profile(cbg, area="OMSR", prevcbg=None, nextcbg=None, strip_nonaligned_residues=False, verbose=False, **kwargs): """ """ # area must be one of # OMSR MINSR MAXSR # LEFTSPRDIF RIGTHSPRDIF # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF # RIGTHORFEND # update to default value if not kwargs.has_key('sprdif_min_aa_length'): kwargs['sprdif_min_aa_length'] = 20 if area == "OMSR": if cbg.has_overall_minimal_spanning_range(): coords = cbg.overall_minimal_spanning_range() else: return None, {} elif area == "MINSR": if cbg.has_minimal_spanning_range(): coords = cbg.minimal_spanning_range() else: return None, {} elif area == "MAXSR": if cbg.has_maximal_spanning_range(): coords = cbg.maximal_spanning_range() else: return None, {} elif area == "LEFTSPRDIF": if cbg.has_left_spanningrange_difference(**kwargs): coords = cbg.left_spanningrange_difference(**kwargs) else: return None, {} elif area == "RIGTHSPRDIF": if cbg.has_rigth_spanningrange_difference(**kwargs): coords = cbg.rigth_spanningrange_difference(**kwargs) else: return None, {} elif area == "OMSRANDLEFTSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_left_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.left_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords, verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node, coordrange in coords.iteritems(): coords[node] = Set(range(min(coordrange), max(omsr[node]) + 1)) elif area == "OMSRANDRIGTHSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_rigth_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.rigth_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords, verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node, coordrange in coords.iteritems(): coords[node] = Set(range(min(omsr[node]), max(coordrange) + 1)) elif area == "RIGTHORFEND": # area in between MAXSR and orfend if not cbg.has_maximal_spanning_range(): return None, {} # get coords & obtain Orf ends coords = cbg.maximal_spanning_range() nodes = coords.keys() for node in nodes: organism = cbg.organism_by_node(node) theorf = cbg.get_orfs_of_graph(organism=organism)[0] coords[node] = range(max(coords[node]) + 1, theorf.protein_endPY) # remove zero-length ranges if len(coords[node]) == 0: del (coords[node]) else: raise "WHAT ELSE!?" ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # decrease coord range by prevcbg if applicable if area in ["MAXSR", "LEFTSPRDIF", "OMSRANDLEFTSPRDIF"] and prevcbg: omsr = prevcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection(prevcbg.organism_set()): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodePrev = prevcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodePrev): continue sta = max([max(omsr[nodePrev]) + 1, min(coords[nodeCbg])]) end = max(coords[nodeCbg]) + 1 coords[nodeCbg] = Set(range(sta, end)) if not coords[nodeCbg]: del (coords[nodeCbg]) # decrease coord range by nextcbg if applicable if area in ["MAXSR", "RIGTHSPRDIF", "OMSRANDRIGTHSPRDIF"] and nextcbg: omsr = nextcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection(nextcbg.organism_set()): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodeNext = nextcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodeNext): continue sta = min(coords[nodeCbg]) end = min([min(omsr[nodeNext]), max(coords[nodeCbg]) + 1]) coords[nodeCbg] = Set(range(sta, end)) if not coords[nodeCbg]: del (coords[nodeCbg]) # check if coords still present if not coords: return None, {} ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # do/redo _remove_short_sprdif_contributors id required if area in [ "MAXSR", "LEFTSPRDIF", "RIGTHSPRDIF", "OMSRANDLEFTSPRDIF", "OMSRANDRIGTHSPRDIF", "RIGTHORFEND" ]: coords = _remove_short_sprdif_contributors(coords) ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # check if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # check sprdif_min_aa_length if applicable if area in [ "RIGTHSPRDIF", "LEFTSPRDIF", "OMSRANDRIGTHSPRDIF", "OMSRANDLEFTSPRDIF" ]: maxlength = max([len(vlist) for vlist in coords.values()]) if maxlength < kwargs['sprdif_min_aa_length']: return None, {} # if here, obtain sequences and build HMM search profile # get fasta sequences and fastaseqs = cbg._get_sequences_by_coords(coords) # rewrite dict (node) keys to string keys fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords) # remove empty sequence strings from fastaseqs dict empty_seq_keys = [] for k, seq in fastaseqs.iteritems(): if seq == "" or len(seq) == 1: empty_seq_keys.append(k) for k in empty_seq_keys: del (coords[k]) del (fastaseqs[k]) # check (again) if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # rewrite coords to (min,max) tuple coords = dict([(key, [min(vlist), max(vlist) + 1]) for key, vlist in coords.iteritems()]) # perform clustalw multiple alignment (alignedseqs, alignment) = clustalw(seqs=fastaseqs) # strip exterior gaps in case of OMSR/MINSR area if area in ["OMSR", "MINSR"]: alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords)) # strip poorly conserved residues in case of RIGTHORFEND if area in ["RIGTHORFEND"]: alignedseqs, alignment, coords = strip_poorly_supported_tails( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords), 0.20) # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID! if strip_nonaligned_residues: alignedseqs, alignment, coords = strip_overall_nonaligned_residues( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords)) # check if alignment was completely consumed or not if not alignment or len(alignment) <= 1: return None, {} ############################################################################ if verbose: print "## HMM clustalw input profile:", prevcbg != None, area, nextcbg != None for node, algseq in alignedseqs.iteritems(): print algseq, node, coords[node] print alignment ############################################################################ # make unique filename for hmm profile file fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag() # write multiple alignment input file writeMultiFasta(alignedseqs, fname_hmm_profile) # make hmmbuild file of the multiplealignment fname_hmmbuild_file = hmmbuild_protein(fname_hmm_profile) # remove hmm profile multiple alignment file osRemove(fname_hmm_profile) # return HMM serach profile filename return fname_hmmbuild_file, coords
def runcexpander(fname_fasta, cbalignp_commandline=" -y", output='binary'): """ Run the complete cascade of cexpander algorithms on an input multi fasta file and return the output as a CexpanderOutput object @type fname_fasta: string @param fname_fasta: path to input multi fasta file @type cbalignp_commandline: string @param cbalignp_commandline: (extra) command line for cbalignp @type min_cols: integer @param min_cols: minimal number of uniformly matched positions (cols) required to report transfer blocks for (>= 0) @type projected_on: string @param projected_on: apply fasta seqeunce header which to use for projection; apply ':::' to do projections on all input sequences @attention: requires global variable EXECUTABLE_cexpander_ALLVSALL @attention: requires global variable EXECUTABLE_CEXPANDER_CBALIGNP @attention: requires global variable EXECUTABLE_CEXPANDER_CEXPANDER @attention: see cexpander_dr for (additional) command line options @attention: only a subset of cexpander_dr commandline options are supported! @rtype: CexpanderOutput object @return: CexpanderOutput object """ if not fname_fasta: raise "NoProperFunctionArguments" if not osPathIsfile(fname_fasta): raise "FileDoesNotExist" # (0) create (~unique) filenames uniquetag = get_random_string_tag() fname_allvsall = ".".join([fname_fasta, uniquetag, "allvsall"]) fname_report = ".".join([fname_fasta, uniquetag, "report"]) fname_aligned = ".".join([fname_fasta, uniquetag, "aligned"]) fname_settings = ".".join([fname_fasta, uniquetag, "settings"]) fname_cexpander = ".".join([fname_fasta, uniquetag, "cexpander"]) # (1) create complete .fa -> cexpanderstring command command = """ python %s %s %s %s; %s -i %s %s > %s; %s < %s; """ % ( EXECUTABLE_CEXPANDER_ALLVSALL, fname_fasta, fname_allvsall, fname_report, EXECUTABLE_CEXPANDER_CBALIGNP, fname_allvsall, cbalignp_commandline, fname_aligned, EXECUTABLE_CEXPANDER_CEXPANDER, fname_settings, ) # (2) create fname_settings file binorfloat = "$dumpcv" if output == "float": binorfloat = "$dumpcvc" fh = open(fname_settings, 'w') content = "\n\n".join([ "$load\n%s\n%s" % (fname_report, fname_aligned), "$addquery\n-1", "$run", "$dumpentries", "$cv_linear", "%s" % (binorfloat), # BINARY == $dumpcv, FLOAT = $dumpcvc "$exit\n\n", ]) fh.write(content) fh.close() # (3) run the command ci, co, ce = osPopen3(command) ci.close() # output of EXECUTABLE_CEXPANDER_ALLVSALL is cast to STDOUT as well! cexpanderdata = co.read() co.close() error = ce.read() ce.close() # (4) parse fname_cexpander to CexpanderOutput object cxpdr = parse_cexpander(cexpanderdata, fname_fasta) # (5) cleanup files osSystem("rm -f %s %s.%s.*" % (fname_fasta, fname_fasta, uniquetag)) # (6) return the output object return cxpdr
def blastall_seq2seq(fastadata=(), filenames=(), output="ncbiparsed", blastprogram="blastp", remove_files=True, extra_blastp_params={ 'F': 'F', 'e': '10' }): """ choose proper input: fastadata ( ( headerQUERY, seqQUERY ) , ( headerSBJCT, seqSBJCT ) ) or filenames ( filenameQUERY, filenameSBJCT ) """ input = None if blastprogram not in ['blastp', 'tblastn', 'tblastx', 'blastx']: raise "only blastp and tblastn are supported" elif blastprogram in ['tblastn', 'tblastx']: dna_or_prot = "F" else: dna_or_prot = "T" if fastadata and type(fastadata) == type( ()) and len(fastadata) == 2 and not filenames: # input is fasta headers and sequence input = "fastadata" # write input filenames uniquetag = get_random_string_tag() fname_q = "_".join([uniquetag, str(fastadata[0][0]), 'Q.fa']) fname_s = "_".join([uniquetag, str(fastadata[1][0]), 'S.fa']) fh = open(fname_q, 'w') fh.write(">%s\n%s" % (fastadata[0][0], fastadata[0][1])) fh.close() fh = open(fname_s, 'w') fh.write(">%s\n%s" % (fastadata[1][0], fastadata[1][1])) fh.close() elif filenames and type(filenames) == type( ()) and len(filenames) == 2 and not fastadata: # input is (supposed to be) filenames input = "filenames" # get filenames fname_q = filenames[0] fname_s = filenames[1] elif not filenames and not fastadata: raise "no input!" else: raise "inproper input!" # formatdb OSsystem("%s -i %s -p %s" % (FORMATDB_PATH, fname_s, dna_or_prot)) # and blastall! extra_params = " ".join( ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()]) ci, co, ce = osPopen3( "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram, extra_params, fname_q, fname_s)) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() if remove_files: OSsystem("rm %s.*" % fname_s) osRemove("%s" % fname_s) osRemove("%s" % fname_q) # and return! return blastallout