def blastall_seq2seq(fastadata=(),filenames=(),output="ncbiparsed",blastprogram="blastp",remove_files=True,extra_blastp_params={'F': 'F', 'e': '10'}): """ choose proper input: fastadata ( ( headerQUERY, seqQUERY ) , ( headerSBJCT, seqSBJCT ) ) or filenames ( filenameQUERY, filenameSBJCT ) """ input = None if blastprogram not in ['blastp','tblastn','tblastx','blastx']: raise "only blastp and tblastn are supported" elif blastprogram in ['tblastn','tblastx']: dna_or_prot = "F" else: dna_or_prot = "T" if fastadata and type(fastadata) == type(()) and len(fastadata) == 2 and not filenames: # input is fasta headers and sequence input = "fastadata" # write input filenames uniquetag = get_random_string_tag() fname_q = "_".join( [ uniquetag, str(fastadata[0][0]), 'Q.fa' ] ) fname_s = "_".join( [ uniquetag, str(fastadata[1][0]), 'S.fa' ] ) fh = open(fname_q,'w') fh.write(">%s\n%s" % (fastadata[0][0],fastadata[0][1])) fh.close() fh = open(fname_s,'w') fh.write(">%s\n%s" % (fastadata[1][0],fastadata[1][1])) fh.close() elif filenames and type(filenames) == type(()) and len(filenames) == 2 and not fastadata: # input is (supposed to be) filenames input = "filenames" # get filenames fname_q = filenames[0] fname_s = filenames[1] elif not filenames and not fastadata: raise "no input!" else: raise "inproper input!" # formatdb OSsystem("%s -i %s -p %s" % (FORMATDB_PATH,fname_s,dna_or_prot)) # and blastall! extra_params = " ".join(["-%s %s" % (k,v) for k,v in extra_blastp_params.iteritems()]) ci,co,ce = osPopen3("%s -p %s %s -i %s -d %s " % (BLASTALL_PATH,blastprogram,extra_params,fname_q,fname_s)) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() if remove_files: OSsystem("rm %s.*" % fname_s) osRemove("%s" % fname_s) osRemove("%s" % fname_q) # and return! return blastallout
def tearDown(self): del self.app del self._umlFrame for x in range(4): try: osRemove(f'{HISTORY_FILE_NAME}{x}') except (ValueError, Exception): pass # we truly want to ignore
def comprimirArchivo(self, archivo): print "Comprimiendo %s" % archivo try: f_in = open(archivo, 'rb') f_out = gzipOpen(archivo+'.gz', 'wb') f_out.writelines(f_in) f_out.close() f_in.close() osRemove(archivo) except: pass
def _restoreBackup(self): preferencesFileName: str = Preferences.getPreferencesLocation() source: str = f"{preferencesFileName}{TestPreferences.BACKUP_SUFFIX}" target: str = preferencesFileName if osPath.exists(source): try: copyfile(source, target) except IOError as e: self.logger.error(f"Unable to copy file. {e}") osRemove(source) else: osRemove(target)
def blastall_seq2db(header, sequence, dbname="", blastprogram="blastp", output="ncbiparsed", extra_blastp_params={ 'F': 'F', 'e': '10' }): """ """ if blastprogram not in ['blastp', 'tblastn', 'blastn', 'blastx']: raise "only blastp and tblastn are supported" extra_params = " ".join( ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()]) # generate (semi ;-) unique filename uniquetag = get_random_string_tag() fname = "_".join( [uniquetag, str(header).replace(" ", "_"), sequence[0:10] + ".fa"]) fname = osPathJoin(OSgetcwd(), fname) fh = open(fname, 'w') fh.write(">%s\n%s\n" % (header, sequence)) fh.close() command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram, extra_params, fname, dbname) try: ci, co, ce = osPopen3(command) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() except: # for some kind of - obvious or freak accident case - # Blast or parsing of the blast record failed # No debugging here; just cleanup and return False print "BLAST CRASHED::" print command blastallout = False # remove the created Query file osRemove(fname) # and return! return blastallout
def _file_cleanup(fnamelist, include_directories=False): """ """ for fname in fnamelist: if osPathExists(str(fname)): try: osRemove(str(fname)) except OSError: if osPathIsdir(str(fname)) and include_directories: osSystem("rm -rf %s" % fname) except: # failed !? on cluster computing, this # could be the case when an identical filename # is created/owned by another user. # I suspect this might happen for formatdb.log ;-) pass
def _file_cleanup(fnamelist,include_directories=False): """ """ for fname in fnamelist: if osPathExists(str(fname)): try: osRemove(str(fname)) except OSError: if osPathIsdir(str(fname)) and include_directories: osSystem("rm -rf %s" % fname) except: # failed !? on cluster computing, this # could be the case when an identical filename # is created/owned by another user. # I suspect this might happen for formatdb.log ;-) pass
def _assertIdenticalFiles(self, baseName: str, generatedFileName: str, failMessage: str, removeTestFile: bool = True) -> None: """ The side effect here is that if the assertion passes then this method removes the generated file Args: baseName: The base file name generatedFileName: The generated file name failMessage: The message to display if the files fail comparison """ standardFileName: str = self._getFullyQualifiedPdfPath(f'{baseName}{TestDiagramParent.STANDARD_SUFFIX}{TestConstants.TEST_SUFFIX}') status: int = self._runPdfDiff(baseFileName=generatedFileName, standardFileName=standardFileName) self.assertTrue(status == 0, failMessage) if removeTestFile is True: self.logger.info(f'Removing: {generatedFileName}') osRemove(generatedFileName)
def delete(user, ui): if len(ui) != 2: print(">>> USAGE: delete <tablename>") return try: with open(ui[1] + '.csv', 'r') as f: if not authenticate(user, ui[1], 'w'): print(">>> ERROR: User", user, "does not have access to", ui[1]) return print("This will permanently delete table", ui[1]) if input('Are you sure you want to proceed? (y/n)>') != 'y': return except FileNotFoundError: print(">>> ERROR: Table", ui[1], "does not exist.") return osRemove(ui[1] + '.csv') print("Table", ui[1], "deleted.")
def testJsonSerialization(self): pass gState: GameState = GameState() gState.playerType = PlayerType.Emeritus gState.gameType = GameType.Medium gState.starDate = 40501.0 gState.remainingGameTime = 42.42424242 gState.currentQuadrantCoordinates = Coordinates(4, 4) gState.currentSectorCoordinates = Coordinates(9, 9) jsonGState: str = jsonpickle.encode(gState, indent=4) self.assertIsNotNone(jsonGState, "Pickling failed") self.logger.info(f'json game stats: {jsonGState}') file: TextIO = open(TestGameState.TEST_PICKLE_FILENAME, 'w') file.write(jsonGState) file.close() jsonFile: TextIO = open(TestGameState.TEST_PICKLE_FILENAME, 'r') jsonStr: str = jsonFile.read() self.assertIsNotNone(jsonStr) jsonFile.close() thawedGameState: GameState = jsonpickle.decode(jsonStr) self.assertIsNotNone(thawedGameState, "Did that thaw?") self.assertEqual(gState.playerType, thawedGameState.playerType, "Player type did not thaw") self.assertEqual(gState.gameType, thawedGameState.gameType, "Game type did not thaw") self.assertEqual(gState.starDate, thawedGameState.starDate, "Star date did not thaw") self.assertEqual(gState.remainingGameTime, thawedGameState.remainingGameTime, "Remaining game time did not thaw") osRemove(TestGameState.TEST_PICKLE_FILENAME)
def clustalw(inputfile="", seqs={}, remove_inputfile=True, params={}): """ """ if inputfile and seqs: raise "wrong usage!" elif inputfile and not seqs: # input is (hopefully) a filename pass elif not inputfile and seqs: # input is (hopefully) sequences # do a quick check if (sequence) strings are given ARE_ALL_STRINGS = True for header, seq in seqs.iteritems(): if not seq: ARE_ALL_STRINGS = False break if not ARE_ALL_STRINGS: raise Exception, "no sequence string(s) specified: %s" % seqs # make a kind of semi-unique filename uniqueid = get_random_string_tag() inputfile = uniqueid + "_" + "_".join( [_nonstringheader2stringheader(hdr) for hdr in seqs.keys()[0:5]]) inputfile += ".mfa" writeMultiFasta(seqs, inputfile) else: # no input at all raise "no input specified" # okay, do the clustalw fname_in = inputfile # get hard-assigned parameters paramstring = " ".join(["-%s=%s" % (k, v) for k, v in params.iteritems()]) ci, co = osPopen2("%s %s %s" % (EXECUTABLE_CLUSTALW, fname_in, paramstring)) ci.close() clwout = co.read() co.close() # abstract output filenames from input filename if fname_in.find(".") == -1: fname_out = fname_in + ".aln" fname_tree = fname_in + ".dnd" else: _base = fname_in[0:fname_in.rfind(".")] fname_out = _base + ".aln" fname_tree = _base + ".dnd" # parse alignment output file _seqs, _alignment = _parse_clustalw(fname_out) # and delete tmp. created files osRemove(fname_out) osRemove(fname_tree) if remove_inputfile: osRemove(fname_in) # check if the keys (headers) in _seqs correspont to those in seqs # differences can occur when non-string headers are used # and return return (_seqs, _alignment)
def blastall_seq2db(header,sequence,dbname="",blastprogram="blastp",output="ncbiparsed",extra_blastp_params={'F': 'F', 'e': '10'}): """ """ if blastprogram not in ['blastp','tblastn','blastn','blastx']: raise "only blastp and tblastn are supported" extra_params = " ".join(["-%s %s" % (k,v) for k,v in extra_blastp_params.iteritems()]) # generate (semi ;-) unique filename uniquetag = get_random_string_tag() fname = "_".join( [ uniquetag, str(header).replace(" ","_"), sequence[0:10]+".fa" ] ) fname = osPathJoin(OSgetcwd(),fname) fh = open(fname,'w') fh.write(">%s\n%s\n" % (header,sequence)) fh.close() command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH,blastprogram,extra_params,fname,dbname) try: ci,co,ce = osPopen3(command) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() except: # for some kind of - obvious or freak accident case - # Blast or parsing of the blast record failed # No debugging here; just cleanup and return False print "BLAST CRASHED::" print command blastallout = False # remove the created Query file osRemove(fname) # and return! return blastallout
def clustalw(inputfile="",seqs={},remove_inputfile=True,params={}): """ """ if inputfile and seqs: raise "wrong usage!" elif inputfile and not seqs: # input is (hopefully) a filename pass elif not inputfile and seqs: # input is (hopefully) sequences # do a quick check if (sequence) strings are given ARE_ALL_STRINGS = True for header, seq in seqs.iteritems(): if not seq: ARE_ALL_STRINGS = False break if not ARE_ALL_STRINGS: raise Exception, "no sequence string(s) specified: %s" % seqs # make a kind of semi-unique filename uniqueid = get_random_string_tag() inputfile = uniqueid+"_"+"_".join([ _nonstringheader2stringheader(hdr) for hdr in seqs.keys()[0:5] ]) inputfile+=".mfa" writeMultiFasta(seqs,inputfile) else: # no input at all raise "no input specified" # okay, do the clustalw fname_in = inputfile # get hard-assigned parameters paramstring = " ".join([ "-%s=%s" % (k,v) for k,v in params.iteritems() ]) ci,co = osPopen2("%s %s %s" % (EXECUTABLE_CLUSTALW,fname_in, paramstring)) ci.close() clwout = co.read() co.close() # abstract output filenames from input filename if fname_in.find(".") == -1: fname_out = fname_in+".aln" fname_tree = fname_in+".dnd" else: _base = fname_in[0:fname_in.rfind(".")] fname_out = _base+".aln" fname_tree = _base+".dnd" # parse alignment output file _seqs,_alignment = _parse_clustalw(fname_out) # and delete tmp. created files osRemove(fname_out) osRemove(fname_tree) if remove_inputfile: osRemove(fname_in) # check if the keys (headers) in _seqs correspont to those in seqs # differences can occur when non-string headers are used # and return return (_seqs,_alignment)
def _create_hmm_profile(cbg, area="OMSR", prevcbg=None, nextcbg=None, strip_nonaligned_residues=False, verbose=False, **kwargs): """ """ # area must be one of # OMSR MINSR MAXSR # LEFTSPRDIF RIGTHSPRDIF # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF # RIGTHORFEND # update to default value if not kwargs.has_key('sprdif_min_aa_length'): kwargs['sprdif_min_aa_length'] = 20 if area == "OMSR": if cbg.has_overall_minimal_spanning_range(): coords = cbg.overall_minimal_spanning_range() else: return None, {} elif area == "MINSR": if cbg.has_minimal_spanning_range(): coords = cbg.minimal_spanning_range() else: return None, {} elif area == "MAXSR": if cbg.has_maximal_spanning_range(): coords = cbg.maximal_spanning_range() else: return None, {} elif area == "LEFTSPRDIF": if cbg.has_left_spanningrange_difference(**kwargs): coords = cbg.left_spanningrange_difference(**kwargs) else: return None, {} elif area == "RIGTHSPRDIF": if cbg.has_rigth_spanningrange_difference(**kwargs): coords = cbg.rigth_spanningrange_difference(**kwargs) else: return None, {} elif area == "OMSRANDLEFTSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_left_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.left_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords, verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node, coordrange in coords.iteritems(): coords[node] = Set(range(min(coordrange), max(omsr[node]) + 1)) elif area == "OMSRANDRIGTHSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_rigth_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.rigth_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords, verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node, coordrange in coords.iteritems(): coords[node] = Set(range(min(omsr[node]), max(coordrange) + 1)) elif area == "RIGTHORFEND": # area in between MAXSR and orfend if not cbg.has_maximal_spanning_range(): return None, {} # get coords & obtain Orf ends coords = cbg.maximal_spanning_range() nodes = coords.keys() for node in nodes: organism = cbg.organism_by_node(node) theorf = cbg.get_orfs_of_graph(organism=organism)[0] coords[node] = range(max(coords[node]) + 1, theorf.protein_endPY) # remove zero-length ranges if len(coords[node]) == 0: del (coords[node]) else: raise "WHAT ELSE!?" ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # decrease coord range by prevcbg if applicable if area in ["MAXSR", "LEFTSPRDIF", "OMSRANDLEFTSPRDIF"] and prevcbg: omsr = prevcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection(prevcbg.organism_set()): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodePrev = prevcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodePrev): continue sta = max([max(omsr[nodePrev]) + 1, min(coords[nodeCbg])]) end = max(coords[nodeCbg]) + 1 coords[nodeCbg] = Set(range(sta, end)) if not coords[nodeCbg]: del (coords[nodeCbg]) # decrease coord range by nextcbg if applicable if area in ["MAXSR", "RIGTHSPRDIF", "OMSRANDRIGTHSPRDIF"] and nextcbg: omsr = nextcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection(nextcbg.organism_set()): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodeNext = nextcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodeNext): continue sta = min(coords[nodeCbg]) end = min([min(omsr[nodeNext]), max(coords[nodeCbg]) + 1]) coords[nodeCbg] = Set(range(sta, end)) if not coords[nodeCbg]: del (coords[nodeCbg]) # check if coords still present if not coords: return None, {} ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # do/redo _remove_short_sprdif_contributors id required if area in [ "MAXSR", "LEFTSPRDIF", "RIGTHSPRDIF", "OMSRANDLEFTSPRDIF", "OMSRANDRIGTHSPRDIF", "RIGTHORFEND" ]: coords = _remove_short_sprdif_contributors(coords) ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # check if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # check sprdif_min_aa_length if applicable if area in [ "RIGTHSPRDIF", "LEFTSPRDIF", "OMSRANDRIGTHSPRDIF", "OMSRANDLEFTSPRDIF" ]: maxlength = max([len(vlist) for vlist in coords.values()]) if maxlength < kwargs['sprdif_min_aa_length']: return None, {} # if here, obtain sequences and build HMM search profile # get fasta sequences and fastaseqs = cbg._get_sequences_by_coords(coords) # rewrite dict (node) keys to string keys fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords) # remove empty sequence strings from fastaseqs dict empty_seq_keys = [] for k, seq in fastaseqs.iteritems(): if seq == "" or len(seq) == 1: empty_seq_keys.append(k) for k in empty_seq_keys: del (coords[k]) del (fastaseqs[k]) # check (again) if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # rewrite coords to (min,max) tuple coords = dict([(key, [min(vlist), max(vlist) + 1]) for key, vlist in coords.iteritems()]) # perform clustalw multiple alignment (alignedseqs, alignment) = clustalw(seqs=fastaseqs) # strip exterior gaps in case of OMSR/MINSR area if area in ["OMSR", "MINSR"]: alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords)) # strip poorly conserved residues in case of RIGTHORFEND if area in ["RIGTHORFEND"]: alignedseqs, alignment, coords = strip_poorly_supported_tails( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords), 0.20) # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID! if strip_nonaligned_residues: alignedseqs, alignment, coords = strip_overall_nonaligned_residues( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords)) # check if alignment was completely consumed or not if not alignment or len(alignment) <= 1: return None, {} ############################################################################ if verbose: print "## HMM clustalw input profile:", prevcbg != None, area, nextcbg != None for node, algseq in alignedseqs.iteritems(): print algseq, node, coords[node] print alignment ############################################################################ # make unique filename for hmm profile file fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag() # write multiple alignment input file writeMultiFasta(alignedseqs, fname_hmm_profile) # make hmmbuild file of the multiplealignment fname_hmmbuild_file = hmmbuild_protein(fname_hmm_profile) # remove hmm profile multiple alignment file osRemove(fname_hmm_profile) # return HMM serach profile filename return fname_hmmbuild_file, coords
def _create_hmm_db(organism, inputdict, cbg, prev, next, orf_must_have_start=False, max_intron_nt_length=200, verbose=False): """ Create fasta ORF database for a organism in a CBG and its viscinity @type organism: * (presumably string) @param organism: Organism identifier recognizable in <input data structure> @type inputdict: dict @param inputdict: <input data structure> @type cbg: CodingBlockGraph or related object @param cbg: CodingBlockGraph upstream/5p of the cbg that must be completed @type prev: CodingBlockGraph or related object (or None) @param prev: CodingBlockGraph upstream/5p of cbg that must be completed @type next: CodingBlockGraph or related object (or None) @param next: CodingBlockGraph downstream/3p of cbg that must be completed @attention: `prev` and `next` CodingBlockGraphs reduce the search space of ORFs to scan with the HMM profile. This Speeds up and improves the quality of results. @type orf_must_have_start: Boolean @param orf_must_have_start: only allow ORFs with methionines as sbjct ORFs @type max_intron_nt_length: integer @param max_intron_nt_length: positive maximum intron length to take into acount when selecting suitable ORFs @type verbose: Boolean @param verbose: report debugging-report on STDOUT (True) or be quiet (False) """ # fullpath filename of result hmm multi fasta database fname_hmm_db_mfa = None if not cbg: return fname_hmm_db_mfa # (1) try to limit searchspace by prev and next CBG prevNode, nextNode = None, None prevMin, nextMax = None, None maskcoords = [] # (1a) check if (informant) organism is in the prev CBG AND if this CBG # has an OMSR -> not per se the case! if prev and organism in prev.organism_set() and\ prev.has_overall_minimal_spanning_range(): prevNode = prev.node_by_organism(organism) try: omsr = prev.overall_minimal_spanning_range(organism=organism) prevMin = (max(omsr) + 1) * 3 maskcoords.append((0, max(omsr))) except KeyError: # hmmm.... block has an OMSR, but not for this organism!??!!? pass # (1b) check if (informant) organism is in the next CBG AND if this CBG # has an OMSR -> not per se the case! if next and organism in next.organism_set() and\ next.has_overall_minimal_spanning_range(): nextNode = next.node_by_organism(organism) try: omsr = next.overall_minimal_spanning_range(organism=organism) nextMax = min(omsr) * 3 aaseqlen = len(inputdict[organism]['genomeseq']) / 3 maskcoords.append((min(omsr), aaseqlen)) except KeyError: # hmmm.... block has an OMSR, but not for this organism!??!!? pass # (1c) limit search space if only prev or next was specified if not prev and next and nextMax: prevMin = nextMax - max_intron_nt_length if not next and prev and prevMin: nextMax = prevMin + max_intron_nt_length # (2a) get elegiable sets of orfs from prev and next if not orf_must_have_start: elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs( min_orf_end=prevMin, max_orf_start=nextMax) else: # ORFs *must* have starts => searching for a TSS exon/CBG elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs( min_orf_end=prevMin, max_orf_start=nextMax, has_starts=True) # (2b) check orf count; can be zero in case of a very tiny region to check if not elegiable_orfs: return fname_hmm_db_mfa # (3) write masked orfs to fasta database multi line string db_fasta = inputdict[organism]['orfs'].tomaskedfasta( coords=maskcoords, orflist=elegiable_orfs, header_prefix=organism) if orf_must_have_start: if len(db_fasta.strip()) == 0: # no UNmasked suitable ORFs remaining! # This is recognized lateron in this function pass else: # mask out all AAs before the first start lines = db_fasta.split("\n") for linenr in range(0, len(lines)): line = lines[linenr] if line[0] != ">": mpos = line.find("M") if mpos > 0: line = "X" * mpos + line[mpos:] lines[linenr] = line # recreate db_fasta string db_fasta = "\n".join(lines) ############################################################################ if verbose: if len(elegiable_orfs) > 10: orfidlist = len(elegiable_orfs) else: orfidlist = [orf.id for orf in elegiable_orfs] print "hmm-elegibable orfs:", organism, orfidlist, "/", print len(inputdict[organism]['orfs'].orfs), "prevMin:", prevMin, if prev: print prev.has_overall_minimal_spanning_range(), else: print None, print "nextMax:", nextMax, if next: print next.has_overall_minimal_spanning_range() else: print None ############################################################################ # (4) make unique filename for hmm database file fname_base = get_random_string_tag() fname_hmm_db_mfa = "hmm_database_%s_%s.fa" % (fname_base, organism) # (5) write masked orfs to fasta database fh = open(fname_hmm_db_mfa, 'w') fh.write(db_fasta) fh.close() # (6) make shure that there where orfs written to file; # in case very little orfs are selected and all are masked -> no files! seqs_in_db = parseFasta(open(fname_hmm_db_mfa).readlines()) if not seqs_in_db: # delete this (empty) file osRemove(fname_hmm_db_mfa) return None # (7) return hmm search database filename return fname_hmm_db_mfa
def _create_hmm_db(organism,inputdict,cbg,prev,next, orf_must_have_start=False,max_intron_nt_length=200, verbose=False): """ Create fasta ORF database for a organism in a CBG and its viscinity @type organism: * (presumably string) @param organism: Organism identifier recognizable in <input data structure> @type inputdict: dict @param inputdict: <input data structure> @type cbg: CodingBlockGraph or related object @param cbg: CodingBlockGraph upstream/5p of the cbg that must be completed @type prev: CodingBlockGraph or related object (or None) @param prev: CodingBlockGraph upstream/5p of cbg that must be completed @type next: CodingBlockGraph or related object (or None) @param next: CodingBlockGraph downstream/3p of cbg that must be completed @attention: `prev` and `next` CodingBlockGraphs reduce the search space of ORFs to scan with the HMM profile. This Speeds up and improves the quality of results. @type orf_must_have_start: Boolean @param orf_must_have_start: only allow ORFs with methionines as sbjct ORFs @type max_intron_nt_length: integer @param max_intron_nt_length: positive maximum intron length to take into acount when selecting suitable ORFs @type verbose: Boolean @param verbose: report debugging-report on STDOUT (True) or be quiet (False) """ # fullpath filename of result hmm multi fasta database fname_hmm_db_mfa = None if not cbg: return fname_hmm_db_mfa # (1) try to limit searchspace by prev and next CBG prevNode, nextNode = None, None prevMin, nextMax = None, None maskcoords = [] # (1a) check if (informant) organism is in the prev CBG AND if this CBG # has an OMSR -> not per se the case! if prev and organism in prev.organism_set() and\ prev.has_overall_minimal_spanning_range(): prevNode = prev.node_by_organism(organism) try: omsr = prev.overall_minimal_spanning_range(organism=organism) prevMin = (max(omsr)+1)*3 maskcoords.append( ( 0, max(omsr) ) ) except KeyError: # hmmm.... block has an OMSR, but not for this organism!??!!? pass # (1b) check if (informant) organism is in the next CBG AND if this CBG # has an OMSR -> not per se the case! if next and organism in next.organism_set() and\ next.has_overall_minimal_spanning_range(): nextNode = next.node_by_organism(organism) try: omsr = next.overall_minimal_spanning_range(organism=organism) nextMax = min(omsr)*3 aaseqlen = len(inputdict[organism]['genomeseq'])/3 maskcoords.append( ( min(omsr), aaseqlen ) ) except KeyError: # hmmm.... block has an OMSR, but not for this organism!??!!? pass # (1c) limit search space if only prev or next was specified if not prev and next and nextMax: prevMin = nextMax - max_intron_nt_length if not next and prev and prevMin: nextMax = prevMin + max_intron_nt_length # (2a) get elegiable sets of orfs from prev and next if not orf_must_have_start: elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs( min_orf_end = prevMin, max_orf_start = nextMax ) else: # ORFs *must* have starts => searching for a TSS exon/CBG elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs( min_orf_end = prevMin, max_orf_start = nextMax, has_starts=True ) # (2b) check orf count; can be zero in case of a very tiny region to check if not elegiable_orfs: return fname_hmm_db_mfa # (3) write masked orfs to fasta database multi line string db_fasta = inputdict[organism]['orfs'].tomaskedfasta( coords=maskcoords, orflist=elegiable_orfs, header_prefix=organism) if orf_must_have_start: if len(db_fasta.strip()) == 0: # no UNmasked suitable ORFs remaining! # This is recognized lateron in this function pass else: # mask out all AAs before the first start lines = db_fasta.split("\n") for linenr in range(0,len(lines)): line = lines[linenr] if line[0] != ">": mpos = line.find("M") if mpos > 0: line = "X"*mpos+line[mpos:] lines[linenr] = line # recreate db_fasta string db_fasta = "\n".join(lines) ############################################################################ if verbose: if len(elegiable_orfs) > 10: orfidlist = len(elegiable_orfs) else: orfidlist = [ orf.id for orf in elegiable_orfs ] print "hmm-elegibable orfs:", organism, orfidlist, "/", print len(inputdict[organism]['orfs'].orfs), "prevMin:", prevMin, if prev: print prev.has_overall_minimal_spanning_range(), else: print None, print "nextMax:", nextMax, if next: print next.has_overall_minimal_spanning_range() else: print None ############################################################################ # (4) make unique filename for hmm database file fname_base = get_random_string_tag() fname_hmm_db_mfa = "hmm_database_%s_%s.fa" % (fname_base,organism) # (5) write masked orfs to fasta database fh = open(fname_hmm_db_mfa,'w') fh.write( db_fasta ) fh.close() # (6) make shure that there where orfs written to file; # in case very little orfs are selected and all are masked -> no files! seqs_in_db = parseFasta(open(fname_hmm_db_mfa).readlines()) if not seqs_in_db: # delete this (empty) file osRemove( fname_hmm_db_mfa ) return None # (7) return hmm search database filename return fname_hmm_db_mfa
def _create_hmm_profile(cbg,area="OMSR",prevcbg=None,nextcbg=None, strip_nonaligned_residues=False, verbose=False,**kwargs): """ """ # area must be one of # OMSR MINSR MAXSR # LEFTSPRDIF RIGTHSPRDIF # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF # RIGTHORFEND # update to default value if not kwargs.has_key('sprdif_min_aa_length'): kwargs['sprdif_min_aa_length'] = 20 if area == "OMSR": if cbg.has_overall_minimal_spanning_range(): coords = cbg.overall_minimal_spanning_range() else: return None, {} elif area == "MINSR": if cbg.has_minimal_spanning_range(): coords = cbg.minimal_spanning_range() else: return None, {} elif area == "MAXSR": if cbg.has_maximal_spanning_range(): coords = cbg.maximal_spanning_range() else: return None, {} elif area == "LEFTSPRDIF": if cbg.has_left_spanningrange_difference(**kwargs): coords = cbg.left_spanningrange_difference(**kwargs) else: return None, {} elif area == "RIGTHSPRDIF": if cbg.has_rigth_spanningrange_difference(**kwargs): coords = cbg.rigth_spanningrange_difference(**kwargs) else: return None, {} elif area == "OMSRANDLEFTSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_left_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.left_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords,verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node,coordrange in coords.iteritems(): coords[node] = Set( range( min(coordrange), max(omsr[node])+1 ) ) elif area == "OMSRANDRIGTHSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_rigth_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.rigth_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords,verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node,coordrange in coords.iteritems(): coords[node] = Set( range( min(omsr[node]), max(coordrange)+1 ) ) elif area == "RIGTHORFEND": # area in between MAXSR and orfend if not cbg.has_maximal_spanning_range(): return None, {} # get coords & obtain Orf ends coords = cbg.maximal_spanning_range() nodes = coords.keys() for node in nodes: organism = cbg.organism_by_node(node) theorf = cbg.get_orfs_of_graph(organism=organism)[0] coords[node] = range(max(coords[node])+1,theorf.protein_endPY) # remove zero-length ranges if len(coords[node]) == 0: del(coords[node]) else: raise "WHAT ELSE!?" ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # decrease coord range by prevcbg if applicable if area in ["MAXSR","LEFTSPRDIF","OMSRANDLEFTSPRDIF"] and prevcbg: omsr = prevcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection( prevcbg.organism_set() ): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodePrev = prevcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodePrev): continue sta = max( [ max(omsr[nodePrev])+1, min(coords[nodeCbg]) ] ) end = max(coords[nodeCbg])+1 coords[nodeCbg] = Set(range(sta,end)) if not coords[nodeCbg]: del( coords[nodeCbg] ) # decrease coord range by nextcbg if applicable if area in ["MAXSR","RIGTHSPRDIF","OMSRANDRIGTHSPRDIF"] and nextcbg: omsr = nextcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection( nextcbg.organism_set() ): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodeNext = nextcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodeNext): continue sta = min(coords[nodeCbg]) end = min( [ min(omsr[nodeNext]), max(coords[nodeCbg])+1 ] ) coords[nodeCbg] = Set(range(sta,end)) if not coords[nodeCbg]: del( coords[nodeCbg] ) # check if coords still present if not coords: return None, {} ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # do/redo _remove_short_sprdif_contributors id required if area in ["MAXSR","LEFTSPRDIF","RIGTHSPRDIF", "OMSRANDLEFTSPRDIF","OMSRANDRIGTHSPRDIF","RIGTHORFEND"]: coords = _remove_short_sprdif_contributors(coords) ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # check if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # check sprdif_min_aa_length if applicable if area in ["RIGTHSPRDIF","LEFTSPRDIF","OMSRANDRIGTHSPRDIF", "OMSRANDLEFTSPRDIF"]: maxlength = max([ len(vlist) for vlist in coords.values() ]) if maxlength < kwargs['sprdif_min_aa_length']: return None, {} # if here, obtain sequences and build HMM search profile # get fasta sequences and fastaseqs = cbg._get_sequences_by_coords(coords) # rewrite dict (node) keys to string keys fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords) # remove empty sequence strings from fastaseqs dict empty_seq_keys = [] for k,seq in fastaseqs.iteritems(): if seq == "" or len(seq) == 1: empty_seq_keys.append(k) for k in empty_seq_keys: del(coords[k]) del(fastaseqs[k]) # check (again) if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # rewrite coords to (min,max) tuple coords = dict([ (key,[min(vlist),max(vlist)+1]) for key,vlist in coords.iteritems() ]) # perform clustalw multiple alignment (alignedseqs,alignment) = clustalw( seqs= fastaseqs ) # strip exterior gaps in case of OMSR/MINSR area if area in ["OMSR","MINSR"]: alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) ) # strip poorly conserved residues in case of RIGTHORFEND if area in ["RIGTHORFEND"]: alignedseqs,alignment,coords = strip_poorly_supported_tails( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords),0.20 ) # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID! if strip_nonaligned_residues: alignedseqs,alignment,coords = strip_overall_nonaligned_residues( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) ) # check if alignment was completely consumed or not if not alignment or len(alignment) <= 1: return None, {} ############################################################################ if verbose: print "## HMM clustalw input profile:",prevcbg!=None,area,nextcbg!=None for node,algseq in alignedseqs.iteritems(): print algseq, node, coords[node] print alignment ############################################################################ # make unique filename for hmm profile file fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag() # write multiple alignment input file writeMultiFasta(alignedseqs,fname_hmm_profile) # make hmmbuild file of the multiplealignment fname_hmmbuild_file = hmmbuild_protein( fname_hmm_profile ) # remove hmm profile multiple alignment file osRemove(fname_hmm_profile) # return HMM serach profile filename return fname_hmmbuild_file, coords
def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD,pacbporfA, orfSetObject,queryorsbjct,verbose = False, **kwargs): """ Merge 2 PacbPORF objects by introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type orfSetObject: object with elegiable Orfs @param orfSetObject: object with elegiable Orfs @type queryorsbjct: string @param queryorsbjct: literal string 'query' or 'sbjct' @type verbose: Boolean @param verbose: print debugging info to STDOUT when True @rtype: list @return: list with ( intron, ExonOnOrf, intron ) on the query sequence """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON) MAX_TINYEXON_NT_LENGTH = 33 MIN_TINYEXON_NT_LENGTH = 6 tinyexons = [] if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS alignedDonorRange = pacbporfD.alignment_dna_range_query() alignedAccepRange = pacbporfA.alignment_dna_range_query() elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ alignedDonorRange = pacbporfD.alignment_dna_range_sbjct() alignedAccepRange = pacbporfA.alignment_dna_range_sbjct() else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message for dObj in donorOrf._donor_sites: # do not make a projection OVER the aligned area if dObj.pos < min(alignedDonorRange): continue if queryorsbjct == "query": (dPos,dPhase) = pacbporfD.dnaposition_query(dObj.pos,forced_return=True) else: (dPos,dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break for aObj in accepOrf._acceptor_sites: # do not make a projection OVER the aligned area if aObj.pos > max(alignedAccepRange): continue if queryorsbjct == "query": (aPos,aPhase) = pacbporfA.dnaposition_query(aObj.pos,forced_return=True) else: (aPos,aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= MAX_TINYEXON_NT_LENGTH: break if distance < MIN_TINYEXON_NT_LENGTH: continue #################################################### # generate a ScanForMatches pattern file #################################################### # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct]) # mask all non-phase0 nucleotides to N residues; # this represents the regularexpression for a specific # peptide sequence firstphasepositions = range( 3-dPhase % 3, len(query), 3) for pos in range(0,len(query)): if pos not in firstphasepositions: query[pos] = "N" # calculate a ~50% mismatch number mismatches = max([ 0, (len(query) - query.count("N"))/2 ]) # write the pattern to string and subsequently to file # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 if kwargs['allow_non_canonical_donor']: sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % ( AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO) else: sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % ( AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO) #################################################### if verbose: print (pacbporfD.orfQ.id,pacbporfA.orfQ.id), print distance, dObj, aObj print sfmpat #################################################### fname = "sfmpat_tinyexon_%s_%s_%s_%s" % ( donorOrf.id, accepOrf.id, posDsbjct, posAsbjct, ) fh = open(fname,'w') fh.write(sfmpat+"\n") fh.close() #################################################### # run ScanForMatches #################################################### command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\ """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\ """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\ """{ print $1"["$2","$3"]\\n"$4 } }' """ command = command % ( donorOrf.inputgenomicsequence, EXECUTABLE_SFM,fname, dObj.pos+(kwargs['min_intron_nt_length']-3), aObj.pos-(kwargs['min_intron_nt_length']-3) ) co = osPopen(command) matches = parseFasta(co.readlines()) co.close() # filter matches for: # (1) correct donor & acceptor phase # (2) high enough donor & acceptor site scores for hdr,seqmatch in matches.iteritems(): startQ,stopQ = [ int(item) for item in hdr.split(":")[1][1:-1].split(",") ] exonQstart = startQ + AUSO + 2 - 1 exonQstop = stopQ - DDSO - 2 #################################### # get Orf object of tinyexon #################################### tinyexonorf = None # select the Orf on which the tinyexon is located for orfObj in orfSetObject.get_elegiable_orfs( max_orf_start=exonQstart,min_orf_end=exonQstop): orfPhase = (exonQstart - orfObj.startPY) % 3 if orfPhase == dPhase: tinyexonorf = orfObj break else: # No tinyexonorf assigned!! Iin case a regex matched # over a STOP-codon or the regex length is smaller # then the smallest Orf, no Orf can be assigned continue # filter for donor & acceptor score dScore = _score_splice_site(seqmatch[-9:],splicetype='donor') aScore = _score_splice_site(seqmatch[0:11],splicetype='acceptor') if dScore < kwargs['min_donor_pssm_score']: continue if aScore < kwargs['min_acceptor_pssm_score']: continue # scan Orf for splicesites tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs['min_donor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_donor'], non_canonical_min_pssm_score=kwargs['non_canonical_min_donor_pssm_score']) tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs['min_acceptor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_acceptor'], non_canonical_min_pssm_score=kwargs['non_canonical_min_acceptor_pssm_score']) # get 1th intron donor object intron1_aObj = None for a in tinyexonorf._acceptor_sites: if a.pos == exonQstart: intron1_aObj = a break else: # pseudo-acceptorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # get 2th intron donor object intron2_dObj = None for d in tinyexonorf._donor_sites: if d.pos == exonQstop: intron2_dObj = d break else: # pseudo-donorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # check if introns are of elegiable lengths if (intron1_aObj.pos-dObj.pos) > kwargs['max_intron_nt_length']: continue if (aObj.pos-intron2_dObj.pos) > kwargs['max_intron_nt_length']: continue #################################################### if True or verbose: # if here, a candidate!!! print (pacbporfD.orfQ.id,tinyexonorf.id,pacbporfA.orfQ.id), print hdr, dScore, aScore print seqmatch #################################################### # append to found tinyexons query_data = ( tinyexonorf, exonQstart, exonQstop ) sbjct_data = ( prjctOrf, posDsbjct, posAsbjct ) splicesite_data = ( dObj,intron1_aObj, intron2_dObj, aObj ) tinyexons.append( ( query_data, sbjct_data, splicesite_data ) ) # file cleanup osRemove(fname) # return - End Of Function - if no tinyexons are found if not tinyexons: return [] #################################### # select the **best** tinyexon #################################### (query_data,sbjct_data,splicesite_data) = tinyexons[0] orfQ,query_dna_start,query_dna_end = query_data orfS,sbjct_dna_start,sbjct_dna_end = sbjct_data (intron1_dObj,intron1_aObj,intron2_dObj,intron2_aObj) = splicesite_data #################################################### if verbose: print "tinyexon orf:", orfQ print "tinyexon orf:", intron1_aObj print "tinyexon orf:", intron2_dObj #################################################### #################################### # make tinyexon PacbPORF #################################### startQaa = orfQ.dnapos2aapos(query_dna_start) -1 startSaa = orfS.dnapos2aapos(sbjct_dna_start) -1 stopQaa = orfQ.dnapos2aapos(query_dna_end) +1 stopSaa = orfS.dnapos2aapos(sbjct_dna_end) +1 # check for directly leading stop codon on tinyexon while startQaa <= orfQ.protein_startPY: startQaa+=1 startSaa+=1 query_dna_start+=3 sbjct_dna_start+=3 while startSaa <= orfS.protein_startPY: startQaa+=1 startSaa+=1 query_dna_start+=3 sbjct_dna_start+=3 # check for directly tailing stop codon on tinyexon while stopQaa > orfQ.protein_endPY: stopQaa-=1 stopSaa-=1 query_dna_end-=3 sbjct_dna_end-=3 while stopSaa > orfS.protein_endPY: stopQaa-=1 stopSaa-=1 query_dna_end-=3 sbjct_dna_end-=3 # get sequences qAAseq = orfQ.getaas(abs_pos_start=startQaa,abs_pos_end=stopQaa) sAAseq = orfS.getaas(abs_pos_start=startSaa,abs_pos_end=stopSaa) #################################################### if verbose or len(qAAseq) != len(sAAseq): # if unequal lengths, error will be raised upon PacbP.__init__() print orfQ, qAAseq, startQaa, stopQaa, (stopQaa-startQaa), print (query_dna_start,query_dna_end) print orfS, sAAseq, startSaa, stopSaa, (stopSaa-startSaa), print (sbjct_dna_start,sbjct_dna_end) print orfQ.inputgenomicsequence[query_dna_start-2:query_dna_end+2] print orfS.inputgenomicsequence[sbjct_dna_start-2:sbjct_dna_end+2] #################################################### # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=( qAAseq, sAAseq, startQaa, startSaa ) ) pacbp.strip_unmatched_ends() pacbporf = pacbp2pacbporf(pacbp,orfQ,orfS) pacbporf.extend_pacbporf_after_stops() pacbporf.source = 'ABGPprojectingTE' #################################### # make introns #################################### intron1 = IntronConnectingOrfs( intron1_dObj, intron1_aObj, None, donorOrf,pacbporf.orfQ ) intron2 = IntronConnectingOrfs( intron2_dObj, intron2_aObj, None, pacbporf.orfQ, accepOrf ) ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron intron1._distance = 0 intron2._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(intron1,pacbporfD,pacbporf) succes = set_apps_intron_query(intron2,pacbporf,pacbporfA) else: succes = set_apps_intron_sbjct(intron1,pacbporfD,pacbporf) succes = set_apps_intron_sbjct(intron2,pacbporf,pacbporfA) # set GFF fsource attribute for recognition of intron sources intron1._gff['fsource'] = "ABGPprojectingTE" intron2._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes intron1._linked_to_pacbporfs = [ pacbporf ] intron2._linked_to_pacbporfs = [ pacbporf ] intron1._linked_to_introns = [ intron2 ] intron2._linked_to_introns = [ intron1 ] #################################################### if verbose: print pacbporf pacbporf.print_protein_and_dna() print intron1 print intron2 if False: # printing data when this function needs to be debugged: print "" print intron1 print intron2 print "" print pacbporfD pacbporfD.print_protein_and_dna() print "" print pacbporf pacbporf.print_protein_and_dna() print "" print pacbporfA pacbporfA.print_protein_and_dna() import sys sys.exit() #################################################### # return introns and intermediate tinyexon PacbPORF return [(intron1,intron2,pacbporf)]
return polygonPolyData # FFD变换 polygonPolyData = FFD(polygonPolyData) Mapper = vtk.vtkPolyDataMapper() Mapper.SetInputData(polygonPolyData) Actor = vtk.vtkActor() Actor.SetMapper(Mapper) Ren1 = vtk.vtkRenderer() Ren1.AddActor(Actor) renWin = vtk.vtkRenderWindow() renWin.AddRenderer(Ren1) # save the obj data dir0 = "C:\Users\hxu13\Desktop\pp\ExportData" porter = vtk.vtkOBJExporter() porter.SetFilePrefix(dir0+"\cells") porter.SetInput(renWin) porter.Write() osRemove(dir0 + '\cells.mtl') iren = vtk.vtkRenderWindowInteractor() iren.SetRenderWindow(renWin) iren.Initialize() iren.Start()
def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD, pacbporfA, orfSetObject, queryorsbjct, verbose=False, **kwargs): """ Merge 2 PacbPORF objects by introns @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs) @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type orfSetObject: object with elegiable Orfs @param orfSetObject: object with elegiable Orfs @type queryorsbjct: string @param queryorsbjct: literal string 'query' or 'sbjct' @type verbose: Boolean @param verbose: print debugging info to STDOUT when True @rtype: list @return: list with ( intron, ExonOnOrf, intron ) on the query sequence """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit **kwargs dictionary for some forced attributes _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON) MAX_TINYEXON_NT_LENGTH = 33 MIN_TINYEXON_NT_LENGTH = 6 tinyexons = [] if queryorsbjct == "query": donorOrf = pacbporfD.orfQ accepOrf = pacbporfA.orfQ prjctOrf = pacbporfD.orfS alignedDonorRange = pacbporfD.alignment_dna_range_query() alignedAccepRange = pacbporfA.alignment_dna_range_query() elif queryorsbjct == "sbjct": donorOrf = pacbporfD.orfS accepOrf = pacbporfA.orfS prjctOrf = pacbporfD.orfQ alignedDonorRange = pacbporfD.alignment_dna_range_sbjct() alignedAccepRange = pacbporfA.alignment_dna_range_sbjct() else: message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct raise InproperlyAppliedArgument, message for dObj in donorOrf._donor_sites: # do not make a projection OVER the aligned area if dObj.pos < min(alignedDonorRange): continue if queryorsbjct == "query": (dPos, dPhase) = pacbporfD.dnaposition_query(dObj.pos, forced_return=True) else: (dPos, dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos, forced_return=True) try: algDobj = pacbporfD._positions[dPos] except IndexError: # site out of range of PacbPORF -> break break for aObj in accepOrf._acceptor_sites: # do not make a projection OVER the aligned area if aObj.pos > max(alignedAccepRange): continue if queryorsbjct == "query": (aPos, aPhase) = pacbporfA.dnaposition_query(aObj.pos, forced_return=True) else: (aPos, aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos, forced_return=True) try: algAobj = pacbporfA._positions[aPos] except IndexError: # site out of range of PacbPORF -> break break if queryorsbjct == "query": posDsbjct = algDobj.sbjct_dna_start + dPhase posAsbjct = algAobj.sbjct_dna_start + aPhase else: posDsbjct = algDobj.query_dna_start + dPhase posAsbjct = algAobj.query_dna_start + aPhase distance = posAsbjct - posDsbjct if distance >= MAX_TINYEXON_NT_LENGTH: break if distance < MIN_TINYEXON_NT_LENGTH: continue #################################################### # generate a ScanForMatches pattern file #################################################### # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct]) # mask all non-phase0 nucleotides to N residues; # this represents the regularexpression for a specific # peptide sequence firstphasepositions = range(3 - dPhase % 3, len(query), 3) for pos in range(0, len(query)): if pos not in firstphasepositions: query[pos] = "N" # calculate a ~50% mismatch number mismatches = max([0, (len(query) - query.count("N")) / 2]) # write the pattern to string and subsequently to file # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3 if kwargs['allow_non_canonical_donor']: sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % ( AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO) else: sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % ( AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO) #################################################### if verbose: print(pacbporfD.orfQ.id, pacbporfA.orfQ.id), print distance, dObj, aObj print sfmpat #################################################### fname = "sfmpat_tinyexon_%s_%s_%s_%s" % ( donorOrf.id, accepOrf.id, posDsbjct, posAsbjct, ) fh = open(fname, 'w') fh.write(sfmpat + "\n") fh.close() #################################################### # run ScanForMatches #################################################### command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\ """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\ """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\ """{ print $1"["$2","$3"]\\n"$4 } }' """ command = command % (donorOrf.inputgenomicsequence, EXECUTABLE_SFM, fname, dObj.pos + (kwargs['min_intron_nt_length'] - 3), aObj.pos - (kwargs['min_intron_nt_length'] - 3)) co = osPopen(command) matches = parseFasta(co.readlines()) co.close() # filter matches for: # (1) correct donor & acceptor phase # (2) high enough donor & acceptor site scores for hdr, seqmatch in matches.iteritems(): startQ, stopQ = [ int(item) for item in hdr.split(":")[1][1:-1].split(",") ] exonQstart = startQ + AUSO + 2 - 1 exonQstop = stopQ - DDSO - 2 #################################### # get Orf object of tinyexon #################################### tinyexonorf = None # select the Orf on which the tinyexon is located for orfObj in orfSetObject.get_eligible_orfs( max_orf_start=exonQstart, min_orf_end=exonQstop): orfPhase = (exonQstart - orfObj.startPY) % 3 if orfPhase == dPhase: tinyexonorf = orfObj break else: # No tinyexonorf assigned!! Iin case a regex matched # over a STOP-codon or the regex length is smaller # then the smallest Orf, no Orf can be assigned continue # filter for donor & acceptor score dScore = _score_splice_site(seqmatch[-9:], splicetype='donor') aScore = _score_splice_site(seqmatch[0:11], splicetype='acceptor') if dScore < kwargs['min_donor_pssm_score']: continue if aScore < kwargs['min_acceptor_pssm_score']: continue # scan Orf for splicesites tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs['min_donor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_donor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_donor_pssm_score']) tinyexonorf.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs['min_acceptor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_acceptor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_acceptor_pssm_score']) # get 1th intron donor object intron1_aObj = None for a in tinyexonorf._acceptor_sites: if a.pos == exonQstart: intron1_aObj = a break else: # pseudo-acceptorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # get 2th intron donor object intron2_dObj = None for d in tinyexonorf._donor_sites: if d.pos == exonQstop: intron2_dObj = d break else: # pseudo-donorsite as found be SFM regex # is not a valid acceptor site of high enough score # continue to next iteration of (hdr,seqmatch) pair continue # check if introns are of elegiable lengths if (intron1_aObj.pos - dObj.pos) > kwargs['max_intron_nt_length']: continue if (aObj.pos - intron2_dObj.pos) > kwargs['max_intron_nt_length']: continue #################################################### if True or verbose: # if here, a candidate!!! print(pacbporfD.orfQ.id, tinyexonorf.id, pacbporfA.orfQ.id), print hdr, dScore, aScore print seqmatch #################################################### # append to found tinyexons query_data = (tinyexonorf, exonQstart, exonQstop) sbjct_data = (prjctOrf, posDsbjct, posAsbjct) splicesite_data = (dObj, intron1_aObj, intron2_dObj, aObj) tinyexons.append((query_data, sbjct_data, splicesite_data)) # file cleanup osRemove(fname) # return - End Of Function - if no tinyexons are found if not tinyexons: return [] #################################### # select the **best** tinyexon #################################### (query_data, sbjct_data, splicesite_data) = tinyexons[0] orfQ, query_dna_start, query_dna_end = query_data orfS, sbjct_dna_start, sbjct_dna_end = sbjct_data (intron1_dObj, intron1_aObj, intron2_dObj, intron2_aObj) = splicesite_data #################################################### if verbose: print "tinyexon orf:", orfQ print "tinyexon orf:", intron1_aObj print "tinyexon orf:", intron2_dObj #################################################### #################################### # make tinyexon PacbPORF #################################### startQaa = orfQ.dnapos2aapos(query_dna_start) - 1 startSaa = orfS.dnapos2aapos(sbjct_dna_start) - 1 stopQaa = orfQ.dnapos2aapos(query_dna_end) + 1 stopSaa = orfS.dnapos2aapos(sbjct_dna_end) + 1 # check for directly leading stop codon on tinyexon while startQaa <= orfQ.protein_startPY: startQaa += 1 startSaa += 1 query_dna_start += 3 sbjct_dna_start += 3 while startSaa <= orfS.protein_startPY: startQaa += 1 startSaa += 1 query_dna_start += 3 sbjct_dna_start += 3 # check for directly tailing stop codon on tinyexon while stopQaa > orfQ.protein_endPY: stopQaa -= 1 stopSaa -= 1 query_dna_end -= 3 sbjct_dna_end -= 3 while stopSaa > orfS.protein_endPY: stopQaa -= 1 stopSaa -= 1 query_dna_end -= 3 sbjct_dna_end -= 3 # get sequences qAAseq = orfQ.getaas(abs_pos_start=startQaa, abs_pos_end=stopQaa) sAAseq = orfS.getaas(abs_pos_start=startSaa, abs_pos_end=stopSaa) #################################################### if verbose or len(qAAseq) != len(sAAseq): # if unequal lengths, error will be raised upon PacbP.__init__() print orfQ, qAAseq, startQaa, stopQaa, (stopQaa - startQaa), print(query_dna_start, query_dna_end) print orfS, sAAseq, startSaa, stopSaa, (stopSaa - startSaa), print(sbjct_dna_start, sbjct_dna_end) print orfQ.inputgenomicsequence[query_dna_start - 2:query_dna_end + 2] print orfS.inputgenomicsequence[sbjct_dna_start - 2:sbjct_dna_end + 2] #################################################### # initialize extended tinyexon PacbPORF from pacb import PacbP pacbp = PacbP(input=(qAAseq, sAAseq, startQaa, startSaa)) pacbp.strip_unmatched_ends() pacbporf = pacbp2pacbporf(pacbp, orfQ, orfS) pacbporf.extend_pacbporf_after_stops() pacbporf.source = 'ABGPprojectingTE' #################################### # make introns #################################### intron1 = IntronConnectingOrfs(intron1_dObj, intron1_aObj, None, donorOrf, pacbporf.orfQ) intron2 = IntronConnectingOrfs(intron2_dObj, intron2_aObj, None, pacbporf.orfQ, accepOrf) ################################################################ # set some meta-data properties to the intron objects ################################################################ # add distance score to intron intron1._distance = 0 intron2._distance = 0 # add Alignment Positional Periphery Score into objects if queryorsbjct == "query": succes = set_apps_intron_query(intron1, pacbporfD, pacbporf) succes = set_apps_intron_query(intron2, pacbporf, pacbporfA) else: succes = set_apps_intron_sbjct(intron1, pacbporfD, pacbporf) succes = set_apps_intron_sbjct(intron2, pacbporf, pacbporfA) # set GFF fsource attribute for recognition of intron sources intron1._gff['fsource'] = "ABGPprojectingTE" intron2._gff['fsource'] = "ABGPprojectingTE" # create _linked_to_xxx attributes intron1._linked_to_pacbporfs = [pacbporf] intron2._linked_to_pacbporfs = [pacbporf] intron1._linked_to_introns = [intron2] intron2._linked_to_introns = [intron1] #################################################### if verbose: print pacbporf pacbporf.print_protein_and_dna() print intron1 print intron2 if False: # printing data when this function needs to be debugged: print "" print intron1 print intron2 print "" print pacbporfD pacbporfD.print_protein_and_dna() print "" print pacbporf pacbporf.print_protein_and_dna() print "" print pacbporfA pacbporfA.print_protein_and_dna() import sys sys.exit() #################################################### # return introns and intermediate tinyexon PacbPORF return [(intron1, intron2, pacbporf)]
def tearDown(self): osRemove(TestGMLExporter.UNIT_TEST_FILENAME)
def _cleanupTempFile(self): osRemove(ToFastEdit.FAST_EDIT_TEMP_FILE)
def blastall_seq2seq(fastadata=(), filenames=(), output="ncbiparsed", blastprogram="blastp", remove_files=True, extra_blastp_params={ 'F': 'F', 'e': '10' }): """ choose proper input: fastadata ( ( headerQUERY, seqQUERY ) , ( headerSBJCT, seqSBJCT ) ) or filenames ( filenameQUERY, filenameSBJCT ) """ input = None if blastprogram not in ['blastp', 'tblastn', 'tblastx', 'blastx']: raise "only blastp and tblastn are supported" elif blastprogram in ['tblastn', 'tblastx']: dna_or_prot = "F" else: dna_or_prot = "T" if fastadata and type(fastadata) == type( ()) and len(fastadata) == 2 and not filenames: # input is fasta headers and sequence input = "fastadata" # write input filenames uniquetag = get_random_string_tag() fname_q = "_".join([uniquetag, str(fastadata[0][0]), 'Q.fa']) fname_s = "_".join([uniquetag, str(fastadata[1][0]), 'S.fa']) fh = open(fname_q, 'w') fh.write(">%s\n%s" % (fastadata[0][0], fastadata[0][1])) fh.close() fh = open(fname_s, 'w') fh.write(">%s\n%s" % (fastadata[1][0], fastadata[1][1])) fh.close() elif filenames and type(filenames) == type( ()) and len(filenames) == 2 and not fastadata: # input is (supposed to be) filenames input = "filenames" # get filenames fname_q = filenames[0] fname_s = filenames[1] elif not filenames and not fastadata: raise "no input!" else: raise "inproper input!" # formatdb OSsystem("%s -i %s -p %s" % (FORMATDB_PATH, fname_s, dna_or_prot)) # and blastall! extra_params = " ".join( ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()]) ci, co, ce = osPopen3( "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram, extra_params, fname_q, fname_s)) ci.close() if output == "ncbiparsed": b_parser = NCBIStandalone.BlastParser() blastallout = b_parser.parse(co) else: blastallout = co.read() co.close() ce.close() if remove_files: OSsystem("rm %s.*" % fname_s) osRemove("%s" % fname_s) osRemove("%s" % fname_q) # and return! return blastallout