def make_gff(fasta_file, tabular_file, gff_file, cut_method): cut_col, score_col = { "NN_Cmax": (2, 1), "NN_Ymax": (5, 4), "NN_Smax": (8, 7), "HMM_Cmax": (16, 15), }[cut_method] source = "SignalP" strand = "." #not stranded phase = "." #not phased tags = "Note=%s" % cut_method tab_handle = open(tabular_file) line = tab_handle.readline() assert line.startswith("#ID\t"), line gff_handle = open(gff_file, "w") gff_handle.write("##gff-version 3\n") for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle): parts = line.rstrip("\n").split("\t") seqid = parts[0] assert title.startswith(seqid), "%s vs %s" % (seqid, title) if len(seq) == 0: #Is it possible to have a zero length reference in GFF3? continue cut = int(parts[cut_col]) if cut == 0: assert cut_method == "HMM_Cmax", cut_method #TODO - Why does it do this? cut = 1 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) score = parts[score_col] gff_handle.write("##sequence-region %s %i %i\n" \ % (seqid, 1, len(seq))) #If the cut is at the very begining, there is no signal peptide! if cut > 1: #signal_peptide = SO:0000418 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ % (seqid, source, "signal_peptide", 1, cut-1, score, strand, phase, tags)) #mature_protein_region = SO:0000419 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ % (seqid, source, "mature_protein_region", cut, len(seq), score, strand, phase, tags)) tab_handle.close() gff_handle.close()
def make_gff(fasta_file, tabular_file, gff_file, cut_method): cut_col, score_col = {"NN_Cmax": (2, 1), "NN_Ymax": (5, 4), "NN_Smax": (8, 7), "HMM_Cmax": (16, 15), }[cut_method] source = "SignalP" strand = "." # not stranded phase = "." # not phased tags = "Note=%s" % cut_method tab_handle = open(tabular_file) line = tab_handle.readline() assert line.startswith("#ID\t"), line gff_handle = open(gff_file, "w") gff_handle.write("##gff-version 3\n") for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle): parts = line.rstrip("\n").split("\t") seqid = parts[0] assert title.startswith(seqid), "%s vs %s" % (seqid, title) if len(seq) == 0: # Is it possible to have a zero length reference in GFF3? continue cut = int(parts[cut_col]) if cut == 0: assert cut_method == "HMM_Cmax", cut_method # TODO - Why does it do this? cut = 1 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) score = parts[score_col] gff_handle.write("##sequence-region %s %i %i\n" % (seqid, 1, len(seq))) # If the cut is at the very begining, there is no signal peptide! if cut > 1: # signal_peptide = SO:0000418 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" % (seqid, source, "signal_peptide", 1, cut - 1, score, strand, phase, tags)) # mature_protein_region = SO:0000419 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" % (seqid, source, "mature_protein_region", cut, len(seq), score, strand, phase, tags)) tab_handle.close() gff_handle.close()
raise ValueError("Could not determine version of %s" % exe) # Run hmmsearch for Whisson et al. (2007) if model == "Whisson2007": hmm_file = os.path.join( os.path.split(sys.argv[0])[0], "whisson_et_al_rxlr_eer_cropped.hmm" ) if not os.path.isfile(hmm_file): sys.exit("Missing HMM file for Whisson et al. (2007)") if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): sys.exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) hmm_hits = set() valid_ids = set() for title, seq in fasta_iterator(fasta_file): name = title.split(None, 1)[0] if name in valid_ids: sys.exit("Duplicated identifier %r" % name) else: valid_ids.add(name) if not valid_ids: # Special case, don't need to run HMMER if there are no sequences pass else: # I've left the code to handle HMMER 3 in situ, in case # we revisit the choice to insist on HMMER 2. hmmer3 = 3 == get_hmmer_version(hmmer_search) # Using zero (or 5.6?) for bitscore threshold if hmmer3: # The HMMER3 table output is easy to parse
else: raise ValueError("Could not determine version of %s" % exe) # Run hmmsearch for Whisson et al. (2007) if model == "Whisson2007": hmm_file = os.path.join(os.path.split(sys.argv[0])[0], "whisson_et_al_rxlr_eer_cropped.hmm") if not os.path.isfile(hmm_file): sys.exit("Missing HMM file for Whisson et al. (2007)") if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): sys.exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) hmm_hits = set() valid_ids = set() for title, seq in fasta_iterator(fasta_file): name = title.split(None, 1)[0] if name in valid_ids: sys.exit("Duplicated identifier %r" % name) else: valid_ids.add(name) if not valid_ids: # Special case, don't need to run HMMER if there are no sequences pass else: # I've left the code to handle HMMER 3 in situ, in case # we revisit the choice to insist on HMMER 2. hmmer3 = (3 == get_hmmer_version(hmmer_search)) # Using zero (or 5.6?) for bitscore threshold if hmmer3: # The HMMER3 table output is easy to parse