示例#1
0
def make_gff(fasta_file, tabular_file, gff_file, cut_method):
    cut_col, score_col = {
        "NN_Cmax": (2, 1),
        "NN_Ymax": (5, 4),
        "NN_Smax": (8, 7),
        "HMM_Cmax": (16, 15),
    }[cut_method]

    source = "SignalP"
    strand = "."  #not stranded
    phase = "."  #not phased
    tags = "Note=%s" % cut_method

    tab_handle = open(tabular_file)
    line = tab_handle.readline()
    assert line.startswith("#ID\t"), line

    gff_handle = open(gff_file, "w")
    gff_handle.write("##gff-version 3\n")

    for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle):
        parts = line.rstrip("\n").split("\t")
        seqid = parts[0]
        assert title.startswith(seqid), "%s vs %s" % (seqid, title)
        if len(seq) == 0:
            #Is it possible to have a zero length reference in GFF3?
            continue
        cut = int(parts[cut_col])
        if cut == 0:
            assert cut_method == "HMM_Cmax", cut_method
            #TODO - Why does it do this?
            cut = 1
        assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid,
                                                           len(seq))
        score = parts[score_col]
        gff_handle.write("##sequence-region %s %i %i\n" \
                          % (seqid, 1, len(seq)))
        #If the cut is at the very begining, there is no signal peptide!
        if cut > 1:
            #signal_peptide = SO:0000418
            gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
                             % (seqid, source,
                                "signal_peptide", 1, cut-1,
                                score, strand, phase, tags))
        #mature_protein_region = SO:0000419
        gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
                         % (seqid, source,
                            "mature_protein_region", cut, len(seq),
                            score, strand, phase, tags))
        tab_handle.close()
    gff_handle.close()
示例#2
0
def make_gff(fasta_file, tabular_file, gff_file, cut_method):
    cut_col, score_col = {"NN_Cmax": (2, 1),
                          "NN_Ymax": (5, 4),
                          "NN_Smax": (8, 7),
                          "HMM_Cmax": (16, 15),
                          }[cut_method]

    source = "SignalP"
    strand = "."  # not stranded
    phase = "."  # not phased
    tags = "Note=%s" % cut_method

    tab_handle = open(tabular_file)
    line = tab_handle.readline()
    assert line.startswith("#ID\t"), line

    gff_handle = open(gff_file, "w")
    gff_handle.write("##gff-version 3\n")

    for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle):
        parts = line.rstrip("\n").split("\t")
        seqid = parts[0]
        assert title.startswith(seqid), "%s vs %s" % (seqid, title)
        if len(seq) == 0:
            # Is it possible to have a zero length reference in GFF3?
            continue
        cut = int(parts[cut_col])
        if cut == 0:
            assert cut_method == "HMM_Cmax", cut_method
            # TODO - Why does it do this?
            cut = 1
        assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq))
        score = parts[score_col]
        gff_handle.write("##sequence-region %s %i %i\n"
                          % (seqid, 1, len(seq)))
        # If the cut is at the very begining, there is no signal peptide!
        if cut > 1:
            # signal_peptide = SO:0000418
            gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
                             % (seqid, source,
                                "signal_peptide", 1, cut - 1,
                                score, strand, phase, tags))
        # mature_protein_region = SO:0000419
        gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
                         % (seqid, source,
                            "mature_protein_region", cut, len(seq),
                            score, strand, phase, tags))
        tab_handle.close()
    gff_handle.close()
示例#3
0
        raise ValueError("Could not determine version of %s" % exe)


# Run hmmsearch for Whisson et al. (2007)
if model == "Whisson2007":
    hmm_file = os.path.join(
        os.path.split(sys.argv[0])[0], "whisson_et_al_rxlr_eer_cropped.hmm"
    )
    if not os.path.isfile(hmm_file):
        sys.exit("Missing HMM file for Whisson et al. (2007)")
    if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"):
        sys.exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search)

    hmm_hits = set()
    valid_ids = set()
    for title, seq in fasta_iterator(fasta_file):
        name = title.split(None, 1)[0]
        if name in valid_ids:
            sys.exit("Duplicated identifier %r" % name)
        else:
            valid_ids.add(name)
    if not valid_ids:
        # Special case, don't need to run HMMER if there are no sequences
        pass
    else:
        # I've left the code to handle HMMER 3 in situ, in case
        # we revisit the choice to insist on HMMER 2.
        hmmer3 = 3 == get_hmmer_version(hmmer_search)
        # Using zero (or 5.6?) for bitscore threshold
        if hmmer3:
            # The HMMER3 table output is easy to parse
示例#4
0
    else:
        raise ValueError("Could not determine version of %s" % exe)


# Run hmmsearch for Whisson et al. (2007)
if model == "Whisson2007":
    hmm_file = os.path.join(os.path.split(sys.argv[0])[0],
                       "whisson_et_al_rxlr_eer_cropped.hmm")
    if not os.path.isfile(hmm_file):
        sys.exit("Missing HMM file for Whisson et al. (2007)")
    if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"):
        sys.exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search)

    hmm_hits = set()
    valid_ids = set()
    for title, seq in fasta_iterator(fasta_file):
        name = title.split(None, 1)[0]
        if name in valid_ids:
            sys.exit("Duplicated identifier %r" % name)
        else:
            valid_ids.add(name)
    if not valid_ids:
        # Special case, don't need to run HMMER if there are no sequences
        pass
    else:
        # I've left the code to handle HMMER 3 in situ, in case
        # we revisit the choice to insist on HMMER 2.
        hmmer3 = (3 == get_hmmer_version(hmmer_search))
        # Using zero (or 5.6?) for bitscore threshold
        if hmmer3:
            # The HMMER3 table output is easy to parse