Exemplo n.º 1
0
def hmmsearch_print_then_parse(gbkf, hmmf):
    aaout = NamedTemporaryFile(mode='w+t', delete=False)
    sys.stderr.write(f"Writing the amino acids to {aaout.name}\n")
    aaout.seek(0)
    for seq in genbank_seqio(gbkf):
        for feat in seq.features:
            if feat.type != 'CDS':
                continue
            aa = ""
            if 'translation' in feat.qualifiers:
                aa = feat.qualifiers['translation'][0]
            else:
                aa = str(feat.extract(seq).translate().seq)
            myid = feature_id(seq, feat)
            aaout.write(f">{myid}\n{aa}\n")

    aaout.close()

    sys.stderr.write("Searching\n")
    try:
        search = subprocess.Popen([
            "hmmsearch", '--cpu', '6', '-E', '1e-10', '--domE', '1e-5',
            '--noali', hmmf, aaout.name
        ],
                                  stdout=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        sys.stderr.write(f"Error running hmmscan:\n{e}\n")
        sys.exit(-1)

    sys.stderr.write("Parsing\n")
    hmmresult = search.communicate()[0]
    results = SearchIO.parse(StringIO(hmmresult.decode()), 'hmmer3-text')
    allhits = {}
    hitcount = 0
    rescount = 0
    for res in results:
        allhits[res.id] = {}

        rescount += 1
        for hit in res:
            allhits[res.id][hit.id] = hit.evalue
            # print(f"Result: {res.id}: Hit: {hit.id} Eval: {hit.evalue}")
            hitcount += 1

    print(
        f"Using hmmsearch and tempfiles there were {rescount} results and {hitcount} hits, and our dict has {len(allhits)} entries"
    )
Exemplo n.º 2
0
def stream_hmmsearch(gbkf, hmmf):
    """
    NOTE: THIS DOES NOT WORK!!

    You can't stream against hmmsearch as it can't rewind the sequences. You either need to use hmmscan (slow) or a temp file
    (fast!)

    :param gbkf:
    :param hmmf:
    :return:
    """
    allhits = {}
    hitcount = 0
    rescount = 0
    for seq in genbank_seqio(gbkf):
        prots = []
        for feat in seq.features:
            if feat.type != 'CDS':
                continue
            aa = ""
            if 'translation' in feat.qualifiers:
                aa = feat.qualifiers['translation'][0]
            else:
                aa = str(feat.extract(seq).translate().seq)
            myid = feature_id(seq, feat)
            prots.append(f">{myid}\n{aa}")

        search = subprocess.Popen([
            "hmmsearch", '--cpu', '6', '-E', '1e-10', '--domE', '1e-5',
            '--noali', hmmf, '-'
        ],
                                  stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE)
        hmmresult = search.communicate(input="\n".join(prots).encode())[0]

        results = SearchIO.parse(StringIO(hmmresult.decode()), 'hmmer3-text')
        for res in results:
            allhits[res.id] = {}
            rescount += 1
            for hit in res:
                allhits[res.id][hit.id] = hit.evalue
                hitcount += 1

    print(
        f"Using hmmsearch and streaming all at once there were {rescount} results and {hitcount} hits, and our dict has {len(allhits)} entries"
    )
Exemplo n.º 3
0
def run_hmmscan_aao(gbkf, hmmf):

    allhits = {}
    hitcount = 0
    rescount = 0
    for seq in genbank_seqio(gbkf):
        prots = []
        for feat in seq.features:
            if feat.type != 'CDS':
                continue
            aa = ""
            if 'translation' in feat.qualifiers:
                aa = feat.qualifiers['translation'][0]
            else:
                aa = str(feat.extract(seq).translate().seq)
            myid = feature_id(seq, feat)
            prots.append(f">{myid}\n{aa}")

        search = subprocess.Popen([
            "hmmscan", '--cpu', '6', '-E', '1e-10', '--domE', '1e-5',
            '--noali', hmmf, '-'
        ],
                                  stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE)
        hmmresult = search.communicate(input="\n".join(prots).encode())[0]

        results = SearchIO.parse(StringIO(hmmresult.decode()), 'hmmer3-text')
        for res in results:
            allhits[res.id] = {}
            rescount += 1
            for hit in res:
                allhits[res.id][hit.id] = hit.evalue
                hitcount += 1

    print(
        f"Using hmmscan and streaming all at once there were {rescount} results and {hitcount} hits, and our dict has {len(allhits)} entries"
    )