Пример #1
0
def taxonomy_ref(in_filename, ref_taxonomy_filename, out_filename):
    """
    """


    logger = logging.getLogger('otu.taxonomy_ref')

    # load the taxonomy file
    ref_taxonomy_handler = open(ref_taxonomy_filename, 'r')
    ref_taxonomy_reader = csv.reader(ref_taxonomy_handler, delimiter='\t')
    ref_taxonomy = dict()
    for row in ref_taxonomy_reader:
        ref_taxonomy[row[0]] = utils.parse_taxonomy(row[1])
    ref_taxonomy_handler.close()

    # write the taxonomy file
    #logger.info("write the taxonomy file %s" % os.path.basename(out_filename)
    out_handle = open(out_filename, 'w')
    out_writer = csv.writer(out_handle, delimiter='\t', lineterminator='\n')
    for record in SeqIO.parse(in_filename, "fasta"):
        out_writer.writerow([record.id, ref_taxonomy[record.id]])
    out_handle.close()
Пример #2
0
def taxonomy_blast(in_filename, ref_filename, ref_taxonomy_filename,
                   out_filename, task='blastn', num_threads=1, evalue=10e-30,
                   perc_identity=90):
    """Assign taxonomy.

    1) makeblastdb -in REF_FILENAME -parse_seqids -dbtype nucl -out DB_FILENAME
    2) blastn -task blastn -db DB_FILENAME -query IN_FILENAME -out BLAST_OUT_FILENAME[.xml]
       -evalue EVALUE -perc_identity PERC_IDENTITY -num_threads NUM_THREADS -outfmt 5
    3) load reference taxonomy file (REF_TAXONOMY_FILENAME)
    4) write the taxonomy file (OUT_FILENAME)
    """

    logger = logging.getLogger('otu.taxonomy_blast')
    basepath = os.path.splitext(out_filename)[0]

    # makeblastdb
    db_prefix = basepath + "_BLAST_DB_TMP"
    devnull = open(os.devnull, "w")
    cmd = ["makeblastdb", "-in", ref_filename, "-parse_seqids", "-dbtype",
           "nucl", "-out", db_prefix]
    logger.info(' '.join(cmd))
    proc = subprocess.Popen(cmd, stdout=devnull, stderr=subprocess.PIPE)
    _, out_stderr = proc.communicate()
    devnull.close()
    if proc.returncode:
        logger.error(out_stderr)
        raise Exception(out_stderr)

    # blastn
    blast_out_filename = basepath + "_BLAST_OUT_TMP.xml"
    devnull = open(os.devnull, "w")
    cmd = ["blastn", "-task", task, "-db", db_prefix, "-query", in_filename,
           "-out", blast_out_filename, "-evalue", str(evalue), "-perc_identity",
           str(perc_identity), "-num_threads", str(num_threads), "-outfmt", "5"]
    logger.info(' '.join(cmd))
    proc = subprocess.Popen(cmd, stdout=devnull, stderr=subprocess.PIPE)
    _, out_stderr = proc.communicate()
    devnull.close()
    if proc.returncode:
        logger.error(out_stderr)
        raise Exception(out_stderr)

    logger.info("write the taxonomy file %s" % os.path.basename(out_filename))

    # load reference taxonomy file
    ref_taxonomy_handler = open(ref_taxonomy_filename, 'r')
    ref_taxonomy_reader = csv.reader(ref_taxonomy_handler, delimiter='\t')
    ref_taxonomy = dict()
    for row in ref_taxonomy_reader:
        ref_taxonomy[row[0]] = utils.parse_taxonomy(row[1])
    ref_taxonomy_handler.close()

    # write the output file
    blast_out_handle = open(blast_out_filename, 'r')
    blast_out_records = NCBIXML.parse(blast_out_handle)
    out_handle = open(out_filename, 'w')
    out_writer = csv.writer(out_handle, delimiter='\t', lineterminator='\n')
    for record in blast_out_records:
        if len(record.alignments) > 0:
            best_hit_id = record.alignments[0].hit_id
            ta = ref_taxonomy[best_hit_id]
        else:
            ta = "Unknown"
        out_writer.writerow([record.query, ta])
    out_handle.close()
    blast_out_handle.close()

    # delete tmp files
    for filename in glob.glob(db_prefix + '*'):
        os.remove(filename)
    os.remove(blast_out_filename)