예제 #1
0
def align_reference_genome(config, query_genome_path, ref_genome_id):
    """Perform per-genome calculation of ANI/conserved DNA values.

    :param config: a global config object encapsulating global runtime vars
    :param query_genome_path: Path to query genome Fasta file.
    :param ref_genome_id: reference genome id.

    :rtype: A dict representing a reference genome and additionally comprising ANI / conserved DNA values.
    """

    reference_genome_path = config['db_path'].joinpath(
        f'{ref_genome_id}.fna.gz')
    tmp_dir = Path(tempfile.mkdtemp())

    dna_fragments_path = tmp_dir.joinpath('dna-fragments.fasta')
    dna_fragments = util.build_dna_fragments(reference_genome_path,
                                             dna_fragments_path)

    # perform global alignments via nucmer
    dna_fragment_matches = execute_nucmer(config, tmp_dir, dna_fragments,
                                          dna_fragments_path,
                                          query_genome_path)

    shutil.rmtree(str(tmp_dir))

    ani = calculate_ani(dna_fragment_matches)
    conserved_dna = calculate_conserved_dna(dna_fragments,
                                            dna_fragment_matches)

    return (ref_genome_id, ani, conserved_dna)
예제 #2
0
def test_build_dna_fragments(tmpdir):
    genome_path = Path('tests/Salmonella_enterica_CFSAN000189.fasta').resolve()
    dna_fragments_path = tmpdir / 'fragments.fna'
    dna_fragments = ru.build_dna_fragments(genome_path, dna_fragments_path)

    # first nucleotide fragment must have standard length
    assert dna_fragments[1]['length'] == rc.FRAGMENT_SIZE

    # last nucleotide fragment should have a non-standard length
    last_fragment_id = sorted(list(dna_fragments.keys()))[-1]
    assert dna_fragments[last_fragment_id]['length'] != rc.FRAGMENT_SIZE
예제 #3
0
def cohort(args, config):
    """Allows cohort genome analysis."""
    all_paths = []
    for path in args.cohort_genomes:
        try:
            all_paths.append(util.check_path(path))
        except FileNotFoundError:
            sys.exit('ERROR: genome file %s is not readable!' % path)
        except PermissionError:
            sys.exit('ERROR (permission): genome file %s is not accessible' %
                     path)
        except OSError:
            sys.exit('ERROR: genome file %s is empty!' % path)
    if all_paths:
        config['genome_path'] = all_paths
    else:
        sys.exit(
            "ERROR: an unexpected path error has occured. Please check your query paths."
        )

    # mashing
    if args.verbose:
        print('\nEstimate genome distances...')
    mash_output_path = config['tmp'].joinpath('mash.out')

    if args.verbose:
        print('\nEstimate genome distances...')
    mash.exec_mash(config, mash_output_path)

    # Parse mash results
    mash_results, filtered_ids, mash_distances_list = mash.parse_mash_cohort(
        config, mash_output_path)

    # get genomes from RefSeq by accessions
    ref_genomes = util.read_reference_genomes(
        config)  # read database reference genomes
    screened_ref_genomes = {
        k: v
        for k, v in ref_genomes.items() if k in filtered_ids
    }

    # build DNA fragments
    dna_fragments_path = config['tmp'].joinpath('dna-fragments.fasta')
    dna_fragments_list = []
    for path in config["genome_path"]:
        dna_fragments = util.build_dna_fragments(path, dna_fragments_path)
        dna_fragments_list.append(dna_fragments)

    # align query fragments to reference genomes and compute ANI/conserved DNA
    cohort_results = []
    query_genomes = []
    if args.verbose:
        print('\nCompute ANIs...')
    with cf.ThreadPoolExecutor(max_workers=args.threads) as tpe:
        futures = []
        results = {}
        for genome_path, dna_fragments in zip(config['genome_path'],
                                              dna_fragments_list):
            for identifier, ref_genome in screened_ref_genomes.items():
                futures.append(
                    tpe.submit(rani.align_query_genome, config,
                               dna_fragments_path, dna_fragments, identifier))
            for f in futures:
                ref_genome_id, ani, conserved_dna = f.result()
                results[ref_genome_id] = [(ani, conserved_dna)]

            # align reference genomes fragments to query genome and compute ANI/conserved DNA
            if args.bidirectional:
                if args.verbose:
                    print('\nCompute reverse ANIs...')
                futures = []
                for identifier, ref_genome in screened_ref_genomes.items():
                    futures.append(
                        tpe.submit(rani.align_reference_genome, config,
                                   genome_path, identifier))
                for f in futures:
                    ref_genome_id, ani, conserved_dna = f.result()
                    results[ref_genome_id].append((ani, conserved_dna))
            query_genomes.append(ntpath.basename(genome_path).split(".", 1)[0])
            cohort_results.append(results)

    # remove tmp dir
    shutil.rmtree(str(config['tmp']))

    # filter and sort results
    filtered_reference_ids_list = []
    for results in cohort_results:
        filtered_reference_ids = []
        for ref_genome_id, result in results.items():
            if args.unfiltered:
                filtered_reference_ids.append(ref_genome_id)
            else:
                if args.bidirectional:
                    query_ref = result[0]
                    ref_query = result[1]
                    if ((query_ref[0] >= config['ani'])
                            and (query_ref[1] >= config['conserved_dna'])
                            and (ref_query[0] >= config['ani'])
                            and (ref_query[1] >= config['conserved_dna'])):
                        filtered_reference_ids.append(ref_genome_id)
                else:
                    ani, conserved_dna = result[0]
                    if (conserved_dna >= config['conserved_dna']) and (
                            ani >= config['ani']):
                        filtered_reference_ids.append(ref_genome_id)
        filtered_reference_ids_list.append(filtered_reference_ids)

    # Find common Reference genomes
    duplicate_check_list = []
    common_references = []
    for filtered_reference_ids in filtered_reference_ids_list:
        for filtered_ref_id in filtered_reference_ids:
            duplicate_check_list.append(filtered_ref_id)
    for elem in duplicate_check_list:
        if duplicate_check_list.count(elem) == len(
                filtered_reference_ids_list):
            if elem not in common_references:
                common_references.append(elem)

    # Calculate and print results based on ANI and conDNA
    if args.bidirectional:
        ref_id_values = {r: [1, 1, 1] for r in common_references}
        ref_id_values = algo.calculate(
            args, ref_id_values, common_references, cohort_results,
            query_genomes)  # Calculating ANI and conDNA

        common_references = sorted(common_references,
                                   key=lambda k: ref_id_values[k][2],
                                   reverse=True)

        # printing results
        print(
            '#ID\tMash Distance\tANI\tCon. DNA\tANIconDNA-coefficient\tTaxonomy ID\tAssembly Status\tOrganism'
        )  # "Aniconda?"
        for id in common_references:  # print results to STDOUT
            ref_genome = ref_genomes[id]
            result = ref_id_values[id]

            print('%s\t%1.5f\t%2.2f\t%2.2f\t%2.2f\t%s\t%s\t%s' %
                  (id, mash_distances_list[0][id], result[0] * 100,
                   result[1] * 100, result[2] * 100, ref_genome['tax'],
                   ref_genome['status'], ref_genome['name']))
    else:
        ref_id_values = {r: [1, 1, 1] for r in common_references}
        ref_id_values = algo.calculate(
            args, ref_id_values, common_references, cohort_results,
            query_genomes)  # Calculating ANI and conDNA

        common_references = sorted(common_references,
                                   key=lambda k: ref_id_values[k][2],
                                   reverse=True)

        # printing results
        print(
            '#ID\tMash Distance\tANI\tCon. DNA\tANIconDNA-coefficient\tTaxonomy ID\tAssembly Status\tOrganism'
        )  # "Aniconda?"
        for id in common_references:  # print results to STDOUT
            ref_genome = ref_genomes[id]
            result = ref_id_values[id]

            print('%s\t%1.5f\t%2.2f\t%2.2f\t%2.2f\t%s\t%s\t%s' %
                  (id, mash_distances_list[0][id], result[0] * 100,
                   result[1] * 100, result[2] * 100, ref_genome['tax'],
                   ref_genome['status'], ref_genome['name']))
예제 #4
0
def main():
    # parse options and arguments
    parser = argparse.ArgumentParser(
        prog='referenceseeker',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description='Rapid determination of appropriate reference genomes.',
        epilog=
        "Citation:\n%s\n\nGitHub:\nhttps://github.com/oschwengers/referenceseeker"
        % rc.CITATION,
        add_help=False)
    parser.add_argument('db',
                        metavar='<database>',
                        help='ReferenceSeeker database path')
    parser.add_argument('genome',
                        metavar='<genome>',
                        help='target draft genome in fasta format')
    group_workflow = parser.add_argument_group(
        'Filter options / thresholds',
        'These options control the filtering and alignment workflow.')
    group_workflow.add_argument(
        '--crg',
        '-r',
        action='store',
        type=int,
        default=100,
        help=
        'Max number of candidate reference genomes to pass kmer prefilter (default = 100)'
    )
    group_workflow.add_argument('--ani',
                                '-a',
                                action='store',
                                type=float,
                                default=0.95,
                                help='ANI threshold (default = 0.95)')
    group_workflow.add_argument(
        '--conserved-dna',
        '-c',
        action='store',
        dest='conserved_dna',
        type=float,
        default=0.69,
        help='Conserved DNA threshold (default = 0.69)')
    group_workflow.add_argument(
        '--unfiltered',
        '-u',
        action='store_true',
        help=
        'Set kmer prefilter to extremely conservative values and skip species level ANI cutoffs (ANI >= 0.95 and conserved DNA >= 0.69'
    )
    group_workflow.add_argument(
        '--bidirectional',
        '-b',
        action='store_true',
        help='Compute bidirectional ANI/conserved DNA values (default = False)'
    )

    group_runtime = parser.add_argument_group('Runtime & auxiliary options')
    group_runtime.add_argument('--help',
                               '-h',
                               action='help',
                               help='Show this help message and exit')
    group_runtime.add_argument('--version',
                               '-V',
                               action='version',
                               version='%(prog)s ' +
                               referenceseeker.__version__)
    group_runtime.add_argument('--verbose',
                               '-v',
                               action='store_true',
                               help='Print verbose information')
    group_runtime.add_argument(
        '--threads',
        '-t',
        action='store',
        type=int,
        default=mp.cpu_count(),
        help='Number of used threads (default = number of available CPU cores)'
    )
    args = parser.parse_args()

    # setup global configuration
    config = util.setup_configuration(args)
    util.test_binaries(config)

    # check parameters
    db_path = Path(args.db)
    if (not os.access(str(db_path), os.R_OK)):
        sys.exit('ERROR: database directory not readable!')
    db_path = db_path.resolve()
    config['db_path'] = db_path

    genome_path = Path(args.genome)
    if (not os.access(str(genome_path), os.R_OK)):
        sys.exit('ERROR: genome file not readable!')
    if (genome_path.stat().st_size == 0):
        sys.exit('ERROR: genome file (%s) is empty!' % genome_path)
    genome_path = genome_path.resolve()
    config['genome_path'] = genome_path

    # print verbose information
    if (args.verbose):
        print("ReferenceSeeker v%s" % referenceseeker.__version__)
        print('Options, parameters and arguments:')
        print("\tuse bundled binaries: %s" % str(config['bundled-binaries']))
        print("\tdb path: %s" % str(config['db_path']))
        print("\tgenome path: %s" % str(config['genome_path']))
        print("\ttmp path: %s" % str(config['tmp']))
        print("\tunfiltered: %s" % str(config['unfiltered']))
        print("\tbidirectional: %s" % str(config['bidirectional']))
        print("\tANI: %0.2f" % config['ani'])
        print("\tconserved DNA: %0.2f" % config['conserved_dna'])
        print("\t# CRG: %d" % config['crg'])
        print("\t# threads: %d" % config['threads'])

    # calculate genome distances via Mash
    if (args.verbose):
        print('\nEstimate genome distances...')
    mash_output_path = config['tmp'].joinpath('mash.out')
    mash.run_mash(config, mash_output_path)

    # extract hits and store dist
    screened_ref_genome_ids, mash_distances = mash.parse_mash_results(
        config, mash_output_path)
    if (args.verbose):
        print("\tscreened %d potential reference genome(s)" %
              len(screened_ref_genome_ids))

    # reduce Mash output to best hits (args.crg)
    if (len(screened_ref_genome_ids) > args.crg):
        if (args.verbose):
            print("\treduce to best %d hits..." % args.crg)
        tmp_screened_ref_genome_ids = sorted(screened_ref_genome_ids,
                                             key=lambda k: mash_distances[k])
        screened_ref_genome_ids = tmp_screened_ref_genome_ids[:args.crg]

    # get genomes from RefSeq by accessions
    ref_genomes = util.read_reference_genomes(config)
    screened_ref_genomes = {
        k: v
        for k, v in ref_genomes.items() if k in screened_ref_genome_ids
    }

    # build dna fragments
    dna_fragments_path = config['tmp'].joinpath('dna-fragments.fasta')
    dna_fragments = util.build_dna_fragments(genome_path, dna_fragments_path)

    # align query fragments to reference genomes and compute ANI/conserved DNA
    results = {}
    if (args.verbose):
        print('\nCompute ANIs...')
    with cf.ThreadPoolExecutor(max_workers=args.threads) as tpe:
        futures = []
        for id, ref_genome in screened_ref_genomes.items():
            futures.append(
                tpe.submit(rani.align_query_genome, config, dna_fragments_path,
                           dna_fragments, id))
        for f in futures:
            ref_genome_id, ani, conserved_dna = f.result()
            results[ref_genome_id] = [(ani, conserved_dna)]

    # align reference genomes fragments to query genome and compute ANI/conserved DNA
    if (args.bidirectional):
        if (args.verbose):
            print('\nCompute reverse ANIs...')
        with cf.ProcessPoolExecutor(args.threads) as ppe:
            futures = []
            for id, ref_genome in screened_ref_genomes.items():
                futures.append(
                    ppe.submit(rani.align_reference_genome, config,
                               genome_path, id))
            for f in futures:
                ref_genome_id, ani, conserved_dna = f.result()
                result = results[ref_genome_id]
                result.append((ani, conserved_dna))

    # remove tmp dir
    shutil.rmtree(str(config['tmp']))

    # filter and sort results
    filtered_reference_ids = []
    for ref_genome_id, result in results.items():
        if (args.unfiltered):
            filtered_reference_ids.append(ref_genome_id)
        else:
            if (args.bidirectional):
                query_ref = result[0]
                ref_query = result[1]
                if ((query_ref[0] >= config['ani'])
                        and (query_ref[1] >= config['conserved_dna'])
                        and (ref_query[0] >= config['ani'])
                        and (ref_query[1] >= config['conserved_dna'])):
                    filtered_reference_ids.append(ref_genome_id)
            else:
                (ani, conserved_dna) = result[0]
                if ((conserved_dna >= config['conserved_dna'])
                        and (ani >= config['ani'])):
                    filtered_reference_ids.append(ref_genome_id)

    # sort and print results according to ANI * conserved DNA values
    if (args.bidirectional):
        filtered_reference_ids = sorted(filtered_reference_ids,
                                        key=lambda k:
                                        (results[k][0][0] * results[k][0][1] *
                                         results[k][1][0] * results[k][1][1]),
                                        reverse=True)
        if (args.verbose):
            print('')
        print(
            '#ID\tMash Distance\tQR ANI\tQR Con. DNA\tRQ ANI\tRQ Con. DNA\tTaxonomy ID\tAssembly Status\tOrganism'
        )
        for id in filtered_reference_ids:  # print results to STDOUT
            ref_genome = ref_genomes[id]
            result = results[id]
            print(
                '%s\t%1.5f\t%2.2f\t%2.2f\t%2.2f\t%2.2f\t%s\t%s\t%s' %
                (id, mash_distances[id], result[0][0] * 100,
                 result[0][1] * 100, result[1][0] * 100, result[1][1] * 100,
                 ref_genome['tax'], ref_genome['status'], ref_genome['name']))
    else:
        filtered_reference_ids = sorted(filtered_reference_ids,
                                        key=lambda k:
                                        (results[k][0][0] * results[k][0][1]),
                                        reverse=True)
        if (args.verbose):
            print('')
        print(
            '#ID\tMash Distance\tANI\tCon. DNA\tTaxonomy ID\tAssembly Status\tOrganism'
        )
        for id in filtered_reference_ids:  # print results to STDOUT
            ref_genome = ref_genomes[id]
            result = results[id][0]
            print(
                '%s\t%1.5f\t%2.2f\t%2.2f\t%s\t%s\t%s' %
                (id, mash_distances[id], result[0] * 100, result[1] * 100,
                 ref_genome['tax'], ref_genome['status'], ref_genome['name']))
예제 #5
0
def single(args, config):
    """allows single genome analysis"""
    try:
        config['genome_path'] = [util.check_path(args.genome)]
    except FileNotFoundError:
        sys.exit('ERROR: genome file %s is not readable!' % args.genome)
    except PermissionError:
        sys.exit('ERROR (permission): genome file %s is not accessible' %
                 args.genome)
    except OSError:
        sys.exit('ERROR: genome file %s is empty!' % args.genome)

    # mash out best hits
    mash_output_path = config['tmp'].joinpath('mash.out')
    screened_ref_genome_ids, mash_distances = mash.run_mash(
        args, config, mash_output_path)
    config['genome_path'] = config['genome_path'][0]  # Reformat genome_path

    # get genomes from RefSeq by accessions
    ref_genomes = util.read_reference_genomes(config)
    screened_ref_genomes = {
        k: v
        for k, v in ref_genomes.items() if k in screened_ref_genome_ids
    }

    # build dna fragments
    dna_fragments_path = config['tmp'].joinpath('dna-fragments.fasta')
    dna_fragments = util.build_dna_fragments(config['genome_path'],
                                             dna_fragments_path)

    # align query fragments to reference genomes and compute ANI/conserved DNA
    results = {}
    if args.verbose:
        print('\nCompute ANIs...')
    with cf.ThreadPoolExecutor(max_workers=args.threads) as tpe:
        futures = []
        for identifier, ref_genome in screened_ref_genomes.items():
            futures.append(
                tpe.submit(rani.align_query_genome, config, dna_fragments_path,
                           dna_fragments, identifier))
        for f in futures:
            ref_genome_id, ani, conserved_dna = f.result()
            results[ref_genome_id] = [(ani, conserved_dna)]
        # align reference genomes fragments to query genome and compute ANI/conserved DNA
        if args.bidirectional:
            if args.verbose:
                print('\nCompute reverse ANIs...')
            futures = []
            for identifier, ref_genome in screened_ref_genomes.items():
                futures.append(
                    tpe.submit(rani.align_reference_genome, config,
                               config['genome_path'], identifier))
            for f in futures:
                ref_genome_id, ani, conserved_dna = f.result()
                result = results[ref_genome_id]
                result.append((ani, conserved_dna))

    # remove tmp dir
    shutil.rmtree(str(config['tmp']))

    # filter and sort results
    filtered_reference_ids = []
    for ref_genome_id, result in results.items():
        if args.unfiltered:
            filtered_reference_ids.append(ref_genome_id)
        else:
            if args.bidirectional:
                query_ref = result[0]
                ref_query = result[1]
                if ((query_ref[0] >= config['ani'])
                        and (query_ref[1] >= config['conserved_dna'])
                        and (ref_query[0] >= config['ani'])
                        and (ref_query[1] >= config['conserved_dna'])):
                    filtered_reference_ids.append(ref_genome_id)
            else:
                (ani, conserved_dna) = result[0]
                if (conserved_dna >=
                        config['conserved_dna']) and (ani >= config['ani']):
                    filtered_reference_ids.append(ref_genome_id)

    # sort and print results according to ANI * conserved DNA values
    if args.bidirectional:
        filtered_reference_ids = sorted(filtered_reference_ids,
                                        key=lambda k:
                                        (results[k][0][0] * results[k][0][1] *
                                         results[k][1][0] * results[k][1][1]),
                                        reverse=True)
        if args.verbose:
            print('')
        print(
            '#ID\tMash Distance\tQR ANI\tQR Con. DNA\tRQ ANI\tRQ Con. DNA\tTaxonomy ID\tAssembly Status\tOrganism'
        )
        for id in filtered_reference_ids:  # print results to STDOUT
            ref_genome = ref_genomes[id]
            result = results[id]
            print(
                '%s\t%1.5f\t%2.2f\t%2.2f\t%2.2f\t%2.2f\t%s\t%s\t%s' %
                (id, mash_distances[id], result[0][0] * 100,
                 result[0][1] * 100, result[1][0] * 100, result[1][1] * 100,
                 ref_genome['tax'], ref_genome['status'], ref_genome['name']))
    else:
        filtered_reference_ids = sorted(filtered_reference_ids,
                                        key=lambda k:
                                        (results[k][0][0] * results[k][0][1]),
                                        reverse=True)
        if args.verbose:
            print('')
        print(
            '#ID\tMash Distance\tANI\tCon. DNA\tTaxonomy ID\tAssembly Status\tOrganism'
        )
        for id in filtered_reference_ids:  # print results to STDOUT
            ref_genome = ref_genomes[id]
            result = results[id][0]
            print(
                '%s\t%1.5f\t%2.2f\t%2.2f\t%s\t%s\t%s' %
                (id, mash_distances[id], result[0] * 100, result[1] * 100,
                 ref_genome['tax'], ref_genome['status'], ref_genome['name']))