示例#1
0
    def test_run_all_noncoding(self):
        '''test run with no metadata input, all sequences are noncoding'''
        fasta_in = [
            os.path.join(data_dir, 'ref_preparer_test_run.in.1.fa'),
            os.path.join(data_dir, 'ref_preparer_test_run.in.2.fa'),
            os.path.join(data_dir, 'ref_preparer_test_run.in.3.fa'),
        ]

        extern_progs = external_progs.ExternalProgs()
        refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, all_coding='no', genetic_code=1)
        tmp_out = 'tmp.ref_preparer_test_run'
        refprep.run(tmp_out)
        expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_all_noncoding.out')

        test_files = [
            '00.auto_metadata.tsv',
            '01.filter.check_metadata.tsv',
            '01.filter.check_genes.log',
            '01.filter.check_noncoding.log',
            '01.filter.check_metadata.log',
            '02.cdhit.all.fa',
            '02.cdhit.clusters.tsv',
            '02.cdhit.gene.fa',
            '02.cdhit.gene.varonly.fa',
            '02.cdhit.noncoding.fa',
            '02.cdhit.noncoding.varonly.fa',
        ]

        for filename in test_files:
            expected = os.path.join(expected_outdir, filename)
            got = os.path.join(tmp_out, filename)
            self.assertTrue(filecmp.cmp(expected, got, shallow=False))

        common.rmtree(tmp_out)
示例#2
0
文件: cdhit.py 项目: turrisita/ariba
    def __init__(
      self,
      infile,
      seq_identity_threshold=0.9,
      threads=1,
      length_diff_cutoff=0.0,
      memory_limit=None,
      verbose=False,
      min_cluster_number=0
    ):

        if not os.path.exists(infile):
            raise Error('File not found: "' + infile + '". Cannot continue')

        if (memory_limit is not None) and (memory_limit < 0):
            raise Error('Input parameter cdhit_max_memory is set to an invalid value. Cannot continue')

        self.infile = os.path.abspath(infile)
        self.seq_identity_threshold = seq_identity_threshold
        self.threads = threads
        self.length_diff_cutoff = length_diff_cutoff
        self.memory_limit = memory_limit
        self.verbose = verbose
        self.min_cluster_number = min_cluster_number
        extern_progs = external_progs.ExternalProgs(fail_on_error=True, using_spades=False)
        self.cd_hit_est = extern_progs.exe('cdhit')
示例#3
0
    def __init__(
        self,
        reads1,
        reads2,
        ref_fasta,
        ref_fastas,
        working_dir,
        final_assembly_fa,
        final_assembly_bam,
        log_fh,
        all_reference_fasta,
        contig_name_prefix='ctg',
        assembler='fermilite',
        max_insert=1000,
        min_scaff_depth=10,
        min_scaff_length=50,
        nucmer_min_id=90,
        nucmer_min_len=20,
        nucmer_breaklen=200,
        extern_progs=None,
        clean=True,
    ):
        self.reads1 = os.path.abspath(reads1)
        self.reads2 = os.path.abspath(reads2)
        self.ref_fasta = os.path.abspath(ref_fasta)
        self.ref_fastas = os.path.abspath(ref_fastas)
        self.working_dir = os.path.abspath(working_dir)
        self.final_assembly_fa = os.path.abspath(final_assembly_fa)
        self.final_assembly_bam = os.path.abspath(final_assembly_bam)
        self.log_fh = log_fh
        self.all_reference_fasta = os.path.abspath(all_reference_fasta)
        self.contig_name_prefix = contig_name_prefix

        self.ref_seq_name = None
        self.assembler = assembler
        self.max_insert = max_insert
        self.min_scaff_depth = min_scaff_depth
        self.min_scaff_length = min_scaff_length
        self.nucmer_min_id = nucmer_min_id
        self.nucmer_min_len = nucmer_min_len
        self.nucmer_breaklen = nucmer_breaklen
        self.clean = clean

        if extern_progs is None:
            self.extern_progs = external_progs.ExternalProgs()
        else:
            self.extern_progs = extern_progs

        try:
            os.mkdir(self.working_dir)
        except:
            raise Error('Error mkdir ' + self.working_dir)

        self.assembler_dir = os.path.join(self.working_dir, 'Assemble')
        self.all_assembly_contigs_fa = os.path.join(self.working_dir,
                                                    'debug_all_contigs.fa')
        self.best_assembly_fa = os.path.join(self.working_dir,
                                             'debug_best_assembly.fa')
        self.final_assembly_fa = os.path.abspath(final_assembly_fa)
示例#4
0
def get_all_versions(raise_error=True, using_spades=True):
    extern_progs = external_progs.ExternalProgs(fail_on_error=False,
                                                using_spades=using_spades)

    report_lines = [
        'ARIBA version: ' + ariba_version,
        '\nExternal dependencies:',
        '\n'.join(extern_progs.version_report),
        '\nExternal dependencies OK: ' + str(extern_progs.all_deps_ok),
        '\nPython version:',
        str(sys.version),
        '\nPython packages:',
    ]

    python_packages_ok = True

    for package in [
            'ariba', 'bs4', 'dendropy', 'pyfastaq', 'pymummer', 'pysam'
    ]:
        try:
            exec('import ' + package)
            version = eval(package + '.__version__')
            path = eval(package + '.__file__')
        except:
            version = 'NOT_FOUND'
            path = 'NOT_FOUND'
            python_packages_ok = False

        if version != 'NOT_FOUND':
            if package in package_min_versions and LooseVersion(
                    version) < package_min_versions[package]:
                version += '... THIS IS TOO LOW. Needs>=' + package_min_versions[
                    package]
                python_packages_ok = False
            elif package in package_max_versions and LooseVersion(
                    version) > package_max_versions[package]:
                version += '...THIS IS TOO HIGH. Needs <=' + package_max_versions[
                    package]
                python_packages_ok = False

        report_lines.append(package + '\t' + version + '\t' + path)

    all_ok = extern_progs.all_deps_ok and python_packages_ok

    report_lines.extend([
        '\nPython packages OK: ' + str(python_packages_ok),
        '\nEverything looks OK: ' + str(all_ok),
    ])

    if raise_error and not all_ok:
        print(*report_lines, sep='\n', file=sys.stderr)
        print('Some dependencies not satisfied. Cannot continue.',
              file=sys.stderr)
        sys.exit(1)

    return extern_progs, report_lines
示例#5
0
    def __init__(
        self,
        readstore_obj,
        references_fa,
        cluster_name,
        log_fh,
        extern_progs=None,
    ):
        self.readstore = readstore_obj
        self.references_fa = references_fa
        self.cluster_name = cluster_name
        self.log_fh = log_fh

        if extern_progs is None:
            self.extern_progs = external_progs.ExternalProgs()
        else:
            self.extern_progs = extern_progs
示例#6
0
    def __init__(self,
                 infile,
                 seq_identity_threshold=0.9,
                 threads=1,
                 length_diff_cutoff=0.0,
                 verbose=False,
                 min_cluster_number=0):

        if not os.path.exists(infile):
            raise Error('File not found: "' + infile + '". Cannot continue')

        self.infile = os.path.abspath(infile)
        self.seq_identity_threshold = seq_identity_threshold
        self.threads = threads
        self.length_diff_cutoff = length_diff_cutoff
        self.verbose = verbose
        self.min_cluster_number = min_cluster_number
        extern_progs = external_progs.ExternalProgs(fail_on_error=True)
        self.cd_hit_est = extern_progs.exe('cdhit')
示例#7
0
    def test_run(self):
        '''test run'''
        fasta_in = [
            os.path.join(data_dir, 'ref_preparer_test_run.in.1.fa'),
            os.path.join(data_dir, 'ref_preparer_test_run.in.2.fa'),
            os.path.join(data_dir, 'ref_preparer_test_run.in.3.fa'),
        ]
        tsv_in = [
            os.path.join(data_dir, 'ref_preparer_test_run.in.1.tsv'),
            os.path.join(data_dir, 'ref_preparer_test_run.in.2.tsv'),
        ]

        extern_progs = external_progs.ExternalProgs()
        refprep = ref_preparer.RefPreparer(fasta_in,
                                           extern_progs,
                                           metadata_tsv_files=tsv_in,
                                           genetic_code=1)
        tmp_out = 'tmp.ref_preparer_test_run'
        refprep.run(tmp_out)
        expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run.out')

        test_files = [
            '01.filter.check_metadata.tsv',
            '01.filter.check_genes.log',
            '01.filter.check_metadata.log',
            '02.cdhit.all.fa',
            '02.cdhit.clusters.tsv',
            '02.cdhit.gene.fa',
            '02.cdhit.gene.varonly.fa',
            '02.cdhit.noncoding.fa',
            '02.cdhit.noncoding.varonly.fa',
        ]

        for filename in test_files:
            expected = os.path.join(expected_outdir, filename)
            got = os.path.join(tmp_out, filename)
            self.assertTrue(filecmp.cmp(expected, got, shallow=False))

        shutil.rmtree(tmp_out)
示例#8
0
    def test_run_noncoding_checks(self):
        '''test run with noncoding sequences that are outside of the allowed size range'''
        fasta_in = [
            os.path.join(data_dir, 'ref_preparer_test_run.in.4.fa')
        ]
        tsv_in = [
            os.path.join(data_dir, 'ref_preparer_test_run.in.4.tsv')
        ]

        extern_progs = external_progs.ExternalProgs()
        refprep = ref_preparer.RefPreparer(
            fasta_in, extern_progs, min_noncoding_length=6, max_noncoding_length=20,
            metadata_tsv_files=tsv_in, genetic_code=1)
        tmp_out = 'tmp.ref_preparer_test_run_noncoding_checks'
        refprep.run(tmp_out)
        expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_noncoding_checks.out')

        test_files = [
            '01.filter.check_metadata.tsv',
            '01.filter.check_genes.log',
            '01.filter.check_noncoding.log',
            '01.filter.check_metadata.log',
            '02.cdhit.all.fa',
            '02.cdhit.clusters.tsv',
            '02.cdhit.gene.fa',
            '02.cdhit.gene.varonly.fa',
            '02.cdhit.noncoding.fa',
            '02.cdhit.noncoding.varonly.fa',
        ]

        for filename in test_files:
            expected = os.path.join(expected_outdir, filename)
            got = os.path.join(tmp_out, filename)
            self.assertTrue(filecmp.cmp(expected, got, shallow=False))

        common.rmtree(tmp_out)
示例#9
0
import unittest
import shutil
import os
import pickle
import pyfastaq
import filecmp
from ariba import clusters, common, external_progs, histogram, sequence_metadata

modules_dir = os.path.dirname(os.path.abspath(clusters.__file__))
data_dir = os.path.join(modules_dir, 'tests', 'data')
extern_progs = external_progs.ExternalProgs()


def file_to_list(infile):
    f = pyfastaq.utils.open_file_read(infile)
    lines = [x for x in f.readlines()]
    pyfastaq.utils.close(f)
    return lines


class TestClusters(unittest.TestCase):
    def setUp(self):
        self.cluster_dir = 'tmp.Cluster'
        self.refdata_dir = 'tmp.RefData'
        os.mkdir(self.refdata_dir)
        shutil.copyfile(os.path.join(data_dir, 'clusters_test_dummy_db.fa'),
                        os.path.join(self.refdata_dir, '02.cdhit.all.fa'))
        shutil.copyfile(
            os.path.join(data_dir, 'clusters_test_dummy_db.tsv'),
            os.path.join(self.refdata_dir, '01.filter.check_metadata.tsv'))
        with open(os.path.join(self.refdata_dir, '00.info.txt'), 'w') as f:
示例#10
0
 def test_external_progs_ok(self):
     '''Test that external programs are found'''
     external_progs.ExternalProgs(verbose=True)
示例#11
0
文件: cluster.py 项目: jis958/ariba
    def __init__(
            self,
            root_dir,
            name,
            refdata,
            all_ref_seqs_fasta=None,
            total_reads=None,
            total_reads_bases=None,
            fail_file=None,
            read_store=None,
            reference_names=None,
            logfile=None,
            assembly_coverage=50,
            assembly_kmer=21,
            assembler='fermilite',
            max_insert=1000,
            min_scaff_depth=10,
            nucmer_min_id=90,
            nucmer_min_len=20,
            nucmer_breaklen=200,
            reads_insert=500,
            sspace_k=20,
            sspace_sd=0.4,
            threads=1,
            assembled_threshold=0.95,
            min_var_read_depth=5,
            min_second_var_read_depth=2,
            max_allele_freq=0.90,
            unique_threshold=0.03,
            max_gene_nt_extend=30,
            spades_mode="rna",  #["rna","wgs"]
            spades_options=None,
            clean=True,
            extern_progs=None,
            random_seed=42,
            threads_total=1):
        self.root_dir = os.path.abspath(root_dir)
        self.read_store = read_store
        self.refdata = refdata
        self.name = name
        self.fail_file = fail_file
        self.reference_fa = os.path.join(self.root_dir, 'reference.fa')
        self.reference_names = reference_names
        self.all_reads1 = os.path.join(self.root_dir, 'reads_1.fq')
        self.all_reads2 = os.path.join(self.root_dir, 'reads_2.fq')
        self.references_fa = os.path.join(self.root_dir, 'references.fa')

        if os.path.exists(self.root_dir):
            self._input_files_exist()

        self.total_reads = total_reads
        self.total_reads_bases = total_reads_bases
        self.logfile = logfile
        self.assembly_coverage = assembly_coverage
        self.assembly_kmer = assembly_kmer
        self.assembler = assembler
        self.sspace_k = sspace_k
        self.sspace_sd = sspace_sd
        self.reads_insert = reads_insert
        self.spades_mode = spades_mode
        self.spades_options = spades_options

        self.reads_for_assembly1 = os.path.join(self.root_dir,
                                                'reads_for_assembly_1.fq')
        self.reads_for_assembly2 = os.path.join(self.root_dir,
                                                'reads_for_assembly_2.fq')

        self.ref_sequence = None

        self.max_insert = max_insert
        self.min_scaff_depth = min_scaff_depth

        self.nucmer_min_id = nucmer_min_id
        self.nucmer_min_len = nucmer_min_len
        self.nucmer_breaklen = nucmer_breaklen

        self.min_var_read_depth = min_var_read_depth
        self.min_second_var_read_depth = min_second_var_read_depth
        self.max_allele_freq = max_allele_freq

        self.threads = threads
        self.assembled_threshold = assembled_threshold
        self.unique_threshold = unique_threshold
        self.max_gene_nt_extend = max_gene_nt_extend
        self.status_flag = flag.Flag()
        self.clean = clean

        self.threads_total = threads_total
        self.remaining_clusters = None

        self.assembly_dir = os.path.join(self.root_dir, 'Assembly')
        self.final_assembly_fa = os.path.join(self.root_dir, 'assembly.fa')
        self.final_assembly_bam = os.path.join(self.root_dir,
                                               'assembly.reads_mapped.bam')
        self.final_assembly_read_depths = os.path.join(
            self.root_dir, 'assembly.reads_mapped.bam.read_depths.gz')
        self.final_assembly_vcf = os.path.join(
            self.root_dir, 'assembly.reads_mapped.bam.vcf')
        self.samtools_vars_prefix = self.final_assembly_bam
        self.assembly_compare = None
        self.variants_from_samtools = {}
        self.assembly_compare_prefix = os.path.join(self.root_dir,
                                                    'assembly_compare')

        self.mummer_variants = {}
        self.variant_depths = {}
        self.percent_identities = {}

        # The log filehandle self.log_fh is set at the start of the run() method.
        # Lots of other methods use self.log_fh. But for unit testing, run() isn't
        # run. So we need to set this to something for unit testing.
        # On the other hand, setting it here breaks a real run of ARIBA because
        # multiprocessing complains with the error:
        # TypeError: cannot serialize '_io.TextIOWrapper' object.
        # Hence the following two lines...
        if unittest:
            self.log_fh = sys.stdout
        else:
            atexit.register(self._atexit)
            self.log_fh = None

        if extern_progs is None:
            self.extern_progs = external_progs.ExternalProgs()
        else:
            self.extern_progs = extern_progs

        if all_ref_seqs_fasta is None:
            self.all_refs_fasta = self.references_fa
        else:
            self.all_refs_fasta = os.path.abspath(all_ref_seqs_fasta)

        self.random_seed = random_seed
        wanted_signals = [
            signal.SIGABRT, signal.SIGINT, signal.SIGSEGV, signal.SIGTERM
        ]
        for s in wanted_signals:
            signal.signal(s, self._receive_signal)
示例#12
0
import unittest
import sys
import os
import shutil
import filecmp
import pyfastaq
from ariba import assembly
from ariba import external_progs

modules_dir = os.path.dirname(os.path.abspath(assembly.__file__))
data_dir = os.path.join(modules_dir, 'tests', 'data')
extern_progs = external_progs.ExternalProgs(using_spades=True)


class TestAssembly(unittest.TestCase):
    def test_run_fermilite(self):
        '''test _run_fermilite'''
        reads = os.path.join(data_dir, 'assembly_run_fermilite.reads.fq')
        tmp_fa = 'tmp.test_run_fermilite.fa'
        tmp_log = 'tmp.test_run_fermilite.log'
        expected_fa = os.path.join(data_dir,
                                   'assembly_run_fermilite.expected.fa')
        expected_log = os.path.join(data_dir,
                                    'assembly_run_fermilite.expected.log')
        got = assembly.Assembly._run_fermilite(reads, tmp_fa, tmp_log,
                                               'contig')
        self.assertEqual(0, got)
        self.assertTrue(filecmp.cmp(expected_fa, tmp_fa, shallow=False))
        self.assertTrue(filecmp.cmp(expected_log, tmp_log, shallow=False))
        os.unlink(tmp_fa)
        os.unlink(tmp_log)
示例#13
0
 def setUp(self):
     self.external_progs = external_progs.ExternalProgs()