def test_run_all_noncoding(self): '''test run with no metadata input, all sequences are noncoding''' fasta_in = [ os.path.join(data_dir, 'ref_preparer_test_run.in.1.fa'), os.path.join(data_dir, 'ref_preparer_test_run.in.2.fa'), os.path.join(data_dir, 'ref_preparer_test_run.in.3.fa'), ] extern_progs = external_progs.ExternalProgs() refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, all_coding='no', genetic_code=1) tmp_out = 'tmp.ref_preparer_test_run' refprep.run(tmp_out) expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_all_noncoding.out') test_files = [ '00.auto_metadata.tsv', '01.filter.check_metadata.tsv', '01.filter.check_genes.log', '01.filter.check_noncoding.log', '01.filter.check_metadata.log', '02.cdhit.all.fa', '02.cdhit.clusters.tsv', '02.cdhit.gene.fa', '02.cdhit.gene.varonly.fa', '02.cdhit.noncoding.fa', '02.cdhit.noncoding.varonly.fa', ] for filename in test_files: expected = os.path.join(expected_outdir, filename) got = os.path.join(tmp_out, filename) self.assertTrue(filecmp.cmp(expected, got, shallow=False)) common.rmtree(tmp_out)
def __init__( self, infile, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.0, memory_limit=None, verbose=False, min_cluster_number=0 ): if not os.path.exists(infile): raise Error('File not found: "' + infile + '". Cannot continue') if (memory_limit is not None) and (memory_limit < 0): raise Error('Input parameter cdhit_max_memory is set to an invalid value. Cannot continue') self.infile = os.path.abspath(infile) self.seq_identity_threshold = seq_identity_threshold self.threads = threads self.length_diff_cutoff = length_diff_cutoff self.memory_limit = memory_limit self.verbose = verbose self.min_cluster_number = min_cluster_number extern_progs = external_progs.ExternalProgs(fail_on_error=True, using_spades=False) self.cd_hit_est = extern_progs.exe('cdhit')
def __init__( self, reads1, reads2, ref_fasta, ref_fastas, working_dir, final_assembly_fa, final_assembly_bam, log_fh, all_reference_fasta, contig_name_prefix='ctg', assembler='fermilite', max_insert=1000, min_scaff_depth=10, min_scaff_length=50, nucmer_min_id=90, nucmer_min_len=20, nucmer_breaklen=200, extern_progs=None, clean=True, ): self.reads1 = os.path.abspath(reads1) self.reads2 = os.path.abspath(reads2) self.ref_fasta = os.path.abspath(ref_fasta) self.ref_fastas = os.path.abspath(ref_fastas) self.working_dir = os.path.abspath(working_dir) self.final_assembly_fa = os.path.abspath(final_assembly_fa) self.final_assembly_bam = os.path.abspath(final_assembly_bam) self.log_fh = log_fh self.all_reference_fasta = os.path.abspath(all_reference_fasta) self.contig_name_prefix = contig_name_prefix self.ref_seq_name = None self.assembler = assembler self.max_insert = max_insert self.min_scaff_depth = min_scaff_depth self.min_scaff_length = min_scaff_length self.nucmer_min_id = nucmer_min_id self.nucmer_min_len = nucmer_min_len self.nucmer_breaklen = nucmer_breaklen self.clean = clean if extern_progs is None: self.extern_progs = external_progs.ExternalProgs() else: self.extern_progs = extern_progs try: os.mkdir(self.working_dir) except: raise Error('Error mkdir ' + self.working_dir) self.assembler_dir = os.path.join(self.working_dir, 'Assemble') self.all_assembly_contigs_fa = os.path.join(self.working_dir, 'debug_all_contigs.fa') self.best_assembly_fa = os.path.join(self.working_dir, 'debug_best_assembly.fa') self.final_assembly_fa = os.path.abspath(final_assembly_fa)
def get_all_versions(raise_error=True, using_spades=True): extern_progs = external_progs.ExternalProgs(fail_on_error=False, using_spades=using_spades) report_lines = [ 'ARIBA version: ' + ariba_version, '\nExternal dependencies:', '\n'.join(extern_progs.version_report), '\nExternal dependencies OK: ' + str(extern_progs.all_deps_ok), '\nPython version:', str(sys.version), '\nPython packages:', ] python_packages_ok = True for package in [ 'ariba', 'bs4', 'dendropy', 'pyfastaq', 'pymummer', 'pysam' ]: try: exec('import ' + package) version = eval(package + '.__version__') path = eval(package + '.__file__') except: version = 'NOT_FOUND' path = 'NOT_FOUND' python_packages_ok = False if version != 'NOT_FOUND': if package in package_min_versions and LooseVersion( version) < package_min_versions[package]: version += '... THIS IS TOO LOW. Needs>=' + package_min_versions[ package] python_packages_ok = False elif package in package_max_versions and LooseVersion( version) > package_max_versions[package]: version += '...THIS IS TOO HIGH. Needs <=' + package_max_versions[ package] python_packages_ok = False report_lines.append(package + '\t' + version + '\t' + path) all_ok = extern_progs.all_deps_ok and python_packages_ok report_lines.extend([ '\nPython packages OK: ' + str(python_packages_ok), '\nEverything looks OK: ' + str(all_ok), ]) if raise_error and not all_ok: print(*report_lines, sep='\n', file=sys.stderr) print('Some dependencies not satisfied. Cannot continue.', file=sys.stderr) sys.exit(1) return extern_progs, report_lines
def __init__( self, readstore_obj, references_fa, cluster_name, log_fh, extern_progs=None, ): self.readstore = readstore_obj self.references_fa = references_fa self.cluster_name = cluster_name self.log_fh = log_fh if extern_progs is None: self.extern_progs = external_progs.ExternalProgs() else: self.extern_progs = extern_progs
def __init__(self, infile, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.0, verbose=False, min_cluster_number=0): if not os.path.exists(infile): raise Error('File not found: "' + infile + '". Cannot continue') self.infile = os.path.abspath(infile) self.seq_identity_threshold = seq_identity_threshold self.threads = threads self.length_diff_cutoff = length_diff_cutoff self.verbose = verbose self.min_cluster_number = min_cluster_number extern_progs = external_progs.ExternalProgs(fail_on_error=True) self.cd_hit_est = extern_progs.exe('cdhit')
def test_run(self): '''test run''' fasta_in = [ os.path.join(data_dir, 'ref_preparer_test_run.in.1.fa'), os.path.join(data_dir, 'ref_preparer_test_run.in.2.fa'), os.path.join(data_dir, 'ref_preparer_test_run.in.3.fa'), ] tsv_in = [ os.path.join(data_dir, 'ref_preparer_test_run.in.1.tsv'), os.path.join(data_dir, 'ref_preparer_test_run.in.2.tsv'), ] extern_progs = external_progs.ExternalProgs() refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, metadata_tsv_files=tsv_in, genetic_code=1) tmp_out = 'tmp.ref_preparer_test_run' refprep.run(tmp_out) expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run.out') test_files = [ '01.filter.check_metadata.tsv', '01.filter.check_genes.log', '01.filter.check_metadata.log', '02.cdhit.all.fa', '02.cdhit.clusters.tsv', '02.cdhit.gene.fa', '02.cdhit.gene.varonly.fa', '02.cdhit.noncoding.fa', '02.cdhit.noncoding.varonly.fa', ] for filename in test_files: expected = os.path.join(expected_outdir, filename) got = os.path.join(tmp_out, filename) self.assertTrue(filecmp.cmp(expected, got, shallow=False)) shutil.rmtree(tmp_out)
def test_run_noncoding_checks(self): '''test run with noncoding sequences that are outside of the allowed size range''' fasta_in = [ os.path.join(data_dir, 'ref_preparer_test_run.in.4.fa') ] tsv_in = [ os.path.join(data_dir, 'ref_preparer_test_run.in.4.tsv') ] extern_progs = external_progs.ExternalProgs() refprep = ref_preparer.RefPreparer( fasta_in, extern_progs, min_noncoding_length=6, max_noncoding_length=20, metadata_tsv_files=tsv_in, genetic_code=1) tmp_out = 'tmp.ref_preparer_test_run_noncoding_checks' refprep.run(tmp_out) expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_noncoding_checks.out') test_files = [ '01.filter.check_metadata.tsv', '01.filter.check_genes.log', '01.filter.check_noncoding.log', '01.filter.check_metadata.log', '02.cdhit.all.fa', '02.cdhit.clusters.tsv', '02.cdhit.gene.fa', '02.cdhit.gene.varonly.fa', '02.cdhit.noncoding.fa', '02.cdhit.noncoding.varonly.fa', ] for filename in test_files: expected = os.path.join(expected_outdir, filename) got = os.path.join(tmp_out, filename) self.assertTrue(filecmp.cmp(expected, got, shallow=False)) common.rmtree(tmp_out)
import unittest import shutil import os import pickle import pyfastaq import filecmp from ariba import clusters, common, external_progs, histogram, sequence_metadata modules_dir = os.path.dirname(os.path.abspath(clusters.__file__)) data_dir = os.path.join(modules_dir, 'tests', 'data') extern_progs = external_progs.ExternalProgs() def file_to_list(infile): f = pyfastaq.utils.open_file_read(infile) lines = [x for x in f.readlines()] pyfastaq.utils.close(f) return lines class TestClusters(unittest.TestCase): def setUp(self): self.cluster_dir = 'tmp.Cluster' self.refdata_dir = 'tmp.RefData' os.mkdir(self.refdata_dir) shutil.copyfile(os.path.join(data_dir, 'clusters_test_dummy_db.fa'), os.path.join(self.refdata_dir, '02.cdhit.all.fa')) shutil.copyfile( os.path.join(data_dir, 'clusters_test_dummy_db.tsv'), os.path.join(self.refdata_dir, '01.filter.check_metadata.tsv')) with open(os.path.join(self.refdata_dir, '00.info.txt'), 'w') as f:
def test_external_progs_ok(self): '''Test that external programs are found''' external_progs.ExternalProgs(verbose=True)
def __init__( self, root_dir, name, refdata, all_ref_seqs_fasta=None, total_reads=None, total_reads_bases=None, fail_file=None, read_store=None, reference_names=None, logfile=None, assembly_coverage=50, assembly_kmer=21, assembler='fermilite', max_insert=1000, min_scaff_depth=10, nucmer_min_id=90, nucmer_min_len=20, nucmer_breaklen=200, reads_insert=500, sspace_k=20, sspace_sd=0.4, threads=1, assembled_threshold=0.95, min_var_read_depth=5, min_second_var_read_depth=2, max_allele_freq=0.90, unique_threshold=0.03, max_gene_nt_extend=30, spades_mode="rna", #["rna","wgs"] spades_options=None, clean=True, extern_progs=None, random_seed=42, threads_total=1): self.root_dir = os.path.abspath(root_dir) self.read_store = read_store self.refdata = refdata self.name = name self.fail_file = fail_file self.reference_fa = os.path.join(self.root_dir, 'reference.fa') self.reference_names = reference_names self.all_reads1 = os.path.join(self.root_dir, 'reads_1.fq') self.all_reads2 = os.path.join(self.root_dir, 'reads_2.fq') self.references_fa = os.path.join(self.root_dir, 'references.fa') if os.path.exists(self.root_dir): self._input_files_exist() self.total_reads = total_reads self.total_reads_bases = total_reads_bases self.logfile = logfile self.assembly_coverage = assembly_coverage self.assembly_kmer = assembly_kmer self.assembler = assembler self.sspace_k = sspace_k self.sspace_sd = sspace_sd self.reads_insert = reads_insert self.spades_mode = spades_mode self.spades_options = spades_options self.reads_for_assembly1 = os.path.join(self.root_dir, 'reads_for_assembly_1.fq') self.reads_for_assembly2 = os.path.join(self.root_dir, 'reads_for_assembly_2.fq') self.ref_sequence = None self.max_insert = max_insert self.min_scaff_depth = min_scaff_depth self.nucmer_min_id = nucmer_min_id self.nucmer_min_len = nucmer_min_len self.nucmer_breaklen = nucmer_breaklen self.min_var_read_depth = min_var_read_depth self.min_second_var_read_depth = min_second_var_read_depth self.max_allele_freq = max_allele_freq self.threads = threads self.assembled_threshold = assembled_threshold self.unique_threshold = unique_threshold self.max_gene_nt_extend = max_gene_nt_extend self.status_flag = flag.Flag() self.clean = clean self.threads_total = threads_total self.remaining_clusters = None self.assembly_dir = os.path.join(self.root_dir, 'Assembly') self.final_assembly_fa = os.path.join(self.root_dir, 'assembly.fa') self.final_assembly_bam = os.path.join(self.root_dir, 'assembly.reads_mapped.bam') self.final_assembly_read_depths = os.path.join( self.root_dir, 'assembly.reads_mapped.bam.read_depths.gz') self.final_assembly_vcf = os.path.join( self.root_dir, 'assembly.reads_mapped.bam.vcf') self.samtools_vars_prefix = self.final_assembly_bam self.assembly_compare = None self.variants_from_samtools = {} self.assembly_compare_prefix = os.path.join(self.root_dir, 'assembly_compare') self.mummer_variants = {} self.variant_depths = {} self.percent_identities = {} # The log filehandle self.log_fh is set at the start of the run() method. # Lots of other methods use self.log_fh. But for unit testing, run() isn't # run. So we need to set this to something for unit testing. # On the other hand, setting it here breaks a real run of ARIBA because # multiprocessing complains with the error: # TypeError: cannot serialize '_io.TextIOWrapper' object. # Hence the following two lines... if unittest: self.log_fh = sys.stdout else: atexit.register(self._atexit) self.log_fh = None if extern_progs is None: self.extern_progs = external_progs.ExternalProgs() else: self.extern_progs = extern_progs if all_ref_seqs_fasta is None: self.all_refs_fasta = self.references_fa else: self.all_refs_fasta = os.path.abspath(all_ref_seqs_fasta) self.random_seed = random_seed wanted_signals = [ signal.SIGABRT, signal.SIGINT, signal.SIGSEGV, signal.SIGTERM ] for s in wanted_signals: signal.signal(s, self._receive_signal)
import unittest import sys import os import shutil import filecmp import pyfastaq from ariba import assembly from ariba import external_progs modules_dir = os.path.dirname(os.path.abspath(assembly.__file__)) data_dir = os.path.join(modules_dir, 'tests', 'data') extern_progs = external_progs.ExternalProgs(using_spades=True) class TestAssembly(unittest.TestCase): def test_run_fermilite(self): '''test _run_fermilite''' reads = os.path.join(data_dir, 'assembly_run_fermilite.reads.fq') tmp_fa = 'tmp.test_run_fermilite.fa' tmp_log = 'tmp.test_run_fermilite.log' expected_fa = os.path.join(data_dir, 'assembly_run_fermilite.expected.fa') expected_log = os.path.join(data_dir, 'assembly_run_fermilite.expected.log') got = assembly.Assembly._run_fermilite(reads, tmp_fa, tmp_log, 'contig') self.assertEqual(0, got) self.assertTrue(filecmp.cmp(expected_fa, tmp_fa, shallow=False)) self.assertTrue(filecmp.cmp(expected_log, tmp_log, shallow=False)) os.unlink(tmp_fa) os.unlink(tmp_log)
def setUp(self): self.external_progs = external_progs.ExternalProgs()