Пример #1
0
    def _closest_nucmer_match_between_fastas(cls, ref_fasta, qry_fasta, log_fh,
                                             min_id, min_length, breaklen,
                                             use_qry_length, check_flanking):
        tmpdir = tempfile.mkdtemp(prefix='tmp.closest_nucmer_match.',
                                  dir=os.getcwd())
        coords_file = os.path.join(tmpdir, 'nucmer_vs_cluster_refs.coords')
        pymummer.nucmer.Runner(
            ref_fasta,
            qry_fasta,
            coords_file,
            min_id=min_id,
            min_length=min_length,
            breaklen=breaklen,
            maxmatch=True,
        ).run()
        nucmer_matches = RefSeqChooser._load_nucmer_coords_file(coords_file,
                                                                log_fh=log_fh)
        common.rmtree(tmpdir)

        if len(nucmer_matches) == 0:
            return None, {}
        else:
            best_hit = RefSeqChooser._choose_best_nucmer_match(
                nucmer_matches,
                use_qry_length=use_qry_length,
                check_flanking=check_flanking)
            return best_hit, nucmer_matches
Пример #2
0
 def test_assemble_with_fermilite(self):
     '''test _assemble_with_fermilite'''
     reads1 = os.path.join(data_dir,
                           'assembly_assemble_with_fermilite.reads_1.fq')
     reads2 = os.path.join(data_dir,
                           'assembly_assemble_with_fermilite.reads_2.fq')
     expected_log = os.path.join(
         data_dir, 'assembly_assemble_with_fermilite.expected.log')
     expected_fa = os.path.join(
         data_dir, 'assembly_assemble_with_fermilite.expected.fa')
     tmp_dir = 'tmp.test_assemble_with_fermilite'
     tmp_log = 'tmp.test_assemble_with_fermilite.log'
     tmp_log_fh = open(tmp_log, 'w')
     print('First line', file=tmp_log_fh)
     a = assembly.Assembly(reads1, reads2, 'not needed', 'not needed',
                           tmp_dir, 'not_needed_for_this_test.fa',
                           'not_needed_for_this_test.bam', tmp_log_fh,
                           'not needed')
     a._assemble_with_fermilite()
     self.assertTrue(a.assembled_ok)
     tmp_log_fh.close()
     self.assertTrue(filecmp.cmp(expected_log, tmp_log, shallow=False))
     self.assertTrue(
         filecmp.cmp(expected_fa,
                     os.path.join(tmp_dir, 'debug_all_contigs.fa'),
                     shallow=False))
     common.rmtree(tmp_dir)
     os.unlink(tmp_log)
Пример #3
0
    def test_load_fasta_files_and_write_clusters_file(self):
        '''test _load_fasta_files_and_write_clusters_file'''
        indir = os.path.join(
            data_dir, 'pubmlst_ref_prepare.test_load_fa_and_clusters.in')
        outdir = 'tmp.test.pubmlst_ref_prepare.test_load_fa_and_clusters'
        os.mkdir(outdir)
        r_prep = pubmlst_ref_preparer.PubmlstRefPreparer('species', outdir)
        profile_file = os.path.join(indir, 'profile.txt')
        r_prep.profile = mlst_profile.MlstProfile(profile_file)
        r_prep._load_fasta_files_and_write_clusters_file(indir)
        expected_cluster_tsv = os.path.join(
            data_dir,
            'pubmlst_ref_prepare.test_load_fa_and_clusters.expect.tsv')
        self.assertTrue(
            filecmp.cmp(expected_cluster_tsv,
                        r_prep.clusters_file,
                        shallow=False))
        common.rmtree(outdir)

        expected_fasta_files = [
            os.path.join(indir, x) for x in ['gene1.tfa', 'gene2.tfa']
        ]
        self.assertEqual(expected_fasta_files, r_prep.fasta_files)

        expected_seqs = {
            'gene1': {
                'gene1_1': pyfastaq.sequences.Fasta('gene1_1', 'ACGT'),
                'gene1_2': pyfastaq.sequences.Fasta('gene1_2', 'AAAA'),
            },
            'gene2': {
                'gene2_1': pyfastaq.sequences.Fasta('gene2_1', 'GGGG'),
                'gene2_2': pyfastaq.sequences.Fasta('gene2_2', 'TTTT'),
            },
        }
        self.assertEqual(expected_seqs, r_prep.sequences)
Пример #4
0
    def run(self, reads_out1, reads_out2):
        tmpdir = tempfile.mkdtemp(prefix='tmp.filter_reads.', dir=os.getcwd())
        all_reads_fasta = os.path.join(tmpdir, 'all_reads_for_cdhit.fa')
        self.readstore.get_reads(self.cluster_name,
                                 all_reads_fasta,
                                 fasta=True,
                                 log_fh=self.log_fh)
        cdhit_out = os.path.join(tmpdir, 'cdhit')
        ReadFilter._run_cdhit_est_2d(self.references_fa,
                                     all_reads_fasta,
                                     cdhit_out,
                                     self.extern_progs.exe('cdhit2d'),
                                     verbose=True,
                                     verbose_fh=self.log_fh)

        wanted_read_ids = ReadFilter._cdhit_clstr_to_reads(cdhit_out +
                                                           '.clstr')
        total_reads, total_bases = self.readstore.get_reads(
            self.cluster_name,
            reads_out1,
            out2=reads_out2,
            log_fh=self.log_fh,
            wanted_ids=wanted_read_ids)

        common.rmtree(tmpdir)
        return total_reads, total_bases
Пример #5
0
 def test_assemble_with_spades_fail(self):
     '''test _assemble_with_spades handles spades fail'''
     reads1 = os.path.join(
         data_dir, 'assembly_test_assemble_with_spades_fails_reads_1.fq')
     reads2 = os.path.join(
         data_dir, 'assembly_test_assemble_with_spades_fails_reads_2.fq')
     tmp_dir = 'tmp.test_assemble_with_spades_fail'
     tmp_log = 'tmp.test_assemble_with_spades_fail.log'
     with open(tmp_log, 'w') as tmp_log_fh:
         print('First line', file=tmp_log_fh)
         common.rmtree(tmp_dir)
         a = assembly.Assembly(reads1,
                               reads2,
                               'not needed',
                               'not needed',
                               tmp_dir,
                               'not_needed_for_this_test.fa',
                               'not_needed_for_this_test.bam',
                               tmp_log_fh,
                               'not needed',
                               assembler="spades",
                               spades_options=" --only-assembler")
         a._assemble_with_spades()
     self.assertFalse(a.assembled_ok)
     common.rmtree(tmp_dir)
     os.unlink(tmp_log)
Пример #6
0
 def test_assemble_with_spades(self):
     '''test _assemble_with_spades'''
     reads1 = os.path.join(data_dir,
                           'assembly_test_assemble_with_spades_reads_1.fq')
     reads2 = os.path.join(data_dir,
                           'assembly_test_assemble_with_spades_reads_2.fq')
     tmp_dir = 'tmp.test_assemble_with_spades'
     tmp_log = 'tmp.test_assemble_with_spades.log'
     with open(tmp_log, 'w') as tmp_log_fh:
         print('First line', file=tmp_log_fh)
         common.rmtree(tmp_dir)
         #using spades_options=" --only-assembler" because error correction cannot determine quality offset on this
         #artificial dataset
         a = assembly.Assembly(reads1,
                               reads2,
                               'not needed',
                               'not needed',
                               tmp_dir,
                               'not_needed_for_this_test.fa',
                               'not_needed_for_this_test.bam',
                               tmp_log_fh,
                               'not needed',
                               assembler="spades",
                               spades_options=" --only-assembler")
         a._assemble_with_spades()
     self.assertTrue(a.assembled_ok)
     common.rmtree(tmp_dir)
     os.unlink(tmp_log)
Пример #7
0
    def test_run_all_noncoding(self):
        '''test run with no metadata input, all sequences are noncoding'''
        fasta_in = [
            os.path.join(data_dir, 'ref_preparer_test_run.in.1.fa'),
            os.path.join(data_dir, 'ref_preparer_test_run.in.2.fa'),
            os.path.join(data_dir, 'ref_preparer_test_run.in.3.fa'),
        ]

        extern_progs = external_progs.ExternalProgs()
        refprep = ref_preparer.RefPreparer(fasta_in, extern_progs, all_coding='no', genetic_code=1)
        tmp_out = 'tmp.ref_preparer_test_run'
        refprep.run(tmp_out)
        expected_outdir = os.path.join(data_dir, 'ref_preparer_test_run_all_noncoding.out')

        test_files = [
            '00.auto_metadata.tsv',
            '01.filter.check_metadata.tsv',
            '01.filter.check_genes.log',
            '01.filter.check_noncoding.log',
            '01.filter.check_metadata.log',
            '02.cdhit.all.fa',
            '02.cdhit.clusters.tsv',
            '02.cdhit.gene.fa',
            '02.cdhit.gene.varonly.fa',
            '02.cdhit.noncoding.fa',
            '02.cdhit.noncoding.varonly.fa',
        ]

        for filename in test_files:
            expected = os.path.join(expected_outdir, filename)
            got = os.path.join(tmp_out, filename)
            self.assertTrue(filecmp.cmp(expected, got, shallow=False))

        common.rmtree(tmp_out)
Пример #8
0
def _run_cluster(obj, verbose, clean, fails_dir, remaining_clusters, remaining_clusters_lock):
    failed_clusters = os.listdir(fails_dir)

    if len(failed_clusters) > 0:
        print('Other clusters failed. Will not start cluster', obj.name, file=sys.stderr)
        return obj

    if verbose:
        print('Start running cluster', obj.name, 'in directory', obj.root_dir, flush=True)
    try:
        obj.run(remaining_clusters=remaining_clusters,remaining_clusters_lock=remaining_clusters_lock)
    except:
        print('Failed cluster:', obj.name, file=sys.stderr)
        with open(os.path.join(fails_dir, obj.name), 'w'):
            pass

    if verbose:
        print('Finished running cluster', obj.name, 'in directory', obj.root_dir, flush=True)

    if clean:
        if verbose:
            print('Deleting cluster dir', obj.root_dir, flush=True)
        if os.path.exists(obj.root_dir):
            try:
                common.rmtree(obj.root_dir)
            except:
                pass

    return obj
Пример #9
0
    def _get_from_vfdb_common(self, outprefix, filename, info_text):
        outprefix = os.path.abspath(outprefix)
        tmpdir = outprefix + '.tmp.download'

        try:
            os.mkdir(tmpdir)
        except:
            raise Error('Error mkdir ' + tmpdir)

        zipfile = os.path.join(tmpdir, filename)
        common.download_file('http://www.mgc.ac.cn/VFs/Down/' + filename, zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True)
        print('Extracting files ... ', end='', flush=True)
        vparser = vfdb_parser.VfdbParser(zipfile, outprefix)
        vparser.run()
        if not self.debug:
            common.rmtree(tmpdir)
        print('done')
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'

        print('Extracted core DNA sequence dataset and metadata. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print('"VFDB 2016: hierarchical and refined dataset for big data analysis-10 years on",\nChen LH et al 2016, Nucleic Acids Res. 44(Database issue):D694-D697. PMID: 26578559\n')
Пример #10
0
    def _extract_files(cls, zip_file, outdir):
        original_files = {
            'annotations': None,
            'fasta': None,
            'header_mappings': None
        }

        try:
            os.mkdir(outdir)
        except:
            raise Error('Error making directory ' + outdir)

        zfile = zipfile.ZipFile(zip_file)
        for member in zfile.namelist():
            if '_annotations_' in member:
                original_files['annotations'] = member
            elif '_database_' in member and member.endswith('.fasta'):
                original_files['fasta'] = member
            elif '_header_mappings_' in member:
                original_files['header_mappings'] = member
            else:
                continue

            zfile.extract(member, path=outdir)

        if None in original_files.values():
            common.rmtree(outdir)
            raise Error(
                'Error. Not all expected files found in downloaded megares zipfile. '
                + str(original_files))

        return original_files
Пример #11
0
    def test_extract_files_ok(self):
        '''test _extract_files when all ok'''
        zip_file = os.path.join(data_dir,
                                'megares_zip_parse_extract_files_ok.zip')
        tmp_dir = 'tmp.test_megares_extract_files_ok'
        got = megares_zip_parser.MegaresZipParser._extract_files(
            zip_file, tmp_dir)
        common_dir = os.path.join('megares_zip_parse_extract_files_ok',
                                  'megares_v1.01')
        expected = {
            'annotations':
            os.path.join(common_dir, 'megares_annotations_v1.01.csv'),
            'fasta':
            os.path.join(common_dir, 'megares_database_v1.01.fasta'),
            'header_mappings':
            os.path.join(common_dir,
                         'megares_to_external_header_mappings_v1.01.tsv')
        }

        self.assertEqual(expected, got)

        for filename in expected.values():
            self.assertTrue(os.path.exists(os.path.join(tmp_dir, filename)))

        common.rmtree(tmp_dir)
Пример #12
0
 def test_make_prepareref_dir(self):
     '''test make_prepareref_dir'''
     outdir = 'tmp.make_prepareref_dir'
     common.rmtree(outdir)
     tb.make_prepareref_dir(outdir)
     self.assertTrue(os.path.exists(outdir))
     json_file = os.path.join(outdir, '00.params.json')
     common.rmtree(outdir)
Пример #13
0
    def _get_from_argannot(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        tmpdir = outprefix + '.tmp.download'
        current_dir = os.getcwd()

        try:
            os.mkdir(tmpdir)
            os.chdir(tmpdir)
        except:
            raise Error('Error mkdir/chdir ' + tmpdir)

        zipfile = 'arg-annot-database_doc.zip'
        common.download_file(
            'http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip',
            zipfile,
            max_attempts=self.max_download_attempts,
            sleep_time=self.sleep_time,
            verbose=True)
        common.syscall('unzip ' + zipfile)
        os.chdir(current_dir)
        print('Extracted files.')

        genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt')
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'

        seq_reader = pyfastaq.sequences.file_reader(genes_file)
        f_out_tsv = pyfastaq.utils.open_file_write(final_tsv)
        f_out_fa = pyfastaq.utils.open_file_write(final_fasta)

        for seq in seq_reader:
            original_id = seq.id
            seq.id = re.sub(r'\((.*)\)', r'\1.', seq.id.split()[0])
            print(seq, file=f_out_fa)
            print(seq.id,
                  '1',
                  '0',
                  '.',
                  '.',
                  'Original name: ' + original_id,
                  sep='\t',
                  file=f_out_tsv)

        pyfastaq.utils.close(f_out_tsv)
        pyfastaq.utils.close(f_out_fa)
        if not self.debug:
            common.rmtree(tmpdir)

        print('Finished. Final files are:',
              final_fasta,
              final_tsv,
              sep='\n\t',
              end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv,
              'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print(argannot_ref)
Пример #14
0
    def _get_from_virulencefinder(self, outprefix):
        outprefix = os.path.abspath(outprefix)
        final_fasta = outprefix + '.fa'
        final_tsv = outprefix + '.tsv'
        tmpdir = outprefix + '.tmp.download'
        current_dir = os.getcwd()

        if self.version == 'old':
            try:
                os.mkdir(tmpdir)
                os.chdir(tmpdir)
            except:
                raise Error('Error mkdir/chdir ' + tmpdir)

            zipfile = 'virulencefinder.zip'
            cmd = 'curl -X POST --data "folder=virulencefinder&filename=virulencefinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
            print('Downloading data with:', cmd, sep='\n')
            common.syscall(cmd)
            common.syscall('unzip ' + zipfile)
        else:
            RefGenesGetter._get_genetic_epi_database_from_bitbucket('virulencefinder', tmpdir, git_commit=self.version)
            os.chdir(tmpdir)

        print('Combining downloaded fasta files...')
        fout_fa = pyfastaq.utils.open_file_write(final_fasta)
        fout_tsv = pyfastaq.utils.open_file_write(final_tsv)
        name_count = {}

        for filename in os.listdir(tmpdir):
            if filename.endswith('.fsa'):
                print('   ', filename)
                fix_file = os.path.join(tmpdir, filename + '.fix.fsa')
                RefGenesGetter._fix_virulencefinder_fasta_file(os.path.join(tmpdir, filename), fix_file)
                file_reader = pyfastaq.sequences.file_reader(fix_file)
                for seq in file_reader:
                    original_id = seq.id
                    seq.id = seq.id.replace('_', '.', 1)
                    seq.id = seq.id.replace(' ', '_')
                    if seq.id in name_count:
                        name_count[seq.id] += 1
                        seq.id = seq.id + '.' + str(name_count[seq.id])
                    else:
                        name_count[seq.id] = 1
                    print(seq, file=fout_fa)
                    print(seq.id, '0', '0', '.', '.', 'Original name was ' + original_id, sep='\t', file=fout_tsv)

        pyfastaq.utils.close(fout_fa)
        pyfastaq.utils.close(fout_tsv)
        print('\nFinished combining files\n')
        os.chdir(current_dir)
        if not self.debug:
            common.rmtree(tmpdir)
        print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
        print('You can use them with ARIBA like this:')
        print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
        print('If you use this downloaded data, please cite:')
        print('"Real-time whole-genome sequencing for routine typing, surveillance, and outbreak detection of verotoxigenic Escherichia coli", Joensen al 2014, PMID: 24574290\n')
Пример #15
0
 def run(self):
     tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
     cdhit_fasta = os.path.join(tmpdir, 'cdhit')
     cluster_info_outfile = cdhit_fasta + '.bak.clstr'
     cmd = self.get_run_cmd(cdhit_fasta)
     common.syscall(cmd, verbose=self.verbose)
     clusters = self._get_clusters_from_bak_file(cluster_info_outfile, self.min_cluster_number)
     common.rmtree(tmpdir)
     return clusters
Пример #16
0
    def test_rmtree(self):
        '''test rmtree'''
        tmp_dir = 'tmp.rmtree'
        os.mkdir(tmp_dir)
        with open(os.path.join(tmp_dir, 'foo'), 'w') as f:
            pass

        self.assertTrue(os.path.exists(tmp_dir))
        common.rmtree(tmp_dir)
        self.assertFalse(os.path.exists(tmp_dir))
Пример #17
0
 def run(self):
     common.download_file(self.zip_url, self.zip_file, verbose=True)
     tmpdir = self.zip_file + '.tmp.extract'
     original_files = MegaresZipParser._extract_files(self.zip_file, tmpdir)
     annotation_data = MegaresZipParser._load_annotations_file(os.path.join(tmpdir, original_files['annotations']))
     header_data = MegaresZipParser._load_header_mappings_file(os.path.join(tmpdir, original_files['header_mappings']))
     sequences = {}
     pyfastaq.tasks.file_to_dict(os.path.join(tmpdir, original_files['fasta']), sequences)
     MegaresZipParser._write_files(self.outprefix, sequences, annotation_data, header_data)
     common.rmtree(tmpdir)
     os.unlink(self.zip_file)
Пример #18
0
    def _get_xml_file_tree(self):
        xml_url = 'http://pubmlst.org/data/dbases.xml'
        tmpdir = tempfile.mkdtemp(prefix='tmp.get_pubmlst_xml', dir=os.getcwd())
        xml_file = os.path.join(tmpdir, 'out.xml')
        self._download_file(xml_url, xml_file)
        xml_tree = ET.parse(xml_file)

        if not self.debug:
            common.rmtree(tmpdir)

        return xml_tree
Пример #19
0
 def test_full_run_ok_gene_start_mismatch(self):
     '''test complete run where gene extended because too different at end for full nucmer match'''
     fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_gene_start_mismatch.fa')
     tsv_in = os.path.join(data_dir, 'cluster_test_full_run_ok_gene_start_mismatch.metadata.tsv')
     refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
     tmpdir = 'tmp.cluster_test_full_run_ok_gene_start_mismatch'
     common.rmtree(tmpdir)
     shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_gene_start_mismatch'), tmpdir)
     c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=112, total_reads_bases=1080)
     c.run()
     expected = [
         'gene\tgene\t1\t0\t27\t112\tcluster_name\t96\t96\t100.0\tcluster_name.l6.c30.ctg.1\t362\t27.8\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of gene'
     ]
     self.assertEqual(expected, c.report_lines)
     common.rmtree(tmpdir)
Пример #20
0
 def test_full_run_smtls_snp_varonly_nonc(self):
     '''test complete run where samtools calls a snp in a presence/absence noncoding sequence'''
     fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.fa')
     tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.tsv')
     refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
     tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_nonc'
     common.rmtree(tmpdir)
     shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc'), tmpdir)
     c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320)
     c.run()
     expected = [
         'ref_seq\tref_seq\t0\t1\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_seq'
     ]
     self.assertEqual(expected, c.report_lines)
     common.rmtree(tmpdir)
Пример #21
0
    def test_full_run_insert_codon(self):
        '''Test complete run where there is a inserted codon'''
        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_insert_codon.fa')
        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_insert_codon.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_insert_codon'
        common.rmtree(tmpdir)
        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_insert_codon'), tmpdir)
        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=292, total_reads_bases=20900)
        c.run()

        expected = [
            'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t108\t108\t92.31\tcluster_name.l15.c30.ctg.1\t1115\t19.9\t0\t.\tp\t.\t0\tS25_M26insELI\tINS\t73\t73\tA\t554\t554\tG\t24\tG\t24\t.\tGeneric description of presence_absence1'
        ]
        self.assertEqual(expected, c.report_lines)
        common.rmtree(tmpdir)
Пример #22
0
    def test_full_run_partial_assembly(self):
        '''Test complete run where only part of the ref gene is present in the reads'''
        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly.fa')
        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_partial_assembly'
        common.rmtree(tmpdir)
        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly'), tmpdir)
        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=278, total_reads_bases=15020)
        c.run()

        expected = [
            'presence_absence1\tpresence_absence1\t1\t0\t19\t278\tcluster_name\t96\t77\t100.0\tcluster_name.l15.c17.ctg.1\t949\t20.5\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of presence_absence1'
        ]
        self.assertEqual(expected, c.report_lines)
        common.rmtree(tmpdir)
Пример #23
0
    def test_full_run_ok_variants_only_variant_not_present_always_report(self):
        '''test complete run of cluster on a variants only gene when variant not present but always report variant'''
        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa')
        tsv_in = os.path.join(data_dir, 'cluster_full_run_varonly.not_present.always_report.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_full_run_varonly.not_present.always_report'
        common.rmtree(tmpdir)
        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir)

        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=66, total_reads_bases=3300)
        c.run()
        expected = [
            'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t65\t67\tCGC\t18;18;19\tC;G;C\t18;18;19\tvariants_only1:1:1:R3S:.:Ref and assembly have wild type, but always report anyway\tGeneric description of variants_only1'
        ]
        self.assertEqual(expected, c.report_lines)
        common.rmtree(tmpdir)
Пример #24
0
    def test_full_run_no_reads_after_filtering(self):
        '''test complete run of cluster when filtering removes all reads'''
        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering.in.fa')
        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering.in.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.test_full_run_no_reads_after_filtering'
        common.rmtree(tmpdir)
        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering'), tmpdir)

        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=0, total_reads_bases=0)
        c.run()

        expected = '\t'.join(['.', '.', '.', '.', '64', '0', 'cluster_name'] + ['.'] * 24)
        self.assertEqual([expected], c.report_lines)
        self.assertFalse(c.status_flag.has('ref_seq_choose_fail'))
        self.assertTrue(c.status_flag.has('assembly_fail'))
        common.rmtree(tmpdir)
Пример #25
0
    def test_full_run_multiple_vars_in_codon(self):
        '''Test complete run where there is a codon with a SNP and an indel'''
        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_multiple_vars.fa')
        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_multiple_vars.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_multiple_vars'
        common.rmtree(tmpdir)
        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_multiple_vars'), tmpdir)
        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=292, total_reads_bases=20900)
        c.run()

        expected = [
            'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t96\t96\t96.91\tcluster_name.l15.c30.ctg.1\t1074\t20.4\t0\t.\tp\t.\t0\t.\tMULTIPLE\t25\t26\tGA\t487\t489\tCAT\t27;26;25\tC;A;T\t27;26;25\t.\tGeneric description of presence_absence1',
            'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t96\t96\t96.91\tcluster_name.l15.c30.ctg.1\t1074\t20.4\t0\t.\tp\t.\t0\tA10fs\tFSHIFT\t28\t28\tG\t491\t491\tG\t26\tG\t26\t.\tGeneric description of presence_absence1',
        ]
        self.assertEqual(expected, c.report_lines)
        common.rmtree(tmpdir)
Пример #26
0
 def test_full_run_smtls_snp_varonly_gene_2(self):
     '''test complete run where samtools calls a snp in a variant only gene'''
     # _2 because I think test_full_run_smtls_snp_varonly_gene tests the asame functionality.
     # ... but let's leave both tests in anyway
     fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.fa')
     tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.tsv')
     refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
     tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_gene_2'
     common.rmtree(tmpdir)
     shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2'), tmpdir)
     c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320)
     c.run()
     expected = [
         'ref_gene\tref_gene\t1\t1\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_gene'
     ]
     self.assertEqual(expected, c.report_lines)
     common.rmtree(tmpdir)
Пример #27
0
def clean_cluster_dir(d, exclude=None):
    if not os.path.exists(d):
        return

    '''Cleans up all files made except original ones in a cluster directory'''
    keep = set(['genes.fa', 'reads_1.fq', 'reads_2.fq'])
    if exclude is not None:
        for f in exclude:
            keep.add(f)

    for name in os.listdir(d):
        if name not in keep:
            full_path = os.path.join(d, name)
            if os.path.isdir(full_path):
                common.rmtree(full_path)
            else:
                os.unlink(full_path)
Пример #28
0
    def _extract_files(cls, zip_file, outdir):
        original_files = {
            'annotations': None,
            'fasta': None,
            'header_mappings': None
        }

        try:
            os.mkdir(outdir)
        except:
            raise Error('Error making directory ' + outdir)

        # Old <2.0.0 megares has eg these files:
        #  megares_annotations_v1.01.csv
        #  megares_database_v1.01.fasta
        #  megares_to_external_header_mappings_v1.01.tsv
        # megares 2.0.0 has these files:
        #  megares_drugs_annotations_v2.00.csv
        #  megares_drugs_database_v2.00.fasta
        #  megares_modified_annotations_v2.00.csv
        #  megares_modified_database_v2.00.fasta
        #  megares_to_external_header_mappings_v2.00.csv
        # The sequences in *_modified_* files seem to be a superset of
        # *_drugs_*, so use the *_modified_* ones. This will happen
        # as long as we loop over sorted filenames, because the _modified_
        # csv and fasta are listed last
        zfile = zipfile.ZipFile(zip_file)
        for member in sorted(zfile.namelist()):
            if '_annotations_' in member:
                original_files['annotations'] = member
            elif '_database_' in member and member.endswith('.fasta'):
                original_files['fasta'] = member
            elif '_header_mappings_' in member:
                original_files['header_mappings'] = member
            else:
                continue

            zfile.extract(member, path=outdir)

        if None in original_files.values():
            common.rmtree(outdir)
            raise Error(
                'Error. Not all expected files found in downloaded megares zipfile. '
                + str(original_files))

        return original_files
Пример #29
0
    def test_full_run_cluster_test_full_run_smtls_snp_varonly_nonc(self):
        '''test complete run where samtools calls a snp at a known snp location in a presence/absence noncoding and sample has the var'''
        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.fa')
        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding'
        common.rmtree(tmpdir)
        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc'), tmpdir)
        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320)
        c.run()

        # We shouldn't get an extra 'HET' line because we already know about the snp, so
        # included in the report of the known snp
        expected = [
            'ref_seq\tref_seq\t0\t1\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tn\tA18G\t1\t.\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\tref_seq:0:1:A18G:.:Description of A18G snp\t.'
        ]
        self.assertEqual(expected, c.report_lines)
        common.rmtree(tmpdir)
Пример #30
0
    def test_full_run_smtls_snp_varonly_gene(self):
        '''test complete run where samtools calls a snp at a known snp location in a variant only gene, gene does have variant'''
        fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene.fa')
        tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var'
        common.rmtree(tmpdir)
        shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene'), tmpdir)
        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320)
        c.run()

        # We shouldn't get an extra 'HET' line because we already know about the snp, so
        # included in the report of the known snp
        expected = [
            'ref_gene\tref_gene\t1\t1\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tp\tI6M\t1\t.\t.\t16\t18\tATG\t135\t137\tATG\t65;64;63\tA;T;G,A\t65;64;32,31\tref_gene:1:1:I6M:.:Description of I6M snp\t.'
        ]
        self.assertEqual(expected, c.report_lines)
        common.rmtree(tmpdir)