Пример #1
0
    def test_init_ok(self):
        '''Test init with good input'''
        fasta_in = os.path.join(data_dir, 'reference_data_init_ok.in.fa')
        tsv_in = os.path.join(data_dir, 'reference_data_init_ok.in.tsv')
        meta1 = sequence_metadata.SequenceMetadata(
            'gene1\t1\t0\tR2S\t.\tconfers killer rabbit resistance')
        meta2 = sequence_metadata.SequenceMetadata(
            "gene2\t1\t0\tI42L\t.\tremoves tardigrade's space-living capability"
        )

        expected_metadata = {
            'gene1': {
                'seq_type': 'p',
                'variant_only': False,
                'n': {},
                'p': {
                    1: {meta1}
                },
                '.': set(),
            },
            'gene2': {
                'seq_type': 'p',
                'variant_only': False,
                'n': {},
                'p': {
                    41: {meta2}
                },
                '.': set(),
            }
        }
        ref_data = reference_data.ReferenceData([fasta_in], [tsv_in])
        self.assertEqual(expected_metadata, ref_data.metadata)

        expected_seqs_dict = {
            'gene1': pyfastaq.sequences.Fasta('gene1',
                                              'CATCGTCGTCTATCGTCGTCCTAG'),
            'gene2': pyfastaq.sequences.Fasta('gene2', 'AAAAACCCCGGGGTTTT')
        }

        self.assertEqual(expected_seqs_dict, ref_data.sequences)
        self.assertEqual({}, ref_data.ariba_to_original_name)
        self.assertEqual({}, ref_data.extra_parameters)

        rename_file = os.path.join(data_dir,
                                   'reference_data_init_ok.rename.tsv')
        parameters_file = os.path.join(data_dir,
                                       'reference_data_init_ok.params.json')
        ref_data = reference_data.ReferenceData(
            [fasta_in], [tsv_in],
            rename_file=rename_file,
            parameters_file=parameters_file)
        expected_rename_dict = {
            'gene1': 'original_gene1',
            'gene2': 'original_gene2'
        }
        self.assertEqual(expected_rename_dict, ref_data.ariba_to_original_name)
        expected_extra_parameters = {'foo': 'bar', 'spam': 'eggs'}
        self.assertEqual(expected_extra_parameters, ref_data.extra_parameters)
Пример #2
0
    def test_init_fails(self):
        '''Test __init__ fails when it should'''
        empty_fasta = os.path.join(data_dir,
                                   'reference_data_init_fails.empty.fa')
        empty_tsv = os.path.join(data_dir,
                                 'reference_data_init_fails.empty.tsv')
        fasta = os.path.join(data_dir, 'reference_data_init_fails.in.fa')

        with self.assertRaises(reference_data.Error):
            reference_data.ReferenceData([empty_fasta], [empty_tsv])
            reference_data.ReferenceData([fasta], [empty_tsv])
Пример #3
0
    def test_full_run_multiple_vars_in_codon(self):
        '''Test complete run where there is a codon with a SNP and an indel'''
        fasta_in = os.path.join(data_dir,
                                'cluster_test_full_run_multiple_vars.fa')
        tsv_in = os.path.join(data_dir,
                              'cluster_test_full_run_multiple_vars.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_multiple_vars'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir, 'cluster_test_full_run_multiple_vars'),
            tmpdir)
        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=292,
                            total_reads_bases=20900)
        c.run()

        expected = [
            'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t96\t96\t96.91\tcluster_name.l15.c30.ctg.1\t1074\t20.4\t0\t.\tp\t.\t0\t.\tMULTIPLE\t25\t26\tGA\t487\t489\tCAT\t27;26;25\tC;A;T\t27;26;25\t.\tGeneric description of presence_absence1',
            'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t96\t96\t96.91\tcluster_name.l15.c30.ctg.1\t1074\t20.4\t0\t.\tp\t.\t0\tA10fs\tFSHIFT\t28\t28\tG\t491\t491\tG\t26\tG\t26\t.\tGeneric description of presence_absence1',
        ]
        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Пример #4
0
    def test_full_run_cluster_test_full_run_smtls_snp_varonly_nonc(self):
        '''test complete run where samtools calls a snp at a known snp location in a presence/absence noncoding and sample has the var'''
        fasta_in = os.path.join(
            data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.fa')
        tsv_in = os.path.join(
            data_dir, 'cluster_test_full_run_smtls_snp_varonly_nonc.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_noncoding'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir,
                         'cluster_test_full_run_smtls_snp_varonly_nonc'),
            tmpdir)
        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=148,
                            total_reads_bases=13320)
        c.run()

        # We shouldn't get an extra 'HET' line because we already know about the snp, so
        # included in the report of the known snp
        expected = [
            'ref_seq\tref_seq\t0\t1\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tn\tA18G\t1\t.\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\tref_seq:0:1:A18G:.:Description of A18G snp\t.'
        ]
        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Пример #5
0
    def test_full_run_smtls_snp_varonly_gene(self):
        '''test complete run where samtools calls a snp at a known snp location in a variant only gene, gene does have variant'''
        fasta_in = os.path.join(data_dir,
                                'cluster_full_run_smtls_snp_varonly_gene.fa')
        tsv_in = os.path.join(data_dir,
                              'cluster_full_run_smtls_snp_varonly_gene.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_var_only_gene_does_have_var'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_gene'),
            tmpdir)
        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=148,
                            total_reads_bases=13320)
        c.run()

        # We shouldn't get an extra 'HET' line because we already know about the snp, so
        # included in the report of the known snp
        expected = [
            'ref_gene\tref_gene\t1\t1\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tp\tI6M\t1\t.\t.\t16\t18\tATG\t135\t137\tATG\t65;64;63\tA;T;G,A\t65;64;32,31\tref_gene:1:1:I6M:.:Description of I6M snp\t.'
        ]
        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Пример #6
0
 def test_full_run_smtls_snp_varonly_gene_2(self):
     '''test complete run where samtools calls a snp in a variant only gene'''
     # _2 because I think test_full_run_smtls_snp_varonly_gene tests the asame functionality.
     # ... but let's leave both tests in anyway
     fasta_in = os.path.join(
         data_dir, 'cluster_full_run_smtls_snp_varonly_gene_2.fa')
     tsv_in = os.path.join(data_dir,
                           'cluster_full_run_smtls_snp_varonly_gene_2.tsv')
     refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
     tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_gene_2'
     shutil.rmtree(tmpdir, ignore_errors=True)
     shutil.copytree(
         os.path.join(data_dir,
                      'cluster_full_run_smtls_snp_varonly_gene_2'), tmpdir)
     c = cluster.Cluster(tmpdir,
                         'cluster_name',
                         refdata,
                         total_reads=148,
                         total_reads_bases=13320)
     c.run()
     expected = [
         'ref_gene\tref_gene\t1\t1\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_gene'
     ]
     self.assertEqual(expected, c.report_lines)
     shutil.rmtree(tmpdir)
Пример #7
0
 def test_full_run_ok_gene_start_mismatch(self):
     '''test complete run where gene extended because too different at end for full nucmer match'''
     fasta_in = os.path.join(
         data_dir, 'cluster_test_full_run_ok_gene_start_mismatch.fa')
     tsv_in = os.path.join(
         data_dir,
         'cluster_test_full_run_ok_gene_start_mismatch.metadata.tsv')
     refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
     tmpdir = 'tmp.cluster_test_full_run_ok_gene_start_mismatch'
     shutil.rmtree(tmpdir, ignore_errors=True)
     shutil.copytree(
         os.path.join(data_dir,
                      'cluster_test_full_run_ok_gene_start_mismatch'),
         tmpdir)
     c = cluster.Cluster(tmpdir,
                         'cluster_name',
                         refdata,
                         total_reads=112,
                         total_reads_bases=1080)
     c.run()
     expected = [
         'gene\tgene\t1\t0\t27\t112\tcluster_name\t96\t96\t100.0\tcluster_name.l6.c30.ctg.1\t362\t27.8\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of gene'
     ]
     self.assertEqual(expected, c.report_lines)
     shutil.rmtree(tmpdir)
Пример #8
0
    def test_full_run_ok_variants_only_variant_is_present(self):
        '''test complete run of cluster on a variants only gene when variant is present'''
        fasta_in = os.path.join(data_dir,
                                'cluster_test_full_run_ok_variants_only.fa')
        tsv_in = os.path.join(
            data_dir,
            'cluster_test_full_run_ok_variants_only.present.metadata.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.present'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'),
            tmpdir)

        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=66,
                            total_reads_bases=3300)
        c.run()

        expected = [
            'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t65\t67\tCGC\t18;18;19\tC;G;C\t18;18;19\tvariants_only1:1:1:R3S:.:Ref and assembly have wild type\tGeneric description of variants_only1',
            'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tGCG\t71\t73\tGCG\t17;17;17\tG;C;G\t17;17;17\tvariants_only1:1:1:I5A:.:Ref and reads have variant so report\tGeneric description of variants_only1',
        ]
        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Пример #9
0
    def test_full_run_ok_presence_absence(self):
        '''test complete run of cluster on a presence absence gene'''
        fasta_in = os.path.join(
            data_dir, 'cluster_test_full_run_ok_presence_absence.fa')
        tsv_in = os.path.join(
            data_dir, 'cluster_test_full_run_ok_presence_absence.metadata.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_ok_presence_absence'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir,
                         'cluster_test_full_run_ok_presence_absence'), tmpdir)

        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=64,
                            total_reads_bases=3200)
        c.run()

        expected = [
            'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t1\tSNP\tp\tA10V\t1\tA10V\tNONSYN\t28\t30\tGCG\t83\t85\tGTG\t22;22;21\tG;T;G\t22;22;21\tpresence_absence1:1:0:A10V:.:Ref has wild, reads have variant so report\tGeneric description of presence_absence1',
            'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t0\t.\tp\t.\t0\t.\tSYN\t52\t54\tATT\t107\t109\tATC\t31;31;32\tA;T;C\t31;31;32\t.\tGeneric description of presence_absence1',
            'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t62\t64\tCGC\t18;17;17\tC;G;C\t18;17;17\tpresence_absence1:1:0:R3S:.:Ref and assembly have wild type\tGeneric description of presence_absence1',
            'presence_absence1\tpresence_absence1\t1\t0\t539\t64\tcluster_name\t96\t96\t97.92\tcluster_name.l15.c30.ctg.1\t213\t15.0\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tGCG\t68\t70\tGCG\t18;20;20\tG;C;G\t18;20;20\tpresence_absence1:1:0:I5A:.:Ref and reads have variant so report\tGeneric description of presence_absence1',
        ]

        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Пример #10
0
    def test_full_run_ok_non_coding(self):
        '''test complete run of cluster on a noncoding sequence'''
        fasta_in = os.path.join(data_dir,
                                'cluster_test_full_run_ok_non_coding.fa')
        tsv_in = os.path.join(
            data_dir, 'cluster_test_full_run_ok_non_coding.metadata.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.test_full_run_ok_non_coding'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding'),
            tmpdir)

        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=72,
                            total_reads_bases=3600)
        c.run()

        self.maxDiff = None
        expected = [
            'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t14\t14\tA\t74\t74\tT\t19\tT\t19\tnoncoding1:0:0:A14T:.:ref has wild type, reads has variant so should report\tgeneric description of noncoding1',
            'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t0\t.\tn\t.\t0\tG61T\tSNP\t61\t61\tG\t121\t121\tT\t24\tT\t24\t.\tgeneric description of noncoding1',
            'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t0\t.\tn\t.\t0\t.82C\tINS\t82\t82\tA\t143\t143\tC\t23\tC\t23\t.\tgeneric description of noncoding1',
            'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t0\t.\tn\t.\t0\tT108.\tDEL\t108\t108\tT\t168\t168\tC\t17\tC\t17\t.\tgeneric description of noncoding1',
            'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t1\tSNP\tn\tA6G\t1\t.\t.\t6\t6\tG\t66\t66\tG\t19\tG\t19\tnoncoding1:0:0:A6G:.:variant in ref and reads so should report\tgeneric description of noncoding1',
            'noncoding1\tnoncoding1\t0\t0\t531\t72\tcluster_name\t120\t120\t95.87\tcluster_name.l15.c30.ctg.1\t234\t15.4\t1\tSNP\tn\tG9T\t0\t.\t.\t9\t9\tG\t69\t69\tG\t19\tG\t19\tnoncoding1:0:0:G9T:.:wild type in ref and reads\tgeneric description of noncoding1'
        ]

        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Пример #11
0
 def test_sequence(self):
     '''Test sequence'''
     fasta_in = os.path.join(data_dir, 'reference_data_sequence.in.fa')
     tsv_in = os.path.join(data_dir, 'reference_data_sequence.in.tsv')
     expected = pyfastaq.sequences.Fasta('seq1', 'ATGTTTTAA')
     refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
     self.assertEqual(expected, refdata.sequence('seq1'))
Пример #12
0
    def test_full_run_ref_not_in_cluster(self):
        '''test complete run of cluster when nearest ref is outside cluster'''
        fasta_in = os.path.join(
            data_dir, 'cluster_test_full_run_ref_not_in_cluster.in.fa')
        tsv_in = os.path.join(
            data_dir, 'cluster_test_full_run_ref_not_in_cluster.in.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.test_full_run_ref_not_in_cluster'
        all_refs_fa = os.path.join(
            data_dir, 'cluster_test_full_run_ref_not_in_cluster.all_refs.fa')
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir, 'cluster_test_full_run_ref_not_in_cluster'),
            tmpdir)

        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=72,
                            total_reads_bases=3600,
                            all_ref_seqs_fasta=all_refs_fa)
        c.run()

        expected = '\t'.join(
            ['.', '.', '.', '.', '1024', '72', 'cluster_name'] + ['.'] * 24)
        self.assertEqual([expected], c.report_lines)
        self.assertTrue(c.status_flag.has('ref_seq_choose_fail'))
        self.assertFalse(c.status_flag.has('assembly_fail'))
        shutil.rmtree(tmpdir)
Пример #13
0
    def test_full_run_assembly_fail(self):
        '''test complete run of cluster when assembly fails'''
        fasta_in = os.path.join(data_dir,
                                'cluster_test_full_run_assembly_fail.in.fa')
        tsv_in = os.path.join(data_dir,
                              'cluster_test_full_run_assembly_fail.in.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.test_full_run_assembly_fail'
        shutil.rmtree(tmpdir, ignore_errors=True)
        shutil.copytree(
            os.path.join(data_dir, 'cluster_test_full_run_assembly_fail'),
            tmpdir)

        c = cluster.Cluster(tmpdir,
                            'cluster_name',
                            refdata,
                            total_reads=4,
                            total_reads_bases=304)
        c.run()

        expected = '\t'.join(['.', '.', '.', '.', '64', '4', 'cluster_name'] +
                             ['.'] * 24)
        self.assertEqual([expected], c.report_lines)
        self.assertFalse(c.status_flag.has('ref_seq_choose_fail'))
        self.assertTrue(c.status_flag.has('assembly_fail'))
        shutil.rmtree(tmpdir)
Пример #14
0
    def test_cluster_w_cdhit_nocluster(self):
        '''Test cluster_with_cd_hit do not run cdhit'''
        fasta_in = os.path.join(
            data_dir, 'reference_data_cluster_w_cdhit_nocluster.in.fa')
        tsv_in = os.path.join(
            data_dir, 'reference_data_cluster_w_cdhit_nocluster.in.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        outprefix = 'tmp.test_cluster_with_cdhit_nocluster'

        expected_clusters = {
            '0': {'noncoding1'},
            '1': {'noncoding2'},
            '2': {'presence_absence1'},
            '3': {'presence_absence2'},
            '4': {'presence_absence3'},
            '5': {'presence_absence4'},
        }

        got_clusters = refdata.cluster_with_cdhit(outprefix, nocluster=True)
        self.assertEqual(expected_clusters, got_clusters)

        expected_clusters_file = os.path.join(
            data_dir, 'reference_data_cluster_w_cdhit_nocluster.expect.tsv')
        got_clusters_file = outprefix + '.clusters.tsv'
        self.assertTrue(
            filecmp.cmp(expected_clusters_file,
                        got_clusters_file,
                        shallow=False))

        os.unlink(got_clusters_file)
        os.unlink(outprefix + '.all.fa')
        os.unlink(outprefix + '.gene.fa')
        os.unlink(outprefix + '.gene.varonly.fa')
        os.unlink(outprefix + '.noncoding.fa')
        os.unlink(outprefix + '.noncoding.varonly.fa')
Пример #15
0
    def test_all_non_wild_type_variants(self):
        '''Test all_non_wild_type_variants'''
        tsv_file = os.path.join(
            data_dir, 'reference_data_test_all_non_wild_type_variants.tsv')
        fasta_in = os.path.join(
            data_dir, 'reference_data_test_all_non_wild_type_variants.ref.fa')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_file])

        v1 = sequence_metadata.SequenceMetadata(
            'var_only_gene\t1\t1\tP3Q\t.\tref has wild type P')
        v2 = sequence_metadata.SequenceMetadata(
            'var_only_gene\t1\t1\tG4I\t.\tref has wild type F')
        v3 = sequence_metadata.SequenceMetadata(
            'var_only_gene\t1\t1\tI5V\t.\tref has variant V instead of I')
        v4 = sequence_metadata.SequenceMetadata(
            'var_only_gene\t1\t1\tF6I\t.\tref has wild type F')
        p1 = sequence_metadata.SequenceMetadata(
            'presence_absence_gene\t1\t0\tN2I\t.\tref has wild type N')
        p2 = sequence_metadata.SequenceMetadata(
            'presence_absence_gene\t1\t0\tA4G\t.\tref has variant G instead of A'
        )
        n1 = sequence_metadata.SequenceMetadata(
            'non_coding\t0\t0\tA2C\t.\tref has wild type A')
        n2 = sequence_metadata.SequenceMetadata(
            'non_coding\t0\t0\tC4T\t.\tref has variant T instead of C')

        var_only_expected = {
            'n': {},
            'p': {
                2: {v1},
                3: {v2},
                4: {v3},
                5: {v4}
            }
        }

        pres_abs_expected = {
            'n': {},
            'p': {
                1: {p1},
                3: {p2}
            },
        }

        non_coding_expected = {'n': {1: {n1}, 3: {n2}}, 'p': {}}

        self.assertEqual(var_only_expected,
                         refdata.all_non_wild_type_variants('var_only_gene'))
        self.assertEqual(
            pres_abs_expected,
            refdata.all_non_wild_type_variants('presence_absence_gene'))
        self.assertEqual(non_coding_expected,
                         refdata.all_non_wild_type_variants('non_coding'))
        self.assertEqual({
            'n': {},
            'p': {}
        }, refdata.all_non_wild_type_variants('not_a_known_sequence'))
Пример #16
0
    def test_get_variants_variants_only(self):
        '''test get_variants variants only'''
        meta1 = sequence_metadata.SequenceMetadata(
            'variants_only\t1\t0\tD2E\tid1\tref has wild type D (GAT=D, GAA=E)'
        )
        meta2 = sequence_metadata.SequenceMetadata(
            'variants_only\t1\t0\tS3R\tid1\tref has variant type R (AGA=R, AGT=S)'
        )
        meta3 = sequence_metadata.SequenceMetadata(
            'variants_only\t1\t0\tD4E\tid1\tref has variant type E (GAA=E, GAC=D)'
        )

        metadata_tsv = 'tmp.test_get_variants_variants_only.metadata.tsv'
        with open(metadata_tsv, 'w') as f:
            print(meta1, file=f)
            print(meta2, file=f)
            print(meta3, file=f)

        fasta_in = os.path.join(
            data_dir, 'assembly_variants_test_get_variants_variants_only.fa')
        refdata = reference_data.ReferenceData([fasta_in], [metadata_tsv])
        os.unlink(metadata_tsv)

        nucmer_snp_file = os.path.join(
            data_dir, 'assembly_variants_test_get_variants_variants_only.snps')
        v2 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1'))
        v3 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1'))

        ctg_nucmer_coords = {
            'contig1': [pyfastaq.intervals.Interval(0, 41)],
            'contig2': [pyfastaq.intervals.Interval(10, 41)],
        }

        ref_nucmer_coords = {
            'contig1': [pyfastaq.intervals.Interval(0, 41)],
            'contig2': [pyfastaq.intervals.Interval(10, 41)],
        }
        expected = {
            'contig1': [
                (4, 'p', 'A5D', 'NONSYN', [v2, v3], set(), set()),
                (None, 'p', None, None, None, {meta1}, set()),
                (None, 'p', None, None, None, {meta3}, set()),
            ],
            'contig2': [(None, 'p', None, None, None, {meta3}, set())],
        }

        a_variants = assembly_variants.AssemblyVariants(
            refdata, nucmer_snp_file)
        got = a_variants.get_variants('variants_only', ctg_nucmer_coords,
                                      ref_nucmer_coords)
        self.assertEqual(expected, got)
Пример #17
0
 def test_write_seqs_to_fasta(self):
     '''Test write_seqs_to_fasta'''
     fasta_in = os.path.join(
         data_dir, 'reference_data_test_write_seqs_to_fasta.in.fa')
     tsv_in = os.path.join(
         data_dir, 'reference_data_test_write_seqs_to_fasta.in.tsv')
     refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
     expected_outfile = os.path.join(
         data_dir, 'reference_data_test_write_seqs_to_fasta.expected.fa')
     tmpfile = 'tmp.test.reference_data.write_seqs_to_fasta.out.fa'
     refdata.write_seqs_to_fasta(tmpfile, {'seq1', 'seq4', 'seq5'})
     self.assertTrue(filecmp.cmp(expected_outfile, tmpfile, shallow=False))
     os.unlink(tmpfile)
Пример #18
0
    def test_one_var_one_ctg_noncdg(self):
        '''test _get_one_variant_for_one_contig_non_coding'''
        fasta_in = os.path.join(data_dir,
                                'assembly_variants_one_var_one_ctg_noncdg.fa')
        tsv_in = os.path.join(data_dir,
                              'assembly_variants_one_var_one_ctg_noncdg.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        ref_sequence_name = 'non_coding'
        refdata_var_dict = refdata.metadata[ref_sequence_name]

        v0 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '2\tT\tA\t2\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig'))

        # ref has A at position 3, which is variant type. This gives contig the wild type C. Shouldn't report
        v1 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '3\tA\tC\t3\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig'))

        # ref has T at position 5, which is wild type. This gives contig variant type A. Should report
        v2 = pymummer.variant.Variant(
            pymummer.snp.Snp(
                '5\tT\tA\t5\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig'))

        meta0 = sequence_metadata.SequenceMetadata(
            'non_coding\t0\t0\tC3A\tid1\tref has variant type A')
        meta2 = sequence_metadata.SequenceMetadata(
            'non_coding\t0\t0\tT5A\tid1\tref has wild type T')

        mummer_variants = [v0, v1, v2]

        expected_tuples = [
            (1, 'n', 'T2A', 'SNP', [v0], set(), set()),  #0
            None,  #1
            (4, 'n', 'T5A', 'SNP', [v2], {meta2}, set()),  #2
        ]

        expected_used_variants = [
            set(),  #0
            {meta0},  #1
            {meta2},  #2
        ]

        assert len(mummer_variants) == len(expected_tuples) == len(
            expected_used_variants)

        for i in range(len(mummer_variants)):
            got_tuple, got_used_variants = assembly_variants.AssemblyVariants._get_one_variant_for_one_contig_non_coding(
                refdata_var_dict, mummer_variants[i])
            self.assertEqual(expected_tuples[i], got_tuple)
            self.assertEqual(expected_used_variants[i], got_used_variants)
Пример #19
0
 def test_full_run_smtls_snp_varonly_nonc(self):
     '''test complete run where samtools calls a snp in a presence/absence noncoding sequence'''
     fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.fa')
     tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc.tsv')
     refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
     tmpdir = 'tmp.cluster_full_run_smtls_snp_varonly_nonc'
     shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_varonly_nonc'), tmpdir)
     c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
     c.run()
     expected = [
         'ref_seq\tref_seq\t0\t1\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_seq'
     ]
     self.assertEqual(expected, c.report_lines)
     shutil.rmtree(tmpdir)
Пример #20
0
    def test_sequence_type(self):
        '''Test sequence_type'''
        fasta_in = os.path.join(data_dir, 'reference_data_sequence_type.in.fa')
        tsv_in = os.path.join(data_dir, 'reference_data_sequence_type.in.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])

        tests = [
            ('gene', ('p', False)),
            ('gene.var_only', ('p', True)),
            ('noncoding', ('n', False)),
            ('noncoding.var_only', ('n', True)),
        ]

        for name, expected in tests:
            self.assertEqual(expected, refdata.sequence_type(name))
Пример #21
0
    def test_full_run_insert_codon(self):
        '''Test complete run where there is a inserted codon'''
        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_insert_codon.fa')
        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_insert_codon.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_insert_codon'
        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_insert_codon'), tmpdir)
        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=292, total_reads_bases=20900)
        c.run()

        expected = [
            'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t108\t108\t92.31\tcluster_name.l15.c30.ctg.1\t1115\t19.9\t0\t.\tp\t.\t0\tS25_M26insELI\tINS\t73\t73\tA\t554\t554\tG\t24\tG\t24\t.\tGeneric description of presence_absence1'
        ]
        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Пример #22
0
    def test_full_run_partial_assembly(self):
        '''Test complete run where only part of the ref gene is present in the reads'''
        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly.fa')
        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_partial_assembly'
        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_partial_asmbly'), tmpdir)
        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=278, total_reads_bases=15020)
        c.run()

        expected = [
            'presence_absence1\tpresence_absence1\t1\t0\t19\t278\tcluster_name\t96\t77\t100.0\tcluster_name.l15.c17.ctg.1\t949\t20.5\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of presence_absence1'
        ]
        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Пример #23
0
 def test_full_run_smtls_known_snp_presabs_nonc(self):
     '''test complete run where samtools calls a snp in a presence/absence noncoding sequence at a known snp position'''
     fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_known_snp_presabs_nonc.fa')
     tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_known_snp_presabs_nonc.tsv')
     refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
     tmpdir = 'tmp.cluster_test_full_run_smtls_known_snp_presabs_nonc'
     common.rmtree(tmpdir)
     shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_known_snp_presabs_nonc'), tmpdir)
     c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320)
     c.run()
     expected = [
         'ref_seq\tref_seq\t0\t0\t147\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tn\tG18A\t0\t.\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\tref_seq:0:0:G18A:.:Description of G18A\tGeneric description of ref_seq'
     ]
     self.assertEqual(expected, c.report_lines)
     common.rmtree(tmpdir)
Пример #24
0
 def test_full_run_smtls_snp_presabs_gene(self):
     '''test complete run where samtools calls a snp in a presence/absence gene'''
     fasta_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene.fa')
     tsv_in = os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene.tsv')
     refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
     tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_pres_abs_gene'
     common.rmtree(tmpdir)
     shutil.copytree(os.path.join(data_dir, 'cluster_full_run_smtls_snp_presabs_gene'), tmpdir)
     c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=148, total_reads_bases=13320)
     c.run()
     expected = [
         'ref_gene\tref_gene\t1\t0\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t0\tHET\t.\t.\t.\tG18A\t.\t18\t18\tG\t137\t137\tG\t63\tG,A\t32,31\t.\tGeneric description of ref_gene'
     ]
     self.assertEqual(expected, c.report_lines)
     common.rmtree(tmpdir)
Пример #25
0
    def test_full_run_ok_variants_only_variant_not_present_always_report(self):
        '''test complete run of cluster on a variants only gene when variant not present but always report variant'''
        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa')
        tsv_in = os.path.join(data_dir, 'cluster_full_run_varonly.not_present.always_report.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_full_run_varonly.not_present.always_report'
        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir)

        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=66, total_reads_bases=3300)
        c.run()
        expected = [
            'variants_only1\tvariants_only1\t1\t1\t27\t66\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t215\t15.3\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tCGC\t65\t67\tCGC\t18;18;19\tC;G;C\t18;18;19\tvariants_only1:1:1:R3S:.:Ref and assembly have wild type, but always report anyway\tGeneric description of variants_only1'
        ]
        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Пример #26
0
    def test_full_run_choose_ref_fail(self):
        '''test complete run of cluster when choosing ref seq fails'''
        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_choose_ref_fail.in.fa')
        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_choose_ref_fail.in.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.test_full_run_choose_ref_fail'
        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_choose_ref_fail'), tmpdir)

        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=2, total_reads_bases=108, spades_other_options='--only-assembler')
        c.run()

        expected = '\t'.join(['.', '.', '.', '.', '1024', '2', 'cluster_name'] + ['.'] * 24)
        self.assertEqual([expected], c.report_lines)
        self.assertTrue(c.status_flag.has('ref_seq_choose_fail'))
        self.assertFalse(c.status_flag.has('assembly_fail'))
        shutil.rmtree(tmpdir)
Пример #27
0
    def test_full_run_delete_codon(self):
        '''Test complete run where there is a deleted codon'''
        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_delete_codon.fa')
        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_delete_codon.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_delete_codon'
        common.rmtree(tmpdir)
        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_delete_codon'), tmpdir)
        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=292, total_reads_bases=20900)
        c.run()

        expected = [
            'presence_absence1\tpresence_absence1\t1\t0\t539\t292\tcluster_name\t117\t117\t92.31\tcluster_name.l15.c30.ctg.1\t1104\t20.0\t0\t.\tp\t.\t0\tR25_A26del\tDEL\t73\t73\tA\t553\t553\tA\t27\tA\t27\t.\tGeneric description of presence_absence1',
        ]
        self.assertEqual(expected, c.report_lines)
        common.rmtree(tmpdir)
Пример #28
0
    def test_full_run_no_reads_after_filtering(self):
        '''test complete run of cluster when filtering removes all reads'''
        fasta_in = os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering.in.fa')
        tsv_in = os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering.in.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.test_full_run_no_reads_after_filtering'
        common.rmtree(tmpdir)
        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_no_reads_after_filtering'), tmpdir)

        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, total_reads=0, total_reads_bases=0)
        c.run()

        expected = '\t'.join(['.', '.', '.', '.', '64', '0', 'cluster_name'] + ['.'] * 24)
        self.assertEqual([expected], c.report_lines)
        self.assertFalse(c.status_flag.has('ref_seq_choose_fail'))
        self.assertTrue(c.status_flag.has('assembly_fail'))
        common.rmtree(tmpdir)
Пример #29
0
    def test_full_run_known_smtls_snp_presabs_gene(self):
        '''test complete run where samtools calls a snp at a known snp location in a presence/absence gene'''
        fasta_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene.fa')
        tsv_in = os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene.tsv')
        refdata = reference_data.ReferenceData([fasta_in], [tsv_in])
        tmpdir = 'tmp.cluster_test_full_run_ok_samtools_snp_known_position_pres_abs_gene'
        shutil.copytree(os.path.join(data_dir, 'cluster_full_run_known_smtls_snp_presabs_gene'), tmpdir)
        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler', total_reads=148, total_reads_bases=13320)
        c.run()

        # We shouldn't get an extra 'HET' line because we already know about the snp, so
        # included in the report of the known snp
        expected = [
            'ref_gene\tref_gene\t1\t0\t155\t148\tcluster_name\t96\t96\t100.0\tcluster_name.l15.c30.ctg.1\t335\t39.8\t1\tSNP\tp\tM6I\t0\t.\t.\t16\t18\tATG\t135\t137\tATG\t65;64;63\tA;T;G,A\t65;64;32,31\tref_gene:1:0:M6I:.:Description of M6I snp\t.'
        ]
        self.assertEqual(expected, c.report_lines)
        shutil.rmtree(tmpdir)
Пример #30
0
    def test_init_fail_files_missing(self):
        '''test init_fail_files_missing'''
        refdata_fa = os.path.join(data_dir, 'cluster_test_init_refdata.fa')
        meatadata_tsv = os.path.join(data_dir, 'cluster_test_init_refdata.tsv')
        refdata = reference_data.ReferenceData([refdata_fa], [meatadata_tsv])

        dirs = [
            'cluster_test_init_no_refs_fa',
            'cluster_test_init_no_reads_1',
            'cluster_test_init_no_reads_2',
        ]
        dirs = [os.path.join(data_dir, d) for d in dirs]
        for d in dirs:
            tmpdir = 'tmp.cluster_test_init_fail_files_missing'
            shutil.copytree(d, tmpdir)
            with self.assertRaises(cluster.Error):
                cluster.Cluster(tmpdir, 'name', refdata=refdata, total_reads=42, total_reads_bases=4242)
            shutil.rmtree(tmpdir)