예제 #1
0
    def test_error_non_numeric_values_in_score_column(self):
        data = "{},{}\n{},{}".format(
            self.HGVS_NT_COL,
            self.SCORE_COL,
            generate_hgvs(prefix="c"),
            "I am not a number",
        )

        with self.assertRaises(ValueError):
            MaveDataset.for_scores(StringIO(data))
예제 #2
0
    def test_defines_same_variants(self):
        tests = [
            (
                "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL),
                "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL),
                True,
            ),
            (
                "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL),
                "{},count\nc.2A>G,0.0".format(self.HGVS_NT_COL),
                False,
            ),
            (
                "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format(
                    self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL),
                "{},{},count\nc.1A>G,p.Ile1Val,0.0".format(
                    self.HGVS_NT_COL, self.HGVS_PRO_COL),
                True,
            ),
            (
                "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format(
                    self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL),
                "{},{},count\nc.1A>G,p.Ile1Phe,0.0".format(
                    self.HGVS_NT_COL, self.HGVS_PRO_COL),
                False,
            ),
            # Check returns None if either dataset invalid
            (
                "wrong_columns,{}\nc.1A>G,0.0".format(self.SCORE_COL),
                "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL),
                None,
            ),
            (
                "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL),
                "wrong_column,count\nc.1A>G,0.0".format(),
                None,
            ),
        ]

        for (scores, counts, expected) in tests:
            with self.subTest(msg=(scores, counts, expected)):
                scores_dataset = MaveDataset.for_scores(StringIO(scores))
                scores_dataset.validate()

                counts_dataset = MaveDataset.for_counts(StringIO(counts))
                counts_dataset.validate()

                self.assertEqual(scores_dataset.match_other(counts_dataset),
                                 expected)
예제 #3
0
    def test_invalid_hgvs_in_column(self):
        tests = [
            (self.HGVS_PRO_COL, generate_hgvs(prefix="c")),
            (self.HGVS_SPLICE_COL, generate_hgvs(prefix="g")),
            (self.HGVS_NT_COL, generate_hgvs(prefix="p")),
        ]
        for (column, variant) in tests:
            with self.subTest(msg=f"{column}: {variant}"):
                if column == self.HGVS_SPLICE_COL:
                    data = "{},{},{}\n{},{},1.0".format(
                        self.HGVS_NT_COL,
                        column,
                        self.SCORE_COL,
                        generate_hgvs(prefix="g"),
                        variant,
                    )
                else:
                    data = "{},{}\n{},1.0".format(column, self.SCORE_COL,
                                                  variant)

                dataset = MaveDataset.for_scores(StringIO(data))
                dataset.validate()

                self.assertFalse(dataset.is_valid)
                self.assertEqual(len(dataset.errors), 1)
                print(dataset.errors)
예제 #4
0
    def test_to_dict(self):
        hgvs_1 = generate_hgvs(prefix="c")
        hgvs_2 = generate_hgvs(prefix="c")
        data = "{},{},{},{}\n{},,,\n{},,,1.0".format(
            self.HGVS_NT_COL,
            self.HGVS_PRO_COL,
            self.HGVS_SPLICE_COL,
            self.SCORE_COL,
            hgvs_1,
            hgvs_2,
        )

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertTrue(dataset.is_valid)
        self.assertDictEqual(
            dataset.to_dict(),
            {
                hgvs_1: {
                    self.HGVS_NT_COL: hgvs_1,
                    self.HGVS_SPLICE_COL: None,
                    self.HGVS_PRO_COL: None,
                    self.SCORE_COL: None,
                },
                hgvs_2: {
                    self.HGVS_NT_COL: hgvs_2,
                    self.HGVS_SPLICE_COL: None,
                    self.HGVS_PRO_COL: None,
                    self.SCORE_COL: 1.0,
                },
            },
        )
예제 #5
0
    def test_sorts_header(self):
        hgvs_nt = generate_hgvs(prefix="g")
        hgvs_pro = generate_hgvs(prefix="p")
        hgvs_splice = generate_hgvs(prefix="c")
        data = "{},{},{},{},{}\n{},{},{},{},{}".format(
            self.HGVS_PRO_COL,
            self.HGVS_NT_COL,
            "colA",
            self.SCORE_COL,
            self.HGVS_SPLICE_COL,
            hgvs_pro,
            hgvs_nt,
            "hello",
            1.0,
            hgvs_splice,
        )

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertTrue(dataset.is_valid)
        self.assertListEqual(
            dataset.columns,
            [
                self.HGVS_NT_COL,
                self.HGVS_SPLICE_COL,
                self.HGVS_PRO_COL,
                self.SCORE_COL,
                "colA",
            ],
        )
예제 #6
0
    def test_does_not_split_double_quoted_variants(self):
        hgvs = "c.[123A>G;124A>G]"
        data = '{},{}\n"{}",1.0'.format(self.HGVS_NT_COL, self.SCORE_COL, hgvs)

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertTrue(dataset.is_valid)
        self.assertIn(hgvs, dataset.data()[self.HGVS_NT_COL])
예제 #7
0
    def test_valid_targetseq_validation_fails(self):
        data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format(self.HGVS_NT_COL,
                                                       self.HGVS_PRO_COL,
                                                       self.SCORE_COL)

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate(targetseq="ATC")

        self.assertTrue(dataset.is_valid)
예제 #8
0
    def test_invalid_row_hgvs_is_not_a_string(self):
        data = "{},{}\n1.0,1.0".format(self.HGVS_NT_COL, self.SCORE_COL)

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertFalse(dataset.is_valid)
        self.assertEqual(len(dataset.errors), 1)
        print(dataset.errors)
예제 #9
0
    def test_empty_no_variants_parsed(self):
        data = "{},{}\n".format(self.HGVS_NT_COL, self.SCORE_COL)

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertTrue(dataset.is_empty)
        self.assertFalse(dataset.is_valid)
        self.assertEqual(len(dataset.errors), 1)
        print(dataset.errors)
예제 #10
0
    def test_invalid_missing_either_required_hgvs_column(self):
        data = "{},{}\n{},{}".format(self.HGVS_SPLICE_COL, self.SCORE_COL,
                                     generate_hgvs(prefix="c"), 1.0)

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertFalse(dataset.is_valid)
        self.assertEqual(len(dataset.errors), 1)
        print(dataset.errors)
예제 #11
0
    def test_scores_missing_scores_column(self):
        data = "{},{}\n{},{}".format(self.HGVS_NT_COL, "scores_rna",
                                     generate_hgvs(prefix="g"), 1.0)

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertFalse(dataset.is_valid)
        self.assertEqual(len(dataset.errors), 1)
        print(dataset.errors)
예제 #12
0
    def test_parses_numeric_column_values_into_float(self):
        hgvs = generate_hgvs(prefix="c")
        data = "{},{}\n{},1.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs)

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertTrue(dataset.is_valid)
        value = dataset.data()[self.SCORE_COL].values[0]
        self.assertIsInstance(value, float)
예제 #13
0
    def test_primary_column_is_pro_when_nt_is_not_defined(self):
        hgvs_pro = generate_hgvs(prefix="p")
        data = "{},{}\n{},1.0".format(self.HGVS_PRO_COL, self.SCORE_COL,
                                      hgvs_pro)

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertTrue(dataset.is_valid)
        self.assertEqual(dataset.index_column, self.HGVS_PRO_COL)
예제 #14
0
    def test_invalid_zero_is_not_parsed_as_none(self):
        hgvs = generate_hgvs(prefix="c")
        data = "{},{}\n{},0.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs)

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertTrue(dataset.is_valid)
        df = dataset.data()
        self.assertEqual(df[self.SCORE_COL].values[0], 0)
예제 #15
0
    def test_invalid_missing_hgvs_columns(self):
        data = "{},{}\n{},1.0".format("not_hgvs", self.SCORE_COL,
                                      generate_hgvs())

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertFalse(dataset.is_valid)
        self.assertEqual(len(dataset.errors), 1)
        print(dataset.errors)
예제 #16
0
    def test_invalid_splice_not_defined_when_nt_is_genomic(self):
        data = "{},{}\n{},1.0".format(self.HGVS_NT_COL, self.SCORE_COL,
                                      generate_hgvs(prefix="g"))

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertFalse(dataset.is_valid)
        self.assertEqual(len(dataset.errors), 2)
        print(dataset.errors)
예제 #17
0
    def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro(self):
        hgvs = generate_hgvs(prefix="p")
        data = "{},{}\n{},1.0\n{},1.0".format(self.HGVS_PRO_COL, "count", hgvs,
                                              hgvs)

        dataset = MaveDataset.for_counts(StringIO(data))
        dataset.validate()

        self.assertFalse(dataset.is_valid)
        self.assertEqual(len(dataset.errors), 1)
        print(dataset.errors)
예제 #18
0
    def test_invalid_same_hgvs_nt_defined_in_two_rows(self):
        hgvs = generate_hgvs(prefix="c")
        data = "{},{}\n{},1.0\n{},1.0".format(self.HGVS_NT_COL, self.SCORE_COL,
                                              hgvs, hgvs)

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertFalse(dataset.is_valid)
        self.assertEqual(len(dataset.errors), 1)
        print(dataset.errors)
예제 #19
0
    def test_error_missing_value_in_pro_column_when_pro_is_primary(self):
        for v in null_values_list:
            with self.subTest(msg=v):
                data = "{},{}\n{},1.0\n{},1.0".format(
                    self.HGVS_PRO_COL, self.SCORE_COL,
                    generate_hgvs(prefix="p"), v)

                dataset = MaveDataset.for_scores(StringIO(data))
                dataset.validate()

                self.assertFalse(dataset.is_valid)
                self.assertEqual(len(dataset.errors), 1)
                print(dataset.errors)
예제 #20
0
    def test_invalid_target_sequence_not_a_multiple_of_3(self):
        data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format(self.HGVS_NT_COL,
                                                       self.HGVS_PRO_COL,
                                                       self.SCORE_COL)

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate(targetseq="ATCG")

        self.assertFalse(dataset.is_valid)
        print(dataset.errors)

        self.assertEqual(dataset.n_errors, 1)
        self.assertIn("multiple of 3", dataset.errors[0])
예제 #21
0
    def test_invalid_targetseq_validation_fails(self):
        data = "{},{},{}\nc.1A>G,p.Val1Phe,0.5".format(self.HGVS_NT_COL,
                                                       self.HGVS_PRO_COL,
                                                       self.SCORE_COL)

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate(targetseq="ATC")

        self.assertFalse(dataset.is_valid)
        print(dataset.errors)

        self.assertEqual(dataset.n_errors, 1)
        self.assertIn("p.Val1Phe", dataset.errors[0])
예제 #22
0
    def test_invalid_null_values_in_header(self):
        for value in null_values_list:
            with self.subTest(msg=f"'{value}'"):
                data = "{},{},{}\n{},1.0,1.0".format(self.HGVS_NT_COL,
                                                     self.SCORE_COL, value,
                                                     generate_hgvs())

                dataset = MaveDataset.for_scores(StringIO(data))
                dataset.validate()

                self.assertFalse(dataset.is_valid)
                self.assertEqual(len(dataset.errors), 1)
                print(dataset.errors)
예제 #23
0
    def test_replaces_null_with_none_in_numeric_columns(self):
        hgvs_nt = generate_hgvs(prefix="c")
        for c in null_values_list:
            with self.subTest(msg=f"'{c}'"):
                data = "{},{}\n{},{}".format(self.HGVS_NT_COL, self.SCORE_COL,
                                             hgvs_nt, c)

                dataset = MaveDataset.for_scores(StringIO(data))
                dataset.validate()

                self.assertTrue(dataset.is_valid)
                self.assertListEqual(
                    list(dataset.data(serializable=True)[self.SCORE_COL]),
                    [None])
예제 #24
0
    def test_df_indexed_by_primary_column(self):
        data = "{},{},{}\n{},{},1.0".format(
            self.HGVS_NT_COL,
            self.HGVS_PRO_COL,
            self.SCORE_COL,
            generate_hgvs(prefix="c"),
            generate_hgvs(prefix="p"),
        )

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertTrue(dataset.is_valid)
        assert_index_equal(dataset.data().index, dataset.index)
예제 #25
0
    def test_invalid_genomic_and_transcript_mixed_in_nt_column(self):
        data = "{},{}\n{},1.0\n{},2.0".format(
            self.HGVS_NT_COL,
            self.SCORE_COL,
            generate_hgvs(prefix="g"),
            generate_hgvs(prefix="c"),
        )

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertFalse(dataset.is_valid)
        self.assertEqual(len(dataset.errors), 2)
        print(dataset.errors)
예제 #26
0
    def test_data_method_converts_null_values_to_None(self):
        hgvs = generate_hgvs()
        for value in null_values_list:
            with self.subTest(msg=value):
                data = "{},{}\n{},{}".format(self.HGVS_NT_COL, self.SCORE_COL,
                                             hgvs, value)

                dataset = MaveDataset.for_scores(StringIO(data))
                dataset.validate()

                self.assertTrue(dataset.is_valid)

                df = dataset.data(serializable=True)
                self.assertIsNotNone(df[self.HGVS_NT_COL].values[0])
                self.assertIsNone(df[self.SCORE_COL].values[0])
예제 #27
0
    def test_invalid_no_additional_columns_outside_hgvs_ones(self):
        data = "{},{},{}\n{},{},{}".format(
            self.HGVS_NT_COL,
            self.HGVS_SPLICE_COL,
            self.HGVS_PRO_COL,
            generate_hgvs(prefix="g"),
            generate_hgvs(prefix="c"),
            generate_hgvs(prefix="p"),
        )

        dataset = MaveDataset.for_counts(StringIO(data))
        dataset.validate()

        self.assertFalse(dataset.is_valid)
        self.assertEqual(len(dataset.errors), 1)
        print(dataset.errors)
예제 #28
0
    def test_invalid_duplicates_in_index(self):
        hgvs = generate_hgvs(prefix="c")
        data = "{},{},{}\n{},{},1.0\n{},{},2.0".format(
            self.HGVS_NT_COL,
            self.HGVS_PRO_COL,
            self.SCORE_COL,
            hgvs,
            generate_hgvs(prefix="p"),
            hgvs,
            generate_hgvs(prefix="p"),
        )

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertFalse(dataset.is_valid)
        self.assertEqual(len(dataset.errors), 1)
        print(dataset.errors)
예제 #29
0
    def test_does_not_allow_wt_and_sy(self):
        wt = "_wt"
        sy = "_sy"
        data = "{},{},{},{}\n{},{},{},1.0".format(
            self.HGVS_NT_COL,
            self.HGVS_SPLICE_COL,
            self.HGVS_PRO_COL,
            self.SCORE_COL,
            wt,
            wt,
            sy,
        )

        dataset = MaveDataset.for_scores(StringIO(data))
        dataset.validate()

        self.assertFalse(dataset.is_valid)
        self.assertEqual(len(dataset.errors), 3)
        print(dataset.errors)