def test_error_non_numeric_values_in_score_column(self): data = "{},{}\n{},{}".format( self.HGVS_NT_COL, self.SCORE_COL, generate_hgvs(prefix="c"), "I am not a number", ) with self.assertRaises(ValueError): MaveDataset.for_scores(StringIO(data))
def test_defines_same_variants(self): tests = [ ( "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL), True, ), ( "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), "{},count\nc.2A>G,0.0".format(self.HGVS_NT_COL), False, ), ( "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format( self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL), "{},{},count\nc.1A>G,p.Ile1Val,0.0".format( self.HGVS_NT_COL, self.HGVS_PRO_COL), True, ), ( "{},{},{}\nc.1A>G,p.Ile1Val,0.0".format( self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL), "{},{},count\nc.1A>G,p.Ile1Phe,0.0".format( self.HGVS_NT_COL, self.HGVS_PRO_COL), False, ), # Check returns None if either dataset invalid ( "wrong_columns,{}\nc.1A>G,0.0".format(self.SCORE_COL), "{},count\nc.1A>G,0.0".format(self.HGVS_NT_COL), None, ), ( "{},{}\nc.1A>G,0.0".format(self.HGVS_NT_COL, self.SCORE_COL), "wrong_column,count\nc.1A>G,0.0".format(), None, ), ] for (scores, counts, expected) in tests: with self.subTest(msg=(scores, counts, expected)): scores_dataset = MaveDataset.for_scores(StringIO(scores)) scores_dataset.validate() counts_dataset = MaveDataset.for_counts(StringIO(counts)) counts_dataset.validate() self.assertEqual(scores_dataset.match_other(counts_dataset), expected)
def test_invalid_hgvs_in_column(self): tests = [ (self.HGVS_PRO_COL, generate_hgvs(prefix="c")), (self.HGVS_SPLICE_COL, generate_hgvs(prefix="g")), (self.HGVS_NT_COL, generate_hgvs(prefix="p")), ] for (column, variant) in tests: with self.subTest(msg=f"{column}: {variant}"): if column == self.HGVS_SPLICE_COL: data = "{},{},{}\n{},{},1.0".format( self.HGVS_NT_COL, column, self.SCORE_COL, generate_hgvs(prefix="g"), variant, ) else: data = "{},{}\n{},1.0".format(column, self.SCORE_COL, variant) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 1) print(dataset.errors)
def test_to_dict(self): hgvs_1 = generate_hgvs(prefix="c") hgvs_2 = generate_hgvs(prefix="c") data = "{},{},{},{}\n{},,,\n{},,,1.0".format( self.HGVS_NT_COL, self.HGVS_PRO_COL, self.HGVS_SPLICE_COL, self.SCORE_COL, hgvs_1, hgvs_2, ) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertTrue(dataset.is_valid) self.assertDictEqual( dataset.to_dict(), { hgvs_1: { self.HGVS_NT_COL: hgvs_1, self.HGVS_SPLICE_COL: None, self.HGVS_PRO_COL: None, self.SCORE_COL: None, }, hgvs_2: { self.HGVS_NT_COL: hgvs_2, self.HGVS_SPLICE_COL: None, self.HGVS_PRO_COL: None, self.SCORE_COL: 1.0, }, }, )
def test_sorts_header(self): hgvs_nt = generate_hgvs(prefix="g") hgvs_pro = generate_hgvs(prefix="p") hgvs_splice = generate_hgvs(prefix="c") data = "{},{},{},{},{}\n{},{},{},{},{}".format( self.HGVS_PRO_COL, self.HGVS_NT_COL, "colA", self.SCORE_COL, self.HGVS_SPLICE_COL, hgvs_pro, hgvs_nt, "hello", 1.0, hgvs_splice, ) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertTrue(dataset.is_valid) self.assertListEqual( dataset.columns, [ self.HGVS_NT_COL, self.HGVS_SPLICE_COL, self.HGVS_PRO_COL, self.SCORE_COL, "colA", ], )
def test_does_not_split_double_quoted_variants(self): hgvs = "c.[123A>G;124A>G]" data = '{},{}\n"{}",1.0'.format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertTrue(dataset.is_valid) self.assertIn(hgvs, dataset.data()[self.HGVS_NT_COL])
def test_valid_targetseq_validation_fails(self): data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format(self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate(targetseq="ATC") self.assertTrue(dataset.is_valid)
def test_invalid_row_hgvs_is_not_a_string(self): data = "{},{}\n1.0,1.0".format(self.HGVS_NT_COL, self.SCORE_COL) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 1) print(dataset.errors)
def test_empty_no_variants_parsed(self): data = "{},{}\n".format(self.HGVS_NT_COL, self.SCORE_COL) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertTrue(dataset.is_empty) self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 1) print(dataset.errors)
def test_invalid_missing_either_required_hgvs_column(self): data = "{},{}\n{},{}".format(self.HGVS_SPLICE_COL, self.SCORE_COL, generate_hgvs(prefix="c"), 1.0) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 1) print(dataset.errors)
def test_scores_missing_scores_column(self): data = "{},{}\n{},{}".format(self.HGVS_NT_COL, "scores_rna", generate_hgvs(prefix="g"), 1.0) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 1) print(dataset.errors)
def test_parses_numeric_column_values_into_float(self): hgvs = generate_hgvs(prefix="c") data = "{},{}\n{},1.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertTrue(dataset.is_valid) value = dataset.data()[self.SCORE_COL].values[0] self.assertIsInstance(value, float)
def test_primary_column_is_pro_when_nt_is_not_defined(self): hgvs_pro = generate_hgvs(prefix="p") data = "{},{}\n{},1.0".format(self.HGVS_PRO_COL, self.SCORE_COL, hgvs_pro) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertTrue(dataset.is_valid) self.assertEqual(dataset.index_column, self.HGVS_PRO_COL)
def test_invalid_zero_is_not_parsed_as_none(self): hgvs = generate_hgvs(prefix="c") data = "{},{}\n{},0.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertTrue(dataset.is_valid) df = dataset.data() self.assertEqual(df[self.SCORE_COL].values[0], 0)
def test_invalid_missing_hgvs_columns(self): data = "{},{}\n{},1.0".format("not_hgvs", self.SCORE_COL, generate_hgvs()) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 1) print(dataset.errors)
def test_invalid_splice_not_defined_when_nt_is_genomic(self): data = "{},{}\n{},1.0".format(self.HGVS_NT_COL, self.SCORE_COL, generate_hgvs(prefix="g")) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 2) print(dataset.errors)
def test_invalid_same_variant_defined_in_two_rows_in_hgvs_pro(self): hgvs = generate_hgvs(prefix="p") data = "{},{}\n{},1.0\n{},1.0".format(self.HGVS_PRO_COL, "count", hgvs, hgvs) dataset = MaveDataset.for_counts(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 1) print(dataset.errors)
def test_invalid_same_hgvs_nt_defined_in_two_rows(self): hgvs = generate_hgvs(prefix="c") data = "{},{}\n{},1.0\n{},1.0".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs, hgvs) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 1) print(dataset.errors)
def test_error_missing_value_in_pro_column_when_pro_is_primary(self): for v in null_values_list: with self.subTest(msg=v): data = "{},{}\n{},1.0\n{},1.0".format( self.HGVS_PRO_COL, self.SCORE_COL, generate_hgvs(prefix="p"), v) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 1) print(dataset.errors)
def test_invalid_target_sequence_not_a_multiple_of_3(self): data = "{},{},{}\nc.1A>G,p.Ile1Val,0.5".format(self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate(targetseq="ATCG") self.assertFalse(dataset.is_valid) print(dataset.errors) self.assertEqual(dataset.n_errors, 1) self.assertIn("multiple of 3", dataset.errors[0])
def test_invalid_targetseq_validation_fails(self): data = "{},{},{}\nc.1A>G,p.Val1Phe,0.5".format(self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate(targetseq="ATC") self.assertFalse(dataset.is_valid) print(dataset.errors) self.assertEqual(dataset.n_errors, 1) self.assertIn("p.Val1Phe", dataset.errors[0])
def test_invalid_null_values_in_header(self): for value in null_values_list: with self.subTest(msg=f"'{value}'"): data = "{},{},{}\n{},1.0,1.0".format(self.HGVS_NT_COL, self.SCORE_COL, value, generate_hgvs()) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 1) print(dataset.errors)
def test_replaces_null_with_none_in_numeric_columns(self): hgvs_nt = generate_hgvs(prefix="c") for c in null_values_list: with self.subTest(msg=f"'{c}'"): data = "{},{}\n{},{}".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs_nt, c) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertTrue(dataset.is_valid) self.assertListEqual( list(dataset.data(serializable=True)[self.SCORE_COL]), [None])
def test_df_indexed_by_primary_column(self): data = "{},{},{}\n{},{},1.0".format( self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL, generate_hgvs(prefix="c"), generate_hgvs(prefix="p"), ) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertTrue(dataset.is_valid) assert_index_equal(dataset.data().index, dataset.index)
def test_invalid_genomic_and_transcript_mixed_in_nt_column(self): data = "{},{}\n{},1.0\n{},2.0".format( self.HGVS_NT_COL, self.SCORE_COL, generate_hgvs(prefix="g"), generate_hgvs(prefix="c"), ) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 2) print(dataset.errors)
def test_data_method_converts_null_values_to_None(self): hgvs = generate_hgvs() for value in null_values_list: with self.subTest(msg=value): data = "{},{}\n{},{}".format(self.HGVS_NT_COL, self.SCORE_COL, hgvs, value) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertTrue(dataset.is_valid) df = dataset.data(serializable=True) self.assertIsNotNone(df[self.HGVS_NT_COL].values[0]) self.assertIsNone(df[self.SCORE_COL].values[0])
def test_invalid_no_additional_columns_outside_hgvs_ones(self): data = "{},{},{}\n{},{},{}".format( self.HGVS_NT_COL, self.HGVS_SPLICE_COL, self.HGVS_PRO_COL, generate_hgvs(prefix="g"), generate_hgvs(prefix="c"), generate_hgvs(prefix="p"), ) dataset = MaveDataset.for_counts(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 1) print(dataset.errors)
def test_invalid_duplicates_in_index(self): hgvs = generate_hgvs(prefix="c") data = "{},{},{}\n{},{},1.0\n{},{},2.0".format( self.HGVS_NT_COL, self.HGVS_PRO_COL, self.SCORE_COL, hgvs, generate_hgvs(prefix="p"), hgvs, generate_hgvs(prefix="p"), ) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 1) print(dataset.errors)
def test_does_not_allow_wt_and_sy(self): wt = "_wt" sy = "_sy" data = "{},{},{},{}\n{},{},{},1.0".format( self.HGVS_NT_COL, self.HGVS_SPLICE_COL, self.HGVS_PRO_COL, self.SCORE_COL, wt, wt, sy, ) dataset = MaveDataset.for_scores(StringIO(data)) dataset.validate() self.assertFalse(dataset.is_valid) self.assertEqual(len(dataset.errors), 3) print(dataset.errors)