示例#1
0
    def testGetPredsAboveThreshold(self, input_df, expected, threshold):
        inferrer_list = [_InferrerFixture(activation_rank=2)]

        # Assert that the first sequence was removed.
        actual = inference.get_preds_at_or_above_threshold(
            input_df, inferrer_list, threshold)
        test_util.assert_dataframes_equal(self, actual, expected)
    def test_assert_dataframes_equal_nan_raises(self):
        df1 = pd.DataFrame({'col1': [
            float('nan'),
        ]})

        with self.assertRaisesRegex(AssertionError, 'nan'):
            test_util.assert_dataframes_equal(self,
                                              df1,
                                              df1,
                                              nan_equals_nan=False)
    def test_load_blast_output(self):
        input_test_fasta = (
            '>accession="ACCESSION"\tlabels="GO:101010,EC:9.9.9.9"\n'
            "ADE\n"
            '>accession="ACCESSION2"\tlabels="EC:1.2.-.-"\n'
            "WWWW\n")
        test_fasta_filename = _write_to_file(input_test_fasta)
        ground_truth_test = baseline_utils.load_ground_truth(
            test_fasta_filename)

        input_train_fasta = (
            '>accession="MATCHACCESSION"\tlabels="GO:101010,EC:9.9.9.9,Pfam:PF12345"\n'
            "ADE\n")
        train_fasta_filename = _write_to_file(input_train_fasta)
        ground_truth_train = baseline_utils.load_ground_truth(
            train_fasta_filename)

        # Missing second sequence in ground truth.
        input_blast = (
            'accession="ACCESSION"\taccession="MATCHACCESSION"\t82.456\t57\t10\t0\t1\t57\t1\t57\t6.92e-21\t79.3\n'
        )
        input_label_vocab = np.array(
            ["EC:1.2.-.-", "EC:9.9.9.9", "GO:101010", "Pfam:PF12345"])
        blast_filename = _write_to_file(input_blast)
        actual = baseline_utils.load_blast_output(
            filename=blast_filename,
            label_vocab=input_label_vocab,
            test_data_ground_truth=ground_truth_test,
            training_data_ground_truth=ground_truth_train)

        expected = pd.DataFrame({
            "sequence_name": ["ACCESSION", "ACCESSION2"],
            "closest_sequence": ["MATCHACCESSION",
                                 float("nan")],
            "true_label": [{"GO:101010", "EC:9.9.9.9"}, {"EC:1.2.-.-"}],
            "predicted_label": [{"GO:101010", "EC:9.9.9.9", "Pfam:PF12345"},
                                frozenset()],
            "percent_seq_identity": [82.456, float("nan")],
            "e_value": [6.92e-21, float("nan")],
            "bit_score": [79.3, 0.0],
        })

        test_util.assert_dataframes_equal(
            self,
            # Assert dataframes equal except for predictions column.
            # Rely on unit testing for predictions column instead to increase
            # test clarity. See test_blast_row_to_confidence_array above.
            actual.drop(columns=["predictions"]),
            expected,
            nan_equals_nan=True)
示例#4
0
    def test_parse_input(self):
        input_file_path = self.create_tempfile(content='>SEQUENCE_NAME\nACDE')
        input_text = proteinfer.parse_input_to_text(input_file_path.full_path)
        actual_df = proteinfer.input_text_to_df(input_text)
        expected = pd.DataFrame({
            'sequence_name': ['SEQUENCE_NAME'],
            'sequence': ['ACDE'],
        })

        # BioPython parses sequences as Bio.Seq.Seq which can, in most cases,
        # act as sequences, but in others can lead to surprising behavior. Ensure
        # we actually have a str.
        self.assertEqual(type(actual_df.sequence.values[0]), str)
        test_util.assert_dataframes_equal(self, actual_df, expected)
    def test_load_ground_truth(self):
        input_fasta = (
            '>accession="ACCESSION"\tlabels="GO:101010,EC:9.9.9.9"\n'
            "ADE\n"
            '>accession="ACCESSION2"\tlabels="EC:1.2.-.-"\n'
            "WWWW\n")
        tmpfile_name = _write_to_file(input_fasta)
        actual = baseline_utils.load_ground_truth(tmpfile_name)

        expected = pd.DataFrame({
            "sequence_name": ["ACCESSION", "ACCESSION2"],
            "true_label": [{"GO:101010", "EC:9.9.9.9"}, {"EC:1.2.-.-"}],
            "sequence": ["ADE", "WWWW"]
        })

        test_util.assert_dataframes_equal(self, actual, expected)
示例#6
0
    def testLoadShardedCsvUseGivenHeader(self):
        input_csv_dir = tempfile.mkdtemp()
        _write_to_file("""col1,col2,col3\nA,B,C""", input_csv_dir)
        _write_to_file("""col1,col2,col3\nD,E,F""", input_csv_dir)

        expected = pd.read_csv(
            io.StringIO("col1,col2,col3\n"
                        "A,B,C\n"
                        "D,E,F"))

        actual = model_performance_analysis.load_sharded_df_csvs(
            input_csv_dir, use_given_header=True)

        test_util.assert_dataframes_equal(self,
                                          actual,
                                          expected,
                                          sort_by_column="col1")
示例#7
0
    def testLoadShardedCsvTest(self):
        input_csv_dir = tempfile.mkdtemp()
        _write_to_file("""A,B,C""", input_csv_dir)
        _write_to_file("""D,E,F""", input_csv_dir)
        input_columns = ["letter_1", "letter_2", "letter_3"]

        expected = pd.read_csv(
            io.StringIO("letter_1,letter_2,letter_3\n"
                        "A,B,C\n"
                        "D,E,F"))

        actual = model_performance_analysis.load_sharded_df_csvs(
            input_csv_dir, column_names=input_columns)

        test_util.assert_dataframes_equal(self,
                                          actual,
                                          expected,
                                          sort_by_column="letter_1")
示例#8
0
    def test_format_output_adds_description_and_formats_float_confidence(self):
        input_df = pd.DataFrame({
            'sequence_name': ['SEQ_A'],
            'predicted_label': ['Pfam:PF000042'],
            'confidence': [.991]
        })
        label_to_description = {'Pfam:PF000042': 'Oxygen carrier'}
        num_decimal_places = 2

        actual = proteinfer.format_df_for_output(input_df,
                                                 label_to_description,
                                                 num_decimal_places)
        expected = pd.DataFrame({
            'sequence_name': ['SEQ_A'],
            'predicted_label': ['Pfam:PF000042'],
            'confidence': [.99],
            'description': ['Oxygen carrier']
        })

        test_util.assert_dataframes_equal(self, actual, expected)
    def test_limit_set_of_labels(self):
        # Set up input data.
        input_df = pd.DataFrame(
            {"labels": [frozenset(["a"]),
                        frozenset(["a", "b"])]})
        acceptable_labels = frozenset(["a"])
        column_to_limit = "labels"

        # Assert input dataframe was not modified later on, so save a copy.
        input_df_copy = input_df.copy()

        # Compute actual.
        actual = baseline_utils.limit_set_of_labels(input_df,
                                                    acceptable_labels,
                                                    column_to_limit)
        expected = pd.DataFrame(
            {"labels": [frozenset(["a"]), frozenset(["a"])]})

        # Test assertions.
        test_util.assert_dataframes_equal(self, actual, expected)

        # Assert input dataframe was not modified.
        test_util.assert_dataframes_equal(self, input_df, input_df_copy)
    def test_assert_dataframes_equal_nan_equal_nan(self):
        df1 = pd.DataFrame({'col1': [
            float('nan'),
        ]})

        test_util.assert_dataframes_equal(self, df1, df1, nan_equals_nan=True)
 def test_assert_dataframes_equal_error(self,
                                        df1,
                                        df2,
                                        order_by_column=None):
     with self.assertRaises(AssertionError):
         test_util.assert_dataframes_equal(self, df1, df2, order_by_column)
 def test_assert_dataframes_equal_no_error(self,
                                           df1,
                                           df2,
                                           order_by_column=None):
     test_util.assert_dataframes_equal(self, df1, df2, order_by_column)
示例#13
0
 def test_order_df_for_output(self, input_df, expected):
     actual = proteinfer.order_df_for_output(input_df)
     test_util.assert_dataframes_equal(self, actual, expected)