def testGetPredsAboveThreshold(self, input_df, expected, threshold): inferrer_list = [_InferrerFixture(activation_rank=2)] # Assert that the first sequence was removed. actual = inference.get_preds_at_or_above_threshold( input_df, inferrer_list, threshold) test_util.assert_dataframes_equal(self, actual, expected)
def test_assert_dataframes_equal_nan_raises(self): df1 = pd.DataFrame({'col1': [ float('nan'), ]}) with self.assertRaisesRegex(AssertionError, 'nan'): test_util.assert_dataframes_equal(self, df1, df1, nan_equals_nan=False)
def test_load_blast_output(self): input_test_fasta = ( '>accession="ACCESSION"\tlabels="GO:101010,EC:9.9.9.9"\n' "ADE\n" '>accession="ACCESSION2"\tlabels="EC:1.2.-.-"\n' "WWWW\n") test_fasta_filename = _write_to_file(input_test_fasta) ground_truth_test = baseline_utils.load_ground_truth( test_fasta_filename) input_train_fasta = ( '>accession="MATCHACCESSION"\tlabels="GO:101010,EC:9.9.9.9,Pfam:PF12345"\n' "ADE\n") train_fasta_filename = _write_to_file(input_train_fasta) ground_truth_train = baseline_utils.load_ground_truth( train_fasta_filename) # Missing second sequence in ground truth. input_blast = ( 'accession="ACCESSION"\taccession="MATCHACCESSION"\t82.456\t57\t10\t0\t1\t57\t1\t57\t6.92e-21\t79.3\n' ) input_label_vocab = np.array( ["EC:1.2.-.-", "EC:9.9.9.9", "GO:101010", "Pfam:PF12345"]) blast_filename = _write_to_file(input_blast) actual = baseline_utils.load_blast_output( filename=blast_filename, label_vocab=input_label_vocab, test_data_ground_truth=ground_truth_test, training_data_ground_truth=ground_truth_train) expected = pd.DataFrame({ "sequence_name": ["ACCESSION", "ACCESSION2"], "closest_sequence": ["MATCHACCESSION", float("nan")], "true_label": [{"GO:101010", "EC:9.9.9.9"}, {"EC:1.2.-.-"}], "predicted_label": [{"GO:101010", "EC:9.9.9.9", "Pfam:PF12345"}, frozenset()], "percent_seq_identity": [82.456, float("nan")], "e_value": [6.92e-21, float("nan")], "bit_score": [79.3, 0.0], }) test_util.assert_dataframes_equal( self, # Assert dataframes equal except for predictions column. # Rely on unit testing for predictions column instead to increase # test clarity. See test_blast_row_to_confidence_array above. actual.drop(columns=["predictions"]), expected, nan_equals_nan=True)
def test_parse_input(self): input_file_path = self.create_tempfile(content='>SEQUENCE_NAME\nACDE') input_text = proteinfer.parse_input_to_text(input_file_path.full_path) actual_df = proteinfer.input_text_to_df(input_text) expected = pd.DataFrame({ 'sequence_name': ['SEQUENCE_NAME'], 'sequence': ['ACDE'], }) # BioPython parses sequences as Bio.Seq.Seq which can, in most cases, # act as sequences, but in others can lead to surprising behavior. Ensure # we actually have a str. self.assertEqual(type(actual_df.sequence.values[0]), str) test_util.assert_dataframes_equal(self, actual_df, expected)
def test_load_ground_truth(self): input_fasta = ( '>accession="ACCESSION"\tlabels="GO:101010,EC:9.9.9.9"\n' "ADE\n" '>accession="ACCESSION2"\tlabels="EC:1.2.-.-"\n' "WWWW\n") tmpfile_name = _write_to_file(input_fasta) actual = baseline_utils.load_ground_truth(tmpfile_name) expected = pd.DataFrame({ "sequence_name": ["ACCESSION", "ACCESSION2"], "true_label": [{"GO:101010", "EC:9.9.9.9"}, {"EC:1.2.-.-"}], "sequence": ["ADE", "WWWW"] }) test_util.assert_dataframes_equal(self, actual, expected)
def testLoadShardedCsvUseGivenHeader(self): input_csv_dir = tempfile.mkdtemp() _write_to_file("""col1,col2,col3\nA,B,C""", input_csv_dir) _write_to_file("""col1,col2,col3\nD,E,F""", input_csv_dir) expected = pd.read_csv( io.StringIO("col1,col2,col3\n" "A,B,C\n" "D,E,F")) actual = model_performance_analysis.load_sharded_df_csvs( input_csv_dir, use_given_header=True) test_util.assert_dataframes_equal(self, actual, expected, sort_by_column="col1")
def testLoadShardedCsvTest(self): input_csv_dir = tempfile.mkdtemp() _write_to_file("""A,B,C""", input_csv_dir) _write_to_file("""D,E,F""", input_csv_dir) input_columns = ["letter_1", "letter_2", "letter_3"] expected = pd.read_csv( io.StringIO("letter_1,letter_2,letter_3\n" "A,B,C\n" "D,E,F")) actual = model_performance_analysis.load_sharded_df_csvs( input_csv_dir, column_names=input_columns) test_util.assert_dataframes_equal(self, actual, expected, sort_by_column="letter_1")
def test_format_output_adds_description_and_formats_float_confidence(self): input_df = pd.DataFrame({ 'sequence_name': ['SEQ_A'], 'predicted_label': ['Pfam:PF000042'], 'confidence': [.991] }) label_to_description = {'Pfam:PF000042': 'Oxygen carrier'} num_decimal_places = 2 actual = proteinfer.format_df_for_output(input_df, label_to_description, num_decimal_places) expected = pd.DataFrame({ 'sequence_name': ['SEQ_A'], 'predicted_label': ['Pfam:PF000042'], 'confidence': [.99], 'description': ['Oxygen carrier'] }) test_util.assert_dataframes_equal(self, actual, expected)
def test_limit_set_of_labels(self): # Set up input data. input_df = pd.DataFrame( {"labels": [frozenset(["a"]), frozenset(["a", "b"])]}) acceptable_labels = frozenset(["a"]) column_to_limit = "labels" # Assert input dataframe was not modified later on, so save a copy. input_df_copy = input_df.copy() # Compute actual. actual = baseline_utils.limit_set_of_labels(input_df, acceptable_labels, column_to_limit) expected = pd.DataFrame( {"labels": [frozenset(["a"]), frozenset(["a"])]}) # Test assertions. test_util.assert_dataframes_equal(self, actual, expected) # Assert input dataframe was not modified. test_util.assert_dataframes_equal(self, input_df, input_df_copy)
def test_assert_dataframes_equal_nan_equal_nan(self): df1 = pd.DataFrame({'col1': [ float('nan'), ]}) test_util.assert_dataframes_equal(self, df1, df1, nan_equals_nan=True)
def test_assert_dataframes_equal_error(self, df1, df2, order_by_column=None): with self.assertRaises(AssertionError): test_util.assert_dataframes_equal(self, df1, df2, order_by_column)
def test_assert_dataframes_equal_no_error(self, df1, df2, order_by_column=None): test_util.assert_dataframes_equal(self, df1, df2, order_by_column)
def test_order_df_for_output(self, input_df, expected): actual = proteinfer.order_df_for_output(input_df) test_util.assert_dataframes_equal(self, actual, expected)