def test_match_table_and_data_ranked_features_not_in_table(): # Qurro is pretty accepting for mismatched data, but if any of your ranked # features aren't in the BIOM Qurro will immediately throw an error. # (...because that is not a good situation.) table, metadata, ranks = get_test_data() new_feature_row = DataFrame([[9, 0]], columns=ranks.columns, index=["F9"]) ranks_modified = ranks.append(new_feature_row, verify_integrity=True) with pytest.raises(ValueError) as exception_info: match_table_and_data(table, ranks_modified, metadata) expected_message = ( "Of the 9 ranked features, 1 was not present in the input BIOM table") assert expected_message in str(exception_info.value) # Try this again; verify it works with more than 1 ranked features not in # the table # (also, the error message should use "were" instead of "was" now :) new_feature_row = DataFrame([[10, -1]], columns=ranks.columns, index=["F10"]) ranks_modified = ranks_modified.append(new_feature_row, verify_integrity=True) with pytest.raises(ValueError) as exception_info: match_table_and_data(table, ranks_modified, metadata) expected_message = ( "Of the 10 ranked features, 2 were not present in the input BIOM table" ) assert expected_message in str(exception_info.value)
def test_match_table_and_data_complete_sample_mismatch(): # Test that, if no samples are shared between the table and metadata, an # error is raised. table, metadata, ranks = get_test_data() # Instead of Sample1, ... use S1, ... metadata.index = ["S1", "S2", "S3", "S4"] with pytest.raises(ValueError) as exception_info: match_table_and_data(table, ranks, metadata) expected_message = ( "No samples are shared between the sample metadata file and BIOM table" ) assert expected_message in str(exception_info.value)
def test_match_table_and_data_table_extra_feature(capsys): # Test case where table contains a feature that isn't in the ranks table, metadata, ranks = get_test_data() new_row = DataFrame( [[20, 20, 20, 20]], columns=table.columns, index=["FeatureInTableButNotRanks"], ) table = table.append(new_row, verify_integrity=True) m_table, m_metadata = match_table_and_data(table, ranks, metadata) # Only features in the ranks' index should be left in the table's index, # and the table and ranks' indices should line up. assert len(set(m_table.index) & set(ranks.index)) == len(ranks.index) # Sanity check -- another way of verifying the above assert "FeatureInTableButNotRanks" not in m_table.index # Check that the matched-up fields' data wasn't altered somehow assert_frame_equal(table.loc[ranks.index], m_table) assert_frame_equal(metadata, m_metadata) # Check that a feature-dropping message was printed captured = capsys.readouterr() expected_msg = ( "1 feature(s) in the BIOM table were not present in the feature " "rankings" ) assert expected_msg in captured.out
def test_match_table_and_data_metadata_extra_sample(capsys): # Test case where metadata contains a sample that isn't in the table table, metadata, ranks = get_test_data() # Add a new row to the metadata new_row = DataFrame( [[20, 20, 20, 20]], columns=metadata.columns, index=["SampleInMDButNotTable"], ) metadata = metadata.append(new_row, verify_integrity=True) m_table, m_metadata = match_table_and_data(table, ranks, metadata) assert len(set(m_table.columns) & set(m_metadata.index)) == len( table.columns) assert "SampleInMDButNotTable" not in m_table.columns assert "SampleInMDButNotTable" not in m_metadata.index # Check that the matched-up fields' data wasn't altered somehow assert_frame_equal(table, m_table) assert_frame_equal(metadata.loc[table.columns], m_metadata) # Check that a message re: the sample being dropped was printed captured = capsys.readouterr() expected_msg = ( "1 sample(s) in the sample metadata file were not present in the BIOM " "table") assert expected_msg in captured.out
def test_match_table_and_data_no_change(capsys): # In basic case, nothing should change table, metadata, ranks = get_test_data() m_table, m_metadata = match_table_and_data(table, ranks, metadata) # Check that table and metadata are actually equal assert_frame_equal(table, m_table) assert_frame_equal(metadata, m_metadata) # Check that nothing was printed (i.e. no "sample/feature dropped" messages # or whatever) captured = capsys.readouterr() assert captured.out == ""
def test_match_table_and_data_table_extra_sample(capsys): # Test case where table contains a sample that isn't in the metadata table, metadata, ranks = get_test_data() table["SampleInTableButNotMD"] = 10 m_table, m_metadata = match_table_and_data(table, ranks, metadata) assert len(set(m_table.columns) & set(m_metadata.index)) == len( metadata.index) assert "SampleInTableButNotMD" not in m_table.columns assert "SampleInTableButNotMD" not in m_metadata.index # Check that the matched-up fields' data wasn't altered somehow assert_frame_equal(table[metadata.index], m_table) assert_frame_equal(metadata, m_metadata) # Check that a message re: the sample being dropped was printed captured = capsys.readouterr() expected_msg = ( "1 sample(s) in the BIOM table were not present in the sample " "metadata file") assert expected_msg in captured.out
def test_match_table_and_data_complex(capsys): # Test the case where there are multiple sources of mismatched data: # -> 1 extra feature in the table ("F9") # -> 1 extra sample in the table ("Sample5") # -> 1 extra sample in the metadata ("SampleM") table, metadata, ranks = get_test_data() # Add the extra feature to the table new_f_row = DataFrame([[1, 2, 3, 4]], columns=table.columns, index=["F9"]) table = table.append(new_f_row, verify_integrity=True) # Add the extra sample to the table table["Sample5"] = 5 # Add the extra sample to the metadata new_s_row = DataFrame( [[4, 3, 2, 1]], columns=metadata.columns, index=["SampleM"] ) metadata = metadata.append(new_s_row, verify_integrity=True) # Ok, actually run the function! m_table, m_metadata = match_table_and_data(table, ranks, metadata) captured = capsys.readouterr() # ...Now we can check all of the output messages. There'll be a lot. expected_message_1 = ( "1 feature(s) in the BIOM table were not present in the feature " "rankings" ) expected_message_2 = ( "1 sample(s) in the BIOM table were not present in the sample " "metadata file" ) expected_message_3 = ( "1 sample(s) in the sample metadata file were not present in the BIOM " "table" ) assert expected_message_1 in captured.out assert expected_message_2 in captured.out assert expected_message_3 in captured.out
def validate_rank_plot_json( biom_table_loc, metadata_loc, input_ranks_loc, rank_json ): """Ensure that the rank plot JSON makes sense.""" # TODO check that feature metadata annotations were properly applied to the # features. Will need the feature metadata file location to be passed here ref_feature_ranks = read_rank_file(input_ranks_loc) # Load the table as a Sparse DF, and then match it up with the sample # metadata. This is needed in order to ensure that the table only describes # samples in the sample metadata. # (And the reason we do *that* is so that, when we're trying to figure out # if a feature is "empty," we can just compute the sum of that feature's # row in the table -- which we couldn't do if the table contained samples # that would be filtered out in Qurro.) table = biom_table_to_sparse_df(load_table(biom_table_loc)) sample_metadata = read_metadata_file(metadata_loc) table, _ = match_table_and_data(table, ref_feature_ranks, sample_metadata) # Validate some basic properties of the plot # (This is all handled by Altair, so these property tests aren't # exhaustive; they're mainly intended to verify that a general plot # matching our specs is being created) assert rank_json["mark"] == "bar" assert rank_json["title"] == "Features" basic_vegalite_json_validation(rank_json) # Loop over every feature in the reference feature ranks. Check that each # feature's corresponding rank data in the rank plot JSON matches. rank_ordering = rank_json["datasets"]["qurro_rank_ordering"] rank_json_feature_data = get_data_from_plot_json( rank_json, id_field="Feature ID" ) for ref_feature_id in ref_feature_ranks.index: # If this feature is empty, it should have been filtered! if sum(table.loc[ref_feature_id]) == 0: assert ref_feature_id not in rank_json_feature_data continue # ...If this feature isn't empty, though, it shouldn't have been # filtered. (We assume that the user didn't pass in -x in this test.) # # Check to make sure that this feature ID is actually in the rank plot # JSON assert ref_feature_id in rank_json_feature_data # Get the corresponding feature's ranking information stored in the # rank plot JSON json_feature_data = rank_json_feature_data[ref_feature_id] # Note that we allow for mismatches in ranking names between the # reference and JSON feature rank data -- instead, we compare based on # the *order* of the feature rankings (aka the order of the columns in # either the feature differentials or ordination feature loadings). # This is fine, because we may want to rename certain rankings' names # (e.g. the axes in DEICODE's feature loadings, which default to just # 0, 1, 2) for ref_ranking, json_ranking in zip_longest( ref_feature_ranks.columns, rank_ordering ): # We use pytest's approx class to get past floating point # imprecisions. Note that we just leave this at the default for # approx, so if this starts failing then adjusting the tolerances # in approx() might be needed. actual_rank_val = ref_feature_ranks[ref_ranking][ref_feature_id] assert actual_rank_val == approx(json_feature_data[json_ranking])
def process_input( feature_ranks, sample_metadata, biom_table, feature_metadata=None, extreme_feature_count=None, ): """Validates/processes the input files and parameter(s) to Qurro. In particular, this function 1. Calls validate_df() and then check_column_names() on all of the input DataFrames passed (feature ranks, sample metadata, feature metadata if passed). 2. Calls replace_nan() on the metadata DataFrame(s), so that all missing values are represented consistently with a None (which will be represented as a null in JSON/JavaScript). 3. Converts the BIOM table to a SparseDataFrame by calling biom_table_to_sparse_df(). 4. Matches up the table with the feature ranks and sample metadata by calling match_table_and_data(). 5. Calls filter_unextreme_features() using the provided extreme_feature_count. (If it's None, then nothing will be done.) 6. Calls remove_empty_samples_and_features() to filter empty samples (and features). This is purposefully done *after* filter_unextreme_features() is called. 7. Calls merge_feature_metadata() on the feature ranks and feature metadata. (If feature metadata is None, nothing will be done.) Returns ------- output_metadata: pd.DataFrame Sample metadata, but matched with the table and with empty samples removed. output_ranks: pd.DataFrame Feature ranks, post-filtering and with feature metadata columns added in. ranking_ids The ranking columns' names in output_ranks. feature_metadata_cols: list The feature metadata columns' names in output_ranks. output_table: pd.SparseDataFrame The BIOM table, post matching with the feature ranks and sample metadata and with empty samples removed. """ logging.debug("Starting processing input.") validate_df(feature_ranks, "feature ranks", 2, 1) validate_df(sample_metadata, "sample metadata", 1, 1) if feature_metadata is not None: # It's cool if there aren't any features actually described in the # feature metadata (hence why we pass in 0 as the minimum # of rows in # the feature metadata DataFrame), but we still pass it to # validate_df() in order to ensure that: # 1) there's at least one feature metadata column (because # otherwise the feature metadata is useless) # 2) column names are unique validate_df(feature_metadata, "feature metadata", 0, 1) check_column_names(sample_metadata, feature_ranks, feature_metadata) # Replace NaN values (which both _metadata_utils.read_metadata_file() and # qiime2.Metadata use to represent missing values, i.e. ""s) with None -- # this is generally easier for us to handle in the JS side of things (since # it'll just be consistently converted to null by json.dumps()). sample_metadata = replace_nan(sample_metadata) if feature_metadata is not None: feature_metadata = replace_nan(feature_metadata) table = biom_table_to_sparse_df(biom_table) # Match up the table with the feature ranks and sample metadata. m_table, m_sample_metadata = match_table_and_data(table, feature_ranks, sample_metadata) # Note that although we always call filter_unextreme_features(), filtering # isn't necessarily always done (whether or not depends on the value of # extreme_feature_count and the contents of the table/ranks). filtered_table, filtered_ranks = filter_unextreme_features( m_table, feature_ranks, extreme_feature_count) # Filter now-empty samples (and empty features) from the BIOM table. output_table, output_metadata, u_ranks = remove_empty_samples_and_features( filtered_table, m_sample_metadata, filtered_ranks) # Save a list of ranking IDs (before we add in feature metadata) # TODO: just have merge_feature_metadata() give us this? ranking_ids = u_ranks.columns output_ranks, feature_metadata_cols = merge_feature_metadata( u_ranks, feature_metadata) logging.debug("Finished input processing.") return ( output_metadata, output_ranks, ranking_ids, feature_metadata_cols, output_table, )