def test_check_column_names(): _, sm, fr = get_test_data() fm = fr.copy() fm.columns = ["FM1", "FM2"] # Shouldn't get an error with default col names check_column_names(sm, fr, fm) # 1. Check for problematic names in sample metadata columns ("Sample ID", # "qurro_balance") sm.columns = ["Metadata1", "Sample ID", "Metadata3", "Metadata4"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"Sample ID"' in str(exception_info.value) # Shouldn't get an error with different case sm.columns = ["Metadata1", "sample id", "Metadata3", "Metadata4"] check_column_names(sm, fr, fm) sm.columns = ["qurro_balance", "Sample ID", "Metadata3", "Metadata4"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) # Sample ID check has priority in error msg assert '"Sample ID"' in str(exception_info.value) sm.columns = ["qurro_balance", "Metadata2", "Metadata3", "Metadata4"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"qurro_balance"' in str(exception_info.value) # reset sample metadata columns to be sane sm.columns = ["Metadata1", "Metadata2", "Metadata3", "Metadata4"] # 2. Check for problematic names in feature ranking columns ("Feature ID", # "qurro_classification", "qurro_spc", "qurro_x") fr.columns = ["R1", "Feature ID"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"Feature ID"' in str(exception_info.value) # If multiple problematic names are present, the ID one takes precedence, # then the _classification one, then the _x one, then the _spc one. # (this is just set by the order of "if" statements in # check_column_names().) # (also this is an arbitrary choice and doesn't really matter that much in # the grand scheme of things but I figure we might as well test these cases # somewhat.) # (also if you somehow have some or all of these column names in a real # dataset then I have a lot of questions) fr.columns = ["Feature ID", "qurro_classification"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"Feature ID"' in str(exception_info.value) fr.columns = ["R1", "qurro_classification"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"qurro_classification"' in str(exception_info.value) fr.columns = ["qurro_x", "qurro_classification"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"qurro_classification"' in str(exception_info.value) fr.columns = ["qurro_x", "R2"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"qurro_x"' in str(exception_info.value) fr.columns = ["qurro_x", "Feature ID"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"Feature ID"' in str(exception_info.value) fr.columns = ["qurro_spc", "qurro_x"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"qurro_x"' in str(exception_info.value) fr.columns = ["qurro_spc", "R2"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"qurro_spc"' in str(exception_info.value) fr.columns = ["qurro_spc", "qurro_classification"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"qurro_classification"' in str(exception_info.value) # reset feature ranking columns to be sane fr.columns = ["R1", "R2"] # 3. Check for problematic names in feature metadata columns ("Feature ID", # "qurro_classification", "qurro_spc") # This is essentially the same stuff as the feature ranking test above. fm.columns = ["FM1", "Feature ID"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"Feature ID"' in str(exception_info.value) fm.columns = ["Feature ID", "qurro_classification"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"Feature ID"' in str(exception_info.value) fm.columns = ["FM1", "qurro_classification"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"qurro_classification"' in str(exception_info.value) fm.columns = ["qurro_spc", "qurro_classification"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"qurro_classification"' in str(exception_info.value) fm.columns = ["qurro_spc", "FM2"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"qurro_spc"' in str(exception_info.value) fm.columns = ["FM1", "qurro_spc"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert '"qurro_spc"' in str(exception_info.value) # reset feature metadata columns to be sane fm.columns = ["FM1", "FM2"] # 4. Check for the case where the feature ranking and feature metadata # columns are not distinct fr.columns = ["FM1", "R2"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert "must be distinct" in str(exception_info.value) fr.columns = ["FM1", "FM2"] with pytest.raises(ValueError) as exception_info: check_column_names(sm, fr, fm) assert "must be distinct" in str(exception_info.value)
def process_input( feature_ranks, sample_metadata, biom_table, feature_metadata=None, extreme_feature_count=None, ): """Validates/processes the input files and parameter(s) to Qurro. In particular, this function 1. Calls validate_df() and then check_column_names() on all of the input DataFrames passed (feature ranks, sample metadata, feature metadata if passed). 2. Calls replace_nan() on the metadata DataFrame(s), so that all missing values are represented consistently with a None (which will be represented as a null in JSON/JavaScript). 3. Converts the BIOM table to a SparseDataFrame by calling biom_table_to_sparse_df(). 4. Matches up the table with the feature ranks and sample metadata by calling match_table_and_data(). 5. Calls filter_unextreme_features() using the provided extreme_feature_count. (If it's None, then nothing will be done.) 6. Calls remove_empty_samples_and_features() to filter empty samples (and features). This is purposefully done *after* filter_unextreme_features() is called. 7. Calls merge_feature_metadata() on the feature ranks and feature metadata. (If feature metadata is None, nothing will be done.) Returns ------- output_metadata: pd.DataFrame Sample metadata, but matched with the table and with empty samples removed. output_ranks: pd.DataFrame Feature ranks, post-filtering and with feature metadata columns added in. ranking_ids The ranking columns' names in output_ranks. feature_metadata_cols: list The feature metadata columns' names in output_ranks. output_table: pd.SparseDataFrame The BIOM table, post matching with the feature ranks and sample metadata and with empty samples removed. """ logging.debug("Starting processing input.") validate_df(feature_ranks, "feature ranks", 2, 1) validate_df(sample_metadata, "sample metadata", 1, 1) if feature_metadata is not None: # It's cool if there aren't any features actually described in the # feature metadata (hence why we pass in 0 as the minimum # of rows in # the feature metadata DataFrame), but we still pass it to # validate_df() in order to ensure that: # 1) there's at least one feature metadata column (because # otherwise the feature metadata is useless) # 2) column names are unique validate_df(feature_metadata, "feature metadata", 0, 1) check_column_names(sample_metadata, feature_ranks, feature_metadata) # Replace NaN values (which both _metadata_utils.read_metadata_file() and # qiime2.Metadata use to represent missing values, i.e. ""s) with None -- # this is generally easier for us to handle in the JS side of things (since # it'll just be consistently converted to null by json.dumps()). sample_metadata = replace_nan(sample_metadata) if feature_metadata is not None: feature_metadata = replace_nan(feature_metadata) table = biom_table_to_sparse_df(biom_table) # Match up the table with the feature ranks and sample metadata. m_table, m_sample_metadata = match_table_and_data(table, feature_ranks, sample_metadata) # Note that although we always call filter_unextreme_features(), filtering # isn't necessarily always done (whether or not depends on the value of # extreme_feature_count and the contents of the table/ranks). filtered_table, filtered_ranks = filter_unextreme_features( m_table, feature_ranks, extreme_feature_count) # Filter now-empty samples (and empty features) from the BIOM table. output_table, output_metadata, u_ranks = remove_empty_samples_and_features( filtered_table, m_sample_metadata, filtered_ranks) # Save a list of ranking IDs (before we add in feature metadata) # TODO: just have merge_feature_metadata() give us this? ranking_ids = u_ranks.columns output_ranks, feature_metadata_cols = merge_feature_metadata( u_ranks, feature_metadata) logging.debug("Finished input processing.") return ( output_metadata, output_ranks, ranking_ids, feature_metadata_cols, output_table, )