Пример #1
0
def test_check_column_names():

    _, sm, fr = get_test_data()
    fm = fr.copy()
    fm.columns = ["FM1", "FM2"]

    # Shouldn't get an error with default col names
    check_column_names(sm, fr, fm)

    # 1. Check for problematic names in sample metadata columns ("Sample ID",
    # "qurro_balance")

    sm.columns = ["Metadata1", "Sample ID", "Metadata3", "Metadata4"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"Sample ID"' in str(exception_info.value)

    # Shouldn't get an error with different case
    sm.columns = ["Metadata1", "sample id", "Metadata3", "Metadata4"]
    check_column_names(sm, fr, fm)

    sm.columns = ["qurro_balance", "Sample ID", "Metadata3", "Metadata4"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    # Sample ID check has priority in error msg
    assert '"Sample ID"' in str(exception_info.value)

    sm.columns = ["qurro_balance", "Metadata2", "Metadata3", "Metadata4"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"qurro_balance"' in str(exception_info.value)

    # reset sample metadata columns to be sane
    sm.columns = ["Metadata1", "Metadata2", "Metadata3", "Metadata4"]

    # 2. Check for problematic names in feature ranking columns ("Feature ID",
    # "qurro_classification", "qurro_spc", "qurro_x")

    fr.columns = ["R1", "Feature ID"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"Feature ID"' in str(exception_info.value)

    # If multiple problematic names are present, the ID one takes precedence,
    # then the _classification one, then the _x one, then the _spc one.
    # (this is just set by the order of "if" statements in
    # check_column_names().)
    # (also this is an arbitrary choice and doesn't really matter that much in
    # the grand scheme of things but I figure we might as well test these cases
    # somewhat.)
    # (also if you somehow have some or all of these column names in a real
    # dataset then I have a lot of questions)
    fr.columns = ["Feature ID", "qurro_classification"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"Feature ID"' in str(exception_info.value)

    fr.columns = ["R1", "qurro_classification"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"qurro_classification"' in str(exception_info.value)

    fr.columns = ["qurro_x", "qurro_classification"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"qurro_classification"' in str(exception_info.value)

    fr.columns = ["qurro_x", "R2"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"qurro_x"' in str(exception_info.value)

    fr.columns = ["qurro_x", "Feature ID"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"Feature ID"' in str(exception_info.value)

    fr.columns = ["qurro_spc", "qurro_x"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"qurro_x"' in str(exception_info.value)

    fr.columns = ["qurro_spc", "R2"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"qurro_spc"' in str(exception_info.value)

    fr.columns = ["qurro_spc", "qurro_classification"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"qurro_classification"' in str(exception_info.value)

    # reset feature ranking columns to be sane
    fr.columns = ["R1", "R2"]

    # 3. Check for problematic names in feature metadata columns ("Feature ID",
    # "qurro_classification", "qurro_spc")
    # This is essentially the same stuff as the feature ranking test above.

    fm.columns = ["FM1", "Feature ID"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"Feature ID"' in str(exception_info.value)

    fm.columns = ["Feature ID", "qurro_classification"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"Feature ID"' in str(exception_info.value)

    fm.columns = ["FM1", "qurro_classification"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"qurro_classification"' in str(exception_info.value)

    fm.columns = ["qurro_spc", "qurro_classification"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"qurro_classification"' in str(exception_info.value)

    fm.columns = ["qurro_spc", "FM2"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"qurro_spc"' in str(exception_info.value)

    fm.columns = ["FM1", "qurro_spc"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert '"qurro_spc"' in str(exception_info.value)

    # reset feature metadata columns to be sane
    fm.columns = ["FM1", "FM2"]

    # 4. Check for the case where the feature ranking and feature metadata
    # columns are not distinct
    fr.columns = ["FM1", "R2"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert "must be distinct" in str(exception_info.value)

    fr.columns = ["FM1", "FM2"]
    with pytest.raises(ValueError) as exception_info:
        check_column_names(sm, fr, fm)
    assert "must be distinct" in str(exception_info.value)
Пример #2
0
def process_input(
    feature_ranks,
    sample_metadata,
    biom_table,
    feature_metadata=None,
    extreme_feature_count=None,
):
    """Validates/processes the input files and parameter(s) to Qurro.

       In particular, this function

       1. Calls validate_df() and then check_column_names() on all of the
          input DataFrames passed (feature ranks, sample metadata, feature
          metadata if passed).

       2. Calls replace_nan() on the metadata DataFrame(s), so that all
          missing values are represented consistently with a None (which
          will be represented as a null in JSON/JavaScript).

       3. Converts the BIOM table to a SparseDataFrame by calling
          biom_table_to_sparse_df().

       4. Matches up the table with the feature ranks and sample metadata by
          calling match_table_and_data().

       5. Calls filter_unextreme_features() using the provided
          extreme_feature_count. (If it's None, then nothing will be done.)

       6. Calls remove_empty_samples_and_features() to filter empty samples
          (and features). This is purposefully done *after*
          filter_unextreme_features() is called.

       7. Calls merge_feature_metadata() on the feature ranks and feature
          metadata. (If feature metadata is None, nothing will be done.)

       Returns
       -------
       output_metadata: pd.DataFrame
            Sample metadata, but matched with the table and with empty samples
            removed.

       output_ranks: pd.DataFrame
            Feature ranks, post-filtering and with feature metadata columns
            added in.

       ranking_ids
            The ranking columns' names in output_ranks.

       feature_metadata_cols: list
            The feature metadata columns' names in output_ranks.

       output_table: pd.SparseDataFrame
            The BIOM table, post matching with the feature ranks and sample
            metadata and with empty samples removed.
    """

    logging.debug("Starting processing input.")

    validate_df(feature_ranks, "feature ranks", 2, 1)
    validate_df(sample_metadata, "sample metadata", 1, 1)
    if feature_metadata is not None:
        # It's cool if there aren't any features actually described in the
        # feature metadata (hence why we pass in 0 as the minimum # of rows in
        # the feature metadata DataFrame), but we still pass it to
        # validate_df() in order to ensure that:
        #   1) there's at least one feature metadata column (because
        #      otherwise the feature metadata is useless)
        #   2) column names are unique
        validate_df(feature_metadata, "feature metadata", 0, 1)

    check_column_names(sample_metadata, feature_ranks, feature_metadata)

    # Replace NaN values (which both _metadata_utils.read_metadata_file() and
    # qiime2.Metadata use to represent missing values, i.e. ""s) with None --
    # this is generally easier for us to handle in the JS side of things (since
    # it'll just be consistently converted to null by json.dumps()).
    sample_metadata = replace_nan(sample_metadata)
    if feature_metadata is not None:
        feature_metadata = replace_nan(feature_metadata)

    table = biom_table_to_sparse_df(biom_table)

    # Match up the table with the feature ranks and sample metadata.
    m_table, m_sample_metadata = match_table_and_data(table, feature_ranks,
                                                      sample_metadata)

    # Note that although we always call filter_unextreme_features(), filtering
    # isn't necessarily always done (whether or not depends on the value of
    # extreme_feature_count and the contents of the table/ranks).
    filtered_table, filtered_ranks = filter_unextreme_features(
        m_table, feature_ranks, extreme_feature_count)

    # Filter now-empty samples (and empty features) from the BIOM table.
    output_table, output_metadata, u_ranks = remove_empty_samples_and_features(
        filtered_table, m_sample_metadata, filtered_ranks)

    # Save a list of ranking IDs (before we add in feature metadata)
    # TODO: just have merge_feature_metadata() give us this?
    ranking_ids = u_ranks.columns

    output_ranks, feature_metadata_cols = merge_feature_metadata(
        u_ranks, feature_metadata)

    logging.debug("Finished input processing.")
    return (
        output_metadata,
        output_ranks,
        ranking_ids,
        feature_metadata_cols,
        output_table,
    )