예제 #1
0
def test_match_table_and_data_ranked_features_not_in_table():
    # Qurro is pretty accepting for mismatched data, but if any of your ranked
    # features aren't in the BIOM Qurro will immediately throw an error.
    # (...because that is not a good situation.)
    table, metadata, ranks = get_test_data()
    new_feature_row = DataFrame([[9, 0]], columns=ranks.columns, index=["F9"])
    ranks_modified = ranks.append(new_feature_row, verify_integrity=True)
    with pytest.raises(ValueError) as exception_info:
        match_table_and_data(table, ranks_modified, metadata)
    expected_message = (
        "Of the 9 ranked features, 1 was not present in the input BIOM table")
    assert expected_message in str(exception_info.value)

    # Try this again; verify it works with more than 1 ranked features not in
    # the table
    # (also, the error message should use "were" instead of "was" now :)
    new_feature_row = DataFrame([[10, -1]],
                                columns=ranks.columns,
                                index=["F10"])
    ranks_modified = ranks_modified.append(new_feature_row,
                                           verify_integrity=True)
    with pytest.raises(ValueError) as exception_info:
        match_table_and_data(table, ranks_modified, metadata)
    expected_message = (
        "Of the 10 ranked features, 2 were not present in the input BIOM table"
    )
    assert expected_message in str(exception_info.value)
예제 #2
0
def test_match_table_and_data_complete_sample_mismatch():
    # Test that, if no samples are shared between the table and metadata, an
    # error is raised.
    table, metadata, ranks = get_test_data()

    # Instead of Sample1, ... use S1, ...
    metadata.index = ["S1", "S2", "S3", "S4"]
    with pytest.raises(ValueError) as exception_info:
        match_table_and_data(table, ranks, metadata)
    expected_message = (
        "No samples are shared between the sample metadata file and BIOM table"
    )
    assert expected_message in str(exception_info.value)
예제 #3
0
def test_match_table_and_data_table_extra_feature(capsys):
    # Test case where table contains a feature that isn't in the ranks
    table, metadata, ranks = get_test_data()
    new_row = DataFrame(
        [[20, 20, 20, 20]],
        columns=table.columns,
        index=["FeatureInTableButNotRanks"],
    )
    table = table.append(new_row, verify_integrity=True)
    m_table, m_metadata = match_table_and_data(table, ranks, metadata)
    # Only features in the ranks' index should be left in the table's index,
    # and the table and ranks' indices should line up.
    assert len(set(m_table.index) & set(ranks.index)) == len(ranks.index)
    # Sanity check -- another way of verifying the above
    assert "FeatureInTableButNotRanks" not in m_table.index
    # Check that the matched-up fields' data wasn't altered somehow
    assert_frame_equal(table.loc[ranks.index], m_table)
    assert_frame_equal(metadata, m_metadata)
    # Check that a feature-dropping message was printed
    captured = capsys.readouterr()
    expected_msg = (
        "1 feature(s) in the BIOM table were not present in the feature "
        "rankings"
    )
    assert expected_msg in captured.out
예제 #4
0
def test_match_table_and_data_metadata_extra_sample(capsys):
    # Test case where metadata contains a sample that isn't in the table
    table, metadata, ranks = get_test_data()

    # Add a new row to the metadata
    new_row = DataFrame(
        [[20, 20, 20, 20]],
        columns=metadata.columns,
        index=["SampleInMDButNotTable"],
    )
    metadata = metadata.append(new_row, verify_integrity=True)
    m_table, m_metadata = match_table_and_data(table, ranks, metadata)
    assert len(set(m_table.columns) & set(m_metadata.index)) == len(
        table.columns)
    assert "SampleInMDButNotTable" not in m_table.columns
    assert "SampleInMDButNotTable" not in m_metadata.index
    # Check that the matched-up fields' data wasn't altered somehow
    assert_frame_equal(table, m_table)
    assert_frame_equal(metadata.loc[table.columns], m_metadata)
    # Check that a message re: the sample being dropped was printed
    captured = capsys.readouterr()
    expected_msg = (
        "1 sample(s) in the sample metadata file were not present in the BIOM "
        "table")
    assert expected_msg in captured.out
예제 #5
0
def test_match_table_and_data_no_change(capsys):
    # In basic case, nothing should change
    table, metadata, ranks = get_test_data()

    m_table, m_metadata = match_table_and_data(table, ranks, metadata)
    # Check that table and metadata are actually equal
    assert_frame_equal(table, m_table)
    assert_frame_equal(metadata, m_metadata)
    # Check that nothing was printed (i.e. no "sample/feature dropped" messages
    # or whatever)
    captured = capsys.readouterr()
    assert captured.out == ""
예제 #6
0
def test_match_table_and_data_table_extra_sample(capsys):
    # Test case where table contains a sample that isn't in the metadata
    table, metadata, ranks = get_test_data()

    table["SampleInTableButNotMD"] = 10
    m_table, m_metadata = match_table_and_data(table, ranks, metadata)

    assert len(set(m_table.columns) & set(m_metadata.index)) == len(
        metadata.index)
    assert "SampleInTableButNotMD" not in m_table.columns
    assert "SampleInTableButNotMD" not in m_metadata.index
    # Check that the matched-up fields' data wasn't altered somehow
    assert_frame_equal(table[metadata.index], m_table)
    assert_frame_equal(metadata, m_metadata)
    # Check that a message re: the sample being dropped was printed
    captured = capsys.readouterr()
    expected_msg = (
        "1 sample(s) in the BIOM table were not present in the sample "
        "metadata file")
    assert expected_msg in captured.out
예제 #7
0
def test_match_table_and_data_complex(capsys):
    # Test the case where there are multiple sources of mismatched data:
    # -> 1 extra feature in the table ("F9")
    # -> 1 extra sample in the table ("Sample5")
    # -> 1 extra sample in the metadata ("SampleM")
    table, metadata, ranks = get_test_data()

    # Add the extra feature to the table
    new_f_row = DataFrame([[1, 2, 3, 4]], columns=table.columns, index=["F9"])
    table = table.append(new_f_row, verify_integrity=True)

    # Add the extra sample to the table
    table["Sample5"] = 5

    # Add the extra sample to the metadata
    new_s_row = DataFrame(
        [[4, 3, 2, 1]], columns=metadata.columns, index=["SampleM"]
    )
    metadata = metadata.append(new_s_row, verify_integrity=True)

    # Ok, actually run the function!
    m_table, m_metadata = match_table_and_data(table, ranks, metadata)
    captured = capsys.readouterr()

    # ...Now we can check all of the output messages. There'll be a lot.
    expected_message_1 = (
        "1 feature(s) in the BIOM table were not present in the feature "
        "rankings"
    )
    expected_message_2 = (
        "1 sample(s) in the BIOM table were not present in the sample "
        "metadata file"
    )
    expected_message_3 = (
        "1 sample(s) in the sample metadata file were not present in the BIOM "
        "table"
    )
    assert expected_message_1 in captured.out
    assert expected_message_2 in captured.out
    assert expected_message_3 in captured.out
예제 #8
0
def validate_rank_plot_json(
    biom_table_loc, metadata_loc, input_ranks_loc, rank_json
):
    """Ensure that the rank plot JSON makes sense."""

    # TODO check that feature metadata annotations were properly applied to the
    # features. Will need the feature metadata file location to be passed here

    ref_feature_ranks = read_rank_file(input_ranks_loc)

    # Load the table as a Sparse DF, and then match it up with the sample
    # metadata. This is needed in order to ensure that the table only describes
    # samples in the sample metadata.
    # (And the reason we do *that* is so that, when we're trying to figure out
    # if a feature is "empty," we can just compute the sum of that feature's
    # row in the table -- which we couldn't do if the table contained samples
    # that would be filtered out in Qurro.)
    table = biom_table_to_sparse_df(load_table(biom_table_loc))
    sample_metadata = read_metadata_file(metadata_loc)
    table, _ = match_table_and_data(table, ref_feature_ranks, sample_metadata)

    # Validate some basic properties of the plot
    # (This is all handled by Altair, so these property tests aren't
    # exhaustive; they're mainly intended to verify that a general plot
    # matching our specs is being created)
    assert rank_json["mark"] == "bar"
    assert rank_json["title"] == "Features"
    basic_vegalite_json_validation(rank_json)

    # Loop over every feature in the reference feature ranks. Check that each
    # feature's corresponding rank data in the rank plot JSON matches.
    rank_ordering = rank_json["datasets"]["qurro_rank_ordering"]
    rank_json_feature_data = get_data_from_plot_json(
        rank_json, id_field="Feature ID"
    )

    for ref_feature_id in ref_feature_ranks.index:
        # If this feature is empty, it should have been filtered!
        if sum(table.loc[ref_feature_id]) == 0:
            assert ref_feature_id not in rank_json_feature_data
            continue
        # ...If this feature isn't empty, though, it shouldn't have been
        # filtered. (We assume that the user didn't pass in -x in this test.)
        #
        # Check to make sure that this feature ID is actually in the rank plot
        # JSON
        assert ref_feature_id in rank_json_feature_data
        # Get the corresponding feature's ranking information stored in the
        # rank plot JSON
        json_feature_data = rank_json_feature_data[ref_feature_id]

        # Note that we allow for mismatches in ranking names between the
        # reference and JSON feature rank data -- instead, we compare based on
        # the *order* of the feature rankings (aka the order of the columns in
        # either the feature differentials or ordination feature loadings).
        # This is fine, because we may want to rename certain rankings' names
        # (e.g. the axes in DEICODE's feature loadings, which default to just
        # 0, 1, 2)
        for ref_ranking, json_ranking in zip_longest(
            ref_feature_ranks.columns, rank_ordering
        ):
            # We use pytest's approx class to get past floating point
            # imprecisions. Note that we just leave this at the default for
            # approx, so if this starts failing then adjusting the tolerances
            # in approx() might be needed.
            actual_rank_val = ref_feature_ranks[ref_ranking][ref_feature_id]
            assert actual_rank_val == approx(json_feature_data[json_ranking])
예제 #9
0
def process_input(
    feature_ranks,
    sample_metadata,
    biom_table,
    feature_metadata=None,
    extreme_feature_count=None,
):
    """Validates/processes the input files and parameter(s) to Qurro.

       In particular, this function

       1. Calls validate_df() and then check_column_names() on all of the
          input DataFrames passed (feature ranks, sample metadata, feature
          metadata if passed).

       2. Calls replace_nan() on the metadata DataFrame(s), so that all
          missing values are represented consistently with a None (which
          will be represented as a null in JSON/JavaScript).

       3. Converts the BIOM table to a SparseDataFrame by calling
          biom_table_to_sparse_df().

       4. Matches up the table with the feature ranks and sample metadata by
          calling match_table_and_data().

       5. Calls filter_unextreme_features() using the provided
          extreme_feature_count. (If it's None, then nothing will be done.)

       6. Calls remove_empty_samples_and_features() to filter empty samples
          (and features). This is purposefully done *after*
          filter_unextreme_features() is called.

       7. Calls merge_feature_metadata() on the feature ranks and feature
          metadata. (If feature metadata is None, nothing will be done.)

       Returns
       -------
       output_metadata: pd.DataFrame
            Sample metadata, but matched with the table and with empty samples
            removed.

       output_ranks: pd.DataFrame
            Feature ranks, post-filtering and with feature metadata columns
            added in.

       ranking_ids
            The ranking columns' names in output_ranks.

       feature_metadata_cols: list
            The feature metadata columns' names in output_ranks.

       output_table: pd.SparseDataFrame
            The BIOM table, post matching with the feature ranks and sample
            metadata and with empty samples removed.
    """

    logging.debug("Starting processing input.")

    validate_df(feature_ranks, "feature ranks", 2, 1)
    validate_df(sample_metadata, "sample metadata", 1, 1)
    if feature_metadata is not None:
        # It's cool if there aren't any features actually described in the
        # feature metadata (hence why we pass in 0 as the minimum # of rows in
        # the feature metadata DataFrame), but we still pass it to
        # validate_df() in order to ensure that:
        #   1) there's at least one feature metadata column (because
        #      otherwise the feature metadata is useless)
        #   2) column names are unique
        validate_df(feature_metadata, "feature metadata", 0, 1)

    check_column_names(sample_metadata, feature_ranks, feature_metadata)

    # Replace NaN values (which both _metadata_utils.read_metadata_file() and
    # qiime2.Metadata use to represent missing values, i.e. ""s) with None --
    # this is generally easier for us to handle in the JS side of things (since
    # it'll just be consistently converted to null by json.dumps()).
    sample_metadata = replace_nan(sample_metadata)
    if feature_metadata is not None:
        feature_metadata = replace_nan(feature_metadata)

    table = biom_table_to_sparse_df(biom_table)

    # Match up the table with the feature ranks and sample metadata.
    m_table, m_sample_metadata = match_table_and_data(table, feature_ranks,
                                                      sample_metadata)

    # Note that although we always call filter_unextreme_features(), filtering
    # isn't necessarily always done (whether or not depends on the value of
    # extreme_feature_count and the contents of the table/ranks).
    filtered_table, filtered_ranks = filter_unextreme_features(
        m_table, feature_ranks, extreme_feature_count)

    # Filter now-empty samples (and empty features) from the BIOM table.
    output_table, output_metadata, u_ranks = remove_empty_samples_and_features(
        filtered_table, m_sample_metadata, filtered_ranks)

    # Save a list of ranking IDs (before we add in feature metadata)
    # TODO: just have merge_feature_metadata() give us this?
    ranking_ids = u_ranks.columns

    output_ranks, feature_metadata_cols = merge_feature_metadata(
        u_ranks, feature_metadata)

    logging.debug("Finished input processing.")
    return (
        output_metadata,
        output_ranks,
        ranking_ids,
        feature_metadata_cols,
        output_table,
    )