예제 #1
0
def test_vibe_check_successes():
    table, metadata, ranks = get_test_data()

    # Should succeed since all of the test data, by default, is in the default
    # safe range
    vibe_check(ranks, table)

    # Should succeed since the numbers in the test data (table and ranks) range
    # from 0 to 8
    vibe_check(ranks, table, safe_range=[0, 8])
예제 #2
0
def test_vibe_check_safe_range_invalid_safe_ranges():
    """Checks cases where the input range specified to vibe_check() is somehow
    invalid.
    """
    table, metadata, ranks = get_test_data()

    ranges = [[1, 2, 3, 4, 5], [], [1], (), (2, )]
    for r in ranges:
        with pytest.raises(ValueError) as exception_info:
            vibe_check(ranks, table, safe_range=r)
        assert "safe_range must have a length of 2." in str(
            exception_info.value)

    with pytest.raises(ValueError) as exception_info:
        vibe_check(ranks, table, safe_range=[10, 1])
    assert "safe_range[1] must be GREATER THAN safe_range[0]." in str(
        exception_info.value)
예제 #3
0
def test_vibe_check_failures():
    table, metadata, ranks = get_test_data()

    # Accordingly, should fail
    with pytest.raises(OverflowError) as exception_info:
        vibe_check(ranks, table, safe_range=[1, 8])
    assert (
        'The input feature table contains entries lower than the "safe" lower '
        "limit for numbers of 1.") in str(exception_info.value)

    # Should also fail
    with pytest.raises(OverflowError) as exception_info:
        vibe_check(ranks, table, safe_range=[0, 7])
    assert ('The input feature table contains entries larger than the "safe" '
            "upper limit for numbers of 7.") in str(exception_info.value)

    # Test failure, with the default safe range, on a few small cases.
    lower_lim = -(2**53) - 1
    upper_lim = (2**53) - 1

    weird_small_values = [lower_lim - 1, lower_lim * 2, lower_lim * 3]
    for w in weird_small_values:
        ranks["Rank 0"]["F3"] = w

        with pytest.raises(OverflowError) as exception_info:
            vibe_check(ranks, table)
        assert (
            "The input feature rankings data contains entries lower than the "
            '"safe" lower limit for numbers of -9007199254740991.') in str(
                exception_info.value)

    # Test failure, with the default safe range, on a few large cases.
    weird_large_values = [upper_lim + 1, upper_lim * 2, upper_lim * 3]
    for w in weird_large_values:
        ranks["Rank 0"]["F3"] = w

        with pytest.raises(OverflowError) as exception_info:
            vibe_check(ranks, table)
        assert (
            "The input feature rankings data contains entries larger than the "
            '"safe" upper limit for numbers of 9007199254740991.') in str(
                exception_info.value)
예제 #4
0
def process_input(
    feature_ranks,
    sample_metadata,
    biom_table,
    feature_metadata=None,
    extreme_feature_count=None,
):
    """Validates/processes the input files and parameter(s) to Qurro.

    In particular, this function

    1. Calls validate_df() and then check_column_names() on all of the
       input DataFrames passed (feature ranks, sample metadata, feature
       metadata if passed).

    2. Calls replace_nan() on the metadata DataFrame(s), so that all
       missing values are represented consistently with a None (which
       will be represented as a null in JSON/JavaScript).

    3. Converts the BIOM table to a SparseDataFrame by calling
       biom_table_to_sparse_df().

    4. Runs vibe_check() on the feature ranks and BIOM table to ensure
       that numbers are within the range of safe IEEE 754 numbers for
       JavaScript. NOTE: STILL NEED TO CHECK METADATA USING THIS SOMEHOW

    5. Matches up the table with the feature ranks and sample metadata by
       calling match_table_and_data().

    6. Calls filter_unextreme_features() using the provided
       extreme_feature_count. (If it's None, then nothing will be done.)

    7. Calls remove_empty_samples_and_features() to filter empty samples
       (and features). This is purposefully done *after*
       filter_unextreme_features() is called.

    8. Calls merge_feature_metadata() on the feature ranks and feature
       metadata. (If feature metadata is None, nothing will be done.)

    Returns
    -------
    output_metadata: pd.DataFrame
         Sample metadata, but matched with the table and with empty samples
         removed.

    output_ranks: pd.DataFrame
         Feature ranks, post-filtering and with feature metadata columns
         added in.

    ranking_ids
         The ranking columns' names in output_ranks.

    feature_metadata_cols: list
         The feature metadata columns' names in output_ranks.

    output_table: pd.SparseDataFrame
         The BIOM table, post matching with the feature ranks and sample
         metadata and with empty samples removed.
    """

    logging.debug("Starting processing input.")

    validate_df(feature_ranks, "feature ranks", 2, 1)
    validate_df(sample_metadata, "sample metadata", 1, 1)
    if feature_metadata is not None:
        # It's cool if there aren't any features actually described in the
        # feature metadata (hence why we pass in 0 as the minimum # of rows in
        # the feature metadata DataFrame), but we still pass it to
        # validate_df() in order to ensure that:
        #   1) there's at least one feature metadata column (because
        #      otherwise the feature metadata is useless)
        #   2) column names are unique
        validate_df(feature_metadata, "feature metadata", 0, 1)

    check_column_names(sample_metadata, feature_ranks, feature_metadata)

    # Replace NaN values (which both _metadata_utils.read_metadata_file() and
    # qiime2.Metadata use to represent missing values, i.e. ""s) with None --
    # this is generally easier for us to handle in the JS side of things (since
    # it'll just be consistently converted to null by json.dumps()).
    sample_metadata = replace_nan(sample_metadata)
    if feature_metadata is not None:
        feature_metadata = replace_nan(feature_metadata)

    table = biom_table_to_sparse_df(biom_table)

    # Check that the solely-numeric data only contains "safe" numbers
    vibe_check(feature_ranks, table)

    # Match up the table with the feature ranks and sample metadata.
    m_table, m_sample_metadata = match_table_and_data(table, feature_ranks,
                                                      sample_metadata)

    # Note that although we always call filter_unextreme_features(), filtering
    # isn't necessarily always done (whether or not depends on the value of
    # extreme_feature_count and the contents of the table/ranks).
    filtered_table, filtered_ranks = filter_unextreme_features(
        m_table, feature_ranks, extreme_feature_count)

    # Filter now-empty samples (and empty features) from the BIOM table.
    output_table, output_metadata, u_ranks = remove_empty_samples_and_features(
        filtered_table, m_sample_metadata, filtered_ranks)

    # Save a list of ranking IDs (before we add in feature metadata)
    # TODO: just have merge_feature_metadata() give us this?
    ranking_ids = u_ranks.columns

    output_ranks, feature_metadata_cols = merge_feature_metadata(
        u_ranks, feature_metadata)

    logging.debug("Finished input processing.")
    return (
        output_metadata,
        output_ranks,
        ranking_ids,
        feature_metadata_cols,
        output_table,
    )