def test_add_sample_presence_count_zeros(): """Checks the case when some features aren't present in any samples.""" table, metadata, ranks = get_test_data() # Test 1: zero out all counts for feature F3 table.loc["F3"] = 0 output_feature_data = add_sample_presence_count(ranks, table) assert_series_equal( output_feature_data["qurro_spc"], Series([3, 2, 0, 3, 2, 2, 2, 2], index=ranks.index, name="qurro_spc"), ) verify_spc_data_integrity(output_feature_data, ranks) # Test 2: zero out all counts table.loc[:] = 0 ofd_2 = add_sample_presence_count(ranks, table) assert_series_equal( ofd_2["qurro_spc"], Series([0] * 8, index=ranks.index, name="qurro_spc"), ) verify_spc_data_integrity(ofd_2, ranks) # Test 3: just one count for one feature table["Sample4"]["F2"] = 1 ofd_3 = add_sample_presence_count(ranks, table) assert_series_equal( ofd_3["qurro_spc"], Series([0, 1, 0, 0, 0, 0, 0, 0], index=ranks.index, name="qurro_spc"), ) verify_spc_data_integrity(ofd_3, ranks)
def test_add_sample_presence_count_name_error(): """Checks the case where the feature data already contains a column called qurro_spc. This should never happen due to check_column_names() being called, but we might as well be careful. """ table, metadata, ranks = get_test_data() ranks.columns = ["Rank 0", "qurro_spc"] with pytest.raises(ValueError): add_sample_presence_count(ranks, table)
def test_add_sample_presence_count_basic(): # NOTE: for reference, the get_test_data() table initially looks like this: # "Sample1": [1, 2, 3, 4, 5, 6, 7, 8], # "Sample2": [8, 7, 6, 5, 4, 3, 2, 1], # "Sample3": [1, 0, 0, 0, 0, 0, 0, 0], # "Sample4": [0, 0, 0, 1, 0, 0, 0, 0], table, metadata, ranks = get_test_data() # Test a basic case. output_feature_data = add_sample_presence_count(ranks, table) assert_series_equal( output_feature_data["qurro_spc"], Series([3, 2, 2, 3, 2, 2, 2, 2], index=ranks.index, name="qurro_spc"), ) # Make sure that the underlying feature data remains the same verify_spc_data_integrity(output_feature_data, ranks)
def gen_rank_plot(V, rank_type, ranking_ids, feature_metadata_cols, table_sdf): """Uses Altair to generate a JSON Vega-Lite spec for the rank plot. Parameters ---------- V: pd.DataFrame DataFrame containing feature rank (and feature metadata, if applicable) information. (Indices correspond to features, and columns correspond to feature ranking or feature metadata fields.) This should have already been matched with the BIOM table, filtered (if -x passed), had empty features removed, etc. rank_type: str Human-readable name for a given ranking column that will be used as the prefix for each y-axis label in the rank plot. (This should be either "Differential" or "Feature Loading".) ranking_ids: pd.Index IDs of the actual "feature ranking" columns in V. feature_metadata_cols: pd.Index or list IDs of the "feature metadata" columns in V (if there wasn't any feature metadata provided, this can just be an empty list). table_sdf: pd.SparseDataFrame A representation of the input BIOM table containing count data. This is used to calculate qurro_spc (the number of samples a feature is present in) for each feature in V. This should ONLY contain samples that will be used in the Qurro visualization -- the presence of extra samples will mess up _df_utils.add_sample_presence_count(). Returns ------- rank_chart_json: dict A dict version of the alt.Chart for the rank plot, with qurro_rank_ordering and qurro_feature_metadata_ordering datasets added in indicating which columns describe feature rankings and which describe feature metadata. (Also has a qurro_rank_type "dataset" (really just a string) that points to the specified rank_type.) """ rank_data = V.copy() # NOTE that until this point we've treated the actual rank values as just # "objects", as far as pandas is concerned. However, if we continue to # treat them as objects when sorting them, we'll get a list of feature # ranks in lexicographic order... which is not what we want. So we just # ensure that all of the columns contain numeric data. for col in ranking_ids: rank_data[col] = pd.to_numeric(rank_data[col]) # The default rank column is just whatever the first rank is. This is what # the rank plot will use when it's first drawn. default_rank_col = ranking_ids[0] # Set default classification of every feature to "None" # (This value will be updated when a feature is selected in the rank plot # as part of the numerator, denominator, or both parts of the current log # ratio.) rank_data["qurro_classification"] = "None" # Add a "qurro_spc" column indicating how many samples each feature is # present in. rank_data = add_sample_presence_count(rank_data, table_sdf) # Replace "index" with "Feature ID". looks nicer in the visualization :) rank_data.rename_axis("Feature ID", axis="index", inplace=True) rank_data.reset_index(inplace=True) # Now, we can actually create the rank plot. rank_chart = ( alt.Chart( rank_data, title="Features", background="#FFFFFF", autosize=alt.AutoSizeParams(resize=True), ).mark_bar().transform_window( sort=[alt.SortField(field=default_rank_col, order="ascending")], # We don't use an alt.WindowFieldDef here because python gets # confused when you use "as" as an actual argument name. So we just # use this syntax. window=[{ "op": "row_number", "as": "qurro_x" }], ).encode( # type="ordinal" needed on the scale here to make bars adjacent; # see https://stackoverflow.com/a/55544817/10730311. x=alt.X( "qurro_x", title="Feature Rankings", type="ordinal", scale=alt.Scale(paddingOuter=1, paddingInner=0, rangeStep=1), axis=alt.Axis(ticks=False, labelAngle=0), ), y=alt.Y(default_rank_col, type="quantitative"), color=alt.Color( "qurro_classification", title="Log-Ratio Classification", scale=alt.Scale( domain=["None", "Numerator", "Denominator", "Both"], range=["#e0e0e0", "#f00", "#00f", "#949"], ), ), tooltip=[ alt.Tooltip( field="qurro_x", title="Current Ranking", type="quantitative", ), alt.Tooltip( field="qurro_classification", title="Log-Ratio Classification", type="nominal", ), alt.Tooltip( field="qurro_spc", title="Sample Presence Count", type="quantitative", ), "Feature ID", *feature_metadata_cols, *ranking_ids, ], ).configure_axis( # Done in order to differentiate "None"-classification features # from grid lines gridColor="#f2f2f2", labelBound=True, ).interactive()) rank_chart_json = rank_chart.to_dict() rank_ordering = "qurro_rank_ordering" fm_col_ordering = "qurro_feature_metadata_ordering" dataset_name_for_rank_type = "qurro_rank_type" check_json_dataset_names(rank_chart_json, rank_ordering, fm_col_ordering, rank_type) # Note we don't use rank_data.columns for setting the rank ordering. This # is because rank_data's columns now include both the ranking IDs and the # "Feature ID" and "qurro_classification" columns (as well as any feature # metadata the user saw fit to pass in). rank_chart_json["datasets"][rank_ordering] = list(ranking_ids) rank_chart_json["datasets"][fm_col_ordering] = list(feature_metadata_cols) rank_chart_json["datasets"][dataset_name_for_rank_type] = rank_type return rank_chart_json