def test_modz_multiple_columns(): replicate_columns = ["Metadata_g", "Metadata_h"] data_replicate_multi_df = data_replicate_df.assign( Metadata_h=["c", "c", "c", "d", "d", "d"]) # The expected result is to completely remove influence of anticorrelated sample consensus_df = modz(data_replicate_multi_df, replicate_columns, min_weight=0, precision=precision) expected_result = pd.DataFrame({ "Metadata_g": ["a", "b"], "Metadata_h": ["c", "d"], "Cells_x": [1.0, 4.0], "Cytoplasm_y": [5.0, 2.0], "Nuclei_z": [2.0, -0.5], }) pd.testing.assert_frame_equal(expected_result.reset_index(), consensus_df.reset_index()) # With the min_weight = 1, then modz is mean consensus_df = modz(data_replicate_multi_df, replicate_columns, min_weight=1, precision=precision) expected_result = data_replicate_multi_df.groupby( replicate_columns).mean().round(4) pd.testing.assert_frame_equal(expected_result.reset_index(), consensus_df, check_less_precise=True)
def test_modz(): # The expected result is to completely remove influence of anticorrelated sample consensus_df = modz(data_replicate_df, replicate_columns, min_weight=0, precision=precision) expected_result = pd.DataFrame( { "Cells_x": [1.0, 4.0], "Cytoplasm_y": [5.0, 2.0], "Nuclei_z": [2.0, -0.5] }, index=["a", "b"], ) expected_result.index.name = replicate_columns pd.testing.assert_frame_equal(expected_result.reset_index(), consensus_df) # With the min_weight = 1, then modz is mean consensus_df = modz(data_replicate_df, replicate_columns, min_weight=1, precision=precision) expected_result = data_replicate_df.groupby( replicate_columns).mean().round(4) expected_result.index.name = replicate_columns pd.testing.assert_frame_equal(expected_result.reset_index(), consensus_df, check_less_precise=True)
def test_modz_extraneous_column(): # The expected result is to completely remove influence of anticorrelated sample data_replicate_new_col_df = data_replicate_df.assign(Metadata_h="c") consensus_df = modz(data_replicate_new_col_df, replicate_columns, min_weight=0, precision=precision) expected_result = pd.DataFrame( { "Cells_x": [1.0, 4.0], "Cytoplasm_y": [5.0, 2.0], "Nuclei_z": [2.0, -0.5] }, index=["a", "b"], ) expected_result.index.name = replicate_columns pd.testing.assert_frame_equal(expected_result.reset_index(), consensus_df)
def test_modz_unbalanced_sample_numbers(): # The expected result is to not freak out when only one sample exists for a piece of metadata data_replicate_multi_df = data_replicate_df.assign( Metadata_h=["c", "c", "c", "c", "c", "d"]) consensus_df = modz( data_replicate_multi_df, replicate_columns="Metadata_h", min_weight=0, precision=precision, ) expected_result = pd.DataFrame( { "Metadata_h": ["c", "d"], "Cells_x": [0.9999, 5.0], "Cytoplasm_y": [5.9994, 1.0], "Nuclei_z": [2.9997, 1], }, ) pd.testing.assert_frame_equal(expected_result, consensus_df)
def test_modz_multiple_columns_feature_specify(): # Include replicate information data_replicate_feature_df = pd.concat([ pd.DataFrame({ "g": "a", "x": [1, 1, -1], "y": [5, 5, -5], "z": [2, 2, -2] }), pd.DataFrame({ "g": "b", "x": [1, 3, 5], "y": [8, 3, 1], "z": [5, -2, 1] }), ]).reset_index(drop=True) data_replicate_feature_df.index = [ "sample_{}".format(x) for x in data_replicate_feature_df.index ] # The expected result is to completely remove influence of anticorrelated sample consensus_df = modz( data_replicate_feature_df, replicate_columns="g", features=["x", "y", "z"], min_weight=0, precision=precision, ) expected_result = pd.DataFrame( { "x": [1.0, 4.0], "y": [5.0, 2.0], "z": [2.0, -0.5] }, index=["a", "b"]) expected_result.index.name = "g" pd.testing.assert_frame_equal(expected_result.reset_index(), consensus_df)
def consensus( profiles, replicate_columns=["Metadata_Plate", "Metadata_Well"], operation="median", features="infer", output_file="none", compression_options=None, float_format=None, modz_args={"method": "spearman"}, ): """Form level 5 consensus profile data. :param profiles: A file or pandas DataFrame of profile data :type profiles: str :param replicate_columns: Metadata columns indicating which replicates to collapse, defaults to ["Metadata_Plate", "Metadata_Well"] :type replicate_columns: list :param operation: The method used to form consensus profiles, defaults to "median" :type operation: str :param features: The features to collapse, defaults to "infer" :type features: str, list :param output_file: If specified, the location to write the file, defaults to "none" :type output_file: str :param modz_args: Additional custom arguments passed as kwargs if operation="modz". See pycytominer.cyto_utils.modz for more details. :type modz_args: dict :param compression_options: the method to compress output data, defaults to None. See pycytominer.cyto_utils.output.py for options :type compression_options: str :param float_format: decimal precision to use in writing output file, defaults to None. For example, use "%.3g" for 3 decimal precision. :Example: import pandas as pd from pycytominer import consensus data_df = pd.concat( [ pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "a", "Cells_x": [0.1, 0.3, 0.8], "Nuclei_y": [0.5, 0.3, 0.1], } ), pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "b", "Cells_x": [0.4, 0.2, -0.5], "Nuclei_y": [-0.8, 1.2, -0.5], } ), ] ).reset_index(drop=True) consensus_df = consensus( profiles=data_df, replicate_columns=["Metadata_Plate", "Metadata_Well"], operation="median", features="infer", output_file="none", ) """ # Confirm that the operation is supported check_consensus_operation(operation) # Load Data profiles = load_profiles(profiles) if operation == "modz": consensus_df = modz(population_df=profiles, replicate_columns=replicate_columns, features=features, **modz_args) else: consensus_df = aggregate( population_df=profiles, strata=replicate_columns, features=features, operation=operation, subset_data_df="none", ) if output_file != "none": output( df=consensus_df, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return consensus_df