def test_aggregate_median_allvar(): """ Testing aggregate pycytominer function """ aggregate_result = aggregate( population_df=data_df, strata=["g"], features="infer", operation="median" ) expected_result = pd.concat( [ pd.DataFrame({"g": "a", "Cells_x": [3], "Nuclei_y": [3]}), pd.DataFrame({"g": "b", "Cells_x": [3], "Nuclei_y": [3]}), ] ).reset_index(drop=True) expected_result = expected_result.astype(dtype_convert_dict) assert aggregate_result.equals(expected_result) # Test output aggregate( population_df=data_df, strata=["g"], features="infer", operation="median", output_file=test_output_file, ) test_df = pd.read_csv(test_output_file) pd.testing.assert_frame_equal(test_df, expected_result)
def test_aggregate_median_with_missing_values(): """ Testing aggregate pycytominer function """ # Convert dtype of one variable to object data_dtype_df = data_missing_df.copy() data_dtype_df.Cells_x = data_dtype_df.Cells_x.astype(str) aggregate_result = aggregate(population_df=data_dtype_df, strata=["g"], features="infer", operation="median") print(aggregate_result) expected_result = pd.concat([ pd.DataFrame({ "g": "a", "Cells_x": [3], "Nuclei_y": [3] }), pd.DataFrame({ "g": "b", "Cells_x": [3], "Nuclei_y": [3] }), ]).reset_index(drop=True) expected_result = expected_result.astype(dtype_convert_dict) assert aggregate_result.equals(expected_result)
def test_aggregate_mean_subsetvar(): """ Testing aggregate pycytominer function """ aggregate_result = aggregate( population_df=data_df, strata=["g"], features=["Cells_x"], operation="mean" ) expected_result = pd.DataFrame({"g": ["a", "b"], "Cells_x": [4, 3]}) expected_result.Cells_x = expected_result.Cells_x.astype(float) assert aggregate_result.equals(expected_result)
def test_aggregate_incorrect_object_feature(): """ Testing aggregate pycytominer function """ incorrect_object_feature = "DOES NOT EXIST" with pytest.raises(KeyError) as err: aggregate_result = aggregate( population_df=data_df, strata=["g"], features="infer", operation="median", compute_object_count=True, object_feature=incorrect_object_feature, ) assert ( f"The following labels were missing: Index(['{incorrect_object_feature}'], dtype='object')" in str(err)) # Test that aggregate doesn't drop samples if strata is na data_missing_group_df = pd.concat([ data_df, pd.DataFrame({ "g": np.nan, "Cells_x": [1, 3, 8], "Nuclei_y": [5, 3, 1] }), ]) result = aggregate( population_df=data_missing_group_df, strata=["g"], features="infer", operation="median", ) # There should be three total groups assert result.shape[0] == 3
def test_aggregate_comparment(): df = image_df.merge(cells_df, how="inner", on=["TableNumber", "ImageNumber"]) result = aggregate(df) ap_result = ap.aggregate_compartment("cells") expected_result = pd.DataFrame({ "Metadata_Plate": ["plate", "plate"], "Metadata_Well": ["A01", "A02"], "Cells_a": [368.0, 583.5], "Cells_b": [482.0, 478.5], "Cells_c": [531.0, 461.5], "Cells_d": [585.5, 428.0], }) pd.testing.assert_frame_equal(result, expected_result) pd.testing.assert_frame_equal(result, ap_result) pd.testing.assert_frame_equal(ap_result, expected_result)
def test_custom_objectnumber_feature(): """ Testing aggregate pycytominer function """ data_df_copy = ( data_df.copy() .rename(columns={'Metadata_ObjectNumber': 'Custom_ObjectNumber_Feature'}) ) aggregate_result = aggregate( population_df=data_df_copy, strata=["g"], features="infer", operation="median", compute_object_count=True, object_feature='Custom_ObjectNumber_Feature' ) expected_result = pd.concat( [ pd.DataFrame( { "g": "a", "Metadata_Object_Count": [3], "Cells_x": [3], "Nuclei_y": [3], } ), pd.DataFrame( { "g": "b", "Metadata_Object_Count": [3], "Cells_x": [3], "Nuclei_y": [3], } ), ] ).reset_index(drop=True) expected_result = expected_result.astype(dtype_convert_dict) assert aggregate_result.equals(expected_result)
def test_aggregate_profiles(): result = ap.aggregate_profiles() expected_result = pd.DataFrame( { "Metadata_Plate": ["plate", "plate"], "Metadata_Well": ["A01", "A02"], "Metadata_Object_Count": [50, 50], "Metadata_Site_Count": [1, 1], "Cells_a": [368.0, 583.5], "Cells_b": [482.0, 478.5], "Cells_c": [531.0, 461.5], "Cells_d": [585.5, 428.0], "Cytoplasm_a": [479.5, 495.5], "Cytoplasm_b": [445.5, 459.0], "Cytoplasm_c": [407.5, 352.0], "Cytoplasm_d": [533.0, 545.0], "Nuclei_a": [591.5, 435.5], "Nuclei_b": [574.0, 579.0], "Nuclei_c": [588.5, 538.5], "Nuclei_d": [483.0, 560.0], } ) pd.testing.assert_frame_equal( result.sort_index(axis=1), expected_result.sort_index(axis=1) ) # Confirm aggregation after merging single cells sc_df = ap.merge_single_cells() sc_aggregated_df = aggregate(sc_df, compute_object_count=True).sort_index( axis="columns" ) pd.testing.assert_frame_equal( result.sort_index(axis="columns").drop("Metadata_Site_Count", axis="columns"), sc_aggregated_df, )
def aggregate_compartment( self, compartment, compute_subsample=False, compute_counts=False, add_image_features=False, n_aggregation_memory_strata=1, ): """Aggregate morphological profiles. Uses pycytominer.aggregate() Parameters ---------- compartment : str Compartment to aggregate. compute_subsample : bool, default False Whether or not to subsample. compute_counts : bool, default False Whether or not to compute the number of objects in each compartment and the number of fields of view per well. add_image_features : bool, default False Whether or not to add image features. n_aggregation_memory_strata : int, default 1 Number of unique strata to pull from the database into working memory at once. Typically 1 is fastest. A larger number uses more memory. For example, if aggregating by "well", then n_aggregation_memory_strata=1 means that one "well" will be pulled from the SQLite database into memory at a time. Returns ------- pandas.core.frame.DataFrame DataFrame of aggregated profiles. """ check_compartments(compartment) if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample: self.get_subsample(compartment=compartment) # Load image data if not already loaded if not self.load_image_data: self.load_image() self.load_image_data = True # Iteratively call aggregate() on chunks of the full compartment table object_dfs = [] for compartment_df in self._compartment_df_generator( compartment=compartment, n_aggregation_memory_strata=n_aggregation_memory_strata, ): population_df = self.image_df.merge( compartment_df, how="inner", on=self.merge_cols, ).rename(self.linking_col_rename, axis="columns") if self.features == "infer": aggregate_features = infer_cp_features( population_df, compartments=compartment ) else: aggregate_features = self.features partial_object_df = aggregate( population_df=population_df, strata=self.strata, compute_object_count=compute_counts, operation=self.aggregation_operation, subset_data_df=self.subset_data_df, features=aggregate_features, object_feature=self.object_feature, ) if compute_counts and self.fields_of_view_feature not in self.strata: fields_count_df = aggregate_fields_count( self.image_df, self.strata, self.fields_of_view_feature ) if add_image_features: fields_count_df = aggregate_image_features( fields_count_df, self.image_features_df, self.image_feature_categories, self.image_cols, self.strata, self.aggregation_operation, ) partial_object_df = fields_count_df.merge( partial_object_df, on=self.strata, how="right", ) # Separate all the metadata and feature columns. metadata_cols = infer_cp_features(partial_object_df, metadata=True) feature_cols = infer_cp_features(partial_object_df, image_features=True) partial_object_df = partial_object_df.reindex( columns=metadata_cols + feature_cols ) object_dfs.append(partial_object_df) # Concatenate one or more aggregated dataframes row-wise into final output object_df = pd.concat(object_dfs, axis=0).reset_index(drop=True) return object_df
else: warnings.warn( f"{site_file} does not exist. There must have been an error in processing" ) single_cell_df = pd.concat(single_cell_df, axis="rows").reset_index(drop=True) # Perform the aggregation based on the defined levels and columns aggregate_output_dir.mkdir(parents=True, exist_ok=True) for aggregate_level, aggregate_columns in aggregate_levels.items(): aggregate_output_file = aggregate_output_files[aggregate_level] print( f"Now aggregating by {aggregate_level}...with operation: {aggregate_operation}" ) aggregate_df = aggregate( population_df=single_cell_df, strata=aggregate_columns, features=aggregate_features, operation=aggregate_operation, ) output( aggregate_df, output_filename=aggregate_output_file, compression=compression, float_format=float_format, )
def aggregate_compartment( self, compartment, compute_subsample=False, compute_counts=False, aggregate_args=None, ): """Aggregate morphological profiles. Uses pycytominer.aggregate() Parameters ---------- compartment : str Compartment to aggregate. compute_subsample : bool, default False Whether or not to subsample. compute_counts : bool, default False Whether or not to compute the number of objects in each compartment and the number of fields of view per well. aggregate_args : dict, optional Additional arguments passed as input to pycytominer.aggregate(). Returns ------- pandas.core.frame.DataFrame DataFrame of aggregated profiles. """ check_compartments(compartment) if (self.subsample_frac < 1 or self.subsample_n != "all") and compute_subsample: self.get_subsample(compartment=compartment) # Load image data if not already loaded if not self.load_image_data: self.load_image() self.load_image_data = True population_df = self.image_df.merge( self.load_compartment(compartment=compartment), how="inner", on=self.merge_cols, ).rename(self.linking_col_rename, axis="columns") # Infering features is tricky with non-canonical data if aggregate_args is None: aggregate_args = {} features = infer_cp_features(population_df, compartments=compartment) elif "features" not in aggregate_args: features = infer_cp_features(population_df, compartments=compartment) elif aggregate_args["features"] == "infer": features = infer_cp_features(population_df, compartments=compartment) else: features = aggregate_args["features"] aggregate_args["features"] = features if "object_feature" not in aggregate_args: aggregate_args["object_feature"] = self.object_feature object_df = aggregate( population_df=population_df, strata=self.strata, compute_object_count=compute_counts, operation=self.aggregation_operation, subset_data_df=self.subset_data_df, **aggregate_args, ) if compute_counts and self.fields_of_view_feature not in self.strata: fields_count_df = self.image_df.loc[ :, list(np.union1d(self.strata, self.fields_of_view_feature)) ] fields_count_df = ( fields_count_df.groupby(self.strata)[self.fields_of_view_feature] .count() .reset_index() .rename( columns={f"{self.fields_of_view_feature}": f"Metadata_Site_Count"} ) ) object_df = fields_count_df.merge(object_df, on=self.strata, how="right") return object_df
def consensus( profiles, replicate_columns=["Metadata_Plate", "Metadata_Well"], operation="median", features="infer", output_file="none", compression_options=None, float_format=None, modz_args={"method": "spearman"}, ): """Form level 5 consensus profile data. :param profiles: A file or pandas DataFrame of profile data :type profiles: str :param replicate_columns: Metadata columns indicating which replicates to collapse, defaults to ["Metadata_Plate", "Metadata_Well"] :type replicate_columns: list :param operation: The method used to form consensus profiles, defaults to "median" :type operation: str :param features: The features to collapse, defaults to "infer" :type features: str, list :param output_file: If specified, the location to write the file, defaults to "none" :type output_file: str :param modz_args: Additional custom arguments passed as kwargs if operation="modz". See pycytominer.cyto_utils.modz for more details. :type modz_args: dict :param compression_options: the method to compress output data, defaults to None. See pycytominer.cyto_utils.output.py for options :type compression_options: str :param float_format: decimal precision to use in writing output file, defaults to None. For example, use "%.3g" for 3 decimal precision. :Example: import pandas as pd from pycytominer import consensus data_df = pd.concat( [ pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "a", "Cells_x": [0.1, 0.3, 0.8], "Nuclei_y": [0.5, 0.3, 0.1], } ), pd.DataFrame( { "Metadata_Plate": "X", "Metadata_Well": "b", "Cells_x": [0.4, 0.2, -0.5], "Nuclei_y": [-0.8, 1.2, -0.5], } ), ] ).reset_index(drop=True) consensus_df = consensus( profiles=data_df, replicate_columns=["Metadata_Plate", "Metadata_Well"], operation="median", features="infer", output_file="none", ) """ # Confirm that the operation is supported check_consensus_operation(operation) # Load Data profiles = load_profiles(profiles) if operation == "modz": consensus_df = modz(population_df=profiles, replicate_columns=replicate_columns, features=features, **modz_args) else: consensus_df = aggregate( population_df=profiles, strata=replicate_columns, features=features, operation=operation, subset_data_df="none", ) if output_file != "none": output( df=consensus_df, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return consensus_df
axis="columns")) file = os.path.join("results", "all_profile_metadata.tsv") all_measurements_df.to_csv(file, sep='\t', index=False) print(all_measurements_df.shape) all_measurements_df.head() # ## A. Apply Median Consensus Aggregation # # ### 1) To the Cell Painting Data # In[10]: x_median_df = aggregate(x_df, strata=["Metadata_cell_line", "Metadata_pert_name"], features="infer", operation="median") x_median_df = (x_median_df.query( "Metadata_pert_name in @all_measurements_df.Metadata_pert_name.unique()" ).query( "Metadata_cell_line in @all_measurements_df.Metadata_cell_line.unique()" ).reset_index(drop=True).reset_index().rename({"index": "Metadata_profile_id"}, axis='columns')) x_median_df.Metadata_profile_id = [ "profile_{}".format(x) for x in x_median_df.Metadata_profile_id ] print(x_median_df.shape) x_median_df.head()
output_file, index=False, sep="\t", compression={"method": "gzip", "mtime": 1} ) # ## Calculate bulk perturbseq data # In[7]: # Perform single cell aggregation into bulk bulk_df = aggregate( population_df=sc_df, strata=["Metadata_guide_identity"], features=gene_features, operation="median" ) # create a column for the gene bulk_df = ( bulk_df .assign(Metadata_gene_identity=[x.split("_")[0] for x in bulk_df.Metadata_guide_identity]) .query("Metadata_gene_identity != '*'") ) bulk_df = bulk_df.reindex(["Metadata_guide_identity", "Metadata_gene_identity"] + gene_features, axis="columns") print(bulk_df.shape) bulk_df.head()