def load_data( plate, pycyto_dict, cyto_dict, level, round_decimals, well_col="Metadata_Well", plate_col="Metadata_Plate", sample_col="Metadata_broad_sample", ): # Extract file from file dictionary try: pycyto_file = pycyto_dict[plate][level] cyto_file = cyto_dict[plate][level] except KeyError: raise KeyError(f"Data not found, skipping! {plate}: {level}") # Load data pycyto_df = pd.read_csv(pycyto_file) try: cyto_df = pd.read_csv(cyto_file).drop( ["Cytoplasm_Parent_Cells", "Cytoplasm_Parent_Nuclei"], axis="columns") except KeyError: cyto_df = pd.read_csv(cyto_file) # Confirm metadata are aligned pd.testing.assert_series_equal(pycyto_df.loc[:, well_col], cyto_df.loc[:, well_col]) pd.testing.assert_series_equal(pycyto_df.loc[:, plate_col], cyto_df.loc[:, plate_col]) pd.testing.assert_series_equal(pycyto_df.loc[:, sample_col], cyto_df.loc[:, sample_col]) # Align to CP Features only pycyto_features = infer_cp_features(pycyto_df) cyto_features = infer_cp_features(cyto_df) # Features must be the same before feature selection if level in ["level_3", "level_4a"]: assert set(pycyto_features) == set( cyto_features), "features should be aligned!" # Reindex and round data pycyto_df = pycyto_df.reindex(set(pycyto_features), axis="columns").round(round_decimals) cyto_df = cyto_df.reindex(set(cyto_features), axis="columns").round(round_decimals) # If we're testing pycytominer feature selection procedure, # align cyto data with pycyto features if level == "pycytominer_select": cyto_df = cyto_df.reindex(set(pycyto_features), axis="columns") # Return a tuple of (pycyto data, cyto data) with aligned feature indices return (pycyto_df, cyto_df)
def pipeline_normalize(self, batch, plate, steps, samples, suffix=None): normalize_steps = steps output_dir = pathlib.PurePath(".", self.pipeline_output, batch, plate) annotate_output_file = pathlib.PurePath(output_dir, f"{plate}_augmented.csv.gz") normalize_output_file = pathlib.PurePath(output_dir, f"{plate}_normalized.csv.gz") if suffix: normalize_output_file = pathlib.PurePath( output_dir, f"{plate}_normalized_{suffix}.csv.gz") normalization_features = normalize_steps["features"] normalization_method = normalize_steps["method"] if normalization_features == "infer" and self.noncanonical: normalization_features = cyto_utils.infer_cp_features( pd.read_csv(annotate_output_file), compartments=self.compartments) normalize( profiles=annotate_output_file, features=normalization_features, samples=samples, method=normalization_method, output_file=normalize_output_file, compression_options=self.pipeline_options["compression"], float_format=self.pipeline_options["float_format"], )
def variance_threshold(population_df, features="infer", samples="all", freq_cut=0.05, unique_cut=0.01): """ Exclude features that have low variance (low information content) Parameters ---------- population_df : pandas.core.frame.DataFrame or file DataFrame that includes metadata and observation features. features : list, default "infer" List of features present in the population dataframe [default: "infer"] if "infer", then assume cell painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_". samples : list or str, default "all" List of samples to perform operation on. If "all", use all samples to calculate. freq_cut : float, default 0.05 Ratio (2nd most common feature val / most common). unique_cut: float, default 0.01 Ratio (num unique features / num samples). Returns ------- excluded_features : list of str List of features to exclude from the population_df. """ assert 0 <= freq_cut <= 1, "freq_cut variable must be between (0 and 1)" assert 0 <= unique_cut <= 1, "unique_cut variable must be between (0 and 1)" # Subset dataframe if samples != "all": population_df = population_df.loc[samples, :] if features == "infer": features = infer_cp_features(population_df) population_df = population_df.loc[:, features] # Test if excluded for low frequency excluded_features_freq = population_df.apply( lambda x: calculate_frequency(x, freq_cut), axis="rows") excluded_features_freq = excluded_features_freq[ excluded_features_freq.isna()].index.tolist() # Test if excluded for uniqueness n = population_df.shape[0] num_unique_features = population_df.nunique() unique_ratio = num_unique_features / n unique_ratio = unique_ratio < unique_cut excluded_features_unique = unique_ratio[unique_ratio].index.tolist() excluded_features = list( set(excluded_features_freq + excluded_features_unique)) return excluded_features
def convert_data(df_dict): df = pd.concat(df_dict.values(), ignore_index=True, sort=True).reset_index(drop=True) cp_cols = infer_cp_features(df) meta_cols = df.drop(cp_cols, axis="columns").columns.tolist() return df.reindex(meta_cols + cp_cols, axis="columns")
def load_data(y_col="Metadata_CellLine", wt_col="WT", return_meta=False, shuffle_row_order=False): train_file = pathlib.Path("data", "example_train.tsv.gz") train_df = pd.read_csv(train_file, sep="\t") test_file = pathlib.Path("data", "example_test.tsv.gz") test_df = pd.read_csv(test_file, sep="\t") if shuffle_row_order: train_df = train_df.sample(frac=1).reset_index(drop=True) test_df = test_df.sample(frac=1).reset_index(drop=True) y_train_df = pd.DataFrame(train_df.loc[:, y_col]).assign(status=0) y_train_df.loc[y_train_df.loc[:, y_col] != wt_col, "status"] = 1 y_test_df = pd.DataFrame(test_df.loc[:, y_col]).assign(status=0) y_test_df.loc[y_test_df.loc[:, y_col] != wt_col, "status"] = 1 cp_features = infer_cp_features(train_df) x_train_df = train_df.loc[:, cp_features] x_test_df = test_df.loc[:, cp_features] if return_meta: meta_train_df = train_df.drop(cp_features, axis="columns") meta_test_df = test_df.drop(cp_features, axis="columns") return x_train_df, y_train_df, meta_train_df, x_test_df, y_test_df, meta_test_df else: return x_train_df, y_train_df, x_test_df, y_test_df
def aggregate( population_df, strata=["Metadata_Plate", "Metadata_Well"], features="infer", operation="median", subset_data_df="none", ): """ Combine population dataframe variables by strata groups using given operation Arguments: population_df - pandas DataFrame to group and aggregate strata - [default: ["Metadata_Plate", "Metadata_Well"]] list indicating the columns to groupby and aggregate features - [default: "all"] or list indicating features that should be aggregated operation - [default: "median"] a string indicating how the data is aggregated currently only supports one of ['mean', 'median'] subset_data_df - [default: "none"] a pandas dataframe indicating how to subset the input Return: Pandas DataFrame of aggregated features """ # Check that the operation is supported operation = check_aggregate_operation(operation) # Subset the data to specified samples if isinstance(subset_data_df, pd.DataFrame): population_df = subset_data_df.merge( population_df, how="left", on=subset_data_df.columns.tolist()).reindex(population_df.columns, axis="columns") # Subset dataframe to only specified variables if provided strata_df = population_df.loc[:, strata] if features == "infer": features = infer_cp_features(population_df) population_df = population_df.loc[:, features] else: population_df = population_df.loc[:, features] # Fix dtype of input features (they should all be floats!) convert_dict = {x: float for x in features} population_df = population_df.astype(convert_dict) # Merge back metadata used to aggregate by population_df = pd.concat([strata_df, population_df], axis="columns") # Perform aggregating function population_df = population_df.groupby(strata) if operation == "median": population_df = population_df.median().reset_index() else: population_df = population_df.mean().reset_index() # Aggregated image number and object number do not make sense for col in ["ImageNumber", "ObjectNumber"]: if col in population_df.columns: population_df = population_df.drop([col], axis="columns") return population_df
def correlation_threshold(population_df, features="infer", samples="all", threshold=0.9, method="pearson"): """ Exclude features that have correlations above a certain threshold Arguments: population_df - pandas DataFrame that includes metadata and observation features features - a list of features present in the population dataframe [default: "infer"] if "infer", then assume cell painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_" samples - list samples to perform operation on [default: "all"] - if "all", use all samples to calculate threshold - float between (0, 1) to exclude features [default: 0.9] method - string indicating which correlation metric to use to test cutoff [default: "pearson"] Return: list of features to exclude from the population_df """ # Check that the input method is supported method = check_correlation_method(method) assert 0 <= threshold <= 1, "threshold variable must be between (0 and 1)" # Subset dataframe and calculate correlation matrix across subset features if samples != "all": population_df = population_df.loc[samples, :] if features == "infer": features = infer_cp_features(population_df) population_df = population_df.loc[:, features] # Get correlation matrix and lower triangle of pairwise correlations in long format data_cor_df, pairwise_df = get_pairwise_correlation( population_df=population_df, method=method) # Get absolute sum of correlation across features # The lower the index, the less correlation to the full data frame # We want to drop features with highest correlation, so drop higher index variable_cor_sum = data_cor_df.abs().sum().sort_values().index # And subset to only variable combinations that pass the threshold pairwise_df = pairwise_df.query("correlation > @threshold") # Return an empty list if nothing is over correlation threshold if pairwise_df.shape[0] == 0: return [] # Output the excluded features excluded = pairwise_df.apply( lambda x: determine_high_cor_pair(x, variable_cor_sum), axis="columns") return list(set(excluded.tolist()))
def test_batch_effect_contribution(df, n_components, pca_columns, model_formula): features = infer_cp_features(df) meta_features = infer_cp_features(df, metadata=True) feature_df = df.loc[:, features] pca = decomposition.PCA(n_components=n_components).fit(feature_df) pca_batch_df = pca.transform(feature_df) pca_batch_df = pd.concat([ df.loc[:, meta_features], pd.DataFrame(pca_batch_df, columns=pca_columns), ], axis="columns") melt_df = pd.melt(pca_batch_df, id_vars=meta_features, value_vars=pca_columns, var_name="pca_component", value_name="pca_value") anova_results = [] for pca_component in pca_columns: subset_melt_df = melt_df.query("pca_component == @pca_component") # Setup model model = ols(model_formula, data=subset_melt_df).fit() # Generate ANOVA table anova_table = (sm.stats.anova_lm(model, typ=2).reset_index().rename( { "index": "factor" }, axis="columns").assign(pca=pca_component)) anova_results.append(anova_table) anova_results = pd.concat(anova_results).reset_index(drop=True).dropna() anova_results = anova_results.assign( neg_log_p=-np.log10(anova_results.loc[:, "PR(>F)"]), batch=batch) anova_results.pca = pd.Categorical(anova_results.pca, categories=pca_columns) anova_results = anova_results.assign( component_number=[int(x.split("_")[1]) for x in anova_results.pca]) return anova_results
def variance_threshold(population_df, features="infer", samples="all", freq_cut=0.05, unique_cut=0.01): """ Exclude features that have low variance (low information content) Arguments: population_df - pandas DataFrame that includes metadata and observation features features - a list of features present in the population dataframe [default: "infer"] if "infer", then assume cell painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_" samples - list samples to perform operation on [default: "all"] - if "all", use all samples to calculate freq_cut - float of ratio (second most common feature value / most common) [default: 0.1] unique_cut - float of ratio (num unique features / num samples) [default: 0.1] Return: list of features to exclude from the population_df """ assert 0 <= freq_cut <= 1, "freq_cut variable must be between (0 and 1)" assert 0 <= unique_cut <= 1, "unique_cut variable must be between (0 and 1)" # Subset dataframe if samples != "all": population_df = population_df.loc[samples, :] if features == "infer": features = infer_cp_features(population_df) population_df = population_df.loc[:, features] # Test if excluded for low frequency excluded_features_freq = population_df.apply( lambda x: calculate_frequency(x, freq_cut), axis="rows") excluded_features_freq = excluded_features_freq[ excluded_features_freq.isna()].index.tolist() # Test if excluded for uniqueness n = population_df.shape[0] num_unique_features = population_df.nunique() unique_ratio = num_unique_features / n unique_ratio = unique_ratio < unique_cut excluded_features_unique = unique_ratio[unique_ratio].index.tolist() excluded_features = list( set(excluded_features_freq + excluded_features_unique)) return excluded_features
def load_data(return_meta=False, shuffle_row_order=False, holdout=False, othertreatment=False): output_data_dict = {"train": {}, "test": {}} train_file = pathlib.Path("data", "single_cell_train.tsv.gz") train_df = pd.read_csv(train_file, sep="\t") test_file = pathlib.Path("data", "single_cell_test.tsv.gz") test_df = pd.read_csv(test_file, sep="\t") if shuffle_row_order: train_df = train_df.sample(frac=1).reset_index(drop=True) test_df = test_df.sample(frac=1).reset_index(drop=True) cp_features = infer_cp_features(train_df) output_data_dict["train"]["x"] = train_df.loc[:, cp_features] output_data_dict["test"]["x"] = test_df.loc[:, cp_features] if holdout: output_data_dict["holdout"] = {} holdout_file = pathlib.Path("data", "single_cell_holdout.tsv.gz") holdout_df = pd.read_csv(holdout_file, sep="\t") if shuffle_row_order: holdout_df = holdout_df.sample(frac=1).reset_index(drop=True) output_data_dict["holdout"]["x"] = holdout_df.loc[:, cp_features] if othertreatment: output_data_dict["othertreatment"] = {} other_file = pathlib.Path("data", "single_cell_othertreatment.tsv.gz") other_df = pd.read_csv(other_file, sep="\t") if shuffle_row_order: other_df = other_df.sample(frac=1).reset_index(drop=True) output_data_dict["othertreatment"]["x"] = other_df.loc[:, cp_features] if return_meta: output_data_dict["train"]["meta"] = train_df.drop(cp_features, axis="columns") output_data_dict["test"]["meta"] = test_df.drop(cp_features, axis="columns") if holdout: output_data_dict["holdout"]["meta"] = holdout_df.drop( cp_features, axis="columns") if othertreatment: output_data_dict["othertreatment"]["meta"] = other_df.drop( cp_features, axis="columns") return output_data_dict
def transform(df, features="infer", meta_features="infer", operation="zeroone"): if features == "infer": features = infer_cp_features(df) if meta_features == "infer": meta_features = infer_cp_features(df, metadata=True) feature_df = df.loc[:, features] meta_df = df.loc[:, meta_features] if operation == "zeroone": scaler = sklearn.preprocessing.MinMaxScaler() feature_df = pd.DataFrame( scaler.fit_transform(feature_df), index=feature_df.index, columns=feature_df.columns, ) output_df = pd.concat([meta_df, feature_df], axis="columns") return output_df
def process_umap(data_df): # Prepare UMAP input by removing metadata columns metadata_cols = infer_cp_features(data_df, metadata=True) metadata_df = data_df.loc[:, metadata_cols] umap_data_df = data_df.drop(metadata_cols, axis="columns") # Apply UMAP reducer = umap.UMAP(random_state=123) embedding = reducer.fit_transform(umap_data_df) # Setup plotting logic embedding_df = pd.DataFrame(embedding, columns=['x', 'y']) embedding_df = embedding_df.merge(metadata_df, left_index=True, right_index=True) return embedding_df
def normalize_sc(sc_df, scaler_method="standard"): sc_df = sc_df.reset_index(drop=True) cp_features = infer_cp_features(sc_df) meta_df = sc_df.drop(cp_features, axis="columns") meta_df.columns = [ x if x.startswith("Metadata_") else f"Metadata_{x}" for x in meta_df.columns ] sc_df = sc_df.loc[:, cp_features] if scaler_method == "standard": scaler = StandardScaler() sc_df = pd.DataFrame(scaler.fit_transform(sc_df), index=sc_df.index, columns=sc_df.columns) sc_df = meta_df.merge(sc_df, left_index=True, right_index=True) return sc_df
def annotate( profiles, platemap, join_on=["Metadata_well_position", "Metadata_Well"], output_file="none", add_metadata_id_to_platemap=True, format_broad_cmap=False, clean_cellprofiler=True, external_metadata="none", external_join_left="none", external_join_right="none", compression_options=None, float_format=None, cmap_args={}, ): """Add metadata to aggregated profiles. Parameters ---------- profiles : pandas.core.frame.DataFrame or file DataFrame or file path of profiles. platemap : pandas.core.frame.DataFrame or file Dataframe or file path of platemap metadata. join_on : list or str, default: ["Metadata_well_position", "Metadata_Well"] Which variables to merge profiles and plate. The first element indicates variable(s) in platemap and the second element indicates variable(s) in profiles to merge using. Note the setting of `add_metadata_id_to_platemap` output_file : str, optional If not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_augmented.csv". add_metadata_id_to_platemap : bool, default True Whether the plate map variables possibly need "Metadata" pre-pended format_broad_cmap : bool, default False Whether we need to add columns to make compatible with Broad CMAP naming conventions. clean_cellprofiler: bool, default True Clean specific CellProfiler feature names. external_metadata : str, optional File with additional metadata information external_join_left : str, optional Merge column in the profile metadata. external_join_right: str, optional Merge column in the external metadata. compression_options : str or dict, optional Contains compression options as input to pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2. float_format : str, optional Decimal precision to use in writing output file as input to pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3 decimal precision. cmap_args : dict, default {} Potential keyword arguments for annotate_cmap(). See cyto_utils/annotate_custom.py for more details. Returns ------- annotated : pandas.core.frame.DataFrame, optional DataFrame of annotated features. If output_file="none", then return the DataFrame. If you specify output_file, then write to file and do not return data. """ # Load Data profiles = load_profiles(profiles) platemap = load_platemap(platemap, add_metadata_id_to_platemap) annotated = platemap.merge(profiles, left_on=join_on[0], right_on=join_on[1], how="inner").drop(join_on[0], axis="columns") # Add specific Connectivity Map (CMAP) formatting if format_broad_cmap: annotated = annotate_cmap(annotated, annotate_join_on=join_on[1], **cmap_args) if clean_cellprofiler: annotated = cp_clean(annotated) if not isinstance(external_metadata, pd.DataFrame): if external_metadata != "none": assert os.path.exists( external_metadata ), "external metadata at {} does not exist".format( external_metadata) external_metadata = pd.read_csv(external_metadata) if isinstance(external_metadata, pd.DataFrame): external_metadata.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in external_metadata.columns ] annotated = (annotated.merge( external_metadata, left_on=external_join_left, right_on=external_join_right, how="left", ).reset_index(drop=True).drop_duplicates()) # Reorder annotated metadata columns meta_cols = infer_cp_features(annotated, metadata=True) other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist() annotated = annotated.loc[:, meta_cols + other_cols] if output_file != "none": output( df=annotated, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return annotated
def test_merge_single_cells(): sc_merged_df = ap.merge_single_cells() # Assert that the image data was merged assert all(x in sc_merged_df.columns for x in ["Metadata_Plate", "Metadata_Well"]) # Assert that metadata columns were renamed appropriately for x in ap.full_merge_suffix_rename: assert ap.full_merge_suffix_rename[x] == "Metadata_{x}".format(x=x) # Perform a manual merge manual_merge = cytoplasm_df.merge( cells_df, left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"], right_on=["TableNumber", "ImageNumber", "ObjectNumber"], suffixes=["_cytoplasm", "_cells"], ).merge( nuclei_df, left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"], right_on=["TableNumber", "ImageNumber", "ObjectNumber"], suffixes=["_cytoplasm", "_nuclei"], ) manual_merge = image_df.merge(manual_merge, on=ap.merge_cols, how="right").rename( ap.full_merge_suffix_rename, axis="columns") # Confirm that the merge correctly reversed the object number (opposite from Parent) assert (sc_merged_df.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] == sc_merged_df.Metadata_ObjectNumber.tolist()) assert (manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] == sc_merged_df.Metadata_ObjectNumber.tolist()) assert (manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] == sc_merged_df.Metadata_ObjectNumber.tolist()) assert (manual_merge.Metadata_ObjectNumber_cells.tolist() == sc_merged_df.Metadata_ObjectNumber.tolist()) # Confirm the merge and adding merge options for method in ["standardize", "robustize"]: for samples in ["all", "Metadata_ImageNumber == 'x'"]: for features in ["infer", ["Cytoplasm_a", "Cells_a"]]: norm_method_df = ap.merge_single_cells( single_cell_normalize=True, normalize_args={ "method": method, "samples": samples, "features": features, }, ) manual_merge_normalize = normalize(manual_merge, method=method, samples=samples, features=features) pd.testing.assert_frame_equal( norm_method_df.sort_index(axis=1), manual_merge_normalize.sort_index(axis=1), ) # Test non-canonical compartment merging new_sc_merge_df = ap_new.merge_single_cells() assert sum(new_sc_merge_df.columns.str.startswith("New")) == 4 assert (new_compartment_df.ObjectNumber.tolist()[::-1] == new_sc_merge_df.Metadata_ObjectNumber_new.tolist()) norm_new_method_df = ap_new.merge_single_cells( single_cell_normalize=True, normalize_args={ "method": "standardize", "samples": "all", "features": "infer", }, ) norm_new_method_no_feature_infer_df = ap_new.merge_single_cells( single_cell_normalize=True, normalize_args={ "method": "standardize", "samples": "all", }, ) default_feature_infer_df = ap_new.merge_single_cells( single_cell_normalize=True) pd.testing.assert_frame_equal(norm_new_method_df, default_feature_infer_df) pd.testing.assert_frame_equal(norm_new_method_df, norm_new_method_no_feature_infer_df) new_compartment_cols = infer_cp_features(new_compartment_df, compartments=ap_new.compartments) traditional_norm_df = normalize( ap_new.image_df.merge(new_compartment_df, on=ap.merge_cols), features=new_compartment_cols, samples="all", method="standardize", ) pd.testing.assert_frame_equal( norm_new_method_df.loc[:, new_compartment_cols].abs().describe(), traditional_norm_df.loc[:, new_compartment_cols].abs().describe(), )
barcode_col = "Metadata_pert_name" gene_col = "Metadata_gene_name" replicate_group_grit = {"replicate_id": barcode_col, "group_id": gene_col} control_group_cut = ["Chr2", "Luc", "LacZ"] control_barcodes = ( df.loc[df[replicate_group_grit["group_id"]].isin(control_group_cut), replicate_group_grit["replicate_id"], ].unique().tolist()) control_barcodes # In[5]: all_features = infer_cp_features(df, compartments=compartments) meta_features = infer_cp_features(df, metadata=True) meta_features # In[6]: grit_compartment_results = [] for cell_line in df.Metadata_cell_line.unique(): for compartment in compartments: compartment_features = infer_cp_features(df, compartments=compartment) for drop in [True, False]: if drop: subset_df = df.drop(compartment_features, axis="columns")
def normalize( profiles, features="infer", meta_features="infer", samples="all", method="standardize", output_file="none", compression=None, float_format=None, whiten_center=True, whiten_method="ZCA", ): """ Normalize features Arguments: profiles - either pandas DataFrame or a file that stores profile data features - list of cell painting features [default: "infer"] if "infer", then assume cell painting features are those that do not start with "Cells", "Nuclei", or "Cytoplasm" meta_features - if specified, then output these with specified features [default: "infer"] samples - string indicating which metadata column and values to use to subset the control samples are often used here [default: 'all'] the format of this variable will be used in a pd.query() function. An example is "Metadata_treatment == 'control'" (include all quotes) method - string indicating how the dataframe will be normalized [default: 'standardize'] output_file - [default: "none"] if provided, will write annotated profiles to file if not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_normalized.csv". compression - the mechanism to compress [default: None] float_format - decimal precision to use in writing output file [default: None] For example, use "%.3g" for 3 decimal precision. whiten_center - if data should be centered before whitening transform [default: True] (only used if method = "whiten") whiten_method - the type of whitening normalization used [default: 'ZCA'] (only used if method = "whiten") Return: A normalized DataFrame """ # Load Data profiles = load_profiles(profiles) # Define which scaler to use method = method.lower() avail_methods = ["standardize", "robustize", "mad_robustize", "whiten"] assert method in avail_methods, "operation must be one {}".format( avail_methods) if method == "standardize": scaler = StandardScaler() elif method == "robustize": scaler = RobustScaler() elif method == "mad_robustize": scaler = RobustMAD() elif method == "whiten": scaler = Whiten(center=whiten_center, method=whiten_method) if features == "infer": features = infer_cp_features(profiles) # Separate out the features and meta feature_df = profiles.loc[:, features] if meta_features == "infer": meta_features = infer_cp_features(profiles, metadata=True) meta_df = profiles.loc[:, meta_features] # Fit the sklearn scaler if samples == "all": fitted_scaler = scaler.fit(feature_df) else: # Subset to only the features measured in the sample query fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features]) # Scale the feature dataframe feature_df = pd.DataFrame( fitted_scaler.transform(feature_df), columns=feature_df.columns, index=feature_df.index, ) normalized = meta_df.merge(feature_df, left_index=True, right_index=True) if output_file != "none": output( df=normalized, output_filename=output_file, compression=compression, float_format=float_format, ) else: return normalized
plate_files = batch_dict["plate_files"] plates = batch_dict["plates"] for plate in plates: print("Now auditing... Batch: {}; Plate: {}".format(batch, plate)) audit_output_dir = os.path.join(output_dir, batch, plate) os.makedirs(audit_output_dir, exist_ok=True) figure_output_dir = os.path.join(figure_dir, batch, plate) os.makedirs(figure_output_dir, exist_ok=True) audit_output_file = os.path.join(audit_output_dir, "{}_audit.csv".format(plate)) df = pd.read_csv(plate_files[plate]) # Determine feature class features = infer_cp_features(df) meta_features = infer_cp_features(df, metadata=True) # Calculate and process pairwise similarity matrix audit_df = metric_melt( df=df, features=features, metadata_features=meta_features, similarity_metric="pearson", eval_metric="replicate_reproducibility", ) audit_df = assign_replicates(similarity_melted_df=audit_df, replicate_groups=audit_cols) # What is 95% of the non replicate null distribution cutoff = audit_df.query(
def annotate( profiles, platemap, cell_id="unknown", join_on=["Metadata_well_position", "Metadata_Well"], output_file="none", add_metadata_id_to_platemap=True, format_broad_cmap=False, perturbation_mode="none", external_metadata="none", external_join_left="none", external_join_right="none", compression=None, float_format=None, ): """ Exclude features that have correlations above a certain threshold Arguments: profiles - either pandas DataFrame or a file that stores profile data platemap - either pandas DataFrame or a file that stores platemap metadata cell_id - [default: "unknown"] provide a string to annotate cell id column join_on - list of length two indicating which variables to merge profiles and plate [default: ["Metadata_well_position", "Metadata_Well"]]. The first element indicates variable(s) in platemap and the second element indicates variable(s) in profiles to merge using. Note the setting of `add_metadata_id_to_platemap` output_file - [default: "none"] if provided, will write annotated profiles to file if not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_augmented.csv". add_metadata_id_to_platemap - boolean if the platemap variables should be recoded format_broad_cmap - [default: False] boolean if we need to add columns to make compatible with Broad CMAP naming conventions. perturbation_mode - [default: "none"] - either "chemical", "genetic" or "none" and only active if format_broad_cmap == True external_metadata - [default: "none"] a string indicating a file with additional metadata information external_join_left - [default: "none"] the merge column in the profile metadata external_join_right - [default: "none"] the merge column in the external metadata compression - the mechanism to compress [default: None] float_format - decimal precision to use in writing output file [default: None] For example, use "%.3g" for 3 decimal precision. Return: Pandas DataFrame of annotated profiles or written to file """ # Load Data profiles = load_profiles(profiles) platemap = load_platemap(platemap, add_metadata_id_to_platemap) annotated = platemap.merge(profiles, left_on=join_on[0], right_on=join_on[1], how="inner").drop(join_on[0], axis="columns") if format_broad_cmap: pert_opts = ["none", "chemical", "genetic"] assert (perturbation_mode in pert_opts ), "perturbation mode must be one of {}".format(pert_opts) assert ( "Metadata_broad_sample" in annotated.columns ), "Are you sure this is a CMAP file? 'Metadata_broad_sample column not found.'" annotated = annotated.assign( Metadata_pert_id=annotated.Metadata_broad_sample.str.extract( r"(BRD[-N][A-Z0-9]+)"), Metadata_pert_mfc_id=annotated.Metadata_broad_sample, Metadata_pert_well=annotated.loc[:, join_on[1]], Metadata_pert_id_vendor="", ) if "Metadata_pert_iname" in annotated.columns: annotated = annotated.assign( Metadata_pert_mfc_desc=annotated.Metadata_pert_iname, Metadata_pert_name=annotated.Metadata_pert_iname, ) if "Metadata_cell_id" not in annotated.columns: annotated = annotated.assign(Metadata_cell_id=cell_id) if perturbation_mode == "chemical": annotated = annotated.assign(Metadata_broad_sample_type=[ "control" if x in ["DMSO", np.nan] else "trt" for x in annotated.Metadata_broad_sample ]) # Generate Metadata_broad_sample column annotated.loc[annotated.Metadata_broad_sample_type == "control", "Metadata_broad_sample", ] = "DMSO" annotated.loc[annotated.Metadata_broad_sample == "empty", "Metadata_broad_sample_type"] = "empty" if "Metadata_mmoles_per_liter" in annotated.columns: annotated.loc[annotated.Metadata_broad_sample_type == "control", "Metadata_mmoles_per_liter", ] = 0 if "Metadata_solvent" in annotated.columns: annotated = annotated.assign( Metadata_pert_vehicle=annotated.Metadata_solvent) if "Metadata_mg_per_ml" in annotated.columns: annotated.loc[annotated.Metadata_broad_sample_type == "control", "Metadata_mg_per_ml", ] = 0 if perturbation_mode == "genetic": if "Metadata_pert_name" in annotated.columns: annotated = annotated.assign(Metadata_broad_sample_type=[ "control" if x == "EMPTY" else "trt" for x in annotated.Metadata_pert_name ]) if "Metadata_broad_sample_type" in annotated.columns: annotated = annotated.assign( Metadata_pert_type=annotated.Metadata_broad_sample_type) else: annotated = annotated.assign(Metadata_pert_type="", Metadata_broad_sample_type="") # Add specific Connectivity Map (CMAP) formatting if not isinstance(external_metadata, pd.DataFrame): if external_metadata != "none": assert os.path.exists( external_metadata ), "external metadata at {} does not exist".format( external_metadata) external_metadata = pd.read_csv(external_metadata) if isinstance(external_metadata, pd.DataFrame): external_metadata.columns = [ "Metadata_{}".format(x) if not x.startswith("Metadata_") else x for x in external_metadata.columns ] annotated = (annotated.merge( external_metadata, left_on=external_join_left, right_on=external_join_right, how="left", ).reset_index(drop=True).drop_duplicates()) # Reorder annotated metadata columns meta_cols = infer_cp_features(annotated, metadata=True) other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist() annotated = annotated.loc[:, meta_cols + other_cols] if output_file != "none": output( df=annotated, output_filename=output_file, compression=compression, float_format=float_format, ) else: return annotated
def write_gct( profiles, output_file, features="infer", meta_features="infer", feature_metadata="none", version="#1.3", ): """ Convert profiles to a .gct file Arguments: profiles - either pandas DataFrame or a file that stores profile data output_file - the name of the gct file to save processed data to features - a list of features present in the population dataframe [default: "infer"] if "infer", then assume cell painting features are those that start with "Cells_", "Nuclei_", or "Cytoplasm_" meta_features - if specified, then output these values in the gct file [default: "infer"] feature_metadata - pandas DataFrame linking features to additional metadata [default: "none"] Return: Pandas DataFrame of audits or written to file """ # Note, only version 1.3 is currently supported assert version == "#1.3", "Only version #1.3 is currently supported." # Step 1: Create first two rows of data if features == "infer": features = infer_cp_features(profiles) feature_df = profiles.loc[:, features].reset_index(drop=True).transpose() # Separate out metadata features if meta_features == "infer": meta_features = infer_cp_features(profiles, metadata=True) metadata_df = profiles.loc[:, meta_features] # Step 2: Get the sample metadata portion of the output file metadata_part = metadata_df.transpose() metadata_part.columns = ["SAMPLE_{}".format(x) for x in metadata_part.columns] metadata_part = ( metadata_part.transpose() .reset_index() .rename({"index": "id"}, axis="columns") .transpose() ) metadata_part.index = [x.replace("Metadata_", "") for x in metadata_part.index] nrow_feature, ncol_features = feature_df.shape _, ncol_metadata = metadata_df.shape # Step 3: Compile feature metadata full_df = pd.concat([metadata_part, feature_df], axis="rows") if isinstance(feature_metadata, pd.DataFrame): nrow_metadata = feature_metadata.shape[1] assert ( "id" in feature_metadata.index.tolist() ), "make sure feature metadata has row named 'id' that stores feature metadata names!" full_df = feature_metadata.merge( full_df, how="right", left_index=True, right_index=True ) else: feature_metadata = ( ["cp_feature_name"] + [np.nan] * ncol_metadata + feature_df.index.tolist() ) nrow_metadata = 1 full_df.insert(0, column="feature_metadata", value=feature_metadata) full_df = full_df.reset_index() # Step 4: Compile all data dimensions data_dimensions = [nrow_feature, ncol_features, nrow_metadata, ncol_metadata] # Step 5: Write output gct file with open(output_file, "w", newline="") as gctfile: gctwriter = csv.writer(gctfile, delimiter="\t") gctwriter.writerow([version]) gctwriter.writerow(data_dimensions) for feature, row in full_df.iterrows(): gctwriter.writerow(row)
]).reset_index(drop=True) data_missing_df = pd.concat([ pd.DataFrame({ "g": "a", "Cells_x": [1, 3, 8, np.nan], "Nuclei_y": [5, np.nan, 3, 1] }), pd.DataFrame({ "g": "b", "Cells_x": [1, 3, np.nan, 5], "Nuclei_y": [np.nan, 8, 3, 1] }), ]).reset_index(drop=True) features = infer_cp_features(data_df) dtype_convert_dict = {x: float for x in features} def test_aggregate_median_allvar(): """ Testing aggregate pycytominer function """ aggregate_result = aggregate(population_df=data_df, strata=["g"], features="infer", operation="median") expected_result = pd.concat([ pd.DataFrame({ "g": "a",
batches # In[4]: batch_data = {} all_clones = list() profile_count_list = list() for batch in batches: print("Now processing... {}".format(batch)) df, batch_count, treatment_count = process_counts(batch, profile_dir=profile_dir) batch_data[batch] = { "dataframe": df, "metafeatures": infer_cp_features(df, metadata=True), "batch_count": batch_count, "treatment_count": treatment_count } all_clones += treatment_count.Metadata_clone.unique().tolist() profile_count_list.append( treatment_count. loc[:, ["Metadata_clone", "Metadata_treatment", "profile_count"]]) # In[5]: sample_count_df = (pd.DataFrame( pd.concat(profile_count_list, axis="rows").fillna("DMSO").reset_index(drop=True).groupby( ["Metadata_clone",
# Output file info output_dir = pathlib.Path("embeddings") batch1_output_file = pathlib.Path( f"{output_dir}/cellpainting_embeddings_batch1.tsv.gz") batch2_output_file = pathlib.Path( f"{output_dir}/cellpainting_embeddings_batch2.tsv.gz") # In[4]: # Load cell painting profiles file = pathlib.Path("cellpainting_lvl4_cpd_replicate_datasets", "cp_level4_cpd_replicates.csv.gz") df = pd.read_csv(file, low_memory=False) cp_features = infer_cp_features(df) meta_features = infer_cp_features( df, metadata=True) + ["broad_id", "pert_iname", "moa", "replicate_name"] # Transform PCA to top 50 components n_components = 50 pca = PCA(n_components=n_components) pca_df = pca.fit_transform(df.loc[:, cp_features]) pca_df = pd.DataFrame(pca_df) pca_df.columns = [f"PCA_{x}" for x in range(0, n_components)] print(pca_df.shape) pca_df.head() # ## UMAP - Batch 1
def pipeline_feature_select(self, steps, suffix=None): feature_select_steps = steps pipeline_output = self.pipeline["output_dir"] level = feature_select_steps["level"] gct = feature_select_steps["gct"] feature_select_operations = feature_select_steps["operations"] feature_select_features = feature_select_steps["features"] all_plates_df = pd.DataFrame() for batch in self.profile_config: batch_df = pd.DataFrame() for plate in self.profile_config[batch]: output_dir = pathlib.PurePath(".", pipeline_output, batch, plate) if suffix: normalize_output_file = pathlib.PurePath( output_dir, f"{plate}_normalized_{suffix}.csv.gz") feature_select_output_file_plate = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_{suffix}_plate.csv.gz", ) else: normalize_output_file = pathlib.PurePath( output_dir, f"{plate}_normalized.csv.gz") feature_select_output_file_plate = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_plate.csv.gz") if feature_select_features == "infer" and self.noncanonical: feature_select_features = cyto_utils.infer_cp_features( pd.read_csv(normalize_output_file), compartments=self.compartments, ) df = pd.read_csv(normalize_output_file).assign( Metadata_batch=batch) if level == "plate": df = df.drop(columns=["Metadata_batch"]) feature_select( profiles=df, features=feature_select_features, operation=feature_select_operations, output_file=feature_select_output_file_plate, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) elif level == "batch": batch_df = concat_dataframes(batch_df, df) elif level == "all": all_plates_df = concat_dataframes(all_plates_df, df) if level == "batch": fs_df = feature_select( profiles=batch_df, features=feature_select_features, operation=feature_select_operations, ) for plate in self.profile_config[batch]: output_dir = pathlib.PurePath(".", pipeline_output, batch, plate) if suffix: feature_select_output_file_batch = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_{suffix}_batch.csv.gz", ) else: feature_select_output_file_batch = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_batch.csv.gz", ) if feature_select_features == "infer" and self.noncanonical: feature_select_features = cyto_utils.infer_cp_features( batch_df, compartments=self.compartments) df = fs_df.query("Metadata_Plate==@plate").reset_index( drop=True) df = df.drop(columns=["Metadata_batch"]) cyto_utils.output( output_filename=feature_select_output_file_batch, df=df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) if gct: create_gct_directories(batch) if suffix: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_batch.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_batch.gct", ) else: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_batch.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_batch.gct", ) cyto_utils.output( output_filename=stacked_file, df=fs_df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) write_gct(profiles=fs_df, output_file=gct_file) if level == "all": fs_df = feature_select( profiles=all_plates_df, features=feature_select_features, operation=feature_select_operations, ) for batch in self.profile_config: fs_batch_df = fs_df.loc[fs_df.Metadata_batch == batch].reset_index(drop=True) for plate in self.profile_config[batch]: output_dir = pathlib.PurePath(".", pipeline_output, batch, plate) if suffix: feature_select_output_file_all = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_{suffix}_all.csv.gz", ) else: feature_select_output_file_all = pathlib.PurePath( output_dir, f"{plate}_normalized_feature_select_all.csv.gz") if feature_select_features == "infer" and self.noncanonical: feature_select_features = cyto_utils.infer_cp_features( all_plates_df, compartments=self.compartments) df = fs_batch_df.query( "Metadata_Plate==@plate").reset_index(drop=True) df = df.drop(columns=["Metadata_batch"]) cyto_utils.output( output_filename=feature_select_output_file_all, df=df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) if gct: create_gct_directories(batch) if suffix: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_all.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_{suffix}_all.gct", ) else: stacked_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_all.csv.gz", ) gct_file = pathlib.PurePath( ".", "gct", batch, f"{batch}_normalized_feature_select_all.gct", ) cyto_utils.output( output_filename=stacked_file, df=fs_batch_df, compression_options=self. pipeline_options["compression"], float_format=self.pipeline_options["float_format"], ) write_gct(profiles=fs_batch_df, output_file=gct_file)
def aggregate( population_df, strata=["Metadata_Plate", "Metadata_Well"], features="infer", operation="median", output_file="none", compute_object_count=False, object_feature="ObjectNumber", subset_data_df="none", compression_options=None, float_format=None, ): """Combine population dataframe variables by strata groups using given operation. Parameters ---------- population_df : pandas.core.frame.DataFrame DataFrame to group and aggregate. strata : list of str, default ["Metadata_Plate", "Metadata_Well"] Columns to groupby and aggregate. features : list of str, default "all" List of features that should be aggregated. operation : str, default "median" How the data is aggregated. Currently only supports one of ['mean', 'median']. output_file : str or file handle, optional If provided, will write aggregated profiles to file. If not specified, will return the aggregated profiles. We recommend naming the file based on the plate name. compute_object_count : bool, default False Whether or not to compute object counts. object_feature : str, default "ObjectNumber" Object number feature. Only used if compute_object_count=True. subset_data_df : pandas.core.frame.DataFrame How to subset the input. compression_options : str, optional The mechanism to compress. float_format : str, optional Decimal precision to use in writing output file. Returns ------- pandas.core.frame.DataFrame DataFrame of aggregated features. """ # Check that the operation is supported operation = check_aggregate_operation(operation) # Subset the data to specified samples if isinstance(subset_data_df, pd.DataFrame): population_df = subset_data_df.merge( population_df, how="left", on=subset_data_df.columns.tolist()).reindex(population_df.columns, axis="columns") # Subset dataframe to only specified variables if provided strata_df = population_df.loc[:, strata] # Only extract single object column in preparation for count if compute_object_count: count_object_df = population_df.loc[:, np. union1d(strata, [object_feature])] count_object_df = (count_object_df.groupby( strata)[object_feature].count().reset_index().rename( columns={f"{object_feature}": "Metadata_Object_Count"})) if features == "infer": features = infer_cp_features(population_df) population_df = population_df.loc[:, features] else: population_df = population_df.loc[:, features] # Fix dtype of input features (they should all be floats!) convert_dict = {x: float for x in features} population_df = population_df.astype(convert_dict) # Merge back metadata used to aggregate by population_df = pd.concat([strata_df, population_df], axis="columns") # Perform aggregating function population_df = population_df.groupby(strata, dropna=False) if operation == "median": population_df = population_df.median().reset_index() else: population_df = population_df.mean().reset_index() # Compute objects counts if compute_object_count: population_df = count_object_df.merge(population_df, on=strata, how="right") # Aggregated image number and object number do not make sense for col in ["ImageNumber", "ObjectNumber"]: if col in population_df.columns: population_df = population_df.drop([col], axis="columns") if output_file != "none": output( df=population_df, output_filename=output_file, compression_options=compression_options, float_format=float_format, ) else: return population_df return population_df
def pipeline_quality_control(self, steps): quality_control_steps = steps["operations"] pipeline_output = self.pipeline["output_dir"] summary_column_order = [ "Batch_Name", "Plate_Name", "Well_Count", "Images_per_site", "Sites_per_well_Median", "Sites_per_well_mad", ] qc_dir = pathlib.PurePath(".", "quality_control") if not os.path.isdir(pathlib.PurePath(qc_dir)): os.mkdir(qc_dir) for step in quality_control_steps: if step == "summary": output_dir = pathlib.PurePath(".", "quality_control", "summary") if not os.path.isdir(pathlib.PurePath(output_dir)): os.mkdir(output_dir) output_file = pathlib.PurePath(output_dir, "summary.tsv") if os.path.isfile(output_file): summary = pd.read_csv(output_file, sep="\t") else: summary = pd.DataFrame() for batch in self.profile_config: for plate in self.profile_config[batch]: input_file = pathlib.PurePath(".", "load_data_csv", batch, plate, "load_data.csv.gz") df = pd.read_csv(input_file).assign( Metadata_batch=batch) site_df = (df.groupby([ "Metadata_Row", "Metadata_Col" ]).Metadata_Site.count().reset_index().Metadata_Site) image_count = len( df.columns[df.columns.str.startswith("FileName")]) summary = summary.append( { "Batch_Name": batch, "Plate_Name": plate, "Well_Count": site_df.count(), "Images_per_site": image_count, "Sites_per_well_Median": site_df.median(), "Sites_per_well_mad": "%.3f" % site_df.mad(), }, ignore_index=True, ) summary["Well_Count"] = summary["Well_Count"].astype(int) summary["Images_per_site"] = summary["Images_per_site"].astype( int) summary["Sites_per_well_Median"] = summary[ "Sites_per_well_Median"].astype(int) summary = summary.drop_duplicates( subset=["Batch_Name", "Plate_Name"], keep="last").sort_values(by=["Batch_Name", "Plate_Name"]) summary[summary_column_order].to_csv(output_file, sep="\t", index=False) elif step == "heatmap": output_dir = pathlib.PurePath(".", "quality_control", "heatmap") if not os.path.isdir(pathlib.PurePath(output_dir)): os.mkdir(output_dir) for batch in self.profile_config: for plate in self.profile_config[batch]: input_file = pathlib.PurePath( ".", pipeline_output, batch, plate, f"{plate}_augmented.csv.gz", ) df = (pd.read_csv(input_file).assign( Metadata_Row=lambda x: x.Metadata_Well.str[0:1] ).assign( Metadata_Col=lambda x: x.Metadata_Well.str[1:])) if "Metadata_Object_Count" in df.columns: cell_count_feature = "Metadata_Object_Count" else: cell_count_feature = "Cytoplasm_Number_Object_Number" df = df[[ "Metadata_Row", "Metadata_Col", cell_count_feature ]] df_pivot = df.pivot("Metadata_Row", "Metadata_Col", cell_count_feature) fig = px.imshow(df_pivot, color_continuous_scale="blues") fig.update_layout( title= f"Plate: {plate}, Feature: {cell_count_feature}", xaxis=dict(title="", side="top"), yaxis=dict(title=""), ) fig.update_traces(xgap=1, ygap=1) if not os.path.isdir( pathlib.PurePath(output_dir, batch)): os.mkdir(pathlib.PurePath(output_dir, batch)) if not os.path.isdir( pathlib.PurePath(output_dir, batch, plate)): os.mkdir(pathlib.PurePath(output_dir, batch, plate)) output_file = ( f"{output_dir}/{batch}/{plate}/{plate}_cell_count.png" ) fig.write_image(output_file, width=640, height=480, scale=2) if os.path.isfile( pathlib.PurePath( ".", pipeline_output, batch, plate, f"{plate}_normalized_feature_select_negcon_all.csv.gz", )): input_file = pathlib.PurePath( ".", pipeline_output, batch, plate, f"{plate}_normalized_feature_select_negcon_all.csv.gz", ) elif os.path.isfile( pathlib.PurePath( ".", pipeline_output, batch, plate, f"{plate}_normalized_feature_select_negcon_batch.csv.gz", )): input_file = pathlib.PurePath( ".", pipeline_output, batch, plate, f"{plate}_normalized_feature_select_negcon_batch.csv.gz", ) elif os.path.isfile( pathlib.PurePath( ".", pipeline_output, batch, plate, f"{plate}_normalized_feature_select_negcon_plate.csv.gz", )): input_file = pathlib.PurePath( ".", pipeline_output, batch, plate, f"{plate}_normalized_feature_select_negcon_plate.csv.gz", ) else: continue df = pd.read_csv(input_file) profiles = df[cyto_utils.infer_cp_features(df)] corr_matrix = np.corrcoef(profiles) corr_matrix_df = pd.DataFrame( corr_matrix, columns=list(df.Metadata_Well), index=list(df.Metadata_Well), ) fig = px.imshow(corr_matrix_df, color_continuous_scale="BlueRed") fig.update_layout( title=f"Plate: {plate}, Correlation all vs. all", xaxis=dict(title="Wells"), yaxis=dict(title="Wells"), ) output_file = ( f"{output_dir}/{batch}/{plate}/{plate}_correlation.png" ) fig.write_image(output_file, width=640, height=480, scale=2) corr_df = (corr_matrix_df.stack().reset_index().rename( columns={ "level_0": "Well_Row", "level_1": "Well_Col", 0: "correlation", }).assign( Row=lambda x: x.Well_Row.str[0:1]).assign( Col=lambda x: x.Well_Row.str[1:])) corr_df["same_row_col"] = corr_df.apply( lambda x: str(x.Row) in str(x.Well_Col) or str( x.Col) in str(x.Well_Col), axis=1, ) wells = list(df.Metadata_Well) table_df = pd.DataFrame() for well in wells: signal = list(corr_df.loc[ (corr_df.Well_Row == well) & (corr_df.same_row_col)]["correlation"]) null = list( corr_df.loc[(corr_df.Well_Row == well) & (corr_df.same_row_col == False)] ["correlation"]) perc_95 = np.nanpercentile(null, 95) above_threshold = signal > perc_95 value = np.mean(above_threshold.astype(float)) table_df = table_df.append( { "Metadata_Row": well[0:1], "Metadata_Col": well[1:], "value": value, }, ignore_index=True, ) df_pivot = table_df.pivot("Metadata_Row", "Metadata_Col", "value") fig = px.imshow(df_pivot, color_continuous_scale="blues") fig.update_layout( title=f"Plate: {plate}, Position effect", xaxis=dict(title="", side="top"), yaxis=dict(title=""), ) fig.update_traces(xgap=1, ygap=1) output_file = ( f"{output_dir}/{batch}/{plate}/{plate}_position_effect.png" ) fig.write_image(output_file, width=640, height=480, scale=2)
from cytominer_eval.transform import metric_melt from cytominer_eval.operations.util import assign_replicates # In[2]: output_dir = "results" file = pathlib.Path("data/cell_health_merged_feature_select.csv.gz") cell_health_df = pd.read_csv(file) print(cell_health_df.shape) cell_health_df.head() # In[3]: features = infer_cp_features(cell_health_df) meta_features = infer_cp_features(cell_health_df, metadata=True) similarity_metric = "pearson" operation = "percent_strong" replicate_groups = [ "Metadata_cell_line", "Metadata_gene_name", "Metadata_pert_name" ] control_ids = ["Chr2", "Luc", "LacZ"] # In[4]: # Melt the input profiles to long format similarity_melted_df = metric_melt(
print(feature_df.shape) feature_df.head() # In[6]: # Perform spherize transform for file in data_files: # Extract plate from file name plate = str(file).split("/")[-1].split("_")[0] print(f"Now processing {plate}...") # Load data and apply feature selection df = pd.read_csv(file).reindex(feature_df.index, axis="columns") # Get feature names metadata_cols = ["Image_Metadata_Well"] + infer_cp_features(df, metadata=True) feature_cols = infer_cp_features( df, compartments=["Cells", "Cytoplasm", "Nuclei"]) output_file = pathlib.Path(f"{data_dir}/{plate}_{output_file_suffix}") # Apply spherize transformation and output files normalize(profiles=df, features=feature_cols, meta_features=metadata_cols, method="spherize", spherize_method="ZCA-cor", spherize_center=True, output_file=output_file)
# In[17]: # Other data other_df = pd.concat(other_dict_df).sample(frac=1).reset_index(drop=True) other_df = normalize_sc(other_df, scaler_method=scaler_method) print(other_df.shape) # ## Apply Feature Selection # In[18]: meta_features = infer_cp_features(train_df, metadata=True) meta_features # In[19]: train_df = feature_select( train_df, operation=feature_select_opts, na_cutoff=na_cutoff, corr_threshold=corr_threshold ) selected_features = infer_cp_features(train_df) reindex_features = meta_features + selected_features
def aggregate_deep(self): """ Main function of this class. Aggregates the profiles into a pandas dataframe. For each key in file_aggregate, the profiles are loaded, concatenated and then aggregated. If files are missing, we throw a warning but continue the code. After aggregation, the metadata is concatenated back onto the dataframe. Returns ------- df_out : pandas.dataframe dataframe with all metadata and the feature space. This is the input to any further pycytominer or pycytominer-eval processing """ if not hasattr(self, "file_aggregate"): self.setup_aggregate() self.aggregated_profiles = [] self.aggregate_merge_col = f"Metadata_{self.aggregate_on.capitalize()}_Position" # Iterates over all sites, wells or plates for metadata_level in self.file_aggregate: # uses custom load function to create df with metadata and profiles arr = [load_npz(x) for x in self.file_aggregate[metadata_level]["files"]] # empty dataframes from missing files are deleted arr = [x for x in arr if not x.empty] # if no files were found there is a miss-match between the index and the output files if not len(arr): warnings.warn( f"No files for the key {metadata_level} could be found.\nThis program will continue, but be aware that this might induce errors!" ) continue df = pd.concat(arr) # extract metadata prior to aggregation meta_df = pd.DataFrame() metadata_cols = infer_cp_features(df, metadata=True) profiles = [x for x in df.columns.tolist() if x not in metadata_cols] # If all rows have the same Metadata information, that value is valid for the aggregated df for col in metadata_cols: if len(df[col].unique()) == 1: meta_df[col] = [df[col].unique()[0]] # perform the aggregation df = df.assign(Metadata_Aggregate_On=self.aggregate_on) df = aggregate.aggregate( population_df=df, strata="Metadata_Aggregate_On", features=profiles, operation=self.aggregate_operation, ).reset_index(drop=True) # add the aggregation level as a column df.loc[:, self.aggregate_merge_col] = metadata_level # concatenate the metadata back onto the aggregated profile df = pd.concat([df, meta_df], axis=1) # save metalevel file if self.output_file != "none": if not os.path.exists(self.output_file): os.mkdir(self.output_file) file_path = os.path.join( self.output_file, metadata_level.replace("/", "_") ) df.to_csv(f"{file_path}.csv", index=False) self.aggregated_profiles.append(df) # Concatenate all of the above created profiles self.aggregated_profiles = pd.concat( [x for x in self.aggregated_profiles] ).reset_index(drop=True) # clean and reindex columns self.aggregated_profiles.columns = [ str(x) for x in self.aggregated_profiles.columns ] meta_features = infer_cp_features(self.aggregated_profiles, metadata=True) reindex_profiles = [str(x) for x in profiles] self.aggregated_profiles = self.aggregated_profiles.reindex( meta_features + reindex_profiles, axis="columns" ) # If Columns have NaN values from concatenation, drop these self.aggregated_profiles.dropna(axis="columns", inplace=True) df_out = self.aggregated_profiles return df_out