def load_data(
    plate,
    pycyto_dict,
    cyto_dict,
    level,
    round_decimals,
    well_col="Metadata_Well",
    plate_col="Metadata_Plate",
    sample_col="Metadata_broad_sample",
):
    # Extract file from file dictionary
    try:
        pycyto_file = pycyto_dict[plate][level]
        cyto_file = cyto_dict[plate][level]
    except KeyError:
        raise KeyError(f"Data not found, skipping! {plate}: {level}")

    # Load data
    pycyto_df = pd.read_csv(pycyto_file)
    try:
        cyto_df = pd.read_csv(cyto_file).drop(
            ["Cytoplasm_Parent_Cells", "Cytoplasm_Parent_Nuclei"],
            axis="columns")
    except KeyError:
        cyto_df = pd.read_csv(cyto_file)

    # Confirm metadata are aligned
    pd.testing.assert_series_equal(pycyto_df.loc[:, well_col],
                                   cyto_df.loc[:, well_col])
    pd.testing.assert_series_equal(pycyto_df.loc[:, plate_col],
                                   cyto_df.loc[:, plate_col])
    pd.testing.assert_series_equal(pycyto_df.loc[:, sample_col],
                                   cyto_df.loc[:, sample_col])

    # Align to CP Features only
    pycyto_features = infer_cp_features(pycyto_df)
    cyto_features = infer_cp_features(cyto_df)

    # Features must be the same before feature selection
    if level in ["level_3", "level_4a"]:
        assert set(pycyto_features) == set(
            cyto_features), "features should be aligned!"

    # Reindex and round data
    pycyto_df = pycyto_df.reindex(set(pycyto_features),
                                  axis="columns").round(round_decimals)
    cyto_df = cyto_df.reindex(set(cyto_features),
                              axis="columns").round(round_decimals)

    # If we're testing pycytominer feature selection procedure,
    # align cyto data with pycyto features
    if level == "pycytominer_select":
        cyto_df = cyto_df.reindex(set(pycyto_features), axis="columns")

    # Return a tuple of (pycyto data, cyto data) with aligned feature indices
    return (pycyto_df, cyto_df)
    def pipeline_normalize(self, batch, plate, steps, samples, suffix=None):
        normalize_steps = steps
        output_dir = pathlib.PurePath(".", self.pipeline_output, batch, plate)
        annotate_output_file = pathlib.PurePath(output_dir,
                                                f"{plate}_augmented.csv.gz")
        normalize_output_file = pathlib.PurePath(output_dir,
                                                 f"{plate}_normalized.csv.gz")
        if suffix:
            normalize_output_file = pathlib.PurePath(
                output_dir, f"{plate}_normalized_{suffix}.csv.gz")

        normalization_features = normalize_steps["features"]
        normalization_method = normalize_steps["method"]

        if normalization_features == "infer" and self.noncanonical:
            normalization_features = cyto_utils.infer_cp_features(
                pd.read_csv(annotate_output_file),
                compartments=self.compartments)

        normalize(
            profiles=annotate_output_file,
            features=normalization_features,
            samples=samples,
            method=normalization_method,
            output_file=normalize_output_file,
            compression_options=self.pipeline_options["compression"],
            float_format=self.pipeline_options["float_format"],
        )
示例#3
0
def variance_threshold(population_df,
                       features="infer",
                       samples="all",
                       freq_cut=0.05,
                       unique_cut=0.01):
    """
    Exclude features that have low variance (low information content)

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame or file
        DataFrame that includes metadata and observation features.
    features : list, default "infer"
         List of features present in the population dataframe [default: "infer"]
         if "infer", then assume cell painting features are those that start with
         "Cells_", "Nuclei_", or "Cytoplasm_".
    samples : list or str, default "all"
        List of samples to perform operation on. If "all", use all samples to calculate.
    freq_cut : float, default 0.05
        Ratio (2nd most common feature val / most common).
    unique_cut: float, default 0.01
        Ratio (num unique features / num samples).

    Returns
    -------
    excluded_features : list of str
         List of features to exclude from the population_df.

    """

    assert 0 <= freq_cut <= 1, "freq_cut variable must be between (0 and 1)"
    assert 0 <= unique_cut <= 1, "unique_cut variable must be between (0 and 1)"

    # Subset dataframe
    if samples != "all":
        population_df = population_df.loc[samples, :]

    if features == "infer":
        features = infer_cp_features(population_df)

    population_df = population_df.loc[:, features]

    # Test if excluded for low frequency
    excluded_features_freq = population_df.apply(
        lambda x: calculate_frequency(x, freq_cut), axis="rows")

    excluded_features_freq = excluded_features_freq[
        excluded_features_freq.isna()].index.tolist()

    # Test if excluded for uniqueness
    n = population_df.shape[0]
    num_unique_features = population_df.nunique()

    unique_ratio = num_unique_features / n
    unique_ratio = unique_ratio < unique_cut
    excluded_features_unique = unique_ratio[unique_ratio].index.tolist()

    excluded_features = list(
        set(excluded_features_freq + excluded_features_unique))
    return excluded_features
def convert_data(df_dict):
    df = pd.concat(df_dict.values(), ignore_index=True,
                   sort=True).reset_index(drop=True)
    cp_cols = infer_cp_features(df)
    meta_cols = df.drop(cp_cols, axis="columns").columns.tolist()

    return df.reindex(meta_cols + cp_cols, axis="columns")
示例#5
0
def load_data(y_col="Metadata_CellLine",
              wt_col="WT",
              return_meta=False,
              shuffle_row_order=False):
    train_file = pathlib.Path("data", "example_train.tsv.gz")
    train_df = pd.read_csv(train_file, sep="\t")

    test_file = pathlib.Path("data", "example_test.tsv.gz")
    test_df = pd.read_csv(test_file, sep="\t")

    if shuffle_row_order:
        train_df = train_df.sample(frac=1).reset_index(drop=True)
        test_df = test_df.sample(frac=1).reset_index(drop=True)

    y_train_df = pd.DataFrame(train_df.loc[:, y_col]).assign(status=0)
    y_train_df.loc[y_train_df.loc[:, y_col] != wt_col, "status"] = 1

    y_test_df = pd.DataFrame(test_df.loc[:, y_col]).assign(status=0)
    y_test_df.loc[y_test_df.loc[:, y_col] != wt_col, "status"] = 1

    cp_features = infer_cp_features(train_df)
    x_train_df = train_df.loc[:, cp_features]
    x_test_df = test_df.loc[:, cp_features]

    if return_meta:
        meta_train_df = train_df.drop(cp_features, axis="columns")
        meta_test_df = test_df.drop(cp_features, axis="columns")
        return x_train_df, y_train_df, meta_train_df, x_test_df, y_test_df, meta_test_df
    else:
        return x_train_df, y_train_df, x_test_df, y_test_df
示例#6
0
def aggregate(
    population_df,
    strata=["Metadata_Plate", "Metadata_Well"],
    features="infer",
    operation="median",
    subset_data_df="none",
):
    """
    Combine population dataframe variables by strata groups using given operation

    Arguments:
    population_df - pandas DataFrame to group and aggregate
    strata - [default: ["Metadata_Plate", "Metadata_Well"]] list indicating the columns to groupby and aggregate
    features - [default: "all"] or list indicating features that should be aggregated
    operation - [default: "median"] a string indicating how the data is aggregated
                currently only supports one of ['mean', 'median']
    subset_data_df - [default: "none"] a pandas dataframe indicating how to subset the input

    Return:
    Pandas DataFrame of aggregated features
    """
    # Check that the operation is supported
    operation = check_aggregate_operation(operation)

    # Subset the data to specified samples
    if isinstance(subset_data_df, pd.DataFrame):
        population_df = subset_data_df.merge(
            population_df, how="left",
            on=subset_data_df.columns.tolist()).reindex(population_df.columns,
                                                        axis="columns")

    # Subset dataframe to only specified variables if provided
    strata_df = population_df.loc[:, strata]
    if features == "infer":
        features = infer_cp_features(population_df)
        population_df = population_df.loc[:, features]
    else:
        population_df = population_df.loc[:, features]

    # Fix dtype of input features (they should all be floats!)
    convert_dict = {x: float for x in features}
    population_df = population_df.astype(convert_dict)

    # Merge back metadata used to aggregate by
    population_df = pd.concat([strata_df, population_df], axis="columns")

    # Perform aggregating function
    population_df = population_df.groupby(strata)

    if operation == "median":
        population_df = population_df.median().reset_index()
    else:
        population_df = population_df.mean().reset_index()

    # Aggregated image number and object number do not make sense
    for col in ["ImageNumber", "ObjectNumber"]:
        if col in population_df.columns:
            population_df = population_df.drop([col], axis="columns")

    return population_df
示例#7
0
def correlation_threshold(population_df,
                          features="infer",
                          samples="all",
                          threshold=0.9,
                          method="pearson"):
    """
    Exclude features that have correlations above a certain threshold

    Arguments:
    population_df - pandas DataFrame that includes metadata and observation features
    features - a list of features present in the population dataframe [default: "infer"]
               if "infer", then assume cell painting features are those that start with
               "Cells_", "Nuclei_", or "Cytoplasm_"
    samples - list samples to perform operation on
              [default: "all"] - if "all", use all samples to calculate
    threshold - float between (0, 1) to exclude features [default: 0.9]
    method - string indicating which correlation metric to use to test cutoff
             [default: "pearson"]

    Return:
    list of features to exclude from the population_df
    """

    # Check that the input method is supported
    method = check_correlation_method(method)

    assert 0 <= threshold <= 1, "threshold variable must be between (0 and 1)"

    # Subset dataframe and calculate correlation matrix across subset features
    if samples != "all":
        population_df = population_df.loc[samples, :]

    if features == "infer":
        features = infer_cp_features(population_df)

    population_df = population_df.loc[:, features]

    # Get correlation matrix and lower triangle of pairwise correlations in long format
    data_cor_df, pairwise_df = get_pairwise_correlation(
        population_df=population_df, method=method)

    # Get absolute sum of correlation across features
    # The lower the index, the less correlation to the full data frame
    # We want to drop features with highest correlation, so drop higher index
    variable_cor_sum = data_cor_df.abs().sum().sort_values().index

    # And subset to only variable combinations that pass the threshold
    pairwise_df = pairwise_df.query("correlation > @threshold")

    # Return an empty list if nothing is over correlation threshold
    if pairwise_df.shape[0] == 0:
        return []

    # Output the excluded features
    excluded = pairwise_df.apply(
        lambda x: determine_high_cor_pair(x, variable_cor_sum), axis="columns")

    return list(set(excluded.tolist()))
def test_batch_effect_contribution(df, n_components, pca_columns,
                                   model_formula):
    features = infer_cp_features(df)
    meta_features = infer_cp_features(df, metadata=True)

    feature_df = df.loc[:, features]

    pca = decomposition.PCA(n_components=n_components).fit(feature_df)
    pca_batch_df = pca.transform(feature_df)

    pca_batch_df = pd.concat([
        df.loc[:, meta_features],
        pd.DataFrame(pca_batch_df, columns=pca_columns),
    ],
                             axis="columns")

    melt_df = pd.melt(pca_batch_df,
                      id_vars=meta_features,
                      value_vars=pca_columns,
                      var_name="pca_component",
                      value_name="pca_value")

    anova_results = []
    for pca_component in pca_columns:
        subset_melt_df = melt_df.query("pca_component == @pca_component")

        # Setup model
        model = ols(model_formula, data=subset_melt_df).fit()

        # Generate ANOVA table
        anova_table = (sm.stats.anova_lm(model, typ=2).reset_index().rename(
            {
                "index": "factor"
            }, axis="columns").assign(pca=pca_component))
        anova_results.append(anova_table)

    anova_results = pd.concat(anova_results).reset_index(drop=True).dropna()
    anova_results = anova_results.assign(
        neg_log_p=-np.log10(anova_results.loc[:, "PR(>F)"]), batch=batch)

    anova_results.pca = pd.Categorical(anova_results.pca,
                                       categories=pca_columns)
    anova_results = anova_results.assign(
        component_number=[int(x.split("_")[1]) for x in anova_results.pca])
    return anova_results
示例#9
0
def variance_threshold(population_df,
                       features="infer",
                       samples="all",
                       freq_cut=0.05,
                       unique_cut=0.01):
    """
    Exclude features that have low variance (low information content)

    Arguments:
    population_df - pandas DataFrame that includes metadata and observation features
    features - a list of features present in the population dataframe [default: "infer"]
               if "infer", then assume cell painting features are those that start with
               "Cells_", "Nuclei_", or "Cytoplasm_"
    samples - list samples to perform operation on
              [default: "all"] - if "all", use all samples to calculate
    freq_cut - float of ratio (second most common feature value / most common) [default: 0.1]
    unique_cut - float of ratio (num unique features / num samples) [default: 0.1]

    Return:
    list of features to exclude from the population_df
    """

    assert 0 <= freq_cut <= 1, "freq_cut variable must be between (0 and 1)"
    assert 0 <= unique_cut <= 1, "unique_cut variable must be between (0 and 1)"

    # Subset dataframe
    if samples != "all":
        population_df = population_df.loc[samples, :]

    if features == "infer":
        features = infer_cp_features(population_df)

    population_df = population_df.loc[:, features]

    # Test if excluded for low frequency
    excluded_features_freq = population_df.apply(
        lambda x: calculate_frequency(x, freq_cut), axis="rows")

    excluded_features_freq = excluded_features_freq[
        excluded_features_freq.isna()].index.tolist()

    # Test if excluded for uniqueness
    n = population_df.shape[0]
    num_unique_features = population_df.nunique()

    unique_ratio = num_unique_features / n
    unique_ratio = unique_ratio < unique_cut
    excluded_features_unique = unique_ratio[unique_ratio].index.tolist()

    excluded_features = list(
        set(excluded_features_freq + excluded_features_unique))
    return excluded_features
def load_data(return_meta=False,
              shuffle_row_order=False,
              holdout=False,
              othertreatment=False):
    output_data_dict = {"train": {}, "test": {}}
    train_file = pathlib.Path("data", "single_cell_train.tsv.gz")
    train_df = pd.read_csv(train_file, sep="\t")

    test_file = pathlib.Path("data", "single_cell_test.tsv.gz")
    test_df = pd.read_csv(test_file, sep="\t")

    if shuffle_row_order:
        train_df = train_df.sample(frac=1).reset_index(drop=True)
        test_df = test_df.sample(frac=1).reset_index(drop=True)

    cp_features = infer_cp_features(train_df)
    output_data_dict["train"]["x"] = train_df.loc[:, cp_features]
    output_data_dict["test"]["x"] = test_df.loc[:, cp_features]

    if holdout:
        output_data_dict["holdout"] = {}
        holdout_file = pathlib.Path("data", "single_cell_holdout.tsv.gz")
        holdout_df = pd.read_csv(holdout_file, sep="\t")
        if shuffle_row_order:
            holdout_df = holdout_df.sample(frac=1).reset_index(drop=True)

        output_data_dict["holdout"]["x"] = holdout_df.loc[:, cp_features]

    if othertreatment:
        output_data_dict["othertreatment"] = {}
        other_file = pathlib.Path("data", "single_cell_othertreatment.tsv.gz")
        other_df = pd.read_csv(other_file, sep="\t")
        if shuffle_row_order:
            other_df = other_df.sample(frac=1).reset_index(drop=True)

        output_data_dict["othertreatment"]["x"] = other_df.loc[:, cp_features]

    if return_meta:
        output_data_dict["train"]["meta"] = train_df.drop(cp_features,
                                                          axis="columns")
        output_data_dict["test"]["meta"] = test_df.drop(cp_features,
                                                        axis="columns")

        if holdout:
            output_data_dict["holdout"]["meta"] = holdout_df.drop(
                cp_features, axis="columns")

        if othertreatment:
            output_data_dict["othertreatment"]["meta"] = other_df.drop(
                cp_features, axis="columns")

    return output_data_dict
示例#11
0
def transform(df,
              features="infer",
              meta_features="infer",
              operation="zeroone"):
    if features == "infer":
        features = infer_cp_features(df)
    if meta_features == "infer":
        meta_features = infer_cp_features(df, metadata=True)

    feature_df = df.loc[:, features]
    meta_df = df.loc[:, meta_features]

    if operation == "zeroone":
        scaler = sklearn.preprocessing.MinMaxScaler()

    feature_df = pd.DataFrame(
        scaler.fit_transform(feature_df),
        index=feature_df.index,
        columns=feature_df.columns,
    )
    output_df = pd.concat([meta_df, feature_df], axis="columns")
    return output_df
def process_umap(data_df):    
    # Prepare UMAP input by removing metadata columns
    metadata_cols = infer_cp_features(data_df, metadata=True)

    metadata_df = data_df.loc[:, metadata_cols]
    umap_data_df = data_df.drop(metadata_cols, axis="columns")
    
    # Apply UMAP
    reducer = umap.UMAP(random_state=123)
    embedding = reducer.fit_transform(umap_data_df)
    
    # Setup plotting logic
    embedding_df = pd.DataFrame(embedding, columns=['x', 'y'])
    embedding_df = embedding_df.merge(metadata_df, left_index=True, right_index=True)
    
    return embedding_df
示例#13
0
def normalize_sc(sc_df, scaler_method="standard"):
    sc_df = sc_df.reset_index(drop=True)
    cp_features = infer_cp_features(sc_df)
    meta_df = sc_df.drop(cp_features, axis="columns")
    meta_df.columns = [
        x if x.startswith("Metadata_") else f"Metadata_{x}"
        for x in meta_df.columns
    ]
    sc_df = sc_df.loc[:, cp_features]

    if scaler_method == "standard":
        scaler = StandardScaler()

    sc_df = pd.DataFrame(scaler.fit_transform(sc_df),
                         index=sc_df.index,
                         columns=sc_df.columns)
    sc_df = meta_df.merge(sc_df, left_index=True, right_index=True)
    return sc_df
示例#14
0
def annotate(
    profiles,
    platemap,
    join_on=["Metadata_well_position", "Metadata_Well"],
    output_file="none",
    add_metadata_id_to_platemap=True,
    format_broad_cmap=False,
    clean_cellprofiler=True,
    external_metadata="none",
    external_join_left="none",
    external_join_right="none",
    compression_options=None,
    float_format=None,
    cmap_args={},
):
    """Add metadata to aggregated profiles.

    Parameters
    ----------
    profiles : pandas.core.frame.DataFrame or file
        DataFrame or file path of profiles.
    platemap : pandas.core.frame.DataFrame or file
        Dataframe or file path of platemap metadata.
    join_on : list or str, default: ["Metadata_well_position", "Metadata_Well"]
        Which variables to merge profiles and plate. The first element indicates variable(s) in platemap and the second element indicates variable(s) in profiles to merge using. Note the setting of `add_metadata_id_to_platemap`
    output_file : str, optional
       If not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_augmented.csv".
    add_metadata_id_to_platemap : bool, default True
        Whether the plate map variables possibly need "Metadata" pre-pended
    format_broad_cmap : bool, default False
        Whether we need to add columns to make compatible with Broad CMAP naming conventions.
    clean_cellprofiler: bool, default True
        Clean specific CellProfiler feature names.
    external_metadata : str, optional
        File with additional metadata information
    external_join_left : str, optional
        Merge column in the profile metadata.
    external_join_right: str, optional
        Merge column in the external metadata.
    compression_options : str or dict, optional
        Contains compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.
    float_format : str, optional
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision.
    cmap_args : dict, default {}
        Potential keyword arguments for annotate_cmap(). See cyto_utils/annotate_custom.py for more details.

    Returns
    -------
    annotated : pandas.core.frame.DataFrame, optional
        DataFrame of annotated features. If output_file="none", then return the
        DataFrame. If you specify output_file, then write to file and do not return
        data.
    """

    # Load Data
    profiles = load_profiles(profiles)
    platemap = load_platemap(platemap, add_metadata_id_to_platemap)

    annotated = platemap.merge(profiles,
                               left_on=join_on[0],
                               right_on=join_on[1],
                               how="inner").drop(join_on[0], axis="columns")

    # Add specific Connectivity Map (CMAP) formatting
    if format_broad_cmap:
        annotated = annotate_cmap(annotated,
                                  annotate_join_on=join_on[1],
                                  **cmap_args)

    if clean_cellprofiler:
        annotated = cp_clean(annotated)

    if not isinstance(external_metadata, pd.DataFrame):
        if external_metadata != "none":
            assert os.path.exists(
                external_metadata
            ), "external metadata at {} does not exist".format(
                external_metadata)

            external_metadata = pd.read_csv(external_metadata)

    if isinstance(external_metadata, pd.DataFrame):
        external_metadata.columns = [
            "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
            for x in external_metadata.columns
        ]

        annotated = (annotated.merge(
            external_metadata,
            left_on=external_join_left,
            right_on=external_join_right,
            how="left",
        ).reset_index(drop=True).drop_duplicates())

    # Reorder annotated metadata columns
    meta_cols = infer_cp_features(annotated, metadata=True)
    other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist()

    annotated = annotated.loc[:, meta_cols + other_cols]

    if output_file != "none":
        output(
            df=annotated,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return annotated
示例#15
0
def test_merge_single_cells():
    sc_merged_df = ap.merge_single_cells()

    # Assert that the image data was merged
    assert all(x in sc_merged_df.columns
               for x in ["Metadata_Plate", "Metadata_Well"])

    # Assert that metadata columns were renamed appropriately
    for x in ap.full_merge_suffix_rename:
        assert ap.full_merge_suffix_rename[x] == "Metadata_{x}".format(x=x)

    # Perform a manual merge
    manual_merge = cytoplasm_df.merge(
        cells_df,
        left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"],
        right_on=["TableNumber", "ImageNumber", "ObjectNumber"],
        suffixes=["_cytoplasm", "_cells"],
    ).merge(
        nuclei_df,
        left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"],
        right_on=["TableNumber", "ImageNumber", "ObjectNumber"],
        suffixes=["_cytoplasm", "_nuclei"],
    )

    manual_merge = image_df.merge(manual_merge, on=ap.merge_cols,
                                  how="right").rename(
                                      ap.full_merge_suffix_rename,
                                      axis="columns")

    # Confirm that the merge correctly reversed the object number (opposite from Parent)
    assert (sc_merged_df.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] ==
            sc_merged_df.Metadata_ObjectNumber.tolist())
    assert (manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] ==
            sc_merged_df.Metadata_ObjectNumber.tolist())
    assert (manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] ==
            sc_merged_df.Metadata_ObjectNumber.tolist())
    assert (manual_merge.Metadata_ObjectNumber_cells.tolist() ==
            sc_merged_df.Metadata_ObjectNumber.tolist())

    # Confirm the merge and adding merge options
    for method in ["standardize", "robustize"]:
        for samples in ["all", "Metadata_ImageNumber == 'x'"]:
            for features in ["infer", ["Cytoplasm_a", "Cells_a"]]:

                norm_method_df = ap.merge_single_cells(
                    single_cell_normalize=True,
                    normalize_args={
                        "method": method,
                        "samples": samples,
                        "features": features,
                    },
                )

                manual_merge_normalize = normalize(manual_merge,
                                                   method=method,
                                                   samples=samples,
                                                   features=features)

                pd.testing.assert_frame_equal(
                    norm_method_df.sort_index(axis=1),
                    manual_merge_normalize.sort_index(axis=1),
                )

    # Test non-canonical compartment merging
    new_sc_merge_df = ap_new.merge_single_cells()

    assert sum(new_sc_merge_df.columns.str.startswith("New")) == 4
    assert (new_compartment_df.ObjectNumber.tolist()[::-1] ==
            new_sc_merge_df.Metadata_ObjectNumber_new.tolist())

    norm_new_method_df = ap_new.merge_single_cells(
        single_cell_normalize=True,
        normalize_args={
            "method": "standardize",
            "samples": "all",
            "features": "infer",
        },
    )

    norm_new_method_no_feature_infer_df = ap_new.merge_single_cells(
        single_cell_normalize=True,
        normalize_args={
            "method": "standardize",
            "samples": "all",
        },
    )

    default_feature_infer_df = ap_new.merge_single_cells(
        single_cell_normalize=True)

    pd.testing.assert_frame_equal(norm_new_method_df, default_feature_infer_df)
    pd.testing.assert_frame_equal(norm_new_method_df,
                                  norm_new_method_no_feature_infer_df)

    new_compartment_cols = infer_cp_features(new_compartment_df,
                                             compartments=ap_new.compartments)
    traditional_norm_df = normalize(
        ap_new.image_df.merge(new_compartment_df, on=ap.merge_cols),
        features=new_compartment_cols,
        samples="all",
        method="standardize",
    )

    pd.testing.assert_frame_equal(
        norm_new_method_df.loc[:, new_compartment_cols].abs().describe(),
        traditional_norm_df.loc[:, new_compartment_cols].abs().describe(),
    )
示例#16
0
barcode_col = "Metadata_pert_name"
gene_col = "Metadata_gene_name"

replicate_group_grit = {"replicate_id": barcode_col, "group_id": gene_col}

control_group_cut = ["Chr2", "Luc", "LacZ"]

control_barcodes = (
    df.loc[df[replicate_group_grit["group_id"]].isin(control_group_cut),
           replicate_group_grit["replicate_id"], ].unique().tolist())

control_barcodes

# In[5]:

all_features = infer_cp_features(df, compartments=compartments)
meta_features = infer_cp_features(df, metadata=True)

meta_features

# In[6]:

grit_compartment_results = []
for cell_line in df.Metadata_cell_line.unique():
    for compartment in compartments:
        compartment_features = infer_cp_features(df, compartments=compartment)

        for drop in [True, False]:

            if drop:
                subset_df = df.drop(compartment_features, axis="columns")
示例#17
0
def normalize(
    profiles,
    features="infer",
    meta_features="infer",
    samples="all",
    method="standardize",
    output_file="none",
    compression=None,
    float_format=None,
    whiten_center=True,
    whiten_method="ZCA",
):
    """
    Normalize features

    Arguments:
    profiles - either pandas DataFrame or a file that stores profile data
    features - list of cell painting features [default: "infer"]
               if "infer", then assume cell painting features are those that do not
               start with "Cells", "Nuclei", or "Cytoplasm"
    meta_features - if specified, then output these with specified features
                    [default: "infer"]
    samples - string indicating which metadata column and values to use to subset
              the control samples are often used here [default: 'all']
              the format of this variable will be used in a pd.query() function. An
              example is "Metadata_treatment == 'control'" (include all quotes)
    method - string indicating how the dataframe will be normalized
             [default: 'standardize']
    output_file - [default: "none"] if provided, will write annotated profiles to file
                  if not specified, will return the annotated profiles. We recommend
                  that this output file be suffixed with "_normalized.csv".
    compression - the mechanism to compress [default: None]
    float_format - decimal precision to use in writing output file [default: None]
                       For example, use "%.3g" for 3 decimal precision.
    whiten_center - if data should be centered before whitening transform [default: True]
                    (only used if method = "whiten")
    whiten_method - the type of whitening normalization used [default: 'ZCA']
                    (only used if method = "whiten")

    Return:
    A normalized DataFrame
    """

    # Load Data
    profiles = load_profiles(profiles)

    # Define which scaler to use
    method = method.lower()

    avail_methods = ["standardize", "robustize", "mad_robustize", "whiten"]
    assert method in avail_methods, "operation must be one {}".format(
        avail_methods)

    if method == "standardize":
        scaler = StandardScaler()
    elif method == "robustize":
        scaler = RobustScaler()
    elif method == "mad_robustize":
        scaler = RobustMAD()
    elif method == "whiten":
        scaler = Whiten(center=whiten_center, method=whiten_method)

    if features == "infer":
        features = infer_cp_features(profiles)

    # Separate out the features and meta
    feature_df = profiles.loc[:, features]
    if meta_features == "infer":
        meta_features = infer_cp_features(profiles, metadata=True)

    meta_df = profiles.loc[:, meta_features]

    # Fit the sklearn scaler
    if samples == "all":
        fitted_scaler = scaler.fit(feature_df)
    else:
        # Subset to only the features measured in the sample query
        fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features])

    # Scale the feature dataframe
    feature_df = pd.DataFrame(
        fitted_scaler.transform(feature_df),
        columns=feature_df.columns,
        index=feature_df.index,
    )

    normalized = meta_df.merge(feature_df, left_index=True, right_index=True)

    if output_file != "none":
        output(
            df=normalized,
            output_filename=output_file,
            compression=compression,
            float_format=float_format,
        )
    else:
        return normalized
    plate_files = batch_dict["plate_files"]
    plates = batch_dict["plates"]
    for plate in plates:
        print("Now auditing... Batch: {}; Plate: {}".format(batch, plate))
        audit_output_dir = os.path.join(output_dir, batch, plate)
        os.makedirs(audit_output_dir, exist_ok=True)

        figure_output_dir = os.path.join(figure_dir, batch, plate)
        os.makedirs(figure_output_dir, exist_ok=True)

        audit_output_file = os.path.join(audit_output_dir,
                                         "{}_audit.csv".format(plate))
        df = pd.read_csv(plate_files[plate])

        # Determine feature class
        features = infer_cp_features(df)
        meta_features = infer_cp_features(df, metadata=True)

        # Calculate and process pairwise similarity matrix
        audit_df = metric_melt(
            df=df,
            features=features,
            metadata_features=meta_features,
            similarity_metric="pearson",
            eval_metric="replicate_reproducibility",
        )

        audit_df = assign_replicates(similarity_melted_df=audit_df,
                                     replicate_groups=audit_cols)
        # What is 95% of the non replicate null distribution
        cutoff = audit_df.query(
示例#19
0
def annotate(
    profiles,
    platemap,
    cell_id="unknown",
    join_on=["Metadata_well_position", "Metadata_Well"],
    output_file="none",
    add_metadata_id_to_platemap=True,
    format_broad_cmap=False,
    perturbation_mode="none",
    external_metadata="none",
    external_join_left="none",
    external_join_right="none",
    compression=None,
    float_format=None,
):
    """
    Exclude features that have correlations above a certain threshold

    Arguments:
    profiles - either pandas DataFrame or a file that stores profile data
    platemap - either pandas DataFrame or a file that stores platemap metadata
    cell_id - [default: "unknown"] provide a string to annotate cell id column
    join_on - list of length two indicating which variables to merge profiles and plate
              [default: ["Metadata_well_position", "Metadata_Well"]]. The first element
              indicates variable(s) in platemap and the second element indicates
              variable(s) in profiles to merge using.
              Note the setting of `add_metadata_id_to_platemap`
    output_file - [default: "none"] if provided, will write annotated profiles to file
                  if not specified, will return the annotated profiles. We recommend
                  that this output file be suffixed with "_augmented.csv".
    add_metadata_id_to_platemap - boolean if the platemap variables should be recoded
    format_broad_cmap - [default: False] boolean if we need to add columns to make
                        compatible with Broad CMAP naming conventions.
    perturbation_mode - [default: "none"] - either "chemical", "genetic" or "none" and only
                        active if format_broad_cmap == True
    external_metadata - [default: "none"] a string indicating a file with additional
                        metadata information
    external_join_left - [default: "none"] the merge column in the profile metadata
    external_join_right - [default: "none"] the merge column in the external metadata
    compression - the mechanism to compress [default: None]
    float_format - decimal precision to use in writing output file [default: None]
                       For example, use "%.3g" for 3 decimal precision.

    Return:
    Pandas DataFrame of annotated profiles or written to file
    """

    # Load Data
    profiles = load_profiles(profiles)
    platemap = load_platemap(platemap, add_metadata_id_to_platemap)

    annotated = platemap.merge(profiles,
                               left_on=join_on[0],
                               right_on=join_on[1],
                               how="inner").drop(join_on[0], axis="columns")

    if format_broad_cmap:

        pert_opts = ["none", "chemical", "genetic"]
        assert (perturbation_mode in pert_opts
                ), "perturbation mode must be one of {}".format(pert_opts)

        assert (
            "Metadata_broad_sample" in annotated.columns
        ), "Are you sure this is a CMAP file? 'Metadata_broad_sample column not found.'"

        annotated = annotated.assign(
            Metadata_pert_id=annotated.Metadata_broad_sample.str.extract(
                r"(BRD[-N][A-Z0-9]+)"),
            Metadata_pert_mfc_id=annotated.Metadata_broad_sample,
            Metadata_pert_well=annotated.loc[:, join_on[1]],
            Metadata_pert_id_vendor="",
        )

        if "Metadata_pert_iname" in annotated.columns:
            annotated = annotated.assign(
                Metadata_pert_mfc_desc=annotated.Metadata_pert_iname,
                Metadata_pert_name=annotated.Metadata_pert_iname,
            )

        if "Metadata_cell_id" not in annotated.columns:
            annotated = annotated.assign(Metadata_cell_id=cell_id)

        if perturbation_mode == "chemical":
            annotated = annotated.assign(Metadata_broad_sample_type=[
                "control" if x in ["DMSO", np.nan] else "trt"
                for x in annotated.Metadata_broad_sample
            ])

            # Generate Metadata_broad_sample column
            annotated.loc[annotated.Metadata_broad_sample_type == "control",
                          "Metadata_broad_sample", ] = "DMSO"
            annotated.loc[annotated.Metadata_broad_sample == "empty",
                          "Metadata_broad_sample_type"] = "empty"

            if "Metadata_mmoles_per_liter" in annotated.columns:
                annotated.loc[annotated.Metadata_broad_sample_type ==
                              "control", "Metadata_mmoles_per_liter", ] = 0

            if "Metadata_solvent" in annotated.columns:
                annotated = annotated.assign(
                    Metadata_pert_vehicle=annotated.Metadata_solvent)
            if "Metadata_mg_per_ml" in annotated.columns:
                annotated.loc[annotated.Metadata_broad_sample_type ==
                              "control", "Metadata_mg_per_ml", ] = 0

        if perturbation_mode == "genetic":
            if "Metadata_pert_name" in annotated.columns:
                annotated = annotated.assign(Metadata_broad_sample_type=[
                    "control" if x == "EMPTY" else "trt"
                    for x in annotated.Metadata_pert_name
                ])

        if "Metadata_broad_sample_type" in annotated.columns:
            annotated = annotated.assign(
                Metadata_pert_type=annotated.Metadata_broad_sample_type)
        else:
            annotated = annotated.assign(Metadata_pert_type="",
                                         Metadata_broad_sample_type="")

    # Add specific Connectivity Map (CMAP) formatting
    if not isinstance(external_metadata, pd.DataFrame):
        if external_metadata != "none":
            assert os.path.exists(
                external_metadata
            ), "external metadata at {} does not exist".format(
                external_metadata)

            external_metadata = pd.read_csv(external_metadata)

    if isinstance(external_metadata, pd.DataFrame):
        external_metadata.columns = [
            "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
            for x in external_metadata.columns
        ]

        annotated = (annotated.merge(
            external_metadata,
            left_on=external_join_left,
            right_on=external_join_right,
            how="left",
        ).reset_index(drop=True).drop_duplicates())

    # Reorder annotated metadata columns
    meta_cols = infer_cp_features(annotated, metadata=True)
    other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist()

    annotated = annotated.loc[:, meta_cols + other_cols]

    if output_file != "none":
        output(
            df=annotated,
            output_filename=output_file,
            compression=compression,
            float_format=float_format,
        )
    else:
        return annotated
示例#20
0
def write_gct(
    profiles,
    output_file,
    features="infer",
    meta_features="infer",
    feature_metadata="none",
    version="#1.3",
):
    """
    Convert profiles to a .gct file

    Arguments:
    profiles - either pandas DataFrame or a file that stores profile data
    output_file - the name of the gct file to save processed data to
    features - a list of features present in the population dataframe [default: "infer"]
               if "infer", then assume cell painting features are those that start with
               "Cells_", "Nuclei_", or "Cytoplasm_"
    meta_features - if specified, then output these values in the gct file
         [default: "infer"]
    feature_metadata - pandas DataFrame linking features to additional metadata [default: "none"]

    Return:
    Pandas DataFrame of audits or written to file
    """

    # Note, only version 1.3 is currently supported
    assert version == "#1.3", "Only version #1.3 is currently supported."

    # Step 1: Create first two rows of data
    if features == "infer":
        features = infer_cp_features(profiles)
    feature_df = profiles.loc[:, features].reset_index(drop=True).transpose()

    # Separate out metadata features
    if meta_features == "infer":
        meta_features = infer_cp_features(profiles, metadata=True)
    metadata_df = profiles.loc[:, meta_features]

    # Step 2: Get the sample metadata portion of the output file
    metadata_part = metadata_df.transpose()
    metadata_part.columns = ["SAMPLE_{}".format(x) for x in metadata_part.columns]
    metadata_part = (
        metadata_part.transpose()
        .reset_index()
        .rename({"index": "id"}, axis="columns")
        .transpose()
    )
    metadata_part.index = [x.replace("Metadata_", "") for x in metadata_part.index]

    nrow_feature, ncol_features = feature_df.shape
    _, ncol_metadata = metadata_df.shape

    # Step 3: Compile feature metadata
    full_df = pd.concat([metadata_part, feature_df], axis="rows")
    if isinstance(feature_metadata, pd.DataFrame):
        nrow_metadata = feature_metadata.shape[1]
        assert (
            "id" in feature_metadata.index.tolist()
        ), "make sure feature metadata has row named 'id' that stores feature metadata names!"
        full_df = feature_metadata.merge(
            full_df, how="right", left_index=True, right_index=True
        )
    else:
        feature_metadata = (
            ["cp_feature_name"] + [np.nan] * ncol_metadata + feature_df.index.tolist()
        )
        nrow_metadata = 1
        full_df.insert(0, column="feature_metadata", value=feature_metadata)
    full_df = full_df.reset_index()

    # Step 4: Compile all data dimensions
    data_dimensions = [nrow_feature, ncol_features, nrow_metadata, ncol_metadata]

    # Step 5: Write output gct file
    with open(output_file, "w", newline="") as gctfile:
        gctwriter = csv.writer(gctfile, delimiter="\t")
        gctwriter.writerow([version])
        gctwriter.writerow(data_dimensions)
        for feature, row in full_df.iterrows():
            gctwriter.writerow(row)
示例#21
0
]).reset_index(drop=True)

data_missing_df = pd.concat([
    pd.DataFrame({
        "g": "a",
        "Cells_x": [1, 3, 8, np.nan],
        "Nuclei_y": [5, np.nan, 3, 1]
    }),
    pd.DataFrame({
        "g": "b",
        "Cells_x": [1, 3, np.nan, 5],
        "Nuclei_y": [np.nan, 8, 3, 1]
    }),
]).reset_index(drop=True)

features = infer_cp_features(data_df)
dtype_convert_dict = {x: float for x in features}


def test_aggregate_median_allvar():
    """
    Testing aggregate pycytominer function
    """
    aggregate_result = aggregate(population_df=data_df,
                                 strata=["g"],
                                 features="infer",
                                 operation="median")

    expected_result = pd.concat([
        pd.DataFrame({
            "g": "a",
示例#22
0
batches

# In[4]:

batch_data = {}
all_clones = list()
profile_count_list = list()
for batch in batches:
    print("Now processing... {}".format(batch))
    df, batch_count, treatment_count = process_counts(batch,
                                                      profile_dir=profile_dir)

    batch_data[batch] = {
        "dataframe": df,
        "metafeatures": infer_cp_features(df, metadata=True),
        "batch_count": batch_count,
        "treatment_count": treatment_count
    }

    all_clones += treatment_count.Metadata_clone.unique().tolist()
    profile_count_list.append(
        treatment_count.
        loc[:, ["Metadata_clone", "Metadata_treatment", "profile_count"]])

# In[5]:

sample_count_df = (pd.DataFrame(
    pd.concat(profile_count_list,
              axis="rows").fillna("DMSO").reset_index(drop=True).groupby(
                  ["Metadata_clone",
示例#23
0
# Output file info
output_dir = pathlib.Path("embeddings")
batch1_output_file = pathlib.Path(
    f"{output_dir}/cellpainting_embeddings_batch1.tsv.gz")
batch2_output_file = pathlib.Path(
    f"{output_dir}/cellpainting_embeddings_batch2.tsv.gz")

# In[4]:

# Load cell painting profiles
file = pathlib.Path("cellpainting_lvl4_cpd_replicate_datasets",
                    "cp_level4_cpd_replicates.csv.gz")
df = pd.read_csv(file, low_memory=False)

cp_features = infer_cp_features(df)
meta_features = infer_cp_features(
    df, metadata=True) + ["broad_id", "pert_iname", "moa", "replicate_name"]

# Transform PCA to top 50 components
n_components = 50
pca = PCA(n_components=n_components)

pca_df = pca.fit_transform(df.loc[:, cp_features])
pca_df = pd.DataFrame(pca_df)
pca_df.columns = [f"PCA_{x}" for x in range(0, n_components)]

print(pca_df.shape)
pca_df.head()

# ## UMAP - Batch 1
    def pipeline_feature_select(self, steps, suffix=None):
        feature_select_steps = steps
        pipeline_output = self.pipeline["output_dir"]

        level = feature_select_steps["level"]
        gct = feature_select_steps["gct"]
        feature_select_operations = feature_select_steps["operations"]
        feature_select_features = feature_select_steps["features"]

        all_plates_df = pd.DataFrame()

        for batch in self.profile_config:
            batch_df = pd.DataFrame()
            for plate in self.profile_config[batch]:
                output_dir = pathlib.PurePath(".", pipeline_output, batch,
                                              plate)
                if suffix:
                    normalize_output_file = pathlib.PurePath(
                        output_dir, f"{plate}_normalized_{suffix}.csv.gz")
                    feature_select_output_file_plate = pathlib.PurePath(
                        output_dir,
                        f"{plate}_normalized_feature_select_{suffix}_plate.csv.gz",
                    )
                else:
                    normalize_output_file = pathlib.PurePath(
                        output_dir, f"{plate}_normalized.csv.gz")
                    feature_select_output_file_plate = pathlib.PurePath(
                        output_dir,
                        f"{plate}_normalized_feature_select_plate.csv.gz")
                if feature_select_features == "infer" and self.noncanonical:
                    feature_select_features = cyto_utils.infer_cp_features(
                        pd.read_csv(normalize_output_file),
                        compartments=self.compartments,
                    )

                df = pd.read_csv(normalize_output_file).assign(
                    Metadata_batch=batch)

                if level == "plate":
                    df = df.drop(columns=["Metadata_batch"])
                    feature_select(
                        profiles=df,
                        features=feature_select_features,
                        operation=feature_select_operations,
                        output_file=feature_select_output_file_plate,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )
                elif level == "batch":
                    batch_df = concat_dataframes(batch_df, df)
                elif level == "all":
                    all_plates_df = concat_dataframes(all_plates_df, df)

            if level == "batch":
                fs_df = feature_select(
                    profiles=batch_df,
                    features=feature_select_features,
                    operation=feature_select_operations,
                )
                for plate in self.profile_config[batch]:
                    output_dir = pathlib.PurePath(".", pipeline_output, batch,
                                                  plate)
                    if suffix:
                        feature_select_output_file_batch = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_{suffix}_batch.csv.gz",
                        )
                    else:
                        feature_select_output_file_batch = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_batch.csv.gz",
                        )
                    if feature_select_features == "infer" and self.noncanonical:
                        feature_select_features = cyto_utils.infer_cp_features(
                            batch_df, compartments=self.compartments)

                    df = fs_df.query("Metadata_Plate==@plate").reset_index(
                        drop=True)
                    df = df.drop(columns=["Metadata_batch"])

                    cyto_utils.output(
                        output_filename=feature_select_output_file_batch,
                        df=df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )

                if gct:
                    create_gct_directories(batch)
                    if suffix:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_batch.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_batch.gct",
                        )
                    else:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_batch.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_batch.gct",
                        )
                    cyto_utils.output(
                        output_filename=stacked_file,
                        df=fs_df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )
                    write_gct(profiles=fs_df, output_file=gct_file)

        if level == "all":
            fs_df = feature_select(
                profiles=all_plates_df,
                features=feature_select_features,
                operation=feature_select_operations,
            )
            for batch in self.profile_config:
                fs_batch_df = fs_df.loc[fs_df.Metadata_batch ==
                                        batch].reset_index(drop=True)
                for plate in self.profile_config[batch]:
                    output_dir = pathlib.PurePath(".", pipeline_output, batch,
                                                  plate)
                    if suffix:
                        feature_select_output_file_all = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_{suffix}_all.csv.gz",
                        )
                    else:
                        feature_select_output_file_all = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_all.csv.gz")
                    if feature_select_features == "infer" and self.noncanonical:
                        feature_select_features = cyto_utils.infer_cp_features(
                            all_plates_df, compartments=self.compartments)

                    df = fs_batch_df.query(
                        "Metadata_Plate==@plate").reset_index(drop=True)

                    df = df.drop(columns=["Metadata_batch"])

                    cyto_utils.output(
                        output_filename=feature_select_output_file_all,
                        df=df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )

                if gct:
                    create_gct_directories(batch)
                    if suffix:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_all.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_all.gct",
                        )
                    else:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_all.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_all.gct",
                        )
                    cyto_utils.output(
                        output_filename=stacked_file,
                        df=fs_batch_df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )
                    write_gct(profiles=fs_batch_df, output_file=gct_file)
示例#25
0
def aggregate(
    population_df,
    strata=["Metadata_Plate", "Metadata_Well"],
    features="infer",
    operation="median",
    output_file="none",
    compute_object_count=False,
    object_feature="ObjectNumber",
    subset_data_df="none",
    compression_options=None,
    float_format=None,
):
    """Combine population dataframe variables by strata groups using given operation.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        DataFrame to group and aggregate.
    strata : list of str, default ["Metadata_Plate", "Metadata_Well"]
        Columns to groupby and aggregate.
    features : list of str, default "all"
        List of features that should be aggregated.
    operation : str, default "median"
        How the data is aggregated. Currently only supports one of ['mean', 'median'].
    output_file : str or file handle, optional
        If provided, will write aggregated profiles to file. If not specified, will return the aggregated profiles.
        We recommend naming the file based on the plate name.
    compute_object_count : bool, default False
        Whether or not to compute object counts.
    object_feature : str, default "ObjectNumber"
        Object number feature. Only used if compute_object_count=True.
    subset_data_df : pandas.core.frame.DataFrame
        How to subset the input.
    compression_options : str, optional
        The mechanism to compress.
    float_format : str, optional
        Decimal precision to use in writing output file.

    Returns
    -------
    pandas.core.frame.DataFrame
        DataFrame of aggregated features.

    """

    # Check that the operation is supported
    operation = check_aggregate_operation(operation)

    # Subset the data to specified samples
    if isinstance(subset_data_df, pd.DataFrame):
        population_df = subset_data_df.merge(
            population_df, how="left",
            on=subset_data_df.columns.tolist()).reindex(population_df.columns,
                                                        axis="columns")

    # Subset dataframe to only specified variables if provided
    strata_df = population_df.loc[:, strata]

    # Only extract single object column in preparation for count
    if compute_object_count:
        count_object_df = population_df.loc[:,
                                            np.
                                            union1d(strata, [object_feature])]
        count_object_df = (count_object_df.groupby(
            strata)[object_feature].count().reset_index().rename(
                columns={f"{object_feature}": "Metadata_Object_Count"}))

    if features == "infer":
        features = infer_cp_features(population_df)
        population_df = population_df.loc[:, features]
    else:
        population_df = population_df.loc[:, features]

    # Fix dtype of input features (they should all be floats!)
    convert_dict = {x: float for x in features}
    population_df = population_df.astype(convert_dict)

    # Merge back metadata used to aggregate by
    population_df = pd.concat([strata_df, population_df], axis="columns")

    # Perform aggregating function
    population_df = population_df.groupby(strata, dropna=False)

    if operation == "median":
        population_df = population_df.median().reset_index()
    else:
        population_df = population_df.mean().reset_index()

    # Compute objects counts
    if compute_object_count:
        population_df = count_object_df.merge(population_df,
                                              on=strata,
                                              how="right")

    # Aggregated image number and object number do not make sense
    for col in ["ImageNumber", "ObjectNumber"]:
        if col in population_df.columns:
            population_df = population_df.drop([col], axis="columns")

    if output_file != "none":
        output(
            df=population_df,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return population_df

    return population_df
    def pipeline_quality_control(self, steps):
        quality_control_steps = steps["operations"]
        pipeline_output = self.pipeline["output_dir"]

        summary_column_order = [
            "Batch_Name",
            "Plate_Name",
            "Well_Count",
            "Images_per_site",
            "Sites_per_well_Median",
            "Sites_per_well_mad",
        ]

        qc_dir = pathlib.PurePath(".", "quality_control")
        if not os.path.isdir(pathlib.PurePath(qc_dir)):
            os.mkdir(qc_dir)

        for step in quality_control_steps:
            if step == "summary":
                output_dir = pathlib.PurePath(".", "quality_control",
                                              "summary")
                if not os.path.isdir(pathlib.PurePath(output_dir)):
                    os.mkdir(output_dir)
                output_file = pathlib.PurePath(output_dir, "summary.tsv")
                if os.path.isfile(output_file):
                    summary = pd.read_csv(output_file, sep="\t")
                else:
                    summary = pd.DataFrame()
                for batch in self.profile_config:
                    for plate in self.profile_config[batch]:
                        input_file = pathlib.PurePath(".", "load_data_csv",
                                                      batch, plate,
                                                      "load_data.csv.gz")
                        df = pd.read_csv(input_file).assign(
                            Metadata_batch=batch)

                        site_df = (df.groupby([
                            "Metadata_Row", "Metadata_Col"
                        ]).Metadata_Site.count().reset_index().Metadata_Site)
                        image_count = len(
                            df.columns[df.columns.str.startswith("FileName")])

                        summary = summary.append(
                            {
                                "Batch_Name": batch,
                                "Plate_Name": plate,
                                "Well_Count": site_df.count(),
                                "Images_per_site": image_count,
                                "Sites_per_well_Median": site_df.median(),
                                "Sites_per_well_mad": "%.3f" % site_df.mad(),
                            },
                            ignore_index=True,
                        )

                summary["Well_Count"] = summary["Well_Count"].astype(int)
                summary["Images_per_site"] = summary["Images_per_site"].astype(
                    int)
                summary["Sites_per_well_Median"] = summary[
                    "Sites_per_well_Median"].astype(int)

                summary = summary.drop_duplicates(
                    subset=["Batch_Name", "Plate_Name"],
                    keep="last").sort_values(by=["Batch_Name", "Plate_Name"])

                summary[summary_column_order].to_csv(output_file,
                                                     sep="\t",
                                                     index=False)
            elif step == "heatmap":
                output_dir = pathlib.PurePath(".", "quality_control",
                                              "heatmap")
                if not os.path.isdir(pathlib.PurePath(output_dir)):
                    os.mkdir(output_dir)
                for batch in self.profile_config:
                    for plate in self.profile_config[batch]:
                        input_file = pathlib.PurePath(
                            ".",
                            pipeline_output,
                            batch,
                            plate,
                            f"{plate}_augmented.csv.gz",
                        )
                        df = (pd.read_csv(input_file).assign(
                            Metadata_Row=lambda x: x.Metadata_Well.str[0:1]
                        ).assign(
                            Metadata_Col=lambda x: x.Metadata_Well.str[1:]))
                        if "Metadata_Object_Count" in df.columns:
                            cell_count_feature = "Metadata_Object_Count"
                        else:
                            cell_count_feature = "Cytoplasm_Number_Object_Number"

                        df = df[[
                            "Metadata_Row", "Metadata_Col", cell_count_feature
                        ]]
                        df_pivot = df.pivot("Metadata_Row", "Metadata_Col",
                                            cell_count_feature)

                        fig = px.imshow(df_pivot,
                                        color_continuous_scale="blues")
                        fig.update_layout(
                            title=
                            f"Plate: {plate}, Feature: {cell_count_feature}",
                            xaxis=dict(title="", side="top"),
                            yaxis=dict(title=""),
                        )
                        fig.update_traces(xgap=1, ygap=1)

                        if not os.path.isdir(
                                pathlib.PurePath(output_dir, batch)):
                            os.mkdir(pathlib.PurePath(output_dir, batch))
                        if not os.path.isdir(
                                pathlib.PurePath(output_dir, batch, plate)):
                            os.mkdir(pathlib.PurePath(output_dir, batch,
                                                      plate))

                        output_file = (
                            f"{output_dir}/{batch}/{plate}/{plate}_cell_count.png"
                        )
                        fig.write_image(output_file,
                                        width=640,
                                        height=480,
                                        scale=2)

                        if os.path.isfile(
                                pathlib.PurePath(
                                    ".",
                                    pipeline_output,
                                    batch,
                                    plate,
                                    f"{plate}_normalized_feature_select_negcon_all.csv.gz",
                                )):
                            input_file = pathlib.PurePath(
                                ".",
                                pipeline_output,
                                batch,
                                plate,
                                f"{plate}_normalized_feature_select_negcon_all.csv.gz",
                            )
                        elif os.path.isfile(
                                pathlib.PurePath(
                                    ".",
                                    pipeline_output,
                                    batch,
                                    plate,
                                    f"{plate}_normalized_feature_select_negcon_batch.csv.gz",
                                )):
                            input_file = pathlib.PurePath(
                                ".",
                                pipeline_output,
                                batch,
                                plate,
                                f"{plate}_normalized_feature_select_negcon_batch.csv.gz",
                            )
                        elif os.path.isfile(
                                pathlib.PurePath(
                                    ".",
                                    pipeline_output,
                                    batch,
                                    plate,
                                    f"{plate}_normalized_feature_select_negcon_plate.csv.gz",
                                )):
                            input_file = pathlib.PurePath(
                                ".",
                                pipeline_output,
                                batch,
                                plate,
                                f"{plate}_normalized_feature_select_negcon_plate.csv.gz",
                            )
                        else:
                            continue

                        df = pd.read_csv(input_file)
                        profiles = df[cyto_utils.infer_cp_features(df)]

                        corr_matrix = np.corrcoef(profiles)

                        corr_matrix_df = pd.DataFrame(
                            corr_matrix,
                            columns=list(df.Metadata_Well),
                            index=list(df.Metadata_Well),
                        )

                        fig = px.imshow(corr_matrix_df,
                                        color_continuous_scale="BlueRed")
                        fig.update_layout(
                            title=f"Plate: {plate}, Correlation all vs. all",
                            xaxis=dict(title="Wells"),
                            yaxis=dict(title="Wells"),
                        )
                        output_file = (
                            f"{output_dir}/{batch}/{plate}/{plate}_correlation.png"
                        )
                        fig.write_image(output_file,
                                        width=640,
                                        height=480,
                                        scale=2)

                        corr_df = (corr_matrix_df.stack().reset_index().rename(
                            columns={
                                "level_0": "Well_Row",
                                "level_1": "Well_Col",
                                0: "correlation",
                            }).assign(
                                Row=lambda x: x.Well_Row.str[0:1]).assign(
                                    Col=lambda x: x.Well_Row.str[1:]))

                        corr_df["same_row_col"] = corr_df.apply(
                            lambda x: str(x.Row) in str(x.Well_Col) or str(
                                x.Col) in str(x.Well_Col),
                            axis=1,
                        )

                        wells = list(df.Metadata_Well)
                        table_df = pd.DataFrame()

                        for well in wells:
                            signal = list(corr_df.loc[
                                (corr_df.Well_Row == well)
                                & (corr_df.same_row_col)]["correlation"])
                            null = list(
                                corr_df.loc[(corr_df.Well_Row == well)
                                            & (corr_df.same_row_col == False)]
                                ["correlation"])

                            perc_95 = np.nanpercentile(null, 95)
                            above_threshold = signal > perc_95
                            value = np.mean(above_threshold.astype(float))

                            table_df = table_df.append(
                                {
                                    "Metadata_Row": well[0:1],
                                    "Metadata_Col": well[1:],
                                    "value": value,
                                },
                                ignore_index=True,
                            )

                        df_pivot = table_df.pivot("Metadata_Row",
                                                  "Metadata_Col", "value")

                        fig = px.imshow(df_pivot,
                                        color_continuous_scale="blues")
                        fig.update_layout(
                            title=f"Plate: {plate}, Position effect",
                            xaxis=dict(title="", side="top"),
                            yaxis=dict(title=""),
                        )
                        fig.update_traces(xgap=1, ygap=1)

                        output_file = (
                            f"{output_dir}/{batch}/{plate}/{plate}_position_effect.png"
                        )
                        fig.write_image(output_file,
                                        width=640,
                                        height=480,
                                        scale=2)
示例#27
0
from cytominer_eval.transform import metric_melt
from cytominer_eval.operations.util import assign_replicates

# In[2]:

output_dir = "results"

file = pathlib.Path("data/cell_health_merged_feature_select.csv.gz")
cell_health_df = pd.read_csv(file)

print(cell_health_df.shape)
cell_health_df.head()

# In[3]:

features = infer_cp_features(cell_health_df)
meta_features = infer_cp_features(cell_health_df, metadata=True)

similarity_metric = "pearson"
operation = "percent_strong"

replicate_groups = [
    "Metadata_cell_line", "Metadata_gene_name", "Metadata_pert_name"
]

control_ids = ["Chr2", "Luc", "LacZ"]

# In[4]:

# Melt the input profiles to long format
similarity_melted_df = metric_melt(
print(feature_df.shape)
feature_df.head()

# In[6]:

# Perform spherize transform
for file in data_files:
    # Extract plate from file name
    plate = str(file).split("/")[-1].split("_")[0]
    print(f"Now processing {plate}...")

    # Load data and apply feature selection
    df = pd.read_csv(file).reindex(feature_df.index, axis="columns")

    # Get feature names
    metadata_cols = ["Image_Metadata_Well"] + infer_cp_features(df,
                                                                metadata=True)
    feature_cols = infer_cp_features(
        df, compartments=["Cells", "Cytoplasm", "Nuclei"])

    output_file = pathlib.Path(f"{data_dir}/{plate}_{output_file_suffix}")

    # Apply spherize transformation and output files
    normalize(profiles=df,
              features=feature_cols,
              meta_features=metadata_cols,
              method="spherize",
              spherize_method="ZCA-cor",
              spherize_center=True,
              output_file=output_file)
示例#29
0
# In[17]:


# Other data
other_df = pd.concat(other_dict_df).sample(frac=1).reset_index(drop=True)
other_df = normalize_sc(other_df, scaler_method=scaler_method)

print(other_df.shape)


# ## Apply Feature Selection

# In[18]:


meta_features = infer_cp_features(train_df, metadata=True)
meta_features


# In[19]:


train_df = feature_select(
    train_df,
    operation=feature_select_opts,
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold
)

selected_features = infer_cp_features(train_df)
reindex_features = meta_features + selected_features
示例#30
0
    def aggregate_deep(self):
        """
        Main function of this class. Aggregates the profiles into a pandas dataframe.

        For each key in file_aggregate, the profiles are loaded, concatenated and then aggregated.
        If files are missing, we throw a warning but continue the code.
        After aggregation, the metadata is concatenated back onto the dataframe.

        Returns
        -------
        df_out : pandas.dataframe
            dataframe with all metadata and the feature space.
            This is the input to any further pycytominer or pycytominer-eval processing
        """
        if not hasattr(self, "file_aggregate"):
            self.setup_aggregate()

        self.aggregated_profiles = []
        self.aggregate_merge_col = f"Metadata_{self.aggregate_on.capitalize()}_Position"

        # Iterates over all sites, wells or plates
        for metadata_level in self.file_aggregate:
            # uses custom load function to create df with metadata and profiles
            arr = [load_npz(x) for x in self.file_aggregate[metadata_level]["files"]]
            # empty dataframes from missing files are deleted
            arr = [x for x in arr if not x.empty]
            # if no files were found there is a miss-match between the index and the output files
            if not len(arr):
                warnings.warn(
                    f"No files for the key {metadata_level} could be found.\nThis program will continue, but be aware that this might induce errors!"
                )
                continue
            df = pd.concat(arr)

            # extract metadata prior to aggregation
            meta_df = pd.DataFrame()
            metadata_cols = infer_cp_features(df, metadata=True)
            profiles = [x for x in df.columns.tolist() if x not in metadata_cols]

            # If all rows have the same Metadata information, that value is valid for the aggregated df
            for col in metadata_cols:
                if len(df[col].unique()) == 1:
                    meta_df[col] = [df[col].unique()[0]]

            # perform the aggregation
            df = df.assign(Metadata_Aggregate_On=self.aggregate_on)
            df = aggregate.aggregate(
                population_df=df,
                strata="Metadata_Aggregate_On",
                features=profiles,
                operation=self.aggregate_operation,
            ).reset_index(drop=True)

            # add the aggregation level as a column
            df.loc[:, self.aggregate_merge_col] = metadata_level
            # concatenate the metadata back onto the aggregated profile
            df = pd.concat([df, meta_df], axis=1)

            # save metalevel file
            if self.output_file != "none":
                if not os.path.exists(self.output_file):
                    os.mkdir(self.output_file)
                file_path = os.path.join(
                    self.output_file, metadata_level.replace("/", "_")
                )
                df.to_csv(f"{file_path}.csv", index=False)
            self.aggregated_profiles.append(df)

        # Concatenate all of the above created profiles
        self.aggregated_profiles = pd.concat(
            [x for x in self.aggregated_profiles]
        ).reset_index(drop=True)

        # clean and reindex columns
        self.aggregated_profiles.columns = [
            str(x) for x in self.aggregated_profiles.columns
        ]
        meta_features = infer_cp_features(self.aggregated_profiles, metadata=True)
        reindex_profiles = [str(x) for x in profiles]
        self.aggregated_profiles = self.aggregated_profiles.reindex(
            meta_features + reindex_profiles, axis="columns"
        )

        # If Columns have NaN values from concatenation, drop these
        self.aggregated_profiles.dropna(axis="columns", inplace=True)

        df_out = self.aggregated_profiles
        return df_out