예제 #1
0
def test_load_profiles():

    profiles = load_profiles(output_data_file)
    pd.testing.assert_frame_equal(data_df, profiles)

    profiles_gzip = load_profiles(output_data_gzip_file)
    pd.testing.assert_frame_equal(data_df, profiles_gzip)

    platemap = load_platemap(output_data_comma_file, add_metadata_id=False)
    pd.testing.assert_frame_equal(data_df, profiles)

    profiles_from_frame = load_profiles(data_df)
    pd.testing.assert_frame_equal(data_df, profiles_from_frame)
예제 #2
0
def normalize(
    profiles,
    features="infer",
    meta_features="infer",
    samples="all",
    method="standardize",
    output_file="none",
    compression=None,
    float_format=None,
    whiten_center=True,
    whiten_method="ZCA",
):
    """
    Normalize features

    Arguments:
    profiles - either pandas DataFrame or a file that stores profile data
    features - list of cell painting features [default: "infer"]
               if "infer", then assume cell painting features are those that do not
               start with "Cells", "Nuclei", or "Cytoplasm"
    meta_features - if specified, then output these with specified features
                    [default: "infer"]
    samples - string indicating which metadata column and values to use to subset
              the control samples are often used here [default: 'all']
              the format of this variable will be used in a pd.query() function. An
              example is "Metadata_treatment == 'control'" (include all quotes)
    method - string indicating how the dataframe will be normalized
             [default: 'standardize']
    output_file - [default: "none"] if provided, will write annotated profiles to file
                  if not specified, will return the annotated profiles. We recommend
                  that this output file be suffixed with "_normalized.csv".
    compression - the mechanism to compress [default: None]
    float_format - decimal precision to use in writing output file [default: None]
                       For example, use "%.3g" for 3 decimal precision.
    whiten_center - if data should be centered before whitening transform [default: True]
                    (only used if method = "whiten")
    whiten_method - the type of whitening normalization used [default: 'ZCA']
                    (only used if method = "whiten")

    Return:
    A normalized DataFrame
    """

    # Load Data
    profiles = load_profiles(profiles)

    # Define which scaler to use
    method = method.lower()

    avail_methods = ["standardize", "robustize", "mad_robustize", "whiten"]
    assert method in avail_methods, "operation must be one {}".format(
        avail_methods)

    if method == "standardize":
        scaler = StandardScaler()
    elif method == "robustize":
        scaler = RobustScaler()
    elif method == "mad_robustize":
        scaler = RobustMAD()
    elif method == "whiten":
        scaler = Whiten(center=whiten_center, method=whiten_method)

    if features == "infer":
        features = infer_cp_features(profiles)

    # Separate out the features and meta
    feature_df = profiles.loc[:, features]
    if meta_features == "infer":
        meta_features = infer_cp_features(profiles, metadata=True)

    meta_df = profiles.loc[:, meta_features]

    # Fit the sklearn scaler
    if samples == "all":
        fitted_scaler = scaler.fit(feature_df)
    else:
        # Subset to only the features measured in the sample query
        fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features])

    # Scale the feature dataframe
    feature_df = pd.DataFrame(
        fitted_scaler.transform(feature_df),
        columns=feature_df.columns,
        index=feature_df.index,
    )

    normalized = meta_df.merge(feature_df, left_index=True, right_index=True)

    if output_file != "none":
        output(
            df=normalized,
            output_filename=output_file,
            compression=compression,
            float_format=float_format,
        )
    else:
        return normalized
예제 #3
0
def annotate(
    profiles,
    platemap,
    cell_id="unknown",
    join_on=["Metadata_well_position", "Metadata_Well"],
    output_file="none",
    add_metadata_id_to_platemap=True,
    format_broad_cmap=False,
    perturbation_mode="none",
    external_metadata="none",
    external_join_left="none",
    external_join_right="none",
    compression=None,
    float_format=None,
):
    """
    Exclude features that have correlations above a certain threshold

    Arguments:
    profiles - either pandas DataFrame or a file that stores profile data
    platemap - either pandas DataFrame or a file that stores platemap metadata
    cell_id - [default: "unknown"] provide a string to annotate cell id column
    join_on - list of length two indicating which variables to merge profiles and plate
              [default: ["Metadata_well_position", "Metadata_Well"]]. The first element
              indicates variable(s) in platemap and the second element indicates
              variable(s) in profiles to merge using.
              Note the setting of `add_metadata_id_to_platemap`
    output_file - [default: "none"] if provided, will write annotated profiles to file
                  if not specified, will return the annotated profiles. We recommend
                  that this output file be suffixed with "_augmented.csv".
    add_metadata_id_to_platemap - boolean if the platemap variables should be recoded
    format_broad_cmap - [default: False] boolean if we need to add columns to make
                        compatible with Broad CMAP naming conventions.
    perturbation_mode - [default: "none"] - either "chemical", "genetic" or "none" and only
                        active if format_broad_cmap == True
    external_metadata - [default: "none"] a string indicating a file with additional
                        metadata information
    external_join_left - [default: "none"] the merge column in the profile metadata
    external_join_right - [default: "none"] the merge column in the external metadata
    compression - the mechanism to compress [default: None]
    float_format - decimal precision to use in writing output file [default: None]
                       For example, use "%.3g" for 3 decimal precision.

    Return:
    Pandas DataFrame of annotated profiles or written to file
    """

    # Load Data
    profiles = load_profiles(profiles)
    platemap = load_platemap(platemap, add_metadata_id_to_platemap)

    annotated = platemap.merge(profiles,
                               left_on=join_on[0],
                               right_on=join_on[1],
                               how="inner").drop(join_on[0], axis="columns")

    if format_broad_cmap:

        pert_opts = ["none", "chemical", "genetic"]
        assert (perturbation_mode in pert_opts
                ), "perturbation mode must be one of {}".format(pert_opts)

        assert (
            "Metadata_broad_sample" in annotated.columns
        ), "Are you sure this is a CMAP file? 'Metadata_broad_sample column not found.'"

        annotated = annotated.assign(
            Metadata_pert_id=annotated.Metadata_broad_sample.str.extract(
                r"(BRD[-N][A-Z0-9]+)"),
            Metadata_pert_mfc_id=annotated.Metadata_broad_sample,
            Metadata_pert_well=annotated.loc[:, join_on[1]],
            Metadata_pert_id_vendor="",
        )

        if "Metadata_pert_iname" in annotated.columns:
            annotated = annotated.assign(
                Metadata_pert_mfc_desc=annotated.Metadata_pert_iname,
                Metadata_pert_name=annotated.Metadata_pert_iname,
            )

        if "Metadata_cell_id" not in annotated.columns:
            annotated = annotated.assign(Metadata_cell_id=cell_id)

        if perturbation_mode == "chemical":
            annotated = annotated.assign(Metadata_broad_sample_type=[
                "control" if x in ["DMSO", np.nan] else "trt"
                for x in annotated.Metadata_broad_sample
            ])

            # Generate Metadata_broad_sample column
            annotated.loc[annotated.Metadata_broad_sample_type == "control",
                          "Metadata_broad_sample", ] = "DMSO"
            annotated.loc[annotated.Metadata_broad_sample == "empty",
                          "Metadata_broad_sample_type"] = "empty"

            if "Metadata_mmoles_per_liter" in annotated.columns:
                annotated.loc[annotated.Metadata_broad_sample_type ==
                              "control", "Metadata_mmoles_per_liter", ] = 0

            if "Metadata_solvent" in annotated.columns:
                annotated = annotated.assign(
                    Metadata_pert_vehicle=annotated.Metadata_solvent)
            if "Metadata_mg_per_ml" in annotated.columns:
                annotated.loc[annotated.Metadata_broad_sample_type ==
                              "control", "Metadata_mg_per_ml", ] = 0

        if perturbation_mode == "genetic":
            if "Metadata_pert_name" in annotated.columns:
                annotated = annotated.assign(Metadata_broad_sample_type=[
                    "control" if x == "EMPTY" else "trt"
                    for x in annotated.Metadata_pert_name
                ])

        if "Metadata_broad_sample_type" in annotated.columns:
            annotated = annotated.assign(
                Metadata_pert_type=annotated.Metadata_broad_sample_type)
        else:
            annotated = annotated.assign(Metadata_pert_type="",
                                         Metadata_broad_sample_type="")

    # Add specific Connectivity Map (CMAP) formatting
    if not isinstance(external_metadata, pd.DataFrame):
        if external_metadata != "none":
            assert os.path.exists(
                external_metadata
            ), "external metadata at {} does not exist".format(
                external_metadata)

            external_metadata = pd.read_csv(external_metadata)

    if isinstance(external_metadata, pd.DataFrame):
        external_metadata.columns = [
            "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
            for x in external_metadata.columns
        ]

        annotated = (annotated.merge(
            external_metadata,
            left_on=external_join_left,
            right_on=external_join_right,
            how="left",
        ).reset_index(drop=True).drop_duplicates())

    # Reorder annotated metadata columns
    meta_cols = infer_cp_features(annotated, metadata=True)
    other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist()

    annotated = annotated.loc[:, meta_cols + other_cols]

    if output_file != "none":
        output(
            df=annotated,
            output_filename=output_file,
            compression=compression,
            float_format=float_format,
        )
    else:
        return annotated
예제 #4
0
def annotate(
    profiles,
    platemap,
    join_on=["Metadata_well_position", "Metadata_Well"],
    output_file="none",
    add_metadata_id_to_platemap=True,
    format_broad_cmap=False,
    clean_cellprofiler=True,
    external_metadata="none",
    external_join_left="none",
    external_join_right="none",
    compression_options=None,
    float_format=None,
    cmap_args={},
):
    """Add metadata to aggregated profiles.

    Parameters
    ----------
    profiles : pandas.core.frame.DataFrame or file
        DataFrame or file path of profiles.
    platemap : pandas.core.frame.DataFrame or file
        Dataframe or file path of platemap metadata.
    join_on : list or str, default: ["Metadata_well_position", "Metadata_Well"]
        Which variables to merge profiles and plate. The first element indicates variable(s) in platemap and the second element indicates variable(s) in profiles to merge using. Note the setting of `add_metadata_id_to_platemap`
    output_file : str, optional
       If not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_augmented.csv".
    add_metadata_id_to_platemap : bool, default True
        Whether the plate map variables possibly need "Metadata" pre-pended
    format_broad_cmap : bool, default False
        Whether we need to add columns to make compatible with Broad CMAP naming conventions.
    clean_cellprofiler: bool, default True
        Clean specific CellProfiler feature names.
    external_metadata : str, optional
        File with additional metadata information
    external_join_left : str, optional
        Merge column in the profile metadata.
    external_join_right: str, optional
        Merge column in the external metadata.
    compression_options : str or dict, optional
        Contains compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.
    float_format : str, optional
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision.
    cmap_args : dict, default {}
        Potential keyword arguments for annotate_cmap(). See cyto_utils/annotate_custom.py for more details.

    Returns
    -------
    annotated : pandas.core.frame.DataFrame, optional
        DataFrame of annotated features. If output_file="none", then return the
        DataFrame. If you specify output_file, then write to file and do not return
        data.
    """

    # Load Data
    profiles = load_profiles(profiles)
    platemap = load_platemap(platemap, add_metadata_id_to_platemap)

    annotated = platemap.merge(profiles,
                               left_on=join_on[0],
                               right_on=join_on[1],
                               how="inner").drop(join_on[0], axis="columns")

    # Add specific Connectivity Map (CMAP) formatting
    if format_broad_cmap:
        annotated = annotate_cmap(annotated,
                                  annotate_join_on=join_on[1],
                                  **cmap_args)

    if clean_cellprofiler:
        annotated = cp_clean(annotated)

    if not isinstance(external_metadata, pd.DataFrame):
        if external_metadata != "none":
            assert os.path.exists(
                external_metadata
            ), "external metadata at {} does not exist".format(
                external_metadata)

            external_metadata = pd.read_csv(external_metadata)

    if isinstance(external_metadata, pd.DataFrame):
        external_metadata.columns = [
            "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
            for x in external_metadata.columns
        ]

        annotated = (annotated.merge(
            external_metadata,
            left_on=external_join_left,
            right_on=external_join_right,
            how="left",
        ).reset_index(drop=True).drop_duplicates())

    # Reorder annotated metadata columns
    meta_cols = infer_cp_features(annotated, metadata=True)
    other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist()

    annotated = annotated.loc[:, meta_cols + other_cols]

    if output_file != "none":
        output(
            df=annotated,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return annotated
예제 #5
0
def feature_select(
    profiles,
    features="infer",
    samples="all",
    operation="variance_threshold",
    output_file="none",
    na_cutoff=0.05,
    corr_threshold=0.9,
    corr_method="pearson",
    freq_cut=0.05,
    unique_cut=0.1,
    compression=None,
    float_format=None,
    blocklist_file=None,
    outlier_cutoff=15,
):
    """
    Performs feature selection based on the given operation

    Arguments:
    profiles - either pandas DataFrame or a file that stores profile data
    features - list of cell painting features [default: "infer"]
               if "infer", then assume cell painting features are those that start with
               "Cells", "Nuclei", or "Cytoplasm"
    samples - if provided, a list of samples to provide operation on
              [default: "all"] - if "all", use all samples to calculate
    operation - str or list of given operations to perform on input profiles
    output_file - [default: "none"] if provided, will write annotated profiles to file
                  if not specified, will return the annotated profiles. We recommend
                  that this output file be suffixed with
                  "_normalized_variable_selected.csv".
    na_cutoff - proportion of missing values in a column to tolerate before removing
    corr_threshold - float between (0, 1) to exclude features above [default: 0.9]
    freq_cut - float of ratio (2nd most common feature val / most common) [default: 0.1]
    unique_cut - float of ratio (num unique features / num samples) [default: 0.1]
    compression - the mechanism to compress [default: None]
    float_format - decimal precision to use in writing output file [default: None]
                   For example, use "%.3g" for 3 decimal precision.
    blocklist_file - file location of dataframe with features to exclude [default: None]
                     Note that if "blocklist" in operation then will remove standard
                     blocklist
    outlier_cutoff - the threshold at which the maximum or minimum value of a feature
                     across a full experiment is excluded [default: 15]. Note that this
                     procedure is typically applied (and therefore the default is
                     suitable) for after normalization.
    """
    all_ops = [
        "variance_threshold",
        "correlation_threshold",
        "drop_na_columns",
        "blocklist",
        "drop_outliers",
    ]

    # Make sure the user provides a supported operation
    if isinstance(operation, list):
        assert all([x in all_ops for x in operation
                    ]), "Some operation(s) {} not supported. Choose {}".format(
                        operation, all_ops)
    elif isinstance(operation, str):
        assert operation in all_ops, "{} not supported. Choose {}".format(
            operation, all_ops)
        operation = operation.split()
    else:
        return ValueError("Operation must be a list or string")

    # Load Data
    profiles = load_profiles(profiles)

    if features == "infer":
        features = infer_cp_features(profiles)

    excluded_features = []
    for op in operation:
        if op == "variance_threshold":
            exclude = variance_threshold(
                population_df=profiles,
                features=features,
                samples=samples,
                freq_cut=freq_cut,
                unique_cut=unique_cut,
            )
        elif op == "drop_na_columns":
            exclude = get_na_columns(
                population_df=profiles,
                features=features,
                samples=samples,
                cutoff=na_cutoff,
            )
        elif op == "correlation_threshold":
            exclude = correlation_threshold(
                population_df=profiles,
                features=features,
                samples=samples,
                threshold=corr_threshold,
                method=corr_method,
            )
        elif op == "blocklist":
            if blocklist_file:
                exclude = get_blocklist_features(population_df=profiles,
                                                 blocklist_file=blocklist_file)
            else:
                exclude = get_blocklist_features(population_df=profiles)
        elif op == "drop_outliers":
            exclude = drop_outlier_features(
                population_df=profiles,
                features=features,
                samples=samples,
                outlier_cutoff=outlier_cutoff,
            )

        excluded_features += exclude

    excluded_features = list(set(excluded_features))

    selected_df = profiles.drop(excluded_features, axis="columns")

    if output_file != "none":
        output(
            df=selected_df,
            output_filename=output_file,
            compression=compression,
            float_format=float_format,
        )
    else:
        return selected_df
예제 #6
0
def feature_select(
    profiles,
    features="infer",
    image_features=False,
    samples="all",
    operation="variance_threshold",
    output_file="none",
    na_cutoff=0.05,
    corr_threshold=0.9,
    corr_method="pearson",
    freq_cut=0.05,
    unique_cut=0.1,
    compression_options=None,
    float_format=None,
    blocklist_file=None,
    outlier_cutoff=15,
    noise_removal_perturb_groups=None,
    noise_removal_stdev_cutoff=None,
):
    """Performs feature selection based on the given operation.

    Parameters
    ----------
    profiles : pandas.core.frame.DataFrame or file
        DataFrame or file of profiles.
    features : list
        A list of strings corresponding to feature measurement column names in the
        `profiles` DataFrame. All features listed must be found in `profiles`.
        Defaults to "infer". If "infer", then assume cell painting features are those
        prefixed with "Cells", "Nuclei", or "Cytoplasm".
    image_features: bool, default False
        Whether the profiles contain image features.
    samples : list or str, default "all"
        Samples to provide operation on.
    operation: list of str or str, default "variance_threshold
        Operations to perform on the input profiles.
    output_file : str, optional
        If provided, will write annotated profiles to file. If not specified, will
        return the normalized profiles as output. We recommend that this output file be
        suffixed with "_normalized_variable_selected.csv".
    na_cutoff : float, default 0.05
        Proportion of missing values in a column to tolerate before removing.
    corr_threshold : float, default 0.1
        Value between (0, 1) to exclude features above if any two features are correlated above this threshold.
    corr_method : str, default "pearson"
        Correlation type to compute. Allowed methods are "spearman", "kendall" and "pearson".
    freq_cut : float, default 0.05
        Ratio (2nd most common feature val / most common).
    unique_cut: float, default 0.01
        Ratio (num unique features / num samples).
    compression_options : str or dict, optional
        Contains compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.
    float_format : str, optional
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision.
    blocklist_file : str, optional
        File location of datafrmame with with features to exclude. Note that if "blocklist" in operation then will remove standard blocklist
    outlier_cutoff : float, default 15
        The threshold at which the maximum or minimum value of a feature across a full experiment is excluded. Note that this procedure is typically applied (and therefore the default is uitable) for after normalization.
    noise_removal_perturb_groups: str or list of str, optional
        Perturbation groups corresponding to rows in profiles or the the name of the metadata column containing this information.
    noise_removal_stdev_cutoff: float,optional
        Maximum mean feature standard deviation to be kept for noise removal, grouped by the identity of the perturbation from perturb_list. The data must already be normalized so that this cutoff can apply to all columns.

    Returns
    -------
    selected_df : pandas.core.frame.DataFrame, optional
        The feature selected profile DataFrame. If output_file="none", then return the
        DataFrame. If you specify output_file, then write to file and do not return
        data.

    """

    all_ops = [
        "variance_threshold",
        "correlation_threshold",
        "drop_na_columns",
        "blocklist",
        "drop_outliers",
        "noise_removal",
    ]

    # Make sure the user provides a supported operation
    if isinstance(operation, list):
        assert all([x in all_ops for x in operation
                    ]), "Some operation(s) {} not supported. Choose {}".format(
                        operation, all_ops)
    elif isinstance(operation, str):
        assert operation in all_ops, "{} not supported. Choose {}".format(
            operation, all_ops)
        operation = operation.split()
    else:
        return ValueError("Operation must be a list or string")

    # Load Data
    profiles = load_profiles(profiles)

    if features == "infer":
        features = infer_cp_features(profiles, image_features=image_features)

    excluded_features = []
    for op in operation:
        if op == "variance_threshold":
            exclude = variance_threshold(
                population_df=profiles,
                features=features,
                samples=samples,
                freq_cut=freq_cut,
                unique_cut=unique_cut,
            )
        elif op == "drop_na_columns":
            exclude = get_na_columns(
                population_df=profiles,
                features=features,
                samples=samples,
                cutoff=na_cutoff,
            )
        elif op == "correlation_threshold":
            exclude = correlation_threshold(
                population_df=profiles,
                features=features,
                samples=samples,
                threshold=corr_threshold,
                method=corr_method,
            )
        elif op == "blocklist":
            if blocklist_file:
                exclude = get_blocklist_features(population_df=profiles,
                                                 blocklist_file=blocklist_file)
            else:
                exclude = get_blocklist_features(population_df=profiles)
        elif op == "drop_outliers":
            exclude = drop_outlier_features(
                population_df=profiles,
                features=features,
                samples=samples,
                outlier_cutoff=outlier_cutoff,
            )
        elif op == "noise_removal":
            exclude = noise_removal(
                population_df=profiles,
                features=features,
                noise_removal_perturb_groups=noise_removal_perturb_groups,
                noise_removal_stdev_cutoff=noise_removal_stdev_cutoff,
            )
        excluded_features += exclude

    excluded_features = list(set(excluded_features))

    selected_df = profiles.drop(excluded_features, axis="columns")

    if output_file != "none":
        output(
            df=selected_df,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return selected_df
예제 #7
0
def consensus(
    profiles,
    replicate_columns=["Metadata_Plate", "Metadata_Well"],
    operation="median",
    features="infer",
    output_file="none",
    compression_options=None,
    float_format=None,
    modz_args={"method": "spearman"},
):
    """Form level 5 consensus profile data.

    :param profiles: A file or pandas DataFrame of profile data
    :type profiles: str
    :param replicate_columns: Metadata columns indicating which replicates to collapse, defaults to ["Metadata_Plate", "Metadata_Well"]
    :type replicate_columns: list
    :param operation: The method used to form consensus profiles, defaults to "median"
    :type operation: str
    :param features: The features to collapse, defaults to "infer"
    :type features: str, list
    :param output_file: If specified, the location to write the file, defaults to "none"
    :type output_file: str
    :param modz_args: Additional custom arguments passed as kwargs if operation="modz". See pycytominer.cyto_utils.modz for more details.
    :type modz_args: dict
    :param compression_options: the method to compress output data, defaults to None. See pycytominer.cyto_utils.output.py for options
    :type compression_options: str
    :param float_format: decimal precision to use in writing output file, defaults to None. For example, use "%.3g" for 3 decimal precision.

    :Example:

    import pandas as pd
    from pycytominer import consensus

    data_df = pd.concat(
        [
            pd.DataFrame(
                {
                    "Metadata_Plate": "X",
                    "Metadata_Well": "a",
                    "Cells_x": [0.1, 0.3, 0.8],
                    "Nuclei_y": [0.5, 0.3, 0.1],
                }
            ),
            pd.DataFrame(
                {
                    "Metadata_Plate": "X",
                    "Metadata_Well": "b",
                    "Cells_x": [0.4, 0.2, -0.5],
                    "Nuclei_y": [-0.8, 1.2, -0.5],
                }
            ),
        ]
    ).reset_index(drop=True)

    consensus_df = consensus(
        profiles=data_df,
        replicate_columns=["Metadata_Plate", "Metadata_Well"],
        operation="median",
        features="infer",
        output_file="none",
    )
    """
    # Confirm that the operation is supported
    check_consensus_operation(operation)

    # Load Data
    profiles = load_profiles(profiles)

    if operation == "modz":
        consensus_df = modz(population_df=profiles,
                            replicate_columns=replicate_columns,
                            features=features,
                            **modz_args)
    else:
        consensus_df = aggregate(
            population_df=profiles,
            strata=replicate_columns,
            features=features,
            operation=operation,
            subset_data_df="none",
        )

    if output_file != "none":
        output(
            df=consensus_df,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return consensus_df
예제 #8
0
def annotate(
    profiles,
    platemap,
    join_on=["Metadata_well_position", "Metadata_Well"],
    output_file="none",
    add_metadata_id_to_platemap=True,
    format_broad_cmap=False,
    clean_cellprofiler=True,
    external_metadata="none",
    external_join_left="none",
    external_join_right="none",
    compression_options=None,
    float_format=None,
    cmap_args={},
):
    """
    Exclude features that have correlations above a certain threshold

    Arguments:
    profiles - either pandas DataFrame or a file that stores profile data
    platemap - either pandas DataFrame or a file that stores platemap metadata
    join_on - list of length two indicating which variables to merge profiles and plate
              [default: ["Metadata_well_position", "Metadata_Well"]]. The first element
              indicates variable(s) in platemap and the second element indicates
              variable(s) in profiles to merge using.
              Note the setting of `add_metadata_id_to_platemap`
    output_file - [default: "none"] if provided, will write annotated profiles to file
                  if not specified, will return the annotated profiles. We recommend
                  that this output file be suffixed with "_augmented.csv".
    add_metadata_id_to_platemap - [default: True] boolean if the platemap variables possibly need "Metadata" pre-pended
    format_broad_cmap - [default: False] boolean if we need to add columns to make
                        compatible with Broad CMAP naming conventions.
    external_metadata - [default: "none"] a string indicating a file with additional
                        metadata information
    external_join_left - [default: "none"] the merge column in the profile metadata
    external_join_right - [default: "none"] the merge column in the external metadata
    compression_options - the mechanism to compress [default: None] See cyto_utils/output.py for options.
    float_format - decimal precision to use in writing output file [default: None]
                       For example, use "%.3g" for 3 decimal precision.
    cmap_args - [default: {}] - potential keyword arguments for annotate_cmap().
                See cyto_utils/annotate_cmap.py for more details.

    Return:
    Pandas DataFrame of annotated profiles or written to file
    """

    # Load Data
    profiles = load_profiles(profiles)
    platemap = load_platemap(platemap, add_metadata_id_to_platemap)

    annotated = platemap.merge(profiles,
                               left_on=join_on[0],
                               right_on=join_on[1],
                               how="inner").drop(join_on[0], axis="columns")

    # Add specific Connectivity Map (CMAP) formatting
    if format_broad_cmap:
        annotated = annotate_cmap(annotated,
                                  annotate_join_on=join_on[1],
                                  **cmap_args)

    if clean_cellprofiler:
        annotated = cp_clean(annotated)

    if not isinstance(external_metadata, pd.DataFrame):
        if external_metadata != "none":
            assert os.path.exists(
                external_metadata
            ), "external metadata at {} does not exist".format(
                external_metadata)

            external_metadata = pd.read_csv(external_metadata)

    if isinstance(external_metadata, pd.DataFrame):
        external_metadata.columns = [
            "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
            for x in external_metadata.columns
        ]

        annotated = (annotated.merge(
            external_metadata,
            left_on=external_join_left,
            right_on=external_join_right,
            how="left",
        ).reset_index(drop=True).drop_duplicates())

    # Reorder annotated metadata columns
    meta_cols = infer_cp_features(annotated, metadata=True)
    other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist()

    annotated = annotated.loc[:, meta_cols + other_cols]

    if output_file != "none":
        output(
            df=annotated,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return annotated
예제 #9
0
def normalize(
    profiles,
    features="infer",
    meta_features="infer",
    samples="all",
    method="standardize",
    output_file="none",
    compression_options=None,
    float_format=None,
    spherize_center=True,
    spherize_method="ZCA-cor",
    spherize_epsilon=1e-6,
):
    """Normalize profiling features

    Parameters
    ----------
    profiles : {pandas.Dataframe, path}
        Either a pandas DataFrame or a file that stores profile data
    features : list
        A list of strings corresponding to feature measurement column names in the
        `profiles` DataFrame. All features listed must be found in `profiles`.
        Defaults to "infer". If "infer", then assume cell painting features are those
        prefixed with "Cells", "Nuclei", or "Cytoplasm".
    meta_features : list
        A list of strings corresponding to metadata column names in the `profiles`
        DataFrame. All features listed must be found in `profiles`. Defaults to "infer".
        If "infer", then assume metadata features are those prefixed with "Metadata"
    samples : str
        The metadata column values to use as a normalization reference. We often use
        control samples. The function uses a pd.query() function, so you should
        structure samples in this fasion. An example is
        "Metadata_treatment == 'control'" (include all quotes). Defaults to "all".
    method : str
        How to normalize the dataframe. Defaults to "standardize". Check avail_methods
        for available normalization methods.
    output_file : str
        If provided, will write annotated profiles to file. If not specified, will
        return the normalized profiles as output. We recommend that this output file be
        suffixed with "_normalized.csv". Defaults to "none".
    compression_options : {dict, None}
        Contain compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.
        Defaults to None.
    float_format : {str, None}
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision. Defaults to None.
    spherize_center : bool
        If the function should center data before sphering (aka whitening). The
        function only uses this variable if method = "spherize". Defaults to True.
    spherize_method : str
        The sphering (aka whitening) normalization selection. The function only uses
        this variable if method = "spherize". Defaults to "ZCA-corr". See
        :py:func:`pycytominer.operations.transform` for available spherize methods.
    spherize_epsilon : float
        The sphering (aka whitening) fudge factor parameter. The function only uses
        this variable if method = "spherize". Defaults 1e-6.

    Returns
    -------
    pd.DataFrame or None
        The normalized profile DataFrame. If output_file="none", then return the
        DataFrame. If you specify output_file, then write to file and do not return
        data.

    Examples
    --------
    import pandas as pd
    from pycytominer import normalize

    data_df = pd.DataFrame(
        {
            "Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"],
            "Metadata_treatment": [
                "drug",
                "drug",
                "control",
                "control",
                "drug",
                "drug",
                "control",
                "control",
            ],
            "x": [1, 2, 8, 2, 5, 5, 5, 1],
            "y": [3, 1, 7, 4, 5, 9, 6, 1],
            "z": [1, 8, 2, 5, 6, 22, 2, 2],
            "zz": [14, 46, 1, 6, 30, 100, 2, 2],
        }
    ).reset_index(drop=True)

    normalized_df = normalize(
        profiles=data_df,
        features=["x", "y", "z", "zz"],
        meta_features="infer",
        samples="Metadata_treatment == 'control'",
        method="standardize"
    )
    """

    # Load Data
    profiles = load_profiles(profiles)

    # Define which scaler to use
    method = method.lower()

    avail_methods = ["standardize", "robustize", "mad_robustize", "spherize"]
    assert method in avail_methods, "operation must be one {}".format(avail_methods)

    if method == "standardize":
        scaler = StandardScaler()
    elif method == "robustize":
        scaler = RobustScaler()
    elif method == "mad_robustize":
        scaler = RobustMAD()
    elif method == "spherize":
        scaler = Spherize(
            center=spherize_center, method=spherize_method, epsilon=spherize_epsilon
        )

    if features == "infer":
        features = infer_cp_features(profiles)

    # Separate out the features and meta
    feature_df = profiles.loc[:, features]
    if meta_features == "infer":
        meta_features = infer_cp_features(profiles, metadata=True)

    meta_df = profiles.loc[:, meta_features]

    # Fit the sklearn scaler
    if samples == "all":
        fitted_scaler = scaler.fit(feature_df)
    else:
        # Subset to only the features measured in the sample query
        fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features])

    # Scale the feature dataframe
    feature_df = pd.DataFrame(
        fitted_scaler.transform(feature_df),
        columns=feature_df.columns,
        index=feature_df.index,
    )

    normalized = meta_df.merge(feature_df, left_index=True, right_index=True)

    if output_file != "none":
        output(
            df=normalized,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return normalized