示例#1
0
    def aggregate_profiles(
        self,
        compute_subsample="False",
        output_file="none",
        compression=None,
        float_format=None,
    ):
        """
        Aggregate and merge compartments. This is the primary entry to this class.

        Arguments:
        compute_subsample - [default: False] boolean if subsample should be computed.
                            NOTE: Must be specified to perform subsampling. Will not
                            apply subsetting if set to False even if subsample is
                            initialized
        output_file - [default: "none"] if provided, will write annotated profiles to file
                  if not specified, will return the annotated profiles. We recommend
                  that this output file be suffixed with "_augmented.csv".
        compression - the mechanism to compress [default: None]
        float_format - decimal precision to use in writing output file [default: None]
                           For example, use "%.3g" for 3 decimal precision.

        Return:
        if output_file is set, then write to file. If not then return
        """

        if output_file != "none":
            self.set_output_file(output_file)

        aggregated = (self.aggregate_compartment(
            compartment="cells", compute_subsample=compute_subsample).merge(
                self.aggregate_compartment(compartment="cytoplasm"),
                on=self.strata,
                how="inner",
            ).merge(
                self.aggregate_compartment(compartment="nuclei"),
                on=self.strata,
                how="inner",
            ))

        self.is_aggregated = True

        if self.output_file != "none":
            output(
                df=aggregated,
                output_filename=self.output_file,
                compression=compression,
                float_format=float_format,
            )
        else:
            return aggregated
示例#2
0
    def aggregate_profiles(
        self,
        compute_subsample=False,
        output_file="none",
        compression_options=None,
        float_format=None,
        n_aggregation_memory_strata=1,
    ):
        """Aggregate and merge compartments. This is the primary entry to this class.

        Parameters
        ----------
        compute_subsample : bool, default False
            Whether or not to compute subsample. compute_subsample must be specified to perform subsampling.
            The function aggregate_profiles(compute_subsample=True) will apply subsetting even if subsample is initialized.
        output_file : str, optional
            The name of a file to output. We recommended that, if provided, the output file be suffixed with "_augmented".
        compression_options : str, optional
            Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.
        float_format : str, optional
            Decimal precision to use in writing output file.
        n_aggregation_memory_strata : int, default 1
            Number of unique strata to pull from the database into working memory
            at once.  Typically 1 is fastest.  A larger number uses more memory.

        Returns
        -------
        pandas.core.frame.DataFrame
            Either a dataframe (if output_file="none") or will write to file.

        """

        if output_file != "none":
            self.set_output_file(output_file)

        compartment_idx = 0
        for compartment in self.compartments:
            if compartment_idx == 0:
                aggregated = self.aggregate_compartment(
                    compartment=compartment,
                    compute_subsample=compute_subsample,
                    compute_counts=True,
                    add_image_features=self.add_image_features,
                    n_aggregation_memory_strata=n_aggregation_memory_strata,
                )
            else:
                aggregated = aggregated.merge(
                    self.aggregate_compartment(
                        compartment=compartment,
                        n_aggregation_memory_strata=n_aggregation_memory_strata,
                    ),
                    on=self.strata,
                    how="inner",
                )
            compartment_idx += 1

        self.is_aggregated = True

        if self.output_file != "none":
            output(
                df=aggregated,
                output_filename=self.output_file,
                compression_options=compression_options,
                float_format=float_format,
            )
        else:
            return aggregated
示例#3
0
    def merge_single_cells(
        self,
        compute_subsample=False,
        sc_output_file="none",
        compression_options=None,
        float_format=None,
        single_cell_normalize=False,
        normalize_args=None,
    ):
        """Given the linking columns, merge single cell data. Normalization is also supported.

        Parameters
        ----------
        compute_subsample : bool, default False
            Whether or not to compute subsample.
        sc_output_file : str, optional
            The name of a file to output.
        compression_options : str, optional
            Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.
        float_format : str, optional
            Decimal precision to use in writing output file.
        single_cell_normalize : bool, default False
            Whether or not to normalize the single cell data.
        normalize_args : dict, optional
            Additional arguments passed as input to pycytominer.normalize().

        Returns
        -------
        pandas.core.frame.DataFrame
            Either a dataframe (if output_file="none") or will write to file.

        """

        # Load the single cell dataframe by merging on the specific linking columns
        sc_df = ""
        linking_check_cols = []
        merge_suffix_rename = []
        for left_compartment in self.compartment_linking_cols:
            for right_compartment in self.compartment_linking_cols[left_compartment]:
                # Make sure only one merge per combination occurs
                linking_check = "-".join(sorted([left_compartment, right_compartment]))
                if linking_check in linking_check_cols:
                    continue

                # Specify how to indicate merge suffixes
                merge_suffix = [
                    "_{comp_l}".format(comp_l=left_compartment),
                    "_{comp_r}".format(comp_r=right_compartment),
                ]
                merge_suffix_rename += merge_suffix
                left_link_col = self.compartment_linking_cols[left_compartment][
                    right_compartment
                ]
                right_link_col = self.compartment_linking_cols[right_compartment][
                    left_compartment
                ]

                if isinstance(sc_df, str):
                    initial_df = self.load_compartment(compartment=left_compartment)

                    if compute_subsample:
                        # Sample cells proportionally by self.strata
                        self.get_subsample(df=initial_df, rename_col=False)

                        subset_logic_df = self.subset_data_df.drop(
                            self.image_df.columns, axis="columns"
                        )

                        initial_df = subset_logic_df.merge(
                            initial_df, how="left", on=subset_logic_df.columns.tolist()
                        ).reindex(initial_df.columns, axis="columns")

                    sc_df = initial_df.merge(
                        self.load_compartment(compartment=right_compartment),
                        left_on=self.merge_cols + [left_link_col],
                        right_on=self.merge_cols + [right_link_col],
                        suffixes=merge_suffix,
                    )
                else:
                    sc_df = sc_df.merge(
                        self.load_compartment(compartment=right_compartment),
                        left_on=self.merge_cols + [left_link_col],
                        right_on=self.merge_cols + [right_link_col],
                        suffixes=merge_suffix,
                    )

                linking_check_cols.append(linking_check)

        # Add metadata prefix to merged suffixes
        full_merge_suffix_rename = []
        full_merge_suffix_original = []
        for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
            full_merge_suffix_original.append(col_name)
            full_merge_suffix_rename.append("Metadata_{x}".format(x=col_name))

        for col_name in self.merge_cols + list(self.linking_col_rename.keys()):
            for suffix in set(merge_suffix_rename):
                full_merge_suffix_original.append("{x}{y}".format(x=col_name, y=suffix))
                full_merge_suffix_rename.append(
                    "Metadata_{x}{y}".format(x=col_name, y=suffix)
                )

        self.full_merge_suffix_rename = dict(
            zip(full_merge_suffix_original, full_merge_suffix_rename)
        )

        # Add image data to single cell dataframe
        if not self.load_image_data:
            self.load_image()
            self.load_image_data = True

        sc_df = (
            self.image_df.merge(sc_df, on=self.merge_cols, how="right")
            .rename(self.linking_col_rename, axis="columns")
            .rename(self.full_merge_suffix_rename, axis="columns")
        )
        if single_cell_normalize:
            # Infering features is tricky with non-canonical data
            if normalize_args is None:
                normalize_args = {}
                features = infer_cp_features(sc_df, compartments=self.compartments)
            elif "features" not in normalize_args:
                features = infer_cp_features(sc_df, compartments=self.compartments)
            elif normalize_args["features"] == "infer":
                features = infer_cp_features(sc_df, compartments=self.compartments)
            else:
                features = normalize_args["features"]

            normalize_args["features"] = features

            sc_df = normalize(profiles=sc_df, **normalize_args)

        if sc_output_file != "none":
            output(
                df=sc_df,
                output_filename=sc_output_file,
                compression_options=compression_options,
                float_format=float_format,
            )
        else:
            return sc_df
示例#4
0
def feature_select(
    profiles,
    features="infer",
    samples="all",
    operation="variance_threshold",
    output_file="none",
    na_cutoff=0.05,
    corr_threshold=0.9,
    corr_method="pearson",
    freq_cut=0.05,
    unique_cut=0.1,
    compression=None,
    float_format=None,
    blocklist_file=None,
    outlier_cutoff=15,
):
    """
    Performs feature selection based on the given operation

    Arguments:
    profiles - either pandas DataFrame or a file that stores profile data
    features - list of cell painting features [default: "infer"]
               if "infer", then assume cell painting features are those that start with
               "Cells", "Nuclei", or "Cytoplasm"
    samples - if provided, a list of samples to provide operation on
              [default: "all"] - if "all", use all samples to calculate
    operation - str or list of given operations to perform on input profiles
    output_file - [default: "none"] if provided, will write annotated profiles to file
                  if not specified, will return the annotated profiles. We recommend
                  that this output file be suffixed with
                  "_normalized_variable_selected.csv".
    na_cutoff - proportion of missing values in a column to tolerate before removing
    corr_threshold - float between (0, 1) to exclude features above [default: 0.9]
    freq_cut - float of ratio (2nd most common feature val / most common) [default: 0.1]
    unique_cut - float of ratio (num unique features / num samples) [default: 0.1]
    compression - the mechanism to compress [default: None]
    float_format - decimal precision to use in writing output file [default: None]
                   For example, use "%.3g" for 3 decimal precision.
    blocklist_file - file location of dataframe with features to exclude [default: None]
                     Note that if "blocklist" in operation then will remove standard
                     blocklist
    outlier_cutoff - the threshold at which the maximum or minimum value of a feature
                     across a full experiment is excluded [default: 15]. Note that this
                     procedure is typically applied (and therefore the default is
                     suitable) for after normalization.
    """
    all_ops = [
        "variance_threshold",
        "correlation_threshold",
        "drop_na_columns",
        "blocklist",
        "drop_outliers",
    ]

    # Make sure the user provides a supported operation
    if isinstance(operation, list):
        assert all([x in all_ops for x in operation
                    ]), "Some operation(s) {} not supported. Choose {}".format(
                        operation, all_ops)
    elif isinstance(operation, str):
        assert operation in all_ops, "{} not supported. Choose {}".format(
            operation, all_ops)
        operation = operation.split()
    else:
        return ValueError("Operation must be a list or string")

    # Load Data
    profiles = load_profiles(profiles)

    if features == "infer":
        features = infer_cp_features(profiles)

    excluded_features = []
    for op in operation:
        if op == "variance_threshold":
            exclude = variance_threshold(
                population_df=profiles,
                features=features,
                samples=samples,
                freq_cut=freq_cut,
                unique_cut=unique_cut,
            )
        elif op == "drop_na_columns":
            exclude = get_na_columns(
                population_df=profiles,
                features=features,
                samples=samples,
                cutoff=na_cutoff,
            )
        elif op == "correlation_threshold":
            exclude = correlation_threshold(
                population_df=profiles,
                features=features,
                samples=samples,
                threshold=corr_threshold,
                method=corr_method,
            )
        elif op == "blocklist":
            if blocklist_file:
                exclude = get_blocklist_features(population_df=profiles,
                                                 blocklist_file=blocklist_file)
            else:
                exclude = get_blocklist_features(population_df=profiles)
        elif op == "drop_outliers":
            exclude = drop_outlier_features(
                population_df=profiles,
                features=features,
                samples=samples,
                outlier_cutoff=outlier_cutoff,
            )

        excluded_features += exclude

    excluded_features = list(set(excluded_features))

    selected_df = profiles.drop(excluded_features, axis="columns")

    if output_file != "none":
        output(
            df=selected_df,
            output_filename=output_file,
            compression=compression,
            float_format=float_format,
        )
    else:
        return selected_df
示例#5
0
def normalize(
    profiles,
    features="infer",
    meta_features="infer",
    samples="all",
    method="standardize",
    output_file="none",
    compression_options=None,
    float_format=None,
    spherize_center=True,
    spherize_method="ZCA-cor",
    spherize_epsilon=1e-6,
):
    """Normalize profiling features

    Parameters
    ----------
    profiles : {pandas.Dataframe, path}
        Either a pandas DataFrame or a file that stores profile data
    features : list
        A list of strings corresponding to feature measurement column names in the
        `profiles` DataFrame. All features listed must be found in `profiles`.
        Defaults to "infer". If "infer", then assume cell painting features are those
        prefixed with "Cells", "Nuclei", or "Cytoplasm".
    meta_features : list
        A list of strings corresponding to metadata column names in the `profiles`
        DataFrame. All features listed must be found in `profiles`. Defaults to "infer".
        If "infer", then assume metadata features are those prefixed with "Metadata"
    samples : str
        The metadata column values to use as a normalization reference. We often use
        control samples. The function uses a pd.query() function, so you should
        structure samples in this fasion. An example is
        "Metadata_treatment == 'control'" (include all quotes). Defaults to "all".
    method : str
        How to normalize the dataframe. Defaults to "standardize". Check avail_methods
        for available normalization methods.
    output_file : str
        If provided, will write annotated profiles to file. If not specified, will
        return the normalized profiles as output. We recommend that this output file be
        suffixed with "_normalized.csv". Defaults to "none".
    compression_options : {dict, None}
        Contain compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.
        Defaults to None.
    float_format : {str, None}
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision. Defaults to None.
    spherize_center : bool
        If the function should center data before sphering (aka whitening). The
        function only uses this variable if method = "spherize". Defaults to True.
    spherize_method : str
        The sphering (aka whitening) normalization selection. The function only uses
        this variable if method = "spherize". Defaults to "ZCA-corr". See
        :py:func:`pycytominer.operations.transform` for available spherize methods.
    spherize_epsilon : float
        The sphering (aka whitening) fudge factor parameter. The function only uses
        this variable if method = "spherize". Defaults 1e-6.

    Returns
    -------
    pd.DataFrame or None
        The normalized profile DataFrame. If output_file="none", then return the
        DataFrame. If you specify output_file, then write to file and do not return
        data.

    Examples
    --------
    import pandas as pd
    from pycytominer import normalize

    data_df = pd.DataFrame(
        {
            "Metadata_plate": ["a", "a", "a", "a", "b", "b", "b", "b"],
            "Metadata_treatment": [
                "drug",
                "drug",
                "control",
                "control",
                "drug",
                "drug",
                "control",
                "control",
            ],
            "x": [1, 2, 8, 2, 5, 5, 5, 1],
            "y": [3, 1, 7, 4, 5, 9, 6, 1],
            "z": [1, 8, 2, 5, 6, 22, 2, 2],
            "zz": [14, 46, 1, 6, 30, 100, 2, 2],
        }
    ).reset_index(drop=True)

    normalized_df = normalize(
        profiles=data_df,
        features=["x", "y", "z", "zz"],
        meta_features="infer",
        samples="Metadata_treatment == 'control'",
        method="standardize"
    )
    """

    # Load Data
    profiles = load_profiles(profiles)

    # Define which scaler to use
    method = method.lower()

    avail_methods = ["standardize", "robustize", "mad_robustize", "spherize"]
    assert method in avail_methods, "operation must be one {}".format(avail_methods)

    if method == "standardize":
        scaler = StandardScaler()
    elif method == "robustize":
        scaler = RobustScaler()
    elif method == "mad_robustize":
        scaler = RobustMAD()
    elif method == "spherize":
        scaler = Spherize(
            center=spherize_center, method=spherize_method, epsilon=spherize_epsilon
        )

    if features == "infer":
        features = infer_cp_features(profiles)

    # Separate out the features and meta
    feature_df = profiles.loc[:, features]
    if meta_features == "infer":
        meta_features = infer_cp_features(profiles, metadata=True)

    meta_df = profiles.loc[:, meta_features]

    # Fit the sklearn scaler
    if samples == "all":
        fitted_scaler = scaler.fit(feature_df)
    else:
        # Subset to only the features measured in the sample query
        fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features])

    # Scale the feature dataframe
    feature_df = pd.DataFrame(
        fitted_scaler.transform(feature_df),
        columns=feature_df.columns,
        index=feature_df.index,
    )

    normalized = meta_df.merge(feature_df, left_index=True, right_index=True)

    if output_file != "none":
        output(
            df=normalized,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return normalized
示例#6
0
    plate_file = plate_files[plate]
    output_file = pathlib.Path(f"{sc_dir}/{plate}_normalized_featureselected.csv.gz")

    # Set console output
    print(f"Now performing feature selection for... {plate_file}")
    sc_df = pd.read_csv(plate_file, low_memory=False)
    print("Before feature selection:")
    print(sc_df.shape)
    
    sc_df = feature_select(
        profiles=sc_df,
        operation=feature_select_operations,
        na_cutoff=na_cutoff,
    )
    
    print("After feature selection:")
    print(sc_df.shape)
    
    # Output file to disk
    output(
        df=sc_df,
        output_filename=output_file,
        sep=",",
        float_format="%.5f",
        compression_options=compression_options,
    )

    print("Done.")
    print("\n\n")

示例#7
0
def normalize(
    profiles,
    features="infer",
    meta_features="infer",
    samples="all",
    method="standardize",
    output_file="none",
    compression=None,
    float_format=None,
    whiten_center=True,
    whiten_method="ZCA",
):
    """
    Normalize features

    Arguments:
    profiles - either pandas DataFrame or a file that stores profile data
    features - list of cell painting features [default: "infer"]
               if "infer", then assume cell painting features are those that do not
               start with "Cells", "Nuclei", or "Cytoplasm"
    meta_features - if specified, then output these with specified features
                    [default: "infer"]
    samples - string indicating which metadata column and values to use to subset
              the control samples are often used here [default: 'all']
              the format of this variable will be used in a pd.query() function. An
              example is "Metadata_treatment == 'control'" (include all quotes)
    method - string indicating how the dataframe will be normalized
             [default: 'standardize']
    output_file - [default: "none"] if provided, will write annotated profiles to file
                  if not specified, will return the annotated profiles. We recommend
                  that this output file be suffixed with "_normalized.csv".
    compression - the mechanism to compress [default: None]
    float_format - decimal precision to use in writing output file [default: None]
                       For example, use "%.3g" for 3 decimal precision.
    whiten_center - if data should be centered before whitening transform [default: True]
                    (only used if method = "whiten")
    whiten_method - the type of whitening normalization used [default: 'ZCA']
                    (only used if method = "whiten")

    Return:
    A normalized DataFrame
    """

    # Load Data
    profiles = load_profiles(profiles)

    # Define which scaler to use
    method = method.lower()

    avail_methods = ["standardize", "robustize", "mad_robustize", "whiten"]
    assert method in avail_methods, "operation must be one {}".format(
        avail_methods)

    if method == "standardize":
        scaler = StandardScaler()
    elif method == "robustize":
        scaler = RobustScaler()
    elif method == "mad_robustize":
        scaler = RobustMAD()
    elif method == "whiten":
        scaler = Whiten(center=whiten_center, method=whiten_method)

    if features == "infer":
        features = infer_cp_features(profiles)

    # Separate out the features and meta
    feature_df = profiles.loc[:, features]
    if meta_features == "infer":
        meta_features = infer_cp_features(profiles, metadata=True)

    meta_df = profiles.loc[:, meta_features]

    # Fit the sklearn scaler
    if samples == "all":
        fitted_scaler = scaler.fit(feature_df)
    else:
        # Subset to only the features measured in the sample query
        fitted_scaler = scaler.fit(profiles.query(samples).loc[:, features])

    # Scale the feature dataframe
    feature_df = pd.DataFrame(
        fitted_scaler.transform(feature_df),
        columns=feature_df.columns,
        index=feature_df.index,
    )

    normalized = meta_df.merge(feature_df, left_index=True, right_index=True)

    if output_file != "none":
        output(
            df=normalized,
            output_filename=output_file,
            compression=compression,
            float_format=float_format,
        )
    else:
        return normalized
features = infer_cp_features(df)
meta_features = infer_cp_features(df, metadata=True)

print(df.shape)
df.head(2)


# In[4]:


# Output feature selected file
output_file = pathlib.Path("data/cell_health_merged_feature_select.csv.gz")

output(
    df=df,
    output_filename=output_file,
    sep=",",
    compression_options={"method": "gzip", "mtime": 1},
)


# In[5]:


# Define cell health constants
barcode_col = "Metadata_pert_name"
gene_col = "Metadata_gene_name"

replicate_group_grit = {"replicate_id": barcode_col, "group_id": gene_col}

control_group_cut = ["Chr2", "Luc", "LacZ"]
control_group_pert = ["EMPTY"]
示例#9
0
# ## Apply normalization, feature select, and output data

# In[12]:

normalized_df = normalize(merged_df,
                          features="infer",
                          meta_features="infer",
                          samples="all",
                          method="standardize")

# In[13]:

feature_select_df = feature_select(
    normalized_df,
    features="infer",
    operation=feature_select_opts,
    output_file="none",
    na_cutoff=na_cutoff,
    corr_threshold=corr_threshold,
)

print(feature_select_df.shape)
feature_select_df.head()

# In[14]:

output_filename = pathlib.Path(
    f"data/{batch}/{plate}_singlecell_normalized_feature_select.csv.gz")
output(normalized_df, output_filename, compression="gzip", float_format="%.5g")
示例#10
0
def consensus(
    profiles,
    replicate_columns=["Metadata_Plate", "Metadata_Well"],
    operation="median",
    features="infer",
    output_file="none",
    compression_options=None,
    float_format=None,
    modz_args={"method": "spearman"},
):
    """Form level 5 consensus profile data.

    :param profiles: A file or pandas DataFrame of profile data
    :type profiles: str
    :param replicate_columns: Metadata columns indicating which replicates to collapse, defaults to ["Metadata_Plate", "Metadata_Well"]
    :type replicate_columns: list
    :param operation: The method used to form consensus profiles, defaults to "median"
    :type operation: str
    :param features: The features to collapse, defaults to "infer"
    :type features: str, list
    :param output_file: If specified, the location to write the file, defaults to "none"
    :type output_file: str
    :param modz_args: Additional custom arguments passed as kwargs if operation="modz". See pycytominer.cyto_utils.modz for more details.
    :type modz_args: dict
    :param compression_options: the method to compress output data, defaults to None. See pycytominer.cyto_utils.output.py for options
    :type compression_options: str
    :param float_format: decimal precision to use in writing output file, defaults to None. For example, use "%.3g" for 3 decimal precision.

    :Example:

    import pandas as pd
    from pycytominer import consensus

    data_df = pd.concat(
        [
            pd.DataFrame(
                {
                    "Metadata_Plate": "X",
                    "Metadata_Well": "a",
                    "Cells_x": [0.1, 0.3, 0.8],
                    "Nuclei_y": [0.5, 0.3, 0.1],
                }
            ),
            pd.DataFrame(
                {
                    "Metadata_Plate": "X",
                    "Metadata_Well": "b",
                    "Cells_x": [0.4, 0.2, -0.5],
                    "Nuclei_y": [-0.8, 1.2, -0.5],
                }
            ),
        ]
    ).reset_index(drop=True)

    consensus_df = consensus(
        profiles=data_df,
        replicate_columns=["Metadata_Plate", "Metadata_Well"],
        operation="median",
        features="infer",
        output_file="none",
    )
    """
    # Confirm that the operation is supported
    check_consensus_operation(operation)

    # Load Data
    profiles = load_profiles(profiles)

    if operation == "modz":
        consensus_df = modz(population_df=profiles,
                            replicate_columns=replicate_columns,
                            features=features,
                            **modz_args)
    else:
        consensus_df = aggregate(
            population_df=profiles,
            strata=replicate_columns,
            features=features,
            operation=operation,
            subset_data_df="none",
        )

    if output_file != "none":
        output(
            df=consensus_df,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return consensus_df
示例#11
0
            consensus_file = pathlib.Path(batch, consensus_file)

            consensus_df = all_consensus_dfs[batch][norm_strat][operation][
                "no_feat_select"
            ]

            print(
                f"  Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}"
            )
            print(f"  File: {consensus_file}")
            print(consensus_df.shape)

            output(
                df=consensus_df,
                output_filename=consensus_file,
                sep=",",
                float_format=float_format,
                compression_options=compression_options,
            )

            # With feature selection
            consensus_feat_df = all_consensus_dfs[batch][norm_strat][operation][
                "feat_select"
            ]

            consensus_feat_file = (
                f"{batch}_consensus_{operation}_feature_select{file_suffix}"
            )
            consensus_feat_file = pathlib.Path(batch, consensus_feat_file)

            print(
示例#12
0
        print(
            f"Now aggregating by {aggregate_level}...with operation: {aggregate_operation}"
        )
        logging.info(
            f"Aggregating by {aggregate_level}...with operation: {aggregate_operation}"
        )

        aggregate_df = aggregate(
            population_df=single_cell_df,
            strata=aggregate_columns,
            features=aggregate_features,
            operation=aggregate_operation,
        )

        # Define a dataset specific file
        aggregate_dataset_file = pathlib.Path(
            aggregate_output_dir,
            aggregate_output_file.name.replace(".csv.gz",
                                               f"_{data_split_site}.csv.gz"),
        )

        output(
            aggregate_df,
            output_filename=aggregate_dataset_file,
            compression_options=compression,
            float_format=float_format,
        )
print("Finished 1.aggregate.")
logging.info(f"Finished 1.aggregate.")
示例#13
0
def annotate(
    profiles,
    platemap,
    join_on=["Metadata_well_position", "Metadata_Well"],
    output_file="none",
    add_metadata_id_to_platemap=True,
    format_broad_cmap=False,
    clean_cellprofiler=True,
    external_metadata="none",
    external_join_left="none",
    external_join_right="none",
    compression_options=None,
    float_format=None,
    cmap_args={},
):
    """
    Exclude features that have correlations above a certain threshold

    Arguments:
    profiles - either pandas DataFrame or a file that stores profile data
    platemap - either pandas DataFrame or a file that stores platemap metadata
    join_on - list of length two indicating which variables to merge profiles and plate
              [default: ["Metadata_well_position", "Metadata_Well"]]. The first element
              indicates variable(s) in platemap and the second element indicates
              variable(s) in profiles to merge using.
              Note the setting of `add_metadata_id_to_platemap`
    output_file - [default: "none"] if provided, will write annotated profiles to file
                  if not specified, will return the annotated profiles. We recommend
                  that this output file be suffixed with "_augmented.csv".
    add_metadata_id_to_platemap - [default: True] boolean if the platemap variables possibly need "Metadata" pre-pended
    format_broad_cmap - [default: False] boolean if we need to add columns to make
                        compatible with Broad CMAP naming conventions.
    external_metadata - [default: "none"] a string indicating a file with additional
                        metadata information
    external_join_left - [default: "none"] the merge column in the profile metadata
    external_join_right - [default: "none"] the merge column in the external metadata
    compression_options - the mechanism to compress [default: None] See cyto_utils/output.py for options.
    float_format - decimal precision to use in writing output file [default: None]
                       For example, use "%.3g" for 3 decimal precision.
    cmap_args - [default: {}] - potential keyword arguments for annotate_cmap().
                See cyto_utils/annotate_cmap.py for more details.

    Return:
    Pandas DataFrame of annotated profiles or written to file
    """

    # Load Data
    profiles = load_profiles(profiles)
    platemap = load_platemap(platemap, add_metadata_id_to_platemap)

    annotated = platemap.merge(profiles,
                               left_on=join_on[0],
                               right_on=join_on[1],
                               how="inner").drop(join_on[0], axis="columns")

    # Add specific Connectivity Map (CMAP) formatting
    if format_broad_cmap:
        annotated = annotate_cmap(annotated,
                                  annotate_join_on=join_on[1],
                                  **cmap_args)

    if clean_cellprofiler:
        annotated = cp_clean(annotated)

    if not isinstance(external_metadata, pd.DataFrame):
        if external_metadata != "none":
            assert os.path.exists(
                external_metadata
            ), "external metadata at {} does not exist".format(
                external_metadata)

            external_metadata = pd.read_csv(external_metadata)

    if isinstance(external_metadata, pd.DataFrame):
        external_metadata.columns = [
            "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
            for x in external_metadata.columns
        ]

        annotated = (annotated.merge(
            external_metadata,
            left_on=external_join_left,
            right_on=external_join_right,
            how="left",
        ).reset_index(drop=True).drop_duplicates())

    # Reorder annotated metadata columns
    meta_cols = infer_cp_features(annotated, metadata=True)
    other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist()

    annotated = annotated.loc[:, meta_cols + other_cols]

    if output_file != "none":
        output(
            df=annotated,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return annotated
示例#14
0
def annotate(
    profiles,
    platemap,
    join_on=["Metadata_well_position", "Metadata_Well"],
    output_file="none",
    add_metadata_id_to_platemap=True,
    format_broad_cmap=False,
    clean_cellprofiler=True,
    external_metadata="none",
    external_join_left="none",
    external_join_right="none",
    compression_options=None,
    float_format=None,
    cmap_args={},
):
    """Add metadata to aggregated profiles.

    Parameters
    ----------
    profiles : pandas.core.frame.DataFrame or file
        DataFrame or file path of profiles.
    platemap : pandas.core.frame.DataFrame or file
        Dataframe or file path of platemap metadata.
    join_on : list or str, default: ["Metadata_well_position", "Metadata_Well"]
        Which variables to merge profiles and plate. The first element indicates variable(s) in platemap and the second element indicates variable(s) in profiles to merge using. Note the setting of `add_metadata_id_to_platemap`
    output_file : str, optional
       If not specified, will return the annotated profiles. We recommend that this output file be suffixed with "_augmented.csv".
    add_metadata_id_to_platemap : bool, default True
        Whether the plate map variables possibly need "Metadata" pre-pended
    format_broad_cmap : bool, default False
        Whether we need to add columns to make compatible with Broad CMAP naming conventions.
    clean_cellprofiler: bool, default True
        Clean specific CellProfiler feature names.
    external_metadata : str, optional
        File with additional metadata information
    external_join_left : str, optional
        Merge column in the profile metadata.
    external_join_right: str, optional
        Merge column in the external metadata.
    compression_options : str or dict, optional
        Contains compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.
    float_format : str, optional
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision.
    cmap_args : dict, default {}
        Potential keyword arguments for annotate_cmap(). See cyto_utils/annotate_custom.py for more details.

    Returns
    -------
    annotated : pandas.core.frame.DataFrame, optional
        DataFrame of annotated features. If output_file="none", then return the
        DataFrame. If you specify output_file, then write to file and do not return
        data.
    """

    # Load Data
    profiles = load_profiles(profiles)
    platemap = load_platemap(platemap, add_metadata_id_to_platemap)

    annotated = platemap.merge(profiles,
                               left_on=join_on[0],
                               right_on=join_on[1],
                               how="inner").drop(join_on[0], axis="columns")

    # Add specific Connectivity Map (CMAP) formatting
    if format_broad_cmap:
        annotated = annotate_cmap(annotated,
                                  annotate_join_on=join_on[1],
                                  **cmap_args)

    if clean_cellprofiler:
        annotated = cp_clean(annotated)

    if not isinstance(external_metadata, pd.DataFrame):
        if external_metadata != "none":
            assert os.path.exists(
                external_metadata
            ), "external metadata at {} does not exist".format(
                external_metadata)

            external_metadata = pd.read_csv(external_metadata)

    if isinstance(external_metadata, pd.DataFrame):
        external_metadata.columns = [
            "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
            for x in external_metadata.columns
        ]

        annotated = (annotated.merge(
            external_metadata,
            left_on=external_join_left,
            right_on=external_join_right,
            how="left",
        ).reset_index(drop=True).drop_duplicates())

    # Reorder annotated metadata columns
    meta_cols = infer_cp_features(annotated, metadata=True)
    other_cols = annotated.drop(meta_cols, axis="columns").columns.tolist()

    annotated = annotated.loc[:, meta_cols + other_cols]

    if output_file != "none":
        output(
            df=annotated,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return annotated
示例#15
0
    def aggregate_profiles(
        self,
        compute_subsample=False,
        output_file="none",
        compression_options=None,
        float_format=None,
        aggregate_args=None,
    ):
        """Aggregate and merge compartments. This is the primary entry to this class.

        Parameters
        ----------
        compute_subsample : bool, default False
            Whether or not to compute subsample. compute_subsample must be specified to perform subsampling.
            The function aggregate_profiles(compute_subsample=True) will apply subsetting even if subsample is initialized.
        output_file : str, optional
            The name of a file to output. We recommended that, if provided, the output file be suffixed with "_augmented".
        compression_options : str, optional
            Compression arguments as input to pandas.to_csv() with pandas version >= 1.2.
        float_format : str, optional
            Decimal precision to use in writing output file.
        aggregate_args : dict, optional
            Additional arguments passed as input to pycytominer.normalize().

        Returns
        -------
        pandas.core.frame.DataFrame
            Either a dataframe (if output_file="none") or will write to file.

        """

        if output_file != "none":
            self.set_output_file(output_file)

        compartment_idx = 0
        for compartment in self.compartments:
            if compartment_idx == 0:
                aggregated = self.aggregate_compartment(
                    compartment=compartment,
                    compute_subsample=compute_subsample,
                    compute_counts=True,
                )
            else:
                aggregated = aggregated.merge(
                    self.aggregate_compartment(compartment=compartment),
                    on=self.strata,
                    how="inner",
                )
            compartment_idx += 1

        self.is_aggregated = True

        if self.output_file != "none":
            output(
                df=aggregated,
                output_filename=self.output_file,
                compression_options=compression_options,
                float_format=float_format,
            )
        else:
            return aggregated
示例#16
0
def process_profile(sql_file, batch, plate, pipeline):
    """
    Given batch details and a pipeline, process morphology profiles
    """
    assert batch in sql_file, "batch {} not recognized in sql file {}".format(
        batch, sql_file)
    assert plate in sql_file, "plate {} not recognized in sql file {}".format(
        plate, sql_file)

    # Set output directory information
    pipeline_output = pipeline["output_dir"]
    output_dir = os.path.join(pipeline_output, batch, plate)
    os.makedirs(output_dir, exist_ok=True)

    # Set output file information
    aggregate_out_file = os.path.join(output_dir, "{}.csv.gz".format(plate))
    annotate_out_file = os.path.join(output_dir,
                                     "{}_augmented.csv.gz".format(plate))
    normalize_out_file = os.path.join(output_dir,
                                      "{}_normalized.csv.gz".format(plate))
    feature_out_file = os.path.join(
        output_dir, "{}_normalized_feature_selected.csv.gz".format(plate))

    # Load pipeline options
    compression = process_pipeline(pipeline["options"], option="compression")
    sc_float_format = process_pipeline(pipeline["options"],
                                       option="sc_float_format")
    samples = process_pipeline(pipeline["options"], option="samples")

    # Load and setup platemap info
    workspace_dir = pipeline["workspace_dir"]
    batch_dir = os.path.join(workspace_dir, "backend", batch)
    metadata_dir = os.path.join(workspace_dir, "metadata", batch)

    barcode_plate_map_file = os.path.join(metadata_dir,
                                          sorted(os.listdir(metadata_dir))[0])
    barcode_plate_map_df = pd.read_csv(barcode_plate_map_file)
    plate_map_name = barcode_plate_map_df.query(
        "Assay_Plate_Barcode == @plate").Plate_Map_Name.values[0]
    plate_map_file = os.path.join(metadata_dir, "platemap",
                                  "{}.txt".format(plate_map_name))
    plate_map_df = pd.read_csv(plate_map_file, sep="\t")
    plate_map_df.columns = [
        "Metadata_{}".format(x) if not x.startswith("Metadata_") else x
        for x in plate_map_df.columns
    ]
    platemap_well_column = pipeline["platemap_well_column"]

    # Process Bulk profiles
    # Step 1: Aggregate
    aggregate_steps = pipeline["aggregate"]
    aggregate_features = aggregate_steps["features"]
    aggregate_operation = aggregate_steps["method"]
    aggregate_plate_column = aggregate_steps["plate_column"]
    aggregate_well_column = aggregate_steps["well_column"]

    strata = [aggregate_plate_column, aggregate_well_column]

    if "site_column" in aggregate_steps:
        aggregate_site_column = aggregate_steps["site_column"]
        strata += [aggregate_site_column]

    if aggregate_steps["perform"]:
        ap = AggregateProfiles(
            sql_file,
            strata=strata,
            features=aggregate_features,
            operation=aggregate_operation,
        )

        ap.aggregate_profiles(output_file=aggregate_out_file,
                              compression=compression)

    if pipeline["count"]["perform"]:
        if not aggregate_steps["perform"]:
            ap = AggregateProfiles(
                sql_file,
                strata=strata,
                features=aggregate_features,
                operation=aggregate_operation,
            )
        count_dir = pipeline["count"]["output_dir"]
        os.makedirs(count_dir, exist_ok=True)

        cell_count_file = os.path.join(
            count_dir, "{}_{}_cell_count.tsv".format(batch, plate))

        cell_count_df = ap.count_cells()

        cell_count_df = cell_count_df.merge(
            plate_map_df,
            left_on=aggregate_well_column,
            right_on=platemap_well_column,
        ).drop(platemap_well_column, axis="columns")

        cell_count_df.to_csv(cell_count_file, sep="\t", index=False)

    # Annotate Profiles
    annotate_steps = pipeline["annotate"]
    annotate_well_column = annotate_steps["well_column"]
    if annotate_steps["perform"]:
        annotate(
            profiles=aggregate_out_file,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file=annotate_out_file,
            compression=compression,
        )

    # Normalize Profiles
    normalize_steps = pipeline["normalize"]
    norm_features = normalize_steps["features"]
    norm_method = normalize_steps["method"]
    if normalize_steps["perform"]:
        normalize(
            profiles=annotate_out_file,
            features=norm_features,
            samples=samples,
            method=norm_method,
            output_file=normalize_out_file,
            compression=compression,
        )

    # Apply feature selection
    feature_select_steps = pipeline["feature_select"]
    feature_select_operations = feature_select_steps["operations"]
    feature_select_features = feature_select_steps["features"]
    if feature_select_steps["perform"]:
        feature_select(
            profiles=normalize_out_file,
            features=feature_select_features,
            samples=samples,
            operation=feature_select_operations,
            output_file=feature_out_file,
            compression=compression,
            corr_threshold=0.9,
            corr_method="pearson",
        )

    sc_steps = pipeline["single_cell"]
    if sc_steps["perform"]:
        if not aggregate_steps["perform"]:
            ap = AggregateProfiles(
                sql_file,
                strata=strata,
                features=aggregate_features,
                operation=aggregate_operation,
            )

        # Load cells
        query = "select * from cells"
        cell_df = pd.read_sql(sql=query, con=ap.conn)

        # Load cytoplasm
        query = "select * from cytoplasm"
        cytoplasm_df = pd.read_sql(sql=query, con=ap.conn)

        # Load nuclei
        query = "select * from nuclei"
        nuclei_df = pd.read_sql(sql=query, con=ap.conn)

        # Merge single cells together
        sc_merged_df = (cell_df.merge(
            cytoplasm_df.drop("ObjectNumber", axis="columns"),
            left_on=["TableNumber", "ImageNumber", "ObjectNumber"],
            right_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Cells"],
            how="inner",
        ).drop("ObjectNumber", axis="columns").merge(
            nuclei_df,
            left_on=["TableNumber", "ImageNumber", "Cytoplasm_Parent_Nuclei"],
            right_on=["TableNumber", "ImageNumber", "ObjectNumber"],
            how="inner",
        ))

        # Merge image data info
        sc_merged_df = ap.image_df.merge(sc_merged_df,
                                         how="right",
                                         on=ap.merge_cols)

        # Make sure column names are correctly prefixed
        prefix = ["Metadata", "Cells", "Cytoplasm", "Nuclei"]
        cols = []
        for col in sc_merged_df.columns:
            if any([col.startswith(x) for x in prefix]):
                cols.append(col)
            else:
                cols.append(f"Metadata_{col}")
        sc_merged_df.columns = cols

        sc_merged_df = annotate(
            profiles=sc_merged_df,
            platemap=plate_map_df,
            join_on=[platemap_well_column, annotate_well_column],
            output_file="none",
        )

        if sc_steps["normalize"]:
            sc_merged_df = normalize(
                profiles=sc_merged_df,
                features=norm_features,
                samples=samples,
                method=norm_method,
                output_file="none",
            )

        if sc_steps["feature_select"]:
            sc_merged_df = feature_select(
                profiles=sc_merged_df,
                features=feature_select_features,
                samples=samples,
                operation=feature_select_operations,
                output_file="none",
                corr_threshold=0.9,
                corr_method="pearson",
            )

        sc_pipeline_output = pipeline["sc_output_dir"]
        sc_output_dir = os.path.join(sc_pipeline_output, batch, plate)
        os.makedirs(sc_output_dir, exist_ok=True)

        # Set output file information
        sc_out_file = os.path.join(sc_output_dir,
                                   "{}_single_cell.csv.gz".format(plate))
        output(
            df=sc_merged_df,
            output_filename=sc_out_file,
            compression="gzip",
            float_format=sc_float_format,
        )
示例#17
0
# Add dose recoding information
anno_df = anno_df.assign(
    Metadata_dose_recode=(anno_df.Metadata_mmoles_per_liter.apply(
        lambda x: recode_dose(x, primary_dose_mapping, return_level=True))))

# Reoroder columns
metadata_cols = cyto_utils.infer_cp_features(anno_df, metadata=True)
cp_cols = cyto_utils.infer_cp_features(anno_df)
reindex_cols = metadata_cols + cp_cols
anno_df = anno_df.reindex(reindex_cols, axis="columns")

# Output annotated file
cyto_utils.output(
    df=anno_df,
    output_filename=anno_file,
    float_format=float_format,
    compression_options=compression,
)

# Normalize Profiles (DMSO Control) - Level 4A Data
norm_dmso_file = pathlib.PurePath(output_dir,
                                  f"{plate_name}_normalized_dmso.csv.gz")
normalize(
    profiles=anno_df,
    samples="Metadata_broad_sample == 'DMSO'",
    method=norm_method,
    output_file=norm_dmso_file,
    float_format=float_format,
    compression_options=compression,
)
    def pipeline_feature_select(self, steps, suffix=None):
        feature_select_steps = steps
        pipeline_output = self.pipeline["output_dir"]

        level = feature_select_steps["level"]
        gct = feature_select_steps["gct"]
        feature_select_operations = feature_select_steps["operations"]
        feature_select_features = feature_select_steps["features"]

        all_plates_df = pd.DataFrame()

        for batch in self.profile_config:
            batch_df = pd.DataFrame()
            for plate in self.profile_config[batch]:
                output_dir = pathlib.PurePath(".", pipeline_output, batch,
                                              plate)
                if suffix:
                    normalize_output_file = pathlib.PurePath(
                        output_dir, f"{plate}_normalized_{suffix}.csv.gz")
                    feature_select_output_file_plate = pathlib.PurePath(
                        output_dir,
                        f"{plate}_normalized_feature_select_{suffix}_plate.csv.gz",
                    )
                else:
                    normalize_output_file = pathlib.PurePath(
                        output_dir, f"{plate}_normalized.csv.gz")
                    feature_select_output_file_plate = pathlib.PurePath(
                        output_dir,
                        f"{plate}_normalized_feature_select_plate.csv.gz")
                if feature_select_features == "infer" and self.noncanonical:
                    feature_select_features = cyto_utils.infer_cp_features(
                        pd.read_csv(normalize_output_file),
                        compartments=self.compartments,
                    )

                df = pd.read_csv(normalize_output_file).assign(
                    Metadata_batch=batch)

                if level == "plate":
                    df = df.drop(columns=["Metadata_batch"])
                    feature_select(
                        profiles=df,
                        features=feature_select_features,
                        operation=feature_select_operations,
                        output_file=feature_select_output_file_plate,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )
                elif level == "batch":
                    batch_df = concat_dataframes(batch_df, df)
                elif level == "all":
                    all_plates_df = concat_dataframes(all_plates_df, df)

            if level == "batch":
                fs_df = feature_select(
                    profiles=batch_df,
                    features=feature_select_features,
                    operation=feature_select_operations,
                )
                for plate in self.profile_config[batch]:
                    output_dir = pathlib.PurePath(".", pipeline_output, batch,
                                                  plate)
                    if suffix:
                        feature_select_output_file_batch = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_{suffix}_batch.csv.gz",
                        )
                    else:
                        feature_select_output_file_batch = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_batch.csv.gz",
                        )
                    if feature_select_features == "infer" and self.noncanonical:
                        feature_select_features = cyto_utils.infer_cp_features(
                            batch_df, compartments=self.compartments)

                    df = fs_df.query("Metadata_Plate==@plate").reset_index(
                        drop=True)
                    df = df.drop(columns=["Metadata_batch"])

                    cyto_utils.output(
                        output_filename=feature_select_output_file_batch,
                        df=df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )

                if gct:
                    create_gct_directories(batch)
                    if suffix:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_batch.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_batch.gct",
                        )
                    else:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_batch.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_batch.gct",
                        )
                    cyto_utils.output(
                        output_filename=stacked_file,
                        df=fs_df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )
                    write_gct(profiles=fs_df, output_file=gct_file)

        if level == "all":
            fs_df = feature_select(
                profiles=all_plates_df,
                features=feature_select_features,
                operation=feature_select_operations,
            )
            for batch in self.profile_config:
                fs_batch_df = fs_df.loc[fs_df.Metadata_batch ==
                                        batch].reset_index(drop=True)
                for plate in self.profile_config[batch]:
                    output_dir = pathlib.PurePath(".", pipeline_output, batch,
                                                  plate)
                    if suffix:
                        feature_select_output_file_all = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_{suffix}_all.csv.gz",
                        )
                    else:
                        feature_select_output_file_all = pathlib.PurePath(
                            output_dir,
                            f"{plate}_normalized_feature_select_all.csv.gz")
                    if feature_select_features == "infer" and self.noncanonical:
                        feature_select_features = cyto_utils.infer_cp_features(
                            all_plates_df, compartments=self.compartments)

                    df = fs_batch_df.query(
                        "Metadata_Plate==@plate").reset_index(drop=True)

                    df = df.drop(columns=["Metadata_batch"])

                    cyto_utils.output(
                        output_filename=feature_select_output_file_all,
                        df=df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )

                if gct:
                    create_gct_directories(batch)
                    if suffix:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_all.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_{suffix}_all.gct",
                        )
                    else:
                        stacked_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_all.csv.gz",
                        )
                        gct_file = pathlib.PurePath(
                            ".",
                            "gct",
                            batch,
                            f"{batch}_normalized_feature_select_all.gct",
                        )
                    cyto_utils.output(
                        output_filename=stacked_file,
                        df=fs_batch_df,
                        compression_options=self.
                        pipeline_options["compression"],
                        float_format=self.pipeline_options["float_format"],
                    )
                    write_gct(profiles=fs_batch_df, output_file=gct_file)
        else:
            warnings.warn(
                f"{site_file} does not exist. There must have been an error in processing"
            )

    single_cell_df = pd.concat(single_cell_df,
                               axis="rows").reset_index(drop=True)

# Perform the aggregation based on the defined levels and columns
aggregate_output_dir.mkdir(parents=True, exist_ok=True)
for aggregate_level, aggregate_columns in aggregate_levels.items():
    aggregate_output_file = aggregate_output_files[aggregate_level]

    print(
        f"Now aggregating by {aggregate_level}...with operation: {aggregate_operation}"
    )

    aggregate_df = aggregate(
        population_df=single_cell_df,
        strata=aggregate_columns,
        features=aggregate_features,
        operation=aggregate_operation,
    )

    output(
        aggregate_df,
        output_filename=aggregate_output_file,
        compression=compression,
        float_format=float_format,
    )
示例#20
0
            "unknown")

        # Set a timepoint variable only for batch 1
        if batch == "2016_04_01_a549_48hr_batch1":
            spherized_df = spherized_df.assign(Metadata_time_point="48H")

        for operation in operations:
            output_file = pathlib.Path(
                f"{output_dir}/{batch}{spherized_string}{norm_strat}_consensus_{operation}.csv.gz"
            )
            print(f"    with consensus operation: {operation}")

            spherized_consensus_df = consensus(
                profiles=spherized_df,
                replicate_columns=replicate_cols,
                operation=operation,
                features=features,
            )
            print(spherized_consensus_df.shape)

            output(
                df=spherized_consensus_df,
                output_filename=output_file,
                sep=",",
                float_format=float_format,
                compression_options=compression_options,
            )
            print("    Done.")

    print("Batch done.\n")
示例#21
0
def feature_select(
    profiles,
    features="infer",
    image_features=False,
    samples="all",
    operation="variance_threshold",
    output_file="none",
    na_cutoff=0.05,
    corr_threshold=0.9,
    corr_method="pearson",
    freq_cut=0.05,
    unique_cut=0.1,
    compression_options=None,
    float_format=None,
    blocklist_file=None,
    outlier_cutoff=15,
    noise_removal_perturb_groups=None,
    noise_removal_stdev_cutoff=None,
):
    """Performs feature selection based on the given operation.

    Parameters
    ----------
    profiles : pandas.core.frame.DataFrame or file
        DataFrame or file of profiles.
    features : list
        A list of strings corresponding to feature measurement column names in the
        `profiles` DataFrame. All features listed must be found in `profiles`.
        Defaults to "infer". If "infer", then assume cell painting features are those
        prefixed with "Cells", "Nuclei", or "Cytoplasm".
    image_features: bool, default False
        Whether the profiles contain image features.
    samples : list or str, default "all"
        Samples to provide operation on.
    operation: list of str or str, default "variance_threshold
        Operations to perform on the input profiles.
    output_file : str, optional
        If provided, will write annotated profiles to file. If not specified, will
        return the normalized profiles as output. We recommend that this output file be
        suffixed with "_normalized_variable_selected.csv".
    na_cutoff : float, default 0.05
        Proportion of missing values in a column to tolerate before removing.
    corr_threshold : float, default 0.1
        Value between (0, 1) to exclude features above if any two features are correlated above this threshold.
    corr_method : str, default "pearson"
        Correlation type to compute. Allowed methods are "spearman", "kendall" and "pearson".
    freq_cut : float, default 0.05
        Ratio (2nd most common feature val / most common).
    unique_cut: float, default 0.01
        Ratio (num unique features / num samples).
    compression_options : str or dict, optional
        Contains compression options as input to
        pd.DataFrame.to_csv(compression=compression_options). pandas version >= 1.2.
    float_format : str, optional
        Decimal precision to use in writing output file as input to
        pd.DataFrame.to_csv(float_format=float_format). For example, use "%.3g" for 3
        decimal precision.
    blocklist_file : str, optional
        File location of datafrmame with with features to exclude. Note that if "blocklist" in operation then will remove standard blocklist
    outlier_cutoff : float, default 15
        The threshold at which the maximum or minimum value of a feature across a full experiment is excluded. Note that this procedure is typically applied (and therefore the default is uitable) for after normalization.
    noise_removal_perturb_groups: str or list of str, optional
        Perturbation groups corresponding to rows in profiles or the the name of the metadata column containing this information.
    noise_removal_stdev_cutoff: float,optional
        Maximum mean feature standard deviation to be kept for noise removal, grouped by the identity of the perturbation from perturb_list. The data must already be normalized so that this cutoff can apply to all columns.

    Returns
    -------
    selected_df : pandas.core.frame.DataFrame, optional
        The feature selected profile DataFrame. If output_file="none", then return the
        DataFrame. If you specify output_file, then write to file and do not return
        data.

    """

    all_ops = [
        "variance_threshold",
        "correlation_threshold",
        "drop_na_columns",
        "blocklist",
        "drop_outliers",
        "noise_removal",
    ]

    # Make sure the user provides a supported operation
    if isinstance(operation, list):
        assert all([x in all_ops for x in operation
                    ]), "Some operation(s) {} not supported. Choose {}".format(
                        operation, all_ops)
    elif isinstance(operation, str):
        assert operation in all_ops, "{} not supported. Choose {}".format(
            operation, all_ops)
        operation = operation.split()
    else:
        return ValueError("Operation must be a list or string")

    # Load Data
    profiles = load_profiles(profiles)

    if features == "infer":
        features = infer_cp_features(profiles, image_features=image_features)

    excluded_features = []
    for op in operation:
        if op == "variance_threshold":
            exclude = variance_threshold(
                population_df=profiles,
                features=features,
                samples=samples,
                freq_cut=freq_cut,
                unique_cut=unique_cut,
            )
        elif op == "drop_na_columns":
            exclude = get_na_columns(
                population_df=profiles,
                features=features,
                samples=samples,
                cutoff=na_cutoff,
            )
        elif op == "correlation_threshold":
            exclude = correlation_threshold(
                population_df=profiles,
                features=features,
                samples=samples,
                threshold=corr_threshold,
                method=corr_method,
            )
        elif op == "blocklist":
            if blocklist_file:
                exclude = get_blocklist_features(population_df=profiles,
                                                 blocklist_file=blocklist_file)
            else:
                exclude = get_blocklist_features(population_df=profiles)
        elif op == "drop_outliers":
            exclude = drop_outlier_features(
                population_df=profiles,
                features=features,
                samples=samples,
                outlier_cutoff=outlier_cutoff,
            )
        elif op == "noise_removal":
            exclude = noise_removal(
                population_df=profiles,
                features=features,
                noise_removal_perturb_groups=noise_removal_perturb_groups,
                noise_removal_stdev_cutoff=noise_removal_stdev_cutoff,
            )
        excluded_features += exclude

    excluded_features = list(set(excluded_features))

    selected_df = profiles.drop(excluded_features, axis="columns")

    if output_file != "none":
        output(
            df=selected_df,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return selected_df
示例#22
0
def aggregate(
    population_df,
    strata=["Metadata_Plate", "Metadata_Well"],
    features="infer",
    operation="median",
    output_file="none",
    compute_object_count=False,
    object_feature="ObjectNumber",
    subset_data_df="none",
    compression_options=None,
    float_format=None,
):
    """Combine population dataframe variables by strata groups using given operation.

    Parameters
    ----------
    population_df : pandas.core.frame.DataFrame
        DataFrame to group and aggregate.
    strata : list of str, default ["Metadata_Plate", "Metadata_Well"]
        Columns to groupby and aggregate.
    features : list of str, default "all"
        List of features that should be aggregated.
    operation : str, default "median"
        How the data is aggregated. Currently only supports one of ['mean', 'median'].
    output_file : str or file handle, optional
        If provided, will write aggregated profiles to file. If not specified, will return the aggregated profiles.
        We recommend naming the file based on the plate name.
    compute_object_count : bool, default False
        Whether or not to compute object counts.
    object_feature : str, default "ObjectNumber"
        Object number feature. Only used if compute_object_count=True.
    subset_data_df : pandas.core.frame.DataFrame
        How to subset the input.
    compression_options : str, optional
        The mechanism to compress.
    float_format : str, optional
        Decimal precision to use in writing output file.

    Returns
    -------
    pandas.core.frame.DataFrame
        DataFrame of aggregated features.

    """

    # Check that the operation is supported
    operation = check_aggregate_operation(operation)

    # Subset the data to specified samples
    if isinstance(subset_data_df, pd.DataFrame):
        population_df = subset_data_df.merge(
            population_df, how="left",
            on=subset_data_df.columns.tolist()).reindex(population_df.columns,
                                                        axis="columns")

    # Subset dataframe to only specified variables if provided
    strata_df = population_df.loc[:, strata]

    # Only extract single object column in preparation for count
    if compute_object_count:
        count_object_df = population_df.loc[:,
                                            np.
                                            union1d(strata, [object_feature])]
        count_object_df = (count_object_df.groupby(
            strata)[object_feature].count().reset_index().rename(
                columns={f"{object_feature}": "Metadata_Object_Count"}))

    if features == "infer":
        features = infer_cp_features(population_df)
        population_df = population_df.loc[:, features]
    else:
        population_df = population_df.loc[:, features]

    # Fix dtype of input features (they should all be floats!)
    convert_dict = {x: float for x in features}
    population_df = population_df.astype(convert_dict)

    # Merge back metadata used to aggregate by
    population_df = pd.concat([strata_df, population_df], axis="columns")

    # Perform aggregating function
    population_df = population_df.groupby(strata, dropna=False)

    if operation == "median":
        population_df = population_df.median().reset_index()
    else:
        population_df = population_df.mean().reset_index()

    # Compute objects counts
    if compute_object_count:
        population_df = count_object_df.merge(population_df,
                                              on=strata,
                                              how="right")

    # Aggregated image number and object number do not make sense
    for col in ["ImageNumber", "ObjectNumber"]:
        if col in population_df.columns:
            population_df = population_df.drop([col], axis="columns")

    if output_file != "none":
        output(
            df=population_df,
            output_filename=output_file,
            compression_options=compression_options,
            float_format=float_format,
        )
    else:
        return population_df

    return population_df
                                        na_cutoff=na_cut)
        else:
            profile_df = feature_select(profiles=profile_df,
                                        operation=feature_select_ops,
                                        na_cutoff=na_cut,
                                        corr_threshold=corr_threshold,
                                        blocklist_file=full_blocklist_file)

        # Step 2: Spherize transform
        if batch == "2017_12_05_Batch2":
            spherize_df = (profile_df.groupby([
                "Metadata_cell_line", "Metadata_time_point"
            ]).apply(
                lambda x: normalize(profiles=x,
                                    features="infer",
                                    meta_features="infer",
                                    samples="Metadata_broad_sample == 'DMSO'",
                                    method="spherize")))
        else:
            spherize_df = normalize(profiles=profile_df,
                                    features="infer",
                                    meta_features="infer",
                                    samples="Metadata_broad_sample == 'DMSO'",
                                    method="spherize")

        print(spherize_df.shape)
        spherize_df.head()

        # Step 3: Output profiles
        output(df=spherize_df, output_filename=output_file)