Exemplo n.º 1
0
def find_correlation(data, threshold=0.9):
    """
    Given a numeric pd.DataFrame, this will find highly correlated features,
    and return a list of features to remove.

    Parameters
    -----------
    data : pandas DataFrame
        DataFrame
    threshold : float
        correlation threshold, will remove one of pairs of features with a
        correlation greater than this value

    Returns
    --------
    select_flat : list
        listof column names to be removed
    """
    corr_mat = data[utils.get_featuredata(data)].corr()
    corr_mat.loc[:, :] = np.tril(corr_mat, k=-1)
    already_in = set()
    result = []
    for col in corr_mat:
        perfect_corr = corr_mat[col][corr_mat[col] > threshold].index.tolist()
        if perfect_corr and col not in already_in:
            already_in.update(set(perfect_corr))
            perfect_corr.append(col)
            result.append(perfect_corr)
    select_nested = [f[1:] for f in result]
    select_flat = [i for j in select_nested for i in j]
    return select_flat
Exemplo n.º 2
0
def find_correlation(data, threshold=0.9):
    """
    Given a numeric pd.DataFrame, this will find highly correlated features,
    and return a list of features to remove.

    Parameters
    -----------
    data : pandas DataFrame
        DataFrame
    threshold : float
        correlation threshold, will remove one of pairs of features with a
        correlation greater than this value

    Returns
    --------
    select_flat : list
        listof column names to be removed
    """
    corr_mat = data[utils.get_featuredata(data)].corr()
    corr_mat.loc[:, :] = np.tril(corr_mat, k=-1)
    already_in = set()
    result = []
    for col in corr_mat:
        perfect_corr = corr_mat[col][corr_mat[col] > threshold].index.tolist()
        if perfect_corr and col not in already_in:
            already_in.update(set(perfect_corr))
            perfect_corr.append(col)
            result.append(perfect_corr)
    select_nested = [f[1:] for f in result]
    select_flat = [i for j in select_nested for i in j]
    return select_flat
Exemplo n.º 3
0
 def featuredata(self):
     """return featuredata"""
     featuredata_cols = utils.get_featuredata(self, self.metadata_string,
                                              self.prefix)
     return DataFrame(self[featuredata_cols],
                      metadata_string=self.metadata_string,
                      prefix=self.prefix)
Exemplo n.º 4
0
def test_get_featuredata_simple():
    x = [1,2,3,4]
    y = [4,3,2,1]
    z = [1,2,3,4]
    columns = ["colA", "colB", "Metadata_A"]
    test_df = pd.DataFrame(list(zip(x, y, z)), columns=columns)
    cols = utils.get_featuredata(test_df)
    assert cols == ["colA", "colB"]
Exemplo n.º 5
0
def test_get_featuredata_middle_prefix():
    x = [1,2,3,4]
    y = [4,3,2,1]
    z = [1,2,3,4]
    a = [4,3,5,1]
    columns = ["colA", "colB", "something_Metadata", "Metadata_A"]
    test_df = pd.DataFrame(list(zip(x, y, z, a)), columns=columns)
    cols = utils.get_featuredata(test_df, prefix=True)
    assert cols == ["colA", "colB", "something_Metadata"]
Exemplo n.º 6
0
def test_get_featuredata_different_case():
    x = [1,2,3,4]
    y = [4,3,2,1]
    z = [1,2,3,4]
    a = [4,3,5,1]
    columns = ["colA", "colB", "metadata_A", "Metadata_A"]
    test_df = pd.DataFrame(list(zip(x, y, z, a)), columns=columns)
    cols = utils.get_featuredata(test_df, metadata_string="metadata")
    assert cols == ["colA", "colB", "Metadata_A"]
Exemplo n.º 7
0
def test_get_feature_data_prefix_options():
    x = [1,2,3,4]
    y = [4,3,2,1]
    z = [1,2,3,4]
    a = [4,3,5,1]
    columns = ["colA", "colB", "something_Metadata", "Metadata_A"]
    test_df = pd.DataFrame(list(zip(x, y, z, a)), columns=columns)
    out = utils.get_featuredata(test_df, prefix=False)
    ans = ["colA", "colB"]
    assert out == ans
Exemplo n.º 8
0
def test_feature_importance_returns_colnames():
    x, y = make_classification(n_samples=100, n_features=10, n_informative=2)
    x = pd.DataFrame(x)
    x.columns = ["x" + str(i) for i in range(1, 11)]
    x["Metadata_compound"] = ["pos", "neg"] * 50
    out = feature_selection.feature_importance(
        data=x, neg_cmpd="neg", pos_cmpd="pos", compound_col="Metadata_compound"
    )
    f_names, importances = list(zip(*out))
    feature_col_names = utils.get_featuredata(x)
    assert list(f_names) == list(feature_col_names)
Exemplo n.º 9
0
def get_outlier_index(data, method="values", sigma=6, adjust=True, **kwargs):
    """
    Returns index of outlying row(s)

    Parameters
    ----------
    data: pandas dataframe
        DataFrame
    method : string (default="values")
        either 'simple' which is based on hampels robust outlier
        test on feature values, or 'ImageQualty' which uses the
        ImageQualty metrics - FocusScore and PowerLogLogSlope.
    sigma : int (default=6)
        number of median absolute deviations away from the sample median to
        define an outlier.
    adjust: boolean (default=True)
        If true will adjust the sigma value to take into account multiple
        measurements. `sigma_adj = sigma * n_feature_columns`
    **kwargs: additional arguments to utils.get_featuredata

    Returns
    -------
    bad_index : list
        list of row index/indices to remove
    """
    if not isinstance(data, pd.DataFrame):
        raise ValueError("not a pandas DataFrame")
    accepted_methods = ["values", "ImageQuality"]
    if method not in accepted_methods:
        raise ValueError("invalid argument. Options: values, ImageQuality")
    if method == "values":
        feature_cols = utils.get_featuredata(data, **kwargs)
        # FIXME really crude correction
        if adjust:
            sigma = sigma * len(feature_cols)
        hampel_out = data[feature_cols].apply(stats.hampel, sigma=sigma)
        hampel_abs = hampel_out.apply(lambda x: sum(abs(x)), axis=1)
        return hampel_abs[hampel_abs > 0].index.tolist()
    if method == "ImageQuality":
        qc_cols = utils.get_image_quality(data)
        # find bad images with FocusScore
        focus_cols = [col for col in qc_cols if "FocusScore" in col]
        hampel_focus = data[focus_cols].apply(stats.hampel, sigma=sigma)
        focus_sum = hampel_focus.apply(np.sum, axis=1)
        focus_bad = focus_sum[focus_sum < 0].index.tolist()
        # find bad images with PowerLogLogSlope
        plls_cols = [col for col in qc_cols if "PowerLogLogSlope" in col]
        hampel_plls = data[plls_cols].apply(stats.hampel, sigma=sigma)
        plls_sum = hampel_plls.apply(np.sum, axis=1)
        plls_bad = plls_sum[plls_sum < 0].index.tolist()
        bad_index = list(set(focus_bad + plls_bad))
        return bad_index
Exemplo n.º 10
0
def get_outlier_index(data, method="values", sigma=6, adjust=True, **kwargs):
    """
    Returns index of outlying row(s)

    Parameters
    ----------
    data: pandas dataframe
        DataFrame
    method : string (default="values")
        either 'simple' which is based on hampels robust outlier
        test on feature values, or 'ImageQualty' which uses the
        ImageQualty metrics - FocusScore and PowerLogLogSlope.
    sigma : int (default=6)
        number of median absolute deviations away from the sample median to
        define an outlier.
    adjust: boolean (default=True)
        If true will adjust the sigma value to take into account multiple
        measurements. `sigma_adj = sigma * n_feature_columns`
    **kwargs: additional arguments to utils.get_featuredata

    Returns
    -------
    bad_index : list
        list of row index/indices to remove
    """
    if not isinstance(data, pd.DataFrame):
        raise ValueError("not a pandas DataFrame")
    accepted_methods = ["values", "ImageQuality"]
    if method not in accepted_methods:
        raise ValueError("invalid argument. Options: values, ImageQuality")
    if method == "values":
        feature_cols = utils.get_featuredata(data, **kwargs)
        # FIXME really crude correction
        if adjust:
            sigma = sigma * len(feature_cols)
        hampel_out = data[feature_cols].apply(stats.hampel, sigma=sigma)
        hampel_abs = hampel_out.apply(lambda x: sum(abs(x)), axis=1)
        return hampel_abs[hampel_abs > 0].index.tolist()
    if method == "ImageQuality":
        qc_cols = utils.get_image_quality(data)
        # find bad images with FocusScore
        focus_cols = [col for col in qc_cols if "FocusScore" in col]
        hampel_focus = data[focus_cols].apply(stats.hampel, sigma=sigma)
        focus_sum = hampel_focus.apply(np.sum, axis=1)
        focus_bad = focus_sum[focus_sum < 0].index.tolist()
        # find bad images with PowerLogLogSlope
        plls_cols = [col for col in qc_cols if "PowerLogLogSlope" in col]
        hampel_plls = data[plls_cols].apply(stats.hampel, sigma=sigma)
        plls_sum = hampel_plls.apply(np.sum, axis=1)
        plls_bad = plls_sum[plls_sum < 0].index.tolist()
        bad_index = list(set(focus_bad + plls_bad))
        return bad_index
Exemplo n.º 11
0
def _check_featuredata(data, on, metadata_string, prefix):
    """
    Check feature data is numerical
    """
    feature_cols = utils.get_featuredata(data, metadata_string, prefix)
    cols_to_check = [col for col in feature_cols if col not in [on]]
    df_to_check = data[cols_to_check]
    is_number = np.vectorize(lambda x: np.issubdtype(x, np.number))
    if any(is_number(df_to_check.dtypes) == False):
        # return column name
        nn_col = df_to_check.columns[is_number(df_to_check.dtypes) == False]
        err_msg = "{} is a non-numeric featuredata columns".format(nn_col)
        raise ValueError(err_msg)
Exemplo n.º 12
0
def find_replicate_var(data, grouping, sorted_by_var=True):
    """
    Return within replicate variance of featuredata
    """
    variances = []
    feature_cols = utils.get_featuredata(data)
    for _, group in data.groupby(grouping):
        variance = group[feature_cols].var()
        variances.append(variance)
    var_mean = np.nanmean(np.vstack(variances), axis=1)
    feature_var = list(zip(feature_cols, var_mean))
    if sorted_by_var:
        feature_var.sort(key=lambda x: x[1])
    return feature_var
Exemplo n.º 13
0
def find_replicate_var(data, grouping, sorted_by_var=True):
    """
    Return within replicate variance of featuredata
    """
    variances = []
    feature_cols = utils.get_featuredata(data)
    for _, group in data.groupby(grouping):
        variance = group[feature_cols].var()
        variances.append(variance)
    var_mean = np.nanmean(np.vstack(variances), axis=1)
    feature_var = list(zip(feature_cols, var_mean))
    if sorted_by_var:
        feature_var.sort(key=lambda x: x[1])
    return feature_var
Exemplo n.º 14
0
def s_normalise(data, plate_id, compound="Metadata_compound",
                neg_compound="DMSO", method="subtract",
                metadata_string="Metadata_", prefix=True):
    """
    Normalise values against negative controls values per plate.

    Parameters
    ----------
    data : pandas DataFrame
        DataFrame
    plate_id : string
        column containing plate ID/label
    compound : string (default="Metadata_compound")
        column containing compound name/ID
    neg_compound : string (default="DMSO")
        name of negative control compound in compound col
    method :string (default="subtract")
        method to normalise against negative control
    **kwargs : additional arguments to utils.get_featuredata/metadata

    Returns
    --------
    df_out : pandas DataFrame
        DataFrame of normalised feature values
    """
    valid_methods = ["subtract", "divide"]
    if method not in valid_methods:
        raise ValueError("Invalid method, options: {}".format(valid_methods))
    # check there are some negative controls on each plate
    _check_control(data, plate_id, compound, neg_compound)
    # identify feature columns
    f_cols = utils.get_featuredata(data, metadata_string, prefix)
    # dataframe for output
    df_out = pd.DataFrame()
    # group by plate
    grouped = data.groupby(plate_id, as_index=False)
    # calculate the average negative control values for each plate
    for _, group in grouped:
        dmso_med_ = group[group[compound] == neg_compound]
        dmso_med = dmso_med_[f_cols].median()
        if method == "subtract":
            group[f_cols] = group[f_cols].sub(dmso_med)
        if method == "divide":
            group[f_cols] = group[f_cols].divide(dmso_med)
        # concatenate group to overall dataframe
        df_out = pd.concat([df_out, group])
    # check we have not lost any rows
    assert data.shape == df_out.shape
    return df_out
Exemplo n.º 15
0
def p_normalise(data, plate_id, compound="Metadata_compound",
                neg_compound="DMSO", n_jobs=-1, method="subtraction",
                metadata_string="Metadata_", prefix=True):
    """
    parallelised version of normalise, currently only works with subtraction
    normalisation.
    """
    _check_control(data, plate_id, compound, neg_compound)
    if n_jobs < 0:
        # use all available cpu cores
        n_jobs = multiprocessing.cpu_count()
    f_cols = utils.get_featuredata(data, metadata_string, prefix)
    grouped = data.groupby(plate_id, as_index=False)
    return _apply_parallel(grouped_df=grouped, func=_norm_group,
                           neg_compound=neg_compound, compound=compound,
                           f_cols=f_cols, n_jobs=n_jobs, method=method)
Exemplo n.º 16
0
def robust_normalise(data, plate_id, compound="Metadata_compound",
                     neg_compound="DMSO", metadata_string="Metadata_",
                     prefix=True):
    """
    Method used in the Carpenter lab. Substract the median feature value for
    each plate negative control from the treatment feature value and divide by
    the median absolute deviation.

    Parameters
    ----------
    data : pandas DataFrame
        DataFrame
    plate_id : string
        column containing plate ID/label
    compound : string (default="Metadata_compound")
        column containing compound name/ID
    neg_compound : string (default="DMSO")
        name of negative control compound in compound col
    **kwargs : additional arguments to utils.get_featuredata/metadata

    Returns
    --------
    df_out : pandas DataFrame
        DataFrame of normalised feature values
    """
    _check_control(data, plate_id, compound, neg_compound)
    f_cols = utils.get_featuredata(data, metadata_string, prefix)
    grouped = data.groupby(plate_id, as_index=False)
    df_out = pd.DataFrame()
    # calculate the average negative control values per plate_id
    for _, group in grouped:
        # find the median and mad dmso value for each plate
        dmso_vals = group[group[compound] == neg_compound]
        dmso_med = dmso_vals[f_cols].median().values
        dmso_mad = dmso_vals[f_cols].apply(stats.mad, axis=0).values
        assert len(dmso_med) == group[f_cols].shape[1]
        # subtract each row of the group by that group's DMSO values
        group[f_cols] = group[f_cols].sub(dmso_med)
        # divide by the MAD of the negative control
        group[f_cols] = group[f_cols].apply(lambda x: (x/dmso_mad)*1.4826, axis=1)
        # concatenate group to overall dataframe
        df_out = pd.concat([df_out, group])
    # check we have not lost any rows
    assert data.shape == df_out.shape
    return df_out
Exemplo n.º 17
0
def _split_classes(data, neg_cmpd, pos_cmpd, compound_col):
    """
    Internal function used to separate featuredata and compound labels for
    classification.

    Parameters
    -----------
    data : pandas DataFrame
    neg_cmpd : string
        name of negative control in compound_col
    pos_cmpd : string
        name of positive control in compound_col
    compound_col : string
        name of column in df that contains compound labels

    Returns
    --------
    classes: list
        [X, Y], where X is the dataframe containing feature columns, and Y
        is the list of integers matching to postive or negative controls.
    """

    if not isinstance(data, pd.DataFrame):
        raise ValueError("is not a pandas DataFrame")
    if compound_col not in data.columns:
        raise ValueError("{} is not a column in data".format(compound_col))
    if neg_cmpd not in data[compound_col].tolist():
        raise ValueError("{} is not in column {}".format(
            neg_cmpd, compound_col))
    if pos_cmpd not in data[compound_col].tolist():
        raise ValueError("{} is not in column {}".format(
            pos_cmpd, compound_col))
    #split data into just positive and negative controls
    controls = [neg_cmpd, pos_cmpd]
    df_cntrl = data[data[compound_col].isin(controls)].copy()
    # convert compound labels to integers. pos_cmpd=1, neg_cmpd=0
    cntrl_int = pd.Categorical(df_cntrl[compound_col]).codes.tolist()
    df_cntrl[compound_col] = cntrl_int
    # select just feature data
    X = df_cntrl[utils.get_featuredata(df_cntrl)]
    Y = df_cntrl[compound_col].tolist()
    return [X, Y]
Exemplo n.º 18
0
def _split_classes(data, neg_cmpd, pos_cmpd, compound_col):
    """
    Internal function used to separate featuredata and compound labels for
    classification.

    Parameters
    -----------
    data : pandas DataFrame
    neg_cmpd : string
        name of negative control in compound_col
    pos_cmpd : string
        name of positive control in compound_col
    compound_col : string
        name of column in df that contains compound labels

    Returns
    --------
    classes: list
        [X, Y], where X is the dataframe containing feature columns, and Y
        is the list of integers matching to postive or negative controls.
    """

    if not isinstance(data, pd.DataFrame):
        raise ValueError("is not a pandas DataFrame")
    if compound_col not in data.columns:
        raise ValueError("{} is not a column in data".format(compound_col))
    if neg_cmpd not in data[compound_col].tolist():
        raise ValueError("{} is not in column {}".format(neg_cmpd, compound_col))
    if pos_cmpd not in data[compound_col].tolist():
        raise ValueError("{} is not in column {}".format(pos_cmpd, compound_col))
    #split data into just positive and negative controls
    controls = [neg_cmpd, pos_cmpd]
    df_cntrl = data[data[compound_col].isin(controls)].copy()
    # convert compound labels to integers. pos_cmpd=1, neg_cmpd=0
    cntrl_int = pd.Categorical(df_cntrl[compound_col]).codes.tolist()
    df_cntrl[compound_col] = cntrl_int
    # select just feature data
    X = df_cntrl[utils.get_featuredata(df_cntrl)]
    Y = df_cntrl[compound_col].tolist()
    return [X, Y]
Exemplo n.º 19
0
def scale_features(data, metadata_string="Metadata_", prefix=True):
    """
    scale and centre features with a z-score

    Parameters
    ----------
    df : pandas DataFrame
        DataFrame
    **kwargs : additional arguments to utils.get_featuredata/get_metadata

    Returns
    -------
    scaled : pandas DataFrame
        dataframe of same dimensions as df, with scaled feature values
    """
    data_columns = data.columns.tolist()
    feature_data = data[utils.get_featuredata(data, metadata_string, prefix)]
    metadata = data[utils.get_metadata(data, metadata_string, prefix)]
    scaled_featuredata = feature_data.apply(z_score)
    scaled_both = pd.concat([scaled_featuredata, metadata], axis=1)
    # return columns to original order
    scaled_both = scaled_both[data_columns]
    return scaled_both
Exemplo n.º 20
0
def find_low_var(data, threshold=1e-5):
    """
    Return column names of feature columns with zero or very low variance

    Parameters
    ------------
    data : pandas DataFrame
        DataFrame
    threshold : float
        low variance threshold

    Returns
    -------
    columns : list
        list of columns to remove
    """
    if not isinstance(data, pd.DataFrame):
        raise ValueError("not a pandas DataFrame")
    var = data[utils.get_featuredata(data)].var(axis=0)
    below_thresh = var[var <= threshold].index.tolist()
    is_nan = utils.is_all_nan(data)
    columns = list(below_thresh) + list(is_nan)
    return columns
Exemplo n.º 21
0
def scale_features(data, metadata_string="Metadata_", prefix=True):
    """
    scale and centre features with a z-score

    Parameters
    ----------
    df : pandas DataFrame
        DataFrame
    **kwargs : additional arguments to utils.get_featuredata/get_metadata

    Returns
    -------
    scaled : pandas DataFrame
        dataframe of same dimensions as df, with scaled feature values
    """
    data_columns = data.columns.tolist()
    feature_data = data[utils.get_featuredata(data, metadata_string, prefix)]
    metadata = data[utils.get_metadata(data, metadata_string, prefix)]
    scaled_featuredata = feature_data.apply(z_score)
    scaled_both = pd.concat([scaled_featuredata, metadata], axis=1)
    # return columns to original order
    scaled_both = scaled_both[data_columns]
    return scaled_both
Exemplo n.º 22
0
def find_low_var(data, threshold=1e-5):
    """
    Return column names of feature columns with zero or very low variance

    Parameters
    ------------
    data : pandas DataFrame
        DataFrame
    threshold : float
        low variance threshold

    Returns
    -------
    columns : list
        list of columns to remove
    """
    if not isinstance(data, pd.DataFrame):
        raise ValueError("not a pandas DataFrame")
    var = data[utils.get_featuredata(data)].var(axis=0)
    below_thresh = var[var <= threshold].index.tolist()
    is_nan = utils.is_all_nan(data)
    columns = list(below_thresh) + list(is_nan)
    return columns
Exemplo n.º 23
0
 def featurecols(self):
     """return of list feature data column names"""
     return utils.get_featuredata(self, self.metadata_string, self.prefix)
Exemplo n.º 24
0
 def featuredata(self):
     """return featuredata"""
     featuredata_cols = utils.get_featuredata(self, self.metadata_string, self.prefix)
     return DataFrame(self[featuredata_cols],
                      metadata_string=self.metadata_string,
                      prefix=self.prefix)
Exemplo n.º 25
0
 def featurecols(self):
     """return of list feature data column names"""
     return utils.get_featuredata(self, self.metadata_string, self.prefix)