예제 #1
0
def test_get_image_quality_no_im_qc_cols():
    x = [1,2,3,4]
    y = [2,3,4,5]
    df = pd.DataFrame(list(zip(x, y)))
    df.columns = ["x", "y"]
    with pytest.raises(ValueError):
        utils.get_image_quality(df)
예제 #2
0
def test_get_image_quality_fails_non_dataframe():
    # create simple dataframe with ImageQuality columns
    x = [1, 2, 3]
    y = [2, 4, 1]
    z = [2, 5, 1]
    df = pd.DataFrame(list(zip(x, y, z)))
    df.columns = ["vals", "ImageQuality_test", "other"]
    test_list = df["ImageQuality_test"].tolist()
    with pytest.raises(ValueError):
        utils.get_image_quality(test_list)
예제 #3
0
def test_get_image_quality():
    # create simple dataframe with ImageQuality columns
    x = [1, 2, 3]
    y = [2, 4, 1]
    z = [2, 5, 1]
    df = pd.DataFrame(list(zip(x, y, z)))
    df.columns = ["vals", "ImageQuality_test", "other"]
    out = utils.get_image_quality(df)
    print(out)
    assert out == ["ImageQuality_test"]
예제 #4
0
def test_get_image_quality_not_beginning():
    # column has ImageQuality in middle of string
    # create simple dataframe with ImageQuality columns
    x = [1, 2, 3]
    y = [2, 4, 1]
    z = [2, 5, 1]
    df2 = pd.DataFrame(list(zip(x, y, z)))
    df2.columns = ["vals", "ImageQuality_test", "Cells_ImageQuality"]
    out = utils.get_image_quality(df2)
    assert out == ["ImageQuality_test", "Cells_ImageQuality"]
예제 #5
0
파일: outliers.py 프로젝트: Swarchal/morar
def get_outlier_index(data, method="values", sigma=6, adjust=True, **kwargs):
    """
    Returns index of outlying row(s)

    Parameters
    ----------
    data: pandas dataframe
        DataFrame
    method : string (default="values")
        either 'simple' which is based on hampels robust outlier
        test on feature values, or 'ImageQualty' which uses the
        ImageQualty metrics - FocusScore and PowerLogLogSlope.
    sigma : int (default=6)
        number of median absolute deviations away from the sample median to
        define an outlier.
    adjust: boolean (default=True)
        If true will adjust the sigma value to take into account multiple
        measurements. `sigma_adj = sigma * n_feature_columns`
    **kwargs: additional arguments to utils.get_featuredata

    Returns
    -------
    bad_index : list
        list of row index/indices to remove
    """
    if not isinstance(data, pd.DataFrame):
        raise ValueError("not a pandas DataFrame")
    accepted_methods = ["values", "ImageQuality"]
    if method not in accepted_methods:
        raise ValueError("invalid argument. Options: values, ImageQuality")
    if method == "values":
        feature_cols = utils.get_featuredata(data, **kwargs)
        # FIXME really crude correction
        if adjust:
            sigma = sigma * len(feature_cols)
        hampel_out = data[feature_cols].apply(stats.hampel, sigma=sigma)
        hampel_abs = hampel_out.apply(lambda x: sum(abs(x)), axis=1)
        return hampel_abs[hampel_abs > 0].index.tolist()
    if method == "ImageQuality":
        qc_cols = utils.get_image_quality(data)
        # find bad images with FocusScore
        focus_cols = [col for col in qc_cols if "FocusScore" in col]
        hampel_focus = data[focus_cols].apply(stats.hampel, sigma=sigma)
        focus_sum = hampel_focus.apply(np.sum, axis=1)
        focus_bad = focus_sum[focus_sum < 0].index.tolist()
        # find bad images with PowerLogLogSlope
        plls_cols = [col for col in qc_cols if "PowerLogLogSlope" in col]
        hampel_plls = data[plls_cols].apply(stats.hampel, sigma=sigma)
        plls_sum = hampel_plls.apply(np.sum, axis=1)
        plls_bad = plls_sum[plls_sum < 0].index.tolist()
        bad_index = list(set(focus_bad + plls_bad))
        return bad_index
예제 #6
0
def get_outlier_index(data, method="values", sigma=6, adjust=True, **kwargs):
    """
    Returns index of outlying row(s)

    Parameters
    ----------
    data: pandas dataframe
        DataFrame
    method : string (default="values")
        either 'simple' which is based on hampels robust outlier
        test on feature values, or 'ImageQualty' which uses the
        ImageQualty metrics - FocusScore and PowerLogLogSlope.
    sigma : int (default=6)
        number of median absolute deviations away from the sample median to
        define an outlier.
    adjust: boolean (default=True)
        If true will adjust the sigma value to take into account multiple
        measurements. `sigma_adj = sigma * n_feature_columns`
    **kwargs: additional arguments to utils.get_featuredata

    Returns
    -------
    bad_index : list
        list of row index/indices to remove
    """
    if not isinstance(data, pd.DataFrame):
        raise ValueError("not a pandas DataFrame")
    accepted_methods = ["values", "ImageQuality"]
    if method not in accepted_methods:
        raise ValueError("invalid argument. Options: values, ImageQuality")
    if method == "values":
        feature_cols = utils.get_featuredata(data, **kwargs)
        # FIXME really crude correction
        if adjust:
            sigma = sigma * len(feature_cols)
        hampel_out = data[feature_cols].apply(stats.hampel, sigma=sigma)
        hampel_abs = hampel_out.apply(lambda x: sum(abs(x)), axis=1)
        return hampel_abs[hampel_abs > 0].index.tolist()
    if method == "ImageQuality":
        qc_cols = utils.get_image_quality(data)
        # find bad images with FocusScore
        focus_cols = [col for col in qc_cols if "FocusScore" in col]
        hampel_focus = data[focus_cols].apply(stats.hampel, sigma=sigma)
        focus_sum = hampel_focus.apply(np.sum, axis=1)
        focus_bad = focus_sum[focus_sum < 0].index.tolist()
        # find bad images with PowerLogLogSlope
        plls_cols = [col for col in qc_cols if "PowerLogLogSlope" in col]
        hampel_plls = data[plls_cols].apply(stats.hampel, sigma=sigma)
        plls_sum = hampel_plls.apply(np.sum, axis=1)
        plls_bad = plls_sum[plls_sum < 0].index.tolist()
        bad_index = list(set(focus_bad + plls_bad))
        return bad_index