Пример #1
0
def test_feature_select_blocklist():
    """
    Testing feature_select and get_na_columns pycytominer function
    """

    data_blocklist_df = pd.DataFrame({
        "Nuclei_Correlation_Manders_AGP_DNA": [1, 3, 8, 5, 2, 2],
        "y": [1, 2, 8, 5, 2, 1],
        "Nuclei_Correlation_RWC_ER_RNA": [9, 3, 8, 9, 2, 9],
        "zz": [0, -3, 8, 9, 6, 9],
    }).reset_index(drop=True)

    result = feature_select(data_blocklist_df,
                            features="infer",
                            operation="blocklist")
    expected_result = pd.DataFrame({
        "y": [1, 2, 8, 5, 2, 1],
        "zz": [0, -3, 8, 9, 6, 9]
    })
    pd.testing.assert_frame_equal(result, expected_result)

    result = feature_select(
        data_blocklist_df,
        features=data_blocklist_df.columns.tolist(),
        operation="blocklist",
    )
    expected_result = pd.DataFrame({
        "y": [1, 2, 8, 5, 2, 1],
        "zz": [0, -3, 8, 9, 6, 9]
    })
    pd.testing.assert_frame_equal(result, expected_result)
Пример #2
0
def test_feature_select_get_na_columns():
    """
    Testing feature_select and get_na_columns pycytominer function
    """
    result = feature_select(data_na_df,
                            features=data_na_df.columns.tolist(),
                            operation="drop_na_columns")
    expected_result = pd.DataFrame({"yy": [1, 2, 8, 10, 2, 100]})
    pd.testing.assert_frame_equal(result, expected_result)

    result = feature_select(
        data_na_df,
        features=data_na_df.columns.tolist(),
        operation="drop_na_columns",
        na_cutoff=1,
    )
    pd.testing.assert_frame_equal(result, data_na_df)

    result = feature_select(
        data_na_df,
        features=data_na_df.columns.tolist(),
        operation="drop_na_columns",
        na_cutoff=0.3,
    )
    expected_result = pd.DataFrame({
        "x": [np.nan, 3, 8, 5, 2, 2],
        "xx": [np.nan, 3, 8, 5, 2, 2],
        "yy": [1, 2, 8, 10, 2, 100],
        "z": [9, 3, 8, 9, 2, np.nan],
    })
    pd.testing.assert_frame_equal(result, expected_result)
Пример #3
0
def test_feature_select_get_na_columns_feature_infer():
    """
    Testing feature_select and get_na_columns pycytominer function
    """
    result = feature_select(
        data_feature_infer_df,
        features="infer",
        operation="drop_na_columns",
        na_cutoff=0.3,
    )
    expected_result = pd.DataFrame({
        "Metadata_x": [np.nan, np.nan, 8, np.nan, 2, np.nan],
        "Cytoplasm_xx": [np.nan, 3, 8, 5, 2, 2],
        "Nuclei_yy": [1, 2, 8, 10, 2, 100],
        "Cytoplasm_z": [9, 3, 8, 9, 2, np.nan],
    })
    pd.testing.assert_frame_equal(result, expected_result)

    result = feature_select(
        data_feature_infer_df,
        features=data_feature_infer_df.columns.tolist(),
        operation="drop_na_columns",
        na_cutoff=0.3,
    )
    expected_result = pd.DataFrame({
        "Cytoplasm_xx": [np.nan, 3, 8, 5, 2, 2],
        "Nuclei_yy": [1, 2, 8, 10, 2, 100],
        "Cytoplasm_z": [9, 3, 8, 9, 2, np.nan],
    })
    pd.testing.assert_frame_equal(result, expected_result)
Пример #4
0
def test_feature_select_all():
    data_all_test_df = data_unique_test_df.assign(zz=a_feature)
    data_all_test_df.iloc[1, 4] = 2
    data_all_test_df.iloc[[x for x in range(0, 50)], 1] = np.nan

    result = feature_select(
        profiles=data_all_test_df,
        features=data_all_test_df.columns.tolist(),
        operation=["drop_na_columns", "correlation_threshold"],
        corr_threshold=0.7,
    )
    expected_result = pd.DataFrame({
        "c": c_feature,
        "d": d_feature,
        "zz": a_feature
    }).reset_index(drop=True)
    expected_result.iloc[1, 2] = 2
    pd.testing.assert_frame_equal(result, expected_result)

    # Get temporary directory
    tmpdir = tempfile.gettempdir()

    # Write file to output
    data_file = os.path.join(tmpdir, "test_feature_select.csv")
    data_all_test_df.to_csv(data_file, index=False, sep=",")
    out_file = os.path.join(tmpdir, "test_feature_select_out.csv")
    _ = feature_select(
        profiles=data_file,
        features=data_all_test_df.columns.tolist(),
        operation=["drop_na_columns", "correlation_threshold"],
        corr_threshold=0.7,
        output_file=out_file,
    )
    from_file_result = pd.read_csv(out_file)
    pd.testing.assert_frame_equal(from_file_result, expected_result)

    result = feature_select(
        profiles=data_all_test_df,
        features=data_all_test_df.columns.tolist(),
        operation=[
            "drop_na_columns", "correlation_threshold", "variance_threshold"
        ],
        corr_threshold=0.7,
    )
    expected_result = pd.DataFrame({
        "c": c_feature,
        "d": d_feature
    }).reset_index(drop=True)
    pd.testing.assert_frame_equal(result, expected_result)
Пример #5
0
def test_feature_select_variance_threshold():
    """
    Testing feature_select and variance_threshold pycytominer function
    """
    result = feature_select(
        data_unique_test_df,
        features=data_unique_test_df.columns.tolist(),
        operation="variance_threshold",
        unique_cut=0.01,
    )
    expected_result = pd.DataFrame({
        "b": b_feature,
        "c": c_feature,
        "d": d_feature
    }).reset_index(drop=True)
    pd.testing.assert_frame_equal(result, expected_result)

    na_data_unique_test_df = data_unique_test_df.copy()
    na_data_unique_test_df.iloc[[x for x in range(0, 50)], 1] = np.nan
    result = feature_select(
        na_data_unique_test_df,
        features=na_data_unique_test_df.columns.tolist(),
        operation=["drop_na_columns", "variance_threshold"],
    )
    expected_result = pd.DataFrame({
        "c": c_feature,
        "d": d_feature
    }).reset_index(drop=True)
    pd.testing.assert_frame_equal(result, expected_result)

    na_data_unique_test_df = data_unique_test_df.copy()
    na_data_unique_test_df.iloc[[x for x in range(0, 50)], 1] = np.nan

    result = feature_select(
        na_data_unique_test_df,
        features=na_data_unique_test_df.columns.tolist(),
        operation=["variance_threshold", "drop_na_columns"],
    )
    expected_result = pd.DataFrame({
        "c": c_feature,
        "d": d_feature
    }).reset_index(drop=True)
    pd.testing.assert_frame_equal(result, expected_result)
Пример #6
0
def test_feature_select_drop_outlier():
    """
    Testing feature_select and get_na_columns pycytominer function
    """
    result = feature_select(
        data_outlier_df, features="infer", operation="drop_outliers"
    )
    expected_result = data_outlier_df.drop(["Cells_zz", "Nuclei_z"], axis="columns")
    pd.testing.assert_frame_equal(result, expected_result)

    result = feature_select(
        data_outlier_df, features="infer", operation="drop_outliers", outlier_cutoff=30
    )
    expected_result = data_outlier_df.drop(["Cells_zz"], axis="columns")
    pd.testing.assert_frame_equal(result, expected_result)

    result = feature_select(
        data_outlier_df, features=["Cells_x", "Cytoplasm_y"], operation="drop_outliers"
    )
    pd.testing.assert_frame_equal(result, data_outlier_df)
Пример #7
0
def test_feature_select_correlation_threshold():
    """
    Testing feature_select and correlation_threshold pycytominer function
    """

    result = feature_select(data_df,
                            features=data_df.columns.tolist(),
                            operation="correlation_threshold")
    expected_result = data_df.drop(["y"], axis="columns")
    pd.testing.assert_frame_equal(result, expected_result)

    data_cor_thresh_na_df = data_df.copy()
    data_cor_thresh_na_df.iloc[0, 2] = np.nan

    result = feature_select(
        data_cor_thresh_na_df,
        features=data_cor_thresh_na_df.columns.tolist(),
        operation=["drop_na_columns", "correlation_threshold"],
    )
    expected_result = data_df.drop(["z", "x"], axis="columns")
    pd.testing.assert_frame_equal(result, expected_result)
Пример #8
0
def test_feature_select_compress():
    compress_file = os.path.join(tmpdir, "test_feature_select_compress.csv.gz")
    _ = feature_select(
        data_na_df,
        features=data_na_df.columns.tolist(),
        operation="drop_na_columns",
        output_file=compress_file,
        compression="gzip",
    )
    expected_result = pd.DataFrame({"yy": [1, 2, 8, 10, 2, 100]})
    result = pd.read_csv(compress_file)

    pd.testing.assert_frame_equal(result, expected_result)
Пример #9
0
# ## Perform Feature Selection

# In[3]:

eb = ("actin", "DNA", "dist", "nuclear")
features = [x for x in train_df.columns if x.startswith(eb)]

# In[4]:

train_feature_select_df = feature_select(profiles=train_df,
                                         features=features,
                                         operation=[
                                             "drop_na_columns",
                                             "variance_threshold",
                                             "correlation_threshold"
                                         ],
                                         na_cutoff=0.01,
                                         corr_threshold=0.95,
                                         corr_method="pearson",
                                         freq_cut=0.01,
                                         unique_cut=0.001)

# In[5]:

selected_features = [
    x for x in train_feature_select_df.columns if x.startswith(eb)
]

print(train_feature_select_df.shape)
train_feature_select_df.head()