예제 #1
0
def test_explain_smoke_titanic():
    titanic = load_titanic()
    titanic_clean = clean(titanic)
    sc = SimpleClassifier().fit(titanic_clean, target_col='survived')
    explain(sc)
    X, y = titanic_clean.drop("survived", axis=1), titanic_clean.survived
    ep = EasyPreprocessor()
    preprocessed = ep.fit_transform(X)
    tree = DecisionTreeClassifier().fit(preprocessed, y)
    explain(tree, feature_names=ep.get_feature_names())
    pipe = make_pipeline(EasyPreprocessor(), LogisticRegression())
    pipe.fit(X, y)
    explain(pipe, feature_names=pipe[0].get_feature_names())
예제 #2
0
def test_explain_titanic_val(model):
    # add multi-class
    # add regression
    titanic = load_titanic()
    titanic_clean = clean(titanic)
    X, y = titanic_clean.drop("survived", axis=1), titanic_clean.survived
    X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y,
                                                      random_state=42)
    pipe = make_pipeline(EasyPreprocessor(), model)
    pipe.fit(X_train, y_train)
    # without validation set
    explain(pipe, feature_names=X.columns)
    # with validation set
    explain(pipe, X_val, y_val, feature_names=X.columns)
예제 #3
0
def preprocess_data(df, sample=20000):
    data = df.sample(n=min(sample, len(df)), random_state=42)
    data_clean, data_types = dabl.clean(data, return_types=True, verbose=3)
    return data_clean, data_types
def run(csv_metadata, list_files, output_folder):
    csv_id = csv_metadata[0]
    csv_metadata = csv_metadata[1]

    tqdm.write(f"\nTreating {csv_id} file")

    #Find the full path for the csv_id
    mask = [csv_id in str(f) for f in list_files]
    csv_file_path = list_files[mask.index(True)]

    dabl_analysis_path = (output_folder /
                          (csv_id + '_dabl')).with_suffix('.csv')
    """
    if dabl_analysis_path.exists():
        tqdm.write(f"File {csv_id} already analyzed: {dabl_analysis_path} already exists")
        return dabl_analysis_path"""
    result_list = []

    if csv_metadata and len(csv_metadata) > 1:
        encoding = csv_metadata["encoding"]
        sep = csv_metadata["separator"]
    else:
        encoding = "latin-1"  # because why not
        sep = ";"
    csv_detective_columns = []
    if "columns" in csv_metadata:
        # keep columns that are not boolean
        csv_detective_columns = [
            k.strip('"') for k, v in csv_metadata['columns'].items()
            if "booleen" not in v
        ]
    try:
        data: pd.DataFrame = pd.read_csv(csv_file_path.as_posix(),
                                         encoding=encoding,
                                         sep=sep,
                                         error_bad_lines=False)
        # remove csv_detective columns
        #data = data.drop(csv_detective_columns, axis=1)
        # TODO change this as now the columns are not in the same order

        data_clean, data_types = dabl.clean(data, return_types=True, verbose=3)
        # dabl.detect_types(data)
        money_variables = csv_metadata['columns']['money']
        for target_col in money_variables:
            try:
                data_clean_no_nan = data_clean[data_clean[target_col].notna()]
                if len(data_clean_no_nan
                       ) < 100:  # less than 100 examples is too few examples
                    continue
                print(f"Building models with target variable: {target_col}")
                sc = dabl.SimpleRegressor(random_state=42).fit(
                    data_clean_no_nan, target_col=target_col)
                features_names = sc.est_.steps[0][1].get_feature_names()
                inner_dict = {
                    "csv_id": csv_id,
                    "task": "regression",
                    "algorithm": sc.current_best_.name,
                    "target_col": target_col,
                    "nb_features": len(features_names),
                    "features_names": "|".join(features_names),
                    "nb_classes": len(data[target_col].unique()),
                    "nb_lines": data_clean_no_nan.shape[0],
                    "date": today,
                }

                inner_dict.update(sc.current_best_.to_dict())
                inner_dict.update({
                    "avg_scores":
                    np.mean(list(sc.current_best_.to_dict().values()))
                })
                result_list.append(inner_dict)
            except Exception as e:
                tqdm.write(
                    f"Could not analyze file {csv_id} with target col {target_col}. Error {str(e)}"
                )
    except Exception as e:
        tqdm.write(f"Could not analyze file {csv_id}. Error: {e}")
        return None
    if not result_list:
        return
    result_df = pd.DataFrame(result_list)
    with open(dabl_analysis_path, "w") as filo:
        result_df.to_csv(filo, header=True, index=False)
    return dabl_analysis_path
예제 #5
0
bdf.save_within_48.value_counts()

# ## Extract and do DABL

# In[ ]:

import dabl

# In[ ]:

feature_df = bdf[final_feature_list]

# In[ ]:

dabl_data = dabl.clean(feature_df)

# In[ ]:

dabl.plot(dabl_data, target_col='save_within_48')

# In[ ]:

X = dabl_data.drop("save_within_48", axis=1)
Y = dabl_data.save_within_48

# In[ ]:

preprocessor = dabl.EasyPreprocessor()
X_trans = preprocessor.fit_transform(X)