Exemplo n.º 1
0
def test_datamation_groupby_multiple():
    df = small_salary().df
    df = DatamationFrame(df)

    # Group by Degree, Work
    mean = df.groupby(['Degree', 'Work']).mean()

    assert "groupby" in mean.operations
    assert "mean" in mean.operations

    assert len(mean.states) == 2
    assert df.equals(mean.states[0])

    assert mean.Salary.Masters.Academia == 84.0298831968801
    assert mean.Salary.Masters.Industry == 91.22576155606282
    assert mean.Salary.PhD.Academia == 85.55796571969728
    assert mean.Salary.PhD.Industry == 93.08335885824636

    # sum
    sum = df.groupby(['Degree', 'Work']).sum()

    assert "groupby" in sum.operations
    assert "sum" in sum.operations

    assert len(sum.states) == 2
    assert df.equals(sum.states[0])

    assert sum.Salary.Masters.Academia == 840.2988319688011
    assert sum.Salary.Masters.Industry == 5655.997216475895
    assert sum.Salary.PhD.Academia == 1540.043382954551
    assert sum.Salary.PhD.Industry == 930.8335885824636

    # product
    product = df.groupby(['Degree', 'Work']).prod()

    assert "groupby" in product.operations
    assert "product" in product.operations

    assert len(product.states) == 2
    assert df.equals(product.states[0])

    assert product.Salary.Masters.Academia == 1.753532557780977e+19
    assert product.Salary.Masters.Industry == 3.3602152421057308e+121
    assert product.Salary.PhD.Academia == 6.027761935702164e+34
    assert product.Salary.PhD.Industry == 4.8818435443657834e+19

    # Group by species, island, sex
    df = DatamationFrame(load_penguins())
    mean = df.groupby(['species', 'island', 'sex']).mean()

    assert "groupby" in mean.operations
    assert "mean" in mean.operations

    assert len(mean.states) == 2
    assert df.equals(mean.states[0])

    assert mean.bill_length_mm.Adelie.Biscoe.male == approx(40.5909090909091)
    assert mean.bill_length_mm.Adelie.Biscoe.female == approx(
        37.35909090909092)
Exemplo n.º 2
0
def _load_penguins():
    print("🐧 loading penguins...")
    from palmerpenguins import load_penguins

    X, y = load_penguins(return_X_y=True)
    print(X.head())

    return X, y
Exemplo n.º 3
0
def train():
    data, target = load_penguins(return_X_y=True, drop_na=True)

    models = {
        "rf": RandomForestClassifier(max_depth=3, random_state=0),
        "dt": DecisionTreeClassifier(max_depth=3, random_state=0),
    }

    for name, model in models.items():
        model.fit(data, target)
        joblib.dump(model, f"model/{name}.pkl")
        print(f"save model: {name}")
Exemplo n.º 4
0
def data_sourcing():
    """
    This template function uses the Palmer Peguins dataset as a place holder.
    Replace it by your own code to import your project's data.
    """

    df = palmerpenguins.load_penguins()

    cols = [
        "bill_length_mm",
        "bill_depth_mm",
        "flipper_length_mm",
        "body_mass_g",
        "sex",
        "species",
    ]

    return df[cols]
Exemplo n.º 5
0
def input_dataframe():
    penguin_df = load_penguins()
    return penguin_df
Exemplo n.º 6
0
def penguins_data():
    return simplify(load_penguins())