示例#1
0
def to_categorical(
    training_data: pd.DataFrame, test_data: pd.DataFrame
) -> (pd.DataFrame, pd.DataFrame):

    categorical_columns_list = list(training_data.columns[training_data.dtypes==object])
    ce_be = BinaryEncoder(cols=categorical_columns_list, handle_unknown="inpute")
    training_data_ce_binary = ce_be.fit_transform(training_data)
    test_data_ce_binary = ce_be.transform(test_data)

    return dict(train_data_categorical=training_data_ce_binary,
                test_data_categorical=test_data_ce_binary)
示例#2
0
#ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
ord1 = OrdinalEncoder()
ord1.fit([df['ord_2']])
df["ord_2"] = ord1.fit_transform(df[["ord_2"]])
df.head(10)
dnew = df.copy()
#ordinal encoding through mapping
temp_dict = {'Cold': 1, 'Warm': 2, 'Hot': 3}
dnew['Ord_2_encod'] = dnew.ord_2.map(temp_dict)
dnew = dnew.drop(['ord_2'], axis=1)

#Binary encoding
from category_encoders import BinaryEncoder
encoder = BinaryEncoder(cols=['ord_2'])
newdata = encoder.fit_transform(df['ord_2'])
df = pd.concat([df, newdata], axis=1)
df = df.drop(['ord_2'], axis=1)
df.head(10)

#Hash encoding
from sklearn.feature_extraction import FeatureHasher
h = FeatureHasher(n_features=3, input_type='string')
hashed_Feature = h.fit_transform(df['nom_0'])
hashed_Feature = hashed_Feature.toarray()
df = pd.concat([df, pd.DataFrame(hashed_Feature)], axis=1)
df.head(10)

df.insert(6, "Target", [0, 1, 1, 0, 0, 1, 0, 0, 0, 1], True)

#mean Ecoding /Target encoding
示例#3
0
def predict():
    '''
    For rendering results on HTML GUI
    '''
    features = [x for x in request.form.values()]
    #final_features = [np.array(int_features)]
    #prediction = model.predict(final_features)

    #output = round(prediction[0], 2)

    features = np.array(features)
    features = features.reshape(1, 6)
    features = pd.DataFrame(data=features,
                            columns=[
                                'Name', 'Genre', 'Comments', 'Likes',
                                'Popularity', 'Followers'
                            ])
    df = pd.read_csv('data.csv')
    cv = {'Comments': int, 'Likes': int, 'Popularity': int, 'Followers': int}
    df = df.astype(cv)
    features = features.astype(cv)
    #x=df[df['Views']==0].index

    df.drop(index=df[df['Views'] < df['Likes']].index, axis=1, inplace=True)
    df.drop(index=df[df['Views'] < df['Comments']].index, axis=1, inplace=True)
    df.drop(index=df[df['Views'] < df['Popularity']].index,
            axis=1,
            inplace=True)

    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
    df = df[~((df < (Q1 - 3 * IQR)) | (df > (Q3 + 3 * IQR))).any(axis=1)]

    df = df.drop(
        columns=['Unique_ID', 'Country', 'Song_Name', 'Timestamp', 'index'])

    y = df['Views']
    df = df.drop(columns=['Views'])

    be = BinaryEncoder()
    df = be.fit_transform(df)
    f = be.transform(features)

    X = df.iloc[:, :]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)

    rg1 = AdaBoostRegressor()
    rg1.fit(X_train, y_train)
    #ypred=rg1.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))

    rg2 = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1)
    # para={'n_estimators':[250,300],'learning_rate':[1,0.1,0.01]}
    # grid=GridSearchCV(estimator=rg8,param_grid=para,verbose=1,cv=10,n_jobs=-1)
    rg2.fit(X_train, y_train)
    #ypred=rg2.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))

    rg3 = RandomForestRegressor(random_state=0, n_estimators=20, max_depth=15)
    # para={'n_estimators':[5,10,30,20],'max_depth':[5,8,20,17]}
    # grid=GridSearchCV(estimator=rg9,param_grid=para,cv=10,verbose=1,n_jobs=-1)
    rg3.fit(X_train, y_train)
    #ypred=rg3.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))

    rg6 = StackingRegressor([rg1, rg2], meta_regressor=rg3)
    rg6.fit(X_train, y_train)
    #ypred=rg6.predict(X_test)
    #sqrt(mean_squared_error(y_test,ypred))
    f = f.iloc[:, :]
    y_pred = rg6.predict(f)

    y_pred = y_pred.astype(int)

    return render_template(
        'index.html', prediction_text='Numberb of Views is {}'.format(y_pred))
示例#4
0
def binaryEncoding(df,column):
  from category_encoders import BinaryEncoder
  encoder=BinaryEncoder(cols=[column])
  df=encoder.fit_transform(df)
  return df
DataFrame = pd.core.frame.DataFrame
Series = pd.core.series.Series
Array = np.ndarray
Imputer = Callable[[DataFrame], DataFrame]
nan = np.nan

df = pd.read_csv('data.csv').drop(['name'], axis=1)
X_ = df.drop('status_group', axis=1)
y = df.status_group

be = BinaryEncoder()
FEATS = 70

pca = PCA(n_components=FEATS)

vals = pca.fit_transform(StandardScaler().fit_transform(be.fit_transform(X_)))

X = pd.DataFrame(vals,
                 columns=[f"pc{k+1}" for k in range(FEATS)],
                 index=y.index).assign(y=y)


def mcar_goblin(dat: DataFrame, ratio: float) -> DataFrame:
    ''' Simulate MCAR with bernoulli '''
    def ident_or_nan(x: float) -> float:
        ''' if heads, replace value with nan. if tails, identity '''
        coin = bernoulli(ratio)
        if coin.rvs() == 1:
            return nan
        else:
            return x
                        nrows=500)
    test = pd.read_csv(os.path.join(config["input_path"], "test.csv"),
                       na_values=-1,
                       nrows=500)

    train_feature, train_label = train.iloc[:,
                                            2:].copy(), train.iloc[:,
                                                                   1].copy()
    test_feature = test.iloc[:, 1:].copy()
    del train, test

    train_feature = train_feature[[
        col for col in train_feature.columns if not col.startswith("ps_calc_")
    ]]
    test_feature = test_feature[train_feature.columns]

    ncs = [
        col for col in train_feature.columns
        if not col.endswith(("_bin", "_cat"))
    ]
    ccs = [
        col for col in train_feature.columns if col.endswith(("_bin", "_cat"))
    ]

    eet = EntityEmbeddingTree(numeric_columns=ncs, categorical_columns=ccs)
    eet.fit(X=train_feature, y=train_label)

    encoder = BinaryEncoder()
    print(encoder.fit_transform(eet.transform(X=train_feature)).shape)
    print(encoder.transform(eet.transform(X=test_feature)).shape)