示例#1
0
文件: myfuncs.py 项目: ezbc/airbnb
def fit_categorical_labels(df_train, df_test, df_labels,
        fit_type='regressor', fit_framework='theanets', labels_list=None):

    from rep.estimators import SklearnClassifier, SklearnRegressor
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import GradientBoostingRegressor
    from rep.estimators.neurolab import NeurolabRegressor
    from rep.estimators.theanets import TheanetsRegressor
    #from rep.estimators import XGBoostRegressor
    #from rep.estimators import XGBoostRegressor

    # Using gradient boosting with default settings
    if fit_framework == 'sklearn':
        if fit_type == 'classifier':
            sk = SklearnClassifier(GradientBoostingClassifier(),
                                   features=df_train.columns.values)
        elif fit_type == 'regressor':
            sk = SklearnRegressor(GradientBoostingRegressor(),
                                  features=df_train.columns.values)
    elif fit_framework == 'neural':
        if fit_type == 'regressor':
            sk = NeurolabRegressor(features=df_train.columns.values,
                                   )
    elif fit_framework == 'xgboost':
        if fit_type == 'regressor':
            sk = XGBoostRegressor(features=df_train.columns.values,
                                  )
    elif fit_framework == 'theanets':
        if fit_type == 'regressor':
            sk = TheanetsRegressor(features=df_train.columns.values,
                                  )
    else:
        raise ValueError('No correct combo of fit_type and fit_framework found')

    prediction_array = np.empty((len(df_test), len(df_labels.columns)))
    for i, column in enumerate(df_labels.columns.values):
        # get a single column to predict
        labels = df_labels[column]

        # fit the data with the training set
        sk.fit(df_train, labels)

        # predict new countries
        prediction = np.squeeze(sk.predict(df_test))
        prediction_array[:, i] = prediction

        #prediction = pd.read_pickle(filename).squeeze()

    df_predict = pd.DataFrame(prediction_array, columns=df_labels.columns.values)
    df_predict = gather_dummy_predictions(df_predict, labels_list)

    #print('unique labels', np.unique(df_predict))

    return df_predict