from __future__ import print_function import sys import bz2 import time import xgboost as xgb import pandas as pd from sklearn.model_selection import cross_val_score import datapot as dp from datapot.datasets import load_job_salary data = load_job_salary() datapot = dp.DataPot() t0 = time.time() datapot.detect(data) print('detect time:', time.time() - t0) t0 = time.time() datapot.fit(data, verbose=True) print('fit time:', time.time() - t0) t0 = time.time() df = datapot.transform(data) print('transform time:', time.time() - t0) X = df.drop(['SalaryNormalized', 'Id'], axis=1) y = pd.qcut(df['SalaryNormalized'].values, q=2, labels=[0, 1]).ravel() model = xgb.XGBClassifier() cv_score = cross_val_score(model, X, y, cv=5)
from sklearn.model_selection import cross_val_score import xgboost as xgb import datapot as dp dummy_data = [ '{"name": "Gilbert", "wins": [3, 4, 12], "rating": 32}', '{"name": "Alexa", "wins": [1, 2, 5, 7], "rating": 24}', '{"name": "May", "wins": [], "rating": 1240}', '{"name": "Deloise", "wins": [6, 8, 9, 10, 11], "rating": 25}', ] # create DataPot instance data = dp.DataPot() print(data) # fit it with data data.fit(dummy_data) print(data) print(data.fields()) # apply transformers df = data.transform(dummy_data, drop_non_numerical=True) print(df) # we are going to predict rating y = df['rating'] X = df.drop('rating', axis=1) # evaluate prediction score using xgboost model = xgb.XGBRegressor()