def transform(self, X, y=None): df = X[X['record_type'] == 0] return classes.get_last_observed_point(df)
import classes logger = logging.getLogger('allstate') logger.info("Loading data") data = classes.get_train_data() train, test = classes.train_test_split(data) # Feature G has the lowest accuracy, so lets use all the other features to predict G # G takes values of 1, 2, 3, and 4, we'll need to one-hot encode the response logger.info("Transforming data") # Transform the data into something that we can give to a learning algorithm train_y = train.loc[train['record_type'] == 1, 'G'] train_data = train.loc[train['record_type'] == 0, ['customer_ID', 'shopping_pt', 'record_type'] + list('ABCDEFG')] train_x = classes.get_last_observed_point(train_data)[list('ABCDEFG')] # Responses need to be encoded to binary columns y_encoder = OneHotEncoder() train_y = y_encoder.fit_transform(train_y.reshape((train_y.shape[0], 1))).toarray() # train_x is a df with columsn A-F # Encode each column of train_x as a one-hot binary column. f_encoder = OneHotEncoder() est = RandomForestClassifier(n_estimators=150, verbose=3, oob_score=True) train_x = f_encoder.fit_transform(train_x).toarray() # OOB score is 0.93 logger.info("Training classifier") est.fit(train_x, train_y) logger.info("Transforming test data")