class Preprocessor: def __init__(self): self.io_utils = IOUtility() self.settings = self.io_utils.get_settings() self.profile = self.io_utils.get_profile() self.enc = None def load_encoder(self): """ Loads OneHotEncoder object if present in cache provided 'cache_encoder' is True in settings. If any of the above condition is false, it creates a new encoder object, trains it and caches it. """ encoder_cache_file_name = "encoder.cache" if self.settings['cache']['cache_encoder']: self.enc = self.io_utils.load_from_cache(encoder_cache_file_name) if self.enc is None: logging.debug("Encoder is not cached or cache is disabled") self.enc = self.__train_encoder(encoder_cache_file_name) else: logging.debug("Successfully loaded encoder from cache") return self.enc def encode(self, x): return self.enc.transform(x) def __train_encoder(self, encoder_cache_file_name): """ Trains and caches the encoder in mini batch mode :param encoder_cache_file_name: :return: trained encoder object """ model = self.io_utils.get_model() _use_cols = misc_utils.get_cols_to_load(labels=False, load_int_cols=False, load_cat_cols=True, ignore_features=model['ignore_features']) train_chunks = self.io_utils.load_train_set(use_cols=_use_cols, converters=misc_utils.get_converters(_use_cols), chunk_size=1500000) enc = OneHotEncoder() start = datetime.now() logging.debug("Training Encoder...") for i, chunk in enumerate(train_chunks): enc.partial_fit(chunk.as_matrix()) logging.debug("Trained encoder with chunk %d\t%s" % (i+1, str(datetime.now() - start))) logging.debug("Finished training encoder in \t%s" % str(datetime.now() - start)) self.io_utils.save_to_cache(enc, encoder_cache_file_name) return enc
class Classifier: def __init__(self, preprocessor): self.preprocessor = preprocessor self.io_utils = IOUtility() self.settings = self.io_utils.get_settings() self.models = { "SGDC": linear_model.SGDClassifier, "BNB": naive_bayes.BernoulliNB, "MNB": naive_bayes.MultinomialNB, "RF": ensemble.RandomForestClassifier, "ET": ensemble.ExtraTreesClassifier, "ADA": ensemble.AdaBoostClassifier, "GB": ensemble.GradientBoostingClassifier} def run(self): model, model_name, model_settings = self.__load_model() self.train(model, model_settings['ignore_features']) log_loss_score = self.predict(model, model_settings['ignore_features']) logging.info("Log loss score for model '%s' is '%f'" % (model_name, log_loss_score)) def train(self, model, ignore_features): logging.debug("Loading train set...") _use_cols = misc_utils.get_cols_to_load(labels=True, load_int_cols=True, load_cat_cols=True, ignore_features=ignore_features) train_df = self.io_utils.load_train_set(use_cols=_use_cols, converters=misc_utils.get_converters(_use_cols), chunk_size=self.settings['chunk_size']) start = datetime.now() logging.debug("Training Model...") if self.settings['chunk_size'] is not None: for i, chunk in enumerate(train_df): logging.debug("Processing chunk %d\t%s" % (i + 1, str(datetime.now() - start))) self.__process_train_chunk(chunk, model, ignore_features) else: self.__process_train_chunk(train_df, model, ignore_features) logging.debug("Finished training in \t%s" % str(datetime.now() - start)) def predict(self, model, ignore_features): logging.debug("Loading test set...") _use_cols = misc_utils.get_cols_to_load(labels=True, load_int_cols=True, load_cat_cols=True, ignore_features=ignore_features) test_df = self.io_utils.load_validation_set(use_cols=_use_cols, converters=misc_utils.get_converters(_use_cols), chunk_size=self.settings['chunk_size']) start = datetime.now() logging.debug("Predicting probabilities...") actual_labels = [] predicted_labels = [] if self.settings['chunk_size'] is not None: for i, chunk in enumerate(test_df): logging.debug("Processing chunk %d\t%s" % (i + 1, str(datetime.now() - start))) y, predictions = self.__process_test_chunk(chunk, model, ignore_features) actual_labels += y.tolist() predicted_labels += self.__cap_predictions(predictions) else: y, predictions = self.__process_test_chunk(test_df, model, ignore_features) actual_labels += y.tolist() predicted_labels += self.__cap_predictions(predictions) logging.debug("Finished predicting in \t%s" % str(datetime.now() - start)) return self.evaluate_model(actual_labels, predicted_labels) def __process_test_chunk(self, chunk, model, ignore_features): y = chunk['Label'] X = self.__preprocess(chunk, ignore_features) predictions = model.predict_proba(X) return y, predictions def __process_train_chunk(self, chunk, model, ignore_features): y = chunk['Label'] X = self.__preprocess(chunk, ignore_features) model.partial_fit(X, y, classes=[0, 1]) def __preprocess(self, chunk, ignore_features): int_features = sparse.coo_matrix(chunk[misc_utils.get_integer_cols(ignore_features=ignore_features)]).tocsr() cat_features = chunk[misc_utils.get_categorical_cols(ignore_features=ignore_features)].as_matrix() cat_features = self.preprocessor.encode(cat_features) X = sparse.hstack((int_features, cat_features)) return X def __load_model(self): model_name = self.settings['model'] model_id = model_name.split('_')[0] model = self.models[model_id](random_state=SEED) model_settings = self.io_utils.get_model(model_name) model_params = model_settings["params"] model.set_params(**model_params) return model, model_name, model_settings @staticmethod def evaluate_model(actual_labels, predicted_labels): def log_loss(act, pred): epsilon = 1e-15 pred = sp.maximum(epsilon, pred) pred = sp.minimum(1-epsilon, pred) ll = sum(act*sp.log(pred) + sp.subtract(1, act)*sp.log(sp.subtract(1, pred))) ll = ll * -1.0/len(act) return ll return log_loss(actual_labels, predicted_labels) @staticmethod def __cap_predictions(preds): return_val = [] for p in preds: prob = p[1] if prob > 0.98: prob = 0.98 elif prob < 0.02: prob = 0.02 return_val.append(prob) return return_val