def predict(self, model, ignore_features): logging.debug("Loading test set...") _use_cols = misc_utils.get_cols_to_load(labels=True, load_int_cols=True, load_cat_cols=True, ignore_features=ignore_features) test_df = self.io_utils.load_validation_set(use_cols=_use_cols, converters=misc_utils.get_converters(_use_cols), chunk_size=self.settings['chunk_size']) start = datetime.now() logging.debug("Predicting probabilities...") actual_labels = [] predicted_labels = [] if self.settings['chunk_size'] is not None: for i, chunk in enumerate(test_df): logging.debug("Processing chunk %d\t%s" % (i + 1, str(datetime.now() - start))) y, predictions = self.__process_test_chunk(chunk, model, ignore_features) actual_labels += y.tolist() predicted_labels += self.__cap_predictions(predictions) else: y, predictions = self.__process_test_chunk(test_df, model, ignore_features) actual_labels += y.tolist() predicted_labels += self.__cap_predictions(predictions) logging.debug("Finished predicting in \t%s" % str(datetime.now() - start)) return self.evaluate_model(actual_labels, predicted_labels)
def train(self, model, ignore_features): logging.debug("Loading train set...") _use_cols = misc_utils.get_cols_to_load(labels=True, load_int_cols=True, load_cat_cols=True, ignore_features=ignore_features) train_df = self.io_utils.load_train_set(use_cols=_use_cols, converters=misc_utils.get_converters(_use_cols), chunk_size=self.settings['chunk_size']) start = datetime.now() logging.debug("Training Model...") if self.settings['chunk_size'] is not None: for i, chunk in enumerate(train_df): logging.debug("Processing chunk %d\t%s" % (i + 1, str(datetime.now() - start))) self.__process_train_chunk(chunk, model, ignore_features) else: self.__process_train_chunk(train_df, model, ignore_features) logging.debug("Finished training in \t%s" % str(datetime.now() - start))
def __train_encoder(self, encoder_cache_file_name): """ Trains and caches the encoder in mini batch mode :param encoder_cache_file_name: :return: trained encoder object """ model = self.io_utils.get_model() _use_cols = misc_utils.get_cols_to_load(labels=False, load_int_cols=False, load_cat_cols=True, ignore_features=model['ignore_features']) train_chunks = self.io_utils.load_train_set(use_cols=_use_cols, converters=misc_utils.get_converters(_use_cols), chunk_size=1500000) enc = OneHotEncoder() start = datetime.now() logging.debug("Training Encoder...") for i, chunk in enumerate(train_chunks): enc.partial_fit(chunk.as_matrix()) logging.debug("Trained encoder with chunk %d\t%s" % (i+1, str(datetime.now() - start))) logging.debug("Finished training encoder in \t%s" % str(datetime.now() - start)) self.io_utils.save_to_cache(enc, encoder_cache_file_name) return enc