示例#1
0
    def predict(self, model, ignore_features):
        logging.debug("Loading test set...")
        _use_cols = misc_utils.get_cols_to_load(labels=True, load_int_cols=True, load_cat_cols=True,
                                                ignore_features=ignore_features)
        test_df = self.io_utils.load_validation_set(use_cols=_use_cols,
                                                        converters=misc_utils.get_converters(_use_cols),
                                                        chunk_size=self.settings['chunk_size'])
        start = datetime.now()
        logging.debug("Predicting probabilities...")
        actual_labels = []
        predicted_labels = []
        
        if self.settings['chunk_size'] is not None:
            for i, chunk in enumerate(test_df):
                logging.debug("Processing chunk %d\t%s" % (i + 1, str(datetime.now() - start)))
                y, predictions = self.__process_test_chunk(chunk, model, ignore_features)
                actual_labels += y.tolist()
                predicted_labels += self.__cap_predictions(predictions)
        else:
            y, predictions = self.__process_test_chunk(test_df, model, ignore_features)
            actual_labels += y.tolist()
            predicted_labels += self.__cap_predictions(predictions)

        logging.debug("Finished predicting in \t%s" % str(datetime.now() - start))
        return self.evaluate_model(actual_labels, predicted_labels)
示例#2
0
    def train(self, model, ignore_features):
        logging.debug("Loading train set...")
        _use_cols = misc_utils.get_cols_to_load(labels=True, load_int_cols=True, load_cat_cols=True,
                                                ignore_features=ignore_features)
        train_df = self.io_utils.load_train_set(use_cols=_use_cols, converters=misc_utils.get_converters(_use_cols),
                                                    chunk_size=self.settings['chunk_size'])

        start = datetime.now()
        logging.debug("Training Model...")
        
        if self.settings['chunk_size'] is not None:
            for i, chunk in enumerate(train_df):
                logging.debug("Processing chunk %d\t%s" % (i + 1, str(datetime.now() - start)))
                self.__process_train_chunk(chunk, model, ignore_features)
        else:
            self.__process_train_chunk(train_df, model, ignore_features)
        
        logging.debug("Finished training in \t%s" % str(datetime.now() - start))
示例#3
0
    def __train_encoder(self, encoder_cache_file_name):
        """
        Trains and caches the encoder in mini batch mode
        :param encoder_cache_file_name:
        :return: trained encoder object
        """

        model = self.io_utils.get_model()
        _use_cols = misc_utils.get_cols_to_load(labels=False, load_int_cols=False, load_cat_cols=True,
                                                ignore_features=model['ignore_features'])
        train_chunks = self.io_utils.load_train_set(use_cols=_use_cols, converters=misc_utils.get_converters(_use_cols),
                                                    chunk_size=1500000)
        enc = OneHotEncoder()
        start = datetime.now()

        logging.debug("Training Encoder...")
        for i, chunk in enumerate(train_chunks):
            enc.partial_fit(chunk.as_matrix())
            logging.debug("Trained encoder with chunk %d\t%s" % (i+1, str(datetime.now() - start)))
        logging.debug("Finished training encoder in \t%s" % str(datetime.now() - start))

        self.io_utils.save_to_cache(enc, encoder_cache_file_name)
        return enc