Пример #1
0
class Preprocessor:
    def __init__(self):
        self.io_utils = IOUtility()
        self.settings = self.io_utils.get_settings()
        self.profile = self.io_utils.get_profile()
        self.enc = None

    def load_encoder(self):
        """
        Loads OneHotEncoder object if present in cache provided 'cache_encoder' is True in settings.
        If any of the above condition is false, it creates a new encoder object, trains it and caches it.
        """
        encoder_cache_file_name = "encoder.cache"
        if self.settings['cache']['cache_encoder']:
            self.enc = self.io_utils.load_from_cache(encoder_cache_file_name)
        if self.enc is None:
            logging.debug("Encoder is not cached or cache is disabled")
            self.enc = self.__train_encoder(encoder_cache_file_name)
        else:
            logging.debug("Successfully loaded encoder from cache")
        return self.enc

    def encode(self, x):
        return self.enc.transform(x)

    def __train_encoder(self, encoder_cache_file_name):
        """
        Trains and caches the encoder in mini batch mode
        :param encoder_cache_file_name:
        :return: trained encoder object
        """

        model = self.io_utils.get_model()
        _use_cols = misc_utils.get_cols_to_load(labels=False, load_int_cols=False, load_cat_cols=True,
                                                ignore_features=model['ignore_features'])
        train_chunks = self.io_utils.load_train_set(use_cols=_use_cols, converters=misc_utils.get_converters(_use_cols),
                                                    chunk_size=1500000)
        enc = OneHotEncoder()
        start = datetime.now()

        logging.debug("Training Encoder...")
        for i, chunk in enumerate(train_chunks):
            enc.partial_fit(chunk.as_matrix())
            logging.debug("Trained encoder with chunk %d\t%s" % (i+1, str(datetime.now() - start)))
        logging.debug("Finished training encoder in \t%s" % str(datetime.now() - start))

        self.io_utils.save_to_cache(enc, encoder_cache_file_name)
        return enc