示例#1
0
class Preprocessor:
    def __init__(self):
        self.io_utils = IOUtility()
        self.settings = self.io_utils.get_settings()
        self.profile = self.io_utils.get_profile()
        self.enc = None

    def load_encoder(self):
        """
        Loads OneHotEncoder object if present in cache provided 'cache_encoder' is True in settings.
        If any of the above condition is false, it creates a new encoder object, trains it and caches it.
        """
        encoder_cache_file_name = "encoder.cache"
        if self.settings['cache']['cache_encoder']:
            self.enc = self.io_utils.load_from_cache(encoder_cache_file_name)
        if self.enc is None:
            logging.debug("Encoder is not cached or cache is disabled")
            self.enc = self.__train_encoder(encoder_cache_file_name)
        else:
            logging.debug("Successfully loaded encoder from cache")
        return self.enc

    def encode(self, x):
        return self.enc.transform(x)

    def __train_encoder(self, encoder_cache_file_name):
        """
        Trains and caches the encoder in mini batch mode
        :param encoder_cache_file_name:
        :return: trained encoder object
        """

        model = self.io_utils.get_model()
        _use_cols = misc_utils.get_cols_to_load(labels=False, load_int_cols=False, load_cat_cols=True,
                                                ignore_features=model['ignore_features'])
        train_chunks = self.io_utils.load_train_set(use_cols=_use_cols, converters=misc_utils.get_converters(_use_cols),
                                                    chunk_size=1500000)
        enc = OneHotEncoder()
        start = datetime.now()

        logging.debug("Training Encoder...")
        for i, chunk in enumerate(train_chunks):
            enc.partial_fit(chunk.as_matrix())
            logging.debug("Trained encoder with chunk %d\t%s" % (i+1, str(datetime.now() - start)))
        logging.debug("Finished training encoder in \t%s" % str(datetime.now() - start))

        self.io_utils.save_to_cache(enc, encoder_cache_file_name)
        return enc
示例#2
0
class Classifier:
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor
        self.io_utils = IOUtility()
        self.settings = self.io_utils.get_settings()
        self.models = {
                "SGDC": linear_model.SGDClassifier,
                "BNB": naive_bayes.BernoulliNB,
                "MNB": naive_bayes.MultinomialNB,
                "RF": ensemble.RandomForestClassifier,
                "ET": ensemble.ExtraTreesClassifier,
                "ADA": ensemble.AdaBoostClassifier,
                "GB": ensemble.GradientBoostingClassifier}

    def run(self):
        model, model_name, model_settings = self.__load_model()
        self.train(model, model_settings['ignore_features'])
        log_loss_score = self.predict(model, model_settings['ignore_features'])
        logging.info("Log loss score for model '%s' is '%f'" % (model_name, log_loss_score))

    def train(self, model, ignore_features):
        logging.debug("Loading train set...")
        _use_cols = misc_utils.get_cols_to_load(labels=True, load_int_cols=True, load_cat_cols=True,
                                                ignore_features=ignore_features)
        train_df = self.io_utils.load_train_set(use_cols=_use_cols, converters=misc_utils.get_converters(_use_cols),
                                                    chunk_size=self.settings['chunk_size'])

        start = datetime.now()
        logging.debug("Training Model...")
        
        if self.settings['chunk_size'] is not None:
            for i, chunk in enumerate(train_df):
                logging.debug("Processing chunk %d\t%s" % (i + 1, str(datetime.now() - start)))
                self.__process_train_chunk(chunk, model, ignore_features)
        else:
            self.__process_train_chunk(train_df, model, ignore_features)
        
        logging.debug("Finished training in \t%s" % str(datetime.now() - start))

    def predict(self, model, ignore_features):
        logging.debug("Loading test set...")
        _use_cols = misc_utils.get_cols_to_load(labels=True, load_int_cols=True, load_cat_cols=True,
                                                ignore_features=ignore_features)
        test_df = self.io_utils.load_validation_set(use_cols=_use_cols,
                                                        converters=misc_utils.get_converters(_use_cols),
                                                        chunk_size=self.settings['chunk_size'])
        start = datetime.now()
        logging.debug("Predicting probabilities...")
        actual_labels = []
        predicted_labels = []
        
        if self.settings['chunk_size'] is not None:
            for i, chunk in enumerate(test_df):
                logging.debug("Processing chunk %d\t%s" % (i + 1, str(datetime.now() - start)))
                y, predictions = self.__process_test_chunk(chunk, model, ignore_features)
                actual_labels += y.tolist()
                predicted_labels += self.__cap_predictions(predictions)
        else:
            y, predictions = self.__process_test_chunk(test_df, model, ignore_features)
            actual_labels += y.tolist()
            predicted_labels += self.__cap_predictions(predictions)

        logging.debug("Finished predicting in \t%s" % str(datetime.now() - start))
        return self.evaluate_model(actual_labels, predicted_labels)

    def __process_test_chunk(self, chunk, model, ignore_features):
        y = chunk['Label']
        X = self.__preprocess(chunk, ignore_features)
        predictions = model.predict_proba(X)
        return y, predictions

    def __process_train_chunk(self, chunk, model, ignore_features):
        y = chunk['Label']
        X = self.__preprocess(chunk, ignore_features)
        model.partial_fit(X, y, classes=[0, 1])

    def __preprocess(self, chunk, ignore_features):
        int_features = sparse.coo_matrix(chunk[misc_utils.get_integer_cols(ignore_features=ignore_features)]).tocsr()
        cat_features = chunk[misc_utils.get_categorical_cols(ignore_features=ignore_features)].as_matrix()
        cat_features = self.preprocessor.encode(cat_features)
        X = sparse.hstack((int_features, cat_features))
        return X

    def __load_model(self):
        model_name = self.settings['model']
        model_id = model_name.split('_')[0]
        model = self.models[model_id](random_state=SEED)

        model_settings = self.io_utils.get_model(model_name)
        model_params = model_settings["params"]
        model.set_params(**model_params)
        return model, model_name, model_settings

    @staticmethod
    def evaluate_model(actual_labels, predicted_labels):
        def log_loss(act, pred):
            epsilon = 1e-15
            pred = sp.maximum(epsilon, pred)
            pred = sp.minimum(1-epsilon, pred)
            ll = sum(act*sp.log(pred) + sp.subtract(1, act)*sp.log(sp.subtract(1, pred)))
            ll = ll * -1.0/len(act)
            return ll

        return log_loss(actual_labels, predicted_labels)

    @staticmethod
    def __cap_predictions(preds):
        return_val = []
        for p in preds:
            prob = p[1]
            if prob > 0.98: prob = 0.98
            elif prob < 0.02: prob = 0.02
            return_val.append(prob)
        return return_val