Пример #1
0
 def run(self):
     logger.info(f'Reading data from {self.input_file_path}')
     data_df = pd.read_csv(self.input_file_path)
     try_mkdir(self.output_artefact_path)
     
     features_file_name = self.input_file_path.split('/')[-1].split('.csv')[0]
     with open(os.path.join(self.input_features_path, f'{features_file_name}_{self.category_name}_features.pkl'), 'rb') as f:
         features = pickle.load(f)
         
     C_s = [0.01, 0.1, 1, 10, 100]
     model = LogisticRegressionCV(C_s,
                                  cv=5, 
                                  n_jobs=-1,
                                  max_iter=1000,
                                  scoring=make_scorer(roc_auc_score))
     
     logger.info(f"Fitting lr for category {self.category_name}")
     model.fit(features, data_df[self.category_name])
     
     with open(self.output().path, 'wb') as f:
         logger.info(f"Saving predictor locally for category {self.category_name}")
         pickle.dump(model, f)
         
     try: 
         mlflow.set_experiment(f'/lr_category_{self.category_name}') 
         with mlflow.start_run():
             logger.info("Sending cv parameters and scores to ML Flow")
             for i, c in enumerate(model.C_):
                 mlflow.log_param(f'C_{i}', c)
                 mlflow.log_metric(f"mean_roc_auc_C_{C_s[i]}", np.mean(model.scores_[1][:, i]))  
             
             logger.info("Sending model artifact to ML Flow")
             mlflow.log_artifact(self.output().path)     
     except Exception as e:
         logger.error("Something went wrong while trying to use MLFlow tracking: ", e)
Пример #2
0
def get_artefacts():
    client = mlflow.tracking.MlflowClient()

    try_mkdir(globalconfig().featurizers_artefacts_folder)
    try_mkdir(globalconfig().model_artefacts_folder)

    client.download_artifacts(
        client.list_run_infos(
            client.get_experiment_by_name('/tfidf').experiment_id)[0].run_id,
        'tfidf_vectorizer.pkl',
        globalconfig().featurizers_artefacts_folder)

    for category in [
            'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
            'identity_hate'
    ]:
        client.download_artifacts(
            client.list_run_infos(
                client.get_experiment_by_name(
                    f'/mnb_category_{category}').experiment_id)[0].run_id,
            f'mnb_featurizer_{category}.pkl',
            globalconfig().featurizers_artefacts_folder)

        client.download_artifacts(
            client.list_run_infos(
                client.get_experiment_by_name(
                    f'/lr_category_{category}').experiment_id)[0].run_id,
            f'{category}_lr.pkl',
            globalconfig().model_artefacts_folder)
Пример #3
0
    def run(self):
        logger.info("Generating features")
        logger.info("Reading data at {}".format(self.input_file_path))
        data_df = pd.read_csv(self.input_file_path)

        featurizer_name = os.path.join(
            self.input_artefact_path,
            f'mnb_featurizer_{self.category_name}.pkl')

        featurizer = MNBFeaturizer.load(featurizer_name)
        logger.info('Transofrming data')
        transformed = featurizer.transform(data_df['comment_text'])

        try_mkdir(self.data_output_path)
        with open(self.output().path, 'wb') as f:
            pickle.dump(transformed, f)
Пример #4
0
    def run(self):
        data_df = pd.read_csv(self.input_file_path)

        with open(self.requires().output().path, 'rb') as f:
            tfidf = pickle.load(f)

        featurizer = MNBFeaturizer(tfidf)
        logger.info("Fitting MNB for category {}".format(self.category_name))
        featurizer.fit(data_df['comment_text'], data_df[self.category_name])
        try_mkdir(self.artefact_output_path)
        featurizer.save(self.output().path)

        try:
            mlflow.set_experiment(f'/mnb_category_{self.category_name}')
            with mlflow.start_run():
                logger.info("Sending MNB artefact to MLFlow")
                mlflow.log_artifact(self.output().path)
        except Exception as e:
            logger.error(
                "Something went wrong while trying to use MLFlow tracking: ",
                e)
Пример #5
0
    def run(self):
        logger.info(f'Reading data from {self.input_batch_data}')
        data_df = pd.read_csv(self.input_batch_data)
        pred_df = data_df[['id']]
        try_mkdir(self.output_prediction_path)

        for category in [
                "toxic", "severe_toxic", "obscene", "threat", "insult",
                "identity_hate"
        ]:
            filename = self.input()[f'generated_test_features_{category}'].path
            with open(filename, 'rb') as f:
                features = pickle.load(f)

            with open(
                    os.path.join(globalconfig().model_artefacts_folder,
                                 f'{category}_lr.pkl'), 'rb') as f:
                lr = pickle.load(f)

            pred = lr.predict_proba(features)[:, 1]
            pred_df[category] = pred

        pred_df.to_csv(self.output().path, index=False)
Пример #6
0
    def run(self):
        logger.info('Reading data from {}'.format(self.input_file_path))
        data_df = pd.read_csv(self.input_file_path)
        bpemb_en = BPEmb(lang="en",
                         dim=50,
                         vs=200000,
                         cache_dir='./bpemb_cache')
        tfidf = TfidfVectorizer(tokenizer=bpemb_en.encode)
        logger.info("Fitting tfidf")
        tfidf.fit(data_df['comment_text'])

        try_mkdir(self.artefact_output_path)
        with open(self.output().path, 'wb') as f:
            pickle.dump(tfidf, f)

        try:
            mlflow.set_experiment('/tfidf')
            with mlflow.start_run():
                logger.info("Sending tfidf artefact to MLFlow")
                mlflow.log_artifact(self.output().path)
        except Exception as e:
            logger.error(
                "Something went wrong while trying to use MLFlow tracking: ",
                e)