Пример #1
0
class PreparationTask(luigi.Task):
    input_df_file = luigi.Parameter(globalconfig().train_data_path)
    output_df_folder = luigi.Parameter(globalconfig().preprocessed_data_folder)
    
    
    def clean_df(self, df, column_name):
        """This function removes line breaks from the specified column
    
        Arguments:
            df {pd.DataFrame} -- data to process
            column_name {string} -- dataFrame column to clean
    
        Returns:
            pd.DataFrame -- processed dataframe
        """
        
        df[column_name] = df[column_name].fillna('').str.replace('\n', ' ')
        return df
    
    def output(self):
        input_file_name, extention = self.input_df_file.split(os.path.sep)[-1].split('.')
        self.output_file_name = '.'.join([input_file_name + '_prepared', extention])
        return luigi.LocalTarget(os.path.join(self.output_df_folder, self.output_file_name))
    
    def run(self):
        df = pd.read_csv(self.input_df_file)
        logger.info('processing {}'.format(self.input_df_file))
        df_clean = self.clean_df(df, 'comment_text')
        
        logger.info('writing {} to {}'.format(self.output_file_name, self.output_df_folder))
        with self.output().open('w') as f:
            df_clean.to_csv(f, index=False, encoding='utf-8')
Пример #2
0
def get_artefacts():
    client = mlflow.tracking.MlflowClient()

    try_mkdir(globalconfig().featurizers_artefacts_folder)
    try_mkdir(globalconfig().model_artefacts_folder)

    client.download_artifacts(
        client.list_run_infos(
            client.get_experiment_by_name('/tfidf').experiment_id)[0].run_id,
        'tfidf_vectorizer.pkl',
        globalconfig().featurizers_artefacts_folder)

    for category in [
            'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
            'identity_hate'
    ]:
        client.download_artifacts(
            client.list_run_infos(
                client.get_experiment_by_name(
                    f'/mnb_category_{category}').experiment_id)[0].run_id,
            f'mnb_featurizer_{category}.pkl',
            globalconfig().featurizers_artefacts_folder)

        client.download_artifacts(
            client.list_run_infos(
                client.get_experiment_by_name(
                    f'/lr_category_{category}').experiment_id)[0].run_id,
            f'{category}_lr.pkl',
            globalconfig().model_artefacts_folder)
Пример #3
0
class TrainLogRegAllWrapperTask(luigi.WrapperTask):
    input_file_path = luigi.Parameter(default='./data/prepared/train_prepared.csv')
    input_features_path = luigi.Parameter(globalconfig().featurized_data_folder)
    output_artefact_path = luigi.Parameter(globalconfig().model_artefacts_folder)
    
    def requires(self):
        for category in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
            yield TrainLogRegTask(input_file_path=self.input_file_path,
                              input_features_path=self.input_features_path,
                              output_artefact_path=self.output_artefact_path,
                              category_name=category)
Пример #4
0
 def build_pipeline(self, category: str) -> Pipeline:
     with open(os.path.join(globalconfig().featurizers_artefacts_folder,
                             f'mnb_featurizer_{category}.pkl'), 'rb') as f:
         mnb = pickle.load(f) 
     
     with open(os.path.join(globalconfig().model_artefacts_folder, 
                            f'{category}_lr.pkl'), 'rb') as f:
         lr = pickle.load(f)
         
     pipeline = Pipeline([('mnb',mnb), 
                          ('lr', lr)])
     
     return pipeline
Пример #5
0
class TrainLogRegTask(luigi.Task):
    input_file_path = luigi.Parameter(default='./data/prepared/train_prepared.csv')
    input_features_path = luigi.Parameter(globalconfig().featurized_data_folder)
    category_name = luigi.Parameter()
    output_artefact_path = luigi.Parameter(globalconfig().model_artefacts_folder)
    
    def requires(self):
        return GenerateMNBFeaturesTask(input_file_path=self.input_file_path,
                                       input_artefact_path=globalconfig().featurizers_artefacts_folder,
                                       data_output_path=globalconfig().featurized_data_folder,
                                       category_name=self.category_name)
    
    def output(self):
        output_path = os.path.join(self.output_artefact_path, f'{self.category_name}_lr.pkl')
        return luigi.LocalTarget(output_path)
    
    def run(self):
        logger.info(f'Reading data from {self.input_file_path}')
        data_df = pd.read_csv(self.input_file_path)
        try_mkdir(self.output_artefact_path)
        
        features_file_name = self.input_file_path.split('/')[-1].split('.csv')[0]
        with open(os.path.join(self.input_features_path, f'{features_file_name}_{self.category_name}_features.pkl'), 'rb') as f:
            features = pickle.load(f)
            
        C_s = [0.01, 0.1, 1, 10, 100]
        model = LogisticRegressionCV(C_s,
                                     cv=5, 
                                     n_jobs=-1,
                                     max_iter=1000,
                                     scoring=make_scorer(roc_auc_score))
        
        logger.info(f"Fitting lr for category {self.category_name}")
        model.fit(features, data_df[self.category_name])
        
        with open(self.output().path, 'wb') as f:
            logger.info(f"Saving predictor locally for category {self.category_name}")
            pickle.dump(model, f)
            
        try: 
            mlflow.set_experiment(f'/lr_category_{self.category_name}') 
            with mlflow.start_run():
                logger.info("Sending cv parameters and scores to ML Flow")
                for i, c in enumerate(model.C_):
                    mlflow.log_param(f'C_{i}', c)
                    mlflow.log_metric(f"mean_roc_auc_C_{C_s[i]}", np.mean(model.scores_[1][:, i]))  
                
                logger.info("Sending model artifact to ML Flow")
                mlflow.log_artifact(self.output().path)     
        except Exception as e:
            logger.error("Something went wrong while trying to use MLFlow tracking: ", e)
Пример #6
0
class GenerateFeaturesWrapperTask(luigi.WrapperTask):
    input_file_path = luigi.Parameter('./data/prepared/train_prepared.csv')
    input_artefact_path = luigi.Parameter(
        globalconfig().featurizers_artefacts_folder)
    data_output_path = luigi.Parameter(globalconfig().featurized_data_folder)

    def requires(self):
        for category in [
                'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
                'identity_hate'
        ]:
            yield GenerateMNBFeaturesTask(
                input_file_path=self.input_file_path,
                input_artefact_path=self.input_artefact_path,
                data_output_path=self.data_output_path,
                category_name=category)
Пример #7
0
    def requires(self):
        test_preparation = PreparationTask(
            input_df_file=self.input_batch_data,
            output_df_folder=globalconfig().preprocessed_data_folder)

        requirements_dict = {'prepared_test': test_preparation}

        for category in [
                "toxic", "severe_toxic", "obscene", "threat", "insult",
                "identity_hate"
        ]:
            requirements_dict[
                f'generated_test_features_{category}'] = GenerateMNBFeaturesTask(
                    input_file_path=test_preparation.output().path,
                    input_artefact_path=globalconfig(
                    ).featurizers_artefacts_folder,
                    data_output_path=globalconfig().featurized_data_folder,
                    category_name=category)

        requirements_dict['trained_log_reg'] = TrainLogRegAllWrapperTask(
            input_file_path=globalconfig().prepared_train_data_path,
            input_features_path=globalconfig().featurized_data_folder,
            output_artefact_path=globalconfig().model_artefacts_folder)

        return requirements_dict
Пример #8
0
class TrainTfidfTask(luigi.Task):
    input_file_path = luigi.Parameter(globalconfig().prepared_train_data_path)
    artefact_output_path = luigi.Parameter(
        globalconfig().featurizers_artefacts_folder)

    def requires(self):
        return PreparationTask(
            input_df_file=globalconfig().train_data_path,
            output_df_folder=globalconfig().preprocessed_data_folder)

    def output(self):
        output_name = os.path.join(self.artefact_output_path,
                                   'tfidf_vectorizer.pkl')
        return luigi.LocalTarget(output_name)

    def run(self):
        logger.info('Reading data from {}'.format(self.input_file_path))
        data_df = pd.read_csv(self.input_file_path)
        bpemb_en = BPEmb(lang="en",
                         dim=50,
                         vs=200000,
                         cache_dir='./bpemb_cache')
        tfidf = TfidfVectorizer(tokenizer=bpemb_en.encode)
        logger.info("Fitting tfidf")
        tfidf.fit(data_df['comment_text'])

        try_mkdir(self.artefact_output_path)
        with open(self.output().path, 'wb') as f:
            pickle.dump(tfidf, f)

        try:
            mlflow.set_experiment('/tfidf')
            with mlflow.start_run():
                logger.info("Sending tfidf artefact to MLFlow")
                mlflow.log_artifact(self.output().path)
        except Exception as e:
            logger.error(
                "Something went wrong while trying to use MLFlow tracking: ",
                e)
Пример #9
0
class GenerateMNBFeaturesTask(luigi.Task):
    input_file_path = luigi.Parameter('./data/prepared/train_prepared.csv')
    input_artefact_path = luigi.Parameter(
        globalconfig().featurizers_artefacts_folder)
    data_output_path = luigi.Parameter(globalconfig().featurized_data_folder)
    category_name = luigi.Parameter()

    def requires(self):
        return TrainMNBTask(
            input_file_path=globalconfig().prepared_train_data_path,
            artefact_output_path=self.input_artefact_path,
            category_name=self.category_name)

    def output(self):
        file_name = self.input_file_path.split('/')[-1].split('.csv')[0]
        output_name = os.path.join(
            self.data_output_path,
            f'{file_name}_{self.category_name}_features.pkl')

        return luigi.LocalTarget(output_name)

    def run(self):
        logger.info("Generating features")
        logger.info("Reading data at {}".format(self.input_file_path))
        data_df = pd.read_csv(self.input_file_path)

        featurizer_name = os.path.join(
            self.input_artefact_path,
            f'mnb_featurizer_{self.category_name}.pkl')

        featurizer = MNBFeaturizer.load(featurizer_name)
        logger.info('Transofrming data')
        transformed = featurizer.transform(data_df['comment_text'])

        try_mkdir(self.data_output_path)
        with open(self.output().path, 'wb') as f:
            pickle.dump(transformed, f)
Пример #10
0
class TrainMNBTask(luigi.Task):
    input_file_path = luigi.Parameter(globalconfig().prepared_train_data_path)
    artefact_output_path = luigi.Parameter(
        globalconfig().featurizers_artefacts_folder)
    category_name = luigi.Parameter()

    def requires(self):
        return TrainTfidfTask(input_file_path=self.input_file_path,
                              artefact_output_path=self.artefact_output_path)

    def output(self):
        output_name = os.path.join(self.artefact_output_path,
                                   f'mnb_featurizer_{self.category_name}.pkl')
        return luigi.LocalTarget(output_name)

    def run(self):
        data_df = pd.read_csv(self.input_file_path)

        with open(self.requires().output().path, 'rb') as f:
            tfidf = pickle.load(f)

        featurizer = MNBFeaturizer(tfidf)
        logger.info("Fitting MNB for category {}".format(self.category_name))
        featurizer.fit(data_df['comment_text'], data_df[self.category_name])
        try_mkdir(self.artefact_output_path)
        featurizer.save(self.output().path)

        try:
            mlflow.set_experiment(f'/mnb_category_{self.category_name}')
            with mlflow.start_run():
                logger.info("Sending MNB artefact to MLFlow")
                mlflow.log_artifact(self.output().path)
        except Exception as e:
            logger.error(
                "Something went wrong while trying to use MLFlow tracking: ",
                e)
Пример #11
0
    def run(self):
        logger.info(f'Reading data from {self.input_batch_data}')
        data_df = pd.read_csv(self.input_batch_data)
        pred_df = data_df[['id']]
        try_mkdir(self.output_prediction_path)

        for category in [
                "toxic", "severe_toxic", "obscene", "threat", "insult",
                "identity_hate"
        ]:
            filename = self.input()[f'generated_test_features_{category}'].path
            with open(filename, 'rb') as f:
                features = pickle.load(f)

            with open(
                    os.path.join(globalconfig().model_artefacts_folder,
                                 f'{category}_lr.pkl'), 'rb') as f:
                lr = pickle.load(f)

            pred = lr.predict_proba(features)[:, 1]
            pred_df[category] = pred

        pred_df.to_csv(self.output().path, index=False)
Пример #12
0
 def requires(self):
     return PreparationTask(
         input_df_file=globalconfig().train_data_path,
         output_df_folder=globalconfig().preprocessed_data_folder)
Пример #13
0
 def requires(self):
     return TrainMNBTask(
         input_file_path=globalconfig().prepared_train_data_path,
         artefact_output_path=self.input_artefact_path,
         category_name=self.category_name)
Пример #14
0
class PredictLogRegTask(luigi.Task):
    input_batch_data = luigi.Parameter(default=globalconfig().test_data_path)
    output_prediction_path = luigi.Parameter(
        default=globalconfig().output_prediction_path)

    def output(self):
        output_path = os.path.join(
            self.output_prediction_path,
            self.input_batch_data.split('/')[-1].split('.csv')[0] +
            f'_prediction.csv')

        return luigi.LocalTarget(output_path)

    def requires(self):
        test_preparation = PreparationTask(
            input_df_file=self.input_batch_data,
            output_df_folder=globalconfig().preprocessed_data_folder)

        requirements_dict = {'prepared_test': test_preparation}

        for category in [
                "toxic", "severe_toxic", "obscene", "threat", "insult",
                "identity_hate"
        ]:
            requirements_dict[
                f'generated_test_features_{category}'] = GenerateMNBFeaturesTask(
                    input_file_path=test_preparation.output().path,
                    input_artefact_path=globalconfig(
                    ).featurizers_artefacts_folder,
                    data_output_path=globalconfig().featurized_data_folder,
                    category_name=category)

        requirements_dict['trained_log_reg'] = TrainLogRegAllWrapperTask(
            input_file_path=globalconfig().prepared_train_data_path,
            input_features_path=globalconfig().featurized_data_folder,
            output_artefact_path=globalconfig().model_artefacts_folder)

        return requirements_dict

    def run(self):
        logger.info(f'Reading data from {self.input_batch_data}')
        data_df = pd.read_csv(self.input_batch_data)
        pred_df = data_df[['id']]
        try_mkdir(self.output_prediction_path)

        for category in [
                "toxic", "severe_toxic", "obscene", "threat", "insult",
                "identity_hate"
        ]:
            filename = self.input()[f'generated_test_features_{category}'].path
            with open(filename, 'rb') as f:
                features = pickle.load(f)

            with open(
                    os.path.join(globalconfig().model_artefacts_folder,
                                 f'{category}_lr.pkl'), 'rb') as f:
                lr = pickle.load(f)

            pred = lr.predict_proba(features)[:, 1]
            pred_df[category] = pred

        pred_df.to_csv(self.output().path, index=False)
Пример #15
0
 def requires(self):
     return GenerateMNBFeaturesTask(input_file_path=self.input_file_path,
                                    input_artefact_path=globalconfig().featurizers_artefacts_folder,
                                    data_output_path=globalconfig().featurized_data_folder,
                                    category_name=self.category_name)