def run(self): logger.info(f'Reading data from {self.input_file_path}') data_df = pd.read_csv(self.input_file_path) try_mkdir(self.output_artefact_path) features_file_name = self.input_file_path.split('/')[-1].split('.csv')[0] with open(os.path.join(self.input_features_path, f'{features_file_name}_{self.category_name}_features.pkl'), 'rb') as f: features = pickle.load(f) C_s = [0.01, 0.1, 1, 10, 100] model = LogisticRegressionCV(C_s, cv=5, n_jobs=-1, max_iter=1000, scoring=make_scorer(roc_auc_score)) logger.info(f"Fitting lr for category {self.category_name}") model.fit(features, data_df[self.category_name]) with open(self.output().path, 'wb') as f: logger.info(f"Saving predictor locally for category {self.category_name}") pickle.dump(model, f) try: mlflow.set_experiment(f'/lr_category_{self.category_name}') with mlflow.start_run(): logger.info("Sending cv parameters and scores to ML Flow") for i, c in enumerate(model.C_): mlflow.log_param(f'C_{i}', c) mlflow.log_metric(f"mean_roc_auc_C_{C_s[i]}", np.mean(model.scores_[1][:, i])) logger.info("Sending model artifact to ML Flow") mlflow.log_artifact(self.output().path) except Exception as e: logger.error("Something went wrong while trying to use MLFlow tracking: ", e)
def get_artefacts(): client = mlflow.tracking.MlflowClient() try_mkdir(globalconfig().featurizers_artefacts_folder) try_mkdir(globalconfig().model_artefacts_folder) client.download_artifacts( client.list_run_infos( client.get_experiment_by_name('/tfidf').experiment_id)[0].run_id, 'tfidf_vectorizer.pkl', globalconfig().featurizers_artefacts_folder) for category in [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ]: client.download_artifacts( client.list_run_infos( client.get_experiment_by_name( f'/mnb_category_{category}').experiment_id)[0].run_id, f'mnb_featurizer_{category}.pkl', globalconfig().featurizers_artefacts_folder) client.download_artifacts( client.list_run_infos( client.get_experiment_by_name( f'/lr_category_{category}').experiment_id)[0].run_id, f'{category}_lr.pkl', globalconfig().model_artefacts_folder)
def run(self): logger.info("Generating features") logger.info("Reading data at {}".format(self.input_file_path)) data_df = pd.read_csv(self.input_file_path) featurizer_name = os.path.join( self.input_artefact_path, f'mnb_featurizer_{self.category_name}.pkl') featurizer = MNBFeaturizer.load(featurizer_name) logger.info('Transofrming data') transformed = featurizer.transform(data_df['comment_text']) try_mkdir(self.data_output_path) with open(self.output().path, 'wb') as f: pickle.dump(transformed, f)
def run(self): data_df = pd.read_csv(self.input_file_path) with open(self.requires().output().path, 'rb') as f: tfidf = pickle.load(f) featurizer = MNBFeaturizer(tfidf) logger.info("Fitting MNB for category {}".format(self.category_name)) featurizer.fit(data_df['comment_text'], data_df[self.category_name]) try_mkdir(self.artefact_output_path) featurizer.save(self.output().path) try: mlflow.set_experiment(f'/mnb_category_{self.category_name}') with mlflow.start_run(): logger.info("Sending MNB artefact to MLFlow") mlflow.log_artifact(self.output().path) except Exception as e: logger.error( "Something went wrong while trying to use MLFlow tracking: ", e)
def run(self): logger.info(f'Reading data from {self.input_batch_data}') data_df = pd.read_csv(self.input_batch_data) pred_df = data_df[['id']] try_mkdir(self.output_prediction_path) for category in [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]: filename = self.input()[f'generated_test_features_{category}'].path with open(filename, 'rb') as f: features = pickle.load(f) with open( os.path.join(globalconfig().model_artefacts_folder, f'{category}_lr.pkl'), 'rb') as f: lr = pickle.load(f) pred = lr.predict_proba(features)[:, 1] pred_df[category] = pred pred_df.to_csv(self.output().path, index=False)
def run(self): logger.info('Reading data from {}'.format(self.input_file_path)) data_df = pd.read_csv(self.input_file_path) bpemb_en = BPEmb(lang="en", dim=50, vs=200000, cache_dir='./bpemb_cache') tfidf = TfidfVectorizer(tokenizer=bpemb_en.encode) logger.info("Fitting tfidf") tfidf.fit(data_df['comment_text']) try_mkdir(self.artefact_output_path) with open(self.output().path, 'wb') as f: pickle.dump(tfidf, f) try: mlflow.set_experiment('/tfidf') with mlflow.start_run(): logger.info("Sending tfidf artefact to MLFlow") mlflow.log_artifact(self.output().path) except Exception as e: logger.error( "Something went wrong while trying to use MLFlow tracking: ", e)