try: doc = load_text(p) except: bad_pages.append(p) continue if doc is None: bad_pages.append(p) continue try: infobox = load_info_box(p) except: bad_pages.append(p) continue if infobox is None: bad_pages.append(p) continue page_index.append(p) content_lst.append(doc) infoboxes_lst.append(infobox) df = self._get_df(page_index, content_lst, infoboxes_lst) save_data(df, self.output().path) if __name__ == '__main__': luigi.run_task(DataExtractionTask())
return vocab def run(self): full_df = self.get_task_inputs() vocab = self.__get_vocabulary(full_df['text']) tokenized_text = full_df['text'].apply( lambda text: ' '.join(self.tokenize_doc(text, vocab=vocab))) vectorizer = TfidfVectorizer() vectorizer.fit(tokenized_text) transformed_array = vectorizer.transform(tokenized_text) tokenized_df = pd.DataFrame(transformed_array.toarray(), index=full_df.index, columns=vectorizer.get_feature_names()) save_data(tokenized_df, self.output().path) # if self.config['preprocess']['is_data_dataframe']: # tokenized_df = pd.DataFrame(index=full_df.index) # for col, value in zip(vectorizer.get_feature_names(), transformed_array.toarray().T): # tokenized_df[col] = value # save_data(tokenized_df, self.output().path) # else: # save_data(transformed_array, self.output().path) if __name__ == '__main__': luigi.run_task(DataTokenizationTask())
mone = (prediction[j] * p_ij[i, j] + (1 - prediction[j]) * (p_ij[i, i] - p_ij[i, j])) mechane = mone + (1 - prediction[i]) * ( prediction[j] * (p_ij[j, j] - p_ij[i, j]) + (1 - prediction[j]) * (1 - p_ij[i, i] - p_ij[i, j] + p_ij[i, j])) p_i_new += prediction[i] * (mone / mechane) Z += (mone / mechane) p_i_new /= Z priored_prediction.append(p_i_new) return priored_prediction def run(self): inputs = self.get_task_inputs() y_pred = inputs['y_pred'] p_ij = inputs['p_ij'] E_ij = inputs['E_ij'] y_pred_after_prior = np.array( [self._run_prior_on_prediction(p_ij, E_ij, p) for p in y_pred]) save_data(y_pred_after_prior, self.output().path) if __name__ == '__main__': luigi.run_task(QuestionsPredictionsAfterPriorTask())
from questions_model.choose_best_model import QuestionsModelSelectionTask from utils.utils import * class QuestionsMakePredictionsTask(luigi.Task): def requires(self): return { 'data': CreateDataSetTask(), 'best_model': QuestionsModelSelectionTask() } def output(self): return luigi.LocalTarget( get_file_path(f"y_pred.pickle", self.config['exp_dir'])) def run(self): inputs = self.get_task_inputs() data: DataSet = inputs['data'] best_model = inputs['best_model'] X_test = data.X_test X_train, y_train = data.train_data model = best_model.fit(X_train, y_train) y_pred = model.predict_proba(X_test) self.save(y_pred) if __name__ == '__main__': luigi.run_task(QuestionsMakePredictionsTask())
return # For DEBUG - smaller subcategories if self.config['debug']['DEBUG']: subcategories = subcategories[:20] for cat in subcategories: print(f"Entered to Wikipedia Category : {cat['title']}") self.__get_category(cat['title']) def run(self): self.pages = set() if self.config['extraction']['subcache']: try: self.pages = set( read_data(get_file_path(self.output_path, 'subcache')).split('\n')) except: self.pages = set( read_data(get_file_path('full_df.pickle', 'subcache')).index) self.__get_category(self.config['extraction']['initial_category']) save_data('\n'.join(self.pages), self.output().path) if __name__ == '__main__': luigi.run_task(WikipediaListExtractionTask())
import numpy as np import utils.luigi_wrapper as luigi from preprocess.data_tokenization import DataTokenizationTask from utils.utils import * class FeatureSelectionTask(luigi.Task): def requires(self): return DataTokenizationTask() def output(self): return luigi.LocalTarget(get_file_path('final_data.pickle', 'data')) def run(self): X = self.get_task_inputs() feature_indices = np.random.choice(np.arange(X.shape[1]), size=1000) X = X[:, feature_indices] save_data(X, self.output().path) if __name__ == '__main__': luigi.run_task(FeatureSelectionTask())
import utils.luigi_wrapper as luigi from utils.utils import * # class TestTask(luigi.Task): x = luigi.luigi.IntParameter() if __name__ == '__main__': luigi.run_task(TestTask(), local_scheduler=get_from_config('luigi_local_scheduler'), delete_all=False)
self._plot_all_ROCs(fpr, tpr, roc_auc, lw) self._set_ROC_axis(lw, self.config['visualization']['plot_all_ROCs']) f = plt.gcf() plt.show() return f def run(self): inputs = self.get_task_inputs() data: DataSet = inputs['data'] y_pred = inputs['y_pred'] index_test = data._arr_indices_test self.y_true = data.y_test self.y_pred = y_pred[index_test] # metrics self.print_metrics(self.y_true, self.y_pred) # get ROC auc's fpr, tpr, roc_auc = self.calculate_ROCs() # get ROC curve plots f = self.plot_ROCs(fpr, tpr, roc_auc) f.savefig(self.output().path) # self.task_done() # General TODO - add prior for questions if __name__ == '__main__': luigi.run_task(PlotROCTask())