예제 #1
0
            try:
                doc = load_text(p)
            except:
                bad_pages.append(p)
                continue

            if doc is None:
                bad_pages.append(p)
                continue

            try:
                infobox = load_info_box(p)
            except:
                bad_pages.append(p)
                continue

            if infobox is None:
                bad_pages.append(p)
                continue

            page_index.append(p)
            content_lst.append(doc)
            infoboxes_lst.append(infobox)

        df = self._get_df(page_index, content_lst, infoboxes_lst)
        save_data(df, self.output().path)


if __name__ == '__main__':
    luigi.run_task(DataExtractionTask())
        return vocab

    def run(self):
        full_df = self.get_task_inputs()

        vocab = self.__get_vocabulary(full_df['text'])

        tokenized_text = full_df['text'].apply(
            lambda text: ' '.join(self.tokenize_doc(text, vocab=vocab)))

        vectorizer = TfidfVectorizer()
        vectorizer.fit(tokenized_text)
        transformed_array = vectorizer.transform(tokenized_text)

        tokenized_df = pd.DataFrame(transformed_array.toarray(),
                                    index=full_df.index,
                                    columns=vectorizer.get_feature_names())
        save_data(tokenized_df, self.output().path)

        # if self.config['preprocess']['is_data_dataframe']:
        #     tokenized_df = pd.DataFrame(index=full_df.index)
        #     for col, value in zip(vectorizer.get_feature_names(), transformed_array.toarray().T):
        #         tokenized_df[col] = value
        #     save_data(tokenized_df, self.output().path)
        # else:
        #     save_data(transformed_array, self.output().path)


if __name__ == '__main__':
    luigi.run_task(DataTokenizationTask())
                mone = (prediction[j] * p_ij[i, j] + (1 - prediction[j]) *
                        (p_ij[i, i] - p_ij[i, j]))
                mechane = mone + (1 - prediction[i]) * (
                    prediction[j] * (p_ij[j, j] - p_ij[i, j]) +
                    (1 - prediction[j]) *
                    (1 - p_ij[i, i] - p_ij[i, j] + p_ij[i, j]))

                p_i_new += prediction[i] * (mone / mechane)
                Z += (mone / mechane)

            p_i_new /= Z

            priored_prediction.append(p_i_new)
        return priored_prediction

    def run(self):
        inputs = self.get_task_inputs()
        y_pred = inputs['y_pred']
        p_ij = inputs['p_ij']
        E_ij = inputs['E_ij']

        y_pred_after_prior = np.array(
            [self._run_prior_on_prediction(p_ij, E_ij, p) for p in y_pred])

        save_data(y_pred_after_prior, self.output().path)


if __name__ == '__main__':
    luigi.run_task(QuestionsPredictionsAfterPriorTask())
예제 #4
0
from questions_model.choose_best_model import QuestionsModelSelectionTask
from utils.utils import *


class QuestionsMakePredictionsTask(luigi.Task):
    def requires(self):
        return {
            'data': CreateDataSetTask(),
            'best_model': QuestionsModelSelectionTask()
        }

    def output(self):
        return luigi.LocalTarget(
            get_file_path(f"y_pred.pickle", self.config['exp_dir']))

    def run(self):
        inputs = self.get_task_inputs()
        data: DataSet = inputs['data']
        best_model = inputs['best_model']
        X_test = data.X_test
        X_train, y_train = data.train_data

        model = best_model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)

        self.save(y_pred)


if __name__ == '__main__':
    luigi.run_task(QuestionsMakePredictionsTask())
            return

        # For DEBUG - smaller subcategories
        if self.config['debug']['DEBUG']:
            subcategories = subcategories[:20]

        for cat in subcategories:
            print(f"Entered to Wikipedia Category : {cat['title']}")
            self.__get_category(cat['title'])

    def run(self):
        self.pages = set()

        if self.config['extraction']['subcache']:
            try:
                self.pages = set(
                    read_data(get_file_path(self.output_path,
                                            'subcache')).split('\n'))
            except:
                self.pages = set(
                    read_data(get_file_path('full_df.pickle',
                                            'subcache')).index)

        self.__get_category(self.config['extraction']['initial_category'])

        save_data('\n'.join(self.pages), self.output().path)


if __name__ == '__main__':
    luigi.run_task(WikipediaListExtractionTask())
import numpy as np

import utils.luigi_wrapper as luigi
from preprocess.data_tokenization import DataTokenizationTask
from utils.utils import *


class FeatureSelectionTask(luigi.Task):
    def requires(self):
        return DataTokenizationTask()

    def output(self):
        return luigi.LocalTarget(get_file_path('final_data.pickle', 'data'))

    def run(self):
        X = self.get_task_inputs()

        feature_indices = np.random.choice(np.arange(X.shape[1]), size=1000)

        X = X[:, feature_indices]

        save_data(X, self.output().path)


if __name__ == '__main__':
    luigi.run_task(FeatureSelectionTask())
예제 #7
0
import utils.luigi_wrapper as luigi
from utils.utils import *


#
class TestTask(luigi.Task):
    x = luigi.luigi.IntParameter()


if __name__ == '__main__':
    luigi.run_task(TestTask(), local_scheduler=get_from_config('luigi_local_scheduler'), delete_all=False)
예제 #8
0
            self._plot_all_ROCs(fpr, tpr, roc_auc, lw)
        self._set_ROC_axis(lw, self.config['visualization']['plot_all_ROCs'])
        f = plt.gcf()
        plt.show()
        return f

    def run(self):
        inputs = self.get_task_inputs()
        data: DataSet = inputs['data']
        y_pred = inputs['y_pred']
        index_test = data._arr_indices_test

        self.y_true = data.y_test
        self.y_pred = y_pred[index_test]
        # metrics
        self.print_metrics(self.y_true, self.y_pred)

        # get ROC auc's
        fpr, tpr, roc_auc = self.calculate_ROCs()

        # get ROC curve plots
        f = self.plot_ROCs(fpr, tpr, roc_auc)

        f.savefig(self.output().path)
        # self.task_done()


# General TODO - add prior for questions
if __name__ == '__main__':
    luigi.run_task(PlotROCTask())