예제 #1
0
파일: pipeline.py 프로젝트: theJasonFan/qb
    def run(self):
        with open(QANTA_MAPPED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        tmp_db = get_tmp_filename()
        questions_to_sqlite(qanta_questions, tmp_db)
        shell(f'mv {tmp_db} {QANTA_SQL_DATASET_PATH}')
예제 #2
0
파일: pipeline.py 프로젝트: Pinafore/qb
    def run(self):
        with open(QANTA_MAPPED_DATASET_PATH) as f:
            qanta_questions = json.load(f)['questions']

        tmp_db = get_tmp_filename()
        questions_to_sqlite(qanta_questions, tmp_db)
        shell(f'mv {tmp_db} {QANTA_SQL_DATASET_PATH}')
예제 #3
0
파일: cli.py 프로젝트: nhatsmrt/qb
def slurm(
    partition,
    qos,
    mem_per_cpu,
    max_time,
    nodelist,
    cpus_per_task,
    luigi_module,
    luigi_task,
):
    env = Environment(loader=PackageLoader("qanta", "slurm/templates"))
    template = env.get_template("luigi-template.sh.jinja2")
    sbatch_script = template.render({
        "luigi_module": luigi_module,
        "luigi_task": luigi_task,
        "partition": partition,
        "qos": qos,
        "mem_per_cpu": mem_per_cpu,
        "max_time": max_time,
        "nodelist": nodelist,
        "cpus_per_task": cpus_per_task,
    })
    tmp_file = get_tmp_filename()
    with open(tmp_file, "w") as f:
        f.write(sbatch_script)
    shell(f"sbatch {tmp_file}")
    shell(f"rm -f {tmp_file}")
예제 #4
0
파일: elmo.py 프로젝트: ymedhat95/qb
    def train(self, training_data: TrainingData) -> None:
        x_train, y_train, x_val, y_val, vocab, class_to_i, i_to_class = preprocess_dataset(
            training_data)
        self.class_to_i = class_to_i
        self.i_to_class = i_to_class

        log.info('Batchifying data')
        train_batches = batchify(x_train, y_train, shuffle=True)
        val_batches = batchify(x_val, y_val, shuffle=False)
        self.model = ElmoModel(len(i_to_class), dropout=self.dropout)
        if CUDA:
            self.model = self.model.cuda()
        log.info(f'Parameters:\n{self.parameters()}')
        log.info(f'Model:\n{self.model}')
        parameters = list(self.model.classifier.parameters())
        for mix in self.model.elmo._scalar_mixes:
            parameters.extend(list(mix.parameters()))
        self.optimizer = Adam(parameters)
        self.criterion = nn.CrossEntropyLoss()
        self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer,
                                                        patience=5,
                                                        verbose=True,
                                                        mode='max')
        temp_prefix = get_tmp_filename()
        self.model_file = f'{temp_prefix}.pt'
        manager = TrainingManager([
            BaseLogger(log_func=log.info),
            TerminateOnNaN(),
            EarlyStopping(monitor='test_acc', patience=10, verbose=1),
            MaxEpochStopping(100),
            ModelCheckpoint(create_save_model(self.model),
                            self.model_file,
                            monitor='test_acc')
        ])
        log.info('Starting training')
        epoch = 0
        while True:
            self.model.train()
            train_acc, train_loss, train_time = self.run_epoch(train_batches)
            random.shuffle(train_batches)

            self.model.eval()
            test_acc, test_loss, test_time = self.run_epoch(val_batches,
                                                            train=False)

            stop_training, reasons = manager.instruct(train_time, train_loss,
                                                      train_acc, test_time,
                                                      test_loss, test_acc)

            if stop_training:
                log.info(' '.join(reasons))
                break
            else:
                self.scheduler.step(test_acc)
            epoch += 1
예제 #5
0
파일: elmo.py 프로젝트: Pinafore/qb
    def train(self, training_data: TrainingData) -> None:
        x_train, y_train, x_val, y_val, vocab, class_to_i, i_to_class = preprocess_dataset(training_data)
        self.class_to_i = class_to_i
        self.i_to_class = i_to_class

        log.info('Batchifying data')
        train_batches = batchify(x_train, y_train, shuffle=True)
        val_batches = batchify(x_val, y_val, shuffle=False)
        self.model = ElmoModel(len(i_to_class), dropout=self.dropout)
        if CUDA:
            self.model = self.model.cuda()
        log.info(f'Parameters:\n{self.parameters()}')
        log.info(f'Model:\n{self.model}')
        parameters = list(self.model.classifier.parameters())
        for mix in self.model.elmo._scalar_mixes:
            parameters.extend(list(mix.parameters()))
        self.optimizer = Adam(parameters)
        self.criterion = nn.CrossEntropyLoss()
        self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode='max')
        temp_prefix = get_tmp_filename()
        self.model_file = f'{temp_prefix}.pt'
        manager = TrainingManager([
            BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1),
            MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), self.model_file, monitor='test_acc')
        ])
        log.info('Starting training')
        epoch = 0
        while True:
            self.model.train()
            train_acc, train_loss, train_time = self.run_epoch(train_batches)
            random.shuffle(train_batches)

            self.model.eval()
            test_acc, test_loss, test_time = self.run_epoch(val_batches, train=False)

            stop_training, reasons = manager.instruct(
                train_time, train_loss, train_acc,
                test_time, test_loss, test_acc
            )

            if stop_training:
                log.info(' '.join(reasons))
                break
            else:
                self.scheduler.step(test_acc)
            epoch += 1
예제 #6
0
파일: cli.py 프로젝트: Pinafore/qb
def slurm(partition, qos, mem_per_cpu, max_time, nodelist, cpus_per_task, luigi_module, luigi_task):
    env = Environment(loader=PackageLoader('qanta', 'slurm/templates'))
    template = env.get_template('luigi-template.sh.jinja2')
    sbatch_script = template.render({
        'luigi_module': luigi_module,
        'luigi_task': luigi_task,
        'partition': partition,
        'qos': qos,
        'mem_per_cpu': mem_per_cpu,
        'max_time': max_time,
        'nodelist': nodelist,
        'cpus_per_task': cpus_per_task
    })
    tmp_file = get_tmp_filename()
    with open(tmp_file, 'w') as f:
        f.write(sbatch_script)
    shell(f'sbatch {tmp_file}')
    shell(f'rm -f {tmp_file}')
예제 #7
0
def slurm(partition, qos, mem_per_cpu, max_time, nodelist, cpus_per_task,
          luigi_module, luigi_task):
    env = Environment(loader=PackageLoader('qanta', 'slurm/templates'))
    template = env.get_template('luigi-template.sh.jinja2')
    sbatch_script = template.render({
        'luigi_module': luigi_module,
        'luigi_task': luigi_task,
        'partition': partition,
        'qos': qos,
        'mem_per_cpu': mem_per_cpu,
        'max_time': max_time,
        'nodelist': nodelist,
        'cpus_per_task': cpus_per_task
    })
    tmp_file = get_tmp_filename()
    with open(tmp_file, 'w') as f:
        f.write(sbatch_script)
    shell(f'sbatch {tmp_file}')
    shell(f'rm -f {tmp_file}')
예제 #8
0
파일: rnn.py 프로젝트: npow/qb
    def train(self, training_data):
        log.info('Loading Quiz Bowl dataset')
        train_iter, val_iter, dev_iter = QuizBowl.iters(
            batch_size=self.batch_size,
            lower=self.lowercase,
            use_wiki=self.use_wiki,
            n_wiki_sentences=self.n_wiki_sentences,
            replace_title_mentions=self.wiki_title_replace_token,
            sort_within_batch=True)
        log.info(f'Training Data={len(training_data[0])}')
        log.info(f'N Train={len(train_iter.dataset.examples)}')
        log.info(f'N Test={len(val_iter.dataset.examples)}')
        fields: Dict[str, Field] = train_iter.dataset.fields
        self.page_field = fields['page']
        self.n_classes = len(self.ans_to_i)
        self.qanta_id_field = fields['qanta_id']
        self.emb_dim = 300

        self.text_field = fields['text']
        log.info(f'Text Vocab={len(self.text_field.vocab)}')

        log.info('Initializing Model')
        self.model = RnnModel(self.n_classes,
                              text_field=self.text_field,
                              emb_dim=self.emb_dim,
                              n_hidden_units=self.n_hidden_units,
                              n_hidden_layers=self.n_hidden_layers,
                              nn_dropout=self.nn_dropout)
        if CUDA:
            self.model = self.model.cuda()
        log.info(f'Parameters:\n{self.parameters()}')
        log.info(f'Model:\n{self.model}')
        self.optimizer = Adam(self.model.parameters())
        self.criterion = nn.CrossEntropyLoss()
        self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer,
                                                        patience=5,
                                                        verbose=True,
                                                        mode='max')

        temp_prefix = get_tmp_filename()
        self.model_file = f'{temp_prefix}.pt'
        manager = TrainingManager([
            BaseLogger(log_func=log.info),
            TerminateOnNaN(),
            EarlyStopping(monitor='test_acc', patience=10, verbose=1),
            MaxEpochStopping(100),
            ModelCheckpoint(create_save_model(self.model),
                            self.model_file,
                            monitor='test_acc')
        ])

        log.info('Starting training')

        epoch = 0
        while True:
            self.model.train()
            train_acc, train_loss, train_time = self.run_epoch(train_iter)

            self.model.eval()
            test_acc, test_loss, test_time = self.run_epoch(val_iter)

            stop_training, reasons = manager.instruct(train_time, train_loss,
                                                      train_acc, test_time,
                                                      test_loss, test_acc)

            if stop_training:
                log.info(' '.join(reasons))
                break
            else:
                self.scheduler.step(test_acc)
            epoch += 1
예제 #9
0
    def train(self, training_data: TrainingData) -> None:
        log.info(f"Config:\n{pformat(self.parameters())}")
        questions = training_data[0]
        answers = training_data[1]

        x_data = []
        y_data = []
        for q, ans in zip(questions, answers):
            for sent in q:
                x_data.append(sent)
                y_data.append(ans)

        label_set = set(answers)
        self.label_to_i = {label: i for i, label in enumerate(label_set, 1)}
        self.i_to_label = {i: label for label, i in self.label_to_i.items()}
        self.max_label = len(self.label_to_i)

        temp_dir = get_tmp_dir()
        with tempfile.NamedTemporaryFile("w", delete=False, dir=temp_dir) as f:
            file_name = f.name
            zipped = list(zip(x_data, y_data))
            random.shuffle(zipped)
            for x, y in zipped:
                features = format_question(x)
                label = self.label_to_i[y]
                f.write("{label} |words {features}\n".format(
                    label=label, features=features))

        if self.multiclass_online_trees:
            multiclass_flag = "--log_multi"
        elif self.multiclass_one_against_all:
            multiclass_flag = "--oaa"
        else:
            raise ValueError(
                "The options multiclass_one_against_all and multiclass_online_trees are XOR"
            )

        self.model_file = get_tmp_filename()
        options = [
            "vw",
            "-k",
            f"{multiclass_flag}",
            f"{self.max_label}",
            f"-d {file_name}",
            f"-f {self.model_file}.vw",
            "--loss_function logistic",
            "-c",
            f"--passes {self.passes}",
            f"-b {self.bits}",
            f"-l {self.learning_rate}",
            f"--decay_learning_rate {self.decay_learning_rate}",
            f"--random_seed {self.random_seed}",
        ]

        for n in self.ngrams:
            options.append(f"--ngram {n}")

        for n in self.skips:
            options.append(f"--skips {n}")

        if self.l1 != 0:
            options.append(f"--l1 {self.l1}")

        if self.l2 != 0:
            options.append(f"--l2 {self.l2}")

        command = " ".join(options)
        log.info(f"Running:\n{command}")

        try:
            shell(command)
        finally:
            shell(f"rm -f {file_name} {file_name}.cache")
예제 #10
0
파일: vw.py 프로젝트: Pinafore/qb
    def train(self, training_data: TrainingData) -> None:
        log.info(f'Config:\n{pformat(self.parameters())}')
        questions = training_data[0]
        answers = training_data[1]

        x_data = []
        y_data = []
        for q, ans in zip(questions, answers):
            for sent in q:
                x_data.append(sent)
                y_data.append(ans)

        label_set = set(answers)
        self.label_to_i = {label: i for i, label in enumerate(label_set, 1)}
        self.i_to_label = {i: label for label, i in self.label_to_i.items()}
        self.max_label = len(self.label_to_i)

        temp_dir = get_tmp_dir()
        with tempfile.NamedTemporaryFile('w', delete=False, dir=temp_dir) as f:
            file_name = f.name
            zipped = list(zip(x_data, y_data))
            random.shuffle(zipped)
            for x, y in zipped:
                features = format_question(x)
                label = self.label_to_i[y]
                f.write('{label} |words {features}\n'.format(label=label, features=features))

        if self.multiclass_online_trees:
            multiclass_flag = '--log_multi'
        elif self.multiclass_one_against_all:
            multiclass_flag = '--oaa'
        else:
            raise ValueError('The options multiclass_one_against_all and multiclass_online_trees are XOR')

        self.model_file = get_tmp_filename()
        options = [
            'vw',
            '-k',
            f'{multiclass_flag}',
            f'{self.max_label}',
            f'-d {file_name}',
            f'-f {self.model_file}.vw',
            '--loss_function logistic',
            '-c',
            f'--passes {self.passes}',
            f'-b {self.bits}',
            f'-l {self.learning_rate}',
            f'--decay_learning_rate {self.decay_learning_rate}',
            f'--random_seed {self.random_seed}'
        ]

        for n in self.ngrams:
            options.append(f'--ngram {n}')

        for n in self.skips:
            options.append(f'--skips {n}')

        if self.l1 != 0:
            options.append(f'--l1 {self.l1}')

        if self.l2 != 0:
            options.append(f'--l2 {self.l2}')

        command = ' '.join(options)
        log.info(f'Running:\n{command}')

        try:
            shell(command)
        finally:
            shell(f'rm -f {file_name} {file_name}.cache')
예제 #11
0
파일: pipeline.py 프로젝트: Pinafore/qb
 def run(self):
     tmp_file = get_tmp_filename()
     shell(f'wget {self.url} -O {tmp_file}')
     shell(f'mv {tmp_file} {self.path}')
     shell(f'rm -f {tmp_file}')
예제 #12
0
    def train(self, training_data):
        log.info("Loading Quiz Bowl dataset")
        train_iter, val_iter, dev_iter = QuizBowl.iters(
            batch_size=self.batch_size,
            lower=self.lowercase,
            use_wiki=self.use_wiki,
            n_wiki_sentences=self.n_wiki_sentences,
            replace_title_mentions=self.wiki_title_replace_token,
            combined_ngrams=self.combined_ngrams,
            unigrams=self.unigrams,
            bigrams=self.bigrams,
            trigrams=self.trigrams,
            combined_max_vocab_size=self.combined_max_vocab_size,
            unigram_max_vocab_size=self.unigram_max_vocab_size,
            bigram_max_vocab_size=self.bigram_max_vocab_size,
            trigram_max_vocab_size=self.trigram_max_vocab_size,
        )
        log.info(f"N Train={len(train_iter.dataset.examples)}")
        log.info(f"N Test={len(val_iter.dataset.examples)}")
        fields: Dict[str, Field] = train_iter.dataset.fields
        self.page_field = fields["page"]
        self.n_classes = len(self.ans_to_i)
        self.qanta_id_field = fields["qanta_id"]
        self.emb_dim = 300

        if "text" in fields:
            self.text_field = fields["text"]
            log.info(f"Text Vocab={len(self.text_field.vocab)}")
        if "unigram" in fields:
            self.unigram_field = fields["unigram"]
            log.info(f"Unigram Vocab={len(self.unigram_field.vocab)}")
        if "bigram" in fields:
            self.bigram_field = fields["bigram"]
            log.info(f"Bigram Vocab={len(self.bigram_field.vocab)}")
        if "trigram" in fields:
            self.trigram_field = fields["trigram"]
            log.info(f"Trigram Vocab={len(self.trigram_field.vocab)}")

        log.info("Initializing Model")
        self.model = DanModel(
            self.n_classes,
            text_field=self.text_field,
            unigram_field=self.unigram_field,
            bigram_field=self.bigram_field,
            trigram_field=self.trigram_field,
            emb_dim=self.emb_dim,
            n_hidden_units=self.n_hidden_units,
            n_hidden_layers=self.n_hidden_layers,
            nn_dropout=self.nn_dropout,
            pooling=self.pooling,
        )
        if CUDA:
            self.model = self.model.cuda()
        log.info(f"Parameters:\n{self.parameters()}")
        log.info(f"Model:\n{self.model}")
        self.optimizer = Adam(self.model.parameters())
        self.criterion = nn.CrossEntropyLoss()
        self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer,
                                                        patience=5,
                                                        verbose=True,
                                                        mode="max")

        temp_prefix = get_tmp_filename()
        self.model_file = f"{temp_prefix}.pt"
        manager = TrainingManager([
            BaseLogger(log_func=log.info),
            TerminateOnNaN(),
            EarlyStopping(monitor="test_acc", patience=10, verbose=1),
            MaxEpochStopping(100),
            ModelCheckpoint(create_save_model(self.model),
                            self.model_file,
                            monitor="test_acc"),
        ])

        log.info("Starting training")

        epoch = 0
        while True:
            self.model.train()
            train_acc, train_loss, train_time = self.run_epoch(train_iter)

            self.model.eval()
            test_acc, test_loss, test_time = self.run_epoch(val_iter)

            stop_training, reasons = manager.instruct(train_time, train_loss,
                                                      train_acc, test_time,
                                                      test_loss, test_acc)

            if stop_training:
                log.info(" ".join(reasons))
                break
            else:
                self.scheduler.step(test_acc)
            epoch += 1
예제 #13
0
파일: pipeline.py 프로젝트: theJasonFan/qb
 def run(self):
     tmp_file = get_tmp_filename()
     shell(f'wget {self.url} -O {tmp_file}')
     shell(f'mv {tmp_file} {self.path}')
     shell(f'rm -f {tmp_file}')
예제 #14
0
파일: dan.py 프로젝트: Pinafore/qb
    def train(self, training_data):
        log.info('Loading Quiz Bowl dataset')
        train_iter, val_iter, dev_iter = QuizBowl.iters(
            batch_size=self.batch_size, lower=self.lowercase,
            use_wiki=self.use_wiki, n_wiki_sentences=self.n_wiki_sentences,
            replace_title_mentions=self.wiki_title_replace_token,
            combined_ngrams=self.combined_ngrams, unigrams=self.unigrams, bigrams=self.bigrams, trigrams=self.trigrams,
            combined_max_vocab_size=self.combined_max_vocab_size,
            unigram_max_vocab_size=self.unigram_max_vocab_size,
            bigram_max_vocab_size=self.bigram_max_vocab_size,
            trigram_max_vocab_size=self.trigram_max_vocab_size
        )
        log.info(f'N Train={len(train_iter.dataset.examples)}')
        log.info(f'N Test={len(val_iter.dataset.examples)}')
        fields: Dict[str, Field] = train_iter.dataset.fields
        self.page_field = fields['page']
        self.n_classes = len(self.ans_to_i)
        self.qanta_id_field = fields['qanta_id']
        self.emb_dim = 300

        if 'text' in fields:
            self.text_field = fields['text']
            log.info(f'Text Vocab={len(self.text_field.vocab)}')
        if 'unigram' in fields:
            self.unigram_field = fields['unigram']
            log.info(f'Unigram Vocab={len(self.unigram_field.vocab)}')
        if 'bigram' in fields:
            self.bigram_field = fields['bigram']
            log.info(f'Bigram Vocab={len(self.bigram_field.vocab)}')
        if 'trigram' in fields:
            self.trigram_field = fields['trigram']
            log.info(f'Trigram Vocab={len(self.trigram_field.vocab)}')

        log.info('Initializing Model')
        self.model = DanModel(
            self.n_classes,
            text_field=self.text_field,
            unigram_field=self.unigram_field, bigram_field=self.bigram_field, trigram_field=self.trigram_field,
            emb_dim=self.emb_dim,
            n_hidden_units=self.n_hidden_units, n_hidden_layers=self.n_hidden_layers,
            nn_dropout=self.nn_dropout,
            pooling=self.pooling
        )
        if CUDA:
            self.model = self.model.cuda()
        log.info(f'Parameters:\n{self.parameters()}')
        log.info(f'Model:\n{self.model}')
        self.optimizer = Adam(self.model.parameters())
        self.criterion = nn.CrossEntropyLoss()
        self.scheduler = lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=5, verbose=True, mode='max')

        temp_prefix = get_tmp_filename()
        self.model_file = f'{temp_prefix}.pt'
        manager = TrainingManager([
            BaseLogger(log_func=log.info), TerminateOnNaN(), EarlyStopping(monitor='test_acc', patience=10, verbose=1),
            MaxEpochStopping(100), ModelCheckpoint(create_save_model(self.model), self.model_file, monitor='test_acc')
        ])

        log.info('Starting training')

        epoch = 0
        while True:
            self.model.train()
            train_acc, train_loss, train_time = self.run_epoch(train_iter)

            self.model.eval()
            test_acc, test_loss, test_time = self.run_epoch(val_iter)

            stop_training, reasons = manager.instruct(
                train_time, train_loss, train_acc,
                test_time, test_loss, test_acc
            )

            if stop_training:
                log.info(' '.join(reasons))
                break
            else:
                self.scheduler.step(test_acc)
            epoch += 1
예제 #15
0
파일: pipeline.py 프로젝트: NPSDC/qb
 def run(self):
     tmp_file = get_tmp_filename()
     shell(f"wget {self.url} -O {tmp_file}")
     shell(f"mv {tmp_file} {self.path}")
     shell(f"rm -f {tmp_file}")