示例#1
0
def get_dataset_from_path(path_dataset, vocab=None, shrink=1,
                          char_based=False):
    train = read_first_col_is_label_format(os.path.join(path_dataset, 'train'),
                               char_based=char_based)
    test = read_first_col_is_label_format(os.path.join(path_dataset, 'test'),
                              char_based=char_based)


    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
示例#2
0
def get_dataset_from_path(path_dataset,
                          vocab=None,
                          shrink=1,
                          char_based=False):
    train = read_other_dataset(os.path.join(path_dataset, 'train'),
                               shrink=shrink,
                               char_based=char_based)
    test = read_other_dataset(os.path.join(path_dataset, 'test'),
                              shrink=shrink,
                              char_based=char_based)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
def predict(model, sentence):
    model, vocab, setup = model
    sentence = sentence.strip()
    text = nlp_utils.normalize_text(sentence)
    words = nlp_utils.split_text(text, char_based=setup['char_based'])
    xs = nlp_utils.transform_to_array([words], vocab, with_label=False)
    xs = nlp_utils.convert_seq(xs, device=-1, with_label=False)  # todo use GPU
    with chainer.using_config('train', False), chainer.no_backprop_mode():
        prob = model.predict(xs, softmax=True)[0]
    answer = int(model.xp.argmax(prob))
    score = float(prob[answer])
    return answer, score
def get_vectors(model, sentences):
    model, vocab, setup = model
    vectors = []
    for sentence in sentences:
        sentence = sentence.strip()
        text = nlp_utils.normalize_text(sentence)
        words = nlp_utils.split_text(text, char_based=setup['char_based'])
        xs = nlp_utils.transform_to_array([words], vocab, with_label=False)
        xs = nlp_utils.convert_seq(xs, device=-1,
                                   with_label=False)  # todo use GPU
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            vector = model.encoder(xs)
            vectors.append(vector.data[0])
    vectors = numpy.asarray(vectors)
    return vectors
示例#5
0
    def get_result(self,
                   embeddings,
                   path_dataset,
                   path_output='/tmp/text_classification/'):
        self.out = path_output
        self.unit = embeddings.matrix.shape[1]

        if not os.path.isdir(path_output):
            os.makedirs(path_output)

        # TODO: move this to protonn ds management
        self.path_dataset = path_dataset
        # if self.path_dataset == 'dbpedia':
        #     train, test, vocab = text_datasets.get_dbpedia(
        #         char_based=self.char_based,
        #         vocab=embeddings.vocabulary.dic_words_ids,
        #         shrink=self.shrink)
        # elif self.path_dataset.startswith('imdb.'):
        #     train, test, vocab = text_datasets.get_imdb(
        #         fine_grained=self.path_dataset.endswith('.fine'),
        #         char_based=self.char_based,
        #         vocab=embeddings.vocabulary.dic_words_ids,
        #         shrink=self.shrink)
        # elif self.path_dataset in ['TREC', 'stsa.binary', 'stsa.fine',
        #                            'custrev', 'mpqa', 'rt-polarity', 'subj']:
        #     train, test, vocab = text_datasets.get_other_text_dataset(
        #         self.path_dataset,
        #         char_based=self.char_based,
        #         vocab=embeddings.vocabulary.dic_words_ids,
        #         shrink=self.shrink)
        # else:  # finallly, if file is not downloadable, load from local path
        print(path_dataset)
        path_adapter = os.path.join(path_dataset, "adapter.py")
        if os.path.isfile(path_adapter):
            spec = importlib.util.spec_from_file_location(
                "ds_adapter", path_adapter)
            module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(module)
            adapter = module.Adapter()
            train, test, _ = adapter.read()
            vocab = embeddings.vocabulary.dic_words_ids
            train = nlp_utils.transform_to_array(train, vocab)
            test = nlp_utils.transform_to_array(test, vocab)

            # exit(0)
        else:
            train, test, vocab = text_datasets.get_dataset_from_path(
                path_dataset,
                vocab=embeddings.vocabulary.dic_words_ids,
                char_based=self.char_based,
                shrink=self.shrink)

        print('# cnt train samples: {}'.format(len(train)))
        print('# cnt test  samples: {}'.format(len(test)))
        print('# size vocab: {}'.format(len(vocab)))
        n_class = len(set([int(d[1]) for d in train]))
        print('# cnt classes: {}'.format(n_class))
        # print(train[0])
        # exit(0)

        train_iter = chainer.iterators.SerialIterator(train, self.batchsize)
        test_iter = chainer.iterators.SerialIterator(test,
                                                     self.batchsize,
                                                     repeat=False,
                                                     shuffle=False)

        # Setup a model
        if self.model == 'rnn':
            Encoder = nets.RNNEncoder
        elif self.model == 'cnn':
            Encoder = nets.CNNEncoder
        elif self.model == 'bow':
            Encoder = nets.BOWMLPEncoder
        encoder = Encoder(n_layers=self.layer,
                          n_vocab=len(vocab),
                          n_units=self.unit,
                          dropout=self.dropout,
                          wv=embeddings.matrix)
        model = nets.TextClassifier(encoder, n_class)
        if self.gpu >= 0:
            # Make a specified GPU current
            chainer.backends.cuda.get_device_from_id(self.gpu).use()
            model.to_gpu()  # Copy the model to the GPU

        # Setup an optimizer
        optimizer = chainer.optimizers.Adam()
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4))

        # Set up a trainer
        updater = training.StandardUpdater(train_iter,
                                           optimizer,
                                           converter=nlp_utils.convert_seq,
                                           device=self.gpu)
        trainer = training.Trainer(updater, (self.epoch, 'epoch'),
                                   out=self.out)

        # Evaluate the model with the test dataset for each epoch
        trainer.extend(
            extensions.Evaluator(test_iter,
                                 model,
                                 converter=nlp_utils.convert_seq,
                                 device=self.gpu))

        # Take a best snapshot
        record_trigger = training.triggers.MaxValueTrigger(
            'validation/main/accuracy', (1, 'epoch'))
        trainer.extend(extensions.snapshot_object(model, 'best_model.npz'),
                       trigger=record_trigger)

        # Write a log of evaluation statistics for each epoch
        trainer.extend(extensions.LogReport())
        trainer.extend(
            extensions.PrintReport([
                'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
                'validation/main/accuracy', 'elapsed_time'
            ]))

        # Print a progress bar to stdout
        trainer.extend(extensions.ProgressBar())

        # Save vocabulary and model's setting
        if not os.path.isdir(self.out):
            os.mkdir(self.out)
        vocab_path = os.path.join(self.out, 'vocab.json')
        with open(vocab_path, 'w') as f:
            json.dump(vocab, f)
        model_path = os.path.join(self.out, 'best_model.npz')
        experiment_setup = self.__dict__
        # TODO: move all this to the parent class
        experiment_setup['task'] = "text classification"
        experiment_setup['vocab_path'] = vocab_path
        experiment_setup['model_path'] = model_path
        experiment_setup['n_class'] = n_class
        experiment_setup['datetime'] = self.current_datetime
        with open(os.path.join(self.out, 'args.json'), 'w') as f:
            json.dump(self.__dict__, f)

        # Run the training
        trainer.run()

        result = {}
        result['experiment_setup'] = experiment_setup
        result['experiment_setup']['default_measurement'] = 'accuracy'
        result['experiment_setup']['dataset'] = os.path.basename(
            os.path.normpath(path_dataset))
        result['experiment_setup']['method'] = self.model
        result['experiment_setup']['embeddings'] = embeddings.metadata
        result['log'] = load_json(os.path.join(self.out, 'log'))

        # TODO: old version was returning last test value, make a footnote
        # result['result'] = {"accuracy": result['log'][-1]['validation/main/accuracy']}
        accuracy = max(_["validation/main/accuracy"] for _ in result['log'])
        result['result'] = {"accuracy": accuracy}
        return [result]