def load_model(model_path, wv): setup = json.load(open(model_path)) vocab = json.load(open(setup['vocab_path'])) n_class = setup['n_class'] # Setup a model if setup['model'] == 'rnn': Encoder = nets.RNNEncoder elif setup['model'] == 'cnn': Encoder = nets.CNNEncoder elif setup['model'] == 'bow': Encoder = nets.BOWMLPEncoder encoder = Encoder(n_layers=setup['layer'], n_vocab=len(vocab), n_units=setup['unit'], dropout=setup['dropout'], wv=wv) model = nets.TextClassifier(encoder, n_class) chainer.serializers.load_npz(setup['model_path'], model) gpu = -1 # todo gpu if gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Copy the model to the GPU return model, vocab, setup
def get_result(self, embs, path_dataset, path_output='/tmp/text_classification/'): self.out = path_output self.unit = embs.matrix.shape[1] if not os.path.isdir(path_output): os.makedirs(path_output) # Load a dataset self.path_dataset = path_dataset if self.path_dataset == 'dbpedia': train, test, vocab = text_datasets.get_dbpedia( char_based=self.char_based, vocab=embs.vocabulary.dic_words_ids, shrink=self.shrink) elif self.path_dataset.startswith('imdb.'): train, test, vocab = text_datasets.get_imdb( fine_grained=self.path_dataset.endswith('.fine'), char_based=self.char_based, vocab=embs.vocabulary.dic_words_ids, shrink=self.shrink) elif self.path_dataset in [ 'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj' ]: train, test, vocab = text_datasets.get_other_text_dataset( self.path_dataset, char_based=self.char_based, vocab=embs.vocabulary.dic_words_ids, shrink=self.shrink) else: # finallly, if file is not downloadable, load from local path train, test, vocab = text_datasets.get_dataset_from_path( path_dataset, vocab=embs.vocabulary.dic_words_ids, char_based=self.char_based, shrink=self.shrink) print('# train data: {}'.format(len(train))) print('# test data: {}'.format(len(test))) print('# vocab: {}'.format(len(vocab))) n_class = len(set([int(d[1]) for d in train])) print('# class: {}'.format(n_class)) train_iter = chainer.iterators.SerialIterator(train, self.batchsize) test_iter = chainer.iterators.SerialIterator(test, self.batchsize, repeat=False, shuffle=False) # Setup a model if self.model == 'rnn': Encoder = nets.RNNEncoder elif self.model == 'cnn': Encoder = nets.CNNEncoder elif self.model == 'bow': Encoder = nets.BOWMLPEncoder encoder = Encoder(n_layers=self.layer, n_vocab=len(vocab), n_units=self.unit, dropout=self.dropout, wv=embs.matrix) model = nets.TextClassifier(encoder, n_class) if self.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(self.gpu).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4)) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, converter=convert_seq, device=self.gpu) trainer = training.Trainer(updater, (self.epoch, 'epoch'), out=self.out) # Evaluate the model with the test dataset for each epoch trainer.extend( extensions.Evaluator(test_iter, model, converter=convert_seq, device=self.gpu)) # Take a best snapshot record_trigger = training.triggers.MaxValueTrigger( 'validation/main/accuracy', (1, 'epoch')) trainer.extend(extensions.snapshot_object(model, 'best_model.npz'), trigger=record_trigger) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) # Save vocabulary and model's setting if not os.path.isdir(self.out): os.mkdir(self.out) vocab_path = os.path.join(self.out, 'vocab.json') with open(vocab_path, 'w') as f: json.dump(vocab, f) model_path = os.path.join(self.out, 'best_model.npz') experiment_setup = self.__dict__ experiment_setup['vocab_path'] = vocab_path experiment_setup['model_path'] = model_path experiment_setup['n_class'] = n_class experiment_setup['datetime'] = self.current_datetime with open(os.path.join(self.out, 'args.json'), 'w') as f: json.dump(self.__dict__, f) # Run the training trainer.run() result = {} result['experiment_setup'] = experiment_setup result['log'] = load_json(os.path.join(self.out, 'log')) result['result'] = result['log'][-1]['validation/main/accuracy'] return result
def get_result(self, embeddings, path_dataset, path_output='/tmp/text_classification/'): self.out = path_output self.unit = embeddings.matrix.shape[1] if not os.path.isdir(path_output): os.makedirs(path_output) # TODO: move this to protonn ds management self.path_dataset = path_dataset # if self.path_dataset == 'dbpedia': # train, test, vocab = text_datasets.get_dbpedia( # char_based=self.char_based, # vocab=embeddings.vocabulary.dic_words_ids, # shrink=self.shrink) # elif self.path_dataset.startswith('imdb.'): # train, test, vocab = text_datasets.get_imdb( # fine_grained=self.path_dataset.endswith('.fine'), # char_based=self.char_based, # vocab=embeddings.vocabulary.dic_words_ids, # shrink=self.shrink) # elif self.path_dataset in ['TREC', 'stsa.binary', 'stsa.fine', # 'custrev', 'mpqa', 'rt-polarity', 'subj']: # train, test, vocab = text_datasets.get_other_text_dataset( # self.path_dataset, # char_based=self.char_based, # vocab=embeddings.vocabulary.dic_words_ids, # shrink=self.shrink) # else: # finallly, if file is not downloadable, load from local path print(path_dataset) path_adapter = os.path.join(path_dataset, "adapter.py") if os.path.isfile(path_adapter): spec = importlib.util.spec_from_file_location( "ds_adapter", path_adapter) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) adapter = module.Adapter() train, test, _ = adapter.read() vocab = embeddings.vocabulary.dic_words_ids train = nlp_utils.transform_to_array(train, vocab) test = nlp_utils.transform_to_array(test, vocab) # exit(0) else: train, test, vocab = text_datasets.get_dataset_from_path( path_dataset, vocab=embeddings.vocabulary.dic_words_ids, char_based=self.char_based, shrink=self.shrink) print('# cnt train samples: {}'.format(len(train))) print('# cnt test samples: {}'.format(len(test))) print('# size vocab: {}'.format(len(vocab))) n_class = len(set([int(d[1]) for d in train])) print('# cnt classes: {}'.format(n_class)) # print(train[0]) # exit(0) train_iter = chainer.iterators.SerialIterator(train, self.batchsize) test_iter = chainer.iterators.SerialIterator(test, self.batchsize, repeat=False, shuffle=False) # Setup a model if self.model == 'rnn': Encoder = nets.RNNEncoder elif self.model == 'cnn': Encoder = nets.CNNEncoder elif self.model == 'bow': Encoder = nets.BOWMLPEncoder encoder = Encoder(n_layers=self.layer, n_vocab=len(vocab), n_units=self.unit, dropout=self.dropout, wv=embeddings.matrix) model = nets.TextClassifier(encoder, n_class) if self.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(self.gpu).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4)) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, converter=nlp_utils.convert_seq, device=self.gpu) trainer = training.Trainer(updater, (self.epoch, 'epoch'), out=self.out) # Evaluate the model with the test dataset for each epoch trainer.extend( extensions.Evaluator(test_iter, model, converter=nlp_utils.convert_seq, device=self.gpu)) # Take a best snapshot record_trigger = training.triggers.MaxValueTrigger( 'validation/main/accuracy', (1, 'epoch')) trainer.extend(extensions.snapshot_object(model, 'best_model.npz'), trigger=record_trigger) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) # Save vocabulary and model's setting if not os.path.isdir(self.out): os.mkdir(self.out) vocab_path = os.path.join(self.out, 'vocab.json') with open(vocab_path, 'w') as f: json.dump(vocab, f) model_path = os.path.join(self.out, 'best_model.npz') experiment_setup = self.__dict__ # TODO: move all this to the parent class experiment_setup['task'] = "text classification" experiment_setup['vocab_path'] = vocab_path experiment_setup['model_path'] = model_path experiment_setup['n_class'] = n_class experiment_setup['datetime'] = self.current_datetime with open(os.path.join(self.out, 'args.json'), 'w') as f: json.dump(self.__dict__, f) # Run the training trainer.run() result = {} result['experiment_setup'] = experiment_setup result['experiment_setup']['default_measurement'] = 'accuracy' result['experiment_setup']['dataset'] = os.path.basename( os.path.normpath(path_dataset)) result['experiment_setup']['method'] = self.model result['experiment_setup']['embeddings'] = embeddings.metadata result['log'] = load_json(os.path.join(self.out, 'log')) # TODO: old version was returning last test value, make a footnote # result['result'] = {"accuracy": result['log'][-1]['validation/main/accuracy']} accuracy = max(_["validation/main/accuracy"] for _ in result['log']) result['result'] = {"accuracy": accuracy} return [result]