def get_dataset_from_path(path_dataset, vocab=None, shrink=1, char_based=False): train = read_first_col_is_label_format(os.path.join(path_dataset, 'train'), char_based=char_based) test = read_first_col_is_label_format(os.path.join(path_dataset, 'test'), char_based=char_based) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab
def get_dataset_from_path(path_dataset, vocab=None, shrink=1, char_based=False): train = read_other_dataset(os.path.join(path_dataset, 'train'), shrink=shrink, char_based=char_based) test = read_other_dataset(os.path.join(path_dataset, 'test'), shrink=shrink, char_based=char_based) train = transform_to_array(train, vocab) test = transform_to_array(test, vocab) return train, test, vocab
def predict(model, sentence): model, vocab, setup = model sentence = sentence.strip() text = nlp_utils.normalize_text(sentence) words = nlp_utils.split_text(text, char_based=setup['char_based']) xs = nlp_utils.transform_to_array([words], vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=-1, with_label=False) # todo use GPU with chainer.using_config('train', False), chainer.no_backprop_mode(): prob = model.predict(xs, softmax=True)[0] answer = int(model.xp.argmax(prob)) score = float(prob[answer]) return answer, score
def get_vectors(model, sentences): model, vocab, setup = model vectors = [] for sentence in sentences: sentence = sentence.strip() text = nlp_utils.normalize_text(sentence) words = nlp_utils.split_text(text, char_based=setup['char_based']) xs = nlp_utils.transform_to_array([words], vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=-1, with_label=False) # todo use GPU with chainer.using_config('train', False), chainer.no_backprop_mode(): vector = model.encoder(xs) vectors.append(vector.data[0]) vectors = numpy.asarray(vectors) return vectors
def get_result(self, embeddings, path_dataset, path_output='/tmp/text_classification/'): self.out = path_output self.unit = embeddings.matrix.shape[1] if not os.path.isdir(path_output): os.makedirs(path_output) # TODO: move this to protonn ds management self.path_dataset = path_dataset # if self.path_dataset == 'dbpedia': # train, test, vocab = text_datasets.get_dbpedia( # char_based=self.char_based, # vocab=embeddings.vocabulary.dic_words_ids, # shrink=self.shrink) # elif self.path_dataset.startswith('imdb.'): # train, test, vocab = text_datasets.get_imdb( # fine_grained=self.path_dataset.endswith('.fine'), # char_based=self.char_based, # vocab=embeddings.vocabulary.dic_words_ids, # shrink=self.shrink) # elif self.path_dataset in ['TREC', 'stsa.binary', 'stsa.fine', # 'custrev', 'mpqa', 'rt-polarity', 'subj']: # train, test, vocab = text_datasets.get_other_text_dataset( # self.path_dataset, # char_based=self.char_based, # vocab=embeddings.vocabulary.dic_words_ids, # shrink=self.shrink) # else: # finallly, if file is not downloadable, load from local path print(path_dataset) path_adapter = os.path.join(path_dataset, "adapter.py") if os.path.isfile(path_adapter): spec = importlib.util.spec_from_file_location( "ds_adapter", path_adapter) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) adapter = module.Adapter() train, test, _ = adapter.read() vocab = embeddings.vocabulary.dic_words_ids train = nlp_utils.transform_to_array(train, vocab) test = nlp_utils.transform_to_array(test, vocab) # exit(0) else: train, test, vocab = text_datasets.get_dataset_from_path( path_dataset, vocab=embeddings.vocabulary.dic_words_ids, char_based=self.char_based, shrink=self.shrink) print('# cnt train samples: {}'.format(len(train))) print('# cnt test samples: {}'.format(len(test))) print('# size vocab: {}'.format(len(vocab))) n_class = len(set([int(d[1]) for d in train])) print('# cnt classes: {}'.format(n_class)) # print(train[0]) # exit(0) train_iter = chainer.iterators.SerialIterator(train, self.batchsize) test_iter = chainer.iterators.SerialIterator(test, self.batchsize, repeat=False, shuffle=False) # Setup a model if self.model == 'rnn': Encoder = nets.RNNEncoder elif self.model == 'cnn': Encoder = nets.CNNEncoder elif self.model == 'bow': Encoder = nets.BOWMLPEncoder encoder = Encoder(n_layers=self.layer, n_vocab=len(vocab), n_units=self.unit, dropout=self.dropout, wv=embeddings.matrix) model = nets.TextClassifier(encoder, n_class) if self.gpu >= 0: # Make a specified GPU current chainer.backends.cuda.get_device_from_id(self.gpu).use() model.to_gpu() # Copy the model to the GPU # Setup an optimizer optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(1e-4)) # Set up a trainer updater = training.StandardUpdater(train_iter, optimizer, converter=nlp_utils.convert_seq, device=self.gpu) trainer = training.Trainer(updater, (self.epoch, 'epoch'), out=self.out) # Evaluate the model with the test dataset for each epoch trainer.extend( extensions.Evaluator(test_iter, model, converter=nlp_utils.convert_seq, device=self.gpu)) # Take a best snapshot record_trigger = training.triggers.MaxValueTrigger( 'validation/main/accuracy', (1, 'epoch')) trainer.extend(extensions.snapshot_object(model, 'best_model.npz'), trigger=record_trigger) # Write a log of evaluation statistics for each epoch trainer.extend(extensions.LogReport()) trainer.extend( extensions.PrintReport([ 'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy', 'validation/main/accuracy', 'elapsed_time' ])) # Print a progress bar to stdout trainer.extend(extensions.ProgressBar()) # Save vocabulary and model's setting if not os.path.isdir(self.out): os.mkdir(self.out) vocab_path = os.path.join(self.out, 'vocab.json') with open(vocab_path, 'w') as f: json.dump(vocab, f) model_path = os.path.join(self.out, 'best_model.npz') experiment_setup = self.__dict__ # TODO: move all this to the parent class experiment_setup['task'] = "text classification" experiment_setup['vocab_path'] = vocab_path experiment_setup['model_path'] = model_path experiment_setup['n_class'] = n_class experiment_setup['datetime'] = self.current_datetime with open(os.path.join(self.out, 'args.json'), 'w') as f: json.dump(self.__dict__, f) # Run the training trainer.run() result = {} result['experiment_setup'] = experiment_setup result['experiment_setup']['default_measurement'] = 'accuracy' result['experiment_setup']['dataset'] = os.path.basename( os.path.normpath(path_dataset)) result['experiment_setup']['method'] = self.model result['experiment_setup']['embeddings'] = embeddings.metadata result['log'] = load_json(os.path.join(self.out, 'log')) # TODO: old version was returning last test value, make a footnote # result['result'] = {"accuracy": result['log'][-1]['validation/main/accuracy']} accuracy = max(_["validation/main/accuracy"] for _ in result['log']) result['result'] = {"accuracy": accuracy} return [result]