def get_scores(config, task, model_path, word_dict_path, label_dict_path, input_path, \ lower_case=True, allow_new_words=True, replace_vocab=True): with Timer('Data loading'): print ('Task: {}'.format(task)) print ('Allow new words in test data: {}. Lower case words: {}'.format(allow_new_words, lower_case)) # Load word and tag dictionary word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN) label_dict = Dictionary() word_dict.load(word_dict_path) label_dict.load(label_dict_path) data = TaggerData(config, [], [], word_dict, label_dict, None, None) # Load test data. if task == 'srl': test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data( input_path, config, data.word_dict, data.label_dict, lower_case, allow_new_words) else: test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data( input_path, config, data.word_dict, data.label_dict, lower_case, allow_new_words) print ('Read {} sentences.'.format(len(test_sentences))) # Add pre-trained embeddings for new words in the test data. #if allow_new_words: data.embedding_shapes = emb_shapes data.embeddings = emb_inits # Batching. test_data = data.get_test_data(test_sentences, batch_size=config.dev_batch_size) with Timer('Model building and loading'): model = BiLSTMTaggerModel(data, config=config, fast_predict=True) model.load(model_path, word_dict, replace_vocab) dist_function = model.get_distribution_function() with Timer('Running model'): scores = None for i, batched_tensor in enumerate(test_data): x, _, num_tokens, weights = batched_tensor p, sc = dist_function(x, weights) scores = numpy.concatenate((scores, sc), axis=0) if i > 0 else sc return scores, data, test_sentences, test_data
def get_scores_ctx(config, task, model_path, gemb_model_path, word_dict_path, label_dict_path, input_path): with Timer('Data loading'): print('Task: {}'.format(task)) allow_new_words = False print('Allow new words in test data: {}'.format(allow_new_words)) # Load word and tag dictionary word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN) label_dict = Dictionary() word_dict.load(word_dict_path) label_dict.load(label_dict_path) data = TaggerData(config, [], [], word_dict, label_dict, None, None) # Load test data. if task == 'srl': test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data_gemb( input_path, config, data.word_dict, data.label_dict, allow_new_words) else: test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data( input_path, config, data.word_dict, data.label_dict, allow_new_words) print('Read {} sentences.'.format(len(test_sentences))) # Add pre-trained embeddings for new words in the test data. #if allow_new_words: data.embedding_shapes = emb_shapes data.embeddings = emb_inits # Batching. test_data = data.get_ctx_gemb_test_data( test_sentences, batch_size=config.dev_batch_size) with Timer('Model building and loading'): model = BiLSTMTaggerModel(data, config=config, fast_predict=True) model.load(model_path) model.add_ctx_gemb() model.gemb.load(gemb_model_path) ctx_emb_function = model.get_ctx_emb_function() dist_function = model.get_distribution_by_gemb_function() with Timer('Running model'): scores = None for i, batched_tensor in enumerate(test_data): x, _, oov_pos, num_tokens, weights = batched_tensor # weights is mask oov_pos = oov_pos[0] # batch must be 1 gembedding, inputs_0 = ctx_emb_function(x, weights, oov_pos) inputs_0_new = replace_with_gemb(inputs_0, gembedding, oov_pos) p, sc = dist_function(inputs_0_new, weights) scores = numpy.concatenate((scores, sc), axis=0) if i > 0 else sc return scores, data, test_sentences, test_data
def load_model(model_path, model_type): config = configuration.get_config(os.path.join(model_path, 'config')) # Load word and tag dictionary word_dict = Dictionary(unknown_token=UNKNOWN_TOKEN) label_dict = Dictionary() word_dict.load(os.path.join(model_path, 'word_dict')) label_dict.load(os.path.join(model_path, 'label_dict')) data = TaggerData(config, [], [], word_dict, label_dict, None, None) if model_type == 'srl': test_sentences, emb_inits, emb_shapes = reader.get_srl_test_data( None, config, data.word_dict, data.label_dict, False) else: test_sentences, emb_inits, emb_shapes = reader.get_postag_test_data( None, config, data.word_dict, data.label_dict, False) data.embedding_shapes = emb_shapes data.embeddings = emb_inits model = BiLSTMTaggerModel(data, config=config, fast_predict=True) model.load(os.path.join(model_path, 'model.npz')) return model, data
def train_tagger(args): config = configuration.get_config(args.config) i = 0 global_step = 0 epoch = 0 train_loss = 0.0 with Timer('Data loading'): vocab_path = args.vocab if args.vocab != '' else None label_path = args.labels if args.labels != '' else None gold_props_path = args.gold if args.gold != '' else None print('Task: {}'.format(args.task)) if args.task == 'srl': # Data and evaluator for SRL. data = TaggerData( config, *reader.get_srl_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = SRLEvaluator(data.get_development_data(), data.label_dict, gold_props_file=gold_props_path, use_se_marker=config.use_se_marker, pred_props_file=None, word_dict=data.word_dict) else: # Data and evaluator for PropId. data = TaggerData( config, *reader.get_postag_data(config, args.train, args.dev, vocab_path, label_path)) evaluator = PropIdEvaluator(data.get_development_data(), data.label_dict) batched_dev_data = data.get_development_data( batch_size=config.dev_batch_size) print('Dev data has {} batches.'.format(len(batched_dev_data))) with Timer('Preparation'): if not os.path.isdir(args.model): print('Directory {} does not exist. Creating new.'.format( args.model)) os.makedirs(args.model) else: if len(os.listdir(args.model)) > 0: print( '[WARNING] Log directory {} is not empty, previous checkpoints might be overwritten' .format(args.model)) shutil.copyfile(args.config, os.path.join(args.model, 'config')) # Save word and label dict to model directory. data.word_dict.save(os.path.join(args.model, 'word_dict')) data.label_dict.save(os.path.join(args.model, 'label_dict')) writer = open(os.path.join(args.model, 'checkpoints.tsv'), 'w') writer.write( 'step\tdatetime\tdev_loss\tdev_accuracy\tbest_dev_accuracy\n') with Timer('Building model'): model = BiLSTMTaggerModel(data, config=config) for param in model.params: print param, param.name, param.shape.eval() loss_function = model.get_loss_function() eval_function = model.get_eval_function() model.save(os.path.join(args.model, 'model')) return while epoch < config.max_epochs: with Timer("Epoch%d" % epoch) as timer: train_data = data.get_training_data(include_last_batch=True) for batched_tensor in train_data: x, y, _, weights = batched_tensor loss = loss_function(x, weights, y) train_loss += loss i += 1 global_step += 1 if i % 400 == 0: timer.tick("{} training steps, loss={:.3f}".format( i, train_loss / i)) train_loss = train_loss / i print("Epoch {}, steps={}, loss={:.3f}".format(epoch, i, train_loss)) i = 0 epoch += 1 train_loss = 0.0 if epoch % config.checkpoint_every_x_epochs == 0: with Timer('Evaluation'): evaluate_tagger(model, eval_function, batched_dev_data, evaluator, writer, global_step) # Done. :) writer.close()