tag_scheme = parameters['tag_scheme'] # Load sentences train_sentences = loader.load_sentences(opts.train, lower, zeros) dev_sentences = loader.load_sentences(opts.dev, lower, zeros) test_sentences = loader.load_sentences(opts.test, lower, zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
tag_scheme = parameters['tag_scheme'] # Load sentences train_sentences = loader.load_sentences(opts.train, zeros) dev_sentences = loader.load_sentences(opts.dev, zeros) test_sentences = loader.load_sentences(opts.test, zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
def runModelInLoop(dropout,char_dim,char_lstm_dim,word_dim,word_lstm_dim): #results File resultsPath = "/Users/Ehsan/Documents/Ehsan_General/HMQ/HMQ_Projects/DNR2/COLING-2016-Code/i2b2-2010/results/" for u_dropout in dropout: for v_char_dim in char_dim: for w_char_lstm_dim in char_lstm_dim: for x_word_dim in word_dim: for y_word_lstm_dim in word_lstm_dim: for dataset in datasets: print "+++++++++++++++" print u_dropout,v_char_dim,w_char_lstm_dim,x_word_dim,y_word_lstm_dim,dataset parameters['dropout'] = u_dropout parameters['char_dim'] = v_char_dim parameters['char_lstm_dim'] =w_char_lstm_dim parameters['word_dim'] = x_word_dim parameters['word_lstm_dim'] = y_word_lstm_dim # If dataset is DrugBank assign predefined path if(dataset == "i2b2-2010"): opts.train = i2b2BasePath+"train.txt" opts.dev = i2b2BasePath+ "dev.txt" opts.test = i2b2BasePath+ "test.txt" resultsFile = resultsPath +"i2b2_2010_Results.txt" # Initialize model model = Model(parameters=parameters, models_path=models_path) print "Model location: %s" % model.model_path # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['tag_scheme'] # Load sentences train_sentences = loader.load_sentences(opts.train, lower, zeros) dev_sentences = loader.load_sentences(opts.dev, lower, zeros) test_sentences = loader.load_sentences(opts.test, lower, zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) print "Calling the prepare_dataset :--" # Index data train_data = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, lower ) dev_data = prepare_dataset( dev_sentences, word_to_id, char_to_id, tag_to_id, lower ) test_data = prepare_dataset( test_sentences, word_to_id, char_to_id, tag_to_id, lower ) print "%i / %i / %i sentences in train / dev / test." % ( len(train_data), len(dev_data), len(test_data)) # Save the mappings to disk print 'Saving the mappings to disk...' model.save_mappings(id_to_word, id_to_char, id_to_tag) # Build the model f_train, f_eval = model.build(**parameters) # Reload previous model values if opts.reload: print 'Reloading previous model...' model.reload() # Train network # singletons = set([word_to_id[k] for k, v in dico_words_train.items() if v == 1]) n_epochs = 2 # number of epochs over the training set freq_eval = 1000 # evaluate on dev every freq_eval steps best_dev = -np.inf best_test = -np.inf count = 0 for epoch in xrange(n_epochs): epoch_costs = [] print "Starting epoch %i..." % epoch for i, index in enumerate(np.random.permutation(len(train_data))): count += 1 input = create_input(train_data[index], parameters, True, singletons) new_cost = f_train(*input) epoch_costs.append(new_cost) #if i % 50 == 0 and i > 0 == 0: # print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:])) if count % freq_eval == 0: dev_score = evaluate(parameters, f_eval, dev_sentences, dev_data, id_to_tag, dico_tags) test_score = evaluate(parameters, f_eval, test_sentences, test_data, id_to_tag, dico_tags) print "Score on dev: %.5f" % dev_score print "Score on test: %.5f" % test_score if dev_score > best_dev: best_dev = dev_score print "New best score on dev."+str(best_dev) # print "Saving model to disk..." # model.save() if test_score > best_test: best_test = test_score print "New best score on test."+str(best_test) # print "Config values used are : " print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs)) # Write the best dev and test scores to the file del model with open(resultsFile, 'a') as f: f.write("dropout: "+ str(parameters['dropout'] ) +"| char_dim: |"+str(parameters['char_dim'])+ "| char_lstm_dim: "+str(parameters['char_lstm_dim']) +" word_dim: "+ str(parameters['word_dim']) +" |word_lstm_dim: "+ str( parameters['word_lstm_dim'] )+" | Best Dev Score: "+str(best_dev) + " | Best Test Score: "+str(best_test) +"\n") return
lower = parameters["lower"] zeros = parameters["zeros"] tag_scheme = parameters["tag_scheme"] train_sentences = loader.load_sentences(opts.train, lower, zeros) dev_sentences = loader.load_sentences(opts.dev, lower, zeros) test_sentences = loader.load_sentences(opts.test, lower, zeros) test_train_sentences = loader.load_sentences(opts.test_train, lower, zeros) loader.update_tag_scheme(train_sentences, tag_scheme) loader.update_tag_scheme(dev_sentences, tag_scheme) loader.update_tag_scheme(test_sentences, tag_scheme) loader.update_tag_scheme(test_train_sentences, tag_scheme) dico_words_train = loader.word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = loader.augment_with_pretrained( dico_words_train.copy(), parameters["pre_emb"], list(itertools.chain.from_iterable([[w[0] for w in s] for s in dev_sentences + test_sentences])) if not parameters["all_emb"] else None, ) dico_chars, char_to_id, id_to_char = loader.char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = loader.tag_mapping(train_sentences) train_data = loader.prepare_dataset(train_sentences, word_to_id, char_to_id, tag_to_id, lower) dev_data = loader.prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id, lower) test_data = loader.prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, lower) test_train_data = loader.prepare_dataset(test_train_sentences, word_to_id, char_to_id, tag_to_id, lower)
def train(self, n_epochs=100, freq_eval=1000, verbose=True, eval_test_set=False): """ :param n_epochs: number of epochs over the training set :param freq_eval: evaluate on dev every freq_eval steps :return: Saves the model with the best F1-Score, evaluated on the dev set """ # Initialize model model = Model(parameters=self.parameters, models_path=models_path) print("Model location: %s" % model.model_path) # Data parameters lower = self.parameters['lower'] zeros = self.parameters['zeros'] tag_scheme = self.parameters['tag_scheme'] # Load sentences train_sentences = loader.load_sentences(self.parameters['train'], lower, zeros) dev_sentences = loader.load_sentences(self.parameters['dev'], lower, zeros) test_sentences = loader.load_sentences(self.parameters['test'], lower, zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if self.parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), self.parameters['pre_emb'], list( itertools.chain.from_iterable([[w[0] for w in s] for s in dev_sentences + test_sentences])) if not self.parameters['all_emb'] else None) else: dico_words, word_to_id, id_to_word = word_mapping( train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) # Index data train_data = prepare_dataset(train_sentences, word_to_id, char_to_id, tag_to_id, lower) dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id, lower) test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) # Save the mappings to disk print('Saving the mappings to disk...') model.save_mappings(id_to_word, id_to_char, id_to_tag) # Build the model f_train, f_eval = model.build(**self.parameters) # Reload previous model values if self.parameters['reload']: print('Reloading previous model...') model.reload() # # Train network # singletons = set( [word_to_id[k] for k, v in dico_words_train.items() if v == 1]) best_dev = -np.inf best_test = -np.inf count = 0 for epoch in range(n_epochs): epoch_costs = [] print("Starting epoch %i at..." % epoch, time.ctime()) for i, index in enumerate(np.random.permutation(len(train_data))): count += 1 input = create_input(train_data[index], self.parameters, True, singletons) new_cost = f_train(*input) epoch_costs.append(new_cost) if i % 50 == 0 and i > 0 == 0 and verbose: print("%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))) if count % freq_eval == 0: dev_score = evaluate(self.parameters, f_eval, dev_sentences, dev_data, id_to_tag, verbose=verbose) if eval_test_set: test_score = evaluate(self.parameters, f_eval, test_sentences, test_data, id_to_tag, verbose=verbose) print("Score on dev: %.5f" % dev_score) if eval_test_set: print("Score on test: %.5f" % test_score) if dev_score > best_dev: best_dev = dev_score print("New best score on dev.") print("Saving model to disk...") model.save() if eval_test_set: if test_score > best_test: best_test = test_score print("New best score on test.") print( "Epoch %i done. Average cost: %f. Ended at..." % (epoch, np.mean(epoch_costs)), time.ctime()) return best_dev
def main(argv=None): # pylint: disable=unused-argument # if tf.gfile.Exists(FLAGS.eval_dir): # tf.gfile.DeleteRecursively(FLAGS.eval_dir) # tf.gfile.MakeDirs(FLAGS.eval_dir) # Read parameters from command line opts = read_args(evaluation=True) # Parse parameters parameters = form_parameters_dict(opts) # Check parameters validity assert os.path.isfile(opts.train) assert os.path.isfile(opts.dev) assert os.path.isfile(opts.test) assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0 assert 0. <= parameters['dropout'] < 1.0 assert parameters['t_s'] in ['iob', 'iobes'] assert not parameters['all_emb'] or parameters['pre_emb'] assert not parameters['pre_emb'] or parameters['word_dim'] > 0 assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb']) # Check evaluation script / folders if not os.path.isfile(eval_script): raise Exception('CoNLL evaluation script not found at "%s"' % eval_script) if not os.path.exists(eval_temp): os.makedirs(eval_temp) if not os.path.exists(models_path): os.makedirs(models_path) event_logs_path = os.path.join(eval_temp, "eval_logs") # if not os.path.exists(event_logs_path): # os.makedirs(event_logs_path) # Initialize model model = MainTaggerModel(parameters=parameters, models_path=models_path, overwrite_mappings=opts.overwrite_mappings) print "MainTaggerModel location: %s" % model.model_path # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['t_s'] max_sentence_lengths = {} max_word_lengths = {} # Load sentences train_sentences, max_sentence_lengths['train'], max_word_lengths['train'] = \ loader.load_sentences(opts.train, lower, zeros) dev_sentences, max_sentence_lengths['dev'], max_word_lengths['dev'] = loader.load_sentences( opts.dev, lower, zeros) test_sentences, max_sentence_lengths['test'], max_word_lengths['test'] = loader.load_sentences( opts.test, lower, zeros) global_max_sentence_length, global_max_char_length = \ calculate_global_maxes(max_sentence_lengths, max_word_lengths) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) if opts.overwrite_mappings: print 'Saving the mappings to disk...' model.save_mappings(id_to_word, id_to_char, id_to_tag) model.reload_mappings() # Index data train_buckets, train_stats, train_unique_words = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) dev_buckets, dev_stats, dev_unique_words = prepare_dataset( dev_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) test_buckets, test_stats, test_unique_words = prepare_dataset( test_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) print "%i / %i / %i sentences in train / dev / test." % ( len(train_stats), len(dev_stats), len(test_stats)) print "%i / %i / %i words in train / dev / test." % ( sum([x[0] for x in train_stats]), sum([x[0] for x in dev_stats]), sum([x[0] for x in test_stats])) print "%i / %i / %i longest sentences in train / dev / test." % ( max([x[0] for x in train_stats]), max([x[0] for x in dev_stats]), max([x[0] for x in test_stats])) print "%i / %i / %i shortest sentences in train / dev / test." % ( min([x[0] for x in train_stats]), min([x[0] for x in dev_stats]), min([x[0] for x in test_stats])) for i, label in [[2, 'char']]: print "%i / %i / %i total %s in train / dev / test." % ( sum([sum(x[i]) for x in train_stats]), sum([sum(x[i]) for x in dev_stats]), sum([sum(x[i]) for x in test_stats]), label) print "%i / %i / %i max. %s lengths in train / dev / test." % ( max([max(x[i]) for x in train_stats]), max([max(x[i]) for x in dev_stats]), max([max(x[i]) for x in test_stats]), label) print "%i / %i / %i min. %s lengths in train / dev / test." % ( min([min(x[i]) for x in train_stats]), min([min(x[i]) for x in dev_stats]), min([min(x[i]) for x in test_stats]), label) print "Max. sentence lengths: %s" % max_sentence_lengths print "Max. char lengths: %s" % max_word_lengths for label, bin_stats, n_unique_words in [['train', train_stats, train_unique_words], ['dev', dev_stats, dev_unique_words], ['test', test_stats, test_unique_words]]: int32_items = len(train_stats) * ( max_sentence_lengths[label] * (5 + max_word_lengths[label]) + 1) float32_items = n_unique_words * parameters['word_dim'] total_size = int32_items + float32_items logging.info("Input ids size of the %s dataset is %d" % (label, int32_items)) logging.info("Word embeddings (unique: %d) size of the %s dataset is %d" % ( n_unique_words, label, float32_items)) logging.info("Total size of the %s dataset is %d" % (label, total_size)) batch_size = 5 # Build the model cost, train_step, tag_scores, tag_ids, word_ids, \ crf_transition_params, sentence_lengths, enqueue_op, placeholders = model.build( max_sentence_length_scalar=global_max_sentence_length, max_word_length_scalar=global_max_char_length, batch_size_scalar=batch_size, **parameters) FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('eval_dir', event_logs_path, """Directory where to write event logs.""") tf.app.flags.DEFINE_string('eval_data', 'test', """Either 'test' or 'train_eval'.""") tf.app.flags.DEFINE_string('checkpoint_dir', model.model_path, """Directory where to read model checkpoints.""") tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5, """How often to run the eval.""") tf.app.flags.DEFINE_integer('num_examples', 10000, """Number of examples to run.""") tf.app.flags.DEFINE_boolean('run_once', False, """Whether to run eval only once.""") evaluate(model, dev_buckets, test_buckets, FLAGS, opts, id_to_tag, batch_size, placeholders, enqueue_op, tag_scores, tag_ids, word_ids, crf_transition_params, sentence_lengths, FLAGS.eval_dir, tag_scheme)
print opts.train assert os.path.isfile(opts.train) assert os.path.isfile(opts.dev) assert os.path.isfile(opts.test) # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] pos = parameters['use_pos'] # Load sentences train_sentences = loader.load_sentences(opts.train) dev_sentences = loader.load_sentences(opts.dev) test_sentences = loader.load_sentences(opts.test) dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower, zeros) dico_words_train = dico_words # Create a dictionary and a mapping for words / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) # Index data train_data = prepare_dataset(train_sentences, word_to_id, char_to_id, tag_to_id, lower, zeros) dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id, lower, zeros) test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, lower, zeros) ##Write to CRFPP Feature File