def post(self): """ Parse multiple string and return the associated entity for each token in each string. """ args = self.parser.parse_args() ref_strings = args.get('strings') tokens = [[[token] for token in ref_string.split(" ")] for ref_string in ref_strings] data = prepare_dataset(tokens, current_app.word_to_id, current_app.char_to_id, {}, current_app.model.parameters['lower'], True) tagged = [] for index, datum in enumerate(data): model_inputs = create_input(datum, current_app.model.parameters, False) y_pred = np.array(current_app.inference[1](*model_inputs))[1:-1] tags = [ current_app.model.id_to_tag[y_pred[i]] for i in range(len(y_pred)) ] tagged.append([ Entity(term=term, entity=entity) for term, entity in zip(ref_strings[index].split(" "), tags) ]) response = ParseBatchResponse(reference_strings=ref_strings, data=tagged) return response
def training_loop(): taskB_out_label = [] for i, file in enumerate(taskB_in): output = [] for j, sent in enumerate(file): with torch.no_grad(): precheck_sent = utils.create_input(taskB_in[i][j].split(), tokenizer).to(device) sent_out_label = model(precheck_sent)[1] sent_str_label = [biluo_decode[t] for t in sent_out_label] output.append(sent_str_label) taskB_out_label.append(output) true_pos = 0 total_ph_pred = 0 for i, file in enumerate(taskB_label): file_true_pos, file_total_ph_pred = eval_func( taskB_out_label[i], [["O"] + s.split() + ["O"] for s in taskB_label[i]]) true_pos = true_pos + file_true_pos total_ph_pred = total_ph_pred + file_total_ph_pred precision = 0 recall = 0 F1score = 0 if (total_ph_pred != 0): precision = true_pos / total_ph_pred if (total_phrases_truth != 0): recall = true_pos / total_phrases_truth if ((precision + recall) != 0): F1score = 2 * precision * recall / (precision + recall) print("Precision : {} | Recall : {} | F1 Score : {}".format( precision, recall, F1score))
def post(self): """ Parse a single string and return the associated entity for each token in the string. """ args = self.parser.parse_args() ref_string = args.get('string') if ref_string is None or ref_string == "": # Hackish way as reqparse can't catch empty string abort(400, description='string is empty or not provided.') tokens = ref_string.split(" ") data = prepare_dataset([[[token] for token in tokens]], current_app.word_to_id, current_app.char_to_id, {}, current_app.model.parameters['lower'], True) model_inputs = create_input(data[0], current_app.model.parameters, False) y_pred = np.array(current_app.inference[1](*model_inputs))[1:-1] tags = [ current_app.model.id_to_tag[y_pred[i]] for i in range(len(y_pred)) ] response = ParseResponse(reference_string=ref_string, data=[ Entity(term=term, entity=entity) for term, entity in zip(tokens, tags) ]) return response
def next_batch(self, bz): index = np.random.choice(len(self.memory), bz) memory = [self.memory[i] for i in index] state = [translate_state(i.get("state")) for i in memory] state_next = [translate_state(i.get("state_next")) for i in memory] action = [i.get("action") for i in memory] finish = [int(i.get("finish")) for i in memory] reward = [i.get("reward") for i in memory] result = { "state": create_input(state), "state_next": create_input(state_next), "action": action, "finish": finish, "reward": reward } return result
def validation_loop(): valid_taskB_out_label = [] for i, file in enumerate(valid_taskB_in): output = [] for j, sent in enumerate(file): with torch.no_grad(): precheck_sent = utils.create_input( valid_taskB_in[i][j].split(), tokenizer).to(device) sent_out_label = model(precheck_sent)[1] sent_str_label = [biluo_decode[t] for t in sent_out_label] output.append(sent_str_label) valid_taskB_out_label.append(output) valid_true_pos = 0 valid_total_ph_pred = 0 for i, file in enumerate(valid_taskB_label): valid_file_true_pos, valid_file_total_ph_pred = eval_func( valid_taskB_out_label[i], [["O"] + s.split() + ["O"] for s in valid_taskB_label[i]]) valid_true_pos = valid_true_pos + valid_file_true_pos valid_total_ph_pred = valid_total_ph_pred + valid_file_total_ph_pred valid_precision = 0 valid_recall = 0 valid_F1score = 0 if (valid_total_ph_pred != 0): valid_precision = valid_true_pos / valid_total_ph_pred if (valid_total_phrases_truth != 0): valid_recall = valid_true_pos / valid_total_phrases_truth if ((valid_precision + valid_recall) != 0): valid_F1score = 2 * valid_precision * \ valid_recall/(valid_precision+valid_recall) print("Precision : {} | Recall : {} | F1 Score : {}".format( valid_precision, valid_recall, valid_F1score))
def prepare_dataset(sentences, word_to_id, char_to_id, gazetteer_list, brown_dict, tag_to_id, l1_model, l1_f_eval, lower=False): """ Prepare the dataset. Return a list of lists of dictionaries containing: - word indexes - word char indexes - tag indexes """ def f(x): return x.lower() if lower else x data = [] for s in sentences: str_words = [w[0] for w in s] words = [ word_to_id[f(w) if f(w) in word_to_id else '<UNK>'] for w in str_words ] # Skip characters that are not in the training set chars = [[char_to_id[c] for c in w if c in char_to_id] for w in str_words] caps = [cap_feature(w) for w in str_words] gazetteer = [gazetteer_feature(w, gazetteer_list) for w in str_words] brown = [brown_feature(w, brown_dict) for w in str_words] sent = { 'str_words': str_words, 'words': words, 'chars': chars, 'caps': caps, 'gazetteer': gazetteer, 'brown': brown, } if l1_model is not None: input = create_input(sent, l1_model.parameters, False) try: if l1_model.parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [l1_model.id_to_tag[y_pred] for y_pred in y_preds] except Exception as e: y_preds = ["O"] * len(str_words) sent['pred'] = [0 if y_pred == "O" else 1 for y_pred in y_preds] tags = [tag_to_id[w[-1]] for w in s] sent['tags'] = tags data.append(sent) return data
def tag(): if request.method == 'POST': data = request.get_json() text = data['text'] if data['split_sentences']: sentences = split_sentences(text) else: sentences = text if data['tokenize'] or data['split_sentences']: tokenized_sentences = [tokenize(s) for s in sentences] else: tokenized_sentences = text count = 0 output = [] for words in tokenized_sentences: if len(words) == 0: continue # Lowercase sentence if model.parameters['lower']: line = line.lower() # Replace all digits with zeros if model.parameters['zeros']: line = zero_digits(line) # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=model.parameters['lower']) input = create_input(sentence, model.parameters, False) # Decoding if model.parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if model.parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len( words ), "Predictions have different length than sentence. Something went wrong." output.append(list(zip(words, y_preds))) count += 1 if count % 100 == 0: logging.info(count) return jsonify(output)
def create_input(self): imgs = [] wides = [] for i,box in enumerate(self.all_after_row_connect): result = Box_Cell(box.bbox,self.img,self.types[i]) if result.img_wide>self.max_wide: self.max_wide = result.img_wide wides.append(result.img_wide) imgs.append(result.normal_img) self.results.append(result) inputs,wides = utils.create_input(imgs,self.max_wide,wides) return inputs,wides
def tag(model, line): # Load existing model print("Loading model...") model = Model(model_path=model) parameters = model.parameters # Load reverse mappings word_to_id, char_to_id, tag_to_id = [{ v: k for k, v in x.items() } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]] # Load the model _, f_eval = model.build(training=False, **parameters) model.reload() start = time.time() print('Tagging...') words_ini = line.rstrip().split() # Replace all digits with zeros if parameters['zeros']: line = zero_digits(line) words = line.rstrip().split() # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) print('---- sentence tagged in %.4fs ----' % (time.time() - start)) return ' '.join(w + '__' + str(y) for w, y in zip(words_ini, y_preds))
def create_input(self): imgs = [] wides = [] def add_img(result): if result.img_wide>self.max_wide: self.max_wide = result.img_wide wides.append(result.img_wide) imgs.append(result.normal_img) for i,result in enumerate(self.all_after_row_connect): add_img(result) inputs,wides = utils.create_input(imgs,self.max_wide,wides) return inputs,wides
def tag_document(doc, parameters, model, f_eval, word_to_id, char_to_id): count = 0 all_ypreds = list() all_tokens = list() for line in doc.sentences: toks_text = [x.orth_ for x in line.tokens] # line = ' '.join(toks_text) if toks_text: # WL edit: used to be 'if line', was crashing on '\n' lines # Lowercase sentence if parameters['lower']: toks_text = [line.lower() for line in toks_text] # Replace all digits with zeros if parameters['zeros']: toks_text = [zero_digits(line) for line in toks_text] # Prepare input sentence = prepare_sentence(toks_text, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(toks_text) # strip IOB prefixes y_preds = [x.split('-')[-1] for x in y_preds] all_ypreds.append(y_preds) all_tokens.append(toks_text) count += 1 if count % 100 == 0: print count return (all_ypreds, all_tokens)
def prepare_sentence(str_words, word_to_id, char_to_id, gazetteer_list={}, brown_dict={}, l1_model=None, l1_f_eval=None, lower=False): """ Prepare a sentence for evaluation. """ def f(x): return x.lower() if lower else x words = [ word_to_id[f(w) if f(w) in word_to_id else '<UNK>'] for w in str_words ] chars = [[char_to_id[c] for c in w if c in char_to_id] for w in str_words] caps = [cap_feature(w) for w in str_words] gazetteer = [gazetteer_feature(w, gazetteer_list) for w in str_words] brown = [brown_feature(w, brown_dict) for w in str_words] sent = { 'str_words': str_words, 'words': words, 'chars': chars, 'caps': caps, 'gazetteer': gazetteer, 'brown': brown } if l1_model is not None: input = create_input(sent, l1_model.parameters, False) try: if l1_model.parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [l1_model.id_to_tag[y_pred] for y_pred in y_preds] except Exception as e: y_preds = ["O"] * len(str_words) sent['pred'] = [0 if y_pred == "O" else 1 for y_pred in y_preds] return sent
def predicts(self, line): if line: # Save original bigrams bigram_sent = self.to_bigram(line, 0).strip().split() # Replave all digits with zeros line = zero_digits(line) input_seq = self.to_bigram(line, 0).strip().split() # Prepare input sentence = prepare_sentence(input_seq, self.word_to_id, self.char_to_id, lower=self.parameters['lower']) input = create_input(sentence, self.parameters, False) if self.parameters['crf']: y_preds = np.array(self.f_eval(*input))[1:-1] else: y_preds = self.f_eval(*input).argmax(axis=1) tags = [self.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if self.parameters['tag_scheme'] == 'iobes': tags = iobes_iob(tags) print(tags) # Make output form out_form = "" unigram_sent = self.bigrams_to_unigrams(bigram_sent) for i in range(len(tags)): if tags[i].startswith('B'): out_form += '<' + unigram_sent[i] elif tags[i].startswith('I'): if i == len(tags) - 1: out_form += unigram_sent[i] + ':' + tags[i][2:] + '>' elif tags[i + 1] == 'O': out_form += unigram_sent[i] + ':' + tags[i][2:] + '>' else: out_form += unigram_sent[i] else: out_form += unigram_sent[i] return out_form
def parseString(self, string): #TO DO #To be consumed by web-service test_file = "test_file" file = open(test_file, 'w') file.write('\n'.join(string.encode('utf-8').split())) file.close() test_sentences = load_sentences(test_file, self.lower, self.zeros) data = self.prepare_dataset(test_sentences) result = '' for citation in data: input = create_input(citation, self.model.parameters, False) y_pred = np.array(self.f[1](*input))[1:-1] tags = [] for i in xrange(len(y_pred)): tags.append(self.model.id_to_tag[y_pred[i]]) for num, word in enumerate(string.encode('utf-8').split()): #print word.decode('utf-8')+'\t'+tags[num] result += word.decode('utf-8')+'\t'+tags[num]+'\n' return result
def evaluate(ctx, model, env, rounds=5, print_action=False, save=None): env.reset_env() for epoch in range(rounds): env.reset_env() done = 0 step = 0 while not done: step += 1 data = create_input([translate_state(env.map.state())]) data = [torch.FloatTensor(i).to(ctx) for i in data] pred = model.forward(data) action = int(torch.argmax(pred).cpu().numpy()) old, new, reward, done = env.step(action) if print_action: print(pred, reward, env.map.battery) if save is not None: img = Image.fromarray(env.map.render(), 'RGB') pred = [str(x)[0:5] for x in pred.detach().numpy().tolist()[0]] filename = "torch-" + str(epoch) + "-" + str(step) + "-" + str( reward) + "-" + "_".join(pred) + ".jpg" img.save(save + "/" + filename) return env.detect_rate
def evaluate(ctx, model, env, rounds=5, print_action=False, save=None): for epoch in range(rounds): env.reset_env() done = 0 step = 0 while not done: step += 1 data = create_input([translate_state(env.map.state())]) data = [nd.array(i, ctx=ctx) for i in data] pred = model(data) action = int(nd.argmax(pred, axis=1).asnumpy()[0]) old, new, reward, done = env.step(action) if print_action: print(pred, reward, env.map.battery) if save is not None: img = Image.fromarray( env.map.grid.render(20, env.map.agent_pos, env.map.agent_dir), 'RGB') pred = [str(x)[0:5] for x in pred.asnumpy().tolist()[0]] filename = str(epoch) + "-" + str(step) + "-" + str( reward) + "-" + "_".join(pred) + ".jpg" img.save(save + "/" + filename) return env.detect_rate
def evaluate(parameters, f_eval, raw_sentences, parsed_sentences, id_to_tag, input_file_path): """ Evaluate current model using CoNLL script. """ predictions = [] for raw_sentence, data in zip(raw_sentences, parsed_sentences): input = create_input(data, parameters, False) if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) p_tags = [id_to_tag[y_pred] for y_pred in y_preds] if parameters['tag_scheme'] == 'iobes': p_tags = iobes_iob(p_tags) for i, y_pred in enumerate(y_preds): new_line = "%s %s" % (raw_sentence[i][0], p_tags[i]) predictions.append(new_line) predictions.append("") output_path = os.path.join(opts.output, os.path.basename(input_file_path[:-4] + "_Tagged.txt")) with codecs.open(output_path, 'w', 'utf8') as f: f.write("\n".join(predictions))
if tags[i][0] == 'O': if len(preTag): res.append("</" + preTag + ">") preTag = "" res.append(sentence[i]) if len(preTag): res.append("</" + preTag + ">") return res #}}} print 'Tagging...' for line in test_data: # Prepare input input = create_input(line, parameters, False, useAttend=parameters['useAttend']) words = line['str_words'] # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input)) else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) # print words
with codecs.open(opts.input, 'r', 'utf-8') as f_input: count = 0 for line in f_input: words_ini = line.rstrip().split() if line: # Lowercase sentence if parameters['lower']: line = line.lower() # Replace all digits with zeros if parameters['zeros']: line = zero_digits(line) words = line.rstrip().split() # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) if opts.outputFormat == 'json': f_output.write(json.dumps({ "text": ' '.join(words), "ranges": iob_ranges(y_preds) })) else:
# # Train network # singletons = set([word_to_id[k] for k, v in dico_words_train.items() if v == 1]) n_epochs = 100 # number of epochs over the training set freq_eval = 1000 # evaluate on dev every freq_eval steps best_dev = -np.inf best_test = -np.inf count = 0 for epoch in xrange(n_epochs): epoch_costs = [] print "Starting epoch %i..." % epoch for i, index in enumerate(np.random.permutation(len(train_data))): count += 1 input = create_input(train_data[index], parameters, True, singletons) new_cost = f_train(*input) epoch_costs.append(new_cost) if i % 50 == 0 and i > 0 == 0: print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:])) if count % freq_eval == 0: dev_score = evaluate(parameters, f_eval, dev_sentences, dev_data, id_to_tag, dico_tags) test_score = evaluate(parameters, f_eval, test_sentences, test_data, id_to_tag, dico_tags) print "Score on dev: %.5f" % dev_score print "Score on test: %.5f" % test_score if dev_score > best_dev: best_dev = dev_score print "New best score on dev." print "Saving model to disk..."
for line in f_input: words_ini = line.rstrip().split() if line: # Lowercase sentence if parameters['lower']: line = line.lower() # Replace all digits with zeros if parameters['zeros']: line = zero_digits(line) words = line.rstrip().split() # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) if opts.outputFormat == 'json': f_output.write( json.dumps({
def run_tagging(model, f_eval, parameters, word_to_id, char_to_id, tag_to_id, opts_input="", opts_output="", opts_delimiter="__", opts_outputFormat=""): # Check parameters validity assert opts_delimiter assert os.path.isfile(opts_input) #set environment to use gpu f_output = codecs.open(opts_output, 'w', 'utf-8') start = time.time() logger.info('Tagging...') with codecs.open(opts_input, 'r', 'utf-8') as f_input: count = 0 for line in f_input: words_ini = line.rstrip().split() if line: # Lowercase sentence if parameters['lower']: line = line.lower() # Replace all digits with zeros if parameters['zeros']: line = zero_digits(line) words = line.rstrip().split() # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) if opts_outputFormat == 'json': f_output.write( json.dumps({ "text": ' '.join(words), "ranges": iob_ranges(y_preds) })) else: #logger.info( "write out tags..." f_output.write( '%s\n' % ' '.join('%s%s%s' % (w, opts_delimiter, y) for w, y in zip(words_ini, y_preds))) else: f_output.write('\n') count += 1 # if count % 100 == 0: # logger.info( count logger.info('---- %i lines tagged in %.4fs ----' % (count, time.time() - start)) f_output.close() logger.info(opts_output) logger.info("") return opts_output + " has been tagged!" # def main(): # logger.info( "executed" # if __name__ == '__main__': # main()
# Train network # count = 0 fvs = [] words = [] for i, index in enumerate(np.random.permutation(len(train_data))): fv = [] if (train_data[index]['str_words'][0] in words): continue count += 1 words.append(train_data[index]['str_words'][0]) input = create_input(train_data[index], parameters, False) # Get gradients vector from model grads = f_eval(*input) grads_rev = f_eval_rev(*input) * Concatenate all gradients for grad in grads_rev: for g in grad: try: for s in g: fv.append(s) except: fv.append(g) for grad in grads:
string = raw_input("Enter the citation string: ") strings = [string] test_file = "test_file" if os.path.exists(test_file): os.remove(test_file) file = open(test_file, 'a') for string in strings: file.write('\n'.join(string.split()) + '\n') file.close() test_sentences = load_sentences(test_file, lower, zeros) data = prepare_dataset(test_sentences, word_to_id, char_to_id, {}, lower, True) for citation in data: inputs = create_input(citation, model.parameters, False) y_pred = np.array(f[1](*inputs))[1:-1] tags = [model.id_to_tag[y_pred[i]] for i in range(len(y_pred))] output = [ w + '\t' + tags[i] for i, w in enumerate(citation['str_words']) ] if opts.run == 'file': with closing(open(output_file, 'w')) as fh: fh.write('\n'.join(output)) else: print('\n'.join(output)) if opts.run == 'file':
best_test = -np.inf count = 0 # costfile = './cost_vec_' + str(parameters['word_dim']) + '_' + str(parameters['word_hidden_dim']) + str(parameters['L2_reg']) + '.txt' # fw = codecs.open(costfile, 'w', 'utf-8') # fw.write('epoch\t\ttrain_loss\t\tdev_los\t\ttest_loss\t\tdev_F1\t\ttest_F1\n') F1_file = './vec_' + opts.tagger + "gaze_" + opts.dictionary + str(parameters['use_gaze']) + "char" + str(parameters['char_dim']) + "_" + str(parameters['char_hidden_dim']) \ + "_word" + str(parameters['word_dim']) + "_" + str(parameters['word_hidden_dim']) + 'taggerhidden' + str(parameters['tagger_hidden_dim']) + '.txt' fw = codecs.open(F1_file, 'w', 'utf-8') fw.write("epoch\t\tdev_F1\t\ttest_F1\n") for epoch in xrange(n_epochs): epoch_costs = [] print "Starting epoch %i..." % epoch for i, index in enumerate(np.random.permutation(len(train_data))): count += 1 input = create_input(train_data[index], parameters, True, use_gaze, pos, singletons) new_cost = f_train(*input) epoch_costs.append(new_cost) if i % 50 == 0 and i > 0 == 0: print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:])) if count % freq_eval == 0: #train_score = evaluate(parameters, f_eval, train_sentences, # train_data, id_to_tag, dico_tags) # train_cost = [] # for i, data in enumerate(train_data): # input = create_input(data, parameters, True, use_gaze, pos, singletons) # train_cost.append(f_plot_cost(*input)) # dev_cost = [] # for i, data in enumerate(dev_data): # input = create_input(data, parameters, True, use_gaze, pos, singletons)
hidden_layers=parameters['hidden_layer'], padding=parameters['padding'], max_seq_len=max_seq_len, train_size=len(train_data)) if parameters['reload']: gramcnn.load(models_path, model_name) for epoch in xrange(n_epochs): epoch_costs = [] print "Starting epoch %i..." % epoch for i, index in enumerate(np.random.permutation(len(train_data))): inputs, word_len = create_input(train_data[index], parameters, True, singletons, padding=parameters['padding'], max_seq_len=max_seq_len, use_pts=parameters['pts']) assert inputs['char_for'] assert inputs['word'] assert inputs['label'] # break if len(inputs['label']) == 1: continue train_loss = [] temp = [] temp.append(word_len) batch_loss = gramcnn.train(inputs, temp)
def runModelInLoop(dropout,char_dim,char_lstm_dim,word_dim,word_lstm_dim): #results File resultsPath = "/Users/Ehsan/Documents/Ehsan_General/HMQ/HMQ_Projects/DNR2/COLING-2016-Code/i2b2-2010/results/" for u_dropout in dropout: for v_char_dim in char_dim: for w_char_lstm_dim in char_lstm_dim: for x_word_dim in word_dim: for y_word_lstm_dim in word_lstm_dim: for dataset in datasets: print "+++++++++++++++" print u_dropout,v_char_dim,w_char_lstm_dim,x_word_dim,y_word_lstm_dim,dataset parameters['dropout'] = u_dropout parameters['char_dim'] = v_char_dim parameters['char_lstm_dim'] =w_char_lstm_dim parameters['word_dim'] = x_word_dim parameters['word_lstm_dim'] = y_word_lstm_dim # If dataset is DrugBank assign predefined path if(dataset == "i2b2-2010"): opts.train = i2b2BasePath+"train.txt" opts.dev = i2b2BasePath+ "dev.txt" opts.test = i2b2BasePath+ "test.txt" resultsFile = resultsPath +"i2b2_2010_Results.txt" # Initialize model model = Model(parameters=parameters, models_path=models_path) print "Model location: %s" % model.model_path # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['tag_scheme'] # Load sentences train_sentences = loader.load_sentences(opts.train, lower, zeros) dev_sentences = loader.load_sentences(opts.dev, lower, zeros) test_sentences = loader.load_sentences(opts.test, lower, zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) print "Calling the prepare_dataset :--" # Index data train_data = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, lower ) dev_data = prepare_dataset( dev_sentences, word_to_id, char_to_id, tag_to_id, lower ) test_data = prepare_dataset( test_sentences, word_to_id, char_to_id, tag_to_id, lower ) print "%i / %i / %i sentences in train / dev / test." % ( len(train_data), len(dev_data), len(test_data)) # Save the mappings to disk print 'Saving the mappings to disk...' model.save_mappings(id_to_word, id_to_char, id_to_tag) # Build the model f_train, f_eval = model.build(**parameters) # Reload previous model values if opts.reload: print 'Reloading previous model...' model.reload() # Train network # singletons = set([word_to_id[k] for k, v in dico_words_train.items() if v == 1]) n_epochs = 2 # number of epochs over the training set freq_eval = 1000 # evaluate on dev every freq_eval steps best_dev = -np.inf best_test = -np.inf count = 0 for epoch in xrange(n_epochs): epoch_costs = [] print "Starting epoch %i..." % epoch for i, index in enumerate(np.random.permutation(len(train_data))): count += 1 input = create_input(train_data[index], parameters, True, singletons) new_cost = f_train(*input) epoch_costs.append(new_cost) #if i % 50 == 0 and i > 0 == 0: # print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:])) if count % freq_eval == 0: dev_score = evaluate(parameters, f_eval, dev_sentences, dev_data, id_to_tag, dico_tags) test_score = evaluate(parameters, f_eval, test_sentences, test_data, id_to_tag, dico_tags) print "Score on dev: %.5f" % dev_score print "Score on test: %.5f" % test_score if dev_score > best_dev: best_dev = dev_score print "New best score on dev."+str(best_dev) # print "Saving model to disk..." # model.save() if test_score > best_test: best_test = test_score print "New best score on test."+str(best_test) # print "Config values used are : " print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs)) # Write the best dev and test scores to the file del model with open(resultsFile, 'a') as f: f.write("dropout: "+ str(parameters['dropout'] ) +"| char_dim: |"+str(parameters['char_dim'])+ "| char_lstm_dim: "+str(parameters['char_lstm_dim']) +" word_dim: "+ str(parameters['word_dim']) +" |word_lstm_dim: "+ str( parameters['word_lstm_dim'] )+" | Best Dev Score: "+str(best_dev) + " | Best Test Score: "+str(best_test) +"\n") return
def train(self, n_epochs=100, freq_eval=1000, verbose=True, eval_test_set=False): """ :param n_epochs: number of epochs over the training set :param freq_eval: evaluate on dev every freq_eval steps :return: Saves the model with the best F1-Score, evaluated on the dev set """ # Initialize model model = Model(parameters=self.parameters, models_path=models_path) print("Model location: %s" % model.model_path) # Data parameters lower = self.parameters['lower'] zeros = self.parameters['zeros'] tag_scheme = self.parameters['tag_scheme'] # Load sentences train_sentences = loader.load_sentences(self.parameters['train'], lower, zeros) dev_sentences = loader.load_sentences(self.parameters['dev'], lower, zeros) test_sentences = loader.load_sentences(self.parameters['test'], lower, zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if self.parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), self.parameters['pre_emb'], list( itertools.chain.from_iterable([[w[0] for w in s] for s in dev_sentences + test_sentences])) if not self.parameters['all_emb'] else None) else: dico_words, word_to_id, id_to_word = word_mapping( train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) # Index data train_data = prepare_dataset(train_sentences, word_to_id, char_to_id, tag_to_id, lower) dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id, lower) test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) # Save the mappings to disk print('Saving the mappings to disk...') model.save_mappings(id_to_word, id_to_char, id_to_tag) # Build the model f_train, f_eval = model.build(**self.parameters) # Reload previous model values if self.parameters['reload']: print('Reloading previous model...') model.reload() # # Train network # singletons = set( [word_to_id[k] for k, v in dico_words_train.items() if v == 1]) best_dev = -np.inf best_test = -np.inf count = 0 for epoch in range(n_epochs): epoch_costs = [] print("Starting epoch %i at..." % epoch, time.ctime()) for i, index in enumerate(np.random.permutation(len(train_data))): count += 1 input = create_input(train_data[index], self.parameters, True, singletons) new_cost = f_train(*input) epoch_costs.append(new_cost) if i % 50 == 0 and i > 0 == 0 and verbose: print("%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))) if count % freq_eval == 0: dev_score = evaluate(self.parameters, f_eval, dev_sentences, dev_data, id_to_tag, verbose=verbose) if eval_test_set: test_score = evaluate(self.parameters, f_eval, test_sentences, test_data, id_to_tag, verbose=verbose) print("Score on dev: %.5f" % dev_score) if eval_test_set: print("Score on test: %.5f" % test_score) if dev_score > best_dev: best_dev = dev_score print("New best score on dev.") print("Saving model to disk...") model.save() if eval_test_set: if test_score > best_test: best_test = test_score print("New best score on test.") print( "Epoch %i done. Average cost: %f. Ended at..." % (epoch, np.mean(epoch_costs)), time.ctime()) return best_dev
for phase in ['train', 'dev', 'test'][:]: if phase == 'train': optimizer = exp_lr_scheduler(optimizer_ft, epoch, **lr_method_parameters) model.train(True) # Set model to training mode random.shuffle(dataset[phase]) else: model.train(False) # Set model to evaluate mode epoch_loss = [] # Iterate over data. preds = [] for i in range(0, len(dataset[phase]), batch_size): inputs, seq_index_mapping, char_index_mapping, seq_len, char_len = \ create_input(dataset[phase][i:i+batch_size], parameters) # forward outputs, loss = model.forward(inputs, seq_len, char_len, char_index_mapping) try: epoch_loss.append(loss.data[0]) except AttributeError: pass # backward + optimize only if in training phase if phase == 'train': # zero the parameter gradients optimizer.zero_grad() loss.backward()
if (valid_total_ph_pred != 0): valid_precision = valid_true_pos / valid_total_ph_pred if (valid_total_phrases_truth != 0): valid_recall = valid_true_pos / valid_total_phrases_truth if ((valid_precision + valid_recall) != 0): valid_F1score = 2 * valid_precision * \ valid_recall/(valid_precision+valid_recall) print("Precision : {} | Recall : {} | F1 Score : {}".format( valid_precision, valid_recall, valid_F1score)) # Main Training Loop with torch.no_grad(): precheck_sent = utils.create_input(taskB_in[0][0].split(), tokenizer).to(device) precheck_tags = torch.tensor( [4] + [biluo_code[t] for t in taskB_label[0][0].split()] + [4], dtype=torch.long).to(device) print("Checkpoint reached! Starting model training......") for epoch in range(4): start = time.time() model.train() for i, file in enumerate(taskB_in): try: if (i % 10 == 0): print(f"done with {i} of {len(taskB_in)}") for j, sent in enumerate(file):
# # Train network # n_epochs = 50 # number of epochs over the training set freq_eval = 500 # evaluate on dev every freq_eval steps best_dev = -np.inf best_test = -np.inf count = 0 for epoch in xrange(n_epochs): epoch_costs = [] print "Starting epoch %i..." % epoch for i, index in enumerate(np.random.permutation(len(train_data))): count += 1 input = create_input(train_data[index], parameters, True, False if pos_tag==0 else True) new_cost = f_train(*input) epoch_costs.append(new_cost) if i % 50 == 0 and i > 0 == 0: print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:])) if count % freq_eval == 0: dev_score, pred_dev = evaluate_scope(parameters, model.model_path, f_eval, dev_data, id_to_y, False if pos_tag==0 else True) if dev_score > best_dev: best_dev = dev_score print "New best score on dev." print "Saving model to disk..." model.save() # Store predictions to disk output_predDEV = os.path.join(model.model_path, "best_dev.output") with codecs.open(output_predDEV, 'w', 'utf8') as f: f.write("\n".join(pred_dev))
def ner(): global model global f_eval global parameters global word_to_id global char_to_id global tag_to_id model_name = request.json["model"] words = request.json["words"] begin_end = request.json["begin_end"] if model is None: ## Model loading print "Loading model " + model_name + ".." model = Model(model_path="models/" + models[model_name]) parameters = model.parameters # Load reverse mappings word_to_id, char_to_id, tag_to_id = [{ v: k for k, v in x.items() } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]] # Load the model _, f_eval = model.build(training=False, **parameters) model.reload() # else: # parameters = model.parameters # word_to_id, char_to_id, tag_to_id = [ # {v: k for k, v in x.items()} # for x in [model.id_to_word, model.id_to_char, model.id_to_tag] # ] # Lowercase sentence if parameters['lower']: words = [w.lower() for w in words] # Replace all digits with zeros if parameters['zeros']: words = [zero_digits(w) for w in words] words = [w if not w.isupper() else w.title() for w in words] # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) # TODO:remove assert? ents = [{ "start_char": b, "end_char": e, "label": label } for (b, e), label in zip(begin_end, y_preds) if label != "O"] return json.dumps({"ents": ents})
def test_inference_performance(): from sklearn.metrics import f1_score from torchtext.datasets import SequenceTaggingDataset from torchtext.data import Field, NestedField WORD = Field(init_token='<bos>', eos_token='<eos>') CHAR_NESTING = Field(tokenize=list, init_token='<bos>', eos_token='<eos>') CHAR = NestedField(CHAR_NESTING, init_token='<bos>', eos_token='<eos>') ENTITY = Field(init_token='<bos>', eos_token='<eos>') data_file = tempfile.NamedTemporaryFile(delete=True) # TODO Need to be decoded in Python 3 data_file.write(requests.get(CORA_URL).content) fields = [(('text', 'char'), (WORD, CHAR))] + [(None, None)] * 22 + [('entity', ENTITY)] dataset = SequenceTaggingDataset(data_file.name, fields, separator=" ") model = Model(model_path='models/neuralParsCit') model.parameters['pre_emb'] = os.path.join(os.getcwd(), 'vectors_with_unk.kv') f = model.build(training=False, **model.parameters) model.reload() word_to_id = {v: i for i, v in model.id_to_word.items()} char_to_id = {v: i for i, v in model.id_to_char.items()} tag_to_id = {tag: i for i, tag in model.id_to_tag.items()} tf = tempfile.NamedTemporaryFile(delete=False) tf.write("\n\n".join( ["\n".join(example.text) for example in dataset.examples])) tf.close() train_sentences = load_sentences(tf.name, model.parameters['lower'], model.parameters['zeros']) train_inputs = prepare_dataset(train_sentences, word_to_id, char_to_id, model.parameters['lower'], True) preds = [] for citation in train_inputs: inputs = create_input(citation, model.parameters, False) y_pred = np.array(f[1](*inputs))[1:-1] preds.append([(w, y_pred[i]) for i, w in enumerate(citation['str_words'])]) assert len(preds) == len(dataset.examples) results = [] for P, T in zip(preds, dataset.examples): for p, t in zip(P, zip(T.text, T.entity)): results.append((p[1], tag_to_id[t[1]])) pred, true = zip(*results) eval_metrics = { 'micro_f1': f1_score(true, pred, average='micro'), 'macro_f1': f1_score(true, pred, average='macro') } data_file.close() assert eval_metrics == pytest.approx({ 'macro_f1': 0.984, 'micro_f1': 0.993 }, abs=0.001)