def tag(): if request.method == 'POST': data = request.get_json() text = data['text'] if data['split_sentences']: sentences = split_sentences(text) else: sentences = text if data['tokenize'] or data['split_sentences']: tokenized_sentences = [tokenize(s) for s in sentences] else: tokenized_sentences = text count = 0 output = [] for words in tokenized_sentences: if len(words) == 0: continue # Lowercase sentence if model.parameters['lower']: line = line.lower() # Replace all digits with zeros if model.parameters['zeros']: line = zero_digits(line) # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=model.parameters['lower']) input = create_input(sentence, model.parameters, False) # Decoding if model.parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if model.parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len( words ), "Predictions have different length than sentence. Something went wrong." output.append(list(zip(words, y_preds))) count += 1 if count % 100 == 0: logging.info(count) return jsonify(output)
def extract_predictions_from_raw_text(model_path, tokens, pos): model = Model(model_path=model_path) parameters = model.parameters if 'language_model' not in parameters: parameters['language_model'] = False # Load reverse mappings word_to_id, char_to_id, tag_to_id = [{ v: k for k, v in x.items() } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]] pos_to_id, ortho_to_id, segment_to_id = [{ v: k for k, v in x.items() } for x in [model.id_to_pos, model.id_to_ortho, model.id_to_segment]] word_to_id_1 = {v: k for k, v in model.id_to_word_1.items()} # Load the model _, f_eval = model.build(training=False, **parameters) model.reload() id_to_tag = model.id_to_tag sentence_cl = ' '.join(tokens) if parameters['lower']: sentence_cl = sentence_cl.lower() # Replace all digits with zeros if parameters['zeros']: sentence_cl = zero_digits(sentence_cl) tokens = sentence_cl.split(' ') ortho = [get_ortho_feature(w) for w in tokens] assert len(tokens) == len(pos) == len(ortho) input_dict = {'words': tokens, 'pos': pos, 'ortho': ortho} # Prepare input sentence = prepare_sentence(input_dict, word_to_id, char_to_id, pos_to_id, ortho_to_id, segment_to_id, word_to_id_1, lower=parameters['lower']) input = create_input(sentence, parameters, add_label=False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) y_preds = resolve_inconsistencies(y_preds) return tokens, y_preds
def tag(model, line): # Load existing model print("Loading model...") model = Model(model_path=model) parameters = model.parameters # Load reverse mappings word_to_id, char_to_id, tag_to_id = [{ v: k for k, v in x.items() } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]] # Load the model _, f_eval = model.build(training=False, **parameters) model.reload() start = time.time() print('Tagging...') words_ini = line.rstrip().split() # Replace all digits with zeros if parameters['zeros']: line = zero_digits(line) words = line.rstrip().split() # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) print('---- sentence tagged in %.4fs ----' % (time.time() - start)) return ' '.join(w + '__' + str(y) for w, y in zip(words_ini, y_preds))
def tag_document(doc, parameters, model, f_eval, word_to_id, char_to_id): count = 0 all_ypreds = list() all_tokens = list() for line in doc.sentences: toks_text = [x.orth_ for x in line.tokens] # line = ' '.join(toks_text) if toks_text: # WL edit: used to be 'if line', was crashing on '\n' lines # Lowercase sentence if parameters['lower']: toks_text = [line.lower() for line in toks_text] # Replace all digits with zeros if parameters['zeros']: toks_text = [zero_digits(line) for line in toks_text] # Prepare input sentence = prepare_sentence(toks_text, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(toks_text) # strip IOB prefixes y_preds = [x.split('-')[-1] for x in y_preds] all_ypreds.append(y_preds) all_tokens.append(toks_text) count += 1 if count % 100 == 0: print count return (all_ypreds, all_tokens)
def predicts(self, line): if line: # Save original bigrams bigram_sent = self.to_bigram(line, 0).strip().split() # Replave all digits with zeros line = zero_digits(line) input_seq = self.to_bigram(line, 0).strip().split() # Prepare input sentence = prepare_sentence(input_seq, self.word_to_id, self.char_to_id, lower=self.parameters['lower']) input = create_input(sentence, self.parameters, False) if self.parameters['crf']: y_preds = np.array(self.f_eval(*input))[1:-1] else: y_preds = self.f_eval(*input).argmax(axis=1) tags = [self.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if self.parameters['tag_scheme'] == 'iobes': tags = iobes_iob(tags) print(tags) # Make output form out_form = "" unigram_sent = self.bigrams_to_unigrams(bigram_sent) for i in range(len(tags)): if tags[i].startswith('B'): out_form += '<' + unigram_sent[i] elif tags[i].startswith('I'): if i == len(tags) - 1: out_form += unigram_sent[i] + ':' + tags[i][2:] + '>' elif tags[i + 1] == 'O': out_form += unigram_sent[i] + ':' + tags[i][2:] + '>' else: out_form += unigram_sent[i] else: out_form += unigram_sent[i] return out_form
def NER_for_sentence(sentence): sentence = utils.remove_numbers(sentence) sentence = utils.remove_punctua(sentence) sentence = utils.remove_whitespace(sentence) str_words = sentence.split() # print(str_words) data = loader.prepare_sentence(str_words, word_to_id, char_to_id, tag_to_id) sentence_in = data['words'] sentence_in = torch.tensor(sentence_in, dtype=torch.long) cap_in = data['caps'] cap_in = torch.tensor(cap_in, dtype=torch.long) chars2 = data['chars'] chars2_sorted = sorted(chars2, key=lambda p: len(p), reverse=True) d = {} for i, ci in enumerate(chars2): for j, cj in enumerate(chars2_sorted): if ci == cj and not j in d and not i in d.values(): d[j] = i continue chars2_length = [len(w) for w in chars2_sorted] char_maxl = max(chars2_length) chars2_mask = np.zeros((len(chars2_sorted), char_maxl), dtype='int') for i, c in enumerate(chars2_sorted): chars2_mask[i, :chars2_length[i]] = c chars2_mask = torch.tensor(chars2_mask, dtype=torch.long) val, out = model(sentence_in, chars2_mask, cap_in, chars2_length, d) predicted_id = out # print(out) id_to_tag = {v: i for i, v in tag_to_id.items()} tags = [id_to_tag[id.item()] for id in predicted_id] result = [] for item in zip(str_words, tags): result.append(item) return result
print 'Tagging...' with codecs.open(opts.input, 'r', 'utf-8') as f_input: count = 0 for line in f_input: words_ini = line.rstrip().split() if line: # Lowercase sentence if parameters['lower']: line = line.lower() # Replace all digits with zeros if parameters['zeros']: line = zero_digits(line) words = line.rstrip().split() # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) if opts.outputFormat == 'json':
def ner(): global model global f_eval global parameters global word_to_id global char_to_id global tag_to_id model_name = request.json["model"] words = request.json["words"] begin_end = request.json["begin_end"] if model is None: ## Model loading print "Loading model " + model_name + ".." model = Model(model_path="models/" + models[model_name]) parameters = model.parameters # Load reverse mappings word_to_id, char_to_id, tag_to_id = [{ v: k for k, v in x.items() } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]] # Load the model _, f_eval = model.build(training=False, **parameters) model.reload() # else: # parameters = model.parameters # word_to_id, char_to_id, tag_to_id = [ # {v: k for k, v in x.items()} # for x in [model.id_to_word, model.id_to_char, model.id_to_tag] # ] # Lowercase sentence if parameters['lower']: words = [w.lower() for w in words] # Replace all digits with zeros if parameters['zeros']: words = [zero_digits(w) for w in words] words = [w if not w.isupper() else w.title() for w in words] # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) # TODO:remove assert? ents = [{ "start_char": b, "end_char": e, "label": label } for (b, e), label in zip(begin_end, y_preds) if label != "O"] return json.dumps({"ents": ents})
def run_tagging(model, f_eval, parameters, word_to_id, char_to_id, tag_to_id, opts_input="", opts_output="", opts_delimiter="__", opts_outputFormat=""): # Check parameters validity assert opts_delimiter assert os.path.isfile(opts_input) #set environment to use gpu f_output = codecs.open(opts_output, 'w', 'utf-8') start = time.time() logger.info('Tagging...') with codecs.open(opts_input, 'r', 'utf-8') as f_input: count = 0 for line in f_input: words_ini = line.rstrip().split() if line: # Lowercase sentence if parameters['lower']: line = line.lower() # Replace all digits with zeros if parameters['zeros']: line = zero_digits(line) words = line.rstrip().split() # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) if opts_outputFormat == 'json': f_output.write( json.dumps({ "text": ' '.join(words), "ranges": iob_ranges(y_preds) })) else: #logger.info( "write out tags..." f_output.write( '%s\n' % ' '.join('%s%s%s' % (w, opts_delimiter, y) for w, y in zip(words_ini, y_preds))) else: f_output.write('\n') count += 1 # if count % 100 == 0: # logger.info( count logger.info('---- %i lines tagged in %.4fs ----' % (count, time.time() - start)) f_output.close() logger.info(opts_output) logger.info("") return opts_output + " has been tagged!" # def main(): # logger.info( "executed" # if __name__ == '__main__': # main()
v: k for k, v in x.items() } for x in [ model.id_to_word, model.id_to_slb, model.id_to_char, model.id_to_tag, model.id_to_pos ]] id_to_tag = model.id_to_tag # Load the model _, f_eval = model.build(training=False, **parameters) model.reload() start = time.time() print 'Running...NER' test_data = prepare_sentence(test_sentences, word_to_id, slb_to_id, char_to_id, pos_to_id) gazette_dict = make_gazette_to_dic(dict_path) gazette_dict_for, gazette_dict_len = dict(), dict() with open(dict_path, 'r') as f: for line in f.readlines(): line = line.strip().split('\t') words, tag = line[0], line[1] if len(words) > 3: gazette_dict_len[words] = len(words) gazette_dict_for[words] = tag gazette_dict_len = sorted(gazette_dict_len.iteritems(), key=itemgetter(1), reverse=True)
print 'Tagging...' with codecs.open(opts.input, 'r', 'utf-8') as f_input: count = 0 for line in f_input: words_ini = line.rstrip().split() if line: # Lowercase sentence if parameters['lower']: line = line.lower() # Replace all digits with zeros if parameters['zeros']: line = zero_digits(line) words = line.rstrip().split() # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) if opts.outputFormat == 'json': f_output.write(json.dumps({ "text": ' '.join(words), "ranges": iob_ranges(y_preds) }))
def extract_tagger_predictions(model_path, span_path, output_path=None, f_eval=None, parameters=None, return_raw_predictions=False): assert file_exists(span_path) documents = read_pickle(span_path) if not f_eval: model = Model(model_path=model_path) parameters = model.parameters if 'language_model' not in parameters: parameters['language_model'] = False # Load reverse mappings word_to_id, char_to_id, tag_to_id = [{ v: k for k, v in x.items() } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]] pos_to_id, ortho_to_id, segment_to_id = [{ v: k for k, v in x.items() } for x in [model.id_to_pos, model.id_to_ortho, model.id_to_segment]] word_to_id_1 = {v: k for k, v in model.id_to_word_1.items()} # Load the model _, f_eval = model.build(training=False, **parameters) model.reload() id_to_tag = model.id_to_tag else: # load mappings mappings = read_pickle(join_path(model_path, 'mappings.pkl')) id_to_word = mappings['id_to_word'] id_to_char = mappings['id_to_char'] id_to_tag = mappings['id_to_tag'] id_to_pos = mappings['id_to_pos'] id_to_ortho = mappings['id_to_ortho'] id_to_segment = mappings['id_to_segment'] id_to_word_1 = mappings['id_to_word_1'] # reverse mappings word_to_id, char_to_id, tag_to_id = [{ v: k for k, v in x.items() } for x in [id_to_word, id_to_char, id_to_tag]] pos_to_id, ortho_to_id, segment_to_id = [{ v: k for k, v in x.items() } for x in [id_to_pos, id_to_ortho, id_to_segment]] word_to_id_1 = {v: k for k, v in id_to_word_1.items()} predictions = {} docs_count = 0 for doc_name, sentences in documents.items(): for sentence in sentences: words = [span['word'] for span in sentence] start = [span['start'] for span in sentence] end = [span['end'] for span in sentence] pos = [span['pos'] for span in sentence] ortho = [get_ortho_feature(w) for w in words] doc_names = [doc_name] * len(words) input_dict = { 'words': words, 'pos': pos, 'ortho': ortho, 'doc_names': doc_names } sentence_cl = ' '.join(words) if parameters['lower']: sentence_cl = sentence_cl.lower() # Replace all digits with zeros if parameters['zeros']: sentence_cl = zero_digits(sentence_cl) words = sentence_cl.split(' ') assert len(words) == len(start) == len(end) # Prepare input sentence = prepare_sentence(input_dict, word_to_id, char_to_id, pos_to_id, ortho_to_id, segment_to_id, word_to_id_1, lower=parameters['lower']) input = create_input(sentence, parameters, add_label=False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) if not return_raw_predictions: y_preds = resolve_inconsistencies(y_preds) entities = extract_entities(words, y_preds, start, end) if doc_name not in predictions: predictions[doc_name] = [] if len(entities) > 0: predictions[doc_name] += entities else: if doc_name not in predictions: predictions[doc_name] = {} predictions[doc_name]['words'] = [] predictions[doc_name]['tags'] = [] predictions[doc_name]['start'] = [] predictions[doc_name]['end'] = [] predictions[doc_name]['words'].append(words) predictions[doc_name]['tags'].append(y_preds) predictions[doc_name]['start'].append(start) predictions[doc_name]['end'].append(end) docs_count += 1 if docs_count % 100 == 0: print('{} documents processed'.format(docs_count)) if return_raw_predictions: return predictions else: write_predictions(output_path, predictions)