def main(): # create instance of config config = Config() # build model model = NERModel(config) model.build() model.restore_session(config.dir_model) # create dataset test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) # evaluate and interact model.predict(test)
def train_active(train, dev, test, select, config, modename): """ Input: train set, test set, selection set, configurations Output: accuracy on dev set, test set, prediction on selection set Select Most & Least Certain Examples from Select set """ # build model #tf.reset_default_graph() #gc.collect() #tf.get_variable_scope().reuse_variables() model = NERModel(config) model.build() print("Start training model...") print("Training size ", len(train)) model.train(train, dev) # restore session model.restore_session(config.dir_model) # evaluate print("===Evaluating on test set:===") mode = "test" + modename model.evaluate(test, mode) # run on selection set print("Selecting samples for active learning...") if len(select) == 0: return [] l = [] for sent in select: output = model.predict(sent[0]) l.append(output[1][0]) #sort l return l #most uncertain and least uncertain
def main(args): # create instance of config config = Config() # build model model = NERModel(config) model.build() model.restore_session(config.dir_model) # create dataset print(model.predict(args.sentence))
def main(): # create instance of config config = Config() # build model model = NERModel(config) model.build() model.restore_session(config.dir_model) # predict path = "data-sequence-tagging/QA4IE-benchmark/" file_name_list = [ "ie_test/0-400/ie_test.span", "seq/0-400/dev.seq", "seq/0-400/test.seq", "seq/0-400/train.seq", "seq/400-700/dev.seq", "seq/400-700/test.seq", "seq/400-700/train.seq", "seq/700-/dev.seq", "seq/700-/test.seq", "seq/700-/train.seq", "span/0-400/dev.span", "span/0-400/test.span", "span/0-400/train.span", "span/400-700/dev.span", "span/400-700/test.span", "span/400-700/train.span", "span/700-/dev.span", "span/700-/test.span", "span/700-/train.span" ] for file_name in file_name_list: ifs = open(path + file_name + ".json", 'r') ofs = open(path + file_name + ".ner", 'w') dataset_raw = ifs.read() dataset = json.loads(dataset_raw) index = 0 for passage in dataset['data']: # start of one passage ofs.write('#' + str(index) + "\n\n") index = index + 1 for paragraph in passage['paragraphs']: context = paragraph['context'] word_list = context.split(' ') preds = model.predict(word_list) ofs.write('\n'.join(preds) + '\n\n') ofs.write('\n') ifs.close() ofs.close() print("successfully predict " + file_name + '\n')
# Pessoa is blue, tempo is green, Local is yellow and organizacao is red bcolors = { "PESSOA": '\033[94m', "TEMPO": '\033[92m', "LOCAL": '\033[93m', "ORGANIZACAO": '\033[91m', "JURISPRUDENCIA": '\033[35m', "LEGISLACAO": '\033[36m', "ENDC": '\033[0m', "O": "" } # create instance of config config = Config() # build model model = NERModel(config) model.build() model.restore_session(config.dir_model) while (True): words = input("Escreva frase a ser analisada: ") words = word_tokenize(words, language='portuguese') preds = model.predict(words) for index, word in enumerate(words): if preds[index][0:2] in ['B-', 'I-', 'E-', 'S-']: preds[index] = preds[index][2:] print(bcolors[preds[index]] + word + bcolors["ENDC"], end=' ') print('\n')
class _EntityBase: def __init__(self, load_lstm): import sys if load_lstm: sys.path.append('/home/rbshaffer/sequence_tagging') from model.ner_model import NERModel from model.config import Config config = Config() # build model self.model = NERModel(config) self.model.build() self.model.restore_session(config.dir_model) def get_chunks(self, parsed): return [] def do_entity_extraction(self, parsed): """ Somewhat complex function to actually do the entity extraction. """ import networkx as nx import textwrap from numpy import mean chunks = self.get_chunks(parsed) def total_edge_count(count_obj, total_counter=0): """ Sub-function to calculate total number of edges in a container. """ if count_obj: ceiling = count_obj.pop(count_obj.keys()[0]) total_counter += sum([min(ceiling, count_obj[c]) for c in count_obj]) total_counter = total_edge_count(count_obj, total_counter) return total_counter def observed_edge_count(raw_obj): """ Sub-function to calculate the observed number of edges in a container. """ observed_counter = 0 for chunk_obj in raw_obj: chunk_entities = {e: chunk_obj.count(e) for e in set(chunk_obj)} observed_counter += total_edge_count(chunk_entities) return observed_counter # container to store all entities extracted, for matching use in-string # maybe consider shifting this inside the loop to only match in-chunk? # though note that the output generator currently depends on this all_entities = [] # output container out = [] # iterate over units of analysis, as defined in country-specific functions for chunk in chunks: entity_strings = [] sentences = self.process_doc(chunk) for sent in sentences: entities = [] tags = self.model.predict(sent) for i, t in enumerate(tags): if t == 'B-MISC': entities.append([sent[i]]) elif t == 'I-MISC' and len(entities) > 0: # this condition shouldn't be necessary - need to figure out why this is happening entities[-1].append(sent[i]) new_entities = [' '.join(e) for e in entities] new_entities = ['\n'.join(textwrap.wrap(e.strip(), 20)) for e in new_entities] entity_strings += new_entities all_entities += new_entities out.append(entity_strings) # get the actual output entities_count = {e: all_entities.count(e) for e in set(all_entities)} out = [[e for e in row if e in entities_count] for row in out] edges = {} for chunk in out: if len(set(chunk)) > 1: entities = list(set(chunk)) for i in range(len(entities)): for j in range(i+1, len(entities)): e1 = entities[i] e2 = entities[j] if (e1, e2) in edges: edges[(e1, e2)] += min(chunk.count(e1), chunk.count(e2)) elif (e2, e1) in edges: edges[(e2, e1)] += min(chunk.count(e1), chunk.count(e2)) else: edges[(e1, e2)] = min(chunk.count(e1), chunk.count(e2)) edges = [k + (w,) for k, w in edges.iteritems()] if entities_count: graph = nx.Graph() for u, v, w in edges: graph.add_edge(u, v, weight=w) degree = list(graph.degree(weight='weight').values()) if degree: average_degree = mean(list(graph.degree(weight='weight').values())) else: average_degree = 0 # count_zeroes? try: clustering_coeff = nx.average_clustering(graph, weight='weight', count_zeros=True) except ZeroDivisionError: clustering_coeff = 0 else: graph = None clustering_coeff = None average_degree = None total_nodes = len(set(all_entities)) total_edges = sum([e[2] for e in edges]) return {'graph': graph, 'edges': edges, 'total_nodes': total_nodes, 'clustering': clustering_coeff, 'total_edges': total_edges, 'average_degree': average_degree} @staticmethod def process_doc(document): sentences = _nltk.sent_tokenize(document) sentences = [_nltk.word_tokenize(sent) for sent in sentences] return sentences