def _evaluate_for_train_valid(self): """Evaluate model on train and valid set and get acc and f1 score. Returns: train_acc, train_f1, valid_acc, valid_f1 """ train_predictions, train_length = evaluate( model=self.model, data_loader=self.data_loader['valid_train'], device=self.device) valid_predictions, valid_length = evaluate( model=self.model, data_loader=self.data_loader['valid_valid'], device=self.device) train_answers = handy_tool( self.data_loader['train_label'], train_length) #get_labels_from_file(self.config.train_file_path) valid_answers = handy_tool( self.data_loader['valid_label'], valid_length) #get_labels_from_file(self.config.valid_file_path) train_predictions, valid_predictions = self.flatten( train_predictions), self.flatten(valid_predictions) train_answers, valid_answers = self.flatten( train_answers), self.flatten(valid_answers) train_acc, train_f1 = calculate_accuracy_f1(train_answers, train_predictions) valid_acc, valid_f1 = calculate_accuracy_f1(valid_answers, valid_predictions) return train_acc, train_f1, valid_acc, valid_f1
def bert_classification(self, content): logger.info('1:{}'.format(content)) row = {'content': content} df = pandas.DataFrame().append(row, ignore_index=True) filename = "data/{}.csv".format(time.time()) df.to_csv(filename, index=False, columns=['content']) test_set, sc_list, label_list = self.data.load_file(filename, train=False) token_list = [] for line in sc_list: tokens = self.data.tokenizer.convert_ids_to_tokens(line) token_list.append(tokens) data_loader_test = DataLoader(test_set, batch_size=self.config.batch_size, shuffle=False) # Evaluate answer_list, length_list = evaluate(self.model, data_loader_test, self.device, isTest=True) mod_tokens_list = handy_tool(token_list, length_list) result = [ result_to_json(t, s) for t, s in zip(mod_tokens_list, answer_list) ] entities = [item['entities'] for item in result] entities = self.flatten(entities) amount_entities = [ entity['word'] for entity in entities if entity['type'] == 'bms' ] return {"answer": amount_entities}
def bert_classification(self, content): logger.info('1:{}'.format(content)) # row = {'type1': '/', 'title': title, 'content': content} # df = pandas.DataFrame().append(row, ignore_index=True) filename = "data/{}.csv".format(time.time()) lines = self.split(content) items = [{"text":line} for line in lines] with open(filename, 'w', encoding='utf-8') as f: json.dump(items, f, ensure_ascii=False, indent=4) # df.to_csv(filename, index=False, columns=['type1', 'title', 'content']) test_set, sc_list, label_list = self.data.load_file(filename, train=False) data_loader_test = DataLoader( test_set, batch_size=self.config.batch_size, shuffle=False) # Evaluate answer_list, length_list = evaluate(self.model, data_loader_test, self.device, isTest=True) token_list = [] for line in sc_list: tokens = self.data.tokenizer.convert_ids_to_tokens(line) token_list.append(tokens) mod_tokens_list = handy_tool(token_list, length_list) result = [result_to_json(t, s) for t, s in zip(mod_tokens_list, answer_list)] entity_list = [] for item in result: entities = item['entities'] words = [d['word'] +"-"+all_type_dic[d['type']] for d in entities if d['type'] !='s'] entity_list.extend(words) return {"answer": entity_list}
def bert_classification(self, content): logger.info('1:{}'.format(content)) lines = self.split(content) rows = [] for line in lines: rows.append({'content': line}) df = pandas.DataFrame(rows) filename = "data/{}.csv".format(time.time()) df.to_csv(filename, index=False, columns=['content']) test_set, sc_list, label_list, row_list = self.data.load_file( filename, train=False) # token_list = [] # for line in sc_list: # tokens = self.data.tokenizer.convert_ids_to_tokens(line) # token_list.append(tokens) data_loader_test = DataLoader(test_set, batch_size=self.config.batch_size, shuffle=False) # Evaluate answer_list, length_list = evaluate(self.model, data_loader_test, self.device, isTest=True) mod_tokens_list = handy_tool(row_list, length_list) result = [ result_to_json(t, s) for t, s in zip(mod_tokens_list, answer_list) ] entities = [item['entities'] for item in result] entities = self.flatten(entities) return {"data": entities}
def bert_classification(self, content): logger.info('1:{}'.format(content)) lines = self.split(content) rows = [] for i, line in enumerate(lines): rows.append({"id": i, 'text': line}) filename = "log/{}.json".format(time.time()) with open(filename, 'w', encoding='utf-8') as f: json.dump(rows, f, ensure_ascii=False, indent=2) test_set, sc_list, label_list = self.data.load_file(filename, train=False) token_list = [] for line in sc_list: tokens = self.data.tokenizer.convert_ids_to_tokens(line) token_list.append(tokens) data_loader_test = DataLoader(test_set, batch_size=self.config.batch_size, shuffle=False) # 3. Evaluate answer_list, length_list = evaluate(self.model, data_loader_test, self.device, isTest=True) def flatten(ll): return list(itertools.chain(*ll)) # train_answers = handy_tool(label_list, length_list) #gold # #answer_list = handy_tool(answer_list, length_list) #prediction # train_answers = flatten(train_answers) # train_predictions = flatten(answer_list) # # train_acc, train_f1 = calculate_accuracy_f1( # train_answers, train_predictions) # print(train_acc, train_f1) # test_json = json.load(open(config.test_file_path, 'r', encoding='utf-8')) # id_list = [item['id'] for item in test_json] mod_tokens_list = handy_tool(token_list, length_list) result = [ result_to_json(t, s) for t, s in zip(mod_tokens_list, answer_list) ] entity_result = [res["entities"] for res in result] return {"data": entity_result}
def main(out_file='output/result.json', model_config='config/rnn_config.json'): """Test model for given test set on 1 GPU or CPU. Args: in_file: file to be tested out_file: output file model_config: config file """ # 0. Load config with open(model_config) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) if torch.cuda.is_available(): device = torch.device('cuda') # device = torch.device('cpu') else: device = torch.device('cpu') #0. preprocess file # id_list = [] # with open(in_file, 'r', encoding='utf-8') as fin: # for line in fin: # sents = json.loads(line.strip()) # id = sents['id'] # id_list.append(id) # id_dict = dict(zip(range(len(id_list)), id_list)) # 1. Load data data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), max_seq_len=config.max_seq_len, model_type=config.model_type, config=config) test_set, sc_list, label_list = data.load_file(config.test_file_path, train=False) token_list = [] for line in sc_list: tokens = data.tokenizer.convert_ids_to_tokens(line) token_list.append(tokens) data_loader_test = DataLoader(test_set, batch_size=config.batch_size, shuffle=False) # 2. Load model model = MODEL_MAP[config.model_type](config) model = load_torch_model(model, model_path=os.path.join(config.model_path, 'model.bin')) model.to(device) # 3. Evaluate answer_list, length_list = evaluate(model, data_loader_test, device, isTest=True) def flatten(ll): return list(itertools.chain(*ll)) # train_answers = handy_tool(label_list, length_list) #gold # #answer_list = handy_tool(answer_list, length_list) #prediction # train_answers = flatten(train_answers) # train_predictions = flatten(answer_list) # # train_acc, train_f1 = calculate_accuracy_f1( # train_answers, train_predictions) # print(train_acc, train_f1) test_json = json.load(open(config.test_file_path, 'r', encoding='utf-8')) id_list = [item['id'] for item in test_json] mod_tokens_list = handy_tool(token_list, length_list) result = [ result_to_json(t, s) for t, s in zip(mod_tokens_list, answer_list) ] # 4. Write answers to file with open(out_file, 'w', encoding='utf8') as fout: result_list = [] for id, item in zip(id_list, result): entities = item['entities'] words = [ d['word'] + "-" + d['type'] for d in entities if d['type'] != 's' ] unique_words = [] for w in words: if w not in unique_words: unique_words.append(w) item = {} item['id'] = id item['entities'] = unique_words result_list.append(item) json.dump(result_list, fout, ensure_ascii=False, indent=4)