def __init__(self): logger.info("...") # 0. Load config with open(model_config) as fin: self.config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) if torch.cuda.is_available(): self.device = torch.device('cuda') else: self.device = torch.device('cpu') # 1. Load data self.data = Data(vocab_file=os.path.join(self.config.model_path, 'vocab.txt'), max_seq_len=self.config.max_seq_len, model_type=self.config.model_type, config=self.config) # 2. Load model self.model = MODEL_MAP[self.config.model_type](self.config) self.model = load_torch_model(self.model, model_path=os.path.join( self.config.model_path, 'model.bin')) self.model.to(self.device) logger.info("###")
def __init__(self, model_config='sfzyzb/config/bert_config-l.json'): # 0. Load config with open(model_config) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) if torch.cuda.is_available(): self.device = torch.device('cuda') # device = torch.device('cpu') else: self.device = torch.device('cpu') # 1. Load data self.data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), max_seq_len=config.max_seq_len, model_type=config.model_type, config=config) # 2. Load model self.model = MODEL_MAP[config.model_type](config) self.model = load_torch_model(self.model, model_path=os.path.join( config.model_path, 'model.bin')) self.model.to(self.device) self.config = config self.model.eval()
def main(in_file='/data/SMP-CAIL2020-test1.csv', temp_file="data/para_content_test.csv", out_file='/output/result1.csv', model_config='config/robert3_config.json'): """Test model for given test set on 1 GPU or CPU. Args: in_file: file to be tested out_file: output file model_config: config file """ # 0. Load config with open(model_config) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) if torch.cuda.is_available(): device = torch.device('cuda') # device = torch.device('cpu') else: device = torch.device('cpu') #0. preprocess file id_list = [] with open(in_file, 'r', encoding='utf-8') as fin: for line in fin: sents = json.loads(line.strip()) id = sents['id'] id_list.append(id) id_dict = dict(zip(range(len(id_list)), id_list)) # 1. Load data data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), max_seq_len=config.max_seq_len, model_type=config.model_type, config=config) test_set = data.load_file(temp_file, train=False) data_loader_test = DataLoader( test_set, batch_size=config.batch_size, shuffle=False) # 2. Load model model = MODEL_MAP[config.model_type](config) model = load_torch_model( model, model_path=os.path.join(config.model_path, 'model.bin')) model.to(device) # 3. Evaluate answer_list = evaluate(model, data_loader_test, device) token_list = [] for line in answer_list: tokens = data.tokenizer.decode(line, skip_special_tokens=True) token_list.append(tokens) # 4. Write answers to file para_list = pd.read_csv(temp_file)['para'].to_list() summary_dict = dict(zip(id_dict.values(), [""] * len(id_dict))) result = zip(para_list, token_list) for id, summary in result: summary_dict[id_dict[id]] += remove(summary).replace(" ","") with open(out_file, 'w', encoding='utf8') as fout: for id, sumamry in summary_dict.items(): fout.write(json.dumps({'id':id,'summary':sumamry}, ensure_ascii=False) + '\n')
def main(in_file='/data/SMP-CAIL2020-test1.csv', out_file='/output/result1.csv', model_config='config/bert_config.json'): """Test model for given test set on 1 GPU or CPU. Args: in_file: file to be tested out_file: output file model_config: config file """ # 0. Load config with open(model_config) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) if torch.cuda.is_available(): device = torch.device('cuda') # device = torch.device('cpu') else: device = torch.device('cpu') #0. preprocess file tag_sents = [] para_id = 0 with open(in_file, 'r', encoding='utf-8') as fin: for line in fin: sents = json.loads(line.strip()) text = sents['text'] sentences = [item['sentence'] for item in text] for sent in sentences: tag_sents.append((para_id, sent)) para_id += 1 df = pandas.DataFrame(tag_sents, columns=['para', 'content']) df.to_csv("data/para_content_test.csv", columns=['para', 'content'], index=False) # 1. Load data data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), max_seq_len=config.max_seq_len, model_type=config.model_type, config=config) test_set = data.load_file("data/para_content_test.csv", train=False) data_loader_test = DataLoader(test_set, batch_size=config.batch_size, shuffle=False) # 2. Load model model = MODEL_MAP[config.model_type](config) model = load_torch_model(model, model_path=os.path.join(config.model_path, 'model.bin')) model.to(device) # 3. Evaluate answer_list = evaluate(model, data_loader_test, device) # 4. Write answers to file df = pd.read_csv("data/para_content_test.csv") idcontent_list = list(df.itertuples(index=False)) filter_list = [k for k, v in zip(idcontent_list, answer_list) if v] df = pd.DataFrame(filter_list, columns=['para', 'content']) df.to_csv(out_file, columns=['para', 'content'], index=False)
def main(in_file='/input/', out_file='/output/result.txt', model_config='config/bert_config.json'): """Test model for given test set on 1 GPU or CPU. Args: in_file: file to be tested out_file: output file model_config: config file """ # 0. Load config with open(model_config) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) if torch.cuda.is_available(): device = torch.device('cuda') # device = torch.device('cpu') else: device = torch.device('cpu') # 1. Load data data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), max_seq_len=config.max_seq_len, model_type=config.model_type, config=config) # 1.1 preprocess '/input/' to 'test.csv' file. preprocess(in_file, TEMPFILE) test_set = data.load_file(TEMPFILE, train=False) data_loader_test = DataLoader(test_set, batch_size=config.batch_size, shuffle=False) # 2. Load model model = MODEL_MAP[config.model_type](config) model = load_torch_model(model, model_path=os.path.join(config.model_path, 'model.bin')) model.to(device) # 3. Evaluate answer_list = evaluatex(model, data_loader_test, device) # 4. Write answers to file id_list = pd.read_csv(TEMPFILE)['id'].tolist() result = {} for i, j in zip(id_list, answer_list): if i not in result.keys(): counter = 0 result[i] = [] if j == '1': result[i].append(chr(ord('A') + counter)) counter += 1 json.dump(result, open(out_file, "w", encoding="utf8"), indent=2, ensure_ascii=False, sort_keys=True)
def main(out_file='output/result.json', model_config='config/rnn_config.json'): """Test model for given test set on 1 GPU or CPU. Args: in_file: file to be tested out_file: output file model_config: config file """ # 0. Load config with open(model_config) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) if torch.cuda.is_available(): device = torch.device('cuda') # device = torch.device('cpu') else: device = torch.device('cpu') # 1. Load data data = Data() test_set = data.load_user_log(config.test_file_path) data_loader_test = DataLoader(test_set, batch_size=config.batch_size, shuffle=False) # 2. Load model model = MODEL_MAP[config.model_type](config) model = load_torch_model(model, model_path=os.path.join(config.model_path, 'model.bin')) model.to(device) # 3. Evaluate answer_list, _ = evaluate(model, data_loader_test, device, isTest=True) def flatten(ll): return list(itertools.chain(*ll)) # # 4. Write answers to file with open(out_file, 'w', encoding='utf8') as fout: for line in answer_list: user_profile = [] for i, e in enumerate(line): if e: user_profile.append(all_code_dic[all_types[i]]) print(user_profile) fout.write(",".join(user_profile) + "\n")
def main(in_file='data/f_test.csv', out_file='/output/result1.csv', model_config='config/bert_config.json'): """Test model for given test set on 1 GPU or CPU. Args: in_file: file to be tested out_file: output file model_config: config file """ # 0. Load config with open(model_config) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) if torch.cuda.is_available(): #device = torch.device('cuda') device = torch.device('cuda') else: device = torch.device('cpu') # 1. Load data data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), max_seq_len=config.max_seq_len, model_type=config.model_type, config=config) test_set, id_list = data.load_file(in_file, train=True) assert len(test_set) == len(id_list) data_loader_test = DataLoader(test_set, batch_size=config.batch_size, shuffle=False) # 2. Load model model = MODEL_MAP[config.model_type](config) model = load_torch_model(model, model_path=os.path.join(config.model_path, 'model.bin')) model.to(device) # 3. Evaluate answer_list = evaluate(model, data_loader_test, device, has_label=True) # 4. Write answers to file result = [] result = single_label_accuracy(answer_list, id_list, config.num_classes, result) metrics = gen_micro_macro_result(result) print(metrics)
def main(in_file='/data/SMP-CAIL2020-test1.csv', out_file='/output/result1.csv', model_config='config/bert_config.json'): """Test model for given test set on 1 GPU or CPU. Args: in_file: file to be tested out_file: output file model_config: config file """ # 0. Load config with open(model_config) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) if torch.cuda.is_available(): #device = torch.device('cuda') device = torch.device('cpu') else: device = torch.device('cpu') # 1. Load data data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), max_seq_len=config.max_seq_len, model_type=config.model_type, config=config) test_set = data.load_file(in_file, train=False) data_loader_test = DataLoader(test_set, batch_size=config.batch_size, shuffle=False) # 2. Load model model = MODEL_MAP[config.model_type](config) model = load_torch_model(model, model_path=os.path.join(config.model_path, 'model.bin')) model.to(device) # 3. Evaluate answer_list = evaluate(model, data_loader_test, device) # 4. Write answers to file id_list = pd.read_csv(in_file)['id'].tolist() with open(out_file, 'w') as fout: fout.write('id,answer\n') for i, j in zip(id_list, answer_list): fout.write(str(i) + ',' + str(j) + '\n')
def __init__(self): logger.info("...") with open(os.path.join(args.model_folder,'money_maps.json'), "r") as f: # with open(FLAGS.map_file, "rb") as f: self.char_to_id, self.id_to_char, self.tag_to_id, self.id_to_tag = json.load(f) # pickle.load(f) print('json file loaded') # 0. Load config with open(model_config) as fin: self.config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) if torch.cuda.is_available(): self.device = torch.device('cuda:0') else: self.device = torch.device('cpu') # 1. Load data self.data = Data(vocab_file=os.path.join(self.config.model_path, 'vocab.txt'), max_seq_len=self.config.max_seq_len, model_type=self.config.model_type, config=self.config) # 2. Load model self.model = MODEL_MAP[self.config.model_type](self.config) self.model = load_torch_model( self.model, model_path=os.path.join(self.config.model_path, 'model.bin'), device=self.device) self.model.to(self.device) logger.info("###")
def main(out_file='output/result.json', model_config='config/rnn_config.json'): """Test model for given test set on 1 GPU or CPU. Args: in_file: file to be tested out_file: output file model_config: config file """ # 0. Load config with open(model_config) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) if torch.cuda.is_available(): device = torch.device('cuda') # device = torch.device('cpu') else: device = torch.device('cpu') #0. preprocess file # id_list = [] # with open(in_file, 'r', encoding='utf-8') as fin: # for line in fin: # sents = json.loads(line.strip()) # id = sents['id'] # id_list.append(id) # id_dict = dict(zip(range(len(id_list)), id_list)) # 1. Load data data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), max_seq_len=config.max_seq_len, model_type=config.model_type, config=config) test_set, sc_list, label_list = data.load_file(config.test_file_path, train=False) token_list = [] for line in sc_list: tokens = data.tokenizer.convert_ids_to_tokens(line) token_list.append(tokens) data_loader_test = DataLoader(test_set, batch_size=config.batch_size, shuffle=False) # 2. Load model model = MODEL_MAP[config.model_type](config) model = load_torch_model(model, model_path=os.path.join(config.model_path, 'model.bin')) model.to(device) # 3. Evaluate answer_list, length_list = evaluate(model, data_loader_test, device, isTest=True) def flatten(ll): return list(itertools.chain(*ll)) # train_answers = handy_tool(label_list, length_list) #gold # #answer_list = handy_tool(answer_list, length_list) #prediction # train_answers = flatten(train_answers) # train_predictions = flatten(answer_list) # # train_acc, train_f1 = calculate_accuracy_f1( # train_answers, train_predictions) # print(train_acc, train_f1) test_json = json.load(open(config.test_file_path, 'r', encoding='utf-8')) id_list = [item['id'] for item in test_json] mod_tokens_list = handy_tool(token_list, length_list) result = [ result_to_json(t, s) for t, s in zip(mod_tokens_list, answer_list) ] # 4. Write answers to file with open(out_file, 'w', encoding='utf8') as fout: result_list = [] for id, item in zip(id_list, result): entities = item['entities'] words = [ d['word'] + "-" + d['type'] for d in entities if d['type'] != 's' ] unique_words = [] for w in words: if w not in unique_words: unique_words.append(w) item = {} item['id'] = id item['entities'] = unique_words result_list.append(item) json.dump(result_list, fout, ensure_ascii=False, indent=4)
def main(in_folder='data/test', out_file='output/result.json', model_config='config/roberta3_bert_config.json', isValidOrTest=True): """Test model for given test set on 1 GPU or CPU. Args: in_file: file to be tested out_file: output file model_config: config file """ # 0. Load config with open(model_config) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) if torch.cuda.is_available(): device = torch.device('cuda') # device = torch.device('cpu') else: device = torch.device('cpu') # 1. Load data data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), max_seq_len=config.max_seq_len, model_type=config.model_type, config=config) if isValidOrTest: imagebits, filenames, labels = read_joblib("data/test.data") exam_file = "data/test.data" else: filenames = preprocess(in_folder, "data/exam.data") exam_file = "data/exam.data" # for debug # exam_file, filenames, labels = test("data/test.data") test_set = data.load_file(exam_file, train=False) data_loader_test = DataLoader(test_set, batch_size=config.batch_size, shuffle=False) # 2. Load model model = MODEL_MAP[config.model_type](config) model = load_torch_model(model, model_path=os.path.join(config.model_path, 'model.bin')) model.to(device) # 3. Evaluate answer_list = evaluatetop5(model, data_loader_test, device) print(answer_list) # 4. Write answers to file # id_list = pd.read_csv(in_file)['id'].tolist() # pred_result = dict(zip(filenames, answer_list)) # for debug pred_result = [] for i in range(len(filenames)): pred_result.append({filenames[i]: [labels[i], answer_list[i]]}) if isValidOrTest: total = len(filenames) correct_top1 = 0 correct_top5 = 0 for i in range(len(filenames)): if int(labels[i]) == answer_list[i][0]: correct_top1 += 1 if int(labels[i]) in answer_list[i]: correct_top5 += 1 print('ACC-T1:', correct_top1 * 100.0 / total, "%\nACC-T5", correct_top5 * 100.0 / total, "%") else: with open(out_file, 'w') as fout: json.dump(pred_result, fout, ensure_ascii=False, indent=4)