예제 #1
0
    def __init__(self):
        logger.info("...")
        # 0. Load config
        with open(model_config) as fin:
            self.config = json.load(fin,
                                    object_hook=lambda d: SimpleNamespace(**d))
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        # 1. Load data

        self.data = Data(vocab_file=os.path.join(self.config.model_path,
                                                 'vocab.txt'),
                         max_seq_len=self.config.max_seq_len,
                         model_type=self.config.model_type,
                         config=self.config)

        # 2. Load model
        self.model = MODEL_MAP[self.config.model_type](self.config)
        self.model = load_torch_model(self.model,
                                      model_path=os.path.join(
                                          self.config.model_path, 'model.bin'))
        self.model.to(self.device)
        logger.info("###")
예제 #2
0
    def __init__(self, model_config='sfzyzb/config/bert_config-l.json'):
        # 0. Load config
        with open(model_config) as fin:
            config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
            # device = torch.device('cpu')
        else:
            self.device = torch.device('cpu')

        # 1. Load data
        self.data = Data(vocab_file=os.path.join(config.model_path,
                                                 'vocab.txt'),
                         max_seq_len=config.max_seq_len,
                         model_type=config.model_type,
                         config=config)

        # 2. Load model
        self.model = MODEL_MAP[config.model_type](config)
        self.model = load_torch_model(self.model,
                                      model_path=os.path.join(
                                          config.model_path, 'model.bin'))
        self.model.to(self.device)
        self.config = config
        self.model.eval()
예제 #3
0
파일: main.py 프로젝트: yueyedeai/CAIL
def main(in_file='/data/SMP-CAIL2020-test1.csv',
         temp_file="data/para_content_test.csv",
         out_file='/output/result1.csv',
         model_config='config/robert3_config.json'):
    """Test model for given test set on 1 GPU or CPU.

    Args:
        in_file: file to be tested
        out_file: output file
        model_config: config file
    """
    # 0. Load config
    with open(model_config) as fin:
        config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))
    if torch.cuda.is_available():
        device = torch.device('cuda')
        # device = torch.device('cpu')
    else:
        device = torch.device('cpu')

    #0. preprocess file
    id_list = []
    with open(in_file, 'r', encoding='utf-8') as fin:
        for line in fin:
            sents = json.loads(line.strip())
            id = sents['id']
            id_list.append(id)
    id_dict = dict(zip(range(len(id_list)), id_list))

    # 1. Load data
    data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'),
                max_seq_len=config.max_seq_len,
                model_type=config.model_type, config=config)
    test_set = data.load_file(temp_file, train=False)
    data_loader_test = DataLoader(
        test_set, batch_size=config.batch_size, shuffle=False)
    # 2. Load model
    model = MODEL_MAP[config.model_type](config)
    model = load_torch_model(
        model, model_path=os.path.join(config.model_path, 'model.bin'))
    model.to(device)
    # 3. Evaluate
    answer_list = evaluate(model, data_loader_test, device)
    token_list = []
    for line in answer_list:
        tokens = data.tokenizer.decode(line, skip_special_tokens=True)
        token_list.append(tokens)
    # 4. Write answers to file
    para_list = pd.read_csv(temp_file)['para'].to_list()
    summary_dict = dict(zip(id_dict.values(), [""] * len(id_dict)))

    result = zip(para_list, token_list)
    for id, summary in result:
        summary_dict[id_dict[id]] += remove(summary).replace(" ","")

    with open(out_file, 'w', encoding='utf8') as fout:
        for id, sumamry in summary_dict.items():
            fout.write(json.dumps({'id':id,'summary':sumamry},  ensure_ascii=False) + '\n')
예제 #4
0
def main(in_file='/data/SMP-CAIL2020-test1.csv',
         out_file='/output/result1.csv',
         model_config='config/bert_config.json'):
    """Test model for given test set on 1 GPU or CPU.

    Args:
        in_file: file to be tested
        out_file: output file
        model_config: config file
    """
    # 0. Load config
    with open(model_config) as fin:
        config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))
    if torch.cuda.is_available():
        device = torch.device('cuda')
        # device = torch.device('cpu')
    else:
        device = torch.device('cpu')
    #0. preprocess file
    tag_sents = []
    para_id = 0
    with open(in_file, 'r', encoding='utf-8') as fin:
        for line in fin:
            sents = json.loads(line.strip())
            text = sents['text']
            sentences = [item['sentence'] for item in text]
            for sent in sentences:
                tag_sents.append((para_id, sent))
            para_id += 1
        df = pandas.DataFrame(tag_sents, columns=['para', 'content'])
        df.to_csv("data/para_content_test.csv",
                  columns=['para', 'content'],
                  index=False)

    # 1. Load data
    data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'),
                max_seq_len=config.max_seq_len,
                model_type=config.model_type,
                config=config)
    test_set = data.load_file("data/para_content_test.csv", train=False)
    data_loader_test = DataLoader(test_set,
                                  batch_size=config.batch_size,
                                  shuffle=False)
    # 2. Load model
    model = MODEL_MAP[config.model_type](config)
    model = load_torch_model(model,
                             model_path=os.path.join(config.model_path,
                                                     'model.bin'))
    model.to(device)
    # 3. Evaluate
    answer_list = evaluate(model, data_loader_test, device)
    # 4. Write answers to file
    df = pd.read_csv("data/para_content_test.csv")
    idcontent_list = list(df.itertuples(index=False))
    filter_list = [k for k, v in zip(idcontent_list, answer_list) if v]
    df = pd.DataFrame(filter_list, columns=['para', 'content'])
    df.to_csv(out_file, columns=['para', 'content'], index=False)
예제 #5
0
def main(in_file='/input/',
         out_file='/output/result.txt',
         model_config='config/bert_config.json'):
    """Test model for given test set on 1 GPU or CPU.

    Args:
        in_file: file to be tested
        out_file: output file
        model_config: config file
    """
    # 0. Load config
    with open(model_config) as fin:
        config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))
    if torch.cuda.is_available():
        device = torch.device('cuda')
        # device = torch.device('cpu')
    else:
        device = torch.device('cpu')
    # 1. Load data
    data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'),
                max_seq_len=config.max_seq_len,
                model_type=config.model_type,
                config=config)

    # 1.1 preprocess '/input/' to 'test.csv' file.
    preprocess(in_file, TEMPFILE)
    test_set = data.load_file(TEMPFILE, train=False)
    data_loader_test = DataLoader(test_set,
                                  batch_size=config.batch_size,
                                  shuffle=False)
    # 2. Load model
    model = MODEL_MAP[config.model_type](config)
    model = load_torch_model(model,
                             model_path=os.path.join(config.model_path,
                                                     'model.bin'))
    model.to(device)
    # 3. Evaluate
    answer_list = evaluatex(model, data_loader_test, device)
    # 4. Write answers to file
    id_list = pd.read_csv(TEMPFILE)['id'].tolist()
    result = {}
    for i, j in zip(id_list, answer_list):
        if i not in result.keys():
            counter = 0
            result[i] = []
        if j == '1':
            result[i].append(chr(ord('A') + counter))
        counter += 1
    json.dump(result,
              open(out_file, "w", encoding="utf8"),
              indent=2,
              ensure_ascii=False,
              sort_keys=True)
예제 #6
0
파일: main.py 프로젝트: ShenDezhou/CAIL2021
def main(out_file='output/result.json', model_config='config/rnn_config.json'):
    """Test model for given test set on 1 GPU or CPU.

    Args:
        in_file: file to be tested
        out_file: output file
        model_config: config file
    """
    # 0. Load config
    with open(model_config) as fin:
        config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))
    if torch.cuda.is_available():
        device = torch.device('cuda')
        # device = torch.device('cpu')
    else:
        device = torch.device('cpu')

    # 1. Load data
    data = Data()

    test_set = data.load_user_log(config.test_file_path)

    data_loader_test = DataLoader(test_set,
                                  batch_size=config.batch_size,
                                  shuffle=False)
    # 2. Load model
    model = MODEL_MAP[config.model_type](config)
    model = load_torch_model(model,
                             model_path=os.path.join(config.model_path,
                                                     'model.bin'))
    model.to(device)
    # 3. Evaluate
    answer_list, _ = evaluate(model, data_loader_test, device, isTest=True)

    def flatten(ll):
        return list(itertools.chain(*ll))

    # # 4. Write answers to file
    with open(out_file, 'w', encoding='utf8') as fout:
        for line in answer_list:
            user_profile = []
            for i, e in enumerate(line):
                if e:
                    user_profile.append(all_code_dic[all_types[i]])
            print(user_profile)
            fout.write(",".join(user_profile) + "\n")
예제 #7
0
def main(in_file='data/f_test.csv',
         out_file='/output/result1.csv',
         model_config='config/bert_config.json'):
    """Test model for given test set on 1 GPU or CPU.

    Args:
        in_file: file to be tested
        out_file: output file
        model_config: config file
    """
    # 0. Load config
    with open(model_config) as fin:
        config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))
    if torch.cuda.is_available():
        #device = torch.device('cuda')
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    # 1. Load data
    data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'),
                max_seq_len=config.max_seq_len,
                model_type=config.model_type,
                config=config)
    test_set, id_list = data.load_file(in_file, train=True)
    assert len(test_set) == len(id_list)
    data_loader_test = DataLoader(test_set,
                                  batch_size=config.batch_size,
                                  shuffle=False)
    # 2. Load model
    model = MODEL_MAP[config.model_type](config)
    model = load_torch_model(model,
                             model_path=os.path.join(config.model_path,
                                                     'model.bin'))
    model.to(device)
    # 3. Evaluate
    answer_list = evaluate(model, data_loader_test, device, has_label=True)
    # 4. Write answers to file
    result = []
    result = single_label_accuracy(answer_list, id_list, config.num_classes,
                                   result)
    metrics = gen_micro_macro_result(result)
    print(metrics)
예제 #8
0
def main(in_file='/data/SMP-CAIL2020-test1.csv',
         out_file='/output/result1.csv',
         model_config='config/bert_config.json'):
    """Test model for given test set on 1 GPU or CPU.

    Args:
        in_file: file to be tested
        out_file: output file
        model_config: config file
    """
    # 0. Load config
    with open(model_config) as fin:
        config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))
    if torch.cuda.is_available():
        #device = torch.device('cuda')
        device = torch.device('cpu')
    else:
        device = torch.device('cpu')
    # 1. Load data
    data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'),
                max_seq_len=config.max_seq_len,
                model_type=config.model_type,
                config=config)
    test_set = data.load_file(in_file, train=False)
    data_loader_test = DataLoader(test_set,
                                  batch_size=config.batch_size,
                                  shuffle=False)
    # 2. Load model
    model = MODEL_MAP[config.model_type](config)
    model = load_torch_model(model,
                             model_path=os.path.join(config.model_path,
                                                     'model.bin'))
    model.to(device)
    # 3. Evaluate
    answer_list = evaluate(model, data_loader_test, device)
    # 4. Write answers to file
    id_list = pd.read_csv(in_file)['id'].tolist()
    with open(out_file, 'w') as fout:
        fout.write('id,answer\n')
        for i, j in zip(id_list, answer_list):
            fout.write(str(i) + ',' + str(j) + '\n')
예제 #9
0
    def __init__(self):
        logger.info("...")
        with open(os.path.join(args.model_folder,'money_maps.json'), "r") as f:  # with open(FLAGS.map_file, "rb") as f:
            self.char_to_id, self.id_to_char, self.tag_to_id, self.id_to_tag = json.load(f)  # pickle.load(f)
            print('json file loaded')
        # 0. Load config
        with open(model_config) as fin:
            self.config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))
        if torch.cuda.is_available():
            self.device = torch.device('cuda:0')
        else:
            self.device = torch.device('cpu')
        # 1. Load data
        self.data = Data(vocab_file=os.path.join(self.config.model_path, 'vocab.txt'),
                         max_seq_len=self.config.max_seq_len,
                         model_type=self.config.model_type, config=self.config)

        # 2. Load model
        self.model = MODEL_MAP[self.config.model_type](self.config)
        self.model = load_torch_model(
            self.model, model_path=os.path.join(self.config.model_path, 'model.bin'), device=self.device)
        self.model.to(self.device)
        logger.info("###")
예제 #10
0
def main(out_file='output/result.json', model_config='config/rnn_config.json'):
    """Test model for given test set on 1 GPU or CPU.

    Args:
        in_file: file to be tested
        out_file: output file
        model_config: config file
    """
    # 0. Load config
    with open(model_config) as fin:
        config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))
    if torch.cuda.is_available():
        device = torch.device('cuda')
        # device = torch.device('cpu')
    else:
        device = torch.device('cpu')

    #0. preprocess file
    # id_list = []
    # with open(in_file, 'r', encoding='utf-8') as fin:
    #     for line in fin:
    #         sents = json.loads(line.strip())
    #         id = sents['id']
    #         id_list.append(id)
    # id_dict = dict(zip(range(len(id_list)), id_list))

    # 1. Load data
    data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'),
                max_seq_len=config.max_seq_len,
                model_type=config.model_type,
                config=config)
    test_set, sc_list, label_list = data.load_file(config.test_file_path,
                                                   train=False)

    token_list = []
    for line in sc_list:
        tokens = data.tokenizer.convert_ids_to_tokens(line)
        token_list.append(tokens)

    data_loader_test = DataLoader(test_set,
                                  batch_size=config.batch_size,
                                  shuffle=False)
    # 2. Load model
    model = MODEL_MAP[config.model_type](config)
    model = load_torch_model(model,
                             model_path=os.path.join(config.model_path,
                                                     'model.bin'))
    model.to(device)
    # 3. Evaluate
    answer_list, length_list = evaluate(model,
                                        data_loader_test,
                                        device,
                                        isTest=True)

    def flatten(ll):
        return list(itertools.chain(*ll))

    # train_answers = handy_tool(label_list, length_list) #gold
    # #answer_list = handy_tool(answer_list, length_list) #prediction
    # train_answers = flatten(train_answers)
    # train_predictions = flatten(answer_list)
    #
    # train_acc, train_f1 = calculate_accuracy_f1(
    #     train_answers, train_predictions)
    # print(train_acc, train_f1)
    test_json = json.load(open(config.test_file_path, 'r', encoding='utf-8'))
    id_list = [item['id'] for item in test_json]

    mod_tokens_list = handy_tool(token_list, length_list)
    result = [
        result_to_json(t, s) for t, s in zip(mod_tokens_list, answer_list)
    ]

    # 4. Write answers to file
    with open(out_file, 'w', encoding='utf8') as fout:
        result_list = []
        for id, item in zip(id_list, result):
            entities = item['entities']
            words = [
                d['word'] + "-" + d['type'] for d in entities
                if d['type'] != 's'
            ]
            unique_words = []
            for w in words:
                if w not in unique_words:
                    unique_words.append(w)
            item = {}
            item['id'] = id
            item['entities'] = unique_words
            result_list.append(item)
        json.dump(result_list, fout, ensure_ascii=False, indent=4)
예제 #11
0
파일: test.py 프로젝트: ShenDezhou/CAIL2021
def main(in_folder='data/test',
         out_file='output/result.json',
         model_config='config/roberta3_bert_config.json',
         isValidOrTest=True):
    """Test model for given test set on 1 GPU or CPU.

    Args:
        in_file: file to be tested
        out_file: output file
        model_config: config file
    """
    # 0. Load config
    with open(model_config) as fin:
        config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d))
    if torch.cuda.is_available():
        device = torch.device('cuda')
        # device = torch.device('cpu')
    else:
        device = torch.device('cpu')
    # 1. Load data
    data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'),
                max_seq_len=config.max_seq_len,
                model_type=config.model_type,
                config=config)

    if isValidOrTest:
        imagebits, filenames, labels = read_joblib("data/test.data")
        exam_file = "data/test.data"
    else:
        filenames = preprocess(in_folder, "data/exam.data")
        exam_file = "data/exam.data"
    # for debug
    # exam_file, filenames, labels = test("data/test.data")
    test_set = data.load_file(exam_file, train=False)
    data_loader_test = DataLoader(test_set,
                                  batch_size=config.batch_size,
                                  shuffle=False)
    # 2. Load model
    model = MODEL_MAP[config.model_type](config)
    model = load_torch_model(model,
                             model_path=os.path.join(config.model_path,
                                                     'model.bin'))
    model.to(device)
    # 3. Evaluate
    answer_list = evaluatetop5(model, data_loader_test, device)
    print(answer_list)
    # 4. Write answers to file
    # id_list = pd.read_csv(in_file)['id'].tolist()
    # pred_result = dict(zip(filenames, answer_list))
    # for debug
    pred_result = []
    for i in range(len(filenames)):
        pred_result.append({filenames[i]: [labels[i], answer_list[i]]})

    if isValidOrTest:
        total = len(filenames)
        correct_top1 = 0
        correct_top5 = 0
        for i in range(len(filenames)):
            if int(labels[i]) == answer_list[i][0]:
                correct_top1 += 1
            if int(labels[i]) in answer_list[i]:
                correct_top5 += 1
        print('ACC-T1:', correct_top1 * 100.0 / total, "%\nACC-T5",
              correct_top5 * 100.0 / total, "%")
    else:
        with open(out_file, 'w') as fout:
            json.dump(pred_result, fout, ensure_ascii=False, indent=4)