Exemplo n.º 1
0
        'dropout_prob': 0,
        'lstm_layer_num': 1,
        'num_labels': 45
    }

    mymodel = REL_BLSTM_CRF(config=model_config, show_param=True)

    ###===========================================================
    ###模型参数测试
    ###===========================================================
    ##pass

    ###===========================================================
    ###试训练 -- train_part
    ###===========================================================
    data_set = AutoKGDataset('./d1/')
    train_dataset = data_set.train_dataset[:20]
    eval_dataset = data_set.dev_dataset[:10]
    # train_dataset = data_set.train_dataset
    # eval_dataset = data_set.dev_dataset
    os.makedirs('result', exist_ok=True)
    data_loader = KGDataLoader(data_set, rebuild=False, temp_dir='result/')
    # print(data_loader.embedding_info_dicts['entity_type_dict'])

    print(data_loader.embedding_info_dicts['label_location_dict'])
    show_metadata(data_loader.metadata_)

    print('start_tags:', data_loader.rel_seq_map_dict[data_loader.START_TAG])
    print('end_tags:', data_loader.rel_seq_map_dict[data_loader.END_TAG])

    train_param = {
Exemplo n.º 2
0
        # for rel, v in temp.items():
        #     print(rel)
        #     print(v[0])
        #     print(v[1])
        return temp        
            
        




if __name__ == '__main__':

    # load_bert_pretrained_dict()
    result_dir = './result/'
    data_set = AutoKGDataset('./data/d4/')
    train_dataset = data_set.train_dataset[:200]

    import os
    os.makedirs(result_dir, exist_ok=True)

    data_loader = KGDataLoader2(data_set, rebuild=False, temp_dir=result_dir)
    show_dict_info(data_loader)
    
    # train_data_mat_dict = data_loader.transform_rel(train_dataset, istest=False, ratio=0)
    train_data_mat_dict = data_loader.transform(train_dataset, istest=False, data_type='rel', ratio=0)
    data_generator = Batch_Generator(train_data_mat_dict, batch_size=4, data_type='rel', isshuffle=True)
    # # data_generator = Batch_Generator(train_data_mat_dict, batch_size=4, data_type='ent', isshuffle=True)
    
    pred = data_loader.transform_back(train_data_mat_dict, data_type='rel')
Exemplo n.º 3
0
def main():
    LOGGER.info("===== Start program")
    LOGGER.info('===== Initialize args')
    args = _parse_args()
    _init_python_path(args)

    LOGGER.info(f'===== task mode: {args.mode}')

    ##获取数据集
    dataset = AutoKGDataset(args.dataset_dir)  
    # show_metadata(dataset.metadata_)

    LOGGER.info('===== Load metadata')
    LOGGER.info(f'===== use_cuda: {args.use_cuda}')
    metadata = dataset.get_metadata()
    args.time_budget = metadata.get('time_budget', args.time_budget)
    LOGGER.info(f'Time budget: {args.time_budget}')

    # data_loader = KGDataLoader1(dataset, rebuild=False, temp_dir=args.result_dir) ##For model REL_BLSTM_CRF
    # data_loader = KGDataLoader2(dataset, rebuild=False, temp_dir=args.result_dir) ##For novel tagging model
    data_loader = KGDataLoader3(dataset, rebuild=False, temp_dir=args.result_dir)   ##For model BERT_Hierarchical
    print('max sentence length: ', data_loader.sentence_max_len)

    # show_dict_info(data_loader)
    # print(data_loader.entity_type_dict)
    # print(list(data_loader.rel_seq_map_dict))

    ## Reload model
    model_params = {
        # 'embedding_dim' : 768,
        # 'hidden_dim' : 64,       
        'n_ent_tags' : len(data_loader.ent_seq_map_dict),  
        # 'n_rel_tags' : len(data_loader.rel_seq_map_dict),  
        'n_rels' : len(data_loader.relation_type_dict),
        'n_words' : len(data_loader.character_location_dict),
        'use_cuda':args.use_cuda,
        'dropout_prob': 0,
        'lstm_layer_num': 1  
    }

    ##TODO:
    ##!!! need KGDataLoader3
    mymodel = BERT_Hierarchical(model_params, show_param=True)  

    if args.use_cuda:
        train_dataset = dataset.train_dataset
        test_dataset = dataset.test_dataset
        eval_dataset = dataset.dev_dataset
    else:
        train_dataset = dataset.train_dataset[:CPU_TRAIN]
        test_dataset = dataset.test_dataset
        eval_dataset = dataset.dev_dataset[:CPU_EVAL]      

    test_dataset_final = dataset._read_dataset(
        os.path.join(args.answer_dir, 'test.solution')
    )
    test_dataset_final = dataset.check_repeat_sentence(test_dataset_final)  ## remove the repeat sentences

    
    if args.mode == 'train':
        LOGGER.info('===== Start Train')
        _train(mymodel, args, data_loader, train_dataset=train_dataset, eval_dataset=eval_dataset, RELOAD_MODEL='model_test.p', use_cuda=args.use_cuda)

        LOGGER.info('===== Start Eval')
        _eval(mymodel, args, data_loader, data_set=test_dataset_final, RELOAD_MODEL='model_test.p', use_cuda=args.use_cuda)

    if args.mode == 'eval':
        LOGGER.info('===== Start Eval')
        _eval(mymodel, args, data_loader, data_set=eval_dataset, RELOAD_MODEL='model_lr_0.01.p', use_cuda=args.use_cuda)

    if args.mode == 'predict':
        LOGGER.info('===== Start Predict')
        _predict(mymodel, args, data_loader, data_set=test_dataset, RELOAD_MODEL='model_test.p', use_cuda=args.use_cuda)
        ranges = torch.arange(0, max_len).long()  #(max_len)
        if lens.is_cuda:
            ranges = ranges.cuda()
        ranges = ranges.unsqueeze(0).expand(batch_size,
                                            max_len)  #(batch_size, max_len)
        lens_exp = lens.unsqueeze(1).expand_as(ranges)  #(batch_size, max_len)
        mask = ranges < lens_exp
        return mask


if __name__ == '__main__':

    ###===========================================================
    ###试训练
    ###===========================================================
    data_set = AutoKGDataset('./data/newdata2/d10')
    train_dataset = data_set.train_dataset
    eval_dataset = data_set.dev_dataset
    # train_dataset = data_set.train_dataset
    # eval_dataset = data_set.dev_dataset

    os.makedirs('result', exist_ok=True)
    data_loader = KGDataLoader3(data_set, rebuild=False, temp_dir='result/')

    model_config = {
        'embedding_dim': 768,
        'n_rels': len(data_loader.relation_type_dict),
        'use_cuda': 1,
        'dropout_prob': 0,
    }
    mymodel = BERT_Hierarchical(model_config, show_param=True)
Exemplo n.º 5
0
def main():
    LOGGER.info("===== Start program")
    LOGGER.info('===== Initialize args')
    args = _parse_args()
    _init_python_path(args)

    LOGGER.info(f'===== task mode: {args.mode}')

    ##获取数据集
    dataset = AutoKGDataset(args.dataset_dir)
    # show_metadata(dataset.metadata_)

    LOGGER.info('===== Load metadata')
    LOGGER.info(f'===== use_cuda: {args.use_cuda}')
    metadata = dataset.get_metadata()
    args.time_budget = metadata.get('time_budget', args.time_budget)
    LOGGER.info(f'Time budget: {args.time_budget}')

    data_loader = KGDataLoader(dataset,
                               rebuild=False,
                               temp_dir=args.result_dir)
    show_dict_info(data_loader)
    print(data_loader.sentence_max_len)

    ## Reload model
    model_params = {
        # 'embedding_dim' : 768,
        # 'hidden_dim' : 64,
        'n_ent_tags': len(data_loader.ent_seq_map_dict),
        'n_rel_tags': len(data_loader.rel_seq_map_dict),
        'n_rels': len(data_loader.label_location_dict) + 1,
        'n_words': len(data_loader.character_location_dict),
        'use_cuda': args.use_cuda,
        'dropout_prob': 0,
        'lstm_layer_num': 1,
        # 'start_ent_idx': data_loader.ent_seq_map_dict[data_loader.START_TAG],  ## <start> tag index for entity tag seq
        # 'end_ent_idx': data_loader.ent_seq_map_dict[data_loader.END_TAG],  ## <end> tag index for entity tag seq
        # 'start_rel_idx': data_loader.rel_seq_map_dict[data_loader.START_TAG],  ## <start> tag index for relation tag seq
        # 'end_rel_idx': data_loader.rel_seq_map_dict[data_loader.END_TAG],  ## <end> tag index for relation tag seq
    }

    ##TODO:
    ### 使用自己写的CRF的模型,效果差不多,简化代码,先不用了, not used now========
    ## mymodel = BLSTM_CRF(model_params, show_param=True)
    ## mymodel = BERT_LSTM_CRF(model_params, show_param=True)
    ## mymodel = BERT_CRF(model_params, show_param=True)
    ## mymodel = REL_BLSTM_CRF(model_params, show_param=True)  ##关系抽取模型,不使用

    ### 主要使用对比模型============================
    # mymodel = BASELINE(model_params, show_param=True)
    # mymodel = BERT_MLP(model_params, show_param=True)
    mymodel = BERT_MLP2(model_params, show_param=True)
    # mymodel = BERT_CRF2(model_params, show_param=True)
    # mymodel = BERT_LSTM_CRF2(model_params, show_param=True)

    if args.use_cuda:
        train_dataset = dataset.train_dataset
        test_dataset = dataset.test_dataset
        eval_dataset = dataset.dev_dataset
    else:
        train_dataset = dataset.train_dataset[:CPU_TRAIN]
        test_dataset = dataset.test_dataset
        eval_dataset = dataset.dev_dataset[:CPU_EVAL]

    test_dataset_final = dataset._read_dataset(
        os.path.join(args.answer_dir, 'test.solution'))
    test_dataset_final = dataset.check_repeat_sentence(
        test_dataset_final)  ## remove the repeat sentences

    if args.mode == 'train':
        LOGGER.info('===== Start Train')
        _train(mymodel,
               args,
               data_loader,
               train_dataset=train_dataset,
               eval_dataset=eval_dataset,
               RELOAD_MODEL='model_test.p',
               use_cuda=args.use_cuda)

        LOGGER.info('===== Start Eval')
        _eval(mymodel,
              args,
              data_loader,
              data_set=test_dataset_final,
              RELOAD_MODEL='model_test.p',
              use_cuda=args.use_cuda)

    if args.mode == 'eval':
        LOGGER.info('===== Start Eval')
        _eval(mymodel,
              args,
              data_loader,
              data_set=eval_dataset,
              RELOAD_MODEL='model_test.p',
              use_cuda=args.use_cuda)

    if args.mode == 'predict':
        LOGGER.info('===== Start Predict')
        _predict(mymodel,
                 args,
                 data_loader,
                 data_set=test_dataset,
                 RELOAD_MODEL='model_test.p',
                 use_cuda=args.use_cuda)