Пример #1
0
    print("Char emb:", char_emb)
    print("Bichar emb:", bichar_emb)
    print("Gaz file:", gaz_file)
    if status == 'train':
        print("Model saved to:", save_model_dir)
    # 立即把stdout缓存内容输出
    sys.stdout.flush()

    if status == 'train':
        data = Data()
        data.model_name = model_name
        data.HP_gpu = gpu
        data.use_bichar = conf_dict['use_bichar']
        data.HP_batch_size = conf_dict['HP_batch_size']  # 1
        data.HP_iteration = conf_dict['HP_iteration']  # 100
        data.HP_lr = conf_dict['HP_lr']  # 0.015
        data.HP_lr_decay = conf_dict['HP_lr_decay']  # 0.5
        data.HP_hidden_dim = conf_dict['HP_hidden_dim']
        data.MAX_SENTENCE_LENGTH = conf_dict['MAX_SENTENCE_LENGTH']
        data.HP_lstm_layer = conf_dict['HP_lstm_layer']
        data_initialization(data, gaz_file, train_file, dev_file, test_file)

        if data.model_name in ['CNN_model', 'LSTM_model']:
            data.generate_instance_with_gaz_2(train_file, 'train')
            data.generate_instance_with_gaz_2(dev_file, 'dev')
            data.generate_instance_with_gaz_2(test_file, 'test')
        elif data.model_name in ['WC-LSTM_model']:
            data.generate_instance_with_gaz_3(train_file, 'train')
            data.generate_instance_with_gaz_3(dev_file, 'dev')
            data.generate_instance_with_gaz_3(test_file, 'test')
        else:
Пример #2
0
    data.model_dir = args.savemodel
    data.dset_dir = args.savedset
    print("aaa", data.dset_dir)
    status = args.status.lower()
    save_model_dir = args.savemodel
    data.HP_gpu = torch.cuda.is_available()
    print("Seed num:", seed_num)
    data.number_normalized = True
    data.word_emb_dir = "../data/glove.6B.100d.txt"

    if status == 'train':
        print("MODEL: train")
        data_initialization(data)
        data.use_char = True
        data.HP_batch_size = 10
        data.HP_lr = 0.015
        data.char_seq_feature = "CNN"
        data.generate_instance('train')
        data.generate_instance('dev')
        data.generate_instance('test')
        data.build_pretrain_emb()
        train(data)
    elif status == 'decode':
        print("MODEL: decode")
        data.load(data.dset_dir)
        data.raw_dir = args.raw
        data.decode_dir = args.output
        data.load_model_dir = args.loadmodel
        data.show_data_summary()
        data.generate_instance('raw')
        print("nbest: %s" % (data.nbest))
Пример #3
0
    print("Seg: ", seg)
    print("Train file:", train_file)
    print("Dev file:", dev_file)
    print("Test file:", test_file)
    print("Raw file:", raw_file)
    print("Char emb:", char_emb)
    print("Bichar emb:", bichar_emb)
    print("Gaz file:", gaz_file)
    if status == 'train':
        print("Model saved to:", save_model_dir)
    sys.stdout.flush()

    if status == 'train':
        data = Data()
        data.HP_gpu = gpu
        data.HP_lr = float(lr)
        data.HP_use_char = False
        data.HP_batch_size = 1
        data.use_bigram = False if bichar_emb is None else True
        data.gaz_dropout = 0.5
        data.norm_gaz_emb = False
        data.HP_fix_gaz_emb = False
        data.MAX_SENTENCE_LENGTH = maxlen
        data_initialization(data, gaz_file, train_file, dev_file, test_file,
                            word_sense_map_file)
        data_build_gold(data, train_gold, dev_gold, test_gold)
        data.generate_instance_with_gaz(train_file, 'train')
        data.generate_instance_with_gaz(dev_file, 'dev')
        data.generate_instance_with_gaz(test_file, 'test')
        data.build_word_pretrain_emb(char_emb)
        data.build_biword_pretrain_emb(bichar_emb)
def main():
    parser = argparse.ArgumentParser(description='Tuning with NCRF++')
    # parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train')
    parser.add_argument('--config', help='Configuration File', default='None')
    parser.add_argument('--wordemb',
                        help='Embedding for words',
                        default='None')
    parser.add_argument('--charemb',
                        help='Embedding for chars',
                        default='None')
    parser.add_argument('--status',
                        choices=['train', 'decode'],
                        help='update algorithm',
                        default='train')
    parser.add_argument('--savemodel',
                        default="data/model/saved_model.lstmcrf.")
    parser.add_argument('--savedset', help='Dir of saved data setting')
    parser.add_argument('--train', default="data/conll03/train.bmes")
    parser.add_argument('--dev', default="data/conll03/dev.bmes")
    parser.add_argument('--test', default="data/conll03/test.bmes")
    parser.add_argument('--seg', default="True")
    parser.add_argument('--random-seed', type=int, default=42)
    parser.add_argument('--lr', type=float)
    parser.add_argument('--batch-size', type=int)
    parser.add_argument('--raw')
    parser.add_argument('--loadmodel')
    parser.add_argument('--output')
    parser.add_argument('--output-tsv')
    parser.add_argument('--model-prefix')
    parser.add_argument('--cpu', action='store_true')

    args = parser.parse_args()

    # Set random seed
    seed_num = args.random_seed
    random.seed(seed_num)
    torch.manual_seed(seed_num)
    np.random.seed(seed_num)

    data = Data()
    data.random_seed = seed_num
    data.HP_gpu = torch.cuda.is_available()
    if args.config == 'None':
        data.train_dir = args.train
        data.dev_dir = args.dev
        data.test_dir = args.test
        data.model_dir = args.savemodel
        data.dset_dir = args.savedset
        print("Save dset directory:", data.dset_dir)
        save_model_dir = args.savemodel
        data.word_emb_dir = args.wordemb
        data.char_emb_dir = args.charemb
        if args.seg.lower() == 'true':
            data.seg = True
        else:
            data.seg = False
        print("Seed num:", seed_num)
    else:
        data.read_config(args.config)
    if args.lr:
        data.HP_lr = args.lr
    if args.batch_size:
        data.HP_batch_size = args.batch_size
    data.output_tsv_path = args.output_tsv
    if args.cpu:
        data.HP_gpu = False
    if args.model_prefix:
        data.model_dir = args.model_prefix

    # data.show_data_summary()
    status = data.status.lower()
    print("Seed num:", seed_num)

    if status == 'train':
        print("MODEL: train")
        data_initialization(data)
        data.generate_instance('train')
        data.generate_instance('dev')
        data.generate_instance('test')
        data.build_pretrain_emb()
        train(data)
    elif status == 'decode':
        print("MODEL: decode")
        data.load(data.dset_dir)
        data.read_config(args.config)
        print(data.raw_dir)
        # exit(0)
        data.show_data_summary()
        data.generate_instance('raw')
        print("nbest: %s" % (data.nbest))
        decode_results, pred_scores = load_model_decode(data, 'raw')
        if data.nbest and not data.sentence_classification:
            data.write_nbest_decoded_results(decode_results, pred_scores,
                                             'raw')
        else:
            data.write_decoded_results(decode_results, 'raw')
    else:
        print(
            "Invalid argument! Please use valid arguments! (train/test/decode)"
        )
Пример #5
0
    logger.info("Train file:" + train_file)
    logger.info("Dev file:" + dev_file)
    logger.info("Test file:" + test_file)
    logger.info("Char emb:" + char_emb)
    logger.info("Bichar emb:" + bichar_emb)
    logger.info("Gaz file:" + gaz_file)
    logger.info("Save dir:" + save_dir)
    sys.stdout.flush()

    if args.status == 'train':
        data = Data()
        data.HP_use_char = False
        data.use_bigram = True  # ner: False, cws: True
        data.gaz_dropout = args.gaz_dropout
        data.HP_lr = args.HP_lr  # cws
        data.HP_dropout = args.HP_dropout  # cws
        data.HP_use_glyph = args.HP_use_glyph
        data.HP_glyph_ratio = args.HP_glyph_ratio
        data.HP_font_channels = args.HP_font_channels
        data.HP_glyph_highway = args.HP_glyph_highway
        data.HP_glyph_embsize = args.HP_glyph_embsize
        data.HP_glyph_output_size = args.HP_glyph_output_size
        data.HP_glyph_dropout = args.HP_glyph_dropout
        data.HP_glyph_cnn_dropout = args.HP_glyph_cnn_dropout

        data.HP_iteration = 50  # cws
        data.norm_gaz_emb = True  # ner: False, cws: True

        data.HP_fix_gaz_emb = False
        data_initialization(data, gaz_file, train_file, dev_file, test_file)
Пример #6
0
            data.HP_use_posi = args.use_posi
            data.HP_rethink_iter = args.rethink_iter

        else:
            data = Data()
            data.HP_gpu = gpu
            data.HP_batch_size = args.batch_size
            data.HP_num_layer = args.num_layer
            data.HP_iteration = args.num_iter
            data.use_bigram = args.use_biword
            data.gaz_dropout = 0.5
            data.norm_gaz_emb = False
            data.HP_fix_gaz_emb = False
            data.label_comment = args.labelcomment
            data.result_file = args.resultfile
            data.HP_lr = args.lr
            data.HP_hidden_dim = args.hidden_dim
            data.HP_use_posi = args.use_posi
            data.HP_rethink_iter = args.rethink_iter
            data_initialization(data, gaz_file, train_file, dev_file,
                                test_file)
            data.generate_instance_with_gaz(train_file, 'train')
            data.generate_instance_with_gaz(dev_file, 'dev')
            data.generate_instance_with_gaz(test_file, 'test')
            data.build_word_pretrain_emb(char_emb)
            data.build_biword_pretrain_emb(bichar_emb)
            data.build_gaz_pretrain_emb(gaz_file)

            print('Dumping data')
            with open(save_data_name, 'wb') as f:
                pickle.dump(data, f)
Пример #7
0
 data.model_dir = args.savemodel
 data.dset_dir = args.savedset
 print("aaa",data.dset_dir)
 status = args.status.lower()
 save_model_dir = args.savemodel
 data.HP_gpu = torch.cuda.is_available()
 print("Seed num:",seed_num)
 data.number_normalized = True
 data.word_emb_dir = "../data/glove.6B.100d.txt"
 
 if status == 'train':
     print("MODEL: train")
     data_initialization(data)
     data.use_char = True
     data.HP_batch_size = 10
     data.HP_lr = 0.015
     data.char_seq_feature = "CNN"
     data.generate_instance('train')
     data.generate_instance('dev')
     data.generate_instance('test')
     data.build_pretrain_emb()
     train(data)
 elif status == 'decode':   
     print("MODEL: decode")
     data.load(data.dset_dir)    
     data.raw_dir = args.raw
     data.decode_dir = args.output
     data.load_model_dir = args.loadmodel
     data.show_data_summary()
     data.generate_instance('raw')
     print("nbest: %s"%(data.nbest))
Пример #8
0
    
    logger.info("Train file:" + train_file)
    logger.info("Dev file:" + dev_file)
    logger.info("Test file:" + test_file)
    logger.info("Char emb:" + char_emb)
    logger.info("Bichar emb:" + bichar_emb)
    logger.info("Gaz file:" + gaz_file)
    logger.info("Save dir:" + save_dir)
    sys.stdout.flush()
    
    if args.status == 'train':
        data = Data()
        data.HP_use_char = False
        data.use_bigram = False if 'NER' in args.name else True  # ner: False, cws: True
        data.gaz_dropout = args.gaz_dropout
        data.HP_lr = 0.015 if 'NER' in args.name else 0.01
        data.HP_dropout = args.HP_dropout
        data.HP_use_glyph = args.HP_use_glyph
        data.HP_glyph_ratio = args.HP_glyph_ratio
        data.HP_font_channels = args.HP_font_channels
        data.HP_glyph_highway = args.HP_glyph_highway
        data.HP_glyph_embsize = args.HP_glyph_embsize
        data.HP_glyph_output_size = args.HP_glyph_output_size
        data.HP_glyph_dropout = args.HP_glyph_dropout
        data.HP_glyph_cnn_dropout = args.HP_glyph_cnn_dropout
        data.HP_glyph_batchnorm = args.HP_glyph_batchnorm
        data.HP_glyph_layernorm = args.HP_glyph_layernorm
        data.norm_gaz_emb = False if 'NER' in args.name else True  # ner: False, cws: True

        data.HP_fix_gaz_emb = False
        data_initialization(data, gaz_file, train_file, dev_file, test_file)
Пример #9
0
            data_initialization(data, train_file, dev_file, test_file)

            data.generate_instance(train_file, 'train')
            data.generate_instance(dev_file, 'dev')
            data.generate_instance(test_file, 'test')
            data.HP_gpu = gpu
            model = CWS(data)
            model.load_state_dict(torch.load(model_dir))
            train(model, data, save_model_dir, seg)
        else:
            print('new train parameter')
            data = Data()
            data.HP_gpu = gpu
            data.HP_batch_size = 1
            data.use_bigram = True
            data.HP_lr = 1e-2
            data.HP_dropout = 0.4
            data.HP_iteration = 100
            data_initialization(data, train_file, dev_file, test_file)

            data.generate_instance(train_file, 'train')
            data.generate_instance(dev_file, 'dev')
            data.generate_instance(test_file, 'test')
            data.build_word_pretrain_emb(char_emb)
            data.build_biword_pretrain_emb(bichar_emb)
            # data.build_word_vec_100()
            # attention
            data.cross_domain = cross_domain
            data.cross_test = cross_test
            data.use_attention = use_attention
            data.use_san = use_san