示例#1
0
def __pretrain_bertnrdj(dataset, n_labels, seq_length, n_steps, batch_size,
                        dropout, n_layers, load_model_file, dst_model_file):
    init_logging('log/{}-pre-bertnrdj-{}-{}.log'.format(
        cur_script_name, utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    dataset_files = config.DATA_FILES[dataset]

    print('init robert ...')
    bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE)
    robert_model = robert.Robert(
        bert_config,
        n_labels=n_labels,
        seq_length=config.BERT_SEQ_LEN,
        is_train=False,
        init_checkpoint=dataset_files['bert_init_checkpoint'])
    print('done')

    yelp_tv_idxs_file = os.path.join(
        config.RES_DIR,
        'yelp/eng-part/yelp-rest-sents-r9-tok-eng-p0_04-tvidxs.txt')
    amazon_tv_idxs_file = os.path.join(
        config.RES_DIR, 'amazon/laptops-reivews-sent-tok-text-tvidxs.txt')
    tv_idxs_file = amazon_tv_idxs_file if dataset == 'se14l' else yelp_tv_idxs_file
    print('loading data ...')
    idxs_train, idxs_valid = datautils.load_train_valid_idxs(tv_idxs_file)
    logging.info('{} valid samples'.format(len(idxs_valid)))
    # idxs_valid = set(idxs_valid)
    valid_aspect_terms_list = __load_terms_list(
        idxs_valid, dataset_files['pretrain_aspect_terms_file'])
    valid_opinion_terms_list = __load_terms_list(
        idxs_valid, dataset_files['pretrain_opinion_terms_file'])
    print('done')

    bertnrdj_model = BertNRDJ(n_labels,
                              config.BERT_EMBED_DIM,
                              hidden_size_lstm=hidden_size_lstm,
                              batch_size=batch_size,
                              model_file=load_model_file,
                              n_lstm_layers=n_layers)
    bertnrdj_model.pretrain(
        robert_model=robert_model,
        train_aspect_tfrec_file=dataset_files[
            'pretrain_train_aspect_tfrec_file'],
        valid_aspect_tfrec_file=dataset_files[
            'pretrain_valid_aspect_tfrec_file'],
        train_opinion_tfrec_file=dataset_files[
            'pretrain_train_opinion_tfrec_file'],
        valid_opinion_tfrec_file=dataset_files[
            'pretrain_valid_opinion_tfrec_file'],
        valid_tokens_file=dataset_files['pretrain_valid_token_file'],
        seq_length=seq_length,
        valid_aspect_terms_list=valid_aspect_terms_list,
        valid_opinion_terms_list=valid_opinion_terms_list,
        n_steps=n_steps,
        batch_size=batch_size,
        dropout=dropout,
        save_file=dst_model_file)
示例#2
0
def __train_bertnrdj(dataset, n_labels, batch_size, model_file, dropout,
                     n_epochs, learning_rate, start_eval_epoch, n_layers):
    init_logging('log/{}-bertnrdj-{}-{}.log'.format(cur_script_name,
                                                    utils.get_machine_name(),
                                                    str_today),
                 mode='a',
                 to_stdout=True)

    dataset_files = config.DATA_FILES[dataset]

    n_train, data_valid = bldatautils.load_train_data_bert_ol(
        dataset_files['train_sents_file'],
        dataset_files['train_valid_split_file'],
        dataset_files['bert_valid_tokens_file'])
    data_test = bldatautils.load_test_data_bert_ol(
        dataset_files['test_sents_file'],
        dataset_files['bert_test_tokens_file'])

    bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE)
    bm = robert.Robert(bert_config,
                       n_labels=n_labels,
                       seq_length=config.BERT_SEQ_LEN,
                       is_train=False,
                       init_checkpoint=dataset_files['bert_init_checkpoint'])

    logging.info('batch_size={}, learning_rate={}, dropout={}'.format(
        batch_size, learning_rate, dropout))

    # model_file = dataset_files['pretrained_bertnrdj_file']
    # model_file = None
    bertnrdj_model = BertNRDJ(n_labels,
                              config.BERT_EMBED_DIM,
                              hidden_size_lstm=hidden_size_lstm,
                              batch_size=batch_size,
                              model_file=model_file,
                              n_lstm_layers=n_layers)
    bertnrdj_model.train(
        robert_model=bm,
        train_tfrec_file=dataset_files['train_tfrecord_file'],
        valid_tfrec_file=dataset_files['valid_tfrecord_file'],
        test_tfrec_file=dataset_files['test_tfrecord_file'],
        seq_length=config.BERT_SEQ_LEN,
        n_train=n_train,
        data_valid=data_valid,
        data_test=data_test,
        dropout=dropout,
        start_eval_spoch=start_eval_epoch,
        n_epochs=n_epochs,
        lr=learning_rate,
    )
示例#3
0
def __train_bertlstm_ol(dataset):
    str_today = datetime.date.today().strftime('%y-%m-%d')
    init_logging('log/{}-bertlstmcrfol-{}.log'.format(cur_script_name,
                                                      str_today),
                 mode='a',
                 to_stdout=True)

    n_labels = 5
    hidden_size_lstm = 200
    batch_size = 16
    n_epochs = 100
    dropout = 0.5

    dataset_files = config.DATA_FILES[dataset]

    n_train, data_valid = bldatautils.load_train_data_bert_ol(
        dataset_files['train_sents_file'],
        dataset_files['train_valid_split_file'],
        dataset_files['bert_valid_tokens_file'])
    data_test = bldatautils.load_test_data_bert_ol(
        dataset_files['test_sents_file'],
        dataset_files['bert_test_tokens_file'])

    bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE)
    bm = robert.Robert(bert_config,
                       n_labels=n_labels,
                       seq_length=config.BERT_SEQ_LEN,
                       is_train=False,
                       init_checkpoint=dataset_files['bert_init_checkpoint'])

    lstmcrf = BertLSTMCRF(n_labels,
                          config.BERT_EMBED_DIM,
                          hidden_size_lstm=hidden_size_lstm,
                          batch_size=batch_size)
    lstmcrf.train_ol(robert_model=bm,
                     train_tfrec_file=dataset_files['train_tfrecord_file'],
                     valid_tfrec_file=dataset_files['valid_tfrecord_file'],
                     test_tfrec_file=dataset_files['test_tfrecord_file'],
                     seq_length=config.BERT_SEQ_LEN,
                     n_train=n_train,
                     data_valid=data_valid,
                     data_test=data_test,
                     n_epochs=n_epochs,
                     dropout=dropout)
示例#4
0
def __train_bertlstm_ol():
    str_today = datetime.date.today().strftime('%y-%m-%d')
    init_logging('log/bertlstmcrfol3-{}.log'.format(str_today),
                 mode='a',
                 to_stdout=True)

    # dataset = 'se14r'
    dataset = 'se15r'
    n_labels = 5

    dataset_files = config.DATA_FILES[dataset]

    n_train, data_valid = bldatautils.load_train_data_bert_ol(
        dataset_files['train_sents_file'],
        dataset_files['train_valid_split_file'],
        dataset_files['bert_valid_tokens_file'])
    data_test = bldatautils.load_test_data_bert_ol(
        dataset_files['test_sents_file'],
        dataset_files['bert_test_tokens_file'])

    bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE)
    bm = robert.Robert(bert_config,
                       n_labels=n_labels,
                       seq_length=config.BERT_SEQ_LEN,
                       is_train=False,
                       init_checkpoint=dataset_files['bert_init_checkpoint'])

    lstmcrf = BertLSTMCRF(n_labels,
                          config.BERT_EMBED_DIM,
                          hidden_size_lstm=500,
                          batch_size=5)
    lstmcrf.train_ol(robert_model=bm,
                     train_tfrec_file=dataset_files['train_tfrecord_file'],
                     valid_tfrec_file=dataset_files['valid_tfrecord_file'],
                     test_tfrec_file=dataset_files['test_tfrecord_file'],
                     seq_length=config.BERT_SEQ_LEN,
                     n_train=n_train,
                     data_valid=data_valid,
                     data_test=data_test)