Exemplo n.º 1
0
def __pretrain_bertnrdj(dataset, n_labels, seq_length, n_steps, batch_size,
                        dropout, n_layers, load_model_file, dst_model_file):
    init_logging('log/{}-pre-bertnrdj-{}-{}.log'.format(
        cur_script_name, utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    dataset_files = config.DATA_FILES[dataset]

    print('init robert ...')
    bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE)
    robert_model = robert.Robert(
        bert_config,
        n_labels=n_labels,
        seq_length=config.BERT_SEQ_LEN,
        is_train=False,
        init_checkpoint=dataset_files['bert_init_checkpoint'])
    print('done')

    yelp_tv_idxs_file = os.path.join(
        config.RES_DIR,
        'yelp/eng-part/yelp-rest-sents-r9-tok-eng-p0_04-tvidxs.txt')
    amazon_tv_idxs_file = os.path.join(
        config.RES_DIR, 'amazon/laptops-reivews-sent-tok-text-tvidxs.txt')
    tv_idxs_file = amazon_tv_idxs_file if dataset == 'se14l' else yelp_tv_idxs_file
    print('loading data ...')
    idxs_train, idxs_valid = datautils.load_train_valid_idxs(tv_idxs_file)
    logging.info('{} valid samples'.format(len(idxs_valid)))
    # idxs_valid = set(idxs_valid)
    valid_aspect_terms_list = __load_terms_list(
        idxs_valid, dataset_files['pretrain_aspect_terms_file'])
    valid_opinion_terms_list = __load_terms_list(
        idxs_valid, dataset_files['pretrain_opinion_terms_file'])
    print('done')

    bertnrdj_model = BertNRDJ(n_labels,
                              config.BERT_EMBED_DIM,
                              hidden_size_lstm=hidden_size_lstm,
                              batch_size=batch_size,
                              model_file=load_model_file,
                              n_lstm_layers=n_layers)
    bertnrdj_model.pretrain(
        robert_model=robert_model,
        train_aspect_tfrec_file=dataset_files[
            'pretrain_train_aspect_tfrec_file'],
        valid_aspect_tfrec_file=dataset_files[
            'pretrain_valid_aspect_tfrec_file'],
        train_opinion_tfrec_file=dataset_files[
            'pretrain_train_opinion_tfrec_file'],
        valid_opinion_tfrec_file=dataset_files[
            'pretrain_valid_opinion_tfrec_file'],
        valid_tokens_file=dataset_files['pretrain_valid_token_file'],
        seq_length=seq_length,
        valid_aspect_terms_list=valid_aspect_terms_list,
        valid_opinion_terms_list=valid_opinion_terms_list,
        n_steps=n_steps,
        batch_size=batch_size,
        dropout=dropout,
        save_file=dst_model_file)
Exemplo n.º 2
0
def __train(word_vecs_file, train_tok_texts_file, train_sents_file,
            train_valid_split_file, test_tok_texts_file, test_sents_file,
            load_model_file, task):
    init_logging('log/{}-train-{}-{}.log'.format(
        os.path.splitext(os.path.basename(__file__))[0],
        utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    dst_aspects_file, dst_opinions_file = None, None

    # n_train = 1000
    n_train = -1
    n_tags = 5
    batch_size = 64
    lr = 0.001
    share_lstm = False

    logging.info(word_vecs_file)
    logging.info('load model {}'.format(load_model_file))
    logging.info(test_sents_file)

    print('loading data ...')
    vocab, word_vecs_matrix = datautils.load_word_vecs(word_vecs_file)

    logging.info('word vec dim: {}, n_words={}'.format(
        word_vecs_matrix.shape[1], word_vecs_matrix.shape[0]))
    train_data, valid_data, test_data = modelutils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file,
        test_sents_file, test_tok_texts_file, vocab, n_train, task)
    print('done')

    test_f1s = list()
    for i in range(5):
        logging.info('turn {}'.format(i))
        model = RINANTE(n_tags,
                        word_vecs_matrix,
                        share_lstm,
                        hidden_size_lstm=hidden_size_lstm,
                        model_file=load_model_file,
                        batch_size=batch_size,
                        lamb=lamb)
        test_af1, _ = model.train(train_data,
                                  valid_data,
                                  test_data,
                                  vocab,
                                  n_epochs=n_epochs,
                                  lr=lr,
                                  dst_aspects_file=dst_aspects_file,
                                  dst_opinions_file=dst_opinions_file)
        test_f1s.append(test_af1)
        logging.info('r={} test_f1={:.4f}'.format(i, test_af1))
        tf.reset_default_graph()
    logging.info('avg_test_f1={:.4f}'.format(sum(test_f1s) / len(test_f1s)))
Exemplo n.º 3
0
def __train_nrdj(word_vecs_file, train_tok_texts_file, train_sents_file,
                 train_valid_split_file, test_tok_texts_file, test_sents_file,
                 load_model_file, task):
    init_logging('log/{}-train-{}-{}.log'.format(
        os.path.splitext(os.path.basename(__file__))[0],
        utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    dst_aspects_file = '/home/hldai/data/aspect/semeval14/nrdj-aspects.txt'
    dst_opinions_file = '/home/hldai/data/aspect/semeval14/nrdj-opinions.txt'
    # dst_aspects_file, dst_opinions_file = None, None

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 20
    lr = 0.001
    share_lstm = False

    logging.info(word_vecs_file)
    logging.info('load model {}'.format(load_model_file))
    logging.info(test_sents_file)

    print('loading data ...')
    vocab, word_vecs_matrix = __load_word_vecs(word_vecs_file)

    logging.info('word vec dim: {}, n_words={}'.format(
        word_vecs_matrix.shape[1], word_vecs_matrix.shape[0]))
    train_data, valid_data, test_data = datautils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file,
        test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions)
    print('done')

    nrdj = NeuRuleDoubleJoint(n_tags,
                              word_vecs_matrix,
                              share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              model_file=load_model_file,
                              batch_size=batch_size)
    nrdj.train(train_data,
               valid_data,
               test_data,
               vocab,
               n_epochs=n_epochs,
               lr=lr,
               dst_aspects_file=dst_aspects_file,
               dst_opinions_file=dst_opinions_file)
Exemplo n.º 4
0
def __train_bertnrdj(dataset, n_labels, batch_size, model_file, dropout,
                     n_epochs, learning_rate, start_eval_epoch, n_layers):
    init_logging('log/{}-bertnrdj-{}-{}.log'.format(cur_script_name,
                                                    utils.get_machine_name(),
                                                    str_today),
                 mode='a',
                 to_stdout=True)

    dataset_files = config.DATA_FILES[dataset]

    n_train, data_valid = bldatautils.load_train_data_bert_ol(
        dataset_files['train_sents_file'],
        dataset_files['train_valid_split_file'],
        dataset_files['bert_valid_tokens_file'])
    data_test = bldatautils.load_test_data_bert_ol(
        dataset_files['test_sents_file'],
        dataset_files['bert_test_tokens_file'])

    bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE)
    bm = robert.Robert(bert_config,
                       n_labels=n_labels,
                       seq_length=config.BERT_SEQ_LEN,
                       is_train=False,
                       init_checkpoint=dataset_files['bert_init_checkpoint'])

    logging.info('batch_size={}, learning_rate={}, dropout={}'.format(
        batch_size, learning_rate, dropout))

    # model_file = dataset_files['pretrained_bertnrdj_file']
    # model_file = None
    bertnrdj_model = BertNRDJ(n_labels,
                              config.BERT_EMBED_DIM,
                              hidden_size_lstm=hidden_size_lstm,
                              batch_size=batch_size,
                              model_file=model_file,
                              n_lstm_layers=n_layers)
    bertnrdj_model.train(
        robert_model=bm,
        train_tfrec_file=dataset_files['train_tfrecord_file'],
        valid_tfrec_file=dataset_files['valid_tfrecord_file'],
        test_tfrec_file=dataset_files['test_tfrecord_file'],
        seq_length=config.BERT_SEQ_LEN,
        n_train=n_train,
        data_valid=data_valid,
        data_test=data_test,
        dropout=dropout,
        start_eval_spoch=start_eval_epoch,
        n_epochs=n_epochs,
        lr=learning_rate,
    )
Exemplo n.º 5
0
def __train(word_vecs_file, train_tok_texts_file, train_sents_file,
            train_valid_split_file, test_tok_texts_file, test_sents_file,
            load_model_file, task):
    init_logging('log/{}-train-{}-{}.log'.format(
        os.path.splitext(os.path.basename(__file__))[0],
        utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    dst_aspects_file, dst_opinions_file = None, None

    # n_train = 1000
    n_train = -1
    n_tags = 5
    batch_size = 32
    lr = 0.001
    share_lstm = False

    logging.info(word_vecs_file)
    logging.info('load model {}'.format(load_model_file))
    logging.info(test_sents_file)

    print('loading data ...')
    vocab, word_vecs_matrix = datautils.load_word_vecs(word_vecs_file)

    logging.info('word vec dim: {}, n_words={}'.format(
        word_vecs_matrix.shape[1], word_vecs_matrix.shape[0]))
    train_data, valid_data, test_data = modelutils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file,
        test_sents_file, test_tok_texts_file, vocab, n_train, task)
    print('done')

    model = RINANTE(n_tags,
                    word_vecs_matrix,
                    share_lstm,
                    hidden_size_lstm=hidden_size_lstm,
                    model_file=load_model_file,
                    batch_size=batch_size,
                    lamb=lamb)
    model.train(train_data,
                valid_data,
                test_data,
                vocab,
                n_epochs=n_epochs,
                lr=lr,
                dst_aspects_file=dst_aspects_file,
                dst_opinions_file=dst_opinions_file)
Exemplo n.º 6
0
def __pre_train_nrdj(word_vecs_file,
                     tok_texts_file,
                     aspect_terms_file,
                     opinion_terms_file,
                     dst_model_file,
                     task,
                     lamb,
                     lstm_l2,
                     train_word_embeddings=False,
                     load_model_file=None):
    init_logging('log/{}-pre-{}-{}.log'.format(
        os.path.splitext(os.path.basename(__file__))[0],
        utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 32
    lr = 0.001
    share_lstm = False

    logging.info(word_vecs_file)
    logging.info(aspect_terms_file)
    logging.info(opinion_terms_file)
    logging.info('dst: {}'.format(dst_model_file))

    print('loading data ...')
    vocab, word_vecs_matrix = __load_word_vecs(word_vecs_file)

    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE)
    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, aspect_terms_file, tok_texts_file, 'aspect')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, opinion_terms_file, tok_texts_file, 'opinion')
    print('done')
    logging.info('train_word_embeddings={} lstm_l2={}'.format(
        train_word_embeddings, lstm_l2))

    nrdj = NeuRuleDoubleJoint(n_tags,
                              word_vecs_matrix,
                              share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              train_word_embeddings=train_word_embeddings,
                              lamb=lamb,
                              lstm_l2_src=lstm_l2,
                              model_file=load_model_file,
                              batch_size=batch_size)

    nrdj.pre_train(train_data_src1,
                   valid_data_src1,
                   train_data_src2,
                   valid_data_src2,
                   vocab,
                   n_epochs=50,
                   lr=lr,
                   save_file=dst_model_file)