示例#1
0
def __train_lstmcrf(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file,
                    test_sents_file, load_model_file, task, error_file=None):
    init_logging('log/nr-{}.log'.format(str_today), mode='a', to_stdout=True)

    n_tags = 5 if task == 'both' else 3

    print('loading data ...')
    with open(word_vecs_file, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)

    save_model_file = None
    train_data, valid_data, test_data = datautils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file,
        test_sents_file, test_tok_texts_file,
        vocab, -1, task)

    # train_data, valid_data = __get_data_semeval(vocab, -1)
    # train_data, valid_data = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE)
    # train_data, valid_data = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE2_FILE)
    print('done')

    # lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm)
    lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file)
    # print(valid_data.aspects_true_list)
    lstmcrf.train(train_data.word_idxs_list, train_data.labels_list, valid_data.word_idxs_list,
                  valid_data.labels_list, vocab, valid_data.tok_texts, valid_data.aspects_true_list,
                  valid_data.opinions_true_list,
                  n_epochs=n_epochs, save_file=save_model_file, error_file=error_file)
示例#2
0
def __train_dlc(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file,
                test_sents_file):
    init_logging('log/dlc-jtrain2-{}.log'.format(str_today), mode='a', to_stdout=True)

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 10
    lr = 0.001
    share_lstm = False

    print('loading data ...')
    with open(word_vecs_file, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
    logging.info('word vec dim: {}, n_words={}'.format(word_vecs_matrix.shape[1], word_vecs_matrix.shape[0]))

    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, pre_aspect_terms_file, pre_tok_texts_file, 'aspect')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, pre_opinion_terms_file, pre_tok_texts_file, 'opinion')

    train_data, valid_data, test_data = datautils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file,
        vocab, n_train, label_opinions)
    print('done')

    dlc = DSLSTMCRF(word_vecs_matrix, hidden_size_lstm=hidden_size_lstm,
                    model_file=None, batch_size=batch_size)

    dlc.joint_train(train_data_src1, valid_data_src1, train_data_src2, valid_data_src2,
                    train_data, valid_data, test_data, n_epochs=n_epochs, lr=lr)
示例#3
0
def __pre_train_nrdj(word_vecs_file,
                     tok_texts_file,
                     aspect_terms_file,
                     opinion_terms_file,
                     dst_model_file,
                     task,
                     load_model_file=None):
    init_logging('log/nrdj-pre-{}.log'.format(str_today),
                 mode='a',
                 to_stdout=True)

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 20
    lr = 0.001
    share_lstm = False

    print('loading data ...')
    with open(word_vecs_file, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)

    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE)
    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, aspect_terms_file, tok_texts_file, 'aspect')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, opinion_terms_file, tok_texts_file, 'opinion')
    print('done')

    nrdj = NeuRuleDoubleJoint(n_tags,
                              word_vecs_matrix,
                              share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              model_file=load_model_file,
                              batch_size=batch_size)

    nrj_train_data_src1 = nrj_train_data_src2 = None
    # if train_mode != 'target-only':
    # nrj_train_data_src1 = NeuRuleDoubleJoint.TrainData(
    #     train_data_src1.word_idxs_list, train_data_src1.labels_list, valid_data_src1.word_idxs_list,
    #     valid_data_src1.labels_list, valid_data_src1.tok_texts, valid_data_src1.aspects_true_list, None
    # )
    # nrj_train_data_src2 = NeuRuleDoubleJoint.TrainData(
    #     train_data_src2.word_idxs_list, train_data_src2.labels_list, valid_data_src2.word_idxs_list,
    #     valid_data_src2.labels_list, valid_data_src2.tok_texts, None,
    #     valid_data_src2.opinions_true_list
    # )

    nrdj.pre_train(train_data_src1,
                   valid_data_src1,
                   train_data_src2,
                   valid_data_src2,
                   vocab,
                   n_epochs=30,
                   lr=lr,
                   save_file=dst_model_file)
示例#4
0
def __pretrain_lstmcrf(word_vecs_file, pre_tok_texts_file, pre_aspect_terms_file, pre_opinion_terms_file,
                       dst_model_file, task):
    init_logging('log/nr-pre-{}.log'.format(str_today), mode='a', to_stdout=True)

    n_tags = 5 if task == 'both' else 3

    print('loading data ...')
    with open(word_vecs_file, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)

    load_model_file = None
    save_model_file = dst_model_file
    if task == 'both':
        train_data, valid_data = datautils.get_data_amazon_ao(
            vocab, pre_aspect_terms_file, pre_opinion_terms_file, pre_tok_texts_file)
    elif task == 'aspect':
        train_data, valid_data = datautils.get_data_amazon(
            vocab, pre_aspect_terms_file, pre_tok_texts_file, task)
    else:
        train_data, valid_data = datautils.get_data_amazon(
            vocab, pre_opinion_terms_file, pre_tok_texts_file, task)
    print('done')

    # lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm)
    lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file)
    # print(valid_data.aspects_true_list)
    lstmcrf.train(train_data.word_idxs_list, train_data.labels_list, valid_data.word_idxs_list,
                  valid_data.labels_list, vocab, valid_data.tok_texts, valid_data.aspects_true_list,
                  valid_data.opinions_true_list, n_epochs=n_epochs, save_file=save_model_file)
示例#5
0
def __pretrain_bertnrdj(dataset, n_labels, seq_length, n_steps, batch_size,
                        dropout, n_layers, load_model_file, dst_model_file):
    init_logging('log/{}-pre-bertnrdj-{}-{}.log'.format(
        cur_script_name, utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    dataset_files = config.DATA_FILES[dataset]

    print('init robert ...')
    bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE)
    robert_model = robert.Robert(
        bert_config,
        n_labels=n_labels,
        seq_length=config.BERT_SEQ_LEN,
        is_train=False,
        init_checkpoint=dataset_files['bert_init_checkpoint'])
    print('done')

    yelp_tv_idxs_file = os.path.join(
        config.RES_DIR,
        'yelp/eng-part/yelp-rest-sents-r9-tok-eng-p0_04-tvidxs.txt')
    amazon_tv_idxs_file = os.path.join(
        config.RES_DIR, 'amazon/laptops-reivews-sent-tok-text-tvidxs.txt')
    tv_idxs_file = amazon_tv_idxs_file if dataset == 'se14l' else yelp_tv_idxs_file
    print('loading data ...')
    idxs_train, idxs_valid = datautils.load_train_valid_idxs(tv_idxs_file)
    logging.info('{} valid samples'.format(len(idxs_valid)))
    # idxs_valid = set(idxs_valid)
    valid_aspect_terms_list = __load_terms_list(
        idxs_valid, dataset_files['pretrain_aspect_terms_file'])
    valid_opinion_terms_list = __load_terms_list(
        idxs_valid, dataset_files['pretrain_opinion_terms_file'])
    print('done')

    bertnrdj_model = BertNRDJ(n_labels,
                              config.BERT_EMBED_DIM,
                              hidden_size_lstm=hidden_size_lstm,
                              batch_size=batch_size,
                              model_file=load_model_file,
                              n_lstm_layers=n_layers)
    bertnrdj_model.pretrain(
        robert_model=robert_model,
        train_aspect_tfrec_file=dataset_files[
            'pretrain_train_aspect_tfrec_file'],
        valid_aspect_tfrec_file=dataset_files[
            'pretrain_valid_aspect_tfrec_file'],
        train_opinion_tfrec_file=dataset_files[
            'pretrain_train_opinion_tfrec_file'],
        valid_opinion_tfrec_file=dataset_files[
            'pretrain_valid_opinion_tfrec_file'],
        valid_tokens_file=dataset_files['pretrain_valid_token_file'],
        seq_length=seq_length,
        valid_aspect_terms_list=valid_aspect_terms_list,
        valid_opinion_terms_list=valid_opinion_terms_list,
        n_steps=n_steps,
        batch_size=batch_size,
        dropout=dropout,
        save_file=dst_model_file)
示例#6
0
def __train_bert():
    str_today = datetime.date.today().strftime('%y-%m-%d')
    init_logging('log/bertlstmcrf3-{}.log'.format(str_today),
                 mode='a',
                 to_stdout=True)

    # dataset = 'se14r'
    dataset = 'se15r'

    dataset_files = config.DATA_FILES[dataset]
    if dataset == 'se14l':
        bert_embed_file_train = os.path.join(
            config.SE14_DIR, 'laptops/laptops_train_texts_tok_bert.txt')
        bert_embed_file_test = os.path.join(
            config.SE14_DIR, 'laptops/laptops_test_texts_tok_bert.txt')
        # dst_aspects_file = 'd:/data/aspect/semeval14/lstmcrf-aspects.txt'
        # dst_opinions_file = 'd:/data/aspect/semeval14/lstmcrf-opinions.txt'
    elif dataset == 'se14r':
        bert_embed_file_train = os.path.join(
            config.SE14_DIR,
            'restaurants/restaurants_train_texts_tok_bert.txt')
        bert_embed_file_test = os.path.join(
            config.SE14_DIR, 'restaurants/restaurants_test_texts_tok_bert.txt')
    else:
        bert_embed_file_train = os.path.join(
            config.SE15_DIR,
            'restaurants/restaurants_train_texts_tok_bert.txt')
        bert_embed_file_test = os.path.join(
            config.SE15_DIR, 'restaurants/restaurants_test_texts_tok_bert.txt')

    print('loading data ...')
    data_train, data_valid = bldatautils.load_train_data_bert(
        bert_embed_file_train, dataset_files['train_sents_file'],
        dataset_files['train_valid_split_file'])
    data_test = bldatautils.load_valid_data_bert(
        bert_embed_file_test, dataset_files['test_sents_file'])
    print('done')

    word_embed_dim = len(data_train.word_embed_seqs[0][0])
    n_tags = 5
    n_epochs = 100
    lr = 0.001

    # with open(word_vecs_file, 'rb') as f:
    #     vocab, word_vecs_matrix = pickle.load(f)

    logging.info(dataset_files['test_sents_file'])
    logging.info('token_embed_dim={}'.format(word_embed_dim))

    save_model_file = None
    lstmcrf = BertLSTMCRF(n_tags,
                          word_embed_dim,
                          hidden_size_lstm=500,
                          batch_size=5)
    lstmcrf.train(data_train, data_valid, data_test, n_epochs=n_epochs, lr=lr)
示例#7
0
def __train(word_vecs_file, train_tok_texts_file, train_sents_file,
            train_valid_split_file, test_tok_texts_file, test_sents_file,
            load_model_file, task):
    init_logging('log/{}-train-{}-{}.log'.format(
        os.path.splitext(os.path.basename(__file__))[0],
        utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    dst_aspects_file, dst_opinions_file = None, None

    # n_train = 1000
    n_train = -1
    n_tags = 5
    batch_size = 64
    lr = 0.001
    share_lstm = False

    logging.info(word_vecs_file)
    logging.info('load model {}'.format(load_model_file))
    logging.info(test_sents_file)

    print('loading data ...')
    vocab, word_vecs_matrix = datautils.load_word_vecs(word_vecs_file)

    logging.info('word vec dim: {}, n_words={}'.format(
        word_vecs_matrix.shape[1], word_vecs_matrix.shape[0]))
    train_data, valid_data, test_data = modelutils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file,
        test_sents_file, test_tok_texts_file, vocab, n_train, task)
    print('done')

    test_f1s = list()
    for i in range(5):
        logging.info('turn {}'.format(i))
        model = RINANTE(n_tags,
                        word_vecs_matrix,
                        share_lstm,
                        hidden_size_lstm=hidden_size_lstm,
                        model_file=load_model_file,
                        batch_size=batch_size,
                        lamb=lamb)
        test_af1, _ = model.train(train_data,
                                  valid_data,
                                  test_data,
                                  vocab,
                                  n_epochs=n_epochs,
                                  lr=lr,
                                  dst_aspects_file=dst_aspects_file,
                                  dst_opinions_file=dst_opinions_file)
        test_f1s.append(test_af1)
        logging.info('r={} test_f1={:.4f}'.format(i, test_af1))
        tf.reset_default_graph()
    logging.info('avg_test_f1={:.4f}'.format(sum(test_f1s) / len(test_f1s)))
示例#8
0
def __train_nrdj(word_vecs_file, train_tok_texts_file, train_sents_file,
                 train_valid_split_file, test_tok_texts_file, test_sents_file,
                 load_model_file, task):
    init_logging('log/{}-train-{}-{}.log'.format(
        os.path.splitext(os.path.basename(__file__))[0],
        utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    dst_aspects_file = '/home/hldai/data/aspect/semeval14/nrdj-aspects.txt'
    dst_opinions_file = '/home/hldai/data/aspect/semeval14/nrdj-opinions.txt'
    # dst_aspects_file, dst_opinions_file = None, None

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 20
    lr = 0.001
    share_lstm = False

    logging.info(word_vecs_file)
    logging.info('load model {}'.format(load_model_file))
    logging.info(test_sents_file)

    print('loading data ...')
    vocab, word_vecs_matrix = __load_word_vecs(word_vecs_file)

    logging.info('word vec dim: {}, n_words={}'.format(
        word_vecs_matrix.shape[1], word_vecs_matrix.shape[0]))
    train_data, valid_data, test_data = datautils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file,
        test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions)
    print('done')

    nrdj = NeuRuleDoubleJoint(n_tags,
                              word_vecs_matrix,
                              share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              model_file=load_model_file,
                              batch_size=batch_size)
    nrdj.train(train_data,
               valid_data,
               test_data,
               vocab,
               n_epochs=n_epochs,
               lr=lr,
               dst_aspects_file=dst_aspects_file,
               dst_opinions_file=dst_opinions_file)
示例#9
0
def __train_bertnrdj(dataset, n_labels, batch_size, model_file, dropout,
                     n_epochs, learning_rate, start_eval_epoch, n_layers):
    init_logging('log/{}-bertnrdj-{}-{}.log'.format(cur_script_name,
                                                    utils.get_machine_name(),
                                                    str_today),
                 mode='a',
                 to_stdout=True)

    dataset_files = config.DATA_FILES[dataset]

    n_train, data_valid = bldatautils.load_train_data_bert_ol(
        dataset_files['train_sents_file'],
        dataset_files['train_valid_split_file'],
        dataset_files['bert_valid_tokens_file'])
    data_test = bldatautils.load_test_data_bert_ol(
        dataset_files['test_sents_file'],
        dataset_files['bert_test_tokens_file'])

    bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE)
    bm = robert.Robert(bert_config,
                       n_labels=n_labels,
                       seq_length=config.BERT_SEQ_LEN,
                       is_train=False,
                       init_checkpoint=dataset_files['bert_init_checkpoint'])

    logging.info('batch_size={}, learning_rate={}, dropout={}'.format(
        batch_size, learning_rate, dropout))

    # model_file = dataset_files['pretrained_bertnrdj_file']
    # model_file = None
    bertnrdj_model = BertNRDJ(n_labels,
                              config.BERT_EMBED_DIM,
                              hidden_size_lstm=hidden_size_lstm,
                              batch_size=batch_size,
                              model_file=model_file,
                              n_lstm_layers=n_layers)
    bertnrdj_model.train(
        robert_model=bm,
        train_tfrec_file=dataset_files['train_tfrecord_file'],
        valid_tfrec_file=dataset_files['valid_tfrecord_file'],
        test_tfrec_file=dataset_files['test_tfrecord_file'],
        seq_length=config.BERT_SEQ_LEN,
        n_train=n_train,
        data_valid=data_valid,
        data_test=data_test,
        dropout=dropout,
        start_eval_spoch=start_eval_epoch,
        n_epochs=n_epochs,
        lr=learning_rate,
    )
示例#10
0
def __train(word_vecs_file, train_tok_texts_file, train_sents_file,
            train_valid_split_file, test_tok_texts_file, test_sents_file,
            load_model_file, task):
    init_logging('log/{}-train-{}-{}.log'.format(
        os.path.splitext(os.path.basename(__file__))[0],
        utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    dst_aspects_file, dst_opinions_file = None, None

    # n_train = 1000
    n_train = -1
    n_tags = 5
    batch_size = 32
    lr = 0.001
    share_lstm = False

    logging.info(word_vecs_file)
    logging.info('load model {}'.format(load_model_file))
    logging.info(test_sents_file)

    print('loading data ...')
    vocab, word_vecs_matrix = datautils.load_word_vecs(word_vecs_file)

    logging.info('word vec dim: {}, n_words={}'.format(
        word_vecs_matrix.shape[1], word_vecs_matrix.shape[0]))
    train_data, valid_data, test_data = modelutils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file,
        test_sents_file, test_tok_texts_file, vocab, n_train, task)
    print('done')

    model = RINANTE(n_tags,
                    word_vecs_matrix,
                    share_lstm,
                    hidden_size_lstm=hidden_size_lstm,
                    model_file=load_model_file,
                    batch_size=batch_size,
                    lamb=lamb)
    model.train(train_data,
                valid_data,
                test_data,
                vocab,
                n_epochs=n_epochs,
                lr=lr,
                dst_aspects_file=dst_aspects_file,
                dst_opinions_file=dst_opinions_file)
示例#11
0
def __train_bertlstm_ol(dataset):
    str_today = datetime.date.today().strftime('%y-%m-%d')
    init_logging('log/{}-bertlstmcrfol-{}.log'.format(cur_script_name,
                                                      str_today),
                 mode='a',
                 to_stdout=True)

    n_labels = 5
    hidden_size_lstm = 200
    batch_size = 16
    n_epochs = 100
    dropout = 0.5

    dataset_files = config.DATA_FILES[dataset]

    n_train, data_valid = bldatautils.load_train_data_bert_ol(
        dataset_files['train_sents_file'],
        dataset_files['train_valid_split_file'],
        dataset_files['bert_valid_tokens_file'])
    data_test = bldatautils.load_test_data_bert_ol(
        dataset_files['test_sents_file'],
        dataset_files['bert_test_tokens_file'])

    bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE)
    bm = robert.Robert(bert_config,
                       n_labels=n_labels,
                       seq_length=config.BERT_SEQ_LEN,
                       is_train=False,
                       init_checkpoint=dataset_files['bert_init_checkpoint'])

    lstmcrf = BertLSTMCRF(n_labels,
                          config.BERT_EMBED_DIM,
                          hidden_size_lstm=hidden_size_lstm,
                          batch_size=batch_size)
    lstmcrf.train_ol(robert_model=bm,
                     train_tfrec_file=dataset_files['train_tfrecord_file'],
                     valid_tfrec_file=dataset_files['valid_tfrecord_file'],
                     test_tfrec_file=dataset_files['test_tfrecord_file'],
                     seq_length=config.BERT_SEQ_LEN,
                     n_train=n_train,
                     data_valid=data_valid,
                     data_test=data_test,
                     n_epochs=n_epochs,
                     dropout=dropout)
示例#12
0
def __train_nrdj(word_vecs_file, train_tok_texts_file, train_sents_file,
                 train_valid_split_file, test_tok_texts_file, test_sents_file,
                 load_model_file, task):
    init_logging('log/nrdj-train-ns1-{}.log'.format(str_today),
                 mode='a',
                 to_stdout=True)

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 10
    lr = 0.001
    share_lstm = False

    print('loading data ...')
    with open(word_vecs_file, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
    logging.info('word vec dim: {}, n_words={}'.format(
        word_vecs_matrix.shape[1], word_vecs_matrix.shape[0]))
    train_data, valid_data, test_data = datautils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file,
        test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions)
    print('done')

    nrdj = NeuRuleDoubleJoint(n_tags,
                              word_vecs_matrix,
                              share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              model_file=load_model_file,
                              batch_size=batch_size)

    nrdj.train(train_data,
               valid_data,
               test_data,
               vocab,
               n_epochs=n_epochs,
               lr=lr)
示例#13
0
def __train_lstmcrf_manual_feat():
    init_logging('log/nrmf-{}.log'.format(str_today), mode='a', to_stdout=True)
    hidden_size_lstm = 100
    n_epochs = 200
    n_tags = 5

    train_aspect_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-train-aspect-rule-result.txt'
    train_opinion_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-train-opinion-rule-result.txt'
    valid_aspect_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-test-aspect-rule-result.txt'
    valid_opinion_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-test-opinion-rule-result.txt'

    print('loading data ...')
    with open(config.SE14_LAPTOP_GLOVE_WORD_VEC_FILE, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
    train_data, valid_data = datautils.get_data_semeval(
        config.SE14_LAPTOP_TRAIN_SENTS_FILE, config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE,
        config.SE14_LAPTOP_TEST_SENTS_FILE, config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE,
        vocab, -1, 'both')

    train_aspect_feat_list = __get_manual_feat(
        config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE, train_aspect_rule_result_file)
    train_opinion_feat_list = __get_manual_feat(
        config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE, train_opinion_rule_result_file)
    train_feat_list = __merge_feat_list(train_aspect_feat_list, train_opinion_feat_list)

    valid_aspect_feat_list = __get_manual_feat(
        config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE, valid_aspect_rule_result_file)
    valid_opinion_feat_list = __get_manual_feat(
        config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE, valid_opinion_rule_result_file)
    valid_feat_list = __merge_feat_list(valid_aspect_feat_list, valid_opinion_feat_list)

    manual_feat_len = train_feat_list[0].shape[1]
    print('manual feat len: {}'.format(manual_feat_len))
    lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm, manual_feat_len=manual_feat_len)
    # print(valid_data.aspects_true_list)
    lstmcrf.train(train_data.word_idxs_list, train_data.labels_list, valid_data.word_idxs_list,
                  valid_data.labels_list, vocab, valid_data.tok_texts, valid_data.aspects_true_list,
                  valid_data.opinions_true_list, train_feat_list=train_feat_list, valid_feat_list=valid_feat_list,
                  n_epochs=n_epochs)
示例#14
0
def __train_bertlstm_ol():
    str_today = datetime.date.today().strftime('%y-%m-%d')
    init_logging('log/bertlstmcrfol3-{}.log'.format(str_today),
                 mode='a',
                 to_stdout=True)

    # dataset = 'se14r'
    dataset = 'se15r'
    n_labels = 5

    dataset_files = config.DATA_FILES[dataset]

    n_train, data_valid = bldatautils.load_train_data_bert_ol(
        dataset_files['train_sents_file'],
        dataset_files['train_valid_split_file'],
        dataset_files['bert_valid_tokens_file'])
    data_test = bldatautils.load_test_data_bert_ol(
        dataset_files['test_sents_file'],
        dataset_files['bert_test_tokens_file'])

    bert_config = bertmodel.BertConfig.from_json_file(config.BERT_CONFIG_FILE)
    bm = robert.Robert(bert_config,
                       n_labels=n_labels,
                       seq_length=config.BERT_SEQ_LEN,
                       is_train=False,
                       init_checkpoint=dataset_files['bert_init_checkpoint'])

    lstmcrf = BertLSTMCRF(n_labels,
                          config.BERT_EMBED_DIM,
                          hidden_size_lstm=500,
                          batch_size=5)
    lstmcrf.train_ol(robert_model=bm,
                     train_tfrec_file=dataset_files['train_tfrecord_file'],
                     valid_tfrec_file=dataset_files['valid_tfrecord_file'],
                     test_tfrec_file=dataset_files['test_tfrecord_file'],
                     seq_length=config.BERT_SEQ_LEN,
                     n_train=n_train,
                     data_valid=data_valid,
                     data_test=data_test)
示例#15
0
import datetime
import os
import numpy as np
import logging
from utils import utils
from utils.loggingutils import init_logging
from models.linearrank import LinearRank
import config

str_today = datetime.date.today().strftime('%y-%m-%d')
init_logging('log/flat-yelp-{}.log'.format(str_today),
             mode='a',
             to_stdout=True)


def __run_linearrank(training_instances_file, val_linked_mentions_file,
                     test_linked_mentions_file):
    flat_model = LinearRank(training_instances_file, val_linked_mentions_file,
                            test_linked_mentions_file,
                            config.YELP_CANDIDATES_FILE, cand_feat_files,
                            config.YELP_MENTION_ID_IDX_FILE,
                            config.YELP_BIZ_ID_TO_IDX_FILE, mention_feat_files,
                            biz_feat_files, learning_rate, n_epochs, l2_reg,
                            batch_size)
    acc_list = list()
    for i in range(n_rounds):
        print('Round {}'.format(i))
        acc = flat_model.train()
        acc_list.append(acc)

    avg_best_acc = sum(acc_list) / len(acc_list)
示例#16
0
def __train_neurule_double_joint():
    init_logging('log/nrdj-{}.log'.format(str_today), mode='a', to_stdout=True)

    # n_train = 1000
    n_train = -1
    # task = 'pretrain'
    task = 'train'
    label_opinions = True
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 20
    lr = 0.001
    share_lstm = False
    train_mode = 'target-only'

    print('loading data ...')
    with open(config.SE14_LAPTOP_GLOVE_WORD_VEC_FILE, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
    train_data_tar, valid_data_tar = datautils.get_data_semeval(
        config.SE14_LAPTOP_TRAIN_SENTS_FILE, config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE,
        config.SE14_LAPTOP_TEST_SENTS_FILE, config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE,
        vocab, n_train, label_opinions)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE)
    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, config.AMAZON_TERMS_TRUE2_FILE, config.AMAZON_TOK_TEXTS_FILE, 'aspect')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, config.AMAZON_TERMS_TRUE4_FILE, config.AMAZON_TOK_TEXTS_FILE, 'opinion')
    rule_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE if task == 'train' else None
    # rule_model_file = None
    pretrain_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE
    save_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE
    print('done')

    nrdj = NeuRuleDoubleJoint(n_tags, word_vecs_matrix, share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              model_file=rule_model_file)

    nrj_train_data_src1 = nrj_train_data_src2 = None
    # if train_mode != 'target-only':
    nrj_train_data_src1 = NeuRuleDoubleJoint.TrainData(
        train_data_src1.word_idxs_list, train_data_src1.labels_list, valid_data_src1.word_idxs_list,
        valid_data_src1.labels_list, valid_data_src1.tok_texts, valid_data_src1.aspects_true_list, None
    )
    nrj_train_data_src2 = NeuRuleDoubleJoint.TrainData(
        train_data_src2.word_idxs_list, train_data_src2.labels_list, valid_data_src2.word_idxs_list,
        valid_data_src2.labels_list, valid_data_src2.tok_texts, None,
        valid_data_src2.opinions_true_list
    )

    nrj_train_data_tar = NeuRuleDoubleJoint.TrainData(
        train_data_tar.word_idxs_list, train_data_tar.labels_list, valid_data_tar.word_idxs_list,
        valid_data_tar.labels_list, valid_data_tar.tok_texts, valid_data_tar.aspects_true_list,
        valid_data_tar.opinions_true_list
    )

    if task == 'pretrain':
        nrdj.pre_train(nrj_train_data_src1, nrj_train_data_src2, vocab, n_epochs=n_epochs, lr=lr,
                       save_file=pretrain_model_file)
    if task == 'train':
        nrdj.train(nrj_train_data_src1, nrj_train_data_src2, nrj_train_data_tar, vocab, train_mode,
                   n_epochs=n_epochs, lr=lr)
示例#17
0
def __pre_train_nrdj(word_vecs_file,
                     tok_texts_file,
                     aspect_terms_file,
                     opinion_terms_file,
                     dst_model_file,
                     task,
                     lamb,
                     lstm_l2,
                     train_word_embeddings=False,
                     load_model_file=None):
    init_logging('log/{}-pre-{}-{}.log'.format(
        os.path.splitext(os.path.basename(__file__))[0],
        utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 32
    lr = 0.001
    share_lstm = False

    logging.info(word_vecs_file)
    logging.info(aspect_terms_file)
    logging.info(opinion_terms_file)
    logging.info('dst: {}'.format(dst_model_file))

    print('loading data ...')
    vocab, word_vecs_matrix = __load_word_vecs(word_vecs_file)

    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE)
    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, aspect_terms_file, tok_texts_file, 'aspect')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, opinion_terms_file, tok_texts_file, 'opinion')
    print('done')
    logging.info('train_word_embeddings={} lstm_l2={}'.format(
        train_word_embeddings, lstm_l2))

    nrdj = NeuRuleDoubleJoint(n_tags,
                              word_vecs_matrix,
                              share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              train_word_embeddings=train_word_embeddings,
                              lamb=lamb,
                              lstm_l2_src=lstm_l2,
                              model_file=load_model_file,
                              batch_size=batch_size)

    nrdj.pre_train(train_data_src1,
                   valid_data_src1,
                   train_data_src2,
                   valid_data_src2,
                   vocab,
                   n_epochs=50,
                   lr=lr,
                   save_file=dst_model_file)
示例#18
0
import pickle
from models.ncrfae import NeuCRFAutoEncoder
import config
from utils.loggingutils import init_logging
from utils import datautils
import logging
import datetime


if __name__ == '__main__':
    str_today = datetime.date.today().strftime('%y-%m-%d')
    init_logging('log/ncrfae-train-{}.log'.format(str_today), mode='a', to_stdout=True)

    n_tags = 5
    n_train = -1
    label_opinions = True

    dataset = 'se14l'
    # dataset = 'se14r'
    dataset_files = config.DATA_FILES[dataset]

    word_vecs_file = dataset_files['word_vecs_file']
    logging.info('word_vec_file: {}'.format(word_vecs_file))
    logging.info(dataset_files['test_sents_file'])
    print('loading data ...')
    with open(word_vecs_file, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
        # print(vocab)

    word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)}
    unlabeled_word_seqs = datautils.read_sents_to_word_idx_seqs(
示例#19
0
def __train_nrdj_deep_restaurant_pr():
    init_logging('log/nrdj-deep-restaurant-{}.log'.format(str_today),
                 mode='a',
                 to_stdout=True)

    # n_train = 1000
    n_train = -1
    task = 'train'
    label_task = 'aspect'
    n_tags = 5 if label_task == 'both' else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 20
    hidden_size_lstm = 100
    n_epochs = 500
    lr = 0.001
    share_lstm = True
    # load_pretrained_model = True
    load_pretrained_model = False
    # train_mode = 'target-only'
    train_mode = 'all'

    aspect_terms_p_file = 'd:/data/aspect/semeval14/restaurant/yelp-aspect-rule-result-p.txt'
    aspect_terms_r_file = 'd:/data/aspect/semeval14/restaurant/yelp-aspect-rule-result-r.txt'
    # opinion_terms_file = 'd:/data/aspect/semeval14/restaurant/yelp-opinion-rule-result.txt'
    yelp_tok_texts_file = 'd:/data/res/yelp-review-eng-tok-sents-round-9.txt'
    rule_model_file = 'd:/data/aspect/semeval14/tf-model/drest/yelp-nrdj.ckpl'
    # rule_model_file = None

    load_model_file = None
    if task == 'train' and load_pretrained_model:
        load_model_file = rule_model_file
    # save_model_file = None if task == 'train' else rule_model_file
    save_model_file = rule_model_file if task == 'pretrain' else None

    print('loading data ...')
    with open(config.SE14_REST_GLOVE_WORD_VEC_FILE, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
    train_data_tar, valid_data_tar = datautils.get_data_semeval(
        config.SE14_REST_TRAIN_SENTS_FILE,
        config.SE14_REST_TRAIN_TOK_TEXTS_FILE,
        config.SE14_REST_TEST_SENTS_FILE, config.SE14_REST_TEST_TOK_TEXTS_FILE,
        vocab, n_train, label_task)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE)
    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, aspect_terms_p_file, yelp_tok_texts_file, 'aspect')
    # train_data_src2, valid_data_src2 = datautils.get_data_amazon(
    #     vocab, aspect_terms_r_file, yelp_tok_texts_file, 'opinion')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, aspect_terms_r_file, yelp_tok_texts_file, 'aspect')
    # train_data_src2, valid_data_src2 = datautils.get_data_amazon(
    #     vocab, opinion_terms_file, yelp_tok_texts_file, 'opinion')
    print('done')

    nrdj = NeuRuleDoubleJointDeep(n_tags,
                                  word_vecs_matrix,
                                  share_lstm,
                                  hidden_size_lstm=hidden_size_lstm,
                                  model_file=load_model_file)

    nrj_train_data_src1 = nrj_train_data_src2 = None
    # if train_mode != 'target-only':
    nrj_train_data_src1 = NeuRuleDoubleJointDeep.TrainData(
        train_data_src1.word_idxs_list, train_data_src1.labels_list,
        valid_data_src1.word_idxs_list, valid_data_src1.labels_list,
        valid_data_src1.tok_texts, valid_data_src1.aspects_true_list, None)
    nrj_train_data_src2 = NeuRuleDoubleJointDeep.TrainData(
        train_data_src2.word_idxs_list, train_data_src2.labels_list,
        valid_data_src2.word_idxs_list, valid_data_src2.labels_list,
        valid_data_src2.tok_texts, valid_data_src2.aspects_true_list, None)

    nrj_train_data_tar = NeuRuleDoubleJointDeep.TrainData(
        train_data_tar.word_idxs_list, train_data_tar.labels_list,
        valid_data_tar.word_idxs_list, valid_data_tar.labels_list,
        valid_data_tar.tok_texts, valid_data_tar.aspects_true_list,
        valid_data_tar.opinions_true_list)
    nrdj.train(nrj_train_data_src1,
               nrj_train_data_src2,
               nrj_train_data_tar,
               vocab,
               train_mode,
               n_epochs=n_epochs,
               lr=lr)
示例#20
0
    mentions = utils.load_json_objs(mentions_file)
    mention_ids = {m['mention_id'] for m in mentions}
    mention_candidates = utils.load_candidates_for_mentions(config.YELP_CANDIDATES_FILE, mention_ids)
    mention_id_to_idx = utils.load_id_to_idx(mention_id_idx_file)
    biz_id_to_idx = utils.load_id_to_idx(config.YELP_BIZ_ID_TO_IDX_FILE)
    if for_pra:
        commuting_matrix_files = [os.path.join(
            config.YELP_DATA_DIR, 'network/{}_norm.txt'.format(s)) for s in path_strs]
    else:
        commuting_matrix_files = [os.path.join(config.YELP_DATA_DIR, 'network/{}.txt'.format(s)) for s in path_strs]
    gen_path_count_feats_file(config.YELP_DATA_INFO_FILE, mention_candidates, mention_id_to_idx, biz_id_to_idx,
                              commuting_matrix_files, for_pra, dst_file)


if __name__ == '__main__':
    init_logging('log/pc_feature_gen.log', mode='a', to_stdout=True)

    yelp_data_info_file = os.path.join(config.YELP_DATA_DIR, 'dataset-info.json')
    yelp_candidates_file = os.path.join(config.YELP_DATA_DIR, 'dataset/candidates.json')
    yelp_cs_candidates_file = os.path.join(config.YELP_DATA_DIR, 'casestudy/cs-mention-candidates.txt')

    # path_strs = ['MRURB']
    # path_strs = ['MRURBRURB']
    path_strs = ['MRUURB']
    tag = 'pc'
    # tag = 'rw'
    NORM_THRES = 100
    yelp_path_count_feat_file = os.path.join(
        config.LOCAL_DATA_DIR, '{}_features_{}.txt'.format(tag, path_strs[0]))
    __gen_yelp_path_count_feat(config.YELP_ALL_LINKED_MENTIONS_FILE, config.YELP_MENTION_ID_IDX_FILE, path_strs,
                               False, yelp_path_count_feat_file)
示例#21
0
    for params in params_list:
        lr, lamb1, lamb2, lamb3, alpha1, alpha2, alpha3 = params
        if lamb1 == 0.01 and lamb2 == 0.01:
            continue
        mr = MarchRec(N_USERS + 1, N_ITEMS + 1, k, n_epoch, batch_size, lr,
                      alpha1, alpha2, alpha3, lamb1, lamb2, lamb3)
        mr.fit(entries_train, entries_val, entries_test)


# method = 'pmf'
# method = 'biased_svd'
method = 'dmf'

str_today = datetime.date.today().strftime('%y-%m-%d')
init_logging('log/{}-{}.log'.format(method, str_today), to_stdout=True)

split_id = 1
train_file = os.path.join(DATADIR, 'u{}_train.txt'.format(split_id))
val_file = os.path.join(DATADIR, 'u{}_val.txt'.format(split_id))
test_file = os.path.join(DATADIR, 'u{}.test'.format(split_id))

if method == 'pmf':
    __run_pmf(train_file, val_file, test_file)
if method == 'biased_svd':
    __run_biased_svd(train_file, val_file, test_file)
if method == 'dmf':
    __run_dmf(train_file, val_file, test_file)
# __run_mlp(train_file, val_file)
# __run_march(train_file, val_file, test_file)