Exemplo n.º 1
0
def __train_lstmcrf(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file,
                    test_sents_file, load_model_file, task, error_file=None):
    init_logging('log/nr-{}.log'.format(str_today), mode='a', to_stdout=True)

    n_tags = 5 if task == 'both' else 3

    print('loading data ...')
    with open(word_vecs_file, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)

    save_model_file = None
    train_data, valid_data, test_data = datautils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file,
        test_sents_file, test_tok_texts_file,
        vocab, -1, task)

    # train_data, valid_data = __get_data_semeval(vocab, -1)
    # train_data, valid_data = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE)
    # train_data, valid_data = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE2_FILE)
    print('done')

    # lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm)
    lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file)
    # print(valid_data.aspects_true_list)
    lstmcrf.train(train_data.word_idxs_list, train_data.labels_list, valid_data.word_idxs_list,
                  valid_data.labels_list, vocab, valid_data.tok_texts, valid_data.aspects_true_list,
                  valid_data.opinions_true_list,
                  n_epochs=n_epochs, save_file=save_model_file, error_file=error_file)
Exemplo n.º 2
0
def __train_dlc(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file,
                test_sents_file):
    init_logging('log/dlc-jtrain2-{}.log'.format(str_today), mode='a', to_stdout=True)

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 10
    lr = 0.001
    share_lstm = False

    print('loading data ...')
    with open(word_vecs_file, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
    logging.info('word vec dim: {}, n_words={}'.format(word_vecs_matrix.shape[1], word_vecs_matrix.shape[0]))

    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, pre_aspect_terms_file, pre_tok_texts_file, 'aspect')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, pre_opinion_terms_file, pre_tok_texts_file, 'opinion')

    train_data, valid_data, test_data = datautils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file,
        vocab, n_train, label_opinions)
    print('done')

    dlc = DSLSTMCRF(word_vecs_matrix, hidden_size_lstm=hidden_size_lstm,
                    model_file=None, batch_size=batch_size)

    dlc.joint_train(train_data_src1, valid_data_src1, train_data_src2, valid_data_src2,
                    train_data, valid_data, test_data, n_epochs=n_epochs, lr=lr)
Exemplo n.º 3
0
def __train_nrdj(word_vecs_file, train_tok_texts_file, train_sents_file,
                 train_valid_split_file, test_tok_texts_file, test_sents_file,
                 load_model_file, task):
    init_logging('log/{}-train-{}-{}.log'.format(
        os.path.splitext(os.path.basename(__file__))[0],
        utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    dst_aspects_file = '/home/hldai/data/aspect/semeval14/nrdj-aspects.txt'
    dst_opinions_file = '/home/hldai/data/aspect/semeval14/nrdj-opinions.txt'
    # dst_aspects_file, dst_opinions_file = None, None

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 20
    lr = 0.001
    share_lstm = False

    logging.info(word_vecs_file)
    logging.info('load model {}'.format(load_model_file))
    logging.info(test_sents_file)

    print('loading data ...')
    vocab, word_vecs_matrix = __load_word_vecs(word_vecs_file)

    logging.info('word vec dim: {}, n_words={}'.format(
        word_vecs_matrix.shape[1], word_vecs_matrix.shape[0]))
    train_data, valid_data, test_data = datautils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file,
        test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions)
    print('done')

    nrdj = NeuRuleDoubleJoint(n_tags,
                              word_vecs_matrix,
                              share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              model_file=load_model_file,
                              batch_size=batch_size)
    nrdj.train(train_data,
               valid_data,
               test_data,
               vocab,
               n_epochs=n_epochs,
               lr=lr,
               dst_aspects_file=dst_aspects_file,
               dst_opinions_file=dst_opinions_file)
Exemplo n.º 4
0
def __train_nrdj(word_vecs_file, train_tok_texts_file, train_sents_file,
                 train_valid_split_file, test_tok_texts_file, test_sents_file,
                 load_model_file, task):
    init_logging('log/nrdj-train-ns1-{}.log'.format(str_today),
                 mode='a',
                 to_stdout=True)

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 10
    lr = 0.001
    share_lstm = False

    print('loading data ...')
    with open(word_vecs_file, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
    logging.info('word vec dim: {}, n_words={}'.format(
        word_vecs_matrix.shape[1], word_vecs_matrix.shape[0]))
    train_data, valid_data, test_data = datautils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file,
        test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions)
    print('done')

    nrdj = NeuRuleDoubleJoint(n_tags,
                              word_vecs_matrix,
                              share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              model_file=load_model_file,
                              batch_size=batch_size)

    nrdj.train(train_data,
               valid_data,
               test_data,
               vocab,
               n_epochs=n_epochs,
               lr=lr)
Exemplo n.º 5
0
def __train_lstmcrf_manual_feat():
    init_logging('log/nrmf-{}.log'.format(str_today), mode='a', to_stdout=True)
    hidden_size_lstm = 100
    n_epochs = 200
    n_tags = 5

    train_aspect_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-train-aspect-rule-result.txt'
    train_opinion_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-train-opinion-rule-result.txt'
    valid_aspect_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-test-aspect-rule-result.txt'
    valid_opinion_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-test-opinion-rule-result.txt'

    print('loading data ...')
    with open(config.SE14_LAPTOP_GLOVE_WORD_VEC_FILE, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
    train_data, valid_data = datautils.get_data_semeval(
        config.SE14_LAPTOP_TRAIN_SENTS_FILE, config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE,
        config.SE14_LAPTOP_TEST_SENTS_FILE, config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE,
        vocab, -1, 'both')

    train_aspect_feat_list = __get_manual_feat(
        config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE, train_aspect_rule_result_file)
    train_opinion_feat_list = __get_manual_feat(
        config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE, train_opinion_rule_result_file)
    train_feat_list = __merge_feat_list(train_aspect_feat_list, train_opinion_feat_list)

    valid_aspect_feat_list = __get_manual_feat(
        config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE, valid_aspect_rule_result_file)
    valid_opinion_feat_list = __get_manual_feat(
        config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE, valid_opinion_rule_result_file)
    valid_feat_list = __merge_feat_list(valid_aspect_feat_list, valid_opinion_feat_list)

    manual_feat_len = train_feat_list[0].shape[1]
    print('manual feat len: {}'.format(manual_feat_len))
    lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm, manual_feat_len=manual_feat_len)
    # print(valid_data.aspects_true_list)
    lstmcrf.train(train_data.word_idxs_list, train_data.labels_list, valid_data.word_idxs_list,
                  valid_data.labels_list, vocab, valid_data.tok_texts, valid_data.aspects_true_list,
                  valid_data.opinions_true_list, train_feat_list=train_feat_list, valid_feat_list=valid_feat_list,
                  n_epochs=n_epochs)
Exemplo n.º 6
0
def __train_neurule_double_joint():
    init_logging('log/nrdj-{}.log'.format(str_today), mode='a', to_stdout=True)

    # n_train = 1000
    n_train = -1
    # task = 'pretrain'
    task = 'train'
    label_opinions = True
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 20
    lr = 0.001
    share_lstm = False
    train_mode = 'target-only'

    print('loading data ...')
    with open(config.SE14_LAPTOP_GLOVE_WORD_VEC_FILE, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
    train_data_tar, valid_data_tar = datautils.get_data_semeval(
        config.SE14_LAPTOP_TRAIN_SENTS_FILE, config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE,
        config.SE14_LAPTOP_TEST_SENTS_FILE, config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE,
        vocab, n_train, label_opinions)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE)
    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, config.AMAZON_TERMS_TRUE2_FILE, config.AMAZON_TOK_TEXTS_FILE, 'aspect')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, config.AMAZON_TERMS_TRUE4_FILE, config.AMAZON_TOK_TEXTS_FILE, 'opinion')
    rule_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE if task == 'train' else None
    # rule_model_file = None
    pretrain_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE
    save_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE
    print('done')

    nrdj = NeuRuleDoubleJoint(n_tags, word_vecs_matrix, share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              model_file=rule_model_file)

    nrj_train_data_src1 = nrj_train_data_src2 = None
    # if train_mode != 'target-only':
    nrj_train_data_src1 = NeuRuleDoubleJoint.TrainData(
        train_data_src1.word_idxs_list, train_data_src1.labels_list, valid_data_src1.word_idxs_list,
        valid_data_src1.labels_list, valid_data_src1.tok_texts, valid_data_src1.aspects_true_list, None
    )
    nrj_train_data_src2 = NeuRuleDoubleJoint.TrainData(
        train_data_src2.word_idxs_list, train_data_src2.labels_list, valid_data_src2.word_idxs_list,
        valid_data_src2.labels_list, valid_data_src2.tok_texts, None,
        valid_data_src2.opinions_true_list
    )

    nrj_train_data_tar = NeuRuleDoubleJoint.TrainData(
        train_data_tar.word_idxs_list, train_data_tar.labels_list, valid_data_tar.word_idxs_list,
        valid_data_tar.labels_list, valid_data_tar.tok_texts, valid_data_tar.aspects_true_list,
        valid_data_tar.opinions_true_list
    )

    if task == 'pretrain':
        nrdj.pre_train(nrj_train_data_src1, nrj_train_data_src2, vocab, n_epochs=n_epochs, lr=lr,
                       save_file=pretrain_model_file)
    if task == 'train':
        nrdj.train(nrj_train_data_src1, nrj_train_data_src2, nrj_train_data_tar, vocab, train_mode,
                   n_epochs=n_epochs, lr=lr)
Exemplo n.º 7
0
def __train_nrdj_deep_restaurant_pr():
    init_logging('log/nrdj-deep-restaurant-{}.log'.format(str_today),
                 mode='a',
                 to_stdout=True)

    # n_train = 1000
    n_train = -1
    task = 'train'
    label_task = 'aspect'
    n_tags = 5 if label_task == 'both' else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 20
    hidden_size_lstm = 100
    n_epochs = 500
    lr = 0.001
    share_lstm = True
    # load_pretrained_model = True
    load_pretrained_model = False
    # train_mode = 'target-only'
    train_mode = 'all'

    aspect_terms_p_file = 'd:/data/aspect/semeval14/restaurant/yelp-aspect-rule-result-p.txt'
    aspect_terms_r_file = 'd:/data/aspect/semeval14/restaurant/yelp-aspect-rule-result-r.txt'
    # opinion_terms_file = 'd:/data/aspect/semeval14/restaurant/yelp-opinion-rule-result.txt'
    yelp_tok_texts_file = 'd:/data/res/yelp-review-eng-tok-sents-round-9.txt'
    rule_model_file = 'd:/data/aspect/semeval14/tf-model/drest/yelp-nrdj.ckpl'
    # rule_model_file = None

    load_model_file = None
    if task == 'train' and load_pretrained_model:
        load_model_file = rule_model_file
    # save_model_file = None if task == 'train' else rule_model_file
    save_model_file = rule_model_file if task == 'pretrain' else None

    print('loading data ...')
    with open(config.SE14_REST_GLOVE_WORD_VEC_FILE, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
    train_data_tar, valid_data_tar = datautils.get_data_semeval(
        config.SE14_REST_TRAIN_SENTS_FILE,
        config.SE14_REST_TRAIN_TOK_TEXTS_FILE,
        config.SE14_REST_TEST_SENTS_FILE, config.SE14_REST_TEST_TOK_TEXTS_FILE,
        vocab, n_train, label_task)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE)
    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, aspect_terms_p_file, yelp_tok_texts_file, 'aspect')
    # train_data_src2, valid_data_src2 = datautils.get_data_amazon(
    #     vocab, aspect_terms_r_file, yelp_tok_texts_file, 'opinion')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, aspect_terms_r_file, yelp_tok_texts_file, 'aspect')
    # train_data_src2, valid_data_src2 = datautils.get_data_amazon(
    #     vocab, opinion_terms_file, yelp_tok_texts_file, 'opinion')
    print('done')

    nrdj = NeuRuleDoubleJointDeep(n_tags,
                                  word_vecs_matrix,
                                  share_lstm,
                                  hidden_size_lstm=hidden_size_lstm,
                                  model_file=load_model_file)

    nrj_train_data_src1 = nrj_train_data_src2 = None
    # if train_mode != 'target-only':
    nrj_train_data_src1 = NeuRuleDoubleJointDeep.TrainData(
        train_data_src1.word_idxs_list, train_data_src1.labels_list,
        valid_data_src1.word_idxs_list, valid_data_src1.labels_list,
        valid_data_src1.tok_texts, valid_data_src1.aspects_true_list, None)
    nrj_train_data_src2 = NeuRuleDoubleJointDeep.TrainData(
        train_data_src2.word_idxs_list, train_data_src2.labels_list,
        valid_data_src2.word_idxs_list, valid_data_src2.labels_list,
        valid_data_src2.tok_texts, valid_data_src2.aspects_true_list, None)

    nrj_train_data_tar = NeuRuleDoubleJointDeep.TrainData(
        train_data_tar.word_idxs_list, train_data_tar.labels_list,
        valid_data_tar.word_idxs_list, valid_data_tar.labels_list,
        valid_data_tar.tok_texts, valid_data_tar.aspects_true_list,
        valid_data_tar.opinions_true_list)
    nrdj.train(nrj_train_data_src1,
               nrj_train_data_src2,
               nrj_train_data_tar,
               vocab,
               train_mode,
               n_epochs=n_epochs,
               lr=lr)
Exemplo n.º 8
0
    word_vecs_file = dataset_files['word_vecs_file']
    logging.info('word_vec_file: {}'.format(word_vecs_file))
    logging.info(dataset_files['test_sents_file'])
    print('loading data ...')
    with open(word_vecs_file, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
        # print(vocab)

    word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)}
    unlabeled_word_seqs = datautils.read_sents_to_word_idx_seqs(
        dataset_files['unlabeled_tok_sents_file'], word_idx_dict)
    print(len(unlabeled_word_seqs), 'unsupervised sents')

    # n_unlabeled_sents_used = 1000
    n_unlabeled_sents_used = len(unlabeled_word_seqs)
    n_unlabeled_samples_per_iter = 1000
    unsupervised_word_seqs = unlabeled_word_seqs[:n_unlabeled_sents_used]
    logging.info('{} unsupervised sents used.'.format(n_unlabeled_sents_used))

    train_data, valid_data, test_data = datautils.get_data_semeval(
        dataset_files['train_sents_file'], dataset_files['train_tok_texts_file'],
        dataset_files['train_valid_split_file'],
        dataset_files['test_sents_file'], dataset_files['test_tok_texts_file'],
        vocab, n_train, label_opinions)
    ncrfae = NeuCRFAutoEncoder(n_tags, word_vecs_matrix, batch_size_l=4, batch_size_u=16, lr_method='adam')
    # ncrfae.test_model(train_data)
    ncrfae.train(
        data_train=train_data, data_valid=valid_data, data_test=test_data,
        unlabeled_word_seqs=unlabeled_word_seqs, n_unlabeled_samples_per_iter=n_unlabeled_samples_per_iter,
        n_epochs=500, lr=0.001)