def __train_lstmcrf(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file, test_sents_file, load_model_file, task, error_file=None): init_logging('log/nr-{}.log'.format(str_today), mode='a', to_stdout=True) n_tags = 5 if task == 'both' else 3 print('loading data ...') with open(word_vecs_file, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) save_model_file = None train_data, valid_data, test_data = datautils.get_data_semeval( train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file, vocab, -1, task) # train_data, valid_data = __get_data_semeval(vocab, -1) # train_data, valid_data = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE) # train_data, valid_data = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE2_FILE) print('done') # lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm) lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file) # print(valid_data.aspects_true_list) lstmcrf.train(train_data.word_idxs_list, train_data.labels_list, valid_data.word_idxs_list, valid_data.labels_list, vocab, valid_data.tok_texts, valid_data.aspects_true_list, valid_data.opinions_true_list, n_epochs=n_epochs, save_file=save_model_file, error_file=error_file)
def __train_dlc(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file, test_sents_file): init_logging('log/dlc-jtrain2-{}.log'.format(str_today), mode='a', to_stdout=True) # n_train = 1000 n_train = -1 label_opinions = True # label_opinions = False n_tags = 5 if label_opinions else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 10 lr = 0.001 share_lstm = False print('loading data ...') with open(word_vecs_file, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) logging.info('word vec dim: {}, n_words={}'.format(word_vecs_matrix.shape[1], word_vecs_matrix.shape[0])) train_data_src1, valid_data_src1 = datautils.get_data_amazon( vocab, pre_aspect_terms_file, pre_tok_texts_file, 'aspect') train_data_src2, valid_data_src2 = datautils.get_data_amazon( vocab, pre_opinion_terms_file, pre_tok_texts_file, 'opinion') train_data, valid_data, test_data = datautils.get_data_semeval( train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions) print('done') dlc = DSLSTMCRF(word_vecs_matrix, hidden_size_lstm=hidden_size_lstm, model_file=None, batch_size=batch_size) dlc.joint_train(train_data_src1, valid_data_src1, train_data_src2, valid_data_src2, train_data, valid_data, test_data, n_epochs=n_epochs, lr=lr)
def __train_nrdj(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file, test_sents_file, load_model_file, task): init_logging('log/{}-train-{}-{}.log'.format( os.path.splitext(os.path.basename(__file__))[0], utils.get_machine_name(), str_today), mode='a', to_stdout=True) dst_aspects_file = '/home/hldai/data/aspect/semeval14/nrdj-aspects.txt' dst_opinions_file = '/home/hldai/data/aspect/semeval14/nrdj-opinions.txt' # dst_aspects_file, dst_opinions_file = None, None # n_train = 1000 n_train = -1 label_opinions = True # label_opinions = False n_tags = 5 if label_opinions else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 20 lr = 0.001 share_lstm = False logging.info(word_vecs_file) logging.info('load model {}'.format(load_model_file)) logging.info(test_sents_file) print('loading data ...') vocab, word_vecs_matrix = __load_word_vecs(word_vecs_file) logging.info('word vec dim: {}, n_words={}'.format( word_vecs_matrix.shape[1], word_vecs_matrix.shape[0])) train_data, valid_data, test_data = datautils.get_data_semeval( train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions) print('done') nrdj = NeuRuleDoubleJoint(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file, batch_size=batch_size) nrdj.train(train_data, valid_data, test_data, vocab, n_epochs=n_epochs, lr=lr, dst_aspects_file=dst_aspects_file, dst_opinions_file=dst_opinions_file)
def __train_nrdj(word_vecs_file, train_tok_texts_file, train_sents_file, train_valid_split_file, test_tok_texts_file, test_sents_file, load_model_file, task): init_logging('log/nrdj-train-ns1-{}.log'.format(str_today), mode='a', to_stdout=True) # n_train = 1000 n_train = -1 label_opinions = True # label_opinions = False n_tags = 5 if label_opinions else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 10 lr = 0.001 share_lstm = False print('loading data ...') with open(word_vecs_file, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) logging.info('word vec dim: {}, n_words={}'.format( word_vecs_matrix.shape[1], word_vecs_matrix.shape[0])) train_data, valid_data, test_data = datautils.get_data_semeval( train_sents_file, train_tok_texts_file, train_valid_split_file, test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions) print('done') nrdj = NeuRuleDoubleJoint(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file, batch_size=batch_size) nrdj.train(train_data, valid_data, test_data, vocab, n_epochs=n_epochs, lr=lr)
def __train_lstmcrf_manual_feat(): init_logging('log/nrmf-{}.log'.format(str_today), mode='a', to_stdout=True) hidden_size_lstm = 100 n_epochs = 200 n_tags = 5 train_aspect_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-train-aspect-rule-result.txt' train_opinion_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-train-opinion-rule-result.txt' valid_aspect_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-test-aspect-rule-result.txt' valid_opinion_rule_result_file = 'd:/data/aspect/semeval14/laptops/laptops-test-opinion-rule-result.txt' print('loading data ...') with open(config.SE14_LAPTOP_GLOVE_WORD_VEC_FILE, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) train_data, valid_data = datautils.get_data_semeval( config.SE14_LAPTOP_TRAIN_SENTS_FILE, config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE, config.SE14_LAPTOP_TEST_SENTS_FILE, config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE, vocab, -1, 'both') train_aspect_feat_list = __get_manual_feat( config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE, train_aspect_rule_result_file) train_opinion_feat_list = __get_manual_feat( config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE, train_opinion_rule_result_file) train_feat_list = __merge_feat_list(train_aspect_feat_list, train_opinion_feat_list) valid_aspect_feat_list = __get_manual_feat( config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE, valid_aspect_rule_result_file) valid_opinion_feat_list = __get_manual_feat( config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE, valid_opinion_rule_result_file) valid_feat_list = __merge_feat_list(valid_aspect_feat_list, valid_opinion_feat_list) manual_feat_len = train_feat_list[0].shape[1] print('manual feat len: {}'.format(manual_feat_len)) lstmcrf = LSTMCRF(n_tags, word_vecs_matrix, hidden_size_lstm=hidden_size_lstm, manual_feat_len=manual_feat_len) # print(valid_data.aspects_true_list) lstmcrf.train(train_data.word_idxs_list, train_data.labels_list, valid_data.word_idxs_list, valid_data.labels_list, vocab, valid_data.tok_texts, valid_data.aspects_true_list, valid_data.opinions_true_list, train_feat_list=train_feat_list, valid_feat_list=valid_feat_list, n_epochs=n_epochs)
def __train_neurule_double_joint(): init_logging('log/nrdj-{}.log'.format(str_today), mode='a', to_stdout=True) # n_train = 1000 n_train = -1 # task = 'pretrain' task = 'train' label_opinions = True n_tags = 5 if label_opinions else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 20 lr = 0.001 share_lstm = False train_mode = 'target-only' print('loading data ...') with open(config.SE14_LAPTOP_GLOVE_WORD_VEC_FILE, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) train_data_tar, valid_data_tar = datautils.get_data_semeval( config.SE14_LAPTOP_TRAIN_SENTS_FILE, config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE, config.SE14_LAPTOP_TEST_SENTS_FILE, config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE, vocab, n_train, label_opinions) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE) train_data_src1, valid_data_src1 = datautils.get_data_amazon( vocab, config.AMAZON_TERMS_TRUE2_FILE, config.AMAZON_TOK_TEXTS_FILE, 'aspect') train_data_src2, valid_data_src2 = datautils.get_data_amazon( vocab, config.AMAZON_TERMS_TRUE4_FILE, config.AMAZON_TOK_TEXTS_FILE, 'opinion') rule_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE if task == 'train' else None # rule_model_file = None pretrain_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE save_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE print('done') nrdj = NeuRuleDoubleJoint(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=rule_model_file) nrj_train_data_src1 = nrj_train_data_src2 = None # if train_mode != 'target-only': nrj_train_data_src1 = NeuRuleDoubleJoint.TrainData( train_data_src1.word_idxs_list, train_data_src1.labels_list, valid_data_src1.word_idxs_list, valid_data_src1.labels_list, valid_data_src1.tok_texts, valid_data_src1.aspects_true_list, None ) nrj_train_data_src2 = NeuRuleDoubleJoint.TrainData( train_data_src2.word_idxs_list, train_data_src2.labels_list, valid_data_src2.word_idxs_list, valid_data_src2.labels_list, valid_data_src2.tok_texts, None, valid_data_src2.opinions_true_list ) nrj_train_data_tar = NeuRuleDoubleJoint.TrainData( train_data_tar.word_idxs_list, train_data_tar.labels_list, valid_data_tar.word_idxs_list, valid_data_tar.labels_list, valid_data_tar.tok_texts, valid_data_tar.aspects_true_list, valid_data_tar.opinions_true_list ) if task == 'pretrain': nrdj.pre_train(nrj_train_data_src1, nrj_train_data_src2, vocab, n_epochs=n_epochs, lr=lr, save_file=pretrain_model_file) if task == 'train': nrdj.train(nrj_train_data_src1, nrj_train_data_src2, nrj_train_data_tar, vocab, train_mode, n_epochs=n_epochs, lr=lr)
def __train_nrdj_deep_restaurant_pr(): init_logging('log/nrdj-deep-restaurant-{}.log'.format(str_today), mode='a', to_stdout=True) # n_train = 1000 n_train = -1 task = 'train' label_task = 'aspect' n_tags = 5 if label_task == 'both' else 3 # n_tags = 5 if task == 'train' else 3 batch_size = 20 hidden_size_lstm = 100 n_epochs = 500 lr = 0.001 share_lstm = True # load_pretrained_model = True load_pretrained_model = False # train_mode = 'target-only' train_mode = 'all' aspect_terms_p_file = 'd:/data/aspect/semeval14/restaurant/yelp-aspect-rule-result-p.txt' aspect_terms_r_file = 'd:/data/aspect/semeval14/restaurant/yelp-aspect-rule-result-r.txt' # opinion_terms_file = 'd:/data/aspect/semeval14/restaurant/yelp-opinion-rule-result.txt' yelp_tok_texts_file = 'd:/data/res/yelp-review-eng-tok-sents-round-9.txt' rule_model_file = 'd:/data/aspect/semeval14/tf-model/drest/yelp-nrdj.ckpl' # rule_model_file = None load_model_file = None if task == 'train' and load_pretrained_model: load_model_file = rule_model_file # save_model_file = None if task == 'train' else rule_model_file save_model_file = rule_model_file if task == 'pretrain' else None print('loading data ...') with open(config.SE14_REST_GLOVE_WORD_VEC_FILE, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) train_data_tar, valid_data_tar = datautils.get_data_semeval( config.SE14_REST_TRAIN_SENTS_FILE, config.SE14_REST_TRAIN_TOK_TEXTS_FILE, config.SE14_REST_TEST_SENTS_FILE, config.SE14_REST_TEST_TOK_TEXTS_FILE, vocab, n_train, label_task) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE) # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE) train_data_src1, valid_data_src1 = datautils.get_data_amazon( vocab, aspect_terms_p_file, yelp_tok_texts_file, 'aspect') # train_data_src2, valid_data_src2 = datautils.get_data_amazon( # vocab, aspect_terms_r_file, yelp_tok_texts_file, 'opinion') train_data_src2, valid_data_src2 = datautils.get_data_amazon( vocab, aspect_terms_r_file, yelp_tok_texts_file, 'aspect') # train_data_src2, valid_data_src2 = datautils.get_data_amazon( # vocab, opinion_terms_file, yelp_tok_texts_file, 'opinion') print('done') nrdj = NeuRuleDoubleJointDeep(n_tags, word_vecs_matrix, share_lstm, hidden_size_lstm=hidden_size_lstm, model_file=load_model_file) nrj_train_data_src1 = nrj_train_data_src2 = None # if train_mode != 'target-only': nrj_train_data_src1 = NeuRuleDoubleJointDeep.TrainData( train_data_src1.word_idxs_list, train_data_src1.labels_list, valid_data_src1.word_idxs_list, valid_data_src1.labels_list, valid_data_src1.tok_texts, valid_data_src1.aspects_true_list, None) nrj_train_data_src2 = NeuRuleDoubleJointDeep.TrainData( train_data_src2.word_idxs_list, train_data_src2.labels_list, valid_data_src2.word_idxs_list, valid_data_src2.labels_list, valid_data_src2.tok_texts, valid_data_src2.aspects_true_list, None) nrj_train_data_tar = NeuRuleDoubleJointDeep.TrainData( train_data_tar.word_idxs_list, train_data_tar.labels_list, valid_data_tar.word_idxs_list, valid_data_tar.labels_list, valid_data_tar.tok_texts, valid_data_tar.aspects_true_list, valid_data_tar.opinions_true_list) nrdj.train(nrj_train_data_src1, nrj_train_data_src2, nrj_train_data_tar, vocab, train_mode, n_epochs=n_epochs, lr=lr)
word_vecs_file = dataset_files['word_vecs_file'] logging.info('word_vec_file: {}'.format(word_vecs_file)) logging.info(dataset_files['test_sents_file']) print('loading data ...') with open(word_vecs_file, 'rb') as f: vocab, word_vecs_matrix = pickle.load(f) # print(vocab) word_idx_dict = {w: i + 1 for i, w in enumerate(vocab)} unlabeled_word_seqs = datautils.read_sents_to_word_idx_seqs( dataset_files['unlabeled_tok_sents_file'], word_idx_dict) print(len(unlabeled_word_seqs), 'unsupervised sents') # n_unlabeled_sents_used = 1000 n_unlabeled_sents_used = len(unlabeled_word_seqs) n_unlabeled_samples_per_iter = 1000 unsupervised_word_seqs = unlabeled_word_seqs[:n_unlabeled_sents_used] logging.info('{} unsupervised sents used.'.format(n_unlabeled_sents_used)) train_data, valid_data, test_data = datautils.get_data_semeval( dataset_files['train_sents_file'], dataset_files['train_tok_texts_file'], dataset_files['train_valid_split_file'], dataset_files['test_sents_file'], dataset_files['test_tok_texts_file'], vocab, n_train, label_opinions) ncrfae = NeuCRFAutoEncoder(n_tags, word_vecs_matrix, batch_size_l=4, batch_size_u=16, lr_method='adam') # ncrfae.test_model(train_data) ncrfae.train( data_train=train_data, data_valid=valid_data, data_test=test_data, unlabeled_word_seqs=unlabeled_word_seqs, n_unlabeled_samples_per_iter=n_unlabeled_samples_per_iter, n_epochs=500, lr=0.001)