def __init__(self, args): super().__init__() train_file = args.train_file vocab_file = args.vocab_file train_sens = data_utils.load_sentences(train_file, skip_invalid=True) word2id, id2word, label2id, id2label = data_utils.load_vocab( train_sens, vocab_file) data_utils.gen_ids(train_sens, word2id, label2id, 100) train_full_tensors = data_utils.make_full_tensors(train_sens) raw_x = train_full_tensors[0] x_length = train_full_tensors[1] x_labels = train_full_tensors[2] raw_f = lambda t: id2label[t] x_labels_true = np.array(list(map(raw_f, x_labels))) n_train = int(len(raw_x) * 1) self.train_x, self.test_x = raw_x[:n_train], raw_x[n_train:] self.train_length_x, self.test_length_x = x_length[:n_train], x_length[ n_train:] self.train_y, self.test_y = x_labels[:n_train], x_labels[n_train:] self.gt_label = x_labels_true self.raw_q = ["".join(i.raw_tokens) for i in train_sens]
def train(): #load data train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) )
parser.add_argument("--init_checkpoint", type=str, default='') parser.add_argument("--train_file", type=str, default="") parser.add_argument("--batch_size", type=int, default=32) # customized cluster centers file path, pass either of params 'external_cluster_center' or 'n_clusters' parser.add_argument("--external_cluster_center", type=str, default="") # number of clusters (init with kmeans) parser.add_argument("--n_clusters", type=int, default=20) parser.add_argument("--epochs", type=int, default=50) parser.add_argument("--warmup_steps", type=int, default=1000) parser.add_argument("--learning_rate", type=float, default=0.01) # DEC model q distribution param, alpha=1 in paper parser.add_argument("--alpha", type=int, default=1) parser.add_argument("--layer_num", type=int, default=1) parser.add_argument("--token_num", type=int, default=7820) parser.add_argument("--lstm_dim", type=int, default=500) parser.add_argument("--embedding_dim", type=int, default=1000) parser.add_argument("--vocab_file", type=str, default="./vocab") parser.add_argument("--model_save_dir", type=str, default="./saved_model") args = parser.parse_args() word2id, id2word = data_utils.load_vocab_file(args.vocab_file) trainset_size = len( data_utils.load_sentences(args.train_file, skip_invalid=True)) other_arg_dict = {} other_arg_dict['token_num'] = len(word2id) other_arg_dict['trainset_size'] = trainset_size exp_data = dataset.ExpDataset(args) dec_model = dec_model.DEC(args, other_arg_dict) train(exp_data, dec_model, args)
def train(): # load data sets train_sentences=load_sentences(FLAGS.train_file,FLAGS.zeros) dev_sentences=load_sentences(FLAGS.dev_file,FLAGS.zeros) test_sentences=load_sentences(FLAGS.test_file,FLAGS.zeros) # appoint tagging scheme (IOB/IOBES) train_sentences=update_tag_scheme(train_sentences,FLAGS.tag_schema) dev_sentences=update_tag_scheme(dev_sentences,FLAGS.tag_schema) test_sentences=update_tag_scheme(test_sentences,FLAGS.tag_schema) #create maps if not exist if not os.path.exists(FLAGS.map_file): if FLAGS.pre_emb: char_to_id,_=char_mapping(train_sentences) char_to_id,id_to_char=augment_with_pretrained(char_to_id,'wiki_100.utf8') else: char_to_id, id_to_char=char_mapping(train_sentences) tag_to_id, id_to_tag=tag_mapping(train_sentences) with open(FLAGS.map_file,'wb') as f: cPickle.dump([char_to_id,id_to_char,tag_to_id,id_to_tag],f,cPickle.HIGHEST_PROTOCOL) else: with open(FLAGS.map_file,'rb') as f: char_to_id, id_to_char, tag_to_id, id_to_tag=cPickle.load(f) # prepare data, get a collection of list containing index train_data=prepare_dataset(train_sentences,char_to_id,tag_to_id,True) dev_data=prepare_dataset(dev_sentences,char_to_id,tag_to_id,True) test_data=prepare_dataset(test_sentences,char_to_id,tag_to_id,True) print "%i %i %i sentences in train / dev / test." % (len(train_data),len(dev_data),len(test_data)) if not FLAGS.pre_emb: pre_emb=None else: pre_emb=load_word2vec(FLAGS.pre_emb_file,char_to_id,FLAGS.char_dim) print "init embedding shape: (%d,%d)" %(pre_emb.shape[0],pre_emb.shape[1]) train_manager=BatchManager(train_data,FLAGS.batch_size,True) dev_manager=BatchManager(dev_data,FLAGS.batch_size,False) test_manager=BatchManager(test_data,FLAGS.batch_size,False) config=BasicModelConfig(FLAGS,len(char_to_id),len(tag_to_id),4) tfConfig = tf.ConfigProto() tfConfig.gpu_options.per_process_gpu_memory_fraction = FLAGS.memory_usage with tf.Session(config=tfConfig) as sess: print "Train started!" model=BasicModel(config,pre_emb) saver=tf.train.Saver() # tensorboard if not os.path.exists(FLAGS.summaries_dir): os.mkdir(FLAGS.summaries_dir) merged=tf.summary.merge_all() train_writer=tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir,FLAGS.model_name,"train"),sess.graph) test_writer=tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir,FLAGS.model_name,"test"),sess.graph) # load previous trained model or create a new model if not os.path.exists(FLAGS.checkpoints): os.mkdir(FLAGS.checkpoints) model_name=os.path.join(FLAGS.checkpoints,FLAGS.model_name) ckpt=tf.train.get_checkpoint_state(FLAGS.checkpoints) if ckpt and ckpt.model_checkpoint_path: print "restore from previous traied model: %s" % FLAGS.model_name saver.restore(sess,ckpt.model_checkpoint_path) else: sess.run(tf.global_variables_initializer()) def evaluate(sess,model,manager): strings=[] predicts=[] goldens=[] bar = ProgressBar(max_value=manager.num_batch) for batch in bar(manager.iter_batch()): batch_string,batch_predict,batch_golden=model.evaluate_step(sess,batch) strings.extend(batch_string) predicts.extend(batch_predict) goldens.extend(batch_golden) return strings,predicts,goldens best_eval_f1=0 noimpro_num=0 for i in range(FLAGS.max_epoch): #train train_loss=[] bar = ProgressBar(max_value=train_manager.num_batch) for step,batch in bar(enumerate(train_manager.iter_batch())): batch.append(merged) summary,global_step,batch_loss=model.train_step(sess,batch,FLAGS.dropout_keep) #add summary to tensorboard train_writer.add_summary(summary,global_step) train_loss.append(batch_loss) print "Epoch %d Train loss is %.4f" % (i+1,np.mean(train_loss)) #dev strings,predicts,goldens=evaluate(sess,model,dev_manager) eval_f1=report_results(strings,predicts,goldens,id_to_char,id_to_tag,'outputs/dev') if eval_f1>best_eval_f1: best_eval_f1=eval_f1 noimpro_num=0 saver.save(sess,model_name) else: noimpro_num+=1 print "Epoch %d Best eval f1:%.6f" % (i+1,best_eval_f1) #test strings,predicts,goldens=evaluate(sess,model,test_manager) test_f1=report_results(strings,predicts,goldens,id_to_char,id_to_tag,'outputs/test',True) #early_stop if noimpro_num>=3: print "Early stop! Final F1 scores on test data is :%.6f" % test_f1 break print
def train(): # 加载数据集 train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # 选择tag形式 (IOB / IOBES) 默认使用IOBES update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: # {'S-LOC': 10, 'E-LOC': 3, 'B-ORG': 4, 'S-PER': 11, 'S-ORG': 12, 'O': 0, # 'E-ORG': 5, 'I-LOC': 6, 'I-PER': 7, 'I-ORG': 1, 'B-PER': 8, 'B-LOC': 2, 'E-PER': 9} char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # 转化成数字化的数据 train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) #长度不足补0 train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # GPU设置 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) # 每100次算一次平均loss loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
result_path = os.path.join("result") #path for data # train_file = os.path.join("data", "example.train") # dev_file = os.path.join("data", "example.dev") #path for data_medicine_three # train_file = os.path.join("data", "example_medicine_three.train") # dev_file = os.path.join("data", "example_medicine_three.dev") #path for data_medicine_all train_file = os.path.join("data", "example_medicine_all.train") dev_file = os.path.join("data", "example_medicine_all.dev") emb_file = os.path.join("data", "wiki_100.utf8") #path for pre_trained embedding #load data and get sentences train_sentences = load_sentences(train_file) dev_sentences = load_sentences(dev_file) # print(train_sentences[5], '\n', dev_sentences[5], '\n', dev_sentences[5]) #获取tag的映射字典 tag_index, id_to_tag = get_tag_index(train_sentences) print("tag_index:", tag_index, len(tag_index)) print("id_to_tag:", id_to_tag, len(id_to_tag)) #prepare data,对sentences进行处理得到sentence的序列化表示,以及word到ID的映射序列 train_data = prepare_data(train_sentences, seg_dim, tag_index) # print("train_data:", "\n", train_data[0][2], "\n", train_data[1], "\n", train_data[2][2]) dev_data = prepare_data(dev_sentences, seg_dim, tag_index) #获取word的映射字典 word_index = train_data[1]
def get_encoded_data(args): global agree_words, disagree_words, agree_indices, disagree_indices # %% load data # load sentence data sents, labels = load_sentences(domain=args.domain) # load sentiment lexicon lexicon = load_lexicon() pos_words = [word for word in lexicon if lexicon[word] == 1] neg_words = [word for word in lexicon if lexicon[word] == 0] lex_labels = [1] * len(pos_words) + [0] * len(neg_words) lex_word_seqs = pos_words + neg_words # load document data mdsd_domain = 'dvd' if args.domain == 'dvds' else args.domain doc_texts, doc_labels, _ = load_documents(domains=(mdsd_domain,)) # just one domain, ignore domain labels ## build vocabulary counter = Counter() word_seqs = [] doc_word_seqs = [] doc_word_sseqs = [] # tokenize to words for sent in sents: word_seqs.append(my_tokenize(sent)) # [[w1, w2, ...], ...] for doc in doc_texts: doc_word_seqs.append(my_tokenize(doc)) sent_seqs = [] for sent in sent_tokenize(doc): sent_seqs.append(my_tokenize(sent)) doc_word_sseqs.append(sent_seqs) # [[[w11, w12, ...], [w21, w22, ...], ...], ...] # stat and index lens = [] doc_lens = [] doc_sentlens = [] doc_wordlens = [] for word_seq in word_seqs: counter.update(word_seq) lens.append(len(word_seq)) for word in lexicon.keys(): counter.update([word]) for doc_word_seq in doc_word_seqs: # counter.update(doc_word_seq) doc_lens.append(len(doc_word_seq)) for sent_seqs in doc_word_sseqs: doc_sentlens.append(len(sent_seqs)) for sent_seq in sent_seqs: counter.update(sent_seq) doc_wordlens.append(len(sent_seq)) percentage = 98 maxlen = int(np.percentile(lens, percentage)) doc_maxlen_sent = int(np.percentile(doc_sentlens, percentage)) # max sent per doc doc_maxlen_word = int(np.percentile(doc_wordlens, percentage)) # max word per sent doc_maxlen_word = max(maxlen, doc_maxlen_word) # the vocabulary min_freq = 3 word2index = dict() idx = 2 # start from 2, 0 as <PAD>, 1 as <OOV> for word_count in counter.most_common(): if word_count[1] >= min_freq or word_count[0] in lexicon: word2index[word_count[0]] = idx idx += 1 n_words = len(word2index) + 2 print('words:', len(word2index)) print('[agree] words:') for word in agree_words: if word in word2index: agree_indices.add(word2index[word]) print(' -', word, word2index[word]) print('[disagree] words:') for word in disagree_words: if word in word2index: disagree_indices.add(word2index[word]) print(' -', word, word2index[word]) print('agree: {}\ndisagree: {}'.format(agree_indices, disagree_indices)) # %% data encoding ==================================================================== # sent data, and CV version seqs = [] for words in word_seqs: seqs.append([word2index.get(word, 1) for word in words]) padded_seqs_bak = pad_sequences(seqs, maxlen=doc_maxlen_word, padding='post', truncating='post') labels_bak = np.asarray(labels, dtype=int) print('sent:', padded_seqs_bak.shape, labels_bak.shape) # CV-fold split for sentence data kf = StratifiedKFold(n_splits=CV, shuffle=True) padded_seqs_trains = dict() padded_seqs_tests = dict() labels_trains = dict() labels_tests = dict() print('{} fold train/test splitting'.format(CV)) for cv, (train_idx, test_idx) in enumerate(kf.split(padded_seqs_bak, labels_bak)): padded_seqs_trains[cv] = padded_seqs_bak[train_idx] padded_seqs_tests[cv] = padded_seqs_bak[test_idx] labels_trains[cv] = labels_bak[train_idx] labels_tests[cv] = labels_bak[test_idx] # lex data lex_seqs = [] for word in lex_word_seqs: lex_seqs.append([word2index.get(word, 1)]) lex_padded_seqs = pad_sequences(lex_seqs, maxlen=1, padding='post', truncating='post') lex_labels = np.asarray(lex_labels, dtype=int) print(' - lex (all):', lex_padded_seqs.shape, lex_labels.shape) # doc data (hierarchical), padding from word to sent n_samples = len(doc_word_sseqs) doc_padded_seqs = np.zeros(shape=(n_samples, doc_maxlen_sent, doc_maxlen_word), dtype=int) for i, sseq_1doc in enumerate(doc_word_sseqs): for j, seq_1doc in enumerate(sseq_1doc): if j < doc_maxlen_sent: for k, word in enumerate(seq_1doc): if k < doc_maxlen_word: doc_padded_seqs[i, j, k] = word2index.get(word, 1) doc_labels = np.asarray(doc_labels, dtype=int) print(' - doc (all):', doc_padded_seqs.shape, doc_labels.shape) # relation data for doc (internal sents) (agree & disagree) count_agree, count_disagree = 0, 0 doc_rel_padded_seqs = np.zeros(shape=(n_samples, doc_maxlen_sent), dtype=int) for i in range(0, n_samples): for j in range(1, doc_maxlen_sent): if doc_padded_seqs[i, j, 0] in agree_indices: doc_rel_padded_seqs[i, j] = 1 count_agree += 1 if doc_padded_seqs[i, j, 0] in disagree_indices: doc_rel_padded_seqs[i, j] = -1 count_disagree += 1 print(' - doc sent-rel (all):', doc_rel_padded_seqs.shape) print(' - doc sent-rel (all): agree: {}, disagree: {}'.format(count_agree, count_disagree)) ## sub-sample from lexicon and documents print('sub-sampling:') # doc data sub-sample n_samples = len(padded_seqs_trains[0]) + len(padded_seqs_tests[0]) doc_padded_seqs, doc_rel_padded_seqs, doc_labels = balanced_subsample3( doc_padded_seqs, doc_rel_padded_seqs, doc_labels, subsample_num=n_samples) doc_padded_seqs = np.asarray(doc_padded_seqs) doc_labels = np.asarray(doc_labels, dtype=int) print(' - doc (sampled):', doc_padded_seqs.shape, doc_labels.shape) # lex data sub-sample lex_padded_seqs, lex_labels = balanced_subsample2(lex_padded_seqs, lex_labels, subsample_num=n_samples) lex_padded_seqs = np.asarray(lex_padded_seqs) lex_labels = np.asarray(lex_labels, dtype=int) print(' - lex (sampled):', lex_padded_seqs.shape, lex_labels.shape) ddata = { 'n_samples': n_samples, 'n_words': n_words, 'doc_maxlen_word': doc_maxlen_word, 'doc_maxlen_sent': doc_maxlen_sent, 'word2index': word2index, 'padded_seqs_trains': padded_seqs_trains, 'labels_trains': labels_trains, 'padded_seqs_tests': padded_seqs_tests, 'labels_tests': labels_tests, 'lex_padded_seqs': lex_padded_seqs, 'lex_labels': lex_labels, 'doc_padded_seqs': doc_padded_seqs, 'doc_labels': doc_labels, 'doc_rel_padded_seqs': doc_rel_padded_seqs, } return ddata