def to_csv(): train_path = "../input/processing/spanish_train_dedup.txt" test_path = "../input/processing/test_b.txt" columns = ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'] train = read_data(train_path) test = read_data(test_path) all_question = set() for line in train + test: line = line.strip().split('\t') all_question.add(line[1]) all_question.add(line[2]) qids = dict(zip(all_question, list(range(1, len(all_question) + 1)))) # write train.csv with open('../input/processing/train.csv', 'wt', encoding='utf-8') as f: f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(*columns)) for i, line in enumerate(train): line = line.strip().split('\t') q1, q2, is_duplicate = line[1], line[2], line[0] f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(i, qids[q1], qids[q2], q1, q2, is_duplicate)) with open('../input/processing/test.csv', 'wt', encoding='utf-8') as f: f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(*columns)) for i, line in enumerate(test): line = line.strip().split('\t') q1, q2, is_duplicate = line[1], line[2], line[0] f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(i, qids[q1], qids[q2], q1, q2, is_duplicate))
def build_spanish_character_vocab(): print('build spanish character vocab') train_path = '../input/processing/spanish_train_dedup.txt' test_path = '../input/processing/test.txt' vocabs = {} train = read_data(train_path) test = read_data(test_path) for data in [train, test]: for line in data: line = line.strip().split('\t') assert len(line) == 3, len(line) for c in line[1] + line[2]: if c in vocabs: vocabs[c] += 1 else: vocabs[c] = 1 # 写入到char_vocab.txt中 char_vocabs = sorted(vocabs.items(), key=lambda x: x[1], reverse=True) fw = open('../input/char_vocab.txt', 'wt', encoding='utf-8') for (k, v) in char_vocabs: fw.write("{}\t{}\n".format(k, v)) fw.close()
def concat_english_spanish_vocab(): english_vocab = read_data("../input/english_word_vocabs.txt") spanish_vocab = read_data("../input/words.txt") vocabs = spanish_vocab + english_vocab save_data(path='../input/processing/multi_task_learn/all_vocab.txt', data=vocabs)
def processing_data_1_step(): """对数据集进行切分,先分成英文部分,西班牙文部分,不shuffle""" process_base_path = '../input/processing/' base_path = '../input/' if not os.path.exists(process_base_path): os.makedirs(process_base_path) train_en = base_path + 'cikm_english_train_20180516.txt' train_sp = base_path + 'cikm_spanish_train_20180516.txt' unlabel_data = base_path + 'cikm_unlabel_spanish_train_20180516.txt' english_file = process_base_path + 'english.txt' spanish_file = process_base_path + 'spanish.txt' unlabel_file = process_base_path + 'unlabel_spanish.txt' test_file = process_base_path + 'test_b_no_process.txt' # 将english_train中的英文和西班牙文划分开 en_train = read_data(train_en) ens = [] sps = [] for line in en_train: line = line.strip() line_arr = line.split('\t') ens.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[0], line_arr[2])) sps.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[1], line_arr[3])) sp_train = read_data(train_sp) for line in sp_train: # line = punctiation(line) line = line.strip() line_arr = re.split('\t', line) sps.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[0], line_arr[2])) ens.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[1], line_arr[3])) # 讲分开的english和spanish文件保存下来 save_data(english_file, data=ens) save_data(spanish_file, data=sps) print(u'对测试数据进行预处理,所有的label均设置为0') test_path = base_path + 'cikm_test_b_20180730.txt' test = read_data(test_path) sps = [] for line in test: line = line.strip() line_arr = re.split('\t', line) sps.append('{}\t{}\t{}\n'.format(0, line_arr[0], line_arr[1])) save_data(test_file, sps) print('Done')
def split_train_valid(split_rate, is_use_real=False): train_file = config.spanish_train_path train_data_file = '../input/processing/train_data.txt' valid_data_file = '../input/processing/valid_data.txt' train = read_data(train_file) # random.shuffle(train) if is_use_real: valid_size = 1400 train_data = train[:-valid_size] valid_data = train[-valid_size:] random.shuffle(train_data) save_data(train_data_file, train_data) save_data(valid_data_file, valid_data) else: valid_size = int(split_rate * len(train)) train_data = train[:-valid_size] valid_data = train[-valid_size:] # random.shuffle(train_data) save_data(train_data_file, train_data) save_data(valid_data_file, valid_data)
def embedding_batch(sess, model, vocab): """ Fast responses by passing - pre-generated model, - session - vocabulary And a batch sentences to produce a output logit to """ checkpoint_path = os.path.join(FLAGS.embedding_dir, FLAGS.embedding_model) writer = tf.summary.FileWriter(FLAGS.embedding_dir, sess.graph) config = projector.ProjectorConfig() embed = config.embeddings.add() embed.tensor_name = 'item_embedding' embed.metadata_path = data_utils.get_metadata_set_path(FLAGS.embedding_dir) projector.visualize_embeddings(writer, config) train_path = data_utils.get_train_set_path(FLAGS.train_dir) train_ids_path = train_path + ("_ids%d" % FLAGS.vocab_size) train_set = data_utils.read_data(train_ids_path, max_size=500000) # FLAGS.max_train_data_size) state_list = [] meta_list = [] for bucket_id in xrange(len(_buckets)): meta_list.extend(train_set[bucket_id]) for i, each in enumerate(meta_list): meta_list[i] = [vocab[id] for id in each[0]] deduped = {} for i, each in enumerate(meta_list): deduped[" ".join(each)] = i deduped_tuple_list = deduped.items() indices = [each[1] for each in deduped_tuple_list] metadata_path = embed.metadata_path with open(metadata_path, 'w+') as item_file: item_file.write('id\tchar\n') for i, each in enumerate(deduped_tuple_list): item_file.write('{}\t{}\n'.format(i, each[0])) print('metadata file created') for bucket_id in xrange(len(_buckets)): begin = 0 # some data will be ignored while begin < len(train_set[bucket_id]): bucket_data = train_set[bucket_id] data = bucket_data[begin: begin + FLAGS.batch_size] encoder_inputs, decoder_inputs, target_weights = model.get_batch( data, bucket_id, False) states, last_states = model.step_encoder_decoder(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) state_list.append(states) begin += FLAGS.batch_size concat = np.concatenate(state_list, axis=0) embedding_states = concat[indices] item_embedding = tf.get_variable(embed.tensor_name, [len(deduped_tuple_list), FLAGS.size]) assign_op = item_embedding.assign(embedding_states) sess.run(assign_op) saver = tf.train.Saver([item_embedding]) saver.save(sess, checkpoint_path, global_step=model.global_step)
def build_spanish_vocab(min_freq=2): """ 将文件中所有的单词提取出来,用id进行表示 !包括test的单词 """ print('build spanish vocab') basepath = '../input/processing/' train_data = basepath + 'spanish_train.txt' test_data = basepath + 'test_b.txt' words = defaultdict(int) train1 = read_data(train_data) for line in train1: line_arr = line.split('\t')[1:] for seq in line_arr: seq = seq.strip() seq = re.split(r' +', seq) for w in seq: words[w] += 1 test = read_data(test_data) for line in test: line_arr = line.split('\t')[1:] for seq in line_arr: seq = seq.strip() seq = re.split(r' +', seq) for w in seq: words[w] += 1 print(len(words)) # 降序排序 words_sorted_count = sorted(words.items(), key=lambda x: -x[1]) words_dict_list = [w[0] for w in words_sorted_count if w[1] >= min_freq] words_dict_list.insert(0, '<UNK>') words_dict_list.insert(0, '<PAD>') with open('../input/words.txt', 'wt', encoding='utf-8') as f: for word in words_dict_list: f.write(word + '\n') print('build spanish vocab done!')
def run(): df_data = data_utils.read_data("iris") df_train, df_test = train_test_split(df_data) k_list = [i + 1 for i in range(5)] for k in k_list: i = 0 for _, test_item in df_test.iterrows(): test_data = test_item.values[:4] test_label = test_item["species"] pred = knn(test_data, df_train, k) if pred == test_label: i += 1 acc = round(i / len(df_test), 2) print(f"k: {k}, accuracy: {acc}")
def build_vocab(self): """ build sents is for build vocab during multi-lingual task, there are two kinds of sents :return: sents """ if self.test_file is None: print('test_file is None') file_list = [self.train_file, self.dev_file] else: file_list = [self.train_file, self.dev_file, self.test_file] examples = data_utils.read_data(file_list) sents = [] for example in examples: sent = example[0] sents.append(sent) word_vocab = data_utils.build_word_vocab(sents, self.threshold) char_vocab = data_utils.build_char_vocab(sents) # 统计平均长度与最大长度 max_sent_len = 0 avg_sent_len = 0 for sent in sents: if len(sent) > max_sent_len: max_sent_len = len(sent) avg_sent_len += len(sent) avg_sent_len /= len(sents) print('task: max_sent_len: {}'.format(max_sent_len)) print('task: avg_sent_len: {}'.format(avg_sent_len)) max_word_len = 0 avg_word_len = 0 total_len = 0 for sent in sents: for word in sent: word = list(word) if len(word) > max_word_len: max_word_len = len(word) avg_word_len += len(word) total_len += len(sent) avg_word_len /= total_len print('task: max_word_len: {}'.format(max_word_len)) print('task: avg_word_len: {}'.format(avg_word_len)) return word_vocab, char_vocab
def de_duplicate(): filename = '../input/processing/spanish_train.txt' data = read_data(filename) dumplicate = [] new_data = [] help_set = set() for line in data: new_line = line[1:] if new_line in help_set: dumplicate.append(line) else: new_data.append(line) help_set.add(new_line) save_data( '../input/processing/dumplicate_spanish_data.txt', dumplicate, ) save_data('../input/processing/spanish_train_dedup.txt', new_data)
def build_data(self, data_file): """ return the formatted matrix, which is used as the input to deep learning models Args: file_list: word_vocab: """ self.examples = data_utils.read_data(data_file) y = [] sent_features = [] sent_lens = [] ids = [] for example in self.examples: sents = example[0] label = self.config.category2id[example[1]] char = data_utils.char_to_matrix(sents, self.char_vocab) sent = data_utils.sent_to_index(sents, self.word_vocab) one_hot_label = data_utils.onehot_vectorize(label, self.config.num_class) y.append(one_hot_label) # 有的句子长度为0, 取平均长度 if len(sent) == 0: sent = np.ones(8) sent_features.append((sent, char)) sent_lens.append(min(len(sent), self.max_sent_len)) # 这里添加char, ner的特征, 之后再做处理 f_sents = [] f_chars = [] char_lens = [] for feature in sent_features: f_sents.append(feature[0]) f_chars.append(feature[1]) input_x = data_utils.pad_2d_matrix(f_sents, self.max_sent_len) input_x_char = data_utils.pad_3d_tensor(f_chars, self.max_sent_len, self.max_word_len) for i in range(len(input_x_char)): char_lens.append([min(len(word), self.max_word_len) for word in input_x_char[i]]) x_len = sent_lens x_char_len = char_lens self.input_x = np.array(input_x, dtype=np.int32) # [batch_size, sent_len] self.input_x_char = np.array(input_x_char, dtype=np.int32) self.x_len = np.array(x_len, dtype=np.int32) # [batch_size] self.x_char_len = np.array(x_char_len, dtype=np.int32) self.y = np.array(y, dtype=np.float32) # [batch_size, class_number]
def train_model(config): print('[INFO] Preparing data for experiment: {}'.format( config['experiment'])) SRC, TRG, train_data, image_train_data, valid_data, test_data, \ encoder_embeddings_matrix, decoder_embeddings_matrix = data_utils.read_data() x_train, y_train = train_data.src, train_data.trg x_val, y_val = valid_data.src, valid_data.trg x_test, y_test = test_data.src, test_data.trg # Re-calculate the vocab size based on the word_idx dictionary config['encoder_vocab'] = len(SRC.vocab) config['decoder_vocab'] = len(TRG.vocab) config['image_size'] = 32 model = VarSeq2SeqDetAttnModel(config, encoder_embeddings_matrix, decoder_embeddings_matrix, input_word_index=SRC.vocab, output_word_index=TRG.vocab) model.train(x_train, image_train_data, y_train, x_val, y_val, y_val)
def tenfold(): """split train data for 10 fold""" base_path = '../input/processing/' train_file = base_path + 'train.txt' save_path = base_path + '10fold' if not os.path.exists(save_path): os.makedirs(save_path) # split data for ten fold train = read_data(train_file) size = len(train) one_part_size = int(0.1 * size) random.shuffle(train) for i in range(10): save_file_path = "{}/train_{}.txt".format(save_path, i) if i < 9: save_data(save_file_path, train[i * one_part_size:(i + 1) * one_part_size]) else: save_data(save_file_path, train[i * one_part_size:])
def five_fold(): """split train data for 10 fold""" base_path = '../input/processing/' train_file = config.spanish_train_path save_path = base_path + '5fold' if not os.path.exists(save_path): os.makedirs(save_path) # split data for ten fold train = read_data(train_file) size = len(train) one_part_size = int(0.2 * size) # random.shuffle(train) for i in range(5): save_file_path = "{}/train_{}.txt".format(save_path, i) if i < 4: save_data(save_file_path, train[i * one_part_size:(i + 1) * one_part_size]) else: save_data(save_file_path, train[i * one_part_size:])
def build_english_vocab(min_freq=3): print('build english word vocab') basepath = '../input/processing/' train_data = basepath + 'english_train.txt' words = defaultdict(int) train1 = read_data(train_data) for line in train1: line_arr = line.split('\t')[1:] for seq in line_arr: seq = seq.strip() seq = re.split(r' +', seq) for w in seq: words[w] += 1 # 降序排序 words_sorted_count = sorted(words.items(), key=lambda x: -x[1]) words_dict_list = [w[0] for w in words_sorted_count if w[1] >= min_freq] with open('../input/english_word_vocabs.txt', 'wt', encoding='utf-8') as f: for word in words_dict_list: f.write(word + '\n') print('build spanish vocab done!')
from model.model import NERModel from utils.data_utils import (Batch, convert_dataset, create_vocab, read_data, save_vocab, segment_vocab, update_tag_scheme, add_external_words) from utils.evaluate import evaluate from utils.train_utils import get_config_proto if __name__ == "__main__": DATA_DIR = "/data/xueyou/fashion/data/" checkpoint_dir = '/data/xueyou/ner/category_ner_lstm_dim128_0208/' if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) # read training data train_files = [os.path.join(DATA_DIR,"category.ner.ac.train.txt")] train_data = read_data(train_files,lower=True) # convert tags to iobes update_tag_scheme(train_data) # create vocab from training data word_vocab,tag_vocab = create_vocab(train_data, lower_case=True, min_cnt = 2) segment_vocab = segment_vocab() # save vocab save_vocab(word_vocab, os.path.join(checkpoint_dir,"word.vocab")) save_vocab(tag_vocab,os.path.join(checkpoint_dir,"tag.vocab")) save_vocab(segment_vocab,os.path.join(checkpoint_dir,"seg.vocab")) # convert word into ids train_data = convert_dataset(train_data, word_vocab, tag_vocab, segment_vocab)
epoch_num = 300 for e in range(1, epoch_num + 1): mse, w_g, b_g = grad_step(w, b) w = w - (lr_w * w_g) b = b - (lr_b * b_g) if not e % 10: print(f"epoch: {e}, mse:{mse:.2f}, w: {w:.2f}, b: {b:.2f}") print(f"w: {w:.2f}, b: {b:.2f}") plt.scatter(df_data["cp"], df_data["cp_new"], s=20, c="green", alpha=0.5) line_x = np.linspace(0, 630) line_y = np.array(b + w * line_x) plt.plot(line_x, line_y.T, color='red') plt.text(200, 1080, f"y={b:.2f}+{w:.2f}*x", rotation=30, fontsize=14, fontstyle="italic") plt.xlabel('cp') plt.ylabel('cp_new') plt.title('Gradient Descent Result') plt.show() if __name__ == '__main__': df_data = data_utils.read_data("pokemon") gradient_descent_method(df_data)
def processing_data_2_step(): process_base_path = '../input/processing/' english_file = process_base_path + 'english.txt' spanish_file = process_base_path + 'spanish.txt' test_file = process_base_path + 'test_b_no_process.txt' english_core_nlp = StanfordCoreNLP(core_nlp_path, lang='en') with StanfordCoreNLP(core_nlp_path, lang='es') as client: english_processing_file = process_base_path + 'english_train.txt' spanish_processing_file = process_base_path + 'spanish_train.txt' test_processing_file = process_base_path + 'test_b.txt' # englishs = read_data(english_file) spanishs = read_data(spanish_file) english_processing = [] spanish_processing = [] # for english for line in tqdm(englishs): lines = line.strip().split('\t') assert len(lines) == 3 lines[1] = text_processing_english(lines[1], english_core_nlp) lines[2] = text_processing_english(lines[2], english_core_nlp) english_processing.append("{}\t{}\t{}\n".format( lines[0], lines[1], lines[2])) save_data(english_processing_file, english_processing) # for spanish for line in tqdm(spanishs): line = line.strip().split('\t') assert len(line) == 3, print(line) line[1] = text_processing_spanish(line[1], client) line[2] = text_processing_spanish(line[2], client) spanish_processing.append("{}\t{}\t{}\n".format( line[0], line[1], line[2])) save_data(spanish_processing_file, spanish_processing) # # for test data test = read_data(test_file) test_processing = [] for line in tqdm(test): # ipdb.set_trace() line = line.strip().split('\t') assert len(line) == 3 line[2] = text_processing_spanish(line[2], client) line[1] = text_processing_spanish(line[1], client) test_processing.append("{}\t{}\t{}\n".format( line[0], line[1], line[2])) save_data(test_processing_file, test_processing) english_core_nlp.close() print('Done')
def test(exp_settings): config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Load test data. print("Reading data in %s" % FLAGS.data_dir) test_set = read_data(FLAGS.data_dir, FLAGS.test_data_prefix, FLAGS.max_list_cutoff) find_class(exp_settings['train_input_feed']).preprocess_data( test_set, exp_settings['train_input_hparams'], exp_settings) exp_settings['max_candidate_num'] = test_set.rank_list_size test_set.pad(exp_settings['max_candidate_num']) # Create model and load parameters. model = create_model(sess, exp_settings, test_set, True) # Create input feed test_input_feed = find_class(exp_settings['test_input_feed'])( model, FLAGS.batch_size, exp_settings['test_input_hparams'], sess) test_writer = tf.summary.FileWriter(FLAGS.model_dir + '/test_log') rerank_scores = [] summary_list = [] # Start testing. it = 0 count_batch = 0.0 batch_size_list = [] while it < len(test_set.initial_list): input_feed, info_map = test_input_feed.get_next_batch( it, test_set, check_validation=False) _, output_logits, summary = model.step(sess, input_feed, True) summary_list.append(summary) batch_size_list.append(len(info_map['input_list'])) for x in range(batch_size_list[-1]): rerank_scores.append(output_logits[x]) it += batch_size_list[-1] count_batch += 1.0 print("Testing {:.0%} finished".format( float(it) / len(test_set.initial_list)), end="\r", flush=True) print("\n[Done]") test_summary = merge_TFSummary(summary_list, batch_size_list) test_writer.add_summary(test_summary, it) cprint( "[Eval]: %s" % (' '.join([ '%s: %.3f' % (x.tag, x.simple_value) for x in test_summary.value ])), 'green') #get rerank indexes with new scores rerank_lists = [] for i in range(len(rerank_scores)): scores = rerank_scores[i] rerank_lists.append( sorted(range(len(scores)), key=lambda k: scores[k], reverse=True)) if not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) output_ranklist(test_set, rerank_scores, FLAGS.output_dir, FLAGS.test_data_prefix) return
def train(exp_settings): # Prepare data. print("Reading data in %s" % FLAGS.data_dir) train_set = read_data(FLAGS.data_dir, FLAGS.train_data_prefix, FLAGS.max_list_cutoff) # cprint(train_set, 'green') # <utils.data_utils.Raw_data object at 0x7f9347482d00> find_class(exp_settings['train_input_feed']).preprocess_data( train_set, exp_settings['train_input_hparams'], exp_settings) valid_set = read_data(FLAGS.data_dir, FLAGS.valid_data_prefix, FLAGS.max_list_cutoff) find_class(exp_settings['train_input_feed']).preprocess_data( valid_set, exp_settings['train_input_hparams'], exp_settings) print("Train Rank list size %d" % train_set.rank_list_size) # 9 print("Valid Rank list size %d" % valid_set.rank_list_size) # 9 exp_settings['max_candidate_num'] = max(train_set.rank_list_size, valid_set.rank_list_size) test_set = None if FLAGS.test_while_train: test_set = read_data(FLAGS.data_dir, FLAGS.test_data_prefix, FLAGS.max_list_cutoff) find_class(exp_settings['train_input_feed']).preprocess_data( test_set, exp_settings['train_input_hparams'], exp_settings) print("Test Rank list size %d" % test_set.rank_list_size) exp_settings['max_candidate_num'] = max( test_set.rank_list_size, exp_settings['max_candidate_num']) test_set.pad(exp_settings['max_candidate_num']) if 'selection_bias_cutoff' not in exp_settings: # check if there is a limit on the number of items per training query. exp_settings[ 'selection_bias_cutoff'] = FLAGS.selection_bias_cutoff if FLAGS.selection_bias_cutoff > 0 else exp_settings[ 'max_candidate_num'] exp_settings['selection_bias_cutoff'] = min( exp_settings['selection_bias_cutoff'], exp_settings['max_candidate_num']) print( 'Users can only see the top %d documents for each query in training.' % exp_settings['selection_bias_cutoff']) # Pad data train_set.pad(exp_settings['max_candidate_num']) valid_set.pad(exp_settings['max_candidate_num']) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # tf.get_variable_scope().reuse_variables() # zcr --> useless for the error `ValueError: Variable dnn_W_0 does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=tf.AUTO_REUSE in VarScope?` # Create model based on the input layer. print("Creating model...") model = create_model(sess, exp_settings, train_set, False) #model.print_info() # Create data feed train_input_feed = find_class(exp_settings['train_input_feed'])( model, FLAGS.batch_size, exp_settings['train_input_hparams'], sess) valid_input_feed = find_class(exp_settings['valid_input_feed'])( model, FLAGS.batch_size, exp_settings['valid_input_hparams'], sess) test_input_feed = None if FLAGS.test_while_train: test_input_feed = find_class(exp_settings['test_input_feed'])( model, FLAGS.batch_size, exp_settings['test_input_hparams'], sess) # Create tensorboard summarizations. train_writer = tf.summary.FileWriter( os.path.join(FLAGS.model_dir, 'train_log'), sess.graph) valid_writer = tf.summary.FileWriter( os.path.join(FLAGS.model_dir, 'valid_log')) test_writer = None if FLAGS.test_while_train: test_writer = tf.summary.FileWriter( os.path.join(FLAGS.model_dir, 'test_log')) # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] best_perf = None while True: # Get a batch and make a step. start_time = time.time() input_feed, info_map = train_input_feed.get_batch( train_set, check_validation=True) # cprint('input_feed: {}'.format(input_feed), 'green') # cprint('info_map: {}'.format(info_map), 'green') # cprint('len(info_map[rank_list_idxs]): {}'.format(len(info_map['rank_list_idxs'])), 'green') # 256 # cprint('len(info_map[input_list]): {}'.format(len(info_map['input_list'])), 'green') # 256 # cprint('len(info_map[click_list]): {}'.format(len(info_map['click_list'])), 'green') # 256 # cprint('len(info_map[letor_features]): {}'.format(len(info_map['letor_features'])), 'green') # 1479 cprint( 'info_map[rank_list_idxs]: {}'.format( info_map['rank_list_idxs']), 'green') ''' [12, 7, 17, 0, 0, 13, 9, 8, 10, 4, 18, 8, 10, 6, 5, 15, 14, 10, 6, 3, 16, 1, 10, 0, 18, 1, 19, 15, 3, 2, 18, 7, 6, 8, 13, 4, 11, 11, 5, 2, 10, 1, 19, 2, 14, 6, 18, 14, 9, 1, 5, 11, 19, 4, 6, 12, 15, 11, 19, 9, 15, 3, 4, 16, 6, 6, 7, 0, 10, 17, 4, 14, 8, 14, 10, 8, 13, 6, 14, 17, 4, 1, 6, 1, 7, 0, 15, 3, 14, 4, 6, 6, 17, 19, 7, 3, 7, 7, 14, 18, 0, 16, 14, 16, 10, 9, 15, 6, 0, 12, 17, 9, 4, 2, 16, 17, 10, 16, 4, 2, 12, 12, 13, 14, 4, 17, 6, 9, 1, 3, 12, 19, 17, 10, 3, 4, 15, 19, 17, 0, 5, 10, 19, 8, 7, 4, 17, 17, 0, 12, 14, 7, 9, 0, 6, 10, 12, 15, 2, 5, 19, 7, 19, 16, 6, 2, 11, 1, 17, 3, 1, 10, 9, 0, 16, 12, 17, 19, 12, 1, 1, 18, 3, 19, 12, 13, 16, 4, 1, 2, 19, 15, 3, 12, 2, 12, 9, 18, 5, 13, 13, 2, 4, 10, 6, 4, 4, 9, 0, 0, 6, 15, 1, 11, 1, 15, 19, 8, 19, 3, 9, 1, 19, 4, 14, 18, 13, 0, 8, 11, 6, 17, 1, 18, 16, 14, 14, 14, 4, 13, 13, 4, 3, 8, 6, 14, 3, 19, 2, 2, 19, 0, 2, 9, 18, 0] ''' cprint('info_map[input_list]: {}'.format(info_map['input_list']), 'green') ''' 是一个list的list,内部list中的item个数为9 [[0, 1, 2, 3, 4, 5, 6, 1487, 1487], [7, 8, 9, 10, 11, 12, 13, 1487, 1487], [14, 15, 16, 17, 18, 19, 20, 1487, 1487], [21, 22, 23, 24, 1487, 1487, 1487, 1487, 1487], [25, 26, 27, 28, 1487, 1487, 1487, 1487, 1487], [29, 30, 31, 32, 33, 34, 1487, 1487, 1487], [35, 36, 37, 1487, 1487, 1487, 1487, 1487, 1487], [38, 39, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [40, 41, 42, 43, 44, 45, 46, 47, 48], [49, 50, 51, 52, 53, 1487, 1487, 1487, 1487], [54, 55, 56, 1487, 1487, 1487, 1487, 1487, 1487], [57, 58, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [59, 60, 61, 62, 63, 64, 65, 66, 67], [68, 69, 70, 71, 72, 73, 74, 1487, 1487], [75, 76, 77, 78, 1487, 1487, 1487, 1487, 1487], [79, 80, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [81, 82, 83, 84, 85, 86, 87, 88, 89], [90, 91, 92, 93, 94, 95, 96, 97, 98], [99, 100, 101, 102, 103, 104, 105, 1487, 1487], [106, 107, 108, 1487, 1487, 1487, 1487, 1487, 1487], [109, 110, 111, 112, 113, 114, 115, 116, 1487], [117, 118, 119, 120, 1487, 1487, 1487, 1487, 1487], [121, 122, 123, 124, 125, 126, 127, 128, 129], [130, 131, 132, 133, 1487, 1487, 1487, 1487, 1487], [134, 135, 136, 1487, 1487, 1487, 1487, 1487, 1487], [137, 138, 139, 140, 1487, 1487, 1487, 1487, 1487], [141, 142, 143, 144, 145, 146, 147, 148, 149], [150, 151, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [152, 153, 154, 1487, 1487, 1487, 1487, 1487, 1487], [155, 156, 157, 158, 159, 160, 161, 162, 163], [164, 165, 166, 1487, 1487, 1487, 1487, 1487, 1487], [167, 168, 169, 170, 171, 172, 173, 1487, 1487], [174, 175, 176, 177, 178, 179, 180, 1487, 1487], [181, 182, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [183, 184, 185, 186, 187, 188, 1487, 1487, 1487], [189, 190, 191, 192, 193, 1487, 1487, 1487, 1487], [194, 195, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [196, 197, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [198, 199, 200, 201, 1487, 1487, 1487, 1487, 1487], [202, 203, 204, 205, 206, 207, 208, 209, 210], [211, 212, 213, 214, 215, 216, 217, 218, 219], [220, 221, 222, 223, 1487, 1487, 1487, 1487, 1487], [224, 225, 226, 227, 228, 229, 230, 231, 232], [233, 234, 235, 236, 237, 238, 239, 240, 241], [242, 243, 244, 245, 246, 247, 248, 249, 250], [251, 252, 253, 254, 255, 256, 257, 1487, 1487], [258, 259, 260, 1487, 1487, 1487, 1487, 1487, 1487], [261, 262, 263, 264, 265, 266, 267, 268, 269], [270, 271, 272, 1487, 1487, 1487, 1487, 1487, 1487], [273, 274, 275, 276, 1487, 1487, 1487, 1487, 1487], [277, 278, 279, 280, 1487, 1487, 1487, 1487, 1487], [281, 282, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [283, 284, 285, 286, 287, 288, 289, 290, 291], [292, 293, 294, 295, 296, 1487, 1487, 1487, 1487], [297, 298, 299, 300, 301, 302, 303, 1487, 1487], [304, 305, 306, 307, 308, 309, 310, 1487, 1487], [311, 312, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [313, 314, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [315, 316, 317, 318, 319, 320, 321, 322, 323], [324, 325, 326, 1487, 1487, 1487, 1487, 1487, 1487], [327, 328, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [329, 330, 331, 1487, 1487, 1487, 1487, 1487, 1487], [332, 333, 334, 335, 336, 1487, 1487, 1487, 1487], [337, 338, 339, 340, 341, 342, 343, 344, 1487], [345, 346, 347, 348, 349, 350, 351, 1487, 1487], [352, 353, 354, 355, 356, 357, 358, 1487, 1487], [359, 360, 361, 362, 363, 364, 365, 1487, 1487], [366, 367, 368, 369, 1487, 1487, 1487, 1487, 1487], [370, 371, 372, 373, 374, 375, 376, 377, 378], [379, 380, 381, 382, 383, 384, 385, 1487, 1487], [386, 387, 388, 389, 390, 1487, 1487, 1487, 1487], [391, 392, 393, 394, 395, 396, 397, 398, 399], [400, 401, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [402, 403, 404, 405, 406, 407, 408, 409, 410], [411, 412, 413, 414, 415, 416, 417, 418, 419], [420, 421, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [422, 423, 424, 425, 426, 427, 1487, 1487, 1487], [428, 429, 430, 431, 432, 433, 434, 1487, 1487], [435, 436, 437, 438, 439, 440, 441, 442, 443], [444, 445, 446, 447, 448, 449, 450, 1487, 1487], [451, 452, 453, 454, 455, 1487, 1487, 1487, 1487], [456, 457, 458, 459, 1487, 1487, 1487, 1487, 1487], [460, 461, 462, 463, 464, 465, 466, 1487, 1487], [467, 468, 469, 470, 1487, 1487, 1487, 1487, 1487], [471, 472, 473, 474, 475, 476, 477, 1487, 1487], [478, 479, 480, 481, 1487, 1487, 1487, 1487, 1487], [482, 483, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [484, 485, 486, 1487, 1487, 1487, 1487, 1487, 1487], [487, 488, 489, 490, 491, 492, 493, 494, 495], [496, 497, 498, 499, 500, 1487, 1487, 1487, 1487], [501, 502, 503, 504, 505, 506, 507, 1487, 1487], [508, 509, 510, 511, 512, 513, 514, 1487, 1487], [515, 516, 517, 518, 519, 520, 521, 1487, 1487], [522, 523, 524, 525, 526, 527, 528, 529, 530], [531, 532, 533, 534, 535, 536, 537, 1487, 1487], [538, 539, 540, 1487, 1487, 1487, 1487, 1487, 1487], [541, 542, 543, 544, 545, 546, 547, 1487, 1487], [548, 549, 550, 551, 552, 553, 554, 1487, 1487], [555, 556, 557, 558, 559, 560, 561, 562, 563], [564, 565, 566, 1487, 1487, 1487, 1487, 1487, 1487], [567, 568, 569, 570, 1487, 1487, 1487, 1487, 1487], [571, 572, 573, 574, 575, 576, 577, 578, 1487], [579, 580, 581, 582, 583, 584, 585, 586, 587], [588, 589, 590, 591, 592, 593, 594, 595, 1487], [596, 597, 598, 599, 600, 601, 602, 603, 604], [605, 606, 607, 1487, 1487, 1487, 1487, 1487, 1487], [608, 609, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [610, 611, 612, 613, 614, 615, 616, 1487, 1487], [617, 618, 619, 620, 1487, 1487, 1487, 1487, 1487], [621, 622, 623, 624, 625, 626, 627, 1487, 1487], [628, 629, 630, 631, 632, 633, 634, 1487, 1487], [635, 636, 637, 1487, 1487, 1487, 1487, 1487, 1487], [638, 639, 640, 641, 642, 1487, 1487, 1487, 1487], [643, 644, 645, 646, 647, 648, 649, 650, 651], [652, 653, 654, 655, 656, 657, 658, 659, 1487], [660, 661, 662, 663, 664, 665, 666, 1487, 1487], [667, 668, 669, 670, 671, 672, 673, 674, 675], [676, 677, 678, 679, 680, 681, 682, 683, 1487], [684, 685, 686, 687, 688, 1487, 1487, 1487, 1487], [689, 690, 691, 692, 693, 694, 695, 696, 697], [698, 699, 700, 701, 702, 703, 704, 1487, 1487], [705, 706, 707, 708, 709, 710, 711, 1487, 1487], [712, 713, 714, 715, 716, 717, 1487, 1487, 1487], [718, 719, 720, 721, 722, 723, 724, 725, 726], [727, 728, 729, 730, 731, 1487, 1487, 1487, 1487], [732, 733, 734, 735, 736, 737, 738, 1487, 1487], [739, 740, 741, 742, 743, 744, 745, 1487, 1487], [746, 747, 748, 1487, 1487, 1487, 1487, 1487, 1487], [749, 750, 751, 752, 1487, 1487, 1487, 1487, 1487], [753, 754, 755, 1487, 1487, 1487, 1487, 1487, 1487], [756, 757, 758, 759, 760, 761, 762, 1487, 1487], [763, 764, 765, 766, 767, 768, 769, 770, 771], [772, 773, 774, 775, 776, 777, 778, 1487, 1487], [779, 780, 781, 782, 783, 784, 785, 786, 787], [788, 789, 790, 1487, 1487, 1487, 1487, 1487, 1487], [791, 792, 793, 794, 795, 1487, 1487, 1487, 1487], [796, 797, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [798, 799, 800, 801, 802, 803, 804, 805, 806], [807, 808, 809, 810, 811, 812, 813, 1487, 1487], [814, 815, 816, 817, 1487, 1487, 1487, 1487, 1487], [818, 819, 820, 821, 1487, 1487, 1487, 1487, 1487], [822, 823, 824, 825, 826, 827, 828, 829, 830], [831, 832, 833, 834, 835, 836, 837, 838, 839], [840, 841, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [842, 843, 844, 845, 846, 847, 848, 1487, 1487], [849, 850, 851, 852, 853, 1487, 1487, 1487, 1487], [854, 855, 856, 857, 858, 859, 860, 1487, 1487], [861, 862, 863, 864, 865, 866, 867, 1487, 1487], [868, 869, 870, 871, 1487, 1487, 1487, 1487, 1487], [872, 873, 874, 875, 876, 877, 878, 1487, 1487], [879, 880, 881, 882, 883, 884, 885, 886, 887], [888, 889, 890, 891, 892, 893, 894, 1487, 1487], [895, 896, 897, 1487, 1487, 1487, 1487, 1487, 1487], [898, 899, 900, 901, 1487, 1487, 1487, 1487, 1487], [902, 903, 904, 905, 906, 907, 908, 1487, 1487], [909, 910, 911, 912, 913, 914, 915, 916, 917], [918, 919, 920, 921, 922, 923, 924, 1487, 1487], [925, 926, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [927, 928, 929, 930, 931, 932, 933, 934, 935], [936, 937, 938, 939, 1487, 1487, 1487, 1487, 1487], [940, 941, 942, 943, 944, 945, 946, 947, 948], [949, 950, 951, 952, 953, 954, 955, 1487, 1487], [956, 957, 958, 959, 960, 961, 962, 963, 964], [965, 966, 967, 968, 969, 970, 971, 972, 1487], [973, 974, 975, 976, 977, 978, 979, 1487, 1487], [980, 981, 982, 983, 984, 985, 986, 987, 988], [989, 990, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [991, 992, 993, 994, 1487, 1487, 1487, 1487, 1487], [995, 996, 997, 998, 999, 1000, 1001, 1487, 1487], [1002, 1003, 1004, 1487, 1487, 1487, 1487, 1487, 1487], [1005, 1006, 1007, 1008, 1487, 1487, 1487, 1487, 1487], [1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017], [1018, 1019, 1020, 1487, 1487, 1487, 1487, 1487, 1487], [1021, 1022, 1023, 1024, 1487, 1487, 1487, 1487, 1487], [1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1487], [1033, 1034, 1035, 1036, 1037, 1038, 1039, 1487, 1487], [1040, 1041, 1042, 1043, 1044, 1045, 1046, 1487, 1487], [1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055], [1056, 1057, 1058, 1059, 1060, 1061, 1062, 1487, 1487], [1063, 1064, 1065, 1066, 1487, 1487, 1487, 1487, 1487], [1067, 1068, 1069, 1070, 1487, 1487, 1487, 1487, 1487], [1071, 1072, 1073, 1487, 1487, 1487, 1487, 1487, 1487], [1074, 1075, 1076, 1487, 1487, 1487, 1487, 1487, 1487], [1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085], [1086, 1087, 1088, 1089, 1090, 1091, 1092, 1487, 1487], [1093, 1094, 1095, 1096, 1097, 1098, 1487, 1487, 1487], [1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1487], [1107, 1108, 1109, 1110, 1111, 1487, 1487, 1487, 1487], [1112, 1113, 1114, 1115, 1487, 1487, 1487, 1487, 1487], [1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124], [1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133], [1134, 1135, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1136, 1137, 1138, 1487, 1487, 1487, 1487, 1487, 1487], [1139, 1140, 1141, 1142, 1143, 1144, 1145, 1487, 1487], [1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154], [1155, 1156, 1157, 1158, 1159, 1160, 1161, 1487, 1487], [1162, 1163, 1164, 1487, 1487, 1487, 1487, 1487, 1487], [1165, 1166, 1167, 1487, 1487, 1487, 1487, 1487, 1487], [1168, 1169, 1170, 1171, 1487, 1487, 1487, 1487, 1487], [1172, 1173, 1174, 1175, 1176, 1177, 1487, 1487, 1487], [1178, 1179, 1180, 1181, 1182, 1183, 1487, 1487, 1487], [1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192], [1193, 1194, 1195, 1196, 1197, 1487, 1487, 1487, 1487], [1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206], [1207, 1208, 1209, 1210, 1211, 1212, 1213, 1487, 1487], [1214, 1215, 1216, 1217, 1218, 1487, 1487, 1487, 1487], [1219, 1220, 1221, 1222, 1223, 1487, 1487, 1487, 1487], [1224, 1225, 1226, 1487, 1487, 1487, 1487, 1487, 1487], [1227, 1228, 1229, 1230, 1487, 1487, 1487, 1487, 1487], [1231, 1232, 1233, 1234, 1487, 1487, 1487, 1487, 1487], [1235, 1236, 1237, 1238, 1239, 1240, 1241, 1487, 1487], [1242, 1243, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1244, 1245, 1246, 1247, 1487, 1487, 1487, 1487, 1487], [1248, 1249, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1250, 1251, 1252, 1253, 1487, 1487, 1487, 1487, 1487], [1254, 1255, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264], [1265, 1266, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275], [1276, 1277, 1278, 1487, 1487, 1487, 1487, 1487, 1487], [1279, 1280, 1281, 1487, 1487, 1487, 1487, 1487, 1487], [1282, 1283, 1284, 1285, 1487, 1487, 1487, 1487, 1487], [1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294], [1295, 1296, 1297, 1298, 1299, 1487, 1487, 1487, 1487], [1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308], [1309, 1310, 1311, 1487, 1487, 1487, 1487, 1487, 1487], [1312, 1313, 1314, 1315, 1316, 1317, 1487, 1487, 1487], [1318, 1319, 1320, 1321, 1487, 1487, 1487, 1487, 1487], [1322, 1323, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1324, 1325, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1326, 1327, 1328, 1329, 1330, 1331, 1332, 1487, 1487], [1333, 1334, 1335, 1336, 1337, 1338, 1339, 1487, 1487], [1340, 1341, 1342, 1343, 1487, 1487, 1487, 1487, 1487], [1344, 1345, 1346, 1487, 1487, 1487, 1487, 1487, 1487], [1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1487], [1355, 1356, 1357, 1358, 1359, 1360, 1361, 1362, 1363], [1364, 1365, 1366, 1367, 1368, 1369, 1370, 1371, 1372], [1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381], [1382, 1383, 1384, 1385, 1386, 1487, 1487, 1487, 1487], [1387, 1388, 1389, 1390, 1391, 1392, 1487, 1487, 1487], [1393, 1394, 1395, 1396, 1397, 1398, 1487, 1487, 1487], [1399, 1400, 1401, 1402, 1403, 1487, 1487, 1487, 1487], [1404, 1405, 1406, 1487, 1487, 1487, 1487, 1487, 1487], [1407, 1408, 1487, 1487, 1487, 1487, 1487, 1487, 1487], [1409, 1410, 1411, 1412, 1413, 1414, 1415, 1487, 1487], [1416, 1417, 1418, 1419, 1420, 1421, 1422, 1423, 1424], [1425, 1426, 1427, 1487, 1487, 1487, 1487, 1487, 1487], [1428, 1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436], [1437, 1438, 1439, 1440, 1441, 1442, 1443, 1444, 1445], [1446, 1447, 1448, 1449, 1450, 1451, 1452, 1453, 1454], [1455, 1456, 1457, 1458, 1459, 1460, 1461, 1462, 1463], [1464, 1465, 1466, 1467, 1487, 1487, 1487, 1487, 1487], [1468, 1469, 1470, 1471, 1472, 1473, 1474, 1475, 1476], [1477, 1478, 1479, 1487, 1487, 1487, 1487, 1487, 1487], [1480, 1481, 1482, 1487, 1487, 1487, 1487, 1487, 1487], [1483, 1484, 1485, 1486, 1487, 1487, 1487, 1487, 1487]] ''' cprint('info_map[click_list]: {}'.format(info_map['click_list']), 'green') ''' 是一个list的list,内部list中的item个数为9, 数值为1的表示点击,为0的表示不点击。 [[0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [1, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 1, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0]] ''' exit() step_loss, _, summary = model.step(sess, input_feed, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 train_writer.add_summary(summary, model.global_step.eval()) # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. cprint( "global step {} learning rate {:.4f} step-time {:.2f} loss {:.4f}" .format(model.global_step.eval(), model.learning_rate.eval(), step_time, loss), 'green') previous_losses.append(loss) # Validate model def validate_model(data_set, data_input_feed): it = 0 count_batch = 0.0 summary_list = [] batch_size_list = [] while it < len(data_set.initial_list): input_feed, info_map = data_input_feed.get_next_batch( it, data_set, check_validation=False) _, _, summary = model.step(sess, input_feed, True) summary_list.append(summary) batch_size_list.append(len(info_map['input_list'])) it += batch_size_list[-1] count_batch += 1.0 return merge_TFSummary(summary_list, batch_size_list) valid_summary = validate_model(valid_set, valid_input_feed) valid_writer.add_summary(valid_summary, model.global_step.eval()) cprint( "[Valid]: %s" % (' '.join([ '%s: %.3f' % (x.tag, x.simple_value) for x in valid_summary.value ])), 'green') if FLAGS.test_while_train: test_summary = validate_model(test_set, test_input_feed) test_writer.add_summary(test_summary, model.global_step.eval()) cprint( "[Test]: %s" % (' '.join([ '%s:%.3f' % (x.tag, x.simple_value) for x in test_summary.value ])), 'green') # Save checkpoint if the objective metric on the validation set is better if "objective_metric" in exp_settings: for x in valid_summary.value: if x.tag == exp_settings["objective_metric"]: if current_step >= FLAGS.start_saving_iteration: if best_perf == None or best_perf < x.simple_value: checkpoint_path = os.path.join( FLAGS.model_dir, "%s.ckpt" % exp_settings['learning_algorithm']) model.saver.save( sess, checkpoint_path, global_step=model.global_step) best_perf = x.simple_value print('Save model, valid %s:%.3f' % (x.tag, best_perf)) break # Save checkpoint if there is no objective metic if best_perf == None and current_step > FLAGS.start_saving_iteration: checkpoint_path = os.path.join( FLAGS.model_dir, "%s.ckpt" % exp_settings['learning_algorithm']) model.saver.save(sess, checkpoint_path, global_step=model.global_step) if loss == float('inf'): break step_time, loss = 0.0, 0.0 sys.stdout.flush() if FLAGS.max_train_iteration > 0 and current_step > FLAGS.max_train_iteration: break
def infer(config, test_bleu=True): work_space = config["workspace"] name = config["Name"] # Construct or load embeddings print("Initializing embeddings ...") vocab_size = config["embeddings"]["vocab_size"] embed_size = config["embeddings"]["embed_size"] vocab_file = config["inference"]["vocab_file"] # Build the model ( encode_num_layers, encode_num_units, encode_cell_type, encode_bidir, attn_num_units, decode_num_layers, decode_num_units, decode_cell_type, use_user_feat, use_gate_memory, use_user_desc, use_blog_user_coattn, use_external_desc_express, use_external_feat_express, user_feat_dim, user_feat_unit, user_feat_mem_unit, desc_rnn_unit, desc_attn_num_units, user_map_unit, ) = get_pcgn_model_config(config) (infer_file, batch_size, is_beam_search, beam_size, infer_source_max_length, infer_target_max_length, infer_desc_max_length, infer_max_iter, output_path, gpu_fraction, gpu_id) = get_pcgn_infer_config(config) print("Building model architecture ...") pcg_model = PCGNModel( mode='infer', model_name=name, vocab_size=vocab_size, embedding_size=embed_size, encode_num_layers=encode_num_layers, encode_num_units=encode_num_units, encode_cell_type=encode_cell_type, encode_bidir=encode_bidir, attn_num_units=attn_num_units, decode_num_layers=decode_num_layers, decode_num_units=decode_num_units, decode_cell_type=decode_cell_type, use_user_feat=use_user_feat, use_gate_memory=use_gate_memory, use_user_desc=use_user_desc, use_blog_user_coattn=use_blog_user_coattn, use_external_desc_express=use_external_desc_express, use_external_feat_express=use_external_feat_express, user_feat_dim=user_feat_dim, user_feat_unit=user_feat_unit, user_feat_mem_unit=user_feat_mem_unit, desc_rnn_unit=desc_rnn_unit, desc_attn_num_units=desc_attn_num_units, user_map_unit=user_map_unit, batch_size=batch_size, beam_search=is_beam_search, beam_size=beam_size, infer_max_iter=infer_max_iter, target_max_length=infer_target_max_length, ) print("\tDone.") logdir = '%s/nn_models/' % work_space # Set up session gpu_fraction = config["training"]["gpu_fraction"] gpu_id = config["training"]["gpu_id"] gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, visible_device_list=gpu_id, allow_growth=True) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)) init = tf.global_variables_initializer() sess.run(init) try: saved_global_step = load_model(pcg_model.saver, sess, logdir) if saved_global_step is None: raise ValueError("Cannot find the checkpoint to restore from.") except Exception: print("Something went wrong while restoring checkpoint. ") raise # ##### Inference ##### # Load data print("Loading inference data ...") # Load vocabularies. vocab_table, reverse_vocab_table = create_vocab_tables(vocab_file) infer_dataset = read_data(infer_file) print(' # infer data:', len(infer_dataset)) print("\tDone.") # Inference print("Start inferring ...") final_result = pd.DataFrame() infer_step = int(len(infer_dataset) / batch_size) preds = [] for ith in range(infer_step): print('step:', ith) start = ith * batch_size end = (ith + 1) * batch_size batch = get_pcgn_batch(infer_dataset[start:end], 'infer', -1, infer_source_max_length, infer_target_max_length, infer_desc_max_length) result = pcg_model.infer(sess, batch) result1 = batch_token_to_str(result[:, 0, :], reverse_vocab_table) #result2 = batch_token_to_str(result[:, 1,:], reverse_vocab_table) #result3 = batch_token_to_str(result[:, 2,:], reverse_vocab_table) #result4 = batch_token_to_str(result[:, 3,:], reverse_vocab_table) #result5 = batch_token_to_str(result[:, 4,:], reverse_vocab_table) preds += list(result1) if test_bleu: blog = batch_token_to_str(batch[0], reverse_vocab_table) cmt = batch_token_to_str(batch[2], reverse_vocab_table) desc = batch_token_to_str(batch[6], reverse_vocab_table) feat_df = featinds2df(batch[8]) df_result = pd.DataFrame({ 'Blog': blog, 'Comment': cmt, 'Individual_Description': desc, 'Prediction': result1, }) df_result = pd.concat([df_result, feat_df], axis=1) final_result = pd.concat([final_result, df_result]) out_path = config["inference"]["output_path"] + 'prediction' + '.txt' with open(out_path, 'w') as f: f.write('\n'.join(preds)) if test_bleu: bleu2 = calc_bleu2(final_result['Prediction'].values, final_result['Comment'].values) print('test bleu:', bleu2) bleurecord = 'test_size:{}\trestore_step:{}\n'.format( str(int(infer_step * batch_size)), str(saved_global_step)) bleurecord += 'bleu2:{}\n\n'.format(str(bleu2[0])) with open(logdir + 'bleu.txt', 'a') as f: f.write(bleurecord) out_path = config["inference"]["output_path"] + 'prediction' + '.csv' final_result.to_csv(out_path, index=False) print("\tDone.")
type=str, help='Path to file containing paths to the data to use.') parser.add_argument('--data_root', default='data/repos', type=str, help='Path root folder containing the cloned repositories.') parser.add_argument('--out_path', default='data', type=str, help='Path to save vocabulary object.') parser.add_argument('--oov_threshold', default=20, type=int, help='Ignore words that appear less than this many times.') args = parser.parse_args() def build_vocab(dataset, oov_threshold): counter = collections.Counter(itertools.chain(itertools.chain(*dataset))) count_pairs = sorted(counter.items(), key=lambda x: -x[1]) count_pairs = (p for p in count_pairs if p[1] > oov_threshold) words, _ = list(zip(*count_pairs)) word_to_id = dict(zip(words, range(2, len(words) + 2))) word_to_id[du.OOV_TOKEN] = du.OOV_IDX word_to_id[du.PAD_TOKEN] = du.PAD_IDX return word_to_id if __name__ == "__main__": data_for_vocab = args.vocab_data data_root = args.data_root data = du.read_data(data_for_vocab, data_root) print("Loaded {} files".format(len(data))) vocab = build_vocab(data, args.oov_threshold) print("Vocabulary size is:", len(vocab)) out_file = args.out_path + "/vocab.p" pickle.dump(vocab, open(out_file, "wb"), protocol=pickle.HIGHEST_PROTOCOL)
def train(): """Train a query2vec model""" # Prepare train data. print("Preparing Seq2seq Model in %s" % FLAGS.train_dir) train_data, test_data, _ = data_utils.prepare_data(FLAGS.train_dir, FLAGS.vocab_size) checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.seq2seq_model) print("Loading training data from %s" % train_data) print("Loading development data from %s" % test_data) gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options, intra_op_parallelism_threads=20)) as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) with tf.device("/gpu:0"): model = model_helper.create_model(sess, False) # Read data into buckets and compute their sizes. print("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) test_set = data_utils.read_data(test_data) train_set = data_utils.read_data(train_data, max_size=FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] prev_loss = [1000000] * len(_buckets) train_writer = tf.summary.FileWriter(os.path.join("summary/train"), sess.graph) test_writer = tf.summary.FileWriter(os.path.join("summary/test"), sess.graph) while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set[bucket_id], bucket_id) summaries, _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 if current_step % FLAGS.steps_per_summary == 0: train_writer.add_summary(summaries, current_step) train_writer.flush() print('Step: %s' % current_step) # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. count = 0 for bucket_id in xrange(len(_buckets)): if len(test_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch( test_set[bucket_id], bucket_id) summaries, _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) test_writer.add_summary(summaries, current_step) eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') if eval_ppx < prev_loss[bucket_id]: prev_loss[bucket_id] = eval_ppx count += 1 print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) if count > len(_buckets) / 3: print("saving model...") model.saver.save(sess, checkpoint_path, global_step=model.global_step) sys.stdout.flush() test_writer.flush()
TRAIN_DATA_DIR = '/data/public/NER/ner/' #external_words_fname = '/data/xueyou/ner/sogou.words.txt' checkpoint_dir = '/data/xueyou/ner/ner_lstm_dim256_no_external_words_0201/' if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) #add_external_words(external_words_fname) # read training data train_files = [ os.path.join(DATA_DIR, "example.train"), os.path.join(TRAIN_DATA_DIR, "people.199801.tagged.txt"), os.path.join(TRAIN_DATA_DIR, "boson_nlp.tagged.txt") ] train_data = read_data(train_files, lower=True) # convert tags to iobes update_tag_scheme(train_data) # create vocab from training data word_vocab, tag_vocab = create_vocab(train_data, lower_case=True, min_cnt=2) segment_vocab = segment_vocab() # save vocab save_vocab(word_vocab, os.path.join(checkpoint_dir, "word.vocab")) save_vocab(tag_vocab, os.path.join(checkpoint_dir, "tag.vocab")) save_vocab(segment_vocab, os.path.join(checkpoint_dir, "seg.vocab"))
def main(): parser = argparse.ArgumentParser() add_argument(parser) args = parser.parse_args() config = Config() train_data = read_data(config.train_data_files, config.model) # 试试对负样本进行降采样 # train_data = sample(train_data) eval_data = read_data(config.eval_data_files, config.model) # train_data_sen = read_data_sen("data/data_tech.train") # eval_data_sen = read_data_sen("data/data_tech.eval") # 这里使用了预训练的词向量的词表作为了模型的词表 create_vocab_from_pretrained_w2v(config.w2v_path, config.word_vocab_file) create_tag_vocab_from_data(train_data, config.tag_vocab_file) word2id, id2word = read_vocab(config.word_vocab_file) tag2id, id2tag = read_vocab(config.tag_vocab_file) # convert word into ids train_data = convert_dataset(train_data, word2id, tag2id, config.sentence_length, config.num_classes, config.model) # train_data_sen = convert_dataset_sen(train_data_sen, word2id, tag2id, config.num_classes, one_hot_label=True) print(train_data[0]) eval_data = convert_dataset(eval_data, word2id, tag2id, config.sentence_length, config.num_classes, config.model) # eval_data_sen = convert_dataset_sen(eval_data_sen, word2id, tag2id, config.num_classes, one_hot_label=True) print("train_data size: {0}".format(len(train_data))) if os.path.exists(os.path.join(config.checkpoint_dir, "config.pkl")): config = pickle.load( open(os.path.join(config.checkpoint_dir, "config.pkl"), 'rb')) else: pickle.dump( config, open(os.path.join(config.checkpoint_dir, "config.pkl"), 'wb')) with tf.Session(config=get_config_proto( log_device_placement=False)) as sess: model = get_model(config.model, config, sess) model.build() model.init() batch_manager = Batch_self_attention(train_data, config.batch_size) batch_manager_eval = Batch_self_attention(eval_data, config.batch_size) # batch_manager = Batch(train_data, config.batch_size) # batch_manager_eval = Batch(eval_data, config.batch_size) epoches = config.epoch max_acc = 0 for i in range(epoches): for batch in batch_manager.next_batch(): # print(batch) loss, accuracy, global_step = model.train_one_step(*zip( *batch)) # key_shape, query_shape = model.test(*zip(*batch)) # print(key_shape, query_shape) # break train_accuracy = evaluate(model, batch_manager) eval_accuracy = evaluate(model, batch_manager_eval) # train_accuracy = evaluate_attention(model, train_data_sen, id2tag) # eval_accuracy = evaluate_attention(model, eval_data_sen, id2tag) print("epoch - {0} step - {1} loss - {2} train_accuracy - {3} eval_accuracy - {4}"\ .format(i, global_step, loss, train_accuracy, eval_accuracy)) # train_accuracy = evaluate_attention(model, train_data_sen, id2tag) # eval_accuracy = evaluate_attention(model, eval_data_sen, id2tag) # print("epoch - {0} step - {1} loss - {2} train_accuracy - {3} eval_accuracy - {4}"\ # .format(i, global_step, loss, train_accuracy, eval_accuracy)) if max_acc < eval_accuracy: max_acc = eval_accuracy model.save_model()
def main(config): # set up workspace work_space = config["workspace"] tf_board = config["tf_board"] setup_workpath(work_space) name = config["Name"] # Construct or load embeddings print("Initializing embeddings ...") vocab_size = config["embeddings"]["vocab_size"] embed_size = config["embeddings"]["embed_size"] # Build the model and compute losses (encode_num_layers, encode_num_units, encode_cell_type, encode_bidir, attn_num_units, decode_num_layers, decode_num_units, decode_cell_type, use_user_feat,use_gate_memory,use_user_desc,use_blog_user_coattn, use_external_desc_express,use_external_feat_express, user_feat_dim,user_feat_unit,user_feat_mem_unit, desc_rnn_unit,desc_attn_num_units,user_map_unit, ) = get_pcgn_model_config(config) (train_file, dev_file, source_max_length, target_max_length, desc_max_length, gpu_fraction, gpu_id, train_steps, checkpoint_every, print_every, batch_size,is_beam_search,beam_size,infer_max_iter, l2_regularize,learning_rate,max_checkpoints,max_gradient_norm, ) = get_pcgn_training_config(config) train_set=read_data(train_file) print(' # train data:',len(train_set)) dev_set=read_data(dev_file) print(' # dev data:',len(dev_set)) print("Building model architecture ") pcg_model = PCGNModel( mode='train', model_name=name, vocab_size=vocab_size, embedding_size=embed_size, encode_num_layers=encode_num_layers, encode_num_units=encode_num_units, encode_cell_type=encode_cell_type, encode_bidir=encode_bidir, attn_num_units=attn_num_units, decode_num_layers=decode_num_layers, decode_num_units=decode_num_units, decode_cell_type=decode_cell_type, use_user_feat=use_user_feat, use_gate_memory=use_gate_memory, use_user_desc=use_user_desc, use_blog_user_coattn=use_blog_user_coattn, use_external_desc_express=use_external_desc_express, use_external_feat_express=use_external_feat_express, user_feat_dim=user_feat_dim, user_feat_unit=user_feat_unit, user_feat_mem_unit=user_feat_mem_unit, desc_rnn_unit=desc_rnn_unit, desc_attn_num_units=desc_attn_num_units, user_map_unit=user_map_unit, batch_size=batch_size, beam_search=is_beam_search, beam_size=beam_size, infer_max_iter=infer_max_iter, target_max_length=target_max_length, l2_regularize=l2_regularize, learning_rate=learning_rate, max_to_keep=max_checkpoints, max_gradient_norm=max_gradient_norm, ) print("\tDone.") logdir = '%s/nn_models/' % work_space # Set up session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction, visible_device_list=gpu_id,allow_growth=True) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)) init = tf.global_variables_initializer() sess.run(init) # tensorbord if use_tensorboard: train_writer = tf.summary.FileWriter(tf_board + 'train/', sess.graph) test_writer = tf.summary.FileWriter(tf_board + 'test/', sess.graph) try: saved_global_step = load_model(pcg_model.saver, sess, logdir) if saved_global_step is None: saved_global_step = -1 except Exception: print("Something went wrong while restoring checkpoint. " "Training is terminated to avoid the overwriting.") raise # ##### Training ##### # Training last_saved_step = saved_global_step num_steps = saved_global_step + train_steps steps = [] previous_losses=[] lr = pcg_model.learning_rate print("Start training ...") print('steps per epoch:',len(train_set)//batch_size) try: for step in range(saved_global_step + 1, num_steps): start_time = time.time() batch = get_pcgn_batch(train_set,'train', batch_size,source_max_length, target_max_length,desc_max_length) loss_value = pcg_model.train(sess, batch) previous_losses.append(loss_value) lr_decay_step = 10 if step % 500 == 0 and len(previous_losses)-5 > lr_decay_step and np.mean(previous_losses[-5:]) >= np.mean(previous_losses[-lr_decay_step -5:-5]): lr=pcg_model.learning_rate if lr > 0.00001: pcg_model.learning_rate=lr*0.9 print('learning rate decay:',lr*0.9) duration = (time.time() - start_time) if step % print_every == 0 and step != 0: # train perplexity t_perp = pcg_model.compute_perplexity(sess, batch) if use_tensorboard: add_summary(train_writer, step, 'train perplexity', t_perp) # eval perplexity dev_str = "" if dev_set is not None: eval_batch = get_pcgn_batch(dev_set,'train', batch_size,source_max_length, target_max_length,desc_max_length) eval_perp = pcg_model.compute_perplexity(sess, eval_batch) with open(logdir+'eval_perp.txt','a',encoding='utf-8') as f: f.write('{}\t{}\n'.format(str(step),str(eval_perp))) if use_tensorboard: add_summary(test_writer, step, 'eval perplexity', eval_perp) dev_str += "val_prep: {:.3f}\n".format(eval_perp) steps.append(step) ep=step//(len(train_set)//batch_size) info = 'epoch {:d}, step {:d},lr:{:.5f}, loss = {:.6f},perp: {:.3f}\n{}({:.3f} sec/step)' print(info.format(ep,step,lr, loss_value, t_perp, dev_str, duration)) if step % checkpoint_every == 0: save_model(pcg_model.saver, sess, logdir, step) last_saved_step = step except KeyboardInterrupt: # Introduce a line break after ^C so save message is on its own line. print() finally: if step > last_saved_step: save_model(pcg_model.saver, sess, logdir, step)
from config import Config from utils.data_utils import read_data, write_scores, write_predictions, get_segment, build_dictionary, remove_low_words from models.BowModel import BowModel from models.BocModel import BocModel from models.Ensemble import Ensemble if __name__ == "__main__": config = Config() # read data train_set = read_data(config.train_set_file_name) dev_set = read_data(config.dev_set_file_name) # segmentation train_set = get_segment(train_set) dev_set = get_segment(dev_set) # remove words with low frequency dictionary = build_dictionary([train_set, dev_set], config.low_frequency, config.high_frequency) train_set = remove_low_words(train_set, dictionary) dev_set = remove_low_words(dev_set, dictionary) # get predictions ensemble_model = Ensemble(config, [BowModel(config), BocModel(config)]) scores = ensemble_model.test(dev_set) # write predictions # write_predictions(dev_set, labels, config.result_file_name) write_scores(scores, config.result_file_name)
nnlm_embedder = hub.load(config['tf_hub_model']) batch_size = config['batch_size'] print('Processing Train Data...') train_data, train_tables, in_domain_test = process_train_data( config, nnlm_embedder, config['train_data'], config['train_tables']) train_batches = create_train_batches(train_data, train_tables, config) train_samples_batches = create_samples_batches(train_data, batch_size) train_tables_batches = create_tables_batches(train_tables, config) if config['use_in_domain_test']: in_domain_test_batches = create_samples_batches(in_domain_test, batch_size) print('Processing Dev Data...') dev_data, dev_tables, _ = read_data(config['dev_data'], config['dev_tables'], config['real_proxy_token']) dev_samples_batches = create_samples_batches(dev_data, batch_size) dev_tables_batches = create_tables_batches(dev_tables, config) print('Processing Test Data...') test_data, test_tables, _ = read_data(config['test_data'], config['test_tables'], config['real_proxy_token']) test_samples_batches = create_samples_batches(test_data, batch_size) test_tables_batches = create_tables_batches(test_tables, config) all_data = { 'train_batches': train_batches, 'dev_samples_batches': dev_samples_batches, 'dev_tables_batches': dev_tables_batches, 'test_samples_batches': test_samples_batches,