def _make_data_processor(self): processors = { "ubuntu": UbuntuProcessor, } data_dir = self.hparams.data_dir self.processor = processors[self.hparams.task_name](self.hparams) self.train_examples, self.train_knowledge_examples, self.train_similar_examples = \ self.processor.get_train_examples(data_dir) self.valid_examples, self.valid_knowledge_examples, self.valid_similar_examples = \ self.processor.get_dev_examples(data_dir) self.test_examples, self.test_knowledge_examples, self.test_similar_examples = \ self.processor.get_test_examples(data_dir) self.label_list = self.processor.get_labels() self.tokenizer = tokenization.FullTokenizer(self.hparams.vocab_dir, self.hparams.do_lower_case) self.processor.data_process_feature(self.hparams, self.tokenizer) self.num_train_steps = int( len(self.train_examples) / self.hparams.train_batch_size * self.hparams.num_epochs) self.warmup_proportion = 0.1 self.num_warmup_steps = int(self.num_train_steps * self.warmup_proportion)
def __init__(self): self.data_dir = "/mnt/raid5/taesun/data/ubuntu_corpus_v1/ubuntu_data/bert_multi_turn_negative_1/bert_%s.pickle" % \ "train" self.bert_vocab_file = "/mnt/raid5/shared/bert/uncased_L-12_H-768_A-12/vocab.txt" self.data_l = [] self.load_data_dir() self.tokenizer = tokenization.FullTokenizer(self.bert_vocab_file, True) self.max_dialog_context = 0 self.max_response = 0 self.max_utterance = 0 self.avg_dialog_context = 0 self.avg_response = 0 self.avg_utterance = 0 self.min_dialog_context = 10000 self.min_response = 10000 self.min_utterance = 10000 self.get_sentence_statistics() print("="*200) print("Final Stat Info") print("avg_dialog_context", self.avg_dialog_context) print("avg_response", self.avg_response) print("max_dialog_context", self.max_dialog_context) print("max_response", self.max_response) print("min_dialog_context", self.min_dialog_context) print("min_response", self.min_response)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) input_files = [] for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Reading from input files ***") for input_file in input_files: tf.logging.info(" %s", input_file) rng = random.Random(FLAGS.random_seed) instances = create_training_instances( input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, rng) output_files = FLAGS.output_file.split(",") tf.logging.info("*** Writing to output files ***") for output_file in output_files: tf.logging.info(" %s", output_file) write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, FLAGS.max_predictions_per_seq, output_files)
def __init__(self, hparams): self.hparams = hparams self.bert_config = modeling_base.BertConfig.from_json_file( self.hparams.bert_config_dir) self.tokenizer = tokenization.FullTokenizer(self.hparams.vocab_dir, self.hparams.do_lower_case) self._make_data_processor()
def __init__(self, batch_size=cf.batch_size): self.mode = None self.max_seq_length = cf.max_seq_length self.tokenizer = tokenization.FullTokenizer(vocab_file=cf.vocab_file, do_lower_case=True) self.batch_size = batch_size self.estimator = None self.processor = SimProcessor() # 加载训练、测试数据class # tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) tf.logging.set_verbosity(tf.logging.INFO)
def _make_data_processor(self): processors = { "fnc": FNCProcessor, } self.tokenizer = tokenization.FullTokenizer(self.hparams.vocab_dir, self.hparams.do_lower_case) data_dir = self.hparams.data_dir self.processor = processors[self.hparams.task_name](self.hparams, self.tokenizer) self.train_examples, self.label_dict = self.processor.get_train_examples( data_dir) self.test_examples = self.processor.get_test_examples(data_dir) self.label_list = self.processor.get_labels() self.num_train_steps = int( len(self.train_examples) / self.hparams.train_batch_size * self.hparams.num_epochs) self.warmup_proportion = 0.1 self.num_warmup_steps = int(self.num_train_steps * self.warmup_proportion)
def __init__(self, args): self.max_len = args.max_len self.bert_config_path = args.bert_config_path self.bert_path = args.bert_path self.loginfo = get_logger(args.log_path) self.export_model_path = args.export_model_path self.batch_size = args.batch_size self.epochs = args.epochs self.is_train = args.is_train self.config = tf.ConfigProto(allow_soft_placement=True) self.config.gpu_options.per_process_gpu_memory_fraction = 0.4 # 占用40%显存 self.tokenize = tokenization.FullTokenizer(args.vocab_path, do_lower_case=True) self.num_train_steps = args.num_train_steps self.num_warmup_steps = args.num_warmup_steps self.init_lr = args.init_lr self.restore_on_train = args.restore_on_train self.isload = args.isload self.bert_config_path = args.bert_config_path self.bert_path = args.bert_path self.epochs = args.epochs self.rate = args.keep_rate self.version = args.version
from bert_model import tokenization abs_path=os.path.abspath('.') # print(abs_path) file_path = os.path.dirname(abs_path) os.sys.path.append(file_path) train=pd.read_csv('../data/train/classify/train.csv',sep='\t',encoding='utf-8',header=0) valid=pd.read_csv('../data/train/classify/valid.csv',sep='\t',encoding='utf-8',header=0) test=pd.read_csv('../data/test/test.csv',sep='\t',encoding='utf-8',header=0) # q=train['query'] # r=train.reply # e= train=list(zip(train.label,train['query'],train.reply,train.seq_id)) valid=list(zip(valid.label,valid['query'],valid.reply,valid.seq_id)) test=list(zip(test.label,test['query'],test.reply,test.seq_id)) tokenize = tokenization.FullTokenizer('./bert_model/chinese_L-12_H-768_A-12/vocab.txt', do_lower_case=True) for input_ids, mask_ids, type_ids, labels,seq_ids in batch_yield(train,batch_size=64,tokenize=tokenize): input_ids=np.array(input_ids) labels=np.array(labels) seq_ids=np.array(seq_ids) print(input_ids.shape) print(labels.shape) print(seq_ids.shape) print('=' * 10) break # print(train.head()) for input_ids, mask_ids, type_ids, labels,seq_ids in batch_yield(valid, batch_size=64, tokenize=tokenize): input_ids = np.array(input_ids) labels = np.array(labels) seq_ids = np.array(seq_ids) print(input_ids.shape)
#----------------------------------------------------------------------------------------------------- def sentence_heatmap(score_sent_mat, dialog, response): hm_sent_mat = softmax((np.max(score_sent_mat, axis=0)*25), dim=-1) print(response) print(list(hm_sent_mat)) def softmax(x, dim=-1): """Compute softmax values for each sets of scores in x.""" exp_x = np.exp(x) sum_exp_x = np.sum(exp_x, axis=dim) sf = exp_x / np.expand_dims(sum_exp_x, axis=dim) return sf if __name__ == '__main__': tokenizer = tokenization.FullTokenizer("/mnt/raid5/shared/bert/tensorflow/uncased_L-12_H-768_A-12/vocab.txt", True) with open("./attention_score_%s.pickle" % idx, "rb") as frb_handle: dialog, response, raw_dialog, raw_response, sequence_rep = pickle.load(frb_handle) # dialog_len, response_len dialog_len = len(dialog) response_len = len(response) dialog_rep = np.array(sequence_rep[0:dialog_len]) # 24, 768 response_rep = np.array(sequence_rep[280:280 + response_len]) # 40, 768 dialog_merged_embeddings = merge_subtokens([" ".join(raw_dialog)], tokenizer, np.expand_dims(dialog_rep,0), is_cls=True) response_merged_embeddings = merge_subtokens([" ".join(raw_response)], tokenizer, np.expand_dims(response_rep,0)) # 24, 40 dialog_sentence, response_sentence = [], [] dialog_sent_vec, response_sent_vec = [], []
def make_bert_multi_turn_data_pickle(num_negative_samples=5): from bert_model import tokenization tokenizer = tokenization.FullTokenizer( "/mnt/raid5/shared/bert/tensorflow/uncased_L-12_H-768_A-12/vocab.txt", True) dialog_turn_num = 10 orig_path = "/mnt/raid5/taesun/data/ubuntu_corpus_v1/ubuntu_data/%s.txt" data_path = "bert_%s_eot_none.pickle" file_dir = "/mnt/raid5/taesun/data/ubuntu_corpus_v1/ubuntu_data/bert_dialog_turn_stat_len" data_type = ["test"] if not os.path.exists(file_dir): os.makedirs(file_dir) for t in data_type: print(t + " data is loading now...") curr_idx = 0 stat_utt_len = 0 with open(os.path.join(file_dir, data_path % t), "wb") as fw_handle: dialog_data_l, candidates_pool = get_dialog_dataset(orig_path % t, is_eot=True) print(len(dialog_data_l)) print("candidates_pool", len(candidates_pool)) current_ground_truth = "" print(dialog_data_l[0]) for idx, dialog_data in enumerate(dialog_data_l): utterances = dialog_data[0] response = dialog_data[1][0] label = str(dialog_data[2]) if label == "0": continue dialog_context = "" utt_count = 0 for utt in utterances: dialog_context += utt utt_count += 1 if utt_count != dialog_turn_num: continue stat_utt_len += len(tokenizer.tokenize(dialog_context)) if t in ["test", "valid"]: pickle.dump([dialog_context, response, label], fw_handle) curr_idx += 1 continue # pos : neg ==> 1 : 1 if num_negative_samples == 1: pickle.dump([dialog_context, response, label], fw_handle) curr_idx += 1 else: if label == "1": current_ground_truth = response pickle.dump([dialog_context, response, label], fw_handle) curr_idx += 1 # negative sample if label == "0": for post_idx in range(1, num_negative_samples + 1): try: neg_sample = dialog_data_l[idx + 2 * post_idx][1][0] except IndexError: print(idx, ":", idx + 2 * post_idx, "index Error") neg_sample = random.sample( candidates_pool.difference( response, current_ground_truth), 1)[0] finally: pickle.dump( [dialog_context, neg_sample, label], fw_handle) curr_idx += 1 if curr_idx % 10000 == 0: print(str(curr_idx) + " data has been saved now...") print(dialog_context) print(str(t), stat_utt_len, curr_idx) print(t + " data pickle save complete")