def get_wholeword_label_str(input_ids,config=None,tokenizer=None): """ get whole word label_str from input_ids Args: input_ids: Tensor(batch_size,seq_length), indexs of input text config: GPT2Config, config of GPT2 model, if not initiated, this function will create a MockConfig by params of input_ids, optional tokenizer: GPT2Tokenizer, if not initiated, it will be created using the default setting in utils.tokenization, optional Returns: label_str: [str], lastword str given lambada as label """ if tokenizer is None: tokenizer = Tokenizer() if config is None: config = MockConfig() config.batch_size = input_ids.shape[0] config.seq_length = input_ids.shape[1] config.vocab_size = tokenizer.vocab_size #lastword_range is a list of tuples, seems like [...,(start_position_i,end_position_i),...] lastword_range = get_lastword_range(input_ids,config,tokenizer=tokenizer) #input_ids requires to shift right for one step for its every first token is <BOS> ids = input_ids[::,1:].asnumpy() label_ids = [ id_[index[0]:index[1]].tolist() for index,id_ in zip(lastword_range,ids)] # use GPT2Tokenizer to decode label_str = [ tokenizer.decode(label_id) for label_id in label_ids ] return label_str
def main(config): print(config) list_of_tokens = [] if config.is_tokenized: # read tokens with open(config.corpus, 'r', encoding='utf8') as reader: for li, line in enumerate(reader): list_of_tokens += line.strip().split() else: # select tokenizer if config.tokenizer == 'mecab': from konlpy.tag import Mecab tokenizer = Tokenizer(tokenization_fn=Mecab().morphs) # tokenization & read tokens with open(config.corpus, 'r', encoding='utf8') as reader: for li, line in enumerate(reader): list_of_tokens += tokenizer.tokenize(line.strip()) # build vocabulary vocab = Vocab(list_of_tokens=list_of_tokens, unk_token=config.unk_token, pad_token=config.pad_token, bos_token=config.bos_token, eos_token=config.eos_token, min_freq=config.min_freq, lower=config.lower) vocab.build() print('Vocabulary size: ', len(vocab)) # save vocabulary with open(config.vocab, 'wb') as writer: pickle.dump(vocab, writer) print('Vocabulary saved to', config.vocab)
def lemmatize(self, stop_words=None): tokenizer = Tokenizer(stop_words=stop_words) lemmatizer = Lemmatizer(stop_words=stop_words) self.lemmatized_queries = dict() for q_id in self.queries.dict.keys(): q = self.queries.get(q_id) tok_q = tokenizer.fit_transform(q) lem_q = lemmatizer.fit_transform(tok_q) self.lemmatized_queries[int(q_id)] = lem_q
def get_lastword_range(input_ids,config=None,tokenizer=None): """ Get the range of lastword tokenized index in input_ids Args: input_ids: Tensor(batch_size,seq_length) config: GPT2Config, config of GPT2 model, if not initiated, this function will create a MockConfig by params of input_ids, optional tokenizer: GPT2Tokenizer, if not initiated, it will be created using the default setting in utils.tokenization, optional Returns: lastword_range: list(tuple), start and end postion of last word of each text of stringlist that used in selecting tokenized last word index in logits. lastword_logits --> logits[batch_index,start:end,::] """ if tokenizer is None: tokenizer = Tokenizer() if config is None: config = MockConfig() config.batch_size = input_ids.shape[0] config.seq_length = input_ids.shape[1] string_list = extract_string_from_tensor(input_ids,mode='single',tokenizer=tokenizer,config=config) # prefix, _ = split_by_last_word(string_list) prefix = split_by_last_word(string_list) lastword_range = _get_lastword_range(prefix,string_list,tokenizer) return lastword_range
def main(args): print(args) # Load tokenizer if args.tokenizer == 'sentencepiece': tokenizer = PretrainedTokenizer(pretrained_model=args.pretrained_model, vocab_file=args.vocab_file) else: tokenizer = TOKENIZER_CLASSES[args.tokenizer]() tokenizer = Tokenizer(tokenizer=tokenizer, vocab_file=args.vocab_file) # Build DataLoader train_dataset = create_examples(args, tokenizer, mode='train') test_dataset = create_examples(args, tokenizer, mode='test') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=True) # Build Trainer trainer = Trainer(args, train_loader, test_loader, tokenizer) # Train & Validate for epoch in range(1, args.epochs + 1): trainer.train(epoch) trainer.validate(epoch) trainer.save(epoch, args.output_model_prefix)
def main(args): print(args) # Load tokenizer if args.tokenizer == 'sentencepiece': tokenizer = PretrainedTokenizer(pretrained_model = args.pretrained_model, vocab_file = args.vocab_file) else: tokenizer = TOKENIZER_CLASSES[args.tokenizer]() tokenizer = Tokenizer(tokenizer = tokenizer, vocab_file = args.vocab_file) # Load model device = 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu' model = torch.load(args.model).to(device) model.eval() # Make input text = 'I have to admit, I got so emotional all throughout the movie.\ And some parts brought me to tears. The cast was phenomenal and I think every superhero got to have their spotlight.' tokens = tokenizer.tokenize(text) input_ids = tokenizer.convert_tokens_to_ids(tokens) padding_length = args.max_seq_len - len(input_ids) input_ids = input_ids + ([tokenizer.pad_token_id] * padding_length) input_ids = torch.tensor(input_ids).unsqueeze(0).to(device) print('--------------------------------------------------------') print('tokens: {}'.format(tokens)) print('input_ids: {}'.format(input_ids)) print('|input_ids|: {}'.format(input_ids)) print('--------------------------------------------------------') # Inference output, attention_weights = model(input_ids) print('class: {}'.format(output.argmax(dim=1)))
def add(self, string: str, tokenizer: Tokenizer): """ Adds the given string to the trie. The tokenizer is used so that we're robust to nuances in whitespace and punctuation. Use the same tokenizer throughout. """ # TODO: Make the tokenizer a class variable. self._add(" ".join(tokenizer.strings(string)))
def load_queries(queries_filename): file = open(queries_filename, 'r') queries = {} vocab = Vocab() tokenizer = Tokenizer() lemmatizer = Lemmatizer() for l in file.readlines(): l = l.replace('\n', '') l_arr = l.split('\t') q = Query() q.id = int(l_arr[0]) q_text = l_arr[1] q_syn_text = '' if len(l_arr) > 2: q_syn_text = l_arr[2] q.text = q_text + ' ' + q_syn_text q.tokens = lemmatizer.fit_transform(tokenizer.fit_transform(q_text)) q.synonim_tokens = lemmatizer.fit_transform(tokenizer.fit_transform(q_syn_text)) queries[q.id] = q file.close() # create vocab for q_id in queries.keys(): q = queries[q_id] tokens = q.tokens + q.synonim_tokens vocab.add_phrase(tuple(q.tokens)) for tkn in tokens: vocab.add1(tkn) grams, inv_grams, gap_grams = get_ngrams(tokens, 2, inverted=True, with_gap=True) for g in grams + inv_grams + gap_grams: vocab.add2(g) return queries, vocab
def _get_lastword_range(prefix, stringlist, tokenizer=None): """ Get the range of lastword tokenized index in label_ids Args: prefix: list(str), list of text with its last word removed(a.k.a. "prefix") in form of str stringlist: list(str), list of text, same as it is in split_by_last_word tokenizer: GPT2Tokenizer, if not initiated, it will be created using the default setting in utils.tokenization, optional Returns: lastword_range: list(tuple), start and end postion of last word of each text of stringlist that used in selecting tokenized last word index in logits. lastword_logits --> logits[batch_index,start:end,::] """ if tokenizer is None: tokenizer = Tokenizer() print('[WARNING] parameter: tokenizer is missing in utils.lambada_utils.last_word_index, using Tokenizer() as default tokenizer') prefix_ids_len = [len(tokenizer.encode(prefix_str)) for prefix_str in prefix] # +1 for including bos full_ids_len = [len(tokenizer.encode(full_str)) for full_str in stringlist] # +1 for including bos #lastword_range = [(prefix_length, full_length) for prefix_length, full_length in zip(prefix_ids_len, full_ids_len)] lastword_range_ = [(prefix_length, full_length) for prefix_length, full_length in zip(prefix_ids_len, full_ids_len)] lastword_range = [] for i in range(len(lastword_range_)): full_ids = tokenizer.encode(stringlist[i]) last_prefix_id = tokenizer.encode(prefix[i])[-1] range_left = prefix_ids_len[i] for j in range(len(full_ids)-2,0,-1): if full_ids[j]== last_prefix_id: range_left = j+1 break lastword_range.append((range_left,lastword_range_[i][1])) return lastword_range
def main(args): logger.info(f"Args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") spm_path = os.path.join('spm', args.spm, "spm.model") args.sample = parse_sample_options(args.sample) logger.info(f"Loading tokenizer from {spm_path}") tokenizer = Tokenizer(spm_path) args.ntoken = ntoken = len(tokenizer) logger.info(f" Vocabulary size: {ntoken}") logger.info("Reading dataset") data = {} for x in ['train', 'valid', 'test']: data[x] = read_data(os.path.join(args.data_dir, f"{x}.query.txt"), min_len=args.min_len) logger.info(f" Number of {x:>5s} data: {len(data[x]):8d}") logger.info("Preparing model and optimizer") config = LMConfig(ntoken, args.ninp, args.nhid, args.nlayers, args.dropouti, args.dropoutr, args.dropouth, args.dropouto) model = LanguageModel(config).to(device) params = get_params(model) logger.info( f" Number of model parameters: {sum(p.numel() for p in params)}") optimizer = torch.optim.Adam(params) if args.resume: logger.info(f"Loading model from {args.resume}") model_load(args.resume, model, optimizer) model = model.to(device) if n_gpu > 1: logger.info(f"Making model as data parallel") model = torch.nn.DataParallel(model, dim=1) train(model, optimizer, tokenizer, data['train'], data['valid'], args) test(model, tokenizer, data['test'], args)
def main(args): logger.info(f"Args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") spm_path = os.path.join('spm', args.spm, "spm.model") logger.info(f"Loading tokenizer from {spm_path}") tokenizer = Tokenizer(spm_path) args.ntoken = ntoken = len(tokenizer) args.branching_factor = min([args.branching_factor, args.ntoken]) logger.info(f" Vocab size: {ntoken}") n_queries_str = f"{f'only {args.n_queries} samples' if args.n_queries else 'all'} quries from" logger.info(f"Reading a dataset ({n_queries_str} test.query.txt)") seen_set = set( read_data(os.path.join(args.data_dir, "train.query.txt"), min_len=args.min_len)) test_data = read_data(os.path.join(args.data_dir, "test.query.txt"), min_len=args.min_len) if args.n_queries: random.seed(args.seed) test_data = random.sample(test_data, args.n_queries) n_seen_test_data = len([x for x in test_data if x in seen_set]) n_unseen_test_data = len(test_data) - n_seen_test_data logger.info( f" Number of test data: {len(test_data):8d} (seen {n_seen_test_data}, unseen {n_unseen_test_data})" ) logger.info(f"Loading model from {args.model_dir}") model = model_load(args.model_dir) model = model.to(device) logger.info('Generation starts!') with torch.no_grad(): generate(model, tokenizer, test_data, args, seen_set=seen_set, calc_mrl=args.calc_mrl)
def __init__(self, data_file, meta_info_file, vocab_file, max_seq_length, max_label_num=10, **kwargs): super(MetaIntentDataset, self).__init__(data_file, **kwargs) self.tokenizer = Tokenizer(backend="bert", vocab_file=vocab_file) self.max_seq_length = max_seq_length self.max_label_num = max_label_num with io.open(meta_info_file) as f: meta_info_json = eval(json.load(f))['data'] self.task_to_idx = dict() self.task_to_label_mapping = dict() self.task_to_label_features = dict() self.label_to_memory_id = {"PAD": 0} for task_label_info in meta_info_json: labels = task_label_info["labelMap"] # 任务中包含的标签 label_map = {label: idx for idx, label in enumerate(labels)} # task_key: 任务名 task_key = task_label_info["taskKey"] self.task_to_idx[task_key] = len(self.task_to_idx) self.task_to_label_mapping[task_key] = label_map for label in labels: # 注意这里有可能出现不同的任务对应的label是一样的名字,但是只要是在同一个dataset下面,就是默认同一个label 名字就是一个意思的表达 if label not in self.label_to_memory_id: self.label_to_memory_id[label] = len( self.label_to_memory_id)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--train_folder", default=None, type=str, help="QA folder for training. E.g., train") parser.add_argument("--dev_folder", default=None, type=str, help="QA folder for dev. E.g., dev") parser.add_argument("--test_folder", default=None, type=str, help="QA folder for test. E.g., test") parser.add_argument("--vocab_file", default=None, type=str, help="Vocab txt for vocabulary") parser.add_argument("--KB_file", default=None, type=str, help="KB json for question answering") parser.add_argument("--M2N_file", default=None, type=str, help="mid2name json for question answering") parser.add_argument("--QUERY_file", default=None, type=str, help="query json for recording searched queries") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written") # Other parameters parser.add_argument("--load_model", default=None, type=str, help="The pre-trained model to load") parser.add_argument("--save_model", default='BaseSave', type=str, help="The name that the models save as") parser.add_argument("--config", default='config/base_config.json', help="The config of base model") parser.add_argument("--num_train_epochs", default=20, type=int, help="The epoches of training") parser.add_argument("--do_train", default=1, type=int, help="Whether to run training") parser.add_argument("--do_eval", default=1, type=int, help= "Whether to run eval") parser.add_argument("--train_batch_size", default=1, type=int, help="Total batch size for training") parser.add_argument("--eval_batch_size", default=1, type=int, help="Total batch size for eval") parser.add_argument("--learning_rate", default=5e-6, type=float, help="Total number of training epoches to perform") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for.") parser.add_argument("--seed", default=123, type=int, help="random seeed for initialization") parser.add_argument("--gpu_id", default=1, type=int, help="id of gpu") parser.add_argument("--top_k", default=1, type=int, help="retrieve top k relation path during prediction") parser.add_argument("--max_hop_num", default=1, type=int, help="maximum hop number") parser.add_argument("--do_policy_gradient", default=1, type=int, help="Whether to train with policy gradient. 1: use policy gradient; 2: use maximum likelihood with beam") args = parser.parse_args() if torch.cuda.is_available(): logger.info("cuda {} is available".format(args.gpu_id)) device = torch.device("cuda", args.gpu_id) # n_gpu = 1 else: device = None logger.info("cuda is unavailable") random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) load_model_file = args.load_model+".bin" if args.load_model else None save_model_file = os.path.join(args.output_dir, args.save_model+".bin") if args.save_model else os.path.join(args.output_dir, "base_model.bin") save_eval_cp_file = os.path.join(args.output_dir, args.save_model+"_predcp.txt") save_eval_file = os.path.join(args.output_dir, args.save_model+".txt") save_kb_cache = os.path.join(os.path.dirname(args.KB_file), "kb_cache.json") save_m2n_cache = os.path.join(os.path.dirname(args.M2N_file), "m2n_cache.json") save_query_cache = os.path.join(os.path.dirname(args.QUERY_file), "query_cache.json") tokenizer = Tokenizer(args.vocab_file) KB = {} if args.do_eval == 2 else convert_json_to_load(Load_KB_Files(args.KB_file)) if args.KB_file else None M2N = {} if args.do_eval == 2 else Load_KB_Files(args.M2N_file) QUERY = set() if args.do_eval == 2 else set(Load_KB_Files(args.QUERY_file)) config = ModelConfig.from_json_file(args.config) policy = Policy(config, tokenizer.vocab, device) if load_model_file and os.path.exists(load_model_file): model_dic = torch.load(load_model_file, map_location='cpu') policy.load_state_dict(model_dic, strict=True) print("successfully load pre-trained model ...") elif config.method in ['Bert']: model_dic = torch.load('config/pytorch_model.bin', map_location='cpu') model_dic = {re.sub('bert', 'ranker', k): v for k, v in model_dic.items()} model_dic['ranker.embeddings.token_type_embeddings.weight'] = torch.cat([model_dic['ranker.embeddings.token_type_embeddings.weight'], model_dic['ranker.embeddings.token_type_embeddings.weight'][1:]], 0) if config.method in ['Bert_tmp']: model_dic.update({re.sub('encoder', 'KBencoder', k): v for k, v in model_dic.items() if re.search('encoder', k)}) policy.load_state_dict(model_dic, strict=False) print("successfully load Bert model ...") else: print("successfully initialize model ...") #print(policy.ranker.decoder.weight.data); exit() if args.gpu_id: policy.to(device) global_step, max_eval_reward, t_total = 0, -0.1, 0 if args.do_eval: dev_instances = create_instances(input_file=args.dev_folder, tokenizer=tokenizer) test_instances = create_instances(input_file=args.test_folder, tokenizer=tokenizer) logger.info("***** Loading evaluation *****") logger.info(" Num dev examples = %d", len(dev_instances)) logger.info(" Num test examples = %d", len(test_instances)) logger.info(" Batch size = %s", args.eval_batch_size) if args.do_train: train_instances = create_instances(input_file=args.train_folder, tokenizer=tokenizer) logger.info("***** Loading training ******") logger.info(" Num examples = %d" , len(train_instances)) logger.info(" Batch size = %s", args.train_batch_size) t_total = len(train_instances)*args.num_train_epochs # Prepare optimizer # param_optimizer = list(policy.named_parameters()) # param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # optimizer_grouped_parameters = [ # {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, # {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} # ] # optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=t_total) param_optimizer = list(policy.parameters()) optimizer = optim.Adam(param_optimizer, lr=args.learning_rate) args.num_train_epochs = 1 if not args.do_train else args.num_train_epochs for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss, tr_LM_loss, tr_reward, tr_reward_boundary, hop1_tr_reward, nb_tr_examples, nb_tr_steps, query_num = 0., 0., 0., 0., 0, 0, 0, 0. if args.do_train: policy.train() if args.do_eval == 2: train_instances = train_instances[:1] random.shuffle(train_instances) for step, batch in enumerate(train_instances[:5000]): #print(step) done, skip_forward = False, False time, _total_losses = 0, 0 while time < args.max_hop_num: # Retrieve graphs based on the current graph cp, ts, tn, ty_n, su_n, ye_n, an_n, hn, RAs, mcl, qr_n, done = retrieve_KB(batch, KB, QUERY, M2N, tokenizer, config.method, time = time, is_train=True, save_model=args.save_model) query_num += qr_n if len(cp) == 0: skip_forward = True; break # When there is no candidate paths for the question, skip ready_batch = select_field(batch.question, cp, ts, tn, ty_n, su_n, ye_n, an_n, hn, RAs, mcl, is_train=True, method=config.method, save_model=args.save_model) if args.gpu_id: ready_batch = tuple(t.to(device) for t in ready_batch) # Step through environment using chosen action _logits, _losses = policy(ready_batch, None) _total_losses += _losses if _losses else 0 logits = _logits.cpu().data.numpy() if args.gpu_id else _logits.data.numpy() adjust_F1s = torch.tensor(batch.current_F1s, dtype=torch.float).view(1, -1) F1s = torch.tensor(batch.F1s, dtype=torch.float).view(1, -1) if args.gpu_id: _adjust_F1s, _F1s = adjust_F1s.to(device), F1s.to(device) if torch.isnan(_logits).any() or (_logits.size()!= _adjust_F1s.size()): skip_forward = True; break # When there is a bug, skip _action, _adjust_loss = select_action(policy, _logits, adjust_F1s = _adjust_F1s, F1s = _F1s, is_train=True, is_reinforce=args.do_policy_gradient, epoch=epoch) #True if args.do_policy_gradient ==2: loss= update_policy_immediately(_adjust_loss, optimizer) action = _action.cpu().data.numpy() if args.gpu_id else _action.data.numpy() eval_metric = 'GraphAcc' if (time==0 and tokenizer.dataset in ['CWQ']) else 'AnsAcc' if (tokenizer.dataset in ['FBQ']) else 'F1Text' if (tokenizer.dataset in ['CQ']) else 'F1' reward, _, done, _, _ = generate_F1(logits, action, batch, time = time, is_train=True, eval_metric=eval_metric, M2N=M2N) if time== 0 and tokenizer.dataset in ['CWQ']: hop1_tr_reward += np.mean(reward) update_train_instance(batch, action) # Save reward policy.reward_episode.append(reward) if done: break # When the best path in the previous iteration is same as the best path in current iteration time += 1 #if np.max(batch.orig_F1s) > reward: print(np.max(batch.orig_F1s)); print(reward); exit() # Used to determine when the environment is solved. if not skip_forward: if args.do_policy_gradient != 2: lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step loss = update_policy(_adjust_loss, policy, optimizer, batch, device = device, LM_loss = _total_losses, is_reinforce=args.do_policy_gradient) tr_loss += loss if _total_losses: tr_LM_loss += _total_losses.item() tr_reward_boundary += np.max(batch.orig_F1s) tr_reward += np.mean(reward) nb_tr_examples += 1 nb_tr_steps += 1 global_step += 1 policy.reset() batch.reset() if (step + 1) % 5000 == 0: print('trained %s instances ...' %step) # model_to_save = policy.module if hasattr(policy, 'module') else policy # torch.save(model_to_save.state_dict(), save_model_file) # Save_KB_Files(convert_json_to_save(KB), save_kb_cache) # Save_KB_Files(M2N, save_m2n_cache) # Save_KB_Files(list(QUERY), save_query_cache) if args.do_eval: policy.eval() eval_reward, nb_eval_steps, nb_eval_examples = 0, 0, 0 if args.do_eval == 2: dev_instances = dev_instances[:1] for eval_step, batch in enumerate(dev_instances): done, skip_forward, pred_cp = False, False, '' time = 0 #print(eval_step) while time < args.max_hop_num: time1 = mytime.time() cp, ts, tn, ty_n, su_n, ye_n, an_n, hn, RAs, mcl, qr_n, done = retrieve_KB(batch, KB, QUERY, M2N, tokenizer, config.method, time = time) query_num += qr_n if len(cp) == 0: skip_forward = True; break ready_batch = select_field(batch.question, cp, ts, tn, ty_n, su_n, ye_n, an_n, hn, RAs, mcl, method=config.method) if args.gpu_id: ready_batch = tuple(t.to(device) for t in ready_batch) # Step through environment using chosen action with torch.no_grad(): _logits, _ = policy(ready_batch, None) logits = _logits.cpu().data.numpy() if args.gpu_id else _logits.data.numpy() _action, _ = select_action(policy, _logits, is_train=False, k=args.top_k) action = _action.cpu().data.numpy() if args.gpu_id else _action.data.numpy() eval_metric = 'AnsAcc' if (tokenizer.dataset in ['FBQ']) else 'F1Text' if (tokenizer.dataset in ['CQ']) else 'F1' reward, pred_cp, done, _, _ = generate_F1(logits, action, batch, time = time, is_train = False, eval_metric=eval_metric, M2N=M2N) update_train_instance(batch, action) if done: break time += 1 if not skip_forward: eval_reward += np.mean(reward) nb_eval_examples += 1 nb_eval_steps += 1 batch.reset() #print(logits); exit() result = {'training loss': tr_loss/np.max([nb_tr_examples, 1.e-10]), 'training reward': tr_reward/np.max([nb_tr_examples, 1.e-10]), 'dev reward': eval_reward/np.max([nb_eval_examples, 1.e-10])} if tokenizer.dataset in ['CWQ', 'WBQ']: result['train reward boundary'] = tr_reward_boundary/np.max([nb_tr_examples, 1.e-10]) if tokenizer.dataset in ['CWQ']: result['training hop1 acc'] = hop1_tr_reward/np.max([nb_tr_examples, 1.e-10]) if 'LM' in config.method: result['training LM loss'] = tr_LM_loss/np.max([nb_tr_examples, 1.e-10]) eval_reward = eval_reward/np.max([nb_eval_examples, 1.e-10]) if eval_reward >= max_eval_reward: max_eval_reward = eval_reward if args.do_eval == 2: test_instances = test_instances[:1] eval_reward, nb_eval_steps, nb_eval_examples, eval_pred_cps, eval_pred_top_ans, eval_reward_boundary = 0, 0, 0, [], [], 0 for eval_step, batch in enumerate(test_instances): #[328:329] done, skip_forward, pred_cp = False, False, '' time, reward, top_pred_ans = 0, [0], defaultdict(int) #print(eval_step) while time < args.max_hop_num: time1 = mytime.time() cp, ts, tn, ty_n, su_n, ye_n, an_n, hn, RAs, mcl, qr_n, done = retrieve_KB(batch, KB, QUERY, M2N, tokenizer, config.method, time = time) query_num += qr_n if len(cp) == 0: skip_forward = True break ready_batch = select_field(batch.question, cp, ts, tn, ty_n, su_n, ye_n, an_n, hn, RAs, mcl, method=config.method) if args.gpu_id: ready_batch = tuple(t.to(device) for t in ready_batch) # Step through environment using chosen action with torch.no_grad(): _logits, _ = policy(ready_batch, None) _logits = F.softmax(_logits, 1) logits = _logits.cpu().data.numpy() if args.gpu_id else _logits.data.numpy() adjust_F1s = torch.tensor(batch.current_F1s, dtype=torch.float).view(1, -1) if args.gpu_id: _adjust_F1s = adjust_F1s.to(device) _action, _ = select_action(policy, _logits, is_train=False, k=args.top_k, time = time) # adjust_F1s = _adjust_F1s, if time < 2 else None action = _action.cpu().data.numpy() if args.gpu_id else _action.data.numpy() eval_metric = 'AnsAcc' if (tokenizer.dataset in ['FBQ']) else 'F1Text' if (tokenizer.dataset in ['CQ']) else 'Hits1' if (tokenizer.dataset in ['CWQ']) else 'F1' reward, pred_cp, done, pred_ans, top_pred_ans = generate_F1(logits, action, batch, time = time, is_train = False, eval_metric=eval_metric, M2N=M2N, top_pred_ans=top_pred_ans) update_train_instance(batch, action) if done: break time += 1 #if len(pred_cp.split(' ')) < 2: print(eval_step); exit() eval_pred_cps += [re.sub('\n', '', '%s\t%s\t%s\t%s' %(eval_step+1, pred_cp, reward, '\t'.join(pred_ans)))] eval_pred_top_ans += [top_pred_ans] #print(top_pred_ans) if not skip_forward: #if np.max(batch.orig_F1s) > np.mean(reward): print(batch.orig_F1s); print(reward); print(eval_step); exit() eval_reward += np.mean(reward) eval_reward_boundary += np.max(batch.orig_F1s) nb_eval_examples += 1 nb_eval_steps += 1 batch.reset() result['test reward'] = eval_reward/np.max([nb_eval_examples, 1.e-10]) result['query times'] = '%s (save model) %s' %(query_num, mask_weight) if args.do_eval == 2: print(result); exit() if tokenizer.dataset in ['CWQ', 'WBQ', 'CQ']: result['test reward boundary'] = eval_reward_boundary/np.max([nb_eval_examples, 1.e-10]) g = open(save_eval_cp_file, "w") g.write('\n'.join(eval_pred_cps)) g.close() if eval_pred_top_ans: g = open(re.sub('.txt$', '.json', save_eval_cp_file), "w") for top_pred_ans in eval_pred_top_ans: json.dump(top_pred_ans, g) g.write('\n') g.close() if args.do_train: '''save the model and some kb cache''' model_to_save = policy.module if hasattr(policy, 'module') else policy torch.save(model_to_save.state_dict(), save_model_file) Save_KB_Files(convert_json_to_save(KB), save_kb_cache) Save_KB_Files(M2N, save_m2n_cache) Save_KB_Files(list(QUERY), save_query_cache) with open(save_eval_file, "a") as writer: logger.info("***** Eval results (%s)*****" %epoch) writer.write("***** Eval results (%s)*****\n" %epoch) for key in sorted(result.keys()): logger.info(" %s=%s", key, str(result[key])) writer.write("%s=%s \n" %(key, str(result[key])))
def train(): tf.logging.set_verbosity(tf.logging.INFO) do_train=True do_eval=True do_test=True max_seq_length=50 batch_size=256 epochs=200 warmup_proportion=0.1 log_steps=500 model_save_dir='./save' train_data_dir='./train_data' raw_data_dir='./data' tokenizer=Tokenizer() create_vocab(raw_data_dir,train_data_dir,tokenizer) train_examples = get_train_examples(raw_data_dir,'train') #print(len(train_examples)) train_file = os.path.join(train_data_dir, "train.tf_record") file_based_convert_examples_to_features(train_examples, train_data_dir, max_seq_length, tokenizer, train_file) config=basic_model_classify.basic_config() config.vocab_size=len(tokenizer.vocab) config.max_length=max_seq_length config.n_tags=len(tokenizer.tags) config.batch_size=batch_size config.test=False num_train_steps=int(len(train_examples)/batch_size*epochs) num_warmup_steps=int(num_train_steps*warmup_proportion) #_trainining_hooks=_log() #_trainining_hooks=None model_fn=model_fn_builder(config=config, num_labels=config.n_tags, learning_rate=config.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps) session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_config.gpu_options.allow_growth = True run_config=tf.estimator.RunConfig(model_dir=model_save_dir,log_step_count_steps=log_steps,session_config=session_config) estimater=tf.estimator.Estimator(model_fn=model_fn,model_dir=model_save_dir,params={'batch_size':config.batch_size},config=run_config) if do_train: tf.logging.info("train examples length:{}".format(len(train_examples))) tf.logging.info("train total steps:{}".format(num_train_steps)) input_fn=file_based_input_fn_builder(train_file,config.max_length,True,True) estimater.train(input_fn,steps=num_train_steps) if do_eval: eval_examples=get_train_examples(raw_data_dir,'test') tf.logging.info("eval examples length:{}".format(len(eval_examples))) eval_file = os.path.join(train_data_dir, "test.tf_record") file_based_convert_examples_to_features(eval_examples, train_data_dir, max_seq_length, tokenizer, eval_file) input_fn=file_based_input_fn_builder(eval_file,config.max_length,False,False) num_eval_steps=int(len(eval_examples)/batch_size) tf.logging.info("eval total steps:{}".format(num_eval_steps)) result=estimater.evaluate(input_fn,steps=num_eval_steps) output_eval_file = os.path.join('./', "eval_results.txt") with tf.gfile.GFile(output_eval_file, "w") as writer: tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if do_test: test_examples=get_train_examples(raw_data_dir,'test') tf.logging.info("test examples length:{}".format(len(test_examples))) test_file = os.path.join(train_data_dir, "test.tf_record") file_based_convert_examples_to_features(test_examples, train_data_dir, max_seq_length, tokenizer, test_file) input_fn=file_based_input_fn_builder(test_file,config.max_length,False,False) num_test_steps=int(len(test_examples)/batch_size) result=estimater.predict(input_fn) result=[i for i in result] true=[i['real_label'] for i in result] false=[i['pre_label'] for i in result] with open("test_tmp.txt",'w') as f: res=classification_report(true,false) print(res) f.write(res)
bleu_cmd += ["-lc"] bleu_cmd += [reference_file.name] try: bleu_out = subprocess.check_output(bleu_cmd, stdin=read_pred, stderr=subprocess.STDOUT) bleu_out = bleu_out.decode("utf-8") bleu_score = re.search(r"BLEU = (.+?),", bleu_out).group(1) bleu_score = float(bleu_score) except subprocess.CalledProcessError as error: if error.output is not None: print("multi-bleu.perl script returned non-zero exit code") print(error.output) bleu_score = np.float32(0.0) # Close temp files hypothesis_file.close() reference_file.close() return bleu_score if __name__ == "__main__": from tokenization import Tokenizer tokenizer = Tokenizer( vocab_file='./src/utils/pretrain-data/gpt2-vocab.json', merge_file='./src/utils/pretrain-data/gpt2-merges.txt') b = BLEU(tokenizer) b.update(['I am his fathers.', 'You are here.'], ['I am his father.', 'I am here.']) print(b.bleu, type(b.bleu))
if __name__=='__main__': config = argparser() print(config) # Select tokenizer config.tokenizer = config.tokenizer.lower() if config.tokenizer==TOKENIZER[0]: from nltk.tokenize import word_tokenize tokenization_fn = word_tokenize elif config.tokenizer ==TOKENIZER[1]: from konlpy.tag import Mecab tokenization_fn = Mecab().morphs tokenizer = Tokenizer(tokenization_fn=tokenization_fn, is_sentence=config.is_sentence, max_seq_length=config.max_seq_length) # Tokenization & read tokens list_of_tokens = [] with open(config.corpus, 'r', encoding='-utf-8', errors='ignore') as reader: for li, line in enumerate(reader): text = ' '.join(line.split('\t')[1:]).strip() list_of_tokens += tokenizer.tokenize(text) # Build vocabulary vocab = Vocab(list_of_tokens=list_of_tokens, unk_token=config.unk_token, pad_token=config.pad_token, bos_token=config.bos_token, eos_token=config.eos_token,
def create_tfrecords(params, write_remainder=True, write_every_n_files=1, resume_from_checkpoint=True, display_pbar=True): # iterates through files in input_dir, splitting into <args.chunk_size> chunks and saving a tfrecords file every <args.files_per> chunks. files, args, process_no = params if args.wwm: print("WWM Masking ON") enc = WWMTokenizer(args.seq_len) else: print("No WWM Masking") enc = Tokenizer() # init metadata discarded_files = 0 files_processed = 0 tfrecord_count = 0 pbar = tqdm(desc=f"Writing TFRecord Files to {args.output_dir}. Parsed 0 input files. files_written ", disable= not display_pbar) checkpoint_path = f"{args.output_dir}/processed_files.txt" input_ids_to_prepend = [] labels_to_prepend = [] input_ids_list_array = [] labels_list_array = [] files_processed_list = [] for f in files: # Read in most updated list of processed files & skip if already processed resume_files_processed = read_checkpoint(checkpoint_path, resume_from_checkpoint) if f in resume_files_processed: continue for input_ids_list, labels_list in archive_to_tokens(f, enc, args): # input_ids_list is a whole file chunked in lists of seq_len files_processed += 1 # if the last chunk < chunk size, but > minimum_size, take it and append it to the beginning of the next file n_tokens = len(input_ids_list[-1]) if n_tokens < args.seq_len: input_ids_last = input_ids_list.pop(-1) labels_last = labels_list.pop(-1) if n_tokens >= args.minimum_size: input_ids_to_prepend.extend(input_ids_last) labels_to_prepend.extend(labels_last) else: discarded_files += 1 if len(input_ids_to_prepend) >= args.seq_len: # if length of data_to_prepend becomes greater than chunk size, add concatted files to tokenized files input_ids_list_array.append(input_ids_to_prepend[:args.seq_len]) input_ids_to_prepend = input_ids_to_prepend[args.seq_len:] labels_list_array.append(labels_to_prepend[:args.seq_len]) labels_to_prepend = labels_to_prepend[args.seq_len:] # add tokenized files > chunk size to main array input_ids_list_array.extend(input_ids_list) labels_list_array.extend(labels_list) if len(labels_list_array) >= args.files_per * write_every_n_files: # write every n files _tfrecord_count, input_ids_remainder, labels_remainder = write_files(input_ids_list_array, labels_list_array, files_per=args.files_per, output_dir=args.output_dir, out_name=args.name, start_no = tfrecord_count, process_no=process_no) pbar.update(_tfrecord_count - tfrecord_count) # update progress bar pbar.set_description(f"Writing TFRecord Files to {args.output_dir}. Parsed {files_processed} input files. files_written ") tfrecord_count = _tfrecord_count input_ids_list_array = input_ids_remainder if input_ids_remainder is not None else [] # add remaining files to next chunk labels_list_array = labels_remainder if labels_remainder is not None else [] with open(f"{checkpoint_path}", "a") as myfile: for x in files_processed_list: myfile.write(f"{x}, ") files_processed_list = [] # Save the file names to skip next time if not doing all in one go files_processed_list.append(f) if len(labels_list_array) >= args.files_per: # also write at end _tfrecord_count, input_ids_remainder, labels_remainder = write_files(input_ids_list_array, labels_list_array, files_per=args.files_per, output_dir=args.output_dir, out_name=args.name, start_no=tfrecord_count, process_no=process_no) pbar.update(_tfrecord_count - tfrecord_count) pbar.set_description(f"Writing TFRecord Files to {args.output_dir}. Parsed {files_processed} input files. files_written ") tfrecord_count = _tfrecord_count with open(f"{checkpoint_path}", "a") as myfile: for x in files_processed_list: myfile.write(f"{x}, ") files_processed_list = [] else: input_ids_remainder = input_ids_list_array # add remaining to remainder labels_remainder = labels_list_array if write_remainder: # write out the remaining files even if there's less than files_per write_files(input_ids_list_array, labels_list_array, files_per=args.files_per, output_dir=args.output_dir, out_name=args.name, start_no=tfrecord_count, write_remainder=True) successful_files = files_processed - discarded_files return {"discarded": discarded_files, "processed": files_processed, "successful": successful_files}
max_para_len=512, max_char_len=32, total_vocab=total_vocab) dev_examples = example_wordpiece(dev_examples, max_ques_len=64, max_para_len=512, max_char_len=32, total_vocab=total_vocab) # test_examples = example_wordpiece(test_examples, max_ques_len=64, max_para_len=512, max_char_len=32, # total_vocab=total_vocab) # word_counter = search_words(train_examples + dev_examples + test_examples) word_counter = search_words(train_examples + dev_examples) word_counter = filter_words(word_counter, min_count=18) tokenization = Tokenizer( vocab_file='/home/liwei/data/Tencent_AILab_ChineseEmbedding.txt', word_counter=word_counter) if not os.path.exists('dataset/preprocessed_data/'): os.makedirs('dataset/preprocessed_data/') with open('dataset/preprocessed_data/vocab.json', 'w') as w: json.dump(tokenization.vocab, w, indent=4) np.save('dataset/preprocessed_data/embedding_mat.npy', tokenization.embedding) examples_to_features(train_examples, type='train', is_training=True, max_para_len=512, tokenization=tokenization) examples_to_features(dev_examples, type='dev',
# Save model torch.save(model.state_dict(), '{}_lm{}.pth'.format(config.model_type.lower(), epoch)) if __name__=='__main__': config = argparser() print(config) # Load vocabulary import pickle with open(config.vocab, 'rb') as reader: vocab = pickle.load(reader) # Select tokenizer if config.tokenizer=='mecab': from konlpy.tag import Mecab tokenizer = Tokenizer(tokenization_fn=Mecab().morphs, vocab=vocab, max_seq_length=config.max_seq_len) # Build dataloader train_loader = DataLoader(dataset=Corpus(corpus_path=config.train_corpus, tokenizer=tokenizer, model_type=config.model_type, cuda=config.cuda), batch_size=config.batch_size, shuffle=config.shuffle, drop_last=True) if config.test_corpus: test_loader = DataLoader(dataset=Corpus(corpus_path=config.test_corpus, tokenizer=tokenizer, model_type=config.model_type, cuda=config.cuda), batch_size=config.batch_size,
os.makedirs(os.path.join(args["data_path"], "temp")) train_df.to_pickle(train_pre_path) val_df.to_pickle(val_pre_path) test_df.to_pickle(test_pre_path) else: train_df = pd.read_pickle(train_pre_path) val_df = pd.read_pickle(val_pre_path) test_df = pd.read_pickle(test_pre_path) ## get data and train columns data_column = list(set(train_df.columns) - set(args["targets"]))[0] if run_tokenization: ## do tokenization print("Tokenize") tokenizer = Tokenizer(tokenizeStr=tokenizer_model[0], fasttextFile=args["fasttext_file"], doLower=args["doLower"]) train_df[data_column] = tokenizer.fit_transform(train_df[data_column]) val_df[data_column] = tokenizer.transform(val_df[data_column]) test_df[data_column] = tokenizer.transform(test_df[data_column]) ## save the preprocessed data if not os.path.exists(os.path.join(args["data_path"], "temp")): os.makedirs(os.path.join(args["data_path"], "temp")) train_df.to_pickle(train_tok_path) val_df.to_pickle(val_tok_path) test_df.to_pickle(test_tok_path) else: train_df = pd.read_pickle(train_tok_path) val_df = pd.read_pickle(val_tok_path)
import os import json import responder from tokenization import Tokenizer env = os.environ DEBUG = env['DEBUG'] in ['1', 'True', 'true'] LANG = env.get('LANG') MECAB_ARGS = env.get('MECAB_ARGS') api = responder.API(debug=DEBUG) tokenizer = Tokenizer(lang=LANG, mecab_args=MECAB_ARGS) @api.route("/") async def tokenize(req, resp): body = await req.text texts = json.loads(body) docs = [tokenizer.tokenize(text) for text in texts] resp.media = dict(data=docs) if __name__ == "__main__": api.run()
import yaml from yaml import Loader from tokenization import Tokenizer tokenizer = Tokenizer(False) def load_scenarios(): with open('../config/scenarios.yml', 'r') as f: data = yaml.load(f.read(), Loader=Loader) scenarios = [] for scenario in data.values(): scenario['triggers'] = [tokenizer.transform( trig) for trig in scenario['trigger'].split('|')] del(scenario['trigger']) scenario['responses'] = scenario['response'].split('|') del(scenario['response']) scenarios.append(scenario) return scenarios
# Load vocabulary with open(config.vocab, 'rb') as reader: vocab = pickle.load(reader) # Select tokenizer config.tokenizer = config.tokenizer.lower() if config.tokenizer == TOKENIZER[0]: from nltk.tokenize import word_tokenize tokenization_fn = word_tokenize elif config.tokenizer == TOKENIZER[1]: from konlpy.tag import Mecab tokenization_fn = Mecab().morphs tokenizer = Tokenizer(tokenization_fn=tokenization_fn, vocab=vocab, is_sentence=config.is_sentence, max_seq_length=config.max_seq_length) # Build dataloader train_dataset = Corpus(corpus_path=config.train_corpus, tokenizer=tokenizer, cuda=config.cuda) valid_dataset = Corpus(corpus_path=config.valid_corpus, tokenizer=tokenizer, cuda=config.cuda) train_loader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=config.shuffle) valid_loader = DataLoader(dataset=valid_dataset, batch_size=config.batch_size, shuffle=config.shuffle)
if not os.path.exists(os.path.join(args["data_path"], "temp")): os.makedirs(os.path.join(args["data_path"], "temp")) np.save(train_pre_path.format("data"), train_data, allow_pickle=True) np.save(val_pre_path.format("data"), val_data, allow_pickle=True) np.save(test_pre_path.format("data"), test_data, allow_pickle=True) np.save(train_pre_path.format("target"), train_target, allow_pickle=True) np.save(val_pre_path.format("target"), val_target, allow_pickle=True) np.save(test_pre_path.format("target"), test_target, allow_pickle=True) if run_tokenization: ## do tokenization print("Tokenize") tokenizer = Tokenizer(args=tokenizer_model, fasttextFile=args["fasttext_file"], doLower=args["doLower"]) train_data = tokenizer.fit_transform(train_data) val_data = tokenizer.transform(val_data) test_data = tokenizer.transform(test_data) ## save the preprocessed data if not os.path.exists(os.path.join(args["data_path"], "temp")): os.makedirs(os.path.join(args["data_path"], "temp")) if sparse.issparse(train_data): sparse.save_npz(train_tok_path.format("data"), train_data) else: np.save(train_tok_path.format("data"), train_data) np.save(train_tok_path.format("target"), train_target) if sparse.issparse(val_data): sparse.save_npz(val_tok_path.format("data"), val_data)
preprocessor = Preprocessor( doLower=args["doLower"], doLemmatization=args["doLemmatization"], removeStopWords=args["removeStopWords"], doSpellingCorrection=args["doSpellingCorrection"], removeNewLine=args["removeNewLine"], removePunctuation=args["removePunctuation"], removeHtmlTags=args["removeHtmlTags"], minTextLength=args["minTextLength"]) predict_df["processed"] = preprocessor.fit_transform( predict_df["text_german"]) predict_df = predict_df.dropna(subset=["processed"], axis=0) print("Tokenize") tokenizer = Tokenizer(tokenizeStr=preperation_technique, ngram=preperation_ngram, fasttextFile=args["fasttext_file"], doLower=args["doLower"]) predict_df["processed"] = tokenizer.fit_transform(predict_df["processed"]) ## for testing purposes #train_df = train_df.sample(100) #val_df = val_df.sample(20) #test_df = test_df.sample(20) ## apply the model labels = [ "price_pos", "price_neg", "quality_pos", "quality_neg", "restaurant_pos", "restaurant_neg", "food_pos", "food_neg", "drinks_pos", "drinks_neg", "ambience_pos", "ambience_neg", "service_pos", "service_neg" ]
from tokenization import Tokenizer from flask import Flask, request import json import numpy as np from loadScenarios import load_scenarios from comparaison import compare_tokens app = Flask(__name__) tokenizer = Tokenizer(using_stopwords=False) scenarios = load_scenarios() threshold = 0.9 @app.route('/', methods=['POST', 'GET']) def api(): args = dict(request.form) message = args['content'] message = tokenizer.transform(message) print(message) response = "I don't understand ..." if len(message) == 0: return response max_similarity = 0 for scenario in scenarios: similarity = compare_tokens(message, scenario['triggers']) print(similarity, scenario['responses']) if similarity > max_similarity and similarity > threshold: response = scenario['responses'][np.random.randint(
# codeing=utf-8 import os import re import numpy as np import random from tqdm import tqdm from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score from tokenization import Tokenizer tokenizer = Tokenizer() input_file = "trec06p/label/index" data_dir = "trec06p/data" num_labels = 2 M = 2 alpha = 0.01 seed = 888 random.seed(seed) cand_nums = [21, 24, 21, 6] features_num = len(cand_nums) feature_weights = [3, 0, 5, 0] train_set_rate = 0.05 from_features = [ '[UNK]', 'hotmail', 'lingo', 'gmail', 'yahoo', 'aol', '0451', 'iname', 'singnet', 'www.loveinfashion', 'o-himesama', 'aries.livedoor', 'oh-oku', 'msn', 'paypal', 'tc.fluke', 'ey', 'specialdevices', 'buta-gori', 'plan9.bell-labs', 'halcyon'
def main(): parser = argparse.ArgumentParser() # model structure parser.add_argument('--rnncell', type=str, default='LSTM') parser.add_argument('--emsize', type=int, default=200) parser.add_argument('--nhid', type=int, default=600) parser.add_argument('--outsize', type=int, default=400) parser.add_argument('--nlayers', type=int, default=2) parser.add_argument('--bidirec', action='store_true') parser.add_argument('--autoenc', action='store_true') parser.add_argument('--forget-bias', type=float, default=False) parser.add_argument('--decoder-bias', action='store_true') # data parser.add_argument('--corpus', type=str, default='guten') parser.add_argument('--min-len', type=int, default=10) parser.add_argument('--max-len', type=int, default=80) # vocabulary parser.add_argument('--vocab', type=str, default=None) parser.add_argument('--lower', action='store_true') parser.add_argument('--min-cnt', type=int, default=6) # training parser.add_argument('--dropout', type=float, default=0.1) parser.add_argument('--seed', type=int, default=3333) parser.add_argument('--batch-size', type=int, default=20) parser.add_argument('--eval-batch-size', type=int, default=10) # optimizer parser.add_argument('--optim', type=str, default='SGD') parser.add_argument('--lr', type=float, default=.5) parser.add_argument('--clip', type=float, default=5.0) parser.add_argument('--decay-after', type=int, default=5) parser.add_argument('--decay-rate', type=float, default=0.5) parser.add_argument('--decay-period', type=int, default=1) parser.add_argument('--epochs', type=int, default=10) # save and log parser.add_argument('--save-dir', type=str, default='train/noname') parser.add_argument('--log-interval', type=int, default=10000) parser.add_argument('--eval-interval', type=int, default=10000) parser.add_argument('--save-all', action='store_false') parser.add_argument('--save-period', type=int, default=1) args = parser.parse_args() logger.debug("Running {}".format(__file__)) if not os.path.exists(args.save_dir): logger.debug("Creating directory at {}".format(args.save_dir)) os.makedirs(args.save_dir) args_path = os.path.join(args.save_dir, 'args.json') with open(args_path, 'w') as f: logger.debug("Saving arguments at {}".format(args_path)) json.dump(vars(args), f, indent=2) log_path = os.path.join(args.save_dir, 'log.txt') file_handler = logging.FileHandler(log_path, mode='w') file_handler.setLevel(logging.INFO) logger.addHandler(file_handler) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Set the random seed manually for reproducibility. np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Use pre built vocabulary if it exists if args.vocab and os.path.exists(args.vocab): vocab = load_vocab(args.vocab) update = False else: vocab = Vocabulary() update = True tokenizer = Tokenizer(vocab, args.lower) tr_txts = get_txts(args.corpus, 'train') va_txts = get_txts(args.corpus, 'valid') tr_input = LineInput(tr_txts, tokenizer, update, args.min_len, args.max_len) va_input = LineInput(va_txts, tokenizer, update, args.min_len, args.max_len) va_batches = va_input.batchify(args.eval_batch_size, False) if update: vocab.build_from_counter(args.min_cnt) logger.debug("Built vocab of size {}".format(len(vocab))) # Build the model model = WordRNN(len(vocab), len(vocab), args.rnncell, args.emsize, args.outsize, args.nhid, args.nlayers, args.bidirec, args.autoenc, args.decoder_bias, args.forget_bias, args.dropout) logger.debug(model) model.to(device) learnables = list(filter(lambda p: p.requires_grad, model.parameters())) optimizer = getattr(optim, args.optim)(learnables, lr=args.lr) save_vocab(vocab, os.path.join(args.save_dir, 'vocab.txt')) model_path = os.path.join(args.save_dir, 'model.pt') # At any point you can hit Ctrl + C to break out of training early. try: # Loop over epochs. best_val_loss = None logger.info('-' * 79) for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() tr_batches = tr_input.batchify(args.batch_size, True) train(model, tr_batches, learnables, optimizer, device, args) val_loss = evaluate(model, va_batches, device) logger.info('-' * 79) logger.info('| end of epoch {:2d} | time: {:5.2f}s ' '| valid loss {:5.2f} | valid ppl {:8.2f} |'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) logger.info('-' * 79) updated_best = not best_val_loss or val_loss < best_val_loss if epoch >= args.decay_after > 0: if (epoch - args.decay_after) % args.decay_period == 0: for group in optimizer.param_groups: group['lr'] *= args.decay_rate if (epoch % args.save_period == 0) and (updated_best or args.save_all): if args.save_all: model_path = os.path.join(args.save_dir, 'ep{}.pt'.format(epoch)) torch.save(model.state_dict(), model_path) if updated_best: best_val_loss = val_loss logger.debug("Completed training and saved to {}".format( args.save_dir)) except KeyboardInterrupt: logger.debug('-' * 79) logger.debug("Exiting from training early")