def get(self): batchid = self.request.get('batchid') batch = db.get(batchid) uobjects = [] for user in batch.users: if User.all().filter("nickname=", user).count() > 0: continue #TODO query FF for the users prescriptions n = random.randint(1,10) friends = [ ("user%02d" % (random.randint(0,19))) for i in range(n)] u = User( nickname = user, friends = friends) uobjects += [ u ] db.put(uobjects) batch.done = True key = db.Key( str(batch.parent_key() )) batch.delete() #nbatches = Batch.all().ancestor( key ).filter("done=", True).count() nbatches = Batch.all(keys_only=True) nbatches = nbatches.ancestor( key ) nbatches = nbatches.count() logging.info("nbatches=%d" % nbatches) if nbatches == 0: job = db.get( key ) job.done = True job.put() t = Task(method="GET", url="/tasks/FinishJob?jobid=%s" % job.key()) t.add()
def evaluate_epoch_sage(epoch, eval_type='valid'): # initialize logger.info('\n{} : Epoch {}'.format(eval_type.upper(), epoch)) model.eval() text = valid['text'] if eval_type == 'valid' else test['text'] all_costs = [] for stidx in range(0, len(text[0]), params.bptt_size): # prepare batch # text_batch = pad_batch(text[:, stidx: stidx + params.batch_size].tolist(), encoder), pad_start_end=False) text_batch = text[:, stidx:stidx + params.bptt_size + 1] b = Batch(text_batch, [], encoder['_pad_']) # model forward if params.sememe: text_y_hat, _ = model(b, clf=False, lm=True) else: text_y_hat = model(b, clf=False, lm=True) # loss loss = model.compute_lm_loss(text_y_hat, b.text_y, b.text_loss_mask) all_costs.append(loss.data.item()) logger.info('loss {}; perplexity: {}'.format( round(np.mean(all_costs), 2), round(np.exp(np.mean(all_costs)), 2), ))
def batch_train(self, data, train_batches, responses=None): ''' Train Model for a Batch of Input Data ''' idxs = range(len(train_batches)) batches = train_batches.copy() np.random.shuffle(batches) total_cost = 0.0 # Total Loss total_seq = 0.0 # Sequence Loss total_pgen = 0.0 # Pgen Loss #pbar = tqdm(enumerate(batches),total=len(train_batches)) #for i, indecies in pbar: for i, indecies in enumerate(batches): idx = indecies[0] indecies = indecies[1:] if len(indecies) == 0: continue if idx == 2: batch_entry = Batch(data, indecies, args, glob, responses, train=True) else: batch_entry = Batch(data, indecies, args, glob, None, train=True) cost_t, seq_loss, pgen_loss = self.model.fit(batch_entry) total_seq += seq_loss total_pgen += pgen_loss total_cost += cost_t #pbar.set_description('TL:{:.2f}, SL:{:.2f}, PL:{:.2f}'.format(total_cost/(i+1),total_seq/(i+1),total_pgen/(i+1))) print('\nTotal L:{:.2f}, Sequence L:{:.2f}, P-Gen L:{:.2f}'.format( total_cost, total_seq, total_pgen)) return total_cost
def get(self): path = os.path.join(os.path.dirname(__file__), 'index.html') room = self.request.get('room') if room is None: self.response.out.write(template.render(path,{"body":"Need a room name"})) #TODO: query for users from friendfeed users = [ ('user%02d' % (i)) for i in range(20)] job = Job(roomname=room, done=False, users = users, score = [], ready = False) job.put() #TODO: Tune BATCHSIZE BATCHSIZE = 4 for i in range(0,len(users),BATCHSIZE): b = Batch(parent=job, users=users[i:i + BATCHSIZE], done = False) b.put() t = Task(method="GET", url="/tasks/RunBatch?batchid=%s" % b.key()); t.add() self.redirect("/wait?jobid=%s" % job.key())
def play(config): vocab = Vocab(config.vocab_file, config.vocab_size) saved_model = torch.load(config.test_from, map_location='cpu') model = Model(config, is_eval=True) model.load_state_dict(saved_model['model']) laser_beam = BeamSearch(model, config, 'intractive') while True: article = input('Input something to summarize:').lower() article_words = word_tokenize(article) data = {'article': article_words, 'abstract': 'for test'} entry = Example(config, vocab, data) normalized_input = Batch([entry] * config.beam_size) best_hyp = laser_beam.beam_search(normalized_input) decoded_abstract = laser_beam.get_summary(best_hyp, normalized_input) print('Summary decoded:') print(decoded_abstract)
def evaluate_epoch_csu(epoch, eval_type='valid'): label = np.array([[]]) text = input('>>> ') if text.strip() == '': return text = np.array([[encoder.get(token, encoder['_unk_']) for token in text.strip().split()[:params.cut_down_len]]]) # initialize # logger.info('\n{} : Epoch {}'.format(eval_type.upper(), epoch)) model.eval() # data without shuffle # if eval_type == 'train': text, label = train['text'], train['label'] # elif eval_type == 'valid': text, label = valid['text'], valid['label'] # else: text, label = test['text'], test['label'] valid_preds, valid_labels = [], [] for stidx in range(0, len(text), params.batch_size): # prepare batch text_batch = pad_batch(text[stidx: stidx + params.batch_size].tolist(), encoder, pad_start_end=True) label_batch = label[stidx: stidx + params.batch_size] b = Batch(text_batch, label_batch, encoder['_pad_']) # model forward clf_output = model(b, clf=True, lm=False) # evaluation pred = clf_output.max(1)[1].data.cpu().numpy().astype(float) valid_preds.extend(pred.tolist()) valid_labels.extend(label_batch.tolist()) valid_preds, valid_labels = np.array(valid_preds), np.array(valid_labels) # A = (valid_preds == valid_labels).astype(float) # acc = A.mean() # z = 1.96 # 95% # delta = z * np.sqrt(acc * (1 - acc) / len(A)) # conf_interval = (acc - delta, acc + delta) # print('num instance', len(A)) # print('delta', delta) # print('conf interval', '[%.3f , %.3f]' % (conf_interval[0], conf_interval[1])) id2label = {v: k for k, v in params.label2id.items()} label = id2label[int(valid_preds[0])] print(label)
def evaluate_epoch_csu(epoch, eval_type='valid'): # initialize logger.info('\n{} : Epoch {}'.format(eval_type.upper(), epoch)) model.eval() # data without shuffle if eval_type == 'train': text, label = train['text'], train['label'] elif eval_type == 'valid': text, label = valid['text'], valid['label'] else: if params.dataset == 'headline': text, label = valid['text'], valid['label'] else: text, label = test['text'], test['label'] valid_preds, valid_labels = [], [] for stidx in range(0, len(text), params.batch_size): # prepare batch text_batch = pad_batch(text[stidx:stidx + params.batch_size].tolist(), encoder, pad_start_end=True) label_batch = label[stidx:stidx + params.batch_size] b = Batch(text_batch, label_batch, encoder['_pad_']) # model forward clf_output = model(b, clf=True, lm=False) # evaluation pred = clf_output.max(1)[1].data.cpu().numpy().astype(float) valid_preds.extend(pred.tolist()) valid_labels.extend(label_batch.tolist()) valid_preds, valid_labels = np.array(valid_preds), np.array(valid_labels) A = (valid_preds == valid_labels).astype(float) acc = A.mean() runid = params.inputdir.replace('exp/', '').replace('/', '-') save_path = 'exp/adv/acc.%s.npy' % runid print('Saved ACC to:', save_path) np.save(save_path, A) logger.info('{}; acc {}'.format( epoch, round(acc, 3), ))
def train_epoch_psvg(epoch): # initialize logger.info('\nTRAINING : Epoch {}'.format(epoch)) model.train() all_costs = [] text = train['text'] for stidx in range(0, len(text[0]), params.bptt_size): # prepare batch text_batch = text[:, stidx: stidx + params.bptt_size + 1] b = Batch(text_batch, [], encoder['_pad_']) # model forward text_y_hat = model(b, clf=False, lm=True) # loss loss = model.compute_lm_loss(text_y_hat, b.text_y, b.text_loss_mask) all_costs.append(loss.data.item()) # backward model_opt.optimizer.zero_grad() loss.backward() # optimizer step model_opt.step() # log and reset if len(all_costs) == params.log_interval: logger.info('{}; loss {}; perplexity: {}; lr {}; embed_norm: {}'.format( stidx, round(np.mean(all_costs), 2), round(np.exp(np.mean(all_costs)), 2), model_opt.rate(), model.tgt_embed[0].lut.weight.data.norm() )) all_costs = [] # save torch.save(model, os.path.join(params.outputdir, "model-{}.pickle".format(epoch)))
def evaluate_epoch_csu(epoch, eval_type='valid'): # initialize logger.info('\n{} : Epoch {}'.format(eval_type.upper(), epoch)) model.eval() # data without shuffle if eval_type == 'train': text, label = train['text'], train['label'] elif eval_type == 'valid': text, label = valid['text'], valid['label'] else: text, label = test['text'], test['label'] valid_preds, valid_labels = [], [] for stidx in range(0, len(text), params.batch_size): # prepare batch text_batch = pad_batch(text[stidx:stidx + params.batch_size].tolist(), encoder, pad_start_end=True) label_batch = label[stidx:stidx + params.batch_size] b = Batch(text_batch, label_batch, encoder['_pad_']) # model forward clf_output = model(b, clf=True, lm=False) # evaluation pred = clf_output.max(1)[1].data.cpu().numpy().astype(float) valid_preds.extend(pred.tolist()) valid_labels.extend(label_batch.tolist()) valid_preds, valid_labels = np.array(valid_preds), np.array(valid_labels) acc = (valid_preds == valid_labels).astype(float) np.save('/home/anonymous/acc.npy', acc) acc = acc.mean() logger.info('{}; acc {}'.format( epoch, round(acc, 3), ))
def train_epoch_sage(epoch): # initialize logger.info('\nTRAINING : Epoch {}'.format(epoch)) model.train() all_costs = [] all_costs_sp = [] text = train['text'] for stidx in range(0, len(text[0]), params.bptt_size): # prepare batch # text_batch = pad_batch(text[:, stidx: stidx + params.bptt_size].tolist(), encoder, pad_start_end=False) text_batch = text[:, stidx:stidx + params.bptt_size + 1] text_batch2 = text_batch[:, :-1] b = Batch(text_batch, [], encoder['_pad_']) # model forward if params.sememe: text_y_hat, sememe_y_hat = model(b, clf=False, lm=True) sememe_y = torch.FloatTensor( word2sememe[text_batch2.reshape(-1)].reshape( [text_batch2.shape[0], text_batch2.shape[1], -1])).cuda() loss_lm = model.compute_lm_loss(text_y_hat, b.text_y, b.text_loss_mask) loss_sp = model.compute_clf_loss(sememe_y_hat, sememe_y, multilabel=True) loss = loss_lm + loss_sp all_costs.append(loss_lm.data.item()) all_costs_sp.append(loss_sp.data.item()) else: text_y_hat = model(b, clf=False, lm=True) # loss loss = model.compute_lm_loss(text_y_hat, b.text_y, b.text_loss_mask) all_costs.append(loss.data.item()) # backward model_opt.optimizer.zero_grad() # model_opt.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), params.max_norm) # optimizer step model_opt.step() # log and reset if len(all_costs) == params.log_interval: logger.info( '{}; loss {}; perplexity: {}; lr {}; embed_norm: {}'.format( stidx, round(np.mean(all_costs), 2), round(np.exp(np.mean(all_costs)), 2), params.lr, #model_opt.rate(), 0 #model.tgt_embed.weight.data.norm() )) all_costs = [] if params.sememe: logger.info('sp loss: {}'.format(np.mean(all_costs_sp))) # save torch.save(model, os.path.join(params.outputdir, "model-{}.pickle".format(epoch)))
def train_epoch_csu(epoch): # initialize logger.info('\nTRAINING : Epoch {}'.format(epoch)) model.train() all_costs, all_accs = [], [] # shuffle the data permutation = np.random.permutation(len(train['text'])) text = train['text'][permutation] label = train['label'][permutation] print('TRAIN DATA', len(text)) for stidx in range(0, len(text), params.batch_size): # prepare batch text_batch = pad_batch(text[stidx:stidx + params.batch_size].tolist(), encoder, pad_start_end=True) text_batch2 = text_batch[:, :-1] label_batch = label[stidx:stidx + params.batch_size] b = Batch(text_batch, label_batch, encoder['_pad_']) # model forward if params.lm_coef == 0.: clf_output = model(b, clf=True, lm=False) else: if params.sememe: clf_output, (text_y_hat, sememe_y_hat) = model(b, clf=True, lm=True) else: clf_output, text_y_hat = model(b, clf=True, lm=True) # evaluation pred = clf_output.max(1)[1].data.cpu().numpy().astype(float) acc = (pred == label_batch).astype(float).mean() loss = model.compute_clf_loss(clf_output, b.label) if params.lm_coef != 0.0: lm_loss = model.compute_lm_loss(text_y_hat, b.text_y, b.text_loss_mask) loss += params.lm_coef * lm_loss if params.sememe: sememe_y = torch.FloatTensor( word2sememe[text_batch2.reshape(-1)].reshape( [text_batch2.shape[0], text_batch2.shape[1], -1])).cuda() sp_loss = model.compute_clf_loss(sememe_y_hat, sememe_y, multilabel=True) loss += params.lm_coef * sp_loss all_costs.append(loss.data.item()) all_accs.append(acc) # backward model_opt.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), params.max_norm) # optimizer step model_opt.step() # log and reset if len(all_costs) == params.log_interval: logger.info('{}; loss {}; acc {}; lr {}; embed_norm {}'.format( stidx, round(np.mean(all_costs), 2), round(np.mean(all_accs), 3), params.lr, #model_opt.rate(), model.tgt_embed[0].lut.weight.data.norm())) all_costs, all_accs = [], [] # save torch.save(model, os.path.join(params.outputdir, "model-{}.pickle".format(epoch)))
def evaluate_epoch_csu(epoch, eval_type='valid'): # initialize logger.info('\n{} : Epoch {}'.format(eval_type.upper(), epoch)) model.eval() # data without shuffle if eval_type == 'train': text, label = train['text'], train['label'] elif eval_type == 'valid': text, label = valid['text'], valid['label'] else: text, label = test['text'], test['label'] valid_scores, valid_preds, valid_labels = [], [], [] for stidx in range(0, len(text), params.batch_size): # prepare batch text_batch = pad_batch(text[stidx: stidx + params.batch_size].tolist(), encoder, pad_start_end=True) label_batch = label[stidx: stidx + params.batch_size] b = Batch(text_batch, label_batch, encoder['_pad_']) # model forward clf_output = model(b, clf=True, lm=False) # evaluation score = torch.sigmoid(clf_output).data.cpu().numpy() pred = (score > 0.5).astype(float) valid_scores.extend(score.tolist()) valid_preds.extend(pred.tolist()) valid_labels.extend(label_batch.tolist()) valid_scores, valid_preds, valid_labels = np.array(valid_scores), np.array(valid_preds), np.array(valid_labels) np.save('{}/scores-{}.npy'.format(params.outputdir, epoch), valid_scores) if params.hierachical: parents = json.load(open('data/parents.json')) id2label = json.load(open('data/labels.json')) label2id = dict([(j, i) for i, j in enumerate(id2label)]) for i in range(valid_preds.shape[0]): last_pred_i = valid_preds[i].copy() while True: for j in range(valid_preds.shape[1]): did = id2label[j] flag = True now = did while now in parents: now = parents[now] if now not in label2id: break if valid_preds[i, label2id[now]] == 0: flag = False break if not flag: valid_preds[i, j] = 0. if (valid_preds[i] == last_pred_i).all(): break last_pred_i = valid_preds[i].copy() em = metrics.accuracy_score(valid_labels, valid_preds) p, r, f1, s = metrics.precision_recall_fscore_support(valid_labels, valid_preds, average='weighted') logger.info('{}; em {}; p {}; r {}; f1 {}'.format( epoch, round(em, 3), round(p, 3), round(r, 3), round(f1, 3) ))
def train_epoch_csu(epoch): # initialize logger.info('\nTRAINING : Epoch {}'.format(epoch)) model.train() all_costs, all_em, all_p, all_r, all_f1 = [], [], [], [], [] # shuffle the data permutation = np.random.permutation(len(train['text'])) text = train['text'][permutation] label = train['label'][permutation] for stidx in range(0, len(text), params.batch_size): # prepare batch text_batch = pad_batch(text[stidx: stidx + params.batch_size].tolist(), encoder, pad_start_end=True) label_batch = label[stidx: stidx + params.batch_size] b = Batch(text_batch, label_batch, encoder['_pad_']) # model forward if params.lm_coef == 0.: clf_output = model(b, clf=True, lm=False) else: clf_output, text_y_hat = model(b, clf=True, lm=True) # evaluation pred = (torch.sigmoid(clf_output) > 0.5).data.cpu().numpy().astype(float) em = metrics.accuracy_score(label_batch, pred) p, r, f1, s = metrics.precision_recall_fscore_support(label_batch, pred, average='weighted') all_em.append(em) all_p.append(p) all_r.append(r) all_f1.append(f1) if params.hierachical: loss = model.compute_hierachical_loss(clf_output, b.label) else: loss = model.compute_clf_loss(clf_output, b.label) if params.lm_coef != 0.0: lm_loss = model.compute_lm_loss(text_y_hat, b.text_y, b.text_loss_mask) loss += params.lm_coef * lm_loss all_costs.append(loss.data.item()) # backward model_opt.optimizer.zero_grad() loss.backward() # optimizer step model_opt.step() # log and reset if len(all_costs) == params.log_interval: logger.info('{}; loss {}; em {}; p {}; r {}; f1 {}; lr {}; embed_norm {}'.format( stidx, round(np.mean(all_costs), 2), round(np.mean(all_em), 3), round(np.mean(all_p), 3), round(np.mean(all_r), 3), round(np.mean(all_f1), 3), model_opt.rate(), model.tgt_embed[0].lut.weight.data.norm() )) all_costs, all_em, all_p, all_r, all_f1 = [], [], [], [], [] # save torch.save(model, os.path.join(params.outputdir, "model-{}.pickle".format(epoch)))
def batch_train_api(self, data, batches, rl_data, train=True, output=False, epoch_str="", use_gold=False): ''' Train Model for a Batch of Input Data ''' file = None if output and args.rl and args.rl_mode != 'GT': dirName = 'logs/api/' + self.run_id if not os.path.exists(dirName): os.mkdir(dirName) file = open(dirName + '/' + epoch_str + '.log', 'w+') #if train: np.random.shuffle(batches) total_cost = 0.0 total_reward = 0.0 total_entries_sum = 0.0 valid_entries_sum = 0.0 perfect_match_entries_sum = 0.0 db_results_map = {} #pbar = tqdm(enumerate(batches),total=len(batches)) #for i, indecies in pbar: for i, indecies in enumerate(batches): idx = indecies[0] indecies = indecies[1:] batch_entry = Batch(data, indecies, args, glob, train=train) # dont run api_predict if we have to just append Ground Truth if args.rl_mode == "GT": actions = None pred_action_lengths = None else: if args.beam: #if train: parent_ids, predict_ids = self.model.api_predict( batch_entry) actions = calculate_beam_result(parent_ids, predict_ids, args.max_api_length) pred_action_lengths = None #for batch_index, action_set in enumerate(actions): # print() # self.surface_form(batch_entry, parent_ids[batch_index], predict_ids[batch_index], action_set, batch_index) # sys.exit() #else: # preds, pred_action_lengths = self.model.api_predict_no_bs(batch_entry) # actions = pad_to_answer_size(list(preds), args.max_api_length, True) else: preds, pred_action_lengths = self.model.api_predict( batch_entry) actions = pad_to_answer_size(list(preds), args.max_api_length, True) #print(actions) db_results, batched_actions_and_rewards, high_probable_rewards, total_entries, valid_entries, perfect_match_entries = \ calculate_reward(glob, actions, pred_action_lengths, batch_entry, rl_data, self.db_engine, self.model, args, data, out_file=file, mode=args.rl_mode, epoch_str=epoch_str, use_gold=use_gold) total_entries_sum += total_entries valid_entries_sum += valid_entries perfect_match_entries_sum += perfect_match_entries total_reward += high_probable_rewards # dont run api_fit if are in Ground Truth mode if train and args.rl_mode != "GT": for actions_and_rewards in batched_actions_and_rewards: total_cost += self.model.api_fit(batch_entry, actions_and_rewards) for id, response in zip(batch_entry.dialog_ids, db_results): db_results_map[id] = response valid_query_ratio = float(valid_entries_sum) / float(total_entries_sum) perfect_query_ratio = float(perfect_match_entries_sum) / float( total_entries_sum) if output and args.rl and args.rl_mode != 'GT': file.close() return total_reward, perfect_query_ratio, valid_query_ratio, db_results_map
def batch_predict(self, data, batches, rl_data, output=False, epoch_str=""): ''' Get Predictions for Input Data batchwise ''' batches_pre = batches[0] batches_api = batches[1] batches_post = batches[2] #if args.rl and self.model.phase >= 1: # total_reward, perfect_query_ratio, valid_query_ratio = self.batch_train_api(data, batches_api, rl_data, train=False, output=output, epoch_str=epoch_str) if args.rl and self.model.phase >= 2: predictions = [] dialog_ids = [] turn_ids = [] entities = [] oov_words = [] golds = [] stories = [] readable_answers = [] post_index = len(batches_pre) batches = batches_pre + batches_post #pbar = tqdm(enumerate(batches),total=len(batches)) #for i, indecies in pbar: for i, indecies in enumerate(batches): idx = indecies[0] indecies = indecies[1:] if len(indecies) == 0: continue # Get predictions if i < post_index: data_batch = Batch(data, indecies, args, glob, None) elif self.model.phase < 2: break else: data_batch = Batch(data, indecies, args, glob, data.responses) if args.simple_beam: parent_ids, predict_ids = self.model.predict(data_batch) else: preds = self.model.predict(data_batch) # Store prediction outputs if args.simple_beam: actions = calculate_beam_result( parent_ids, predict_ids, glob['candidate_sentence_size']) for action in actions: predictions.append(action[0]) else: predictions += pad_to_answer_size( list(preds), glob['candidate_sentence_size']) dialog_ids += data_batch.dialog_ids readable_answers += data_batch.readable_answers #print('readable_answers', readable_answers) turn_ids += data_batch.turn_ids entities += data_batch.entities oov_words += data_batch.oov_words golds += data_batch.answers stories += data_batch.readable_stories # Evaluate metrics acc = evaluate(args, glob, predictions, golds, stories, entities, dialog_ids, turn_ids, readable_answers, oov_words, self.db_engine, out=output, run_id=self.run_id, epoch_str=epoch_str) else: acc = {} acc['bleu'] = 0.0 acc['acc'] = 0.0 acc['dialog'] = 0.0 acc['f1'] = ((0.0, 0.0, 0.0), (0.0, 0.0, 0.0), (0.0, 0.0, 0.0)) acc['comp'] = 0.0 # not-used # acc['api'] = (perfect_query_ratio if args.rl and self.model.phase >= 1 else 0.0) return acc