def epoch_reinforce_train(model, optimizer, batch_size, sql_data, table_data, db_path): engine = DBEngine(db_path) #XXX print ("=== call to train") model.train() perm = np.random.permutation(len(sql_data)) cum_reward = 0.0 st = 0 while st < len(sql_data): ed = st + batch_size if st + batch_size < len(perm) else len(perm) q_seq, col_seq, col_num, ans_seq, query_seq, gt_whr_seq, raw_data =\ to_batch_seq(sql_data, table_data, perm, st, ed, ret_vis_data=True) #XXX print ("=== generate g_s") g_s = model.generate_g_s(q_seq, col_seq, query_seq) raw_q_seq = [x[0] for x in raw_data] raw_col_seq = [x[1] for x in raw_data] query_gt, table_ids = to_batch_query(sql_data, perm, st, ed) #XXX print ("=== fwd") score = model.forward(q_seq, col_seq, col_num, (True, True, True), reinforce=True) clasif_queries = model.gen_query(score, q_seq, col_seq, raw_q_seq, raw_col_seq, (True, True, True), reinforce=True) query_gt, table_ids = to_batch_query(sql_data, perm, st, ed) rewards = [] for idx, (sql_gt, sql_pred, tid) in enumerate(zip(query_gt, clasif_queries, table_ids)): ret_gt = engine.execute(tid, sql_gt['sel'], sql_gt['agg'], sql_gt['conds']) try: ret_pred = engine.execute(tid, sql_pred['sel'], sql_pred['agg'], sql_pred['conds']) except: ret_pred = None if ret_pred is None: rewards.append(-2) elif ret_pred != ret_gt: rewards.append(-1) else: rewards.append(1) cum_reward += (sum(rewards)) optimizer.zero_grad() model.reinforce_backward(score, rewards) optimizer.step() st = ed return cum_reward / len(sql_data)
def epoch_acc(model, batch_size, sql_data, table_data, db_path): engine = DBEngine(db_path) model.eval() perm = list(range(len(sql_data))) badcase = 0 one_acc_num, tot_acc_num, ex_acc_num = 0.0, 0.0, 0.0 for st in tqdm(range(len(sql_data) // batch_size + 1)): ed = (st + 1) * batch_size if (st + 1) * \ batch_size < len(perm) else len(perm) st = st * batch_size q_seq, gt_sel_num, col_seq, col_num, ans_seq, gt_cond_seq, raw_data = \ to_batch_seq(sql_data, table_data, perm, st, ed, ret_vis_data=True) # q_seq: char-based sequence of question # gt_sel_num: number of selected columns and aggregation functions, new added field # col_seq: char-based column name # col_num: number of headers in one table # ans_seq: (sel, number of conds, sel list in conds, op list in conds) # gt_cond_seq: ground truth of conditions # raw_data: ori question, headers, sql query_gt, table_ids = to_batch_query(sql_data, perm, st, ed) # query_gt: ground truth of sql, data['sql'], containing sel, agg, # conds:{sel, op, value} raw_q_seq = [x[0] for x in raw_data] # original question try: score = model.forward(q_seq, col_seq, col_num) pred_queries = model.gen_query(score, q_seq, col_seq, raw_q_seq) # generate predicted format one_err, tot_err = model.check_acc(raw_data, pred_queries, query_gt) except BaseException: badcase += 1 print('badcase', badcase) continue one_acc_num += (ed - st - one_err) tot_acc_num += (ed - st - tot_err) # Execution Accuracy for sql_gt, sql_pred, tid in zip(query_gt, pred_queries, table_ids): ret_gt = engine.execute(tid, sql_gt['sel'], sql_gt['agg'], sql_gt['conds'], sql_gt['cond_conn_op']) try: ret_pred = engine.execute(tid, sql_pred['sel'], sql_pred['agg'], sql_pred['conds'], sql_pred['cond_conn_op']) except BaseException: ret_pred = None ex_acc_num += (ret_gt == ret_pred) return one_acc_num / len(sql_data), tot_acc_num / \ len(sql_data), ex_acc_num / len(sql_data)
def epoch_exec_acc(model, batch_size, sql_data, table_data, db_path): engine = DBEngine(db_path) model.eval() perm = list(range(len(sql_data))) tot_acc_num = 0.0 acc_of_log = 0.0 st = 0 while st < len(sql_data): ed = st + batch_size if st + batch_size < len(perm) else len(perm) q_seq, col_seq, col_num, ans_seq, query_seq, gt_cond_seq, raw_data = \ to_batch_seq(sql_data, table_data, perm, st, ed, ret_vis_data=True) raw_q_seq = [x[0] for x in raw_data] raw_col_seq = [x[1] for x in raw_data] gt_where_seq = model.generate_gt_where_seq(q_seq, col_seq, query_seq) query_gt, table_ids = to_batch_query(sql_data, perm, st, ed) gt_sel_seq = [x[1] for x in ans_seq] score = model.forward(q_seq, col_seq, col_num, (True, True, True), gt_sel=gt_sel_seq) pred_queries = model.gen_query(score, q_seq, col_seq, raw_q_seq, raw_col_seq, (True, True, True)) for idx, (sql_gt, sql_pred, tid) in enumerate(zip(query_gt, pred_queries, table_ids)): try: ret_gt = engine.execute(tid, sql_gt['sel'], sql_gt['agg'], sql_gt['conds']) except: ret_gt = None try: ret_pred = engine.execute(tid, sql_pred['sel'], sql_pred['agg'], sql_pred['conds']) except: ret_pred = None tot_acc_num += (ret_gt == ret_pred) st = ed return tot_acc_num / len(sql_data)
def execute_accuracy(query_gt, pred_queries, table_ids, db_path, sql_data): """ Execution Accuracy 执行精确性,只要sql的执行结果一致就行 """ engine = DBEngine(db_path) ex_acc_num = 0 for sql_gt, sql_pred, tid in zip(query_gt, pred_queries, table_ids): ret_gt = engine.execute(tid, sql_gt['sel'], sql_gt['agg'], sql_gt['conds'], sql_gt['cond_conn_op']) try: ret_pred = engine.execute(tid, sql_pred['sel'], sql_pred['agg'], sql_pred['conds'], sql_pred['cond_conn_op']) except Exception as e: print(e) ret_pred = None ex_acc_num += (ret_gt == ret_pred) print('\nexecute acc is {}'.format(ex_acc_num / len(sql_data))) return ex_acc_num / len(sql_data)
def parse_sql(): table_id = request.json['table_id'] question = request.json['question'] flag_childfind = 0 if table_id == 'device': matchObj = re.search(r'最(.*)手机是', question, re.M | re.I) if matchObj: str_match = matchObj.group() str_mat_tmp = str_match.replace('手机', '') + '多少' question = question.replace(str_match, str_mat_tmp) flag_childfind = 1 key_col_index = 5 if table_id == 'telbill': matchObj = re.search(r'最(.*)账期是', question, re.M | re.I) if matchObj: str_match = matchObj.group() str_mat_tmp = str_match.replace('账期', '') + '多少' question = question.replace(str_match, str_mat_tmp) flag_childfind = 1 key_col_index = 0 test_json_line = '{\"question\": \"' + question + '\",\"table_id\": \"' + table_id + '\"}' test_data = read_line(test_json_line, test_tables) print(test_json_line) test_dataseq = DataSequence(data=test_data, tokenizer=query_tokenizer, label_encoder=label_encoder, is_train=False, shuffle_header=False, max_len=160, shuffle=False, batch_size=1) header_lens = np.sum(test_dataseq[0]['input_header_mask'], axis=-1) model = models['stage1'] model2 = models['stage2'] with graph.as_default(): preds_cond_conn_op, preds_sel_agg, preds_cond_op = model.predict_on_batch( test_dataseq[0]) sql = outputs_to_sqls(preds_cond_conn_op, preds_sel_agg, preds_cond_op, header_lens, test_dataseq.label_encoder) te_qc_pairs = QuestionCondPairsDataset( test_data, candidate_extractor=CandidateCondsExtractor(share_candidates=True), has_label=False, model_1_outputs=sql) te_qc_pairs_seq = QuestionCondPairsDataseq(te_qc_pairs, tokenizer, sampler=FullSampler(), shuffle=False, batch_size=1) te_result = model2.predict_generator(te_qc_pairs_seq, verbose=1) task2_result = merge_result(te_qc_pairs, te_result, threshold=0.995) cond = list(task2_result.get(0, [])) sql[0]['conds'] = cond engine = DBEngine() #table_id = json.loads(test_json_line)['table_id'] header = test_tables.__getitem__(table_id)._df.columns.values.tolist() sql_gen = engine.execute(table_id, sql[0]['sel'], sql[0]['agg'], sql[0]['conds'], sql[0]['cond_conn_op'], header) if flag_childfind == 1 and sql[0]['agg'][0] > 0: #print(sql[0]['sel']) header_index = int(sql[0]['sel'][0]) childcol = header[header_index] key_col = header[key_col_index] sql_gen = 'select ' + key_col + ' from Table_' + table_id + ' where ' + childcol + '=( ' + sql_gen + ' )' return jsonify({'task': sql_gen})
header_lens = np.sum(test_dataseq[0]['input_header_mask'], axis=-1) preds_cond_conn_op, preds_sel_agg, preds_cond_op = model.predict_on_batch( test_dataseq[0]) sql = outputs_to_sqls(preds_cond_conn_op, preds_sel_agg, preds_cond_op, header_lens, test_dataseq.label_encoder) te_qc_pairs = QuestionCondPairsDataset( test_data, candidate_extractor=CandidateCondsExtractor(share_candidates=True), has_label=False, model_1_outputs=sql) te_qc_pairs_seq = QuestionCondPairsDataseq(te_qc_pairs, tokenizer, sampler=FullSampler(), shuffle=False, batch_size=1) te_result = model2.predict_generator(te_qc_pairs_seq, verbose=1) task2_result = merge_result(te_qc_pairs, te_result, threshold=0.995) cond = list(task2_result.get(0, [])) sql[0]['conds'] = cond engine = DBEngine() table_id = json.loads(test_json_line)['table_id'] header = test_tables.__getitem__(table_id)._df.columns.values.tolist() print( engine.execute(table_id, sql[0]['sel'], sql[0]['agg'], sql[0]['conds'], sql[0]['cond_conn_op'], header)) #print(engine.execute(sql_json['table_id'], sql_json['sql']['sel'], sql_json['sql']['agg'], sql_json['sql']['conds'], sql_json['sql']['cond_conn_op']))
def epoch_reinforce_train(model, optimizer, batch_size, sql_dataloader): """ :param model: (Seq2SQL class) :param optimizer: (optimizer object) :param batch_size: (int) :param sql_data: (list) each entry is a dict containing one training example. Dict includes table_id for relevant table :param table_data (dict) table data dict with keys as table_id's :param db_path: (str) path to the table db file """ # engine = DBEngine(db_path) #Init database engine = DBEngine(sql_dataloader.dataset.TRAIN_DB) model.train() #Set model in training mode # perm = np.random.permutation(len(sql_data)) cum_reward = 0.0 st = 0 for batch_idx, sql_data in enumerate(sql_dataloader): gt_where_batch = model.generate_gt_where_seq( sql_data['question_tokens'], sql_data['column_headers'], sql_data['query_tokens'] ) #Get where clauses of examples with tokens replaced by their token_ids raw_q_batch = [x[0] for x in sql_data['question_raw'] ] # Get questions for each training example raw_col_batch = [x[1] for x in sql_data['question_raw'] ] # Get Column Headers for each training example gt_sel_batch = [x[1] for x in sql_data['sql_query'] ] # Get selector_id's for each training example table_ids = sql_data['table_id'] gt_sql_entry = sql_data['sql_entry'] score = model.forward(q=sql_data['question_tokens'], col=sql_data['column_headers'], col_num=sql_data['column_num'], pred_entry=(True, True, True), reinforce=True, gt_sel=gt_sel_batch) loss = model.loss(score, sql_data['sql_query'], (True, True, False), gt_where_batch) pred_queries = model.gen_query(score, sql_data['question_tokens'], sql_data['column_headers'], raw_q_batch, raw_col_batch, (True, True, True), reinforce=True) rewards = [] # import pdb; pdb.set_trace() for (sql_gt, sql_pred, tid) in zip(gt_sql_entry, pred_queries, table_ids): ret_gt = engine.execute(tid, sql_gt['sel'], sql_gt['agg'], sql_gt['conds']) try: ret_pred = engine.execute(tid, sql_pred['sel'], sql_pred['agg'], sql_pred['conds']) except: ret_pred = None if ret_pred is None: rewards.append(-2) elif ret_pred != ret_gt: rewards.append(-1) else: rewards.append(3) cum_reward += (sum(rewards)) lossrl = model.reinforce_backward(score[2], rewards, optimizer) # print("Avg RL Loss for batch: {}".format(lossrl.mean())) loss_batch = loss + lossrl # Optimization step batch-wise optimizer.zero_grad() loss_batch.backward(torch.FloatTensor([1])) optimizer.step() # Optimization step example-wise # for l in loss_batch: # optimizer.zero_grad() # l.backward(retain_graph=True) # optimizer.step() print("Avg RL Loss for Epoch's last batch: {}. Avg CE Loss: {}".format( loss_batch.data[0], loss.data[0])) return cum_reward / len(sql_dataloader.dataset)
header_lens = np.sum(test_dataseq[0]['input_header_mask'], axis=-1) preds_cond_conn_op, preds_sel_agg, preds_cond_op = model.predict_on_batch( test_dataseq[0]) sql = outputs_to_sqls(preds_cond_conn_op, preds_sel_agg, preds_cond_op, header_lens, test_dataseq.label_encoder) te_qc_pairs = QuestionCondPairsDataset( test_data, candidate_extractor=CandidateCondsExtractor(share_candidates=True), has_label=False, model_1_outputs=sql) te_qc_pairs_seq = QuestionCondPairsDataseq(te_qc_pairs, tokenizer, sampler=FullSampler(), shuffle=False, batch_size=1) te_result = model2.predict_generator(te_qc_pairs_seq, verbose=1) task2_result = merge_result(te_qc_pairs, te_result, threshold=0.995) cond = list(task2_result.get(0, [])) sql[0]['conds'] = cond engine = DBEngine() print( engine.execute( json.loads(test_json_line)['table_id'], sql[0]['sel'], sql[0]['agg'], sql[0]['conds'], sql[0]['cond_conn_op'])) print(sql) # In[ ]: