示例#1
0
def epoch_reinforce_train(model, optimizer, batch_size, sql_data, table_data,
                          db_path):
    engine = DBEngine(db_path)

    #XXX print ("=== call to train")
    model.train()
    perm = np.random.permutation(len(sql_data))
    cum_reward = 0.0
    st = 0
    while st < len(sql_data):
        ed = st + batch_size if st + batch_size < len(perm) else len(perm)

        q_seq, col_seq, col_num, ans_seq, query_seq, gt_whr_seq, raw_data =\
            to_batch_seq(sql_data, table_data, perm, st, ed, ret_vis_data=True)
        #XXX print ("=== generate g_s")
        g_s = model.generate_g_s(q_seq, col_seq, query_seq)
        raw_q_seq = [x[0] for x in raw_data]
        raw_col_seq = [x[1] for x in raw_data]
        query_gt, table_ids = to_batch_query(sql_data, perm, st, ed)
        #XXX print ("=== fwd")
        score = model.forward(q_seq,
                              col_seq,
                              col_num, (True, True, True),
                              reinforce=True)
        clasif_queries = model.gen_query(score,
                                         q_seq,
                                         col_seq,
                                         raw_q_seq,
                                         raw_col_seq, (True, True, True),
                                         reinforce=True)

        query_gt, table_ids = to_batch_query(sql_data, perm, st, ed)
        rewards = []
        for idx, (sql_gt, sql_pred,
                  tid) in enumerate(zip(query_gt, clasif_queries, table_ids)):
            ret_gt = engine.execute(tid, sql_gt['sel'], sql_gt['agg'],
                                    sql_gt['conds'])
            try:
                ret_pred = engine.execute(tid, sql_pred['sel'],
                                          sql_pred['agg'], sql_pred['conds'])
            except:
                ret_pred = None

            if ret_pred is None:
                rewards.append(-2)
            elif ret_pred != ret_gt:
                rewards.append(-1)
            else:
                rewards.append(1)

        cum_reward += (sum(rewards))
        optimizer.zero_grad()
        model.reinforce_backward(score, rewards)
        optimizer.step()

        st = ed

    return cum_reward / len(sql_data)
示例#2
0
def epoch_acc(model, batch_size, sql_data, table_data, db_path):
    engine = DBEngine(db_path)
    model.eval()
    perm = list(range(len(sql_data)))
    badcase = 0
    one_acc_num, tot_acc_num, ex_acc_num = 0.0, 0.0, 0.0
    for st in tqdm(range(len(sql_data) // batch_size + 1)):
        ed = (st + 1) * batch_size if (st + 1) * \
            batch_size < len(perm) else len(perm)
        st = st * batch_size
        q_seq, gt_sel_num, col_seq, col_num, ans_seq, gt_cond_seq, raw_data = \
            to_batch_seq(sql_data, table_data, perm, st, ed, ret_vis_data=True)
        # q_seq: char-based sequence of question
        # gt_sel_num: number of selected columns and aggregation functions, new added field
        # col_seq: char-based column name
        # col_num: number of headers in one table
        # ans_seq: (sel, number of conds, sel list in conds, op list in conds)
        # gt_cond_seq: ground truth of conditions
        # raw_data: ori question, headers, sql
        query_gt, table_ids = to_batch_query(sql_data, perm, st, ed)
        # query_gt: ground truth of sql, data['sql'], containing sel, agg,
        # conds:{sel, op, value}
        raw_q_seq = [x[0] for x in raw_data]  # original question
        try:
            score = model.forward(q_seq, col_seq, col_num)
            pred_queries = model.gen_query(score, q_seq, col_seq, raw_q_seq)
            # generate predicted format
            one_err, tot_err = model.check_acc(raw_data, pred_queries,
                                               query_gt)
        except BaseException:
            badcase += 1
            print('badcase', badcase)
            continue
        one_acc_num += (ed - st - one_err)
        tot_acc_num += (ed - st - tot_err)

        # Execution Accuracy
        for sql_gt, sql_pred, tid in zip(query_gt, pred_queries, table_ids):
            ret_gt = engine.execute(tid, sql_gt['sel'], sql_gt['agg'],
                                    sql_gt['conds'], sql_gt['cond_conn_op'])
            try:
                ret_pred = engine.execute(tid, sql_pred['sel'],
                                          sql_pred['agg'], sql_pred['conds'],
                                          sql_pred['cond_conn_op'])
            except BaseException:
                ret_pred = None
            ex_acc_num += (ret_gt == ret_pred)
    return one_acc_num / len(sql_data), tot_acc_num / \
        len(sql_data), ex_acc_num / len(sql_data)
示例#3
0
def epoch_exec_acc(model, batch_size, sql_data, table_data, db_path):
    engine = DBEngine(db_path)

    model.eval()
    perm = list(range(len(sql_data)))
    tot_acc_num = 0.0
    acc_of_log = 0.0
    st = 0
    while st < len(sql_data):
        ed = st + batch_size if st + batch_size < len(perm) else len(perm)
        q_seq, col_seq, col_num, ans_seq, query_seq, gt_cond_seq, raw_data = \
            to_batch_seq(sql_data, table_data, perm, st, ed, ret_vis_data=True)
        raw_q_seq = [x[0] for x in raw_data]
        raw_col_seq = [x[1] for x in raw_data]
        gt_where_seq = model.generate_gt_where_seq(q_seq, col_seq, query_seq)
        query_gt, table_ids = to_batch_query(sql_data, perm, st, ed)
        gt_sel_seq = [x[1] for x in ans_seq]
        score = model.forward(q_seq,
                              col_seq,
                              col_num, (True, True, True),
                              gt_sel=gt_sel_seq)
        pred_queries = model.gen_query(score, q_seq, col_seq, raw_q_seq,
                                       raw_col_seq, (True, True, True))

        for idx, (sql_gt, sql_pred,
                  tid) in enumerate(zip(query_gt, pred_queries, table_ids)):
            try:
                ret_gt = engine.execute(tid, sql_gt['sel'], sql_gt['agg'],
                                        sql_gt['conds'])
            except:
                ret_gt = None
            try:
                ret_pred = engine.execute(tid, sql_pred['sel'],
                                          sql_pred['agg'], sql_pred['conds'])
            except:
                ret_pred = None
            tot_acc_num += (ret_gt == ret_pred)

        st = ed

    return tot_acc_num / len(sql_data)
示例#4
0
def execute_accuracy(query_gt, pred_queries, table_ids, db_path, sql_data):
    """
        Execution Accuracy 执行精确性,只要sql的执行结果一致就行

    """
    engine = DBEngine(db_path)
    ex_acc_num = 0
    for sql_gt, sql_pred, tid in zip(query_gt, pred_queries, table_ids):
        ret_gt = engine.execute(tid, sql_gt['sel'], sql_gt['agg'],
                                sql_gt['conds'], sql_gt['cond_conn_op'])

        try:
            ret_pred = engine.execute(tid, sql_pred['sel'], sql_pred['agg'],
                                      sql_pred['conds'],
                                      sql_pred['cond_conn_op'])
        except Exception as e:
            print(e)
            ret_pred = None
        ex_acc_num += (ret_gt == ret_pred)
    print('\nexecute acc is {}'.format(ex_acc_num / len(sql_data)))
    return ex_acc_num / len(sql_data)
示例#5
0
def parse_sql():
    table_id = request.json['table_id']
    question = request.json['question']

    flag_childfind = 0
    if table_id == 'device':
        matchObj = re.search(r'最(.*)手机是', question, re.M | re.I)
        if matchObj:
            str_match = matchObj.group()
            str_mat_tmp = str_match.replace('手机', '') + '多少'
            question = question.replace(str_match, str_mat_tmp)
            flag_childfind = 1
            key_col_index = 5

    if table_id == 'telbill':
        matchObj = re.search(r'最(.*)账期是', question, re.M | re.I)
        if matchObj:
            str_match = matchObj.group()
            str_mat_tmp = str_match.replace('账期', '') + '多少'
            question = question.replace(str_match, str_mat_tmp)
            flag_childfind = 1
            key_col_index = 0

    test_json_line = '{\"question\": \"' + question + '\",\"table_id\": \"' + table_id + '\"}'
    test_data = read_line(test_json_line, test_tables)
    print(test_json_line)
    test_dataseq = DataSequence(data=test_data,
                                tokenizer=query_tokenizer,
                                label_encoder=label_encoder,
                                is_train=False,
                                shuffle_header=False,
                                max_len=160,
                                shuffle=False,
                                batch_size=1)

    header_lens = np.sum(test_dataseq[0]['input_header_mask'], axis=-1)
    model = models['stage1']
    model2 = models['stage2']
    with graph.as_default():
        preds_cond_conn_op, preds_sel_agg, preds_cond_op = model.predict_on_batch(
            test_dataseq[0])
        sql = outputs_to_sqls(preds_cond_conn_op, preds_sel_agg, preds_cond_op,
                              header_lens, test_dataseq.label_encoder)
        te_qc_pairs = QuestionCondPairsDataset(
            test_data,
            candidate_extractor=CandidateCondsExtractor(share_candidates=True),
            has_label=False,
            model_1_outputs=sql)

        te_qc_pairs_seq = QuestionCondPairsDataseq(te_qc_pairs,
                                                   tokenizer,
                                                   sampler=FullSampler(),
                                                   shuffle=False,
                                                   batch_size=1)
        te_result = model2.predict_generator(te_qc_pairs_seq, verbose=1)

    task2_result = merge_result(te_qc_pairs, te_result, threshold=0.995)
    cond = list(task2_result.get(0, []))
    sql[0]['conds'] = cond

    engine = DBEngine()
    #table_id = json.loads(test_json_line)['table_id']
    header = test_tables.__getitem__(table_id)._df.columns.values.tolist()
    sql_gen = engine.execute(table_id, sql[0]['sel'], sql[0]['agg'],
                             sql[0]['conds'], sql[0]['cond_conn_op'], header)

    if flag_childfind == 1 and sql[0]['agg'][0] > 0:
        #print(sql[0]['sel'])
        header_index = int(sql[0]['sel'][0])

        childcol = header[header_index]
        key_col = header[key_col_index]
        sql_gen = 'select ' + key_col + ' from Table_' + table_id + ' where ' + childcol + '=( ' + sql_gen + ' )'

    return jsonify({'task': sql_gen})
示例#6
0
header_lens = np.sum(test_dataseq[0]['input_header_mask'], axis=-1)
preds_cond_conn_op, preds_sel_agg, preds_cond_op = model.predict_on_batch(
    test_dataseq[0])
sql = outputs_to_sqls(preds_cond_conn_op, preds_sel_agg, preds_cond_op,
                      header_lens, test_dataseq.label_encoder)
te_qc_pairs = QuestionCondPairsDataset(
    test_data,
    candidate_extractor=CandidateCondsExtractor(share_candidates=True),
    has_label=False,
    model_1_outputs=sql)

te_qc_pairs_seq = QuestionCondPairsDataseq(te_qc_pairs,
                                           tokenizer,
                                           sampler=FullSampler(),
                                           shuffle=False,
                                           batch_size=1)
te_result = model2.predict_generator(te_qc_pairs_seq, verbose=1)

task2_result = merge_result(te_qc_pairs, te_result, threshold=0.995)
cond = list(task2_result.get(0, []))
sql[0]['conds'] = cond

engine = DBEngine()
table_id = json.loads(test_json_line)['table_id']
header = test_tables.__getitem__(table_id)._df.columns.values.tolist()
print(
    engine.execute(table_id, sql[0]['sel'], sql[0]['agg'], sql[0]['conds'],
                   sql[0]['cond_conn_op'], header))
#print(engine.execute(sql_json['table_id'], sql_json['sql']['sel'], sql_json['sql']['agg'], sql_json['sql']['conds'], sql_json['sql']['cond_conn_op']))
示例#7
0
def epoch_reinforce_train(model, optimizer, batch_size, sql_dataloader):
    """
    :param model: (Seq2SQL class)
    :param optimizer: (optimizer object)
    :param batch_size: (int)
    :param sql_data: (list) each entry is a dict containing one training example.
                    Dict includes table_id for relevant table
    :param table_data (dict) table data dict with keys as table_id's
    :param db_path: (str) path to the table db file
    """

    # engine = DBEngine(db_path) #Init database
    engine = DBEngine(sql_dataloader.dataset.TRAIN_DB)
    model.train()  #Set model in training mode
    # perm = np.random.permutation(len(sql_data))
    cum_reward = 0.0
    st = 0

    for batch_idx, sql_data in enumerate(sql_dataloader):
        gt_where_batch = model.generate_gt_where_seq(
            sql_data['question_tokens'], sql_data['column_headers'],
            sql_data['query_tokens']
        )  #Get where clauses of examples with tokens replaced by their token_ids
        raw_q_batch = [x[0] for x in sql_data['question_raw']
                       ]  # Get questions for each training example
        raw_col_batch = [x[1] for x in sql_data['question_raw']
                         ]  # Get Column Headers for each training example
        gt_sel_batch = [x[1] for x in sql_data['sql_query']
                        ]  # Get selector_id's for each training example
        table_ids = sql_data['table_id']
        gt_sql_entry = sql_data['sql_entry']
        score = model.forward(q=sql_data['question_tokens'],
                              col=sql_data['column_headers'],
                              col_num=sql_data['column_num'],
                              pred_entry=(True, True, True),
                              reinforce=True,
                              gt_sel=gt_sel_batch)
        loss = model.loss(score, sql_data['sql_query'], (True, True, False),
                          gt_where_batch)
        pred_queries = model.gen_query(score,
                                       sql_data['question_tokens'],
                                       sql_data['column_headers'],
                                       raw_q_batch,
                                       raw_col_batch, (True, True, True),
                                       reinforce=True)

        rewards = []
        # import pdb; pdb.set_trace()
        for (sql_gt, sql_pred, tid) in zip(gt_sql_entry, pred_queries,
                                           table_ids):
            ret_gt = engine.execute(tid, sql_gt['sel'], sql_gt['agg'],
                                    sql_gt['conds'])
            try:
                ret_pred = engine.execute(tid, sql_pred['sel'],
                                          sql_pred['agg'], sql_pred['conds'])
            except:
                ret_pred = None

            if ret_pred is None:
                rewards.append(-2)
            elif ret_pred != ret_gt:
                rewards.append(-1)
            else:
                rewards.append(3)

        cum_reward += (sum(rewards))

        lossrl = model.reinforce_backward(score[2], rewards, optimizer)
        # print("Avg RL Loss for batch: {}".format(lossrl.mean()))
        loss_batch = loss + lossrl

        # Optimization step batch-wise
        optimizer.zero_grad()
        loss_batch.backward(torch.FloatTensor([1]))
        optimizer.step()

        # Optimization step example-wise
        # for l in loss_batch:
        #     optimizer.zero_grad()
        #     l.backward(retain_graph=True)
        #     optimizer.step()

    print("Avg RL Loss for Epoch's last batch: {}. Avg CE Loss: {}".format(
        loss_batch.data[0], loss.data[0]))
    return cum_reward / len(sql_dataloader.dataset)
    header_lens = np.sum(test_dataseq[0]['input_header_mask'], axis=-1)
    preds_cond_conn_op, preds_sel_agg, preds_cond_op = model.predict_on_batch(
        test_dataseq[0])
    sql = outputs_to_sqls(preds_cond_conn_op, preds_sel_agg, preds_cond_op,
                          header_lens, test_dataseq.label_encoder)
    te_qc_pairs = QuestionCondPairsDataset(
        test_data,
        candidate_extractor=CandidateCondsExtractor(share_candidates=True),
        has_label=False,
        model_1_outputs=sql)

    te_qc_pairs_seq = QuestionCondPairsDataseq(te_qc_pairs,
                                               tokenizer,
                                               sampler=FullSampler(),
                                               shuffle=False,
                                               batch_size=1)
    te_result = model2.predict_generator(te_qc_pairs_seq, verbose=1)

    task2_result = merge_result(te_qc_pairs, te_result, threshold=0.995)
    cond = list(task2_result.get(0, []))
    sql[0]['conds'] = cond
    engine = DBEngine()
    print(
        engine.execute(
            json.loads(test_json_line)['table_id'], sql[0]['sel'],
            sql[0]['agg'], sql[0]['conds'], sql[0]['cond_conn_op']))

    print(sql)

# In[ ]: