def predict(self, output_dict, arguments, helper): """ Inference by raw_feature * Args: output_dict: model's output dictionary consisting of arguments: arguments dictionary consisting of user_input helper: dictionary for helping get answer * Returns: query: Generated SQL Query execute_result: Execute result by generated query """ output_dict["table_id"] = arguments["table_id"] output_dict["tokenized_question"] = helper["tokenized_question"] prediction = self.generate_queries(output_dict)[0] pred_query = Query.from_dict(prediction["query"], ordered=True) dbengine = DBEngine(arguments["db_path"]) try: pred_execute_result = dbengine.execute_query( prediction["table_id"], pred_query, lower=True) except IndexError as e: pred_execute_result = str(e) return { "query": str(pred_query), "execute_result": pred_execute_result }
def evaluate(labels, predictions, db_path, ordered=True): # pragma: no cover """ labels and predictions: dictionary {data_uid: sql_data, ...} """ engine = DBEngine(db_path) exact_match, grades = [], [] for idx, data_uid in enumerate(predictions): eg = labels[data_uid] ep = predictions[data_uid] qg = eg["sql_query"] gold = eg["execution_result"] pred = ep.get("error", None) qp = None if not ep.get("error", None): try: qp = Query.from_dict(ep["query"], ordered=ordered) pred = engine.execute_query(ep["table_id"], qp, lower=True) except Exception as e: pred = repr(e) correct = pred == gold match = qp == qg grades.append(correct) exact_match.append(match) return { "ex_accuracy": sum(grades) / len(grades) * 100.0, "lf_accuracy": sum(exact_match) / len(exact_match) * 100.0, }
def _read(self, file_path, data_type=None): file_path = self.data_handler.read(file_path, return_path=True) file_path = Path(file_path) data_dir = file_path.parent file_name = file_path.stem db_path = data_dir / f"{file_name}.db" table_path = data_dir / f"{file_name}.tables.jsonl" self.dbengine = DBEngine(db_path) helper = {"file_path": file_path, "db_path": db_path, "examples": {}} features, labels = [], [] sql_datas, table_data = self.load_data(file_path, table_path, data_type=data_type) for sql_data in tqdm(sql_datas, desc=data_type): question = sql_data["question"] table_id = sql_data["table_id"] column_headers = table_data[table_id]["header"] feature_row = {"column": column_headers, "question": question} data_uid = str(uuid.uuid1()) conditions_value_position = self.get_coditions_value_position( sql_data["question"], [x[2] for x in sql_data["sql"]["conds"]] ) sql_query = Query.from_dict(sql_data["sql"], ordered=True) execution_result = self.dbengine.execute_query(table_id, sql_query, lower=True) label_row = { "id": data_uid, "table_id": table_id, "tokenized_question": self.word_tokenizer.tokenize(question), "aggregator_idx": sql_data["sql"]["agg"], "select_column_idx": sql_data["sql"]["sel"], "conditions_num": len(sql_data["sql"]["conds"]), "conditions_column_idx": [x[0] for x in sql_data["sql"]["conds"]], "conditions_operator_idx": [x[1] for x in sql_data["sql"]["conds"]], "conditions_value_string": [str(x[2]) for x in sql_data["sql"]["conds"]], "conditions_value_position": conditions_value_position, "sql_query": sql_query, "execution_result": execution_result, } features.append(feature_row) labels.append(label_row) helper["examples"][data_uid] = { "question": question, "sql_query": sql_query, "execution_result": execution_result, } if self.is_test and len(labels) == 10: break return make_batch(features, labels), helper
def print_examples(self, index, inputs, predictions): """ Print evaluation examples * Args: index: data index inputs: mini-batch inputs predictions: prediction dictionary consisting of - key: 'id' (question id) - value: consisting of dictionary table_id, query (agg, sel, conds) * Returns: print(Context, Question, Answers and Predict) """ data_index = inputs["labels"]["data_idx"][index].item() data_id = self._dataset.get_id(data_index) helper = self._dataset.helper question = helper["examples"][data_id]["question"] label = self._dataset.get_ground_truth(data_id) dbengine = DBEngine(helper["db_path"]) prediction = predictions[data_id] pred_query = Query.from_dict(prediction["query"], ordered=True) pred_execute_result = dbengine.execute_query(prediction["table_id"], pred_query, lower=True) print("- Question:", question) print("- Answers:") print(" SQL Query: ", label["sql_query"]) print(" Execute Results:", label["execution_result"]) print("- Predict:") print(" SQL Query: ", pred_query) print(" Execute Results:", pred_execute_result) print("-" * 30)
} if __name__ == "__main__": # pragma: no cover parser = ArgumentParser() parser.add_argument("source_file", help="source file for the prediction") parser.add_argument("db_file", help="source database for the prediction") parser.add_argument("pred_file", help="predictions by the model") parser.add_argument( "--ordered", action="store_true", help="whether the exact match should consider the order of conditions", ) args = parser.parse_args() engine = DBEngine(args.db_file) exact_match = [] with open(args.source_file) as fs, open(args.pred_file) as fp: grades = [] for ls, lp in tqdm(zip(fs, fp), total=count_lines(args.source_file)): eg = json.loads(ls) ep = json.loads(lp) qg = Query.from_dict(eg["sql"], ordered=args.ordered) gold = engine.execute_query(eg["table_id"], qg, lower=True) pred = ep.get("error", None) qp = None if not ep.get("error", None): try: qp = Query.from_dict(ep["query"], ordered=args.ordered) pred = engine.execute_query(eg["table_id"], qp, lower=True) except Exception as e:
class WikiSQLReader(DataReader): """ WikiSQL DataReader (http://arxiv.org/abs/1709.00103) * Args: file_paths: .json file paths (train and dev) tokenizers: defined tokenizers config (char/word) """ def __init__(self, file_paths, tokenizers, context_max_length=None, is_test=None): super(WikiSQLReader, self).__init__(file_paths, WikiSQLDataset) self.is_test = is_test self.text_columns = ["column", "question"] if "word" not in tokenizers: raise ValueError( "WordTokenizer is required. define English WordTokenizer") self.word_tokenizer = tokenizers["word"] self.dbengine = None @overrides def _read(self, file_path, data_type=None): file_path = self.data_handler.read(file_path, return_path=True) file_path = Path(file_path) data_dir = file_path.parent file_name = file_path.stem db_path = data_dir / f"{file_name}.db" table_path = data_dir / f"{file_name}.tables.jsonl" self.dbengine = DBEngine(db_path) helper = Helper(**{ "file_path": file_path, "db_path": db_path, }) features, labels = [], [] sql_datas, table_data = self.load_data(file_path, table_path, data_type=data_type) for sql_data in tqdm(sql_datas, desc=data_type): question = sql_data["question"] table_id = sql_data["table_id"] column_headers = table_data[table_id]["header"] feature_row = {"column": column_headers, "question": question} data_uid = str(uuid.uuid1()) conditions_value_position = self.get_coditions_value_position( sql_data["question"], [x[2] for x in sql_data["sql"]["conds"]]) sql_query = Query.from_dict(sql_data["sql"], ordered=True) execution_result = self.dbengine.execute_query(table_id, sql_query, lower=True) label_row = { "id": data_uid, "table_id": table_id, "tokenized_question": self.word_tokenizer.tokenize(question), "aggregator_idx": sql_data["sql"]["agg"], "select_column_idx": sql_data["sql"]["sel"], "conditions_num": len(sql_data["sql"]["conds"]), "conditions_column_idx": [x[0] for x in sql_data["sql"]["conds"]], "conditions_operator_idx": [x[1] for x in sql_data["sql"]["conds"]], "conditions_value_string": [str(x[2]) for x in sql_data["sql"]["conds"]], "conditions_value_position": conditions_value_position, "sql_query": sql_query, "execution_result": execution_result, } features.append(feature_row) labels.append(label_row) helper.set_example( data_uid, { "question": question, "sql_query": sql_query, "execution_result": execution_result, }) if self.is_test and len(labels) == 10: break return utils.make_batch(features, labels), helper.to_dict() @overrides def read_one_example(self, inputs): """ inputs keys: question, column, db_path, table_id """ question_text = inputs["question"] helper = { "tokenized_question": self.word_tokenizer.tokenize(question_text) } return inputs, helper def load_data(self, sql_path, table_path, data_type=None): sql_data = [] table_data = {} logger.info(f"Loading data from {sql_path}") with open(sql_path) as inf: for line in tqdm(inf, desc=f"sql_{data_type}"): sql = json.loads(line.strip()) sql_data.append(sql) logger.info(f"Loading data from {table_path}") with open(table_path) as inf: for line in tqdm(inf, desc=f"table_{data_type}"): tab = json.loads(line.strip()) table_data[tab["id"]] = tab for sql in sql_data: assert sql["table_id"] in table_data return sql_data, table_data def get_coditions_value_position(self, question, values): tokenized_question = self.word_tokenizer.tokenize(question.lower()) tokenized_values = [ self.word_tokenizer.tokenize(str(value).lower()) for value in values ] START_TOKEN, END_TOKEN = "<BEG>", "<END>" token_to_index = {START_TOKEN: 0} for token in tokenized_question: token_to_index[token] = len(token_to_index) token_to_index[END_TOKEN] = len(token_to_index) position_tokens = [] for value in tokenized_values: position_token = [token_to_index[START_TOKEN]] for token in value: if token in token_to_index: position_token.append(token_to_index[token]) else: for i in range(len(tokenized_question)): q_token = tokenized_question[i] if token in q_token: position_token.append(token_to_index[q_token]) position_token.append(token_to_index[END_TOKEN]) assert len(position_token) != 2 position_tokens.append(position_token) return position_tokens