def test(): # convert query dict to text (without correct column references) details = {"sel": 5, "conds": [[3, 0, "SOUTH AUSTRALIA"]], "agg": 0} test_str = Query(details["sel"], details["agg"], details["conds"]) print(test_str) db = records.Database('sqlite:///data/train.db') conn = db.get_connection() # convert query dict to text with table reference (still does not give the correct columns) # because header is not supplied table = Table.from_db(conn, "1-1000181-1") print(table.query_str(test_str)) # convert query dict to text with table reference after supplying headers table_data = { "id": "1-1000181-1", "header": [ "State/territory", "Text/background colour", "Format", "Current slogan", "Current series", "Notes" ], "types": [], "rows": [] } t = Table(table_data["id"], table_data["header"], table_data["types"], table_data["rows"]) print(t.query_str(test_str))
def toQueryStr(file_name, table_arr, type=0, test_batch_size=1000): path = os.path.join(DATA_DIR, '{}.jsonl'.format(file_name)) print(path) with open(path, 'r') as pf: data = pf.readlines() idxs = np.arange(len(data)) data = np.array(data, dtype=np.object) np.random.seed(0) # set random seed so that random things are reproducible np.random.shuffle(idxs) data = data[idxs] batched_data = chunked(data, test_batch_size) print("start processing") examples = [] for batch_idx, batch_data in enumerate(batched_data): if len(batch_data) < test_batch_size: break # the last batch is smaller than the others, exclude. for d_idx, d in enumerate(batch_data): line = json.loads(str(d), encoding='utf-8') doc_token = line['question'] code_arr = line['sql'] query = Query(code_arr['sel'], code_arr['agg'], code_arr['conds']) id = line['table_id'] table = Table("table_id", "header", "types", "rows") code_str = '' for table in table_arr: if table.table_id == id: table = table code_str = table.query_str(query) break else: continue isNegative = np.random.randint(2) if isNegative == 0: random_line_num = np.random.randint(len(data)) line = json.loads(str(data[random_line_num]), encoding='utf-8') doc_token = line['question'] code_token = code_str else: code_token = code_str example = (str(isNegative), "nothing", "nothing", doc_token, code_token) example = '<CODESPLIT>'.join(example) examples.append(example) data_path = os.path.join(DATA_DIR, 'train_valid/wiki_sql') if not os.path.exists(data_path): os.makedirs(data_path) output_file_name = "1.txt" if type == 0: output_file_name = 'train.txt' else: output_file_name = 'valid.txt' file_path = os.path.join(data_path, output_file_name) print(file_path) with open(file_path, 'w', encoding='utf-8') as f: f.writelines('\n'.join(examples))