def build_nonredundant_query_split(jsons, save_loc, max_questions=None, keep_variables=False): import random datasets = {} for json_dict in jsons: split = json_dict["query-split"] if split == "exclude": continue if not split in datasets: datasets[split] = [] query = json_dict["sql"][0] sql_vars = json_dict['variables'] sentence = random.choice(json_dict["sentences"]) text, variables, _ = extract_sentence_fields(sentence) if keep_variables: sql = query question = text else: sql, question = read_new_as_old.insert_variables( query, sql_vars, text, variables) sql = tokenise(sql) question = preprocess_text(question) datasets[split].append((question, sql)) print "Nonredundant query split:" for k, v in sorted(datasets.items()): print "\t%s: %d" % (k, len(v)) save_datasets(datasets, save_loc)
def build_query_split(jsons, save_loc, max_questions=None, keep_variables=False): datasets = {} for json_dict in jsons: split = json_dict["query-split"] if split == "exclude": continue if not split in datasets: datasets[split] = [] for query in [json_dict["sql"][0]]: sql_vars = json_dict['variables'] sentences = json_dict["sentences"] if max_questions and max_questions < len(sentences): sentences = sentences[:max_questions] for sentence in sentences: text, variables, _ = extract_sentence_fields(sentence) if keep_variables: sql = query question = text else: sql, question = read_new_as_old.insert_variables( query, sql_vars, text, variables) sql = tokenise(sql) question = preprocess_text(question) datasets[split].append((question, sql)) print("Query split:") for k, v in sorted(datasets.items()): print("\t%s: %d" % (k, len(v))) save_datasets(datasets, save_loc)