def main(unused_argv): examples = tsv_utils.read_tsv(FLAGS.input) new_examples = [] for source, target in examples: new_examples.append((nqg_tokenization.process_source(source), nqg_tokenization.process_target(target))) tsv_utils.write_tsv(new_examples, FLAGS.output)
def main(unused_argv): examples = tsv_utils.read_tsv(FLAGS.input) random.seed(FLAGS.seed) random.shuffle(examples) examples_1 = examples[:FLAGS.num_examples_1] examples_2 = examples[FLAGS.num_examples_1:] tsv_utils.write_tsv(examples_1, FLAGS.output_1) tsv_utils.write_tsv(examples_2, FLAGS.output_2)
def main(unused_argv): examples = tsv_utils.read_tsv(FLAGS.input) examples_1, examples_2 = template_utils.split_by_template( examples, template_fn=spider_template_fn, max_num_examples_1=FLAGS.max_num_examples_1, seed=FLAGS.seed) tsv_utils.write_tsv(examples_1, FLAGS.output_1) tsv_utils.write_tsv(examples_2, FLAGS.output_2)
def main(unused_argv): examples = tsv_utils.read_tsv(FLAGS.input) if FLAGS.use_target: sorted_examples = sorted(examples, key=lambda x: len(x[1].split(" "))) else: sorted_examples = sorted(examples, key=lambda x: len(x[0].split(" "))) examples_1 = sorted_examples[:FLAGS.num_examples] examples_2 = sorted_examples[FLAGS.num_examples:] tsv_utils.write_tsv(examples_1, FLAGS.output_1) tsv_utils.write_tsv(examples_2, FLAGS.output_2)
def main(unused_argv): splits = load_splits() examples = tsv_utils.read_tsv(FLAGS.input) example_id_to_example = { example_id: example for example_id, example in enumerate(examples) } for split, split_ids in splits.items(): examples = [] for split_id in split_ids: examples.append(example_id_to_example[split_id]) filename = os.path.join(FLAGS.output_dir, "%s.tsv" % split) tsv_utils.write_tsv(examples, filename)
def main(unused_argv): tables_json = load_json(FLAGS.tables) db_id_to_schema_string = {} for table_json in tables_json: db_id = table_json["db_id"].lower() db_id_to_schema_string[db_id] = _get_schema_string(table_json) examples = tsv_utils.read_tsv(FLAGS.input) new_examples = [] for source, target in examples: db_id = source.split()[0].rstrip(":") schema_string = db_id_to_schema_string[db_id] new_source = "%s%s" % (source, schema_string) new_examples.append((new_source.lower(), target.lower())) tsv_utils.write_tsv(new_examples, FLAGS.output)
def main(unused_argv): examples = tsv_utils.read_tsv(FLAGS.input) # First, randomly split examples. random.seed(FLAGS.seed) random.shuffle(examples) examples_1 = examples[:FLAGS.num_examples_1] examples_2 = examples[FLAGS.num_examples_1:] # Swap examples to meet atom constraint and maximize compound divergence. examples_1, examples_2 = mcd_utils.swap_examples( examples_1, examples_2, get_compounds_fn=tmcd_utils.get_example_compounds, get_atoms_fn=tmcd_utils.get_example_atoms, max_iterations=1000, max_divergence=None) tsv_utils.write_tsv(examples_1, FLAGS.output_1) tsv_utils.write_tsv(examples_2, FLAGS.output_2)
def main(unused_argv): examples_json = load_json(FLAGS.examples) examples = [] for example_json in examples_json: database = example_json["db_id"] source = example_json["question"] target = example_json["query"] # Skip if database not in set of databases with >= 50 examples. if database not in database_constants.DATABASES: continue # Prepend database. source = "%s: %s" % (database, source) target = normalize_whitespace(target) examples.append((source.lower(), target.lower())) tsv_utils.write_tsv(examples, FLAGS.output)
def main(unused_argv): examples = load_examples(FLAGS.input) tsv_utils.write_tsv(examples, FLAGS.output)
def main(unused_argv): examples = get_examples() tsv_utils.write_tsv(examples, FLAGS.output)
def main(unused_argv): examples = read_examples(FLAGS.source, FLAGS.target) tsv_utils.write_tsv(examples, FLAGS.output)