def main(_): random.seed(FLAGS.random_seed) task_name = FLAGS.task_name.lower() with gfile.Open(FLAGS.input_path, "r") as f: sents_data = f.read().strip().split("\n") with gfile.Open(FLAGS.thief_dataset, "r") as f: thief_data = f.read().strip().split("\n") header = sents_data[0] sents_data = sents_data[1:] vocab, probs = pp_util.build_vocab(thief_data, task_name="list-sentences", vocab_mode=FLAGS.vocab_mode, vocab_path=FLAGS.vocab_path) vocab_dict = {x: i for i, x in enumerate(vocab)} output_data = [] if FLAGS.dataset_size: points_remaining = FLAGS.dataset_size new_sents_data = [] while points_remaining > len(sents_data): new_sents_data.extend([x for x in sents_data]) points_remaining = points_remaining - len(sents_data) new_sents_data.extend([x for x in sents_data[:points_remaining]]) sents_data = new_sents_data for _ in range(FLAGS.augmentations): for sent in tqdm.tqdm(sents_data): data_point_parts = sent.split("\t") if FLAGS.scheme.startswith("random_ed_k_"): premise_ind, hypo_ind = pp_util.task_input_indices[task_name] # sample random sentence from the thief dataset new_premise = pp_util.sample_thief_data( thief_data, sanitize=FLAGS.sanitize_samples, vocab=vocab, vocab_dict=vocab_dict).split() data_point_parts[premise_ind] = pp_util.detokenize(new_premise) # Starting from premise, make multiple ed1 changes to form hypothesis new_premise = pp_util.token_replace( token_list=new_premise, vocab=vocab, probs=None, num_changes=FLAGS.ed1_changes) data_point_parts[hypo_ind] = pp_util.detokenize(new_premise) elif FLAGS.scheme.startswith("random_"): # For every index having textual input, do a random replacement for index in pp_util.task_input_indices[task_name]: # sample random sentence from the thief dataset new_sent = pp_util.sample_thief_data( thief_data, sanitize=FLAGS.sanitize_samples, vocab=vocab, vocab_dict=vocab_dict).split() data_point_parts[index] = pp_util.detokenize(new_sent) elif FLAGS.scheme.startswith("shuffle_"): # only a valid scheme for pairwise datasets premise_ind, hypo_ind = pp_util.task_input_indices[task_name] # sample random sentence from the thief dataset new_premise = pp_util.sample_thief_data( thief_data, sanitize=FLAGS.sanitize_samples, vocab=vocab, vocab_dict=vocab_dict).split() data_point_parts[premise_ind] = pp_util.detokenize(new_premise) # Shuffle words for hypothesis random.shuffle(new_premise) data_point_parts[hypo_ind] = pp_util.detokenize(new_premise) elif FLAGS.scheme.startswith("random_ed_all_"): premise_ind, hypo_ind = pp_util.task_input_indices[task_name] # sample random sentence from the thief dataset new_premise = pp_util.sample_thief_data( thief_data, sanitize=FLAGS.sanitize_samples, vocab=vocab, vocab_dict=vocab_dict).split() data_point_parts[premise_ind] = pp_util.detokenize(new_premise) # Starting from premise, make multiple ed1 changes to form hypothesis # First, randomly sample the type of change that needs to be made change_type = random.choice( ["replace", "drop", "add", "random"]) # Next, randomly sample the number of ed1 changes that need to be made # FLAGS.ed1_changes represents the upper-bound num_changes = random.choice( [i for i in range(1, FLAGS.ed1_changes + 1)]) if change_type == "drop" and num_changes >= len(new_premise): change_type = random.choice(["replace", "add"]) if change_type == "replace": new_premise = pp_util.token_replace( token_list=new_premise, vocab=vocab, probs=probs, num_changes=num_changes) elif change_type == "drop": new_premise = pp_util.token_drop(token_list=new_premise, num_changes=num_changes) elif change_type == "add": new_premise = pp_util.token_add(token_list=new_premise, vocab=vocab, probs=probs, scheme=FLAGS.scheme, num_changes=num_changes) elif change_type == "random": # in the random mode, just sample another sentence from corpus new_premise = pp_util.sample_thief_data( thief_data, sanitize=FLAGS.sanitize_samples, vocab=vocab, vocab_dict=vocab_dict).split() data_point_parts[hypo_ind] = pp_util.detokenize(new_premise) # Once all sentences have been replaced, add to corpus output_data.append("\t".join(data_point_parts)) logging.info("Final dataset size = %d", len(output_data)) output_data = [header] + output_data with gfile.Open(FLAGS.output_path, "w") as f: f.write("\n".join(output_data) + "\n") return
def main(_): random.seed(FLAGS.random_seed) task_name = FLAGS.task_name.lower() with gfile.Open(FLAGS.input_path, "r") as f: sents_data = f.read().strip().split("\n") header = sents_data[0] sents_data = sents_data[1:] if FLAGS.thief_dataset: with gfile.Open(FLAGS.thief_dataset, "r") as f: thief_data = f.read().strip().split("\n") vocab, probs = pp_util.build_vocab(sents_data=thief_data, task_name="list-sentences", vocab_mode=FLAGS.vocab_mode, vocab_path=FLAGS.vocab_path) thief_lengths_pool = pp_util.get_lengths_pool(thief_data) else: vocab, probs = pp_util.build_vocab(sents_data=sents_data, task_name=task_name, vocab_mode=FLAGS.vocab_mode, vocab_path=FLAGS.vocab_path) thief_lengths_pool = None output_data = [] if FLAGS.dataset_size: points_remaining = FLAGS.dataset_size new_sents_data = [] while points_remaining > len(sents_data): new_sents_data.extend([x for x in sents_data]) points_remaining = points_remaining - len(sents_data) new_sents_data.extend([x for x in sents_data[:points_remaining]]) sents_data = new_sents_data for _ in range(FLAGS.augmentations): for sent in tqdm.tqdm(sents_data): data_point_parts = sent.split("\t") if FLAGS.scheme.startswith("random_ed_k_"): # only relevant for pairwise text classification tasks premise_ind, hypo_ind = pp_util.task_input_indices[task_name] # Randomly choose premise original_premise = data_point_parts[premise_ind].split() new_len = pp_util.get_length( original_sequence=original_premise, thief_lengths_pool=thief_lengths_pool, lengths_scheme=FLAGS.lengths_scheme) # randomly sample a word for every position in the premise new_premise = pp_util.sample_next_sequence(vocab=vocab, probs=probs, seq_length=new_len, scheme=FLAGS.scheme) data_point_parts[premise_ind] = pp_util.detokenize( new_premise, FLAGS.vocab_mode) # Starting from premise, make multiple ed1 changes to form hypothesis new_premise = pp_util.token_replace( token_list=new_premise, vocab=vocab, probs=probs, num_changes=FLAGS.ed1_changes) data_point_parts[hypo_ind] = pp_util.detokenize( new_premise, FLAGS.vocab_mode) elif FLAGS.scheme.startswith("random_"): # For every index having textual input, do a random replacement for index in pp_util.task_input_indices[task_name]: original_sent = data_point_parts[index].split() new_len = pp_util.get_length( original_sequence=original_sent, thief_lengths_pool=thief_lengths_pool, lengths_scheme=FLAGS.lengths_scheme) # randomly sample a word for every position in the premise new_sent = pp_util.sample_next_sequence( vocab=vocab, probs=probs, seq_length=new_len, scheme=FLAGS.scheme) data_point_parts[index] = pp_util.detokenize( new_sent, FLAGS.vocab_mode) elif FLAGS.scheme.startswith("shuffle_"): # only relevant for pairwise text classification tasks premise_ind, hypo_ind = pp_util.task_input_indices[task_name] # Randomly choose premise original_premise = data_point_parts[premise_ind].split() # sample lengths according to a thief dataset or uniform random sampling new_len = pp_util.get_length( original_sequence=original_premise, thief_lengths_pool=thief_lengths_pool, lengths_scheme=FLAGS.lengths_scheme) # randomly sample a word for every position in the premise new_premise = pp_util.sample_next_sequence(vocab=vocab, probs=probs, seq_length=new_len, scheme=FLAGS.scheme) data_point_parts[premise_ind] = pp_util.detokenize( new_premise, FLAGS.vocab_mode) # Shuffle words for hypothesis random.shuffle(new_premise) data_point_parts[hypo_ind] = pp_util.detokenize( new_premise, FLAGS.vocab_mode) # Once all sentences have been replaced, add to corpus output_data.append("\t".join(data_point_parts)) output_data = [header] + output_data with gfile.Open(FLAGS.output_path, "w") as f: f.write("\n".join(output_data) + "\n") return