def get_real_data(task, train_extra_data, accum=True, encode=True): task_idx = args.tasks.index(task) gen_size = DATA_ATTRS[task]["train"]["data_size"] if accum: prev_tasks = args.tasks[:task_idx] gen_size = int(np.ceil(gen_size * args.gen_lm_sample_percentage))//len(prev_tasks) else: prev_tasks = [args.tasks[task_idx-1]] gen_size = int(gen_size * args.gen_lm_sample_percentage) datum = [] for prev_task in prev_tasks: with open(TASK_DICT[prev_task]["train"],"r") as f: data = data_expand(json.load(f)["data"]) indices = np.random.choice(range(len(data)), gen_size) for i in indices: d = parse_single_real_data(data[i],prev_task) datum.append(d) if encode: train_extra_data.append(TOKENIZER.encode(d)) model_dir = get_model_dir([prev_task]) dump_path = os.path.join(model_dir,"real.csv") write_extra_data(dump_path, datum) return dump_path
def read_extra_data(gen_path, train_extra_data): with open(gen_path,"r") as lm_file: reader = csv.reader(lm_file,delimiter=',') next(reader) for row in reader: row = TOKENIZER.encode(row[0].strip()) train_extra_data.append(row)
def parallel_tokenization(self, d): examples = [] context = TOKENIZER.encode(d["context"]) max_a_len = 0 for i3, qa in enumerate(d["qas"]): question = TOKENIZER.encode(qa["question"]) raw_answers = qa["answers"] if len(raw_answers) == 0: assert qa["is_impossible"] raw_answers.append({"text": ""}) answer = [] for i, raw_answer in enumerate(raw_answers): answer.extend(TOKENIZER.encode(raw_answer["text"])) if i != len(raw_answers) - 1: answer.append(self.pad_token) max_a_len = max(max_a_len, len(answer)) examples.append(self.parse_example(self.gen_token, context, question, answer, qa.get("id", 0 if not args.test_training_set else d["pid"]+"_%d"%i3))) return examples, max_a_len