def get_train_examples(self): return self._create_examples( lines=read_file_lines(self.train_path, strip_lines=True), ans_lines=read_file_lines(self.path_dict["train_ans"], strip_lines=True), set_type="train", )
def get_val_examples(self): return self._create_examples( lines=read_file_lines(self.val_path, strip_lines=True), ans_lines=read_file_lines(self.path_dict["val_ans"], strip_lines=True), set_type="val", )
def _get_examples(self, phase): eng_examples = self._create_examples( lines=read_file_lines(self.path_dict[phase]["eng"]), is_english=True, set_type=phase, ) other_examples = self._create_examples( lines=read_file_lines(self.path_dict[phase]["other"]), is_english=False, set_type=phase, ) return eng_examples + other_examples
def get_val_examples(self): eng_examples = self._create_examples( lines=read_file_lines(self.path_dict["eng"]), is_english=True, set_type="val", ) other_examples = self._create_examples( lines=read_file_lines(self.path_dict["other"]), is_english=False, set_type="val", ) return eng_examples + other_examples
def download_mutual_plus_data_and_write_config( task_name: str, task_data_path: str, task_config_path: str ): os.makedirs(task_data_path, exist_ok=True) os.makedirs(task_data_path + "/train", exist_ok=True) os.makedirs(task_data_path + "/dev", exist_ok=True) os.makedirs(task_data_path + "/test", exist_ok=True) num_files = {"train": 7088, "dev": 886, "test": 886} for phase in num_files: examples = [] for i in range(num_files[phase]): file_name = phase + "_" + str(i + 1) + ".txt" download_utils.download_file( f"https://raw.githubusercontent.com/Nealcly/MuTual/" + f"master/data/mutual_plus/{phase}/{file_name}", os.path.join(task_data_path, phase, file_name), ) for line in py_io.read_file_lines(os.path.join(task_data_path, phase, file_name)): examples.append(line) py_io.write_jsonl(examples, os.path.join(task_data_path, phase + ".jsonl")) shutil.rmtree(os.path.join(task_data_path, phase)) py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train.jsonl"), "val": os.path.join(task_data_path, "dev.jsonl"), "test": os.path.join(task_data_path, "test.jsonl"), }, "name": task_name, }, path=task_config_path, )
def _create_examples(cls, data_path, set_type): curr_token_list, curr_pos_list = [], [] data_lines = read_file_lines(data_path, "r", encoding="utf-8") examples = [] idx = 0 for data_line in data_lines: data_line = data_line.strip() if data_line: if set_type == "test": line_tokens = data_line.split("\t") if len(line_tokens) == 2: token, pos = line_tokens else: token, pos = data_line, None else: token, pos = data_line.split("\t") curr_token_list.append(token) curr_pos_list.append(pos) else: examples.append( Example( guid="%s-%s" % (set_type, idx), tokens=curr_token_list, pos_list=curr_pos_list, )) idx += 1 curr_token_list, curr_pos_list = [], [] if curr_token_list: examples.append( Example(guid="%s-%s" % (idx, idx), tokens=curr_token_list, pos_list=curr_pos_list)) return examples
def get_val_examples(self): return self._create_examples( lines=zip( read_json_lines(self.val_path), read_file_lines(self.path_dict["val_labels"], strip_lines=True), ), set_type="val", )
def _create_examples(cls, data_path, idx_path, set_type): curr_token_list, curr_pos_list, idx_ls = [], [], [] data_lines = read_file_lines(data_path, "r", encoding="utf-8") idx_lines = read_file_lines(idx_path, "r", encoding="utf-8") examples = [] for data_line, idx_line in zip_equal(data_lines, idx_lines): data_line, idx_line = data_line.strip(), idx_line.strip() assert bool(data_line) == bool(idx_line) if data_line: if set_type == "test": line_tokens = data_line.split("\t") if len(line_tokens) == 2: token, pos = line_tokens else: token, pos = data_line, None else: token, pos = data_line.split("\t") curr_token_list.append(token) curr_pos_list.append(pos) idx_ls.append(int(idx_line)) else: idx = get_all_same(idx_ls) examples.append( Example( guid="%s-%s" % (set_type, idx), tokens=curr_token_list, pos_list=curr_pos_list, )) curr_token_list, curr_pos_list, idx_ls = [], [], [] if curr_token_list: idx = get_all_same(idx_ls) examples.append( Example( guid="%s-%s" % (idx, idx), tokens=curr_token_list, pos_list=curr_pos_list, )) return examples
def _read_labels(cls, path): lines = read_file_lines(path) return [int(line.strip()) for line in lines]
def get_test_examples(self): return self._create_examples(lines=read_file_lines(self.test_path), set_type="test")
def get_val_examples(self): return self._create_examples(lines=read_file_lines(self.val_path), set_type="val")
def get_train_examples(self): return self._create_examples(lines=read_file_lines(self.train_path), set_type="train")
def get_test_examples(self): return self._create_examples( lines=read_file_lines(self.test_path, strip_lines=True), ans_lines=None, set_type="test", )
def download_tatoeba_data_and_write_config(task_data_base_path: str, task_config_base_path: str): tatoeba_temp_path = py_io.create_dir(task_data_base_path, "tatoeba_temp") download_utils.download_and_unzip( "https://github.com/facebookresearch/LASER/archive/master.zip", tatoeba_temp_path, ) languages_dict = { "afr": "af", "ara": "ar", "bul": "bg", "ben": "bn", "deu": "de", "ell": "el", "spa": "es", "est": "et", "eus": "eu", "pes": "fa", "fin": "fi", "fra": "fr", "heb": "he", "hin": "hi", "hun": "hu", "ind": "id", "ita": "it", "jpn": "ja", "jav": "jv", "kat": "ka", "kaz": "kk", "kor": "ko", "mal": "ml", "mar": "mr", "nld": "nl", "por": "pt", "rus": "ru", "swh": "sw", "tam": "ta", "tel": "te", "tha": "th", "tgl": "tl", "tur": "tr", "urd": "ur", "vie": "vi", "cmn": "zh", "eng": "en", } raw_base_path = os.path.join(tatoeba_temp_path, "LASER-master", "data", "tatoeba", "v1") for full_lang, lang in languages_dict.items(): task_name = f"tatoeba_{lang}" if lang == "en": continue task_data_path = py_io.create_dir(task_data_base_path, task_name) eng_src = os.path.join(raw_base_path, f"tatoeba.{full_lang}-eng.eng") other_src = os.path.join(raw_base_path, f"tatoeba.{full_lang}-eng.{full_lang}") eng_out = os.path.join(task_data_path, f"{lang}-en.en") other_out = os.path.join(task_data_path, f"{lang}-en.{lang}") labels_out = os.path.join(task_data_path, f"{lang}-en.labels") tgts = [line.strip() for line in py_io.read_file_lines(eng_src)] os.rename(src=other_src, dst=other_out) idx = range(len(tgts)) data = zip(tgts, idx) # Tatoeba is a retrieval dataset where you have a set of sentences in English and another # set in another language, and you need to match them. It also doesn't have training # data, so it's pretty much evaluation only. However, the dataset is distributed with the # sentences in order, i.e. the retrieval pairing is the sentence order. # # The XTREME authors intentionally scramble the order by sorting one of the two # sets alphabetically. We're following their recipe, but also retaining the labels for # internal scoring. with py_io.get_lock(eng_out): with py_io.get_lock(labels_out): if os.path.exists(eng_out) and os.path.exists(labels_out): logger.info('Skip writing to %s since it already exists.', eng_out) logger.info('Skip writing to %s since it already exists.', labels_out) else: with open(eng_out, "w") as ftgt, open(labels_out, "w") as flabels: for t, i in sorted(data, key=lambda x: x[0]): ftgt.write(f"{t}\n") flabels.write(f"{i}\n") py_io.write_json( data={ "task": "tatoeba", "paths": { "eng": eng_out, "other": other_out, "labels_path": labels_out }, "kwargs": { "language": lang }, "name": task_name, }, path=os.path.join(task_config_base_path, f"{task_name}_config.json"), skip_if_exists=True, ) shutil.rmtree(tatoeba_temp_path)