def read(self, data_path: str, split: str) -> Iterable[DataExample]: input_path = self.get_split_path(data_path, split) normalizer = TextNormalizer() with open(input_path, "r", encoding="utf-8") as input_file: for idx, line in enumerate(input_file): if idx == 0: continue values = line.split("\t") input1: str = normalizer.process(values[1].strip()) input2: str = normalizer.process(values[2].strip()) relatedness: float = float(values[3].strip()) entailment: str = values[4].strip() yield self.create_example(input1, input2, relatedness, entailment)
def read(self, data_path: str, split: str) -> Iterable[DataExample]: split = split if split == "train" else f"out-{split}" path = self.get_split_path(data_path, split) normalizer = TextNormalizer() with open(path, "r", encoding="utf-8") as input_file: for line in input_file: words = line.split() label = words[-1] text = " ".join(words[0:-1]) text = text.replace(" em ", "em ").replace(" śmy ", "śmy ").replace( " m ", "m ") text = normalizer.process(text) yield DataExample(text, label)
def read(self, data_path: str, split: str) -> Iterable[DataExample]: split_name = "training" if split == "train" else split file_pattern = "{}_set_clean_only_{}.txt" text_path = os.path.join(data_path, self._spec.task_path(), file_pattern.format(split_name, "text")) tags_path = os.path.join(data_path, self._spec.task_path(), file_pattern.format(split_name, "tags")) normalizer = TextNormalizer(detokenize=False) with open(text_path, "r", encoding="utf-8") as text_file, open( tags_path, "r", encoding="utf-8") as tags_file: text_lines = text_file.readlines() tags_lines = tags_file.readlines() assert len(text_lines) == len(tags_lines) for idx in range(len(text_lines)): text = normalizer.process(text_lines[idx].strip()) text = text.replace("@anonymized_account", "@ użytkownik") label = tags_lines[idx].strip() yield DataExample(text, label)
def read_simple(self, data_path: str, split: str, separator: str = " ", label_first: bool = True, normalize: bool = True): label_idx = 0 if label_first else 1 text_idx = 1 if label_first else 0 input_path = self.get_split_path(data_path, split) normalize_func = lambda val: val if normalize: normalizer = TextNormalizer() normalize_func = lambda val: normalizer.process(val) with open(input_path, "r", encoding="utf-8") as input_file: for line in input_file: values = line.split(sep=separator, maxsplit=1) label = values[label_idx] text = values[text_idx].strip() text = normalize_func(text) yield DataExample(text, label)
def normalizer(self) -> TextNormalizer: return TextNormalizer(detokenize=False)
def normalizer(self): return TextNormalizer()
def normalizer(self): return TextNormalizer(detokenize=False, lang="en")