예제 #1
0
 def read(self, data_path: str, split: str) -> Iterable[DataExample]:
     input_path = self.get_split_path(data_path, split)
     normalizer = TextNormalizer()
     with open(input_path, "r", encoding="utf-8") as input_file:
         for idx, line in enumerate(input_file):
             if idx == 0: continue
             values = line.split("\t")
             input1: str = normalizer.process(values[1].strip())
             input2: str = normalizer.process(values[2].strip())
             relatedness: float = float(values[3].strip())
             entailment: str = values[4].strip()
             yield self.create_example(input1, input2, relatedness,
                                       entailment)
예제 #2
0
 def read(self, data_path: str, split: str) -> Iterable[DataExample]:
     split = split if split == "train" else f"out-{split}"
     path = self.get_split_path(data_path, split)
     normalizer = TextNormalizer()
     with open(path, "r", encoding="utf-8") as input_file:
         for line in input_file:
             words = line.split()
             label = words[-1]
             text = " ".join(words[0:-1])
             text = text.replace(" em ", "em ").replace(" śmy ",
                                                        "śmy ").replace(
                                                            " m ", "m ")
             text = normalizer.process(text)
             yield DataExample(text, label)
예제 #3
0
 def read(self, data_path: str, split: str) -> Iterable[DataExample]:
     split_name = "training" if split == "train" else split
     file_pattern = "{}_set_clean_only_{}.txt"
     text_path = os.path.join(data_path, self._spec.task_path(),
                              file_pattern.format(split_name, "text"))
     tags_path = os.path.join(data_path, self._spec.task_path(),
                              file_pattern.format(split_name, "tags"))
     normalizer = TextNormalizer(detokenize=False)
     with open(text_path, "r", encoding="utf-8") as text_file, open(
             tags_path, "r", encoding="utf-8") as tags_file:
         text_lines = text_file.readlines()
         tags_lines = tags_file.readlines()
         assert len(text_lines) == len(tags_lines)
         for idx in range(len(text_lines)):
             text = normalizer.process(text_lines[idx].strip())
             text = text.replace("@anonymized_account", "@ użytkownik")
             label = tags_lines[idx].strip()
             yield DataExample(text, label)
예제 #4
0
 def read_simple(self,
                 data_path: str,
                 split: str,
                 separator: str = " ",
                 label_first: bool = True,
                 normalize: bool = True):
     label_idx = 0 if label_first else 1
     text_idx = 1 if label_first else 0
     input_path = self.get_split_path(data_path, split)
     normalize_func = lambda val: val
     if normalize:
         normalizer = TextNormalizer()
         normalize_func = lambda val: normalizer.process(val)
     with open(input_path, "r", encoding="utf-8") as input_file:
         for line in input_file:
             values = line.split(sep=separator, maxsplit=1)
             label = values[label_idx]
             text = values[text_idx].strip()
             text = normalize_func(text)
             yield DataExample(text, label)
예제 #5
0
 def normalizer(self) -> TextNormalizer:
     return TextNormalizer(detokenize=False)
예제 #6
0
 def normalizer(self):
     return TextNormalizer()
예제 #7
0
 def normalizer(self):
     return TextNormalizer(detokenize=False, lang="en")