Пример #1
0
    def split_data(self, test_max_size: int = None, source_vocab_size: int = None, target_vocab_size: int = None,
                   shuffle=True, take_parallel_data=True, take_corpus_instances=False, legacy_test=None):
        """
        Guarantees test data without data identical to training and only with vocabulary that exists in training


        :param test_max_size: int = max examples on test data
        :param source_vocab_size: int = restrict most common vocab
        :param target_vocab_size: int = restrict most common vocab
        :param shuffle: bool = randomize
        :param take_parallel_data: bool = zipped data if true else return (x_train, y_train, x_test, y_test)
        :param take_corpus_instances: bool = return new instances for train data and test data
        :param legacy_test: List[Tuple[str,str]] = parallel data
        """
        self.source.reset_freq()
        self.target.reset_freq()
        train = []
        test = []

        if legacy_test is not None:
            test = Corpus(*self.distinct_from_parallel(legacy_test), source_name=self.source_language,
                          target_name=self.target_language)

        test_max_size = test_max_size if test_max_size is not None and isinstance(test_max_size, (int, float)) else len(
                self.source.data) - self.n_train
        if source_vocab_size is not None or target_vocab_size is not None:
            data = list(self._get_vocab_data(source_vocab_size=source_vocab_size,
                                             target_vocab_size=target_vocab_size))
        else:
            data = list(zip(self.source.data, self.target.data))

        if shuffle:
            random.shuffle(data)

        for x, y in data:
            # remove blank line
            if x == '' or y == '':
                continue
            if legacy_test is not None:
                # remove sentence from train.
                if self.source.preprocess(x) in test.source.phrases_freq:
                    continue
            if (self._can_go_test(x, y) and len(test) < test_max_size) and legacy_test is None:
                test.append([x, y])
                self._update_filters(x, y)
                continue
            train.append([x, y])

        if take_parallel_data is False:
            return (*get_cols(train), *get_cols(test))
        if take_corpus_instances is True:
            train = self.load_from_parallel_data(train, self.source_language, self.target_language)
            test = self.load_from_parallel_data(test, self.source_language, self.target_language)
            return train, test
        return train, test
Пример #2
0
 def __getitem__(self, item):
     if isinstance(item, str):
         if item not in self.cols:
             raise ValueError(f'Column name <{item}> not in {self.cols}')
         return get_cols(self.data)[self.cols.index(item)]
     if isinstance(item, tuple):
         lines, col = item
         if isinstance(col, tuple):
             raise ValueError("Isn't Possible.")
         assert col <= self.n_cols - 1, ValueError(
                 f"Invalid Column. Choice available index {list(range(self.n_cols))}")
         return get_cols(self.data)[col][lines]
     return super().__getitem__(item)
Пример #3
0
 def to_dict(self):
     return dict(zip(self.cols, get_cols(self.lines)))
Пример #4
0
 def to_dict(self):
     return dict(zip(self.cols, get_cols(self.data)))
Пример #5
0
 def distinct_from_parallel(cls, data):
     return get_cols(data)