def __init__(self, data_params={}, tokenizer_pl=[], _tokenizer_dir='only_news_commentary'): # initialize variables self.__data_params = data_params self.__tokenizer_pl = tokenizer_pl self.tokenizer_dir = f'{_tokenizer_dir}_{self.__data_params["vocab_size"]}' self.tokenizer_path = os.path.join( create_dir(data_dir, 'tokenizer', self.tokenizer_dir), 'tokenizer.pkl') if os.path.isfile(self.tokenizer_path): return # load data from files data = news_commentary.zh_en() data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO) data = reduce(lambda x, y: x + y, data) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get tokenizer self.__tokenizer_src, self.__tokenizer_tar = list(zip(*data)) self.get_tokenizer()
def __init__(self, start_ratio=0.0, end_ratio=0.8, sample_rate=1.0): # load data from files data = news_commentary.zh_en() data = self.__split_data(data, start_ratio, end_ratio) data = reduce(lambda x, y: x + y, data) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # sample data if the data size is too big; low resource setting data = self.sample_data(data, sample_rate) self.__src_data, self.__tar_data = list(zip(*data))
def __init__(self, start_ratio=0.0, end_ratio=0.98, _sample_rate=1.0, data_params={}, tokenizer_pl=[], encoder_pl=[], _tokenizer_dir='cdlm', _dataset='cdlm'): # initialize variables self.__data_params = data_params self.__tokenizer_pl = tokenizer_pl self.__encoder_pl = encoder_pl self.__sample_rate = _sample_rate self.__tokenizer_path = os.path.join( create_dir(data_dir, 'tokenizer', _tokenizer_dir), 'tokenizer.pkl') self.__processed_dir_path = create_dir(data_dir, 'preprocessed', _dataset) # load data from files data = news_commentary.zh_en() data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get tokenizer if os.path.isfile(self.__tokenizer_path): self.__tokenizer = load_pkl(self.__tokenizer_path) else: tmp_data = reduce(lambda x, y: x + y, data) self.__tokenizer_src, self.__tokenizer_tar = list(zip(*tmp_data)) self.get_tokenizer() # get the data set (train or validation or test) data = self.__split_data(data, start_ratio, end_ratio) data = reduce(lambda x, y: x + y, data) self.gen_preprocessed_data(data, self.BATCH_SIZE_PER_FILE)
def __init__(self, start_ratio=0.0, end_ratio=0.98, _dataset='cdlm'): # initialize variables self.__processed_dir_path = create_dir(data_dir, 'un_preprocessed', _dataset) # load data from files data = news_commentary.zh_en() data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) # get the data set (train or validation or test) data = self.__split_data(data, start_ratio, end_ratio) data = reduce(lambda x, y: x + y, data) # shuffle the data random.seed(self.RANDOM_STATE) random.shuffle(data) self.gen_data(data, self.BATCH_SIZE_PER_FILE)
def __load_from_news_commentary(self): data = news_commentary.zh_en() data = self.__split_data(data, 0., self.NMT_TRAIN_RATIO_NEWS_COMMENTARY) return reduce(lambda x, y: x + y, data)