def prepare_sequence(self, sentences, sos=False, eos=False, add_noise=False): """Prepare inputs to feed the model""" n_sen = list(sentences).copy() sos = self.tokenizer.SOS_TK if sos else "" eos = self.tokenizer.EOS_TK if eos else "" for i in range(len(n_sen)): if add_noise: n_sen[i] = pp.add_noise([n_sen[i]], self.tokenizer.maxlen)[0] n_sen[i] = self.tokenizer.encode(sos + n_sen[i][:self.tokenizer.maxlen - 2] + eos) n_sen[i] = np.pad(n_sen[i], (0, self.tokenizer.maxlen - len(n_sen[i]))) if self.one_hot_process: n_sen[i] = self.tokenizer.encode_one_hot(n_sen[i]) return np.asarray(n_sen, dtype=np.int16)
raw_path = os.path.join("..", "raw") data_path = os.path.join("..", "data") source_path = os.path.join(data_path, f"{args.source}.txt") output_path = os.path.join("..", "output", args.source, args.mode) target_path = os.path.join(output_path, "checkpoint_weights.hdf5") max_text_length = 128 charset_base = string.printable[:95] charset_special = """ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöùúûüý""" if args.transform: data = Dataset(source=os.path.join(raw_path, args.source)) data.read_lines(maxlen=max_text_length) valid_noised = pp.add_noise(data.dataset['valid'], max_text_length) test_noised = pp.add_noise(data.dataset['test'], max_text_length) valid_metrics = ev.ocr_metrics(ground_truth=data.dataset['valid'], data=valid_noised) info = "\n".join([ f"####", f"#### {args.source} partitions (number of sentences)", f"####", f"#### Total: {data.size['total']}", f"####", f"#### Train: {data.size['train']}", f"#### Validation: {data.size['valid']}", f"####\n", f"#### Validation Error Rate:",