def tokenizerFromJson(json_string): """Parses a JSON tokenizer configuration file and returns a tokenizer instance. # Arguments json_string: JSON string encoding a tokenizer configuration. # Returns A Keras Tokenizer instance """ tokenizer_config = json.load(json_string) config = tokenizer_config.get('config') word_counts = json.loads(config.pop('word_counts')) word_docs = json.loads(config.pop('word_docs')) index_docs = json.loads(config.pop('index_docs')) # Integer indexing gets converted to strings with json.dumps() index_docs = {int(k): v for k, v in index_docs.items()} index_word = json.loads(config.pop('index_word')) index_word = {int(k): v for k, v in index_word.items()} word_index = json.loads(config.pop('word_index')) tokenizer = Tokenizer(**config) tokenizer.word_counts = word_counts tokenizer.word_docs = word_docs tokenizer.index_docs = index_docs tokenizer.word_index = word_index tokenizer.index_word = index_word return tokenizer
def tokenize_sequences(self): import time tk = Tokenizer(num_words=None, char_level=True, lower=False) tk.word_index = {aa: i + 1 for i, aa in enumerate(amino_acid_alphabet)} tk.word_index['-'] = 0 tk.index_word = {i: aa for aa, i in tk.word_index.items()} self.df['sequence_tokenized'] = list( tk.texts_to_sequences(self.df['sequence'].tolist())) self.df['sequence_tokenized'] = self.df.apply( lambda row: np.array(row['sequence_tokenized'], dtype=np.uint8), axis=1) self.tokenizer = tk self.alphabet_size = len(self.tokenizer.word_index) return self
def _create_dataset(self): datapoint_train, datapoint_test, label_train, label_test = \ train_test_split(self._data['cleaned_text'], self._data['cleaned_summary'], test_size=0.1, random_state=0, shuffle=True) datapoint_tokenizer = Tokenizer() label_tokenizer = Tokenizer() if os.path.exists(Dataset._PREPARED_TOKEN_DATA): with open(Dataset._PREPARED_TOKEN_DATA, 'r') as fp: datapoint_tokenizer.word_index = json.load(fp) datapoint_tokenizer.index_word = dict([ (i, char) for char, i in datapoint_tokenizer.word_index.items() ]) else: datapoint_tokenizer.fit_on_texts(list(datapoint_train)) with open(Dataset._PREPARED_TOKEN_DATA, 'w') as fp: json.dump(datapoint_tokenizer.word_index, fp) if os.path.exists(Dataset._PREPARED_TOKEN_LABEL): with open(Dataset._PREPARED_TOKEN_LABEL, 'r') as fp: label_tokenizer.word_index = json.load(fp) label_tokenizer.index_word = dict([ (i, char) for char, i in label_tokenizer.word_index.items() ]) else: label_tokenizer.fit_on_texts(list(label_train)) with open(Dataset._PREPARED_TOKEN_LABEL, 'w') as fp: json.dump(label_tokenizer.word_index, fp) self.max_len_datapoint = 80 if os.path.exists(Dataset._PREPARED_TRAIN_DATA): self.datapoint_train = np.load(Dataset._PREPARED_TRAIN_DATA) else: self.datapoint_train = pad_sequences( datapoint_tokenizer.texts_to_sequences(datapoint_train), maxlen=self.max_len_datapoint, padding='post') np.save(Dataset._PREPARED_TRAIN_DATA, self.datapoint_train) if os.path.exists(Dataset._PREPARED_TEST_DATA): self.datapoint_test = np.load(Dataset._PREPARED_TEST_DATA) else: self.datapoint_test = pad_sequences( datapoint_tokenizer.texts_to_sequences(datapoint_test), maxlen=self.max_len_datapoint, padding='post') np.save(Dataset._PREPARED_TEST_DATA, self.datapoint_test) self.datapoint_vocab_size = len(datapoint_tokenizer.word_index) + 1 self.datapoint_tokenizer = datapoint_tokenizer self.label_tokenizer = label_tokenizer self.max_len_label = 10 if os.path.exists(Dataset._PREPARED_TRAIN_LABEL): self.label_train = np.load(Dataset._PREPARED_TRAIN_LABEL) else: self.label_train = pad_sequences( label_tokenizer.texts_to_sequences(label_train), maxlen=self.max_len_label, padding='post') np.save(Dataset._PREPARED_TRAIN_LABEL, self.label_train) if os.path.exists(Dataset._PREPARED_TEST_LABEL): self.label_test = np.load(Dataset._PREPARED_TEST_LABEL) else: self.label_test = pad_sequences( label_tokenizer.texts_to_sequences(label_test), maxlen=self.max_len_label, padding='post') np.save(Dataset._PREPARED_TEST_LABEL, self.label_test) self.label_vocab_size = len(label_tokenizer.word_index) + 1