def tokenizerFromJson(json_string):
    """Parses a JSON tokenizer configuration file and returns a
    tokenizer instance.
    # Arguments
        json_string: JSON string encoding a tokenizer configuration.
    # Returns
        A Keras Tokenizer instance
    """
    tokenizer_config = json.load(json_string)
    config = tokenizer_config.get('config')

    word_counts = json.loads(config.pop('word_counts'))
    word_docs = json.loads(config.pop('word_docs'))
    index_docs = json.loads(config.pop('index_docs'))
    # Integer indexing gets converted to strings with json.dumps()
    index_docs = {int(k): v for k, v in index_docs.items()}
    index_word = json.loads(config.pop('index_word'))
    index_word = {int(k): v for k, v in index_word.items()}
    word_index = json.loads(config.pop('word_index'))

    tokenizer = Tokenizer(**config)
    tokenizer.word_counts = word_counts
    tokenizer.word_docs = word_docs
    tokenizer.index_docs = index_docs
    tokenizer.word_index = word_index
    tokenizer.index_word = index_word

    return tokenizer
Exemplo n.º 2
0
 def tokenize_sequences(self):
     import time
     tk = Tokenizer(num_words=None, char_level=True, lower=False)
     tk.word_index = {aa: i + 1 for i, aa in enumerate(amino_acid_alphabet)}
     tk.word_index['-'] = 0
     tk.index_word = {i: aa for aa, i in tk.word_index.items()}
     self.df['sequence_tokenized'] = list(
         tk.texts_to_sequences(self.df['sequence'].tolist()))
     self.df['sequence_tokenized'] = self.df.apply(
         lambda row: np.array(row['sequence_tokenized'], dtype=np.uint8),
         axis=1)
     self.tokenizer = tk
     self.alphabet_size = len(self.tokenizer.word_index)
     return self
    def _create_dataset(self):
        datapoint_train, datapoint_test, label_train, label_test = \
            train_test_split(self._data['cleaned_text'], self._data['cleaned_summary'],
                             test_size=0.1, random_state=0, shuffle=True)

        datapoint_tokenizer = Tokenizer()
        label_tokenizer = Tokenizer()

        if os.path.exists(Dataset._PREPARED_TOKEN_DATA):
            with open(Dataset._PREPARED_TOKEN_DATA, 'r') as fp:
                datapoint_tokenizer.word_index = json.load(fp)
            datapoint_tokenizer.index_word = dict([
                (i, char)
                for char, i in datapoint_tokenizer.word_index.items()
            ])
        else:
            datapoint_tokenizer.fit_on_texts(list(datapoint_train))
            with open(Dataset._PREPARED_TOKEN_DATA, 'w') as fp:
                json.dump(datapoint_tokenizer.word_index, fp)

        if os.path.exists(Dataset._PREPARED_TOKEN_LABEL):
            with open(Dataset._PREPARED_TOKEN_LABEL, 'r') as fp:
                label_tokenizer.word_index = json.load(fp)
            label_tokenizer.index_word = dict([
                (i, char) for char, i in label_tokenizer.word_index.items()
            ])

        else:
            label_tokenizer.fit_on_texts(list(label_train))
            with open(Dataset._PREPARED_TOKEN_LABEL, 'w') as fp:
                json.dump(label_tokenizer.word_index, fp)

        self.max_len_datapoint = 80

        if os.path.exists(Dataset._PREPARED_TRAIN_DATA):
            self.datapoint_train = np.load(Dataset._PREPARED_TRAIN_DATA)
        else:
            self.datapoint_train = pad_sequences(
                datapoint_tokenizer.texts_to_sequences(datapoint_train),
                maxlen=self.max_len_datapoint,
                padding='post')
            np.save(Dataset._PREPARED_TRAIN_DATA, self.datapoint_train)

        if os.path.exists(Dataset._PREPARED_TEST_DATA):
            self.datapoint_test = np.load(Dataset._PREPARED_TEST_DATA)
        else:
            self.datapoint_test = pad_sequences(
                datapoint_tokenizer.texts_to_sequences(datapoint_test),
                maxlen=self.max_len_datapoint,
                padding='post')
            np.save(Dataset._PREPARED_TEST_DATA, self.datapoint_test)

        self.datapoint_vocab_size = len(datapoint_tokenizer.word_index) + 1
        self.datapoint_tokenizer = datapoint_tokenizer
        self.label_tokenizer = label_tokenizer

        self.max_len_label = 10
        if os.path.exists(Dataset._PREPARED_TRAIN_LABEL):
            self.label_train = np.load(Dataset._PREPARED_TRAIN_LABEL)
        else:
            self.label_train = pad_sequences(
                label_tokenizer.texts_to_sequences(label_train),
                maxlen=self.max_len_label,
                padding='post')
            np.save(Dataset._PREPARED_TRAIN_LABEL, self.label_train)

        if os.path.exists(Dataset._PREPARED_TEST_LABEL):
            self.label_test = np.load(Dataset._PREPARED_TEST_LABEL)
        else:
            self.label_test = pad_sequences(
                label_tokenizer.texts_to_sequences(label_test),
                maxlen=self.max_len_label,
                padding='post')
            np.save(Dataset._PREPARED_TEST_LABEL, self.label_test)

        self.label_vocab_size = len(label_tokenizer.word_index) + 1