示例#1
0
def tokenizer_from_json(json_string):
    """Parses a JSON tokenizer configuration file and returns a
    tokenizer instance.
    # Arguments
        json_string: JSON string encoding a tokenizer configuration.
    # Returns
        A Keras Tokenizer instance
    """
    tokenizer_config = json.loads(json_string)
    config = tokenizer_config.get('config')

    word_counts = json.loads(config.pop('word_counts'))
    word_docs = json.loads(config.pop('word_docs'))
    index_docs = json.loads(config.pop('index_docs'))
    # Integer indexing gets converted to strings with json.dumps()
    index_docs = {int(k): v for k, v in index_docs.items()}
    index_word = json.loads(config.pop('index_word'))
    index_word = {int(k): v for k, v in index_word.items()}
    word_index = json.loads(config.pop('word_index'))

    tokenizer = Tokenizer(**config)
    tokenizer.word_counts = word_counts
    tokenizer.word_docs = word_docs
    tokenizer.index_docs = index_docs
    tokenizer.word_index = word_index
    tokenizer.index_word = index_word

    return tokenizer
示例#2
0
 def loadTokenzier(self, directory):
     with open(directory, encoding='UTF-8-sig') as fh:
         data = json.load(fh)
     tk = Tokenizer()
     key = list(data.keys())
     for i in key:
         setattr(tk, i, data[i])
     VOCAB_SIZE = len(tk.word_index) + 1
     self.START_TOKEN, self.END_TOKEN = [VOCAB_SIZE], [VOCAB_SIZE + 1]
     self.VOCAB_SIZE = VOCAB_SIZE + 2
     # 토크나이저 로드하면 모든 key,value가 string으로 들어감 나중에 토큰을 텍스트로
     # 복원할 때 정상적으로 구동하기 위해서 index_word는 key를 int로 바꿔줌
     tk.index_word = {int(k): v for k, v in tk.index_word.items()}
     return tk
示例#3
0
def tokenizer_from_json(json_string):
    tokenizer_config = json.loads(json_string)
    config = tokenizer_config.get('config')

    word_counts = json.loads(config.pop('word_counts'))
    word_docs = json.loads(config.pop('word_docs'))
    index_docs = json.loads(config.pop('index_docs'))
    # Integer indexing gets converted to strings with json.dumps()
    index_docs = {int(k): v for k, v in index_docs.items()}
    index_word = json.loads(config.pop('index_word'))
    index_word = {int(k): v for k, v in index_word.items()}
    word_index = json.loads(config.pop('word_index'))

    tokenizer = Tokenizer(**config)
    tokenizer.word_counts = word_counts
    tokenizer.word_docs = word_docs
    tokenizer.index_docs = index_docs
    tokenizer.word_index = word_index
    tokenizer.index_word = index_word

    return tokenizer
示例#4
0
    word_counts = json.loads(config.pop('word_counts'))
    word_docs   = json.loads(config.pop('word_docs'))
    index_docs  = json.loads(config.pop('index_docs'))
    
    # Integer indexing gets converted to strings with json.dumps()
    index_docs = {int(k): v for k, v in index_docs.items()}
    index_word = json.loads(config.pop('index_word'))
    index_word = {int(k): v for k, v in index_word.items()}
    word_index = json.loads(config.pop('word_index'))
​
    tokenizer             = Tokenizer(**config)
    tokenizer.word_counts = word_counts
    tokenizer.word_docs   = word_docs
    tokenizer.index_docs  = index_docs
    tokenizer.word_index  = word_index
    tokenizer.index_word  = index_word
​
​
    return tokenizer
​
​
​
​
​
def create_tf_example_row(input_row):
​
    # convert to string
    password = str(input_row[0])
​
    # create tf example
    tf_example = tf.train.Example(features=tf.train.Features(feature={