def tokenizer_from_json(json_string): """Parses a JSON tokenizer configuration file and returns a tokenizer instance. # Arguments json_string: JSON string encoding a tokenizer configuration. # Returns A Keras Tokenizer instance """ tokenizer_config = json.loads(json_string) config = tokenizer_config.get('config') word_counts = json.loads(config.pop('word_counts')) word_docs = json.loads(config.pop('word_docs')) index_docs = json.loads(config.pop('index_docs')) # Integer indexing gets converted to strings with json.dumps() index_docs = {int(k): v for k, v in index_docs.items()} index_word = json.loads(config.pop('index_word')) index_word = {int(k): v for k, v in index_word.items()} word_index = json.loads(config.pop('word_index')) tokenizer = Tokenizer(**config) tokenizer.word_counts = word_counts tokenizer.word_docs = word_docs tokenizer.index_docs = index_docs tokenizer.word_index = word_index tokenizer.index_word = index_word return tokenizer
def load_tokenizer_from_file(filename): tokenizer = Tokenizer() with open(filename, 'r') as infile: tokenizer_data = json.load(infile) tokenizer.word_counts = OrderedDict(tokenizer_data['word_counts']) tokenizer.word_docs = tokenizer_data['word_docs'] tokenizer.word_index = tokenizer_data['word_index'] tokenizer.document_count = tokenizer_data['document_count'] tokenizer.index_docs = tokenizer_data['index_docs'] return tokenizer
def tokenizer_from_json(json_string): tokenizer_config = json.loads(json_string) config = tokenizer_config.get('config') word_counts = json.loads(config.pop('word_counts')) word_docs = json.loads(config.pop('word_docs')) index_docs = json.loads(config.pop('index_docs')) # Integer indexing gets converted to strings with json.dumps() index_docs = {int(k): v for k, v in index_docs.items()} index_word = json.loads(config.pop('index_word')) index_word = {int(k): v for k, v in index_word.items()} word_index = json.loads(config.pop('word_index')) tokenizer = Tokenizer(**config) tokenizer.word_counts = word_counts tokenizer.word_docs = word_docs tokenizer.index_docs = index_docs tokenizer.word_index = word_index tokenizer.index_word = index_word return tokenizer
tokenizer_config = json.loads(json_string) config = tokenizer_config.get('config') word_counts = json.loads(config.pop('word_counts')) word_docs = json.loads(config.pop('word_docs')) index_docs = json.loads(config.pop('index_docs')) # Integer indexing gets converted to strings with json.dumps() index_docs = {int(k): v for k, v in index_docs.items()} index_word = json.loads(config.pop('index_word')) index_word = {int(k): v for k, v in index_word.items()} word_index = json.loads(config.pop('word_index')) tokenizer = Tokenizer(**config) tokenizer.word_counts = word_counts tokenizer.word_docs = word_docs tokenizer.index_docs = index_docs tokenizer.word_index = word_index tokenizer.index_word = index_word return tokenizer def create_tf_example_row(input_row): # convert to string