def _create_embeddings(self, embeddings_set, vocabs): unif = self.config_params['unif'] keep_unused = self.config_params.get('keep_unused', False) if 'word' in vocabs: embeddings_section = self.config_params['word_embeddings'] embed_label = embeddings_section.get('label', None) embeddings = dict() if embed_label is not None: embed_file = embeddings_set[embed_label]['file'] embed_dsz = embeddings_set[embed_label]['dsz'] embed_sha1 = embeddings_set[embed_label].get('sha1',None) embeddings['word'] = Task._create_embeddings_from_file(embed_file, embed_dsz, embed_sha1, self.data_download_cache, vocabs['word'], unif=unif, keep_unused=keep_unused) else: dsz = embeddings_section['dsz'] embeddings['word'] = baseline.RandomInitVecModel(dsz, vocabs['word'], unif_weight=unif) if 'char' in vocabs: if self.config_params.get('charsz', -1) > 0: embeddings['char'] = baseline.RandomInitVecModel(self.config_params['charsz'], vocabs['char'], unif_weight=unif) extended_embed_info = self.config_params.get('extended_embed_info', {}) for key, vocab in vocabs.items(): if key in extended_embed_info: print('Adding extended feature embeddings {}'.format(key)) ext_embed = None if extended_embed_info[key].get("embedding", None) is None \ else extended_embed_info[key]["embedding"] ext_emb_dsz = extended_embed_info[key].get("dsz", None) if ext_embed is not None: EmbeddingT = baseline.GloVeModel if ext_embed.endswith('.txt') else baseline.Word2VecModel print("using {} to read external embedding file {}".format(EmbeddingT, ext_embed)) embeddings[key] = EmbeddingT(ext_embed, known_vocab=vocab, unif_weight=unif, keep_unused=False) else: print("randomly initializing external feature with dimension {}".format(ext_emb_dsz)) embeddings[key] = baseline.RandomInitVecModel(ext_emb_dsz, vocab, unif_weight=unif) elif key not in ['word', 'char']: raise Exception("Error: must specify a field '{}' in 'extended_embed_sz' dictionary for embedding dim size".format(key)) out_vocabs = {} for key, value in embeddings.items(): out_vocabs[key] = value.vocab return embeddings, out_vocabs
def _initialize_embedding(self, dimensions_size, vocab): return baseline.RandomInitVecModel(dimensions_size, vocab, False)
def _run(self, sess, model_file, embeddings_set, output_dir, model_version): self.word2index, vocab = ClassifyTensorFlowExporter.read_vocab( model_file) labels = self.load_labels(model_file) # Make the TF example, network input serialized_tf_example = tf.placeholder(tf.string, name='tf_example') feature_configs = { FIELD_NAME: tf.FixedLenFeature(shape=[], dtype=tf.string), } tf_example = tf.parse_example(serialized_tf_example, feature_configs) raw_posts = tf_example[FIELD_NAME] dense = tf.map_fn(self._preproc_post_creator(), raw_posts, dtype=tf.int64) word_embeddings = self.task.config_params["word_embeddings"] dsz = embeddings_set[word_embeddings["label"]]["dsz"] init_vectors = baseline.RandomInitVecModel(dsz, vocab, False) print(len(init_vectors.weights), len(vocab), init_vectors.vsz) model_params = self.task.config_params["model"] model_params["x"] = dense model_params["pkeep"] = 1 model_params["sess"] = sess print(model_params) model = baseline.tf.classify.create_model({'word': init_vectors}, labels, **model_params) softmax_output = tf.nn.softmax(model.logits) values, indices = tf.nn.top_k(softmax_output, len(labels)) class_tensor = tf.constant(model.labels) table = tf.contrib.lookup.index_to_string_table_from_tensor( class_tensor) classes = table.lookup(tf.to_int64(indices)) self.restore_model(sess, model_file) output_path = os.path.join(tf.compat.as_bytes(output_dir), tf.compat.as_bytes(str(model_version))) print('Exporting trained model to %s' % output_path) builder = tf.saved_model.builder.SavedModelBuilder(output_path) # Build the signature_def_map. classify_inputs_tensor = tf.saved_model.utils.build_tensor_info( serialized_tf_example) classes_output_tensor = tf.saved_model.utils.build_tensor_info(classes) scores_output_tensor = tf.saved_model.utils.build_tensor_info(values) classification_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ tf.saved_model.signature_constants.CLASSIFY_INPUTS: classify_inputs_tensor }, outputs={ tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: classes_output_tensor, tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES: scores_output_tensor }, method_name=tf.saved_model.signature_constants. CLASSIFY_METHOD_NAME)) predict_inputs_tensor = tf.saved_model.utils.build_tensor_info( raw_posts) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={'tokens': predict_inputs_tensor}, outputs={ 'classes': classes_output_tensor, 'scores': scores_output_tensor }, method_name=tf.saved_model.signature_constants. PREDICT_METHOD_NAME)) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ 'predict_text': prediction_signature, tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: classification_signature, }, legacy_init_op=legacy_init_op) builder.save() print('Successfully exported model to %s' % output_dir)
def _run(self, sess, model_file, embeddings_set, output_dir, model_version): self.word2index, vocab_word = TaggerTensorFlowExporter.read_vocab( model_file, 'word') self.char2index, vocab_char = TaggerTensorFlowExporter.read_vocab( model_file, 'char') upchars = tf.constant([chr(i) for i in range(65, 91)]) self.lchars = tf.constant([chr(i) for i in range(97, 123)]) self.upchars_lut = tf.contrib.lookup.index_table_from_tensor( mapping=upchars, num_oov_buckets=1, default_value=-1) labels = self.load_labels(model_file) # Make the TF example, network input serialized_tf_example = tf.placeholder(tf.string, name='tf_example') feature_configs = { FIELD_NAME: tf.FixedLenFeature(shape=[], dtype=tf.string), } tf_example = tf.parse_example(serialized_tf_example, feature_configs) raw_posts = tf_example[FIELD_NAME] # Run for each post x, xch, lengths = tf.map_fn(self._preproc_post_creator(), raw_posts, dtype=(tf.int64, tf.int64, tf.int32), back_prop=False) word_embeddings = self.task.config_params["word_embeddings"] dsz = embeddings_set[word_embeddings["label"]]["dsz"] char_dsz = self.task.config_params["charsz"] init_word_vectors = baseline.RandomInitVecModel(dsz, vocab_word, False) init_char_vectors = baseline.RandomInitVecModel( char_dsz, vocab_char, False) embeddings = {} embeddings['word'] = init_word_vectors embeddings['char'] = init_char_vectors vocabs = {} vocabs['word'] = vocab_word vocabs['char'] = vocab_char # WARNING: This can be a bug if the user defaults the values (-1) # for conll, the mxlen=124, for idr, the mxlen is forced to a max BPTT # for twpos, the mxlen=38 # this should probably be fixed by serializing the mxlen of the model # or rereading it from the tensor from file mxlen = self.task.config_params['preproc']['mxlen'] mxwlen = self.task.config_params['preproc']['mxwlen'] model_params = self.task.config_params["model"] model_params["x"] = x model_params["xch"] = xch model_params["lengths"] = lengths model_params["pkeep"] = 1 model_params["sess"] = sess model_params["maxs"] = mxlen model_params["maxw"] = mxwlen print(model_params) model = baseline.tf.tagger.create_model(labels, embeddings, **model_params) model.create_loss() softmax_output = tf.nn.softmax(model.probs) values, indices = tf.nn.top_k(softmax_output, 1) if model.crf is True: indices, _ = tf.contrib.crf.crf_decode( model.probs, model.A, tf.constant([mxlen])) ## We are assuming the batchsz is 1 here list_of_labels = [''] * len(labels) for label, idval in labels.items(): list_of_labels[idval] = label class_tensor = tf.constant(list_of_labels) table = tf.contrib.lookup.index_to_string_table_from_tensor( class_tensor) classes = table.lookup(tf.to_int64(indices)) self.restore_model(sess, model_file) output_path = os.path.join(tf.compat.as_bytes(output_dir), tf.compat.as_bytes(str(model_version))) print('Exporting trained model to %s' % output_path) builder = tf.saved_model.builder.SavedModelBuilder(output_path) # Build the signature_def_map. classify_inputs_tensor = tf.saved_model.utils.build_tensor_info( serialized_tf_example) classes_output_tensor = tf.saved_model.utils.build_tensor_info(classes) scores_output_tensor = tf.saved_model.utils.build_tensor_info(values) classification_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={ tf.saved_model.signature_constants.CLASSIFY_INPUTS: classify_inputs_tensor }, outputs={ tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: classes_output_tensor, tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES: scores_output_tensor }, method_name=tf.saved_model.signature_constants. CLASSIFY_METHOD_NAME)) predict_inputs_tensor = tf.saved_model.utils.build_tensor_info( raw_posts) prediction_signature = ( tf.saved_model.signature_def_utils.build_signature_def( inputs={'tokens': predict_inputs_tensor}, outputs={ 'classes': classes_output_tensor, 'scores': scores_output_tensor }, method_name=tf.saved_model.signature_constants. PREDICT_METHOD_NAME)) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ 'tag_text': prediction_signature, tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: classification_signature, }, legacy_init_op=legacy_init_op) builder.save() print('Successfully exported model to %s' % output_dir)