def print_ckpt_tensor_name(checkpoint_path): import pdb pdb.set_trace() num_tensor = 0 ckpt = 'ckpt' checkpoint_name = None os.chdir(checkpoint_path) for each_file in os.listdir(os.curdir): if ckpt in each_file: checkpoint_name = each_file.split( ckpt)[0] + ckpt + each_file.split(ckpt)[1].split('.')[0] break if checkpoint_name is None: return model_reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_name) var_dict = model_reader.get_variable_to_shape_map() for key in var_dict: num_tensor = num_tensor + 1 print(key + " " + str(model_reader.get_tensor(key).shape)) if key == 'bert/encoder/layer_2/attention/self/qkv_weight'\ or key == 'bert/embeddings/word_embeddings'\ or key == 'bert/pooler/dense/kernel': print(model_reader.get_tensor(key)) print(num_tensor)
def get_embedding(): model_dir = '/home/zpl/Model_rnn/whole_256_3/model_longtime' + '-%d' % 16 reader = pywrap_tensorflow.NewCheckpointReader(model_dir) var_to_shape_map = reader.get_variable_to_shape_map() for key in var_to_shape_map.keys(): # print(var_to_shape_map[key]) if key == 'model/embedding': # print("tensor_name: ", key) embedding = reader.get_tensor(key) return embedding
def convert_compare_ipu_gpu(ckpt_a, ckpt_b): graph = tf.Graph() reader_a = pywrap_tensorflow.NewCheckpointReader(ckpt_a) reader_b = pywrap_tensorflow.NewCheckpointReader(ckpt_b) var_to_shape_map_a = reader_a.get_variable_to_shape_map() var_to_shape_map_b = reader_b.get_variable_to_shape_map() # import pdb # pdb.set_trace() with graph.as_default(): sess = tf.Session() for tensor_name in var_to_shape_map_a: try: tensor_value_a = reader_a.get_tensor(tensor_name) tensor_value_b = reader_b.get_tensor(tensor_name) if tensor_value_a.any() != tensor_value_b.any(): print(tensor_name) except: print("Not found tensor:{}".format(tensor_name)) print("finish compare!")
def print_vars(data_type=np.float16): checkpoint_name='ckpt_noshuffDIEN3' curent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))+'/' out_dir = curent_dir + "model-F16"+'/' if not os.path.exists(out_dir): os.mkdir(out_dir) reader=pywrap_tensorflow.NewCheckpointReader(args.ckpt) var_to_map = reader.get_variable_to_shape_map() val_f = {} for key, dim in var_to_map.items(): val_f[key.strip(":0")] = tf.Variable(reader.get_tensor(key).astype(data_type)) #get parameters before convert param_log_origin='' for key in var_to_map: param_log_origin += "tensor_name: "+key+" shape:"+str(reader.get_tensor(key).shape)+"\r\n" param_log_origin += str(reader.get_tensor(key))+"\r\n" writer = open(out_dir+'Param-'+str(reader.get_tensor(key).dtype)+'.txt', 'w', encoding="utf-8") writer.write(param_log_origin) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) new_saver = tf.train.import_meta_graph(args.ckpt+'.meta') new_saver.restore(sess,args.ckpt) saver = tf.train.Saver(val_f) saver.save(sess, out_dir+checkpoint_name) #save parameters after convert reader_convert = pywrap_tensorflow.NewCheckpointReader(out_dir+checkpoint_name) var_to_map_convert = reader_convert.get_variable_to_shape_map() param_log_convert='' for item in var_to_map_convert: param_log_convert += "tensor_name: "+item+" shape:"+str(reader_convert.get_tensor(item).shape)+"\r\n" param_log_convert += str(reader_convert.get_tensor(item))+"\r\n" writer = open(out_dir+'Param-'+str(reader_convert.get_tensor(item).dtype)+'.txt', 'w', encoding="utf-8") writer.write(param_log_convert) print("Convert Finish!") print("Save to path:"+out_dir)
def get_embeding( path="/home/xihuaiwen/chinese/CLUE_B/baselines/models/bert/prev_trained_model/chinese_L-12_H-768_A-12/gc_ckpt/model.ckpt-525000" ): graph = tf.Graph() reader = pywrap_tensorflow.NewCheckpointReader(path) var_to_shape_map = reader.get_variable_to_shape_map() with graph.as_default(): sess = tf.Session() for key in var_to_shape_map: if "adam" not in key and "Momentum" not in key: if 'word_embeddings' in key: val = reader.get_tensor(key) return val
def load_initializers_from_checkpoint(checkpoint_path): initializers = {} reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_path) var_to_map = reader.get_variable_to_shape_map() for key, dim in var_to_map.items(): if key == 'global_step': continue # if reader.get_tensor(key).dtype.name == 'float16': # int_data = np.asarray(reader.get_tensor(key), np.int32) # np_weight = int_data.view(dtype=np.float16).reshape(dim) # else: np_weight = reader.get_tensor(key) initializers[key] = np_weight return initializers
def print_ckpt_tensor_name(checkpoint_path): ckpt = '.ckpt' checkpoint_name = None os.chdir(checkpoint_path) for each_file in os.listdir(os.curdir): if ckpt in each_file: checkpoint_name = each_file.split(ckpt)[0] + ckpt break if checkpoint_name is None: return model_reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_name) var_dict = model_reader.get_variable_to_shape_map() for key in var_dict: print(key) if key == 'bert/encoder/layer_1/attention/output/dense/bias': print(model_reader.get_tensor(key))
def convert_ckpt_to_fp16(ckpt_file: str) -> tf.train.Saver: """Convert checkpoint to fp16 weights and return saver. Args: ckpt_file: Path to checkpoint file. Returns: tf.train.Saver object initialized with dictionary of fp16 variables. """ # Strip .data-xxxx-xxxx if not ckpt_file.endswith(".ckpt"): ckpt_file = ckpt_file.rsplit('.', 1)[0] reader = pywrap_tensorflow.NewCheckpointReader(ckpt_file) var_to_map = reader.get_variable_to_shape_map() val_f16 = {} for key, _ in var_to_map.items(): val_f16[key.strip(":0")] = tf.Variable(reader.get_tensor(key).astype(np.float16)) saver = tf.train.Saver(val_f16) return saver
def convert_ckpt_to_fp(checkpoint_path,data_type=np.float16): """Convert checkpoint to fp weights and return saver. Args: init_checkpoint: Path to checkpoint file. data_type: np.float16, np.float32, np.float64, """ ckpt = '.ckpt' sync_file = [] checkpoint_name = None os.chdir(checkpoint_path) for each_file in os.listdir(os.curdir): if ckpt in each_file: checkpoint_name = each_file.split(ckpt)[0]+ckpt break if checkpoint_name is None: return curent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))+'/' out_dir = curent_dir + checkpoint_path + "-F16"+'/' if not os.path.exists(out_dir): os.mkdir(out_dir) for each_file in os.listdir(os.curdir): ext = os.path.splitext(each_file)[1] if ext in ['.txt','.json']: copyfile(curent_dir+checkpoint_path+'/'+each_file, out_dir+each_file) reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_name) var_to_map = reader.get_variable_to_shape_map() val_f = {} for key, dim in var_to_map.items(): val_f[key.strip(":0")] = tf.Variable(reader.get_tensor(key).astype(data_type)) ''' if 'word_embeddings' in key: temp = reader.get_tensor(key)[:2896,:] val_f[key.strip(":0")] = tf.Variable(temp.astype(data_type))#119547 if 'dense' in key: if len(dim)>1: need_split_dim1 = False need_split_dim2 = False need_split_dim1 = True if dim[0]==3072 else False need_split_dim2 = True if dim[1]==3072 else False if need_split_dim1: temp = reader.get_tensor(key)[:2048,:] val_f[key.strip(":0")] = tf.Variable(temp.astype(data_type)) elif need_split_dim2: temp = reader.get_tensor(key)[:,:2048] val_f[key.strip(":0")] = tf.Variable(temp.astype(data_type)) elif need_split_dim1 and need_split_dim2: temp = reader.get_tensor(key)[:2048,:2048] val_f[key.strip(":0")] = tf.Variable(temp.astype(data_type)) else: if dim[0]==3072: temp = reader.get_tensor(key)[:2048] val_f[key.strip(":0")] = tf.Variable(temp.astype(data_type)) ''' #get parameters before convert param_log_origin='' for key in var_to_map: param_log_origin += "tensor_name: "+key+" shape:"+str(reader.get_tensor(key).shape)+"\r\n" param_log_origin += str(reader.get_tensor(key))+"\r\n" writer = open(out_dir+'Param-'+str(reader.get_tensor(key).dtype)+'.txt', 'w', encoding="utf-8") writer.write(param_log_origin) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) new_saver = tf.train.import_meta_graph(curent_dir + checkpoint_path +'/'+checkpoint_name+'.meta') new_saver.restore(sess,curent_dir + checkpoint_path +'/'+checkpoint_name) saver = tf.train.Saver(val_f) saver.save(sess, out_dir+checkpoint_name) #save parameters after convert reader_convert = pywrap_tensorflow.NewCheckpointReader(out_dir+checkpoint_name) var_to_map_convert = reader_convert.get_variable_to_shape_map() param_log_convert='' for item in var_to_map_convert: param_log_convert += "tensor_name: "+item+" shape:"+str(reader_convert.get_tensor(item).shape)+"\r\n" param_log_convert += str(reader_convert.get_tensor(item))+"\r\n" writer = open(out_dir+'Param-'+str(reader_convert.get_tensor(item).dtype)+'.txt', 'w', encoding="utf-8") writer.write(param_log_convert) print("Convert Finish!") print("Save to path:"+out_dir)
if os.path.isfile(config_para["test"]): dt_sentences = dev_sentences + test_sentences else: dt_sentences = dev_sentences if 'bin' in parameters['pre_emb']: wordmodel = gensim.models.KeyedVectors.load_word2vec_format( parameters['pre_emb'], binary=True) else: wordmodel = gensim.models.KeyedVectors.load_word2vec_format( parameters['pre_emb'], binary=False) # load bioBert embedding bert_word_embedding = None word_index_dic = {} reader = pywrap_tensorflow.NewCheckpointReader( parameters['bio_bert_embedding']) bert_word_embedding = reader.get_tensor("bert/embeddings/word_embeddings") with open(parameters['bio_bert_vocab'], "r", encoding="utf8") as f: vocab = f.readlines() for index, i in enumerate(vocab): word_index_dic[i.strip()] = index # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. word_to_id = [] char_to_id = [] pt_to_id = [] tag_to_id = [] if not parameters['reload']: if parameters['pre_emb']: # mapping of words frenquency decreasing
def convert_research_ckpt_to_apps( ckpt_file, output_dir, num_embed_split, vocab_size, use_attention_bias, use_qkv_bias, use_cls_layer, baseline, dtype, ): saved_variables = [] split_embeddings = [] def add_variable(old_tensor, new_tensor): logging.info(f"{old_tensor} -> {new_tensor}") saved_variables.append(new_tensor) graph = tf.Graph() reader = pywrap_tensorflow.NewCheckpointReader(ckpt_file) var_to_shape_map = reader.get_variable_to_shape_map() with graph.as_default(): sess = tf.compat.v1.Session() for old_tensor_name in sorted(var_to_shape_map): # Filter out the optimizer variables if 'global_step' in old_tensor_name: continue if filter_optimizer(old_tensor_name): continue if not use_cls_layer and "transform" in old_tensor_name: logging.info("Discarding dense layer before MLM loss.") continue if not use_attention_bias and "output/dense/bias" in old_tensor_name: logging.info("Discarding attention biases.") continue this_tensor_dtype = dtype tensor_value = tf.cast(reader.get_tensor(old_tensor_name), dtype=this_tensor_dtype) new_name = old_tensor_name if new_name.startswith('all/'): new_name = new_name[4:] if "Norm" in new_name: new_name = new_name.replace("GroupNorm", "LayerNorm") if "/layer_" in new_name and "encoder" in new_name: if new_name.endswith('weight') and '/dwconv/' not in new_name: new_name = new_name.replace("weight", "dense/kernel") if new_name.endswith('bias'): new_name = new_name.replace("bias", "dense/bias") if '/boom' in new_name: new_name = new_name.replace("boom", "feed_forward_") if '/up/' in new_name: new_name = new_name.replace("/up/", "/intermediate/") if '/down/' in new_name: new_name = new_name.replace("/down/", "/output/") if '/mixer/' in new_name: new_name = new_name.replace("/mixer/", "/output/mixer/") if baseline: new_name = new_name.replace("/feed_forward_", "") new_name = new_name.replace("/postnorm/", "/output/") if '/conv/' in new_name: new_name = new_name.replace("/conv/", "/convolution/") if "/pre/" in new_name: hidden_dim = tensor_value.shape[-1] // 2 values = tensor_value[..., :hidden_dim] gates = tensor_value[..., hidden_dim:] values_name = new_name.replace("/pre/", "/pre/glu/values/") gates_name = new_name.replace("/pre/", "/pre/glu/gates/") values_var = tf.Variable(values, name=values_name) gates_var = tf.Variable(gates, name=gates_name) add_variable(old_tensor_name, values_var) add_variable(old_tensor_name, gates_var) continue if '/attention/' in new_name: new_name = new_name.replace("attention/output", "attention/projection") new_name = new_name.replace("/qkv/", "/self/") if '/self/' in new_name: new_name = new_name.replace("dense/bias", "bias") if 'self/dense/kernel' in new_name: hidden_dim = tensor_value.shape[-1] // 3 q = tensor_value[..., :hidden_dim] k = tensor_value[..., hidden_dim:2 * hidden_dim] v = tensor_value[..., 2 * hidden_dim:3 * hidden_dim] q_name = new_name.replace("/dense/kernel", "/query/kernel") k_name = new_name.replace("/dense/kernel", "/key/kernel") v_name = new_name.replace("/dense/kernel", "/value/kernel") q_var = tf.Variable(q, name=q_name) add_variable(old_tensor_name, q_var) k_var = tf.Variable(k, name=k_name) add_variable(old_tensor_name, k_var) v_var = tf.Variable(v, name=v_name) add_variable(old_tensor_name, v_var) continue if baseline: new_name = new_name.replace("/postnorm/", "/projection/") elif 'encoder/post_layers' in new_name: new_name = new_name.replace("bert/encoder/post_layers/", "") elif "word_embeddings" in new_name: split_match = re.search("/s\d/", new_name) if split_match and num_embed_split == 1: split_embeddings.append(tensor_value) continue else: pass new_var = tf.Variable(tensor_value, name=new_name) add_variable(old_tensor_name, new_var) if split_embeddings and num_embed_split == 1: merged_embedding = tf.concat(split_embeddings, axis=0) new_var = tf.Variable(merged_embedding, name="bert/embeddings/word_embeddings") add_variable(split_embeddings[0], new_var) sess.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver() _dir_name, ckpt_name = os.path.split(ckpt_file) output_file = os.path.join(output_dir, ckpt_name) saver.save(sess, output_file) num_params = np.sum([np.prod(v.shape) for v in saved_variables]) print(f"Number of parameters saved: {num_params}")
def convert_gc_ckpt_to_google(ckpt_file, output_dir=None, include_qkv_bias=False, dtype=tf.float32): graph = tf.Graph() dir_name, ckpt_name = os.path.split(os.path.abspath(ckpt_file)) reader = pywrap_tensorflow.NewCheckpointReader(ckpt_file) var_to_shape_map = reader.get_variable_to_shape_map() with graph.as_default(): sess = tf.Session() num_hidden_layers = 0 word_embeddings = [] new_variables = [] keep_variables = [] for tensor_name in var_to_shape_map: logging.info(f"Loading {tensor_name}") # Filter the optimizer variables if filter_optimizer(tensor_name): continue tensor_value = tf.cast(reader.get_tensor(tensor_name), dtype=dtype) if "word_embeddings" in tensor_name: word_embeddings.append(tensor_name) elif "attention" in tensor_name: layer_idx = int(tensor_name.split("/")[2].split("_")[-1]) num_hidden_layers = max(layer_idx, num_hidden_layers) if "qkv_bias" in tensor_name and include_qkv_bias: hidden_size = tensor_value.shape[0] // 3 query_bias = tensor_value[:hidden_size] key_bias = tensor_value[hidden_size:2 * hidden_size] value_bias = tensor_value[2 * hidden_size:] qb = tf.Variable(query_bias, name=tensor_name.replace( "qkv_bias", "query/bias")) kb = tf.Variable(key_bias, name=tensor_name.replace( "qkv_bias", "key/bias")) vb = tf.Variable(value_bias, name=tensor_name.replace( "qkv_bias", "value/bias")) new_variables.extend([qb, kb, vb]) # rename projection to output elif "projection" in tensor_name: new_name = tensor_name.replace("projection", "output") proj = tf.Variable(tensor_value, name=new_name) new_variables.append(proj) else: var = tf.get_variable(tensor_name, shape=tensor_value.shape, dtype=dtype) keep_variables.append(var) # Combine split embeddings word_embeddings = np.sort(word_embeddings) embeddings_vals = [reader.get_tensor(k) for k in word_embeddings] unit_embeddings = np.vstack(embeddings_vals) logging.debug( f"Concated word_embeddings shape: {unit_embeddings.shape}") we = tf.Variable( unit_embeddings, dtype=dtype, shape=unit_embeddings.shape, name="bert/embeddings/word_embeddings", ) new_variables.append(we) saved_variables = new_variables + keep_variables sess.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver(var_list=saved_variables) output_file = os.path.join(output_dir, ckpt_name) saver.save(sess, output_file) print("Saved to :" + output_file)
def convert_google_ckpt_to_gc( ckpt_file, output_dir, num_embed_split, vocab_size, use_attention_bias, use_qkv_bias, use_cls_layer, dtype, ): saved_variables = [] def add_variable(old_tensor, new_tensor): logging.info(f"{old_tensor} -> {new_tensor}") saved_variables.append(new_tensor) graph = tf.Graph() reader = pywrap_tensorflow.NewCheckpointReader(ckpt_file) var_to_shape_map = reader.get_variable_to_shape_map() with graph.as_default(): sess = tf.compat.v1.Session() for tensor_name in sorted(var_to_shape_map): # Filter out the optimizer variables if filter_optimizer(tensor_name): continue if not use_cls_layer and "transform" in tensor_name: logging.info("Discarding dense layer before MLM loss.") continue if not use_attention_bias and "output/dense/bias" in tensor_name: logging.info("Discarding attention biases.") continue this_tensor_dtype = dtype if "cls/squad/" in tensor_name: # Keep SQuAD output dense layer weights as float32 this_tensor_dtype = tf.float32 tensor_value = tf.cast(reader.get_tensor(tensor_name), dtype=this_tensor_dtype) else: # Cast all other tensors to required precision. tensor_value = tf.cast(reader.get_tensor(tensor_name), dtype=this_tensor_dtype) if "word_embeddings" in tensor_name and num_embed_split > 1: # Split word_embeddings when num_split>1 logging.info( f"Splitting word embeddings info {num_embed_split} splits." ) word_embeddings = truncate_vocab(tensor_value, vocab_size) hidden_size = np.shape(word_embeddings)[1] assert vocab_size % num_embed_split == 0 size_per_slice = int(vocab_size / num_embed_split) for i in range(num_embed_split): start_idx = i * size_per_slice end_idx = (i + 1) * size_per_slice we_pieces = tf.Variable( word_embeddings[start_idx:end_idx, :], shape=(size_per_slice, hidden_size), name=f"bert/embeddings/s{i}/word_embeddings", ) add_variable(tensor_name, we_pieces) # Truncate word embeddings to vocab_size elif "word_embeddings" in tensor_name: full_word_embeddings = tf.Variable(truncate_vocab( tensor_value, vocab_size), name=tensor_name) add_variable(tensor_name, full_word_embeddings) # Rename tensor elif "attention/output" in tensor_name: new_name = tensor_name.replace("attention/output", "attention/projection") proj = tf.Variable(tensor_value, name=new_name) add_variable(tensor_name, proj) elif is_qkv_tensor(tensor_name): # We will process self-attention parameters outside the loop continue else: others_var = tf.Variable(tensor_value, name=tensor_name) add_variable(tensor_name, others_var) # Concatenate or split QKV layer_re = re.compile('.*/layer_([0-9]+)/.*') matches = [layer_re.match(k) for k in var_to_shape_map.keys()] num_hidden_layers = max( [int(m.group(1)) for m in matches if m is not None]) + 1 logging.info("Concatenate query, key, value layers into one.") for i in range(num_hidden_layers): layer_name = f"bert/encoder/layer_{i}/attention/self" # Combine query,key,value to qkv_weight qkv_weight = [] qkv_bias = [] for name in ["query", "key", "value"]: weight_name = layer_name + f"/{name}/kernel" bias_name = layer_name + f"/{name}/bias" weight = tf.cast(reader.get_tensor(weight_name), dtype=dtype) bias = tf.cast(reader.get_tensor(bias_name), dtype=dtype) add_variable(weight_name, tf.Variable(weight, name=weight_name)) # The QKV bias is always concantenated qkv_bias.append(bias) if use_qkv_bias: qkv_bias = tf.concat(qkv_bias, axis=0) qkv_b = tf.Variable(qkv_bias, shape=qkv_bias.shape, name=layer_name + "/qkv_bias") add_variable("qkv_bias", qkv_b) sess.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver() _dir_name, ckpt_name = os.path.split(ckpt_file) output_file = os.path.join(output_dir, ckpt_name) saver.save(sess, output_file) num_params = np.sum([np.prod(v.shape) for v in saved_variables]) print(f"Number of parameters saved: {num_params}")
def convert_gc_ckpt_to_google(ckpt_file, output_dir=None, include_qkv_bias=False, dtype=tf.float32): """ Convert GC bert checkpoint to Google original checkpoint 1. combine `word_embeddings` if splitted 2. rename scope `bert/encoder/layer_x/attention/projection/` to `bert/encoder/layer_x/attention/output/` 3. add back attention_projection_bias. 4. split `qkv_weight` to query,key,value, and add relative bias. 5. rename `GroupNorm` to `LayerNorm`. 6. add back dense layer before mlm loss. Args: ckpt_file: str, Google checkpoint. output_dir: str, Path to save converted GC checkpoint. include_qkv_bias: bool, are there bias weights in attention layer. dtype: tf.float32 or tf.float16, type of tensor in output ckpt file. Only will be used when load origin google checkpoint Returns: None """ graph = tf.Graph() dir_name, ckpt_name = os.path.split(os.path.abspath(ckpt_file)) if not output_dir: output_dir = os.path.join(dir_name, "google_ckpt") if not os.path.exists(output_dir): os.makedirs(output_dir) reader = pywrap_tensorflow.NewCheckpointReader(ckpt_file) var_to_shape_map = reader.get_variable_to_shape_map() with graph.as_default(): sess = tf.Session() num_hidden_layers = 0 optimizer_names = ["adam", "Momentum", "lamb"] # optimizer weights word_embeddings = [] new_variables = [] keep_vardiables = [] for tensor_name in var_to_shape_map: # logger.info(f"Load {tensor_name}......") # Filter the optimizer variables if filter_optimizer(tensor_name, optimizer_names): continue tensor_value = tf.cast(reader.get_tensor(tensor_name), dtype=dtype) if tensor_name == 'bert/encoder/layer_0/intermediate/dense/kernel' or tensor_name == 'bert/pooler/dense/kernel': print(tensor_name) print(tensor_value) # logger.info(f"Shape is {tensor_value.shape}") if "word_embeddings" in tensor_name: word_embeddings.append(tensor_name) elif "attention" in tensor_name: layer_idx = int(tensor_name.split("/")[2].split("_")[-1]) num_hidden_layers = max(layer_idx, num_hidden_layers) # split query, key, value. if "qkv_weight" in tensor_name: hidden_size = tensor_value.shape[1] // 3 query = tensor_value[:, :hidden_size] key = tensor_value[:, hidden_size:2 * hidden_size] value = tensor_value[:, 2 * hidden_size:] qw = tf.Variable(query, name=tensor_name.replace( "qkv_weight", "query/kernel")) kw = tf.Variable(key, name=tensor_name.replace( "qkv_weight", "key/kernel")) vw = tf.Variable(value, name=tensor_name.replace( "qkv_weight", "value/kernel")) new_variables.extend([qw, kw, vw]) elif "qkv_bias" in tensor_name and include_qkv_bias: hidden_size = tensor_value.shape[0] // 3 query_bias = tensor_value[:hidden_size] key_bias = tensor_value[hidden_size:2 * hidden_size] value_bias = tensor_value[2 * hidden_size:] qb = tf.Variable(query_bias, name=tensor_name.replace( "qkv_bias", "query/bias")) kb = tf.Variable(key_bias, name=tensor_name.replace( "qkv_bias", "key/bias")) vb = tf.Variable(value_bias, name=tensor_name.replace( "qkv_bias", "value/bias")) new_variables.extend([qb, kb, vb]) # rename projection to output elif "projection" in tensor_name: # logger.debug(f"Rename projection......") new_name = tensor_name.replace("projection", "output") if "GroupNorm" in tensor_name: # logger.debug(f"Rename GroupNorm in attention ......") new_name = new_name.replace("GroupNorm", "LayerNorm") proj = tf.Variable(tensor_value, name=new_name) new_variables.append(proj) # rename other GroupNorm elif "GroupNorm" in tensor_name: # logger.debug(f"Rename GroupNorm ......") gn = tf.Variable(tensor_value, name=tensor_name.replace( "GroupNorm", "LayerNorm")) new_variables.append(gn) else: var = tf.get_variable(tensor_name, shape=tensor_value.shape, dtype=dtype) # var = tf.Variable(tensor_value, name=tensor_name) keep_vardiables.append(var) # Combine splitted embeddings word_embeddings = np.sort(word_embeddings) embeddings_vals = [reader.get_tensor(k) for k in word_embeddings] unit_embeddings = np.vstack(embeddings_vals) # logger.debug(f"Concated word_embeddings shape: {unit_embeddings.shape}") we = tf.Variable(unit_embeddings, dtype=dtype, shape=unit_embeddings.shape, name="bert/embeddings/word_embeddings") new_variables.append(we) saved_variables = new_variables + keep_vardiables # logger.info("Finish concat word embeddings.") sess.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver() output_file = os.path.join(output_dir, ckpt_name) saver.save(sess, output_file)
def convert_ipu_ckpt_to_gc(ckpt_file, output_dir=None, num_embed_split=1, vocab_size=30400, use_attention_bias=False, use_qkv_bias=False, use_cls_layer=False, dtype=tf.float16, label_num=1): """ Convert Google original checkpoint to GC bert checkpoint there are several difference between our GC bert and origin google bert: 1. gc_bert do not have attention_probs_dropout_prob 2. gc_bert do not have mlm projection layer 3. gc_bert do not have attention_projection_bias 4. rename scope `bert/encoder/layer_x/attention/output/` to `bert/encoder/layer_x/attention/projection/` 5. combine query, key, value layer to qkv_weight and qkv_bias layer. This changes might cause different performance on lamb optimizer, so the optimizer has been modified. 6. In some cases, gc_bert supports word embedding split and rename the scope to `bert/embeddings/s{i}/word_embeddings`. Args: ckpt_file: str, Google checkpoint. output_dir: str, Path to save converted GC checkpoint. num_embed_split: int, number of word embedding need to be split. Only will be used when load origin google checkpoint vocab_size: int, vocabulary size. GC bert cut original 30522 to 30400 for better performance. use_attention_bias: bool, whether to use attention bias. Defaults to False. use_qkv_bias: bool, whether to use bias in qkv layers. Defaults to False. use_cls_layer: bool, whether to use dense layer before mlm loss. Defaults to False dtype: tf.float32 or tf.float16, type of tensor in output ckpt file. Only will be used when load origin google checkpoint Returns: None """ graph = tf.Graph() dir_name, ckpt_name = os.path.split(ckpt_file) if not output_dir: output_dir = os.path.join(dir_name, "gc_ckpt") if not os.path.exists(output_dir): os.makedirs(output_dir) reader = pywrap_tensorflow.NewCheckpointReader(ckpt_file) var_to_shape_map = reader.get_variable_to_shape_map() with graph.as_default(): sess = tf.Session() num_hidden_layers = 0 optimizer_names = ["adam", "Momentum", "lamb"] # optimizer weights qkv_layers = defaultdict(dict) saved_variables = [] emb_list = [] for tensor_name in var_to_shape_map: # Filter the optimizer variables if filter_optimizer(tensor_name, optimizer_names): continue if not use_cls_layer and "transform" in tensor_name: # print("Abandon dense layer before mlm loss.") continue if not use_attention_bias and "output/dense/bias" in tensor_name: # print("Abandon attention bias") continue tensor_value = tf.cast(reader.get_tensor(tensor_name), dtype=dtype) if "word_embeddings" in tensor_name: emb_list.append(tensor_name) # split word_embeddings when num_split>1 ''' word_embeddings = tensor_value[:vocab_size, :] hidden_size = np.shape(word_embeddings)[1] assert vocab_size % num_embed_split == 0 size_per_slice = int(vocab_size / num_embed_split) for i in range(num_embed_split): start_idx = i * size_per_slice end_idx = (i+1) * size_per_slice we_pieces = tf.Variable( word_embeddings[start_idx:end_idx, :], shape=(size_per_slice, hidden_size), name=f"bert/embeddings/s{i}/word_embeddings") saved_variables.append(we_pieces) ''' # Rename tensor elif "attention/output" in tensor_name: new_name = tensor_name.replace("attention/output", "attention/projection") if "LayerNorm" in tensor_name: new_name = new_name.replace("LayerNorm", "GroupNorm") proj = tf.Variable(tensor_value, name=new_name) saved_variables.append(proj) elif "LayerNorm" in tensor_name: ln = tf.Variable(tensor_value, name=tensor_name.replace( "LayerNorm", "GroupNorm")) saved_variables.append(ln) # Find query, key, value. elif "query" in tensor_name or \ "key" in tensor_name or \ "value" in tensor_name: layer_idx = int(tensor_name.split("/")[2].split("_") [1]) # get the layer_{i} num_hidden_layers = max(layer_idx, num_hidden_layers) qkv_layers[layer_idx][tensor_name] = tensor_value else: others_var = tf.Variable(tensor_value, name=tensor_name) saved_variables.append(others_var) print("Start to combine query,key,value layers to qkv layer...") print("Start to combine word embedding ...") word_embeddings = np.sort(emb_list) embeding_vals = [reader.get_tensor(key) for key in word_embeddings] unit_embedding = np.vstack(embeding_vals) # unit_embedding = get_embeding() # word_embedding = tf.concat(emb_list, axis=0) word = tf.Variable(unit_embedding, shape=unit_embedding.shape, name="bert/embeddings/word_embeddings", dtype=tf.float16) saved_variables.append(word) ''' for i in range(num_hidden_layers+1): layer_name = f"bert/encoder/layer_{i}/attention/self" # Combine query,key,value to qkv_weight layer_tensors = qkv_layers[i] qkv_weight = [] qkv_bias = [] for name in ["query", "key", "value"]: weight_name = layer_name + f"/{name}/kernel" bias_name = layer_name + f"/{name}/bias" qkv_weight.append(layer_tensors[weight_name]) qkv_bias.append(layer_tensors[bias_name]) qkv_weight = tf.concat(qkv_weight, axis=1) qkv = tf.Variable(qkv_weight, shape=qkv_weight.shape, name=layer_name+"/qkv_weight") saved_variables.append(qkv) if use_qkv_bias: qkv_bias = tf.concat(qkv_bias, axis=0) qkv_b = tf.Variable(qkv_bias, shape=qkv_bias.shape, name=layer_name+"/qkv_bias") saved_variables.append(qkv_b) else: print(f"Abandon QKV bias in layer_{i}") ''' #loss loss_weight = tf.get_variable( shape=(label_num, 768), dtype=tf.float16, initializer=tf.truncated_normal_initializer(stddev=0.02), name="output_weights") saved_variables.append(loss_weight) loss_bias = tf.get_variable(shape=(label_num, ), dtype=tf.float16, initializer=tf.zeros_initializer(), name="output_bias") saved_variables.append(loss_bias) sess.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver() output_file = os.path.join(output_dir, ckpt_name) saver.save(sess, output_file) print("Save to :" + output_file)
def visualization(f, cfmtrx, maxpnt, tsne, weights_hist): pred_and_label = np.empty([0, 2], dtype=np.int32) max_idx = np.empty([0, 512], dtype=np.int32) global_feature = np.empty([0, 512]) # CONCAT data from all files for fn in range(len(DUMP_FILES)): dump_file = DUMP_FILES[fn] log_string('V---- %d/%d -----' % (fn + 1, len(DUMP_FILES))) # load dump file fin = h5py.File(os.path.join(f['dump_dir'], dump_file)) # concatenate pred_and_label = np.concatenate( (pred_and_label, fin['pred_and_label'][:]), axis=0) max_idx = np.concatenate((max_idx, fin['max_idx'][:]), axis=0) global_feature = np.concatenate( (global_feature, fin['global_feature'][:]), axis=0) fin.close() log_string('pred_and_label {}'.format(pred_and_label.shape)) # (N, 2) log_string('max_idx {}'.format(max_idx.shape)) # (N, C) log_string('global_feature {}'.format(global_feature.shape)) # (N, 512) # PLOT confusion matrix if cfmtrx: log_string('PLOT confusion matrix') cmat = np.zeros([f['num_class'], f['num_class']]) for i in range(pred_and_label.shape[0]): pred_val = pred_and_label[i][0] true_val = pred_and_label[i][1] cmat[true_val, pred_val] += 1 plot_confusion_matrix(cmat, class_names=label_modelnet.keys(), normalize=True, title='') # PLOT max point if maxpnt: log_string('PLOT maximum point') fdump = h5py.File(os.path.join(f['dump_dir'], DUMP_FILES[0])) ftest = h5py.File(os.path.join(f['dataset_path'], TEST_FILES[0])) max_idx = fdump['max_idx'][:] points = ftest['data'][:, 0:f['num_point'], :] assert max_idx.shape[0] == points.shape[0] # random choose to show shows = np.random.random_integers(0, max_idx.shape[0], 20) pc_list = [] for s in range(shows.shape[0]): i = shows[s] pc = points[i, :, :] pidx = np.unique(max_idx[i, :]) color_tab = np.full((f['num_point']), 35) color_tab[pidx] = 99 plot_point_cloud(pc, color_tab) fdump.close() ftest.close() # PLOT T-SNE if tsne: log_string('PLOT T-SNE') tlabel = [] tfeature = [] for i in range(pred_and_label.shape[0]): if pred_and_label[i][0] == pred_and_label[i][1]: tlabel.append(pred_and_label[i][0]) tfeature.append(global_feature[i]) tlabel = np.array(tlabel) tfeature = np.array(tfeature) log_string('tlabel {}'.format(tlabel.shape)) # (N,) log_string('tfeature {}'.format(tfeature.shape)) # (N, C) plot_TSNE(tlabel, tfeature, f['num_class']) # PLOT if weights_hist: import tensorflow as tf from tensorflow import pywrap_tensorflow checkpoint_file = tf.train.latest_checkpoint(f['model_path']) # Read data from checkpoint file reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_file) var_to_shape_map = reader.get_variable_to_shape_map() def plot_weights_hist(ax, data): data = np.abs(data) data = np.squeeze(data) data = np.sum(data, axis=1, keepdims=False) data = data / np.sum(data) print(data.shape) s = [ '$x$', '$y$', '$z$', '$x^2$', '$y^2$', '$z^2$', '$x^3$', '$y^3$', '$z^3$', '$\overline{X}$', '$\overline{Y}$', '$\overline{Z}$', '$\overline{X^2}$', '$\overline{Y^2}$', '$\overline{Z^2}$', '$\overline{X^3}$', '$\overline{Y^3}$', '$\overline{Z^3}$', '$xy$', '$yz$', '$zx$', '$x^2y$', '$y^2z$', '$z^2x$', '$x^2z$', '$y^2x$', '$z^2y$', '$l2$', '$d_x$', '$d_y$', '$d_z$', '$\\theta_x$', '$\\theta_y$', '$\\theta_z$' ] ax.bar(s, data) # Print tensor name and values fig = plt.figure() ax1 = fig.add_subplot(211) ax1.set_xlabel('(a)') ax2 = fig.add_subplot(212) ax2.set_xlabel('(b)') for key in var_to_shape_map: if key == 'LinearCombLayer/conv2d_128_pc/weights': print(key, reader.get_tensor(key).shape) plot_weights_hist(ax1, reader.get_tensor(key)) if key == 'LinearCombLayer/conv2d_128_nn/weights': print(key, reader.get_tensor(key).shape) plot_weights_hist(ax2, reader.get_tensor(key)) #plt.tight_layout() plt.show()