def prepare_config_and_inputs(self): input_values = tf.cast( ids_tensor([self.batch_size, self.seq_length], 32768), tf.float32) / 32768.0 attention_mask = tf.ones_like(input_values) config = HubertConfig( hidden_size=self.hidden_size, feat_extract_norm=self.feat_extract_norm, feat_extract_dropout=self.feat_extract_dropout, feat_extract_activation=self.feat_extract_activation, conv_dim=self.conv_dim, conv_stride=self.conv_stride, conv_kernel=self.conv_kernel, conv_bias=self.conv_bias, num_conv_pos_embeddings=self.num_conv_pos_embeddings, num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, hidden_dropout_prob=self.hidden_dropout_prob, intermediate_size=self.intermediate_size, layer_norm_eps=self.layer_norm_eps, hidden_act=self.hidden_act, initializer_range=self.initializer_range, vocab_size=self.vocab_size, do_stable_layer_norm=self.do_stable_layer_norm, ) return config, input_values, attention_mask
def convert_hubert_checkpoint(pytorch_dump_folder_path, config_path=None): """ Copy/paste/tweak model's weights to transformers design. """ model = distilhubert().model.model if config_path is not None: config = HubertConfig.from_pretrained(config_path) else: config = convert_config(model) model = model.eval() feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0, do_normalize=False, return_attention_mask=False, ) hf_model = HubertModel(config) recursively_load_weights(model, hf_model) feature_extractor.save_pretrained(pytorch_dump_folder_path) hf_model.save_pretrained(pytorch_dump_folder_path)
def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path): """ Copy/paste/tweak model's weights to transformers design. """ checkpoint = torch.load(checkpoint_path, map_location="cpu") if checkpoint["Config"]["downstream_expert"]["modelrc"][ "select"] not in SUPPORTED_MODELS: raise NotImplementedError( f"The supported s3prl models are {SUPPORTED_MODELS}") downstream_dict = checkpoint["Downstream"] hf_congfig = HubertConfig.from_pretrained(config_path) hf_model = HubertForSequenceClassification.from_pretrained( base_model_name, config=hf_congfig) hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( base_model_name, return_attention_mask=True, do_normalize=False) if hf_congfig.use_weighted_layer_sum: hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"] hf_model.projector.weight.data = downstream_dict["projector.weight"] hf_model.projector.bias.data = downstream_dict["projector.bias"] hf_model.classifier.weight.data = downstream_dict[ "model.post_net.linear.weight"] hf_model.classifier.bias.data = downstream_dict[ "model.post_net.linear.bias"] hf_feature_extractor.save_pretrained(model_dump_path) hf_model.save_pretrained(model_dump_path)
def get_config(self): return HubertConfig( hidden_size=self.hidden_size, feat_extract_norm=self.feat_extract_norm, feat_extract_dropout=self.feat_extract_dropout, feat_extract_activation=self.feat_extract_activation, conv_dim=self.conv_dim, conv_stride=self.conv_stride, conv_kernel=self.conv_kernel, conv_bias=self.conv_bias, num_conv_pos_embeddings=self.num_conv_pos_embeddings, num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, hidden_dropout_prob=self.hidden_dropout_prob, intermediate_size=self.intermediate_size, layer_norm_eps=self.layer_norm_eps, hidden_act=self.hidden_act, initializer_range=self.initializer_range, vocab_size=self.vocab_size, )
def convert_hubert_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True): """ Copy/paste/tweak model's weights to transformers design. """ if config_path is not None: config = HubertConfig.from_pretrained(config_path) else: config = HubertConfig() if is_finetuned: if dict_path: target_dict = Dictionary.load(dict_path) # important change bos & pad token id since CTC symbol is <pad> and # not <s> as in fairseq config.bos_token_id = target_dict.pad_index config.pad_token_id = target_dict.bos_index config.eos_token_id = target_dict.eos_index config.vocab_size = len(target_dict.symbols) vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json") if not os.path.isdir(pytorch_dump_folder_path): logger.error( "--pytorch_dump_folder_path ({}) should be a directory". format(pytorch_dump_folder_path)) return os.makedirs(pytorch_dump_folder_path, exist_ok=True) with open(vocab_path, "w", encoding="utf-8") as vocab_handle: json.dump(target_dict.indices, vocab_handle) tokenizer = Wav2Vec2CTCTokenizer( vocab_path, unk_token=target_dict.unk_word, pad_token=target_dict.pad_word, bos_token=target_dict.bos_word, eos_token=target_dict.eos_word, word_delimiter_token="|", do_lower_case=False, ) return_attention_mask = True if config.feat_extract_norm == "layer" else False feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0, do_normalize=True, return_attention_mask=return_attention_mask, ) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(pytorch_dump_folder_path) hf_wav2vec = HubertForCTC(config) else: hf_wav2vec = HubertModel(config) if is_finetuned: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}) else: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path]) model = model[0].eval() recursively_load_weights(model, hf_wav2vec, is_finetuned) hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
def convert_config(model): config = HubertConfig() fs_config = model.config config.activation_dropout = fs_config.activation_dropout config.apply_spec_augment = False config.attention_dropout = fs_config.attention_dropout config.conv_bias = False conv_layers = eval(fs_config.extractor_conv_feature_layers) config.conv_dim = [x[0] for x in conv_layers] config.conv_kernel = [x[1] for x in conv_layers] config.conv_stride = [x[2] for x in conv_layers] config.feat_extract_activation = "gelu" config.feat_extract_norm = "layer" if fs_config.extractor_mode == "layer_norm" else "group" config.feat_proj_layer_norm = False config.feat_proj_dropout = 0.0 config.final_dropout = 0.0 config.hidden_act = fs_config.activation_fn config.hidden_dropout = fs_config.dropout config.hidden_size = fs_config.encoder_embed_dim config.initializer_range = 0.02 config.intermediate_size = fs_config.encoder_ffn_embed_dim config.layer_norm_eps = 1e-5 config.layerdrop = 0.0 config.num_attention_heads = fs_config.encoder_attention_heads config.num_conv_pos_embedding_groups = fs_config.conv_pos_groups config.num_conv_pos_embeddings = fs_config.conv_pos config.num_feat_extract_layers = len(conv_layers) config.num_hidden_layers = fs_config.encoder_layers return config