def convert_unispeech_sat_checkpoint(
    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    if config_path is not None:
        config = UniSpeechSatConfig.from_pretrained(config_path)
    else:
        config = UniSpeechSatConfig()

    dict_path = ""

    if is_finetuned:
        hf_wav2vec = UniSpeechSatForCTC(config)
    else:
        hf_wav2vec = UniSpeechSatForPreTraining(config)

    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
        [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
    )
    model = model[0].eval()

    recursively_load_weights(model, hf_wav2vec)

    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
Пример #2
0
 def get_config(self):
     return UniSpeechSatConfig(
         hidden_size=self.hidden_size,
         feat_extract_norm=self.feat_extract_norm,
         feat_extract_dropout=self.feat_extract_dropout,
         feat_extract_activation=self.feat_extract_activation,
         conv_dim=self.conv_dim,
         conv_stride=self.conv_stride,
         conv_kernel=self.conv_kernel,
         conv_bias=self.conv_bias,
         num_conv_pos_embeddings=self.num_conv_pos_embeddings,
         num_conv_pos_embedding_groups=self.num_conv_pos_embedding_groups,
         mask_time_prob=self.mask_time_prob,
         mask_time_length=self.mask_time_length,
         num_hidden_layers=self.num_hidden_layers,
         num_attention_heads=self.num_attention_heads,
         hidden_dropout_prob=self.hidden_dropout_prob,
         intermediate_size=self.intermediate_size,
         layer_norm_eps=self.layer_norm_eps,
         hidden_act=self.hidden_act,
         initializer_range=self.initializer_range,
         vocab_size=self.vocab_size,
         tdnn_dim=self.tdnn_dim,
         tdnn_kernel=self.tdnn_kernel,
         tdnn_dilation=self.tdnn_dilation,
         xvector_output_dim=self.xvector_output_dim,
     )
def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path,
                             model_dump_path):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    checkpoint = torch.load(checkpoint_path, map_location="cpu")

    downstream_dict = checkpoint["Downstream"]

    hf_config = UniSpeechSatConfig.from_pretrained(config_path)
    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
        base_model_name, return_attention_mask=True, do_normalize=False)

    arch = hf_config.architectures[0]
    if arch.endswith("ForSequenceClassification"):
        hf_model = convert_classification(base_model_name, hf_config,
                                          downstream_dict)
    elif arch.endswith("ForAudioFrameClassification"):
        hf_model = convert_diarization(base_model_name, hf_config,
                                       downstream_dict)
    elif arch.endswith("ForXVector"):
        hf_model = convert_xvector(base_model_name, hf_config, downstream_dict)
    else:
        raise NotImplementedError(
            f"S3PRL weights conversion is not supported for {arch}")

    if hf_config.use_weighted_layer_sum:
        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]

    hf_feature_extractor.save_pretrained(model_dump_path)
    hf_model.save_pretrained(model_dump_path)