def convert_fairseq_s2t_checkpoint_to_tfms(checkpoint_path, pytorch_dump_folder_path): m2m_100 = torch.load(checkpoint_path, map_location="cpu") args = m2m_100["args"] state_dict = m2m_100["model"] lm_head_weights = state_dict["decoder.output_projection.weight"] remove_ignore_keys_(state_dict) rename_keys(state_dict) vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0] tie_embeds = args.share_decoder_input_output_embed conv_kernel_sizes = [int(i) for i in args.conv_kernel_sizes.split(",")] config = Speech2TextConfig( vocab_size=vocab_size, max_source_positions=args.max_source_positions, max_target_positions=args.max_target_positions, encoder_layers=args.encoder_layers, decoder_layers=args.decoder_layers, encoder_attention_heads=args.encoder_attention_heads, decoder_attention_heads=args.decoder_attention_heads, encoder_ffn_dim=args.encoder_ffn_embed_dim, decoder_ffn_dim=args.decoder_ffn_embed_dim, d_model=args.encoder_embed_dim, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_function="relu", num_conv_layers=len(conv_kernel_sizes), conv_channels=args.conv_channels, conv_kernel_sizes=conv_kernel_sizes, input_feat_per_channel=args.input_feat_per_channel, input_channels=args.input_channels, tie_word_embeddings=tie_embeds, num_beams=5, max_length=200, use_cache=True, decoder_start_token_id=2, early_stopping=True, ) model = Speech2TextForConditionalGeneration(config) missing, unexpected = model.model.load_state_dict(state_dict, strict=False) if len(missing) > 0 and not set(missing) <= set([ "encoder.embed_positions.weights", "decoder.embed_positions.weights", ]): raise ValueError( "Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights` are allowed to be missing," f" but all the following weights are missing {missing}") if tie_embeds: model.lm_head = make_linear_from_emb(model.model.decoder.embed_tokens) else: model.lm_head.weight.data = lm_head_weights model.save_pretrained(pytorch_dump_folder_path)
def test_generate_fp16(self): config, input_dict = self.model_tester.prepare_config_and_inputs() input_features = input_dict["input_features"] attention_mask = input_dict["attention_mask"] model = Speech2TextForConditionalGeneration(config).eval().to(torch_device) if torch_device == "cuda": input_features = input_features.half() model.half() model.generate(input_features, attention_mask=attention_mask) model.generate(input_features, num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)