def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_lm_architecture(args) if getattr(args, "max_target_positions", None) is not None: max_target_positions = args.max_target_positions else: max_target_positions = getattr(args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS) def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) if args.is_wordlm and hasattr(task, "word_dictionary"): dictionary = task.word_dictionary elif isinstance(task, SpeechRecognitionEspressoTask): dictionary = task.target_dictionary else: dictionary = task.source_dictionary # separate decoder input embeddings pretrained_decoder_embed = None if args.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( args.decoder_embed_path, dictionary, args.decoder_embed_dim) # one last double check of parameter combinations if args.share_embed and (args.decoder_embed_dim != args.decoder_out_embed_dim): raise ValueError( "--share-embed requires " "--decoder-embed-dim to match --decoder-out-embed-dim") if args.decoder_freeze_embed: pretrained_decoder_embed.weight.requires_grad = False decoder = SpeechLSTMDecoder( dictionary=dictionary, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attn_type=None, encoder_output_units=0, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_embed, adaptive_softmax_cutoff=(utils.eval_str_list( args.adaptive_softmax_cutoff, type=int) if args.criterion == "adaptive_loss" else None), max_target_positions=max_target_positions, ) return cls(decoder, args)
def build_decoder(cls, cfg, tgt_dict, embed_tokens): return SpeechLSTMDecoder( tgt_dict, embed_dim=cfg.decoder.embed_dim, hidden_size=cfg.decoder.hidden_size, out_embed_dim=cfg.decoder.hidden_size, num_layers=cfg.decoder.layers, dropout_in=cfg.decoder.dropout_in, dropout_out=cfg.decoder.dropout_out, residual=cfg.decoder.residual, pretrained_embed=embed_tokens, share_input_output_embed=True, # disallow fc_out in decoder max_target_positions=DEFAULT_MAX_TARGET_POSITIONS, )