def __init__(self, idims, odim, args): """Initialize this class with python-level args. Args: idims (list): list of the number of an input feature dim. odim (int): The number of output vocab. args (Namespace): arguments """ super(E2E, self).__init__() torch.nn.Module.__init__(self) self.mtlalpha = args.mtlalpha assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]" self.verbose = args.verbose # NOTE: for self.build method args.char_list = getattr(args, "char_list", None) self.char_list = args.char_list self.outdir = args.outdir self.space = args.sym_space self.blank = args.sym_blank self.reporter = Reporter() self.num_encs = args.num_encs self.share_ctc = args.share_ctc # below means the last number becomes eos/sos ID # note that sos/eos IDs are identical self.sos = odim - 1 self.eos = odim - 1 # subsample info self.subsample_list = get_subsample(args, mode="asr", arch="rnn_mulenc") # label smoothing info if args.lsm_type and os.path.isfile(args.train_json): logging.info("Use label smoothing with " + args.lsm_type) labeldist = label_smoothing_dist(odim, args.lsm_type, transcript=args.train_json) else: labeldist = None # speech translation related self.replace_sos = getattr(args, "replace_sos", False) # use getattr to keep compatibility self.frontend = None # encoder self.enc = encoder_for(args, idims, self.subsample_list) # ctc self.ctc = ctc_for(args, odim) # attention self.att = att_for(args) # hierarchical attention network han = att_for(args, han_mode=True) self.att.append(han) # decoder self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist) if args.mtlalpha > 0 and self.num_encs > 1: # weights-ctc, # e.g. ctc_loss = w_1*ctc_1_loss + w_2 * ctc_2_loss + w_N * ctc_N_loss self.weights_ctc_train = args.weights_ctc_train / np.sum( args.weights_ctc_train) # normalize self.weights_ctc_dec = args.weights_ctc_dec / np.sum( args.weights_ctc_dec) # normalize logging.info("ctc weights (training during training): " + " ".join([str(x) for x in self.weights_ctc_train])) logging.info("ctc weights (decoding during training): " + " ".join([str(x) for x in self.weights_ctc_dec])) else: self.weights_ctc_dec = [1.0] self.weights_ctc_train = [1.0] # weight initialization self.init_like_chainer() # options for beam search if args.report_cer or args.report_wer: recog_args = { "beam_size": args.beam_size, "penalty": args.penalty, "ctc_weight": args.ctc_weight, "maxlenratio": args.maxlenratio, "minlenratio": args.minlenratio, "lm_weight": args.lm_weight, "rnnlm": args.rnnlm, "nbest": args.nbest, "space": args.sym_space, "blank": args.sym_blank, "tgt_lang": False, "ctc_weights_dec": self.weights_ctc_dec, } self.recog_args = argparse.Namespace(**recog_args) self.report_cer = args.report_cer self.report_wer = args.report_wer else: self.report_cer = False self.report_wer = False self.rnnlm = None self.logzero = -10000000000.0 self.loss = None self.acc = None
def __init__(self, idim, odim, mono_odim, args, ignore_id=-1): """Construct an E2E object. :param int idim: dimension of inputs :param int odim: dimension of outputs :param Namespace args: argument Namespace containing options """ super(E2E, self).__init__() torch.nn.Module.__init__(self) self.mtlalpha = args.mtlalpha assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]" self.etype = args.etype self.verbose = args.verbose # NOTE: for self.build method self.outdir = args.outdir # target matching system organization self.oversampling = args.oversampling self.residual = args.residual self.outer = args.outer self.poster = torch.nn.Linear(args.eprojs, odim * self.oversampling) self.poster_mono = torch.nn.Linear(args.eprojs, mono_odim * self.oversampling) # below means the last number becomes eos/sos ID # note that sos/eos IDs are identical self.sos = odim - 1 self.eos = odim - 1 self.sos_mono = mono_odim - 1 self.eos_mono = mono_odim - 1 self.odim = odim self.mono_odim = mono_odim self.ignore_id = ignore_id self.subsample = get_subsample(args, mode="asr", arch="rnn") self.reporter = Reporter() # label smoothing info if args.lsm_type and os.path.isfile(args.train_json): logging.info("Use label smoothing with " + args.lsm_type) labeldist = label_smoothing_dist(odim, args.lsm_type, transcript=args.train_json) else: labeldist = None if getattr(args, "use_frontend", False): # use getattr to keep compatibility self.frontend = frontend_for(args, idim) self.feature_transform = feature_transform_for( args, (idim - 1) * 2) idim = args.n_mels else: self.frontend = None # encoder self.enc = encoder_for(args, idim, self.subsample) # ctc self.ctc = ctc_for(args, odim) # weight initialization if args.initializer == "lecun": self.init_like_chainer() elif args.initializer == "orthogonal": self.init_orthogonal() elif args.initializer == "xavier": self.init_xavier() else: raise NotImplementedError("unknown initializer: " + args.initializer) if args.report_cer or args.report_wer: self.error_calculator = ErrorCalculator( args.char_list, args.sym_space, args.sym_blank, args.report_cer, args.report_wer, ) else: self.error_calculator = None self.rnnlm = None self.logzero = -10000000000.0 self.loss = None self.acc = None
def __init__(self, idim, odim, args): super(E2E, self).__init__() self.mtlalpha = args.mtlalpha assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]" self.etype = args.etype self.verbose = args.verbose self.char_list = args.char_list self.outdir = args.outdir self.reporter = Reporter() self.num_spkrs = args.num_spkrs self.spa = args.spa self.pit = PIT(self.num_spkrs) # below means the last number becomes eos/sos ID # note that sos/eos IDs are identical self.sos = odim - 1 self.eos = odim - 1 # subsample info # +1 means input (+1) and layers outputs (args.elayer_sd + args.elayers) subsample = np.ones(args.elayers_sd + args.elayers + 1, dtype=np.int) if args.etype.endswith("p") and not args.etype.startswith("vgg"): ss = args.subsample.split("_") for j in range(min(args.elayers_sd + args.elayers + 1, len(ss))): subsample[j] = int(ss[j]) else: logging.warning( 'Subsampling is not performed for vgg*. It is performed in max pooling layers at CNN.' ) logging.info('subsample: ' + ' '.join([str(x) for x in subsample])) self.subsample = subsample # label smoothing info if args.lsm_type: logging.info("Use label smoothing with " + args.lsm_type) labeldist = label_smoothing_dist(odim, args.lsm_type, transcript=args.train_json) else: labeldist = None # encoder self.enc = encoder_for(args, idim, self.subsample) # ctc self.ctc = ctc_for(args, odim, reduce=False) # attention num_att = self.num_spkrs if args.spa else 1 self.att = att_for(args, num_att) # decoder self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist) # weight initialization self.init_like_chainer() # options for beam search if 'report_cer' in vars(args) and (args.report_cer or args.report_wer): recog_args = { 'beam_size': args.beam_size, 'penalty': args.penalty, 'ctc_weight': args.ctc_weight, 'maxlenratio': args.maxlenratio, 'minlenratio': args.minlenratio, 'lm_weight': args.lm_weight, 'rnnlm': args.rnnlm, 'nbest': args.nbest, 'space': args.sym_space, 'blank': args.sym_blank } self.recog_args = argparse.Namespace(**recog_args) self.report_cer = args.report_cer self.report_wer = args.report_wer else: self.report_cer = False self.report_wer = False self.rnnlm = None self.logzero = -10000000000.0 self.loss = None self.acc = None
def __init__(self, idims, odim, args): """Initialize this class with python-level args. Args: idims (list): list of the number of an input feature dim. odim (int): The number of output vocab. args (Namespace): arguments """ super(E2E, self).__init__() torch.nn.Module.__init__(self) self.mtlalpha = args.mtlalpha assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]" self.verbose = args.verbose # NOTE: for self.build method args.char_list = getattr(args, "char_list", None) self.char_list = args.char_list self.outdir = args.outdir self.space = args.sym_space self.blank = args.sym_blank self.reporter = Reporter() self.num_encs = args.num_encs self.share_ctc = args.share_ctc # below means the last number becomes eos/sos ID # note that sos/eos IDs are identical self.sos = odim - 1 self.eos = odim - 1 # subsample info self.subsample_list = [] for idx in range(self.num_encs): # +1 means input (+1) and layers outputs (args.elayer) subsample = np.ones(args.elayers[idx] + 1, dtype=np.int) if args.etype[idx].endswith( "p") and not args.etype[idx].startswith("vgg"): ss = args.subsample[idx].split("_") for j in range(min(args.elayers[idx] + 1, len(ss))): subsample[j] = int(ss[j]) else: logging.warning( 'Encoder {}: Subsampling is not performed for vgg*. ' 'It is performed in max pooling layers at CNN.'.format( idx + 1)) logging.info('subsample: ' + ' '.join([str(x) for x in subsample])) self.subsample_list.append(subsample) # label smoothing info if args.lsm_type and os.path.isfile(args.train_json): logging.info("Use label smoothing with " + args.lsm_type) labeldist = label_smoothing_dist(odim, args.lsm_type, transcript=args.train_json) else: labeldist = None # speech translation related self.replace_sos = getattr(args, "replace_sos", False) # use getattr to keep compatibility self.frontend = None # encoder self.enc = encoder_for(args, idims, self.subsample_list) # ctc self.ctc = ctc_for(args, odim) # attention self.att = att_for(args) # hierarchical attention network han = att_for(args, han_mode=True) self.att.append(han) # decoder self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist) if args.mtlalpha > 0 and self.num_encs > 1: # weights-ctc, e.g. ctc_loss = w_1*ctc_1_loss + w_2 * ctc_2_loss + w_N * ctc_N_loss self.weights_ctc_train = args.weights_ctc_train / np.sum( args.weights_ctc_train) # normalize self.weights_ctc_dec = args.weights_ctc_dec / np.sum( args.weights_ctc_dec) # normalize logging.info('ctc weights (training during training): ' + ' '.join([str(x) for x in self.weights_ctc_train])) logging.info('ctc weights (decoding during training): ' + ' '.join([str(x) for x in self.weights_ctc_dec])) else: self.weights_ctc_dec = [1.0] self.weights_ctc_train = [1.0] # weight initialization self.init_like_chainer() # options for beam search if args.report_cer or args.report_wer: recog_args = { 'beam_size': args.beam_size, 'penalty': args.penalty, 'ctc_weight': args.ctc_weight, 'maxlenratio': args.maxlenratio, 'minlenratio': args.minlenratio, 'lm_weight': args.lm_weight, 'rnnlm': args.rnnlm, 'nbest': args.nbest, 'space': args.sym_space, 'blank': args.sym_blank, 'tgt_lang': False, 'ctc_weights_dec': self.weights_ctc_dec } self.recog_args = argparse.Namespace(**recog_args) self.report_cer = args.report_cer self.report_wer = args.report_wer else: self.report_cer = False self.report_wer = False self.rnnlm = None self.logzero = -10000000000.0 self.loss = None self.acc = None
def __init__(self, idim, odim, args): super(E2E, self).__init__() torch.nn.Module.__init__(self) self.mtlalpha = args.mtlalpha assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]" self.etype = args.etype self.verbose = args.verbose # NOTE: for self.build method args.char_list = getattr(args, "char_list", None) self.char_list = args.char_list self.outdir = args.outdir self.space = args.sym_space self.blank = args.sym_blank # self.oracle_length = args.oracle_length self.reporter = Reporter() # below means the last number becomes eos/sos ID # note that sos/eos IDs are identical self.sos = odim - 1 self.eos = odim - 1 # subsample info # +1 means input (+1) and layers outputs (args.elayer) subsample = np.ones(args.elayers + 1, dtype=np.int) if args.etype.endswith("p") and not args.etype.startswith("vgg"): ss = args.subsample.split("_") for j in range(min(args.elayers + 1, len(ss))): subsample[j] = int(ss[j]) else: logging.warning( 'Subsampling is not performed for vgg*. It is performed in max pooling layers at CNN.' ) logging.info('subsample: ' + ' '.join([str(x) for x in subsample])) self.subsample = subsample # label smoothing info if args.lsm_type and os.path.isfile(args.train_json): logging.info("Use label smoothing with " + args.lsm_type) labeldist = label_smoothing_dist(odim, args.lsm_type, transcript=args.train_json) else: labeldist = None # speech translation related self.replace_sos = getattr(args, "replace_sos", False) # use getattr to keep compatibility if getattr(args, "use_frontend", False): # use getattr to keep compatibility # Relative importing because of using python3 syntax from espnet.nets.pytorch_backend.frontends.feature_transform \ import feature_transform_for from espnet.nets.pytorch_backend.frontends.frontend \ import frontend_for self.frontend = frontend_for(args, idim) self.feature_transform = feature_transform_for( args, (idim - 1) * 2) idim = args.n_mels else: self.frontend = None # encoder self.enc = encoder_for(args, idim, self.subsample) # ctc self.ctc = ctc_for(args, odim) # attention self.att = att_for(args) # decoder self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist) # weight initialization self.init_like_chainer() # options for beam search if args.report_cer or args.report_wer: recog_args = { 'beam_size': args.beam_size, 'penalty': args.penalty, 'ctc_weight': args.ctc_weight, 'maxlenratio': args.maxlenratio, 'minlenratio': args.minlenratio, 'lm_weight': args.lm_weight, 'rnnlm': args.rnnlm, 'nbest': args.nbest, 'space': args.sym_space, 'blank': args.sym_blank, 'tgt_lang': False, 'sampling': args.sampling } self.recog_args = argparse.Namespace(**recog_args) self.report_cer = args.report_cer self.report_wer = args.report_wer else: self.report_cer = False self.report_wer = False self.rnnlm = None self.logzero = -10000000000.0 self.loss = None self.acc = None self.loss_nll = torch.nn.NLLLoss()
def __init__(self, idim, odim, args): """Construct an E2E object. :param int idim: dimension of inputs :param int odim: dimension of outputs :param Namespace args: argument Namespace containing options """ super(E2E, self).__init__() torch.nn.Module.__init__(self) # fill missing arguments for compatibility args = fill_missing_args(args, self.add_arguments) self.mtlalpha = args.mtlalpha assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]" self.etype = args.etype self.verbose = args.verbose # NOTE: for self.build method args.char_list = getattr(args, "char_list", None) self.char_list = args.char_list self.outdir = args.outdir self.space = args.sym_space self.blank = args.sym_blank self.reporter = Reporter() # below means the last number becomes eos/sos ID # note that sos/eos IDs are identical self.sos = odim - 1 self.eos = odim - 1 # subsample info self.subsample = get_subsample(args, mode="asr", arch="rnn") # label smoothing info if args.lsm_type and os.path.isfile(args.train_json): logging.info("Use label smoothing with " + args.lsm_type) labeldist = label_smoothing_dist(odim, args.lsm_type, transcript=args.train_json) else: labeldist = None if getattr(args, "use_frontend", False): # use getattr to keep compatibility self.frontend = frontend_for(args, idim) self.feature_transform = feature_transform_for( args, (idim - 1) * 2) idim = args.n_mels else: self.frontend = None # encoder self.enc = encoder_for(args, idim, self.subsample) # ctc self.ctc = ctc_for(args, odim) # attention self.att = att_for(args) # decoder self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist) # weight initialization self.init_like_chainer() # options for beam search if args.report_cer or args.report_wer: recog_args = { "beam_size": args.beam_size, "penalty": args.penalty, "ctc_weight": args.ctc_weight, "maxlenratio": args.maxlenratio, "minlenratio": args.minlenratio, "lm_weight": args.lm_weight, "rnnlm": args.rnnlm, "nbest": args.nbest, "space": args.sym_space, "blank": args.sym_blank, } self.recog_args = argparse.Namespace(**recog_args) self.report_cer = args.report_cer self.report_wer = args.report_wer else: self.report_cer = False self.report_wer = False self.rnnlm = None self.logzero = -10000000000.0 self.loss = None self.acc = None
def __init__(self, idim, odim, args): """Initialize multi-speaker modules. Args: idim (int): dimension of inputs odim (int): dimension of outputs args (Namespace): argument Namespace containing options """ torch.nn.Module.__init__(self) self.mtlalpha = args.mtlalpha assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]" self.rnnt_mode = args.rnnt_mode self.etype = args.etype self.verbose = args.verbose self.char_list = args.char_list self.outdir = args.outdir self.reporter = Reporter() self.num_spkrs = args.num_spkrs self.spa = args.spa self.pit = PIT(self.num_spkrs) # below means the last number becomes eos/sos ID # note that sos/eos IDs are identical self.sos = odim - 1 self.eos = odim - 1 # subsample info # +1 means input (+1) and layers outputs (args.elayer_sd + args.elayers) subsample = np.ones(args.elayers_sd + args.elayers + 1, dtype=np.int) if args.etype.endswith("p") and not args.etype.startswith("vgg"): ss = args.subsample.split("_") for j in range(min(args.elayers_sd + args.elayers + 1, len(ss))): subsample[j] = int(ss[j]) else: logging.warning( 'Subsampling is not performed for vgg*. It is performed in max pooling layers at CNN.' ) logging.info('subsample: ' + ' '.join([str(x) for x in subsample])) self.subsample = subsample # label smoothing info if args.lsm_type and os.path.isfile(args.train_json): logging.info("Use label smoothing with " + args.lsm_type) labeldist = label_smoothing_dist(odim, args.lsm_type, transcript=args.train_json) else: labeldist = None if getattr(args, "use_frontend", False): # use getattr to keep compatibility # Relative importing because of using python3 syntax from espnet.nets.pytorch_backend.frontends.feature_transform \ import feature_transform_for from espnet.nets.pytorch_backend.frontends.frontend \ import frontend_for self.frontend = frontend_for(args, idim) self.feature_transform = feature_transform_for( args, (idim - 1) * 2) idim = args.n_mels else: self.frontend = None # encoder self.enc = encoder_for(args, idim, self.subsample) # ctc self.ctc = ctc_for(args, odim, reduce=False) if args.rnnt_mode == 'rnnt-att': # attention num_att = self.num_spkrs if args.spa else 1 self.att = att_for(args, num_att) # decoder self.dec = decoder_for(args, odim, self.att) else: # prediction self.dec = decoder_for(args, odim) # weight initialization self.init_like_chainer() # options for beam search if 'report_cer' in vars(args) and (args.report_cer or args.report_wer): recog_args = { 'beam_size': args.beam_size, 'penalty': args.penalty, 'ctc_weight': args.ctc_weight, 'maxlenratio': args.maxlenratio, 'minlenratio': args.minlenratio, 'lm_weight': args.lm_weight, 'rnnlm': args.rnnlm, 'nbest': args.nbest, 'space': args.sym_space, 'blank': args.sym_blank } self.recog_args = argparse.Namespace(**recog_args) self.report_cer = args.report_cer self.report_wer = args.report_wer else: self.report_cer = False self.report_wer = False self.rnnlm = None self.logzero = -10000000000.0 self.loss = None self.acc = None
def __init__(self, idim, odim, args): """Initialize multi-speaker E2E module.""" torch.nn.Module.__init__(self) self.mtlalpha = args.mtlalpha assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]" self.etype = args.etype self.verbose = args.verbose self.char_list = args.char_list self.outdir = args.outdir self.reporter = Reporter() self.num_spkrs = args.num_spkrs self.spa = args.spa self.pit = PIT(self.num_spkrs) # below means the last number becomes eos/sos ID # note that sos/eos IDs are identical self.sos = odim - 1 self.eos = odim - 1 # subsample info self.subsample = get_subsample(args, mode='asr', arch='rnn_mix') # label smoothing info if args.lsm_type and os.path.isfile(args.train_json): logging.info("Use label smoothing with " + args.lsm_type) labeldist = label_smoothing_dist(odim, args.lsm_type, transcript=args.train_json) else: labeldist = None if getattr(args, "use_frontend", False): # use getattr to keep compatibility # Relative importing because of using python3 syntax from espnet.nets.pytorch_backend.frontends.feature_transform \ import feature_transform_for from espnet.nets.pytorch_backend.frontends.frontend \ import frontend_for self.frontend = frontend_for(args, idim) self.feature_transform = feature_transform_for( args, (idim - 1) * 2) idim = args.n_mels else: self.frontend = None # encoder self.enc = encoder_for(args, idim, self.subsample) # ctc self.ctc = ctc_for(args, odim, reduce=False) # attention num_att = self.num_spkrs if args.spa else 1 self.att = att_for(args, num_att) # decoder self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist) # weight initialization self.init_like_chainer() # options for beam search if 'report_cer' in vars(args) and (args.report_cer or args.report_wer): recog_args = { 'beam_size': args.beam_size, 'penalty': args.penalty, 'ctc_weight': args.ctc_weight, 'maxlenratio': args.maxlenratio, 'minlenratio': args.minlenratio, 'lm_weight': args.lm_weight, 'rnnlm': args.rnnlm, 'nbest': args.nbest, 'space': args.sym_space, 'blank': args.sym_blank } self.recog_args = argparse.Namespace(**recog_args) self.report_cer = args.report_cer self.report_wer = args.report_wer else: self.report_cer = False self.report_wer = False self.rnnlm = None self.logzero = -10000000000.0 self.loss = None self.acc = None
def __init__(self, idim, odim, args, asr_model=None, mt_model=None): super(E2E, self).__init__() torch.nn.Module.__init__(self) self.mtlalpha = args.mtlalpha assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]" self.etype = args.etype self.verbose = args.verbose self.char_list = args.char_list self.outdir = args.outdir self.space = args.sym_space self.blank = args.sym_blank self.reporter = Reporter() # below means the last number becomes eos/sos ID # note that sos/eos IDs are identical self.sos = odim - 1 self.eos = odim - 1 # subsample info # +1 means input (+1) and layers outputs (args.elayer) subsample = np.ones(args.elayers + 1, dtype=np.int) if args.etype.endswith("p") and not args.etype.startswith("vgg"): ss = args.subsample.split("_") for j in range(min(args.elayers + 1, len(ss))): subsample[j] = int(ss[j]) else: logging.warning( 'Subsampling is not performed for vgg*. It is performed in max pooling layers at CNN.') logging.info('subsample: ' + ' '.join([str(x) for x in subsample])) self.subsample = subsample # label smoothing info if args.lsm_type and os.path.isfile(args.train_json): logging.info("Use label smoothing with " + args.lsm_type) labeldist = label_smoothing_dist(odim, args.lsm_type, transcript=args.train_json) else: labeldist = None # speech translation related self.replace_sos = args.replace_sos if args.use_frontend: # Relative importing because of using python3 syntax from espnet.nets.pytorch_backend.frontends.feature_transform \ import feature_transform_for from espnet.nets.pytorch_backend.frontends.frontend \ import frontend_for self.frontend = frontend_for(args, idim) self.feature_transform = feature_transform_for(args, (idim - 1) * 2) idim = args.n_mels else: self.frontend = None # encoder self.enc = encoder_for(args, idim, self.subsample) # ctc self.ctc = ctc_for(args, odim) # attention self.att = att_for(args) # decoder self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist) # weight initialization self.init_like_chainer() # pre-training w/ ASR encoder and NMT decoder if asr_model is not None: param_dict = dict(asr_model.named_parameters()) for n, p in self.named_parameters(): # overwrite the encoder if n in param_dict.keys() and p.size() == param_dict[n].size(): if 'enc.enc' in n: p.data = param_dict[n].data logging.warning('Overwrite %s' % n) if mt_model is not None: param_dict = dict(mt_model.named_parameters()) for n, p in self.named_parameters(): # overwrite the decoder if n in param_dict.keys() and p.size() == param_dict[n].size(): if 'dec.' in n or 'att' in n: p.data = param_dict[n].data logging.warning('Overwrite %s' % n) # options for beam search if args.report_cer or args.report_wer: recog_args = {'beam_size': args.beam_size, 'penalty': args.penalty, 'ctc_weight': args.ctc_weight, 'maxlenratio': args.maxlenratio, 'minlenratio': args.minlenratio, 'lm_weight': args.lm_weight, 'rnnlm': args.rnnlm, 'nbest': args.nbest, 'space': args.sym_space, 'blank': args.sym_blank, 'tgt_lang': False} self.recog_args = argparse.Namespace(**recog_args) self.report_cer = args.report_cer self.report_wer = args.report_wer else: self.report_cer = False self.report_wer = False self.rnnlm = None self.logzero = -10000000000.0 self.loss = None self.acc = None
def __init__(self, idim, odim, args): """Construct an E2E object. :param int idim: dimension of inputs :param int odim: dimension of outputs :param Namespace args: argument Namespace containing options """ super(E2E, self).__init__() torch.nn.Module.__init__(self) # fill missing arguments for compatibility args = fill_missing_args(args, self.add_arguments) self.mtlalpha = args.mtlalpha assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]" self.etype = args.etype self.verbose = args.verbose # NOTE: for self.build method args.char_list = getattr(args, "char_list", None) self.char_list = args.char_list self.outdir = args.outdir self.space = args.sym_space self.blank = args.sym_blank self.reporter = Reporter() # below means the last number becomes eos/sos ID # note that sos/eos IDs are identical self.sos = odim - 1 self.eos = odim - 1 # gs534 - word vocab bpe = len(self.char_list) > 100 # hack here for bpe flag self.vocabulary = Vocabulary(args.dictfile, bpe) if args.dictfile != '' else None # gs534 - create lexicon tree lextree = None self.meeting_KB = None self.n_KBs = getattr(args, 'dynamicKBs', 0) pretrain_emb = [] if args.meetingKB and args.meetingpath != '': if self.n_KBs == 0 or not os.path.isdir(os.path.join(args.meetingpath, 'split_0')): self.meeting_KB = KBmeeting(self.vocabulary, args.meetingpath, args.char_list, bpe) else: # arrange multiple KBs self.meeting_KB = [] for i in range(self.n_KBs): self.meeting_KB.append(KBmeeting(self.vocabulary, os.path.join(args.meetingpath, 'split_{}'.format(i)), args.char_list, bpe)) # subsample info self.subsample = get_subsample(args, mode="asr", arch="rnn") # label smoothing info if args.lsm_type and os.path.isfile(args.train_json): logging.info("Use label smoothing with " + args.lsm_type) labeldist = label_smoothing_dist( odim, args.lsm_type, transcript=args.train_json ) else: labeldist = None if getattr(args, "use_frontend", False): # use getattr to keep compatibility self.frontend = frontend_for(args, idim) self.feature_transform = feature_transform_for(args, (idim - 1) * 2) idim = args.n_mels else: self.frontend = None # encoder self.enc = encoder_for(args, idim, self.subsample) # ctc self.ctc = ctc_for(args, odim) # attention self.att = att_for(args) # decoder self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist, meetingKB=self.meeting_KB[0] if isinstance(self.meeting_KB, list) else self.meeting_KB) # weight initialization self.init_from = getattr(args, 'init_full_model', None) self.init_like_chainer() # options for beam search if args.report_cer or args.report_wer: recog_args = { "beam_size": args.beam_size, "penalty": args.penalty, "ctc_weight": args.ctc_weight, "maxlenratio": args.maxlenratio, "minlenratio": args.minlenratio, "lm_weight": args.lm_weight, "rnnlm": args.rnnlm, "nbest": args.nbest, "space": args.sym_space, "blank": args.sym_blank, } self.recog_args = argparse.Namespace(**recog_args) self.report_cer = args.report_cer self.report_wer = args.report_wer else: self.report_cer = False self.report_wer = False self.rnnlm = None self.logzero = -10000000000.0 self.loss = None self.acc = None
def __init__(self, idim, odim, args, ignore_id=-1, blank_id=0, training=True): """Construct an E2E object for transducer model.""" torch.nn.Module.__init__(self) args = fill_missing_args(args, self.add_arguments) self.is_rnnt = True self.transducer_weight = args.transducer_weight self.use_aux_task = (True if (args.aux_task_type is not None and training) else False) self.use_aux_ctc = args.aux_ctc and training self.aux_ctc_weight = args.aux_ctc_weight self.use_aux_cross_entropy = args.aux_cross_entropy and training self.aux_cross_entropy_weight = args.aux_cross_entropy_weight if self.use_aux_task: n_layers = ((len(args.enc_block_arch) * args.enc_block_repeat - 1) if args.enc_block_arch is not None else (args.elayers - 1)) aux_task_layer_list = valid_aux_task_layer_list( args.aux_task_layer_list, n_layers, ) else: aux_task_layer_list = [] if "custom" in args.etype: if args.enc_block_arch is None: raise ValueError( "When specifying custom encoder type, --enc-block-arch" "should also be specified in training config. See" "egs/vivos/asr1/conf/transducer/train_*.yaml for more info." ) self.subsample = get_subsample(args, mode="asr", arch="transformer") self.encoder = CustomEncoder( idim, args.enc_block_arch, input_layer=args.custom_enc_input_layer, repeat_block=args.enc_block_repeat, self_attn_type=args.custom_enc_self_attn_type, positional_encoding_type=args. custom_enc_positional_encoding_type, positionwise_activation_type=args. custom_enc_pw_activation_type, conv_mod_activation_type=args. custom_enc_conv_mod_activation_type, aux_task_layer_list=aux_task_layer_list, ) encoder_out = self.encoder.enc_out self.most_dom_list = args.enc_block_arch[:] else: self.subsample = get_subsample(args, mode="asr", arch="rnn-t") self.enc = encoder_for( args, idim, self.subsample, aux_task_layer_list=aux_task_layer_list, ) encoder_out = args.eprojs if "custom" in args.dtype: if args.dec_block_arch is None: raise ValueError( "When specifying custom decoder type, --dec-block-arch" "should also be specified in training config. See" "egs/vivos/asr1/conf/transducer/train_*.yaml for more info." ) self.decoder = CustomDecoder( odim, args.dec_block_arch, input_layer=args.custom_dec_input_layer, repeat_block=args.dec_block_repeat, positionwise_activation_type=args. custom_dec_pw_activation_type, dropout_rate_embed=args.dropout_rate_embed_decoder, ) decoder_out = self.decoder.dunits if "custom" in args.etype: self.most_dom_list += args.dec_block_arch[:] else: self.most_dom_list = args.dec_block_arch[:] else: self.dec = DecoderRNNT( odim, args.dtype, args.dlayers, args.dunits, blank_id, args.dec_embed_dim, args.dropout_rate_decoder, args.dropout_rate_embed_decoder, ) decoder_out = args.dunits self.joint_network = JointNetwork(odim, encoder_out, decoder_out, args.joint_dim, args.joint_activation_type) if hasattr(self, "most_dom_list"): self.most_dom_dim = sorted( Counter(d["d_hidden"] for d in self.most_dom_list if "d_hidden" in d).most_common(), key=lambda x: x[0], reverse=True, )[0][0] self.etype = args.etype self.dtype = args.dtype self.sos = odim - 1 self.eos = odim - 1 self.blank_id = blank_id self.ignore_id = ignore_id self.space = args.sym_space self.blank = args.sym_blank self.odim = odim self.reporter = Reporter() self.error_calculator = None self.default_parameters(args) if training: self.criterion = TransLoss(args.trans_type, self.blank_id) decoder = self.decoder if self.dtype == "custom" else self.dec if args.report_cer or args.report_wer: self.error_calculator = ErrorCalculator( decoder, self.joint_network, args.char_list, args.sym_space, args.sym_blank, args.report_cer, args.report_wer, ) if self.use_aux_task: self.auxiliary_task = AuxiliaryTask( decoder, self.joint_network, self.criterion, args.aux_task_type, args.aux_task_weight, encoder_out, args.joint_dim, ) if self.use_aux_ctc: self.aux_ctc = ctc_for( Namespace( num_encs=1, eprojs=encoder_out, dropout_rate=args.aux_ctc_dropout_rate, ctc_type="warpctc", ), odim, ) if self.use_aux_cross_entropy: self.aux_decoder_output = torch.nn.Linear(decoder_out, odim) self.aux_cross_entropy = LabelSmoothingLoss( odim, ignore_id, args.aux_cross_entropy_smoothing) self.loss = None self.rnnlm = None
def __init__(self, idim, odim, args): torch.nn.Module.__init__(self) self.mtlalpha = args.mtlalpha assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]" self.etype = args.etype self.verbose = args.verbose self.char_list = args.char_list self.outdir = args.outdir self.space = args.sym_space # self.space = -1 self.blank = args.sym_blank self.reporter = Reporter() # below means the last number becomes eos/sos ID # note that sos/eos IDs are identical self.sos = odim - 1 self.eos = odim - 1 # subsample info # +1 means input (+1) and layers outputs (args.elayer) subsample = np.ones(args.elayers + 1, dtype=np.int) if args.etype.endswith("p") and not args.etype.startswith("vgg"): ss = args.subsample.split("_") for j in range(min(args.elayers + 1, len(ss))): subsample[j] = int(ss[j]) else: logging.warning( 'Subsampling is not performed for vgg*. It is performed in max pooling layers at CNN.' ) logging.info('subsample: ' + ' '.join([str(x) for x in subsample])) self.subsample = subsample # label smoothing info if args.lsm_type: logging.info("Use label smoothing with " + args.lsm_type) labeldist = label_smoothing_dist(odim, args.lsm_type, transcript=args.train_json) else: labeldist = None if args.use_frontend: # Relative importing because of using python3 syntax from espnet.nets.pytorch_backend.frontends.feature_transform \ import feature_transform_for from espnet.nets.pytorch_backend.frontends.frontend \ import frontend_for self.frontend = frontend_for(args, idim) self.feature_transform = feature_transform_for( args, (idim - 1) * 2) idim = args.n_mels else: self.frontend = None # encoder # self.enc = encoder_for(args, idim, self.subsample) self.encoder = Encoder( idim=idim, center_len=args.transformer_encoder_center_chunk_len, left_len=args.transformer_encoder_left_chunk_len, hop_len=args.transformer_encoder_hop_len, right_len=args.transformer_encoder_right_chunk_len, abs_pos=args.transformer_encoder_abs_embed, rel_pos=args.transformer_encoder_rel_embed, use_mem=args.transformer_encoder_use_memory, attention_dim=args.adim, attention_heads=args.aheads, linear_units=args.eunits, num_blocks=args.elayers, input_layer=args.transformer_input_layer, dropout_rate=args.dropout_rate, positional_dropout_rate=args.dropout_rate, attention_dropout_rate=args.transformer_attn_dropout_rate) # ctc self.ctc = ctc_for(args, odim) # attention self.att = att_for(args) # decoder self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist) # weight initialization self.init_like_chainer() # options for beam search if args.report_cer or args.report_wer: recog_args = { 'beam_size': args.beam_size, 'penalty': args.penalty, 'ctc_weight': args.ctc_weight, 'maxlenratio': args.maxlenratio, 'minlenratio': args.minlenratio, 'lm_weight': args.lm_weight, 'rnnlm': args.rnnlm, 'nbest': args.nbest, 'space': args.sym_space, 'blank': args.sym_blank } self.recog_args = argparse.Namespace(**recog_args) self.report_cer = args.report_cer self.report_wer = args.report_wer else: self.report_cer = False self.report_wer = False self.rnnlm = None self.logzero = -10000000000.0 self.loss = None self.acc = None