def __init__(self, input_dim, output_dim, context=[0], affine_type="tdnn", **options): super(ReluBatchNormTdnnLayerR, self).__init__() affine_options = { "bias": True, "groups": 1, "norm_w": False, "norm_f": False } affine_options = utils.assign_params_dict(affine_options, options) self.add_relu_bn(input_dim, options=options) if affine_type == "tdnn": self.affine = TdnnAffine(input_dim, output_dim, context=context, **affine_options) else: self.affine = ChunkSeparationAffine(input_dim, output_dim, context=context, **affine_options)
def __init__(self, input_dim, output_dim, context=[0], affine_type="tdnn", **options): super(ReluBatchNormTdnnLayer, self).__init__() affine_options = { "bias": True, "groups": 1, "norm_w": False, "norm_f": False } affine_options = utils.assign_params_dict(affine_options, options) # Only keep the order: affine -> layers.insert -> add_relu_bn, # the structure order will be right when print(model), such as follows: # (tdnn1): ReluBatchNormTdnnLayer( # (affine): TdnnAffine() # (activation): ReLU() # (batchnorm): BatchNorm1d(512, eps=1e-05, momentum=0.5, affine=False, track_running_stats=True) if affine_type == "tdnn": self.affine = TdnnAffine(input_dim, output_dim, context=context, **affine_options) else: self.affine = ChunkSeparationAffine(input_dim, output_dim, context=context, **affine_options) self.add_relu_bn(output_dim, options=options)
def get_augmentation(aug=None, aug_params={}): default_aug_params = { "frequency": 0.2, "frame": 0., "rows": 1, "cols": 0, "random_rows": False, "random_cols": False } aug_params = utils.assign_params_dict(default_aug_params, aug_params) if aug is None or aug == "" or aug == False: return None elif aug == "specaugment": return SpecAugment(frequency=aug_params["frequency"], frame=aug_params["frame"], rows=aug_params["rows"], cols=aug_params["cols"], random_rows=aug_params["random_rows"], random_cols=aug_params["random_cols"]) elif aug == "cutout": raise NotImplementedError else: raise TypeError("Do not support {} augmentation.".format(aug))
def __init__(self, package, stop_early=False): default_elements = { "data": None, "model": None, "optimizer": None, "lr_scheduler": None } default_params = { "model_dir": "", "model_blueprint": "", "exist_model": "", "start_epoch": 0, "epochs": 10, "use_gpu": True, "gpu_id": "", "benchmark": True, "max_change": 10.0, "compute_accuracy": True, "compute_valid_accuracy": True, "compute_one_batch_valid": False, "suffix": "params", "nan_debug": False, "use_tensorboard": True, "mixed_prec": False } elements, params = package self.elements = utils.assign_params_dict(default_elements, elements) self.params = utils.assign_params_dict(default_params, params, support_unknow=True) assert self.elements["data"] is not None assert self.elements["model"] is not None assert self.elements["optimizer"] is not None assert self.params["model_dir"] != "" assert self.params["model_blueprint"] != "" self.elements["model_forward"] = self.elements["model"] self.params["start_epoch"] = max(0, self.params["start_epoch"]) if self.params["mixed_prec"] is True: self.scaler = GradScaler() self.stop_early = stop_early # To do. self.training_point = (self.params["start_epoch"], 0, self.elements["data"].num_batch_train)
def add_relu_bn(self, output_dim=None, options: dict = {}): default_params = { "bn-relu": False, "nonlinearity": 'relu', "nonlinearity_params": { "inplace": True, "negative_slope": 0.01 }, "bn": True, "bn_params": { "momentum": 0.1, "affine": True, "track_running_stats": True }, "special_init": True, "mode": 'fan_out' } default_params = utils.assign_params_dict(default_params, options) # This 'if else' is used to keep a corrected order when printing model. # torch.sequential is not used for I do not want too many layer wrapper and just keep structure as tdnn1.affine # rather than tdnn1.layers.affine or tdnn1.layers[0] etc.. if not default_params["bn-relu"]: # ReLU-BN # For speaker recognition, relu-bn seems better than bn-relu. And w/o affine (scale and shift) of bn is # also better than w/ affine. self.after_forward = self._relu_bn_forward self.activation = Nonlinearity( default_params["nonlinearity"], **default_params["nonlinearity_params"]) if default_params["bn"]: self.batchnorm = torch.nn.BatchNorm1d( output_dim, **default_params["bn_params"]) else: # BN-ReLU self.after_forward = self._bn_relu_forward if default_params["bn"]: self.batchnorm = torch.nn.BatchNorm1d( output_dim, **default_params["bn_params"]) self.activation = Nonlinearity( default_params["nonlinearity"], **default_params["nonlinearity_params"]) if default_params["special_init"] and self.affine is not None: if default_params["nonlinearity"] in [ "relu", "leaky_relu", "tanh", "sigmoid" ]: # Before special_init, there is another initial way been done in TdnnAffine and it # is just equal to use torch.nn.init.normal_(self.affine.weight, 0., 0.01) here. torch.nn.init.kaiming_uniform_( self.affine.weight, a=0, mode=default_params["mode"], nonlinearity=default_params["nonlinearity"]) else: torch.nn.init.xavier_normal_(self.affine.weight, gain=1.0)
def __init__(self, optimizer, params:dict={}): # Suggested weight_decay: 1e-4 for l2 regularization (sgd, adam) and # 1e-1 for decouped weight decay (sgdw, adamw, radam, ralamb, adamod etc.) default_params = { "name":"warmR", "1cycle.learn_rate":0.001, "warmR.T_max":10, "warmR.T_mult":1, "warmR.factor":1.0, "warmR.eta_min":4e-8, "warmR.log_decay":False, "warmR.lr_decay_step":1, "reduceP.metric":'valid_acc', "reduceP.check_interval":0, "reduceP.factor":0.1, "reduceP.patience":10, "reduceP.threshold":0.0001, "reduceP.cooldown":0, "reduceP.min_lr":0 } used_params = utils.assign_params_dict(default_params, params, force_check=False, support_unknow=True) split_params = utils.split_params(used_params) if isinstance(optimizer, Lookahead): base_optimizer = optimizer.optimizer else: base_optimizer = optimizer self.name = split_params["public"]["name"] if self.name == "1cycle": # To do. self.lr_scheduler = optim.lr_scheduler.OneCycleLR(base_optimizer, **split_params["1cycle"]) elif self.name == "warmR": T_max = split_params["warmR"].pop("T_max") self.lr_decay_step = split_params["warmR"].pop("lr_decay_step") self.lr_scheduler = CosineAnnealingWarmRestarts(base_optimizer, T_max, **split_params["warmR"]) elif self.name == "reduceP": self.check_interval = split_params["reduceP"].pop("check_interval") self.metric = split_params["reduceP"].pop("metric") if self.metric == "valid_acc": mode = "max" elif self.metric == "valid_loss": mode = "min" else: raise ValueError("Do not support {} metric for ReduceLROnPlateau strategy.".format(self.metric)) self.lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(base_optimizer, mode=mode, **split_params["reduceP"]) self.init = False if utils.use_horovod(): raise TypeError("Do not support ReduceLROnPlateau for multi-gpu of Horovod now.") else: raise ValueError("Do not support {0} lr_scheduler now.".format(name))
def init(self, inputs_dim, num_targets, aug_dropout=0.2, training=True, extracted_embedding="far", tdnn_layer_params={}): default_tdnn_layer_params = { "nonlinearity": 'relu', "bn-relu": False, "bn": True, "bn_params": { "momentum": 0.1, "affine": True, "track_running_stats": True } } tdnn_layer_params = utils.assign_params_dict(default_tdnn_layer_params, tdnn_layer_params) # Var self.extracted_embedding = extracted_embedding # Nnet self.aug_dropout = torch.nn.Dropout2d( p=aug_dropout) if aug_dropout > 0 else None self.tdnn1 = ReluBatchNormTdnnLayer(inputs_dim, 512, [-2, -1, 0, 1, 2], **tdnn_layer_params) self.tdnn2 = ReluBatchNormTdnnLayer(512, 512, [-2, 0, 2], **tdnn_layer_params) self.tdnn3 = ReluBatchNormTdnnLayer(512, 512, [-3, 0, 3], **tdnn_layer_params) self.tdnn4 = ReluBatchNormTdnnLayer(512, 512, **tdnn_layer_params) self.tdnn5 = ReluBatchNormTdnnLayer(512, 1500, **tdnn_layer_params) self.stats = StatisticsPooling(1500, stddev=True) self.tdnn6 = ReluBatchNormTdnnLayer(self.stats.get_output_dim(), 512, **tdnn_layer_params) self.tdnn7 = ReluBatchNormTdnnLayer(512, 512, **tdnn_layer_params) # Do not need when extracting embedding. if training: self.loss = SoftmaxLoss(512, num_targets) # An example to using transform-learning without initializing loss.affine parameters self.transform_keys = [ "tdnn1", "tdnn2", "tdnn3", "tdnn4", "tdnn5", "stats", "tdnn6", "tdnn7" ]
def get_dropout_from_wrapper(p=0., dropout_params={}): assert 0. <= p < 1. default_dropout_params = { "type": "default", # default | random "start_p": 0., "dim": 2, "method": "normal", "continuous": False, "inplace": True, "frequency": 0.2, "frame": 0.2, "rows": 1, "cols": 1, "random_rows": False, "random_cols": False } dropout_params = utils.assign_params_dict(default_dropout_params, dropout_params) name = dropout_params["type"] if p == 0: return None if name == "default": return get_default_dropout(p=p, dim=dropout_params["dim"], inplace=dropout_params["inplace"]) elif name == "random": return RandomDropout(p=p, start_p=dropout_params["start_p"], dim=dropout_params["dim"], method=dropout_params["method"], inplace=dropout_params["inplace"]) elif name == "alpha": return torch.nn.AlphaDropout(p=p, inplace=dropout_params["inplace"]) elif name == "context": return ContextDropout(p=p) elif name == "noise": return NoiseDropout(p=p, dim=dropout_params["dim"], method=dropout_params["method"], continuous=dropout_params["continuous"], inplace=dropout_params["inplace"]) else: raise TypeError( "Do not support {} dropout in current wrapper.".format(name))
def __init__(self, channels, context=[0], bias=False, scale=4, inplace=True, affine_type="tdnn-affine", bn_params={}): super().__init__() default_bn_params = {"momentum": 0.1, "affine": True, "track_running_stats": True} bn_params = utils.assign_params_dict(default_bn_params, bn_params) assert channels % scale == 0, "{} % {} != 0".format(channels, scale) self.scale = scale self.width = channels // scale self.nums = scale if scale == 1 else scale - 1 self.convs = [] self.bns = [] for i in range(self.nums): self.convs.append(ReluBatchNormTdnnLayer(self.width, self.width, context, affine_type, bias=bias, nonlinearity="", bn=False)) self.bns.append(nn.BatchNorm1d(self.width, **bn_params)) self.convs = nn.ModuleList(self.convs) self.bns = nn.ModuleList(self.bns) self.relu = nn.ReLU(inplace=inplace)
def __init__(self, optimizer, params: dict = {}): # Suggested weight_decay: 1e-4 for l2 regularization (sgd, adam) and # 1e-1 for decouped weight decay (sgdw, adamw, radam, ralamb, adamod etc.) default_params = { "name": "warmR", "1cycle.learn_rate": 0.001, "warmR.T_max": 10, "warmR.T_mult": 1, "warmR.factor": 1.0, "warmR.eta_min": 4e-8, "warmR.log_decay": False, "warmR.lr_decay_step": 1 } used_params = utils.assign_params_dict(default_params, params, force_check=False, support_unknow=True) split_params = utils.split_params(used_params) if isinstance(optimizer, Lookahead): base_optimizer = optimizer.optimizer else: base_optimizer = optimizer self.name = split_params["public"]["name"] if self.name == "1cycle": # To do. self.lr_scheduler = optim.lr_scheduler.OneCycleLR( base_optimizer, **split_params["1cycle"]) elif self.name == "warmR": T_max = split_params["warmR"].pop("T_max") self.lr_decay_step = split_params["warmR"].pop("lr_decay_step") self.lr_scheduler = CosineAnnealingWarmRestarts( base_optimizer, T_max, **split_params["warmR"]) else: raise ValueError( "Do not support {0} lr_scheduler now.".format(name))
def __init__(self, trainer): default_params = { "report_times_every_epoch": None, "report_interval_iters": 100, "record_file": "train.csv", "use_tensorboard": False } self.trainer = trainer default_params = utils.assign_params_dict(default_params, self.trainer.params) if default_params["report_times_every_epoch"] is not None: self.report_interval_iters = max( 1, self.trainer.training_point[2] // default_params["report_times_every_epoch"]) else: self.report_interval_iters = default_params[ "report_interval_iters"] if not self.trainer.params["debug"] and default_params[ "use_tensorboard"]: # from tensorboardX import SummaryWriter from torch.utils.tensorboard import SummaryWriter model_name = os.path.basename(self.trainer.params["model_dir"]) # time_string = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) # time_string = self.trainer.params["time_string"] # self.board_writer = SummaryWriter("{}/log/{}-{}-tensorboard".format(self.trainer.params["model_dir"], model_name, time_string)) # self.board_writer = SummaryWriter("{}/log/{}-{}-tensorboard".format( # self.trainer.params["model_dir"], time_string, model_name)) self.board_writer = SummaryWriter("{}/log/tensorboard".format( self.trainer.params["model_dir"])) else: self.board_writer = None self.epochs = self.trainer.params["epochs"] self.optimizer = self.trainer.elements["optimizer"] # For optimizer wrapper such as lookahead. # "None" is the default value if getattr(self.optimizer, "optimizer", None) is not None: self.optimizer = self.optimizer.optimizer self.device = "[{0}]".format( utils.get_device(self.trainer.elements["model"])) self.record_value = [] self.start_write_log = False if not self.trainer.params["debug"] and default_params[ "record_file"] != "" and default_params[ "record_file"] is not None: self.record_file = "{0}/log/{1}".format( self.trainer.params["model_dir"], default_params["record_file"]) # The case to recover training if self.trainer.params["start_epoch"] > 0: # train.csv using append mode self.start_write_log = True elif os.path.exists(self.record_file): # Do backup to avoid clearing the loss log when re-running a same launcher. bk_file = "{0}.backup.{1}".format( self.record_file, time.strftime('%Y-%m-%d_%H:%M:%S', time.localtime(time.time()))) shutil.move(self.record_file, bk_file) else: self.record_file = None # A format to show progress # Do not use progressbar.Bar(marker="\x1b[32m█\x1b[39m") and progressbar.SimpleProgress(format='%(value_s)s/%(max_value_s)s') to avoid too long string. widgets = [ progressbar.Percentage(format='%(percentage)3.2f%%'), " | ", "Epoch:", progressbar.Variable('current_epoch', format='{formatted_value}', width=0, precision=0), "/{0}, ".format(self.epochs), "Iter:", progressbar.Variable('current_iter', format='{formatted_value}', width=0, precision=0), "/{0}".format(self.trainer.training_point[2]), " (", progressbar.Timer(format='ELA: %(elapsed)s'), ", ", progressbar.AdaptiveETA(), ")" ] # total num of iter max_value = self.trainer.params[ "epochs"] * self.trainer.training_point[2] self.bar = progressbar.ProgressBar(max_value=max_value, widgets=widgets, redirect_stdout=True) # Use multi-process for update. self.queue = Queue() self.process = Process(target=self._update, daemon=True) self.process.start()
def init(self, inputs_dim, num_targets, num_phones, extend=False, skip_connection=False, mixup=False, mixup_alpha=1.0, specaugment=False, specaugment_params={}, aug_dropout=0., context_dropout=0., hidden_dropout=0., dropout_params={}, SE=False, se_ratio=4, tdnn_layer_params={}, tdnn6=True, tdnn7_params={}, pooling="statistics", pooling_params={}, margin_loss=False, margin_loss_params={}, use_step=False, step_params={}, transfer_from="softmax_loss", training=True, extracted_embedding="far",mt_alpha=0.1): ## Params. default_dropout_params = { "type":"default", # default | random "start_p":0., "dim":2, "method":"uniform", # uniform | normals "continuous":False, "inplace":True } default_tdnn_layer_params = { "nonlinearity":'relu', "nonlinearity_params":{"inplace":True}, "bn-relu":False, "bn":True, "bn_params":{"momentum":0.5, "affine":False, "track_running_stats":True} } default_pooling_params = { "num_nodes":1500, "num_head":1, "share":True, "affine_layers":1, "hidden_size":64, "context":[0], "stddev":True, "temperature":False, "fixed":True, "stddev":True } default_margin_loss_params = { "method":"am", "m":0.2, "feature_normalize":True, "s":30, "double":False, "mhe_loss":False, "mhe_w":0.01, "inter_loss":0., "ring_loss":0., "curricular":False } default_step_params = { "T":None, "m":False, "lambda_0":0, "lambda_b":1000, "alpha":5, "gamma":1e-4, "s":False, "s_tuple":(30, 12), "s_list":None, "t":False, "t_tuple":(0.5, 1.2), "p":False, "p_tuple":(0.5, 0.1) } dropout_params = utils.assign_params_dict(default_dropout_params, dropout_params) tdnn_layer_params = utils.assign_params_dict(default_tdnn_layer_params, tdnn_layer_params) # If param is not be specified, default it w.r.t tdnn_layer_params. tdnn7_params = utils.assign_params_dict(tdnn_layer_params, tdnn7_params) pooling_params = utils.assign_params_dict(default_pooling_params, pooling_params) margin_loss_params = utils.assign_params_dict(default_margin_loss_params, margin_loss_params) step_params = utils.assign_params_dict(default_step_params, step_params) ## Var. self.skip_connection = skip_connection self.use_step = use_step self.step_params = step_params self.extracted_embedding = extracted_embedding # For extract. self.mt_alpha = mt_alpha ## Nnet. # Head self.mixup = Mixup(alpha=mixup_alpha) if mixup else None self.specaugment = SpecAugment(**specaugment_params) if specaugment else None self.aug_dropout = get_dropout_from_wrapper(aug_dropout, dropout_params) self.context_dropout = ContextDropout(p=context_dropout) if context_dropout > 0 else None self.hidden_dropout = get_dropout_from_wrapper(hidden_dropout, dropout_params) # Frame level self.tdnn1 = ReluBatchNormTdnnLayer(inputs_dim,512,[-2,-1,0,1,2], **tdnn_layer_params) self.se1 = SEBlock(512, ratio=se_ratio) if SE else None self.ex_tdnn1 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None self.tdnn2 = ReluBatchNormTdnnLayer(512,512,[-2,0,2], **tdnn_layer_params) self.se2 = SEBlock(512, ratio=se_ratio) if SE else None self.ex_tdnn2 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None self.tdnn3 = ReluBatchNormTdnnLayer(512,512,[-3,0,3], **tdnn_layer_params) self.se3 = SEBlock(512, ratio=se_ratio) if SE else None self.ex_tdnn3 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None self.ex_tdnn4 = ReluBatchNormTdnnLayer(512,512,[-4,0,4], **tdnn_layer_params) if extend else None self.se4 = SEBlock(512, ratio=se_ratio) if SE and extend else None self.ex_tdnn5 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None self.tdnn4 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) num_nodes = pooling_params.pop("num_nodes") self.tdnn5 = ReluBatchNormTdnnLayer(512, num_nodes, **tdnn_layer_params) #Zheng Li 2021-06-08 self.phonetic_tdnn5 = ReluBatchNormTdnnLayer(512,512,**tdnn_layer_params) self.phonetic_tdnn6 = ReluBatchNormTdnnLayer(512,512,**tdnn_layer_params) self.phonetic_tdnn7 = ReluBatchNormTdnnLayer(512,512,**tdnn_layer_params) # Pooling stddev = pooling_params.pop("stddev") if pooling == "lde": self.stats = LDEPooling(num_nodes, c_num=pooling_params["num_head"]) elif pooling == "attentive": self.stats = AttentiveStatisticsPooling(num_nodes, affine_layers=pooling_params["affine_layers"], hidden_size=pooling_params["hidden_size"], context=pooling_params["context"], stddev=stddev) elif pooling == "multi-head": self.stats = MultiHeadAttentionPooling(num_nodes, stddev=stddev, **pooling_params) elif pooling == "multi-resolution": self.stats = MultiResolutionMultiHeadAttentionPooling(num_nodes, **pooling_params) else: self.stats = StatisticsPooling(num_nodes, stddev=stddev) stats_dim = self.stats.get_output_dim() # Segment level if tdnn6: self.tdnn6 = ReluBatchNormTdnnLayer(stats_dim, 512, **tdnn_layer_params) tdnn7_dim = 512 else: self.tdnn6 = None tdnn7_dim = stats_dim if tdnn7_params["nonlinearity"] == "default": tdnn7_params["nonlinearity"] = tdnn_layer_params["nonlinearity"] self.tdnn7 = ReluBatchNormTdnnLayer(tdnn7_dim,512, **tdnn7_params) # Loss # Do not need when extracting embedding. if training : if margin_loss: self.loss = MarginSoftmaxLoss(512, num_targets, **margin_loss_params) #Zheng Li 2021-06-08 self.loss_spk = MarginSoftmaxLoss(512, num_targets, **margin_loss_params) self.loss_phone= SoftmaxLoss_frame_phone_fix(512, num_phones) else: #Zheng Li 2021-06-08 self.loss_spk = SoftmaxLoss(512, num_targets) self.loss_phone= SoftmaxLoss_frame_phone_fix(512, num_phones) self.wrapper_loss = MixupLoss(self.loss, self.mixup) if mixup else None # An example to using transform-learning without initializing loss.affine parameters self.transform_keys = ["tdnn1","tdnn2","tdnn3","tdnn4","tdnn5","stats","tdnn6","tdnn7", "ex_tdnn1","ex_tdnn2","ex_tdnn3","ex_tdnn4","ex_tdnn5", "se1","se2","se3","se4","loss"] if margin_loss and transfer_from == "softmax_loss": # For softmax_loss to am_softmax_loss self.rename_transform_keys = {"loss.affine.weight":"loss.weight"}
def init(self, inputs_dim, num_targets, aug_dropout=0., tail_dropout=0., training=True, extracted_embedding="near", resnet_params={}, pooling="statistics", pooling_params={}, fc1=False, fc1_params={}, fc2_params={}, margin_loss=False, margin_loss_params={}, use_step=False, step_params={}, transfer_from="softmax_loss"): ## Params. default_resnet_params = { "head_conv":True, "head_conv_params":{"kernel_size":3, "stride":1, "padding":1}, "head_maxpool":False, "head_maxpool_params":{"kernel_size":3, "stride":1, "padding":1}, "block":"BasicBlock", "layers":[3, 4, 6, 3], "planes":[32, 64, 128, 256], # a.k.a channels. "convXd":2, "norm_layer_params":{"momentum":0.5, "affine":True}, "full_pre_activation":True, "zero_init_residual":False } default_pooling_params = { "num_head":1, "hidden_size":64, "share":True, "affine_layers":1, "context":[0], "stddev":True, "temperature":False, "fixed":True } default_fc_params = { "nonlinearity":'relu', "nonlinearity_params":{"inplace":True}, "bn-relu":False, "bn":True, "bn_params":{"momentum":0.5, "affine":True, "track_running_stats":True} } default_margin_loss_params = { "method":"am", "m":0.2, "feature_normalize":True, "s":30, "mhe_loss":False, "mhe_w":0.01 } default_step_params = { "T":None, "m":False, "lambda_0":0, "lambda_b":1000, "alpha":5, "gamma":1e-4, "s":False, "s_tuple":(30, 12), "s_list":None, "t":False, "t_tuple":(0.5, 1.2), "p":False, "p_tuple":(0.5, 0.1) } resnet_params = utils.assign_params_dict(default_resnet_params, resnet_params) pooling_params = utils.assign_params_dict(default_pooling_params, pooling_params) fc1_params = utils.assign_params_dict(default_fc_params, fc1_params) fc2_params = utils.assign_params_dict(default_fc_params, fc2_params) margin_loss_params = utils.assign_params_dict(default_margin_loss_params, margin_loss_params) step_params = utils.assign_params_dict(default_step_params, step_params) ## Var. self.extracted_embedding = extracted_embedding # only near here. self.use_step = use_step self.step_params = step_params self.convXd = resnet_params["convXd"] ## Nnet. self.aug_dropout = torch.nn.Dropout2d(p=aug_dropout) if aug_dropout > 0 else None # [batch, 1, feats-dim, frames] for 2d and [batch, feats-dim, frames] for 1d. # Should keep the channel/plane is always in 1-dim of tensor (index-0 based). inplanes = 1 if self.convXd == 2 else inputs_dim self.resnet = ResNet(inplanes, **resnet_params) # It is just equal to Ceil function. resnet_output_dim = (inputs_dim + self.resnet.get_downsample_multiple() - 1) // self.resnet.get_downsample_multiple() \ * self.resnet.get_output_planes() if self.convXd == 2 else self.resnet.get_output_planes() # Pooling stddev = pooling_params.pop("stddev") if pooling == "lde": self.stats = LDEPooling(resnet_output_dim, c_num=pooling_params["num_head"]) elif pooling == "attentive": self.stats = AttentiveStatisticsPooling(resnet_output_dim, hidden_size=pooling_params["hidden_size"], context=pooling_params["context"], stddev=stddev) elif pooling == "multi-head": self.stats = MultiHeadAttentionPooling(resnet_output_dim, stddev=stddev, **pooling_params) elif pooling == "multi-resolution": self.stats = MultiResolutionMultiHeadAttentionPooling(resnet_output_dim, **pooling_params) else: self.stats = StatisticsPooling(resnet_output_dim, stddev=stddev) self.fc1 = ReluBatchNormTdnnLayer(self.stats.get_output_dim(), resnet_params["planes"][3], **fc1_params) if fc1 else None if fc1: fc2_in_dim = resnet_params["planes"][3] else: fc2_in_dim = self.stats.get_output_dim() self.fc2 = ReluBatchNormTdnnLayer(fc2_in_dim, resnet_params["planes"][3], **fc2_params) self.tail_dropout = torch.nn.Dropout2d(p=tail_dropout) if tail_dropout > 0 else None ## Do not need when extracting embedding. if training : if margin_loss: self.loss = MarginSoftmaxLoss(resnet_params["planes"][3], num_targets, **margin_loss_params) else: self.loss = SoftmaxLoss(resnet_params["planes"][3], num_targets) # An example to using transform-learning without initializing loss.affine parameters self.transform_keys = ["resnet", "stats", "fc1", "fc2"] if margin_loss and transfer_from == "softmax_loss": # For softmax_loss to am_softmax_loss self.rename_transform_keys = {"loss.affine.weight":"loss.weight"}
def get_optimizer(model, params: dict = {}): # Suggested weight_decay: 1e-4 for l2 regularization (sgd, adam) and # 1e-1 for decouped weight decay (sgdw, adamw, radam, ralamb, adamod etc.) default_params = { "name": "adamW", "learn_rate": 0.001, "beta1": 0.9, "beta2": 0.999, "beta3": 0.999, "weight_decay": 1e-4, "lookahead.k": 5, "lookahead.alpha": 0., "gc": False } used_params = utils.assign_params_dict(default_params, params) # Base params name = used_params["name"] learn_rate = used_params["learn_rate"] beta1 = used_params["beta1"] beta2 = used_params["beta2"] beta3 = used_params["beta3"] weight_decay = used_params["weight_decay"] gc = used_params["gc"] extra_params = {} # Gradient centralization: # Yong, H., Huang, J., Hua, X., & Zhang, L. (2020). Gradient Centralization: # A New Optimization Technique for Deep Neural Networks. arXiv e-prints, arXiv:2004.01461. # Retrieved from https://ui.adsabs.harvard.edu/abs/2020arXiv200401461Y # Github: https://github.com/Yonghongwei/Gradient-Centralization if gc: # Specify this list by developer. default_support_gc_list = ["adamW", "ralamb"] if name not in default_support_gc_list: raise TypeError( "Optimizer {} does not support gradient centralization (GC) now." .format(name)) extra_params["gc"] = True # Select optimizer if name == "sgd": base_optimizer = optim.SGD(model.parameters(), lr=learn_rate, momentum=beta1, weight_decay=weight_decay) elif name == "sgdW": base_optimizer = SGDW(model.parameters(), lr=learn_rate, momentum=beta1, weight_decay=weight_decay) elif name == "adam": base_optimizer = optim.Adam(model.parameters(), lr=learn_rate, betas=(beta1, beta2), weight_decay=weight_decay) elif name == "adamW": base_optimizer = AdamW(model.parameters(), lr=learn_rate, betas=(beta1, beta2), weight_decay=weight_decay, **extra_params) elif name == "radam": base_optimizer = RAdam(model.parameters(), lr=learn_rate, betas=(beta1, beta2), weight_decay=weight_decay) elif name == "ralamb": base_optimizer = Ralamb(model.parameters(), lr=learn_rate, betas=(beta1, beta2), weight_decay=weight_decay, **extra_params) elif name == "adamod": base_optimizer = AdaMod(model.parameters(), lr=learn_rate, betas=(beta1, beta2), beta3=beta3, weight_decay=weight_decay) elif name == "novograd": base_optimizer = Novograd(model.parameters(), lr=learn_rate, betas=(beta1, beta2), weight_decay=weight_decay) else: raise ValueError("Do not support {0} optimizer now.".format(name)) # Using alpha to decide whether to use lookahead if used_params["lookahead.alpha"] > 0: logger.info("Use lookahead optimizer with alpha={} and k={}".format( used_params["lookahead.alpha"], used_params["lookahead.k"])) optimizer = Lookahead(base_optimizer, k=used_params["lookahead.k"], alpha=used_params["lookahead.alpha"]) else: optimizer = base_optimizer return optimizer
def __init__(self, optimizer, params:dict={}): # Suggested weight_decay: 1e-4 for l2 regularization (sgd, adam) and # 1e-1 for decouped weight decay (sgdw, adamw, radam, ralamb, adamod etc.) default_params = { "name":"warmR", "cyclic.max_lr":1e-3, "cyclic.base_lr":1e-8, "cyclic.step_size_up":2e4, "cyclic.step_size_down":None, "cyclic.mode":'triangular2', "cyclic.gamma":1.0, "cyclic.scale_fn":None, "cyclic.scale_mode":'cycle', "cyclic.cycle_momentum":False, "cyclic.base_momentum":0.8, "cyclic.max_momentum":0.9, "1cycle.learn_rate":0.001, "1cycle.total_steps":None, "1cycle.epochs":None, "1cycle.steps_per_epoch":None, "1cycle.pct_start":0.3, "1cycle.anneal_strategy":'linear', "1cycle.cycle_momentum":False, "1cycle.base_momentum":0.85, "1cycle.max_momentum":0.95, "1cycle.div_factor":25.0, "1cycle.final_div_factor":10000.0, "warmR.T_max":10, "warmR.T_mult":1, "warmR.factor":1.0, "warmR.eta_min":4e-8, "warmR.log_decay":False, "warmR.lr_decay_step":1, "reduceP.metric":'valid_acc', "reduceP.check_interval":0, "reduceP.factor":0.5, "reduceP.patience":10, "reduceP.threshold":0.0001, "reduceP.cooldown":0, "reduceP.min_lr":0. } used_params = utils.assign_params_dict(default_params, params, force_check=False, support_unknow=True) split_params = utils.split_params(used_params) if isinstance(optimizer, Lookahead): base_optimizer = optimizer.optimizer else: base_optimizer = optimizer self.name = split_params["public"]["name"] if self.name == "cyclic": base_lr = split_params["cyclic"].pop("base_lr") max_lr = split_params["cyclic"].pop("max_lr") self.lr_scheduler = torch.optim.lr_scheduler.CyclicLR(base_optimizer, base_lr, max_lr, **split_params["cyclic"]) elif self.name == "1cycle": max_lr = split_params["1cycle"].pop("learn_rate") self.lr_scheduler = optim.lr_scheduler.OneCycleLR(base_optimizer, max_lr, **split_params["1cycle"]) elif self.name == "warmR": T_max = split_params["warmR"].pop("T_max") self.lr_decay_step = split_params["warmR"].pop("lr_decay_step") self.lr_scheduler = CosineAnnealingWarmRestarts(base_optimizer, T_max, **split_params["warmR"]) elif self.name == "reduceP": self.check_interval = split_params["reduceP"].pop("check_interval") self.metric = split_params["reduceP"].pop("metric") self.min_lr = split_params["reduceP"]["min_lr"] if self.metric == "valid_acc": mode = "max" elif self.metric == "valid_loss": mode = "min" else: raise ValueError("Do not support {} metric for ReduceLROnPlateau strategy.".format(self.metric)) self.lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(base_optimizer, mode=mode, **split_params["reduceP"]) self.init = False if utils.use_horovod(): raise TypeError("Do not support ReduceLROnPlateau for multi-gpu of Horovod now.") else: raise ValueError("Do not support {0} lr_scheduler now.".format(name))
def init(self, inputs_dim, num_targets, mixup=False, mixup_alpha=1.0, specaugment=False, specaugment_params={}, aug_dropout=0., context_dropout=0., hidden_dropout=0., dropout_params={}, xvector_params={}, pooling="statistics", pooling_params={}, fc_params={}, margin_loss=False, margin_loss_params={}, use_step=False, step_params={}, transfer_from="softmax_loss", training=True): ## Params. default_dropout_params = { "type": "default", # default | random "start_p": 0., "dim": 2, "method": "uniform", # uniform | normals "continuous": False, "inplace": True } default_xvector_params = { "init_dim": 128, "layers": [6, 12], "growth_rate": 64, "bn_scale": 2, "nonlinearity": "relu", "memory_efficient": True } default_pooling_params = { "num_head": 1, "hidden_size": 64, "share": True, "affine_layers": 1, "context": [0], "stddev": True, "temperature": False, "fixed": True } default_fc_params = { "nonlinearity": 'relu', "nonlinearity_params": { "inplace": True }, "bn-relu": False, "bn": True, "bn_params": { "momentum": 0.5, "affine": True, "track_running_stats": True } } default_margin_loss_params = { "method": "am", "m": 0.2, "feature_normalize": True, "s": 30, "double": False, "mhe_loss": False, "mhe_w": 0.01, "inter_loss": 0., "ring_loss": 0., "curricular": False } default_step_params = { "T": None, "m": False, "lambda_0": 0, "lambda_b": 1000, "alpha": 5, "gamma": 1e-4, "s": False, "s_tuple": (30, 12), "s_list": None, "t": False, "t_tuple": (0.5, 1.2), "p": False, "p_tuple": (0.5, 0.1) } dropout_params = utils.assign_params_dict(default_dropout_params, dropout_params) xvector_params = utils.assign_params_dict(default_xvector_params, xvector_params) pooling_params = utils.assign_params_dict(default_pooling_params, pooling_params) fc_params = utils.assign_params_dict(default_fc_params, fc_params) margin_loss_params = utils.assign_params_dict( default_margin_loss_params, margin_loss_params) step_params = utils.assign_params_dict(default_step_params, step_params) ## Var. self.use_step = use_step self.step_params = step_params ## Nnet # Head self.mixup = Mixup(alpha=mixup_alpha) if mixup else None self.specaugment = SpecAugment( **specaugment_params) if specaugment else None self.aug_dropout = get_dropout_from_wrapper(aug_dropout, dropout_params) self.context_dropout = ContextDropout( p=context_dropout) if context_dropout > 0 else None self.hidden_dropout = get_dropout_from_wrapper(hidden_dropout, dropout_params) # Frame level in_dim = xvector_params["init_dim"] layers = xvector_params["layers"] out_dim = xvector_params["growth_rate"] bn_dim = out_dim * xvector_params["bn_scale"] nonlinearity = xvector_params["nonlinearity"] memory_efficient = xvector_params["memory_efficient"] options = {"bias": False, "bn-relu": True} self.tdnn = ReluBatchNormTdnnLayer(inputs_dim, in_dim, [-2, -1, 0, 1, 2], nonlinearity=nonlinearity, **options) self.dense_block1 = DTdnnBlock(layers[0], in_dim, out_dim, bn_dim, [-1, 0, 1], memory_efficient, nonlinearity=nonlinearity, **options) in_dim += layers[0] * out_dim self.transit1 = ReluBatchNormTdnnLayerR(in_dim, in_dim // 2, nonlinearity=nonlinearity, **options) in_dim //= 2 self.dense_block2 = DTdnnBlock(layers[1], in_dim, out_dim, bn_dim, [-3, 0, 3], memory_efficient, nonlinearity=nonlinearity, **options) in_dim += layers[1] * out_dim self.transit2 = ReluBatchNormTdnnLayerR(in_dim, in_dim // 2, nonlinearity=nonlinearity, **options) in_dim //= 2 # Pooling stddev = pooling_params.pop("stddev") if pooling == "lde": self.stats = LDEPooling(in_dim, c_num=pooling_params["num_head"]) elif pooling == "attentive": self.stats = AttentiveStatisticsPooling( in_dim, affine_layers=pooling_params["affine_layers"], hidden_size=pooling_params["hidden_size"], context=pooling_params["context"], stddev=stddev) elif pooling == "multi-head": self.stats = MultiHeadAttentionPooling(in_dim, stddev=stddev, **pooling_params) elif pooling == "multi-resolution": self.stats = MultiResolutionMultiHeadAttentionPooling( in_dim, **pooling_params) else: self.stats = StatisticsPooling(in_dim, stddev=stddev) # Segment level self.fc = ReluBatchNormTdnnLayer(self.stats.get_output_dim(), 512, **fc_params) # Loss # Do not need when extracting embedding. if training: if margin_loss: self.loss = MarginSoftmaxLoss(512, num_targets, **margin_loss_params) else: self.loss = SoftmaxLoss(512, num_targets) self.wrapper_loss = MixupLoss(self.loss, self.mixup) if mixup else None # An example to using transform-learning without initializing loss.affine parameters self.transform_keys = [ "tdnn", "block1", "transit1", "block2", "transit2", "stats", "fc", "loss" ] if margin_loss and transfer_from == "softmax_loss": # For softmax_loss to am_softmax_loss self.rename_transform_keys = { "loss.affine.weight": "loss.weight" }
def init(self, inputs_dim, num_targets, channels=512, embd_dim=192, aug_dropout=0., tail_dropout=0., training=True, extracted_embedding="near", mixup=False, mixup_alpha=1.0, pooling="ecpa-attentive", pooling_params={}, fc1=False, fc1_params={}, fc2_params={}, margin_loss= True, margin_loss_params={}, use_step=False, step_params={}, transfer_from="softmax_loss" ): default_pooling_params = { "num_head":1, "hidden_size":64, "share":True, "affine_layers":1, "context":[0], "stddev":True, "temperature":False, "fixed":True } default_fc_params = { "nonlinearity":'relu', "nonlinearity_params":{"inplace":True}, "bn-relu":False, "bn":True, "bn_params":{"momentum":0.5, "affine":True, "track_running_stats":True} } default_margin_loss_params = { "method":"am", "m":0.2, "feature_normalize":True, "s":30, "double":False, "mhe_loss":False, "mhe_w":0.01, "inter_loss":0., "ring_loss":0., "curricular":False} default_step_params = { "T":None, "m":False, "lambda_0":0, "lambda_b":1000, "alpha":5, "gamma":1e-4, "s":False, "s_tuple":(30, 12), "s_list":None, "t":False, "t_tuple":(0.5, 1.2), "p":False, "p_tuple":(0.5, 0.1) } self.use_step = use_step self.step_params = step_params self.extracted_embedding = extracted_embedding pooling_params = utils.assign_params_dict(default_pooling_params, pooling_params) fc1_params = utils.assign_params_dict(default_fc_params, fc1_params) fc2_params = utils.assign_params_dict(default_fc_params, fc2_params) margin_loss_params = utils.assign_params_dict(default_margin_loss_params, margin_loss_params) step_params = utils.assign_params_dict(default_step_params, step_params) self.mixup = Mixup(alpha=mixup_alpha) if mixup else None self.layer1 = Conv1dReluBn(inputs_dim, channels, kernel_size=5, padding=2) self.layer2 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=2, dilation=2, scale=8) self.layer3 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=3, dilation=3, scale=8) self.layer4 = SE_Res2Block(channels, kernel_size=3, stride=1, padding=4, dilation=4, scale=8) cat_channels = channels * 3 self.conv = nn.Conv1d(cat_channels, cat_channels, kernel_size=1) self.bn_conv = nn.BatchNorm1d(cat_channels) # Pooling stddev = pooling_params.pop("stddev") if pooling == "attentive": self.stats = AttentiveStatisticsPooling(cat_channels, hidden_size=pooling_params["hidden_size"],context=pooling_params["context"], stddev=stddev) self.bn_stats = nn.BatchNorm1d(cat_channels * 2) self.fc1 = ReluBatchNormTdnnLayer(cat_channels * 2, embd_dim, **fc1_params) if fc1 else None elif pooling == "ecpa-attentive": self.stats = AttentiveStatsPool(cat_channels,128) self.bn_stats = nn.BatchNorm1d(cat_channels * 2) self.fc1 = ReluBatchNormTdnnLayer(cat_channels * 2, embd_dim, **fc1_params) if fc1 else None elif pooling == "multi-head": self.stats = MultiHeadAttentionPooling(cat_channels, stddev=stddev, **pooling_params) self.bn_stats = nn.BatchNorm1d(cat_channels * 2) self.fc1 = ReluBatchNormTdnnLayer(cat_channels * 2, embd_dim, **fc1_params) if fc1 else None elif pooling == "global-multi": self.stats = GlobalMultiHeadAttentionPooling(cat_channels,stddev=stddev, **pooling_params) self.bn_stats = nn.BatchNorm1d(cat_channels * 2* pooling_params["num_head"]) self.fc1 = ReluBatchNormTdnnLayer(cat_channels * 2* pooling_params["num_head"], embd_dim, **fc1_params) if fc1 else None elif pooling == "multi-resolution": self.stats = MultiResolutionMultiHeadAttentionPooling(cat_channels, **pooling_params) self.bn_stats = nn.BatchNorm1d(cat_channels * 2* pooling_params["num_head"]) self.fc1 = ReluBatchNormTdnnLayer(cat_channels * 2* pooling_params["num_head"], embd_dim, **fc1_params) if fc1 else None else: self.stats = StatisticsPooling(cat_channels, stddev=stddev) self.bn_stats = nn.BatchNorm1d(cat_channels * 2) self.fc1 = ReluBatchNormTdnnLayer(cat_channels * 2, embd_dim, **fc1_params) if fc1 else None self.tail_dropout = torch.nn.Dropout2d(p=tail_dropout) if tail_dropout > 0 else None if fc1: fc2_in_dim = embd_dim else: fc2_in_dim = cat_channels * 2 self.fc2 = ReluBatchNormTdnnLayer(fc2_in_dim, embd_dim, **fc2_params) self.tail_dropout = torch.nn.Dropout2d(p=tail_dropout) if tail_dropout > 0 else None # Loss # Do not need when extracting embedding. if training : if margin_loss: self.loss = MarginSoftmaxLoss(embd_dim, num_targets, **margin_loss_params) else: self.loss = SoftmaxLoss(embd_dim, num_targets) # self.loss = AngleLoss(embd_dim,num_targets) self.wrapper_loss = MixupLoss(self.loss, self.mixup) if mixup else None # An example to using transform-learning without initializing loss.affine parameters self.transform_keys = ["layer2","layer3","layer4","conv","stats","fc1","fc2"] if margin_loss and transfer_from == "softmax_loss": # For softmax_loss to am_softmax_loss self.rename_transform_keys = {"loss.affine.weight":"loss.weight"}
def init(self, inputs_dim, num_targets, channels=512, emb_dim=192, tdnn_layer_params={}, layer5_params={}, layer6=False, layer7_params={}, margin_loss=False, margin_loss_params={}, pooling="statistics", use_step=False, step_params={}, training=True, extracted_embedding="near"): default_tdnn_layer_params = { "affine_type": 'tdnn-affine', "nonlinearity": 'relu', "nonlinearity_params": {"inplace": True}, "bn-relu": False, "bn": True, "bn_params": {"momentum": 0.5, "affine": False, "track_running_stats": True} } default_layer5_params = {"nonlinearity": 'relu', "bn": False} default_layer7_params = {"nonlinearity": '', "bn": True} default_margin_loss_params = { "method": "am", "m": 0.2, "feature_normalize": True, "s": 30, "double": False, "mhe_loss": False, "mhe_w": 0.01, "inter_loss": 0., "ring_loss": 0., "curricular": False } default_step_params = { "T": None, "m": False, "lambda_0": 0, "lambda_b": 1000, "alpha": 5, "gamma": 1e-4, "s": False, "s_tuple": (30, 12), "s_list": None, "t": False, "t_tuple": (0.5, 1.2), "p": False, "p_tuple": (0.5, 0.1) } tdnn_layer_params = utils.assign_params_dict(default_tdnn_layer_params, tdnn_layer_params) layer5_params = utils.assign_params_dict(default_layer5_params, layer5_params) layer5_params = utils.assign_params_dict(default_tdnn_layer_params, layer5_params) layer7_params = utils.assign_params_dict(default_layer7_params, layer7_params) layer7_params = utils.assign_params_dict(default_tdnn_layer_params, layer7_params) margin_loss_params = utils.assign_params_dict(default_margin_loss_params, margin_loss_params) step_params = utils.assign_params_dict(default_step_params, step_params) self.use_step = use_step self.step_params = step_params self.extracted_embedding = extracted_embedding # For extract. self.layer1 = ReluBatchNormTdnnLayer(inputs_dim, channels, [-2, -1, 0, 1, 2], **tdnn_layer_params) # channels, kernel_size, stride, padding, dilation, scale self.layer2 = SE_Res2Block(channels, [-2, 0, 2], 8, tdnn_layer_params) self.layer3 = SE_Res2Block(channels, [-3, 0, 3], 8, tdnn_layer_params) self.layer4 = SE_Res2Block(channels, [-4, 0, 4], 8, tdnn_layer_params) cat_channels = channels * 3 self.layer5 = ReluBatchNormTdnnLayer(cat_channels, cat_channels, **layer5_params) if pooling == "attention": self.pooling = AttentiveStatsPool(cat_channels, 128, tdnn_layer_params["affine_type"]) else: self.pooling = StatisticsPooling(cat_channels, stddev=True) # self.bn1 = nn.BatchNorm1d(cat_channels * 2, **tdnn_layer_params["bn_params"]) # Segment level if layer6: self.layer6 = ReluBatchNormTdnnLayer(cat_channels * 2, 512, **tdnn_layer_params) layer7_dim = 512 else: self.layer6 = None layer7_dim = cat_channels * 2 self.layer7 = ReluBatchNormTdnnLayer(layer7_dim, emb_dim, **layer7_params) for m in self.modules(): if isinstance(m, nn.Conv1d): nn.init.kaiming_uniform_(m.weight, mode='fan_out', nonlinearity='relu') if training: if margin_loss: self.loss = MarginSoftmaxLoss(emb_dim, num_targets, **margin_loss_params) else: self.loss = SoftmaxLoss(emb_dim, num_targets, affine_type=tdnn_layer_params["affine_type"])
def init(self, inputs_dim, num_targets, extend=False, skip_connection=False, aug_dropout=0., context_dropout=0., hidden_dropout=0., dropout_params={}, SE=False, se_ratio=4, tdnn_layer_params={}, tdnn6=True, tdnn7_params={}, attentive_pooling=False, attentive_pooling_params={"hidden_size":64, "stddev_attention":False}, LDE_pooling=False, LDE_pooling_params={"c_num":64, "nodes":128}, focal_loss=False, focal_loss_params={"gamma":2}, margin_loss=False, margin_loss_params={}, use_step=False, step_params={}, transfer_from="softmax_loss", training=True, extracted_embedding="far"): ## Params. default_dropout_params = { "type":"default", # default | random "start_p":0., "dim":2, "method":"uniform", # uniform | normals "continuous":False, "inplace":True } default_tdnn_layer_params = { "nonlinearity":'relu', "nonlinearity_params":{"inplace":True}, "bn-relu":False, "bn":True, "bn_params":{"momentum":0.5, "affine":False, "track_running_stats":True} } default_margin_loss_params = { "method":"am", "m":0.2, "feature_normalize":True, "s":30, "double":False, "mhe_loss":False, "mhe_w":0.01, "inter_loss":0., "ring_loss":0., "curricular":False } default_step_params = { "T":None, "m":False, "lambda_0":0, "lambda_b":1000, "alpha":5, "gamma":1e-4, "s":False, "s_tuple":(30, 12), "s_list":None, "t":False, "t_tuple":(0.5, 1.2), "p":False, "p_tuple":(0.5, 0.1) } dropout_params = utils.assign_params_dict(default_dropout_params, dropout_params) tdnn_layer_params = utils.assign_params_dict(default_tdnn_layer_params, tdnn_layer_params) # If param is not be specified, default it a.w.t tdnn_layer_params. tdnn7_params = utils.assign_params_dict(tdnn_layer_params, tdnn7_params) margin_loss_params = utils.assign_params_dict(default_margin_loss_params, margin_loss_params) step_params = utils.assign_params_dict(default_step_params, step_params) ## Var. self.skip_connection = skip_connection self.use_step = use_step self.step_params = step_params self.extracted_embedding = extracted_embedding # For extract. ## Nnet. # Head self.aug_dropout = get_dropout_from_wrapper(aug_dropout, dropout_params) self.context_dropout = ContextDropout(p=context_dropout) if context_dropout > 0 else None self.hidden_dropout = get_dropout_from_wrapper(hidden_dropout, dropout_params) # Frame level self.tdnn1 = ReluBatchNormTdnnLayer(inputs_dim,512,[-2,-1,0,1,2], **tdnn_layer_params) self.se1 = SEBlock(512, ratio=se_ratio) if SE else None self.ex_tdnn1 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None self.tdnn2 = ReluBatchNormTdnnLayer(512,512,[-2,0,2], **tdnn_layer_params) self.se2 = SEBlock(512, ratio=se_ratio) if SE else None self.ex_tdnn2 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None self.tdnn3 = ReluBatchNormTdnnLayer(512,512,[-3,0,3], **tdnn_layer_params) self.se3 = SEBlock(512, ratio=se_ratio) if SE else None self.ex_tdnn3 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None self.ex_tdnn4 = ReluBatchNormTdnnLayer(512,512,[-4,0,4], **tdnn_layer_params) if extend else None self.se4 = SEBlock(512, ratio=se_ratio) if SE and extend else None self.ex_tdnn5 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) if extend else None self.tdnn4 = ReluBatchNormTdnnLayer(512,512, **tdnn_layer_params) nodes = LDE_pooling_params.pop("nodes") if LDE_pooling else 1500 self.tdnn5 = ReluBatchNormTdnnLayer(512, nodes, **tdnn_layer_params) # Pooling if LDE_pooling: self.stats = LDEPooling(nodes, **LDE_pooling_params) elif attentive_pooling: self.stats = AttentiveStatisticsPooling(nodes, **attentive_pooling_params, stddev=True) else: self.stats = StatisticsPooling(nodes, stddev=True) stats_dim = self.stats.get_output_dim() # Segment level if tdnn6: self.tdnn6 = ReluBatchNormTdnnLayer(stats_dim, 512, **tdnn_layer_params) tdnn7_dim = 512 else: self.tdnn6 = None tdnn7_dim = stats_dim if tdnn7_params["nonlinearity"] == "default": tdnn7_params["nonlinearity"] = tdnn_layer_params["nonlinearity"] self.tdnn7 = ReluBatchNormTdnnLayer(tdnn7_dim,512, **tdnn7_params) # Loss # Do not need when extracting embedding. if training : if margin_loss: self.loss = MarginSoftmaxLoss(512, num_targets, **margin_loss_params) elif focal_loss: self.loss = FocalLoss(512, num_targets, **focal_loss_params) else: self.loss = SoftmaxLoss(512, num_targets) # An example to using transform-learning without initializing loss.affine parameters self.transform_keys = ["tdnn1","tdnn2","tdnn3","tdnn4","tdnn5","stats","tdnn6","tdnn7", "ex_tdnn1","ex_tdnn2","ex_tdnn3","ex_tdnn4","ex_tdnn5", "se1","se2","se3","se4","loss"] if margin_loss and transfer_from == "softmax_loss": # For softmax_loss to am_softmax_loss self.rename_transform_keys = {"loss.affine.weight":"loss.weight"}