def __init__(self, lr=0.0001, model="alexnet50", nOut=512, encoder_type='SAP', normalize=True, trainfunc='contrastive', **kwargs): super(SpeakerNet, self).__init__() argsdict = {'nOut': nOut, 'encoder_type': encoder_type} SpeakerNetModel = importlib.import_module( 'models.' + model).__getattribute__(model) self.__S__ = SpeakerNetModel(**argsdict).cuda() if trainfunc == 'angleproto': self.__L__ = AngleProtoLoss().cuda() self.__train_normalize__ = True self.__test_normalize__ = True elif trainfunc == 'proto': self.__L__ = ProtoLoss().cuda() self.__train_normalize__ = False self.__test_normalize__ = False else: raise ValueError('Undefined loss.') self.__optimizer__ = torch.optim.Adam(list(self.__S__.parameters()) + list(self.__L__.parameters()), lr=lr) self.torchfb = transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, f_min=0.0, f_max=8000, pad=0, n_mels=40).cuda() self.instancenorm = nn.InstanceNorm1d(40).cuda() print('Initialised network with nOut %d encoder_type %s' % (nOut, encoder_type))
def __init__(self, device, max_frames, lr = 0.0001, margin = 1, scale = 1, hard_rank = 0, hard_prob = 0, model="alexnet50", nOut = 512, nSpeakers = 1000, optimizer = 'adam', encoder_type = 'SAP', normalize = True, trainfunc='contrastive', **kwargs): super(SpeakerNet, self).__init__(); argsdict = {'nOut': nOut, 'encoder_type':encoder_type} self.device = device SpeakerNetModel = importlib.import_module('models.'+model).__getattribute__(model) # @TODO make cuda optional in order to train on dev machines w/o GPUs self.__S__ = SpeakerNetModel(**argsdict).to(self.device); if trainfunc == 'angleproto': self.__L__ = AngleProtoLoss().to(self.device) self.__train_normalize__ = True self.__test_normalize__ = True elif trainfunc == 'ge2e': self.__L__ = GE2ELoss().to(self.device) self.__train_normalize__ = True self.__test_normalize__ = True elif trainfunc == 'amsoftmax': self.__L__ = AMSoftmax(in_feats=nOut, n_classes=nSpeakers, m=margin, s=scale).to(self.device) self.__train_normalize__ = False self.__test_normalize__ = True elif trainfunc == 'aamsoftmax': self.__L__ = AAMSoftmax(in_feats=nOut, n_classes=nSpeakers, m=margin, s=scale).to(self.device) self.__train_normalize__ = False self.__test_normalize__ = True elif trainfunc == 'softmax': self.__L__ = SoftmaxLoss(in_feats=nOut, n_classes=nSpeakers).to(self.device) self.__train_normalize__ = False self.__test_normalize__ = True elif trainfunc == 'proto': self.__L__ = ProtoLoss().to(self.device) self.__train_normalize__ = False self.__test_normalize__ = False elif trainfunc == 'triplet': self.__L__ = PairwiseLoss(loss_func='triplet', hard_rank=hard_rank, hard_prob=hard_prob, margin=margin).to(self.device) self.__train_normalize__ = True self.__test_normalize__ = True elif trainfunc == 'contrastive': self.__L__ = PairwiseLoss(loss_func='contrastive', hard_rank=hard_rank, hard_prob=hard_prob, margin=margin).to(self.device) self.__train_normalize__ = True self.__test_normalize__ = True else: raise ValueError('Undefined loss.') if optimizer == 'adam': self.__optimizer__ = torch.optim.Adam(self.parameters(), lr = lr); elif optimizer == 'sgd': self.__optimizer__ = torch.optim.SGD(self.parameters(), lr = lr, momentum = 0.9, weight_decay=5e-5); else: raise ValueError('Undefined optimizer.') self.__max_frames__ = max_frames;
def __init__(self, max_frames, lr=0.0001, margin=1, scale=1, hard_rank=0, hard_prob=0, model="alexnet50", nOut=512, nSpeakers=1000, optimizer='adam', encoder_type='SAP', normalize=True, trainfunc='contrastive', **kwargs): super(SpeakerNet, self).__init__() argsdict = {'nOut': nOut, 'encoder_type': encoder_type} self.__S__ = globals()[model](**argsdict).cuda() if trainfunc == 'angleproto': self.__L__ = AngleProtoLoss().cuda() self.__train_normalize__ = True self.__test_normalize__ = True elif trainfunc == 'ge2e': self.__L__ = GE2ELoss().cuda() self.__train_normalize__ = True self.__test_normalize__ = True elif trainfunc == 'amsoftmax': self.__L__ = AMSoftmax(in_feats=nOut, n_classes=nSpeakers, m=margin, s=scale).cuda() self.__train_normalize__ = False self.__test_normalize__ = True elif trainfunc == 'aamsoftmax': self.__L__ = AAMSoftmax(in_feats=nOut, n_classes=nSpeakers, m=margin, s=scale).cuda() self.__train_normalize__ = False self.__test_normalize__ = True elif trainfunc == 'softmax': self.__L__ = SoftmaxLoss(in_feats=nOut, n_classes=nSpeakers).cuda() self.__train_normalize__ = False self.__test_normalize__ = True elif trainfunc == 'proto': self.__L__ = ProtoLoss().cuda() self.__train_normalize__ = False self.__test_normalize__ = False elif trainfunc == 'triplet': self.__L__ = PairwiseLoss(loss_func='triplet', hard_rank=hard_rank, hard_prob=hard_prob, margin=margin).cuda() self.__train_normalize__ = True self.__test_normalize__ = True elif trainfunc == 'contrastive': self.__L__ = PairwiseLoss(loss_func='contrastive', hard_rank=hard_rank, hard_prob=hard_prob, margin=margin).cuda() self.__train_normalize__ = True self.__test_normalize__ = True else: raise ValueError('Undefined loss.') if optimizer == 'adam': self.__optimizer__ = torch.optim.Adam(self.parameters(), lr=lr) elif optimizer == 'sgd': self.__optimizer__ = torch.optim.SGD(self.parameters(), lr=lr, momentum=0.9, weight_decay=5e-5) else: raise ValueError('Undefined optimizer.') self.__max_frames__ = max_frames
class SpeakerNet(nn.Module): def __init__(self, lr=0.0001, model="alexnet50", nOut=512, encoder_type='SAP', normalize=True, trainfunc='contrastive', **kwargs): super(SpeakerNet, self).__init__() argsdict = {'nOut': nOut, 'encoder_type': encoder_type} SpeakerNetModel = importlib.import_module( 'models.' + model).__getattribute__(model) self.__S__ = SpeakerNetModel(**argsdict).cuda() if trainfunc == 'angleproto': self.__L__ = AngleProtoLoss().cuda() self.__train_normalize__ = True self.__test_normalize__ = True elif trainfunc == 'proto': self.__L__ = ProtoLoss().cuda() self.__train_normalize__ = False self.__test_normalize__ = False else: raise ValueError('Undefined loss.') self.__optimizer__ = torch.optim.Adam(list(self.__S__.parameters()) + list(self.__L__.parameters()), lr=lr) self.torchfb = transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, f_min=0.0, f_max=8000, pad=0, n_mels=40).cuda() self.instancenorm = nn.InstanceNorm1d(40).cuda() print('Initialised network with nOut %d encoder_type %s' % (nOut, encoder_type)) ## ===== ===== ===== ===== ===== ===== ===== ===== ## Train network ## ===== ===== ===== ===== ===== ===== ===== ===== def train_network(self, loader): self.train() stepsize = loader.batch_size counter = 0 index = 0 loss = 0 top1 = 0 # EER or accuracy criterion = torch.nn.CrossEntropyLoss() conf_labels = torch.LongTensor([1] * stepsize + [0] * stepsize).cuda() tstart = time.time() for data in loader: self.zero_grad() data = data.transpose(0, 1).unsqueeze(2) feat = [] for inp in data: outp = self.__S__.forward(torch.FloatTensor(inp).cuda()) if self.__train_normalize__: outp = F.normalize(outp, p=2, dim=1) feat.append(outp) feat = torch.stack(feat, dim=1).squeeze() nloss, prec1 = self.__L__.forward(feat, None) loss += nloss.detach().cpu() top1 += prec1 counter += 1 index += stepsize nloss.backward() self.__optimizer__.step() telapsed = time.time() - tstart tstart = time.time() sys.stdout.write("\rProcessing (%d) " % (index)) sys.stdout.write( "Loss %f EER/TAcc %2.3f%% - %.2f Hz " % (loss / counter, top1 / counter, stepsize / telapsed)) sys.stdout.flush() sys.stdout.write("\n") return (loss / counter, top1 / counter) ## ===== ===== ===== ===== ===== ===== ===== ===== ## Evaluate from list ## ===== ===== ===== ===== ===== ===== ===== ===== def evaluateFromList(self, listfilename, print_interval=100, test_path='', num_eval=10, eval_frames=200): print('Evaluating with NumEval %d EvalFrames %d Normalize %s' % (num_eval, eval_frames, self.__test_normalize__)) self.eval() lines = [] files = [] feats = {} tstart = time.time() ## Read all lines with open(listfilename) as listfile: while True: line = listfile.readline() if (not line): # or (len(all_scores)==1000) break data = line.split() ## Append random label if missing if len(data) == 2: data = [random.randint(0, 1)] + data files.append(data[1]) files.append(data[2]) lines.append(line) setfiles = list(set(files)) setfiles.sort() ## Save all features to file for idx, file in enumerate(setfiles): inp1 = torch.FloatTensor( loadWAV(os.path.join(test_path, file), eval_frames, evalmode=True, num_eval=num_eval)).cuda() with torch.no_grad(): feat = self.torchfb(inp1) + 1e-6 feat = self.instancenorm(feat.log()).unsqueeze(1).detach() ref_feat = self.__S__.forward(feat).detach().cpu() filename = '%06d.wav' % idx feats[file] = ref_feat telapsed = time.time() - tstart if idx % print_interval == 0: sys.stdout.write( "\rReading %d of %d: %.2f Hz, embedding size %d" % (idx, len(setfiles), idx / telapsed, ref_feat.size()[1])) print('') all_scores = [] all_labels = [] all_trials = [] tstart = time.time() ## Read files and compute all scores for idx, line in enumerate(lines): data = line.split() ## Append random label if missing if len(data) == 2: data = [random.randint(0, 1)] + data ref_feat = feats[data[1]].cuda() com_feat = feats[data[2]].cuda() if self.__test_normalize__: ref_feat = F.normalize(ref_feat, p=2, dim=1) com_feat = F.normalize(com_feat, p=2, dim=1) dist = F.pairwise_distance(ref_feat.unsqueeze(-1), com_feat.unsqueeze(-1).transpose( 0, 2)).detach().cpu().numpy() score = -1 * numpy.mean(dist) all_scores.append(score) all_labels.append(int(data[0])) all_trials.append(data[1] + " " + data[2]) if idx % print_interval == 0: telapsed = time.time() - tstart sys.stdout.write("\rComputing %d of %d: %.2f Hz" % (idx, len(lines), idx / telapsed)) sys.stdout.flush() print('\n') return (all_scores, all_labels, all_trials) ## ===== ===== ===== ===== ===== ===== ===== ===== ## Update learning rate ## ===== ===== ===== ===== ===== ===== ===== ===== def updateLearningRate(self, alpha): learning_rate = [] for param_group in self.__optimizer__.param_groups: param_group['lr'] = param_group['lr'] * alpha learning_rate.append(param_group['lr']) return learning_rate ## ===== ===== ===== ===== ===== ===== ===== ===== ## Save parameters ## ===== ===== ===== ===== ===== ===== ===== ===== def saveParameters(self, path): torch.save(self.state_dict(), path) ## ===== ===== ===== ===== ===== ===== ===== ===== ## Load parameters ## ===== ===== ===== ===== ===== ===== ===== ===== def loadParameters(self, path): self_state = self.state_dict() loaded_state = torch.load(path) for name, param in loaded_state.items(): origname = name if name not in self_state: name = name.replace("module.", "") if name not in self_state: print("%s is not in the model." % origname) continue if self_state[name].size() != loaded_state[origname].size(): print("Wrong parameter length: %s, model: %s, loaded: %s" % (origname, self_state[name].size(), loaded_state[origname].size())) continue self_state[name].copy_(param)