def main(): if opt.resume: model_path = os.path.join(opt.works_dir, opt.resume) if not os.path.isfile(model_path): raise Exception("no checkpoint found at {}".format(model_path)) package = torch.load(model_path, map_location=lambda storage, loc: storage) opt.lr = package.get('learning_rate', opt.lr) opt.total_steps = int(package.get('total_steps', 0)) - 1 print('total_steps is {}'.format(opt.total_steps)) if opt.seq_training == 'true': if opt.model_type == 'lstm': model = DeepSpeakerSeqModel.load_model(model_path, 'state_dict') elif opt.model_type == 'cnn': model = DeepSpeakerCnnSeqModel.load_model( model_path, 'state_dict') else: raise Exception('wrong model_type {}'.format(opt.model_type)) else: if opt.model_type == 'lstm': model = DeepSpeakerModel.load_model(model_path, 'state_dict') elif opt.model_type == 'cnn': model = DeepSpeakerCnnModel.load_model(model_path, 'state_dict') else: raise Exception('wrong model_type {}'.format(opt.model_type)) logging.info('Loading model {}'.format(model_path)) else: if opt.seq_training == 'true': if opt.model_type == 'lstm': model = DeepSpeakerSeqModel(opt) elif opt.model_type == 'cnn': model = DeepSpeakerCnnSeqModel(opt) else: raise Exception('wrong model_type {}'.format(opt.model_type)) else: if opt.model_type == 'lstm': model = DeepSpeakerModel(opt) elif opt.model_type == 'cnn': model = DeepSpeakerCnnModel(opt) else: raise Exception('wrong model_type {}'.format(opt.model_type)) print(model) for k, v in model.state_dict().items(): print(k, v.shape) model.to(device) optimizer = optim.Adam(model.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999), weight_decay=opt.weight_decay) train(opt, model, optimizer)
def main(): model = DeepSpeakerModel(embedding_size=256, num_classes=10) if args.resume: if os.path.isfile(args.resume): print('=> loading checkpoint {}'.format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) else: print('=> no checkpoint found at {}'.format(args.resume)) # print(calculateOneEmbedding('/home/zinzin/Documents/pytorch/deepspeaker-pytorch/data/test_set/dnl/s1/t1/s1_t1_1.wav', model)) embeddings = enrollment(model) test(model, embeddings)
def main(): # Views the training images and displays the distance on anchor-negative and anchor-positive test_display_triplet_distance = False # print the experiment configuration print('\nparsed options:\n{}\n'.format(vars(args))) print('\nNumber of Classes:\n{}\n'.format(len(train_dir.classes))) # instantiate model and initialize weightsNUM_FEATURES # TODO(xin): IMPORTANT load num_classes from checkpoint model = DeepSpeakerModel(embedding_size=args.embedding_size, num_classes=len(train_dir.classes), feature_dim=num_features, frame_dim=c.NUM_FRAMES) if args.cuda: model.cuda() from torchsummary import summary summary(model, (1, c.NUM_FRAMES, c.NUM_FEATURES)) # # More detailed information on model # print(model) optimizer = create_optimizer(model, args.lr) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print('=> loading checkpoint {}'.format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) else: print('=> no checkpoint found at {}'.format(args.resume)) start = args.start_epoch end = start + args.epochs train_loader = torch.utils.data.DataLoader(train_dir, batch_size=args.batch_size, shuffle=False, **kwargs) test_loader = torch.utils.data.DataLoader(test_dir, batch_size=args.test_batch_size, shuffle=False, **kwargs) for epoch in range(start, end): if args.test_only: test(test_loader, model, epoch) return train(train_loader, model, optimizer, epoch) test(test_loader, model, epoch)
def main(libri_dir=c.DATASET_DIR): print('Looking for fbank features [.npy] files in {}.'.format(libri_dir)) libri = data_catalog(libri_dir) # filename speaker_id # 0 audio/LibriSpeechSamples/train-clean-100-npy/1-100-0001.npy 1 # 1 audio/LibriSpeechSamples/train-clean-100-npy/1-100-0002.npy 1 unique_speakers = libri['speaker_id'].unique() # 251 speaker transform=transforms.Compose([transforms.ToTensor()]) train_dir = stochastic_mini_batch(libri) train_loader = DataLoader(train_dir, batch_size=c.BATCH_SIZE, shuffle=True) model = DeepSpeakerModel(embedding_size=c.EMBEDDING_SIZE,num_classes=c.NUM_SPEAKERS) optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0) epoch = 0 model.cuda() summary(model, input_size=(1, 160, 64)) for epoch in range(100): model.train() for batch_idx, (data_a, data_p, data_n,label_a,label_p,label_n) in tqdm(enumerate(train_loader)): data_a, data_p, data_n = data_a.type(torch.FloatTensor),data_p.type(torch.FloatTensor),data_n.type(torch.FloatTensor) data_a, data_p, data_n = data_a.cuda(), data_p.cuda(), data_n.cuda() data_a, data_p, data_n = Variable(data_a), Variable(data_p), Variable(data_n) out_a, out_p, out_n = model(data_a), model(data_p), model(data_n) triplet_loss = TripletMarginLoss(0.2).forward(out_a, out_p, out_n) loss = triplet_loss # compute gradient and update weights optimizer.zero_grad() loss.backward() optimizer.step() print('selected_triplet_loss', triplet_loss.data) print("epoch:",epoch) torch.save(model.state_dict(),"checkpoint_{}.pt".format(epoch))
def main(): # Views the training images and displays the distance on anchor-negative and anchor-positive test_display_triplet_distance = False # print the experiment configuration print('\nparsed options:\n{}\n'.format(vars(args))) print('\nNumber of Classes:\n{}\n'.format(len(train_dir.classes))) # instantiate model and initialize weights model = DeepSpeakerModel(embedding_size=args.embedding_size, num_classes=len(train_dir.classes)) if args.cuda: model.cuda() optimizer = create_optimizer(model, args.lr) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print('=> loading checkpoint {}'.format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) else: print('=> no checkpoint found at {}'.format(args.resume)) start = args.start_epoch #start = 0 end = start + args.epochs train_loader = torch.utils.data.DataLoader(train_dir, batch_size=args.batch_size, shuffle=False, **kwargs) for epoch in range(start, end): train(train_loader, model, optimizer, epoch) #test(test_loader, model, epoch) #break; if test_display_triplet_distance: display_triplet_distance(model, train_loader, LOG_DIR + "/train_{}".format(epoch))
import os import torch from torch.autograd import Variable from model import DeepSpeakerModel # _, _, rawInput = get_feature(os.path.dirname(os.path.abspath(__file__)) + '/s2_n_8_9.wav') # print('Raw input: ', rawInput) # if torch.cuda.is_available(): # tensor = torch.from_numpy(rawInput).type(torch.FloatTensor).cuda() cnnModel = DeepSpeakerModel(256, 1) # else: # tensor = torch.from_numpy(rawInput).type(torch.FloatTensor) # cnnModel = DeepSpeakerModel(256, 1) # input = Variable(tensor, requires_grad=True) # print('Input: ', input) input = Variable(torch.randn(128, 1, 64, 32).type(torch.FloatTensor), requires_grad=True) output = cnnModel(input) print('OUTPUT: ', output)
import options as opt from data_load import Mydataset from model import DeepSpeakerModel from audio_fbank import read_mfcc, sample_from_mfcc import tensorflow as tf import tqdm if (__name__ == '__main__'): torch.manual_seed(55) torch.cuda.manual_seed_all(55) if (__name__ == '__main__'): model = DeepSpeakerModel(embedding_size=opt.embedding_size, num_classes=opt.classes).cuda() writer = SummaryWriter() if (hasattr(opt, 'weights')): pretrained_dict = torch.load(opt.weights) model_dict = model.state_dict() pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict.keys() and v.size() == model_dict[k].size() } missed_params = [ k for k, v in model_dict.items() if not k in pretrained_dict.keys() ]
def load_model(ckp_path): model = DeepSpeakerModel(embedding_size=512, num_classes=251) checkpoint = torch.load(ckp_path, map_location='cpu') model.load_state_dict(checkpoint) return model
pin_memory=True) logging.info("Building dataset Sucessed") ## Building Model ## logging.info("Building Model") opt.in_size = val_dataset.in_size if opt.seq_training == 'true': if opt.model_type == 'lstm' or opt.model_type == 'blstmp': model = DeepSpeakerSeqModel(opt) elif opt.model_type == 'cnn': model = DeepSpeakerCnnSeqModel(opt) else: raise Exception('wrong model_type {}'.format(opt.model_type)) else: if opt.model_type == 'lstm' or opt.model_type == 'blstmp': model = DeepSpeakerModel(opt) elif opt.model_type == 'cnn': model = DeepSpeakerCnnModel(opt) else: raise Exception('wrong model_type {}'.format(opt.model_type)) if opt.resume: model, opt.steps = load(model, opt.resume) else: raise Exception('wrong opt.resume {}'.format(opt.resume)) model.to(opt.device) print(model) logging.info("Building Model Sucessed")