def state_stack(net, audio): from vg.scorer import testing with testing(net): states_bot = net.SpeechImage.SpeechEncoderBottom.states(audio) states_top = net.SpeechImage.SpeechEncoderTop.states(states_bot[-1]) states = torch.cat([states_bot, states_top], dim=0).permute(1, 2, 0, 3) #batch x length x layer x feature return states
def experiment(net, data, prov, model_config, run_config): def valid_loss(task): result = [] for item in data.iter_valid_batches(): args = task.args(item) args = [ torch.autograd.Variable(torch.from_numpy(x), volatile=True).cuda() for x in args ] result.append(task.test_cost(*args).data.cpu().numpy()) return result net.cuda() net.train() scorer = Scorer( prov, dict(split='val', tokenize=lambda x: x['audio'], batch_size=data.batch_size)) net.optimizer.zero_grad() last_epoch = 0 with open("result.json", "w") as out: for epoch in range(last_epoch + 1, run_config['epochs'] + 1): costs = Counter() net.train() for _j, item in enumerate(data.iter_train_batches()): j = _j + 1 name = "Aud" task = net spk = item['speaker'][0] if len(set( item['speaker'])) == 1 else 'MIXED' args = task.args(item) args = [ torch.autograd.Variable(torch.from_numpy(x)).cuda() for x in args ] loss = task.optimizer.step(lambda: task.step(*args)) costs += Counter({'cost': loss.data[0], 'N': 1}) print(epoch, j, j * data.batch_size, name, spk, "train", "".join([str(costs['cost'] / costs['N'])])) if j % run_config['validate_period'] == 0: print(epoch, j, 0, name, spk, "valid", "".join([str(numpy.mean(valid_loss(task)))])) sys.stdout.flush() torch.save(net, "model.{}.pkl".format(epoch)) with testing(net): result = dict(epoch=epoch, rsa=scorer.rsa_image(net), para=scorer.retrieval_para(net)) if run_config.get('speaker_id', True): result['speaker_id'] = scorer.speaker_id(net) out.write(json.dumps(result)) out.write("\n") out.flush() torch.save(net, "model.pkl")
def valid_loss(net, name, task, data): result = [] with testing(net): #net.eval() for item in data[name].iter_valid_batches(): args = task.args(item) args = [ torch.autograd.Variable(torch.from_numpy(x)).cuda() for x in args ] result.append(task.test_cost(*args).data.cpu().numpy()) return result
def predict_beam(self, audio, audio_len, beam_size): with testing(self): out = self.SpeechEncoderBottom(audio, audio_len) preds = self.TextDecoder.beam_search(out, beam_size) pred_chars = [] for i_seq in range(preds.shape[0]): seq = preds[i_seq] i_eos = (seq == self.mapper.END_ID).nonzero()[0] i_last = i_eos[0] if i_eos.shape[0] > 0 else seq.shape[0] chars = [ self.mapper.ids.from_id(id.item()) for id in seq[:i_last] ] pred_chars.append(''.join(chars)) return pred_chars
def predict(self, audio): with testing(self): rep = self.SpeechImage.SpeechEncoderTop( self.SpeechImage.SpeechEncoderBottom(audio)) return rep
def encode_images(self, images): with testing(self): rep = self.SpeechImage.ImageEncoder(images) return rep
def predict(self, text): with testing(self): rep = self.TextEncoderTop(self.TextEncoderBottom(text)) return rep
def test_cost(self, *args): with testing(self): return self.cost(*args)
def predict(self, audio, audio_len): with testing(self): logits, _ = self.forward(audio, audio_len) return self.logits2pred(logits.detach().cpu())
def predict(self, speech): with testing(self): return self(speech)
def predict(self, speech): with testing(self): self.eval() pred = self(speech) return pred
def test_cost(self, beg, end): with testing(self): self.eval() cost = self.cost(beg, end) return cost
def experiment(net, data, run_config): def valid_loss(name, task): result = [] for item in data[name].iter_valid_batches(): args = task.args(item) args = [ torch.autograd.Variable(torch.from_numpy(x), volatile=True).cuda() for x in args ] result.append( [x.data.cpu().numpy() for x in task.test_cost(*args)]) return result net.cuda() net.train() scorer = Scorer( data['Image'].provider, dict(split='val', tokenize=lambda x: x['audio'], batch_size=data['Image'].batch_size)) last_epoch = 0 #tasks = [('Text', net.Text), ('Image', net.Image)] for _, task in run_config['tasks']: task.optimizer.zero_grad() with open("result.json", "w") as out: for epoch in range(last_epoch + 1, run_config['epochs'] + 1): costs = dict(Text=Counter(), Image=Counter()) for _j, items in enumerate( zip(data['Image'].iter_train_batches(reshuffle=True), data['Text'].iter_train_batches(reshuffle=True))): j = _j + 1 item = dict(Image=items[0], Text=items[1]) for name, task in run_config['tasks']: spk = item[name]['speaker'][0] if len( set(item[name]['speaker'])) == 1 else 'MIXED' args = task.args(item[name]) args = [ torch.autograd.Variable(torch.from_numpy(x)).cuda() for x in args ] loss = task.cost(*args) task.optimizer.zero_grad() loss.backward() _ = nn.utils.clip_grad_norm(task.parameters(), task.config['max_norm']) task.optimizer.step() costs[name] += Counter({'cost': loss.data[0], 'N': 1}) print( epoch, j, j * data[name].batch_size, name, spk, "train", "".join([str(costs[name]['cost'] / costs[name]['N'])])) if j % run_config['validate_period'] == 0: loss = valid_loss(name, task) print(epoch, j, 0, name, "VALID", "valid", "".join([str(numpy.mean(loss))])) # with testing(net): # result = dict(epoch=epoch, rsa=scorer.rsa_image(net), # retrieval=scorer.retrieval(net), # speaker_id=scorer.speaker_id(net)) # out.write(json.dumps(result)) # out.write("\n") # out.flush() sys.stdout.flush() torch.save(net, "model.{}.pkl".format(epoch)) # loaded = S.load("model.{}.pkl".format(epoch)) with testing(net): result = dict(epoch=epoch, rsa=scorer.rsa_image(net), retrieval=scorer.retrieval(net), speaker_id=scorer.speaker_id(net)) out.write(json.dumps(result)) out.write("\n") out.flush() torch.save(net, "model.pkl")
def predict(self, audio): with testing(self): rep = self.Image.SpeechEncoder(audio) return rep
def layer_states(net, audio): from vg.scorer import testing with testing(net): states = net.SpeechImage.SpeechEncoderBottom(audio) return states
def experiment(net, data, run_config): net.cuda() net.train() scorer = run_config['Scorer'] last_epoch = 0 result_fpath = "result.json" model_fpath_tmpl = "model.{}.pkl" model_fpath = "model.pkl" if run_config['debug']: wdump_fpath = "weights.csv" if run_config['save_path'] is not None: result_fpath = os.path.join(run_config['save_path'], result_fpath) model_fpath_tmpl = os.path.join(run_config['save_path'], model_fpath_tmpl) model_fpath = os.path.join(run_config['save_path'], model_fpath) if run_config['debug']: wdump_fpath = os.path.join(run_config['save_path'], wdump_fpath) wdump = open(wdump_fpath, "w") for _, task in run_config['tasks']: task.optimizer.zero_grad() with open(result_fpath, "w") as out: if run_config['debug']: t = time.time() best_wer = None for epoch in range(last_epoch + 1, run_config['epochs'] + 1): # FIXME: avoid end of epoch with small batch? for _j, item in enumerate(data.iter_train_batches(reshuffle=True)): j = _j + 1 for name, task in run_config['tasks']: spkr = item['speaker'] spkr = spkr[0] if len(set(spkr)) == 1 else 'MIXED' args = task.args(item) args = [torch.from_numpy(x).cuda() for x in args] loss = task.cost(*args) task.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(task.parameters(), task.config['max_norm']) task.optimizer.step() print(epoch, j, j * data.batch_size, spkr, "train", str(loss.item())) if j % run_config['validate_period'] == 0: loss = valid_loss(net, task, data) print(epoch, j, 0, "VALID", "valid", str(np.mean(loss))) # Dump weights for debugging if run_config['debug']: weights = [ str(p.view(-1)[0].item()) for p in task.parameters() ] wdump.write(",".join(weights)) wdump.write("\n") wdump.flush() sys.stdout.flush() #torch.save(net, model_fpath_tmpl.format(epoch)) if run_config['debug']: t2 = time.time() print("Elapsed time: {:3f}".format(t2 - t)) t = t2 with testing(net): scorer.set_net(net) result = dict(epoch=epoch, cer=scorer.cer(), wer=scorer.wer()) cer = result['cer']['CER'] wer = result['wer']['WER'] print(epoch, j, 0, "CER", "valid", cer, "WER", "valid", wer) out.write(json.dumps(result)) out.write("\n") out.flush() # Save best model if best_wer is None or wer < best_wer: torch.save(net.state_dict(), model_fpath) best_wer = wer else: net.load_state_dict(torch.load(model_fpath)) if 'epsilon_decay' in run_config.keys(): for p in net.SpeechTranscriber.optimizer.param_groups: p["eps"] *= run_config['epsilon_decay'] print('Epsilon decay - new value: ', p["eps"]) if run_config['debug']: t2 = time.time() print("Elapsed time: {:3f}".format(t2 - t)) t = t2 if run_config['debug']: wdump.close() # Save full model for inference torch.save(net, model_fpath)
def experiment(net, data, run_config): net.cuda() net.train() scorer = run_config['Scorer'] last_epoch = 0 for _, task in run_config['tasks']: task.optimizer.zero_grad() with open("result.json", "w") as out: for epoch in range(last_epoch + 1, run_config['epochs'] + 1): costs = dict(SpeechText=Counter(), SpeechCorrText=Counter(), SpeechImage=Counter(), TextImage=Counter()) for _j, items in enumerate( zip( data['SpeechImage'].iter_train_batches(reshuffle=True), data['SpeechText'].iter_train_batches(reshuffle=True), data['SpeechCorrText'].iter_train_batches( reshuffle=True), data['TextImage'].iter_train_batches(reshuffle=True))): j = _j + 1 item = dict(SpeechImage=items[0], SpeechText=items[1], SpeechCorrText=items[2], TextImage=items[3]) for name, task in run_config['tasks']: spk = item[name]['speaker'][0] if len( set(item[name]['speaker'])) == 1 else 'MIXED' args = task.args(item[name]) args = [torch.from_numpy(x).cuda() for x in args] loss = task.cost(*args) task.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(task.parameters(), task.config['max_norm']) task.optimizer.step() costs[name] += Counter({'cost': loss.data.item(), 'N': 1}) print( epoch, j, j * data[name].batch_size, name, spk, "train", "".join([str(costs[name]['cost'] / costs[name]['N'])])) if j % run_config['validate_period'] == 0: loss = valid_loss(net, name, task, data) print(epoch, j, 0, name, "VALID", "valid", "".join([str(numpy.mean(loss))])) sys.stdout.flush() torch.save(net, "model.{}.pkl".format(epoch)) with testing(net): result = dict(epoch=epoch, rsa=scorer.rsa_image(net), retrieval=scorer.retrieval(net), speaker_id=scorer.speaker_id(net)) out.write(json.dumps(result)) out.write("\n") out.flush() torch.save(net, "model.pkl")
def test_cost(self, speech1_prev, speech1, speech2, speech3_prev, speech3): with testing(self): self.eval() rep = self(speech2) return self.cost(speech1_prev, speech1, rep, speech3_prev, speech3)