def nograd_loss(args): model, datapoint = args states = None loss = 0 with no_grad(): for inp,lbl in datapoint: out, states = respond_to(model, inp, states) loss += sequence_loss(lbl, out, do_grad=False) return loss
def grad_loss(args): model, datapoint = args states = None loss = 0 grads = [zeros(param.size()) for layer in model for param in layer._asdict().values()] for inp,lbl in datapoint: out, states = respond_to(model, inp, states) states = [state.detach() for state in states] loss += sequence_loss(lbl, out) grads = [e1 + e2 for e1, e2 in zip(grads, collect_grads(model))] return grads, loss
def respond_to(model, sequences, training_run=True, extra_steps=0): responses = [[] for _ in range(len(sequences))] loss = 0 convolver, enc, dec, deconvolver = model hann_w = hann() if not config.use_gpu else hann().cuda() ihann_w = ihann() if not config.use_gpu else ihann().cuda() # with no_grad(): # #print(convolver[0].w.size(), hann().size()) # convolver[0].w *= hann_w # # deconvolver[0].w *= ihann(deconvolver[0].w) for i, sequence in enumerate(sequences): #print(f'seq{i}/{len(sequences)}') #print('in size:',sequence.size(),'conv_w size:',convolver[0].w.unsqueeze(1).size()) sequence = conv1d(sequence, (convolver[0].w * hann_w).unsqueeze(1), stride=config.frame_stride) sequence = transpose(sequence, 1, 2) sequence /= config.frame_len #print('conved size:',sequence.size()) # make key,query from all here.. => the transformer stuff for t in range(sequence.size(1) - 1): #curr_inp = sequence[:,t:t+1,:] prev_inps = sequence[:, :t + 1, :] lbl = sequence[:, t + 1:t + 2, :] positions = Tensor([[t + 1 / config.max_T, i / config.max_T] for i in range(t + 1)]).view(1, -1, 2) if config.use_gpu: positions = positions.cuda() #print(f'{t}/{sequence.size(1)}') # print('t:',t,',prev inps size:',prev_inps.size(),'curr inp size:',curr_inp.size()) #todo: hmmmm.. #inp = cat([prev_inps,curr_inp.repeat(1,t+1,1)], -1) inp = cat([prev_inps, positions], -1) # if config.seq_force_ratio != 1 and t>=2: # seq_force_ratio = config.seq_force_ratio**t # inp *= seq_force_ratio # inp += #print('inp size:',inp.size()) enced = prop_model(enc, inp) # print('enced size:', enced.size()) attn_inp = (softmax(enced, 1) * prev_inps).sum(1) # print('attnded size:', attn_inp.size()) deced = prop_model(dec, attn_inp) loss += sequence_loss(lbl, deced) responses[-1].append(deced) # input("halt here") #input('halt here..') if training_run: loss.backward() return float(loss) else: #print("seq size", sequence.size(1), 'hm resps', len(responses[-1])) if len(sequences) == 1: for t_extra in range(extra_steps): t = sequence.size(1) + t_extra - 1 #print(f't extra:{t}') curr_inp = responses[-1][t - 1] # print(sequence[:,:,:].size(), stack(responses[-1][sequence.size(1)-1-1:],1).size()) prev_inps = cat([ sequence[:, :-1, :], stack(responses[-1][sequence.size(1) - 1 - 1:], 1) ], 1) inp = cat([prev_inps, curr_inp.repeat(1, t + 1, 1)], -1) #print(inp.size()) enced = prop_model(enc, inp) # print('enced size:', enced.size()) attn_inp = (softmax(enced, 1) * prev_inps).sum(1) # print('attnded size:', attn_inp.size()) deced = prop_model(dec, attn_inp) responses[-1].append(deced) responses = responses[-1] responses = [(deconvolver[0].w * resp).sum(1) for resp in responses] responses = [resp * ihann_w for resp in responses] hm_windows = (len(sequence) - config.frame_len // config.frame_stride) + 1 responses = [] # todo: stitch together responses here.. responses = Tensor(responses).view(1, 1, -1) return float(loss), responses
def main(model=None): print(f'readying model & data @ {now()}') data = load_data() if not data: save_data(preprocess()) data = load_data() if not model: if not config.fresh_model: model = load_model() if not model: model = make_model() save_model(model) model = load_model() print('created ',end='') else: print('loaded ',end='') print(f'model: {describe_model(model)}') print(f'total files: {len(data)}, ',end='') data, data_dev = split_dataset(data) if config.batch_size > len(data): config.batch_size = len(data) elif config.batch_size == -1: config.batch_size = len(data_dev) print(f'train: {len(data)}, dev: {len(data_dev)}, batch size: {config.batch_size}') print(f'hm train: {sum(len(datapoint) for datapoint in data)}, ' f'hm dev: {sum(len(datapoint) for datapoint in data_dev)}, ' f'learning rate: {config.learning_rate}, ' f'optimizer: {config.optimizer}, ' f'\ntraining for {config.hm_epochs} epochs.. ',end='\n') one_batch = (config.batch_size == len(data)) or (config.train_combined and config.train_parallel) config.shuffle_epoch &= not one_batch window_slide_multiplier = config.hm_bars_grouped//config.hm_bars_slide if config.ckp_save_epochs == -1: config.ckp_save_epochs = range(config.hm_epochs) data_losss, dev_losss = [], [] if config.initialize_loss: print(f'initializing losses @ {now()}', flush=True) if not one_batch: data_losss.append(dev_loss(model,data)) dev_losss.append(dev_loss(model,data_dev)) print(f'initial losses: {data_losss, dev_losss}') print(f'training started @ {now()}', flush=True) for ep in range(config.hm_epochs): loss = 0 if config.train_parallel and config.train_combined: l, g = process_data_onebatch(model, data) loss += l give_grads(model, g) batch_size = sum(sum(len(inp) * window_slide_multiplier for inp, lbl in datapoint) for datapoint in data) sgd(model, batch_size=batch_size) if config.optimizer == 'sgd' else adaptive_sgd(model, ep, batch_size=batch_size) else: for i,batch in enumerate(batchify(data)): if config.disp_batches: print(f'\tbatch {i}, {sum(len(datapoint) for datapoint in batch)}', end='', flush=True) batch_size = sum(sum(len(inp)*window_slide_multiplier for inp,lbl in datapoint) for datapoint in batch) if config.train_parallel: l,g = process_batch_parallel(model,batch) loss += l give_grads(model,g) elif config.train_combined: loss += process_batch_combined(model, batch) else: for j,datapoint in enumerate(batch): states = None for k,(inp,lbl) in enumerate(datapoint): out, states = respond_to(model, inp, states) states = [state.detach() for state in states] loss += sequence_loss(lbl,out) sgd(model,batch_size=batch_size) if config.optimizer == 'sgd' else adaptive_sgd(model,ep,batch_size=batch_size) if config.disp_batches: print(f', completed @ {now()}' ,flush=True) loss /= sum(sum(len(inp)*window_slide_multiplier for inp,lbl in datapoint) for datapoint in data) data_losss.append(loss) dev_losss.append(dev_loss(model,data_dev)) print(f'epoch {ep}, loss {loss}, dev loss {dev_losss[-1]}, completed @ {now()}', flush=True) if ep in config.ckp_save_epochs: save_model(model,f'{config.model_save_path}_ckp{ep}') data_losss.append(dev_loss(model,data)) dev_losss.append(dev_loss(model,data_dev)) print(f'final losses: {[data_losss[-1],dev_losss[-1]]}') print(f'training ended @ {now()}', flush=True) plot(data_losss) show() plot(dev_losss) show() if config.overwrite_model or input(f'Save model as {config.model_save_path}? (y/n): ').lower() == 'y': save_model(load_model(),config.model_save_path+'_prev') save_model(model) return model, [data_losss, dev_losss]
def process_batch_combined(model,batch,training_run=True): batch = deepcopy(batch) loss = 0 zero_states = empty_states(model,len(batch)) all_states = deepcopy(zero_states) window_slide_ratio = config.hm_bars_slide/config.hm_bars_grouped teacher_ratio = config.hm_bars_teacher/config.hm_bars_grouped max_inplbls = max(len(datapoint) for datapoint in batch) for datapoint in batch: hm = max_inplbls-len(datapoint) if hm: datapoint.extend([None]*hm) has_remaining_inplbl = list(range(len(batch))) for ctr_inplbl in range(max_inplbls): # print(f'\t',f'{ctr_inplbl}/{max_inplbls}',now(),flush=True) has_remaining_inplbl = [i for i in has_remaining_inplbl if batch[i][ctr_inplbl] is not None] inplbls_slice = [batch[i][ctr_inplbl] for i in has_remaining_inplbl] max_inplen = max(len(inp) for inp,lbl in inplbls_slice) for inp,lbl in inplbls_slice: hm = max_inplen-len(inp) if hm: inp.extend([None]*hm) all_inps = [batch[i][ctr_inplbl][0] for i in has_remaining_inplbl] all_lbls = [batch[i][ctr_inplbl][1] for i in has_remaining_inplbl] states_transfers_to = [int((len(inp)+1)*window_slide_ratio) for inp,lbl in inplbls_slice] states_to_transfer = deepcopy(zero_states) teacher_up_to = [int((len(inp)+1)*teacher_ratio) for inp,lbl in inplbls_slice] # all_outs = [] has_remaining_inp = list(has_remaining_inplbl) has_remaining_inp_= range(len(has_remaining_inplbl)) for t in range(max_inplen): has_remaining_inp = [i for i,ii in zip(has_remaining_inp,has_remaining_inp_) if all_inps[ii][t] is not None] links_to_prev = [has_remaining_inp_.index(i) for i in [has_remaining_inplbl.index(i) for i in has_remaining_inp]] has_remaining_inp_= [has_remaining_inplbl.index(i) for i in has_remaining_inp] # inps = cat([all_inps[i][t] for i in has_remaining_inp_], dim=0) # lbls = cat([all_lbls[i][t] for i in has_remaining_inp_], dim=0) inps = cat([all_inps[i][t] if t <= teacher_up_to[i] else outs[links_to_prev[ii]:links_to_prev[ii]+1,:config.timestep_size] for ii,i in enumerate(has_remaining_inp_)], dim=0) lbls = cat([all_lbls[i][t] for i in has_remaining_inp_], dim=0) states = [stack([row for i,row in enumerate(layer_state) if i in has_remaining_inp]) for layer_state in all_states] #start = time() outs, states = prop_model_nocircuit(model, states, inps) for layer_state, state in zip(all_states, states): for ii,i in enumerate(has_remaining_inp): layer_state[i] = state[ii] t +=1 for i in has_remaining_inp_: if t == states_transfers_to[i]: for layer_state, transfer_state in zip(all_states, states_to_transfer): transfer_state[i] = layer_state[i].detach() #nnt = time() - start # TDO : start a thread with this prop circuit + its loss part? #start = time() if not config.act_classical_rnn: outs = prop_circuits(outs, inps) outs_ = outs[:,:config.timestep_size] for i in range(config.timestep_size,config.statevec_size): outs_[:,-1] += outs[:,i] outs = outs_ # else: # outs = softmax(outs,dim=1) #outs = outs/outs.sum() #cct = time() - start # print('circuit out',flush=True) ; show_it = 7 # print(circ_outs[show_it]) # print('extra qiskit answer',flush=True) # from circuit import make_circuit,run_circuit # arg2 = inps[show_it] # arg1 = outs[show_it] # results = run_circuit(make_circuit(arg1.detach().numpy(),arg2.detach().numpy()),backend='state',display_job_status=False) # result_final = list(abs(result)**2 for result in results) # print(result_final) # input('Halt..') # from circuit import prop_circuit # print('extra extra answers..') # print('theoretical:') # print(prop_circuit(arg1,arg2)) # print('experimental:') # print(prop_circuit(arg1,arg2,mode='experimental')) # input("HALT!") # print(f'> training times for t {t}/{max_inplen}*{max_inplbls}: {nnt} - {cct} ;; {cct/nnt}') # input("continue to next it.. ?") # all_outs.append(circ_outs) loss += sequence_loss(lbls,outs,do_stack=False) # for i,layer in enumerate(model): # for l in layer._fields: # g = getattr(layer,l).grad # if g is not None: # if g.sum() == 0: print(f'Zero grad at layer {i} {l}') # else: print(f'layer {i} {l} norm: {g.norm()}, sum: {g.sum()}, abs-sum: {g.abs().sum()}') # else: print(f'No grad at layer {i} {l} !') # input('Halt !') all_states = states_to_transfer if training_run: loss.backward() return float(loss)