示例#1
0
def nograd_loss(args):

    model, datapoint = args

    states = None
    loss = 0

    with no_grad():

        for inp,lbl in datapoint:

            out, states = respond_to(model, inp, states)

            loss += sequence_loss(lbl, out, do_grad=False)

    return loss
示例#2
0
def grad_loss(args):

    model, datapoint = args

    states = None
    loss = 0

    grads = [zeros(param.size()) for layer in model for param in layer._asdict().values()]

    for inp,lbl in datapoint:

        out, states = respond_to(model, inp, states)

        states = [state.detach() for state in states]

        loss += sequence_loss(lbl, out)
        grads = [e1 + e2 for e1, e2 in zip(grads, collect_grads(model))]

    return grads, loss
示例#3
0
def respond_to(model, sequences, training_run=True, extra_steps=0):

    responses = [[] for _ in range(len(sequences))]
    loss = 0

    convolver, enc, dec, deconvolver = model

    hann_w = hann() if not config.use_gpu else hann().cuda()
    ihann_w = ihann() if not config.use_gpu else ihann().cuda()

    # with no_grad():
    #     #print(convolver[0].w.size(), hann().size())
    #     convolver[0].w *= hann_w
    # #     deconvolver[0].w *= ihann(deconvolver[0].w)

    for i, sequence in enumerate(sequences):

        #print(f'seq{i}/{len(sequences)}')

        #print('in size:',sequence.size(),'conv_w size:',convolver[0].w.unsqueeze(1).size())

        sequence = conv1d(sequence, (convolver[0].w * hann_w).unsqueeze(1),
                          stride=config.frame_stride)
        sequence = transpose(sequence, 1, 2)
        sequence /= config.frame_len

        #print('conved size:',sequence.size())

        # make key,query from all here.. => the transformer stuff

        for t in range(sequence.size(1) - 1):

            #curr_inp = sequence[:,t:t+1,:]
            prev_inps = sequence[:, :t + 1, :]
            lbl = sequence[:, t + 1:t + 2, :]

            positions = Tensor([[t + 1 / config.max_T, i / config.max_T]
                                for i in range(t + 1)]).view(1, -1, 2)
            if config.use_gpu: positions = positions.cuda()

            #print(f'{t}/{sequence.size(1)}')

            # print('t:',t,',prev inps size:',prev_inps.size(),'curr inp size:',curr_inp.size())

            #todo: hmmmm..
            #inp = cat([prev_inps,curr_inp.repeat(1,t+1,1)], -1)
            inp = cat([prev_inps, positions], -1)

            # if config.seq_force_ratio != 1 and t>=2:
            #     seq_force_ratio = config.seq_force_ratio**t
            #     inp *= seq_force_ratio
            #     inp +=

            #print('inp size:',inp.size())

            enced = prop_model(enc, inp)

            # print('enced size:', enced.size())

            attn_inp = (softmax(enced, 1) * prev_inps).sum(1)

            # print('attnded size:', attn_inp.size())

            deced = prop_model(dec, attn_inp)

            loss += sequence_loss(lbl, deced)

            responses[-1].append(deced)
            # input("halt here")

        #input('halt here..')

    if training_run:
        loss.backward()
        return float(loss)

    else:

        #print("seq size", sequence.size(1), 'hm resps', len(responses[-1]))

        if len(sequences) == 1:

            for t_extra in range(extra_steps):
                t = sequence.size(1) + t_extra - 1

                #print(f't extra:{t}')

                curr_inp = responses[-1][t - 1]

                # print(sequence[:,:,:].size(), stack(responses[-1][sequence.size(1)-1-1:],1).size())

                prev_inps = cat([
                    sequence[:, :-1, :],
                    stack(responses[-1][sequence.size(1) - 1 - 1:], 1)
                ], 1)

                inp = cat([prev_inps, curr_inp.repeat(1, t + 1, 1)], -1)

                #print(inp.size())

                enced = prop_model(enc, inp)

                # print('enced size:', enced.size())

                attn_inp = (softmax(enced, 1) * prev_inps).sum(1)

                # print('attnded size:', attn_inp.size())

                deced = prop_model(dec, attn_inp)

                responses[-1].append(deced)

            responses = responses[-1]
            responses = [(deconvolver[0].w * resp).sum(1)
                         for resp in responses]
            responses = [resp * ihann_w for resp in responses]
            hm_windows = (len(sequence) -
                          config.frame_len // config.frame_stride) + 1

            responses = []  # todo: stitch together responses here..
            responses = Tensor(responses).view(1, 1, -1)

        return float(loss), responses
示例#4
0
def main(model=None):

    print(f'readying model & data @ {now()}')

    data = load_data()
    if not data:
        save_data(preprocess())
        data = load_data()

    if not model:
        if not config.fresh_model:
            model = load_model()
        if not model:
            model = make_model()
            save_model(model)
            model = load_model()
            print('created ',end='')
        else: print('loaded ',end='')
        print(f'model: {describe_model(model)}')

    print(f'total files: {len(data)}, ',end='')

    data, data_dev = split_dataset(data)

    if config.batch_size > len(data):
        config.batch_size = len(data)
    elif config.batch_size == -1:
        config.batch_size = len(data_dev)

    print(f'train: {len(data)}, dev: {len(data_dev)}, batch size: {config.batch_size}')

    print(f'hm train: {sum(len(datapoint) for datapoint in data)}, '
          f'hm dev: {sum(len(datapoint) for datapoint in data_dev)}, '
          f'learning rate: {config.learning_rate}, '
          f'optimizer: {config.optimizer}, '
          f'\ntraining for {config.hm_epochs} epochs.. ',end='\n')

    one_batch = (config.batch_size == len(data)) or (config.train_combined and config.train_parallel)
    config.shuffle_epoch &= not one_batch
    window_slide_multiplier = config.hm_bars_grouped//config.hm_bars_slide
    if config.ckp_save_epochs == -1: config.ckp_save_epochs = range(config.hm_epochs)

    data_losss, dev_losss = [], []

    if config.initialize_loss:

        print(f'initializing losses @ {now()}', flush=True)
        if not one_batch:
            data_losss.append(dev_loss(model,data))
        dev_losss.append(dev_loss(model,data_dev))
        print(f'initial losses: {data_losss, dev_losss}')

    print(f'training started @ {now()}', flush=True)

    for ep in range(config.hm_epochs):

        loss = 0

        if config.train_parallel and config.train_combined:
            l, g = process_data_onebatch(model, data)
            loss += l
            give_grads(model, g)
            batch_size = sum(sum(len(inp) * window_slide_multiplier for inp, lbl in datapoint) for datapoint in data)
            sgd(model, batch_size=batch_size) if config.optimizer == 'sgd' else adaptive_sgd(model, ep, batch_size=batch_size)

        else:
            for i,batch in enumerate(batchify(data)):

                if config.disp_batches:
                    print(f'\tbatch {i}, {sum(len(datapoint) for datapoint in batch)}', end='', flush=True)

                batch_size = sum(sum(len(inp)*window_slide_multiplier for inp,lbl in datapoint) for datapoint in batch)

                if config.train_parallel:
                    l,g = process_batch_parallel(model,batch)
                    loss += l
                    give_grads(model,g)

                elif config.train_combined:
                    loss += process_batch_combined(model, batch)

                else:
                    for j,datapoint in enumerate(batch):
                        states = None
                        for k,(inp,lbl) in enumerate(datapoint):
                            out, states = respond_to(model, inp, states)
                            states = [state.detach() for state in states]
                            loss += sequence_loss(lbl,out)

                sgd(model,batch_size=batch_size) if config.optimizer == 'sgd' else adaptive_sgd(model,ep,batch_size=batch_size)

                if config.disp_batches:
                    print(f', completed @ {now()}' ,flush=True)

        loss /= sum(sum(len(inp)*window_slide_multiplier for inp,lbl in datapoint) for datapoint in data)

        data_losss.append(loss)
        dev_losss.append(dev_loss(model,data_dev))
        
        print(f'epoch {ep}, loss {loss}, dev loss {dev_losss[-1]}, completed @ {now()}', flush=True)

        if ep in config.ckp_save_epochs:
            save_model(model,f'{config.model_save_path}_ckp{ep}')

    data_losss.append(dev_loss(model,data))
    dev_losss.append(dev_loss(model,data_dev))

    print(f'final losses: {[data_losss[-1],dev_losss[-1]]}')

    print(f'training ended @ {now()}', flush=True)

    plot(data_losss)
    show()
    plot(dev_losss)
    show()

    if config.overwrite_model or input(f'Save model as {config.model_save_path}? (y/n): ').lower() == 'y':
        save_model(load_model(),config.model_save_path+'_prev')
        save_model(model)

    return model, [data_losss, dev_losss]
示例#5
0
def process_batch_combined(model,batch,training_run=True):

    batch = deepcopy(batch)

    loss = 0

    zero_states = empty_states(model,len(batch))

    all_states = deepcopy(zero_states)

    window_slide_ratio = config.hm_bars_slide/config.hm_bars_grouped
    teacher_ratio = config.hm_bars_teacher/config.hm_bars_grouped

    max_inplbls = max(len(datapoint) for datapoint in batch)

    for datapoint in batch:
        hm = max_inplbls-len(datapoint)
        if hm:
            datapoint.extend([None]*hm)

    has_remaining_inplbl = list(range(len(batch)))

    for ctr_inplbl in range(max_inplbls):

        # print(f'\t',f'{ctr_inplbl}/{max_inplbls}',now(),flush=True)

        has_remaining_inplbl = [i for i in has_remaining_inplbl if batch[i][ctr_inplbl] is not None]

        inplbls_slice = [batch[i][ctr_inplbl] for i in has_remaining_inplbl]

        max_inplen = max(len(inp) for inp,lbl in inplbls_slice)

        for inp,lbl in inplbls_slice:
            hm = max_inplen-len(inp)
            if hm:
                inp.extend([None]*hm)

        all_inps = [batch[i][ctr_inplbl][0] for i in has_remaining_inplbl]
        all_lbls = [batch[i][ctr_inplbl][1] for i in has_remaining_inplbl]

        states_transfers_to = [int((len(inp)+1)*window_slide_ratio) for inp,lbl in inplbls_slice]
        states_to_transfer = deepcopy(zero_states)

        teacher_up_to = [int((len(inp)+1)*teacher_ratio) for inp,lbl in inplbls_slice]

        # all_outs = []

        has_remaining_inp = list(has_remaining_inplbl)
        has_remaining_inp_= range(len(has_remaining_inplbl))

        for t in range(max_inplen):

            has_remaining_inp = [i for i,ii in zip(has_remaining_inp,has_remaining_inp_) if all_inps[ii][t] is not None]

            links_to_prev = [has_remaining_inp_.index(i) for i in [has_remaining_inplbl.index(i) for i in has_remaining_inp]]

            has_remaining_inp_= [has_remaining_inplbl.index(i) for i in has_remaining_inp]

            # inps = cat([all_inps[i][t] for i in has_remaining_inp_], dim=0)
            # lbls = cat([all_lbls[i][t] for i in has_remaining_inp_], dim=0)

            inps = cat([all_inps[i][t] if t <= teacher_up_to[i] else outs[links_to_prev[ii]:links_to_prev[ii]+1,:config.timestep_size] for ii,i in enumerate(has_remaining_inp_)], dim=0)
            lbls = cat([all_lbls[i][t] for i in has_remaining_inp_], dim=0)
            
            states = [stack([row for i,row in enumerate(layer_state) if i in has_remaining_inp]) for layer_state in all_states]

            #start = time()

            outs, states = prop_model_nocircuit(model, states, inps)

            for layer_state, state in zip(all_states, states):
                for ii,i in enumerate(has_remaining_inp):
                    layer_state[i] = state[ii]

            t +=1
            for i in has_remaining_inp_:
                if t == states_transfers_to[i]:
                    for layer_state, transfer_state in zip(all_states, states_to_transfer):
                        transfer_state[i] = layer_state[i].detach()

            #nnt = time() - start

            # TDO : start a thread with this prop circuit + its loss part?

            #start = time()

            if not config.act_classical_rnn:

                outs = prop_circuits(outs, inps)

                outs_ = outs[:,:config.timestep_size]
                for i in range(config.timestep_size,config.statevec_size):
                    outs_[:,-1] += outs[:,i]
                outs = outs_
                
            # else:
            #     outs = softmax(outs,dim=1)
            #outs = outs/outs.sum()

            #cct = time() - start

            # print('circuit out',flush=True) ; show_it = 7
            # print(circ_outs[show_it])
            # print('extra qiskit answer',flush=True)
            # from circuit import make_circuit,run_circuit
            # arg2 = inps[show_it]
            # arg1 = outs[show_it]
            # results = run_circuit(make_circuit(arg1.detach().numpy(),arg2.detach().numpy()),backend='state',display_job_status=False)
            # result_final = list(abs(result)**2 for result in results)
            # print(result_final)
            # input('Halt..')
            # from circuit import prop_circuit
            # print('extra extra answers..')
            # print('theoretical:')
            # print(prop_circuit(arg1,arg2))
            # print('experimental:')
            # print(prop_circuit(arg1,arg2,mode='experimental'))
            # input("HALT!")

            # print(f'> training times for t {t}/{max_inplen}*{max_inplbls}: {nnt} - {cct}  ;; {cct/nnt}')
            # input("continue to next it.. ?")

            # all_outs.append(circ_outs)

            loss += sequence_loss(lbls,outs,do_stack=False)

            # for i,layer in enumerate(model):
            #     for l in layer._fields:
            #         g = getattr(layer,l).grad
            #         if g is not None:
            #             if g.sum() == 0: print(f'Zero grad at layer {i} {l}')
            #             else: print(f'layer {i} {l} norm: {g.norm()}, sum: {g.sum()}, abs-sum: {g.abs().sum()}')
            #         else: print(f'No grad at layer {i} {l} !')
            # input('Halt !')

        all_states = states_to_transfer

    if training_run:
        loss.backward()

    return float(loss)