def test(model, args):
    preds = []
    labels = []
    imp_indexes = []
    metrics = ['group_auc']
    test_file = os.path.join(args.data_dir, args.test_data_file)
    preds = []
    labels = []
    imp_indexes = []
    feature_file = os.path.join(args.data_dir, args.feature_file)
    history_file = os.path.join(args.data_dir, args.history_file)
    if 'last' in args.field:
        abs_file = os.path.join(args.data_dir, args.abs_file)
    else:
        abs_file = ''
    iterator = NewsIterator(batch_size=1,
                            npratio=-1,
                            feature_file=feature_file,
                            history_file=history_file,
                            abs_file=abs_file,
                            field=args.field,
                            fp16=True)
    print('test...')
    cudaid = 0
    #model = nn.DataParallel(model, device_ids=list(range(args.size)))
    step = 0
    with torch.no_grad():
        data_batch = iterator.load_test_data_from_file(test_file, None)
        batch_t = 0
        for imp_index, user_index, his_id, candidate_id, label, _ in data_batch:
            batch_t += len(candidate_id)
            his_id = his_id.cuda(cudaid)
            candidate_id = candidate_id.cuda(cudaid)
            logit = model(his_id, candidate_id, None, mode='validation')
            # print('???',label_t,label)
            # assert 1==0
            logit = list(np.reshape(np.array(logit.cpu()), -1))
            label = list(np.reshape(np.array(label), -1))
            imp_index = list(np.reshape(np.array(imp_index), -1))

            assert len(imp_index) == 1
            imp_index = imp_index * len(logit)

            assert len(logit) == len(label)
            assert len(logit) == len(imp_index)
            assert np.sum(np.array(label)) != 0

            labels.extend(label)
            preds.extend(logit)
            imp_indexes.extend(imp_index)
            step += 1
            if step % 100 == 0:
                print('all data: ', len(labels))

    group_labels, group_preds = group_labels_func(labels, preds, imp_indexes)
    res = cal_metric(group_labels, group_preds, metrics)
    return res['group_auc']
Exemplo n.º 2
0
def test(model, arges):
    preds = []
    labels = []
    imp_indexes = []
    metrics = ['group_auc']
    test_file = os.path.join(args.data_dir, args.test_data_file)
    preds = []
    labels = []
    imp_indexes = []
    feature_file = os.path.join(args.data_dir, args.feature_file)
    iterator = NewsIterator(batch_size=900,
                            npratio=-1,
                            feature_file=feature_file,
                            field=args.field)
    print('test...')
    with torch.no_grad():
        data_batch = iterator.load_data_from_file(test_file)
        batch_t = 0
        for imp_index, user_index, his_id, candidate_id, label in data_batch:
            batch_t += len(candidate_id)
            his_id = his_id.cuda(cudaid)
            candidate_id = candidate_id.cuda(cudaid)
            logit = model(his_id, candidate_id, None, mode='validation')
            # print('???',label_t,label)
            # assert 1==0
            logit = list(np.reshape(np.array(logit.cpu()), -1))
            label = list(np.reshape(np.array(label), -1))
            imp_index = list(np.reshape(np.array(imp_index), -1))

            labels.extend(label)
            preds.extend(logit)
            imp_indexes.extend(imp_index)
            print('all data: ', len(labels))

    group_labels, group_preds = group_labels_func(labels, preds, imp_indexes)
    res = cal_metric(group_labels, group_preds, metrics)
    return res['group_auc']
Exemplo n.º 3
0
def test(cudaid,args,model):#valid
    #w=open('train_plain_bert.txt','w')
    #model=Plain_bert(load_model='roberta.base',output_size=768)

    #model.cuda(cudaid)
    dist.init_process_group(
        backend='nccl',
        init_method='env://',
        world_size=args.size,
        rank=cudaid)

    model.eval()
    test_file=os.path.join(args.data_dir, args.data_file)  
    #cudaid=args.cudaid
    feature_file=os.path.join(args.data_dir,args.feature_file)
    history_file=os.path.join(args.data_dir,args.history_file)
    if 'last' in args.field:
        abs_file=os.path.join(args.data_dir,args.abs_file)
    else:
        abs_file=''

    w=open(os.path.join(args.data_dir,args.log_file+str(cudaid)),'w')
    #data_batch=utils.get_batch()
    #cuda_list=range(cuda_num)
    #model = nn.DataParallel(model, device_ids=cuda_list)
    preds = []
    labels = []
    imp_indexes = []
    #test_file='valid_ms_roberta_plain.txt'
    #test_file='valid_ms_roberta_plain_large.txt'
    #test_file='valid_ms_roberta_plain.txt'
    feature_file=os.path.join(args.data_dir,args.feature_file)
    iterator=NewsIterator(batch_size=args.gpu_size, npratio=-1,feature_file=feature_file,history_file=history_file,abs_file=abs_file,field=args.field,fp16=True)
    print('test...')
    with torch.no_grad():
        data_batch=iterator.load_test_data_from_file(test_file,args.can_length)
        #data_batch=iterator.load_data_from_file(test_file)
        batch_t=0
        for  imp_index , user_index, his_id, candidate_id , label,can_len  in data_batch:
            batch_t+=len(candidate_id)
            his_id=his_id.cuda(cudaid)
            candidate_id= candidate_id.cuda(cudaid)
            logit=model.predict(his_id,candidate_id)
            
            logit=np.array(logit.cpu())
            imp_index=np.reshape(np.array(imp_index), -1)
            assert len(imp_index)==len(logit)

            # logit=np.reshape(np.array(logit.cpu()), -1)
            # label=np.reshape(np.array(label), -1)
            # imp_index=np.reshape(np.array(imp_index), -1)
            #print('batch_t:',batch_t)
            for i in range(len(imp_index)):
                # w.write('imp_index:'+str(imp_index[i])+' '+' '.join([str(logit[i][j]) for j in range(can_len[i][0])]))
                # w.write('\n')
                for j in range(can_len[i][0]):
                    #assert len(label[i])==can_len[i][0]
                    # w.write('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i][j])+' label: '+str(label[i][j])+'\n')
                    # print('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i][j])+' label: '+str(label[i][j]))
                    w.write('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i][j])+'\n')
                    print('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i][j]))
                    
                # print('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i])+' label: '+str(label[i]))
                # w.write('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i])+' label: '+str(label[i])+'\n')
            #assert 1==0
            print('imp_index: ',imp_index[-1])
            # preds.extend(logit)
            # labels.extend(label)
            # imp_indexes.extend(imp_index)
            # batch_t+=len(candidate_id)
            #print(labels)
            #if batch_t==10:
            #break
        # group_labels, group_preds = group_labels_func(labels, preds, imp_indexes)
        # res = cal_metric(group_labels, group_preds, metrics)
    #return res
    w.close()
Exemplo n.º 4
0
def train(cudaid, args,model):

    dist.init_process_group(
        backend='nccl',
        init_method='env://',
        world_size=args.size,
        rank=cudaid)

    random.seed(1)
    np.random.seed(1) 
    torch.manual_seed(1) 
    torch.cuda.manual_seed(1)

    print('params: '," T_warm: ",T_warm," all_iteration: ",all_iteration," lr: ",lr)
    #cuda_list=range(args.size)
    print('rank: ',cudaid)
    torch.cuda.set_device(cudaid)
    model.cuda(cudaid)

    accumulation_steps=int(args.batch_size/args.size/args.gpu_size)
    optimizer = apex.optimizers.FusedLAMB(model.parameters(), lr=lr,betas=(0.9,0.98),eps=1e-6,weight_decay=0.0,max_grad_norm=1.0)
    model, optimizer = amp.initialize(model, optimizer, opt_level='O2')
    model = DDP(model)
    

    #model = nn.DataParallel(model, device_ids=cuda_list)
    # torch.distributed.init_process_group(backend='nccl', init_method='tcp://localhost:23456', rank=0, world_size=1)
    # torch.cuda.set_device(cudaid)
    
    #model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
    #model=torch.nn.parallel.DistributedDataParallel(model, device_ids=cuda_list)
    #model = torch.nn.DataParallel(model)
    #model=apex.parallel.DistributedDataParallel(model)

    accum_batch_loss=0
    iterator=NewsIterator(batch_size=args.gpu_size, npratio=4,feature_file=os.path.join(args.data_dir,args.feature_file),field=args.field)
    train_file=os.path.join(args.data_dir, args.data_file)  
    #for epoch in range(0,100):
    batch_t=0
    iteration=0
    print('train...',args.field)
    #w=open(os.path.join(args.data_dir,args.log_file),'w')
    if cudaid==0:
        writer = SummaryWriter(os.path.join(args.data_dir, args.log_file) )
    epoch=0
    model.train()
    # batch_t=52880-1
    # iteration=3305-1
    batch_t=0
    iteration=0
    step=0
    best_score=-1
    #w=open(os.path.join(args.data_dir,args.log_file),'w')

    # model.eval()
    # auc=test(model,args)

    for epoch in range(0,10):
    #while True:
        all_loss=0
        all_batch=0
        data_batch=iterator.load_data_from_file(train_file,cudaid,args.size)
        for  imp_index , user_index, his_id, candidate_id , label in data_batch:
            batch_t+=1
            assert candidate_id.shape[1]==2
            his_id=his_id.cuda(cudaid)
            candidate_id= candidate_id.cuda(cudaid)
            label = label.cuda(cudaid)
            loss=model(his_id,candidate_id, label)

            sample_size=candidate_id.shape[0]
            loss=loss.sum()/sample_size/math.log(2)
            
            accum_batch_loss+=float(loss)

            all_loss+=float(loss)
            all_batch+=1

            loss = loss/accumulation_steps
            #loss.backward()
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            if (batch_t)%accumulation_steps==0:

                iteration+=1
                adjust_learning_rate(optimizer,iteration)
                optimizer.step()
                optimizer.zero_grad()
                if cudaid==0:
                    print(' batch_t: ',batch_t, ' iteration: ', iteration, ' epoch: ',epoch,' accum_batch_loss: ',accum_batch_loss/accumulation_steps,' lr: ', optimizer.param_groups[0]['lr'])
                    writer.add_scalar('Loss/train', accum_batch_loss/accumulation_steps, iteration)
                    writer.add_scalar('Ltr/train', optimizer.param_groups[0]['lr'], iteration)
                accum_batch_loss=0
                if iteration%2==0 and cudaid==0:
                    torch.cuda.empty_cache()
                    model.eval()
                    if cudaid==0:
                        auc=test(model,args)
                        print(auc)
                        writer.add_scalar('auc/valid', auc, step)
                        step+=1
                        if auc>best_score:
                            torch.save(model.state_dict(), os.path.join(args.save_dir,'Plain_robert_dot_best.pkl'))
                            best_score=auc
                            print('best score: ',best_score)
                    torch.cuda.empty_cache()
                    model.train()
        
        if cudaid==0:
            torch.save(model.state_dict(), os.path.join(args.save_dir,'Plain_robert_dot'+str(epoch)+'.pkl'))
Exemplo n.º 5
0
def train(model, optimizer, args):

    print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration,
          " lr: ", lr)
    #writer = SummaryWriter('./model_snapshot_error')
    # cuda_list=range(cuda_num)
    cuda_list = range(args.size)
    #model.cuda(cudaid)
    # accumulation_steps=40
    accumulation_steps = int(args.batch_size / args.size / 8)
    #accumulation_steps=1
    model = nn.DataParallel(model, device_ids=cuda_list)
    accum_batch_loss = 0
    #train_file='train_ms_roberta_plain_pair_sample_shuffle.txt'
    #train_file='train_ms_roberta_plain_pair_sample_large_new_shuffle.txt'
    #train_file='train_ms_roberta.txt'
    iterator = NewsIterator(batch_size=8 * args.size,
                            npratio=4,
                            feature_file=os.path.join(args.data_dir,
                                                      args.feature_file))
    train_file = os.path.join(args.data_dir, args.data_file)
    #for epoch in range(0,100):
    batch_t = 0
    iteration = 0
    print('train...', cuda_list)
    writer = SummaryWriter(os.path.join(args.data_dir, args.log_file))
    #w=open(os.path.join(args.data_dir,args.log_file),'w')

    epoch = 0
    model.train()
    # batch_t=52880-1
    # iteration=3305-1
    batch_t = 0
    iteration = 0
    #w=open(os.path.join(args.data_dir,args.log_file),'w')
    for epoch in range(0, 10):
        #while True:
        all_loss = 0
        all_batch = 0
        data_batch = iterator.load_data_from_file(train_file)
        for imp_index, user_index, his_id, candidate_id, label in data_batch:
            batch_t += 1
            # if batch_t<=232240:
            # if batch_t<=317190:
            #   if (batch_t)%accumulation_steps==0:
            #       iteration+=1
            #   continue

            # print('shape: ',his_id.shape,candidate_id.shape,label.shape)
            # print('candidate_id: ',candidate_id)
            assert candidate_id.shape[1] == 2
            his_id = his_id.cuda(cudaid)
            candidate_id = candidate_id.cuda(cudaid)
            label = label.cuda(cudaid)
            loss, sample_size = model(his_id, candidate_id, label)

            sample_size = float(sample_size.sum())
            loss = loss.sum() / sample_size / math.log(2)
            # sample_size=float(sample_size)
            # loss=loss/sample_size/math.log(2)
            #print(' batch_t: ',batch_t, '  epoch: ',epoch,' loss: ',float(loss))
            #print('???loss',loss)

            accum_batch_loss += float(loss)

            all_loss += float(loss)
            all_batch += 1

            loss = loss / accumulation_steps
            loss.backward()

            if (batch_t) % accumulation_steps == 0:
                #print('candidate_id: ',candidate_id)
                # total_norm=0
                # for p in model.parameters():
                #     if p.grad==None:
                #         print('error: ',index,p.size(),p.grad)
                #     param_norm = p.grad.data.norm(2)
                #     total_norm += param_norm.item() ** 2
                # total_norm = total_norm ** (1. / 2)
                #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                # total_clip_norm=0
                # for p in model.parameters():
                #     if p.grad==None:
                #         print('error: ',index,p.size(),p.grad)
                #     param_norm = p.grad.data.norm(2)
                #     total_clip_norm += param_norm.item() ** 2
                # total_clip_norm = total_clip_norm ** (1. / 2)

                iteration += 1
                adjust_learning_rate(optimizer, iteration)

                optimizer.step()
                optimizer.zero_grad()
                print(' batch_t: ', batch_t, ' iteration: ', iteration,
                      ' epoch: ', epoch, ' accum_batch_loss: ',
                      accum_batch_loss / accumulation_steps, ' lr: ',
                      optimizer.param_groups[0]['lr'])
                #w.write(' batch_t: '+str(batch_t)+' iteration: '+str(iteration)+' epoch: '+str(epoch)+' accum_batch_loss: '+str(accum_batch_loss/accumulation_steps)+'\n')
                writer.add_scalar('Loss/train',
                                  accum_batch_loss / accumulation_steps,
                                  iteration)
                accum_batch_loss = 0
                #assert epoch>=3
                # torch.save(model.state_dict(),'./model/Plain_bert_960b_large'+str(epoch)+'.pkl')

                #writer.add_scalar('Loss/train', float(accum_batch_loss/accumulation_steps), iteration)
                #break
        torch.save(
            model.state_dict(),
            os.path.join(args.save_dir,
                         'Plain_robert_dot' + str(epoch) + '.pkl'))
Exemplo n.º 6
0
def train(model, optimizer, args):

    print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration,
          " lr: ", lr)
    cuda_list = range(args.size)
    accumulation_steps = int(args.batch_size / args.size / args.gpu_size)
    #model = nn.DataParallel(model, device_ids=cuda_list)

    # torch.cuda.set_device(cudaid)
    # torch.distributed.init_process_group(backend='nccl', init_method='tcp://localhost:23456', rank=0, world_size=1)
    # model=torch.nn.parallel.DistributedDataParallel(model, device_ids=cuda_list,output_device=0,find_unused_parameters=True)

    model = torch.nn.DataParallel(model)
    accum_batch_loss = 0
    iterator = NewsIterator(batch_size=args.gpu_size * args.size,
                            npratio=4,
                            feature_file=os.path.join(args.data_dir,
                                                      args.feature_file),
                            field=args.field)
    train_file = os.path.join(args.data_dir, args.data_file)
    #for epoch in range(0,100):
    batch_t = 0
    iteration = 0
    print('train...', cuda_list)
    #w=open(os.path.join(args.data_dir,args.log_file),'w')
    writer = SummaryWriter(os.path.join(args.data_dir, args.log_file))
    epoch = 0
    model.train()
    # batch_t=52880-1
    # iteration=3305-1
    batch_t = 0
    iteration = 0
    step = 0
    best_score = -1
    #w=open(os.path.join(args.data_dir,args.log_file),'w')

    # model.eval()
    # auc=test(model,args)

    for epoch in range(0, 10):
        #while True:
        all_loss = 0
        all_batch = 0
        data_batch = iterator.load_data_from_file(train_file)
        for imp_index, user_index, his_id, candidate_id, label in data_batch:
            batch_t += 1
            assert candidate_id.shape[1] == 2
            his_id = his_id.cuda(cudaid)
            candidate_id = candidate_id.cuda(cudaid)
            label = label.cuda(cudaid)
            loss = model(his_id, candidate_id, label)

            sample_size = candidate_id.shape[0]
            loss = loss.sum() / sample_size / math.log(2)

            accum_batch_loss += float(loss)

            all_loss += float(loss)
            all_batch += 1

            loss = loss / accumulation_steps
            loss.backward()

            if (batch_t) % accumulation_steps == 0:

                iteration += 1
                adjust_learning_rate(optimizer, iteration)
                optimizer.step()
                optimizer.zero_grad()
                print(' batch_t: ', batch_t, ' iteration: ', iteration,
                      ' epoch: ', epoch, ' accum_batch_loss: ',
                      accum_batch_loss / accumulation_steps, ' lr: ',
                      optimizer.param_groups[0]['lr'])
                writer.add_scalar('Loss/train',
                                  accum_batch_loss / accumulation_steps,
                                  iteration)
                writer.add_scalar('Ltr/train', optimizer.param_groups[0]['lr'],
                                  iteration)
                accum_batch_loss = 0
                if iteration % 2 == 0:
                    torch.cuda.empty_cache()
                    model.eval()
                    auc = test(model, args)
                    print(auc)
                    if auc > best_score:
                        torch.save(
                            model.state_dict(),
                            os.path.join(args.save_dir,
                                         'Plain_robert_dot_best.pkl'))
                        best_score = auc
                        print('best score: ', best_score)
                        writer.add_scalar('auc/valid', auc, step)
                        step += 1
                    torch.cuda.empty_cache()
                    model.train()
        torch.save(
            model.state_dict(),
            os.path.join(args.save_dir,
                         'Plain_robert_dot' + str(epoch) + '.pkl'))
def train(cudaid, args, model):
    pynvml.nvmlInit()
    dist.init_process_group(backend='nccl',
                            init_method='env://',
                            world_size=args.size,
                            rank=cudaid)

    random.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration,
          " lr: ", lr)
    #cuda_list=range(args.size)
    print('rank: ', cudaid)
    torch.cuda.set_device(cudaid)
    model.cuda(cudaid)

    accumulation_steps = int(args.batch_size / args.size / args.gpu_size)
    #optimizer = torch.optim.Adam(model.parameters(), lr=lr,betas=(0.9,0.98),eps=1e-6,weight_decay=0.0)
    optimizer = apex.optimizers.FusedLAMB(model.parameters(),
                                          lr=lr,
                                          betas=(0.9, 0.98),
                                          eps=1e-6,
                                          weight_decay=0.0,
                                          max_grad_norm=1.0)

    model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    model = DDP(model)

    accum_batch_loss = 0
    history_file = os.path.join(args.data_dir, args.history_file)
    if 'last' in args.field:
        abs_file = os.path.join(args.data_dir, args.abs_file)
    else:
        abs_file = ''
    iterator = NewsIterator(batch_size=args.gpu_size,
                            npratio=4,
                            feature_file=os.path.join(args.data_dir,
                                                      args.feature_file),
                            history_file=history_file,
                            abs_file=abs_file,
                            field=args.field,
                            fp16=True)
    train_file = os.path.join(args.data_dir, args.data_file)

    print('train...', args.field)
    if cudaid == 0:
        writer = SummaryWriter(os.path.join(args.data_dir, args.log_file))
    model.train()

    #epoch=args.epoch
    iteration = args.iteration
    batch_t = args.batch_t
    step = int(iteration / 500) + 1
    best_score = args.best_score

    start_pos = None  #args.batch_t*args.gpu_size#如果不是0的话千万记得加一个%

    for epoch in range(args.epoch, 12):
        all_loss = 0
        all_batch = 0
        if epoch != args.epoch:
            data_batch = iterator.load_data_from_file(train_file, cudaid,
                                                      args.size)
        else:
            data_batch = iterator.load_data_from_file(train_file, cudaid,
                                                      args.size, start_pos)
        print('load ok...')
        for imp_index, user_index, his_id, candidate_id, label in data_batch:
            batch_t += 1
            assert candidate_id.shape[1] == 2

            # if cudaid==1:
            #     torch.set_printoptions(profile="full")
            #     print(his_id)
            his_id = his_id.cuda(cudaid)
            candidate_id = candidate_id.cuda(cudaid)
            label = label.cuda(cudaid)
            loss = model(his_id, candidate_id, label)

            sample_size = candidate_id.shape[0]
            loss = loss.sum() / sample_size / math.log(2)

            accum_batch_loss += float(loss)

            all_loss += float(loss)
            all_batch += 1

            # if cudaid==1:

            # torch.set_printoptions(profile="full")
            # w=open('input.txt','w')
            # w.write(str(his_id.cpu()))
            # w.close()
            # assert 1==0

            loss = loss / accumulation_steps

            #loss.backward()
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            if (batch_t) % accumulation_steps == 0:

                iteration += 1
                adjust_learning_rate(optimizer, iteration)
                optimizer.step()
                optimizer.zero_grad()
                if cudaid == 0:
                    # handle = pynvml.nvmlDeviceGetHandleByIndex(cudaid)
                    # meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
                    # #print(int(meminfo.used)/1024/1024)
                    # print('loss: ',loss,int(meminfo.used)/1024/1024)
                    print(' batch_t: ', batch_t, ' iteration: ', iteration,
                          ' epoch: ', epoch, ' accum_batch_loss: ',
                          accum_batch_loss / accumulation_steps, ' lr: ',
                          optimizer.param_groups[0]['lr'])
                    writer.add_scalar('Loss/train',
                                      accum_batch_loss / accumulation_steps,
                                      iteration)
                    writer.add_scalar('Ltr/train',
                                      optimizer.param_groups[0]['lr'],
                                      iteration)
                accum_batch_loss = 0
                if iteration % 500 == 0 and cudaid == 0:
                    torch.cuda.empty_cache()
                    model.eval()
                    if cudaid == 0:
                        auc = test(model, args)
                        print(auc)
                        writer.add_scalar('auc/valid', auc, step)
                        step += 1
                        if auc > best_score:
                            torch.save(
                                model.state_dict(),
                                os.path.join(args.save_dir,
                                             'Plain_robert_dot_best.pkl'))
                            best_score = auc
                            print('best score: ', best_score)
                    torch.cuda.empty_cache()
                    model.train()

        if cudaid == 0:
            torch.save(
                model.state_dict(),
                os.path.join(args.save_dir,
                             'Plain_robert_dot' + str(epoch) + '.pkl'))
Exemplo n.º 8
0
def test(model, args):  #valid
    #w=open('train_plain_bert.txt','w')
    #model=Plain_bert(load_model='roberta.base',output_size=768)

    #model.cuda(cudaid)
    model.eval()
    test_file = os.path.join(args.data_dir, args.data_file)
    cudaid = args.cudaid
    w = open(os.path.join(args.data_dir, args.log_file), 'w')
    #data_batch=utils.get_batch()
    #cuda_list=range(cuda_num)
    #model = nn.DataParallel(model, device_ids=cuda_list)
    preds = []
    labels = []
    imp_indexes = []
    #test_file='valid_ms_roberta_plain.txt'
    #test_file='valid_ms_roberta_plain_large.txt'
    #test_file='valid_ms_roberta_plain.txt'
    feature_file = os.path.join(args.data_dir, args.feature_file)
    iterator = NewsIterator(batch_size=args.gpu_size,
                            npratio=-1,
                            feature_file=feature_file,
                            field=args.field)
    print('test...')
    with torch.no_grad():
        data_batch = iterator.load_test_data_from_file(test_file,
                                                       args.can_length)
        #data_batch=iterator.load_data_from_file(test_file)
        batch_t = 0
        for imp_index, user_index, his_id, candidate_id, label, can_len in data_batch:
            batch_t += len(candidate_id)
            # if batch_t<=167:
            #   continue
            his_id = his_id.cuda(cudaid)
            candidate_id = candidate_id.cuda(cudaid)
            #rank_mask=rank_mask.cuda(cudaid)
            #print(his_id.shape,candidate_id.shape,batch_t)
            # assert 1==0
            # imp_index= imp_index.cuda(cudaid)
            # label= label.cuda(cudaid)
            #print('???',his_id.shape,candidate_id.shape)
            #print('???',his_id)
            #print('???',candidate_id)
            logit = model.predict(his_id, candidate_id)
            # print('rank_mask: ',rank_mask)
            # print('user_index: ',user_index)
            # print('candidate_id: ',candidate_id)
            # print('batch_t: ',batch_t)
            #print('???',logit)
            #assert 1==0
            logit = np.array(logit.cpu())
            imp_index = np.reshape(np.array(imp_index), -1)
            assert len(imp_index) == len(logit)

            # logit=np.reshape(np.array(logit.cpu()), -1)
            # label=np.reshape(np.array(label), -1)
            # imp_index=np.reshape(np.array(imp_index), -1)
            #print('batch_t:',batch_t)
            for i in range(len(imp_index)):
                # w.write('imp_index:'+str(imp_index[i])+' '+' '.join([str(logit[i][j]) for j in range(can_len[i][0])]))
                # w.write('\n')
                for j in range(can_len[i][0]):
                    assert len(label[i]) == can_len[i][0]
                    w.write('imp_index: ' + str(imp_index[i]) + ' logit: ' +
                            str(logit[i][j]) + ' label: ' + str(label[i][j]) +
                            '\n')
                    #print('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i][j])+' label: '+str(label[i][j]))

                # print('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i])+' label: '+str(label[i]))
                # w.write('imp_index: '+str(imp_index[i])+' logit: '+str(logit[i])+' label: '+str(label[i])+'\n')
            #assert 1==0
            print('imp_index: ', imp_index[-1])
            # preds.extend(logit)
            # labels.extend(label)
            # imp_indexes.extend(imp_index)
            # batch_t+=len(candidate_id)
            #print(labels)
            #if batch_t==10:
            #break

        # group_labels, group_preds = group_labels_func(labels, preds, imp_indexes)
        # res = cal_metric(group_labels, group_preds, metrics)

    #return res
    w.close()
Exemplo n.º 9
0
def train(model, optimizer, args):

    print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration,
          " lr: ", lr)
    #writer = SummaryWriter('./model_snapshot_error')
    # cuda_list=range(cuda_num)
    cuda_list = range(args.size)
    #model.cuda(cudaid)
    # accumulation_steps=40
    accumulation_steps = int(args.batch_size / args.size / 8)
    #accumulation_steps=1
    #model = nn.DataParallel(model, device_ids=cuda_list)
    accum_batch_loss = 0
    iterator = NewsIterator(batch_size=8 * args.size,
                            npratio=4,
                            feature_file=os.path.join(args.data_dir,
                                                      args.feature_file),
                            field=args.field)
    train_file = os.path.join(args.data_dir, args.data_file)
    #for epoch in range(0,100):
    batch_t = 0
    iteration = 0
    print('train...', cuda_list)
    #w=open(os.path.join(args.data_dir,args.log_file),'w')
    writer = SummaryWriter(os.path.join(args.data_dir, args.log_file))

    epoch = 0
    model.train()
    # batch_t=52880-1
    # iteration=3305-1
    batch_t = 0
    iteration = 0
    #w=open(os.path.join(args.data_dir,args.log_file),'w')
    for epoch in range(0, 10):
        #while True:
        all_loss = 0
        all_batch = 0
        data_batch = iterator.load_data_from_file(train_file)
        for imp_index, user_index, his_id, candidate_id, label in data_batch:
            batch_t += 1
            assert candidate_id.shape[1] == 2
            his_id = his_id.cuda(cudaid)
            candidate_id = candidate_id.cuda(cudaid)
            label = label.cuda(cudaid)
            loss, sample_size = model(his_id, candidate_id, label)

            sample_size = float(sample_size.sum())
            loss = loss.sum() / sample_size / math.log(2)
            # sample_size=float(sample_size)
            # loss=loss/sample_size/math.log(2)
            #print(' batch_t: ',batch_t, '  epoch: ',epoch,' loss: ',float(loss))
            #print('???loss',loss)

            accum_batch_loss += float(loss)

            all_loss += float(loss)
            all_batch += 1

            loss = loss / accumulation_steps
            loss.backward()

            if (batch_t) % accumulation_steps == 0:

                iteration += 1
                adjust_learning_rate(optimizer, iteration)

                optimizer.step()
                optimizer.zero_grad()
                print(' batch_t: ', batch_t, ' iteration: ', iteration,
                      ' epoch: ', epoch, ' accum_batch_loss: ',
                      accum_batch_loss / accumulation_steps, ' lr: ',
                      optimizer.param_groups[0]['lr'])
                #w.write(' batch_t: '+str(batch_t)+' iteration: '+str(iteration)+' epoch: '+str(epoch)+' accum_batch_loss: '+str(accum_batch_loss/accumulation_steps)+'\n')
                writer.add_scalar('Loss/train',
                                  accum_batch_loss / accumulation_steps,
                                  iteration)
                accum_batch_loss = 0
                #assert epoch>=3
                # torch.save(model.state_dict(),'./model/Plain_bert_960b_large'+str(epoch)+'.pkl')

                #writer.add_scalar('Loss/train', float(accum_batch_loss/accumulation_steps), iteration)
                #break
        torch.save(
            model.state_dict(),
            os.path.join(args.save_dir,
                         'Plain_robert_dot' + str(epoch) + '.pkl'))
Exemplo n.º 10
0
def test(model, args, cudaid):
    preds = np.array([])
    labels = np.array([])
    imp_indexes = np.array([])
    metrics = ['group_auc']
    test_file = os.path.join(args.data_dir, args.test_data_file)
    preds = []
    labels = []
    imp_indexes = []
    if args.test_feature_file is not None:
        feature_file = os.path.join(args.data_dir, args.test_feature_file)
    else:
        feature_file = os.path.join(args.data_dir, args.feature_file)
    iterator = NewsIterator(batch_size=1,
                            npratio=-1,
                            feature_file=feature_file,
                            field=args.field,
                            fp16=True)
    print('test...')
    #cudaid=0
    #model = nn.DataParallel(model, device_ids=list(range(args.size)))
    step = 0
    with torch.no_grad():
        data_batch = iterator.load_test_data_from_file(test_file,
                                                       None,
                                                       rank=cudaid,
                                                       size=args.size)
        batch_t = 0
        for imp_index, user_index, his_id, candidate_id, label, _ in data_batch:
            batch_t += len(candidate_id)
            his_id = his_id.cuda(cudaid)
            candidate_id = candidate_id.cuda(cudaid)
            logit = model(his_id, candidate_id, None, mode='validation')

            #print('???',his_id.shape,logit.shape,candidate_id.shape)
            # logit=list(np.reshape(np.array(logit.data.cpu()), -1))
            # label=list(np.reshape(np.array(label), -1))
            # imp_index=list(np.reshape(np.array(imp_index), -1))

            logit = np.reshape(np.array(logit.data.cpu()), -1)
            label = np.reshape(np.array(label), -1)
            #imp_index=np.reshape(np.array(imp_index), -1)

            assert len(imp_index) == 1
            #imp_index=imp_index*len(logit)
            imp_index = np.repeat(imp_index, len(logit))

            assert len(logit) == len(label), (len(logit), len(label))
            assert len(logit) == len(imp_index)
            assert np.sum(label) != 0

            # labels.extend(label)
            # preds.extend(logit)
            # imp_indexes.extend(imp_index)
            labels = np.concatenate((labels, label), axis=0)
            preds = np.concatenate((preds, logit), axis=0)
            imp_indexes = np.concatenate((imp_indexes, imp_index), axis=0)
            step += 1
            if step % 100 == 0:
                print('all data: ', len(labels), cudaid)
                #return labels,preds,imp_indexes

    # group_labels, group_preds = group_labels_func(labels, preds, imp_indexes)
    # res = cal_metric(group_labels, group_preds, metrics)
    # return res['group_auc']
    return labels, preds, imp_indexes