示例#1
0
文件: data.py 项目: rciric/InfoBiGANs
def labeled_image_set(filename, shuffle=True):
    with open(filename, 'r') as f:
        lines = f.readlines()
    data = []
    for line in lines:
        f, l = line.split()
        data.append((f, int(l)))
    if shuffle:
        data = list_shuffle(data)
    return data
示例#2
0
文件: data.py 项目: rciric/InfoBiGANs
 def __init__(self,
              root,
              data,
              batch_size,
              minor_size,
              crop_size,
              crop_mode='random',
              num_workers=1,
              labels=None,
              max_images=None):
     assert crop_mode in self.crop_modes
     if labels is not None:
         label_set = set(labels)
         assert len(labels) > 0
         labels = list(label_set)
         labels.sort()
         print 'Keeping %d labels: %s' % (len(labels), labels)
         # remap the N labels to the 0:(N-1) range
         label_to_index = {l: i for i, l in enumerate(labels)}
         data = [(image, label_to_index[l]) for image, l in data
                 if l in label_set]
     if (max_images is not None) and (len(data) > max_images):
         print 'Shrinking dataset from %d images to %d images' % \
             (len(data), max_images)
         data = list_shuffle(data)
         data = data[:max_images]
     self.__dict__.update(
         {k: v
          for k, v in locals().iteritems() if k != 'self'})
     self.batch_indices = np.arange(batch_size)
     self.reset_data()
     self.pool = Pool(num_workers)
     self.map_result = None
     # sort crop sizes largest to smallest
     crop_size = list(reversed(sorted(crop_size)))
     self.image_shapes = [(3, c, c) for c in crop_size]
     self.crop_shapes = [(c, c) for c in crop_size]
     self.out_data = [
         np.zeros((batch_size, ) + s, dtype=np.uint8)
         for s in self.image_shapes
     ]
     self.out_label = np.zeros(batch_size, dtype=np.int32)
     kwargs = dict(root=self.root,
                   minor_size=self.minor_size,
                   crop_shapes=self.crop_shapes,
                   crop_random=(crop_mode == 'random'))
     self.get_image = functools.partial(get_image, **kwargs)
     self.start_prefetch()
示例#3
0
文件: data.py 项目: rciric/InfoBiGANs
 def reset_data(self):
     self.data = list_shuffle(self.data)
     self.index = 0
示例#4
0
def train(config_path, resume=True):

    # Load the parameters
    param_dict, rep_param_dict = load_params(config_path)

    # use cuda flag
    use_cuda = True
    """
    the tranining directory
    """
    # load data
    TRAIN_DIR01 = "{}/MQ2007/S1/".format(param_dict["data_base_path"])
    TRAIN_DIR02 = "{}/MQ2007/S2/".format(param_dict["data_base_path"])
    TRAIN_DIR03 = "{}/MQ2007/S3/".format(param_dict["data_base_path"])
    TRAIN_DIR04 = "{}/MQ2007/S4/".format(param_dict["data_base_path"])
    TRAIN_DIR05 = "{}/MQ2007/S5/".format(param_dict["data_base_path"])

    TEST_DIR01 = '{}/MQ2007/S1/'.format(param_dict["data_base_path"])
    TEST_DIR02 = '{}/MQ2007/S2/'.format(param_dict["data_base_path"])
    TEST_DIR03 = '{}/MQ2007/S3/'.format(param_dict["data_base_path"])
    TEST_DIR04 = '{}/MQ2007/S4/'.format(param_dict["data_base_path"])
    TEST_DIR05 = '{}/MQ2007/S5/'.format(param_dict["data_base_path"])

    train_files01 = glob.glob("{}/data0.pkl".format(TRAIN_DIR01))
    train_files02 = glob.glob("{}/data0.pkl".format(TRAIN_DIR02))
    train_files03 = glob.glob("{}/data0.pkl".format(TRAIN_DIR03))
    train_files04 = glob.glob("{}/data0.pkl".format(TRAIN_DIR04))
    train_files05 = glob.glob("{}/data0.pkl".format(TRAIN_DIR05))

    test_files01 = glob.glob("{}/testdata0.pkl".format(TEST_DIR01))
    test_files02 = glob.glob("{}/testdata0.pkl".format(TEST_DIR02))
    test_files03 = glob.glob("{}/testdata0.pkl".format(TEST_DIR03))
    test_files04 = glob.glob("{}/testdata0.pkl".format(TEST_DIR04))
    test_files05 = glob.glob("{}/testdata0.pkl".format(TEST_DIR05))

    fold = param_dict["fold"]
    model_base_path = param_dict['model_base_path']
    model_name_str = param_dict['model_name_str']
    q_len = param_dict["q_len"]
    d_len = param_dict["d_len"]

    if fold == 1:
        train_files = train_files01 + train_files02 + train_files03
        test_files = test_files04[0]  # a path list ['/...'] only take the str
        rel_path = '{}/{}/tmp/test/S4.qrels'.format(model_base_path,
                                                    model_name_str)
    elif fold == 2:
        train_files = train_files02 + train_files03 + train_files04
        test_files = test_files05[0]
        rel_path = '{}/{}/tmp/test/S5.qrels'.format(model_base_path,
                                                    model_name_str)
    elif fold == 3:
        train_files = train_files03 + train_files04 + train_files05
        test_files = test_files01[0]
        rel_path = '{}/{}/tmp/test/S1.qrels'.format(model_base_path,
                                                    model_name_str)
    elif fold == 4:
        train_files = train_files04 + train_files05 + train_files01
        test_files = test_files02[0]
        rel_path = '{}/{}/tmp/test/S2.qrels'.format(model_base_path,
                                                    model_name_str)
    elif fold == 5:
        train_files = train_files05 + train_files01 + train_files02
        test_files = test_files03[0]
        rel_path = '{}/{}/tmp/test/S3.qrels'.format(model_base_path,
                                                    model_name_str)
    else:
        raise ValueError("wrong fold num {}".format(fold))
    """
    Build the model
    """
    emb_size = param_dict['emb_size']
    num_heads = param_dict['num_heads']
    kernel_size = rep_param_dict['kernel_size']
    filt_size = rep_param_dict['filt_size']
    vocab_size = param_dict['vocab_size']
    output_dim = rep_param_dict['output_dim']
    hidden_size = param_dict['hidden_size']
    batch_size = param_dict['batch_size']
    preemb = param_dict['preemb']
    emb_path = param_dict['emb_path']
    hinge_margin = param_dict['hinge_margin']

    model = Attention(emb_size=emb_size,
                      query_length=q_len,
                      doc_length=d_len,
                      num_heads=num_heads,
                      kernel_size=kernel_size,
                      filter_size=filt_size,
                      vocab_size=vocab_size,
                      dropout=0.0,
                      qrep_dim=output_dim,
                      hidden_size=hidden_size,
                      batch_size=batch_size,
                      preemb=preemb,
                      emb_path=emb_path)

    if use_cuda:
        model.cuda()
    # optimizer
    optimizer = optim.Adam(model.parameters(),
                           lr=param_dict['learning_rate'],
                           betas=(param_dict['beta1'], param_dict['beta2']),
                           weight_decay=param_dict['alpha'])
    # loss func
    loss = nn.MarginRankingLoss(margin=hinge_margin, size_average=True)
    # experiment
    print("Experiment")

    if resume == False:
        f_log = open(
            '{}/{}/logs/training_log.txt'.format(model_base_path,
                                                 model_name_str), 'w+', 1)
        valid_log = open(
            '{}/{}/logs/valid_log.txt'.format(model_base_path, model_name_str),
            'w+', 1)
    else:
        f_log = open(
            '{}/{}/logs/training_log.txt'.format(model_base_path,
                                                 model_name_str), 'a+', 1)
        valid_log = open(
            '{}/{}/logs/valid_log.txt'.format(model_base_path, model_name_str),
            'a+', 1)

    # model_file
    model_file = '{}/{}/saves/model_file'.format(model_base_path,
                                                 model_name_str)
    """
    TRAINING
    """

    # define the parameters
    n_epoch = param_dict['n_epoch']
    # init best validation MAP value
    best_MAP = 0.0
    best_NDCG1 = 0.0
    batch_count_tr = 0
    # restore saved parameter if resume_training is true
    if resume == True:
        model_file = '{}/{}/saves/model_file'.format(model_base_path,
                                                     model_name_str)
        model.load_state_dict(torch.load(model_file))
        with open(
                '{}/{}/saves/best_MAP.pkl'.format(model_base_path,
                                                  model_name_str),
                'rb') as f_MAP:
            best_MAP = pickle.load(f_MAP)
        print("loaded model, and resume training now")

    for epoch in range(1, n_epoch + 1):
        '''load_data'''
        for f in train_files:
            data = load_dataset(f)
            print("loaded {}".format(f))
            '''prepare_data'''
            [Q, D_pos, D_neg, L] = pair_data_generator(data, q_len)
            valid_data = load_dataset(test_files)
            ''' shuffle data'''
            train_data = list_shuffle(Q, D_pos, D_neg, L)
            '''training func'''

            num_batch = len(train_data[0]) // batch_size
            for batch_count in range(num_batch):
                Q = train_data[0][batch_size * batch_count:batch_size *
                                  (batch_count + 1)]
                D_pos = train_data[1][batch_size * batch_count:batch_size *
                                      (batch_count + 1)]
                D_neg = train_data[2][batch_size * batch_count:batch_size *
                                      (batch_count + 1)]
                L = train_data[3][batch_size * batch_count:batch_size *
                                  (batch_count + 1)]
                if use_cuda:
                    Q = Variable(torch.LongTensor(
                        pad_batch_list(Q, max_len=q_len, padding_id=0)),
                                 requires_grad=False).cuda()
                    D_pos = Variable(torch.LongTensor(
                        pad_batch_list(D_pos, max_len=d_len, padding_id=0)),
                                     requires_grad=False).cuda()
                    D_neg = Variable(torch.LongTensor(
                        pad_batch_list(D_neg, max_len=d_len, padding_id=0)),
                                     requires_grad=False).cuda()
                    L = Variable(torch.FloatTensor(L),
                                 requires_grad=False).cuda()
                else:
                    Q = Variable(torch.LongTensor(
                        pad_batch_list(Q, max_len=q_len, padding_id=0)),
                                 requires_grad=False)
                    D_pos = Variable(torch.LongTensor(
                        pad_batch_list(D_pos, max_len=d_len, padding_id=0)),
                                     requires_grad=False)
                    D_neg = Variable(torch.LongTensor(
                        pad_batch_list(D_neg, max_len=d_len, padding_id=0)),
                                     requires_grad=False)
                    L = Variable(torch.FloatTensor(L), requires_grad=False)

                # run on this batch
                optimizer.zero_grad()
                t1 = time.time()

                q_mask, d_pos_mask, d_neg_mask = model.generate_mask(
                    Q, D_pos, D_neg)
                """
                need to do the modification i the model.py
                """
                S_pos, S_neg = model(Q, D_pos, D_neg, q_mask, d_pos_mask,
                                     d_neg_mask)
                Loss = hinge_loss(S_pos, S_neg, 1.0)
                Loss.backward()
                optimizer.step()
                t2 = time.time()
                batch_count_tr += 1
                print("epoch {} batch {} training cost: {} using {}s" \
                .format(epoch, batch_count+1, Loss.data[0], t2-t1))
                f_log.write("epoch {} batch {} training cost: {}, using {}s".
                            format(epoch, batch_count + 1, Loss.data[0], t2 -
                                   t1) + '\n')
                """
                evaluate part
                """
                if batch_count_tr % 20 == 0:
                    if valid_data is not None:
                        MAP, NDCGs = evaluate(config_path,
                                              model,
                                              valid_data,
                                              rel_path,
                                              mode="valid")
                        print(MAP, NDCGs)
                        valid_log.write(
                            "epoch {}, batch {}, MAP: {}, NDCGs: {} {} {} {}".
                            format(epoch + 1, batch_count + 1, MAP,
                                   NDCGs[1][0], NDCGs[1][1], NDCGs[1][2],
                                   NDCGs[1][3]))
                        if MAP > best_MAP:  # save this best model
                            best_MAP = MAP
                            with open(
                                    '{}/{}/saves/best_MAP.pkl'.format(
                                        model_base_path, model_name_str),
                                    'wb') as f_MAP:
                                pickle.dump(best_MAP, f_MAP)
                            # save model params after several epoch
                            model_file = '{}/{}/saves/model_file'.format(
                                model_base_path, model_name_str)
                            torch.save(model.state_dict(), model_file)
                            print("successfully saved model to the path {}".
                                  format(model_file))

                        valid_log.write("{} {} {} {}".format(
                            NDCGs[1][0], NDCGs[1][1], NDCGs[1][2],
                            NDCGs[1][3]))
                        valid_log.write(" MAP: {}".format(MAP))
                        valid_log.write('\n')
    f_log.close()
    valid_log.close()