def labeled_image_set(filename, shuffle=True): with open(filename, 'r') as f: lines = f.readlines() data = [] for line in lines: f, l = line.split() data.append((f, int(l))) if shuffle: data = list_shuffle(data) return data
def __init__(self, root, data, batch_size, minor_size, crop_size, crop_mode='random', num_workers=1, labels=None, max_images=None): assert crop_mode in self.crop_modes if labels is not None: label_set = set(labels) assert len(labels) > 0 labels = list(label_set) labels.sort() print 'Keeping %d labels: %s' % (len(labels), labels) # remap the N labels to the 0:(N-1) range label_to_index = {l: i for i, l in enumerate(labels)} data = [(image, label_to_index[l]) for image, l in data if l in label_set] if (max_images is not None) and (len(data) > max_images): print 'Shrinking dataset from %d images to %d images' % \ (len(data), max_images) data = list_shuffle(data) data = data[:max_images] self.__dict__.update( {k: v for k, v in locals().iteritems() if k != 'self'}) self.batch_indices = np.arange(batch_size) self.reset_data() self.pool = Pool(num_workers) self.map_result = None # sort crop sizes largest to smallest crop_size = list(reversed(sorted(crop_size))) self.image_shapes = [(3, c, c) for c in crop_size] self.crop_shapes = [(c, c) for c in crop_size] self.out_data = [ np.zeros((batch_size, ) + s, dtype=np.uint8) for s in self.image_shapes ] self.out_label = np.zeros(batch_size, dtype=np.int32) kwargs = dict(root=self.root, minor_size=self.minor_size, crop_shapes=self.crop_shapes, crop_random=(crop_mode == 'random')) self.get_image = functools.partial(get_image, **kwargs) self.start_prefetch()
def reset_data(self): self.data = list_shuffle(self.data) self.index = 0
def train(config_path, resume=True): # Load the parameters param_dict, rep_param_dict = load_params(config_path) # use cuda flag use_cuda = True """ the tranining directory """ # load data TRAIN_DIR01 = "{}/MQ2007/S1/".format(param_dict["data_base_path"]) TRAIN_DIR02 = "{}/MQ2007/S2/".format(param_dict["data_base_path"]) TRAIN_DIR03 = "{}/MQ2007/S3/".format(param_dict["data_base_path"]) TRAIN_DIR04 = "{}/MQ2007/S4/".format(param_dict["data_base_path"]) TRAIN_DIR05 = "{}/MQ2007/S5/".format(param_dict["data_base_path"]) TEST_DIR01 = '{}/MQ2007/S1/'.format(param_dict["data_base_path"]) TEST_DIR02 = '{}/MQ2007/S2/'.format(param_dict["data_base_path"]) TEST_DIR03 = '{}/MQ2007/S3/'.format(param_dict["data_base_path"]) TEST_DIR04 = '{}/MQ2007/S4/'.format(param_dict["data_base_path"]) TEST_DIR05 = '{}/MQ2007/S5/'.format(param_dict["data_base_path"]) train_files01 = glob.glob("{}/data0.pkl".format(TRAIN_DIR01)) train_files02 = glob.glob("{}/data0.pkl".format(TRAIN_DIR02)) train_files03 = glob.glob("{}/data0.pkl".format(TRAIN_DIR03)) train_files04 = glob.glob("{}/data0.pkl".format(TRAIN_DIR04)) train_files05 = glob.glob("{}/data0.pkl".format(TRAIN_DIR05)) test_files01 = glob.glob("{}/testdata0.pkl".format(TEST_DIR01)) test_files02 = glob.glob("{}/testdata0.pkl".format(TEST_DIR02)) test_files03 = glob.glob("{}/testdata0.pkl".format(TEST_DIR03)) test_files04 = glob.glob("{}/testdata0.pkl".format(TEST_DIR04)) test_files05 = glob.glob("{}/testdata0.pkl".format(TEST_DIR05)) fold = param_dict["fold"] model_base_path = param_dict['model_base_path'] model_name_str = param_dict['model_name_str'] q_len = param_dict["q_len"] d_len = param_dict["d_len"] if fold == 1: train_files = train_files01 + train_files02 + train_files03 test_files = test_files04[0] # a path list ['/...'] only take the str rel_path = '{}/{}/tmp/test/S4.qrels'.format(model_base_path, model_name_str) elif fold == 2: train_files = train_files02 + train_files03 + train_files04 test_files = test_files05[0] rel_path = '{}/{}/tmp/test/S5.qrels'.format(model_base_path, model_name_str) elif fold == 3: train_files = train_files03 + train_files04 + train_files05 test_files = test_files01[0] rel_path = '{}/{}/tmp/test/S1.qrels'.format(model_base_path, model_name_str) elif fold == 4: train_files = train_files04 + train_files05 + train_files01 test_files = test_files02[0] rel_path = '{}/{}/tmp/test/S2.qrels'.format(model_base_path, model_name_str) elif fold == 5: train_files = train_files05 + train_files01 + train_files02 test_files = test_files03[0] rel_path = '{}/{}/tmp/test/S3.qrels'.format(model_base_path, model_name_str) else: raise ValueError("wrong fold num {}".format(fold)) """ Build the model """ emb_size = param_dict['emb_size'] num_heads = param_dict['num_heads'] kernel_size = rep_param_dict['kernel_size'] filt_size = rep_param_dict['filt_size'] vocab_size = param_dict['vocab_size'] output_dim = rep_param_dict['output_dim'] hidden_size = param_dict['hidden_size'] batch_size = param_dict['batch_size'] preemb = param_dict['preemb'] emb_path = param_dict['emb_path'] hinge_margin = param_dict['hinge_margin'] model = Attention(emb_size=emb_size, query_length=q_len, doc_length=d_len, num_heads=num_heads, kernel_size=kernel_size, filter_size=filt_size, vocab_size=vocab_size, dropout=0.0, qrep_dim=output_dim, hidden_size=hidden_size, batch_size=batch_size, preemb=preemb, emb_path=emb_path) if use_cuda: model.cuda() # optimizer optimizer = optim.Adam(model.parameters(), lr=param_dict['learning_rate'], betas=(param_dict['beta1'], param_dict['beta2']), weight_decay=param_dict['alpha']) # loss func loss = nn.MarginRankingLoss(margin=hinge_margin, size_average=True) # experiment print("Experiment") if resume == False: f_log = open( '{}/{}/logs/training_log.txt'.format(model_base_path, model_name_str), 'w+', 1) valid_log = open( '{}/{}/logs/valid_log.txt'.format(model_base_path, model_name_str), 'w+', 1) else: f_log = open( '{}/{}/logs/training_log.txt'.format(model_base_path, model_name_str), 'a+', 1) valid_log = open( '{}/{}/logs/valid_log.txt'.format(model_base_path, model_name_str), 'a+', 1) # model_file model_file = '{}/{}/saves/model_file'.format(model_base_path, model_name_str) """ TRAINING """ # define the parameters n_epoch = param_dict['n_epoch'] # init best validation MAP value best_MAP = 0.0 best_NDCG1 = 0.0 batch_count_tr = 0 # restore saved parameter if resume_training is true if resume == True: model_file = '{}/{}/saves/model_file'.format(model_base_path, model_name_str) model.load_state_dict(torch.load(model_file)) with open( '{}/{}/saves/best_MAP.pkl'.format(model_base_path, model_name_str), 'rb') as f_MAP: best_MAP = pickle.load(f_MAP) print("loaded model, and resume training now") for epoch in range(1, n_epoch + 1): '''load_data''' for f in train_files: data = load_dataset(f) print("loaded {}".format(f)) '''prepare_data''' [Q, D_pos, D_neg, L] = pair_data_generator(data, q_len) valid_data = load_dataset(test_files) ''' shuffle data''' train_data = list_shuffle(Q, D_pos, D_neg, L) '''training func''' num_batch = len(train_data[0]) // batch_size for batch_count in range(num_batch): Q = train_data[0][batch_size * batch_count:batch_size * (batch_count + 1)] D_pos = train_data[1][batch_size * batch_count:batch_size * (batch_count + 1)] D_neg = train_data[2][batch_size * batch_count:batch_size * (batch_count + 1)] L = train_data[3][batch_size * batch_count:batch_size * (batch_count + 1)] if use_cuda: Q = Variable(torch.LongTensor( pad_batch_list(Q, max_len=q_len, padding_id=0)), requires_grad=False).cuda() D_pos = Variable(torch.LongTensor( pad_batch_list(D_pos, max_len=d_len, padding_id=0)), requires_grad=False).cuda() D_neg = Variable(torch.LongTensor( pad_batch_list(D_neg, max_len=d_len, padding_id=0)), requires_grad=False).cuda() L = Variable(torch.FloatTensor(L), requires_grad=False).cuda() else: Q = Variable(torch.LongTensor( pad_batch_list(Q, max_len=q_len, padding_id=0)), requires_grad=False) D_pos = Variable(torch.LongTensor( pad_batch_list(D_pos, max_len=d_len, padding_id=0)), requires_grad=False) D_neg = Variable(torch.LongTensor( pad_batch_list(D_neg, max_len=d_len, padding_id=0)), requires_grad=False) L = Variable(torch.FloatTensor(L), requires_grad=False) # run on this batch optimizer.zero_grad() t1 = time.time() q_mask, d_pos_mask, d_neg_mask = model.generate_mask( Q, D_pos, D_neg) """ need to do the modification i the model.py """ S_pos, S_neg = model(Q, D_pos, D_neg, q_mask, d_pos_mask, d_neg_mask) Loss = hinge_loss(S_pos, S_neg, 1.0) Loss.backward() optimizer.step() t2 = time.time() batch_count_tr += 1 print("epoch {} batch {} training cost: {} using {}s" \ .format(epoch, batch_count+1, Loss.data[0], t2-t1)) f_log.write("epoch {} batch {} training cost: {}, using {}s". format(epoch, batch_count + 1, Loss.data[0], t2 - t1) + '\n') """ evaluate part """ if batch_count_tr % 20 == 0: if valid_data is not None: MAP, NDCGs = evaluate(config_path, model, valid_data, rel_path, mode="valid") print(MAP, NDCGs) valid_log.write( "epoch {}, batch {}, MAP: {}, NDCGs: {} {} {} {}". format(epoch + 1, batch_count + 1, MAP, NDCGs[1][0], NDCGs[1][1], NDCGs[1][2], NDCGs[1][3])) if MAP > best_MAP: # save this best model best_MAP = MAP with open( '{}/{}/saves/best_MAP.pkl'.format( model_base_path, model_name_str), 'wb') as f_MAP: pickle.dump(best_MAP, f_MAP) # save model params after several epoch model_file = '{}/{}/saves/model_file'.format( model_base_path, model_name_str) torch.save(model.state_dict(), model_file) print("successfully saved model to the path {}". format(model_file)) valid_log.write("{} {} {} {}".format( NDCGs[1][0], NDCGs[1][1], NDCGs[1][2], NDCGs[1][3])) valid_log.write(" MAP: {}".format(MAP)) valid_log.write('\n') f_log.close() valid_log.close()