def train_model(self): gen_batch_index = DataIterator(np.arange(self.users_num), batch_size=self.batchSize_G, shuffle=True) dis_batch_index = DataIterator(np.arange(self.users_num), batch_size=self.batchSize_D, shuffle=True) totalEpochs = self.epochs totalEpochs = int(totalEpochs / self.step_G) for epoch in range(totalEpochs): train_matrix, ZR_matrix, PM_matrix = self.get_train_data() # training discriminator for d_epoch in range(self.step_D): for idx in dis_batch_index: train_data = train_matrix[idx].toarray() train_mask = PM_matrix[idx].toarray() feed = {self.realData: train_data, self.mask: train_mask, self.condition: train_data} self.sess.run(self.trainer_D, feed_dict=feed) # training generator for g_epoch in range(self.step_G): for idx in gen_batch_index: train_data = train_matrix[idx].toarray() train_z_mask = ZR_matrix[idx].toarray() train_p_mask = PM_matrix[idx].toarray() feed = {self.realData: train_data, self.condition: train_data, self.mask: train_p_mask, self.G_ZR_dims: train_z_mask} self.sess.run(self.trainer_G, feed_dict=feed) result = self.evaluate_model() print("%d_G:\t%s" % (epoch*self.step_G+g_epoch, result))
def __init__(self, train_dir, test_dir, n_epochs, init_lr, gpu, local_rank, lr_schedule_type, lr_schedule_param, dataset, train_batch_size, test_batch_size, valid_size, opt_type, opt_param, weight_decay, label_smoothing, no_decay_keys, model_init, init_div_groups, validation_frequency, print_frequency, train_iters): self.n_epochs = n_epochs self.init_lr = init_lr self.lr_schedule_type = lr_schedule_type self.lr_schedule_param = lr_schedule_param self.dataset = dataset self.train_batch_size = train_batch_size self.test_batch_size = test_batch_size self.valid_size = valid_size self.train_dir = train_dir self.test_dir = test_dir self.opt_type = opt_type self.opt_param = opt_param self.weight_decay = weight_decay self.label_smoothing = label_smoothing self.no_decay_keys = no_decay_keys self.model_init = model_init self.init_div_groups = init_div_groups self.validation_frequency = validation_frequency self.print_frequency = print_frequency self._data_provider = None self._train_iter, self._valid_iter, self._test_iter = None, None, None self.train_iters = train_iters self.val_iters = self.valid_size // self.test_batch_size print('test_batch_size={}, valid_size={}, val_iters={}'.\ format(self.test_batch_size, self.valid_size, self.val_iters)) # Prepare data train_loader = get_train_dataloader(self.train_dir, self.train_batch_size, shuffle=True) self.train_dataprovider = DataIterator(train_loader) val_loader = get_val_dataloader(self.test_dir, self.test_batch_size) self.val_dataprovider = DataIterator(val_loader) self.data_shape = (3, 224, 224) self.n_classes = 1000 self.gpu = gpu
def evaluate(self, model): # B: batch size # N: the number of items test_users = DataIterator(list(self.user_pos_test.keys()), batch_size=2048, shuffle=False, drop_last=False) batch_result = [] for batch_users in test_users: ranking_score = model.predict_for_eval(batch_users) # (B,N) # set the ranking scores of training items to -inf, # then the training items will be sorted at the end of the ranking list. for idx, user in enumerate(batch_users): train_items = self.user_pos_train[user] ranking_score[idx][train_items] = -np.inf test_items = [] for user in batch_users: # using 'dtype=np.intc' is to ensure the right of cpp backend u_items = np.array(self.user_pos_test[user], dtype=np.intc, copy=True) test_items.append(u_items) result = eval_score_matrix(ranking_score, test_items, top_k=50, thread_num=None) # (B,k*metric_num) batch_result.append(result) # concatenate the batch results to a matrix all_user_result = np.concatenate(batch_result, axis=0) final_result = np.mean(all_user_result, axis=0) # mean return final_result
def train_model(self): self.logger.info(self.evaluator.metrics_info()) for epoch in range(self.epochs): users, pos_items, neg_items = csr_to_pairwise(self.train_matrix, neg_num=self.neg_num, fold_neg=True) data = DataIterator(users, pos_items, neg_items, batch_size=self.batch_size, shuffle=True) for batch_users, batch_pos_items, batch_neg_items in data: feed = { self.user_h: batch_users, self.pos_item_h: batch_pos_items, self.neg_item_h: batch_neg_items } self.sess.run(self.update, feed_dict=feed) result = self.evaluate_model() self.logger.info("epoch %d:\t%s" % (epoch, result)) # save params if (epoch + 1) % 50 == 0: params = self.sess.run(self.parameters) with open( "%s_d=%d_e=%d_dnsbpr.pkl" % (self.dataset.name, epoch, self.factors_num), "wb") as fout: pickle.dump(params, fout)
def __init__(self, train_data, test_data, batch_size=32, randomize=True, n_tasks=5): self.it = DataIterator(train_data, test_data, batch_size, randomize, n_tasks=n_tasks) self.train_x = self.it.train_x self.train_y = self.it.train_y self.test_x = self.it.test_x self.test_y = self.it.test_y self.i = 0 self.batch_size = batch_size self.n_tasks = n_tasks assert (n_tasks == 5) print("labels are 0/1, 2/3, 4/5, 6/7, 8/9") self.generate_tasks([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]) self.img_fn = self.it.img_fn self.reshape_dims = (28 * 28, ) self.switch_task(0)
def predict_for_eval(self, users): users = DataIterator(users, batch_size=1024, shuffle=False, drop_last=False) all_ratings = [] for users in users: tmp_rating = self.sess.run(self.pre_logits, feed_dict={self.user_h: users}) all_ratings.extend(tmp_rating) all_ratings = np.array(all_ratings, dtype=np.float32) return all_ratings
def __init__(self, indexer, trainPairs, trainLens, testPairs, testLens, batchSize=5, hiddenSize=10, nLayers=2, dropout=0.1, residual=True, lr=1e-4, enforcingRatio=0.5, clip=5.0, resultSavePath='mscoco/results.txt'): """ Args: indexer: an Indexer object. trainPairs, testPairs: each is a list of pairs of word index list. trainLens, testLens: each is a list of pairs of length of word index list. batchSize: int. (default=5) hiddenSize: int. (default=10) nLayers: number of GRU stacking layers. (default=2) dropout: dropout rate. (default=0.1) residual: boolean, whether to establish residual links. (default=True) lr: learning rate, float. (default=1e-4 with Adam) enforcingRatio: the percentage of teacher-enforced training. (default=0.5) clip: gradient clip cap, float. (default=5.0) resultSavePath: (input,prediction,target) sentence triples file path. """ self.indexer = indexer self.trainIter = DataIterator(indexer, trainPairs, trainLens) self.testIter = DataIterator(indexer, testPairs, testLens) self.batchSize = batchSize self.hiddenSize = hiddenSize self.nLayers = nLayers self.dropout = dropout self.residual = residual self.lr = lr self.enforcingRatio = enforcingRatio self.clip = clip self.resultSavePath = resultSavePath self._build_model()
def train_model(self): self.logger.info(self.evaluator.metrics_info()) for epoch in range(self.epochs): users, pos_items, neg_items = csr_to_pairwise(self.train_matrix, neg_num=self.dns, fold_neg=True) data = DataIterator(users, pos_items, neg_items, batch_size=self.batch_size, shuffle=True) for user, pos_item, neg_item in data: feed = {self.user_h: user, self.pos_item_h: pos_item, self.neg_item_h: neg_item} self.sess.run(self.update, feed_dict=feed) result = self.evaluate_model() self.logger.info("epoch %d:\t%s" % (epoch, result))
def predict_for_eval(self, users): users_iter = DataIterator(users, batch_size=1024, shuffle=False) all_ratings = [] for batch in users_iter: eval_data = self.train_matrix[batch].toarray() tmp_rating = self.sess.run(self.G_output, feed_dict={self.condition: eval_data}) all_ratings.extend(tmp_rating) all_ratings = np.array(all_ratings, dtype=np.float32) if self.mode == "itemBased": all_ratings = np.transpose(all_ratings) return all_ratings
def get_train_data(self): users_list = [] items_list = [] for user, items in self.user_pos_train.items(): users_list.extend([user] * len(items)) items_list.extend(items) dataloader = DataIterator(users_list, items_list, batch_size=self.batch_size, shuffle=True) return dataloader
def predict_for_eval(self, users): users = DataIterator(users, batch_size=1024, shuffle=False, drop_last=False) all_ratings = [] for bat_user in users: eval_data = self.train_matrix[bat_user].toarray() feed = {self.uid_ph: bat_user, self.input_ph: eval_data} tmp_rating = self.sess.run(self.output, feed_dict=feed) all_ratings.extend(tmp_rating) all_ratings = np.array(all_ratings, dtype=np.float32) return all_ratings
def get_train_data(self): users, pos_items, neg_items = csr_to_pairwise(self.train_matrix, neg_num=1, fold_neg=False) users_list, items_list, labels_list = [], [], [] users_list.extend(users) items_list.extend(pos_items) labels_list.extend([1]*len(pos_items)) users_list.extend(users) items_list.extend(neg_items) labels_list.extend([0] * len(pos_items)) dataloader = DataIterator(users_list, items_list, labels_list, batch_size=self.batch_size, shuffle=True) return dataloader
def _pre_training(self): # pretrain self.logger.info("Pre-training") for epoch in range(self.adv_epoch): users, pos_items, neg_items = csr_to_pairwise(self.train_matrix, neg_num=self.dns, fold_neg=True) data = DataIterator(users, pos_items, neg_items, batch_size=self.batch_size, shuffle=True) for user_input, item_input_pos, item_dns_list in data: feed_dict = {self.user_input: user_input, self.item_input_pos: item_input_pos, self.item_input_neg: item_dns_list} self.sess.run(self.bpr_optimizer, feed_dict) result = self.evaluate_model() self.logger.info("%d:\t%s" % (epoch, result))
def _adversarial_training(self): # adversarial training self.logger.info("Adversarial training") for epoch in range(self.adv_epoch, self.epochs): users, pos_items, neg_items = csr_to_pairwise(self.train_matrix, neg_num=1, fold_neg=True) data = DataIterator(users, pos_items, neg_items, batch_size=self.batch_size, shuffle=True) for user_input, item_input_pos, item_input_neg in data: feed_dict = {self.user_input: user_input, self.item_input_pos: item_input_pos, self.item_input_neg: item_input_neg, self.steps: epoch} self.sess.run([self.update_P, self.update_Q], feed_dict) self.sess.run(self.amf_optimizer, feed_dict) result = self.evaluate_model() self.logger.info("%d:\t%s" % (epoch, result))
def train_model(self): data = DataIterator(np.arange(self.users_num), batch_size=self.batch_size, shuffle=True) for epoch in range(self.epochs): corrupt_input, mask = self.get_train_data() for bat_user in data: train_data = corrupt_input[bat_user].toarray() train_mask = mask[bat_user].toarray() labels = self.train_matrix[bat_user].toarray() feed = { self.uid_ph: bat_user, self.input_ph: train_data, self.mask_ph: train_mask, self.label_ph: labels } self.sess.run(self.update_opt, feed_dict=feed) result = self.evaluate_model() self.logger.info("epoch %d:\t%s" % (epoch, result))
# print(len(true_label_list)) # print(pred_label_list) # print(true_label_list) # save_weight = np.array([1.03283219, 0.97672083, 0.94315084]) # pred_logit_list = save_weight * np.array(pred_logit_list) # pred_label_list = np.argmax(pred_logit_list, axis=1) - 1 # dev_df=pd.read_csv(config.data_process + 'processed_data/new_dev_df.csv',encoding='utf_8_sig') # dev_df['pred_label']=pred_label_list # dev_df.to_csv(config.data_process+'compare_result.csv',index=False,encoding='utf_8_sig') if __name__ == '__main__': config = Config() vocab_file = config.vocab_file do_lower_case = False tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) print('Predicting test.txt..........') dev_iter = DataIterator(config.batch_size, data_file=config.data_process + 'processed_data/new_dev_df.csv', use_bert=config.use_bert, seq_length=config.sequence_length, is_test=True, tokenizer=tokenizer) # print('Predicting dev.txt..........') # dev_iter = DataIterator(config.batch_size, data_file=result_data_dir + 'dev.txt', use_bert=config.use_bert, # seq_length=config.sequence_length, is_test=True, tokenizer=tokenizer) set_test(dev_iter, config.checkpoint_path)
final_ev_dict[key] = dict() for e_key in ev.keys(): final_ev_dict[key][e_key] = round(sum(ev[e_key]) / len(ev[e_key]), 4) # print(final_ev_dict) for key in final_ev_dict.keys(): ev_p = final_ev_dict[key]['precision'] ev_r = final_ev_dict[key]['recall'] ev_f1 = final_ev_dict[key]['f1-score'] print(key, ev_p, ev_r, ev_f1) f1 = f1 / len(target_names) precision = precision / len(target_names) recall = recall / len(target_names) print('{:.4f} {:.4f} {:.4f}'.format(precision, recall, f1)) with open(result_data_dir + 'result.json', 'w', encoding='utf—8') as f: json.dump(pred_answer, f, ensure_ascii=False) if __name__ == '__main__': config = Config() tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=config.model_path, do_lower_case=True, never_split=["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]) print('Predicting test.txt..........') dev_iter = DataIterator(config.batch_size, config.processed_data + 'dev.txt', pretrainning_model=config.pretrainning_model, seq_length=config.sequence_length, is_test=True, tokenizer=tokenizer) set_test(dev_iter, config.checkpoint_path)
def comput_p(true_list, pred_list): c = 0 for i in range(len(true_list)): if true_list[i] == pred_list[i]: c += 1 return c / (i + 1) if __name__ == '__main__': config = Config() vocab_file = config.vocab_file # 通用词典 do_lower_case = True tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) re_tokenzier = Tokenizer(vocab_file, do_lower_case) train_iter = DataIterator(config.batch_size, data_file=config.process + 'train.csv', use_bert=config.use_bert, tokenizer=tokenizer, seq_length=config.sequence_length, config=config) dev_iter = DataIterator(config.batch_size, data_file=config.process + 'dev.csv', use_bert=config.use_bert, tokenizer=tokenizer, seq_length=config.sequence_length, is_test=True, config=config) train(train_iter, dev_iter, config) #fold2 new_answer
questionlen_list.extend(querylen_list) allmaping_list.extend(mapping_list) context_list.extend(text_list) cls_prob_list.extend(pred_c) pred_answer, C = refind_answer(predict_df, all_uid_list, start_prob_list, end_prob_list, questionlen_list, allmaping_list, context_list, cls_prob_list) predict_df['answer'] = pred_answer predict_df.to_csv(config.processed_data + 'result_dev.csv') if __name__ == '__main__': config = Config() tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=config.model_path, do_lower_case=True, never_split=["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]) test_iter = DataIterator(config.batch_size, data_file=config.processed_data + 'test.csv', config=config, tokenizer=tokenizer) predict_file = config.processed_data + 'NCPPolicies_test.csv' print('Predicting {}..........'.format(str(predict_file))) test_df = pd.read_csv(predict_file, sep='\t', error_bad_lines=False) set_test(test_iter, config.checkpoint_path, test_df)
:return: """ # 计算每行的最大值 row_max = x.max(axis=axis) # 每行元素都需要减去对应的最大值,否则求exp(x)会溢出,导致inf情况 row_max = row_max.reshape(-1, 1) x = x - row_max # 计算e的指数次幂 x_exp = np.exp(x) x_sum = np.sum(x_exp, axis=axis, keepdims=True) s = x_exp / x_sum return s if __name__ == '__main__': config = Config() tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=config.model_path, do_lower_case=False, never_split=["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]) train_iter = DataIterator(config.batch_size, data_file=config.processed_data + 'train.csv', config=config, tokenizer=tokenizer) dev_iter = DataIterator(config.batch_size, data_file=config.processed_data + 'dev.csv', config=config, tokenizer=tokenizer) train(train_iter, dev_iter, config=config)
def prepare_data_mlc(gold_fraction, corruption_prob, corruption_type, args): from load_corrupted_data import CIFAR10, CIFAR100 mean = [x / 255 for x in [125.3, 123.0, 113.9]] std = [x / 255 for x in [63.0, 62.1, 66.7]] train_transform = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, padding=4), transforms.ToTensor(), transforms.Normalize(mean, std) ]) test_transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize(mean, std)]) # since cifar10 and cifar100 have no official validation split, use gold as valid also if args.dataset == 'cifar10': train_data_gold = CIFAR10(args.data_path, True, True, gold_fraction, corruption_prob, args.corruption_type, transform=train_transform, download=True, distinguish_gold=False, seed=args.seed) train_data_silver = CIFAR10( args.data_path, True, False, gold_fraction, corruption_prob, args.corruption_type, transform=train_transform, download=True, shuffle_indices=train_data_gold.shuffle_indices, seed=args.seed, distinguish_gold=False, weaklabel=args.weaklabel) # note here for the change train_data_gold_deterministic = CIFAR10( args.data_path, True, True, gold_fraction, corruption_prob, args.corruption_type, transform=test_transform, download=True, shuffle_indices=train_data_gold.shuffle_indices, distinguish_gold=False, seed=args.seed) test_data = CIFAR10(args.data_path, train=False, transform=test_transform, download=True, distinguish_gold=False, seed=args.seed) # same as gold valid_data = CIFAR10(args.data_path, True, True, gold_fraction, corruption_prob, args.corruption_type, transform=train_transform, download=True, distinguish_gold=False, seed=args.seed) num_classes = 10 elif args.dataset == 'cifar100': train_data_gold = CIFAR100(args.data_path, True, True, gold_fraction, corruption_prob, args.corruption_type, transform=train_transform, download=True, distinguish_gold=False, seed=args.seed) train_data_silver = CIFAR100( args.data_path, True, False, gold_fraction, corruption_prob, args.corruption_type, transform=train_transform, download=True, shuffle_indices=train_data_gold.shuffle_indices, seed=args.seed, distinguish_gold=False, weaklabel=args.weaklabel) # note the weaklabel arg train_data_gold_deterministic = CIFAR100( args.data_path, True, True, gold_fraction, corruption_prob, args.corruption_type, transform=test_transform, download=True, shuffle_indices=train_data_gold.shuffle_indices, distinguish_gold=False, seed=args.seed) test_data = CIFAR100(args.data_path, train=False, transform=test_transform, download=True, distinguish_gold=False, seed=args.seed) # same as gold valid_data = CIFAR100(args.data_path, True, True, gold_fraction, corruption_prob, args.corruption_type, transform=train_transform, download=True, distinguish_gold=False, seed=args.seed) num_classes = 100 gold_sampler = None silver_sampler = None valid_sampler = None test_sampler = None batch_size = args.bs train_gold_loader = DataIterator( torch.utils.data.DataLoader(train_data_gold, batch_size=batch_size, shuffle=(gold_sampler is None), num_workers=args.prefetch, pin_memory=True, sampler=gold_sampler)) train_silver_loader = torch.utils.data.DataLoader( train_data_silver, batch_size=batch_size, shuffle=(silver_sampler is None), num_workers=args.prefetch, pin_memory=True, sampler=silver_sampler) valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, shuffle=(valid_sampler is None), num_workers=args.prefetch, pin_memory=True, sampler=valid_sampler) test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=(test_sampler is None), num_workers=args.prefetch, pin_memory=True, sampler=test_sampler) return train_gold_loader, train_silver_loader, valid_loader, test_loader, num_classes
""" if __name__ == '__main__': config = Config() # 得到emb矩阵 print('loading word2vec mat1...') embeddings_1 = gensim.models.KeyedVectors.load_word2vec_format(config.model_path + 'w2v_model/category_256.txt', binary=False) print('loading word2vec mat2...') embeddings_2 = gensim.models.KeyedVectors.load_word2vec_format(config.model_path + 'w2v_model/industry_256.txt', binary=False) print('loading word2vec mat3...') embeddings_3 = gensim.models.KeyedVectors.load_word2vec_format(config.model_path + 'w2v_model/product_256.txt', binary=False) print('loading word2vec mat4...') embeddings_4 = gensim.models.KeyedVectors.load_word2vec_format(config.model_path + 'w2v_model/advertiser_256.txt', binary=False) print('loading word2vec mat5...') embeddings_5 = gensim.models.KeyedVectors.load_word2vec_format(config.model_path + 'w2v_model/creative_256.txt', binary=False) print('model_loadding done') # dev_iter = DataIterator(config.test_batch_size, data_file=config.data_processed + 'new_test_{}_{}.csv'.format(start,end), # use_bert=config.use_bert,seq_length=config.sequence_length, is_test=True, tokenizer=tokenizer) dev_iter = DataIterator(config.batch_size, embeddings_1, embeddings_2, embeddings_3, embeddings_4, embeddings_5, data_file=config.corpus_path + 'test.csv', seq_length=config.sequence_length,config=config) set_test(dev_iter, config.checkpoint_path)
class RunConfig: def __init__(self, train_dir, test_dir, n_epochs, init_lr, gpu, local_rank, lr_schedule_type, lr_schedule_param, dataset, train_batch_size, test_batch_size, valid_size, opt_type, opt_param, weight_decay, label_smoothing, no_decay_keys, model_init, init_div_groups, validation_frequency, print_frequency, train_iters): self.n_epochs = n_epochs self.init_lr = init_lr self.lr_schedule_type = lr_schedule_type self.lr_schedule_param = lr_schedule_param self.dataset = dataset self.train_batch_size = train_batch_size self.test_batch_size = test_batch_size self.valid_size = valid_size self.train_dir = train_dir self.test_dir = test_dir self.opt_type = opt_type self.opt_param = opt_param self.weight_decay = weight_decay self.label_smoothing = label_smoothing self.no_decay_keys = no_decay_keys self.model_init = model_init self.init_div_groups = init_div_groups self.validation_frequency = validation_frequency self.print_frequency = print_frequency self._data_provider = None self._train_iter, self._valid_iter, self._test_iter = None, None, None self.train_iters = train_iters self.val_iters = self.valid_size // self.test_batch_size print('test_batch_size={}, valid_size={}, val_iters={}'.\ format(self.test_batch_size, self.valid_size, self.val_iters)) # Prepare data train_loader = get_train_dataloader(self.train_dir, self.train_batch_size, shuffle=True) self.train_dataprovider = DataIterator(train_loader) val_loader = get_val_dataloader(self.test_dir, self.test_batch_size) self.val_dataprovider = DataIterator(val_loader) self.data_shape = (3, 224, 224) self.n_classes = 1000 self.gpu = gpu @property def config(self): config = {} for key in self.__dict__: if not key.startswith('_'): config[key] = self.__dict__[key] return config def copy(self): return RunConfig(**self.config) """ learning rate """ def _calc_learning_rate(self, epoch, batch=0, nBatch=None): if self.lr_schedule_type == 'cosine': T_total = self.n_epochs * nBatch T_cur = epoch * nBatch + batch lr = 0.5 * self.init_lr * (1 + math.cos(math.pi * T_cur / T_total)) else: raise ValueError('do not support: %s' % self.lr_schedule_type) return lr def adjust_learning_rate(self, optimizer, epoch, batch=0, nBatch=None): """ adjust learning of a given optimizer and return the new learning rate """ new_lr = self._calc_learning_rate(epoch, batch, nBatch) for param_group in optimizer.param_groups: param_group['lr'] = new_lr return new_lr """ data provider """ @property def data_config(self): raise NotImplementedError @property def data_provider(self): if self._data_provider is None: if self.dataset == 'imagenet': self._data_provider = self.train_dataprovider else: raise ValueError('do not support: %s' % self.dataset) return self._data_provider @data_provider.setter def data_provider(self, val): self._data_provider = val @property def train_loader(self): return self.train_dataprovider @property def valid_loader(self): return self.val_dataprovider @property def test_loader(self): return self.val_dataprovider @property def train_next_batch(self): try: images, labels = self.train_loader.next() images = Variable(images, requires_grad=False) labels = Variable(labels, requires_grad=False) except StopIteration: assert ('error') return images, labels @property def valid_next_batch(self): try: images, labels = self.val_dataprovider.next() images = Variable(images, requires_grad=False) labels = Variable(labels, requires_grad=False) except StopIteration: assert ('error') return images, labels @property def test_next_batch(self): try: images, labels = self.val_dataprovider.next() images = Variable(images, requires_grad=False) labels = Variable(labels, requires_grad=False) except StopIteration: assert ('error') return images, labels """ optimizer """ def build_optimizer(self, net_params): if self.opt_type == 'sgd': opt_param = {} if self.opt_param is None else self.opt_param momentum, nesterov = opt_param.get('momentum', 0.9), opt_param.get( 'nesterov', True) if self.no_decay_keys: optimizer = torch.optim.SGD([ { 'params': net_params[0], 'weight_decay': self.weight_decay }, { 'params': net_params[1], 'weight_decay': 0 }, ], lr=self.init_lr, momentum=momentum, nesterov=nesterov) else: optimizer = torch.optim.SGD(net_params, self.init_lr, momentum=momentum, nesterov=nesterov, weight_decay=self.weight_decay) else: raise NotImplementedError return optimizer
re.write(R) n_df = pd.DataFrame() id = list(pred_answer_dict.keys()) answer = list(pred_answer_dict.values()) n_df['id'] = id n_df['answer'] = answer no_df = n_df[n_df['answer'] == ''] print('{}个没找到答案'.format(no_df.__len__())) no_df.to_csv('/'.join(config.checkpoint_path.split('/')[:-1]) + '/no_answer.csv', index=False, encoding='utf_8_sig') if __name__ == '__main__': config = Config() vocab_file = config.vocab_file do_lower_case = False tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) print('Predicting test.txt..........') dev_iter = DataIterator( config.test_batch_size, # data_file=Config().data + 'test_mrc.csv', config.process + 'test.csv', use_bert=config.use_bert, seq_length=config.sequence_length, is_test=True, tokenizer=tokenizer, config=config) set_test(dev_iter, config.checkpoint_path)
age_auc = accuracy_score(true_age_list, pred_age_list) gender_auc = accuracy_score(true_gender_list, pred_gender_list) print('focal_auc {}, age_auc {}, gender_auc {}'.format( age_auc + gender_auc, age_auc, gender_auc)) return age_auc, gender_auc if __name__ == '__main__': config = Config() vocab_file = config.vocab_file # 通用词典 do_lower_case = False tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) train_iter = DataIterator(config.batch_size, data_file=config.data_processed + 'new_train.csv', use_bert=config.use_bert, tokenizer=tokenizer, seq_length=config.sequence_length) dev_iter = DataIterator(config.batch_size, data_file=config.data_processed + 'new_dev.csv', use_bert=config.use_bert, tokenizer=tokenizer, seq_length=config.sequence_length, is_test=True) train(train_iter, dev_iter, config)
def prepare_data_mwnet(gold_fraction, corruption_prob, corruption_type, args): from load_corrupted_data_mlg import CIFAR10, CIFAR100 normalize = transforms.Normalize( mean=[x / 255.0 for x in [125.3, 123.0, 113.9]], std=[x / 255.0 for x in [63.0, 62.1, 66.7]]) if True: # no augment as used by mwnet train_transform = transforms.Compose([ transforms.ToTensor(), transforms.Lambda(lambda x: F.pad(x.unsqueeze(0), (4, 4, 4, 4), mode='reflect').squeeze()), transforms.ToPILImage(), transforms.RandomCrop(32), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) else: train_transform = transforms.Compose([ transforms.ToTensor(), normalize, ]) test_transform = transforms.Compose([transforms.ToTensor(), normalize]) args.num_meta = int(50000 * gold_fraction) if args.dataset == 'cifar10': num_classes = 10 train_data_meta = CIFAR10(root=args.data_path, train=True, meta=True, num_meta=args.num_meta, corruption_prob=corruption_prob, corruption_type=args.corruption_type, transform=train_transform, download=True) train_data = CIFAR10(root=args.data_path, train=True, meta=False, num_meta=args.num_meta, corruption_prob=corruption_prob, corruption_type=args.corruption_type, transform=train_transform, download=True, seed=args.seed) test_data = CIFAR10(root=args.data_path, train=False, transform=test_transform, download=True) valid_data = CIFAR10(root=args.data_path, train=True, meta=True, num_meta=args.num_meta, corruption_prob=corruption_prob, corruption_type=args.corruption_type, transform=train_transform, download=True) elif args.dataset == 'cifar100': num_classes = 100 train_data_meta = CIFAR100(root=args.data_path, train=True, meta=True, num_meta=args.num_meta, corruption_prob=corruption_prob, corruption_type=args.corruption_type, transform=train_transform, download=True) train_data = CIFAR100(root=args.data_path, train=True, meta=False, num_meta=args.num_meta, corruption_prob=corruption_prob, corruption_type=args.corruption_type, transform=train_transform, download=True, seed=args.seed) test_data = CIFAR100(root=args.data_path, train=False, transform=test_transform, download=True) valid_data = CIFAR100(root=args.data_path, train=True, meta=True, num_meta=args.num_meta, corruption_prob=corruption_prob, corruption_type=args.corruption_type, transform=train_transform, download=True) train_gold_loader = DataIterator( torch.utils.data.DataLoader(train_data_meta, batch_size=args.bs, shuffle=True, num_workers=args.prefetch, pin_memory=True)) train_silver_loader = torch.utils.data.DataLoader( train_data, batch_size=args.bs, shuffle=True, num_workers=args.prefetch, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=args.bs, shuffle=True, num_workers=args.prefetch, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_data, batch_size=args.bs, shuffle=False, num_workers=args.prefetch, pin_memory=True) return train_gold_loader, train_silver_loader, valid_loader, test_loader, num_classes
def prepare_data(args): num_classes = 14 # resnet recommended normalization normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # transform # Note: rescaling to 224 and center-cropping already processed in img folders transform = transforms.Compose([ transforms.ToTensor(), # to [0,1] normalize ]) train_data_gold = torchvision.datasets.ImageFolder( 'data/clothing1M/clean_train', transform=transform) train_data_silver = torchvision.datasets.ImageFolder( 'data/clothing1M/noisy_train', transform=transform) val_data = torchvision.datasets.ImageFolder('data/clothing1M/clean_val', transform=transform) test_data = torchvision.datasets.ImageFolder('data/clothing1M/clean_test', transform=transform) # fix class idx to equal to class name _fix_cls_to_idx(train_data_gold) _fix_cls_to_idx(train_data_silver) _fix_cls_to_idx(val_data) _fix_cls_to_idx(test_data) gold_sampler = None silver_sampler = None val_sampler = None test_sampler = None batch_size = args.bs train_gold_loader = DataIterator( torch.utils.data.DataLoader(train_data_gold, batch_size=batch_size, shuffle=(gold_sampler is None), num_workers=args.prefetch, pin_memory=True, sampler=gold_sampler)) train_silver_loader = torch.utils.data.DataLoader( train_data_silver, batch_size=batch_size, shuffle=(silver_sampler is None), num_workers=args.prefetch, pin_memory=True, sampler=silver_sampler) val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=(val_sampler is None), num_workers=args.prefetch, pin_memory=True, sampler=val_sampler) test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=(test_sampler is None), num_workers=args.prefetch, pin_memory=True, sampler=test_sampler) return train_gold_loader, train_silver_loader, val_loader, test_loader, num_classes
cls_pre = np.argmax(cls_probs, axis=-1) pred_label_list += list(cls_pre) print(len(pred_label_list)) # print(pred_label_list) # print(true_label_list) test_result_pd = pd.read_csv(config.base_dir + 'dev.csv', encoding='utf8') test_result_pd['pred'] = pred_label_list true_list = test_result_pd['num_label'].tolist() from sklearn.metrics import f1_score F1 = f1_score(true_list, pred_label_list, average='micro') print('F1:', F1) test_result_pd.to_csv(config.base_dir + 'result.csv', index=False, encoding='utf-8') if __name__ == '__main__': config = Config() vocab_file = config.vocab_file do_lower_case = False tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=config.model_path, do_lower_case=True, never_split=["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]) print('Predicting test.txt..........') dev_iter = DataIterator(config.batch_size, config.base_dir + 'dev.csv', use_bert=config.use_bert, seq_length=config.sequence_length, is_test=True, tokenizer=tokenizer) set_test(dev_iter, config.checkpoint_path)
'y_pred_text': ldct_list_text } df = pd.DataFrame(dict_data) precision, recall, f1 = get_P_R_F(df) print('precision: {}, recall {}, f1 {}'.format(precision, recall, f1)) return precision, recall if __name__ == '__main__': config = Config() vocab_file = config.vocab_file # 通用词典 do_lower_case = False tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) train_iter = DataIterator(config.batch_size, data_file=result_data_dir + 'train.txt', use_bert=config.use_bert, tokenizer=tokenizer, seq_length=config.sequence_length) print('GET!!') dev_iter = DataIterator(config.batch_size, data_file=result_data_dir + 'dev.txt', use_bert=config.use_bert, tokenizer=tokenizer, seq_length=config.sequence_length, is_test=True) train(train_iter, dev_iter, config)
class Seq2Seq: """Encoder-Decoder model with Luong attention, stacking and residual links.""" def __init__(self, indexer, trainPairs, trainLens, testPairs, testLens, batchSize=5, hiddenSize=10, nLayers=2, dropout=0.1, residual=True, lr=1e-4, enforcingRatio=0.5, clip=5.0, resultSavePath='mscoco/results.txt'): """ Args: indexer: an Indexer object. trainPairs, testPairs: each is a list of pairs of word index list. trainLens, testLens: each is a list of pairs of length of word index list. batchSize: int. (default=5) hiddenSize: int. (default=10) nLayers: number of GRU stacking layers. (default=2) dropout: dropout rate. (default=0.1) residual: boolean, whether to establish residual links. (default=True) lr: learning rate, float. (default=1e-4 with Adam) enforcingRatio: the percentage of teacher-enforced training. (default=0.5) clip: gradient clip cap, float. (default=5.0) resultSavePath: (input,prediction,target) sentence triples file path. """ self.indexer = indexer self.trainIter = DataIterator(indexer, trainPairs, trainLens) self.testIter = DataIterator(indexer, testPairs, testLens) self.batchSize = batchSize self.hiddenSize = hiddenSize self.nLayers = nLayers self.dropout = dropout self.residual = residual self.lr = lr self.enforcingRatio = enforcingRatio self.clip = clip self.resultSavePath = resultSavePath self._build_model() def _build_model(self): """Specify computational graph.""" self.encoder = EncoderRNN(self.indexer.size, self.hiddenSize, nLayers=self.nLayers, dropout=self.dropout) self.decoder = LuongDecoderRNN(self.hiddenSize, self.indexer.size, nLayers=self.nLayers, dropout=self.dropout, residual=self.residual) self.encoderOptim = optim.Adam(self.encoder.parameters(), self.lr) self.decoderOptim = optim.Adam(self.decoder.parameters(), self.lr) def _model_config(self): return 'Vocab Size = ' + str(self.indexer.size) + '\n' + \ 'Train/Test Size = ' + str(self.trainIter.size)+'/'+str(self.testIter.size) + '\n' + \ 'batchSize = ' + str(self.batchSize) + '; hiddenSize = ' + str(self.hiddenSize) + '\n' + \ 'nLayers = ' + str(self.nLayers) + '; dropout = ' + str(self.dropout) + '\n' + \ 'residual = ' + str(self.residual) + '; learning rate = ' + str(self.lr) + '\n' + \ 'teacher enforce ratio = ' + str(self.enforcingRatio) + '; clip = ' + str(self.clip) + '\nn' def _train_step(self): """One step of training.""" inputs, inputsLen, targets, targetsLen = self.trainIter.random_batch( self.batchSize) self.encoderOptim.zero_grad() self.decoderOptim.zero_grad() loss = 0 # Run encoder encoderHidden = None encoderOutput, encoderHidden = self.encoder(inputs, inputsLen, encoderHidden) # Run decoder decoderInput = Variable( torch.LongTensor([self.indexer.get_index('SOS')] * self.batchSize)) decoderContext = Variable( torch.zeros(self.batchSize, self.decoder.hiddenSize)) decoderHidden = encoderHidden enforce = random.random() < self.enforcingRatio maxTargetLen = max(targetsLen) decoderOutputAll = Variable( torch.zeros(maxTargetLen, self.batchSize, self.decoder.outputSize)) # <mt-max,bc,vocab> for di in range(maxTargetLen): decoderOutput, decoderHidden, decoderContext, attentionWeights = self.decoder( decoderInput, decoderHidden, decoderContext, encoderOutput) decoderOutputAll[di] = decoderOutput if enforce: decoderInput = targets[di] # <== targets is <mt,bc> else: topValues, topIndices = decoderOutput.data.topk(1) # <bc,1> decoderInput = Variable( topIndices.squeeze()) # <bc,1> -> <bc,> # Sequence cross entropy loss = batch_cross_entropy( decoderOutputAll.transpose(0, 1).contiguous(), targets.transpose(0, 1).contiguous(), targetsLen, self.batchSize) # Backprop loss.backward() torch.nn.utils.clip_grad_norm(self.encoder.parameters(), self.clip) torch.nn.utils.clip_grad_norm(self.decoder.parameters(), self.clip) self.encoderOptim.step() self.decoderOptim.step() return loss.data[0] / targetsLen def train(self, nEpochs=1, epochSize=100, printEvery=5): """Train on loaded data upon construction. Args: nEpochs: number of epochs. epochSize: number of batches trained in an epoch. printEvery: frequency of results report. """ averageLoss = 0 start = time.time() for e in range(nEpochs): epochLoss = 0 for step in range(epochSize): loss = self._train_step() if step != 0 and step % printEvery == 0: print("Step %d average loss = %.4f (time: %.2f)" % ( step, loss.mean(), # batch mean. time.time() - start)) start = time.time() epochLoss += loss.mean() epochLoss /= epochSize averageLoss += epochLoss print("\nEpoch %d loss = %.4f\n" % (e + 1, epochLoss)) averageLoss /= nEpochs print("\nGrand average loss = %.4f\n" % averageLoss) def _clear_special_tokens(self, words): """Clear all the PAD, UNK, SOS, EOS to avoid inflated BLEU. Args: words: a list of tokens. Returns: a list of tokens which are not special tokens. """ return [ word for word in words if word not in set(["PAD", "UNK", "SOS", "EOS"]) ] def evaluate_pair(self, predWords, targetWords): """Compute the BLEU score of a prediction given a reference. Args: predWords: predicted words (a list of strings). targetWords: reference, same type as preWords. Returns: The BLEU score (uses = nltk.translate.bleu_score.sentence_bleu). """ return bleu([self._clear_special_tokens(targetWords)], self._clear_special_tokens(predWords)) def evaluate_random(self, size, saveResults, printResults=True): """Randomly evaluate samples from the test set (which is loaded upon construction). Args: size: number of samples evaluated (as a single batch). printResults: print input, prediction and gold translation to console. (default=True) Returns: The average BLEU score in the batch. """ inputs, inputsLen, targets, targetsLen = self.testIter.random_batch( size) # Run encoder encoderHidden = None encoderOutput, encoderHidden = self.encoder(inputs, inputsLen, encoderHidden) # Run decoder decoderInput = Variable( torch.LongTensor([self.indexer.get_index('SOS')] * size)) decoderContext = Variable(torch.zeros(size, self.decoder.hiddenSize)) decoderHidden = encoderHidden maxTargetLen = max(targetsLen) predictions = [] for di in range(maxTargetLen): decoderOutput, decoderHidden, decoderContext, attentionWeights = self.decoder( decoderInput, decoderHidden, decoderContext, encoderOutput) topValues, topIndices = decoderOutput.data.topk(1) # <bc,1> decoderInput = Variable(topIndices.squeeze()) # <bc,1> -> <bc,> predictions.append(topIndices.view(-1).numpy()) inputs = inputs.data.numpy().transpose() predictions = np.array(predictions).transpose() # <mt,bc> -> <bc,mt> targets = targets.data.numpy().transpose() bleuList = [] results = [] for i, (input, pred, target) in enumerate(zip(inputs, predictions, targets)): predWords = self.indexer.to_words(pred) targetWords = self.indexer.to_words(target) bleuCurr = self.evaluate_pair(predWords, targetWords) bleuList.append(bleuCurr) inputSent = self.indexer.to_sent(input) predSent = self.indexer.to_sent(pred) targetSent = self.indexer.to_sent(target) results.append([inputSent, predSent, targetSent]) if printResults: print("Example %d" % (i + 1)) print("INPUT >> %s" % inputSent) print("PRED >> %s" % predSent) print("TRUE >> %s" % targetSent) print("[BLEU] %.2f\n" % bleuCurr) averageBleu = np.mean(bleuList) if saveResults: return averageBleu, results return averageBleu def evaluate(self, nBatches=10, saveResults=True): """Randomly evaluate a given number of batches. Args: nBatches: the number of random batches to be evaluated. """ averageBleuList = [] for i in range(nBatches): if saveResults: averageBleu, results = self.evaluate_random(self.batchSize, saveResults, printResults=False) averageBleuList.append(averageBleu) with open(self.resultSavePath, 'a') as f: if i == 0: f.write(self._model_config()) for input, pred, target in results: f.write('INPUT >> ' + input + '\n') f.write('PRED >> ' + pred + '\n') f.write('TARGET >> ' + target + '\n\n') else: averageBleuList.append( self.evaluate_random(self.batchSize, saveResults, printResults=False)) print("Average BLEU score over %d examples is %.4f" % (self.batchSize * nBatches, np.mean(averageBleuList))) def evaluate_given(self, sent, maxLen=20): """Evaluate a give sentence. Args: sentence: a single string. OOVs are treated as UNKs. maxLen: the max number of decoding steps. """ sent = sent.split() sentCode = [self.indexer.get_index(word, add=False) for word in sent] if any(i == -1 for i in sentCode): raise Exception("This sentence contains out of vocabulary words!") input = Variable(torch.LongTensor(sentCode)).view(-1, 1) inputLen = np.array([len(sentCode)]) # Run encoder encoderHidden = None encoderOutput, encoderHidden = self.encoder(input, inputLen, encoderHidden) # Run decoder decoderInput = Variable( torch.LongTensor([self.indexer.get_index('SOS')] * 1)) decoderContext = Variable(torch.zeros(1, self.decoder.hiddenSize)) decoderHidden = encoderHidden pred = [] for di in range(maxLen): decoderOutput, decoderHidden, decoderContext, attentionWeights = self.decoder( decoderInput, decoderHidden, decoderContext, encoderOutput) topValues, topIndices = decoderOutput.data.topk(1) # <bc,1> decoderInput = Variable(topIndices.squeeze()) # <bc,1> -> <bc,> predIndex = topIndices.view(-1).numpy()[0] if predIndex == self.indexer.get_index('EOS'): break pred.append(predIndex) print("INPUT >> %s" % ' '.join(sent)) print("PRED >> %s\n" % ' '.join(self.indexer.to_words(pred)))