def main(**kwargs): opt.parse(kwargs) model = getattr(models, opt.model)(opt).cuda().eval() if opt.model_path is not None: model.load(opt.model_path) opt.parse(kwargs) model = model.eval() test_data_title, test_data_content, index2qid, labels = load_data( type_=opt.type_) Num = len(test_data_title) print("Num: ", Num) result = np.zeros((Num, 25556)) for i in tqdm.tqdm(range(Num)): if i % opt.batch_size == 0 and i > 0: # import ipdb;ipdb.set_trace() title = np.array(test_data_title[i - opt.batch_size:i]) content = np.array(test_data_content[i - opt.batch_size:i]) result[i - opt.batch_size:i, :] = dotest(model, title, content) if Num % opt.batch_size != 0: # original # title=np.array(test_data_title[opt.batch_size*(Num/opt.batch_size):]) # content=np.array(test_data_content[opt.batch_size*(Num/opt.batch_size):]) # result[opt.batch_size*(Num/opt.batch_size):,:]=dotest(model,title,content) title = np.array(test_data_title[opt.batch_size * (Num // opt.batch_size):]) content = np.array(test_data_content[opt.batch_size * (Num // opt.batch_size):]) result[opt.batch_size * (Num // opt.batch_size):, :] = dotest( model, title, content) t.save(t.from_numpy(result).float(), opt.result_path)
def main(**kwargs): opt.parse(kwargs) files = glob.glob(opt.test_result_path + '/*.Pre') l = [] for file in files: print(file) f = t.load(file) l.append(t.load(file)) # import ipdb;ipdb.set_trace()\ loss = [[0 for _ in range(len(l))] for _ in range(len(l))] for i in range(len(l)): for j in range(len(l)): s = 0 if i < j: for _ in range(opt.num_classes): s += abs(l[i][_] - l[j][_]) loss[i][j] = s print(loss)
def main(**kwargs): maxTop = 1 weight = 1 opt.parse(kwargs) outfile = opt.ensemble_outfile # label_path = opt.labels_path test_data_path= opt.test_data_path single_model_score_file = opt.single_model_score_file pth_data_path = opt.inpath index2qid = np.load(test_data_path)['index2qid'].item() # with open(label_path) as f: # labels_info = json.load(f) # qid2label = labels_info['d'] # label2qid = labels_info['id2label'] files = glob.glob(pth_data_path +'/*.pth') print(files) modle_kind_name = 'baseline_greedy' single_score = {} fin = open(single_model_score_file, 'r', encoding='utf8') reader = csv.reader(fin) single_model_score_file_data = list(reader) for _ in range(len(single_model_score_file_data)): single_score[single_model_score_file_data[_][0].split('/')[-1].strip()] = float(single_model_score_file_data[_][1]) print(single_score) csv_line = [] single_score_of_files_rank = [] for file in files: csv_line.append(os.path.split(file)[1]) single_score_of_files_rank.append(single_score[os.path.split(file)[1][::-1].split('.',1)[1][::-1]] ) f = open(outfile, 'a+', encoding='utf-8') writer = csv.writer(f) writer.writerow(csv_line) f.close() scoreDic = {} tNum = len(files) dNum = 0 csv_line = [] for _ in range(len(files)): this_score = single_score[files[_].split('/')[-1].strip().replace('.pth','')] scoreDic[str(_)] = this_score csv_line.append(this_score) f = open(outfile, 'a+', encoding='utf-8') writer = csv.writer(f) writer.writerow(csv_line) f.close() turn = 1 maxTurn = tNum scoreDic_new = scoreDic while turn < maxTurn: if len(scoreDic_new) == 0: break scoreDic = scoreDic_new scoreDic_new = {} rankscorelist = sorted(scoreDic.items(), key=lambda d: d[1], reverse=True) maxscore = float(rankscorelist[0][1]) print(rankscorelist) top = 0 while top < maxTop and top < len(rankscorelist): donelist = [int(_) for _ in rankscorelist[top][0].split('.')] print(donelist) selfscore = rankscorelist[top][1] for i in range(tNum): if i not in donelist: print(i) print(os.path.join(opt.inpath, modle_kind_name + '.ensembel' + '_' + rankscorelist[top][0])) if turn == 1: r = weight * t.load(files[donelist[0]]) else: r = weight * t.load(os.path.join(opt.inpath, modle_kind_name + '.ensembel' + '_' + rankscorelist[top][0])) r += t.load(files[i]) true_labels = [] # for ii in range(len(r)): # # print(ii) # # print(index2qid[ii]) # # print(qid2label[index2qid[ii]]) # true_labels.append(qid2label[index2qid[ii]]) result = (r).topk(5, 1)[1] # predict_label_and_marked_label_list = [[_1, _2] for _1, _2 in zip(result, true_labels)] # score, _, _, ss = get_score(predict_label_and_marked_label_list) # print(score) # f = open(outfile, 'a+', encoding='utf-8') # writer = csv.writer(f) # writer.writerow([rankscorelist[top][0], str(i), str(score)]) # f.close() t.save(r, os.path.join(opt.inpath, modle_kind_name + '.ensembel' + '_' + rankscorelist[top][0] + '.' + str(i))) # if score > maxscore: # maxscore = score # # scoreDic_new[rankscorelist[top][0] + '.' + str(i)] = score top += 1 turn += 1
def main(**kwargs): maxTop = 4 weight = 3 opt.parse(kwargs) outfile = opt.ensemble_outfile label_path = opt.labels_path test_data_path = opt.test_data_path single_model_score_file = opt.single_model_score_file pth_data_path = opt.inpath modle_kind_name = 'ensembled' index2qid = np.load(test_data_path)['index2qid'].item() with open(label_path) as f: labels_info = json.load(f) qid2label = labels_info['d'] label2qid = labels_info['id2label'] files = glob.glob(pth_data_path + '/*.pth') print(files) single_score = {} fin = open(single_model_score_file, 'r', encoding='utf8') reader = csv.reader(fin) single_model_score_file_data = list(reader) for _ in range(len(single_model_score_file_data)): single_score[single_model_score_file_data[_][0].split('/') [-1].strip()] = float(single_model_score_file_data[_][1]) print(single_score) csv_line = [] single_score_of_files_rank = [] for file in files: csv_line.append(os.path.split(file)[1]) single_score_of_files_rank.append( single_score[os.path.split(file)[1][::-1].split('.', 1)[1][::-1]]) csv_line.append('ensemble_all') f = open(outfile, 'a+', encoding='utf-8') writer = csv.writer(f) writer.writerow(csv_line) f.close() tNum = len(files) dNum = 0 r = 0 csv_line = [] for file in files: dNum += 1 print('dealing %d/%d checkpoint' % (dNum, tNum)) r += t.load(file) this_score = str(single_score[file.split('/')[-1].strip().replace( '.pth', '')]) print(this_score) csv_line.append(this_score) print('counting') r = r / tNum t.save(r, os.path.join(opt.inpath, modle_kind_name + '.all')) true_labels = [] for ii in range(len(r)): true_labels.append(qid2label[index2qid[ii]]) result = (r).topk(5, 1)[1] predict_label_and_marked_label_list = [ [_1, _2] for _1, _2 in zip(result, true_labels) ] score, _, _, ss = get_score(predict_label_and_marked_label_list) print(score) csv_line.append(str(score)) f = open(outfile, 'a+', encoding='utf-8') writer = csv.writer(f) writer.writerow(csv_line) f.close() print('p1') csv_line = [] csv_line.append('') for _ in files: csv_line.append(_) f = open(outfile, 'a+', encoding='utf-8') writer = csv.writer(f) writer.writerow(csv_line) f.close() print('p2') scoreDic = {} print(tNum) for i in range(tNum): csv_line = [] csv_line.append(files[i]) for j in range(tNum): print('p3') if i != j: print('dealing: %d, %d' % (i, j)) r = t.load(files[i]) * weight r += t.load(files[j]) print('finish load') true_labels = [] for ii in range(len(r)): true_labels.append(qid2label[index2qid[ii]]) result = (r).topk(5, 1)[1] predict_label_and_marked_label_list = [ [_1, _2] for _1, _2 in zip(result, true_labels) ] score, _, _, ss = get_score( predict_label_and_marked_label_list) print(score) csv_line.append(str(score)) if len(scoreDic) <= maxTop or sorted( scoreDic.items(), key=lambda d: d[1], reverse=True)[maxTop - 1][1] < score: t.save( r, os.path.join( opt.inpath, modle_kind_name + '.ensembel' + '_' + str(i) + '.' + str(j))) scoreDic[str(i) + '.' + str(j)] = score # print(ss) else: csv_line.append('') f = open(outfile, 'a+', encoding='utf-8') writer = csv.writer(f) writer.writerow(csv_line) f.close() print(scoreDic) turn = 2 maxTurn = tNum scoreDic_new = scoreDic while turn < maxTurn: scoreDic = scoreDic_new scoreDic_new = {} rankscorelist = sorted(scoreDic.items(), key=lambda d: d[1], reverse=True) print(rankscorelist) top = 0 while top < maxTop and top < len(rankscorelist): donelist = [int(_) for _ in rankscorelist[top][0].split('.')] print(donelist) selfscore = rankscorelist[top][1] for i in range(tNum): if i not in donelist: print(i) print( os.path.join( opt.inpath, modle_kind_name + '.ensembel' + '_' + rankscorelist[top][0])) r = weight * t.load( os.path.join( opt.inpath, modle_kind_name + '.ensembel' + '_' + rankscorelist[top][0])) r += t.load(files[i]) true_labels = [] for ii in range(len(r)): true_labels.append(qid2label[index2qid[ii]]) result = (r).topk(5, 1)[1] predict_label_and_marked_label_list = [ [_1, _2] for _1, _2 in zip(result, true_labels) ] score, _, _, ss = get_score( predict_label_and_marked_label_list) print(score) f = open(outfile, 'a+', encoding='utf-8') writer = csv.writer(f) writer.writerow( [rankscorelist[top][0], str(i), str(score)]) f.close() if (score > selfscore and score > single_score_of_files_rank[i] ) and (len(scoreDic_new) <= maxTop or sorted(scoreDic_new.items(), key=lambda d: d[1], reverse=True)[maxTop - 1][1] < score): t.save( r, os.path.join( opt.inpath, modle_kind_name + '.ensembel' + '_' + rankscorelist[top][0] + '.' + str(i))) scoreDic_new[rankscorelist[top][0] + '.' + str(i)] = score top += 1 turn += 1
def main(**kwargs): opt.parse(kwargs) # import ipdb;ipdb.set_trace() files = glob.glob(opt.test_model_path + '/*') total = len(files) now = 0 for file in files: now += 1 print('now: ' + str(now) + '/' + str(total)) print(file) if not os.path.isfile(file): print('is path ') continue filepath, file = os.path.split(file) cuts = file.split('_') modelkind = cuts[0] if modelkind == 'CNNText' and cuts[1] == 'inception': modelkind += '_inception' modeltype = cuts[-2] opt.model_path = os.path.join(opt.test_model_path, file) opt.type_ = modeltype try: # import ipdb;ipdb.set_trace() model = getattr(models, modelkind)(opt).cuda().eval() if opt.model_path is not None: model.load(opt.model_path) opt.parse(kwargs) opt.type_ = modeltype opt.result_path = os.path.join(opt.test_result_path, file) + '.pth' model = model.eval() test_data_title, test_data_content, index2qid, labels = load_data( type_=opt.type_) Num = len(test_data_title) result = np.zeros((Num, 25556)) for i in tqdm.tqdm(range(Num)): if i % opt.batch_size == 0 and i > 0: # import ipdb;ipdb.set_trace() title = np.array(test_data_title[i - opt.batch_size:i]) content = np.array(test_data_content[i - opt.batch_size:i]) result[i - opt.batch_size:i, :] = dotest( model, title, content) if Num % opt.batch_size != 0: # original # title=np.array(test_data_title[opt.batch_size*(Num/opt.batch_size):]) # content=np.array(test_data_content[opt.batch_size*(Num/opt.batch_size):]) # result[opt.batch_size*(Num/opt.batch_size):,:]=dotest(model,title,content) title = np.array(test_data_title[opt.batch_size * (Num // opt.batch_size):]) content = np.array(test_data_content[opt.batch_size * (Num // opt.batch_size):]) result[opt.batch_size * (Num // opt.batch_size):, :] = dotest( model, title, content) # import ipdb;ipdb.set_trace() print('save') print(opt.result_path) t.save(t.from_numpy(result).float(), opt.result_path) if not os.path.exists(os.path.join(filepath + 'dealed')): print('no file') os.makedirs(os.path.join(filepath + 'dealed')) print('move from -- to --') print(os.path.join(filepath, file)) print(os.path.join(filepath + 'dealed')) shutil.move(os.path.join(filepath, file), os.path.join(filepath + 'dealed')) f = open('finish.getprobability.log', 'a', encoding='utf-8') f.write(file + '\n') f.close() except Exception as e: f = open('error.getprobability.log', 'a', encoding='utf-8') f.write(file + '\n') f.close()
def main(**kwargs): print('***************A') opt.parse(kwargs) # import ipdb;ipdb.set_trace() files = glob.glob(opt.test_model_path + '/*') total = len(files) now = 0 outfile = opt.ps_outfile label_path = opt.labels_path test_data_path = opt.test_data_path index2qid = np.load(test_data_path)['index2qid'].item() with open(label_path) as f: labels_info = json.load(f) qid2label = labels_info['d'] for file in files: try: now += 1 print('now: ' + str(now) + '/' + str(total)) print(file) if not os.path.isfile(file): print('is path ') continue filepath, file = os.path.split(file) cuts = file.split('_') modelkind = cuts[0] if modelkind == 'CNNText' and cuts[1] == 'inception': modelkind += '_inception' # print(modelkind) modeltype = cuts[-2] opt.model_path = os.path.join(opt.test_model_path, file) opt.type_ = modeltype # import ipdb;ipdb.set_trace() model = getattr(models, modelkind)(opt).cuda().eval() if opt.model_path is not None: model.load(opt.model_path) opt.parse(kwargs) opt.type_ = modeltype opt.result_path = os.path.join(opt.test_result_path, file) if not os.path.exists(opt.test_result_path): print('no dir: ' + opt.test_result_path) os.makedirs(opt.test_result_path) model = model.eval() test_data_title, test_data_content, index2qid, labels = load_data( type_=opt.type_) Num = len(test_data_title) result = np.zeros((Num, 25556)) for i in tqdm.tqdm(range(Num)): if i % opt.batch_size == 0 and i > 0: # import ipdb;ipdb.set_trace() title = np.array(test_data_title[i - opt.batch_size:i]) content = np.array(test_data_content[i - opt.batch_size:i]) result[i - opt.batch_size:i, :] = dotest( model, title, content) if Num % opt.batch_size != 0: # original # title=np.array(test_data_title[opt.batch_size*(Num/opt.batch_size):]) # content=np.array(test_data_content[opt.batch_size*(Num/opt.batch_size):]) # result[opt.batch_size*(Num/opt.batch_size):,:]=dotest(model,title,content) title = np.array(test_data_title[opt.batch_size * (Num // opt.batch_size):]) content = np.array(test_data_content[opt.batch_size * (Num // opt.batch_size):]) result[opt.batch_size * (Num // opt.batch_size):, :] = dotest( model, title, content) # import ipdb;ipdb.set_trace() # r = t.from_numpy(result).float() if opt.save_test_result == True: t.save(r, opt.result_path + '.pth') true_labels = [] for ii in range(len(r)): true_labels.append(qid2label[index2qid[ii]]) tmp = r result = (tmp).topk(opt.visible_top_num, 1)[1] predict_label_and_marked_label_list = [ [_1, _2] for _1, _2 in zip(result, true_labels) ] if opt.save_top_result: print('doing top_result') top_result_raw = (tmp).topk(opt.visible_top_num, 1) top_result = top_result_raw[1] t.save(top_result_raw, opt.result_path + '.topPrId' + str(opt.visible_top_num)) _, _, _, _, precision_classes_num = get_score_topk( predict_label_and_marked_label_list, opt.visible_top_num, opt.num_classes) kk = t.Tensor(precision_classes_num) t.save( kk, opt.result_path + '.top' + str(opt.visible_top_num) + '.Pre') print('saved top_result') result = (tmp).topk(5, 1)[1] predict_label_and_marked_label_list = [ [_1, _2] for _1, _2 in zip(result, true_labels) ] score, _, _, ss = get_score(predict_label_and_marked_label_list) print(score) print(ss) f = open(outfile, 'a', encoding='utf-8') writer = csv.writer(f) writer.writerow([file, str(score)]) f.close() if not os.path.exists(os.path.join(filepath + 'dealed')): print('no dir: ' + os.path.join(filepath + 'dealed')) os.makedirs(os.path.join(filepath + 'dealed')) print('move from -- to --') print(os.path.join(filepath, file)) print(os.path.join(filepath + 'dealed')) shutil.move(os.path.join(filepath, file), os.path.join(filepath + 'dealed')) f = open('finish.getprobability.log', 'a', encoding='utf-8') f.write(file + '\n') f.close() except Exception as e: print(file) print(e)
def main(**kwargs): opt.parse(kwargs) outfile = opt.ensemble_outfile label_path = opt.labels_path test_data_path = opt.test_data_path single_model_score_file = opt.single_model_score_file pth_data_path = opt.inpath index2qid = np.load(test_data_path)['index2qid'].item() with open(label_path) as f: labels_info = json.load(f) qid2label = labels_info['d'] label2qid = labels_info['id2label'] files = glob.glob(pth_data_path + '/*.pth') single_score = {} fin = open(single_model_score_file, 'r', encoding='utf8') reader = csv.reader(fin) single_model_score_file_data = list(reader) for _ in range(len(single_model_score_file_data)): single_score[single_model_score_file_data[_][0].split('/') [-1].strip()] = float(single_model_score_file_data[_][1]) model_kind_list = [] model_kind_dict = {} for i in range(len(files)): filename = os.path.split(files[i])[1] model_kind = filename[::-1].split('_', 1)[1][::-1].strip() if model_kind in model_kind_list: model_kind_dict[model_kind].append(files[i]) else: model_kind_list.append(model_kind) model_kind_dict[model_kind] = [] model_kind_dict[model_kind].append(files[i]) same_modle_ensemble_file_list = [] same_modle_ensemble_score_dict = {} print('model_kind_list: ') print(model_kind_list) t_m_num = len(model_kind_list) d_m_num = 0 for deal_model_list in model_kind_list: print('dealing %s model' % (deal_model_list)) d_m_num += 1 r = 0 csv_line = [] t_c_num = len(model_kind_dict[deal_model_list]) d_c_num = 0 for file in model_kind_dict[deal_model_list]: csv_line.append(file.split('/')[-1].strip()) csv_line.append(deal_model_list + '.ensembel') f = open(outfile, 'a+', encoding='utf-8') writer = csv.writer(f) writer.writerow(csv_line) f.close() csv_line = [] for file in model_kind_dict[deal_model_list]: d_c_num += 1 print('dealing %d/%d model: %d/%d checkpoint' % (d_m_num, t_m_num, d_c_num, t_c_num)) r += t.load(file) this_score = str(single_score[file.split('/')[-1].strip().replace( '.pth', '')]) print(this_score) csv_line.append(this_score) print('counting') t.save(r, os.path.join(opt.inpath, deal_model_list + '.ensembel')) true_labels = [] for ii in range(len(r)): true_labels.append(qid2label[index2qid[ii]]) result = (r).topk(5, 1)[1] predict_label_and_marked_label_list = [ [_1, _2] for _1, _2 in zip(result, true_labels) ] score, _, _, ss = get_score(predict_label_and_marked_label_list) print(score) csv_line.append(str(score)) f = open(outfile, 'a+', encoding='utf-8') writer = csv.writer(f) writer.writerow(csv_line) f.close() same_modle_ensemble_file_list.append(deal_model_list + '.ensembel') same_modle_ensemble_score_dict[deal_model_list + '.ensembel'] = score csv_line = [] csv_line.append('') for _ in same_modle_ensemble_file_list: csv_line.append(_) f = open(outfile, 'a+', encoding='utf-8') writer = csv.writer(f) writer.writerow(csv_line) f.close() for i in range(len(same_modle_ensemble_file_list)): for j in range(len(same_modle_ensemble_file_list)): csv_line = [] csv_line.append(same_modle_ensemble_file_list[i]) if i > j: print('dealing: %d, %d' % (i, j)) r = t.load( os.path.join(opt.inpath, same_modle_ensemble_file_list[i])) r += t.load( os.path.join(opt.inpath, same_modle_ensemble_file_list[j])) true_labels = [] for ii in range(len(r)): true_labels.append(qid2label[index2qid[ii]]) result = (r).topk(5, 1)[1] predict_label_and_marked_label_list = [ [_1, _2] for _1, _2 in zip(result, true_labels) ] score, _, _, ss = get_score( predict_label_and_marked_label_list) print(score) csv_line.append(str(score)) # print(ss) else: csv_line.append('') f = open(outfile, 'a+', encoding='utf-8') writer = csv.writer(f) writer.writerow(csv_line) f.close()
def main(**kwargs): ''' 训练入口 ''' opt.parse(kwargs,print_=False) if opt.debug:import ipdb;ipdb.set_trace() model = getattr(models,opt.model)(opt).cuda() if opt.model_path: model.load(opt.model_path) print(model) opt.parse(kwargs,print_=True) pre_loss=1.0 lr,lr2=opt.lr,opt.lr2 loss_function = getattr(models,opt.loss)() dataset = ZhihuData(opt.train_data_path,opt.labels_path,type_=opt.type_,augument=opt.augument) dataloader = data.DataLoader(dataset, batch_size = opt.batch_size, shuffle = opt.shuffle, num_workers = opt.num_workers, pin_memory = True ) optimizer = model.get_optimizer(lr,opt.lr2,opt.weight_decay) loss_meter = tnt.meter.AverageValueMeter() score_meter=tnt.meter.AverageValueMeter() best_score = 0 for epoch in range(opt.max_epoch): loss_meter.reset() score_meter.reset() for ii,((title,content),label) in tqdm.tqdm(enumerate(dataloader)): # 训练 更新参数 title,content,label = Variable(title.cuda()),Variable(content.cuda()),Variable(label.cuda()) optimizer.zero_grad() score = model(title,content) # import ipdb;ipdb.set_trace() loss = loss_function(score,opt.weight*label.float()) loss_meter.add(loss.data[0]) loss.backward() optimizer.step() if ii%opt.plot_every ==opt.plot_every-1: if os.path.exists(opt.debug_file): import ipdb ipdb.set_trace() predict = score.data.topk(5,dim=1)[1].cpu().tolist() true_target = label.data.float().cpu().topk(5,dim=1) true_index=true_target[1][:,:5] true_label=true_target[0][:,:5] predict_label_and_marked_label_list=[] for jj in range(label.size(0)): true_index_=true_index[jj] true_label_=true_label[jj] true=true_index_[true_label_>0] predict_label_and_marked_label_list.append((predict[jj],true.tolist())) score_,prec_,recall_,_ss=get_score(predict_label_and_marked_label_list) score_meter.add(score_) print('prec:%s,recall:%s,score:%s,a:%s' %(prec_,recall_,score_,_ss)) print('scores', score_meter.value()[0]) #eval() print('loss', loss_meter.value()[0]) k = t.randperm(label.size(0))[0] if ii%opt.decay_every == opt.decay_every-1: # 计算在验证集上的分数,并相对应的调整学习率 del loss scores,prec_,recall_ ,_ss= val(model,dataset) # vis.log({' epoch:':epoch,' lr: ':lr,'scores':scores,'prec':prec_,'recall':recall_,'ss':_ss,'scores_train':score_meter.value()[0],'loss':loss_meter.value()[0]}) if scores>best_score: best_score = scores best_path = model.save(name = str(scores),new=True) else: try: model.save(name=str(scores), new=True) except Exception as e: print(e) if scores < best_score: model.load(best_path,change_opt=False) lr = lr * opt.lr_decay lr2= 2e-4 if lr2==0 else lr2*0.8 optimizer = model.get_optimizer(lr,lr2,0) pre_loss = loss_meter.value()[0] loss_meter.reset() score_meter.reset()