def test_threshold_detector_returns_expected_results(path_to_config_threshold, path_to_output_reference, make_tmp_folder): util.seed(0) yass.set_config(path_to_config_threshold, make_tmp_folder) (standardized_path, standardized_params, whiten_filter) = preprocess.run(output_directory=make_tmp_folder) clear = detect.run(standardized_path, standardized_params, whiten_filter) path_to_clear = path.join(path_to_output_reference, 'detect_threshold_clear.npy') ReferenceTesting.assert_array_equal(clear, path_to_clear)
def main(): cfg = util.readConfig(util.gcfg( )) cseed = 0 if cfg[MAIN][SEED] == '': cseed = util.seed( ) else: cseed = float(cfg[MAIN][SEED]) random.seed(cseed) util.loadCSV(cfg[AGENT][CSV_FILE]) lg = log( cfg, cseed, util.gcfg( ) ) util.renderHead( cfg ) best = None for i in range( 0, int(cfg[MAIN][RUNS]) ): lg.sep( i ) nbest = run(cfg, i, lg) #Determine if our new potential best is better, # this just uses the average of the two fitness values versus the bad opponents if best == None or nbest.fit > best.fit: if best != None: best.delete( ) best = nbest lg.best(best) lg.absBestFinish(cfg, best) lg.wrapUp(best) print("\n")
def main(): if len(sys.argv)!=4: print 'USAGE:' print 'python -m scoop devel.py [cloneID] [clusterDir] [outputDir]' print 'see devel_config.py' return cloneID = sys.argv[1] clusterDir = sys.argv[2]; assert clusterDir[-1]=='/',"should be ended with '/'" baseOutDir = sys.argv[3]; assert baseOutDir[-1]!='/',"should NOT be ended with '/'" clfParam = None method = cfg['method'] if method=='esvm': from esvm_config import config as clfParam elif method=='psvm': from psvm_config import config as clfParam else: print 'FATAL: unknown method' return outDir = os.path.join(baseOutDir,'devel-'+os.path.basename(baseOutDir)) if not(os.path.isdir(baseOutDir)): os.makedirs(baseOutDir) if not(os.path.isdir(outDir)): os.makedirs(outDir) ## Load data ################################################################################### dataLog = {}; dataLogFpath = os.path.join(outDir,'data_log_'+os.path.basename(baseOutDir)+'.json') dataset = clusterDir.split('/')[-2].split('-')[-1]; dataLog['dataset'] = dataset datasetParams = dataset.split('#') assert datasetParams[0]=='yamanishi' xyDevFpath = os.path.join(baseOutDir,'_'.join(['xdev','ydev','xrel','yrel']+datasetParams)+'.h5') if os.path.exists(xyDevFpath): print 'loading data from PREVIOUS...' with h5py.File(xyDevFpath,'r') as f: xdev = f['xdev'][:] ydev = f['ydev'][:] xrel = f['xrel'][:] yrel = f['yrel'][:] xrelraw = f['xrelraw'][:] with open(dataLogFpath,'r') as f: dataLog = yaml.load(f) else: print 'loading data FRESHLY...' print 'loading cluster result...' nUnlabels = [] statFnames = [i for i in os.listdir(clusterDir) if 'labels_stat.json' in i] for i in statFnames: with open(os.path.join(clusterDir,i),'r') as f: stat = yaml.load(f) nUnlabels.append(stat['0']) # use the cluster with minimum numbers of unlabeled samples metric = '_'.join(statFnames[ nUnlabels.index(min(nUnlabels)) ].split('_')[0:2]) dataLog['metric'] = metric connFpath = os.path.join(clusterDir,metric+'_labels.pkl') with open(connFpath,'r') as f: data = pickle.load(f) ## print 'getting devel and release data...' xraw = []; yraw = [] for k,v in data.iteritems(): for vv in v: xraw.append(vv) yraw.append(k) devIdx = [i for i in range(len(xraw)) if yraw[i]!=0] xdev = [xraw[i] for i in devIdx] ydev = [yraw[i] for i in devIdx] relIdx = [i for i in range(len(xraw)) if yraw[i]==0] xrel = [xraw[i] for i in relIdx] yrel = [yraw[i] for i in relIdx] dataLog['nDevel'] = len(devIdx); dataLog['nData'] = len(yraw) dataLog['rDevel:Data'] = dataLog['nDevel']/float(dataLog['nData']) dataLog['nDevel(+)'] = len( [i for i in ydev if i==1] ); assert dataLog['nDevel(+)']!=0 dataLog['nDevel(-)'] = len( [i for i in ydev if i==-1] ); assert dataLog['nDevel(-)']!=0 dataLog['rDevel(+):Devel'] = float(dataLog['nDevel(+)'])/dataLog['nDevel'] dataLog['rDevel(-):Devel'] = float(dataLog['nDevel(-)'])/dataLog['nDevel'] dataLog['rDevel(+):(-)'] = float(dataLog['nDevel(+)'])/float(dataLog['nDevel(-)']) dataLog['nRelease'] = len(relIdx); dataLog['rRelease:Data'] = dataLog['nRelease']/float(dataLog['nData']) ## print 'loading com, pro feature...' krFpath = os.path.join(cfg['datasetDir'],datasetParams[0],'feature', 'klekotaroth','klekotaroth-'+datasetParams[1]+'.h5') aacFpath = os.path.join(cfg['datasetDir'],datasetParams[0],'feature', 'amino-acid-composition','amino-acid-composition-'+datasetParams[1]+'.h5') krDict = {}; aacDict = {} with h5py.File(krFpath, 'r') as f: for com in [str(i) for i in f.keys()]: krDict[com] = f[com][:] with h5py.File(aacFpath, 'r') as f: for pro in [str(i) for i in f.keys()]: aacDict[pro] = f[pro][:] # aacDict[pro] = list( fu.map(lambda x: float('%.2f'%(x)),f[pro][:]) ) # rounding comFeaLenOri = len(krDict.values()[0]) proFeaLenOri = len(aacDict.values()[0]) ## msg = 'extract (com,pro) feature... dims: '+str(comFeaLenOri)+','+str(proFeaLenOri) msg += ' of '+str(len(ydev))+' and '+str(len(yrel)) print msg sh.setConst(krDict=krDict) sh.setConst(aacDict=aacDict) xdevf = list( fu.map(cutil.extractComProFea,xdev) ) xrelf = list( fu.map(cutil.extractComProFea,xrel) ) ## xyDevList = cutil.divideSamples(xdevf,ydev,cfg['smoteBatchSize']) if cfg['maxNumberOfSmoteBatch'] != 0: xyDevList = xyDevList[0:cfg['maxNumberOfSmoteBatch']] smoteSeed = util.seed(); dataLog['smoteSeed'] = smoteSeed sh.setConst(smoteSeed=smoteSeed) print 'resampling via Smote FRESHLY... '+str(len(xyDevList))+' smote(s)'+' on '+str(len(ydev)) smoteTic = time.time() xdevfr = []; ydevr = [] xydevfrList = list( fu.map(ensembleSmote,xyDevList) ) for xdevfri,ydevri in xydevfrList: for x in xdevfri: xdevfr.append(x.tolist()) for y in ydevri: ydevr.append(y) assert len(xdevfr)==len(ydevr),'len(xdevfr)!=len(ydevr)' dataLog['nSmote'] = len(xyDevList) dataLog['nDevelResampled'] = len(ydevr) dataLog['rDevelResampled:Data'] = dataLog['nDevelResampled']/float(dataLog['nData']) dataLog['nDevelResampled(+)'] = len( [i for i in ydevr if i==1] ) dataLog['nDevelResampled(-)'] = len( [i for i in ydevr if i==-1] ) dataLog['rDevelResampled(+):DevelResampled'] = dataLog['nDevelResampled(+)']/float(dataLog['nDevelResampled']) dataLog['rDevelResampled(-):DevelResampled'] = dataLog['nDevelResampled(-)']/float(dataLog['nDevelResampled']) dataLog['rDevelResampled(+):(-)'] = dataLog['nDevelResampled(+)']/float(dataLog['nDevelResampled(-)']) dataLog['timeSMOTE'] = str(time.time()-smoteTic) ## print 'update xdev,ydev,xrel... '+str(np.asarray(xdevfr).shape) xrelraw = xrel[:] # raw: feature is NOT extracted xrel = xrelf[:] xdev = xdevfr[:] ydev = ydevr[:] print 'writing updated xdev,ydev and xrel,yrel...' with h5py.File(xyDevFpath,'w') as f: f.create_dataset('xdev',data=xdev,dtype=np.float32) f.create_dataset('ydev',data=ydev,dtype=np.int8) f.create_dataset('xrel',data=xrel,dtype=np.float32) f.create_dataset('yrel',data=yrel,dtype=np.int8) f.create_dataset('xrelraw',data=xrelraw) print 'writing dataLog...' dataLog['nCom'] = len(krDict) dataLog['nPro'] = len(aacDict) with open(dataLogFpath,'w') as f: json.dump(dataLog,f,indent=2,sort_keys=True) ## TUNE+TRAIN+TEST ############################################################################# devLog = {} devSeed = util.seed(); dataLog['devSeed'] = devSeed tag = '_'.join([method+'#'+cloneID,dataset,util.tag()]) ## split devel dataset msg = ' '.join( ['devel',dataset,cloneID]) xtr,xte,ytr,yte = tts(xdev,ydev,test_size=cfg['testSize'], random_state=devSeed,stratify=ydev) if cfg['maxTestingSamples']>0: chosenIdx = np.random.randint(len(xte),size=cfg['maxTestingSamples']) xte = [xte[i] for i in chosenIdx]; yte = [yte[i] for i in chosenIdx] devLog['nTraining'] = len(xtr) devLog['nTraining(+)'] = len([i for i in ytr if i==1]) devLog['nTraining(-)'] = len([i for i in ytr if i==-1]) devLog['rTraining(+):(-)'] = devLog['nTraining(+)']/float(devLog['nTraining(-)']) devLog['rTraining:Devel'] = devLog['nTraining']/float(dataLog['nDevelResampled']) devLog['nTesting'] = len(xte) devLog['nTesting(+)'] = len([i for i in yte if i==1]) devLog['nTesting(-)'] = len([i for i in yte if i==-1]) devLog['rTesting(+):(-)'] = devLog['nTesting(+)']/float(devLog['nTesting(-)']) devLog['rTesting:Devel'] = devLog['nTesting']/float(dataLog['nDevelResampled']) ## tuning clf = None if method=='esvm': clf = eSVM(simMat=None) elif method=='psvm': clf = svm.SVC(kernel=clfParam['kernel'],probability=True) ## training print msg+': fitting nTr= '+str(len(ytr)) trTic = time.time() if method=='esvm': clf.fit(xtr,ytr) devLog['labels'] = clf.labels() devLog['nSVM'] = clf.nSVM() devLog['xtrDimAllBatches'] = clf.xtrDimAllBatches() elif method=='psvm': if cfg['method']['kernel']=='precomputed': assert False # simMatTr = cutil.makeComProKernelMatFromSimMat(xtr,xtr,simMat) # clf.fit(simMatTr,ytr) else: clf.fit(xtr,ytr) devLog['labels'] = clf.classes_.tolist() devLog['timeTraining'] = str(time.time()-trTic) ## testing print msg+': predicting nTe= '+str(len(yte)) teTic = time.time() if method=='esvm': ypred,yscore = clf.predict(xte) elif method=='psvm': if cfg['method']['kernel']=='precomputed': assert False # simMatTe = cutil.makeComProKernelMatFromSimMat(xte,xtr,simMat) # ypred = clf.predict(simMatTe) # yscore = clf.predict_proba(simMatTe) else: ypred = clf.predict(xte) yscore = clf.predict_proba(xte) yscore = [max(i.tolist()) for i in yscore] devLog['timeTesting'] = str(time.time()-teTic) ## TEST RELEASE ################################################################################ print msg+': predicting RELEASE n= '+str(len(yrel)) relTic = time.time() if method=='esvm': yrel,yrelscore = clf.predict(xrel) elif method=='psvm': if cfg['method']['kernel']=='precomputed': assert False # simMatTe = cutil.makeComProKernelMatFromSimMat(xrel,xtr,simMat) # yrel = clf.predict(simMatTe) # yrelscore = clf.predict_proba(simMatTe) else: yrel = clf.predict(xrel) yrelscore = clf.predict_proba(xrel) yrelscore = [max(i.tolist()) for i in yrelscore] devLog['timeRelease'] = str(time.time()-relTic) ## WRITE RESULT ################################################################################ result = {'yte':yte,'ypred':ypred,'yscore':yscore, 'xrelraw':xrelraw,'yrel':yrel,'yrelscore':yrelscore} print 'writing prediction...' with h5py.File(os.path.join(outDir,'result_'+tag+'.h5'),'w') as f: for k,v in result.iteritems(): if 'raw' in k: f.create_dataset(k,data=v) else: dt = np.int8 if 'score' in k: dt = np.float32 f.create_dataset(k,data=v,dtype=dt) ## print 'writing devLog...' devLog['clfParam'] = clfParam devLog['devParam'] = cfg with open(os.path.join(outDir,'devLog_'+tag+'.json'),'w') as f: json.dump(devLog,f,indent=2,sort_keys=True)
def main(): # seed random number generator engine util.seed(0) # set up sheep parameters Sheep.set_gene_size(32) Sheep.prop.repr_age = 8 Sheep.prop.threshold = 3 Sheep.prop.mut_rate = 2 Sheep.prop.N_max = 1000 Sheep.prop.N_init = 1000 # set up bear parameters Bear.set_gene_size(32) Bear.prop.repr_age = 8 Bear.prop.threshold = 3 Bear.prop.mut_rate = 2 Bear.prop.N_max = 1000 Bear.prop.N_init = 1000 # generate initial population pop = [] # list for i in range(Sheep.prop.N_init): pop.append(Sheep(Sheep.random_age())) Sheep.prop.N_t += 1 for i in range(Bear.prop.N_init): pop.append(Bear(Bear.random_age())) Bear.prop.N_t += 1 # prepare output file with open("py_pennaLV_week03.txt", "w") as ofs: ofs.write("time sheep bear\n") ofs.write("#param seed {} ".format(util.seed())) ofs.write("N_init {} ".format(Sheep.prop.N_init)) ofs.write("N_max {} ".format(Sheep.prop.N_max)) ofs.write("gene_size {} ".format(Sheep.prop.gene_size)) ofs.write("repr_age {} ".format(Sheep.prop.repr_age)) ofs.write("mut_rate {} ".format(Sheep.prop.mut_rate)) ofs.write("threshold {}\n".format(Sheep.prop.threshold)) ofs.write("b_N_init {} ".format(Bear.prop.N_init)) ofs.write("b_N_max {} ".format(Bear.prop.N_max)) ofs.write("b_gene_size {} ".format(Bear.prop.gene_size)) ofs.write("b_repr_age {} ".format(Bear.prop.repr_age)) ofs.write("b_mut_rate {} ".format(Bear.prop.mut_rate)) ofs.write("b_threshold {}\n".format(Bear.prop.threshold)) # run simulation for gen in range(300): for s in pop[:]: if not s.progress(): pop.remove(s) s.prop.N_t -= 1 elif s.adult(): pop.insert(0, s.make_child()) s.prop.N_t += 1 #~ print("{} {} {}".format(gen, Sheep.prop.N_t, Bear.prop.N_t)) ofs.write("{} {} {}\n".format(gen, Sheep.prop.N_t, Bear.prop.N_t)) print("total sheep: {}".format(Sheep.prop.N_t)) print("total bear: {}".format(Bear.prop.N_t)) print(pop[-1])
def train(self, writer=None, batch_size=8, lr=1 * 10**-5, num_epochs=3, seed=None): max_grad_norm = 1.0 test_losses = [] should_stop = early_stopping.ConsecutiveNonImprovment(3) self.training_parameters.append({ "batch_size": batch_size, "lr": lr, "num_epochs": num_epochs, "seed": seed, "base_model": self.base_model, }) self.optimizer = AdamW(self.classifier.parameters(), lr=lr) self.mixed_precision_setup() if self.multi_gpu and type(self.classifier) != torch.nn.DataParallel: self.classifier = torch.nn.DataParallel(self.classifier) scheduler = self.warumup_cooldown_scheduler(self.optimizer, num_epochs, batch_size) for epoch in range(num_epochs): # To not depend on if we run tests after each epoch we need to seed here if seed is not None: util.seed(seed + epoch) print(f"Starting training epoch {epoch + 1}/{num_epochs}") self.classifier.train() loader = DataLoader(self.train_dataset, batch_size=batch_size, collate_fn=PaddedBatch, shuffle=True) for batch in tqdm(loader): self.optimizer.zero_grad() loss, logits = self.classifier( batch.token_ids.cuda(), token_type_ids=batch.sequence_ids.cuda(), attention_mask=batch.input_mask.cuda(), labels=batch.labels.float().cuda(), ) self.num_batches += 1 if writer: writer.add_scalar("cross entropy loss per batch", loss.mean(), self.num_batches) if self.mixed_precision: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.mean().backward() torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), max_grad_norm) else: loss.mean().backward() torch.nn.utils.clip_grad_norm_( self.classifier.parameters(), max_grad_norm) self.optimizer.step() scheduler.step() util.seed_for_testing() test_losses = self.run_test_loss_report(test_losses, epoch, writer) # Early stopping when test loss is no longer improving if should_stop(test_losses): print("Test loss no longer improving, stopping!") print(f"(losses were {test_losses})") best_epoch = sorted(enumerate(test_losses), key=lambda kv: kv[1])[0][0] self.early_stopped_at = best_epoch self.load_epoch_model(best_epoch) return del loader
def setup(): seed(0)