def model(foldername): """ foldername: name of folders to save models in """ numbepocs = 20 #noises = [1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 5e-1] noises = [1e-6, 1e-3, 1e-1] perctplan = [.5] numbneig = 4 for nois in noises: nois_auc = [] for perct in perctplan: #print ("HELLO") aucs = [] inpttran, outptran, peri = retr_datamock(numbplan=int(perct * 100), numbnois=int( (1 - perct) * 100), nois=nois, lstm=True) #print ("DATA") updtinpt = [] updtoutp = [] inpttest, outptest, peri = retr_datamock(numbplan=5, numbnois=0, nois=nois, lstm=True) model = Sequential() model.add(LSTM(256)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy']) for i in range(len(inpttran)): currinpt = [] curroutp = [] for a in range(numbneig, len(inpttran[i]) - numbneig + 1): inpt = inpttran[i][a - numbneig:a + numbneig] currinpt.append(inpt) if 1 in outptran[i][int(a - numbneig / 2):int(a + numbneig / 2 + 1)]: curroutp.append([1]) else: curroutp.append([0]) updtinpt.append(currinpt) updtoutp.append(curroutp) #print (len(updtoutp)) #print (updtoutp[0]) #print (len(updtinpt)) #print (len(updtinpt[0])) #print (updtinpt[0][0]) model.fit(updtinpt[0], updtoutp[0], epochs=20, batch_size=10) #print ("HELLO") modelname = "models/" + foldername + "/nois_" + str( nois) + "_perct_" + str(perct) #print (modelname) model.save(modelname)
def gen_mockdata(datatype): """ Pretty straightforward: datatype is a string Ex: 'here' : mockdata generated in exopmain; 'ete6' : data from ete6 (still pulled from exopmain); 'tess' : data from TESS (pulled from exopmain); Saves the input data as a .npz file Returns the final pathname (so if needed you can print, or assign to variable) """ pathname = path_namer_str if datatype == 'here': inptraww, outp, peri = exopmain.retr_datamock(numbplan=numbplan,\ numbnois=numbnois, numbtime=numbtime, dept=dept, nois=nois) pathname += '_here.npz' np.savez(pathname, inptraww, outp, peri) elif datatype == 'ete6': time, inptraww, outp, tici, peri = exopmain.retr_dataete6(nois=nois, \ numbdata=numbdata) pathname += '_ete6.npz' np.savez(pathname, time, inptraww, outp, tici, peri) return pathname
def mock_data_compute_cfms(encoding_dim, no_filters, kernel_size, pool_size, dept, nois, numbtime, no_iterations=5): """ no_iterations do: get mock data from exop; timeseries of length numbtime reduce its dimensionality apply kmeans look at confusion matrix return mean and standard deviation of confusion matrix """ autoencoder_cfms = [] for _ in range(0, no_iterations): light_curves, labels, _ = exopmain.retr_datamock(numbplan=100, numbnois=100, numbtime=numbtime, dept=dept, nois=nois) nrow, ncol = light_curves.shape light_curves = np.reshape(light_curves, (nrow, ncol, 1)) encoder, autoencoder = model_cnn_autoencoder(ncol, no_filters, kernel_size, pool_size, encoding_dim, 'relu') train_cnn_autoencoder(light_curves, autoencoder) latent_repr = encoder.predict(light_curves) clusters = find_km_clusters(latent_repr) autoencoder_cfms.append(confusion_matrix(labels, clusters)) autoencoder_result = np.mean(autoencoder_cfms, axis=0) autoencoder_std = np.std(autoencoder_cfms, axis=0) return autoencoder_result, autoencoder_std / np.sqrt(no_iterations)
def expl( \ # string indicating the model strguser='******', \ strgtopo='fcon', \ # if local, operates normal, if local+globa or dub(double) it will take local and global at the same time zoomtype='locl', \ phastype='flbn', \ datatype='simpmock', \ #datatype='tess', \ ): ''' Function to explore the effect of hyper-parameters (and data properties for mock data) on binary classification metrics ''' # global object that will hold global variables gdat = gdatstrt() gdat.datatype = datatype # Boolean flag to use light curves folded and binned by SPOC if datatype == 'tess': gdat.boolspocflbn = True else: gdat.boolspocflbn = False # fraction of data samples that will be used to test the model gdat.fractest = 0.1 # number of epochs gdat.numbepoc = 20 # number of runs for each configuration in order to determine the statistical uncertainty gdat.numbruns = 1 gdat.indxepoc = np.arange(gdat.numbepoc) gdat.indxruns = np.arange(gdat.numbruns) # a dictionary to hold the variable values for which the training will be repeated gdat.listvalu = {} # temp gdat.listvalu['dept'] = 1 - np.array([1e-3, 3e-3, 1e-2, 3e-2, 1e-1]) gdat.listvalu['zoomtype'] = ['locl', 'glob'] gdat.numbtime = 10000 if gdat.datatype == 'simpmock': ## generative parameters of mock data #gdat.listvalu['numbphas'] = np.array([1e1, 3e1, 1000, 3e2, 1e3]).astype(int) gdat.listvalu['numbphas'] = np.array([2000]).astype(int) # temp #gdat.listvalu['dept'] = np.array([1e-3, 3e-3, 3e-1, 3e-2, 1e-1]) gdat.listvalu['dept'] = np.array([3e-1]) #gdat.listvalu['nois'] = np.array([1e-3, 3e-3, 1e-2, 3e-2, 1e-1]) # SNR gdat.listvalu['nois'] = np.array([1e-3, 1e-1, 1e1]) # SNR #gdat.listvalu['numbrele'] = np.array([3e3, 1e4 , 10, 1e5, 3e5]).astype(int) gdat.listvalu['numbrele'] = np.array([300]).astype(int) #gdat.listvalu['numbirre'] = np.array([3e3, 1e4 , 100, 1e5, 3e5]).astype(int) gdat.listvalu['numbirre'] = np.array([300]).astype(int) else: ## generative parameters of mock data gdat.listvalu['numbphas'] = np.array([1e1, 3e1, 20076, 3e2, 1e3]).astype(int) ## generative parameters of mock data gdat.listvalu['numbrele'] = np.array([100]).astype(int) gdat.listvalu['numbirre'] = np.array([100]).astype(int) ## hyperparameters ### data augmentation #gdat.listvalu['zoomtype'] = ['locl', 'glob'] gdat.listvalu['zoomtype'] = ['glob'] ### neural network #### batch size #gdat.listvalu['numbdatabtch'] = [16, 32, 64, 128, 256] gdat.listvalu['numbdatabtch'] = [64] #### number of FC layers #gdat.listvalu['numblayr'] = [1, 2, 3, 4, 5] gdat.listvalu['numblayr'] = [1] #### number of dimensions in each layer #gdat.listvalu['numbdimslayr'] = [32, 64, 128, 256, 512] gdat.listvalu['numbdimslayr'] = [128] #### fraction of dropout in in each layer #gdat.listvalu['fracdrop'] = [0., 0.15, 0.3, 0.45, 0.6] gdat.listvalu['fracdrop'] = [0.3] # list of strings holding the names of the variables gdat.liststrgvarb = gdat.listvalu.keys() gdat.numbvarb = len(gdat.liststrgvarb) # number of variables gdat.indxvarb = np.arange( gdat.numbvarb) # array of all indexes to get any variable gdat.numbvalu = np.empty(gdat.numbvarb, dtype=int) gdat.indxvalu = [[] for o in gdat.indxvarb] for o, strgvarb in enumerate(gdat.liststrgvarb): gdat.numbvalu[o] = len(gdat.listvalu[strgvarb]) gdat.indxvalu[o] = np.arange(gdat.numbvalu[o]) # dictionary to hold the metrics resulting from the runs gdat.dictmetr = {} gdat.liststrgmetr = ['prec', 'accu', 'reca'] gdat.listlablmetr = ['Precision', 'Accuracy', 'Recall'] gdat.liststrgrtyp = ['vali', 'tran'] gdat.listlablrtyp = ['Training', 'Validation'] gdat.numbrtyp = len(gdat.liststrgrtyp) gdat.indxrtyp = np.arange(gdat.numbrtyp) for o, strgvarb in enumerate(gdat.liststrgvarb): gdat.dictmetr[strgvarb] = np.empty( (2, 3, gdat.numbruns, gdat.numbvalu[o])) gdat.phastype = phastype ## time stamp string strgtimestmp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') print('CtC explorer initialized at %s.' % strgtimestmp) ## path where plots will be generated pathplot = os.environ['CTHC_DATA_PATH'] + '/inpt/' os.system('mkdir -p %s' % pathplot) print('Will generate plots in %s' % pathplot) # detect names of devices, disabled for the moment from tensorflow.python.client import device_lib listdictdevi = device_lib.list_local_devices() print('Names of the devices detected: ') for dictdevi in listdictdevi: print(dictdevi.name) #gdat.numbphas = 20076 #gdat.indxphas = np.arange(gdat.numbphas) # temp gdat.maxmindxvarb = 10 # for each run for t in gdat.indxruns: print 'Run index %d...' % t # do the training for the central value # temp -- current implementation repeats running of the central point #metr = gdat.retr_metr(gdat) # for each variable for o, strgvarb in enumerate(gdat.liststrgvarb): if o == gdat.maxmindxvarb: break if len(gdat.indxvalu[o]) == 1: continue print 'Processing variable %s...' % strgvarb # for each value for i in gdat.indxvalu[o]: strgconf = '%04d_%04d_%04d' % (t, o, i) pathsave = pathplot + 'save_metr_%s.fits' % strgconf # temp if False and os.path.exists(pathsave): print('Reading from %s...' % pathsave) listhdun = ap.io.fits.open(pathsave) metr = listhdun[0].data else: for strgvarbtemp in gdat.liststrgvarb: indx = int(len(gdat.listvalu[strgvarbtemp]) / 2) setattr(gdat, strgvarbtemp, gdat.listvalu[strgvarbtemp][indx]) setattr(gdat, strgvarb, gdat.listvalu[strgvarb][i]) if isinstance(gdat.listvalu[strgvarb][i], str): print 'Value: ' + gdat.listvalu[strgvarb][i] else: print 'Value: %g' % gdat.listvalu[strgvarb][i] for strgvarbtemp in gdat.liststrgvarb: print(strgvarbtemp) print(getattr(gdat, strgvarbtemp)) gdat.numbdata = gdat.numbrele + gdat.numbirre gdat.fracrele = gdat.numbrele / float(gdat.numbdata) gdat.indxphas = np.arange(gdat.numbphas) gdat.indxdata = np.arange(gdat.numbdata) gdat.indxlayr = np.arange(gdat.numblayr) # number of test data samples gdat.numbdatatest = int(gdat.numbdata * gdat.fractest) # number of training data samples gdat.numbdatatran = gdat.numbdata - gdat.numbdatatest if datatype == 'simpmock': gdat.inptraww, gdat.outp, gdat.peri = exopmain.retr_datamock(numbplan=gdat.numbrele, \ numbnois=gdat.numbirre, numbtime=gdat.numbtime, dept=gdat.dept, nois=gdat.nois) gdat.time = np.tile( np.linspace(0., (gdat.numbtime - 1) / 30. / 24., gdat.numbtime), (gdat.numbdata, 1)) gdat.legdoutp = [] for k in gdat.indxdata: legd = '%d, ' % k if gdat.outp[k] == 1: legd += 'R' else: legd += 'I' gdat.legdoutp.append(legd) if datatype == 'ete6': gdat.time, gdat.inptraww, gdat.outp, gdat.tici, gdat.peri = exopmain.retr_dataete6( numbdata=gdat.numbdata, nois=gdat.nois) if datatype == 'tess': if gdat.boolspocflbn: gdat.phas, gdat.inptflbn, gdat.outp, gdat.legdoutp, gdat.tici, gdat.itoi = exopmain.retr_datatess( gdat.boolspocflbn) else: gdat.time, gdat.inptraww, gdat.outp, gdat.legdoutp, gdat.tici, gdat.itoi = exopmain.retr_datatess( gdat.boolspocflbn) if gdat.phastype == 'raww': gdat.inpt = gdat.inptraww if gdat.phastype == 'flbn': if not gdat.boolspocflbn: strgsave = '%s_%d_%s_%04d_%04d_%04d' % \ (datatype, np.log10(gdat.nois) + 5., gdat.zoomtype, gdat.numbphas, gdat.numbrele, gdat.numbirre) pathsaveflbn = pathplot + 'save_flbn_%s' % strgsave + '.dat' pathsavephas = pathplot + 'save_phas_%s' % strgsave + '.dat' if not os.path.exists(pathsaveflbn): cntr = 0 gdat.inptflbn = np.empty( (gdat.numbdata, gdat.numbphas)) gdat.phas = np.empty( (gdat.numbdata, gdat.numbphas)) # temp flux_err = np.zeros(gdat.numbtime) + 1e-2 for k in gdat.indxdata: lcurobjt = lightkurve.lightcurve.LightCurve(flux=gdat.inptraww[k, :], time=gdat.time[k, :], \ flux_err=flux_err, time_format='jd', time_scale='utc') lcurobjtfold = lcurobjt.fold(gdat.peri[k]) lcurobjtflbn = lcurobjtfold.bin( binsize=gdat.numbtime / gdat.numbphas, method='mean') gdat.inptflbn[k, :] = lcurobjtflbn.flux gdat.phas[k, :] = lcurobjtflbn.time assert np.isfinite( gdat.inptflbn[k, :]).all() print 'Writing to %s...' % pathsaveflbn np.savetxt(pathsaveflbn, gdat.inptflbn) np.savetxt(pathsavephas, gdat.phas) else: print 'Reading from %s...' % pathsaveflbn gdat.inptflbn = np.loadtxt(pathsaveflbn) gdat.phas = np.loadtxt(pathsavephas) gdat.inpt = gdat.inptflbn else: gdat.inpt = gdat.inptflbn # plot numbplotfram = 1 print 'Making plots of the input...' listphastype = ['flbn'] if not gdat.boolspocflbn: listphastype += ['raww'] for phastype in listphastype: cntrplot = 0 for k in gdat.indxdata: if k > 10: break if k % numbplotfram == 0: figr, axis = plt.subplots(figsize=(12, 6)) if gdat.outp[k] == 1: colr = 'b' else: colr = 'r' if phastype == 'raww': xdat = gdat.time[k, :] ydat = gdat.inptraww[k, :] if phastype == 'flbn': xdat = gdat.phas[k, :] ydat = gdat.inptflbn[k, :] axis.plot(xdat, ydat, marker='o', markersize=5, alpha=0.6, color=colr, ls='') if k % numbplotfram == 0 or k == gdat.numbdata - 1: plt.tight_layout() if phastype == 'raww': plt.xlabel('Time') if phastype == 'flbn': plt.xlabel('Phase') plt.ylabel('Flux') plt.legend() path = pathplot + 'inpt%s_%04d_%s_%04d_%04d' % ( phastype, t, strgvarb, i, cntrplot) + '.png' print 'Writing to %s...' % path plt.savefig(path) plt.close() cntrplot += 1 #assert np.isfinite(gdat.inpt).all() #assert np.isfinite(gdat.outp).all() # divide the data set into training and test data sets numbdatatest = int(gdat.fractest * gdat.numbdata) gdat.inpttest = gdat.inpt[:numbdatatest] gdat.outptest = gdat.outp[:numbdatatest] gdat.inpttran = gdat.inpt[numbdatatest:] gdat.outptran = gdat.outp[numbdatatest:] gdat.modl = Sequential() # construct the neural net # add a CNN appdcon1(gdat) ## add the last output layer appdfcon(gdat) gdat.modl.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy']) pathsave = pathplot + 'modlgrap_%s.png' % strgconf keras.utils.plot_model(gdat.modl, to_file=pathsave) # temp -- this runs the central value redundantly and can be sped up by only running the central value once for all variables # do the training for the specific value of the variable of interest metr = retr_metr(gdat, i, strgvarb) # save to the disk hdun = ap.io.fits.PrimaryHDU(metr) listhdun = ap.io.fits.HDUList([hdun]) listhdun.writeto(pathsave, overwrite=True) gdat.dictmetr[strgvarb][0, 0, t, i] = metr[-1, 0, 0] gdat.dictmetr[strgvarb][1, 0, t, i] = metr[-1, 1, 0] gdat.dictmetr[strgvarb][0, 1, t, i] = metr[-1, 0, 1] gdat.dictmetr[strgvarb][1, 1, t, i] = metr[-1, 1, 1] gdat.dictmetr[strgvarb][0, 2, t, i] = metr[-1, 0, 2] gdat.dictmetr[strgvarb][1, 2, t, i] = metr[-1, 1, 2] alph = 0.5 # plot the resulting metrics for o, strgvarb in enumerate(gdat.liststrgvarb): if o == gdat.maxmindxvarb: break if len(gdat.indxvalu[o]) == 1: continue for l, strgmetr in enumerate(gdat.liststrgmetr): figr, axis = plt.subplots() # figr unused for r in gdat.indxrtyp: yerr = np.zeros((2, gdat.numbvalu[o])) if r == 0: colr = 'b' else: colr = 'g' indx = [] ydat = np.zeros(gdat.numbvalu[o]) - 1. for i in gdat.indxvalu[o]: indx.append( np.where(gdat.dictmetr[strgvarb][r, l, :, i] != -1)[0]) if indx[i].size > 0: ydat[i] = np.mean(gdat.dictmetr[strgvarb][r, l, indx[i], i], axis=0) yerr[0, i] = ydat[i] - np.percentile( gdat.dictmetr[strgvarb][r, l, indx[i], i], 5.) yerr[1, i] = np.percentile( gdat.dictmetr[strgvarb][r, l, indx[i], i], 95.) - ydat[i] temp, listcaps, temp = axis.errorbar(gdat.listvalu[strgvarb], ydat, yerr=yerr, label=gdat.listlablrtyp[r], capsize=10, marker='o', \ ls='', markersize=10, lw=3, alpha=alph, color=colr) for caps in listcaps: caps.set_markeredgewidth(3) for t in gdat.indxruns: axis.plot(gdat.listvalu[strgvarb], gdat.dictmetr[strgvarb][r, l, t, :], marker='D', ls='', markersize=5, alpha=alph, color=colr) #axis.set_ylim([-0.1, 1.1]) if strgvarb == 'numbphas': labl = '$N_{time}$' if strgvarb == 'dept': labl = '$\delta$' if strgvarb == 'nois': labl = '$\sigma$' if strgvarb == 'numbdata': labl = '$N_{data}$' if strgvarb == 'fracplan': labl = '$f_{p}$' if strgvarb == 'numbdatabtch': labl = '$N_{db}$' if strgvarb == 'numbdimslayr': labl = '$N_{dens}$' if strgvarb == 'fracdrop': labl = '$f_D$' axis.set_ylabel(gdat.listlablmetr[l]) axis.set_xlabel(labl) if strgvarb in [ 'numbdata', 'numbphas', 'dept', 'nois', 'numbdimslayr', 'numbdatabtch' ]: axis.set_xscale('log') plt.legend() plt.tight_layout() plt.xlabel(labl) plt.ylabel(gdat.listlablmetr[l]) path = pathplot + strgvarb + strgmetr + '.pdf' plt.savefig(path) plt.close()
pool_size = 4 encoding_dim = 2 l1_param = 0.1 l2_param = 0.1 usetess = True save_path = 'ileana_output_files/tess_data/' if usetess: _, light_curves, labels, _, _, _ = exopmain.retr_datatess( True, boolplot=False) else: dept = 1e-2 nois = 1e-4 light_curves, labels, _ = exopmain.retr_datamock(numbplan=100, numbnois=100, dept=dept, nois=nois) #plot_input_ts(light_curves, save_path) nrow, ncol = light_curves.shape light_curves = np.reshape(light_curves, (nrow, ncol, 1)) encoder, autoencoder, filename = model_cnn_autoencoder( ncol=ncol, no_filters=no_filters, kernel_size=kernel_size, pool_size=pool_size, encoding_dim=encoding_dim, activation_function='relu', verbose=True, l1_param=l1_param, l2_param=l2_param,
def explore(dataclass, modelfunc, datatype='here'): ''' Function to explore the effect of hyper-parameters (and data properties for mock data) on binary classification metrics ''' # global object that will hold global variables # this can be wrapped in a function to allow for customization # initialize the data here gdat = dataclass ## time stamp string strgtimestmp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') # print ('CtC explorer initialized at %s.' % strgtimestmp) ## path where plots will be generated pathplot = os.environ['TDGU_DATA_PATH'] + '/' # print ('Will generate plots in %s' % pathplot) """" # detect names of devices, disabled for the moment from tensorflow.python.client import device_lib listdictdevi = device_lib.list_local_devices() print ('Names of the devices detected: ') for dictdevi in listdictdevi: print (dictdevi.name) """ # temp gdat.maxmindxvarb = 10 # for each run for t in gdat.indxruns: # print ('Run index %d' % t) # do the training for the central value # temp -- current implementation repeats running of the central point #metr = gdat.retr_metr() # for each variable for o, strgvarb in enumerate(gdat.liststrgvarb): if o == gdat.maxmindxvarb: break # print ('Processing variable %s...' % strgvarb) # for each value for i in gdat.indxvalu[o]: pathsave = pathplot + '%04d%04d%04d.fits' % (t, o, i) # temp if False and os.path.exists(pathsave): # print ('Reading %s...' % pathsave) listhdun = ap.io.fits.open(pathsave) metr = listhdun[0].data else: for strgvarbtemp in gdat.liststrgvarb: setattr( gdat, strgvarbtemp, gdat.listvalu[strgvarbtemp][int(gdat.numbvalu[o] / 2)]) setattr(gdat, strgvarb, gdat.listvalu[strgvarb][i]) for strgvarbtemp in gdat.liststrgvarb: print('strgvarbtemp, ', strgvarbtemp, ' gdat.strgvarbtemp, ', getattr(gdat, strgvarbtemp)) gdat.numbplan = int(gdat.numbdata * gdat.fracplan) gdat.numbnois = gdat.numbdata - gdat.numbplan gdat.indxtime = np.arange(gdat.numbtime) gdat.indxdata = np.arange(gdat.numbdata) gdat.indxlayr = np.arange(gdat.numblayr) # number of test data samples gdat.numbdatatest = int(gdat.numbdata * gdat.fractest) # number of training data samples gdat.numbdatatran = gdat.numbdata - gdat.numbdatatest # number of signal data samples numbdataplan = int(gdat.numbdata * gdat.fracplan) if datatype == 'here': gdat.inpt, gdat.outp = exopmain.retr_datamock( numbplan=gdat.numbplan, numbnois=gdat.numbnois, numbtime=gdat.numbtime, dept=gdat.dept, nois=gdat.nois) if datatype == 'ete6': gdat.inpt, gdat.outp = exopmain.retr_ete6() # print ('Beginning') # print ('gdat.inpt\n', gdat.inpt.shape) """ # plot figr, axis = plt.subplots() # figr unused for k in gdat.indxdata: if k < 10: if gdat.outp[k] == 1: colr = 'r' else: colr = 'b' axis.plot(gdat.indxtime, gdat.inpt[k, :], marker='o', ls='-', markersize=5, alpha=0.6, color=colr) plt.tight_layout() plt.xlabel('time') plt.ylabel('data-input') plt.title('input vs time') plt.legend() path = pathplot + 'inpt_%04d%s%04d' % (t, strgvarb, i) + strgtimestmp + '.pdf' plt.savefig(path) plt.close() """ # divide the data set into training and test data sets numbdatatest = int(gdat.fractest * gdat.numbdata) gdat.inpttest = gdat.inpt[:numbdatatest, :] gdat.outptest = gdat.outp[:numbdatatest] gdat.inpttran = gdat.inpt[numbdatatest:, :] gdat.outptran = gdat.outp[numbdatatest:] gdat.modl = modelfunc(gdat, ) # temp -- this runs the central value redundantly and can be sped up by only running the central value once for all variables # do the training for the specific value of the variable of interest metr = retrmetr(gdat, i, strgvarb) """ # save to the disk hdun = ap.io.fits.PrimaryHDU(metr) listhdun = ap.io.fits.HDUList([hdun]) listhdun.writeto(pathsave, overwrite=True) """ gdat.dictmetr[strgvarb][0, 0, t, i] = metr[-1, 0, 0] gdat.dictmetr[strgvarb][1, 0, t, i] = metr[-1, 1, 0] gdat.dictmetr[strgvarb][0, 1, t, i] = metr[-1, 0, 1] gdat.dictmetr[strgvarb][1, 1, t, i] = metr[-1, 1, 1] gdat.dictmetr[strgvarb][0, 2, t, i] = metr[-1, 0, 2] gdat.dictmetr[strgvarb][1, 2, t, i] = metr[-1, 1, 2] return strgtimestmp
def run_through_puts(dataclass, modelfunc, datatype='here'): ''' Function to explore the effect of hyper-parameters (and data properties for mock data) on binary classification metrics ''' # global object that will hold global variables # this can be wrapped in a function to allow for customization # initialize the data here gdat = dataclass ## time stamp string strgtimestmp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') # print ('CtC explorer initialized at %s.' % strgtimestmp) ## path where plots will be generated pathplot = os.environ['TDGU_DATA_PATH'] + '/' # temp gdat.maxmindxvarb = 10 # for each run for t in gdat.indxruns: # for each variable for o, strgvarb in enumerate(gdat.liststrgvarb): if o == gdat.maxmindxvarb: break pr_points = [] # for each value for i in gdat.indxvalu[o]: for strgvarbtemp in gdat.liststrgvarb: setattr( gdat, strgvarbtemp, gdat.listvalu[strgvarbtemp][int(gdat.numbvalu[o] / 2)]) setattr(gdat, strgvarb, gdat.listvalu[strgvarb][i]) # for strgvarbtemp in gdat.liststrgvarb: #print (strgvarb, getattr(gdat, strgvarb)) gdat.numbplan = int(gdat.numbdata * gdat.fracplan) gdat.numbnois = gdat.numbdata - gdat.numbplan gdat.indxtime = np.arange(gdat.numbtime) gdat.indxdata = np.arange(gdat.numbdata) gdat.indxlayr = np.arange(gdat.numblayr) # number of test data samples gdat.numbdatatest = int(gdat.numbdata * gdat.fractest) # number of training data samples gdat.numbdatatran = gdat.numbdata - gdat.numbdatatest # number of signal data samples numbdataplan = int(gdat.numbdata * gdat.fracplan) if datatype == 'here': gdat.inpt, gdat.outp = exopmain.retr_datamock( numbplan=gdat.numbplan, numbnois=gdat.numbnois, numbtime=gdat.numbtime, dept=gdat.dept, nois=gdat.nois) if datatype == 'ete6': gdat.inpt, gdat.outp = exopmain.retr_ete6() # divide the data set into training and test data sets numbdatatest = int(gdat.fractest * gdat.numbdata) gdat.inpttest = gdat.inpt[:numbdatatest, :] gdat.outptest = gdat.outp[:numbdatatest] gdat.inpttran = gdat.inpt[numbdatatest:, :] gdat.outptran = gdat.outp[numbdatatest:] gdat.modl = modelfunc(gdat, ) # precision, recall = Precision_Recall(gdat) # pr_points.append((precision, recall)) pr_points = metrics_vary_thresh(gdat) figr, axis = plt.subplots() axis.plot([i[0] for i in pr_points], [i[1] for i in pr_points], marker='o', ls='', markersize=5, alpha=0.6) plt.tight_layout() plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision v Recall, {0}{1}'.format( str(strgvarb), str(getattr(gdat, strgvarb)))) # plt.legend() path = pathplot + 'PvR_{0}_{1}{2}_'.format( t, strgvarb, getattr(gdat, strgvarb)) + strgtimestmp + '.pdf' plt.savefig(path) plt.close() return strgtimestmp
labltrue = np.zeros(numbdata) fact = np.zeros(numbdata) s2nr = np.zeros(numbrele) deptthis = np.zeros(numbrele) for k in indxdata: fact[k] = (1. + 2. * np.random.random()) * stdvflux flux[k, :] = 1. + fact[k] * np.random.randn(numbbins) if k < numbrele: numbtran = np.random.random_integers(15) indxtran = np.arange(numbbins / 2 - numbtran / 2, numbbins / 2 + numbtran / 2 + 1) deptthis[k] = dept * (1. + np.random.rand()) flux[k, indxtran] -= deptthis[k] s2nr[k] = deptthis[k] / fact[k] * np.sqrt(indxtran.size) labltrue[indxrele] = 1. gdat.inptraww, gdat.outp, gdat.peri = exopmain.retr_datamock(numbplan=gdat.numbrele, \ numbnois=gdat.numbirre, numbtime=gdat.numbtime, dept=gdat.dept, nois=gdat.nois, boolflbn=True) else: meanphas, flux, labltrue, legdoutp, tici, itoi = exopmain.retr_datatess( False) indxbadd = np.where(~np.isfinite(flux))[0] print 'indxbadd' summgene(indxbadd) print 'flux' summgene(flux) flux[indxbadd] = np.random.randn(indxbadd.size) print 'meanphas' summgene(meanphas) print 'flux' summgene(flux) #imp = Imputer(strategy="mean", axis=0)
from sklearn.manifold import MDS from sklearn.metrics import confusion_matrix from sklearn.cluster import KMeans from autoencoder import get_latent_vars sys.path.append('/Users/ruginaileana/src') from exop import main as exopmain from binary_classification_helper import find_km_clusters, plot_confusion_matrix visualize = True run_all = True dimensionality_reduction = "PCA" #can also be "PCA" or autoencoder" lower_dimensionality = 1 #only applies if PCA, for autoencoder always do 2 latent variables #get data light_curves, labels = exopmain.retr_datamock() light_curves = np.array(light_curves) ######################################################################## ######################################################################## ################PLOTS TO SEE WHAT PCA AND AUTOENCODER DO################ ######################################################################## ######################################################################## # if run_all: # dimensionality_reduction = "PCA" # lower_dimensionality = 1 # if dimensionality_reduction == "PCA" and lower_dimensionality == 1: # pca = PCA(n_components=lower_dimensionality) # proj = pca.fit_transform(light_curves) # clusters = find_km_clusters(proj)