def test_empty_csv(self, tmpdir): file_name = 'empty.csv' csv_file = tmpdir.join(file_name) csv_file.ensure() with pytest.raises(Exception): utils.load_table(str(csv_file), 'empty')
def load_table(self, tmpdir, csv_str, table_name, **kwargs): file_name = '{}.csv'.format(table_name) csv_file = tmpdir.join(file_name) csv_file.write(csv_str) utils.load_table(str(csv_file), table_name, **kwargs) self.meta.reflect()
def apify(filename, tablename=None): tablename = tablename or utils.get_name(filename) logger.info('Importing {0} to table {1}'.format(filename, tablename)) utils.drop_table(tablename) utils.load_table(filename, tablename) utils.index_table(tablename, config.CASE_INSENSITIVE) logger.info('Finished importing {0}'.format(filename))
def gather_fit_data(fit_count, maxfev): fn_format = "fits.fit_count={fit_count}_maxfev={maxfev}.txt" try: return utils.load_table(fn_format.format(**locals())) except OSError: pass d = utils.filter_preferred_ml(utils.load_all()) d = d[~d["method"].isin(["imsrg[f]+eom[n]"])] results = [] with multiprocessing.Pool(4) as p: results = p.map( functools.partial(gather_fit_data_inner, fit_count=fit_count, maxfev=maxfev), tuple( d.groupby( ["label", "interaction", "num_filled", "freq", "method"]))) d = pd.DataFrame.from_records(itertools.chain(*results)) print("{} fits failed, out of {}".format( (d["fit_method"] == "fixedab").sum(), len(d))) # fit_count=5: # maxfev=default: 198 fits failed, out of 2247 # maxfev=10k: 40 fits failed, out of 2248 # maxfev=100k: 0 fits failed cols = """ interaction label freq num_filled method best_chisq best_coefficient best_coefficient_err best_constant best_constant_err best_fixedab_constant_err best_exponent best_exponent_err best_fit_method best_fit_stop chisq coefficient coefficient_err constant constant_err fixedab_constant_err exponent exponent_err fit_method fit_stop rel_discrep rel_discrep_err rel_dist rel_dist_err """.split() assert len(d.columns) == len(cols) utils.save_table(fn_format.format(**locals()), d[cols]) return d
def _compute_denseSeq(self, det_file): ''' Function for computing all dense sequences given the detections. Is only used during initialization (__init__) Input: det_file: <str>, name and path to the '.csv' file with all detections (required filed names: {'videos','frames','x1','y1','x2','y2'}) Output: self.seq_dense: <list of dictionaries, size: #videos> contains one dictionary per video containing all necessary information for loading the sequences during the training/testing phase ''' detections = load_table( det_file, asDict=False) #the detections are given in an excel or csv file self.videos = np.unique(detections['videos'].values) self.N_vids = len(self.videos) self.seq_dense = [] #initialize dictionary for v in range(self.N_vids): frames, coords, group, orig_size = get_vid_info( detections, self.videos[v], self.crops_saved) #it might be that some detections inbetween are missing, so find out where we have a gap gap = np.where(((frames[1:] - frames[:-1]) > 1) * 1)[0] if len(gap) == 0: #if all frames are consecutive connectSeq = [range(len(frames))] else: #get all connected sequences connectSeq = [range(gap[0] + 1)] #first sequence connectSeq.extend([ range(gap[i] + 1, gap[i + 1] + 1) for i in range(len(gap) - 1) ]) #all sequences inbetween connectSeq.extend([range(gap[-1] + 1, len(frames))]) #last sequence #create the dense sequences idx_dense = [] for i in range(len(connectSeq)): seq = connectSeq[i] #idx_dense.extend([range(seq[i],seq[i]+self.F) for i in range(len(seq)-self.F+1)]) idx_dense.extend([ range(seq[i], seq[i] + self.skip_frames * self.F, self.skip_frames) for i in range( len(seq) - (self.F * self.skip_frames) + self.skip_frames) ]) #save all information for the dense sequences idx_dense = np.array(idx_dense) frames_dense = frames[idx_dense] coords_dense = coords[idx_dense] group_dense = group[idx_dense] seq_dense_v = { 'video': self.videos[v], 'group': group_dense, 'frames': frames_dense, 'coords': coords_dense, 'orig_size': orig_size } self.seq_dense.append(seq_dense_v)
def gather_fit_data(fit_count, maxfev): fn_format = "fits.fit_count={fit_count}_maxfev={maxfev}.txt" try: return utils.load_table(fn_format.format(**locals())) except OSError: pass d = utils.filter_preferred_ml(utils.load_all()) d = d[~d["method"].isin(["imsrg[f]+eom[n]"])] results = [] with multiprocessing.Pool(4) as p: results = p.map( functools.partial(gather_fit_data_inner, fit_count=fit_count, maxfev=maxfev), tuple(d.groupby(["label", "interaction", "num_filled", "freq", "method"]))) d = pd.DataFrame.from_records(itertools.chain(*results)) print("{} fits failed, out of {}" .format((d["fit_method"] == "fixedab").sum(), len(d))) # fit_count=5: # maxfev=default: 198 fits failed, out of 2247 # maxfev=10k: 40 fits failed, out of 2248 # maxfev=100k: 0 fits failed cols = """ interaction label freq num_filled method best_chisq best_coefficient best_coefficient_err best_constant best_constant_err best_fixedab_constant_err best_exponent best_exponent_err best_fit_method best_fit_stop chisq coefficient coefficient_err constant constant_err fixedab_constant_err exponent exponent_err fit_method fit_stop rel_discrep rel_discrep_err rel_dist rel_dist_err """.split() assert len(d.columns) == len(cols) utils.save_table(fn_format.format(**locals()), d[cols]) return d
def load(self): logger.info('###################### load data #######################') for pair in self.pairs: logger.info('[load] =====%s====' % pair) dst_table = pair['dst_table'] table_file_path = os.path.join(self.data_dir, dst_table+'.bin') tmp_table = 'tmp_'+dst_table+'_for_sync' # load data utils.load_table( self.dst_db, self.dst_cursor, tmp_table, table_file_path) logger.info('[load] from [file:{file_path}] to \ [db:{db}-table:{table}]'.format( file_path=table_file_path, db=self.dst_db, table=tmp_table)) # postload custom_table = self.customs.get(dst_table, None) if custom_table and getattr(custom_table, 'postload', None): custom_table.postload(self.dst_cursor)
def apify(filename, tablename=None): try: filenames = glob.glob(filename, recursive=True) except TypeError: # recursive glob in Python 3.5+ only filenames = glob.glob(filename) if len(filenames) > 1 and tablename: raise Exception("Can't specify a `tablename` for >1 file") for filename in filenames: _tablename = tablename or utils.get_name(filename) logger.info('Importing {0} to table {1}'.format(filename, _tablename)) try: utils.drop_table(_tablename) except sa.exc.OperationalError as e: logger.debug('DROP TABLE {} failed, may not exist?'.format( _tablename)) logger.debug(str(e)) try: utils.load_table(filename, _tablename) except Exception as e: logger.error('Failed to load table from file {}'.format(filename)) logger.error(str(e)) logger.info('Finished importing {0}'.format(filename))
def apify(filename, tablename=None): try: filenames = glob.glob(filename, recursive=True) except TypeError: # recursive glob in Python 3.5+ only filenames = glob.glob(filename) if len(filenames) > 1 and tablename: raise Exception("Can't specify a `tablename` for >1 file") for filename in filenames: _tablename = tablename or utils.get_name(filename) logger.info('Importing {0} to table {1}'.format(filename, _tablename)) try: utils.drop_table(_tablename) except sa.exc.OperationalError as e: logger.debug( 'DROP TABLE {} failed, may not exist?'.format(_tablename)) logger.debug(str(e)) try: utils.load_table(filename, _tablename) except Exception as e: logger.error('Failed to load table from file {}'.format(filename)) logger.error(str(e)) logger.info('Finished importing {0}'.format(filename))
def setup(self, workpath, datapath, jss, datapathtype, format_index): self.jss = jss self.dirname = workpath self.datapath = datapath self.datapathtype = datapathtype self.data_format = self.namespace['data_format'][format_index] # setup job center self.JobCenter = jobc.JobCenter(self.jss, self.dirname, self.data_format, self) # load table self.process_data = utils.load_table(os.path.join(self.dirname, self.namespace['project_structure'][0])) # load table change log, if there exists self.rawdata_changelog = utils.load_changelog(os.path.join(self.dirname, self.namespace['project_structure'][0])) # set up tag_buffer self.tag_buffer = {} for assm in self.namespace['process_assignments']: self.tag_buffer[assm] = {} # write jss to UI if self.jss is not None: self.ui.comboBox_2.addItem(self.jss) # setup all tabWidgets # setup process for assm in self.namespace['process_assignments']: self.ui.comboBox.addItem(assm) self.ui.tableWidget.horizontalHeader().setResizeMode(QtGui.QHeaderView.Interactive) self.ui.label_73.setText( utils.fmt_process_status(self.data_format) ) self.ui.pushButton_2.setVisible(False) # setup classify for decp in self.namespace['classify_decomp']: self.ui.comboBox_3.addItem(decp) self.ui.lineEdit.setText(os.path.join(os.path.join(self.dirname, \ self.namespace['project_structure'][0]), self.namespace['process_HF'])) self.ui.lineEdit_2.setText("Hits/data") self.ui.widget_12.setVisible(False) self.ui.widget_11.setVisible(False) # setup merge for sym in self.namespace['merge_sym']: self.ui.comboBox_5.addItem(sym) # setup phasing for method in self.namespace['phasing_method']: self.ui.comboBox_9.addItem(method) self.ui.comboBox_10.addItem(method) self.ui.comboBox_11.addItem(method) # setup monitors self.table_monitor = None # draw table self.refresh_table()
def __init__(self, opt): self.data_path = opt.Paths['img'] table = load_table(opt.Paths['detections'],asDict=False)#np.load(opt.Paths['dic'], encoding="latin1").item() self.info = {'videos':table['videos'].values, 'frames':table['frames'].values} ### Removing missing crops # detecions_iterator = tqdm(zip(self.info['videos'],self.info['frames']),desc='Validate detections') # selection = [os.path.exists('%s%s/%06d.jpg'%(self.data_path,v,f)) # for i, (v, f) in enumerate(detecions_iterator)] # self.info['videos'] = self.info['videos'][selection] # self.info['frames'] = self.info['frames'][selection] ### Determine length of our training set uni_videos = np.unique(self.info['videos']) if opt.Training['size'] is None: self.length = len(self.info['frames']) else: per_video = int(opt.Training['size']/len(uni_videos)) #self.length = min(opt.Training['size'],len(self.info['frames'])) selection = [np.where(self.info['videos']==v)[0][:per_video] for v in uni_videos]#np.random.permutation(len(self.info['frames'])) selection = np.concatenate(selection) self.info['videos'] = self.info['videos'][selection] self.info['frames'] = self.info['frames'][selection] self.length = len(self.info['frames']) ### Load fc6 features into the RAM #self.fc6 = np.zeros((self.length, opt.Network['feature_size'])) data_iter = tqdm(uni_videos,position=2) data_iter.set_description('Load Posture Representation') self.fc6 = [] for i, v in enumerate(data_iter): frames = self.info['frames'][self.info['videos']==v] features_file = np.load(opt.Paths['fc6'] + v + '.npz') selection = [np.where(features_file['frames']==frame)[0][0] for frame in frames if frame in features_file['frames']] self.fc6.append(features_file['fc6'][selection]) # self.fc6.append(features_file['fc6']) self.fc6 = np.concatenate(self.fc6,0) assert len(self.fc6.shape)==2, 'Features not properly concatenated' assert self.fc6.shape[0]==self.info['videos'].shape[0],'Wrong number of features loaded: %d - %d'%(self.info['videos'].shape[0],self.fc6.shape[0])
def __init__(self, opt): self.data_path = opt.Paths['img'] self.frame_path = opt.Paths['frame_path'] table = load_table(opt.Paths['detections'], asDict=False) self.info = { 'videos': table['videos'].values, 'frames': table['frames'].values } ### Determine length of our training set uni_videos = np.unique(self.info['videos']) if opt.Training['size'] is None: self.length = len(self.info['frames']) else: per_video = int(opt.Training['size'] / len(uni_videos)) #self.length = min(opt.Training['size'],len(self.info['frames'])) selection = [ np.where(self.info['videos'] == v)[0][:per_video] for v in uni_videos ] #np.random.permutation(len(self.info['frames'])) selection = np.concatenate(selection) self.info['videos'] = self.info['videos'][selection] self.info['frames'] = self.info['frames'][selection] self.length = len(self.info['frames']) ### Load fc6 features into the RAM data_iter = tqdm(uni_videos, position=2) data_iter.set_description('Load Posture Representation') self.fc6 = [] for i, v in enumerate(data_iter): frames = self.info['frames'][self.info['videos'] == v] features_file = np.load(opt.Paths['fc6'] + v + '.npz') selection = [ np.where(features_file['frames'] == frame)[0][0] for frame in frames if frame in features_file['frames'] ] self.fc6.append(features_file['fc6'][selection]) # self.fc6.append(features_file['fc6']) self.fc6 = np.concatenate(self.fc6, 0) assert len(self.fc6.shape) == 2, 'Features not properly concatenated' assert self.fc6.shape[0] == self.info['videos'].shape[ 0], 'Wrong number of features loaded: %d - %d' % ( self.info['videos'].shape[0], self.fc6.shape[0])
def load_full_fit_data(fit_count=DEFAULT_FIT_COUNT, maxfev=DEFAULT_MAXFEV): '''Load fit data from file if available. Otherwise calculate the fits.''' fn = "fit_data.fit_count={fit_count}_maxfev={maxfev}.txt".format( **locals()) try: return utils.load_table(fn) except OSError: pass sys.stderr.write("Fit data has not yet been calculated. " "This may take a few minutes...\n") sys.stderr.flush() d = utils.filter_preferred_ml(utils.load_all()) d = d[~d["method"].isin(["imsrg[f]+eom[n]"])] with multiprocessing.Pool(4) as p: results_s, missing_num_shells = zip(*p.map( functools.partial( gather_fit_data, fit_count=fit_count, maxfev=maxfev), tuple( d.groupby([ "label", "interaction", "num_filled", "freq", "method" ])))) results = itertools.chain(*results_s) missing_fn = ("fits_missing_points." "fit_count={fit_count}_maxfev={maxfev}.log".format( **locals())) utils.save_table(missing_fn.format(**locals()), pd.DataFrame.from_records(missing_num_shells)) sys.stderr.write("Missing data points logged to: {}\n".format(missing_fn)) sys.stderr.flush() d = pd.DataFrame.from_records(results) num_failed = (d["fit_method"] == "fixedab").sum() if num_failed: sys.stderr.write("{} out of {} fits failed\n".format( num_failed, len(d))) sys.stderr.flush() utils.save_table(fn, d) return d
D[j, :] = d[:100 * k] return D, I # if __name__ == "__main__": ############################################ # 1. Load Features ############################################ results_fold = '%s/similarity/nearestNeighbor/%s/' % (cfg.results_path, args.feature_type) if not os.path.exists(results_fold): os.makedirs(results_fold) #videos = load_table(cfg.video_path+'/vid_list.csv',index=None,asDict=True).values() #if len(videos)>1: videos = videos[1].values() detections = load_table(cfg.detection_file, asDict=False) uni_videos = np.unique(detections['videos'].values) if 'fc6' or 'fc7' in args.feature_type: print( 'Chosen features: "%s". Compute %i Nearest Neighbors of %i randomly chosen postures. The Results will be saved in "%s".' % (args.feature_type, args.nn_per_query, args.number_of_queries, results_fold)) elif 'lstm' in args.feature_type: print( 'Chosen features: "%s". Compute %i Nearest Neighbors of %i randomly chosen sequences. The Results will be saved in "%s".' % (args.feature_type, args.nn_per_query, args.number_of_queries, results_fold)) else: raise ValueError( 'Chosen Features (%s) are not available. Please choose "fc6", "fc7" or "fc6fc7" for posture features or "lstm" for sequence features.'
video_files = [] for root, dirnames, filenames in os.walk(cfg.video_path): video_files.extend(glob(root + "/*." + cfg.video_format)) for vid in video_files: vid_list = vid_list.append( {'videos': vid[len(cfg.video_path):-(len(cfg.video_format) + 1)]}, ignore_index=True) vid_list.to_csv(cfg.video_path + '/vid_list.csv') ############################################ # 1. Extract the frames ############################################ frames_crop = load_table( cfg.frames_crop, index='videos') #load the cropping coordinates in dictionary format #find all video files in the given directory video_files = [] for root, dirnames, filenames in os.walk(cfg.video_path): video_files.extend(glob(root + "/*." + cfg.video_format)) N = len(video_files) print('Number of found videos: %i \n Start extracting frames...' % N) for v in trange(N): vid_fullPath = video_files[v] vid = vid_fullPath[len(cfg.video_path):-(len(cfg.video_format) + 1)] #define output folder
def plot(fit_count=5, log=False, maxfev=0, plot_type="scatter", stat="err", hf=False): dorig = utils.filter_preferred_ml(utils.load_all()) d_good = utils.load_table("fits_good.txt") d_good = d_good.groupby( ["interaction", "label", "freq", "num_filled", "method"] ).first() d = gather_fit_data(fit_count=fit_count, maxfev=maxfev) d = d[d["interaction"] == "normal"] # d = d[d["label"] == "add"] # d = d[d["method"] == "imsrg"] # d = d[d["num_filled"] == 5] # d = d[d["freq"] == 0.1] d = d[d["fit_method"] != "fixedab"] doriggrps = dorig.groupby(["interaction", "label", "freq", "num_filled", "method"]) d["rel_constant_err"] = d["constant_err"] / d["constant"] d["rel_best_constant_err"] = d["best_constant_err"] / d["best_constant"] d["label_is_ground"] = d["label"] == "ground" d["good"] = d.apply(lambda r: d_good.loc[ (r["interaction"], r["label"], r["freq"], r["num_filled"], r["method"])]["good"], axis=1) d["rel_chi"] = d["chisq"]**.5 / d["constant"] d["rel_reduced_chi"] = d["rel_chi"] / (fit_count - 3) d["rel_best_chisq"] = d["best_chisq"]**.5 / d["best_constant"] d["rel_best_reduced_chisq"] = d["rel_best_chisq"] / (fit_count - 3) d = d[(d["rel_best_reduced_chisq"] < 1e-6)] d["fixedab_with_hf"] = ( ((d["fit_method"] == "fixedab") == (d["method"].isin(["hf", "hf+qdpt3"]))) | (d["fit_method"] == "full") ) color_col = "method" bin_transform = TRANSFORM_ID bin_transform = TRANSFORM_LOG_ABS if color_col in ["exponent", "rel_dist", "rel_constant_err", "rel_best_constant_err", "rel_best_chisq", "rel_chi", "rel_reduced_chi", "chi_ratio"]: num_bins = 16 d = d.replace([np.inf, -np.inf], np.nan).dropna(subset=[color_col]) binf = bin_transform[0] bininvf = bin_transform[1] color_bins = pd.cut(binf(abs(d[color_col])), num_bins) d["color_bin_start"] = color_bins.map( lambda bin: bininvf(parse_bin(bin)[0])) d["color_bin_stop"] = color_bins.map( lambda bin: bininvf(parse_bin(bin)[1])) color_bin_cols = ["color_bin_start", "color_bin_stop"] else: color_bin_cols = [color_col] max_bins = len(d[color_bin_cols[0]].unique()) fig, ax = plt.subplots() def on_pick_event(event): x = list(event.artist.get_xdata()[event.ind])[0] y = list(event.artist.get_ydata()[event.ind])[0] sel = d[(abs(d["x"] - x) <= 1e-20) & (abs(d["y"] - y) <= 1e-20)] print(sel.transpose().to_csv()) if len(sel) != 1: print('>> not found <<') return sel = sel.iloc[0] grp = doriggrps.get_group((sel["interaction"], sel["label"], sel["freq"], sel["num_filled"], sel["method"])) fig, ax = plt.subplots(2) ax[0].plot(grp["num_shells"], grp["energy"], "x") fit_start = sel["fit_stop"] + 1 - fit_count ax[0].axvspan(fit_start, sel["fit_stop"], color="#cccccc") xs = np.linspace(grp["num_shells"].min(), grp["num_shells"].max()) ax[0].plot(xs, sel["coefficient"] * xs ** sel["exponent"] + sel["constant"]) subgrp = grp[grp["num_shells"].between( fit_start-0.1, sel["fit_stop"]+0.1)] last_constant = sel["constant"] last_constant_err = sel["constant_err"] def random_weight(count): weights = np.zeros(count) for i in range(count): weights[np.random.randint(0, count)] += 1 return weights p0 = [sel["coefficient"], sel["exponent"], sel["constant"]] p = p0 x = subgrp["num_shells"] y = subgrp["energy"] constants = [] constants.append(p[2]) print(f"x = np.array({list(x)})") print(f"y = np.array({list(y)})") ax[1].plot(x, (p[0] * x ** p[1] + p[2] - y), "-x") ax[1].axhline(0.0, linestyle=":") for i in range(10): count = len(x) weights = random_weight(count) + 1e-99 if sum(weights > 0.1) <= 3: # can't fit with this few points continue try: p, cov = scipy.optimize.curve_fit( lambda x, a, b, c: a * x ** b + c, x, y, sigma=1.0 / weights ** 0.5, p0=p0, maxfev=100000) except RuntimeError as e: print(e) continue chisq = np.average((p[0] * x ** p[1] + p[2] - y) ** 2, weights=weights) * len(x) constant = p[2] constant_err = cov[2, 2] ** 0.5 constants.append(p[2]) last_constant = constant last_constant_err = constant_err print("result", np.mean(constants), np.std(constants)) print("rel", np.std(constants) / np.mean(constants)) ax[0].set_ylim([max(ax[0].get_ylim()[0], 0.0), min(ax[0].get_ylim()[1], np.max(y))]) ax[0].plot(xs, p[0] * xs ** p[1] + p[2], ":", color="lime") fig.canvas.mpl_connect("pick_event", on_pick_event) d["quality"] = np.log10(d["constant_err"]/d["chisq"]**0.5) # hf has unique behaviors (what about mp2?) if hf: d = d[d["method"] == "hf"] else: d = d[d["method"] != "hf"] if stat == "err": d = d[d["quality"] > 0] d["y"] = d["rel_discrep"] / d["rel_constant_err"] if stat == "hessian": d["x"] = d["quality"] elif stat == "err": d["x"] = np.log10(d["rel_constant_err"]) else: assert False d["y_err"] = d["rel_discrep_err"] / d["rel_discrep"] * d["y"] d = d[(d["rel_constant_err"] > 0) & (d["rel_constant_err"] < np.inf)] if plot_type == "contour": if hf: hf_suffix = "HF" else: hf_suffix = "non-HF" # ranged = mesh; lim = view if stat == "err": if hf: nx = 20 ny = 20 xrange = (-7, -1) yrange = (-5, 5) xlim = (-6, -2) ylim = (-4, 4) title = ("discrepancy vs fit uncertainty " f"({hf_suffix})") else: nx = 20 ny = 40 xrange = (-7, -1) yrange = (-50, 50) xlim = (-6, -2) ylim = (-40, 40) title = ("discrepancy vs fit uncertainty " f"(filtered: Q > 0, {hf_suffix})") xlabel = r"$\log_{10}\left(\frac{\sigma_c}{c}\right)$" ylabel = r"$\frac{\varepsilon}{\sigma_c}$" elif stat == "hessian": if hf: nx = 20 ny = 40 xrange = (-0.6, 1.5) yrange = (-10, 10) xlim = (-0.4, 1.1) ylim = (-10, 10) else: nx = 20 ny = 40 xrange = (-0.6, 2.1) yrange = (-200, 200) xlim = (-0.4, 1.7) ylim = (-150, 150) title = ("spread of “actual” discrepancy vs quality " f"({hf_suffix})") xlabel = (r"$Q = \log_{10}\left(\frac{\sigma_c}" r"{\sqrt{\mathtt{RSS}}}\right)$") ylabel = r"$\frac{\varepsilon}{\sigma_c}$" else: assert False ax.plot(d["x"], d["y"], "o", markersize=1, picker=3, color="white", markeredgewidth=0) dx = (xrange[1] - xrange[0]) / (nx - 1) h, x, y = np.histogram2d(d["x"], d["y"], bins=(nx, ny - 1), range=((xrange[0] - 0.5 * dx, xrange[1] + 0.5 * dx), yrange)) ch = np.concatenate([np.zeros((nx, 1)), np.cumsum(h, axis=1)], axis=1) z = ch / ch[..., -1, np.newaxis] x, y = np.meshgrid(0.5 * (x[1:] + x[:-1]), y, indexing="ij") levels = np.linspace(-2.0, 2.0, 5) levels = 0.5 + 0.5 * scipy.special.erf(2.0 ** -0.5 * levels) ax.axhline(-1.0, linestyle="--", color="white", linewidth=0.5) ax.axhline(1.0, linestyle="--", color="white", linewidth=0.5) ax.contour(x, y, z, levels=levels, colors="white", alpha=0.3) cs = ax.contourf(x, y, z, levels=np.linspace(0.0, 1.0, 300), # note: translucent cmaps tend to cause artifacts cmap=CMAP_FOLDED_VIRIDIS, linestyle=":") for c in cs.collections: # http://stackoverflow.com/a/32911283 c.set_edgecolor("face") fig.colorbar(cs) ax.set_title(title) ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.set_xlim(xlim) ax.set_ylim(ylim) elif plot_type == "scatter": cmap = utils.CMAP_RAINBOW for i, (bin, g) in enumerate(sorted(d.groupby(color_bin_cols))): color = cmap(float(i) / max_bins) ax.plot(g["x"], g["y"], ".", label=str(bin), color=color, markersize=10, markeredgewidth=0, alpha=0.5, picker=2) ax.legend() fn = f"fit-predictiveness-{plot_type}-{fit_count}-{stat}-{hf}" fig.tight_layout() utils.savefig(fig, fn)
#!/usr/bin/env python3 import functools, os, sys import pandas as pd import matplotlib import matplotlib.pyplot as plt import utils utils.init(__file__) d = utils.load_table("compare_rel_slopes.txt") # we want to use (interaction, freq, num_particles) as an axis # but matplotlib doesn't let us do that # so we have to use this awful hack x_labels = [ "'_sigmaA=0.5_sigmaB=4.0', 1.0, 6", "'_sigmaA=0.5_sigmaB=4.0', 1.0, 12", "'_sigmaA=0.5_sigmaB=4.0', 1.0, 20", "'_sigmaA=0.5_sigmaB=4.0', 0.28, 6", "'_sigmaA=0.5_sigmaB=4.0', 0.28, 12", "'_sigmaA=0.5_sigmaB=4.0', 0.28, 20", "'_sigmaA=0.5', 1.0, 6", "'_sigmaA=0.5', 1.0, 12", "'_sigmaA=0.5', 1.0, 20", "'_sigmaA=0.5', 0.28, 6", "'_sigmaA=0.5', 0.28, 12", "'_sigmaA=0.5', 0.28, 20", "'_sigmaB=4.0', 1.0, 6", "'_sigmaB=4.0', 1.0, 12", "'_sigmaB=4.0', 1.0, 20", "'_sigmaB=4.0', 0.28, 6",
#!/usr/bin/env python3 import functools, os import matplotlib.pyplot as plt import pandas as pd import utils utils.init(__file__) d = utils.load_table("imsrg-qdpt/dat_arenergy_by_ml.txt") d = d[["num_shells", "num_filled", "freq", "ml", "label", "energy"]] d["method"] = "qdpt" dq = d d = utils.load_table("EOM_IMSRG_qd_attached.dat") d["energy"] = d["E(N+1)-E(N)"] d = d[["shells", "filled", "ML", "omega", "energy"]] d = d.rename(columns={ "shells": "num_shells", "filled": "num_filled", "ML": "ml", "omega": "freq", }) d["label"] = "add" d["method"] = "eom" dea = d d = utils.load_table("EOM_IMSRG_qd_removed.dat") d["energy"] = -d["E(N-1)-E(N)"] d = d[["shells", "filled", "ML", "omega", "energy"]] d = d.rename(columns={ "shells": "num_shells",
is_multimaster=args.multimaster) elif args.obj_type == 'volume': utils.autosetup_replica_table_multithread( volume_path=args.path, replica_parent=args.replica, num_replica=args.numreplica, is_multimaster=args.multimaster) else: logging.error('Unrecognized object. Cannot create.') sys.exit(-1) elif args.cmd_name == 'load': logging.debug('Load command') if args.obj_type == 'table': utils.load_table(table_name=args.path, num_cfs=args.numcfs, num_cols=args.numcols, num_rows=args.numrows, is_json=args.json) elif args.obj_type == 'volume': utils.load_volume_tables_multithread(volume_path=args.path, num_cfs=args.numcfs, num_cols=args.numcols, num_rows=args.numrows, is_json=args.json) else: logging.error('Unrecognized object. Cannot create.') sys.exit(-1) elif args.cmd_name == 'replstatus': logging.debug('Replica status tracking') if args.obj_type == 'table': utils.get_replica_status(table_name=args.path, fields=args.filter)
def plot(fit_count=fits.DEFAULT_FIT_COUNT, log=False, maxfev=fits.DEFAULT_MAXFEV, plot_type="scatter", stat="err", hf=False): dorig = utils.filter_preferred_ml(utils.load_all()) d_good = utils.load_table("fits_good.txt") d_good = d_good.groupby( ["interaction", "label", "freq", "num_filled", "method"] ).first() d = fits.load_predictive_data(fit_count=fit_count, maxfev=maxfev) d = d[d["interaction"] == "normal"] # d = d[d["label"] == "add"] # d = d[d["method"] == "imsrg"] # d = d[d["num_filled"] == 5] # d = d[d["freq"] == 0.1] d = d[d["fit_method"] != "fixedab"] doriggrps = dorig.groupby(["interaction", "label", "freq", "num_filled", "method"]) d["rel_constant_err"] = d["constant_err"] / d["constant"] d["rel_best_constant_err"] = d["best_constant_err"] / d["best_constant"] d["label_is_ground"] = d["label"] == "ground" d["good"] = d.apply(functools.partial(is_good, d_good), axis=1) d["rel_chi"] = d["chisq"]**.5 / d["constant"] d["rel_reduced_chi"] = d["rel_chi"] / (fit_count - 3) d["rel_best_chisq"] = d["best_chisq"]**.5 / d["best_constant"] d["rel_best_reduced_chisq"] = d["rel_best_chisq"] / (fit_count - 3) d = d[(d["rel_best_reduced_chisq"] < 1e-6)] d["fixedab_with_hf"] = ( ((d["fit_method"] == "fixedab") == (d["method"].isin(["hf", "hf+qdpt3"]))) | (d["fit_method"] == "full") ) color_col = "method" bin_transform = TRANSFORM_ID bin_transform = TRANSFORM_LOG_ABS if color_col in ["exponent", "rel_dist", "rel_constant_err", "rel_best_constant_err", "rel_best_chisq", "rel_chi", "rel_reduced_chi", "chi_ratio"]: num_bins = 16 d = d.replace([np.inf, -np.inf], np.nan).dropna(subset=[color_col]) binf = bin_transform[0] bininvf = bin_transform[1] color_bins = pd.cut(binf(abs(d[color_col])), num_bins) d["color_bin_start"] = color_bins.map( lambda bin: bininvf(parse_bin(bin)[0])) d["color_bin_stop"] = color_bins.map( lambda bin: bininvf(parse_bin(bin)[1])) color_bin_cols = ["color_bin_start", "color_bin_stop"] else: color_bin_cols = [color_col] max_bins = len(d[color_bin_cols[0]].unique()) fig, ax = plt.subplots() def on_pick_event(event): x = list(event.artist.get_xdata()[event.ind])[0] y = list(event.artist.get_ydata()[event.ind])[0] sel = d[(abs(d["x"] - x) <= 1e-20) & (abs(d["y"] - y) <= 1e-20)] print(sel.transpose().to_csv()) if len(sel) != 1: print('>> not found <<') return sel = sel.iloc[0] grp = doriggrps.get_group((sel["interaction"], sel["label"], sel["freq"], sel["num_filled"], sel["method"])) fig, ax = plt.subplots(2) ax[0].plot(grp["num_shells"], grp["energy"], "x") fit_start = sel["fit_stop"] + 1 - fit_count ax[0].axvspan(fit_start, sel["fit_stop"], color="#cccccc") xs = np.linspace(grp["num_shells"].min(), grp["num_shells"].max()) ax[0].plot(xs, sel["coefficient"] * xs ** sel["exponent"] + sel["constant"]) subgrp = grp[grp["num_shells"].between( fit_start-0.1, sel["fit_stop"]+0.1)] last_constant = sel["constant"] last_constant_err = sel["constant_err"] def random_weight(count): weights = np.zeros(count) for i in range(count): weights[np.random.randint(0, count)] += 1 return weights p0 = [sel["coefficient"], sel["exponent"], sel["constant"]] p = p0 x = subgrp["num_shells"] y = subgrp["energy"] constants = [] constants.append(p[2]) print(f"x = np.array({list(x)})") print(f"y = np.array({list(y)})") ax[1].plot(x, (p[0] * x ** p[1] + p[2] - y), "-x") ax[1].axhline(0.0, linestyle=":") for i in range(10): count = len(x) weights = random_weight(count) + 1e-99 if sum(weights > 0.1) <= 3: # can't fit with this few points continue try: p, cov = scipy.optimize.curve_fit( lambda x, a, b, c: a * x ** b + c, x, y, sigma=1.0 / weights ** 0.5, p0=p0, maxfev=100000) except RuntimeError as e: print(e) continue chisq = np.average((p[0] * x ** p[1] + p[2] - y) ** 2, weights=weights) * len(x) constant = p[2] constant_err = cov[2, 2] ** 0.5 constants.append(p[2]) last_constant = constant last_constant_err = constant_err print("result", np.mean(constants), np.std(constants)) print("rel", np.std(constants) / np.mean(constants)) ax[0].set_ylim([max(ax[0].get_ylim()[0], 0.0), min(ax[0].get_ylim()[1], np.max(y))]) ax[0].plot(xs, p[0] * xs ** p[1] + p[2], ":", color="lime") fig.canvas.mpl_connect("pick_event", on_pick_event) d["quality"] = np.log10(d["constant_err"]/d["chisq"]**0.5) # hf has unique behaviors (what about mp2?) if hf: d = d[d["method"] == "hf"] else: d = d[d["method"] != "hf"] if stat == "err": d = d[d["quality"] > 0] d["y"] = d["rel_discrep"] / d["rel_constant_err"] if stat == "hessian": d["x"] = d["quality"] elif stat == "err": d["x"] = np.log10(d["rel_constant_err"]) else: assert False d["y_err"] = d["rel_discrep_err"] / d["rel_discrep"] * d["y"] d = d[(d["rel_constant_err"] > 0) & (d["rel_constant_err"] < np.inf)] if plot_type == "contour": if hf: hf_suffix = "HF" else: hf_suffix = "non-HF" # ranged = mesh; lim = view if stat == "err": if hf: nx = 20 ny = 20 xrange = (-7, -1) yrange = (-5, 5) xlim = (-6, -2) ylim = (-4, 4) title = ("discrepancy vs fit uncertainty " f"({hf_suffix})") else: nx = 20 ny = 40 xrange = (-7, -1) yrange = (-50, 50) xlim = (-6, -2) ylim = (-40, 40) title = ("discrepancy vs fit uncertainty " f"(filtered: Q > 0, {hf_suffix})") xlabel = r"$\log_{10}\left(\frac{\sigma_c}{c}\right)$" ylabel = r"$\frac{\varepsilon}{\sigma_c}$" elif stat == "hessian": if hf: nx = 20 ny = 40 xrange = (-0.6, 1.5) yrange = (-10, 10) xlim = (-0.4, 1.1) ylim = (-10, 10) else: nx = 20 ny = 40 xrange = (-0.6, 2.1) yrange = (-200, 200) xlim = (-0.4, 1.7) ylim = (-150, 150) title = ("spread of “actual” discrepancy vs quality " f"({hf_suffix})") xlabel = (r"$Q = \log_{10}\left(\frac{\sigma_c}" r"{\sqrt{\mathtt{RSS}}}\right)$") ylabel = r"$\frac{\varepsilon}{\sigma_c}$" else: assert False ax.plot(d["x"], d["y"], "o", markersize=1, picker=3, color="white", markeredgewidth=0) dx = (xrange[1] - xrange[0]) / (nx - 1) h, x, y = np.histogram2d(d["x"], d["y"], bins=(nx, ny - 1), range=((xrange[0] - 0.5 * dx, xrange[1] + 0.5 * dx), yrange)) ch = np.concatenate([np.zeros((nx, 1)), np.cumsum(h, axis=1)], axis=1) z = ch / ch[..., -1, np.newaxis] x, y = np.meshgrid(0.5 * (x[1:] + x[:-1]), y, indexing="ij") levels = np.linspace(-2.0, 2.0, 5) levels = 0.5 + 0.5 * scipy.special.erf(2.0 ** -0.5 * levels) ax.axhline(-1.0, linestyle="--", color="white", linewidth=0.5) ax.axhline(1.0, linestyle="--", color="white", linewidth=0.5) ax.contour(x, y, z, levels=levels, colors="white", alpha=0.3) cs = ax.contourf(x, y, z, levels=np.linspace(0.0, 1.0, 300), # note: translucent cmaps tend to cause artifacts cmap=CMAP_FOLDED_VIRIDIS, linestyle=":") for c in cs.collections: # http://stackoverflow.com/a/32911283 c.set_edgecolor("face") fig.colorbar(cs) ax.set_title(title) ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.set_xlim(xlim) ax.set_ylim(ylim) elif plot_type == "scatter": cmap = utils.CMAP_RAINBOW for i, (bin, g) in enumerate(sorted(d.groupby(color_bin_cols))): color = cmap(float(i) / max_bins) ax.plot(g["x"], g["y"], ".", label=str(bin), color=color, markersize=10, markeredgewidth=0, alpha=0.5, picker=2) ax.legend() fn = f"fit-predictiveness-{plot_type}-{fit_count}-{stat}-{hf}" fig.tight_layout() utils.savefig(fig, fn)
#!/usr/bin/env python3 import os, re, sys sys.path.insert(1, os.path.join(os.path.dirname(__file__), "..")) import utils fn = re.match(r"(.*)-postprocess\.py", __file__).group(1) + ".txt" d = utils.load_table(fn) # canonicalization can introduce duplicates, in addition to whatever # duplicates that already exist in the file d["p"] = d["p"].map(utils.canonicalize_p) d = utils.check_fun_dep(d, ["interaction", "num_shells", "num_filled", "freq", "method", "p", "term_id"], {"correction": 1e-7}, combiner=utils.rightmost_combiner) d = d.sort_values(["interaction", "num_shells", "num_filled", "freq", "method", "p", "term_id", "correction"]) with open(fn, "w") as f: f.write(""" # term_ids 3 and 4: QDPT2 # term_ids 5 to 22: QDPT3 # # Functional dependencies: # # * (num_shells, num_filled, freq, method, p, term_id) -> correction # """[1:]) utils.save_table(f, d)
# fill Isomer table # todo: retention time/ figure isomer = swgdrug[['formula']] isomer.insert(1, 'retention_time', np.nan) # waar halen we die vandaan? isomer.insert(2, 'figure', np.nan) # waar halen we die vandaan? isomer.to_sql('Isomer', con=config.db_connection, index=False, if_exists='append') # fill label table label = swgdrug[['formula', 'name']] label.insert(2, 'preference', np.nan) label.insert(3, 'isomer_id', np.nan) isomer_table = load_table('Isomer') # loop over the isomer table and get index of the formula name. # for now we can only match on formula name, and there can be multiple formulas with the same name # we match on the first match found, and this match will then be removed from isomer_table. for index, row in label.iterrows(): formula, name = row['formula'], row['name'] try: id_match = isomer_table[isomer_table['formula'] == formula].iloc[0]['id'] except: raise ValueError( "Formula name '{}' does not exist in Isomer table".format(formula)) isomer_table = isomer_table[isomer_table.id != id_match] label.at[index, 'isomer_id'] = id_match
#!/usr/bin/env python3 import os, re, sys sys.path.insert(1, os.path.join(os.path.dirname(__file__), "..")) import utils fn = re.match(r"(.*)-postprocess\.py", __file__).group(1) + ".txt" d = utils.load_table(fn) d = utils.check_fun_dep( d, ["interaction", "num_shells", "num_filled", "freq", "method"], {"energy": 2e-5}, combiner=utils.rightmost_combiner) d = d.sort_values( ["interaction", "num_shells", "num_filled", "freq", "method", "energy"]) with open(fn, "w") as f: f.write(""" # Functional dependencies: # # * (interaction, num_shells, num_filled, freq, method) -> energy # """[1:]) utils.save_table(f, d)
#!/usr/bin/env python3 import itertools, sys import matplotlib import matplotlib.pyplot as plt import numpy as np import utils with utils.plot(__file__): d = utils.load_table("compare_rel_slopes.txt", na_filter=False) # we want to use (num_particles, freq) as an axis # but matplotlib doesn't let us do that # so we have to use this awful hack interactions = [ ("", r"$(0, \infty)$"), ("_sigmaB=4.0", r"$(0, 4)$"), ("_sigmaA=0.5", r"$(\frac{1}{2}, \infty)$"), ("_sigmaA=0.5_sigmaB=4.0", r"$(\frac{1}{2}, 4)$"), ] interaction_colors = { "": "#a883e4", "_sigmaB=4.0": "#951c16", "_sigmaA=0.5": "#005e93", "_sigmaA=0.5_sigmaB=4.0": "#1e1e1e", } interaction_markers = { "": "o", "_sigmaB=4.0": "X", "_sigmaA=0.5": "o", "_sigmaA=0.5_sigmaB=4.0": "x", }
def get_model(input_dim, hid_nodes=16): #X_train, y_train, X_test, y_test = split_data(df) model = kr.models.Sequential() #model.add(kr.layers.Dense(hid_nodes, input_dim=2 * (end - start + 1), activation='relu')) model.add(kr.layers.Dense(hid_nodes, input_dim=input_dim, activation='relu')) model.add(kr.layers.Dense(1, activation='linear')) model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse']) return model # Kumpula df_n = load_table(neighbor, 2017)[features] X_n, y_n = add_shifted_features(df_n, features, start=start, end=end, step=step, forecast_step=forecast_step) #df_n = df_n.drop(features, axis=1) df_t = load_table(target_area, 2017)[features] X_t, y_t = add_shifted_features(df_t, features, start=start, end=end, step=step, forecast_step=forecast_step) y = y_t[target_feat] #df_t = df_t.drop(features, axis=1) X = pd.merge(X_t, X_n, on='Date', suffixes=['_'+neighbor, '_'+target_area]) #X_data = df_All X_train, y_train, X_test, y_test = split_data(X, y, n=n_preds) print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)