def _regress_chrom(chrom_to_do): """Applies _regress_tfam() to all of the transcript families on a chromosome""" chrom_orfs = pd.read_hdf(opts.orfstore, 'all_orfs', mode='r', where="chrom == %r and tstop > 0 and tcoord > 0" % chrom_to_do, columns=['orfname', 'tfam', 'tid', 'tcoord', 'tstop', 'AAlen', 'chrom', 'gcoord', 'gstop', 'strand', 'codon', 'orftype', 'annot_start', 'annot_stop']) # tcoord > 0 removes ORFs where the first codon is an NTG, to avoid an indexing error # Those ORFs would never get called anyway since they couldn't possibly have any reads at their start codon if restrictbystartfilenames: restrictedstarts = pd.DataFrame() for (restrictbystart, minw) in zip(restrictbystartfilenames, opts.minwstart): restrictedstarts = restrictedstarts.append( pd.read_hdf(restrictbystart, 'start_strengths', mode='r', where="(chrom == %r) & (W_start > minw)" % chrom_to_do, columns=['tfam', 'chrom', 'gcoord', 'strand']), ignore_index=True).drop_duplicates() chrom_orfs = chrom_orfs.merge(restrictedstarts) # inner merge acts as a filter if chrom_orfs.empty: if opts.verbose > 1: logprint('No ORFs found on %s' % chrom_to_do) return failure_return inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles] gnd = HashedReadBAMGenomeArray(inbams, ReadKeyMapFactory(Pdict, read_length_nmis)) res = tuple([pd.concat(res_dfs) for res_dfs in zip(*[_regress_tfam(tfam_set, gnd) for (tfam, tfam_set) in chrom_orfs.groupby('tfam')])]) for inbam in inbams: inbam.close() if opts.verbose > 1: logprint('%s complete' % chrom_to_do) return res
def create_subset(dest_store, dest_skims, maxZone, households_sample_size=0): dest_store_path = os.path.join(dest_data_dir, dest_store) dest_skims_path = os.path.join(dest_data_dir, dest_skims) print('land_use_taz') df = pd.read_hdf(source_store, 'land_use_taz') df = df[df.index <= maxZone] df.to_hdf(dest_store_path, 'land_use_taz') del df print('households') hh_df = pd.read_hdf(source_store, 'households') hh_df = hh_df[hh_df.TAZ <= maxZone] if households_sample_size: hh_df = hh_df.take(np.random.choice(len(hh_df), size=households_sample_size, replace=False)) hh_df.to_hdf(dest_store_path, 'households') print('persons') per_df = pd.read_hdf(source_store, 'persons') per_df = per_df[per_df.household_id.isin(hh_df.index)] per_df.to_hdf(dest_store_path, 'persons') # process all skims skims = omx.open_file(source_skims) skims_out = omx.open_file(dest_skims_path, 'w') skimsToProcess = skims.list_matrices() for skimName in skimsToProcess: print(skimName) skims_out[skimName] = skims[skimName][0:maxZone, 0:maxZone] skims_out[skimName].attrs.TITLE = '' # remove funny character for OMX viewer
def GetFiles(): ''' Get All the files with relevant tickers :return: ''' flist = sorted(os.listdir('Z:/TAQ/TAQHDF5/')) for ff in flist: if ff.replace('taq_','')[:4]>='2001' and ff.replace('taq_','')[:4]<'2014': print "Downloading..." t0=datetime.datetime.now() #ff = 'taq_20131231.h5' path = "Z:/TAQ/TAQHDF5/" + ff df = pd.read_hdf(path,'Trades') ind = pd.read_hdf(path,'TradeIndex') ind['end'] = np.cumsum(ind['count']) symlist = 'AAPL AXP BA CAT CSCO CVX DD DIS GE GS HD IBM INTC JNJ JOM KO MCD MMM MRK MSFT NKE PFE PG TRV UNH UTX V VZ WMT XOM'.split(' ') ind['ticker'] = [str(j).strip() for j in ind['ticker']] ind = ind[ind['ticker'].isin(symlist)].reset_index(drop=True) ran = np.array([range(start,end) for start,end in zip(ind['start'],ind['end'])]) ran = [item for sublist in ran for item in sublist] df = df[df.index.isin(ran)] df['time'] = pd.to_datetime(df['utcsec'],unit='s') for i in ind.index: start = int(ind.loc[i,'start']) end = int(ind.loc[i,'end']) df.loc[start:end,'sym'] = ind.loc[i,'ticker'] df.to_csv('data/taq/' + ff.replace('taq_','').replace('.h5','')+'.csv',columns=['time','price','sym'],index=False) print datetime.datetime.now()-t0
def from_analysis_file(data_set, analysis_file): dg = DriftingGratings(data_set) try: dg.populate_stimulus_table() dg._sweep_response = pd.read_hdf(analysis_file, "analysis/sweep_response_dg") dg._mean_sweep_response = pd.read_hdf(analysis_file, "analysis/mean_sweep_response_dg") dg._peak = pd.read_hdf(analysis_file, "analysis/peak") with h5py.File(analysis_file, "r") as f: dg._response = f["analysis/response_dg"].value dg._binned_dx_sp = f["analysis/binned_dx_sp"].value dg._binned_cells_sp = f["analysis/binned_cells_sp"].value dg._binned_dx_vis = f["analysis/binned_dx_vis"].value dg._binned_cells_vis = f["analysis/binned_cells_vis"].value if "analysis/noise_corr_dg" in f: dg.noise_correlation = f["analysis/noise_corr_dg"].value if "analysis/signal_corr_dg" in f: dg.signal_correlation = f["analysis/signal_corr_dg"].value if "analysis/rep_similarity_dg" in f: dg.representational_similarity = f["analysis/rep_similarity_dg"].value except Exception as e: raise MissingStimulusException(e.args) return dg
def merge_temp_databases(id_obs,store,file): store.append('events',pd.read_hdf(os.path.join(PATH.TMP_FOLDER,file),'events'),data_columns=['Pulse','SAP','BEAM','DM','Time']) meta_data = pd.read_hdf(os.path.join(PATH.TMP_FOLDER,file),'meta_data') meta_data.reset_index(inplace=True,drop=True) meta_data['version'] = args.vers store.append('meta_data',meta_data) os.remove(os.path.join(PATH.TMP_FOLDER,file))
def read_test_train(train_size): print("Load train.csv") train = pd.read_hdf("../modified_data/train_original.csv.hdf", 'table') null_count = train.isnull().sum().sum() if null_count > 0: print('Nans:', null_count) cols = train.isnull().any(axis=0) print(cols[cols == True]) rows = train.isnull().any(axis=1) print(rows[rows == True]) print('NANs in train, please check it!') exit() split = round((1-train_size)*len(train.index)) train = train[split:] print("Load test.csv") test = pd.read_hdf("../modified_data/test.hdf", 'table') null_count = test.isnull().sum().sum() if null_count > 0: print('Nans:', null_count) cols = test.isnull().any(axis=0) print(cols[cols == True]) print('NANs in test, please check it!') exit() features = get_features(train, test) return train, test, features
def __init__(self, name='unnamed', description=''): self._datafile = '%s/Genes.h5' % dataDirectory() self._dataframe = pandas.read_hdf(self._datafile, 'data') self._metadata = pandas.read_hdf(self._datafile, 'metadata') self.name = name self.description = description
def test_to_hdf(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out[:]) with tmpfile('h5') as fn: a.x.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_series_equal(df.x, out[:]) a = dd.from_pandas(df, 1) with tmpfile('h5') as fn: a.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out[:]) # test compute = False with tmpfile('h5') as fn: r = a.to_hdf(fn, '/data', compute=False) r.compute() out = pd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out[:])
def bbq_sauce_piquant(): with recipes_db: cur = recipes_db.cursor() cur.execute("SELECT * FROM Recipe_IDs") query_results = cur.fetchall() for result in query_results: if 'bbq' in result[1].lower(): r_id = result[0] print r_id, result[1] break # grab ingredients for this recipe cur.execute("SELECT Ingredient FROM Ingredient_List WHERE ID = " + str(r_id)) query_results = cur.fetchall() ingredients = [] for row in query_results: ingredients.append(row[0]) # find recommended ingredients flavors = pd.read_hdf(data_dir + 'flavor_profiles_nnls.h5', 'df') piquant_ingredients = flavors['type'] == 'piquant' graph = pd.read_hdf(data_dir + 'ingredient_graph.h5', 'df') # get distances between ingredients in recipe and potential additions graph = graph.ix[piquant_ingredients][ingredients] graph_distances = graph.sum(axis=1) recommended_ingredients = list(np.sort(graph_distances)[::-1][:5].index) print 'Recipe ingredients:' print ingredients print '' print 'Recommended Piquant ingredients:' print recommended_ingredients
def get_dt_deep(self, compute=False): filename = os.path.join(self.datadir, 'dt_deep{}.h5'.format(self.kwarg_tag)) compute = not os.path.exists(filename) if not compute: try: dt_deep = pd.read_hdf(filename, 'dt_deep') except Exception: compute = True if compute: # need grid to work with first df = self.get_df() # Make bucket for derivative to go in df['dt_deep'] = np.nan # Compute derivative for each (feh, age) isochrone, and fill in for f, m in tqdm(itertools.product(*df.index.levels[:2]), total=len(list(itertools.product(*df.index.levels[:2]))), desc='Computing dt/deep'): subdf = df.loc[f, m] log_age = np.log10(subdf['star_age']) deriv = np.gradient(log_age, subdf['eep']) subdf.loc[:, 'dt_deep'] = deriv df.dt_deep.to_hdf(filename, 'dt_deep') dt_deep = pd.read_hdf(filename, 'dt_deep') return dt_deep
def get_dtypes(self, where=None): if not where: filename=self.get_store_filename("dtypes") print filename return read_hdf(filename,self.get_store_key(),) else: return read_hdf(self.get_store_filename("dtypes"),self.get_store_key(),where=where)
def main(fname, blpath, odir, year, month): print "Applying EI Rules 1 and 2." hdf_filepath = odir + "/%s_%s_store_df.h5" % (year, month) print "LOOKING for HDF file at location ", hdf_filepath if os.path.exists(hdf_filepath): print "READING HDF" ei_df = pd.read_hdf(hdf_filepath, 'ei_df') bl_df = pd.read_hdf(hdf_filepath, 'bl_df') else: ei_df = pd.read_csv(fname, header=0, sep=";", converters=converters, names=cols, quotechar="'", decimal=",") ei_df, bl_df = bl_prepare(ei_df, blpath) print "Doing setup..." ei_df, bl_df = setup(ei_df, bl_df) print "SAVING HDF to", hdf_filepath ei_df.to_hdf(hdf_filepath, 'ei_df') bl_df.to_hdf(hdf_filepath, 'bl_df') print "Entering rule 1..." ei_df = rule1(ei_df, bl_df, RECEIVER) ei_df = rule1(ei_df, bl_df, SENDER) print "Entering rule 2..." ei_df = rule2(ei_df) print ei_df output_values = ["purchase_value", "remit_value", "transfer_value", "devolution_value", "icms_credit_value", "remit_value", "tax", "icms_tax", "transportation_cost", "year", "month"] output_name = "%s_%s" % (year,month) print "Making tables..." ymsrp = make_table(ei_df, "srp", output_values, odir, output_name, year=year, month=month)
def _evaluate_data(self, feature_set_name): x_train = pd.read_hdf(os.path.join(self._data_dir, feature_set_name, 'train_train_features.hf5'), 'data') y_train = pd.read_hdf(os.path.join(self._data_dir, 'train_train_y.hf5'), 'data') x_validation = pd.read_hdf(os.path.join(self._data_dir, feature_set_name, 'train_validation_features.hf5'), 'data') y_validation = pd.read_hdf(os.path.join(self._data_dir, 'train_validation_y.hf5'), 'data') dtrain = xgb.DMatrix(x_train, y_train['target'], missing=-1) dtest = xgb.DMatrix(x_validation, y_validation['target'], missing=-1) results = {} for i_seed in range(0, self.NB_SEEDS): evals_result = {} params = {'bst:max_depth': self._max_depth, 'bst:eta': self._eta, 'objective': 'binary:logistic', 'colsample_bytree': self._col_sample, 'subsample': self._sub_sample, 'min_child_weight': self._min_child_weight, 'eval_metric': 'auc', 'silent': 1, 'nthread': 16, 'seed': i_seed} eval_list = [(dtest, 'eval')] bst = xgb.train(params, dtrain, self._nb_rounds, eval_list, evals_result=evals_result, verbose_eval=False, early_stopping_rounds=200) results[i_seed] = evals_result['eval'][-1] save_model(bst, evals_result, params, x_train.columns.tolist(), feature_set_name, 'full_evaluation', os.path.join(self._data_dir, 'Models')) print 'Seed {} => {}'.format(i_seed, results[i_seed]) return results
def load_hdf(cls, filename, path=''): data = pd.read_hdf(filename, '{}/data'.format(path)) t = np.array(data['t']) f = np.array(data['f']) mask = np.array(data['mask']) new = cls(t,f,mask=mask) acorr = pd.read_hdf(filename, '{}/acorr'.format(path)) new._lag = np.array(acorr['lag']) new._ac = np.array(acorr['ac']) pgram = pd.read_hdf(filename, '{}/pgram'.format(path)) new._pers = np.array(pgram['period']) new._pgram = np.array(pgram['pgram']) #store.close() i=1 has_sub = True new.subseries = {} while has_sub: try: name = 'sub{}'.format(i) new.subseries[name] = cls.load_hdf(filename, path='{}/{}'.format(path,name)) except KeyError: has_sub = False i += 1 return new
def from_analysis_file(data_set, analysis_file, stimulus): lsn = LocallySparseNoise(data_set, stimulus) lsn.populate_stimulus_table() if stimulus == stimulus_info.LOCALLY_SPARSE_NOISE: stimulus_suffix = stimulus_info.LOCALLY_SPARSE_NOISE_SHORT elif stimulus == stimulus_info.LOCALLY_SPARSE_NOISE_4DEG: stimulus_suffix = stimulus_info.LOCALLY_SPARSE_NOISE_4DEG_SHORT elif stimulus == stimulus_info.LOCALLY_SPARSE_NOISE_8DEG: stimulus_suffix = stimulus_info.LOCALLY_SPARSE_NOISE_8DEG_SHORT try: with h5py.File(analysis_file, "r") as f: k = "analysis/mean_response_%s" % stimulus_suffix if k in f: lsn._mean_response = f[k].value lsn._sweep_response = pd.read_hdf(analysis_file, "analysis/sweep_response_%s" % stimulus_suffix) lsn._mean_sweep_response = pd.read_hdf(analysis_file, "analysis/mean_sweep_response_%s" % stimulus_suffix) with h5py.File(analysis_file, "r") as f: lsn._cell_index_receptive_field_analysis_data = LocallySparseNoise.read_cell_index_receptive_field_analysis(f, stimulus) except Exception as e: raise MissingStimulusException(e.args) return lsn
def main(opts, flgs): if not opts['training_hdf']: opts['training_hdf'] = opts['hdf'] df = pnd.read_hdf(opts['hdf'], str(opts['data'])) data = df[df['non_null_cells'] > int(opts['area_size'])] if opts['training_kchk'] and opts['training_ychk']: Kchk = pnd.read_hdf(opts['training_hdf'], str(opts['training_kchk'])) ychk = pnd.read_hdf(opts['training_hdf'], str(opts['training_ychk'])) itr = extract_itr(Kchk, ychk, int(opts['training_number'])) else: with open(opts['training_json'], 'r') as fp: tr = json.load(fp) check_classification(tr) itr, Kchk, ychk = extract_training(tr, data, int(opts['training_number'])) conf = imp.load_source("conf", opts['training_conf']) mls = getattr(conf, opts['training_mls']) key = None if opts['training_key'] == '' else opts['training_key'] tdata, tKchk = transform(opts['transform'], data, Kchk) tK_chk, y_chk = tKchk.loc[itr], ychk.loc[itr] mls_classification(tdata, tK_chk, y_chk, mls, hdf=opts['hdf'], out_class=opts['out_class'], key=key)
def from_analysis_file(data_set, analysis_file, movie_name): nm = NaturalMovie(data_set, movie_name) nm.populate_stimulus_table() # TODO: deal with this properly suffix_map = { stiminfo.NATURAL_MOVIE_ONE: '_'+stiminfo.NATURAL_MOVIE_ONE_SHORT, stiminfo.NATURAL_MOVIE_TWO: '_'+stiminfo.NATURAL_MOVIE_TWO_SHORT, stiminfo.NATURAL_MOVIE_THREE: '_'+stiminfo.NATURAL_MOVIE_THREE_SHORT } try: suffix = suffix_map[movie_name] nm._sweep_response = pd.read_hdf(analysis_file, "analysis/sweep_response"+suffix) nm._peak = pd.read_hdf(analysis_file, "analysis/peak") with h5py.File(analysis_file, "r") as f: nm._binned_dx_sp = f["analysis/binned_dx_sp"].value nm._binned_cells_sp = f["analysis/binned_cells_sp"].value nm._binned_dx_vis = f["analysis/binned_dx_vis"].value nm._binned_cells_vis = f["analysis/binned_cells_vis"].value except Exception as e: raise MissingStimulusException(e.args) return nm
def from_analysis_file(data_set, analysis_file): ns = NaturalScenes(data_set) ns.populate_stimulus_table() try: ns._sweep_response = pd.read_hdf(analysis_file, "analysis/sweep_response_ns") ns._mean_sweep_response = pd.read_hdf(analysis_file, "analysis/mean_sweep_response_ns") ns._peak = pd.read_hdf(analysis_file, "analysis/peak") with h5py.File(analysis_file, "r") as f: ns._response = f["analysis/response_ns"].value ns._binned_dx_sp = f["analysis/binned_dx_sp"].value ns._binned_cells_sp = f["analysis/binned_cells_sp"].value ns._binned_dx_vis = f["analysis/binned_dx_vis"].value ns._binned_cells_vis = f["analysis/binned_cells_vis"].value if "analysis/noise_corr_ns" in f: ns.noise_correlation = f["analysis/noise_corr_ns"].value if "analysis/signal_corr_ns" in f: ns.signal_correlation = f["analysis/signal_corr_ns"].value if "analysis/rep_similarity_ns" in f: ns.representational_similarity = f["analysis/rep_similarity_ns"].value except Exception as e: raise MissingStimulusException(e.args) return ns
def read_grid(hdf_fname): """ Load the grid information from hdf Parameters ---------- hdf_fname: str filename and path to the HDF file Returns ------- wavelength : astropy.units.Quantity meta : pandas.Series index : pandas.DataFrame fluxes : astropy.units.Quantity """ logger.info('Reading index') index = pd.read_hdf(hdf_fname, 'index') meta = pd.read_hdf(hdf_fname, 'meta') logger.info('Discovered columns {0}'.format(', '.join(meta['parameters']))) with h5py.File(hdf_fname) as fh: logger.info('Reading Fluxes') fluxes = fh['fluxes'].__array__() logger.info('Fluxes shape {0}'.format(fluxes.shape)) flux_unit = u.Unit(meta['flux_unit']) wavelength = pd.read_hdf(hdf_fname, 'wavelength').values[:, 0] wavelength = u.Quantity(wavelength, meta['wavelength_unit']) return wavelength, meta, index, fluxes * flux_unit
def load_pascal(VOCyear='VOC2012', force=False, args=None): """ Load all the annotations, including object bounding boxes. Loads XML data in args['num_workers'] threads using joblib.Parallel. Warning: this takes a few minutes to load from scratch! """ if args is None: # TODO: set this to number of cores on machine args = {'num_workers': 8} cache_filename = \ vislab.config['paths']['shared_data'] + \ '/pascal_{}_dfs.h5'.format(VOCyear) if not force and os.path.exists(cache_filename): images_df = pd.read_hdf(cache_filename, 'images_df') objects_df = pd.read_hdf(cache_filename, 'objects_df') return images_df, objects_df # Load all annotation file data (should take < 30 s). annotation_filenames = glob.glob( vislab.config['paths'][VOCyear] + '/Annotations/*.xml') images_df, objects_df = load_annotation_files( annotation_filenames, args['num_workers']) # Get the split information. splits_dir = vislab.config['paths'][VOCyear] + '/ImageSets/Main' images_df['_split'] = None for split in ['train', 'val', 'test']: split_filename = splits_dir + '/{}.txt'.format(split) if not os.path.exists(split_filename): print("{} split does not exist".format(split)) continue with open(split_filename) as f: inds = [x.strip() for x in f.readlines()] safe_inds = set(inds).intersection(images_df.index) images_df['_split'].ix[safe_inds] = split # Drop images without a split (VOC2007 images in the VOC2012 set). images_df = images_df.dropna(subset=['_split']) # Generate image filenames images_df['_filename'] = images_df.apply( lambda r: get_image_filename_for_id(r.name, VOCyear), axis=1) # Drop corresponding images in the objects_df. objects_df = objects_df.ix[images_df.index] # Propagate split info to objects_df objects_df['split'] = np.repeat( images_df['_split'].values, images_df['_num_objects'].values) # Make sure that all labels are either True or False. images_df = images_df.fillna(False) images_df.to_hdf(cache_filename, 'images_df', mode='w') objects_df.to_hdf(cache_filename, 'objects_df', mode='a') return images_df, objects_df
def merge(settings, overwrite=False): """ Merges interviews over time by household. Parameters ---------- settings : JSON settings file overwrite : bool whether to overwrite existing files Returns ------- None (IO) """ STORE_FMT = 'm%Y_%m' store_path = settings['monthly_store'] start = settings['date_start'] end = settings['date_end'] all_months = pd.date_range(start=start, end=end, freq='m') if overwrite: logger.info("Merging for {}".format(all_months)) else: with pd.get_store(settings['merged_store']) as store: cached = set(store.keys()) all_m = set([x.strftime('/' + STORE_FMT) for x in all_months]) logger.info("Using cached for {}".format(cached & all_m)) new = all_m - cached all_months = filter(lambda x: x.strftime('/' + STORE_FMT) in new, all_months) for m0 in all_months: months = (x.strftime('cpsm%Y-%m') for x in m.make_months(m0.strftime('%Y-%m-%d'))) months = enumerate(months, 1) mis, month = next(months) df0 = pd.read_hdf(store_path, key=month).query('HRMIS == @mis') match_funcs = [m.match_age, m.match_sex, m.match_race] dfs = [df0] for mis, month in months: try: dfn = pd.read_hdf(store_path, key=month).query('HRMIS == @mis') dfs.append(m.match(df0, dfn, match_funcs)) except KeyError: msg = "The panel for {} has no monthly data file for {}" logger.warn(msg.format(m0, month)) continue df = m.merge(dfs) df = df.sort_index() df = m.make_wave_id(df) store_key = df['wave_id'].iloc[0].strftime(STORE_FMT) df.to_hdf(settings["merged_store"], store_key) logger.info("Added merged {} to {}".format(store_key, settings['merged_store']))
def import_data(): def add_columns(df, df_i): df['ingredients_clean'] = ing_utils.get_ings_by_product(df, df_i) df['num_ingredients'] = df['ingredients_clean'].apply(len) df['hier'] = df[['aisle', 'shelf', 'food_category']].values.tolist() df = pd.read_hdf('../foodessentials/products.h5', 'products') df_i = pd.read_hdf('../foodessentials/ingredients.h5', 'ingredients') add_columns(df, df_i) return df, df_i
def calcem(): #execute this function in EM calculation directory process=subprocess.check_output(['export_band.py','-wbe']) vb=pd.read_hdf('vb.h5','vb').as_matrix()[:5][:,1:3] cb=pd.read_hdf('cb.h5','cb').as_matrix()[:5][:,1:3] os.remove('vb.h5') os.remove('cb.h5') mh,err_mh=fit_em(vb) me,err_me=fit_em(cb) return me,mh,err_me,err_mh
def to_database(scenario=' ', rng=range(0, 0), urbansim_connection=get_connection_string("configs/dbconfig.yml", 'urbansim_database'), default_schema='urbansim_output'): """ df_name: Required parameter, is the name of the table that will be read from the H5 file, Also first half of the table name to be stored in the database urbansim_connection: sql connection, default is for urbansim_database year: year of information to be caputured, should be pass the same range as simulation period minus first and last year. defalut_schema: The schema name under which to save the data, default is urbansim_output """ conn = psycopg2.connect(database="urbansim", user="******", password="******", host="socioeca8", port="5432") cursor = conn.cursor() t = (scenario,) cursor.execute('SELECT scenario_id FROM urbansim_output.parent_scenario WHERE scenario_name=%s', t) scenario_id = cursor.fetchone() cursor.execute('SELECT parent_scenario_id FROM urbansim_output.parent_scenario WHERE scenario_name=%s', t) parent_scenario_id = cursor.fetchone() conn.close() for year in rng: if year == 0 and scenario_id[0] == 1: for x in ['parcels', 'buildings', 'jobs']: print 'exporting ' + x + str(year) + ' ' + str(scenario_id[0]) df = pd.read_hdf('data\\results.h5', 'base/' + x) df['parent_scenario_id'] = parent_scenario_id[0] df.to_sql(x + '_base', urbansim_connection, schema=default_schema, if_exists='append') elif year == rng[len(rng)-1]: for x in ['buildings', 'feasibility', 'jobs']: print 'exporting ' + x + str(year) + ' ' + str(scenario_id[0]) df = pd.read_hdf('data\\results.h5', str(year) + '/' + x) if x == 'feasibility': df = df['residential'] df.rename(columns={'total_sqft': 'total_sqft_existing_bldgs'}, inplace=True) df = df[(df.addl_units > 0) or (df.non_residential_sqft > 0)] df['existing_units'] = np.where(df['new_built_units'] == 0, df['total_residential_units'], \ df['total_residential_units'] - df['addl_units']) elif x == 'buildings': df = df[df.new_bldg == 1] df.sch_dev = df.sch_dev.astype(int) df.new_bldg = df.new_bldg.astype(int) elif x == 'jobs': df = df[df.index > get_max_job_id()] df['year'] = year df['scenario_id'] = scenario_id[0] df['parent_scenario_id'] = parent_scenario_id[0] df.to_sql(x, urbansim_connection, schema=default_schema, if_exists='append')
def tick_data_convert_dates_single(TCKR, directory=None): """ Input: single ticker in format 'TICKER.X', where X is netfonds exchange letter (N:NYSE,O:NASDAQ,A:AMEX) Combines all tickdata files for the ticker in the directory, default = current. """ start_dir = os.getcwd() #save start dir so we can revert back at the end of program if directory==None: directory = start_dir os.chdir(directory) #get list of files for ticker = TCKR files = os.path.isfile(TCKR+'.combined.h5') if not files: print 'Error: '+ TCKR+'.combined.h5' + ' not found' return 1 size1 = os.path.getsize(TCKR+'.combined.h5') df = pd.read_hdf(TCKR+'.combined.h5', 'dataframe') os.remove(TCKR+'.combined.h5') # if 'time'in df.columns.values: # df.index = pd.to_datetime(df['time']) # del df['time'] # print TCKR + ' deleted time' # if 'daysecs' in df.columns.values: # del df['daysecs'] # print TCKR + ' deleted daysecs' # if 'timeopen' in df.columns.values: # del df['timeopen'] # print TCKR + ' deleted timeopen' # if 'timeclose' in df.columns.values: # del df['timeclose'] # print TCKR + ' deleted timeclose' # if 'date' in df.columns.values: # del df['date'] # print TCKR + ' deleted date' # # df.index = pd.to_datetime(df.index) # print TCKR + ' converted index to timeseries' store = pd.HDFStore(TCKR+'.combined.h5') store.append('dataframe', df, format='table', complib='blosc', complevel=9, expectedrows=len(df)) store.close() #df.to_hdf(TCKR+'.combined.h5', 'dataframe', mode='w',format='table',complib='blosc', complevel=9) size2 = os.path.getsize(TCKR+'.combined.h5') print TCKR + 'wrote to hdf file. size change=' +str(float(size2)/float(size1)) df2=pd.read_hdf(TCKR+'.combined.h5', 'dataframe') (df2==df).all() if (df2.index==df.index).all(): print TCKR + ' Indexes match!' os.chdir(start_dir) return 0
def __init__(self, features_dir, train_features_filename='train_features.hf5', test_features_filename='test_features.hf5', train_y_filename='train_y.hf5'): self.features_dir = features_dir self._train_features_filename = train_features_filename self._test_features_filename = test_features_filename self._train_y_filename = train_y_filename # Load features self._train_features = pd.read_hdf(os.path.join(features_dir, train_features_filename), 'data') self._test_features = pd.read_hdf(os.path.join(features_dir, test_features_filename), 'data') self._train_y = pd.read_hdf(os.path.join(features_dir, train_y_filename), 'data')
def load_dataset(force=False): cache_filename = vislab.config['paths']['shared_data'] + '/inria_dfs.h5' if not force and os.path.exists(cache_filename): images_df = pd.read_hdf(cache_filename, 'images_df') objects_df = pd.read_hdf(cache_filename, 'objects_df') return images_df, objects_df objects_dfs = [] images_dfs = [] for split in ['Train', 'Test']: # Load object data. anno_filenames = [ _.strip() for _ in open('{}/{}/annotations.lst'.format(dirname, split)).readlines() ] objects_df = pd.concat(( parse_annotation(anno_filename) for anno_filename in anno_filenames )) # Construct images_df from the objects data. grouped = objects_df.groupby(level=0) images_df = pd.DataFrame() images_df['filename'] = objects_df.groupby(level=0).first()['filename'] images_df[['filename', 'width', 'height']] = grouped.first()[ ['filename', 'width', 'height']] # We know that all objects are PASperson, but let's count them. images_df['PASperson'] = True images_df['num_objects'] = grouped.count()['class'] # Load negative examples and append to the images_df. neg_filenames, neg_image_ids = map(list, zip(*[ (_.strip(), _.strip().split('/')[-1][:-4]) for _ in open('{}/{}/neg.lst'.format(dirname, split)).readlines() ])) neg_images_df = pd.DataFrame(index=neg_image_ids) neg_images_df['filename'] = neg_filenames neg_images_df['PASperson'] = False neg_images_df['num_objects'] = 0 images_df = images_df.append(neg_images_df) objects_df['split'] = split images_df['split'] = split objects_dfs.append(objects_df) images_dfs.append(images_df) objects_df = pd.concat(objects_dfs) images_df = pd.concat(images_dfs) images_df.to_hdf(cache_filename, 'images_df', mode='w') objects_df.to_hdf(cache_filename, 'objects_df', mode='a') return images_df, objects_df
def restore_db(): filenames = glob.glob('phd_store*.h5') data_df = pd.read_hdf(filenames[0], 'author_df') for filename in filenames[1:]: temp = pd.read_hdf(filename, 'author_df') data_df = data_df.append(temp, ignore_index=True) # make a dataframe that is just US astro PhDs astro_df = data_df[(data_df['nonUS'] == False) & (data_df['astroPublication'] == True)] return astro_df, data_df
def load_DB(): meta_data = pd.read_hdf(PATH.DB, 'meta_data') pulses = pd.read_hdf(PATH.DB, 'pulses') cands = pd.read_hdf(PATH.DB, 'candidates') cands = cands[cands.main_cand == 0] cands.sort_values('Sigma', inplace=True, ascending=False) cands = cands.groupby('BEAM').head(10) cands = cands.head(50) cands = cands[ ((cands.N_pulses == 1) & (cands.Sigma>10.)) | ((cands.N_pulses > 1) & (cands.Sigma>16.)) ] cands.sort_values('Sigma', inplace=True, ascending=False) return meta_data, pulses, cands
def load_params(self): """ """ self.params_matrix = pd.read_hdf(self.ref_path, 'params_matrix') self.paramtree = pd.read_hdf(self.ref_path, 'params') self.paramtree = ParamTree(df=self.paramtree) self.measuretree = pd.read_hdf(self.ref_path, 'measures') self.measuretree = ParamTree(df=self.measuretree, adimentionalized=False)
import numpy as np import pandas as pd import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D dataloc = '/home/jacob/research/velas/vela2b/vela27/a0.490/' dataloc = '/mnt/cluster/abs/cgm/vela2b/vela27/a0.490/' boxfile = '{0:s}vela2b-27_GZa0.490.h5'.format(dataloc) d = pd.read_hdf(boxfile, 'data') loT, hiT = 10**4, 10**4.5 loN, hiN = 10**-5, 10**-4.5 print len(d) cloudInds = ((d['temperature'] < hiT) & (d['temperature'] > loT) & (d['density'] < hiN) & (d['density'] > loN) & (d['x'] < 0) & (d['z'] > 0) & (np.abs(d['y']) < 300)) cloud = d[cloudInds] loc = cloud[['x', 'y', 'z']] locMat = loc.as_matrix() datamean = loc.mean(axis=0).as_matrix() uu, dd, vv = np.linalg.svd(locMat, full_matrices=True) print uu print dd print vv
def get_trial_response_df(self): tdf = pd.read_hdf(self.trial_response_df_path, key='df') tdf.reset_index(inplace=True) tdf.drop(columns=['cell_roi_id'], inplace=True) return tdf
import numpy as np import pandas as pd import matplotlib.pyplot as plt np.set_printoptions(edgeitems=1000) dat = (np.load('ssi63shitl10hr.npy')) print(int(dat[0, -1])) exit() df = pd.read_hdf('../../flight-data/ssi63.h5') [print(k) for k in df.keys()] dfn = df[[ 'raw_pressure_1', 'raw_pressure_2', 'raw_pressure_3', 'raw_pressure_4', 'raw_temp_1', 'raw_temp_2', 'raw_temp_3', 'raw_temp_4', 'lat_gps', 'long_gps', 'altitude_gps', 'heading_gps', 'speed_gps', 'num_sats_gps' ]] del df dfn = dfn.iloc[:20 * 60 * 60 * 10] dfn.reindex((dfn.index - dfn.index[0]) / np.timedelta64(1, 's')) data = np.hstack( (((dfn.index - dfn.index[0]) / np.timedelta64(1, 's')).values.reshape( -1, 1), dfn.values)) np.save('ssi63shitl10hr', data)
def ColumnCleaner(filer,key): framer1 = pd.read_hdf(filer,key) framer1 = framer1.drop(['Run','Event','SubEvent','SubEventStream','exists'], axis=1) cols1 = framer1.columns.tolist() framer1.columns = [key+'_%s' % cols1[i] for i in range(0, len(cols1[:]))] return framer1
animalLists = [['adap042', 'adap043'], ['adap044', 'adap046']] labels = ['low_freq_go_left', 'low_freq_go_right'] tuningIntensities = [40, 50, 60, 70] plotAll = True qualityThreshold = 3 #2 maxZThreshold = 2 ISIcutoff = 0.02 for label,animalList in zip(labels, animalLists): # -- Make composite celldb -- # allMiceDfs = [] for animal in animalList: databaseFullPath = os.path.join(settings.DATABASE_PATH, '{}_database.h5'.format(animal)) key = 'head_fixed' celldbThisMouse = pd.read_hdf(databaseFullPath, key=key) allMiceDfs.append(celldbThisMouse) celldb = pd.concat(allMiceDfs, ignore_index=True) # -- Plot histogram of responsive freqs by hemi (!only for those TTs that are in striatum!) -- # goodQualCells = celldb.query("isiViolations<{} and shapeQuality>{} and astrRegion!='undetermined'".format(ISIcutoff, qualityThreshold)) maxZscore = goodQualCells.ZscoreEachIntensity.apply(lambda x : np.max(np.abs(x))) goodRespCells=goodQualCells[maxZscore >= maxZThreshold] # -- Plot reports -- # outputDir = '/home/languo/data/ephys/head_fixed_astr/all_mice/responsive_freqs_by_hemi/' if not os.path.exists(outputDir): os.mkdir(outputDir)
This script assigns all the other galaxy properties like baryonic mass and effective radius to each galaxy in the mock catalog """ from progressbar import ProgressBar import pandas as pd import numpy as np ### Reading text files Mr_vpeak_catalog = pd.read_csv('../data/SHAM_parallel.csv', \ delimiter='\t', header=None, \ names=['vpeak','M_r']) eco_obs_catalog = pd.read_csv('../data/gal_Lr_Mb_Re.txt',\ delimiter='\s+',header=None,skiprows=2,\ names=['M_r','logmbary','Re']) halocat_galcat_merged = pd.read_hdf('../data/halo_gal_Vishnu_Rockstar_macc.h5',\ key='halocat_galcat_merged') colnames = halocat_galcat_merged.columns Mr_vpeak_catalog = Mr_vpeak_catalog.sort_values('M_r') eco_obs_catalog = eco_obs_catalog.loc[eco_obs_catalog.Re.values >= 0] eco_obs_catalog = eco_obs_catalog.sort_values('M_r') pbar = ProgressBar() nearest_match_idx_arr = [] mbary_arr = [] re_arr = [] np.random.seed(0) for mag_value in pbar(Mr_vpeak_catalog.M_r.values): diff_arr = np.abs(eco_obs_catalog.M_r.values - mag_value) nearest_match_idx = np.where(diff_arr == diff_arr.min())[0] if len(nearest_match_idx) > 1:
def plot_ROC(prediction, pinfo, ensemble=1, label_type=None, output_png=None, output_tex=None, output_csv=None): # Convert the inputs to the correct format if type(prediction) is list: prediction = ''.join(prediction) if type(pinfo) is list: pinfo = ''.join(pinfo) if type(ensemble) is list: ensemble = int(ensemble[0]) # ensemble = ''.join(ensemble) if type(output_png) is list: output_png = ''.join(output_png) if type(output_csv) is list: output_csv = ''.join(output_csv) if type(output_tex) is list: output_tex = ''.join(output_tex) if type(label_type) is list: label_type = ''.join(label_type) # Read the inputs prediction = pd.read_hdf(prediction) if label_type is None: # Assume we want to have the first key label_type = prediction.keys()[0] N_1 = len(prediction[label_type].Y_train[0]) N_2 = len(prediction[label_type].Y_test[0]) # Determine the predicted score per patient print('Determining score per patient.') y_truths, y_scores, _, _ = plot_SVM(prediction, pinfo, label_type, show_plots=False, alpha=0.95, ensemble=ensemble, output='decision') # Plot the ROC with confidence intervals print("Plotting the ROC with confidence intervals.") plot = 'default' f, fpr, tpr = plot_ROC_CIc(y_truths, y_scores, N_1, N_2) if plot == 'default': plot = '' # Save the outputs if output_png is not None: f.savefig(output_png) print(("ROC saved as {} !").format(output_png)) if output_tex is not None: tikz_save(output_tex) print(("ROC saved as {} !").format(output_tex)) # Save ROC values as JSON if output_csv is not None: with open(output_csv, 'wb') as csv_file: writer = csv.writer(csv_file) writer.writerow(['FPR', 'TPR']) for i in range(0, len(fpr)): data = [str(fpr[i]), str(tpr[i])] writer.writerow(data) print(("ROC saved as {} !").format(output_csv)) return f, fpr, tpr
import _pickle as cPickle import matplotlib.cm as cm import os import matplotlib.gridspec as gridspec ############################################################################################################### # TO LOAD ############################################################################################################### data_directory = '/mnt/DataGuillaume/MergedData/' datasets = np.loadtxt(data_directory + 'datasets_ThalHpc.list', delimiter='\n', dtype=str, comments='#') # WHICH NEURONS mappings = pd.read_hdf("/mnt/DataGuillaume/MergedData/MAPPING_NUCLEUS.h5") firing_rate = pd.read_hdf("/mnt/DataGuillaume/MergedData/FIRING_RATE_ALL.h5") hd_index = mappings.index[np.where(mappings['hd'] == 1)[0]] hd_index = hd_index[np.where((firing_rate.loc[hd_index] > 1.0).all(axis=1))[0]] # SWR MODULATION swr_mod, swr_ses = loadSWRMod( '/mnt/DataGuillaume/MergedData/SWR_THAL_corr.pickle', datasets, return_index=True) nbins = 400 binsize = 5 times = np.arange(0, binsize * (nbins + 1), binsize) - (nbins * binsize) / 2 swr = pd.DataFrame(columns=swr_ses, index=times, data=gaussFilt(swr_mod, (1, )).transpose())
import numpy as np import pandas, cctbx, scitbx from dials.array_family import flex from cctbx import sgtbx, crystal # load the data! print "loading data" data_f = "/reg/d/psdm/cxi/cxid9114/res/dermen/reflection_2colorspec.hdf5" df = pandas.read_hdf( data_f,"reflections") sg = sgtbx.space_group(" P 4nw 2abw") Symm = crystal.symmetry( unit_cell=(79,79,38,90,90,90), space_group=sg) print "querying" df = df.query("BnotA") #df = df.query("intens2 < 5000") print "hkl" hkls = tuple( map( tuple, df[['hB','kB','lB']].values)) intens = np.ascontiguousarray(df.intens5.values) data = flex.double(intens) sigmas = flex.double( np.sqrt(intens)) mil_idx = flex.miller_index(hkls) mill_set = cctbx.miller.set( crystal_symmetry=Symm, indices=mil_idx, anomalous_flag=True) mill_ar = cctbx.miller.array(mill_set, data=data, sigmas=sigmas)\ .set_observation_type_xray_intensity()
def test_trigger_type_in_dl1_params(): from lstchain.io.io import dl1_params_lstcam_key params = pd.read_hdf(dl1_file, key=dl1_params_lstcam_key) assert 'trigger_type' in params.columns
from __future__ import print_function from statsmodels.compat import iteritems, cStringIO import numpy as np import pandas as pd sio = cStringIO.StringIO() c = pd.read_hdf('kpss_critical_values.h5', 'c') ct = pd.read_hdf('kpss_critical_values.h5', 'ct') data = {'c': c, 'ct': ct} for k, v in iteritems(data): n = v.shape[0] selected = np.zeros((n, 1), dtype=np.bool) selected[0] = True selected[-1] = True selected[v.index == 10.0] = True selected[v.index == 5.0] = True selected[v.index == 2.5] = True selected[v.index == 1.0] = True max_diff = 1.0 while max_diff > 0.05: xp = np.squeeze(v[selected].values) yp = np.asarray(v[selected].index, dtype=np.float64) x = np.squeeze(v.values) y = np.asarray(v.index, dtype=np.float64) yi = np.interp(x, xp, yp) abs_diff = np.abs(y - yi) max_diff = np.max(abs_diff) if max_diff > 0.05: selected[np.where(abs_diff == max_diff)] = True
from keras.wrappers.scikit_learn import KerasClassifier from keras.utils import np_utils # fix random seed for reproducibility seed = 7 np.random.seed(seed) # Constants FILE_PATH_TRAIN = "./input/train.h5" # Validation Size TEST_SIZE = 0.2 # read files training_data = pd.read_hdf(FILE_PATH_TRAIN, "train") # training data # extracting the x-values x_values_training = training_data.copy() x_values_training = x_values_training.drop(labels=['y'], axis=1) x_component_training = x_values_training.values # extracting the y-values y_component_training = training_data['y'].values # training the scaler scaler = StandardScaler(with_mean=True, with_std=True) scaler = scaler.fit(x_component_training) # scaling the training and test data
import pandas as pd import numpy as np import sklearn from sklearn import neural_network from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.decomposition import PCA train_labeled = pd.read_hdf("train_labeled.h5", "train") train_unlabeled = pd.read_hdf("train_unlabeled.h5", "train") test = pd.read_hdf("test.h5", "test") train_labeled = np.array(train_labeled) features = train_labeled[:, 1:129] lables = train_labeled[:, 0] print(lables) print(features) print(lables.size) print(features.size) nn = neural_network.MLPClassifier(hidden_layer_sizes=(256, ), activation='relu', solver='adam', alpha=1, learning_rate='constant', learning_rate_init=0.0001, power_t=0.5,
########## Buffer L T21 @3mM MgCl2 + 150 NaCl ### exp200 labels.extend(['Cy3b_R1-36#:R1s1-8:40nM_L21_exp200_p038uW']) labels.extend(['Cy3b_R1-36#:R1s1-8:40nM_L21_exp200_p114uW']) labels.extend(['Cy3b_R1-36#:R1s1-8:40nM_L21_exp200_p250uW']) dir_names = [ '/fs/pool/pool-schwille-paint/Analysis/p06.SP-tracking/immobile/tracking-handle/z.datalog' ] * len(labels) ############################################################## Read in data #### Load & Sorting path = [ os.path.join(dir_names[i], labels[i] + '_stats.h5') for i in range(0, len(labels)) ] X = pd.concat([pd.read_hdf(p, key='result') for p in path]) X.sort_index(axis=0, ascending=True, inplace=True) X.sort_index(axis=1, ascending=True, inplace=True) #%% ############################################################## Quick selection field = 'Tn=1e+00' istrue = (X.power == 38) & (X.exp == 400) Xred = X.loc[istrue, field] #%% exp = 200 buffers = ['B', 'L', 'L21'] colors = ['r', 'b', 'k', 'magenta'] ############################################################## Plotting half
def get_pwi(self, segment, source, sink): pwi = self.segment_stores[segment].loc[source, sink] return pwi def save_output(self): nx.write_gpickle(self.G, '{0} Graph with PWIs.pkl'.format(self.handle)) def run(self): for sc, sk, d in self.G.edges(data=True): for seg, val in d['segments'].items(): pwi = self.get_pwi(seg, sc, sk) self.G.edge[sc][sk]['segments'][seg] = pwi print(self.G.edge[sc][sk]) self.save_output() if __name__ == '__main__': handle = sys.argv[1] segment_stores = dict() for i in range(1, 9): print('Getting segment {0} store.'.format(i)) segment_stores[i] = pd.read_hdf( '{0} Thresholded Segment Affmats.h5'.format(handle), key='segment{0}'.format(i)) gc = GraphPWIFinder(handle, segment_stores) gc.run()
def main(args, vers=None): # Load Events from HDF5 database of .singlepulse file try: events = pd.read_hdf(args.filename, 'events') with pd.HDFStore(args.filename) as store: db_keys = store.keys() file_type = 'hdf5' except (IOError, HDF5ExtError) as e: events = Events.Loader(args) file_type = 'sp' # Select events within the defined ranges if args.SNR_min is not None: events = events[events.Sigma >= args.SNR_min] if events.empty: print "No events found. Exiting" return # Load meta data if args.meta_data is not None: meta_data = Events.meta_data_Loader(args.meta_data) if vers is not None: meta_data['version'] = vers meta_data['File'] = os.path.basename(args.filename) if args.no_store is not None: meta_data.to_hdf(args.store_name, 'meta_data') elif file_type == 'hdf5': if '/meta_data' in db_keys: meta_data = pd.read_hdf(args.filename, 'meta_data') else: meta_data = None # Load Pulses if not args.no_search: pulses = Pulses.Loader(events, args) elif file_type == 'hdf5': if '/pulses' in db_keys: pulses = pd.read_hdf(args.filename, 'pulses') else: print "Pulses not present in the database. Exiting" return else: print "Events have been loaded and stored into the HDF5 file. Exiting" return if not args.no_filter: pulses = pulses[pulses.Rank == 0] # Select pulses within the defined ranges if args.t_range is not None: pulses = pulses[(pulses.Time >= args.t_range[0]) & (pulses.Time <= args.t_range[1])] if args.DM_range is not None: pulses = pulses[(pulses.DM >= args.DM_range[0]) & (pulses.DM <= args.DM_range[1])] if args.SNR_min is not None: pulses = pulses[pulses.Sigma >= args.SNR_peak_min] if args.N_min is not None: pulses = pulses[pulses.N_events >= args.N_min] if pulses.empty: print "No pulses found. Exiting" return # Load Candidates cands = Candidates.Loader(pulses, args) if not args.no_search or not args.no_store: cands.to_hdf(args.store_name, 'candidates') cands = cands[cands.main_cand == 0] if cands.empty: print "No candidates found. Exiting" return if cands.shape[0] > 100: print "{} candidates found, only the brightest 100 will be processed.".format( cands.shape[0]) cands = cands.head(100) cands = cands[((cands.N_pulses == 1) & (cands.Sigma >= args.single_cand_SNR)) | ((cands.N_pulses > 1) & (cands.Sigma >= args.multiple_cand_SNR))] cands.sort_values('Sigma', inplace=True, ascending=False) #Produce the output if not args.no_plot: LSPplot.output(args, events, pulses, cands, meta_data) return
args = parser.parse_args() # Grep all database files from pathlib import Path files = [] for path in Path(args.path).rglob('*.hdf'): files.append(path) for path in Path(args.path).rglob('*.hd5'): files.append(path) if len(files) == 0: print('No new files found') exit() for f in files: # target output desc = f #.relative_to(args.path) output = f'{desc.parent}/{desc.stem}.parquet' # Original print(f'Converting from: {f}') df = pd.read_hdf(f) # execute print(f'Converting to:{output}') df.to_parquet(output) # FOLD print(f'RM: {f}') os.remove(f)
def load_df(dirpath, filename, varname=None): varname = filename if varname is None else varname fn = os.path.join(dirpath, filename) return read_hdf(fn, varname)
model.add(Dense(units=output_size)) model.add(Activation(activ_func)) model.compile(loss=loss, optimizer=optimizer) return model """ if __name__ == "__main__": with open('tickerlist.pkl', 'rb') as f: tickerlist = pickle.load(f) train, test, X_train, X_test, y_train, y_test = [], [], [], [], [], [] for symbol in tickerlist: #symbol = 'ADABTC' print(symbol) df = pd.read_hdf("added_params/" + symbol + ".h5") df = df.drop(['time'], axis=1) df = df.dropna() l_train, l_test, l_X_train, l_X_test, l_y_train, l_y_test = prepare_data_high( df, target_col='high', window_len=window_len, zero_base=zero_base, test_size=test_size) if len(l_y_test) > 10: symbolhighdata = {} symbolhighdata['l_train'] = l_train symbolhighdata['l_test'] = l_test symbolhighdata['l_X_train'] = l_X_train
def main(infold, mns_path, epsg_code, outfile, titre): """Main plotting function.""" files = {} for file in infold.iterdir(): files.update({file.name: file}) # Extract the info nb = list(files.keys())[0].split("_")[-1].split(".")[0] width = list(files.keys())[0].split("_")[-3] azi = list(files.keys())[0].split("_")[-5] # Open gdal raster MNS_data, MNS_gt, MNS_ds = open_large_raster(str(mns_path)) # Open transect data_tr = {} for fname, pth in files.items(): if "transect" in fname: with open(pth, "rb") as f: transects = pickle.load(f) else: data_tr.update({"_".join(fname.split("_")[0:2]): pd.read_hdf(pth)}) # Get very approximate center of transects midishline = transects[int(len(transects) / 2)] mid_point = midishline.interpolate(0.5, normalized=True) midpoint_buffer = mid_point.buffer(midishline.length / 2) envelope = midpoint_buffer.envelope # Turn interactive plotting off plt.ioff() # Create figure fig = plt.figure(figsize=(15.4, 6.6)) fig.suptitle(titre) # Epsg proj_code = ccrs.epsg(epsg_code) # 2 by 2 grid gs = GridSpec(ncols=3, nrows=2, figure=fig, width_ratios=[0.1, 1.5, 4]) ax = plt.subplot(gs[0, 1], projection=proj_code) ax1 = plt.subplot(gs[:, 0]) ax2 = plt.subplot(gs[-1, 1], projection=proj_code) ax3 = plt.subplot(gs[:, -1]) # AX mns_masked = np.ma.masked_where(MNS_data < 0, MNS_data) extent = ( MNS_gt[0], MNS_gt[0] + MNS_ds.RasterXSize * MNS_gt[1], MNS_gt[3] + MNS_ds.RasterYSize * MNS_gt[5], MNS_gt[3], ) ax.imshow( mns_masked, extent=extent, origin="upper", cmap="gist_earth" ) ax.plot( [midishline.coords[0][0], midishline.coords[-1][0]], [midishline.coords[0][1], midishline.coords[-1][1]], linestyle="-", color="red", linewidth=1, ) norm = Normalize(vmin=np.min(mns_masked), vmax=np.max(mns_masked)) cbar = ColorbarBase( ax1, cmap=plt.get_cmap("gist_earth"), norm=norm, orientation="vertical" ) cbar.ax.yaxis.set_label_position("left") cbar.ax.set_ylabel("Altitude / m") # AX2 ax2.imshow( mns_masked, extent=extent, origin="upper", cmap="gist_earth" ) for line in transects: ax2.plot( [line.coords[0][0], line.coords[-1][0]], [line.coords[0][1], line.coords[-1][1]], linestyle="-", color="black", alpha=0.6, linewidth=0.5, ) ax2.set_extent( [ envelope.bounds[0], envelope.bounds[2], envelope.bounds[1], envelope.bounds[-1], ], crs=ccrs.epsg(epsg_code), ) ax2.set_title("Zoom on transects", y=-0.2) # AX3 # Plot MNT/ MNS ground data_tr["MNT_solnu"].T.plot( ax=ax3, color="sienna", alpha=0.1, legend=False ) data_tr["MNT_solnu"].T.mean(axis=1).plot( ax=ax3, color="sienna", legend=True, label="Mean summer DTM" ) data_tr["MNS_solnu"].T.plot( ax=ax3, color="lightgreen", alpha=0.1, legend=False ) data_tr["MNS_solnu"].T.mean(axis=1).plot( ax=ax3, color="lightgreen", legend=True, label="Mean summer DSM" ) # Plot MNS neige data_tr["MNS_neige"].T.plot( ax=ax3, color="midnightblue", alpha=0.2, legend=False ) data_tr["MNS_neige"].T.mean(axis=1).plot( ax=ax3, color="midnightblue", legend=True, label="Mean winter DSM" ) ax3.set_title( "Azimuth: %s°, Width: %sm, # of transects: %s" % (azi, width, nb) ) ax3.set_xlabel("Distance along transect / m") ax3.set_ylabel("Altitude / m") ax3.set_xlim(0, midishline.length) ax3.set_ylim( np.nanmin(data_tr["MNT_solnu"].T.mean(axis=1)) - 5, np.nanmax(data_tr["MNS_neige"].T.mean(axis=1)) + 5, ) ax3.xaxis.set_major_locator(MultipleLocator(10)) ax3.xaxis.set_minor_locator(MultipleLocator(5)) ax3.yaxis.set_major_locator(MultipleLocator(1)) ax3.yaxis.set_minor_locator(MultipleLocator(0.5)) ax3.xaxis.set_ticks_position("both") ax3.yaxis.set_ticks_position("both") ax3.tick_params(direction="inout", which="both") fig.savefig(infold.joinpath(outfile), bbox_inches="tight", dpi=300)
def read_hdf_data_psi(path = 'premix_data', key='of_tables', in_labels=['zeta','f','pv'], labels = ['T'], scaler = None): # read in the hdf5 file # AND COMPUTE PSI OF THE MIXTURE try: df = pd.read_hdf(path,key=key) except: print('Check the data path and key') # read the molar weigths with open('molar_weights.json', 'r') as fp: molar_weights = json.load(fp) # read in the order of the species names with open('GRI_species_order') as f: all_species = f.read().splitlines() # numpy array of species molar weights molar_weights_np = np.array([molar_weights[s] for s in all_species]) molar_weights_np = molar_weights_np/ 1000 # conversion from g to kg! This is needed for OpenFOAM T_vector = df['T'].as_matrix() # convert to ndarray gri_mass_frac = df[all_species].as_matrix() # COMPUTE THE CORRECT PSI VALUE R_universal = 8.314459 psi_list = [] print('Starting to compute psi ... ') # iterate over all rows for index in range(0,df.shape[0]): R_m = R_universal * sum(gri_mass_frac[index,:] / molar_weights_np) #df['psi'].iloc[index] = 1 / (R_m * row['T']) psi_list.append(1/(R_m * T_vector[index])) # print(index) # hand back the data to df df['psi'] = psi_list print('Done with psi!\n') input_df=df[in_labels] if scaler=='MinMax': in_scaler = preprocessing.MinMaxScaler() out_scaler = preprocessing.MinMaxScaler() elif scaler=='Standard': in_scaler = preprocessing.StandardScaler() out_scaler = preprocessing.StandardScaler() else: raise ValueError('Only possible scalers are: MinMax or Standard.') input_np = in_scaler.fit_transform(input_df) label_df=df[labels] label_np = out_scaler.fit_transform(label_df) print('\n*******************************') print('The scaler is %s\n' % scaler) print('This is the order of the labels:') [print(f) for f in labels] print('*******************************\n') return input_np, label_np, df, in_scaler, out_scaler
def get_extended_stimulus_presentations_df(self): return pd.read_hdf(self.extended_stimulus_presentations_df_path, key='df')
def load_equities(): return pd.read_hdf(custom_data_path / 'stooq.h5', 'jp/equities')
def test_write_dl2_dataframe(): from lstchain.tests.test_lstchain import dl2_file, test_dir from lstchain.io.io import dl2_params_lstcam_key dl2 = pd.read_hdf(dl2_file, key=dl2_params_lstcam_key) from lstchain.io import write_dl2_dataframe write_dl2_dataframe(dl2, os.path.join(test_dir, 'dl2_test.h5'))
import pandas as pd import urllib.request # Read from h5 df = pd.read_hdf('./02_io_tools/hdfstore.h5', 'd1') print(df.head()) # Create JSON df.to_json('./02_io_tools/example_json.json') # Read JSON df2 = pd.read_json('./02_io_tools/example_json.json') print(df2.head()) # Request to read from Giphy Public API - Trending get_market_history_json = urllib.request.urlopen( 'https://api.bittrex.com/api/v1.1/public/getmarkethistory?market=USD-BTC' ).read() get_market_history_df = pd.read_json(get_market_history_json) print(get_market_history_json)
def AnalyzeVideosSession(video_dir): """ DeepLabCut Toolbox https://github.com/AlexEMG/DeepLabCut A Mathis, [email protected] M Mathis, [email protected] This script analyzes videos based on a trained network (as specified in myconfig_analysis.py) You need tensorflow for evaluation. Run by: python3 AnalyzeVideosSession.py video_dir Functionalized by Adam S. Lowet, 10/25/19 """ #################################################### # Dependencies #################################################### import os.path import sys subfolder = os.getcwd().split('analysis-tools')[0] sys.path.append(subfolder) # add parent directory: (where nnet & config are!) sys.path.append(os.path.join(subfolder, "pose-tensorflow")) sys.path.append(os.path.join(subfolder, "config")) from myconfig_analysis import cropping, Task, date, \ trainingsFraction, resnet, snapshotindex, shuffle,x1, x2, y1, y2, videotype, storedata_as_csv # Deep-cut dependencies from config import load_config from nnet import predict from dataset.pose_dataset import data_to_input # Dependencies for video: import pickle # import matplotlib.pyplot as plt import imageio from skimage.util import img_as_ubyte from moviepy.editor import VideoFileClip import skimage import skimage.color import time import pandas as pd import numpy as np import os from tqdm import tqdm def getpose(image, cfg, outputs, outall=False): ''' Adapted from DeeperCut, see pose-tensorflow folder''' image_batch = data_to_input(skimage.color.gray2rgb(image)) outputs_np = sess.run(outputs, feed_dict={inputs: image_batch}) scmap, locref = predict.extract_cnn_output(outputs_np, cfg) pose = predict.argmax_pose_predict(scmap, locref, cfg.stride) if outall: return scmap, locref, pose else: return pose #################################################### # Loading data, and defining model folder #################################################### basefolder = os.path.join('..', '..', 'pose-tensorflow', 'models') modelfolder = os.path.join( basefolder, Task + str(date) + '-trainset' + str(int(trainingsFraction * 100)) + 'shuffle' + str(shuffle)) cfg = load_config(os.path.join(modelfolder, 'test', "pose_cfg.yaml")) ################################################## # Load and setup CNN part detector ################################################## # Check which snapshots are available and sort them by # iterations Snapshots = np.array([ fn.split('.')[0] for fn in os.listdir(os.path.join(modelfolder, 'train')) if "index" in fn ]) increasing_indices = np.argsort([int(m.split('-')[1]) for m in Snapshots]) Snapshots = Snapshots[increasing_indices] print(modelfolder) print(Snapshots) ################################################## # Compute predictions over images ################################################## # Check if data already was generated: cfg['init_weights'] = os.path.join(modelfolder, 'train', Snapshots[snapshotindex]) # Name for scorer: trainingsiterations = (cfg['init_weights'].split('/')[-1]).split('-')[-1] # Name for scorer: scorer = 'DeepCut' + "_resnet" + str(resnet) + "_" + Task + str( date) + 'shuffle' + str(shuffle) + '_' + str(trainingsiterations) cfg['init_weights'] = os.path.join(modelfolder, 'train', Snapshots[snapshotindex]) sess, inputs, outputs = predict.setup_pose_prediction(cfg) pdindex = pd.MultiIndex.from_product( [[scorer], cfg['all_joints_names'], ['x', 'y', 'likelihood']], names=['scorer', 'bodyparts', 'coords']) ################################################## # Datafolder ################################################## # video_dir='../videos/' #where your folder with videos is. frame_buffer = 10 os.chdir(video_dir) videos = np.sort([fn for fn in os.listdir(os.curdir) if (videotype in fn)]) print("Starting ", video_dir, videos) for video in videos: dataname = video.split('.')[0] + scorer + '.h5' try: # Attempt to load data... pd.read_hdf(dataname) print("Video already analyzed!", dataname) except FileNotFoundError: print("Loading ", video) clip = VideoFileClip(video) ny, nx = clip.size # dimensions of frame (height, width) fps = clip.fps #nframes = np.sum(1 for j in clip.iter_frames()) #this is slow (but accurate) nframes_approx = int( np.ceil(clip.duration * clip.fps) + frame_buffer) # this will overestimage number of frames (see https://github.com/AlexEMG/DeepLabCut/issues/9) This is especially a problem # for high frame rates and long durations due to rounding errors (as Rich Warren found). Later we crop the result (line 187) if cropping: clip = clip.crop(y1=y1, y2=y2, x1=x1, x2=x2) # one might want to adjust print("Duration of video [s]: ", clip.duration, ", recorded with ", fps, "fps!") print("Overall # of frames: ", nframes_approx, "with cropped frame dimensions: ", clip.size) start = time.time() PredicteData = np.zeros( (nframes_approx, 3 * len(cfg['all_joints_names']))) clip.reader.initialize() print("Starting to extract posture") for index in tqdm(range(nframes_approx)): #image = img_as_ubyte(clip.get_frame(index * 1. / fps)) image = img_as_ubyte(clip.reader.read_frame()) # Thanks to Rick Warren for the following snipplet: # if close to end of video, start checking whether two adjacent frames are identical # this should only happen when moviepy has reached the final frame # if two adjacent frames are identical, terminate the loop if index == int(nframes_approx - frame_buffer * 2): last_image = image elif index > int(nframes_approx - frame_buffer * 2): if (image == last_image).all(): nframes = index print("Detected frames: ", nframes) break else: last_image = image pose = getpose(image, cfg, outputs) PredicteData[index, :] = pose.flatten( ) # NOTE: thereby cfg['all_joints_names'] should be same order as bodyparts! stop = time.time() dictionary = { "start": start, "stop": stop, "run_duration": stop - start, "Scorer": scorer, "config file": cfg, "fps": fps, "frame_dimensions": (ny, nx), "nframes": nframes } metadata = {'data': dictionary} print("Saving results...") DataMachine = pd.DataFrame( PredicteData[:nframes, :], columns=pdindex, index=range( nframes)) #slice pose data to have same # as # of frames. DataMachine.to_hdf(dataname, 'df_with_missing', format='table', mode='w') if storedata_as_csv: DataMachine.to_csv(video.split('.')[0] + scorer + '.csv') with open( dataname.split('.')[0] + 'includingmetadata.pickle', 'wb') as f: pickle.dump(metadata, f, pickle.HIGHEST_PROTOCOL)
def load_data(): return pd.read_hdf("./raw_data/data.hdf", "master")
from sklearn.model_selection import ShuffleSplit, RepeatedKFold from sklearn.pipeline import make_pipeline from sklearn.linear_model import RidgeCV input_path = "/storage/inria/agramfor/camcan_derivatives" bands = [ 'alpha', 'beta_high', 'beta_low', 'delta', 'gamma_high', 'gamma_lo', 'gamma_mid', 'low', 'theta' ] # assemble matrixes data = list() for band in bands: data.append( pd.read_hdf(op.join(input_path, f'mne_source_power_diag-{band}.h5'), 'mne_power_diag')) data = pd.concat(data, axis=1) subjects = data.index.values # use subjects we used in previous nips submission new_subjects = ['CC510256', 'CC520197', 'CC610051', 'CC121795', 'CC410182'] mask = ~np.in1d(subjects, new_subjects) subjects = subjects[mask] X = data.values[mask] participants_fname = op.join(cfg.derivative_path, "participants.csv") participants = pd.read_csv(participants_fname) y = participants.set_index('Observations').age.loc[subjects].values
def ExtractFramesbasedonPreselection( Index, extractionalgorithm, data, video, cfg, config, opencv=True, cluster_resizewidth=30, cluster_color=False, savelabeled=True, with_annotations=True, ): from deeplabcut.create_project import add start = cfg["start"] stop = cfg["stop"] numframes2extract = cfg["numframes2pick"] bodyparts = auxiliaryfunctions.IntersectionofBodyPartsandOnesGivenbyUser( cfg, "all") videofolder = str(Path(video).parents[0]) vname = str(Path(video).stem) tmpfolder = os.path.join(cfg["project_path"], "labeled-data", vname) if os.path.isdir(tmpfolder): print("Frames from video", vname, " already extracted (more will be added)!") else: auxiliaryfunctions.attempttomakefolder(tmpfolder, recursive=True) nframes = len(data) print("Loading video...") if opencv: vid = VideoWriter(video) fps = vid.fps duration = vid.calc_duration() else: from moviepy.editor import VideoFileClip clip = VideoFileClip(video) fps = clip.fps duration = clip.duration if cfg["cropping"]: # one might want to adjust coords = (cfg["x1"], cfg["x2"], cfg["y1"], cfg["y2"]) else: coords = None print("Duration of video [s]: ", duration, ", recorded @ ", fps, "fps!") print("Overall # of frames: ", nframes, "with (cropped) frame dimensions: ") if extractionalgorithm == "uniform": if opencv: frames2pick = frameselectiontools.UniformFramescv2( vid, numframes2extract, start, stop, Index) else: frames2pick = frameselectiontools.UniformFrames( clip, numframes2extract, start, stop, Index) elif extractionalgorithm == "kmeans": if opencv: frames2pick = frameselectiontools.KmeansbasedFrameselectioncv2( vid, numframes2extract, start, stop, cfg["cropping"], coords, Index, resizewidth=cluster_resizewidth, color=cluster_color, ) else: if cfg["cropping"]: clip = clip.crop(y1=cfg["y1"], y2=cfg["x2"], x1=cfg["x1"], x2=cfg["x2"]) frames2pick = frameselectiontools.KmeansbasedFrameselection( clip, numframes2extract, start, stop, Index, resizewidth=cluster_resizewidth, color=cluster_color, ) else: print( "Please implement this method yourself! Currently the options are 'kmeans', 'jump', 'uniform'." ) frames2pick = [] # Extract frames + frames with plotted labels and store them in folder (with name derived from video name) nder labeled-data print("Let's select frames indices:", frames2pick) colors = visualization.get_cmap(len(bodyparts), cfg["colormap"]) strwidth = int(np.ceil(np.log10(nframes))) # width for strings for index in frames2pick: ##tqdm(range(0,nframes,10)): if opencv: PlottingSingleFramecv2( vid, cfg["cropping"], coords, data, bodyparts, tmpfolder, index, cfg["dotsize"], cfg["pcutoff"], cfg["alphavalue"], colors, strwidth, savelabeled, ) else: PlottingSingleFrame( clip, data, bodyparts, tmpfolder, index, cfg["dotsize"], cfg["pcutoff"], cfg["alphavalue"], colors, strwidth, savelabeled, ) plt.close("all") # close videos if opencv: vid.close() else: clip.close() del clip # Extract annotations based on DeepLabCut and store in the folder (with name derived from video name) under labeled-data if len(frames2pick) > 0: try: if cfg["cropping"]: add.add_new_videos( config, [video], coords=[coords]) # make sure you pass coords as a list else: add.add_new_videos(config, [video], coords=None) except: # can we make a catch here? - in fact we should drop indices from DataCombined if they are in CollectedData.. [ideal behavior; currently this is pretty unlikely] print( "AUTOMATIC ADDING OF VIDEO TO CONFIG FILE FAILED! You need to do this manually for including it in the config.yaml file!" ) print("Videopath:", video, "Coordinates for cropping:", coords) pass if with_annotations: machinefile = os.path.join( tmpfolder, "machinelabels-iter" + str(cfg["iteration"]) + ".h5") if isinstance(data, pd.DataFrame): df = data.loc[frames2pick] df.index = [ os.path.join( "labeled-data", vname, "img" + str(index).zfill(strwidth) + ".png", ) for index in df.index ] # exchange index number by file names. elif isinstance(data, dict): idx = [ os.path.join( "labeled-data", vname, "img" + str(index).zfill(strwidth) + ".png", ) for index in frames2pick ] filename = os.path.join(str(tmpfolder), f"CollectedData_{cfg['scorer']}.h5") try: df_temp = pd.read_hdf(filename, "df_with_missing") columns = df_temp.columns except FileNotFoundError: columns = pd.MultiIndex.from_product( [ [cfg["scorer"]], cfg["individuals"], cfg["multianimalbodyparts"], ["x", "y"], ], names=["scorer", "individuals", "bodyparts", "coords"], ) if cfg["uniquebodyparts"]: columns2 = pd.MultiIndex.from_product( [ [cfg["scorer"]], ["single"], cfg["uniquebodyparts"], ["x", "y"], ], names=[ "scorer", "individuals", "bodyparts", "coords" ], ) df_temp = pd.concat(( pd.DataFrame(columns=columns), pd.DataFrame(columns=columns2), )) columns = df_temp.columns array = np.full((len(frames2pick), len(columns)), np.nan) for i, index in enumerate(frames2pick): data_temp = data.get(index) if data_temp is not None: vals = np.concatenate(data_temp)[:, :2].flatten() array[i, :len(vals)] = vals df = pd.DataFrame(array, index=idx, columns=columns) else: return if Path(machinefile).is_file(): Data = pd.read_hdf(machinefile, "df_with_missing") DataCombined = pd.concat([Data, df]) # drop duplicate labels: DataCombined = DataCombined[~DataCombined.index.duplicated( keep="first")] DataCombined.to_hdf(machinefile, key="df_with_missing", mode="w") DataCombined.to_csv( os.path.join(tmpfolder, "machinelabels.csv") ) # this is always the most current one (as reading is from h5) else: df.to_hdf(machinefile, key="df_with_missing", mode="w") df.to_csv(os.path.join(tmpfolder, "machinelabels.csv")) print( "The outlier frames are extracted. They are stored in the subdirectory labeled-data\%s." % vname) print( "Once you extracted frames for all videos, use 'refine_labels' to manually correct the labels." ) else: print("No frames were extracted.")
def timegrid_one_batch(configs): batch_id = configs["batch_id"] with open(configs["pid_batch_file"], 'rb') as fp: obj = pickle.load(fp) batch_to_lst = obj["batch_to_lst"] batches = list(sorted(batch_to_lst.keys())) batch_idxs = batch_to_lst[batch_id] first_write = True create_static = configs["create_static"] create_dynamic = configs["create_dynamic"] print("Dispatched batch {} with {} patients".format( batch_id, len(batch_idxs))) for pidx, pid in enumerate(batch_idxs): if (pidx + 1) % 10 == 0: print("Progress in batch {}: {}/{}".format(batch_id, pidx + 1, len(batch_idxs))) if create_static: static_extractor = eicu_static_tf.StaticExtractor() df_pat = pd.read_hdf(configs["input_patient_table"], mode='r', where="patientunitstayid={}".format(pid)) df_adm = pd.read_hdf(configs["input_admission_table"], mode='r', where="patientunitstayid={}".format(pid)) df_aav = pd.read_hdf(configs["input_apache_aps_var_table"], mode='r', where="patientunitstayid={}".format(pid)) df_apr = pd.read_hdf(configs["input_apache_patient_result_table"], mode='r', where="patientunitstayid={}".format(pid)) df_apv = pd.read_hdf(configs["input_apache_pred_var_table"], mode='r', where="patientunitstayid={}".format(pid)) df_static = static_extractor.transform(df_pat, df_adm, df_aav, df_apr, df_apv, pid=pid) if create_dynamic: lab_vars = [] with open(configs["selected_lab_vars"], 'r') as fp: csv_fp = csv.reader(fp, delimiter='\t') next(csv_fp) for lab_name in csv_fp: lab_vars.append(lab_name[0].strip()) grid_model = eicu_tf_impute.Timegridder() grid_model.set_selected_lab_vars(lab_vars) quantile_fp = open(configs["quantile_dict"], mode='r') var_quantile_dict = json.load(quantile_fp) grid_model.set_quantile_dict(var_quantile_dict) quantile_fp.close() df_lab = pd.read_hdf(configs["input_lab_table"], mode='r', where="patientunitstayid={}".format(pid)) df_vs = pd.read_hdf(configs["input_vital_periodic_table"], mode='r', where="patientunitstayid={}".format(pid)) df_avs = pd.read_hdf(configs["input_vital_aperiodic_table"], mode='r', where="patientunitstayid={}".format(pid)) df_out = grid_model.transform(df_lab, df_vs, df_avs, pid=pid) if first_write: if create_dynamic: df_out.to_hdf(os.path.join(configs["output_dynamic_dir"], "batch_{}.h5".format(batch_id)), configs["output_dset_id"], append=False, data_columns=["patientunitstayid"], mode='w', format="table", complevel=configs["hdf_comp_level"], complib=configs["hdf_comp_alg"]) if create_static: df_static.to_hdf(os.path.join(configs["output_static_dir"], "batch_{}.h5".format(batch_id)), configs["output_dset_id"], append=False, data_columns=["patientunitstayid"], mode='w', format="table", complevel=configs["hdf_comp_level"], complib=configs["hdf_comp_alg"]) else: if create_dynamic: df_out.to_hdf(os.path.join(configs["output_dynamic_dir"], "batch_{}.h5".format(batch_id)), configs["output_dset_id"], append=True, data_columns=["patientunitstayid"], mode='a', format="table", complevel=configs["hdf_comp_level"], complib=configs["hdf_comp_alg"]) if create_static: df_static.to_hdf(os.path.join(configs["output_static_dir"], "batch_{}.h5".format(batch_id)), configs["output_dset_id"], append=True, data_columns=["patientunitstayid"], mode='a', format="table", complevel=configs["hdf_comp_level"], complib=configs["hdf_comp_alg"]) first_write = False