def addtoQuilt(df_new, name): from quilt.data.nmduarte import gdelt3 if name=="data_with_news": d = gdelt3.data.data_with_news() else: d = gdelt3.data.events() #print("Original data has :", original_data.count()) df_new2= df_new.toPandas() print("Appending:", df_new2.count()) print("original: ",type(d)) print("new: ", type(df_new2)) d = d.append(df_new2) print("TTOAL:", d.count()) # gdelt3._set(["data","data_with_news"],df) # data_with_news['new_column'] = "aaaaaa" # data_with_news['new_column2'] = "bbbbb" quilt.build("nmduarte/gdelt3/data/"+name, d) quilt.push("nmduarte/gdelt3/data/"+name, is_public=True, is_team=False)
def _build_file_as_package(self, filepath: Union[str, pathlib.Path], package_name: str) -> str: # enforce types checks.check_types(filepath, [str, pathlib.Path]) checks.check_types(package_name, str) checks.check_file_exists(filepath) # convert types filepath = pathlib.Path(filepath) filepath = filepath.expanduser() filepath = filepath.resolve() # construct manifest load = {} load["file"] = str(filepath) load["transform"] = "id" contents = {"load": load} node = {"contents": contents} # write temporary manifest temp_write_loc = pathlib.Path(os.getcwd()) temp_write_loc /= "single_file.yml" with open(temp_write_loc, "w") as write_out: yaml.dump(node, write_out, default_flow_style=False) # create quilt node full_package_name = self.storage_user + "/" + package_name quilt.build(full_package_name, str(temp_write_loc)) # remove the temp file os.remove(temp_write_loc) return full_package_name
def df_to_quilt(df, path): parts = path.split('/') assert len(parts) > 2 root_pkg = '/'.join(parts[0:2]) try: quilt.install(root_pkg, force=True) except Exception: pass object_encoding = {} df = df.copy() for col, dtype in df.dtypes.iteritems(): if dtype.name in ('Int8', 'Int32'): object_encoding[col] = 'int32' df[col] = df[col].astype(object) else: object_encoding[col] = 'infer' with tempfile.NamedTemporaryFile(suffix='.parquet') as f: print('writing to %s' % f.name) fastparquet.write(f.name, df, compression='snappy', object_encoding=object_encoding) print('build') quilt.build(path, f.name) print('push') quilt.push(root_pkg, is_public=True)
def update_quilt_datasets(): import quilt from quilt.data.jyrjola import karttahel df = get_buildings() karttahel._set(['buildings'], df) quilt.build('jyrjola/karttahel', karttahel) quilt.push('jyrjola/karttahel', is_public=True)
def _get_root_node(self): store, package = self.find_package() if not package: quilt.build(self.package_name) store, package = self.find_package() root_node = _from_core_node(store, package) return root_node
def update(self, df): root_node = self._get_root_node() df, meta = pint_df_to_quilt(df) root_node._set([self.sub_path], df) data_node = getattr(root_node, self.sub_path) data_node._meta.update(meta) quilt.build(self.package_name, root_node)
def update_quilt_datasets(): QUILT_TARGET = 'jyrjola/statfi' from quilt.data.jyrjola import statfi as node import quilt import requests_cache requests_cache.install_cache() df = get_fuel_classification() df.to_csv('fuel_classification.csv') print(df) exit() node._set(['fuel_classification'], df) quilt.build(QUILT_TARGET, node) quilt.push(QUILT_TARGET, is_public=True)
def refresh_pxweb_datasets(): import requests_cache requests_cache.install_cache() api = PXWebAPI('http://trafi2.stat.fi/PXWeb', 'fi') # print(api.list_topics('TraFi/Ensirekisteroinnit')) # exit() for path, table in PXWEB_TABLES: print(path, table) pxf = api.get_table('%s/%s.px' % (path, table)) table = 'tf%s' % table root_node = update_node_from_pcaxis(QUILT_DATASET, table, pxf) quilt.build(QUILT_DATASET, root_node) quilt.push(QUILT_DATASET, is_public=True)
def merge(self, df): root_node = self._get_root_node() data_node = getattr(root_node, self.sub_path, None) if data_node is None: return self.update(df) old_df = quilt_to_pint_df(data_node) merged = old_df.append(df).sort_index() # Remove duplicate rows merged = merged[~merged.index.duplicated(keep='first')] merged, meta = pint_df_to_quilt(merged) root_node._set([self.sub_path], merged) data_node = getattr(root_node, self.sub_path) data_node._meta.update(meta) quilt.build(self.package_name, root_node) return merged
def update_pkg( df: pd.DataFrame, user: str, package: str, readme: Optional[str] = None, hash_key=None, ): r""" Parameters ---------- df user package readme hash_key Returns ------- """ pkg_path = f'{user}/{package}' quilt.build(pkg_path, quilt.nodes.GroupNode(dict(author='@hudlrd'))) quilt.build(f'{pkg_path}/df', quilt.nodes.DataNode(None, None, df, {})) # TODO: warn the user if readme if not provided if readme is not None: with NamedTemporaryFile() as tmp: tmp.write(readme.encode('UTF-8')) tmp.flush() quilt.build(f'{pkg_path}/README', tmp.model_name) quilt.login() quilt.push(pkg_path, is_public=True, hash=hash_key)
def upload_to_quilt(spark, schemas_dic): """ Function to upload data to quilt and to append it to already existing data :param spark: Spark Sessuin :return: None """ # remove old data and get new one logging.info("Installing quilt gdelt data...") quilt.rm("nmduarte/gdelt", force=True) quilt.install("nmduarte/gdelt", force=True) from quilt.data.nmduarte import gdelt # get the old data from quilt logging.info("getting data from quilt...") events_from_quilt = gdelt.events() mentions_from_quilt = gdelt.mentions() news_from_quilt = gdelt.news() # transform the data into dataframes so it can be appended logging.info("Creating dataframes from quilt data...") events_from_quilt_df = spark.createDataFrame(events_from_quilt, schema=schemas_dic['events2']) mentions_from_quilt_df = spark.createDataFrame( mentions_from_quilt, schema=schemas_dic['mentions']) news_from_quilt_df = spark.createDataFrame(news_from_quilt, schema=schemas_dic['news']) # mentions data - new data logging.info("Reading last 15min data from S3...") mentions_df = tools.read_from_s3_enriched(spark, "mentions", schemas_dic['mentions'], cmd_opts.date) events_df = tools.read_from_s3_enriched(spark, "events", schemas_dic['events2'], cmd_opts.date) news_df = tools.read_from_s3_enriched(spark, "news", schemas_dic['news'], cmd_opts.date) # concatenate already existing data with new data logging.info("Appending data to old quilt data...") mentions_concat = mentions_from_quilt_df.union(mentions_df) events_concat = events_from_quilt_df.union(events_df) news_concat = news_from_quilt_df.union(news_df) # build the 3 packages logging.info("Building quilt packages...") quilt.build("nmduarte/gdelt/mentions", mentions_concat.toPandas()) quilt.build("nmduarte/gdelt/events", events_concat.toPandas()) quilt.build("nmduarte/gdelt/news", news_concat.toPandas()) # push the 3 packages logging.info("Pushing quilt info...") quilt.push("nmduarte/gdelt/mentions", is_public=True, is_team=False) quilt.push("nmduarte/gdelt/events", is_public=True, is_team=False) quilt.push("nmduarte/gdelt/news", is_public=True, is_team=False)
def update_node_from_pcaxis(root_node_or_path, sub_path, px_file): assert '/' not in sub_path if isinstance(root_node_or_path, str): root_path = root_node_or_path root_mod_path = root_path.replace('/', '.') try: root_node = importlib.import_module('quilt.data.%s' % root_mod_path) except ImportError: quilt.build(root_path) root_node = importlib.import_module('quilt.data.%s' % root_mod_path) else: root_node = root_node_or_path df = px_file.to_df(melt=True, dropna=True) root_node._set([sub_path], df) meta = dict(px_file.meta) for key, val in meta.items(): if isinstance(val, collections.OrderedDict): meta[key] = dict(val) elif isinstance(val, datetime): meta[key] = val.isoformat() try: import json json.dumps(meta, sort_keys=True) except Exception: from pprint import pprint pprint(meta) raise getattr(root_node, sub_path)._meta['pxmeta'] = meta return root_node
def uploadToQuilt(spark): #downloads the data from s3 print("Getting schemas..") events_schema, mentions_schema, news_schema,events_schema2 = set_schemas() # quilt.install("nmduarte/gdelt3") # mentions data print("Getting mention data..") mentions_df = read_from_s3_enriched(spark, "mentions", mentions_schema, cmd_opts.date) mentions_df.show() mentions_df.write.csv("tmp_data/mentions", header="true", mode="overwrite") events_df = read_from_s3_enriched(spark, "events", events_schema2, cmd_opts.date) events_df.write.csv("tmp_data/events", header="true", mode="overwrite") news_df = read_from_s3_enriched(spark, "news", news_schema, cmd_opts.date) news_df.write.csv("tmp_data/news", header="true", mode="overwrite") #news_df.write.csv("hdfs://10.0.0.13/ubuntu/hdfs/data/example.csv") #news_df.show() #quilt.build("nmduarte/gdelt8_news") #from quilt.data.nmduarte import gdelt8_news #news1 = pd.read_csv("tmp_data/news/part-00000-0f8595b0-2bd0-4156-9254-78e7b5cfa5c9-c000.csv", engine='python', escapechar="\\") #gdelt8_news._set(['bar'], news1) #print(gdelt8_news.bar()) #quilt.push("nmduarte/gdelt8_news", is_public=True) #quilt.build("nmduarte/gdelt_news","tmp_data/news") # put some data in it #from quilt.data.nmduarte import gdelt9_news #df = pd.DataFrame(data=[1, 2, 3]) #gdelt9_news._set(['bar'], df) #print(gdelt9_news.bar()) #quilt.push("nmduarte/gdelt_news", is_public=True) #print(news1.head()) # build the 3 packages quilt.build("nmduarte/gdelt_mentions","tmp_data/mentions") quilt.build("nmduarte/gdelt_events", "tmp_data/events") quilt.build("nmduarte/gdelt_news", "tmp_data/news") # push the 3 packages quilt.push("nmduarte/gdelt_mentions", is_public=True, is_team=False) quilt.push("nmduarte/gdelt_events", is_public=True, is_team=False) quilt.push("nmduarte/gdelt_news", is_public=True, is_team=False)
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from util import adjust_inflation, convert_gdf try: from quilt.data.spatialucr import census except ImportError: warn("Fetching data. This should only happen once") quilt.install("spatialucr/census") quilt.install("spatialucr/census_cartographic") from quilt.data.spatialucr import census try: from quilt.data.geosnap_data import data_store except ImportError: quilt.build("geosnap_data/data_store") from quilt.data.geosnap_data import data_store class Bunch(dict): """A dict with attribute-access.""" def __getattr__(self, key): try: return self.__getitem__(key) except KeyError: raise AttributeError(key) def __setattr__(self, key, value): self.__setitem__(key, value) def __dir__(self):
t = time.time() CopyImages(folds, allImageIDs, dataSource, packagePath, logPath, imagePrefix) copyTime = time.time() - t print('########Copy Completed#########') FilterAndSavePandasTable(folds, allImageIDs, annList, packagePath, logPath) print('######## Panda Table Generated#########') GenerateREADME(packagePath + 'README.md', classDescription) if (os.path.exists(packagePath + 'build.yml')): os.remove(packagePath + 'build.yml') t = time.time() quilt.generate(packagePath) quilt.build(quiltUser + '/' + classDescription, packagePath + 'build.yml') pkgNode = quilt.load(quiltUser + '/' + classDescription) pkgNode._meta['trainable'] = classID in oi.classes_trainable().values pkgNode._meta['labelName'] = classDescription numImages = GetNumImages(folds, allImageIDs) pkgNode._meta['image_count'] = numImages GenerateImageMetadata(folds, allImageIDs, pkgNode, annList, logPath, imagePrefix) print('######## Image Metadata Generated#########') quilt.build(quiltUser + '/' + classDescription, pkgNode) print('######## New Package Generated#########')
np.save(file="model/train/W2_data.npy", arr=measuresvelocity) np.save(file="model/train/W3_data.npy", arr=accumulation) np.save(file="model/train/X_data.npy", arr=lores) np.save(file="model/train/Y_data.npy", arr=hires) # %% [markdown] # ### Quilt # # Login -> Build -> Push # %% quilt.login() # %% # Tiled datasets for training neural network quilt.build(package="weiji14/deepbedmap/model/train/W1_data", path=rema) quilt.build(package="weiji14/deepbedmap/model/train/W2_data", path=measuresvelocity) quilt.build(package="weiji14/deepbedmap/model/train/W3_data", path=accumulation) quilt.build(package="weiji14/deepbedmap/model/train/X_data", path=lores) quilt.build(package="weiji14/deepbedmap/model/train/Y_data", path=hires) # %% # Original datasets for neural network predictions on bigger area quilt.build(package="weiji14/deepbedmap/lowres/bedmap2_bed", path="lowres/bedmap2_bed.tif") quilt.build( package="weiji14/deepbedmap/misc/REMA_100m_dem_filled", path="misc/REMA_100m_dem_filled.tif", )
print("Done!") return X_tile, W1_tile, W2_tile, W3_tile # %% X_tile, W1_tile, W2_tile, W3_tile = get_deepbedmap_model_inputs( window_bound=window_bound) print(X_tile.shape, W1_tile.shape, W2_tile.shape, W3_tile.shape) # Build quilt package for datasets covering our test region reupload = False if reupload == True: bounds_str = "_".join(str(int(b)) for b in (window_bound)).replace("-", "m") quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/W1_tile", path=W1_tile) quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/W2_tile", path=W2_tile) quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/W3_tile", path=W3_tile) quilt.build(package=f"weiji14/deepbedmap/model/test/{bounds_str}/X_tile", path=X_tile) quilt.push(package=f"weiji14/deepbedmap/model/test/{bounds_str}", is_public=True) # %% def subplot(directive: str, row: int = None, col: int = None, **kwargs): """Thin wrapper around https://docs.generic-mapping-tools.org/latest/subplot.html""" with gmt.clib.Session() as lib: rowcol = "" # default is blank, e.g. when directive == "end"
ST_Length(ST_Intersection(way, (SELECT way FROM munigeom))) AS length, ST_Transform(ST_SetSRID(way, 3067), 4326) AS way FROM planet_osm_line WHERE (highway='cycleway' OR tags ? 'cycleway') AND ST_Intersects(way, (SELECT way FROM munigeom))""" % (muni_sql, col_sql) df = gpd.GeoDataFrame.from_postgis(sql, con, geom_col='way') return df if __name__ == '__main__': import sys import quilt from quilt.data.jyrjola import osm data_date = datetime.strptime(sys.argv[1], '%Y-%m-%d').date() print("Executing SQL...") df = get_bike_lanes('helsinki') df['date'] = data_date # Quilt is unable to store geometry data, so drop the geometry # column for now. df.drop('way', inplace=True, axis=1) print("%d rows received, total length %d km" % (len(df), df['length'].sum() / 1000)) old_df = osm.helsinki_bike_lanes() quilt.build('jyrjola/osm/helsinki_bike_lanes', old_df.append(df)) #quilt.push('jyrjola/osm', is_public=True)
def update_quilt(quilt_path): import os import glob import settings def upload_px_dataset(root_node, file): fname = os.path.splitext(os.path.basename(file))[0] if 'hginseutu' not in fname.lower( ) and 'UM' not in fname and 'hki' not in fname.lower(): return print(fname) if re.match('^[0-9]', fname): # If the name begins with a number, prefix it with an letter # to make it a legal Python identifier. fname = 'z' + fname fname = fname.replace('-', '_').lower() content = open(file, 'r', encoding='windows-1252').read() parser = PxParser() try: file = parser.parse(content) except Exception as e: print(e) return now = datetime.now() parser = PxParser() file = parser.parse(content) now = datetime.now() from pprint import pprint #if 'last_updated' not in file.meta or (now - file.meta['last_updated']) > timedelta(days=2 * 365): # return print("\t%s" % file.meta['contents']) if root_node: quilt_target = root_node else: quilt_target = quilt_path node = update_node_from_pcaxis(quilt_target, fname, file) return node SKIP_FILES = [] data_dir = os.path.join(settings.DATA_DIR, 'aluesarjat') files = glob.glob('%s/*.px' % data_dir) skip_until = None root_node = None for file in files: if 'A01S_HKI_Rak' not in file: continue if skip_until: if skip_until not in file: continue skip_until = None skip = False for sf in SKIP_FILES: if sf in file: skip = True break if skip: continue ret = upload_px_dataset(root_node, file) if ret: root_node = ret assert root_node quilt.build(quilt_path, root_node) quilt.push(quilt_path, is_public=True)
if __name__ == '__main__': refresh_pxweb_datasets() exit() import quilt try: pass #quilt.install('jyrjola/traficom', force=True) except Exception: pass FORCE_QUARTERS = ['2019q3'] # quilt.push('jyrjola/traficom', is_public=True) from quilt.data.jyrjola import traficom # noqa for url, quarter in VEHICLE_URLS: print(url) dataset_name = 'vehicle_register_%s' % quarter if dataset_name in traficom._keys() and quarter not in FORCE_QUARTERS: print('skipping') continue outfn = '/tmp/out-%s.pq' % quarter if not os.path.exists(outfn): fetch_road_vehicle_register(url.split('/')[-1], quarter, outfn) print('build') quilt.build('jyrjola/traficom/%s' % dataset_name, path='/tmp/out-%s.pq' % quarter) print('push') quilt.push('jyrjola/traficom', is_public=True)
def read_ncdb(filepath): """ Read & store data from Geolytics's Neighborhood Change Database. Parameters ---------- filepath : str location of the input CSV file extracted from your Geolytics DVD Returns ------- pandas.DataFrame """ ncdb_vars = dictionary["ncdb"].dropna()[1:].values names = [] for name in ncdb_vars: for suffix in ['7', '8', '9', '0', '1', '2']: names.append(name + suffix) names.append('GEO2010') c = pd.read_csv(filepath, nrows=1).columns c = pd.Series(c.values) keep = [] for i, col in c.items(): for name in names: if col.startswith(name): keep.append(col) df = pd.read_csv( filepath, usecols=keep, engine='c', na_values=["", " ", 99999, -999], converters={ "GEO2010": str, "COUNTY": str, "COUSUB": str, "DIVISION": str, "REGION": str, "STATE": str, }, ) cols = df.columns fixed = [] for col in cols: if col.endswith("D"): fixed.append("D" + col[:-1]) elif col.endswith("N"): fixed.append("N" + col[:-1]) elif col.endswith("1A"): fixed.append(col[:-2] + "2") orig = [] for col in cols: if col.endswith("D"): orig.append(col) elif col.endswith("N"): orig.append(col) elif col.endswith("1A"): orig.append(col) renamer = dict(zip(orig, fixed)) df.rename(renamer, axis="columns", inplace=True) df = df[df.columns[df.columns.isin(names)]] df = pd.wide_to_long(df, stubnames=ncdb_vars, i="GEO2010", j="year", suffix="(7|8|9|0|1|2)").reset_index() df["year"] = df["year"].replace({ 7: 1970, 8: 1980, 9: 1990, 0: 2000, 1: 2010, 2: 2010 }) df = df.groupby(["GEO2010", "year"]).first() mapper = dict(zip(dictionary.ncdb, dictionary.variable)) df.reset_index(inplace=True) df = df.rename(mapper, axis="columns") df = df.set_index("geoid") for row in dictionary['formula'].dropna().tolist(): try: df.eval(row, inplace=True) except: warn('Unable to compute ' + str(row)) df = df.round(0) keeps = df.columns[df.columns.isin(dictionary['variable'].tolist() + ['year'])] df = df[keeps] df = df.loc[df.n_total_pop != 0] data_store._set(['ncdb'], df) quilt.build("geosnap_data/data_store", data_store)
) print("Done!") return X_tile, W1_tile, W2_tile, W3_tile # %% X_tile, W1_tile, W2_tile, W3_tile = get_deepbedmap_model_inputs( window_bound=window_bound ) print(X_tile.shape, W1_tile.shape, W2_tile.shape, W3_tile.shape) # Build quilt package for datasets covering our test region reupload = False if reupload == True: quilt.build(package="weiji14/deepbedmap/model/test/W1_tile", path=W1_tile) quilt.build(package="weiji14/deepbedmap/model/test/W2_tile", path=W2_tile) quilt.build(package="weiji14/deepbedmap/model/test/W3_tile", path=W3_tile) quilt.build(package="weiji14/deepbedmap/model/test/X_tile", path=X_tile) quilt.push(package="weiji14/deepbedmap/model/test", is_public=True) # %% def plot_3d_view( img: np.ndarray, ax: matplotlib.axes._subplots.Axes, elev: int = 60, azim: int = 330, z_minmax: tuple = None, title: str = None, zlabel: str = None,
def read_ltdb(sample, fullcount): """ Read & store data from Brown's Longitudinal Tract Database (LTDB). Parameters ---------- sample : str file path of the zip file containing the standard Sample CSV files downloaded from https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx fullcount: str file path of the zip file containing the standard Fullcount CSV files downloaded from https://s4.ad.brown.edu/projects/diversity/Researcher/LTBDDload/Default.aspx Returns ------- pandas.DataFrame """ sample_zip = zipfile.ZipFile(sample) fullcount_zip = zipfile.ZipFile(fullcount) def _ltdb_reader(path, file, year, dropcols=None): df = pd.read_csv( path.open(file), na_values=["", " ", 99999, -999], converters={ 0: str, "placefp10": str }, low_memory=False, encoding="latin1", ) if dropcols: df.drop(dropcols, axis=1, inplace=True) df.columns = df.columns.str.lower() names = df.columns.values.tolist() names[0] = "geoid" newlist = [] # ignoring the first 4 columns, remove year suffix from column names for name in names[4:]: newlist.append(name[:-2]) colnames = names[:4] + newlist df.columns = colnames # prepend a 0 when FIPS is too short df["geoid"] = df["geoid"].str.rjust(11, "0") df.set_index("geoid", inplace=True) df["year"] = year inflate_cols = [ "mhmval", "mrent", "incpc", "hinc", "hincw", "hincb", "hinch", "hinca" ] inflate_available = list( set(df.columns).intersection(set(inflate_cols))) if len(inflate_available): # try: df = adjust_inflation(df, inflate_available, year) # except KeyError: # half the dfs don't have these variables # pass return df # read in Brown's LTDB data, both the sample and fullcount files for each # year population, housing units & occupied housing units appear in both # "sample" and "fullcount" files-- currently drop sample and keep fullcount sample70 = _ltdb_reader( sample_zip, "ltdb_std_all_sample/ltdb_std_1970_sample.csv", dropcols=["POP70SP1", "HU70SP", "OHU70SP"], year=1970, ) fullcount70 = _ltdb_reader(fullcount_zip, "LTDB_Std_1970_fullcount.csv", year=1970) sample80 = _ltdb_reader( sample_zip, "ltdb_std_all_sample/ltdb_std_1980_sample.csv", dropcols=["pop80sf3", "pop80sf4", "hu80sp", "ohu80sp"], year=1980, ) fullcount80 = _ltdb_reader(fullcount_zip, "LTDB_Std_1980_fullcount.csv", year=1980) sample90 = _ltdb_reader( sample_zip, "ltdb_std_all_sample/ltdb_std_1990_sample.csv", dropcols=["POP90SF3", "POP90SF4", "HU90SP", "OHU90SP"], year=1990, ) fullcount90 = _ltdb_reader(fullcount_zip, "LTDB_Std_1990_fullcount.csv", year=1990) sample00 = _ltdb_reader( sample_zip, "ltdb_std_all_sample/ltdb_std_2000_sample.csv", dropcols=["POP00SF3", "HU00SP", "OHU00SP"], year=2000, ) fullcount00 = _ltdb_reader(fullcount_zip, "LTDB_Std_2000_fullcount.csv", year=2000) sample10 = _ltdb_reader(sample_zip, "ltdb_std_all_sample/ltdb_std_2010_sample.csv", year=2010) # join the sample and fullcount variables into a single df for the year ltdb_1970 = sample70.drop(columns=['year']).join(fullcount70.iloc[:, 7:], how="left") ltdb_1980 = sample80.drop(columns=['year']).join(fullcount80.iloc[:, 7:], how="left") ltdb_1990 = sample90.drop(columns=['year']).join(fullcount90.iloc[:, 7:], how="left") ltdb_2000 = sample00.drop(columns=['year']).join(fullcount00.iloc[:, 7:], how="left") ltdb_2010 = sample10 df = pd.concat([ltdb_1970, ltdb_1980, ltdb_1990, ltdb_2000, ltdb_2010], sort=True) renamer = dict( zip(dictionary['ltdb'].tolist(), dictionary['variable'].tolist())) df.rename(renamer, axis="columns", inplace=True) # compute additional variables from lookup table for row in dictionary['formula'].dropna().tolist(): df.eval(row, inplace=True) keeps = df.columns[df.columns.isin(dictionary['variable'].tolist() + ['year'])] df = df[keeps] data_store._set(['ltdb'], df) quilt.build("geosnap_data/data_store", data_store)
def build_and_push(package, df): quilt.build('%s/%s/%s' % (USER, PACKAGE_BASE, package), df) quilt.push('%s/%s/%s' % (USER, PACKAGE_BASE, package), is_public=True)