def run_colmap_new(args): m = MetapackCliMemo(args, downloader) resources = get_resources(m) if not resources: err(f"No resources found with colmap name '{m.args.colmap_name}'") # Collect all of the headers, into a list of headers, # and the union of all of them in col_index col_index = [] headers = [] for r in resources: h = r.headers col_index += [ alt_col_name(c) for c in h if alt_col_name(c) not in col_index ] headers.append(h) # Create lists, of the same length as the index, of the source # column names, at the same position as the alt_col_name is in the col_index data = [col_index] for header in headers: new_row = [None] * len(col_index) for c in header: new_row[col_index.index(alt_col_name(c))] = c data.append(new_row) t = [['index'] + [r.name for r in resources]] + list( zip(*data)) # zip transposes rows into columns. path = Path(f"colmap-{m.args.colmap_name}.csv") if m.args.print: from tabulate import tabulate prt(tabulate(t[1:], headers=t[0])) else: if path.exists() and not m.args.force: err(f"Col map file '{str(path)}' already exists. Use -f to overwrite" ) else: with path.open('w') as f: csv.writer(f).writerows(t) prt(f"Wrote {str(path)}")
def process_schema(doc, resource, df): """Add schema entiries to a metatab doc from a dataframe""" from rowgenerators import SourceError from requests.exceptions import ConnectionError from metapack.cli.core import extract_path_name, type_map from metapack_build.core import alt_col_name from tableintuit import TypeIntuiter from rowgenerators.generator.python import PandasDataframeSource from appurl import parse_app_url try: doc['Schema'] except KeyError: doc.new_section('Schema', ['DataType', 'Altname', 'Description']) schema_name = resource.get_value('schema', resource.get_value('name')) schema_term = doc.find_first(term='Table', value=schema_name, section='Schema') if schema_term: logger.info("Found table for '{}'; skipping".format(schema_name)) return path, name = extract_path_name(resource.url) logger.info("Processing {}".format(resource.url)) si = PandasDataframeSource( parse_app_url(resource.url), df, cache=doc._cache, ) try: ti = TypeIntuiter().run(si) except SourceError as e: logger.warn("Failed to process '{}'; {}".format(path, e)) return except ConnectionError as e: logger.warn("Failed to download '{}'; {}".format(path, e)) return table = doc['Schema'].new_term('Table', schema_name) logger.info("Adding table '{}' to metatab schema".format(schema_name)) for i, c in enumerate(ti.to_rows()): raw_alt_name = alt_col_name(c['header'], i) alt_name = raw_alt_name if raw_alt_name != c['header'] else '' t = table.new_child('Column', c['header'], datatype=type_map.get(c['resolved_type'], c['resolved_type']), altname=alt_name, description=df[c['header']].description \ if hasattr(df, 'description') and df[c['header']].description else '' ) return table
def add_dataframe(df, name, pkg=None, description=''): """Add a dataframe to a source package. Pass in either the name of the dataframe, or the dataframe. If the dataframeis passed it, the name will be the dataframe's variable name. The function will re-write the source package with the new resource. """ from warnings import warn from metapack.cli.core import type_map from metapack_build.core import alt_col_name import numpy as np if name is None or df is None: warn("Did not find dataframe for reference '{}' ".format(ref)) return pkg = pkg or open_source_package() resource_ref = 'file:' + get_notebook_rel_path(pkg) + '#' + name t = pkg.find_first('Root.Datafile', value=resource_ref) col_props = {} if t: print("Datafile exists for url '{}', deleting".format(resource_ref)) if t.schema_term: col_props = { c['name']:c for c in t.columns()} pkg.remove_term(t.schema_term) pkg.remove_term(t) t = pkg['Resources'].new_term('Root.Datafile', resource_ref, name=name, description=description) st = pkg['Schema'].new_term('Table', t.schema_name, description=description) for i, name in enumerate(df.columns): props = col_props.get(name,{}) try: native_type = type(np.asscalar(df[name].dtype.type(0))).__name__ except ValueError: native_type = df[name].dtype.name except AttributeError: native_type = type(df[name][0]).__name__ for pn in 'datatype name pos header'.split(): if pn in props: del props[pn] if 'altname' in props: altname = props['altname'] del props['altname'] else: raw_alt_name = alt_col_name(name, i) altname = raw_alt_name if raw_alt_name != name else '' col = df[name] if hasattr(col, 'description'): # custom property props['description'] = col.description t = st.new_child('Column', name, datatype=type_map.get(native_type, native_type), altname=altname, **props) pkg.write_csv()
def rebuild_schema(doc, r, df): """Rebuild the schema for a resource based on a dataframe""" import numpy as np # Re-get the resource in the doc, since it may be different. try: r = doc.resource(r.name) except AttributeError: # Maybe r is actually a resource name r = doc.resource(r) def alt_col_name(name, i): import re if not name: return 'col{}'.format(i) return re.sub('_+', '_', re.sub('[^\w_]', '_', str(name)).lower()).rstrip('_') df_types = { np.dtype('O'): 'text', np.dtype('int64'): 'integer', np.dtype('float64'): 'number' } try: df_index_frame = df.index.to_frame() except AttributeError: df_index_frame = None def get_col_dtype(c): c = str(c) try: return df_types[df[c].dtype] except KeyError: # Maybe it is in the index? pass try: return df_types[df_index_frame[c].dtype] except TypeError: # Maybe not a multi-index pass if c == 'id' or c == df.index.name: return df_types[df.index.dtype] return 'unknown' columns = [] schema_term = r.schema_term[0] if schema_term: old_cols = {c['name'].value: c.properties for c in schema_term.children} for c in schema_term.children: schema_term.remove_child(c) schema_term.children = [] else: old_cols = {} schema_term = doc['Schema'].new_term('Table', r.schema_name) index_names = [n if n else "id" for n in df.index.names] for i, col in enumerate(index_names + list(df.columns)): acn = alt_col_name(col, i) if alt_col_name(col, i) != str(col) else '' d = {'name': col, 'datatype': get_col_dtype(col), 'altname': acn} if col in old_cols.keys(): lookup_name = col elif acn in old_cols.keys(): lookup_name = acn else: lookup_name = None if lookup_name and lookup_name in old_cols: for k, v in schema_term.properties.items(): old_col = old_cols.get(lookup_name) for k, v in old_col.items(): if k != 'name' and v: d[k] = v columns.append(d) for c in columns: name = c['name'] del c['name'] datatype = c['datatype'] del c['datatype'] altname = c['altname'] del c['altname'] schema_term.new_child('Column', name, datatype=datatype, altname=altname, **c)
def convert_col(v): # The replacement of '_' may be necessary for some datasets # to ensure that similar columns from different datasets are aligned. return alt_col_name(v, 0) # .replace('_', '')