def __init__(self, meta, source_path=None): """ :param meta: record from MetaTable :param source_path: path of source file on local filesystem if None, look for data at a remote URL instead """ # Just the info about column names we usually need self.dataset = meta.meta_tuple() self.name = 's_' + self.dataset.name # Get the Columns to construct our table try: # Can we just grab columns from an existing table? self.cols = self._from_ingested(meta.column_info()) # if we can, update the corresponding meta table with these column names and types # we expect column names to be filled in by users, but helpful in test cases/backend submission if meta.column_names is None: cols = {col.name: str(col.type) for col in self.cols if col.name not in {u'geom', u'point_date', u'hash'}} meta.column_names = json.dumps(cols) except NoSuchTableError: self.cols = None # Retrieve the source file try: if source_path: # Local ingest self.file_helper = ETLFile(source_path=source_path) else: # Remote ingest self.file_helper = ETLFile(source_url=meta.source_url) except Exception as e: raise PlenarioETLError(e)
def __init__(self, meta, source_path=None): """ :param meta: record from MetaTable :param source_path: path of source file on local filesystem if None, look for data at a remote URL instead """ # Just the info about column names we usually need logger.info('Begin.') logger.info('meta: {}'.format(meta)) logger.info('source_path: {}'.format(source_path)) self.dataset = meta.meta_tuple() self.name = 's_' + self.dataset.name # Get the Columns to construct our table try: # Can we just grab columns from an existing table? self.cols = self._from_ingested(meta.column_info()) except NoSuchTableError: self.cols = None # Retrieve the source file try: if source_path: # Local ingest self.file_helper = ETLFile(source_path=source_path) else: # Remote ingest self.file_helper = ETLFile(source_url=meta.source_url) except Exception as e: raise PlenarioETLError(e)
def _make_table(self, f): """ Create a table and fill it with CSV data. :param f: Open file handle pointing to start of CSV :return: populated table """ # Persist an empty table eagerly # so that we can access it when we drop down to a raw connection. # Be paranoid and remove the table if one by this name already exists. table = Table(self.name, MetaData(), *self.cols, extend_existing=True) self._drop() table.create(bind=postgres_engine) # Fill in the columns we expect from the CSV. names = ['"' + c.name + '"' for c in self.cols] copy_st = "COPY {t_name} ({cols}) FROM STDIN " \ "WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',')".\ format(t_name=self.name, cols=', '.join(names)) # In order to issue a COPY, we need to drop down to the psycopg2 DBAPI. conn = postgres_engine.raw_connection() try: with conn.cursor() as cursor: f.seek(0) cursor.copy_expert(copy_st, f) conn.commit() return table except Exception as e: # When the bulk copy fails on _any_ row, # roll back the entire operation. raise PlenarioETLError(e) finally: conn.close()
def __enter__(self): """ Add a table (prefixed with n_) to the database with one record for each record found in the staging table with a hash not present in the existing table. If there are no such records, do nothing. """ # create n_table with point_date, geom, and id columns s = self.staging e = self.existing d = self.dataset derived_dates = func.cast(s.c[d.date], TIMESTAMP).label('point_date') derived_geoms = self._geom_col() cols_to_insert = [s.c['hash'], derived_dates, derived_geoms] # Select the hash and the columns we're deriving from the staging table. sel = select(cols_to_insert) # And limit our results to records # whose hashes aren't already present in the existing table. sel = sel.select_from(s.outerjoin(e, s.c['hash'] == e.c['hash'])).\ where(e.c['hash'] == None) # Drop the table first out of healthy paranoia self._drop() try: self.table.create(bind=postgres_engine) except Exception as e: raise PlenarioETLError(repr(e) + '\nCould not create table n_' + d.name) ins = self.table.insert().from_select(cols_to_insert, sel) # Populate it with records from our select statement. try: postgres_engine.execute(ins) except Exception as e: raise PlenarioETLError(repr(e) + '\n' + str(sel)) else: # Would be nice to check if we have new records or not right here. return self
def ingest(self): """ Create the table. The caller is responsible for cleanup. :return: SQLAlchemy Table """ with ETLFile(source_url=self.url, source_path=self.path) as file_helper: # Attempt insertion try: with zipfile.ZipFile(file_helper.handle) as shapefile_zip: import_shapefile(shapefile_zip=shapefile_zip, table_name=self.name) except zipfile.BadZipfile: raise PlenarioETLError("Source file was not a valid .zip") except ShapefileError as e: raise PlenarioETLError( "Failed to import shapefile.\n{}".format(repr(e))) add_unique_hash(self.name) return reflect(self.name)
def insert(self): """ Join with the staging table to insert complete records into existing table. """ derived_cols = [c for c in self.table.c if c.name in {'geom', 'point_date'}] staging_cols = [c for c in self.staging.c] sel_cols = staging_cols + derived_cols sel = select(sel_cols).where(self.staging.c.hash == self.table.c.hash) ins = self.existing.insert().from_select(sel_cols, sel) try: postgres_engine.execute(ins) except Exception as e: raise PlenarioETLError(repr(e) + '\n Failed on statement: ' + str(ins)) try: _null_malformed_geoms(self.existing) except Exception as e: raise PlenarioETLError(repr(e) + '\n Failed to null out geoms with (0,0) geocoding')
def add(self): if self.meta.is_ingested: raise PlenarioETLError( "Table {} has already been ingested.".format(self.table_name)) new = HashedShape(self.table_name, self.source_url, self.source_path) try: new.ingest() self.meta.update_after_ingest() session.commit() except: # In case ingestion failed partway through, # be sure to leave no trace. new.drop() raise
def _hash_update(staging, existing): delete_absent_hashes(staging.name, existing.name) # Select all records from staging # whose hashes are not present in existing. join_condition = staging.c['hash'] == existing.c['hash'] sel = staging.select()\ .select_from(staging.outerjoin(existing, join_condition)).\ where(existing.c['hash'] == None) # Insert those into existing col_names = [col.name for col in existing.columns] ins = existing.insert().from_select(col_names, sel) try: engine.execute(ins) except Exception as e: raise PlenarioETLError(repr(e) + '\n' + str(sel))
def __enter__(self): """ Create the staging table. Will be named s_[dataset_name] """ with self.file_helper as helper: if not self.cols: # We couldn't get the column metadata from an existing table self.cols = self._from_inference(helper.handle) # Grab the handle to build a table from the CSV try: self.table = self._make_table(helper.handle) add_unique_hash(self.table.name) self.table = Table(self.name, MetaData(), autoload_with=engine, extend_existing=True) return self except Exception as e: raise PlenarioETLError(e)
def __enter__(self): """Create the staging table. Will be named s_[dataset_name]""" logger.info('Begin.') with self.file_helper as helper: text_handle = open(helper.handle.name, "rt", encoding='utf-8') self.cols = self._from_inference(text_handle) # Grab the handle to build a table from the CSV try: self.table = self._make_table(text_handle) add_unique_hash(self.table.name) self.table = Table( self.name, postgres_base.metadata, autoload_with=postgres_engine, extend_existing=True ) return self except Exception as e: raise PlenarioETLError(e) logger.info('End.')
def _geom_col(self): """ Derive selectable with a PostGIS point in 4326 projection derived from either the latitude and longitude columns or single location column """ t = self.staging d = self.dataset if d.lat and d.lon: # Assume latitude and longitude columns are both numeric types. geom_col = func.ST_SetSRID(func.ST_Point(t.c[d.lon], t.c[d.lat]), 4326).label('geom') elif d.loc: geom_col = func.point_from_loc(t.c[d.loc]).label('geom') else: msg = 'Staging table does not have geometry information.' raise PlenarioETLError(msg) return geom_col
def _add_unique_hash(table_name): """ Adds an md5 hash column of the preexisting columns and removes duplicate rows from a table. :param table_name: Name of table to add hash to. """ add_hash = ''' DROP TABLE IF EXISTS temp; CREATE TABLE temp AS SELECT DISTINCT *, md5(CAST(("{table_name}".*)AS text)) AS hash FROM "{table_name}"; DROP TABLE "{table_name}"; ALTER TABLE temp RENAME TO "{table_name}"; ALTER TABLE "{table_name}" ADD PRIMARY KEY (hash); '''.format(table_name=table_name) try: engine.execute(add_hash) except Exception as e: raise PlenarioETLError( repr(e) + '\n Failed to deduplicate with ' + add_hash)