def dat_crime(fpath=None): # Step Zero: Create dat_crime table raw_crime(fpath=fpath) dedupe_crime() src_crime() src_crime_table = Table('src_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) dat_crime_table = crime_table('dat_chicago_crimes_all', Base.metadata) dat_crime_table.append_column(Column('chicago_crimes_all_row_id', Integer, primary_key=True)) dat_crime_table.append_column(Column('start_date', TIMESTAMP, server_default=text('CURRENT_TIMESTAMP'))) dat_crime_table.append_column(Column('end_date', TIMESTAMP, server_default=text('NULL'))) dat_crime_table.append_column(Column('current_flag', Boolean, server_default=text('TRUE'))) dat_crime_table.append_constraint(UniqueConstraint('id', 'start_date')) dat_crime_table.create(bind=engine, checkfirst=True) new_cols = ['start_date', 'end_date', 'current_flag', 'chicago_crimes_all_row_id'] dat_ins = dat_crime_table.insert()\ .from_select( [c for c in dat_crime_table.columns.keys() if c not in new_cols], select([c for c in src_crime_table.columns]) ) conn = engine.contextual_connect() res = conn.execute(dat_ins) cols = crime_master_cols(dat_crime_table) master_ins = MasterTable.insert()\ .from_select( [c for c in MasterTable.columns.keys() if c != 'master_row_id'], select(cols)\ .select_from(dat_crime_table) ) conn = engine.contextual_connect() res = conn.execute(master_ins) return 'DAT crime created'
def chg_crime(): # Step Seven: Find updates dat_crime_table = Table('dat_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) src_crime_table = Table('src_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) chg_crime_table = Table('chg_chicago_crimes_all', Base.metadata, Column('id', Integer, primary_key=True), extend_existing=True) chg_crime_table.drop(bind=engine, checkfirst=True) chg_crime_table.create(bind=engine) src_cols = [c for c in src_crime_table.columns if c.name not in ['id', 'start_date', 'end_date']] dat_cols = [c for c in dat_crime_table.columns if c.name not in ['id', 'start_date', 'end_date']] and_args = [] for s, d in zip(src_cols, dat_cols): ors = or_(s != None, d != None) ands = and_(ors, s != d) and_args.append(ands) ins = chg_crime_table.insert()\ .from_select( ['id'], select([src_crime_table.c.id])\ .select_from(src_crime_table.join(dat_crime_table, src_crime_table.c.id == dat_crime_table.c.id))\ .where(or_( and_(dat_crime_table.c.current_flag == True, and_(or_(src_crime_table.c.id != None, dat_crime_table.c.id != None), src_crime_table.c.id != dat_crime_table.c.id)), *and_args)) ) conn = engine.contextual_connect() conn.execute(ins) return 'Changes found'
def update_dat_crimes(): # Step Five: Update Main Crime table dat_crime_table = Table('dat_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) src_crime_table = Table('src_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) try: new_crime_table = Table('new_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) except NoSuchTableError: return None excluded_cols = ['end_date', 'current_flag', 'chicago_crimes_all_row_id'] dat_cols = [c for c in dat_crime_table.columns.keys() if c not in excluded_cols] excluded_cols.append('start_date') src_cols = [c for c in src_crime_table.columns if c.name not in excluded_cols] src_cols.append(text("'%s' AS start_date" % datetime.now().strftime('%Y-%m-%d'))) ins = dat_crime_table.insert()\ .from_select( dat_cols, select(src_cols)\ .select_from(src_crime_table.join(new_crime_table, src_crime_table.c.id == new_crime_table.c.id)) ) conn = engine.contextual_connect() conn.execute(ins) return 'Crime Table updated'
def _add_location(self, span=None): """ Add latitude and longitude from weather station into observations table """ start_day, end_day = calendar.monthrange(self.current_year, self.current_month) range_start = '%s-%s-%s' % (self.current_year, self.current_month, 1) range_end = '%s-%s-%s' % (self.current_year, self.current_month, end_day) date_col = 'date' table_name = 'dat_weather_observations_%s' % span if span == 'hourly': date_col = 'datetime' upd = text(""" UPDATE %s SET longitude = subq.longitude, latitude = subq.latitude FROM ( SELECT wban_code, st_x(location) as longitude, st_y(location) as latitude FROM weather_stations ) as subq WHERE %s.wban_code = subq.wban_code AND %s.%s <= :range_end AND %s.%s >= :range_start AND %s.longitude IS NULL AND %s.latitude IS NULL """ % (table_name, table_name, table_name, date_col, table_name, date_col, table_name, table_name) ) conn = engine.contextual_connect() conn.execute(upd, range_start=range_start, range_end=range_end)
def new_crime(): # Step Four: Find New Crimes dat_crime_table = Table('dat_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) src_crime_table = Table('src_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) new_crime_table = Table('new_chicago_crimes_all', Base.metadata, Column('id', Integer, primary_key=True), extend_existing=True) new_crime_table.drop(bind=engine, checkfirst=True) new_crime_table.create(bind=engine) ins = new_crime_table.insert()\ .from_select( ['id'], select([src_crime_table.c.id])\ .select_from(src_crime_table.join(dat_crime_table, src_crime_table.c.id == dat_crime_table.c.id, isouter=True))\ .where(dat_crime_table.c.chicago_crimes_all_row_id == None) ) conn = engine.contextual_connect() try: conn.execute(ins) return 'New records found' except TypeError: # No new records return None
def _add_location(self, span=None): """ Add latitude and longitude from weather station into observations table """ start_day, end_day = calendar.monthrange(self.current_year, self.current_month) range_start = '%s-%s-%s' % (self.current_year, self.current_month, 1) range_end = '%s-%s-%s' % (self.current_year, self.current_month, end_day) date_col = 'date' table_name = 'dat_weather_observations_%s' % span if span == 'hourly': date_col = 'datetime' upd = text(""" UPDATE %s SET longitude = subq.longitude, latitude = subq.latitude FROM ( SELECT wban_code, st_x(location) as longitude, st_y(location) as latitude FROM weather_stations ) as subq WHERE %s.wban_code = subq.wban_code AND %s.%s <= :range_end AND %s.%s >= :range_start AND %s.longitude IS NULL AND %s.latitude IS NULL """ % (table_name, table_name, table_name, date_col, table_name, date_col, table_name, table_name)) conn = engine.contextual_connect() conn.execute(upd, range_start=range_start, range_end=range_end)
def update_master_current_flag(): # Step Eight: Update end_date and current_flag in master table dat_crime_table = Table('dat_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) update = MasterTable.update()\ .values(current_flag=False, end_date=datetime.now().strftime('%Y-%m-%d'))\ .where(MasterTable.c.dataset_row_id == dat_crime_table.c.chicago_crimes_all_row_id)\ .where(dat_crime_table.c.current_flag==False)\ .where(dat_crime_table.c.end_date==date.today()) conn = engine.contextual_connect() conn.execute(update) return 'Master table current flag updated'
def update_crime_current_flag(): # Step Seven: Update end_date and current_flag in crime table dat_crime_table = Table('dat_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) chg_crime_table = Table('chg_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) update = dat_crime_table.update()\ .values(current_flag=False, end_date=datetime.now().strftime('%Y-%m-%d'))\ .where(dat_crime_table.c.id==chg_crime_table.c.id)\ .where(dat_crime_table.c.current_flag == True) conn = engine.contextual_connect() conn.execute(update) return 'Crime table current flag updated'
def _update(self, span=None): new_table = Table('new_weather_observations_%s' % span, Base.metadata, Column('wban_code', String(5)), keep_existing=True) dat_table = getattr(self, '%s_table' % span) src_table = getattr(self, 'src_%s_table' % span) from_sel_cols = ['wban_code'] if span == 'daily': from_sel_cols.append('date') src_date_col = src_table.c.date dat_date_col = dat_table.c.date new_table.append_column(Column('date', Date)) new_date_col = new_table.c.date elif span == 'hourly': from_sel_cols.append('datetime') src_date_col = src_table.c.datetime dat_date_col = dat_table.c.datetime new_table.append_column(Column('datetime', DateTime)) new_date_col = new_table.c.datetime new_table.drop(engine, checkfirst=True) new_table.create(engine) ins = new_table.insert()\ .from_select(from_sel_cols, select([src_table.c.wban_code, src_date_col])\ .select_from(src_table.join(dat_table, and_(src_table.c.wban_code == dat_table.c.wban_code, src_date_col == dat_date_col), isouter=True) ).where(dat_table.c.id == None) ) conn = engine.contextual_connect() try: conn.execute(ins) new = True except TypeError: new = False if new: ins = dat_table.insert()\ .from_select([c for c in dat_table.columns if c.name != 'id'], select([c for c in src_table.columns])\ .select_from(src_table.join(new_table, and_(src_table.c.wban_code == new_table.c.wban_code, src_date_col == new_date_col) )) ) conn.execute(ins)
def src_crime(): # Step Three: Create New table with unique ids raw_crime_table = Table('raw_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) dedupe_crime_table = Table('dedup_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) src_crime_table = crime_table('src_chicago_crimes_all', Base.metadata) src_crime_table.drop(bind=engine, checkfirst=True) src_crime_table.create(bind=engine) ins = src_crime_table.insert()\ .from_select( src_crime_table.columns.keys(), select([c for c in raw_crime_table.columns if c.name != 'dup_row_id'])\ .where(raw_crime_table.c.dup_row_id == dedupe_crime_table.c.dup_row_id) ) conn = engine.contextual_connect() conn.execute(ins) return 'Source table created'
def dedupe_crime(): # Step Two: Find duplicate records by ID raw_crime_table = Table('raw_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) dedupe_crime_table = Table('dedup_chicago_crimes_all', Base.metadata, Column('dup_row_id', Integer, primary_key=True), extend_existing=True) dedupe_crime_table.drop(bind=engine, checkfirst=True) dedupe_crime_table.create(bind=engine) ins = dedupe_crime_table.insert()\ .from_select( ['dup_row_id'], select([func.max(raw_crime_table.c.dup_row_id)])\ .group_by(raw_crime_table.c.id) ) conn = engine.contextual_connect() res = conn.execute(ins) return 'Raw crime deduplicated'
def delete_dataset(self, source_url_hash): md = session.query(MetaTable).get(source_url_hash) try: dat_table = Table( "dat_%s" % md.dataset_name, Base.metadata, autoload=True, autoload_with=engine, keep_existing=True ) dat_table.drop(engine, checkfirst=True) except NoSuchTableError: pass master_table = MasterTable.__table__ delete = master_table.delete().where(master_table.c.dataset_name == md.dataset_name) conn = engine.contextual_connect() try: conn.execute(delete) session.delete(md) session.commit() except InternalError, e: raise delete_dataset.retry(exc=e)
def delete_dataset(self, source_url_hash): md = session.query(MetaTable).get(source_url_hash) try: dat_table = Table('dat_%s' % md.dataset_name, Base.metadata, autoload=True, autoload_with=engine, keep_existing=True) dat_table.drop(engine, checkfirst=True) except NoSuchTableError: pass master_table = MasterTable.__table__ delete = master_table.delete()\ .where(master_table.c.dataset_name == md.dataset_name) conn = engine.contextual_connect() try: conn.execute(delete) session.delete(md) session.commit() except InternalError, e: raise delete_dataset.retry(exc=e)
def update_master(): # Step Six: Update Master table dat_crime_table = Table('dat_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) try: new_crime_table = Table('new_chicago_crimes_all', Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) except NoSuchTableError: return None cols = crime_master_cols(dat_crime_table) ins = MasterTable.insert()\ .from_select( [c for c in MasterTable.columns.keys() if c != 'master_row_id'], select(cols)\ .select_from(dat_crime_table.join(new_crime_table, dat_crime_table.c.id == new_crime_table.c.id)) ) conn = engine.contextual_connect() conn.execute(ins) return 'Master updated'