Пример #1
0
def dat_crime(fpath=None):
    # Step Zero: Create dat_crime table
    raw_crime(fpath=fpath)
    dedupe_crime()
    src_crime()
    src_crime_table = Table('src_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    dat_crime_table = crime_table('dat_chicago_crimes_all', Base.metadata)
    dat_crime_table.append_column(Column('chicago_crimes_all_row_id', Integer, primary_key=True))
    dat_crime_table.append_column(Column('start_date', TIMESTAMP, server_default=text('CURRENT_TIMESTAMP')))
    dat_crime_table.append_column(Column('end_date', TIMESTAMP, server_default=text('NULL')))
    dat_crime_table.append_column(Column('current_flag', Boolean, server_default=text('TRUE')))
    dat_crime_table.append_constraint(UniqueConstraint('id', 'start_date'))
    dat_crime_table.create(bind=engine, checkfirst=True)
    new_cols = ['start_date', 'end_date', 'current_flag', 'chicago_crimes_all_row_id']
    dat_ins = dat_crime_table.insert()\
        .from_select(
            [c for c in dat_crime_table.columns.keys() if c not in new_cols],
            select([c for c in src_crime_table.columns])
        )
    conn = engine.contextual_connect()
    res = conn.execute(dat_ins)
    cols = crime_master_cols(dat_crime_table)
    master_ins = MasterTable.insert()\
        .from_select(
            [c for c in MasterTable.columns.keys() if c != 'master_row_id'],
            select(cols)\
                .select_from(dat_crime_table)
        )
    conn = engine.contextual_connect()
    res = conn.execute(master_ins)
    return 'DAT crime created'
Пример #2
0
def chg_crime():
    # Step Seven: Find updates
    dat_crime_table = Table('dat_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    src_crime_table = Table('src_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    chg_crime_table = Table('chg_chicago_crimes_all', Base.metadata, 
        Column('id', Integer, primary_key=True),
        extend_existing=True)
    chg_crime_table.drop(bind=engine, checkfirst=True)
    chg_crime_table.create(bind=engine)
    src_cols = [c for c in src_crime_table.columns if c.name not in ['id', 'start_date', 'end_date']]
    dat_cols = [c for c in dat_crime_table.columns if c.name not in ['id', 'start_date', 'end_date']]
    and_args = []
    for s, d in zip(src_cols, dat_cols):
        ors = or_(s != None, d != None)
        ands = and_(ors, s != d)
        and_args.append(ands)
    ins = chg_crime_table.insert()\
          .from_select(
              ['id'],
              select([src_crime_table.c.id])\
                  .select_from(src_crime_table.join(dat_crime_table,
                      src_crime_table.c.id == dat_crime_table.c.id))\
                  .where(or_(
                          and_(dat_crime_table.c.current_flag == True, 
                                and_(or_(src_crime_table.c.id != None, dat_crime_table.c.id != None), 
                                src_crime_table.c.id != dat_crime_table.c.id)),
                          *and_args))
          )
    conn = engine.contextual_connect()
    conn.execute(ins)
    return 'Changes found'
Пример #3
0
def update_dat_crimes():
    # Step Five: Update Main Crime table
    dat_crime_table = Table('dat_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    src_crime_table = Table('src_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    try:
        new_crime_table = Table('new_chicago_crimes_all', Base.metadata, 
            autoload=True, autoload_with=engine, extend_existing=True)
    except NoSuchTableError:
        return None
    excluded_cols = ['end_date', 'current_flag', 'chicago_crimes_all_row_id']
    dat_cols = [c for c in dat_crime_table.columns.keys() if c not in excluded_cols]
    excluded_cols.append('start_date')
    src_cols = [c for c in src_crime_table.columns if c.name not in excluded_cols]
    src_cols.append(text("'%s' AS start_date" % datetime.now().strftime('%Y-%m-%d')))
    ins = dat_crime_table.insert()\
        .from_select(
            dat_cols,
            select(src_cols)\
                .select_from(src_crime_table.join(new_crime_table,
                    src_crime_table.c.id == new_crime_table.c.id))
        )
    conn = engine.contextual_connect()
    conn.execute(ins)
    return 'Crime Table updated'
Пример #4
0
 def _add_location(self, span=None):
     """ 
     Add latitude and longitude from weather station into observations table
     """
     start_day, end_day = calendar.monthrange(self.current_year, self.current_month)
     range_start = '%s-%s-%s' % (self.current_year, self.current_month, 1)
     range_end = '%s-%s-%s' % (self.current_year, self.current_month, end_day)
     date_col = 'date'
     table_name = 'dat_weather_observations_%s' % span
     if span == 'hourly':
         date_col = 'datetime'
     upd = text("""
         UPDATE %s SET 
             longitude = subq.longitude,
             latitude = subq.latitude
         FROM (
             SELECT 
                 wban_code,
                 st_x(location) as longitude,
                 st_y(location) as latitude
             FROM weather_stations
         ) as subq
         WHERE %s.wban_code = subq.wban_code
            AND %s.%s <= :range_end
            AND %s.%s >= :range_start
            AND %s.longitude IS NULL
            AND %s.latitude IS NULL
     """ % (table_name, table_name, 
            table_name, date_col,
            table_name, date_col,
            table_name, table_name)
     )
     conn = engine.contextual_connect()
     conn.execute(upd, range_start=range_start, range_end=range_end)
Пример #5
0
def new_crime():
    # Step Four: Find New Crimes
    dat_crime_table = Table('dat_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    src_crime_table = Table('src_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    new_crime_table = Table('new_chicago_crimes_all', Base.metadata, 
        Column('id', Integer, primary_key=True),
        extend_existing=True)
    new_crime_table.drop(bind=engine, checkfirst=True)
    new_crime_table.create(bind=engine)
    ins = new_crime_table.insert()\
        .from_select(
            ['id'],
            select([src_crime_table.c.id])\
                .select_from(src_crime_table.join(dat_crime_table, 
                    src_crime_table.c.id == dat_crime_table.c.id, isouter=True))\
                .where(dat_crime_table.c.chicago_crimes_all_row_id == None)
        )
    conn = engine.contextual_connect()
    try:
        conn.execute(ins)
        return 'New records found'
    except TypeError:
        # No new records
        return None
Пример #6
0
 def _add_location(self, span=None):
     """ 
     Add latitude and longitude from weather station into observations table
     """
     start_day, end_day = calendar.monthrange(self.current_year,
                                              self.current_month)
     range_start = '%s-%s-%s' % (self.current_year, self.current_month, 1)
     range_end = '%s-%s-%s' % (self.current_year, self.current_month,
                               end_day)
     date_col = 'date'
     table_name = 'dat_weather_observations_%s' % span
     if span == 'hourly':
         date_col = 'datetime'
     upd = text("""
         UPDATE %s SET 
             longitude = subq.longitude,
             latitude = subq.latitude
         FROM (
             SELECT 
                 wban_code,
                 st_x(location) as longitude,
                 st_y(location) as latitude
             FROM weather_stations
         ) as subq
         WHERE %s.wban_code = subq.wban_code
            AND %s.%s <= :range_end
            AND %s.%s >= :range_start
            AND %s.longitude IS NULL
            AND %s.latitude IS NULL
     """ % (table_name, table_name, table_name, date_col, table_name,
            date_col, table_name, table_name))
     conn = engine.contextual_connect()
     conn.execute(upd, range_start=range_start, range_end=range_end)
Пример #7
0
def update_master_current_flag():
    # Step Eight: Update end_date and current_flag in master table
    dat_crime_table = Table('dat_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    update = MasterTable.update()\
        .values(current_flag=False, end_date=datetime.now().strftime('%Y-%m-%d'))\
        .where(MasterTable.c.dataset_row_id == dat_crime_table.c.chicago_crimes_all_row_id)\
        .where(dat_crime_table.c.current_flag==False)\
        .where(dat_crime_table.c.end_date==date.today())
    conn = engine.contextual_connect()
    conn.execute(update)
    return 'Master table current flag updated'
Пример #8
0
def update_crime_current_flag():
    # Step Seven: Update end_date and current_flag in crime table
    dat_crime_table = Table('dat_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    chg_crime_table = Table('chg_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    update = dat_crime_table.update()\
        .values(current_flag=False, end_date=datetime.now().strftime('%Y-%m-%d'))\
        .where(dat_crime_table.c.id==chg_crime_table.c.id)\
        .where(dat_crime_table.c.current_flag == True)
    conn = engine.contextual_connect()
    conn.execute(update)
    return 'Crime table current flag updated'
Пример #9
0
 def _update(self, span=None):
     new_table = Table('new_weather_observations_%s' % span,
                       Base.metadata,
                       Column('wban_code', String(5)),
                       keep_existing=True)
     dat_table = getattr(self, '%s_table' % span)
     src_table = getattr(self, 'src_%s_table' % span)
     from_sel_cols = ['wban_code']
     if span == 'daily':
         from_sel_cols.append('date')
         src_date_col = src_table.c.date
         dat_date_col = dat_table.c.date
         new_table.append_column(Column('date', Date))
         new_date_col = new_table.c.date
     elif span == 'hourly':
         from_sel_cols.append('datetime')
         src_date_col = src_table.c.datetime
         dat_date_col = dat_table.c.datetime
         new_table.append_column(Column('datetime', DateTime))
         new_date_col = new_table.c.datetime
     new_table.drop(engine, checkfirst=True)
     new_table.create(engine)
     ins = new_table.insert()\
             .from_select(from_sel_cols,
                 select([src_table.c.wban_code, src_date_col])\
                     .select_from(src_table.join(dat_table,
                         and_(src_table.c.wban_code == dat_table.c.wban_code,
                              src_date_col == dat_date_col),
                         isouter=True)
                 ).where(dat_table.c.id == None)
             )
     conn = engine.contextual_connect()
     try:
         conn.execute(ins)
         new = True
     except TypeError:
         new = False
     if new:
         ins = dat_table.insert()\
                 .from_select([c for c in dat_table.columns if c.name != 'id'],
                     select([c for c in src_table.columns])\
                         .select_from(src_table.join(new_table,
                             and_(src_table.c.wban_code == new_table.c.wban_code,
                                  src_date_col == new_date_col)
                         ))
                 )
         conn.execute(ins)
Пример #10
0
 def _update(self, span=None):
     new_table = Table('new_weather_observations_%s' % span, Base.metadata,
                       Column('wban_code', String(5)), keep_existing=True)
     dat_table = getattr(self, '%s_table' % span)
     src_table = getattr(self, 'src_%s_table' % span)
     from_sel_cols = ['wban_code']
     if span == 'daily':
         from_sel_cols.append('date')
         src_date_col = src_table.c.date
         dat_date_col = dat_table.c.date
         new_table.append_column(Column('date', Date))
         new_date_col = new_table.c.date
     elif span == 'hourly':
         from_sel_cols.append('datetime')
         src_date_col = src_table.c.datetime
         dat_date_col = dat_table.c.datetime
         new_table.append_column(Column('datetime', DateTime))
         new_date_col = new_table.c.datetime
     new_table.drop(engine, checkfirst=True)
     new_table.create(engine)
     ins = new_table.insert()\
             .from_select(from_sel_cols, 
                 select([src_table.c.wban_code, src_date_col])\
                     .select_from(src_table.join(dat_table,
                         and_(src_table.c.wban_code == dat_table.c.wban_code,
                              src_date_col == dat_date_col),
                         isouter=True)
                 ).where(dat_table.c.id == None)
             )
     conn = engine.contextual_connect()
     try:
         conn.execute(ins)
         new = True
     except TypeError:
         new = False
     if new:
         ins = dat_table.insert()\
                 .from_select([c for c in dat_table.columns if c.name != 'id'], 
                     select([c for c in src_table.columns])\
                         .select_from(src_table.join(new_table,
                             and_(src_table.c.wban_code == new_table.c.wban_code,
                                  src_date_col == new_date_col)
                         ))
                 )
         conn.execute(ins)
Пример #11
0
def src_crime():
    # Step Three: Create New table with unique ids
    raw_crime_table = Table('raw_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    dedupe_crime_table = Table('dedup_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    src_crime_table = crime_table('src_chicago_crimes_all', Base.metadata)
    src_crime_table.drop(bind=engine, checkfirst=True)
    src_crime_table.create(bind=engine)
    ins = src_crime_table.insert()\
        .from_select(
            src_crime_table.columns.keys(),
            select([c for c in raw_crime_table.columns if c.name != 'dup_row_id'])\
                .where(raw_crime_table.c.dup_row_id == dedupe_crime_table.c.dup_row_id)
        )
    conn = engine.contextual_connect()
    conn.execute(ins)
    return 'Source table created'
Пример #12
0
def dedupe_crime():
    # Step Two: Find duplicate records by ID
    raw_crime_table = Table('raw_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    dedupe_crime_table = Table('dedup_chicago_crimes_all', Base.metadata,
        Column('dup_row_id', Integer, primary_key=True),
        extend_existing=True)
    dedupe_crime_table.drop(bind=engine, checkfirst=True)
    dedupe_crime_table.create(bind=engine)
    ins = dedupe_crime_table.insert()\
        .from_select(
            ['dup_row_id'], 
            select([func.max(raw_crime_table.c.dup_row_id)])\
            .group_by(raw_crime_table.c.id)
        )
    conn = engine.contextual_connect()
    res = conn.execute(ins)
    return 'Raw crime deduplicated'
Пример #13
0
def delete_dataset(self, source_url_hash):
    md = session.query(MetaTable).get(source_url_hash)
    try:
        dat_table = Table(
            "dat_%s" % md.dataset_name, Base.metadata, autoload=True, autoload_with=engine, keep_existing=True
        )
        dat_table.drop(engine, checkfirst=True)
    except NoSuchTableError:
        pass
    master_table = MasterTable.__table__
    delete = master_table.delete().where(master_table.c.dataset_name == md.dataset_name)
    conn = engine.contextual_connect()
    try:
        conn.execute(delete)
        session.delete(md)
        session.commit()
    except InternalError, e:
        raise delete_dataset.retry(exc=e)
Пример #14
0
def delete_dataset(self, source_url_hash):
    md = session.query(MetaTable).get(source_url_hash)
    try:
        dat_table = Table('dat_%s' % md.dataset_name, Base.metadata, 
            autoload=True, autoload_with=engine, keep_existing=True)
        dat_table.drop(engine, checkfirst=True)
    except NoSuchTableError:
        pass
    master_table = MasterTable.__table__
    delete = master_table.delete()\
        .where(master_table.c.dataset_name == md.dataset_name)
    conn = engine.contextual_connect()
    try:
        conn.execute(delete)
        session.delete(md)
        session.commit()
    except InternalError, e:
        raise delete_dataset.retry(exc=e)
Пример #15
0
def update_master():
    # Step Six: Update Master table
    dat_crime_table = Table('dat_chicago_crimes_all', Base.metadata, 
        autoload=True, autoload_with=engine, extend_existing=True)
    try:
        new_crime_table = Table('new_chicago_crimes_all', Base.metadata, 
            autoload=True, autoload_with=engine, extend_existing=True)
    except NoSuchTableError:
        return None
    cols = crime_master_cols(dat_crime_table)
    ins = MasterTable.insert()\
        .from_select(
            [c for c in MasterTable.columns.keys() if c != 'master_row_id'],
            select(cols)\
                .select_from(dat_crime_table.join(new_crime_table, 
                    dat_crime_table.c.id == new_crime_table.c.id))
        )
    conn = engine.contextual_connect()
    conn.execute(ins)
    return 'Master updated'