def startup(): """Startup fixture: make database connections and define tables to ignore""" new_db_map = { 'ais-api-broad': 'engine_broad', 'ais-api-market': 'engine_market', } proc = subprocess.Popen( ['bash', '-c', '. ../../../bin/eb_env_utils.sh; get_prod_env'], stdout=subprocess.PIPE) output = proc.stdout.read() old_prod_env = output.rstrip() old_prod_env = old_prod_env.decode('utf-8') old_db = datum.connect(config['DATABASES'][new_db_map[old_prod_env]]) new_db = datum.connect(config['DATABASES']['engine']) unused_tables = ('spatial_ref_sys', 'alembic_version', 'multiple_seg_line', 'service_area_diff', 'address_zip', 'zip_range') changed_tables = () ignore_tables = unused_tables + changed_tables return { 'new_db': new_db, 'old_db': old_db, 'unused_tables': unused_tables, 'changed_tables': changed_tables, 'ignore_tables': ignore_tables }
def startup(): """Startup fixture: make database connections and define tables to ignore""" new_db_map = { 'ais-api-broad': 'engine_broad', 'ais-api-market': 'engine_market', } proc = subprocess.Popen(['bash', '-c', '. ../../../bin/eb_env_utils.sh; get_prod_env'], stdout=subprocess.PIPE) output = proc.stdout.read() old_prod_env = output.rstrip() old_prod_env = old_prod_env.decode('utf-8') old_db = datum.connect(config['DATABASES'][new_db_map[old_prod_env]]) new_db = datum.connect(config['DATABASES']['engine']) unused_tables = ('spatial_ref_sys', 'alembic_version', 'multiple_seg_line', 'service_area_diff', 'address_zip', 'zip_range', 'dor_parcel_address_analysis') changed_tables = () ignore_tables = unused_tables + changed_tables return {'new_db': new_db, 'old_db': old_db, 'unused_tables': unused_tables, 'changed_tables': changed_tables, 'ignore_tables': ignore_tables}
def get_conn(self): if self.conn is None: params = self.get_connection(self.db_conn_id) if params.conn_type not in self.SCHEMAS: raise AirflowException('Could not create Datum connection for connection type {}'.format(params.conn_type)) logging.info('Establishing connection to {}'.format(self.db_conn_id)) conn_string = self.get_conn_str() self.conn = datum.connect(conn_string) return self.conn
def get_conn(self): if self.conn is None: params = self.get_connection(self.db_conn_id) if params.conn_type not in self.SCHEMAS: raise AirflowException( 'Could not create Datum connection for connection type {}'. format(params.conn_type)) logging.info('Establishing connection to {}'.format( self.db_conn_id)) conn_string = self.get_conn_str() self.conn = datum.connect(conn_string) return self.conn
from ais.models import Address from ais.util import parity_for_num, parity_for_range from passyunk.parser import PassyunkParser # DEV # import traceback # from pprint import pprint print('Starting...') start = datetime.now() config = app.config Parser = config['PARSER'] parser_tags = config['ADDRESSES']['parser_tags'] sources = config['ADDRESSES']['sources'] db = datum.connect(config['DATABASES']['engine']) address_table = db['address'] address_tag_table = db['address_tag'] source_address_table = db['source_address'] address_link_table = db['address_link'] street_segment_table = db['street_segment'] address_street_table = db['address_street'] true_range_view_name = 'true_range' # TODO: something more elegant here. true_range_select_stmt = ''' select coalesce(r.seg_id, l.seg_id) as seg_id, r.low as true_right_from, r.high as true_right_to, l.low as true_left_from,
from passyunk.data import DIRS_STD, SUFFIXES_STD import datum from ais.models import Address from ais.util import parity_for_num, parity_for_range from ais import app # DEV from pprint import pprint import traceback start = datetime.now() print('Starting...') """SET UP""" config = app.config db = datum.connect(config['DATABASES']['engine']) source_def = config['BASE_DATA_SOURCES']['parcels']['dor'] source_db_name = source_def['db'] source_db_url = config['DATABASES'][source_db_name] source_db = datum.connect(source_db_url) source_field_map = source_def['field_map'] source_table_name = source_def['table'] source_table = source_db[source_table_name] source_geom_field = source_table.geom_field field_map = source_def['field_map'] street_table = db['street_segment'] parcel_table = db['dor_parcel'] parcel_error_table = db['dor_parcel_error'] parcel_error_polygon_table = db['dor_parcel_error_polygon']
import sys import datum from ais import app # DEV import traceback from pprint import pprint """SET UP""" config = app.config source_def = config['BASE_DATA_SOURCES']['curbs'] source_db = datum.connect(config['DATABASES'][source_def['db']]) source_table = source_db[source_def['table']] field_map = source_def['field_map'] db = datum.connect(config['DATABASES']['engine']) curb_table = db['curb'] parcel_curb_table = db['parcel_curb'] """MAIN""" # print('Dropping parcel-curb view...') # db.drop_mview('parcel_curb') print('Dropping indexes...') curb_table.drop_index('curb_id') parcel_curb_table.drop_index('curb_id') parcel_curb_table.drop_index('parcel_source', 'parcel_row_id') print('Deleting existing curbs...') curb_table.delete() print('Reading curbs from source...') source_rows = source_table.read()
def sync(date, alerts, verbose): status = 'ERROR' with warnings.catch_warnings(record=True) as w: try: if verbose: console_handler = logging.StreamHandler() console_handler.setLevel(logging.DEBUG) console_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.info('Starting...') start = arrow.now() # Connect to Salesforce sf = Salesforce(username=SF_USER, \ password=SF_PASSWORD, \ security_token=SF_TOKEN) # Connect to database dest_db = datum.connect(DEST_DB_DSN) dest_tbl = dest_db[DEST_TABLE] tmp_tbl = dest_db[DEST_TEMP_TABLE] logger.info('Truncating temp table...') tmp_tbl.delete() sf_query = SF_QUERY # If a start date was passed in, handle it. if date: warnings.warn('Fetched records for {} only'.format(date)) try: date_comps = [int(x) for x in date.split('-')] start_date = arrow.get(date_obj(*date_comps), 'US/Eastern')\ .to('Etc/UTC') except ValueError: raise HandledError('Date parameter is invalid') end_date = start_date.replace(days=1) sf_query += ' AND (LastModifiedDate >= {})'.format(start_date) sf_query += ' AND (LastModifiedDate < {})'.format(end_date) # Otherwise, grab the last updated date from the DB. else: logger.info('Getting last updated date...') start_date_str = dest_db.execute('select max({}) from {}'\ .format(DEST_UPDATED_FIELD, DEST_TABLE))[0] start_date = arrow.get(start_date_str, 'US/Eastern').to('Etc/UTC') sf_query += ' AND (LastModifiedDate > {})'.format( start_date.isoformat()) logger.info('Fetching new records from Salesforce...') try: sf_rows = sf.query_all(sf_query)['records'] except SalesforceMalformedRequest: raise HandledError('Could not query Salesforce') logger.info('Processing rows...') rows = [process_row(sf_row, FIELD_MAP) for sf_row in sf_rows] logger.info('Writing to temp table...') tmp_tbl.write(rows) logger.info('Deleting updated records...') update_count = dest_db.execute(DEL_STMT) add_count = len(rows) - update_count logger.info('Appending new records...') dest_tbl.write(rows) # We should have added and updated at least 1 record if add_count == 0: warnings.warn('No records added') if update_count == 0: warnings.warn('No records updated') # TODO this check was causing an obscure httplib error # (essentially, timing out) so disabling it for now # Check count against Salesforce # sf_count = sf.query_all(SF_COUNT_QUERY)['totalSize'] # db_count = dest_tbl.count() # if sf_count != db_count: # warnings.warn('Salesforce has {} rows, database has {}'\ # .format(sf_count, db_count)) # If we got here, it was successful. status = 'SUCCESS' logger.info('Ran successfully. Added {}, updated {}.'\ .format(add_count, update_count)) except: logger.exception('Unhandled error') finally: if alerts: msg = '[311] {} - {}'.format(__file__, status) if status == 'SUCCESS': msg += ' - {} added, {} updated'\ .format(add_count, update_count) if len(w) > 0: msg += ' - {}.'.format('; '.join( [str(x.message) for x in w])) # Try to post to Slack try: slack = Slacker(SLACK_TOKEN) slack.chat.post_message(SLACK_CHANNEL, msg) except Exception as e: logger.error('Could not post to Slack. ' 'The message was:\n\n{}\n\n' 'The error was:\n\n{}'.format(msg, e))
from datetime import datetime import datum from ais import app start = datetime.now() print('Starting...') ''' SET UP ''' config = app.config Parser = config['PARSER'] parser = Parser() db = datum.connect(config['DATABASES']['engine']) WRITE_OUT = True geocode_table = db['geocode'] address_tag_table = db['address_tag'] geocode_tag_map = { 'pwd_parcel_id': (1, 3, 7), 'dor_parcel_id': (2, 4, 8) } new_geocode_rows = [] print('Reading geocode rows...') geocode_map = {} geocode_rows = geocode_table.read() print('Mapping geocode rows...') for geocode_row in geocode_rows: street_address = geocode_row['street_address'] if not street_address in geocode_map:
import sys import datum from ais import app # DEV import traceback from pprint import pprint """SET UP""" config = app.config source_def = config['BASE_DATA_SOURCES']['curbs'] source_db = datum.connect(config['DATABASES'][source_def['db']]) source_table = source_db[source_def['table']] field_map = source_def['field_map'] db = datum.connect(config['DATABASES']['engine']) curb_table = db['curb'] parcel_curb_table = db['parcel_curb'] """MAIN""" # print('Dropping parcel-curb view...') # db.drop_mview('parcel_curb') print('Dropping indexes...') curb_table.drop_index('curb_id') parcel_curb_table.drop_index('curb_id') parcel_curb_table.drop_index('parcel_source', 'parcel_row_id') print('Deleting existing curbs...')
import sys import csv from datetime import datetime # import logging import datum from common import process_row from config import * from pprint import pprint start = datetime.now() print('Starting...') dest_db = datum.connect(DEST_DB_DSN) dest_table = dest_db[DEST_TABLE] print('Dropping existing rows...') dest_table.delete() file_path = sys.argv[1] with open(file_path) as f: reader = csv.DictReader(f) reader_rows = [] for r in reader: reader_rows.append(r) print('Reading...') # dest_rows = [process_row(row, FIELD_MAP) for row in reader_rows[5:7]] dest_rows = [process_row(row, FIELD_MAP) for row in reader_rows]
def db_conn(db_conn_string): db = datum.connect(db_conn_string) yield db db.save() db.close()
import datum from ais import app from ais.models import Address # DEV import traceback from pprint import pprint print('Starting...') start = datetime.now() # TODO: This should probably make a DB query for each address, rather than chunking # into street names. Getting hard to manage. """SET UP""" config = app.config db = datum.connect(config['DATABASES']['engine']) tag_fields = config['ADDRESS_SUMMARY']['tag_fields'] geocode_table = db['geocode'] address_table = db['address'] max_values = config['ADDRESS_SUMMARY']['max_values'] geocode_types = config['ADDRESS_SUMMARY']['geocode_types'] geocode_priority_map = config['ADDRESS_SUMMARY']['geocode_priority'] #geocode_types_on_curb = config['ADDRESS_SUMMARY']['geocode_types_on_curb'] geocode_types_in_street = config['ADDRESS_SUMMARY']['geocode_types_in_street'] tag_table = db['address_tag'] link_table = db['address_link'] address_summary_table = db['address_summary'] # DEV WRITE_OUT = True
def sync(date, alerts, verbose): status = 'ERROR' with warnings.catch_warnings(record=True) as w: try: if verbose: console_handler = logging.StreamHandler() console_handler.setLevel(logging.DEBUG) console_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.info('Starting...') start = arrow.now() # Connect to Salesforce sf = Salesforce(username=SF_USER, \ password=SF_PASSWORD, \ security_token=SF_TOKEN) # Connect to database dest_db = datum.connect(DEST_DB_DSN) dest_tbl = dest_db[DEST_TABLE] tmp_tbl = dest_db[DEST_TEMP_TABLE] logger.info('Truncating temp table...') tmp_tbl.delete() sf_query = SF_QUERY # If a start date was passed in, handle it. if date: warnings.warn('Fetched records for {} only'.format(date)) try: date_comps = [int(x) for x in date.split('-')] start_date = arrow.get(date_obj(*date_comps), 'US/Eastern')\ .to('Etc/UTC') except ValueError: raise HandledError('Date parameter is invalid') end_date = start_date.replace(days=1) sf_query += ' AND (LastModifiedDate >= {})'.format(start_date) sf_query += ' AND (LastModifiedDate < {})'.format(end_date) # Otherwise, grab the last updated date from the DB. else: logger.info('Getting last updated date...') start_date_str = dest_db.execute('select max({}) from {}'\ .format(DEST_UPDATED_FIELD, DEST_TABLE))[0] start_date = arrow.get(start_date_str, 'US/Eastern').to('Etc/UTC') sf_query += ' AND (LastModifiedDate > {})'.format(start_date.isoformat()) logger.info('Fetching new records from Salesforce...') try: sf_rows = sf.query_all(sf_query)['records'] except SalesforceMalformedRequest: raise HandledError('Could not query Salesforce') logger.info('Processing rows...') rows = [process_row(sf_row, FIELD_MAP) for sf_row in sf_rows] logger.info('Writing to temp table...') tmp_tbl.write(rows) logger.info('Deleting updated records...') update_count = dest_db.execute(DEL_STMT) add_count = len(rows) - update_count logger.info('Appending new records...') dest_tbl.write(rows) # We should have added and updated at least 1 record if add_count == 0: warnings.warn('No records added') if update_count == 0: warnings.warn('No records updated') # TODO this check was causing an obscure httplib error # (essentially, timing out) so disabling it for now # Check count against Salesforce # sf_count = sf.query_all(SF_COUNT_QUERY)['totalSize'] # db_count = dest_tbl.count() # if sf_count != db_count: # warnings.warn('Salesforce has {} rows, database has {}'\ # .format(sf_count, db_count)) # If we got here, it was successful. status = 'SUCCESS' logger.info('Ran successfully. Added {}, updated {}.'\ .format(add_count, update_count)) except: logger.exception('Unhandled error') finally: if alerts: msg = '[311] {} - {}'.format(__file__, status) if status == 'SUCCESS': msg += ' - {} added, {} updated'\ .format(add_count, update_count) if len(w) > 0: msg += ' - {}.'.format('; '.join([str(x.message) for x in w])) # Try to post to Slack try: slack = Slacker(SLACK_TOKEN) slack.chat.post_message(SLACK_CHANNEL, msg) except Exception as e: logger.error( 'Could not post to Slack. ' 'The message was:\n\n{}\n\n' 'The error was:\n\n{}'.format(msg, e) )
import datum from ais import app from ais.models import Address # DEV from pprint import pprint import traceback start = datetime.now() print('Starting...') """SET UP""" config = app.config db = datum.connect(config['DATABASES']['engine']) parcel_table = db['pwd_parcel'] parcel_geom_field = parcel_table.geom_field source_def = config['BASE_DATA_SOURCES']['parcels']['pwd'] source_db_name = source_def['db'] source_db_url = config['DATABASES'][source_db_name] source_db = datum.connect(source_db_url) source_field_map = source_def['field_map'] source_table_name = source_def['table'] source_table = source_db[source_table_name] source_geom_field = source_table.geom_field # Read in OPA account nums and addresses opa_source_def = config['BASE_DATA_SOURCES']['opa_owners']
import csv from copy import deepcopy from datetime import datetime import datum from ais import app from ais.models import Address # DEV import traceback from pprint import pprint start = datetime.now() """SET UP""" config = app.config db = datum.connect(config['DATABASES']['engine']) source_db = datum.connect(config['DATABASES']['gis']) # source_table = source_db['usps_zip4s'] source_table = source_db['vw_usps_zip4s_ais'] field_map = { 'usps_id': 'updatekey', 'address_low': 'addrlow', 'address_high': 'addrhigh', 'address_oeb': 'addroeb', 'street_predir': 'streetpre', 'street_name': 'streetname', 'street_suffix': 'streetsuff', 'street_postdir': 'streetpost', 'unit_type': 'addrsecondaryabbr', 'unit_low': 'addrsecondarylow', 'unit_high': 'addrsecondaryhigh',