Пример #1
0
def get_sidewalk_data(**kwargs):
    """Get sidewalk condition data from DB."""
    sw_query = general.file_to_string('./sql/sidewalk_insp.sql', __file__)
    sw_conn = MsSqlHook(mssql_conn_id='streets_cg_sql')

    df = sw_conn.get_pandas_df(sw_query)

    # Rename columns we're keeping
    df = df.rename(columns={
        'sap_id': 'seg_id',
        'legacy_id': 'geojoin_id',
        'inspectiondate': 'oci_date',
        'rating': 'oci_desc',
        'condition': 'oci'
        })

    df = df.drop(['cgLastModified',
        'MaxInspect',
        'MaxMod'],axis=1)

    # Write csv
    logging.info('Writing ' + str(df.shape[0]))
    
    general.pos_write_csv(
       df, cond_file, date_format=conf['date_format_ymd'])
    
    return "Successfully wrote prod file"
Пример #2
0
def get_sidewalk_data(**kwargs):
    """Get sidewalk condition data from DB."""
    sw_query = general.file_to_string('./sql/sidewalk_insp.sql', __file__)
    sw_conn = MsSqlHook(mssql_conn_id='streets_cg_sql')

    df = sw_conn.get_pandas_df(sw_query)

    # Rename columns we're keeping
    df = df.rename(
        columns={
            'sap_id': 'seg_id',
            'legacy_id': 'geojoin_id',
            'inspectiondate': 'oci_date',
            'rating': 'oci_desc',
            'condition': 'oci'
        })

    df = df.drop(['cgLastModified', 'MaxInspect', 'MaxMod'], axis=1)

    # Write csv
    logging.info('Writing ' + str(df.shape[0]))

    general.pos_write_csv(df, cond_file, date_format=conf['date_format_ymd'])

    return "Successfully wrote prod file"
def get_pts_violations():
    """Get violations from pts, creates temp file."""
    logging.info('Retrieving PTS violations.')
    db = cx_Oracle.connect(credentials)
    sql = general.file_to_string('./sql/pts_sw.sql', __file__)
    df = pd.read_sql_query(sql, db)

    general.pos_write_csv(df, temp_file_pts, date_format='%Y-%m-%dT%H:%M:%S%z')

    return "Successfully wrote {} records for dsd_pts violations file".format(
        df.shape[0])
Пример #4
0
def get_active_businesses():
    """Query DB for 'Active Businesses' and save data to temp."""
    logging.info('Retrieving business tax license data')
    db = cx_Oracle.connect(credentials)
    sql = general.file_to_string('./sql/ttcs_biz.sql', __file__)
    df = pd.read_sql_query(sql, db)
    df_rows = df.shape[0]
    logging.info('Query returned {} results'.format(df_rows))
    general.pos_write_csv(df,
                          temp_all,
                          date_format=conf['date_format_ymd_hms'])

    return 'Successfully retrieved active businesses data.'
Пример #5
0
def get_special_events():
    """Get special events from DB."""
    se_query = general.file_to_string('./sql/special_events.sql', __file__)
    se_conn = MsSqlHook(mssql_conn_id='special_events_sql')
    df = se_conn.get_pandas_df(se_query)
    df['event_id'] = pd.to_numeric(df['event_id'],
                                   errors='coerce',
                                   downcast='integer')

    general.pos_write_csv(df,
                          temp_file,
                          date_format=conf['date_format_ymd_hms'])

    return "Retrieved special events to temp file."
Пример #6
0
def get_active_businesses():
    """Query DB for 'Active Businesses' and save data to temp."""
    logging.info('Retrieving business tax license data')
    db = cx_Oracle.connect(credentials)
    sql = general.file_to_string('./sql/ttcs_biz.sql', __file__)
    df = pd.read_sql_query(sql, db)
    df_rows = df.shape[0]
    logging.info('Query returned {} results'.format(df_rows))
    general.pos_write_csv(
        df,
        temp_all,
        date_format=conf['date_format_ymd_hms'])

    return 'Successfully retrieved active businesses data.'
Пример #7
0
def get_permits_files():
    """Query DB for 'permits' and save data to temp directory."""
    logging.info('Retrieving permits data.')
    db = cx_Oracle.connect(credentials)
    sql = general.file_to_string('./sql/pts.sql', __file__)
    sql += "WHERE a.issue_dt >= TO_DATE('"+str(year)+"-JAN-01', 'YYYY-MON-DD') AND a.issue_dt < TO_DATE('"+str(year+1)+"-JAN-01', 'YYYY-MON-DD')"
    df = pd.read_sql_query(sql, db)
    logging.info('Query returned {} results for {}'.format(df.shape[0],year))
    general.pos_write_csv(
        df,
        temp_permits,
        date_format=conf['date_format_ymd_hms'])

    return 'Successfully retrieved permits data.'
Пример #8
0
def get_streets_paving_data():
    """Get streets paving data from DB."""
    
    pv_query = general.file_to_string('./sql/pavement_ex.sql', __file__)
    pv_conn = MsSqlHook(mssql_conn_id='streets_cg_sql')

    df = pv_conn.get_pandas_df(pv_query)

    results = df.shape[0]

    general.pos_write_csv(
        df, temp_file)
    
    return f"Successfully wrote temp file with {results} records"
def get_claims_data():
    """Query an oracle database"""
    logging.info('Retrieving data from Oracle database')
    # This requires that otherwise optional credentials variable
    db = cx_Oracle.connect(credentials)
    # Create a sql file containing query for the database
    # Save this file in a sql folder at the same level as the jobs file
    sql = general.file_to_string('./sql/claimstat_tsw.sql', __file__)
    df = pd.read_sql_query(sql, db)
    logging.info(f'Query returned {df.shape[0]} results')

    general.pos_write_csv(df, "{}/claimstat_raw.csv".format(tmp))

    return 'Successfully retrieved Oracle data.'
Пример #10
0
def get_fd_data(**kwargs):
    """Get fire department data from Data Base."""

    logging.info("Get fire department data from CAD archive")

    fd_query = general.file_to_string('./sql/fd.sql', __file__)
    fd_conn = MsSqlHook(mssql_conn_id='fire_department')

    logging.info("Read data to panda DataFrame")

    df = fd_conn.get_pandas_df(fd_query)
    # Write csv
    logging.info('Writing ' + prod_file)
    general.pos_write_csv(df,
                          prod_file,
                          date_format=conf['date_format_ymd_hms'])

    return "Successfully wrote prod file at " + prod_file
Пример #11
0
def get_fd_data( **kwargs):
	"""Get fire department data from Data Base."""
	
	logging.info("Get fire department data from CAD archive")
	
	
	fd_query = general.file_to_string('./sql/fd.sql', __file__)
	fd_conn = MsSqlHook(mssql_conn_id='fire_department')

	logging.info("Read data to panda DataFrame")
	
	df = fd_conn.get_pandas_df(fd_query)
	# Write csv
	logging.info('Writing ' + prod_file)
	general.pos_write_csv(
		df, prod_file, date_format=conf['date_format_ymd_hms'])
		
	return "Successfully wrote prod file at " + prod_file
Пример #12
0
def get_cip_data(**kwargs):
    """Get CIP data from Data Base."""

    logging.info("Get CIP data from Oracle DataBase")

    cip_query = general.file_to_string('./sql/cip.sql', __file__)
    cip_conn = cx_Oracle.connect(credentials)

    logging.info("Read data to Panda DataFrame")
    df = pd.read_sql_query(cip_query, cip_conn)

    rows = df.shape[0]

    # Write csv
    logging.info('Writing {} rows to prod'.format(rows))
    general.pos_write_csv(df,
                          prod_file,
                          date_format=conf['date_format_ymd_hms'])

    return "Successfully wrote prod file"
Пример #13
0
def get_sidewalk_gis(**kwargs):
    """ Get sidewalk geodatabase from shared drive"""

    sde_query = general.file_to_string('./sql/sidewalk_sde.sql', __file__)
    sde_conn = MsSqlHook(mssql_conn_id='atlas_sde')

    #sde_server = conf['sde_server']
    #sde_user = conf['sde_user']
    #sde_pw = conf['sde_pw']

    #sde_conn = pymssql.connect(sde_server, sde_user, sde_pw, 'sdw')
    #query = "SELECT *, [Shape].STAsText() as geom FROM SDW.IAMSD.SIDEWALK"

    df = sde_conn.get_pandas_df(sde_query)

    df.columns = [x.lower() for x in df.columns]

    df = df.drop('shape', 1)

    df = df.rename(
        columns={
            'sapid': 'seg_id',
            'cdcode': 'council',
            'cpcode': 'comm_plan',
            'legacy_id': 'geojoin_id',
            'iamfloc': 'fun_loc_id',
            'loc_descr': 'loc_desc'
        })

    df = df.fillna('')

    logging.info('processed {} rows'.format(rows))

    geospatial.df2shp(df=df,
                      folder=prod_dir,
                      layername=layername,
                      dtypes=dtypes,
                      gtype=gtype,
                      epsg=2230)

    return "Converted table to shapefile"
Пример #14
0
def get_cip_data(**kwargs):
	"""Get CIP data from Data Base."""
	
	logging.info("Get CIP data from Oracle DataBase")
	
	
	cip_query = general.file_to_string('./sql/cip.sql', __file__)
	cip_conn= cx_Oracle.connect(credentials)


	logging.info("Read data to Panda DataFrame")
	df = pd.read_sql_query(cip_query, cip_conn)

	rows = df.shape[0]
	
	# Write csv
	logging.info('Writing {} rows to prod'.format(rows))
	general.pos_write_csv(
		df, prod_file, date_format=conf['date_format_ymd_hms'])
		
	return "Successfully wrote prod file"
Пример #15
0
def get_fd_data(**kwargs):
    """Get fire department data from Data Base."""

    logging.info("Get fire department data from Data Base")
    fd_query = general.file_to_string('./sql/fd_pn.sql', __file__)
    fd_conn = MsSqlHook(mssql_conn_id='fire_department')

    logging.info("Read data to panda DataFrame")
    df = fd_conn.get_pandas_df(fd_query)

    df = df.rename(
        columns={
            'city': 'address_city',
            'response_month': 'month_response',
            'response_year': 'year_response'
        })

    logging.info("Writing {} rows to prod".format(df.shape[0]))

    # Write csv
    general.pos_write_csv(df, prod_file)
    return "Successfully wrote prod file"
Пример #16
0
def get_onbase_test():
    """Get tables from OnBase."""
    logging.info('Getting files from onbase')
    for root, dirs, files in os.walk('./poseidon/dags/city_docs/sql/onbase'):
        for name in files:
            logging.info('Querying for '+name)
            path = './sql/onbase/{}'.format(name)
            query_string = general.file_to_string(path, __file__)
            logging.info('Connecting to MS Database')
            onbase_conn = MsSqlHook(mssql_conn_id='onbase_test_sql')
            logging.info('Reading data to Pandas DataFrame')
            df = onbase_conn.get_pandas_df(query_string)
            table_type = name[0:-4]

            logging.info('Correcting title column')
            df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

            save_path =  '{0}/onbase_test_{1}.csv'.format(conf['prod_data_dir'],table_type)
            logging.info('Writting Production file')
            general.pos_write_csv(df, save_path)

    return "Successfully retrieved OnBase tables"
Пример #17
0
def get_onbase():
    """Get tables from OnBase."""
    logging.info('Getting files from onbase')
    for root, dirs, files in os.walk('./poseidon/dags/city_docs/sql/onbase'):
        for name in files:
            logging.info('Querying for '+name)
            path = './sql/onbase/{}'.format(name)
            query_string = general.file_to_string(path, __file__)
            logging.info('Connecting to MS Database')
            onbase_conn = MsSqlHook(mssql_conn_id='onbase_sql')
            logging.info('Reading data to Pandas DataFrame')
            df = onbase_conn.get_pandas_df(query_string)
            table_type = name[0:-4]

            logging.info('Correcting title column')
            df['TITLE'] = fix_title(df[['TITLE','OBJECT_NAME']])

            save_path =  '{0}/onbase_{1}.csv'.format(conf['prod_data_dir'],table_type)
            logging.info('Writting Production file')
            general.pos_write_csv(df, save_path)

    return "Successfully retrieved OnBase tables"
def get_indicator_bacteria_tests(date_start='01-JAN-2014', date_end='15-JUN-2017', **kwargs):
    
    # For test mode, because F* waiting so long.
    if kwargs['test_mode'] == True:
        logging.warning("RUNNING IN TEST MODE, PULLING LAST YEAR ONLY!!!!")
        date_start = (kwargs['execution_date'] - timedelta(days=365)).strftime('%d-%b-%Y')
    
    db = cx_Oracle.connect(conf['oracle_wpl'])

    logging.info("Starting Indicator Bac Tests: " + date_start + " to " + date_end)


    jzn_1_q = string.Template(general.file_to_string('./sql/jzn1.sql', __file__))\
        .substitute(ds=date_start, de=date_end)
    jzn_2_q = string.Template(general.file_to_string('./sql/jzn2.sql', __file__))\
        .substitute(ds=date_start, de=date_end)
    jzn_3_q = string.Template(general.file_to_string('./sql/jzn3.sql', __file__))\
        .substitute(ds=date_start, de=date_end)
    jzn_4_q = string.Template(general.file_to_string('./sql/jzn4.sql', __file__))\
        .substitute(ds=date_start, de=date_end)

    logging.info("Reading JZN1")
    jzn_1 = pd.read_sql_query(jzn_1_q, db, coerce_float=True, index_col='F_FIELD_RECORD')
    jzn_1.F_VALUE = pd.to_numeric(jzn_1.F_VALUE, errors='coerce')
    jzn_1 = jzn_1[jzn_1.F_VALUE.notnull()]

    logging.info("Reading JZN2")
    jzn_2 = pd.read_sql_query(jzn_2_q, db, coerce_float=True, index_col='F_FIELD_RECORD')
    jzn_2.F_VALUE = pd.to_numeric(jzn_2.F_VALUE, errors='coerce')
    jzn_2 = jzn_2[jzn_2.F_VALUE.notnull()]

    logging.info("Reading JZN3")
    jzn_3 = pd.read_sql_query(jzn_3_q, db, coerce_float=True, index_col='F_FIELD_RECORD')
    jzn_3.F_VALUE = pd.to_numeric(jzn_3.F_VALUE, errors='coerce')
    jzn_3 = jzn_3[jzn_3.F_VALUE.notnull()]

    logging.info("Reading JZN4")
    jzn_4 = pd.read_sql_query(jzn_4_q, db, coerce_float=True, index_col='F_FIELD_RECORD')
    jzn_4.F_VALUE = pd.to_numeric(jzn_4.F_VALUE, errors='coerce')
    jzn_4 = jzn_4[jzn_4.F_VALUE.notnull()]

    jn_1 = jzn_1.rename(columns={
        'SOURCE':'V5_SOURCE',
        'SAMPLE_DATE':'V5_SAMPLE_DATE',
        'SAMPLE_ID':'V5_SAMPLE_ID',
        'F_VALUE':'V5_CL2_TOTAL',
        'L_VALUE':'V5_T_COLIFORM'
    }).filter(like='V5',axis=1)

    jn_2 = jzn_2.rename(columns={
        'L_VALUE':'V5_E_COLI'
    }).filter(like='V5',axis=1)

    jn_3 = jzn_3.rename(columns={
        'F_QUAL':'V5_TEMP_PART1',
        'F_VALUE':'V5_TEMP_PART2'
    }).filter(like='V5',axis=1)

    jn_4 = jzn_4.rename(columns={
        'F_QUAL':'V5_PH_PART1',
        'F_VALUE':'V5_PH_PART2'
    }).filter(like='V5',axis=1)

    df = jn_1.join([jn_2, jn_3, jn_4], how='inner')

    df = df.rename(columns={
        'V5_PH_PART2':'V5_PH',
        'V5_TEMP_PART2':'V5_TEMPERATURE',
    })

    del df['V5_PH_PART1']
    del df['V5_TEMP_PART1']

    df = df.rename(columns=lambda x: re.sub('V5\_','',x))
    df.index.rename(name='FR_NUM', inplace=True)

    new_file_path = conf['prod_data_dir'] + '/indicator_bacteria_tests_datasd.csv'
    logging.info("Writing to " + new_file_path)
    df.to_csv(new_file_path,
        index=True, 
        encoding='utf-8', 
        doublequote=True, 
        date_format=conf['date_format_ymd'])
   
    return "Indicator bacteria tests written to " + new_file_path
Пример #19
0
def get_streets_paving_data(mode='sdif', **kwargs):
    """Get streets paving data from DB."""
    pv_query = general.file_to_string('./sql/pavement_ex.sql', __file__)
    pv_conn = MsSqlHook(mssql_conn_id='streets_cg_sql')

    moratorium_string = "Post Construction"
    phone_UTLY = "858-627-3200"
    phone_OTHER = "619-527-7500"
    TSW_PM = "*****@*****.**"
    UTLY_PM = "*****@*****.**"
    ACT_OVERLAY_CONCRETE_PM = "*****@*****.**"
    ACT_SLURRY_SERIES_PM = "*****@*****.**"

    # Different String for imcat mode.
    if mode == 'imcat':
        moratorium_string = "Post-Construction"

    df = pv_conn.get_pandas_df(pv_query)

    for i in [
            'seg_id', 'rd_seg_id', 'wo_id', 'wo_name', 'wo_status',
            'wo_proj_type', 'job_activity', 'seg_func_class'
    ]:

        df[i] = df[i].astype(str)

    df['job_completed_cbox'] = df['job_completed_cbox'].astype(bool)

    # Backfill - set all fields to mora
    df.loc[df.wo_status.str.contains(
        'post construction|moratorium|post-construction',
        regex=True,
        case=False), "wo_status"] = moratorium_string

    # Remove Records w/o A Completed Date ONLY in the UTLY and TSW work order
    # IMCAT ONLY
    if mode == 'imcat':
        df = df.query('not '\
                    + '((wo_id == "UTLY" & job_end_dt.isnull()) '\
                    + 'or (wo_id == "TSW" & job_end_dt.isnull()))')

        # Remove empty activities (IMCAT ONLY)
        df = df.query('not '\
                    + '(job_activity.isnull() '\
                    + '| job_activity == "" '\
                    + '| job_activity == "None")')

    # Remove Data Entry
    # Remove mill / pave
    # Remove Structure Widening
    # Remove Patching
    if mode == 'imcat':
        remove_search = 'data entry|mill|structure wid|patching'
    else:
        remove_search = 'data entry|structure wid|patching'

    df = df[~(df.job_activity.str.contains(
        remove_search, regex=True, case=False))]

    # Search Strings
    concrete_search = "panel rep|pcc - reconstruc"
    slurry_search = "surface treatment|scrub seal|cape seal|central mix"
    overlay_search = "resurfacing|overlay|ac overlay|mill|ac - overlay|ac - ac overlay|ac - reconstruct|ac inlay"

    # Set Proj Type to NA
    df['wo_proj_type'] = None
    # Concrete
    df.loc[df.job_activity.str.contains(
        concrete_search, regex=True, case=False), 'wo_proj_type'] = 'Concrete'
    # Slurry
    df.loc[df.job_activity.str.contains(
        slurry_search, regex=True, case=False), 'wo_proj_type'] = 'Slurry'
    # Overlay
    df.loc[df.job_activity.str.contains(
        overlay_search, regex=True, case=False), 'wo_proj_type'] = 'Overlay'

    # Remove All Records over 5 Years Old;
    #pv <- pv[(as.Date(pv$job_end_dt) > (today() - years(5))) | is.na(pv$job_end_dt),]

    # Create ref dates
    #today = kwargs['execution_date']
    today = general.today()
    five_yrs_ago = today.replace(year=(today.year - 5))
    three_yrs_ago = today.replace(year=(today.year - 3))

    # Remove records
    df = df[(df.job_end_dt > five_yrs_ago) | (df.job_end_dt.isnull())]

    # Remove Slurry Records > 3 Years Old
    # IMCAT ONLY
    if mode == 'imcat':
        mask = ~((df.wo_proj_type == 'Slurry') &
                 (df.job_end_dt < three_yrs_ago))
        df = df[mask]

    # Create a feature for completed jobs
    df['final_job_completion_state'] = False
    #pv[(!is.na(pv$job_end_dt) & pv$job_completed_cbox == 1), "final_job_completion_state"] <- 1

    df.loc[df.job_end_dt.notnull(), "final_job_completion_state"] = True

    # Set all completed jobs to Moratorium status
    df.loc[df.final_job_completion_state == True,
           "wo_status"] = moratorium_string

    # Set Dates in The future for TSW work orders as Construction.
    mask = (df.wo_id == 'TSW') & \
           (df.job_end_dt.notnull()) & \
           (df.job_end_dt > today)

    df.loc[mask, "wo_status"] = "Construction"

    # Set Phone # For UTLY
    df.loc[df.wo_id == 'UTLY', 'wo_pm_phone'] = phone_UTLY

    # Set Phone # for Everything else
    df.loc[df.wo_id != 'UTLY', 'wo_pm_phone'] = phone_OTHER

    # Set PM for UTLY
    df.loc[df.wo_id == 'UTLY', 'wo_pm'] = UTLY_PM

    # Set PM for TSW
    df.loc[df.wo_id == 'TSW', 'wo_pm'] = TSW_PM

    # Set PM for Overlay / Concrete
    #mask = (df.wo_proj_type == 'Overlay') | (df.wo_proj_type == 'Concrete') & (df.wo_pm.isnull())
    mask = (df.wo_pm.isnull()) & ((df.wo_proj_type == 'Overlay') |
                                  (df.wo_proj_type == 'Concrete'))
    df.loc[mask, 'wo_pm'] = ACT_OVERLAY_CONCRETE_PM

    # Set PM for Slurry / Series
    mask = (df.wo_pm.isnull()) & ((df.wo_proj_type == 'Slurry') |
                                  (df.wo_proj_type == 'Series Circuit'))
    df.loc[mask, 'wo_pm'] = ACT_SLURRY_SERIES_PM

    # Spot Unknown
    mask = (df.job_activity.isnull()) | (df.job_activity == None) | (df.job_activity == 'None') | (df.job_activity == '')\
        |(df.wo_proj_type.isnull()) | (df.wo_proj_type == None) | (df.wo_proj_type == 'None') | (df.wo_proj_type == '')\
        |(df.wo_status.isnull()) | (df.wo_status == None) | (df.wo_status == 'None') | (df.wo_status == '')

    spot_unknown = df[mask]

    logging.info('Found {} records with no activity, type or status'.format(
        spot_unknown.shape[0]))

    # Remove unknown
    df = df[~mask]

    # Sort by job end date time
    df = df.sort_values(by='job_end_dt', na_position='last', ascending=False)

    # Remove duplicates, although it doesn't make sense
    # This is wrong.
    df = df.drop_duplicates('seg_id', keep='first')

    # Rename columns we're keeping
    df = df.rename(columns={
        'pve_id': 'PVE_ID',
        'seg_id': 'SEG_ID',
        'wo_id': 'PROJECTID',
        'wo_name': 'TITLE',
        'wo_pm': 'PM',
        'wo_pm_phone': 'PM_PHONE',
        'wo_design_start_dt': 'START',
        'wo_design_end_dt': 'END',
        'wo_resident_engineer': 'RESIDENT_ENGINEER',
        'job_end_dt': 'MORATORIUM',
        'wo_status': 'STATUS',
        'wo_proj_type': 'TYPE',
        'seg_length_ft': 'LENGTH',
        'seg_width_ft': 'WIDTH',
    })

    # Regex for the keeps:
    df = df.filter(regex="^[A-Z0-9]")

    # Remove START and END for projects in moratorium:
    df.loc[df.STATUS == moratorium_string, ['START', 'END']] = None

    # For IMCAT uppercase status
    if mode == 'imcat':
        df['STATUS'] = df['STATUS'].str.upper()

    # Write csv
    logging.info('Writing ' + str(df.shape[0]) + ' rows in mode ' + mode)
    general.pos_write_csv(
        df, prod_file[mode], date_format=conf['date_format_ymd_hms'])
    return "Successfully wrote prod file at " + prod_file[mode]