Python check_date_boundaries 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: lib_cinci.features

메소드/함수: check_date_boundaries

hotexamples.com에서의 예제들: 7

Python check_date_boundaries - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 lib_cinci.features.check_date_boundaries에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: permits.py 프로젝트: conorhenley/cincinnati

def make_permits_features(con, n_months, max_dist):
    """
    Make permits features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'permits'
    date_column = 'issueddate'

    #Get the time window for which you can generate features
    min_insp, max_insp = check_date_boundaries(con, n_months, dataset, date_column)

    make_inspections_address_nmonths_table(con, dataset, date_column,
        min_insp, max_insp, n_months=n_months, max_dist=max_dist, load=False)
    
    logger.info('Computing distance features for {}'.format(dataset))
    freq = group_and_count_from_db(con, dataset, n_months, max_dist)
    #Rename columns to avoid spaces and capital letters
    freq.columns = format_column_names(freq.columns)
    return freq

예제 #2

파일 보기

def make_crime_features(con, n_months, max_dist):
    """
    Make crime features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'crime'
    date_column = 'occurred_on'

    #Get the time window for which you can generate features
    min_insp, max_insp = check_date_boundaries(con, n_months, dataset,
                                               date_column)

    make_inspections_address_nmonths_table(con,
                                           dataset,
                                           date_column,
                                           min_insp,
                                           max_insp,
                                           n_months=n_months,
                                           max_dist=max_dist,
                                           load=False)

    logger.info('Computing distance features for {}'.format(dataset))

    max_rnum = 15
    coalescemissing = "'missing'"

    # make a table of the more general offense frequencies so we can prune them
    # also include a column with an array of corresponding detailed levels
    query = """
        DROP TABLE IF EXISTS public.frequentcrimes_orc;
        CREATE TABLE public.frequentcrimes_orc AS (
        WITH t as (
        SELECT coalesce(substring(orc from ' \((\w*)\) '), {coalescemissing}) as orc_combined,
               array_agg(distinct orc) as all_orcs,
               count(*) as count
        FROM public.crime
        GROUP BY orc_combined
        ORDER BY count desc
        )
        SELECT 
            row_number() OVER () as rnum,
            t.orc_combined,
            t.all_orcs,
            CASE WHEN row_number() OVER () <= {rnum} THEN t.orc_combined
            ELSE 'other' END AS level
        FROM t
        );""".format(rnum=max_rnum, coalescemissing=coalescemissing)

    cur = con.cursor()
    cur.execute(query)
    con.commit()

    query = """
        DROP TABLE IF EXISTS crimefeatures1_{n_months}months_{max_dist}m;
       
        DROP TABLE IF EXISTS joinedcrime_{n_months}months_{max_dist}m;

        -- join the inspections and crime
        CREATE TEMP TABLE joinedcrime_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT parcel_id, inspection_date,
                   coalesce(substring(event.orc from ' \((\w*)\) '), {coalescemissing}) as orc_combined
            FROM insp2crime_{n_months}months_{max_dist}m i2e
            LEFT JOIN LATERAL (
                SELECT * FROM public.crime s where s.id=i2e.id
            ) event
            ON true
        ;
        CREATE INDEX ON joinedcrime_{n_months}months_{max_dist}m (parcel_id, inspection_date);
        
        -- make the simple features
        CREATE TEMP TABLE crimefeatures1_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT 
                parcel_id,
                inspection_date,
                count(*) as total
            FROM joinedcrime_{n_months}months_{max_dist}m event
            GROUP BY parcel_id, inspection_date;
        CREATE INDEX ON crimefeatures1_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- make the categorical (dummified) features 
        CREATE TEMP TABLE crimefeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS

            -- restrict crime levels to the 15 most common ones,
            -- using the tables of frequency counts for these levels that we created earlier
            -- also make sure all 15 levels appear

            SELECT 
                t2.parcel_id, t2.inspection_date,
                'orc_combined_'||t2.level AS categ,
                coalesce(t1.count,0) as count   
             FROM
             (SELECT parcel_id, inspection_date,
                     ft.level,
                     count(*) as count
              FROM joinedcrime_{n_months}months_{max_dist}m event
              LEFT JOIN public.frequentcrimes_orc ft
              ON ft.orc_combined = event.orc_combined
              GROUP BY parcel_id, inspection_date, ft.level
             ) t1
             RIGHT JOIN
             (SELECT parcel_id, inspection_date, ft.level 
                 FROM parcels_inspections
                 JOIN 
                     (select distinct level from public.frequentcrimes_orc) ft
                 ON true
             ) t2
             USING (parcel_id, inspection_date,level)
        ;

        CREATE INDEX ON crimefeatures2_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- Now call the pivot function to create columns with the 
        -- different fire types
        SELECT colpivot('crimepivot_{n_months}months_{max_dist}m',
                        'select * from crimefeatures2_{n_months}months_{max_dist}m',
                        array['parcel_id','inspection_date'],
                        array['categ'],
                        'coalesce(#.count,0)',
                        null
        );
        CREATE INDEX ON crimepivot_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- still need to 'save' the tables into a permanent table
        DROP TABLE IF EXISTS crimefeatures_{n_months}months_{max_dist}m;
        CREATE TABLE crimefeatures_{n_months}months_{max_dist}m AS
            SELECT * FROM crimefeatures1_{n_months}months_{max_dist}m
            JOIN crimepivot_{n_months}months_{max_dist}m
            USING (parcel_id, inspection_date)
        ;
    """.format(n_months=str(n_months),
               max_dist=str(max_dist),
               coalescemissing=coalescemissing)

    cur.execute(query)
    con.commit()

    # fetch the data
    query = """
        SELECT * FROM crimefeatures_{n_months}months_{max_dist}m;
    """.format(n_months=str(n_months), max_dist=str(max_dist))

    df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date'])

    # clean up the column names
    df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns)
    df.columns = map(
        lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns)

    # drop the last interim table
    query = 'drop table crimefeatures_{n_months}months_{max_dist}m'.format(
        n_months=str(n_months), max_dist=str(max_dist))
    cur.execute(query)
    con.commit()

    return df

예제 #3

파일 보기

파일: three11.py 프로젝트: TorontoDataScientistsWithoutBorders/cincinnati

def make_three11_features(con, n_months, max_dist):
    """
    Make three11 features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'three11'
    date_column = 'requested_datetime'

    #Get the time window for which you can generate features
    min_insp, max_insp = check_date_boundaries(con, n_months, dataset,
                                               date_column)

    make_inspections_latlong_nmonths_table(con,
                                           dataset,
                                           date_column,
                                           min_insp,
                                           max_insp,
                                           n_months=n_months,
                                           max_dist=max_dist,
                                           load=False)

    max_rnum = 15

    logger.info('Computing distance features for {}'.format(dataset))

    coalescemissing = "'missing'"  # needs to be double-quoted cause SQL-injection

    # frequent service_codes, so we can prune them (there are too many)
    make_table_of_frequent_codes(
        con,
        col='service_code',
        intable='public.three11',
        outtable='public.frequentthree11_service_code',
        rnum=max_rnum,
        coalesceto=coalescemissing)

    cur = con.cursor()

    query = """
        DROP TABLE IF EXISTS three11features1_{n_months}months_{max_dist}m;
       
        DROP TABLE IF EXISTS joinedthree11_{n_months}months_{max_dist}m;

        -- join the inspections and three11
        CREATE TEMP TABLE joinedthree11_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT parcel_id, inspection_date,
                   agency_responsible,
                   status,
                   coalesce(service_code,{coalescemissing}) as service_code,
                   CASE WHEN description='Request entered through the Web. Refer to Intake Questions for further description.'
                        THEN 1 ELSE 0 END AS webrequest
            FROM insp2three11_{n_months}months_{max_dist}m i2e
            LEFT JOIN LATERAL (
                SELECT * FROM public.three11 s where s.id=i2e.id
            ) event
            ON true
        ;
        CREATE INDEX ON joinedthree11_{n_months}months_{max_dist}m (parcel_id, inspection_date);
        
        -- make the simple features
        CREATE TEMP TABLE three11features1_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT 
                parcel_id,
                inspection_date,
                sum(webrequest) as sum_webrequest,
                avg(webrequest) as avg_webrequest,
                count(*) as total
            FROM joinedthree11_{n_months}months_{max_dist}m event
            GROUP BY parcel_id, inspection_date;
        CREATE INDEX ON three11features1_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- make the categorical (dummified) features 
        CREATE TEMP TABLE three11features2_{n_months}months_{max_dist}m ON COMMIT DROP AS

            -- restrict three11 levels to the 15 most common ones,
            -- using the tables of frequency counts for these levels that we created earlier
            -- also make sure all levels always appear

            SELECT 
                t2.parcel_id, t2.inspection_date,
                'service_code_'||t2.level AS categ,
                coalesce(t1.count,0) as count   
             FROM
             (SELECT parcel_id, inspection_date,
                     ft.level,
                     count(*) as count
              FROM joinedthree11_{n_months}months_{max_dist}m event
              LEFT JOIN public.frequentthree11_service_code ft
              ON ft.raw_level = event.service_code
              GROUP BY parcel_id, inspection_date, ft.level
             ) t1
             RIGHT JOIN
             (SELECT parcel_id, inspection_date, ft.level 
                 FROM parcels_inspections
                 JOIN 
                     (select distinct level from public.frequentthree11_service_code) ft
                 ON true
             ) t2
             USING (parcel_id, inspection_date,level)
        ;

        CREATE INDEX ON three11features2_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- Now call the pivot function to create columns with the 
        -- different fire types
        SELECT colpivot('three11pivot_{n_months}months_{max_dist}m',
                        'select * from three11features2_{n_months}months_{max_dist}m',
                        array['parcel_id','inspection_date'],
                        array['categ'],
                        'coalesce(#.count,0)',
                        null
        );
        CREATE INDEX ON three11pivot_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- still need to 'save' the tables into a permanent table
        DROP TABLE IF EXISTS three11features_{n_months}months_{max_dist}m;
        CREATE TABLE three11features_{n_months}months_{max_dist}m AS
            SELECT * FROM three11features1_{n_months}months_{max_dist}m
            JOIN three11pivot_{n_months}months_{max_dist}m
            USING (parcel_id, inspection_date)
        ;
    """.format(n_months=str(n_months),
               max_dist=str(max_dist),
               coalescemissing=coalescemissing)

    cur.execute(query)
    con.commit()

    # fetch the data
    query = """
        SELECT * FROM three11features_{n_months}months_{max_dist}m;
    """.format(n_months=str(n_months), max_dist=str(max_dist))

    df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date'])

    # clean up the column names
    df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns)
    df.columns = map(
        lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns)

    # drop the last interim table
    query = 'drop table three11features_{n_months}months_{max_dist}m'.format(
        n_months=str(n_months), max_dist=str(max_dist))
    cur.execute(query)
    con.commit()

    return df

예제 #4

파일 보기

def make_permits_features(con, n_months, max_dist):
    """
    Make permits features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'permits'
    date_column = 'issueddate'

    load_colpivot(con)

    #Get the time window for which you can generate features
    min_insp, max_insp = check_date_boundaries(con, n_months, dataset,
                                               date_column)

    make_inspections_address_nmonths_table(con,
                                           dataset,
                                           date_column,
                                           min_insp,
                                           max_insp,
                                           n_months=n_months,
                                           max_dist=max_dist,
                                           load=False)

    logger.info('Computing distance features for {}'.format(dataset))

    cur = con.cursor()

    insp2tablename = ('insp2{dataset}_{n_months}months'
                      '_{max_dist}m').format(dataset='permits',
                                             n_months=str(n_months),
                                             max_dist=str(max_dist))

    # create a table of the most common proposeduse types,
    # so we can limit the pivot later to the 15 most common
    # types of uses
    cols = [
        'proposeduse', 'statuscurrent', 'workclass', 'permitclass',
        'permittype'
    ]

    coalescemissing = "'missing'"

    for col in cols:
        make_table_of_frequent_codes(con,
                                     col=col,
                                     intable='public.permits',
                                     outtable='public.frequentpermit_%s' % col,
                                     rnum=15,
                                     coalesce_to=coalescemissing)

    unionall_template = """
        SELECT parcel_id, inspection_date, 
              '{col}_'||coalesce(t2.level,{coalescemissing}) as categ,
              coalesce(t1.count, 0) as count
        FROM (
            SELECT parcel_id, inspection_date,
                   fs.level,
                   count(*) as count
            FROM joinedpermits_{n_months}months_{max_dist}m event
            LEFT JOIN public.frequentpermit_{col} fs
            ON fs.raw_level = coalesce(event.{col},{coalescemissing})
            GROUP BY parcel_id, inspection_date, fs.level
        ) t1
        RIGHT JOIN (
            SELECT parcel_id, inspection_date, t.level
            FROM parcels_inspections
            JOIN ( SELECT distinct level FROM public.frequentpermit_{col} ) t
            ON true
        ) t2
        USING (parcel_id, inspection_date, level)
        """

    unionall_statements = unionall_template.format(col=cols[0],
                                                  n_months=str(n_months),
                                                  max_dist=str(max_dist),
                                                  coalescemissing=coalescemissing
                                                  ) + \
                          '\n'.join([
                            'UNION ALL ( %s )'%unionall_template.format(col=col,
                                                                        n_months=str(n_months),
                                                                        max_dist=str(max_dist),
                                                                        coalescemissing=coalescemissing
                                                                        )
                            for col in cols[1:]
                            ])

    cur = con.cursor()
    query = """
        DROP TABLE IF EXISTS permitfeatures1_{n_months}months_{max_dist}m;

        CREATE TEMP TABLE permitfeatures1_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT 
                parcel_id,
                inspection_date,
                count(*) as total,
                avg(completeddate-applieddate) as avg_days_applied_to_completed,
                avg(completeddate-issueddate) as avg_days_issued_to_completed,
                avg(issueddate-applieddate) as avg_days_applied_to_issued,
                avg(expiresdate-issueddate) as avg_days_issued_to_expires,
                avg(expiresdate-completeddate) as avg_days_completed_to_expires,
                avg(CASE WHEN issueddate IS NOT NULL THEN 1 ELSE 0 END) as avg_issued,
                avg(CASE WHEN completeddate IS NOT NULL THEN 1 ELSE 0 END) as avg_completed,
                avg(CASE WHEN expiresdate IS NOT NULL THEN 1 ELSE 0 END) as avg_expires,
                avg(totalsqft) as avg_sqft,
                avg(estprojectcostdec) as avg_estcost,
                avg(units) as avg_units,
                avg(CASE WHEN coissueddate IS NOT NULL THEN 1 ELSE 0 END) as avg_is_coissued,
                avg(substring(fee from 2)::real) as avg_fee,
                avg(CASE WHEN companyname='OWNER' THEN 1 ELSE 0 END) as avg_owner_is_company
            FROM insp2permits_{n_months}months_{max_dist}m i2e
            LEFT JOIN public.permits event USING (id)
            GROUP BY parcel_id, inspection_date;
        CREATE INDEX ON permitfeatures1_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- make the categorical (dummified) features 
        CREATE TEMP TABLE joinedpermits_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT parcel_id, inspection_date, event.* 
            FROM insp2permits_{n_months}months_{max_dist}m i2e
            LEFT JOIN LATERAL (
                SELECT * FROM public.permits s where s.id=i2e.id
            ) event
            ON true
        ;
        CREATE INDEX ON joinedpermits_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- Join the permits with the inspections; then concatenate the 
        -- inspections and the various categorical variables (we'll pivot later)
        
        CREATE TEMP TABLE permitfeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS

            {unionall_statements};

        CREATE INDEX ON permitfeatures2_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- Now call the pivot function to create columns with the 
        -- different fire types
        SELECT colpivot('permitpivot_{n_months}months_{max_dist}m',
                        'select * from permitfeatures2_{n_months}months_{max_dist}m',
                        array['parcel_id','inspection_date'],
                        array['categ'],
                        'coalesce(#.count,0)',
                        null
        );
        CREATE INDEX ON permitpivot_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- still need to 'save' the tables into a permanent table
        CREATE TABLE permitfeatures_{n_months}months_{max_dist}m AS
            SELECT * FROM permitfeatures1_{n_months}months_{max_dist}m
            JOIN permitpivot_{n_months}months_{max_dist}m
            USING (parcel_id, inspection_date)
        ;
    """.format(n_months=str(n_months),
               max_dist=str(max_dist),
               unionall_statements=unionall_statements)

    cur.execute(query)
    con.commit()

    # fetch the data
    query = """
        SELECT * FROM permitfeatures_{n_months}months_{max_dist}m;
    """.format(n_months=str(n_months), max_dist=str(max_dist))

    df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date'])

    # clean up the column names
    df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns)
    df.columns = map(
        lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns)

    # drop the last interim table
    query = 'drop table permitfeatures_{n_months}months_{max_dist}m'.format(
        n_months=str(n_months), max_dist=str(max_dist))
    cur.execute(query)
    con.commit()

    return df

예제 #5

파일 보기

파일: sales.py 프로젝트: TorontoDataScientistsWithoutBorders/cincinnati

def make_sales_features(con, n_months, max_dist):
    """
    Make sales features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'sales'
    date_column = 'date_of_sale'
    insp2tablename = ('insp2{dataset}_{n_months}months'
                      '_{max_dist}m').format(dataset='sales',
                                             n_months=str(n_months),
                                             max_dist=str(max_dist))

    load_colpivot(con)

    #Get the time window for which you can generate features
    min_insp, max_insp = check_date_boundaries(con, n_months, dataset,
                                               date_column)

    make_inspections_address_nmonths_table(con,
                                           dataset,
                                           date_column,
                                           min_insp,
                                           max_insp,
                                           n_months=n_months,
                                           max_dist=max_dist,
                                           load=False)

    logger.info('Computing distance features for {}'.format(dataset))

    # there are several columns that we need to prune in terms of codes;
    # thus, make tables of value counts
    rnum = 15

    coalescemissing_use_code = "11111"  # use_code is an int, so hack this
    coalescemissing = "'missing'"

    to_dummify_columns = [
        'instrument_type', 'garage_type', 'style', 'grade',
        'exterior_wall_type', 'basement', 'heating', 'air_conditioning'
    ]

    for col in to_dummify_columns:
        make_table_of_frequent_codes(con,
                                     col=col,
                                     intable='public.sales',
                                     outtable='public.frequentsales_%s' % col,
                                     rnum=rnum,
                                     coalesce_to=coalescemissing)

    # use_code needs special treatment because it's an int
    make_table_of_frequent_codes(con,
                                 col='use_code',
                                 intable='public.sales',
                                 outtable='public.frequentsales_use_code',
                                 coalesceto=coalescemissing_use_code,
                                 rnum=rnum,
                                 to_other="9999")

    cur = con.cursor()

    # let's generate all the 'simple' features we might want;
    # each column will be named similar to 'avg_total_rooms'
    coltemplate = "{fun}({col}) AS {fun}_{col}"
    cols = [
        'number_of_parcels', 'appraisal_area', 'total_sales_records',
        'sale_price', 'total_rooms', 'full_bath', 'half_bath', 'fireplaces',
        'garage_capacity', 'num_stories', 'year_built', 'finished_sq_ft',
        'total_finish_area', 'first_floor_area', 'half_floor_area',
        'finished_basement'
    ]
    funs = [
        'avg'
    ]  # ,'sum','min','max','stddev'] # could do more, but probably not necessary
    featureselects = ',\n'.join(
        coltemplate.format(fun=f, col=c)
        for f, c in itertools.product(funs, cols))

    # This is a template for a pivot table. In the sales table, we have several categorical columns.
    # We need to pivot these into columns, with counts grouped by parcel_id and inspection_date.
    # As a first step, we take make a table for each categorical column that we want to pivot.
    # Each such table has columns (parcel_id, inspection_date, categ, count), where categ is
    # the level of our categorical column, and count is the number of times that level appears
    # for index (parcel_id, inspection_date). (We create a new level for 'null' rows.)
    # Here, we just define a template for this table query; we'll use it below.
    # {col} will be the categorical column name; joinedsales_Xmonths_Ym a join between sales and
    # insp2sales_Xmonths_Ym.
    unionall_template = """
        SELECT parcel_id, inspection_date, 
              '{col}_'||coalesce(t2.level,{coalescemissing}) as categ,
              coalesce(t1.count, 0) as count
        FROM (
            SELECT parcel_id, inspection_date,
                   fs.level,
                   count(*) as count
            FROM joinedsales_{n_months}months_{max_dist}m event
            LEFT JOIN public.frequentsales_{col} fs
            ON fs.raw_level = coalesce(event.{col},{coalescemissing})
            GROUP BY parcel_id, inspection_date, fs.level
        ) t1
        RIGHT JOIN (
            SELECT parcel_id, inspection_date, t.level
            FROM parcels_inspections
            JOIN ( SELECT distinct level FROM public.frequentsales_{col} ) t
            ON true
        ) t2
        USING (parcel_id, inspection_date, level)
        """

    unionall_statements = '\n'.join([
        'UNION ALL ( %s )' %
        unionall_template.format(col=col,
                                 n_months=str(n_months),
                                 max_dist=str(max_dist),
                                 coalescemissing=coalescemissing)
        for col in to_dummify_columns
    ])

    query = """
        DROP TABLE IF EXISTS salesfeatures1_{n_months}months_{max_dist}m;
       
        DROP TABLE IF EXISTS joinedsales_{n_months}months_{max_dist}m;

        -- join the inspections and sales
        CREATE TEMP TABLE joinedsales_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT parcel_id, inspection_date, event.* 
            FROM insp2sales_{n_months}months_{max_dist}m i2e
            LEFT JOIN LATERAL (
                SELECT * FROM public.sales s where s.id=i2e.id
            ) event
            ON true
        ;
        CREATE INDEX ON joinedsales_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- make the simple features
        CREATE TEMP TABLE salesfeatures1_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT 
                parcel_id,
                inspection_date,
                count(*) as total,
                {featureselects}
            FROM joinedsales_{n_months}months_{max_dist}m event
            GROUP BY parcel_id, inspection_date;
        CREATE INDEX ON salesfeatures1_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- make the categorical (dummified) features 
        CREATE TEMP TABLE salesfeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS
        
        -- now, we have a few columns with too many levels; we restrict these levels to the 15 most common ones,
        -- using the tables of frequency counts for these levels that we created earlier

        -- use_code is special, as it's an int (and we want it as varchar)
        SELECT parcel_id, inspection_date, 
              'use_code_'||coalesce(t2.level::varchar,'missing') as categ,
              coalesce(t1.count, 0) as count
        FROM (
            SELECT parcel_id, inspection_date,
                   fs.level,
                   count(*) as count
            FROM joinedsales_{n_months}months_{max_dist}m event
            LEFT JOIN public.frequentsales_use_code fs
            ON fs.raw_level = coalesce(event.use_code,{coalescemissing_use_code})
            GROUP BY parcel_id, inspection_date, fs.level
        ) t1
        RIGHT JOIN (
            SELECT parcel_id, inspection_date, t.level
            FROM parcels_inspections
            JOIN ( SELECT distinct level FROM public.frequentsales_use_code ) t
            ON true
        ) t2
        USING (parcel_id, inspection_date, level)

        {unionall_statements} -- these are all the columns that we defined above
        ;
        
        CREATE INDEX ON salesfeatures2_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- Now call the pivot function to create columns with the 
        -- different fire types
        SELECT colpivot('salespivot_{n_months}months_{max_dist}m',
                        'select * from salesfeatures2_{n_months}months_{max_dist}m',
                        array['parcel_id','inspection_date'],
                        array['categ'],
                        'coalesce(#.count,0)',
                        null
        );
        CREATE INDEX ON salespivot_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- still need to 'save' the tables into a permanent table
        DROP TABLE IF EXISTS salesfeatures_{n_months}months_{max_dist}m;
        CREATE TABLE salesfeatures_{n_months}months_{max_dist}m AS
            SELECT * FROM salesfeatures1_{n_months}months_{max_dist}m
            JOIN salespivot_{n_months}months_{max_dist}m
            USING (parcel_id, inspection_date)
        ;
    """.format(n_months=str(n_months),
               max_dist=str(max_dist),
               featureselects=featureselects,
               coalescemissing_use_code=coalescemissing_use_code,
               unionall_statements=unionall_statements)

    cur.execute(query)
    con.commit()

    # fetch the data
    query = """
        SELECT * FROM salesfeatures_{n_months}months_{max_dist}m;
    """.format(n_months=str(n_months), max_dist=str(max_dist))

    df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date'])

    # clean up the column names
    df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns)
    df.columns = map(
        lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns)

    # drop the last interim table
    query = 'drop table salesfeatures_{n_months}months_{max_dist}m'.format(
        n_months=str(n_months), max_dist=str(max_dist))
    cur.execute(query)
    con.commit()

    return df

예제 #6

파일 보기

파일: violation_density.py 프로젝트: TorontoDataScientistsWithoutBorders/cincinnati

def make_inspections_features(con, n_months, max_dist):
    """
    Make inspections features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'inspections_views.events_parcel_id'
    date_column = 'date'

    ## ------------------------------------------------------------------------
    ## Make the parcel_id-to-nearby-houses table, if it's not there yet.
    ## ------------------------------------------------------------------------

    query = """
        CREATE TABLE insp2houses_{max_dist}m AS
            SELECT  
                feature_y.parcel_id,
                count(*) as parcels
            FROM (
                SELECT t.parcel_id,
                       p.geom
                FROM (SELECT DISTINCT parcel_id FROM parcels_inspections) t
                LEFT JOIN shape_files.parcels_cincy p
                ON t.parcel_id=p.parcelid
            ) feature_y
            LEFT JOIN shape_files.parcels_cincy parcels
            ON ST_DWithin(feature_y.geom, parcels.geom, {max_dist}*3.281::double precision)
            AND feature_y.parcel_id <> parcels.parcelid
            GROUP BY feature_y.parcel_id
        ;
        CREATE INDEX ON insp2houses_{max_dist}m (parcel_id);
        """.format(max_dist=max_dist)

    #Create a cursor
    cur = con.cursor()

    #Get the current schema
    cur.execute('SELECT current_schema;')
    current_schema = cur.fetchone()[0]

    #Build the table name
    table_name = 'insp2houses_{max_dist}m'.format(max_dist=max_dist)
    # check if table already exists in current schema;
    # if not, create it
    if table_name not in tables_in_schema(current_schema):
        logging.info("Table %s does not exist yet, generating." % table_name)
        cur.execute(query)
    else:
        logging.info("Table %s already exists, skipping." % table_name)

    con.commit()

    ## ------------------------------------------------------------------------
    ## Make the table of nearby events, and the features.
    ## ------------------------------------------------------------------------

    #Get the time window for which you can generate features
    min_insp, max_insp = check_date_boundaries(con, n_months, dataset,
                                               date_column)

    query = """
        DROP TABLE IF EXISTS inspfeatures1_{n_months}months_{max_dist}m;
        CREATE TEMP TABLE inspfeatures1_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT t2.parcel_id, t2.inspection_date,
                   t2.event,
                   coalesce(t1.count, 0) as count,
                   (coalesce(t1.count, 0)+1.0) / (coalesce(t2.parcels,0)+5.0) as regularized_count_per_houses 
            FROM (
                SELECT  
                    feature_y.parcel_id,
                    feature_y.inspection_date,
                    coalesce(realinspections.event,'missing') as event,
                    count(*) as count
                FROM (
                    SELECT t.*, p.geom, ih.parcels
                    FROM parcels_inspections t
                    LEFT JOIN shape_files.parcels_cincy p
                    ON t.parcel_id=p.parcelid
                    LEFT JOIN insp2houses_{max_dist}m ih
                    USING (parcel_id)
                ) feature_y
                JOIN (
                    SELECT insp.*, p.geom
                    FROM inspections_views.events_parcel_id insp
                    JOIN shape_files.parcels_cincy p
                    ON insp.parcel_no=p.parcelid
                ) realinspections
                ON realinspections.date < feature_y.inspection_date
                AND (feature_y.inspection_date - '{n_months} month'::interval) <= realinspections.date
                AND ST_DWithin(feature_y.geom, realinspections.geom, {max_dist}*3.281::double precision)
                WHERE feature_y.inspection_date BETWEEN '{min_date}' AND '{max_date}'
                GROUP BY feature_y.parcel_id, feature_y.inspection_date, realinspections.event
            ) t1
            RIGHT JOIN
            (SELECT parcel_id, inspection_date, ft.event, parcels
                FROM parcels_inspections
                JOIN 
                    (select distinct coalesce(event,'missing') as event from inspections_views.events_parcel_id) ft
                ON true
                JOIN insp2houses_{max_dist}m
                USING (parcel_id)
            ) t2
            USING (parcel_id, inspection_date, event)
        ;

        CREATE TEMP TABLE inspfeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS (
        SELECT parcel_id, inspection_date, event, count
        FROM inspfeatures1_{n_months}months_{max_dist}m
        UNION ALL (
            SELECT parcel_id, inspection_date, 
                   event||'_per_houses' as event,
                   regularized_count_per_houses AS count
            FROM inspfeatures1_{n_months}months_{max_dist}m
            )
        ) ;
        CREATE INDEX ON inspfeatures2_{n_months}months_{max_dist}m (parcel_id, inspection_date);
        
        -- Now call the pivot function to create columns with the 
        -- different inspection events
        SELECT colpivot('insppivot_{n_months}months_{max_dist}m',
                        'select * from inspfeatures2_{n_months}months_{max_dist}m',
                        array['parcel_id','inspection_date'],
                        array['event'],
                        '#.count',
                        null
        ); -- Note: Not coalescing the counts, as the _per_houses shouldn't be
           --       set to 0. We'll have to leave it to later imputation.
        CREATE INDEX ON insppivot_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- still need to 'save' the tables into a permanent table
        DROP TABLE IF EXISTS inspfeatures_{n_months}months_{max_dist}m;
        CREATE TABLE inspfeatures_{n_months}months_{max_dist}m AS
            SELECT * FROM insppivot_{n_months}months_{max_dist}m ip1
        ;
        """.format(n_months=str(n_months),
                   max_dist=max_dist,
                   min_date=str(min_insp),
                   max_date=str(max_insp))

    cur.execute(query)
    con.commit()

    # fetch the data
    query = """
        SELECT * FROM inspfeatures_{n_months}months_{max_dist}m;
    """.format(n_months=str(n_months), max_dist=max_dist)

    df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date'])

    # clean up the column names
    df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns)
    df.columns = map(
        lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns)

    # drop the last interim table
    query = 'drop table inspfeatures_{n_months}months_{max_dist}m'.format(
        n_months=str(n_months), max_dist=str(max_dist))
    cur.execute(query)
    con.commit()

    return df

예제 #7

파일 보기

파일: fire.py 프로젝트: TorontoDataScientistsWithoutBorders/cincinnati

def make_fire_features(con, n_months, max_dist):
    """
    Make Fire features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'fire'
    date_column = 'incident_date'
    coalescemissing = "'missing'"

    #Get the time window for which you can generate features
    min_insp, max_insp = check_date_boundaries(con, n_months, dataset,
                                               date_column)

    make_inspections_address_nmonths_table(con,
                                           dataset,
                                           date_column,
                                           min_insp,
                                           max_insp,
                                           n_months=n_months,
                                           max_dist=max_dist,
                                           load=False)

    logger.info('Computing distance features for {}'.format(dataset))

    insp2tablename = ('insp2{dataset}_{n_months}months'
                      '_{max_dist}m').format(dataset='fire',
                                             n_months=str(n_months),
                                             max_dist=str(max_dist))

    # add the colpivot function to our Postgres schema
    load_colpivot(con)

    cur = con.cursor()

    # create a table of the most common fire types,
    # so we can limit the pivot later to the 15 most common
    # types of incidents
    make_table_of_frequent_codes(con,
                                 col='incident_type_desc',
                                 intable='public.fire',
                                 outtable='public.frequentfiretypes',
                                 coalesce_to=coalescemissing,
                                 rnum=15)

    # also make sure that the fire data has an index on the description,
    # as we want to join on it
    query = """
        CREATE INDEX firetype_idx ON public.fire (incident_type_desc);
    """
    try:
        cur.execute(query)
        con.commit()
    except (InternalError, ProgrammingError) as e:
        logger.warning("Catching Exception: " + e.message)
        logger.warning(" - CONTINUING, NOT RE-RUNNING firetype_idx QUERY.....")
        con.rollback()

    # now on to the actual feature generation
    query = """
        DROP TABLE IF EXISTS firefeatures_{n_months}months_{max_dist}m;

        -- link parcels and events within the right radius
        CREATE TEMP TABLE joinedtable ON COMMIT DROP AS
            SELECT parcel_id, inspection_date, event.* 
            FROM insp2fire_{n_months}months_{max_dist}m i2e
            LEFT JOIN LATERAL (
                SELECT * FROM public.fire s where s.id=i2e.id
            ) event
            ON true
        ;
        CREATE INDEX ON joinedtable (parcel_id, inspection_date);

        -- group by inspections and fire types (we'll pivot later)
        -- make sure to include all types
        CREATE TEMP TABLE firetypes_{n_months}months_{max_dist}m ON COMMIT DROP AS (

            SELECT t2.parcel_id, t2.inspection_date,
                   'incident_type_'||t2.level AS incident_type_desc,
                   coalesce(t1.count, 0) as count
            FROM ( SELECT parcel_id, inspection_date,
                       frequentfires.level,
                       count(*) as count
                   FROM joinedtable event
                   LEFT JOIN public.frequentfiretypes frequentfires
                   ON frequentfires.raw_level = coalesce(event.incident_type_desc, {coalescemissing})
                   GROUP BY parcel_id, inspection_date, frequentfires.level
            ) t1
            RIGHT JOIN (
                SELECT parcel_id, inspection_date, ft.level
                FROM parcels_inspections
                JOIN (SELECT DISTINCT level FROM public.frequentfiretypes) ft
                ON true
            ) t2
            USING (parcel_id, inspection_date, level)
        );

        CREATE INDEX ON firetypes_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- Now call the pivot function to create columns with the 
        -- different fire types
        SELECT colpivot('firefeatures_{n_months}months_{max_dist}m',
                        'select * from firetypes_{n_months}months_{max_dist}m',
                        array['parcel_id','inspection_date'],
                        array['incident_type_desc'],
                        'coalesce(#.count,0)',
                        null
        );
        CREATE INDEX ON firefeatures_{n_months}months_{max_dist}m (parcel_id,inspection_date);

        -- now we do some simple features
        DROP TABLE IF EXISTS firefeatures2_{n_months}months_{max_dist}m;

        CREATE TEMP TABLE firefeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS (
            SELECT parcel_id, inspection_date,
                count(*) as total, -- note that total includes the non-frequent incident types
                avg(
                   extract(epoch from event.unit_clear_date_time-event.alarm_date_time)::int/60
                ) as avg_clear_time_minutes,
                max(
                   extract(epoch from event.unit_clear_date_time-event.alarm_date_time)::int/60
                ) as max_clear_time_minutes,
                min(
                   extract(epoch from event.unit_clear_date_time-event.alarm_date_time)::int/60
                ) as min_clear_time_minutes,
                stddev(
                   extract(epoch from event.unit_clear_date_time-event.alarm_date_time)::int/60
                ) as stddev_clear_time_minutes
            FROM joinedtable event
            GROUP BY parcel_id, inspection_date
        ); 
        CREATE INDEX ON firefeatures2_{n_months}months_{max_dist}m (parcel_id,inspection_date);

        -- The pivot function only creates a temp table,
        -- so we still need to save it into a proper table.
        -- Also, this is a good time to join in the other 
        -- features we want.
        CREATE TABLE firefeatures_{n_months}months_{max_dist}m AS
            SELECT * FROM firefeatures_{n_months}months_{max_dist}m
            JOIN firefeatures2_{n_months}months_{max_dist}m
            USING (parcel_id, inspection_date)
        ;
        """.format(insp2tablename=insp2tablename,
                   n_months=str(n_months),
                   max_dist=str(max_dist),
                   coalescemissing=coalescemissing)

    cur.execute(query)
    con.commit()

    query = """
        SELECT * FROM firefeatures_{n_months}months_{max_dist}m;
    """.format(n_months=str(n_months), max_dist=str(max_dist))

    # fetch the data
    df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date'])

    # clean up the column names
    df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns)
    df.columns = map(
        lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns)

    # drop the last interim table
    query = 'drop table firefeatures_{n_months}months_{max_dist}m'.format(
        n_months=str(n_months), max_dist=str(max_dist))
    cur.execute(query)
    con.commit()

    return df