Python make_inspections_address_nmonths_table примеры использования

Язык программирования: Python

Пространство имен/Пакет: feature_utils

Метод/Функция: make_inspections_address_nmonths_table

Примеров на hotexamples.com: 7

Python make_inspections_address_nmonths_table - 7 примеров найдено. Это лучшие примеры Python кода для feature_utils.make_inspections_address_nmonths_table, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: permits.py Проект: conorhenley/cincinnati

def make_permits_features(con, n_months, max_dist):
    """
    Make permits features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'permits'
    date_column = 'issueddate'

    #Get the time window for which you can generate features
    min_insp, max_insp = check_date_boundaries(con, n_months, dataset, date_column)

    make_inspections_address_nmonths_table(con, dataset, date_column,
        min_insp, max_insp, n_months=n_months, max_dist=max_dist, load=False)
    
    logger.info('Computing distance features for {}'.format(dataset))
    freq = group_and_count_from_db(con, dataset, n_months, max_dist)
    #Rename columns to avoid spaces and capital letters
    freq.columns = format_column_names(freq.columns)
    return freq

Пример #2

Показать файл

def make_sales_features(con, n_months, max_dist):
    """
    Make sales features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'sales'
    date_column = 'datesale'

    make_inspections_address_nmonths_table(con,
                                           dataset,
                                           date_column,
                                           n_months=n_months,
                                           max_dist=max_dist,
                                           load=False)
    logger.info('Computing distance features for {}'.format(dataset))
    freq = group_and_count_from_db(con, dataset, n_months, max_dist)
    #Rename columns to avoid spaces and capital letters
    freq.columns = format_column_names(freq.columns)
    return freq

Пример #3

Показать файл

Файл: crime.py Проект: digideskio/cincinnati

def make_crime_features(con, n_months, max_dist):
    """
    Make Fire features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'crime'
    date_column = 'date_reported'
   
    make_inspections_address_nmonths_table(con, dataset, date_column,
                                                n_months=n_months,
                                                max_dist=max_dist,
                                                load=False)
    logger.info('Computing distance features for {}'.format(dataset))
    freq = group_and_count_from_db(con, dataset, n_months, max_dist)
    #Rename columns to avoid spaces and capital letters
    freq.columns = format_column_names(freq.columns)
    return freq

Пример #4

Показать файл

def make_crime_features(con, n_months, max_dist):
    """
    Make crime features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'crime'
    date_column = 'occurred_on'

    #Get the time window for which you can generate features
    min_insp, max_insp = check_date_boundaries(con, n_months, dataset,
                                               date_column)

    make_inspections_address_nmonths_table(con,
                                           dataset,
                                           date_column,
                                           min_insp,
                                           max_insp,
                                           n_months=n_months,
                                           max_dist=max_dist,
                                           load=False)

    logger.info('Computing distance features for {}'.format(dataset))

    max_rnum = 15
    coalescemissing = "'missing'"

    # make a table of the more general offense frequencies so we can prune them
    # also include a column with an array of corresponding detailed levels
    query = """
        DROP TABLE IF EXISTS public.frequentcrimes_orc;
        CREATE TABLE public.frequentcrimes_orc AS (
        WITH t as (
        SELECT coalesce(substring(orc from ' \((\w*)\) '), {coalescemissing}) as orc_combined,
               array_agg(distinct orc) as all_orcs,
               count(*) as count
        FROM public.crime
        GROUP BY orc_combined
        ORDER BY count desc
        )
        SELECT 
            row_number() OVER () as rnum,
            t.orc_combined,
            t.all_orcs,
            CASE WHEN row_number() OVER () <= {rnum} THEN t.orc_combined
            ELSE 'other' END AS level
        FROM t
        );""".format(rnum=max_rnum, coalescemissing=coalescemissing)

    cur = con.cursor()
    cur.execute(query)
    con.commit()

    query = """
        DROP TABLE IF EXISTS crimefeatures1_{n_months}months_{max_dist}m;
       
        DROP TABLE IF EXISTS joinedcrime_{n_months}months_{max_dist}m;

        -- join the inspections and crime
        CREATE TEMP TABLE joinedcrime_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT parcel_id, inspection_date,
                   coalesce(substring(event.orc from ' \((\w*)\) '), {coalescemissing}) as orc_combined
            FROM insp2crime_{n_months}months_{max_dist}m i2e
            LEFT JOIN LATERAL (
                SELECT * FROM public.crime s where s.id=i2e.id
            ) event
            ON true
        ;
        CREATE INDEX ON joinedcrime_{n_months}months_{max_dist}m (parcel_id, inspection_date);
        
        -- make the simple features
        CREATE TEMP TABLE crimefeatures1_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT 
                parcel_id,
                inspection_date,
                count(*) as total
            FROM joinedcrime_{n_months}months_{max_dist}m event
            GROUP BY parcel_id, inspection_date;
        CREATE INDEX ON crimefeatures1_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- make the categorical (dummified) features 
        CREATE TEMP TABLE crimefeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS

            -- restrict crime levels to the 15 most common ones,
            -- using the tables of frequency counts for these levels that we created earlier
            -- also make sure all 15 levels appear

            SELECT 
                t2.parcel_id, t2.inspection_date,
                'orc_combined_'||t2.level AS categ,
                coalesce(t1.count,0) as count   
             FROM
             (SELECT parcel_id, inspection_date,
                     ft.level,
                     count(*) as count
              FROM joinedcrime_{n_months}months_{max_dist}m event
              LEFT JOIN public.frequentcrimes_orc ft
              ON ft.orc_combined = event.orc_combined
              GROUP BY parcel_id, inspection_date, ft.level
             ) t1
             RIGHT JOIN
             (SELECT parcel_id, inspection_date, ft.level 
                 FROM parcels_inspections
                 JOIN 
                     (select distinct level from public.frequentcrimes_orc) ft
                 ON true
             ) t2
             USING (parcel_id, inspection_date,level)
        ;

        CREATE INDEX ON crimefeatures2_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- Now call the pivot function to create columns with the 
        -- different fire types
        SELECT colpivot('crimepivot_{n_months}months_{max_dist}m',
                        'select * from crimefeatures2_{n_months}months_{max_dist}m',
                        array['parcel_id','inspection_date'],
                        array['categ'],
                        'coalesce(#.count,0)',
                        null
        );
        CREATE INDEX ON crimepivot_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- still need to 'save' the tables into a permanent table
        DROP TABLE IF EXISTS crimefeatures_{n_months}months_{max_dist}m;
        CREATE TABLE crimefeatures_{n_months}months_{max_dist}m AS
            SELECT * FROM crimefeatures1_{n_months}months_{max_dist}m
            JOIN crimepivot_{n_months}months_{max_dist}m
            USING (parcel_id, inspection_date)
        ;
    """.format(n_months=str(n_months),
               max_dist=str(max_dist),
               coalescemissing=coalescemissing)

    cur.execute(query)
    con.commit()

    # fetch the data
    query = """
        SELECT * FROM crimefeatures_{n_months}months_{max_dist}m;
    """.format(n_months=str(n_months), max_dist=str(max_dist))

    df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date'])

    # clean up the column names
    df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns)
    df.columns = map(
        lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns)

    # drop the last interim table
    query = 'drop table crimefeatures_{n_months}months_{max_dist}m'.format(
        n_months=str(n_months), max_dist=str(max_dist))
    cur.execute(query)
    con.commit()

    return df

Пример #5

Показать файл

def make_permits_features(con, n_months, max_dist):
    """
    Make permits features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'permits'
    date_column = 'issueddate'

    load_colpivot(con)

    #Get the time window for which you can generate features
    min_insp, max_insp = check_date_boundaries(con, n_months, dataset,
                                               date_column)

    make_inspections_address_nmonths_table(con,
                                           dataset,
                                           date_column,
                                           min_insp,
                                           max_insp,
                                           n_months=n_months,
                                           max_dist=max_dist,
                                           load=False)

    logger.info('Computing distance features for {}'.format(dataset))

    cur = con.cursor()

    insp2tablename = ('insp2{dataset}_{n_months}months'
                      '_{max_dist}m').format(dataset='permits',
                                             n_months=str(n_months),
                                             max_dist=str(max_dist))

    # create a table of the most common proposeduse types,
    # so we can limit the pivot later to the 15 most common
    # types of uses
    cols = [
        'proposeduse', 'statuscurrent', 'workclass', 'permitclass',
        'permittype'
    ]

    coalescemissing = "'missing'"

    for col in cols:
        make_table_of_frequent_codes(con,
                                     col=col,
                                     intable='public.permits',
                                     outtable='public.frequentpermit_%s' % col,
                                     rnum=15,
                                     coalesce_to=coalescemissing)

    unionall_template = """
        SELECT parcel_id, inspection_date, 
              '{col}_'||coalesce(t2.level,{coalescemissing}) as categ,
              coalesce(t1.count, 0) as count
        FROM (
            SELECT parcel_id, inspection_date,
                   fs.level,
                   count(*) as count
            FROM joinedpermits_{n_months}months_{max_dist}m event
            LEFT JOIN public.frequentpermit_{col} fs
            ON fs.raw_level = coalesce(event.{col},{coalescemissing})
            GROUP BY parcel_id, inspection_date, fs.level
        ) t1
        RIGHT JOIN (
            SELECT parcel_id, inspection_date, t.level
            FROM parcels_inspections
            JOIN ( SELECT distinct level FROM public.frequentpermit_{col} ) t
            ON true
        ) t2
        USING (parcel_id, inspection_date, level)
        """

    unionall_statements = unionall_template.format(col=cols[0],
                                                  n_months=str(n_months),
                                                  max_dist=str(max_dist),
                                                  coalescemissing=coalescemissing
                                                  ) + \
                          '\n'.join([
                            'UNION ALL ( %s )'%unionall_template.format(col=col,
                                                                        n_months=str(n_months),
                                                                        max_dist=str(max_dist),
                                                                        coalescemissing=coalescemissing
                                                                        )
                            for col in cols[1:]
                            ])

    cur = con.cursor()
    query = """
        DROP TABLE IF EXISTS permitfeatures1_{n_months}months_{max_dist}m;

        CREATE TEMP TABLE permitfeatures1_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT 
                parcel_id,
                inspection_date,
                count(*) as total,
                avg(completeddate-applieddate) as avg_days_applied_to_completed,
                avg(completeddate-issueddate) as avg_days_issued_to_completed,
                avg(issueddate-applieddate) as avg_days_applied_to_issued,
                avg(expiresdate-issueddate) as avg_days_issued_to_expires,
                avg(expiresdate-completeddate) as avg_days_completed_to_expires,
                avg(CASE WHEN issueddate IS NOT NULL THEN 1 ELSE 0 END) as avg_issued,
                avg(CASE WHEN completeddate IS NOT NULL THEN 1 ELSE 0 END) as avg_completed,
                avg(CASE WHEN expiresdate IS NOT NULL THEN 1 ELSE 0 END) as avg_expires,
                avg(totalsqft) as avg_sqft,
                avg(estprojectcostdec) as avg_estcost,
                avg(units) as avg_units,
                avg(CASE WHEN coissueddate IS NOT NULL THEN 1 ELSE 0 END) as avg_is_coissued,
                avg(substring(fee from 2)::real) as avg_fee,
                avg(CASE WHEN companyname='OWNER' THEN 1 ELSE 0 END) as avg_owner_is_company
            FROM insp2permits_{n_months}months_{max_dist}m i2e
            LEFT JOIN public.permits event USING (id)
            GROUP BY parcel_id, inspection_date;
        CREATE INDEX ON permitfeatures1_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- make the categorical (dummified) features 
        CREATE TEMP TABLE joinedpermits_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT parcel_id, inspection_date, event.* 
            FROM insp2permits_{n_months}months_{max_dist}m i2e
            LEFT JOIN LATERAL (
                SELECT * FROM public.permits s where s.id=i2e.id
            ) event
            ON true
        ;
        CREATE INDEX ON joinedpermits_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- Join the permits with the inspections; then concatenate the 
        -- inspections and the various categorical variables (we'll pivot later)
        
        CREATE TEMP TABLE permitfeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS

            {unionall_statements};

        CREATE INDEX ON permitfeatures2_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- Now call the pivot function to create columns with the 
        -- different fire types
        SELECT colpivot('permitpivot_{n_months}months_{max_dist}m',
                        'select * from permitfeatures2_{n_months}months_{max_dist}m',
                        array['parcel_id','inspection_date'],
                        array['categ'],
                        'coalesce(#.count,0)',
                        null
        );
        CREATE INDEX ON permitpivot_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- still need to 'save' the tables into a permanent table
        CREATE TABLE permitfeatures_{n_months}months_{max_dist}m AS
            SELECT * FROM permitfeatures1_{n_months}months_{max_dist}m
            JOIN permitpivot_{n_months}months_{max_dist}m
            USING (parcel_id, inspection_date)
        ;
    """.format(n_months=str(n_months),
               max_dist=str(max_dist),
               unionall_statements=unionall_statements)

    cur.execute(query)
    con.commit()

    # fetch the data
    query = """
        SELECT * FROM permitfeatures_{n_months}months_{max_dist}m;
    """.format(n_months=str(n_months), max_dist=str(max_dist))

    df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date'])

    # clean up the column names
    df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns)
    df.columns = map(
        lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns)

    # drop the last interim table
    query = 'drop table permitfeatures_{n_months}months_{max_dist}m'.format(
        n_months=str(n_months), max_dist=str(max_dist))
    cur.execute(query)
    con.commit()

    return df

Пример #6

Показать файл

Файл: sales.py Проект: TorontoDataScientistsWithoutBorders/cincinnati

def make_sales_features(con, n_months, max_dist):
    """
    Make sales features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'sales'
    date_column = 'date_of_sale'
    insp2tablename = ('insp2{dataset}_{n_months}months'
                      '_{max_dist}m').format(dataset='sales',
                                             n_months=str(n_months),
                                             max_dist=str(max_dist))

    load_colpivot(con)

    #Get the time window for which you can generate features
    min_insp, max_insp = check_date_boundaries(con, n_months, dataset,
                                               date_column)

    make_inspections_address_nmonths_table(con,
                                           dataset,
                                           date_column,
                                           min_insp,
                                           max_insp,
                                           n_months=n_months,
                                           max_dist=max_dist,
                                           load=False)

    logger.info('Computing distance features for {}'.format(dataset))

    # there are several columns that we need to prune in terms of codes;
    # thus, make tables of value counts
    rnum = 15

    coalescemissing_use_code = "11111"  # use_code is an int, so hack this
    coalescemissing = "'missing'"

    to_dummify_columns = [
        'instrument_type', 'garage_type', 'style', 'grade',
        'exterior_wall_type', 'basement', 'heating', 'air_conditioning'
    ]

    for col in to_dummify_columns:
        make_table_of_frequent_codes(con,
                                     col=col,
                                     intable='public.sales',
                                     outtable='public.frequentsales_%s' % col,
                                     rnum=rnum,
                                     coalesce_to=coalescemissing)

    # use_code needs special treatment because it's an int
    make_table_of_frequent_codes(con,
                                 col='use_code',
                                 intable='public.sales',
                                 outtable='public.frequentsales_use_code',
                                 coalesceto=coalescemissing_use_code,
                                 rnum=rnum,
                                 to_other="9999")

    cur = con.cursor()

    # let's generate all the 'simple' features we might want;
    # each column will be named similar to 'avg_total_rooms'
    coltemplate = "{fun}({col}) AS {fun}_{col}"
    cols = [
        'number_of_parcels', 'appraisal_area', 'total_sales_records',
        'sale_price', 'total_rooms', 'full_bath', 'half_bath', 'fireplaces',
        'garage_capacity', 'num_stories', 'year_built', 'finished_sq_ft',
        'total_finish_area', 'first_floor_area', 'half_floor_area',
        'finished_basement'
    ]
    funs = [
        'avg'
    ]  # ,'sum','min','max','stddev'] # could do more, but probably not necessary
    featureselects = ',\n'.join(
        coltemplate.format(fun=f, col=c)
        for f, c in itertools.product(funs, cols))

    # This is a template for a pivot table. In the sales table, we have several categorical columns.
    # We need to pivot these into columns, with counts grouped by parcel_id and inspection_date.
    # As a first step, we take make a table for each categorical column that we want to pivot.
    # Each such table has columns (parcel_id, inspection_date, categ, count), where categ is
    # the level of our categorical column, and count is the number of times that level appears
    # for index (parcel_id, inspection_date). (We create a new level for 'null' rows.)
    # Here, we just define a template for this table query; we'll use it below.
    # {col} will be the categorical column name; joinedsales_Xmonths_Ym a join between sales and
    # insp2sales_Xmonths_Ym.
    unionall_template = """
        SELECT parcel_id, inspection_date, 
              '{col}_'||coalesce(t2.level,{coalescemissing}) as categ,
              coalesce(t1.count, 0) as count
        FROM (
            SELECT parcel_id, inspection_date,
                   fs.level,
                   count(*) as count
            FROM joinedsales_{n_months}months_{max_dist}m event
            LEFT JOIN public.frequentsales_{col} fs
            ON fs.raw_level = coalesce(event.{col},{coalescemissing})
            GROUP BY parcel_id, inspection_date, fs.level
        ) t1
        RIGHT JOIN (
            SELECT parcel_id, inspection_date, t.level
            FROM parcels_inspections
            JOIN ( SELECT distinct level FROM public.frequentsales_{col} ) t
            ON true
        ) t2
        USING (parcel_id, inspection_date, level)
        """

    unionall_statements = '\n'.join([
        'UNION ALL ( %s )' %
        unionall_template.format(col=col,
                                 n_months=str(n_months),
                                 max_dist=str(max_dist),
                                 coalescemissing=coalescemissing)
        for col in to_dummify_columns
    ])

    query = """
        DROP TABLE IF EXISTS salesfeatures1_{n_months}months_{max_dist}m;
       
        DROP TABLE IF EXISTS joinedsales_{n_months}months_{max_dist}m;

        -- join the inspections and sales
        CREATE TEMP TABLE joinedsales_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT parcel_id, inspection_date, event.* 
            FROM insp2sales_{n_months}months_{max_dist}m i2e
            LEFT JOIN LATERAL (
                SELECT * FROM public.sales s where s.id=i2e.id
            ) event
            ON true
        ;
        CREATE INDEX ON joinedsales_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- make the simple features
        CREATE TEMP TABLE salesfeatures1_{n_months}months_{max_dist}m ON COMMIT DROP AS
            SELECT 
                parcel_id,
                inspection_date,
                count(*) as total,
                {featureselects}
            FROM joinedsales_{n_months}months_{max_dist}m event
            GROUP BY parcel_id, inspection_date;
        CREATE INDEX ON salesfeatures1_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- make the categorical (dummified) features 
        CREATE TEMP TABLE salesfeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS
        
        -- now, we have a few columns with too many levels; we restrict these levels to the 15 most common ones,
        -- using the tables of frequency counts for these levels that we created earlier

        -- use_code is special, as it's an int (and we want it as varchar)
        SELECT parcel_id, inspection_date, 
              'use_code_'||coalesce(t2.level::varchar,'missing') as categ,
              coalesce(t1.count, 0) as count
        FROM (
            SELECT parcel_id, inspection_date,
                   fs.level,
                   count(*) as count
            FROM joinedsales_{n_months}months_{max_dist}m event
            LEFT JOIN public.frequentsales_use_code fs
            ON fs.raw_level = coalesce(event.use_code,{coalescemissing_use_code})
            GROUP BY parcel_id, inspection_date, fs.level
        ) t1
        RIGHT JOIN (
            SELECT parcel_id, inspection_date, t.level
            FROM parcels_inspections
            JOIN ( SELECT distinct level FROM public.frequentsales_use_code ) t
            ON true
        ) t2
        USING (parcel_id, inspection_date, level)

        {unionall_statements} -- these are all the columns that we defined above
        ;
        
        CREATE INDEX ON salesfeatures2_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- Now call the pivot function to create columns with the 
        -- different fire types
        SELECT colpivot('salespivot_{n_months}months_{max_dist}m',
                        'select * from salesfeatures2_{n_months}months_{max_dist}m',
                        array['parcel_id','inspection_date'],
                        array['categ'],
                        'coalesce(#.count,0)',
                        null
        );
        CREATE INDEX ON salespivot_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- still need to 'save' the tables into a permanent table
        DROP TABLE IF EXISTS salesfeatures_{n_months}months_{max_dist}m;
        CREATE TABLE salesfeatures_{n_months}months_{max_dist}m AS
            SELECT * FROM salesfeatures1_{n_months}months_{max_dist}m
            JOIN salespivot_{n_months}months_{max_dist}m
            USING (parcel_id, inspection_date)
        ;
    """.format(n_months=str(n_months),
               max_dist=str(max_dist),
               featureselects=featureselects,
               coalescemissing_use_code=coalescemissing_use_code,
               unionall_statements=unionall_statements)

    cur.execute(query)
    con.commit()

    # fetch the data
    query = """
        SELECT * FROM salesfeatures_{n_months}months_{max_dist}m;
    """.format(n_months=str(n_months), max_dist=str(max_dist))

    df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date'])

    # clean up the column names
    df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns)
    df.columns = map(
        lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns)

    # drop the last interim table
    query = 'drop table salesfeatures_{n_months}months_{max_dist}m'.format(
        n_months=str(n_months), max_dist=str(max_dist))
    cur.execute(query)
    con.commit()

    return df

Пример #7

Показать файл

Файл: fire.py Проект: TorontoDataScientistsWithoutBorders/cincinnati

def make_fire_features(con, n_months, max_dist):
    """
    Make Fire features

    Input:
    db_connection: connection to postgres database.
                   "set schema ..." must have been called on this connection
                   to select the correct schema from which to load inspections

    Output:
    A pandas dataframe, with one row per inspection and one column per feature.
    """
    dataset = 'fire'
    date_column = 'incident_date'
    coalescemissing = "'missing'"

    #Get the time window for which you can generate features
    min_insp, max_insp = check_date_boundaries(con, n_months, dataset,
                                               date_column)

    make_inspections_address_nmonths_table(con,
                                           dataset,
                                           date_column,
                                           min_insp,
                                           max_insp,
                                           n_months=n_months,
                                           max_dist=max_dist,
                                           load=False)

    logger.info('Computing distance features for {}'.format(dataset))

    insp2tablename = ('insp2{dataset}_{n_months}months'
                      '_{max_dist}m').format(dataset='fire',
                                             n_months=str(n_months),
                                             max_dist=str(max_dist))

    # add the colpivot function to our Postgres schema
    load_colpivot(con)

    cur = con.cursor()

    # create a table of the most common fire types,
    # so we can limit the pivot later to the 15 most common
    # types of incidents
    make_table_of_frequent_codes(con,
                                 col='incident_type_desc',
                                 intable='public.fire',
                                 outtable='public.frequentfiretypes',
                                 coalesce_to=coalescemissing,
                                 rnum=15)

    # also make sure that the fire data has an index on the description,
    # as we want to join on it
    query = """
        CREATE INDEX firetype_idx ON public.fire (incident_type_desc);
    """
    try:
        cur.execute(query)
        con.commit()
    except (InternalError, ProgrammingError) as e:
        logger.warning("Catching Exception: " + e.message)
        logger.warning(" - CONTINUING, NOT RE-RUNNING firetype_idx QUERY.....")
        con.rollback()

    # now on to the actual feature generation
    query = """
        DROP TABLE IF EXISTS firefeatures_{n_months}months_{max_dist}m;

        -- link parcels and events within the right radius
        CREATE TEMP TABLE joinedtable ON COMMIT DROP AS
            SELECT parcel_id, inspection_date, event.* 
            FROM insp2fire_{n_months}months_{max_dist}m i2e
            LEFT JOIN LATERAL (
                SELECT * FROM public.fire s where s.id=i2e.id
            ) event
            ON true
        ;
        CREATE INDEX ON joinedtable (parcel_id, inspection_date);

        -- group by inspections and fire types (we'll pivot later)
        -- make sure to include all types
        CREATE TEMP TABLE firetypes_{n_months}months_{max_dist}m ON COMMIT DROP AS (

            SELECT t2.parcel_id, t2.inspection_date,
                   'incident_type_'||t2.level AS incident_type_desc,
                   coalesce(t1.count, 0) as count
            FROM ( SELECT parcel_id, inspection_date,
                       frequentfires.level,
                       count(*) as count
                   FROM joinedtable event
                   LEFT JOIN public.frequentfiretypes frequentfires
                   ON frequentfires.raw_level = coalesce(event.incident_type_desc, {coalescemissing})
                   GROUP BY parcel_id, inspection_date, frequentfires.level
            ) t1
            RIGHT JOIN (
                SELECT parcel_id, inspection_date, ft.level
                FROM parcels_inspections
                JOIN (SELECT DISTINCT level FROM public.frequentfiretypes) ft
                ON true
            ) t2
            USING (parcel_id, inspection_date, level)
        );

        CREATE INDEX ON firetypes_{n_months}months_{max_dist}m (parcel_id, inspection_date);

        -- Now call the pivot function to create columns with the 
        -- different fire types
        SELECT colpivot('firefeatures_{n_months}months_{max_dist}m',
                        'select * from firetypes_{n_months}months_{max_dist}m',
                        array['parcel_id','inspection_date'],
                        array['incident_type_desc'],
                        'coalesce(#.count,0)',
                        null
        );
        CREATE INDEX ON firefeatures_{n_months}months_{max_dist}m (parcel_id,inspection_date);

        -- now we do some simple features
        DROP TABLE IF EXISTS firefeatures2_{n_months}months_{max_dist}m;

        CREATE TEMP TABLE firefeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS (
            SELECT parcel_id, inspection_date,
                count(*) as total, -- note that total includes the non-frequent incident types
                avg(
                   extract(epoch from event.unit_clear_date_time-event.alarm_date_time)::int/60
                ) as avg_clear_time_minutes,
                max(
                   extract(epoch from event.unit_clear_date_time-event.alarm_date_time)::int/60
                ) as max_clear_time_minutes,
                min(
                   extract(epoch from event.unit_clear_date_time-event.alarm_date_time)::int/60
                ) as min_clear_time_minutes,
                stddev(
                   extract(epoch from event.unit_clear_date_time-event.alarm_date_time)::int/60
                ) as stddev_clear_time_minutes
            FROM joinedtable event
            GROUP BY parcel_id, inspection_date
        ); 
        CREATE INDEX ON firefeatures2_{n_months}months_{max_dist}m (parcel_id,inspection_date);

        -- The pivot function only creates a temp table,
        -- so we still need to save it into a proper table.
        -- Also, this is a good time to join in the other 
        -- features we want.
        CREATE TABLE firefeatures_{n_months}months_{max_dist}m AS
            SELECT * FROM firefeatures_{n_months}months_{max_dist}m
            JOIN firefeatures2_{n_months}months_{max_dist}m
            USING (parcel_id, inspection_date)
        ;
        """.format(insp2tablename=insp2tablename,
                   n_months=str(n_months),
                   max_dist=str(max_dist),
                   coalescemissing=coalescemissing)

    cur.execute(query)
    con.commit()

    query = """
        SELECT * FROM firefeatures_{n_months}months_{max_dist}m;
    """.format(n_months=str(n_months), max_dist=str(max_dist))

    # fetch the data
    df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date'])

    # clean up the column names
    df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns)
    df.columns = map(
        lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns)

    # drop the last interim table
    query = 'drop table firefeatures_{n_months}months_{max_dist}m'.format(
        n_months=str(n_months), max_dist=str(max_dist))
    cur.execute(query)
    con.commit()

    return df