def construct_geo_name(geo_feature_obj, geo_name_obj):

    try:
        #  filter geographic data by features and feature types

        geo_data = session.query(geo_feature_obj) \
            .filter(geo_feature_obj.geo_feature.in_(DEFAULT['geo_features'])) \
            .filter(~geo_feature_obj.feature_type.in_(DEFAULT['exempt_types'])).subquery()

        geo_name = session.query(
            func.concat(geo_data.c.geo_feature, '_',
                        geo_data.c.feature_type).label('name'),
            geo_data.c.geo_feature,
            geo_data.c.feature_type).distinct().order_by('name').all()

        obj_results = [
            geo_name_obj(name=item[0],
                         geo_feature=item[1],
                         feature_type=item[2]) for item in geo_name
        ]
        # session.add_all(obj_results)
        # session.commit()

        print('Generated {} Geo Names.'.format(len(geo_name)))
        return

    except Exception as e:
        print(e)
        exit(-1)
def gen_geo_vector(geo_obj, geo_name_obj, grid_list):
    """
    load geographic data and construct the geographic vector

    return:
        geo_vector: (n_loc, n_geo_features)
        geo_name_list: a list of geographic feature names
    """

    geo_data = session.query(geo_obj.data) \
        .filter(geo_obj.gid.in_(grid_list)) \
        .order_by(geo_obj.gid).all()

    n_geo_features = len(geo_data[0][0])
    geo_vector = np.array(geo_data).reshape(len(grid_list), n_geo_features)
    print('The shape of geographic vector = {}.'.format(geo_vector.shape))

    geo_name_df = pd.read_sql(
        session.query(geo_name_obj).statement, session.bind)
    geo_name_list = list(geo_name_df['name'])

    if len(geo_name_list) != n_geo_features:
        print('Something wrong with the geographic feature vector!')

    return geo_vector, geo_name_list
def crop_osm(osm_table, bounding_box):

    if bounding_box is not None:
        return session.query(osm_table.wkb_geometry, osm_table.fclass) \
            .filter(func.ST_Intersects(osm_table.wkb_geometry, bounding_box)) \
            .filter(osm_table.fclass is not None).subquery()
    else:
        return session.query(osm_table.wkb_geometry, osm_table.fclass) \
            .filter(osm_table.fclass is not None).subquery()
def construct_geo_vector(**kwargs):

    geo_feature_obj = kwargs['geo_feature_obj']
    coord_obj = kwargs['coord_obj']
    geo_vector_obj = kwargs['geo_vector_obj']
    geo_name_obj = kwargs['geo_name_obj']

    locations = sorted([i[0] for i in session.query(coord_obj.gid).all()])
    geo_name_df = pd.read_sql(
        session.query(geo_name_obj.name).statement, session.bind)

    try:
        for loc in locations:

            geo_data_sql = session.query(geo_feature_obj.value, func.concat(
                geo_feature_obj.geo_feature, '_', geo_feature_obj.feature_type).label('name')) \
                .filter(geo_feature_obj.gid == loc).statement

            geo_data_df = pd.read_sql(geo_data_sql, session.bind)
            geo_data = geo_name_df.merge(geo_data_df, on='name', how='left')
            geo_data = geo_data['value'].fillna(0.0)

            coord = session.query(
                coord_obj.lon,
                coord_obj.lat).filter(coord_obj.gid == loc).first()
            obj_result = geo_vector_obj(gid=loc,
                                        data=list(geo_data) + list(coord))

            session.add(obj_result)
            session.commit()

            if loc % 1000 == 0:
                print('Geo Vector {} has finished.'.format(
                    len(list(geo_data) + list(coord))))

        # adding lon, lat into geo feature names
        obj_results = [
            geo_name_obj(name='lon',
                         geo_feature='location',
                         feature_type='lon'),
            geo_name_obj(name='lat',
                         geo_feature='location',
                         feature_type='lat')
        ]
        # session.add_all(obj_results)
        # session.commit()

        return

    except Exception as e:
        print(e)
        exit(-1)
def main(pm_obj, coord_obj, **kwargs):

    # compute the number of time points in the period
    min_time, max_time = kwargs.get('min_time', '2018-01-01'), kwargs.get('max_time', '2018-02-01')
    time_list = pd.date_range(start=min_time, end=max_time, closed='left', freq='1H')

    # query all the pm locations that a certain number of observations
    pm_locations = session.query(pm_obj.gid) \
        .filter(pm_obj.timestamp >= min_time) \
        .filter(pm_obj.timestamp < max_time) \
        .group_by(pm_obj.gid).having(func.count(pm_obj.gid) > 0.01 * len(time_list)).all()
    pm_locations = [i[0] for i in pm_locations]
    print('Number of pm2.5 locations = {}.'.format(len(pm_locations)))

    # query the coordinates of the locations
    coord_df = pd.read_sql(session.query(coord_obj.gid, coord_obj.lon, coord_obj.lat).statement, session.bind)
    grid_coordinates = coord_df[coord_df['gid'].isin(pm_locations)]

    grid_list = list(grid_coordinates['gid'])
    coordinate_arr = grid_coordinates[['lon', 'lat']].values
    location_set = LocationSet(grid_list, coordinate_arr)

    label_dict = gen_labels_with_lon_lat(location_set)
    # if kw

    if min_time == '2018-01-01':
        train_loc, val_loc, test_loc = gen_train_val_test(label_dict, train_radio=0.6, val_radio=0.1)
    else:
        train_loc, val_loc, test_loc = gen_train_val_test(label_dict, train_radio=0.6, val_radio=0.2)

    print(len(train_loc), train_loc)
    print(len(val_loc), val_loc)
    print(len(test_loc), test_loc)

    fig, ax = plt.subplots()
    x = [location_set.location_dict[i].lon for i in train_loc]
    y = [location_set.location_dict[i].lat for i in train_loc]
    ax.scatter(x, y, c='r', label='train')

    x = [location_set.location_dict[i].lon for i in val_loc]
    y = [location_set.location_dict[i].lat for i in val_loc]
    ax.scatter(x, y, c='b', label='val')

    x = [location_set.location_dict[i].lon for i in test_loc]
    y = [location_set.location_dict[i].lat for i in test_loc]
    ax.scatter(x, y, c='g', label='test')

    plt.legend()
    plt.show()

    return sorted(train_loc), sorted(val_loc), sorted(test_loc)
示例#6
0
def one_time_prediction(time, epa_geo_vector, fishnet_geo_vector, **kwargs):

    air_data = session.query(epa_obj.station_id, epa_obj.date_observed,
                             epa_obj.concentration).filter(
                                 epa_obj.date_observed == time,
                                 epa_obj.parameter_name == 'PM2.5').all()

    if len(air_data) <= 3:
        return None

    air_df = pd.DataFrame(air_data, columns=AIR_COLUMN_SET)
    air_df = air_df.groupby(by=[KEY_COL, TIME_COL], as_index=False).mean()
    air_df = air_df[air_df[VALUE_COL] > 0]

    locations = list(air_df[KEY_COL])
    y_train = np.array(air_df[VALUE_COL])
    x_train = np.array(epa_geo_vector[locations])
    x_test = np.array(fishnet_geo_vector)

    rf_tree_num = kwargs.get('rf_regression_tree_num', 300)
    rf_tree_depth = kwargs.get('rf_regression_tree_depth', 10)
    model = random_forest_regressor(x_train.T, y_train.T, rf_tree_num,
                                    rf_tree_depth)

    prediction = model.predict(x_test.T)
    gids = list(fishnet_geo_vector.columns)
    write_res(gids, prediction, time)
def gen_label_mat(pm_obj, time_list, mapping_mat):
    """
    construct the label matrix, if there is no label for a grid, using Nan to fill in.

    return:
        pm_mat: (n_times, n_output=1, n_rows, n_cols)
    """

    min_time, max_time = time_list[0], time_list[-1]
    pm_query_sql = session.query(pm_obj.gid, pm_obj.timestamp, pm_obj.pm25) \
        .filter(pm_obj.timestamp >= min_time) \
        .filter(pm_obj.timestamp <= max_time) \
        .order_by(pm_obj.gid)

    pm_data = pd.read_sql(pm_query_sql.statement, session.bind)

    pm_mat_list = []
    for t in time_list:
        this_pm_data = pm_data[pm_data['timestamp'] == t]
        this_pm_grids = list(this_pm_data['gid'])
        this_pm_data = np.array(this_pm_data['pm25']).reshape((1, 1, -1))
        this_pm_mat = gen_grid_data(this_pm_data, this_pm_grids, mapping_mat)
        pm_mat_list.append(this_pm_mat)

    pm_mat = np.vstack(pm_mat_list)
    print('The shape of PM matrix = {}.'.format(pm_mat.shape))
    return pm_mat
def main(old_meo_obj, target_meo_obj):

    meo_features = [
        'temperature', 'dew_point', 'humidity', 'pressure', 'wind_speed',
        'wind_bearing', 'cloud_cover', 'visibility'
    ]

    max_time = session.query(func.max(
        old_meo_obj.timestamp)).scalar().strftime('%Y-%m-%d %H:%M:%S')
    min_time = session.query(func.min(
        old_meo_obj.timestamp)).scalar().strftime('%Y-%m-%d %H:%M:%S')
    tz = pytz.timezone('America/Los_Angeles')
    time_df = pd.date_range(start=min_time, end=max_time, freq='1H')
    time_list = sorted(list(set([tz.localize(x) for x in time_df])))
    print(len(time_list))
    """ !!! Be careful, create table would overwrite the original table """
    create_table(target_meo_obj)
    interpolate_time(old_meo_obj, target_meo_obj, time_list, meo_features)
def main(config):

    start_time = config['START_TIME']
    end_time = config['END_TIME']
    tz = pytz.timezone('America/Los_Angeles')
    time_list = pd.date_range(start=start_time, end=end_time, freq='H')
    time_list = [tz.localize(x) for x in time_list]
    table_obj = config['TABLE']
    new_table_obj = config['NEW_TABLE']

    fields = ['pm1_atm', 'pm2_5_atm', 'pm10_atm', 'pm1_cf_1', 'pm2_5_cf_1', 'pm10_cf_1', 'p_0_3um_cnt', 'p_0_5um_cnt',
               'p_1_0um_cnt', 'p_2_5um_cnt', 'p_5um_cnt', 'p_10um_cnt', 'rssi', 'temperature', 'humidity']

    for i, t in enumerate(time_list[:-1]):

        sql_statement = session.query(table_obj).filter(table_obj.timestamp >= time_list[i], table_obj.timestamp < time_list[i + 1])
        df = pd.read_sql(sql_statement.statement, session.bind)[['sensor_id', 'channel'] + fields]

        if len(df) == 0:
            continue

        def preprocessing(x):
            x_mean, x_std = x.mean(skipna=True), x.std(skipna=True)
            x_left, x_right = x_mean - x_std, x_mean + x_std
            new_x = ((x >= x_left) & (x <= x_right)) * x
            new_x = new_x.replace({0: np.nan})
            return new_x.mean(skipna=True)

        agg_df = df.groupby(['sensor_id', 'channel']).apply(lambda x: preprocessing(x[fields])).round(5)
        agg_df = agg_df[fields].reset_index()
        agg_df = agg_df.replace({np.nan: None})

        agg_data = []
        for _, row in agg_df.iterrows():
            agg_data_obj = new_table_obj(
                sensor_id=row['sensor_id'],
                channel=row['channel'],
                timestamp=time_list[i],
                pm1_atm=row['pm1_atm'],
                pm2_5_atm=row['pm2_5_atm'],
                pm10_atm=row['pm10_atm'],
                pm1_cf_1=row['pm1_cf_1'],
                pm2_5_cf_1=row['pm2_5_cf_1'],
                pm10_cf_1=row['pm10_cf_1'],
                p_0_3um_cnt=row['p_0_3um_cnt'],
                p_0_5um_cnt=row['p_0_5um_cnt'],
                p_1_0um_cnt=row['p_1_0um_cnt'],
                p_2_5um_cnt=row['p_2_5um_cnt'],
                p_5um_cnt=row['p_5um_cnt'],
                p_10um_cnt=row['p_10um_cnt'],
                rssi=row['rssi'],
                temperature=row['temperature'],
                humidity=row['humidity'])
            agg_data.append(agg_data_obj)
        session.add_all(agg_data)
        session.commit()
    print('Finish one table.')
示例#10
0
def get_locations(location_table_obj):
    locations = session.query(location_table_obj.sensor_id,
                              location_table_obj.parent_id,
                              location_table_obj.channel,
                              location_table_obj.thingspeak_primary_id,
                              location_table_obj.thingspeak_primary_id_read_key,
                              location_table_obj.thingspeak_second_id,
                              location_table_obj.thingspeak_second_id_read_key).all()
    return locations
def load_geo_data(geo_obj, loc_type=None, loc_obj=None):

    geo = session.query(geo_obj.gid, geo_obj.geo_feature, geo_obj.feature_type,
                        geo_obj.buffer_size, geo_obj.value).all()

    if loc_type == 'station_id' and loc_obj is not None:
        geo += session.query(loc_obj.station_id, literal('location'),
                             literal('lon'), literal(0), loc_obj.lon).all()
        geo += session.query(loc_obj.station_id, literal('location'),
                             literal('lat'), literal(0), loc_obj.lat).all()
    if loc_type == 'gid' and loc_obj is not None:
        geo += session.query(loc_obj.station_id, literal('location'),
                             literal('lon'), literal(0), loc_obj.lon).all()
        geo += session.query(loc_obj.station_id, literal('location'),
                             literal('lat'), literal(0), loc_obj.lat).all()

    geo_df = pd.DataFrame(geo, columns=GEO_COLUMN_SET)
    geo_vector = construct_geo_vector(geo_df)
    return geo_df, geo_vector
示例#12
0
def request_target_locations(location_obj):

    results = session.query(location_obj).with_entities(
        *[location_obj.gid, location_obj.lon, location_obj.lat]).all()

    loc_dict = {}

    if not results:
        return {'status': -1, 'msg': 'No target locations.', 'data': loc_dict}
    else:
        for res in results:
            loc_dict[res.gid] = (res.lon, res.lat)
        return {'status': 1, 'msg': '', 'data': loc_dict}
def generate_grids(config, area=None):

    bounding_box = WKTElement(config['BOUNDING_BOX'], srid=4326)
    grid_obj = config['GRID_OBJ']
    resolution = config['RESOLUTION']
    epsg = config['EPSG']

    try:

        grids = session.query(func.ST_Dump(
            func.makegrid_2d(bounding_box, resolution, resolution)).geom.label('geom')  # self-defined function in Psql
        ).subquery()

        # using the boundary to crop the area
        # if config['AREA'] == 'los_angeles':
        #     grids = session.query(grids.c.geom) \
        #         .filter(func.ST_Intersects(LosAngelesCountyBoundary.wkb_geometry, grids.c.geom)).subquery()

        results = session.query(
            func.row_number().over().label('gid'),
            func.ST_Centroid(grids.c.geom).label('centroid'),
            func.ST_X(func.ST_Centroid(grids.c.geom)).label('lon'),
            func.ST_Y(func.ST_Centroid(grids.c.geom)).label('lat'),
            grids.c.geom,
            func.ST_X(func.ST_Transform(func.ST_Centroid(grids.c.geom), epsg)).label('lon_proj'),
            func.ST_Y(func.ST_Transform(func.ST_Centroid(grids.c.geom), epsg)).label('lat_proj')).all()

        obj_results = []
        for res in results:
            obj_results.append(grid_obj(gid=res[0], centroid=res[1], lon=res[2], lat=res[3],
                                        geom=res[4], lon_proj=res[5], lat_proj=res[6]))

        # session.add_all(obj_results)
        # session.commit()
        return

    except Exception as e:
        print(e)
        exit(-1)
示例#14
0
def find_station_info():

    search_results = session.query(LosAngelesEPALocation)\
        .with_entities(*[LosAngelesEPALocation.station_id, LosAngelesEPALocation.lon, LosAngelesEPALocation.lat]).all()

    if not search_results:
        return None
    else:
        station_dict = {}
        max_station_id = 0
        for res in search_results:
            station_dict[(res.lon, res.lat)] = res.station_id
            if res.station_id > max_station_id:
                max_station_id = res.station_id
        return station_dict, max_station_id
def interpolate_time(old_obj, target_obj, time_list, features):

    try:
        time_df = pd.DataFrame(time_list,
                               columns=['timestamp']).set_index(['timestamp'])
        locations = session.query(old_obj.gid).distinct(old_obj.gid).all()
        locations = sorted([loc[0] for loc in locations])

        for loc in locations:

            data = session.query(old_obj.timestamp, *features).filter(old_obj.gid == loc)\
                .order_by(old_obj.timestamp).all()

            df = pd.DataFrame(data, columns=['timestamp'] +
                              features).set_index(['timestamp'])
            df = df.loc[~df.index.duplicated(
                keep='first')]  # remove the potential duplicates in index
            df = df.join(time_df, how='right').sort_index()
            # df['wind_bearing'] = df['wind_bearing'].apply(lambda x: x - 360 if x > 180 else x)

            inter_data = df.interpolate(method='linear').reset_index()

            obj_results = [
                target_obj(gid=loc, timestamp=dt[0], data=dt[1:])
                for dt in inter_data.values.tolist()
            ]
            session.add_all(obj_results)
            session.commit()

            print('Location {} has finished. {} records has been generated.'.
                  format(loc, len(inter_data)))
        return

    except Exception as e:
        print(e)
        exit(-1)
def gen_meo_vector(meo_obj, time_list, grid_list):
    """
    load weather data and construct the meo vector

    return:
        meo_vector: (n_times, n_loc, n_meo_features)
    """

    min_time, max_time = time_list[0], time_list[-1]
    n_times, n_loc = len(time_list), len(grid_list)

    meo_data = session.query(meo_obj.data) \
        .filter(meo_obj.timestamp >= min_time) \
        .filter(meo_obj.timestamp <= max_time) \
        .filter(meo_obj.gid.in_(grid_list)) \
        .order_by(meo_obj.timestamp, meo_obj.gid).all()

    n_meo_features = len(meo_data[0][0])
    meo_vector = np.array(meo_data).reshape((n_times, n_loc, n_meo_features))

    print('The shape of meo vector = {}.'.format(meo_vector.shape))
    return meo_vector
def load_air_data(air_obj, start_time, end_time):

    air_data = session.query(air_obj.station_id, air_obj.date_observed,
                             air_obj.concentration).filter(
                                 air_obj.date_observed >= start_time,
                                 air_obj.date_observed < end_time,
                                 air_obj.parameter_name == 'PM2.5').all()

    if len(air_data) < 1:
        return None
    """ remove duplicates and negative values """
    air_df = pd.DataFrame(air_data, columns=AIR_COLUMN_SET)
    air_df = air_df.groupby(by=[KEY_COL, TIME_COL], as_index=False).mean()
    air_df = air_df[air_df[VALUE_COL] > 0]
    """ remove the locations with too few observation """
    n_obs = air_df.groupby(KEY_COL).size().reset_index(name='n')
    rm_loc = list(
        n_obs[n_obs['n'] < int(0.6 *
                               time_diff(start_time, end_time))][KEY_COL])
    air_df = air_df[~air_df[KEY_COL].isin(rm_loc)]
    """ construct timeseries """
    air_time_series = construct_time_series(air_df)
    return air_time_series
def main(input_obj):

    pm_obj = input_obj['pm_obj']
    meo_obj = input_obj['meo_obj']
    geo_obj = input_obj['geo_obj']
    geo_name_obj = input_obj['geo_name_obj']
    coord_obj = input_obj['coord_obj']

    # load mapping matrix
    mapping_mat = np.load(input_obj['mapping_mat_file'])['mat']

    # load grids
    coord_df = pd.read_sql(
        session.query(coord_obj.gid, coord_obj.lon, coord_obj.lat).statement,
        session.bind)
    grid_list = list(coord_df['gid'])
    print('Number of grids = {}.'.format(len(grid_list)))

    # get time list
    min_time, max_time = input_obj['min_time'], input_obj['max_time']
    tz = pytz.timezone('America/Los_Angeles')
    time_list = pd.date_range(start=min_time,
                              end=max_time,
                              closed='left',
                              freq='1H')
    time_list = sorted(list(set([tz.localize(x) for x in time_list])))
    print('Data from {} to {}.'.format(min_time, max_time))
    print('Number of time points = {}.'.format(len(time_list)))

    # generate label data
    print('...Generating label data...')
    label_mat = gen_label_mat(pm_obj, time_list, mapping_mat)

    # generate dynamic data
    print('...Generating dynamic data...')
    meo_vector = gen_meo_vector(meo_obj, time_list, grid_list)
    time_vector = gen_time_vector(time_list, grid_list)

    dynamic_vector = np.concatenate([meo_vector, time_vector], axis=-1)

    # convert to feature matrix
    dynamic_mat = dynamic_vector.swapaxes(
        1, 2)  # (n_times, n_loc, n_features) => (n_times, n_features, n_loc)
    dynamic_mat = gen_grid_data(dynamic_mat, grid_list, mapping_mat)
    print('The shape of dynamic matrix = {}.'.format(dynamic_mat.shape))

    # generate static data
    print('...Generating static data...')
    geo_vector, geo_name_list = gen_geo_vector(geo_obj, geo_name_obj,
                                               grid_list)
    geo_vector = geo_vector.reshape(1, geo_vector.shape[0],
                                    geo_vector.shape[1])

    # convert to feature matrix
    static_mat = geo_vector.swapaxes(
        1, 2)  # (1, n_loc, n_features) => (1, n_features, n_loc)
    static_mat = gen_grid_data(static_mat, grid_list, mapping_mat)
    print('The shape of static matrix = {}.'.format(static_mat.shape))

    # combine static vector and dynamic vector
    # arr = np.expand_dims(geo_vector, axis=0)
    # arr = np.repeat(arr, len(time_list), axis=0)
    # feature_vector = np.concatenate([feature_vector, arr], axis=-1)

    np.savez_compressed(os.path.join(input_obj['output_file']),
                        label_mat=label_mat,
                        dynamic_mat=dynamic_mat,
                        static_mat=static_mat,
                        dynamic_features=np.array([
                            'temperature', 'dew_point', 'humidity', 'pressure',
                            'wind_speed', 'wind_direction', 'cloud_cover',
                            'visibility', 'hourofday', 'dayofweek', 'day',
                            'month', 'dayofyear'
                        ]),
                        static_features=np.array(geo_name_list),
                        mapping_mat=mapping_mat)
def compute_features_from_osm(config):

    osm_tables = config['OSM']
    bounding_box = WKTElement(config['BOUNDING_BOX'], srid=4326)
    grid_obj = config['GRID_OBJ']
    geo_feature_obj = config['GEO_FEATURE_OBJ']

    try:
        for feature_name, osm_table in osm_tables.items():
            geo_feature_type = osm_table.wkb_geometry.type.geometry_type
            cropped_osm = crop_osm(
                osm_table,
                bounding_box)  # crop the OSM data with a bounding box

            sub_query = session.query(grid_obj.gid, cropped_osm.c.fclass,
                                      func.ST_GeogFromWKB(
                                          func.ST_Intersection(grid_obj.geom, cropped_osm.c.wkb_geometry))
                                      .label('intersection')) \
                .filter(func.ST_Intersects(grid_obj.geom, cropped_osm.c.wkb_geometry)).subquery()

            results = []
            if geo_feature_type == 'MULTIPOLYGON':
                results = session.query(sub_query.c.gid.label('gid'),
                                        sub_query.c.fclass.label('feature_type'),
                                        literal(feature_name).label('geo_feature'),
                                        func.SUM(func.ST_AREA(sub_query.c.intersection)).label('value'),
                                        literal('area').label('measurement')) \
                    .group_by(sub_query.c.gid, sub_query.c.fclass).all()

            elif geo_feature_type == 'MULTILINESTRING':
                results = session.query(sub_query.c.gid.label('gid'),
                                        sub_query.c.fclass.label('feature_type'),
                                        literal(feature_name).label('geo_feature'),
                                        func.SUM(func.ST_LENGTH(sub_query.c.intersection)).label('value'),
                                        literal('length').label('measurement')) \
                    .group_by(sub_query.c.gid, sub_query.c.fclass).all()

            elif geo_feature_type == 'POINT':
                results = session.query(sub_query.c.gid.label('gid'),
                                        sub_query.c.fclass.label('feature_type'),
                                        literal(feature_name).label('geo_feature'),
                                        func.COUNT(sub_query.c.intersection).label('value'),
                                        literal('count').label('measurement')) \
                    .group_by(sub_query.c.gid, sub_query.c.fclass).all()

            else:
                pass

            obj_results = []
            for res in results:
                obj_results.append(
                    geo_feature_obj(gid=res[0],
                                    feature_type=res[1],
                                    geo_feature=res[2],
                                    value=res[3],
                                    measurement=res[4]))
            # session.add_all(obj_results)
            # session.commit()
            print('{} has finished'.format(feature_name))

        return

    except Exception as e:
        print(e)
        exit(-1)
示例#20
0
def gen_matrix(coord_obj):
    """
        generate a matrix as
        mat = array([[6917, 6918, 6919, ..., 6990, 6991, 6992],
                     [6841, 6842, 6843, ..., 6914, 6915, 6916],
                     [6765, 6766, 6767, ..., 6838, 6839, 6840],
                     ...,
                     [153, 154, 155, ..., 226, 227, 228],
                     [77, 78, 79, ..., 150, 151, 152],
                     [1, 2, 3, ..., 74, 75, 76]])
    """

    coord_df = pd.read_sql(
        session.query(coord_obj.gid, coord_obj.lon, coord_obj.lat).statement,
        session.bind)
    coord_df = coord_df.round(10)
    coord_dict = {(row[1], row[2]): int(row[0])
                  for row in coord_df.values.tolist()}

    lat_list = sorted(coord_df['lat'].drop_duplicates())
    lat_dict = coord_df[[
        'lon', 'lat'
    ]].groupby('lat')['lon'].apply(lambda x: sorted(x)).to_dict()

    n_rows = len(lat_list)
    n_cols = min([len(v) for k, v in lat_dict.items()])
    """ find neighbors ["left", "right", "up", "down"] for the gid """
    neighbors = {}
    for idx, row in coord_df.iterrows():
        gid, this_lon, this_lat = int(row['gid']), row['lon'], row['lat']
        neighbors[gid] = {}
        neighbors[gid]['left'] = get_horizontal_neighbor(this_lon,
                                                         this_lat,
                                                         lat_dict,
                                                         coord_dict,
                                                         direction=-1)
        neighbors[gid]['right'] = get_horizontal_neighbor(this_lon,
                                                          this_lat,
                                                          lat_dict,
                                                          coord_dict,
                                                          direction=1)
        neighbors[gid]['up'] = get_vertical_neighbor(this_lon,
                                                     this_lat,
                                                     lat_list,
                                                     lat_dict,
                                                     coord_dict,
                                                     direction=1)
        neighbors[gid]['down'] = get_vertical_neighbor(this_lon,
                                                       this_lat,
                                                       lat_list,
                                                       lat_dict,
                                                       coord_dict,
                                                       direction=-1)
    """ convert neighbors to the matrix """
    mat = np.full(([n_rows, n_cols]), -1)
    curr_gid = min(coord_dict.values())
    curr_row = curr_gid

    for i in range(n_rows - 1, -1, -1):
        for j in range(n_cols):
            mat[i][j] = curr_gid
            if j < n_cols - 1:
                curr_gid = neighbors[curr_gid]['right']
            else:
                curr_gid = neighbors[curr_row]['up']
                curr_row = curr_gid

    return mat
def main(pm_obj, coord_obj, method, **kwargs):

    # compute the number of time points in the period
    min_time, max_time = kwargs.get('min_time', '2018-01-01'), kwargs.get(
        'max_time', '2018-02-01')
    time_list = pd.date_range(start=min_time,
                              end=max_time,
                              closed='left',
                              freq='1H')

    # query all the pm locations that a certain number of observations
    pm_locations = session.query(pm_obj.gid) \
        .filter(pm_obj.timestamp >= min_time) \
        .filter(pm_obj.timestamp < max_time) \
        .group_by(pm_obj.gid).having(func.count(pm_obj.gid) > 0.01 * len(time_list)).all()
    pm_locations = [i[0] for i in pm_locations]
    print('Number of pm2.5 locations = {}.'.format(len(pm_locations)))

    # query the coordinates of the locations
    coord_df = pd.read_sql(
        session.query(coord_obj.gid, coord_obj.lon, coord_obj.lat).statement,
        session.bind)
    grid_coordinates = coord_df[coord_df['gid'].isin(pm_locations)]

    grid_list = list(grid_coordinates['gid'])
    coordinate_arr = grid_coordinates[['lon', 'lat']].values
    location_set = LocationSet(grid_list, coordinate_arr)

    if method == 'db_k_means':
        label_dict = gen_labels_with_db_k_means(location_set)

    elif method == 'k_means':
        label_dict = {}
        pass

    else:
        label_dict = gen_labels_with_lon_lat(location_set)

    train_loc, val_loc, test_loc = gen_train_val_test(label_dict)

    if method == 'db_k_means':
        # finally get the train, val, and test locations for the original locations
        def extract_loc(input_loc):
            output_loc = []
            for this_loc in input_loc:
                output_loc += [int(i) for i in str(this_loc.gid).split('_')]
            return output_loc

        train_loc, val_loc, test_loc = extract_loc(train_loc), extract_loc(
            val_loc), extract_loc(test_loc)

    print(len(train_loc), train_loc)
    print(len(val_loc), val_loc)
    print(len(test_loc), test_loc)

    # plot final results
    all_location_info = {'gid': [], 'lon': [], 'lat': [], 'c': []}
    all_location_info['gid'] += [
        location_set.location_dict[i].gid for i in train_loc
    ]
    all_location_info['lon'] += [
        location_set.location_dict[i].lon for i in train_loc
    ]
    all_location_info['lat'] += [
        location_set.location_dict[i].lat for i in train_loc
    ]
    all_location_info['c'] += ['r'] * len(train_loc)
    all_location_info['gid'] += [
        location_set.location_dict[i].gid for i in val_loc
    ]
    all_location_info['lon'] += [
        location_set.location_dict[i].lon for i in val_loc
    ]
    all_location_info['lat'] += [
        location_set.location_dict[i].lat for i in val_loc
    ]
    all_location_info['c'] += ['b'] * len(val_loc)
    all_location_info['gid'] += [
        location_set.location_dict[i].gid for i in test_loc
    ]
    all_location_info['lon'] += [
        location_set.location_dict[i].lon for i in test_loc
    ]
    all_location_info['lat'] += [
        location_set.location_dict[i].lat for i in test_loc
    ]
    all_location_info['c'] += ['g'] * len(test_loc)

    plt.scatter(all_location_info['lon'],
                all_location_info['lat'],
                c=all_location_info['c'])
    plt.show()

    return sorted(train_loc), sorted(val_loc), sorted(test_loc)