def construct_geo_name(geo_feature_obj, geo_name_obj): try: # filter geographic data by features and feature types geo_data = session.query(geo_feature_obj) \ .filter(geo_feature_obj.geo_feature.in_(DEFAULT['geo_features'])) \ .filter(~geo_feature_obj.feature_type.in_(DEFAULT['exempt_types'])).subquery() geo_name = session.query( func.concat(geo_data.c.geo_feature, '_', geo_data.c.feature_type).label('name'), geo_data.c.geo_feature, geo_data.c.feature_type).distinct().order_by('name').all() obj_results = [ geo_name_obj(name=item[0], geo_feature=item[1], feature_type=item[2]) for item in geo_name ] # session.add_all(obj_results) # session.commit() print('Generated {} Geo Names.'.format(len(geo_name))) return except Exception as e: print(e) exit(-1)
def gen_geo_vector(geo_obj, geo_name_obj, grid_list): """ load geographic data and construct the geographic vector return: geo_vector: (n_loc, n_geo_features) geo_name_list: a list of geographic feature names """ geo_data = session.query(geo_obj.data) \ .filter(geo_obj.gid.in_(grid_list)) \ .order_by(geo_obj.gid).all() n_geo_features = len(geo_data[0][0]) geo_vector = np.array(geo_data).reshape(len(grid_list), n_geo_features) print('The shape of geographic vector = {}.'.format(geo_vector.shape)) geo_name_df = pd.read_sql( session.query(geo_name_obj).statement, session.bind) geo_name_list = list(geo_name_df['name']) if len(geo_name_list) != n_geo_features: print('Something wrong with the geographic feature vector!') return geo_vector, geo_name_list
def crop_osm(osm_table, bounding_box): if bounding_box is not None: return session.query(osm_table.wkb_geometry, osm_table.fclass) \ .filter(func.ST_Intersects(osm_table.wkb_geometry, bounding_box)) \ .filter(osm_table.fclass is not None).subquery() else: return session.query(osm_table.wkb_geometry, osm_table.fclass) \ .filter(osm_table.fclass is not None).subquery()
def construct_geo_vector(**kwargs): geo_feature_obj = kwargs['geo_feature_obj'] coord_obj = kwargs['coord_obj'] geo_vector_obj = kwargs['geo_vector_obj'] geo_name_obj = kwargs['geo_name_obj'] locations = sorted([i[0] for i in session.query(coord_obj.gid).all()]) geo_name_df = pd.read_sql( session.query(geo_name_obj.name).statement, session.bind) try: for loc in locations: geo_data_sql = session.query(geo_feature_obj.value, func.concat( geo_feature_obj.geo_feature, '_', geo_feature_obj.feature_type).label('name')) \ .filter(geo_feature_obj.gid == loc).statement geo_data_df = pd.read_sql(geo_data_sql, session.bind) geo_data = geo_name_df.merge(geo_data_df, on='name', how='left') geo_data = geo_data['value'].fillna(0.0) coord = session.query( coord_obj.lon, coord_obj.lat).filter(coord_obj.gid == loc).first() obj_result = geo_vector_obj(gid=loc, data=list(geo_data) + list(coord)) session.add(obj_result) session.commit() if loc % 1000 == 0: print('Geo Vector {} has finished.'.format( len(list(geo_data) + list(coord)))) # adding lon, lat into geo feature names obj_results = [ geo_name_obj(name='lon', geo_feature='location', feature_type='lon'), geo_name_obj(name='lat', geo_feature='location', feature_type='lat') ] # session.add_all(obj_results) # session.commit() return except Exception as e: print(e) exit(-1)
def main(pm_obj, coord_obj, **kwargs): # compute the number of time points in the period min_time, max_time = kwargs.get('min_time', '2018-01-01'), kwargs.get('max_time', '2018-02-01') time_list = pd.date_range(start=min_time, end=max_time, closed='left', freq='1H') # query all the pm locations that a certain number of observations pm_locations = session.query(pm_obj.gid) \ .filter(pm_obj.timestamp >= min_time) \ .filter(pm_obj.timestamp < max_time) \ .group_by(pm_obj.gid).having(func.count(pm_obj.gid) > 0.01 * len(time_list)).all() pm_locations = [i[0] for i in pm_locations] print('Number of pm2.5 locations = {}.'.format(len(pm_locations))) # query the coordinates of the locations coord_df = pd.read_sql(session.query(coord_obj.gid, coord_obj.lon, coord_obj.lat).statement, session.bind) grid_coordinates = coord_df[coord_df['gid'].isin(pm_locations)] grid_list = list(grid_coordinates['gid']) coordinate_arr = grid_coordinates[['lon', 'lat']].values location_set = LocationSet(grid_list, coordinate_arr) label_dict = gen_labels_with_lon_lat(location_set) # if kw if min_time == '2018-01-01': train_loc, val_loc, test_loc = gen_train_val_test(label_dict, train_radio=0.6, val_radio=0.1) else: train_loc, val_loc, test_loc = gen_train_val_test(label_dict, train_radio=0.6, val_radio=0.2) print(len(train_loc), train_loc) print(len(val_loc), val_loc) print(len(test_loc), test_loc) fig, ax = plt.subplots() x = [location_set.location_dict[i].lon for i in train_loc] y = [location_set.location_dict[i].lat for i in train_loc] ax.scatter(x, y, c='r', label='train') x = [location_set.location_dict[i].lon for i in val_loc] y = [location_set.location_dict[i].lat for i in val_loc] ax.scatter(x, y, c='b', label='val') x = [location_set.location_dict[i].lon for i in test_loc] y = [location_set.location_dict[i].lat for i in test_loc] ax.scatter(x, y, c='g', label='test') plt.legend() plt.show() return sorted(train_loc), sorted(val_loc), sorted(test_loc)
def one_time_prediction(time, epa_geo_vector, fishnet_geo_vector, **kwargs): air_data = session.query(epa_obj.station_id, epa_obj.date_observed, epa_obj.concentration).filter( epa_obj.date_observed == time, epa_obj.parameter_name == 'PM2.5').all() if len(air_data) <= 3: return None air_df = pd.DataFrame(air_data, columns=AIR_COLUMN_SET) air_df = air_df.groupby(by=[KEY_COL, TIME_COL], as_index=False).mean() air_df = air_df[air_df[VALUE_COL] > 0] locations = list(air_df[KEY_COL]) y_train = np.array(air_df[VALUE_COL]) x_train = np.array(epa_geo_vector[locations]) x_test = np.array(fishnet_geo_vector) rf_tree_num = kwargs.get('rf_regression_tree_num', 300) rf_tree_depth = kwargs.get('rf_regression_tree_depth', 10) model = random_forest_regressor(x_train.T, y_train.T, rf_tree_num, rf_tree_depth) prediction = model.predict(x_test.T) gids = list(fishnet_geo_vector.columns) write_res(gids, prediction, time)
def gen_label_mat(pm_obj, time_list, mapping_mat): """ construct the label matrix, if there is no label for a grid, using Nan to fill in. return: pm_mat: (n_times, n_output=1, n_rows, n_cols) """ min_time, max_time = time_list[0], time_list[-1] pm_query_sql = session.query(pm_obj.gid, pm_obj.timestamp, pm_obj.pm25) \ .filter(pm_obj.timestamp >= min_time) \ .filter(pm_obj.timestamp <= max_time) \ .order_by(pm_obj.gid) pm_data = pd.read_sql(pm_query_sql.statement, session.bind) pm_mat_list = [] for t in time_list: this_pm_data = pm_data[pm_data['timestamp'] == t] this_pm_grids = list(this_pm_data['gid']) this_pm_data = np.array(this_pm_data['pm25']).reshape((1, 1, -1)) this_pm_mat = gen_grid_data(this_pm_data, this_pm_grids, mapping_mat) pm_mat_list.append(this_pm_mat) pm_mat = np.vstack(pm_mat_list) print('The shape of PM matrix = {}.'.format(pm_mat.shape)) return pm_mat
def main(old_meo_obj, target_meo_obj): meo_features = [ 'temperature', 'dew_point', 'humidity', 'pressure', 'wind_speed', 'wind_bearing', 'cloud_cover', 'visibility' ] max_time = session.query(func.max( old_meo_obj.timestamp)).scalar().strftime('%Y-%m-%d %H:%M:%S') min_time = session.query(func.min( old_meo_obj.timestamp)).scalar().strftime('%Y-%m-%d %H:%M:%S') tz = pytz.timezone('America/Los_Angeles') time_df = pd.date_range(start=min_time, end=max_time, freq='1H') time_list = sorted(list(set([tz.localize(x) for x in time_df]))) print(len(time_list)) """ !!! Be careful, create table would overwrite the original table """ create_table(target_meo_obj) interpolate_time(old_meo_obj, target_meo_obj, time_list, meo_features)
def main(config): start_time = config['START_TIME'] end_time = config['END_TIME'] tz = pytz.timezone('America/Los_Angeles') time_list = pd.date_range(start=start_time, end=end_time, freq='H') time_list = [tz.localize(x) for x in time_list] table_obj = config['TABLE'] new_table_obj = config['NEW_TABLE'] fields = ['pm1_atm', 'pm2_5_atm', 'pm10_atm', 'pm1_cf_1', 'pm2_5_cf_1', 'pm10_cf_1', 'p_0_3um_cnt', 'p_0_5um_cnt', 'p_1_0um_cnt', 'p_2_5um_cnt', 'p_5um_cnt', 'p_10um_cnt', 'rssi', 'temperature', 'humidity'] for i, t in enumerate(time_list[:-1]): sql_statement = session.query(table_obj).filter(table_obj.timestamp >= time_list[i], table_obj.timestamp < time_list[i + 1]) df = pd.read_sql(sql_statement.statement, session.bind)[['sensor_id', 'channel'] + fields] if len(df) == 0: continue def preprocessing(x): x_mean, x_std = x.mean(skipna=True), x.std(skipna=True) x_left, x_right = x_mean - x_std, x_mean + x_std new_x = ((x >= x_left) & (x <= x_right)) * x new_x = new_x.replace({0: np.nan}) return new_x.mean(skipna=True) agg_df = df.groupby(['sensor_id', 'channel']).apply(lambda x: preprocessing(x[fields])).round(5) agg_df = agg_df[fields].reset_index() agg_df = agg_df.replace({np.nan: None}) agg_data = [] for _, row in agg_df.iterrows(): agg_data_obj = new_table_obj( sensor_id=row['sensor_id'], channel=row['channel'], timestamp=time_list[i], pm1_atm=row['pm1_atm'], pm2_5_atm=row['pm2_5_atm'], pm10_atm=row['pm10_atm'], pm1_cf_1=row['pm1_cf_1'], pm2_5_cf_1=row['pm2_5_cf_1'], pm10_cf_1=row['pm10_cf_1'], p_0_3um_cnt=row['p_0_3um_cnt'], p_0_5um_cnt=row['p_0_5um_cnt'], p_1_0um_cnt=row['p_1_0um_cnt'], p_2_5um_cnt=row['p_2_5um_cnt'], p_5um_cnt=row['p_5um_cnt'], p_10um_cnt=row['p_10um_cnt'], rssi=row['rssi'], temperature=row['temperature'], humidity=row['humidity']) agg_data.append(agg_data_obj) session.add_all(agg_data) session.commit() print('Finish one table.')
def get_locations(location_table_obj): locations = session.query(location_table_obj.sensor_id, location_table_obj.parent_id, location_table_obj.channel, location_table_obj.thingspeak_primary_id, location_table_obj.thingspeak_primary_id_read_key, location_table_obj.thingspeak_second_id, location_table_obj.thingspeak_second_id_read_key).all() return locations
def load_geo_data(geo_obj, loc_type=None, loc_obj=None): geo = session.query(geo_obj.gid, geo_obj.geo_feature, geo_obj.feature_type, geo_obj.buffer_size, geo_obj.value).all() if loc_type == 'station_id' and loc_obj is not None: geo += session.query(loc_obj.station_id, literal('location'), literal('lon'), literal(0), loc_obj.lon).all() geo += session.query(loc_obj.station_id, literal('location'), literal('lat'), literal(0), loc_obj.lat).all() if loc_type == 'gid' and loc_obj is not None: geo += session.query(loc_obj.station_id, literal('location'), literal('lon'), literal(0), loc_obj.lon).all() geo += session.query(loc_obj.station_id, literal('location'), literal('lat'), literal(0), loc_obj.lat).all() geo_df = pd.DataFrame(geo, columns=GEO_COLUMN_SET) geo_vector = construct_geo_vector(geo_df) return geo_df, geo_vector
def request_target_locations(location_obj): results = session.query(location_obj).with_entities( *[location_obj.gid, location_obj.lon, location_obj.lat]).all() loc_dict = {} if not results: return {'status': -1, 'msg': 'No target locations.', 'data': loc_dict} else: for res in results: loc_dict[res.gid] = (res.lon, res.lat) return {'status': 1, 'msg': '', 'data': loc_dict}
def generate_grids(config, area=None): bounding_box = WKTElement(config['BOUNDING_BOX'], srid=4326) grid_obj = config['GRID_OBJ'] resolution = config['RESOLUTION'] epsg = config['EPSG'] try: grids = session.query(func.ST_Dump( func.makegrid_2d(bounding_box, resolution, resolution)).geom.label('geom') # self-defined function in Psql ).subquery() # using the boundary to crop the area # if config['AREA'] == 'los_angeles': # grids = session.query(grids.c.geom) \ # .filter(func.ST_Intersects(LosAngelesCountyBoundary.wkb_geometry, grids.c.geom)).subquery() results = session.query( func.row_number().over().label('gid'), func.ST_Centroid(grids.c.geom).label('centroid'), func.ST_X(func.ST_Centroid(grids.c.geom)).label('lon'), func.ST_Y(func.ST_Centroid(grids.c.geom)).label('lat'), grids.c.geom, func.ST_X(func.ST_Transform(func.ST_Centroid(grids.c.geom), epsg)).label('lon_proj'), func.ST_Y(func.ST_Transform(func.ST_Centroid(grids.c.geom), epsg)).label('lat_proj')).all() obj_results = [] for res in results: obj_results.append(grid_obj(gid=res[0], centroid=res[1], lon=res[2], lat=res[3], geom=res[4], lon_proj=res[5], lat_proj=res[6])) # session.add_all(obj_results) # session.commit() return except Exception as e: print(e) exit(-1)
def find_station_info(): search_results = session.query(LosAngelesEPALocation)\ .with_entities(*[LosAngelesEPALocation.station_id, LosAngelesEPALocation.lon, LosAngelesEPALocation.lat]).all() if not search_results: return None else: station_dict = {} max_station_id = 0 for res in search_results: station_dict[(res.lon, res.lat)] = res.station_id if res.station_id > max_station_id: max_station_id = res.station_id return station_dict, max_station_id
def interpolate_time(old_obj, target_obj, time_list, features): try: time_df = pd.DataFrame(time_list, columns=['timestamp']).set_index(['timestamp']) locations = session.query(old_obj.gid).distinct(old_obj.gid).all() locations = sorted([loc[0] for loc in locations]) for loc in locations: data = session.query(old_obj.timestamp, *features).filter(old_obj.gid == loc)\ .order_by(old_obj.timestamp).all() df = pd.DataFrame(data, columns=['timestamp'] + features).set_index(['timestamp']) df = df.loc[~df.index.duplicated( keep='first')] # remove the potential duplicates in index df = df.join(time_df, how='right').sort_index() # df['wind_bearing'] = df['wind_bearing'].apply(lambda x: x - 360 if x > 180 else x) inter_data = df.interpolate(method='linear').reset_index() obj_results = [ target_obj(gid=loc, timestamp=dt[0], data=dt[1:]) for dt in inter_data.values.tolist() ] session.add_all(obj_results) session.commit() print('Location {} has finished. {} records has been generated.'. format(loc, len(inter_data))) return except Exception as e: print(e) exit(-1)
def gen_meo_vector(meo_obj, time_list, grid_list): """ load weather data and construct the meo vector return: meo_vector: (n_times, n_loc, n_meo_features) """ min_time, max_time = time_list[0], time_list[-1] n_times, n_loc = len(time_list), len(grid_list) meo_data = session.query(meo_obj.data) \ .filter(meo_obj.timestamp >= min_time) \ .filter(meo_obj.timestamp <= max_time) \ .filter(meo_obj.gid.in_(grid_list)) \ .order_by(meo_obj.timestamp, meo_obj.gid).all() n_meo_features = len(meo_data[0][0]) meo_vector = np.array(meo_data).reshape((n_times, n_loc, n_meo_features)) print('The shape of meo vector = {}.'.format(meo_vector.shape)) return meo_vector
def load_air_data(air_obj, start_time, end_time): air_data = session.query(air_obj.station_id, air_obj.date_observed, air_obj.concentration).filter( air_obj.date_observed >= start_time, air_obj.date_observed < end_time, air_obj.parameter_name == 'PM2.5').all() if len(air_data) < 1: return None """ remove duplicates and negative values """ air_df = pd.DataFrame(air_data, columns=AIR_COLUMN_SET) air_df = air_df.groupby(by=[KEY_COL, TIME_COL], as_index=False).mean() air_df = air_df[air_df[VALUE_COL] > 0] """ remove the locations with too few observation """ n_obs = air_df.groupby(KEY_COL).size().reset_index(name='n') rm_loc = list( n_obs[n_obs['n'] < int(0.6 * time_diff(start_time, end_time))][KEY_COL]) air_df = air_df[~air_df[KEY_COL].isin(rm_loc)] """ construct timeseries """ air_time_series = construct_time_series(air_df) return air_time_series
def main(input_obj): pm_obj = input_obj['pm_obj'] meo_obj = input_obj['meo_obj'] geo_obj = input_obj['geo_obj'] geo_name_obj = input_obj['geo_name_obj'] coord_obj = input_obj['coord_obj'] # load mapping matrix mapping_mat = np.load(input_obj['mapping_mat_file'])['mat'] # load grids coord_df = pd.read_sql( session.query(coord_obj.gid, coord_obj.lon, coord_obj.lat).statement, session.bind) grid_list = list(coord_df['gid']) print('Number of grids = {}.'.format(len(grid_list))) # get time list min_time, max_time = input_obj['min_time'], input_obj['max_time'] tz = pytz.timezone('America/Los_Angeles') time_list = pd.date_range(start=min_time, end=max_time, closed='left', freq='1H') time_list = sorted(list(set([tz.localize(x) for x in time_list]))) print('Data from {} to {}.'.format(min_time, max_time)) print('Number of time points = {}.'.format(len(time_list))) # generate label data print('...Generating label data...') label_mat = gen_label_mat(pm_obj, time_list, mapping_mat) # generate dynamic data print('...Generating dynamic data...') meo_vector = gen_meo_vector(meo_obj, time_list, grid_list) time_vector = gen_time_vector(time_list, grid_list) dynamic_vector = np.concatenate([meo_vector, time_vector], axis=-1) # convert to feature matrix dynamic_mat = dynamic_vector.swapaxes( 1, 2) # (n_times, n_loc, n_features) => (n_times, n_features, n_loc) dynamic_mat = gen_grid_data(dynamic_mat, grid_list, mapping_mat) print('The shape of dynamic matrix = {}.'.format(dynamic_mat.shape)) # generate static data print('...Generating static data...') geo_vector, geo_name_list = gen_geo_vector(geo_obj, geo_name_obj, grid_list) geo_vector = geo_vector.reshape(1, geo_vector.shape[0], geo_vector.shape[1]) # convert to feature matrix static_mat = geo_vector.swapaxes( 1, 2) # (1, n_loc, n_features) => (1, n_features, n_loc) static_mat = gen_grid_data(static_mat, grid_list, mapping_mat) print('The shape of static matrix = {}.'.format(static_mat.shape)) # combine static vector and dynamic vector # arr = np.expand_dims(geo_vector, axis=0) # arr = np.repeat(arr, len(time_list), axis=0) # feature_vector = np.concatenate([feature_vector, arr], axis=-1) np.savez_compressed(os.path.join(input_obj['output_file']), label_mat=label_mat, dynamic_mat=dynamic_mat, static_mat=static_mat, dynamic_features=np.array([ 'temperature', 'dew_point', 'humidity', 'pressure', 'wind_speed', 'wind_direction', 'cloud_cover', 'visibility', 'hourofday', 'dayofweek', 'day', 'month', 'dayofyear' ]), static_features=np.array(geo_name_list), mapping_mat=mapping_mat)
def compute_features_from_osm(config): osm_tables = config['OSM'] bounding_box = WKTElement(config['BOUNDING_BOX'], srid=4326) grid_obj = config['GRID_OBJ'] geo_feature_obj = config['GEO_FEATURE_OBJ'] try: for feature_name, osm_table in osm_tables.items(): geo_feature_type = osm_table.wkb_geometry.type.geometry_type cropped_osm = crop_osm( osm_table, bounding_box) # crop the OSM data with a bounding box sub_query = session.query(grid_obj.gid, cropped_osm.c.fclass, func.ST_GeogFromWKB( func.ST_Intersection(grid_obj.geom, cropped_osm.c.wkb_geometry)) .label('intersection')) \ .filter(func.ST_Intersects(grid_obj.geom, cropped_osm.c.wkb_geometry)).subquery() results = [] if geo_feature_type == 'MULTIPOLYGON': results = session.query(sub_query.c.gid.label('gid'), sub_query.c.fclass.label('feature_type'), literal(feature_name).label('geo_feature'), func.SUM(func.ST_AREA(sub_query.c.intersection)).label('value'), literal('area').label('measurement')) \ .group_by(sub_query.c.gid, sub_query.c.fclass).all() elif geo_feature_type == 'MULTILINESTRING': results = session.query(sub_query.c.gid.label('gid'), sub_query.c.fclass.label('feature_type'), literal(feature_name).label('geo_feature'), func.SUM(func.ST_LENGTH(sub_query.c.intersection)).label('value'), literal('length').label('measurement')) \ .group_by(sub_query.c.gid, sub_query.c.fclass).all() elif geo_feature_type == 'POINT': results = session.query(sub_query.c.gid.label('gid'), sub_query.c.fclass.label('feature_type'), literal(feature_name).label('geo_feature'), func.COUNT(sub_query.c.intersection).label('value'), literal('count').label('measurement')) \ .group_by(sub_query.c.gid, sub_query.c.fclass).all() else: pass obj_results = [] for res in results: obj_results.append( geo_feature_obj(gid=res[0], feature_type=res[1], geo_feature=res[2], value=res[3], measurement=res[4])) # session.add_all(obj_results) # session.commit() print('{} has finished'.format(feature_name)) return except Exception as e: print(e) exit(-1)
def gen_matrix(coord_obj): """ generate a matrix as mat = array([[6917, 6918, 6919, ..., 6990, 6991, 6992], [6841, 6842, 6843, ..., 6914, 6915, 6916], [6765, 6766, 6767, ..., 6838, 6839, 6840], ..., [153, 154, 155, ..., 226, 227, 228], [77, 78, 79, ..., 150, 151, 152], [1, 2, 3, ..., 74, 75, 76]]) """ coord_df = pd.read_sql( session.query(coord_obj.gid, coord_obj.lon, coord_obj.lat).statement, session.bind) coord_df = coord_df.round(10) coord_dict = {(row[1], row[2]): int(row[0]) for row in coord_df.values.tolist()} lat_list = sorted(coord_df['lat'].drop_duplicates()) lat_dict = coord_df[[ 'lon', 'lat' ]].groupby('lat')['lon'].apply(lambda x: sorted(x)).to_dict() n_rows = len(lat_list) n_cols = min([len(v) for k, v in lat_dict.items()]) """ find neighbors ["left", "right", "up", "down"] for the gid """ neighbors = {} for idx, row in coord_df.iterrows(): gid, this_lon, this_lat = int(row['gid']), row['lon'], row['lat'] neighbors[gid] = {} neighbors[gid]['left'] = get_horizontal_neighbor(this_lon, this_lat, lat_dict, coord_dict, direction=-1) neighbors[gid]['right'] = get_horizontal_neighbor(this_lon, this_lat, lat_dict, coord_dict, direction=1) neighbors[gid]['up'] = get_vertical_neighbor(this_lon, this_lat, lat_list, lat_dict, coord_dict, direction=1) neighbors[gid]['down'] = get_vertical_neighbor(this_lon, this_lat, lat_list, lat_dict, coord_dict, direction=-1) """ convert neighbors to the matrix """ mat = np.full(([n_rows, n_cols]), -1) curr_gid = min(coord_dict.values()) curr_row = curr_gid for i in range(n_rows - 1, -1, -1): for j in range(n_cols): mat[i][j] = curr_gid if j < n_cols - 1: curr_gid = neighbors[curr_gid]['right'] else: curr_gid = neighbors[curr_row]['up'] curr_row = curr_gid return mat
def main(pm_obj, coord_obj, method, **kwargs): # compute the number of time points in the period min_time, max_time = kwargs.get('min_time', '2018-01-01'), kwargs.get( 'max_time', '2018-02-01') time_list = pd.date_range(start=min_time, end=max_time, closed='left', freq='1H') # query all the pm locations that a certain number of observations pm_locations = session.query(pm_obj.gid) \ .filter(pm_obj.timestamp >= min_time) \ .filter(pm_obj.timestamp < max_time) \ .group_by(pm_obj.gid).having(func.count(pm_obj.gid) > 0.01 * len(time_list)).all() pm_locations = [i[0] for i in pm_locations] print('Number of pm2.5 locations = {}.'.format(len(pm_locations))) # query the coordinates of the locations coord_df = pd.read_sql( session.query(coord_obj.gid, coord_obj.lon, coord_obj.lat).statement, session.bind) grid_coordinates = coord_df[coord_df['gid'].isin(pm_locations)] grid_list = list(grid_coordinates['gid']) coordinate_arr = grid_coordinates[['lon', 'lat']].values location_set = LocationSet(grid_list, coordinate_arr) if method == 'db_k_means': label_dict = gen_labels_with_db_k_means(location_set) elif method == 'k_means': label_dict = {} pass else: label_dict = gen_labels_with_lon_lat(location_set) train_loc, val_loc, test_loc = gen_train_val_test(label_dict) if method == 'db_k_means': # finally get the train, val, and test locations for the original locations def extract_loc(input_loc): output_loc = [] for this_loc in input_loc: output_loc += [int(i) for i in str(this_loc.gid).split('_')] return output_loc train_loc, val_loc, test_loc = extract_loc(train_loc), extract_loc( val_loc), extract_loc(test_loc) print(len(train_loc), train_loc) print(len(val_loc), val_loc) print(len(test_loc), test_loc) # plot final results all_location_info = {'gid': [], 'lon': [], 'lat': [], 'c': []} all_location_info['gid'] += [ location_set.location_dict[i].gid for i in train_loc ] all_location_info['lon'] += [ location_set.location_dict[i].lon for i in train_loc ] all_location_info['lat'] += [ location_set.location_dict[i].lat for i in train_loc ] all_location_info['c'] += ['r'] * len(train_loc) all_location_info['gid'] += [ location_set.location_dict[i].gid for i in val_loc ] all_location_info['lon'] += [ location_set.location_dict[i].lon for i in val_loc ] all_location_info['lat'] += [ location_set.location_dict[i].lat for i in val_loc ] all_location_info['c'] += ['b'] * len(val_loc) all_location_info['gid'] += [ location_set.location_dict[i].gid for i in test_loc ] all_location_info['lon'] += [ location_set.location_dict[i].lon for i in test_loc ] all_location_info['lat'] += [ location_set.location_dict[i].lat for i in test_loc ] all_location_info['c'] += ['g'] * len(test_loc) plt.scatter(all_location_info['lon'], all_location_info['lat'], c=all_location_info['c']) plt.show() return sorted(train_loc), sorted(val_loc), sorted(test_loc)