def foshan_data(site, start_date, end_date=None, gas='pm2d5'): '''Load fixed sensor data collected from foshan. Parameters ---------- site: string or list The station name of queried stations like "体育馆岗亭", or list of such. Sepcifically, if given 'ALL', all available sites will be used. start_date: string The start time (include) like "2018-11-01". end_date: string, default: same as start_date The end time (include) like "2018-11-30" gas: string or list, default: 'pm2d5' The gas type to be collected. Return ------ DataPack The loaded data that can be used by apdt. ''' # Parameter Check data_path = _config['foshan_data_path'] air = gas # To Do: support a list of gas type. if site == 'ALL': site = sorted(os.listdir(data_path)) elif type(site) is str: site = [site] site = [x if x.endswith('.xls') else x + '.xls' for x in site] if start_date < '2018-11-01' or start_date > '2018-11-30': raise Exception("Foshan data range between '2018-11-01'-'2018-11-30'") if end_date is None: end_date = start_date if end_date < '2018-11-01' or end_date > '2018-11-30': raise Exception("Foshan data range between '2018-11-01'-'2018-11-30'") if end_date < start_date: raise Exception('end_date shoule larger than start_date.') datalist = [] location = [] for id, target in enumerate(site): data = pd.read_excel(data_path + target, index_col=0) dat = data[[air] + ['lon', 'lat']].dropna() dat = dat.reset_index().rename(columns={ 'index': 'datetime', air: 'data0', 'lon': 'lon', 'lat': 'lat' }).set_index('datetime') dat = dat.resample("T").asfreq() dat = dat[dat.index >= start_date + ' 00:00:00'] dat = dat[dat.index <= end_date + ' 23:59:59'] # nanidx=dat.index[np.isnan(dat[air])] dat = dat.dropna() dat = dat.resample("T").bfill() # dat["isnan"]=0 # dat["isnan"].loc[nanidx]=1 dat['site_id'] = 'foshan' + str(id) dat['site_name'] = target.split('.')[0] dat["date"] = dat.index.map(lambda x: x.date()) datalist.append(dat) location.append([ 'foshan' + str(id), target.split('.')[0], dat['lon'][0], dat['lat'][0] ]) datalist = pd.concat(datalist) location = pd.DataFrame(location, columns=['site_id', 'site_name', 'lon', 'lat']) artifact = DataPack() artifact.raw_data = datalist artifact.data = datalist.copy() artifact.site_info = location artifact.data_type = gas artifact.sample_unit = 'M' artifact.tag.append('fixed-location') artifact.tag.append('time-aligned') return artifact
def load_nms(site, start_date, end_date=None, gas='pm2d5', par=True): '''Load national monitoring stations (NMS) data. Parameters ---------- site: string or list The station ID of queried stations like "1001A", or list of such. Sepcifically, if given 'ALL', all available sites will be used. start_date: string The start time (include) like "2017-01-01". end_date: string, default: same as start_date The end time (include) like "2018-12-31" gas: string or list, default: 'pm2d5' The gas type to be collected. par: bool, default True If use parallel process. Return ------ DataPack The loaded data that can be used by apdt. ''' worker_num = _config['subthread_max_num'] data_path = _config['nms_data_path'] location = pd.read_csv(data_path + 'site_location.csv') if site == 'ALL': site = list(location['监测点编码']) elif type(site) is str: site = [site] gas = {'pm2d5': 'PM2.5', 'pm10': 'PM10', 'aqi': 'AQI', 'so2': 'SO2', 'no2': 'NO2',\ 'o3': 'O3', 'co': 'CO', 'co2': 'CO2'}[gas] if end_date is None: end_date = start_date location = location[[ location.iloc[i, 0] in site for i in range(location.shape[0]) ]] location['监测点名称'] = location['城市'] + '-' + location['监测点名称'] location = location.rename(columns={ '监测点编码': 'site_id', '监测点名称': 'site_name', '经度': 'lon', '纬度': 'lat' }) location = location[['site_id', 'site_name', 'lat', 'lon']] #Collect from file start_time = time.mktime((time.strptime(start_date + ' 12:00:00', '%Y-%m-%d %H:%M:%S'))) end_time = time.mktime((time.strptime(end_date + ' 12:00:00', '%Y-%m-%d %H:%M:%S'))) template = pd.DataFrame({'hour': [i for i in range(24)]}) def worker(time_now, end_time): data_list = [] while time_now < end_time: file = data_path + 'site' + \ str(time.gmtime(time_now).tm_year) + \ '/china_sites_' + \ time.strftime('%Y%m%d', time.gmtime(time_now)) + '.csv' try: data = pd.read_csv(file) data = data.reindex( columns=list(set(site).union(set(data.columns)))) data = data.loc[data['type'] == gas][['hour'] + site] data = template.merge(data, on='hour', how='left') data['hour'] = time.strftime('%Y%m%d', time.gmtime( time_now)) + ' ' + data['hour'].astype(str) + ':00:00' data['hour'] = pd.to_datetime(data['hour']) data = data.rename(columns={ 'hour': 'datetime' }).set_index('datetime') data = data.stack(dropna=False).reset_index( level=1, drop=False).rename(columns={ 'level_1': 'site_id', 0: 'data0' }) data_list.append(data) except: pass time_now += 24 * 3600 return data_list if par: workers = [] data_list = [] time_list = [start_time] + [ int((end_time - start_time) / 3600 / 24 / worker_num) * k * 24 * 3600 + start_time for k in range(1, worker_num) ] + [end_time + 1] for i in range(worker_num): workers.append(SubThread(worker, (time_list[i], time_list[i + 1]))) workers[-1].start() for i in range(worker_num): workers[i].join() data_list = data_list + workers[i].get_result() else: data_list = worker(start_time, end_time) data_list = pd.concat(data_list) data_list = data_list.reset_index().merge(location, on='site_id', how='left').set_index('datetime') data_list = data_list.dropna(subset=['lat', 'lon']) location = location.dropna(subset=['lat', 'lon']) artifact = DataPack() artifact.raw_data = data_list artifact.data = data_list.copy() artifact.site_info = location artifact.data_type = [gas] artifact.sample_unit = 'H' artifact.tag.append('fixed-location') return artifact
def load_weather(site, start_date, end_date=None, feature='temperature', **kwarg): '''Load national monitoring stations (NMS) weather data. Parameters ---------- site: string or list The station ID of queried stations like "1001A", or list of such. start_date: string The start time (include) like "2017-01-01". end_date: string, default: same as start_date The end time (include) like "2018-12-31" feature: string or list, default: 'temperature' The data type to be collected. Return ------ DataPack The loaded data that can be used by apdt. To Do ----- Deal with alias. Deal with unit transform. Deal with data download. Deal with coordinates input. Deal with multi-thread Deal with site='ALL' ''' if 'enable_warning' not in kwarg.keys(): kwarg['enable_warning'] = True if 'warning_threshold' not in kwarg.keys(): kwarg['warning_threshold'] = 0.2 data_path = _config['nms_data_path'] location = pd.read_csv(data_path + 'site_location.csv') if site == 'ALL': # site = list(location['监测点编码']) print('all site not supported yet') return None elif type(site) is str: site = [site] if end_date is None: end_date = start_date if type(feature) is str: feature = [feature] alia_dict = {} alia_dict.update( {x: 'temperature' for x in ['temperature', 'temp', 'tmp', 'wendu']}) alia_dict.update( {x: 'humidity' for x in ['humidity', 'hmd', 'hum', 'shidu']}) alia_dict.update( {x: 'windSpeed' for x in ['windSpeed', 'speed', 'spd', 'fengsu']}) alia_dict.update({ x: 'windBearing' for x in ['windBearing', 'direction', 'angel', 'fengxiang'] }) alia_dict.update( {x: 'visibility' for x in ['visibility', 'kejiandu', 'keshidu']}) alia_dict.update( {x: 'pressure' for x in ['pressure', 'press', 'yali', 'qiya']}) for i, x in enumerate(feature): if x not in alia_dict.keys(): raise Exception(x + ' is not supported.') feature[i] = alia_dict[x] location = location[[ location.iloc[i, 0] in site for i in range(location.shape[0]) ]] location['监测点名称'] = location['城市'] + '-' + location['监测点名称'] location = location.rename(columns={ '监测点编码': 'site_id', '监测点名称': 'site_name', '经度': 'lon', '纬度': 'lat' }) location = location[['site_id', 'site_name', 'lat', 'lon']] data_bag = [] for site_id in site: #Collect from file start_time = time.mktime((time.strptime(start_date + ' 12:00:00', '%Y-%m-%d %H:%M:%S'))) end_time = time.mktime((time.strptime(end_date + ' 12:00:00', '%Y-%m-%d %H:%M:%S'))) time_now = start_time stamp_bias = time.mktime((time.strptime(start_date + ' 00:00:00', '%Y-%m-%d %H:%M:%S'))) data_list = [[] for _ in range(len(feature))] time_stamp = [[] for _ in range(len(feature))] while time_now <= end_time: file = 'data/weather/' + site_id + '/' + time.strftime( '%Y-%m-%d', time.gmtime(time_now)) + '.json' with open(file) as f: data = json.load(f) if 'hourly' in data.keys(): for k in range(len(data['hourly']['data'])): _stamp = int( (data['hourly']['data'][k]['time'] - stamp_bias) / 3600) _data = data['hourly']['data'][k] for m in range(len(feature)): try: data_list[m].append(_data[feature[m]]) time_stamp[m].append(_stamp) except: pass time_now += 24 * 3600 # Linear interploation Length = int((end_time - start_time) // 3600 + 24) data_ratio = np.array([len(data_list[m]) for m in range(len(feature))]) / Length if np.any(data_ratio < kwarg['warning_threshold'] ) and kwarg['enable_warning']: print(__file__ + ' Runtime Warning: ') print( 'Too much missing value in site ' + site_id + ' Weather Data, the returned interpolated data may be not meaningful: ', data_ratio) data_interp = [] for i in range(len(feature)): if len(data_list[i]) > 0: data_interp.append( np.interp(np.arange(Length), time_stamp[i], data_list[i])) else: data_interp.append(np.zeros(Length, )) data = np.stack(data_interp, -1) data_table = pd.DataFrame( data=data, columns=['data' + str(i) for i in range(len(feature))]) data_table['site_id'] = site_id data_table['datetime'] = pd.date_range(start=start_date, periods=Length, freq='H') data_bag.append(data_table) data_bag = pd.concat(data_bag) data_bag = data_bag.merge(location, on='site_id', how='left').set_index('datetime') data_bag = data_bag.dropna(subset=['lat', 'lon']) location = location.dropna(subset=['lat', 'lon']) artifact = DataPack() artifact.raw_data = data_bag artifact.data = data_bag.copy() artifact.site_info = location artifact.data_type = feature artifact.sample_unit = 'H' artifact.tag.append('fixed-location') artifact.tag.append('time-aligned') return artifact
def gp_data(time_length, site_num, dimension=1, kernel_weight=None, noise_level=None, seed=None, freq='H'): '''Generate a fake PM2.5-like data for quick start using gaussian process. Parameters ---------- - time_length: int. How many hours should data have. - site_num: int. How many sites should data have. - dimension: int, default 1, How many independent sample each ST-point have. - kernel_weight: list of three float numbers, default [1.0, 1.0, 1.0], the relevant variance of three components: long-term trend, short-term fluction and period wave. or list of six float numbers, where addional 3 will be considered as lenth_scale parameter of three kernels(0-1, default 1). - noise_level: float, default 0.01, the white noise add to kernel, note that this is neccessary for long time generation. - seed: int. The random seed. - freq: str. the frequency of time stamp. 'H' for hour (default) and 'T' for minutes. Return ------ DataPack Issue ----- ''' if seed is not None: np.random.seed(seed) if kernel_weight is None: kernel_weight = [[1.0, 1.0, 1.0]] if np.array(kernel_weight).ndim == 1: kernel_weight = [kernel_weight] if noise_level is None: noise_level = 0.01 if len(kernel_weight[0]) == 3: kernel_weight = [ kernel_weight[i] + [1.0, 1.0, 1.0] for i in range(len(kernel_weight)) ] # Decrease this number if this program stuck. generation_step = 1000 xs = np.arange(generation_step * 2).reshape((generation_step * 2, 1)) if len(kernel_weight) == 1: # Case: Only one parameter is given, site_num will be treated as new dimension k1 = gp.kernels.RBF(length_scale=50.0 + 100.0 * kernel_weight[0][3]) k2 = gp.kernels.Matern(length_scale=20.0 + 10.0 * kernel_weight[0][4], nu=0.5) k3 = gp.kernels.ExpSineSquared(length_scale=1, periodicity=100 + 200 * kernel_weight[0][5]) kw = gp.kernels.WhiteKernel(noise_level=noise_level) k = kernel_weight[0][0] * k1 + kernel_weight[0][ 1] * k2 + kernel_weight[0][2] * k3 + kw C = k(xs) C_11 = C[:generation_step, :generation_step] C_11_inv = np.linalg.inv(C_11) C_21 = C[generation_step:, :generation_step] u, s, _ = np.linalg.svd(C_11) us = np.matmul(u, np.diag(np.sqrt(s))) sample = np.matmul( us, np.random.randn(generation_step, site_num * dimension)) C_cond = C_11 - np.matmul(np.matmul(C_21.T, C_11_inv), C_21) u, s, _ = np.linalg.svd(C_cond) us = np.matmul(u, np.diag(np.sqrt(s))) time_now = generation_step sample_list = [sample] while time_now < time_length: mu_cond = 0 + np.matmul(np.matmul(C_21, C_11_inv), sample) sample = np.matmul( us, np.random.randn(generation_step, site_num * dimension)) sample = sample + mu_cond sample_list.append(sample) time_now = time_now + generation_step sample_list = np.concatenate(sample_list)[:time_length].reshape( (time_length * site_num, dimension)) else: # Case: A list of parameter is given, generate site_num samples one by one sample_list_all = [] for i in range(site_num): k1 = gp.kernels.RBF(length_scale=50.0 + 100.0 * kernel_weight[i][3]) k2 = gp.kernels.Matern(length_scale=20.0 + 10.0 * kernel_weight[i][4], nu=0.5) k3 = gp.kernels.ExpSineSquared(length_scale=1, periodicity=100 + 200 * kernel_weight[i][5]) kw = gp.kernels.WhiteKernel(noise_level=noise_level) k = kernel_weight[i][0] * k1 + kernel_weight[i][ 1] * k2 + kernel_weight[i][2] * k3 + kw C = k(xs) C_11 = C[:generation_step, :generation_step] C_11_inv = np.linalg.inv(C_11) C_21 = C[generation_step:, :generation_step] u, s, _ = np.linalg.svd(C_11) us = np.matmul(u, np.diag(np.sqrt(s))) sample = np.matmul(us, np.random.randn(generation_step, 1 * dimension)) C_cond = C_11 - np.matmul(np.matmul(C_21.T, C_11_inv), C_21) u, s, _ = np.linalg.svd(C_cond) us = np.matmul(u, np.diag(np.sqrt(s))) time_now = generation_step sample_list = [sample] while time_now < time_length: mu_cond = 0 + np.matmul(np.matmul(C_21, C_11_inv), sample) sample = np.matmul( us, np.random.randn(generation_step, 1 * dimension)) sample = sample + mu_cond sample_list.append(sample) time_now = time_now + generation_step sample_list = np.concatenate(sample_list)[:time_length].reshape( (time_length * 1, dimension)) sample_list_all.append(sample_list) sample_list = np.stack(sample_list_all, 1).reshape( (time_length * site_num, dimension)) datetime_list = pd.date_range(start='2000-1-1', periods=time_length, freq=freq) site_name_list = ['virtual_site' + str(i) for i in range(site_num)] idx = pd.MultiIndex.from_product([datetime_list, site_name_list], names=('datetime', 'site_name')) data = pd.DataFrame(index=idx, columns=['data' + str(i) for i in range(dimension)], data=sample_list) data = data.reset_index() site_list = pd.DataFrame(data=np.random.randn(site_num, 2) + np.array([39.8673, 116.3660]), columns=['lat', 'lon']) site_list['site_name'] = site_name_list site_list['site_id'] = ['V' + str(i).zfill(4) for i in range(site_num)] data = data.merge(site_list, how='left', on='site_name') data = data.set_index('datetime') datapack = DataPack() datapack.raw_data = data datapack.data = data.copy() datapack.site_info = site_list datapack.data_type = ['virtual_type_' + str(i) for i in range(dimension)] datapack.sample_unit = freq datapack.tag.append('fixed-location') datapack.tag.append('time-aligned') datapack.time_length = time_length datapack.site_num = site_num return datapack