Пример #1
0
def foshan_data(site, start_date, end_date=None, gas='pm2d5'):
    '''Load fixed sensor data collected from foshan.
    Parameters
    ----------
        site: string or list
            The station name of queried stations like "体育馆岗亭", or list of such. Sepcifically, if given 'ALL', all available sites will be used.
        start_date: string
            The start time (include) like "2018-11-01".
        end_date: string, default: same as start_date
            The end time (include) like "2018-11-30"
        gas: string or list, default: 'pm2d5'
            The gas type to be collected.
    Return
    ------
        DataPack
            The loaded data that can be used by apdt.
    '''
    # Parameter Check
    data_path = _config['foshan_data_path']
    air = gas  # To Do: support a list of gas type.

    if site == 'ALL':
        site = sorted(os.listdir(data_path))
    elif type(site) is str:
        site = [site]
    site = [x if x.endswith('.xls') else x + '.xls' for x in site]

    if start_date < '2018-11-01' or start_date > '2018-11-30':
        raise Exception("Foshan data range between '2018-11-01'-'2018-11-30'")
    if end_date is None:
        end_date = start_date
    if end_date < '2018-11-01' or end_date > '2018-11-30':
        raise Exception("Foshan data range between '2018-11-01'-'2018-11-30'")
    if end_date < start_date:
        raise Exception('end_date shoule larger than start_date.')

    datalist = []
    location = []

    for id, target in enumerate(site):
        data = pd.read_excel(data_path + target, index_col=0)
        dat = data[[air] + ['lon', 'lat']].dropna()
        dat = dat.reset_index().rename(columns={
            'index': 'datetime',
            air: 'data0',
            'lon': 'lon',
            'lat': 'lat'
        }).set_index('datetime')
        dat = dat.resample("T").asfreq()
        dat = dat[dat.index >= start_date + ' 00:00:00']
        dat = dat[dat.index <= end_date + ' 23:59:59']
        # nanidx=dat.index[np.isnan(dat[air])]
        dat = dat.dropna()
        dat = dat.resample("T").bfill()
        # dat["isnan"]=0
        # dat["isnan"].loc[nanidx]=1
        dat['site_id'] = 'foshan' + str(id)
        dat['site_name'] = target.split('.')[0]
        dat["date"] = dat.index.map(lambda x: x.date())

        datalist.append(dat)
        location.append([
            'foshan' + str(id),
            target.split('.')[0], dat['lon'][0], dat['lat'][0]
        ])

    datalist = pd.concat(datalist)
    location = pd.DataFrame(location,
                            columns=['site_id', 'site_name', 'lon', 'lat'])
    artifact = DataPack()
    artifact.raw_data = datalist
    artifact.data = datalist.copy()
    artifact.site_info = location
    artifact.data_type = gas
    artifact.sample_unit = 'M'
    artifact.tag.append('fixed-location')
    artifact.tag.append('time-aligned')
    return artifact
Пример #2
0
def load_nms(site, start_date, end_date=None, gas='pm2d5', par=True):
    '''Load national monitoring stations (NMS) data.
    Parameters
    ----------
        site: string or list
            The station ID of queried stations like "1001A", or list of such. Sepcifically, if given 'ALL', all available sites will be used.
        start_date: string
            The start time (include) like "2017-01-01".
        end_date: string, default: same as start_date
            The end time (include) like "2018-12-31"
        gas: string or list, default: 'pm2d5'
            The gas type to be collected.
        par: bool, default True
            If use parallel process.
    Return
    ------
        DataPack
            The loaded data that can be used by apdt.
    '''
    worker_num = _config['subthread_max_num']
    data_path = _config['nms_data_path']
    location = pd.read_csv(data_path + 'site_location.csv')

    if site == 'ALL':
        site = list(location['监测点编码'])
    elif type(site) is str:
        site = [site]
    gas = {'pm2d5': 'PM2.5', 'pm10': 'PM10', 'aqi': 'AQI', 'so2': 'SO2', 'no2': 'NO2',\
        'o3': 'O3', 'co': 'CO', 'co2': 'CO2'}[gas]
    if end_date is None:
        end_date = start_date

    location = location[[
        location.iloc[i, 0] in site for i in range(location.shape[0])
    ]]
    location['监测点名称'] = location['城市'] + '-' + location['监测点名称']
    location = location.rename(columns={
        '监测点编码': 'site_id',
        '监测点名称': 'site_name',
        '经度': 'lon',
        '纬度': 'lat'
    })
    location = location[['site_id', 'site_name', 'lat', 'lon']]

    #Collect from file
    start_time = time.mktime((time.strptime(start_date + ' 12:00:00',
                                            '%Y-%m-%d %H:%M:%S')))
    end_time = time.mktime((time.strptime(end_date + ' 12:00:00',
                                          '%Y-%m-%d %H:%M:%S')))
    template = pd.DataFrame({'hour': [i for i in range(24)]})

    def worker(time_now, end_time):
        data_list = []
        while time_now < end_time:
            file = data_path + 'site' + \
                    str(time.gmtime(time_now).tm_year) + \
                    '/china_sites_' + \
                    time.strftime('%Y%m%d', time.gmtime(time_now)) + '.csv'
            try:
                data = pd.read_csv(file)
                data = data.reindex(
                    columns=list(set(site).union(set(data.columns))))
                data = data.loc[data['type'] == gas][['hour'] + site]
                data = template.merge(data, on='hour', how='left')
                data['hour'] = time.strftime('%Y%m%d', time.gmtime(
                    time_now)) + ' ' + data['hour'].astype(str) + ':00:00'
                data['hour'] = pd.to_datetime(data['hour'])
                data = data.rename(columns={
                    'hour': 'datetime'
                }).set_index('datetime')
                data = data.stack(dropna=False).reset_index(
                    level=1, drop=False).rename(columns={
                        'level_1': 'site_id',
                        0: 'data0'
                    })
                data_list.append(data)
            except:
                pass
            time_now += 24 * 3600
        return data_list

    if par:
        workers = []
        data_list = []
        time_list = [start_time] + [
            int((end_time - start_time) / 3600 / 24 / worker_num) * k * 24 *
            3600 + start_time for k in range(1, worker_num)
        ] + [end_time + 1]
        for i in range(worker_num):
            workers.append(SubThread(worker, (time_list[i], time_list[i + 1])))
            workers[-1].start()
        for i in range(worker_num):
            workers[i].join()
            data_list = data_list + workers[i].get_result()
    else:
        data_list = worker(start_time, end_time)
    data_list = pd.concat(data_list)
    data_list = data_list.reset_index().merge(location,
                                              on='site_id',
                                              how='left').set_index('datetime')
    data_list = data_list.dropna(subset=['lat', 'lon'])
    location = location.dropna(subset=['lat', 'lon'])

    artifact = DataPack()
    artifact.raw_data = data_list
    artifact.data = data_list.copy()
    artifact.site_info = location
    artifact.data_type = [gas]
    artifact.sample_unit = 'H'
    artifact.tag.append('fixed-location')

    return artifact
Пример #3
0
def load_weather(site,
                 start_date,
                 end_date=None,
                 feature='temperature',
                 **kwarg):
    '''Load national monitoring stations (NMS) weather data.
    Parameters
    ----------
        site: string or list
            The station ID of queried stations like "1001A", or list of such.
        start_date: string
            The start time (include) like "2017-01-01".
        end_date: string, default: same as start_date
            The end time (include) like "2018-12-31"
        feature: string or list, default: 'temperature'
            The data type to be collected.
    Return
    ------
        DataPack
            The loaded data that can be used by apdt.
    To Do
    -----
        Deal with alias.
        Deal with unit transform.
        Deal with data download.
        Deal with coordinates input.
        Deal with multi-thread
        Deal with site='ALL'
    '''

    if 'enable_warning' not in kwarg.keys():
        kwarg['enable_warning'] = True
    if 'warning_threshold' not in kwarg.keys():
        kwarg['warning_threshold'] = 0.2

    data_path = _config['nms_data_path']
    location = pd.read_csv(data_path + 'site_location.csv')

    if site == 'ALL':
        # site = list(location['监测点编码'])
        print('all site not supported yet')
        return None
    elif type(site) is str:
        site = [site]
    if end_date is None:
        end_date = start_date
    if type(feature) is str:
        feature = [feature]

    alia_dict = {}
    alia_dict.update(
        {x: 'temperature'
         for x in ['temperature', 'temp', 'tmp', 'wendu']})
    alia_dict.update(
        {x: 'humidity'
         for x in ['humidity', 'hmd', 'hum', 'shidu']})
    alia_dict.update(
        {x: 'windSpeed'
         for x in ['windSpeed', 'speed', 'spd', 'fengsu']})
    alia_dict.update({
        x: 'windBearing'
        for x in ['windBearing', 'direction', 'angel', 'fengxiang']
    })
    alia_dict.update(
        {x: 'visibility'
         for x in ['visibility', 'kejiandu', 'keshidu']})
    alia_dict.update(
        {x: 'pressure'
         for x in ['pressure', 'press', 'yali', 'qiya']})
    for i, x in enumerate(feature):
        if x not in alia_dict.keys():
            raise Exception(x + ' is not supported.')
        feature[i] = alia_dict[x]

    location = location[[
        location.iloc[i, 0] in site for i in range(location.shape[0])
    ]]
    location['监测点名称'] = location['城市'] + '-' + location['监测点名称']
    location = location.rename(columns={
        '监测点编码': 'site_id',
        '监测点名称': 'site_name',
        '经度': 'lon',
        '纬度': 'lat'
    })
    location = location[['site_id', 'site_name', 'lat', 'lon']]

    data_bag = []
    for site_id in site:
        #Collect from file
        start_time = time.mktime((time.strptime(start_date + ' 12:00:00',
                                                '%Y-%m-%d %H:%M:%S')))
        end_time = time.mktime((time.strptime(end_date + ' 12:00:00',
                                              '%Y-%m-%d %H:%M:%S')))
        time_now = start_time
        stamp_bias = time.mktime((time.strptime(start_date + ' 00:00:00',
                                                '%Y-%m-%d %H:%M:%S')))

        data_list = [[] for _ in range(len(feature))]
        time_stamp = [[] for _ in range(len(feature))]
        while time_now <= end_time:
            file = 'data/weather/' + site_id + '/' + time.strftime(
                '%Y-%m-%d', time.gmtime(time_now)) + '.json'
            with open(file) as f:
                data = json.load(f)
                if 'hourly' in data.keys():
                    for k in range(len(data['hourly']['data'])):
                        _stamp = int(
                            (data['hourly']['data'][k]['time'] - stamp_bias) /
                            3600)
                        _data = data['hourly']['data'][k]
                        for m in range(len(feature)):
                            try:
                                data_list[m].append(_data[feature[m]])
                                time_stamp[m].append(_stamp)
                            except:
                                pass
            time_now += 24 * 3600

        # Linear interploation
        Length = int((end_time - start_time) // 3600 + 24)
        data_ratio = np.array([len(data_list[m])
                               for m in range(len(feature))]) / Length
        if np.any(data_ratio < kwarg['warning_threshold']
                  ) and kwarg['enable_warning']:
            print(__file__ + ' Runtime Warning: ')
            print(
                'Too much missing value in site ' + site_id +
                ' Weather Data, the returned interpolated data may be not meaningful: ',
                data_ratio)
        data_interp = []
        for i in range(len(feature)):
            if len(data_list[i]) > 0:
                data_interp.append(
                    np.interp(np.arange(Length), time_stamp[i], data_list[i]))
            else:
                data_interp.append(np.zeros(Length, ))
        data = np.stack(data_interp, -1)

        data_table = pd.DataFrame(
            data=data, columns=['data' + str(i) for i in range(len(feature))])
        data_table['site_id'] = site_id
        data_table['datetime'] = pd.date_range(start=start_date,
                                               periods=Length,
                                               freq='H')
        data_bag.append(data_table)

    data_bag = pd.concat(data_bag)
    data_bag = data_bag.merge(location, on='site_id',
                              how='left').set_index('datetime')
    data_bag = data_bag.dropna(subset=['lat', 'lon'])
    location = location.dropna(subset=['lat', 'lon'])

    artifact = DataPack()
    artifact.raw_data = data_bag
    artifact.data = data_bag.copy()
    artifact.site_info = location
    artifact.data_type = feature
    artifact.sample_unit = 'H'
    artifact.tag.append('fixed-location')
    artifact.tag.append('time-aligned')

    return artifact
Пример #4
0
def gp_data(time_length,
            site_num,
            dimension=1,
            kernel_weight=None,
            noise_level=None,
            seed=None,
            freq='H'):
    '''Generate a fake PM2.5-like data for quick start using gaussian process.
    Parameters
    ----------
        - time_length: int. How many hours should data have.
        - site_num: int. How many sites should data have.
        - dimension: int, default 1, How many independent sample each ST-point have.
        - kernel_weight: list of three float numbers, default [1.0, 1.0, 1.0], the relevant variance of three components: long-term trend, short-term fluction and period wave. or list of six float numbers, where addional 3 will be considered as lenth_scale parameter of three kernels(0-1, default 1).
        - noise_level: float, default 0.01, the white noise add to kernel, note that this is neccessary for long time generation.
        - seed: int. The random seed.
        - freq: str. the frequency of time stamp. 'H' for hour (default) and 'T' for minutes.
    Return
    ------
        DataPack
    Issue
    -----
    '''
    if seed is not None:
        np.random.seed(seed)
    if kernel_weight is None:
        kernel_weight = [[1.0, 1.0, 1.0]]
    if np.array(kernel_weight).ndim == 1:
        kernel_weight = [kernel_weight]
    if noise_level is None:
        noise_level = 0.01
    if len(kernel_weight[0]) == 3:
        kernel_weight = [
            kernel_weight[i] + [1.0, 1.0, 1.0]
            for i in range(len(kernel_weight))
        ]

    # Decrease this number if this program stuck.
    generation_step = 1000

    xs = np.arange(generation_step * 2).reshape((generation_step * 2, 1))

    if len(kernel_weight) == 1:
        # Case: Only one parameter is given, site_num will be treated as new dimension
        k1 = gp.kernels.RBF(length_scale=50.0 + 100.0 * kernel_weight[0][3])
        k2 = gp.kernels.Matern(length_scale=20.0 + 10.0 * kernel_weight[0][4],
                               nu=0.5)
        k3 = gp.kernels.ExpSineSquared(length_scale=1,
                                       periodicity=100 +
                                       200 * kernel_weight[0][5])
        kw = gp.kernels.WhiteKernel(noise_level=noise_level)
        k = kernel_weight[0][0] * k1 + kernel_weight[0][
            1] * k2 + kernel_weight[0][2] * k3 + kw
        C = k(xs)
        C_11 = C[:generation_step, :generation_step]
        C_11_inv = np.linalg.inv(C_11)
        C_21 = C[generation_step:, :generation_step]
        u, s, _ = np.linalg.svd(C_11)
        us = np.matmul(u, np.diag(np.sqrt(s)))
        sample = np.matmul(
            us, np.random.randn(generation_step, site_num * dimension))
        C_cond = C_11 - np.matmul(np.matmul(C_21.T, C_11_inv), C_21)
        u, s, _ = np.linalg.svd(C_cond)
        us = np.matmul(u, np.diag(np.sqrt(s)))
        time_now = generation_step
        sample_list = [sample]
        while time_now < time_length:
            mu_cond = 0 + np.matmul(np.matmul(C_21, C_11_inv), sample)
            sample = np.matmul(
                us, np.random.randn(generation_step, site_num * dimension))
            sample = sample + mu_cond
            sample_list.append(sample)
            time_now = time_now + generation_step
        sample_list = np.concatenate(sample_list)[:time_length].reshape(
            (time_length * site_num, dimension))
    else:
        # Case: A list of parameter is given, generate site_num samples one by one
        sample_list_all = []
        for i in range(site_num):
            k1 = gp.kernels.RBF(length_scale=50.0 +
                                100.0 * kernel_weight[i][3])
            k2 = gp.kernels.Matern(length_scale=20.0 +
                                   10.0 * kernel_weight[i][4],
                                   nu=0.5)
            k3 = gp.kernels.ExpSineSquared(length_scale=1,
                                           periodicity=100 +
                                           200 * kernel_weight[i][5])
            kw = gp.kernels.WhiteKernel(noise_level=noise_level)
            k = kernel_weight[i][0] * k1 + kernel_weight[i][
                1] * k2 + kernel_weight[i][2] * k3 + kw
            C = k(xs)
            C_11 = C[:generation_step, :generation_step]
            C_11_inv = np.linalg.inv(C_11)
            C_21 = C[generation_step:, :generation_step]
            u, s, _ = np.linalg.svd(C_11)
            us = np.matmul(u, np.diag(np.sqrt(s)))
            sample = np.matmul(us,
                               np.random.randn(generation_step, 1 * dimension))
            C_cond = C_11 - np.matmul(np.matmul(C_21.T, C_11_inv), C_21)
            u, s, _ = np.linalg.svd(C_cond)
            us = np.matmul(u, np.diag(np.sqrt(s)))
            time_now = generation_step
            sample_list = [sample]
            while time_now < time_length:
                mu_cond = 0 + np.matmul(np.matmul(C_21, C_11_inv), sample)
                sample = np.matmul(
                    us, np.random.randn(generation_step, 1 * dimension))
                sample = sample + mu_cond
                sample_list.append(sample)
                time_now = time_now + generation_step
            sample_list = np.concatenate(sample_list)[:time_length].reshape(
                (time_length * 1, dimension))
            sample_list_all.append(sample_list)
        sample_list = np.stack(sample_list_all, 1).reshape(
            (time_length * site_num, dimension))

    datetime_list = pd.date_range(start='2000-1-1',
                                  periods=time_length,
                                  freq=freq)
    site_name_list = ['virtual_site' + str(i) for i in range(site_num)]
    idx = pd.MultiIndex.from_product([datetime_list, site_name_list],
                                     names=('datetime', 'site_name'))
    data = pd.DataFrame(index=idx,
                        columns=['data' + str(i) for i in range(dimension)],
                        data=sample_list)
    data = data.reset_index()
    site_list = pd.DataFrame(data=np.random.randn(site_num, 2) +
                             np.array([39.8673, 116.3660]),
                             columns=['lat', 'lon'])
    site_list['site_name'] = site_name_list
    site_list['site_id'] = ['V' + str(i).zfill(4) for i in range(site_num)]
    data = data.merge(site_list, how='left', on='site_name')
    data = data.set_index('datetime')

    datapack = DataPack()
    datapack.raw_data = data
    datapack.data = data.copy()
    datapack.site_info = site_list
    datapack.data_type = ['virtual_type_' + str(i) for i in range(dimension)]
    datapack.sample_unit = freq
    datapack.tag.append('fixed-location')
    datapack.tag.append('time-aligned')
    datapack.time_length = time_length
    datapack.site_num = site_num

    return datapack