Пример #1
0
def read_data_sets(sites=['中山', '古亭', '士林', '松山', '萬華'],
                   date_range=['2014', '2015'],
                   feature_selection=['PM2.5'],
                   beginning='1/1',
                   finish='12/31',
                   path=root_path + 'dataset/',
                   update=False):
    # print('Reading data .. ')

    y_d_h_data = data_reader(int(date_range[0]), int(date_range[-1]), path,
                             update)

    # print('Reading data .. ok')
    # print('Date Range: ', date_range)
    # print('Construct feature vectors: ')
    num_of_missing = 0.
    total_number = 0.
    feature_vector_set = []
    for each_year in date_range:
        print('%s .. ok' % each_year)
        # for each_date in y_d_h_data[each_year]:
        days = 0
        for month in [
                '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'
        ]:
            # -- duration --
            if each_year == date_range[0] and int(month) < int(
                    beginning[:beginning.index('/')]):  # start
                continue
            elif each_year == date_range[-1] and int(month) > int(
                    finish[:finish.index('/')]):  # dead line
                continue
            # --
            if (month == '1') or (month == '3') or (month == '5') or (
                    month == '7') or (month == '8') or (month
                                                        == '10') or (month
                                                                     == '12'):
                days = 31
            elif (month == '4') or (month == '6') or (month
                                                      == '9') or (month
                                                                  == '11'):
                days = 30
            elif month == '2':
                if '2/29' in y_d_h_data[each_year]:
                    days = 29
                else:
                    days = 28

            for day in range(days):
                each_date = month + '/' + str(day + 1)

                # -- duration --
                if (each_year == date_range[0]) and (int(month) == int(
                        beginning[:beginning.index('/')])) and (
                            (day + 1) < int(beginning[(beginning.index('/') +
                                                       1):])):  # start
                    continue
                elif (each_year == date_range[-1]) and int(month) == int(
                        finish[:finish.index('/')]) and ((day + 1) > int(
                            finish[(finish.index('/') + 1):])):  # dead line
                    continue
                # --

                if not ('pollution' in y_d_h_data[each_year][each_date]):
                    print('Data of pollution missing: %s/%s' %
                          (each_year, each_date))
                else:
                    for each_hour in range(24):
                        feature_vector = list()
                        feature_vector += data_coordinate_angle(
                            time_to_angle(
                                '%s/%s' %
                                (each_year, each_date))[-1])  # day of year
                        feature_vector += data_coordinate_angle(
                            return_weekday(int(each_year), int(month),
                                           int(day + 1)))  # day of week
                        feature_vector += data_coordinate_angle(
                            float(each_hour) / 24 * 360)  # time of day
                        for site in sites:
                            if not (site in y_d_h_data[each_year][each_date]
                                    ['pollution']):
                                # print('Data of site(%s) missing: %s/%s %d:00' % (site, each_year, each_date, each_hour))
                                for feature_elem in feature_selection:
                                    feature_vector.append('NaN')
                                    num_of_missing += 1
                                    total_number += 1
                            else:
                                for feature_elem in feature_selection:
                                    try:
                                        feature = float(
                                            y_d_h_data[each_year][each_date]
                                            ['pollution'][site][each_hour][
                                                pollution_to_pollution_no(
                                                    feature_elem)])
                                        if feature < 0:
                                            feature_vector.append('NaN')
                                            num_of_missing += 1
                                            total_number += 1
                                        else:
                                            feature_vector.append(feature)
                                            total_number += 1
                                    except:
                                        # print('Data of feature(%s) of site(%s) missing: %s/%s %d:00' % (
                                        #     feature_elem, site, each_year, each_date, each_hour))
                                        feature_vector.append('NaN')
                                        num_of_missing += 1
                                        total_number += 1
                        feature_vector_set.append(feature_vector)

    # print('data_frame .. ok')
    print('Missing rate: %.5f' % (num_of_missing / total_number))
    return feature_vector_set
Пример #2
0
def data_reader(start_year,
                last_year,
                path=root_path + 'dataset/',
                update=False):
    y_d_h_data = dict()
    not_exit_flag = 0
    while (start_year != last_year + 1) and (not update):
        if os.path.exists(path + 'cPickle/pollution_and_weather_data_' +
                          str(start_year)):
            not_exit_flag += 1
            print('Reading %d data by cPickle .. ' % start_year)
            fr = open(
                path + 'cPickle/pollution_and_weather_data_' + str(start_year),
                'rb')
            y_d_h_data[str(start_year)] = cPickle.load(fr)
            fr.close()
        start_year += 1

    if not_exit_flag > 0:
        return y_d_h_data

    elif not_exit_flag == 0:
        print('Start from reading raw data.')
        # feature_vector = []
        y_d_h_data = dict(
        )  # years, days and hours, then pollution and weather data

        # ---------- pollution ----------
        pollution_data_files = []  # multi-files
        num_pollution_property = 21

        # --- csv ---
        # csv_pollution_data = []
        load_all(pollution_data_files, path + 'Data_of_Air_Pollution/')

        # data pre-processing : format
        keep_date = ''
        pollution_vector_one_day = []

        for single_file_pollution_data in pollution_data_files:
            for line in single_file_pollution_data:
                if line == single_file_pollution_data[0]:
                    None
                else:
                    if line[0].find('-') != -1:
                        line[0] = line[0].replace(
                            '-0', '/')  # 2008-01-01 -> 2008/1/1
                        line[0] = line[0].replace(
                            '-', '/')  # 2008-10-12 -> 2008/10/12
                    if line[0].find('/0') != -1:
                        line[0] = line[0].replace('/0',
                                                  '/')  # 2010/01/01 ->2010/1/1

                    year = line[0][:line[0].find('/')]
                    date = line[0][line[0].find('/') + 1:]
                    # check/create year dict., ex: 2016, 2015
                    if not (year in y_d_h_data):
                        y_d_h_data[year] = dict()
                    # check/create date dict., ex: 01/01, 10/31
                    if not (date in y_d_h_data[year]):
                        y_d_h_data[year][date] = dict()
                    # pollution sites dict.
                    if not ('pollution' in y_d_h_data[year][date]):
                        y_d_h_data[year][date]['pollution'] = dict()

                    print(line[:3])

                    if keep_date != line[0]:  # a new day
                        if keep_date != '' and (keep_date[:keep_date.find('/')]
                                                == year):
                            y_d_h_data[keep_date[:keep_date.find('/')]][
                                keep_date[keep_date.find('/') +
                                          1:]]['pollution'][line[
                                              1]] = pollution_vector_one_day
                            pollution_vector_one_day = []
                        elif keep_date != '':
                            pollution_vector_one_day = []

                        keep_date = line[0]

                        # Reserve 'num_pollution_property' entries for data, and take '-' to mean missing value
                        for each_hour in np.arange(24):
                            pollution_vector_one_day.append([
                                '-' for i in np.arange(num_pollution_property)
                            ])

                    for each_hour in np.arange(24):
                        # The first three elements are date, sites and kind of pollution
                        try:
                            pollution_vector_one_day[each_hour][
                                pollution_to_pollution_no(line[2].replace(
                                    ' ', ''))] = line[3 + each_hour]
                        except:
                            break

                if line == single_file_pollution_data[
                        -1]:  # the last recorded day of this file
                    y_d_h_data[year][date]['pollution'][
                        line[1]] = pollution_vector_one_day

        print(
            '--------------------------------------------------------------------------------------'
        )

        # ---------- weather ----------
        num_weather_property = 13
        # num_sites = 449

        weather_data = []
        load_all(weather_data, path + 'Data_of_Weather/')

        for file_i in np.arange(len(weather_data)):
            # sorting by date -> site -> param_code
            for line_j in np.arange(len(weather_data[file_i])):
                [year, _, date, angle] = time_to_angle(
                    weather_data[file_i][line_j][2].replace(' 00:00:00', ''))

                format_day_order = angle / 360.
                weather_data[file_i][line_j].append(
                    int(year) + format_day_order)
                weather_data[file_i][line_j][0] = int(
                    weather_data[file_i][line_j][0])

            weather_data[file_i] = sorted(
                weather_data[file_i],
                key=itemgetter(len(weather_data[file_i][line_j]) - 1, 0, 1))

            print('Sorted complete.')

            keep_date = ''
            keep_site = ''
            for line_j in np.arange(len(weather_data[file_i])):
                # a new site
                if weather_data[file_i][line_j][0] != keep_site:
                    if keep_site != '':
                        y_d_h_data[year][date]['weather'][
                            keep_site] = weather_vector
                        print(year + '/' + date + ': site- %s' % keep_site)
                    keep_site = weather_data[file_i][line_j][0]

                    weather_vector = []

                # a new day
                if weather_data[file_i][line_j][2].replace(' 00:00:00',
                                                           '') != keep_date:
                    if keep_date != '' and len(weather_vector) != 0:
                        y_d_h_data[year][date]['weather'][
                            keep_site] = weather_vector
                        print(year + '/' + date + ': site- %s' % keep_site)

                    keep_date = weather_data[file_i][line_j][2].replace(
                        ' 00:00:00', '')
                    year = keep_date[:keep_date.find('/')]
                    date = keep_date[keep_date.find('/') + 1:]

                    # check/create year dict., ex: 2016, 2015
                    if not (year in y_d_h_data):
                        y_d_h_data[year] = dict()
                    # check/create date dict., ex: 1/1, 10/31
                    if not (date in y_d_h_data[year]):
                        y_d_h_data[year][date] = dict()
                    # weather sites dict.
                    if not ('weather' in y_d_h_data[year][date]):
                        y_d_h_data[year][date]['weather'] = dict()

                    weather_vector = []

                # Initiate weather_vector, when 'a new day' or 'a new site'.
                if len(weather_vector) == 0:
                    for each_hour in np.arange(24):
                        weather_vector.append(
                            ['-' for i in np.arange(num_weather_property)])

                # collecting data
                for each_hour in np.arange(24):
                    weather_vector[each_hour][param_code_to_param_code_no(weather_data[file_i][line_j][1])] \
                        = weather_data[file_i][line_j][3+each_hour]  # the first three element mean site, param_code and date

                if line_j == len(weather_data[file_i]) - 1:  # the last day
                    y_d_h_data[year][date]['weather'][
                        keep_site] = weather_vector
            print('----')

        print('Saving .. ')
        for years in y_d_h_data.keys():
            fw1 = open(path + 'cPickle/pollution_and_weather_data_' + years,
                       'wb')
            cPickle.dump(y_d_h_data[years], fw1)
            fw1.close()

        print('Saved.')

        return y_d_h_data
Пример #3
0
def read_data_map(path, site, feature_selection, date_range=[2014, 2015],
                  beginning='1/1', finish='12/31', update=False):

    # input_shape = (train_seg_length, 5, 5, feature_dim)
    y_d_h_data = data_reader(path, int(date_range[0]), int(date_range[-1]), update)

    num_of_missing = 0.
    total_number = 0.
    feature_tensor_list = []

    for year in date_range:
        print('%s .. ok' % year)
        days = 0

        for month in range(1, 13):

            # Check the exceeding of the duration
            if year == int(date_range[0]) and month < int(beginning[:beginning.index('/')]):  # start
                continue
            elif year == int(date_range[-1]) and month > int(finish[:finish.index('/')]):  # dead line
                continue

            # Set the number of days in a month
            if (month == 4) or (month == 6) or (month == 9) or (month == 11):
                days = 30
            elif month == 2:
                if '2/29' in y_d_h_data[str(year)]:
                    days = 29
                else:
                    days = 28
            else:
                days = 31

            for day in range(days):
                each_date = str(month) + '/' + str(day + 1)

                # Check the exceeding of the duration
                if (year == int(date_range[0])) and (month == int(beginning[:beginning.index('/')])) and (
                            (day+1) < int(beginning[(beginning.index('/')+1):])):  # start
                    continue
                elif (year == int(date_range[-1])) and month == int(finish[:finish.index('/')]) and (
                            (day+1) > int(finish[(finish.index('/')+1):])):  # dead line
                    continue

                if not ('pollution' in y_d_h_data[str(year)][each_date]):
                    print('Data of pollution missing: %s/%s' % (year, each_date))
                else:
                    for each_hour in range(24):

                        # Construct feature vector
                        time_feature = list()
                        time_feature += convert_polar_to_cartesian(
                            time_to_angle('%s/%s' % (year, each_date))[-1])  # day of year
                        time_feature += convert_polar_to_cartesian(
                            return_weekday(int(year), month, int(day+1)))  # day of week
                        time_feature += convert_polar_to_cartesian(
                            float(each_hour)/24*360)  # time of day

                        feature_tensor = np.zeros(shape=(site.shape + ((6 + len(feature_selection) + 1),)),
                                                  dtype=float)

                        site_names = list(site.adj_map.keys())
                        for site_name in site_names:
                            map_index = site.adj_map[site_name]

                            # Set time feature
                            feature_tensor[map_index[0], map_index[1], 0:6] = np.array(time_feature)

                            # Set feature vector
                            if not (site_name in y_d_h_data[str(year)][each_date]['pollution']):
                                # All features should be set to 'NaN'
                                feature_tensor[map_index[0], map_index[1], 6:(5+len(feature_selection))] = 'NaN'
                                num_of_missing += len(feature_selection)
                                total_number += len(feature_selection)
                            else:
                                feature_index = 0
                                for feature_elem in feature_selection:
                                    feature_index += 1
                                    if feature_elem == 'WIND_DIREC':
                                        try:
                                            feature = float(y_d_h_data[str(year)][each_date]['pollution'][site_name][each_hour]
                                                            [pollution_to_pollution_no(feature_elem)])
                                            if feature < 0:
                                                feature_tensor[map_index[0], map_index[1], -2] = 'NaN'
                                                feature_tensor[map_index[0], map_index[1], -1] = 'NaN'
                                                num_of_missing += 1
                                                total_number += 1
                                            else:
                                                xy_coord = convert_polar_to_cartesian(feature)
                                                feature_tensor[map_index[0], map_index[1], -2] = xy_coord[0]
                                                feature_tensor[map_index[0], map_index[1], -1] = xy_coord[1]
                                                total_number += 1
                                        except:
                                            feature_tensor[map_index[0], map_index[1], (5 + feature_index)] = 'NaN'
                                            num_of_missing += 1
                                            total_number += 1
                                    else:
                                        try:
                                            feature = float(y_d_h_data[str(year)][each_date]['pollution'][site_name][each_hour]
                                                            [pollution_to_pollution_no(feature_elem)])
                                            if feature < 0:
                                                feature_tensor[map_index[0], map_index[1], (5+feature_index)] = 'NaN'
                                                num_of_missing += 1
                                                total_number += 1
                                            else:
                                                feature_tensor[map_index[0], map_index[1], (5+feature_index)] = feature
                                                total_number += 1
                                        except:
                                            feature_tensor[map_index[0], map_index[1], (5 + feature_index)] = 'NaN'
                                            num_of_missing += 1
                                            total_number += 1

                        feature_tensor_list.append(feature_tensor)

    print('Missing rate: %.5f' % (num_of_missing/total_number))
    return np.array(feature_tensor_list)