def csv_ld_fill(): aq_location, aq_dicts = ld_raw_fetch.load_aq_original() for aq_name in aq_location: aq_dict = aq_dicts[aq_name] start_dt_o, end_dt_o = datetime.strptime(list(aq_dict.keys())[0], format_string), \ datetime.strptime(list(aq_dict.keys())[-1], format_string) # Firstly fill lost time with all None value for dt_o in tools.per_delta(start_dt_o, end_dt_o, timedelta(hours=1)): # dt_s = dt_o.strftime(format_string) dt_s = format_ld_dt_string(dt_o) try: data = aq_dict[dt_s] except KeyError: aq_dict[dt_s] = [None] * 3 for dt_o in tools.per_delta(start_dt_o, end_dt_o, timedelta(hours=1)): dt_s = format_ld_dt_string(dt_o) if dt_s == "2018/3/30 22:00" and aq_name == "BX1": print() data = aq_dict[dt_s] for column in range(len(data)): if data[column] is not None and data[column] < 0: data[column] = None del aq_dict[dt_s] aq_dict[dt_s] = data start_dt_o += timedelta(hours=1) end_dt_o -= timedelta(hours=1) count = 0 # Then fill data if only one row is lost for dt_o in tools.per_delta(start_dt_o, end_dt_o, timedelta(hours=1)): # dt_s = dt_o.strftime(format_string) dt_s = format_ld_dt_string(dt_o) data = aq_dict[dt_s] previous = aq_dict[format_ld_dt_string(dt_o - timedelta(hours=1))] following = aq_dict[format_ld_dt_string(dt_o + timedelta(hours=1))] for column in range(len(data)): if (data[column] is None) or \ (column == 1 and (data[column] > 200)) or (column == 2 and (data[column] > 300)): if previous[column] is not None and following[ column] is not None: data[column] = (previous[column] + following[column]) / 2 count += 1 else: data[column] = None del aq_dict[dt_s] aq_dict[dt_s] = data print("Filled data in ", aq_name, ": ", count, sep='') # Write into csv with open("../data_ld_m/aq/" + aq_name + ".csv", "w", newline='') as file: writer = csv.writer(file, delimiter=',') for dt_s in aq_dict.keys(): dt_s_m = datetime.strptime( dt_s, format_string).strftime(format_string_m) writer.writerow([dt_s_m] + aq_dict[dt_s]) file.flush()
def csv_bj_fill(): aq_location, aq_dicts = bj_raw_fetch.load_aq_original() for aq_name in aq_location: aq_dict = aq_dicts[aq_name] start_dt_o, end_dt_o = datetime.strptime(list(aq_dict.keys())[0], format_string_m), \ datetime.strptime(list(aq_dict.keys())[-1], format_string_m) # Firstly fill lost time with all None value for dt_o in tools.per_delta(start_dt_o, end_dt_o, timedelta(hours=1)): # dt_s = dt_o.strftime(format_string_m) dt_s = dt_o.strftime(format_string_m) try: data = aq_dict[dt_s] except KeyError: aq_dict[dt_s] = [None] * 6 start_dt_o += timedelta(hours=1) end_dt_o -= timedelta(hours=1) count = 0 # Then fill data if only one row is lost for dt_o in tools.per_delta(start_dt_o, end_dt_o, timedelta(hours=1)): # dt_s = dt_o.strftime(format_string_m) dt_s = dt_o.strftime(format_string_m) if dt_s == "2018-04-22 00:00:00": print() data = aq_dict[dt_s] previous = aq_dict[(dt_o - timedelta(hours=1)).strftime(format_string_m)] following = aq_dict[(dt_o + timedelta(hours=1)).strftime(format_string_m)] modified = False for column in range(len(data)): if data[column] is None: if previous[column] is not None and following[ column] is not None: data[column] = (previous[column] + following[column]) / 2 count += 1 modified = True else: pass del aq_dict[dt_s] aq_dict[dt_s] = data print("Filled data in ", aq_name, ": ", count, sep='') # Write into csv with open("../data_m/aq/" + aq_name + ".csv", "w", newline='') as file: writer = csv.writer(file, delimiter=',') for dt_s in aq_dict.keys(): writer.writerow([dt_s] + aq_dict[dt_s]) file.flush()
def get_time_string(start_time_s, end_time_s, time_delta=timedelta(hours=1)): time_string_array = [] start_time = datetime.strptime(start_time_s, format_string) end_time = datetime.strptime(end_time_s, format_string) for time in per_delta(start_time, end_time, time_delta): time_string_array.append(time.strftime(format_string)) return time_string_array
def export_data(read_start_string, read_end_string, export_start_string=None, export_end_string=None, use_fill=True): start_string, end_string = read_start_string, read_end_string global aq_location, grid_location, aq_dicts, grid_dicts aq_location, grid_location, aq_dicts, grid_dicts = ld_raw_fetch.load_all(start_string, end_string) if use_fill: aq_dicts = ld_raw_fetch.load_filled_dicts(start_string, end_string) if export_start_string is not None: start_string, end_string = export_start_string, export_end_string h5_file = h5py.File("../data_ld/tradition_export/traditional_ld_{}_{}.h5".format(start_string, end_string), "w") print("\nFetching data to export...") for aq_name in aq_location.keys(): start_datetime, end_datetime = datetime.strptime(start_string, format_string_2), \ datetime.strptime(end_string, format_string_2) last_valid_dt_object = None data_to_write = [] for dt_object_day in per_delta(start_datetime, end_datetime, timedelta(days=1)): have_valid = False data_matrix = [] for dt_object in per_delta(dt_object_day - timedelta(hours=23), dt_object_day, timedelta(hours=1)): try: row = list() dt_string = dt_object.strftime(format_string) row += [dt_object.timestamp()] +\ [dt_object.weekday()] + \ [[1, 0][dt_object.weekday() in range(5)]] # + \ # [[0, 1][dt_object.date in holiday_array]] row += (aq_dicts[aq_name][dt_string]) nearest_grid = get_nearest(aq_name) row += (grid_dicts[nearest_grid][dt_string]) # other_aq = copy.copy(aq_location) # del other_aq[aq_name] # # factor_dict = dict() # for other_aq_id in other_aq.keys(): # factor = cal_affect_factor(other_aq_id, aq_name, dt_string) # factor_dict[other_aq_id] = factor # sorted_factor_dict = sorted(factor_dict.items(), key=operator.itemgetter(1), reverse=True) # valid = False # other_aq_row = [None] * 2 # for other_aq_id, factor in sorted_factor_dict: # if factor < 0: # valid = False # break # try: # other_aq_row = aq_dicts[other_aq_id][dt_string] # valid = True # except KeyError: # valid = False # if valid: # row += [factor] + other_aq_row # break # if not valid: # raise KeyError("Data loss here") data_matrix.append(row) have_valid = True except KeyError as e: have_valid = False break if have_valid: last_valid_dt_object = dt_object_day data_to_write = data_matrix if last_valid_dt_object is not None: print("{} last valid data - {}".format(aq_name, last_valid_dt_object.strftime(format_string_2))) h5_file.create_dataset(aq_name, data=np.asarray(data_to_write)) else: print("{} has no valid data".format(aq_name)) h5_file.flush() h5_file.close()
def fill_api_data(city, data_type, start_str, end_str, fill_range=3): directory = "data_{}_api/{}/{}_{}".format(city, data_type, start_str, end_str) location = location_dict[city][data_type] data_dicts = dict() errors = [] column_start = {"aq": 1, "meo": 2}[data_type] start_obj, end_obj = datetime.strptime(start_str, format_string[1]), \ datetime.strptime(end_str, format_string[1]) modified_directory = "data_{}_api_m/{}/{}_{}".format(city, data_type, start_str, end_str) if not os.path.exists(modified_directory): os.makedirs(modified_directory) else: shutil.rmtree(modified_directory) os.makedirs(modified_directory) for location_name in location.keys(): filled_count = 0 data_dict = dict() with open("{}/{}.csv".format(directory, location_name), "r") as csv_file: reader = csv.reader(csv_file, delimiter=',') for row in reader: if data_type == "aq": data_dict[row[0]] = list( map(float_m, row[column_start:data_column_scope[data_type][city] + column_start])) elif data_type == "meo": data_dict[row[0]] = list( map(float_zero, row[column_start:data_column_scope[data_type][city] + column_start])) # Fill timestamp loss with None for dt_obj in tools.per_delta(start_obj, end_obj, timedelta(hours=1)): dt_str = dt_obj.strftime(format_string[0]) try: data_dict[dt_str] except KeyError: data_dict[dt_str] = [None] * data_column_scope[data_type][city] # Fill data if possible dt_obj = start_obj while dt_obj < end_obj: dt_obj += timedelta(hours=1) dt_str = dt_obj.strftime(format_string[0]) current_data = data_dict[dt_str] for column in range(data_column_scope[data_type][city]): try: if current_data[column] is None: # Found None value, begin counting the length of empty data count = 1 while True: if data_dict[(dt_obj + timedelta(hours=count)). strftime(format_string[0])][column] is None: count += 1 else: break if count > fill_range: raise KeyError("Too much data is lost.") start_value = data_dict[(dt_obj - timedelta(hours=1)). strftime(format_string[0])][column] if start_value is None: raise KeyError("Data is empty in the first row.") end_value = data_dict[(dt_obj + timedelta(hours=count)). strftime(format_string[0])][column] gradient = (end_value - start_value) / (count + 1) for i in range(count): data_dict[(dt_obj + timedelta(hours=i)). strftime(format_string[0])][column] = start_value + (i + 1) * gradient filled_count += 1 except KeyError as e: errors.append(e) continue data_dicts[location_name] = data_dict sorted_data_matrix = sorted(data_dict.items(), key=operator.itemgetter(0)) with open("{}/{}.csv".format(modified_directory, location_name), "w", newline='') as csv_file: writer = csv.writer(csv_file, delimiter=',') for dt_str, data in sorted_data_matrix: writer.writerow([dt_str] + data) csv_file.flush()
Bar('=', '[', ']'), ' ', Percentage()]) valid_count = 0 # Validate the near grid matrix algorithm # plt.figure() # plt.title(aq_name) # plt.plot(aq_location[aq_name][0], aq_location[aq_name][1], '.') # plt.plot(grid_coor_array[:, 0], grid_coor_array[:, 1], '.') # plt.show() # Exporting data from start to end predict_matrix = [] dt_int_array = [] for dt_object_day in per_delta(start_datetime, end_datetime, timedelta(hours=24)): for dt_object in per_delta(dt_object_day, dt_object_day + timedelta(hours=2), timedelta(hours=1)): aggregate += 1 bar.update(aggregate) dt_string = dt_object.strftime(format_string) # Fetch history and prediction data, check data validation in the same time predict = check_valid(aq_name, dt_object) if predict is None: continue # Append this hour's data into per-day data predict_matrix.append(predict) dt_int_array.append(int(ti.mktime(dt_object.timetuple())))
valid_count = 0 near_grids, grid_coor_array = get_grids(aq_name, grid_edge_length) # Validate the near grid matrix algorithm # plt.figure() # plt.title(aq_name) # plt.plot(aq_location[aq_name][0], aq_location[aq_name][1], '.') # plt.plot(grid_coor_array[:, 0], grid_coor_array[:, 1], '.') # plt.show() grid_matrix = [] history_matrix = [] predict_matrix = [] dt_int_array = [] fake_forecast_matrix = [] for dt_object in per_delta(start_datetime, end_datetime, timedelta(hours=1)): aggregate += 1 bar.update(aggregate) dt_string = dt_object.strftime(format_string) # Fetch history and prediction data, check data validation in the same time aq_matrix, predict, near_grid_data, fake_forecast_data = check_valid(aq_name, dt_object, time_span) if aq_matrix is None: continue grid_matrix.append(near_grid_data) history_matrix.append(aq_matrix) predict_matrix.append(predict) dt_int_array.append(dt_object.timestamp()) fake_forecast_matrix.append(fake_forecast_data) valid_count += 1
def export_data(city, read_start_string, read_end_string, export_start_string, export_end_string, use_fill): start_string, end_string = read_start_string, read_end_string global aq_location, grid_location, grid_dicts, aq_dicts, forecast_directory, export_directory forecast_directory = forecast_directory_dict[city] export_directory = export_directory_dict[city] if city == "bj": aq_location, grid_location, aq_dicts, grid_dicts = bj_raw_fetch.load_all( start_string, end_string) if use_fill: aq_dicts = bj_raw_fetch.load_filled_dicts(start_string, end_string) elif city == "ld": aq_location, grid_location, aq_dicts, grid_dicts = ld_raw_fetch.load_all( start_string, end_string) if use_fill: aq_dicts = ld_raw_fetch.load_filled_dicts(start_string, end_string) if export_start_string is None: start_string, end_string = read_start_string, read_end_string else: start_string, end_string = export_start_string, export_end_string start_datetime, end_datetime = datetime.strptime(start_string, format_string_2), \ datetime.strptime(end_string, format_string_2) data_dir = export_directory.format(start_string, end_string) if not os.path.exists(data_dir): os.makedirs(data_dir) print("\nExporting to {}".format(data_dir)) for aq_name in aq_location.keys(): # if not aq_name in ["KF1"]: # continue valid_count = 0 near_grids, grid_coor_array = get_grids(aq_name, grid_circ) # Exporting data from start to end last_valid_dt_object = None grid_matrix, history_matrix, dt_int_array, forecast_matrix = tuple( [1] * 4) for dt_object in per_delta(start_datetime, end_datetime, timedelta(hours=24)): # Fetch history and prediction data, check data validation in the same time aq_matrix, near_grid_data, forecast_data, predict = check_valid( aq_name, dt_object, near_grids) if aq_matrix is None: continue # Append this hour's data into per-day data grid_matrix = [near_grid_data] history_matrix = [aq_matrix] dt_int_array = [dt_object.timestamp()] forecast_matrix = [forecast_data] valid_count += 1 last_valid_dt_object = dt_object if last_valid_dt_object is not None: h5_file = h5py.File("{}/{}.h5".format(data_dir, aq_name), "w") h5_file.create_dataset("grid", data=np.asarray(grid_matrix)) h5_file.create_dataset("history", data=np.asarray(history_matrix)) h5_file.create_dataset("timestep", data=np.asarray(dt_int_array)) h5_file.create_dataset("weather_forecast", data=np.asarray(forecast_matrix)) h5_file.flush() h5_file.close() print("{} - Have data, last valid {}".format( aq_name, last_valid_dt_object.strftime(format_string_2))) else: print("{} - No valid data".format(aq_name))
def export_data(city, read_start_string, read_end_string, export_start_string, export_end_string, use_fill, use_history, export_train): start_string, end_string = read_start_string, read_end_string global ci ci = city global aq_location, grid_location, aq_dicts, grid_dicts, df if use_history: if city == "ld": aq_location, grid_location, aq_dicts, grid_dicts = ld_raw_fetch.load_all_history( ) df = load_data.load_directory_data( load_data.history_data_directory[city]["aq"], load_data.data_header_dict[city]["aq"], drop=["no2"]) elif city == "bj": aq_location, grid_location, aq_dicts, grid_dicts = bj_raw_fetch.load_all_history( ) df = load_data.load_directory_data( load_data.history_data_directory[city]["aq"], load_data.data_header_dict[city]["aq"]) else: if city == "ld": aq_location, grid_location, aq_dicts, grid_dicts = ld_raw_fetch.load_all( start_string, end_string) df = load_data.load_directory_data( [ load_data.filled_data_directory[city], load_data.history_data_directory[city]["aq"] ], load_data.data_header_dict[city]["aq"], drop=["no2"]) elif city == "bj": aq_location, grid_location, aq_dicts, grid_dicts = bj_raw_fetch.load_all( start_string, end_string) df = load_data.load_directory_data([ load_data.filled_data_directory[city], load_data.history_data_directory[city]["aq"] ], load_data.data_header_dict[city]["aq"]) if use_fill: if city == "ld": aq_dicts = ld_raw_fetch.load_filled_dicts( start_string, end_string) elif city == "bj": aq_dicts = bj_raw_fetch.load_filled_dicts( start_string, end_string) global export_predict export_predict = export_train if export_start_string is not None: start_string, end_string = export_start_string, export_end_string start_datetime, end_datetime = datetime.strptime(start_string, format_string_2), \ datetime.strptime(end_string, format_string_2) diff = end_datetime - start_datetime days, seconds = diff.days, diff.seconds delta_time = int(days * 24 + seconds // 3600) if export_train: delta_time = int(delta_time / 24) directory = "" if export_train: if city == "ld": directory = "../data_ld/tradition_train/{}_{}".format( start_string, end_string) elif city == "bj": directory = "../data/tradition_train/{}_{}".format( start_string, end_string) else: if city == "ld": directory = "../data_ld/tradition_predict/{}_{}".format( start_string, end_string) elif city == "bj": directory = "../data/tradition_predict/{}_{}".format( start_string, end_string) if not os.path.exists(directory): os.makedirs(directory) print("\nExporting to {}".format(directory)) # out_file = open("out{}_{}.txt".format(start_string, end_string), "w") for aq_name in aq_location.keys(): # if aq_name not in ["KF1"]: # continue timestamp_matrix, history_aq, history_meo, forecast, predict_aq, statistic = [], [], [], [], [], [] if export_train: aggregate = 0 valid = 0 for dt_object in per_delta(start_datetime, end_datetime, timedelta(hours=24)): aggregate += 1 if aggregate % 10 == 0: print("\t{} exported %3.2f%%".format(aq_name) % (100 * aggregate / delta_time)) # out_file.write("{} exported %3.2f%%\n".format(aq_name) % (100 * aggregate / delta_time)) # out_file.flush() history_aq_matrix, history_meo_matrix, forecast_matrix, predict_matrix, \ weekday, weekend, timestamp, statistic_matrix = check_valid(aq_name, dt_object) if history_aq_matrix is None: continue timestamp_matrix.append([timestamp, weekday, weekend]) history_aq.append(history_aq_matrix) history_meo.append(history_meo_matrix) forecast.append(forecast_matrix) predict_aq.append(predict_matrix) statistic.append(statistic_matrix) valid += 1 h5_file = h5py.File("{}/{}.h5".format(directory, aq_name), "w") h5_file.create_dataset("timestamp", data=np.array(timestamp_matrix)) h5_file.create_dataset("history_aq", data=np.array(history_aq)) h5_file.create_dataset("history_meo", data=np.array(history_meo)) h5_file.create_dataset("forecast", data=np.array(forecast)) h5_file.create_dataset("predict_aq", data=np.array(predict_aq)) h5_file.create_dataset("statistic", data=np.array(statistic)) h5_file.flush() h5_file.close() print("{} finished, valid {}".format(aq_name, valid)) sleep(0.1) else: last_valid_dt = None for dt_object in per_delta(start_datetime, end_datetime, timedelta(hours=24)): history_aq_matrix, history_meo_matrix, forecast_matrix, predict_matrix, \ weekday, weekend, timestamp, statistic_matrix = check_valid(aq_name, dt_object) if history_aq_matrix is None: continue timestamp_matrix = [[timestamp, weekday, weekend]] history_aq = [history_aq_matrix] history_meo = [history_meo_matrix] forecast = [forecast_matrix] predict_aq = [predict_matrix] statistic = [statistic_matrix] last_valid_dt = dt_object if last_valid_dt is not None: h5_file = h5py.File("{}/{}.h5".format(directory, aq_name), "w") h5_file.create_dataset("timestamp", data=np.array(timestamp_matrix)) h5_file.create_dataset("history_aq", data=np.array(history_aq)) h5_file.create_dataset("history_meo", data=np.array(history_meo)) h5_file.create_dataset("forecast", data=np.array(forecast)) h5_file.create_dataset("predict_aq", data=np.array(predict_aq)) h5_file.create_dataset("statistic", data=np.array(statistic)) h5_file.flush() h5_file.close() print("{} last valid {}".format( aq_name, last_valid_dt.strftime(format_string_2))) else: print("{} no valid data".format(aq_name))