def check_request_actual_status(index_in_db, request, request_db): req_id = int(request[REQUEST_ID_FIELD]) res = rc.get_status(req_id) if res['status'].lower() == 'error': logger.info("Request file_id: {} has failed".format(req_id)) request_db.loc[index_in_db, REQUEST_STATUS_FIELD] = RequestStatus.FAILED.value elif res['status'] == 'ok': request_status = res['result']['status'] logger.info("Status of request {0} is {1}".format( req_id, request_status)) if request_status == RequestStatus.ERROR.value: request_db.loc[index_in_db, REQUEST_STATUS_FIELD] = RequestStatus.ERROR.value print(res) if request_status == RequestStatus.COMPLETED.value: request_db.loc[ index_in_db, REQUEST_STATUS_FIELD] = RequestStatus.COMPLETED.value else: logger.error("Unhandled request status: {0} for request {1}".format( res['status'], req_id)) request_db.to_csv(REQ_ID_PATH)
def download_request(req_id: str, target_dir): logger.info("Downloading files from request {0} into {1}".format( req_id, target_dir)) try: download(req_id, target_dir) except Exception as e: logger.error("Downloading failed", exc_info=True) raise e
def prepare_requests(**kwargs): if kwargs['input_file'] is not None: params_to_fetch = read_params_from_input_file(kwargs['input_file']) logger.info(f"Preparing requests for {len(params_to_fetch)} parameters...") else: params_to_fetch = [{PARAM_FIELD: kwargs['gfs_parameter'], "level": kwargs['gfs_level'], HOURS_TYPE_FIELD: kwargs[HOURS_TYPE_FIELD]}] if kwargs["bulk"]: prepare_bulk_region_request(params_to_fetch, **kwargs) else: prepare_points_request(params_to_fetch, **kwargs)
def process_csv_files(): logger.info("Processing csv files...") for root, dirs, filenames in os.walk("download/csv"): for dir_with_csvs in dirs: latlon_search = re.search(r'(\d+(_\d)?)-(\d+(_\d)?)', dir_with_csvs) latitude = latlon_search.group(1) longitude = latlon_search.group(3) prepare_final_csvs_from_csvs(dir_with_csvs, latitude, longitude, datetime.datetime(2015, 1, 15)) break
def print_db_stats(db): not_completed = db[db[REQUEST_STATUS_FIELD] == RequestStatus.SENT.value] logger.info("{} requests are pending".format(len(not_completed))) completed = db[db[REQUEST_STATUS_FIELD] == RequestStatus.COMPLETED.value] logger.info("{} requests are completed".format(len(completed))) downloaded = db[db[REQUEST_STATUS_FIELD] == RequestStatus.DOWNLOADED.value] logger.info("{} requests are downloaded, but not processed yet".format( len(downloaded))) finished = db[db[REQUEST_STATUS_FIELD] == RequestStatus.FINISHED.value] logger.info("{} requests are already processed".format(len(finished))) failed = db[db[REQUEST_STATUS_FIELD] == RequestStatus.FAILED.value] logger.info("{} requests have failed".format(len(failed)))
def process_netCDF_files_to_csv(): logger.info("Processing netCDF files...") netCDF_dir = "download/netCDF" for root, param_dirs, filenames in os.walk(netCDF_dir): for param_dir in param_dirs: logger.info(f"Processing {param_dir} parameter...") for root, level_dirs, filenames in os.walk( os.path.join(netCDF_dir, param_dir)): for level_dir in level_dirs: logger.info(f"Processing {level_dir} level...") files_in_directory = [ f for f in os.listdir( os.path.join(netCDF_dir, param_dir, level_dir)) if isfile(join(netCDF_dir, param_dir, level_dir, f)) ] for file in tqdm.tqdm(files_in_directory): file_name_match = re.match(RAW_NETCDF_FILENAME_REGEX, file) if file_name_match is not None: process_netCDF_file_to_csv(file, param_dir, level_dir) # do not take subdirs break # do not take subdirs break logger.info("Processing done.")
def prepare_and_start_processor(**kwargs): if kwargs['send_only'] is False: prepare_requests(**kwargs) job = {} try: logger.info("Scheduling sender job.") job = schedule.every(60).minutes.do(send_prepared_requests, kwargs) except Exception as e: logger.error(e, exc_info=True) job.run() while True: schedule.run_pending() time.sleep(60)
def prepare_coordinates(coords_data): """ Round GFS coordinates for provided data. Filter duplicates. :param coords_data: Pandas dataFrame :return: """ coordinates = coords_data.apply(lambda x: [round(x[NLAT_FIELD], 1), round(x[SLAT_FIELD], 1), round(x[WLON_FIELD], 1), round(x[ELON_FIELD], 1)], axis=1) coords_data[[NLAT_FIELD, SLAT_FIELD, WLON_FIELD, ELON_FIELD]] = [x for x in coordinates] before_duplicates_filter = len(coords_data) coords_data = coords_data.drop_duplicates(subset=[NLAT_FIELD, SLAT_FIELD, WLON_FIELD, ELON_FIELD]) num_dup = before_duplicates_filter - len(coords_data) logger.info("Removed {} duplicates rows".format(num_dup)) return coords_data
def find_coordinates(path, output_file_name="city_geo.csv"): location_list = pd.read_csv(path, encoding="ISO-8859-1", names=['file_id', 'city_name', 'meteo_code']) city_list = location_list["city_name"].to_list() geolocator = Nominatim(user_agent='gfs_fetch_processor') geo_list = [] logger.info("Downloading coordinates for provided cities") for city_name in tqdm(city_list): geo = geolocator.geocode(city_name) if geo: geo_list.append([city_name, geo.latitude, geo.longitude]) path_to_write = os.path.join("../city_coordinates", output_file_name) data = pd.DataFrame(geo_list, columns=["city_name", "latitude", "longitude"]) data.to_csv(path_to_write) return data
def save_request_to_pseudo_db(request_type: RequestType, request_status: RequestStatus, **kwargs): if not os.path.isfile(REQ_ID_PATH): pseudo_db = pd.DataFrame(columns=[REQUEST_ID_FIELD, REQUEST_TYPE_FIELD, REQUEST_STATUS_FIELD, NLAT_FIELD, SLAT_FIELD, WLON_FIELD, ELON_FIELD, PARAM_FIELD, LEVEL_FIELD, HOURS_TYPE_FIELD]) else: pseudo_db = pd.read_csv(REQ_ID_PATH, index_col=[0]) pseudo_db = pseudo_db.append( {REQUEST_ID_FIELD: kwargs[REQUEST_ID_FIELD], REQUEST_TYPE_FIELD: request_type.value, REQUEST_STATUS_FIELD: request_status.value, NLAT_FIELD: kwargs[NLAT_FIELD], SLAT_FIELD: kwargs[SLAT_FIELD], WLON_FIELD: kwargs[WLON_FIELD], ELON_FIELD: kwargs[ELON_FIELD], PARAM_FIELD: kwargs[PARAM_FIELD], LEVEL_FIELD: kwargs[LEVEL_FIELD], HOURS_TYPE_FIELD: kwargs[HOURS_TYPE_FIELD] }, ignore_index=True) logger.info(f"Saving a new request of type {request_type} for coords (lat: {kwargs[NLAT_FIELD]}-{kwargs[SLAT_FIELD]}, " f"lon: {kwargs[WLON_FIELD]}-{kwargs[ELON_FIELD]}), param {kwargs[PARAM_FIELD]}, level {kwargs[LEVEL_FIELD]}, hours_type {kwargs[HOURS_TYPE_FIELD]}...") pseudo_db.to_csv(REQ_ID_PATH)
def send_prepared_requests(kwargs): start_date = datetime.strptime(kwargs["start_date"], '%Y-%m-%d %H:%M') end_date = datetime.strptime(kwargs["end_date"], '%Y-%m-%d %H:%M') request_db = pd.read_csv(REQ_ID_PATH, index_col=0) requests_to_send = request_db[request_db[REQUEST_STATUS_FIELD] == RequestStatus.PENDING.value] for index, request in requests_to_send.iterrows(): nlat, slat, elon, wlon = request[[NLAT_FIELD, SLAT_FIELD, ELON_FIELD, WLON_FIELD]] request_type = request[REQUEST_TYPE_FIELD] param = request[PARAM_FIELD] level = request[LEVEL_FIELD] hours_type = request[HOURS_TYPE_FIELD] product = generate_product_description(kwargs['forecast_start'], kwargs['forecast_end'], hours_type=hours_type) template = build_template(nlat, slat, elon, wlon, start_date, end_date, param, product, level, 'csv' if request_type == RequestType.POINT.value else 'netCDF') response = submit_json(template) if response['status'] == 'ok': request_id = response['result']['request_id'] request_db.loc[index, REQUEST_STATUS_FIELD] = RequestStatus.SENT.value request_db.loc[index, REQUEST_ID_FIELD] = str(int(request_id)) else: logger.info("Rda has returned error.") if response['status'] == 'error' and TOO_MANY_REQUESTS in response['messages']: logger.info("Too many requests. Request will be sent on next scheduler trigger.") break else: request_db.loc[index, REQUEST_STATUS_FIELD] = RequestStatus.FAILED.value logger.info(response) request_db.to_csv(REQ_ID_PATH) print("Sending requests done. Waiting for next scheduler trigger.")
def extract_files_from_tar(download_target_path, extract_target_path, file_type: str, tidy=False): tars = [ f for f in os.listdir(download_target_path) if os.path.isfile(os.path.join(download_target_path, f)) and f.endswith("tar") ] logger.info("Unpacking {0} tars into {1} directory".format( len(tars), extract_target_path)) for file in tars: tar = tarfile.open(os.path.join(download_target_path, file), "r:") tar.extractall(extract_target_path) tar.close() if tidy: os.remove(tar) if file_type == "csv": new_file_pattern = re.compile(RAW_CSV_FILENAME_WITH_REQUEST_REGEX) for file in [ f for f in os.listdir(extract_target_path) if new_file_pattern.match(f) ]: final_csv_name = re.sub(RAW_CSV_FILENAME_WITH_REQUEST_REGEX, r"\1\5", file) # remove request number os.replace(os.path.join(extract_target_path, file), os.path.join(extract_target_path, final_csv_name)) else: new_file_pattern = re.compile(RAW_NETCDF_FILENAME_WITH_REQUEST_REGEX) for file in [ f for f in os.listdir(extract_target_path) if new_file_pattern.match(f) ]: final_csv_name = re.sub(RAW_NETCDF_FILENAME_WITH_REQUEST_REGEX, r"\1\5", file) os.replace(os.path.join(extract_target_path, file), os.path.join(extract_target_path, final_csv_name))
def processor(purge_requests: bool, tidy: bool): logger.info("Starting rda download processor") request_db = read_pseudo_rda_request_db() print_db_stats(request_db) print("Checking actual status of pending requests...") not_completed = request_db[request_db[REQUEST_STATUS_FIELD] == RequestStatus.SENT.value] for index, request in not_completed.iterrows(): check_request_actual_status(index, request, request_db) completed = request_db[request_db[REQUEST_STATUS_FIELD] == RequestStatus.COMPLETED.value] for index, request in completed.iterrows(): download_completed_request(index, request, request_db) ready_for_unpacking = request_db[request_db[REQUEST_STATUS_FIELD] == RequestStatus.DOWNLOADED.value] for index, request in ready_for_unpacking.iterrows(): process_tars(index, request, request_db, tidy) request_db.to_csv(REQ_ID_PATH) if purge_requests: done_requests = request_db[ (request_db[REQUEST_STATUS_FIELD] == RequestStatus.ERROR.value) | (request_db[REQUEST_STATUS_FIELD] == RequestStatus.FAILED.value) | (request_db[REQUEST_STATUS_FIELD] == RequestStatus.DOWNLOADED.value ) | (request_db[REQUEST_STATUS_FIELD] == RequestStatus.FINISHED.value)] for index, request in done_requests.iterrows(): purge(str(int(request[REQUEST_ID_FIELD]))) request_db.loc[index, REQUEST_STATUS_FIELD] = RequestStatus.PURGED.value request_db.to_csv(REQ_ID_PATH) print("Done. Waiting for next scheduler trigger.")
def process_netCDF_files_to_npy(output_dir: str): for param in GFS_PARAMETERS: logger.info(f"Converting parameter {param['name']} {param['level']}") process_to_numpy_array(param, Coords(56, 48, 13, 26), output_dir)
def purge(req_id: str): logger.info("Purging request {}".format(req_id)) rc.purge_request(req_id)