def _download_single_tc(self, year, tc, tc_dir, data_link_text): url = self.CONFIG['hwind']['url'] tc_url = f'{url}/{tc.name.lower()}{year}.html' tc_dir = f'{tc_dir}{tc.basin}/{year}/{tc.name}/' os.makedirs(tc_dir, exist_ok=True) page = requests.get(tc_url) data = page.text soup = BeautifulSoup(data, features='lxml') anchors = soup.find_all('a', text=data_link_text) if not len(anchors): return False url_prefix = self.CONFIG['hwind']['data_url_prefix'] for a in anchors: href = a.get("href") filename = href.split('/')[-1] file_url = f'{url_prefix}{href}' file_path = f'{tc_dir}{filename}' if not utils.check_period(self._get_dt_of_hwind_file(href), self.period): continue utils.download(file_url, file_path) return True
def _extract_year_hurr_file_path(self, read_all=False): data_root_dir = self.SFMR_CONFIG['dirs']['hurr'] self.year_hurr_file_path = dict() for year in self.year_hurr.keys(): if not len(self.year_hurr[year]): continue self.year_hurr_file_path[year] = dict() for hurr in self.year_hurr[year]: spec_data_dir = '{0}{1}/{2}/'.format(data_root_dir, year, hurr) try: filenames = [ f for f in os.listdir(spec_data_dir) if f.endswith('.nc') ] except FileNotFoundError: pass if not len(filenames): continue self.year_hurr_file_path[year][hurr] = [] for file in filenames: date_ = datetime.datetime.strptime( file.split('SFMR')[1][:8] + '000000', '%Y%m%d%H%M%S').date() if not read_all and utils.check_period(date_, self.period): self.year_hurr_file_path[year][hurr].append( spec_data_dir + file) if read_all: self.year_hurr_file_path[year][hurr].append( spec_data_dir + file)
def _create_year_tc(self): # Structure of dictionary year_tc # year - tc_info (basin, sid and name) # Return existing year_tc year_tc_path = self.CONFIG['hwind']['pickle']['year_tc'] # Create year_tc year_tc = dict() url = self.CONFIG['hwind']['url'] # Get page according to url page = requests.get(url) data = page.text soup = BeautifulSoup(data, features='lxml') mydivs = soup.find_all('div', class_='legacy-year') for year in self.years: year_tc[year] = [] if year <= 1994: year_name = '1994 & Earlier' elif year > 2013: continue else: year_name = str(year) try: div_year = soup.find_all('div', class_='legacy-year', text=year_name)[0] except Exception as msg: self.logger.error( (f'Error occurs when extract year div ' + f'from {url}')) strong_basins = div_year.find_parent('div', class_='row').\ find_next_sibling('div').find_all('strong') for item in strong_basins: basin = self.CONFIG['hwind']['basin_map'][item.text] anchors = item.find_parent('div', class_='col-sm-4').\ find_all('a') for a in anchors: tc_url = a.get('href') tc_page = requests.get(tc_url) tc_data = tc_page.text tc_soup = BeautifulSoup(tc_data, features='lxml') gridded_anchors = tc_soup.find_all( 'a', text=self.CONFIG['hwind']['data_link_text']\ ['gridded']) if not len(gridded_anchors): continue url_prefix = self.CONFIG['hwind']['data_url_prefix'] tc_data_in_period_count = 0 for g_a in gridded_anchors: if utils.check_period( self._get_dt_of_hwind_file(g_a.get('href')), self.period): tc_data_in_period_count += 1 if not tc_data_in_period_count: continue tc_info = TCInfo(year, basin, a.text) self._find_sid(tc_info) year_tc[year].append(tc_info) os.makedirs(os.path.dirname(year_tc_path), exist_ok=True) with open(year_tc_path, 'wb') as file: pickle.dump(year_tc, file) return year_tc
def _read_detail(self, basin, region_restriction, vars, storm_num, date_time_num, have_read, info): """Read detail of IBTrACS data. """ total = storm_num # List to record all details tc_list = [] IBTrACSTable = self.create_tc_table(basin) season_check_offset = self.CONFIG['ibtracs']\ ['season_check_offset'] for i in range(storm_num): print(f'\r{info} {i+1}/{total}', end='') # Season is not just the year, so to ensure correctly # skipping loop by checking season, we need to set an offset # for checking season if int(vars['season'][i]) < (self.period[0].year - season_check_offset): continue if int(vars['season'][i]) > (self.period[1].year + season_check_offset): continue # Skip this loop if datetime of first record is earlier than # start date of period of more than 60 days, # or datetime of first record is later than end date of # period iso_times = vars['iso_time'][i] not_masked_count = np.count_nonzero(iso_times.count(1)) if not not_masked_count: self.logger.debug((f'Skipping No.{i+1} TC because its ' + f'iso_time field is all masked')) continue last_iso_time = iso_times[not_masked_count - 1] last_datetime = datetime.datetime.strptime( last_iso_time.tostring().decode('utf-8'), '%Y-%m-%d %H:%M:%S') if last_datetime < self.period[0]: self.logger.debug( (f'Skipping No.{i+1} TC because its ' + f'last datetime is earlier than ' + f'starting datetime of period: ' + f'{last_datetime}')) continue first_iso_time = iso_times[0] first_datetime = datetime.datetime.strptime( first_iso_time.tostring().decode('utf-8'), '%Y-%m-%d %H:%M:%S') if first_datetime > self.period[1]: self.logger.debug( (f'Skipping No.{i+1} TC because its ' + f'first datetime is later than ' + f'ending datetime of period: ' + f'{first_datetime}')) continue self.logger.debug((f'Reading No.{i+1} TC which lived from ' + f'{first_datetime} to {last_datetime}')) sid = vars['sid'][i].tostring().decode('utf-8') name = vars['name'][i] name = name[name.mask == False].tostring().decode('utf-8') for j in range(date_time_num): row = IBTrACSTable() # Read ISO time and check whether record is in period iso_time = vars['iso_time'][i][j] if iso_time[0] is MASKED: break iso_time_str = iso_time.tostring().decode('utf-8') row.date_time = datetime.datetime.strptime( iso_time_str, '%Y-%m-%d %H:%M:%S') if not utils.check_period(row.date_time, self.period): continue # Insert rows which have read to TC table until # find next unread month # year, month = row.date_time.year, row.date_time.month # if not have_read[year][month]: # if len(tc_list): # utils.bulk_insert_avoid_duplicate_unique( # tc_list, self.CONFIG['database']\ # ['batch_size']['insert'], # IBTrACSTable, ['sid_date_time'], self.session, # check_self=True) # tc_list = [] # self.logger.debug((f'Reading WMO records of ' # + f'{year}-{str(month).zfill(2)}')) # have_read[year][month] = True # Read basin of TC row.basin = vars['basin'][i][j].tostring().decode('utf-8') # Read latitude, longitude, minimal centeral pressure, # maximum sustained wind speed from official WMO agency lat = vars['lat'][i][j] lon = (vars['lon'][i][j] + 360) % 360 # breakpoint() if lat is MASKED or lon is MASKED: continue if region_restriction: if (lat < self.lat1 or lat > self.lat2 or lon < self.lon1 or lon > self.lon2): continue pres = vars['wmo_pres'][i][j] wind = vars['wmo_wind'][i][j] # if pres is MASKED or wind is MASKED: # continue # Set attributes of row row.sid = sid if name != 'NOT_NAMED': row.name = name row.lat = float(lat) row.lon = float(lon) row.pres = int(pres) if pres is not MASKED else None row.wind = int(wind) if wind is not MASKED else None row.sid_date_time = f'{sid}_{row.date_time}' # Average radius of 34/50/64 knot winds in four # directions (ne, se, sw, nw) from three agencies # (bom, reunion, usa) dirs = ['ne', 'se', 'sw', 'nw'] radii = dict() for r in ['r34', 'r50', 'r64']: radii[r] = dict() for d in range(4): radii[r][d] = [] for a in ['bom', 'reunion', 'usa']: r_d_a = vars[f'{a}_{r}'][i][j][d] if r_d_a is not MASKED: radii[r][d].append(int(r_d_a)) if len(radii[r][d]): setattr(row, f'{r}_{dirs[d]}', int(sum(radii[r][d]) / len(radii[r][d]))) tc_list.append(row) # breakpoint() if len(tc_list): utils.bulk_insert_avoid_duplicate_unique( tc_list, self.CONFIG['database']\ ['batch_size']['insert'], IBTrACSTable, ['sid_date_time'], self.session, check_self=True) utils.delete_last_lines() print('Done')
def get_one_hurricane_brief_info(self, hurricane_sfmr_url): brief_info = [] try: page = requests.get(hurricane_sfmr_url) data = page.text soup = bs4.BeautifulSoup(data, features='lxml') anchors = soup.find_all('a') filename_suffix = '.nc' except Exception as msg: breakpoint() exit(msg) possible_names = self.SFMR_CONFIG['possible_names'] error_files = self.SFMR_CONFIG['error_files'] wrong_name_correction = self.SFMR_CONFIG['files_with_wrong_name'] for link in anchors: href = link.get('href') # Find href of netcdf file if href.endswith(filename_suffix): try: split_name = None # Extract file name filename = href.split('/')[-1] if filename in error_files: self.logger.warning(f'[Skip] Error file {href}') continue if href in wrong_name_correction.keys(): filename = wrong_name_correction[href] self.logger.warning( f"""[Correct] wrong name from """ f"""{href.split('/')[-1]} to {filename}""") for name in possible_names: if name in filename: tail_half = filename.split(name)[1] split_name = name break # There may be NetCDF name format # like 'USAF_SFMR0809221638.nc' # from 'https://www.aoml.noaa.gov/hrd' # '/Storm_pages/kyle2008/sfmr.html' # It is very annoying and there seems # no simple rule to check this problem. # Because it hard to distinguish # 'SFMR20110536' and 'SFMR20110524'. # First one is the case as kyle2008, its # actually date is 2020/11/05. # Second one is a normal case, its # actually date is 2011/05/24. # Before 2020, following rule may work. if (tail_half.startswith('20') or tail_half.startswith('199')): date_str = tail_half[:8] date_ = datetime.date(int(date_str[:4]), int(date_str[4:6]), int(date_str[6:])) else: date_str = tail_half[:6] date_ = datetime.date(int(f'20{date_str[:2]}'), int(date_str[2:4]), int(date_str[4:])) filename = (f"""{filename.split(split_name)[0]}""" f"""{split_name}20""" f"""{filename.split(split_name)[1]}""") except Exception as msg: breakpoint() exit(msg) if not utils.check_period(date_, self.period): continue info = SFMRDetail() info.hurr_name = hurricane_sfmr_url.split('/')[-2][:-4] info.filename = filename.replace(split_name, 'SFMR') info.file_url = href brief_info.append(info) return brief_info
def _download_sfmr_data(self): """Download SFMR data of hurricanes. Parameters ---------- None Nothing is required by this function. Returns ------- hit_times : dict Times of hurricane NetCDF file's date being in period. """ self.logger.info(self.SFMR_CONFIG['prompt']['info']\ ['download_hurr']) utils.set_format_custom_text(self.SFMR_CONFIG['data_name_length']) suffix = '.nc' save_root_dir = self.SFMR_CONFIG['dirs']['hurr'] os.makedirs(save_root_dir, exist_ok=True) total = 0 count = 0 for year in self.year_hurr.keys(): total += len(self.year_hurr[year]) for year in self.year_hurr.keys(): hurrs = list(self.year_hurr[year]) for hurr in hurrs: count += 1 info = (f'Download SFMR data of hurricane {hurr} ' + f'in {year}') self.logger.debug(info) if count > 1: utils.delete_last_lines() print(f'\r{info} ({count}/{total})', end='') # Create directory to store SFMR files dir_path = f'{save_root_dir}{year}/{hurr}/' os.makedirs(dir_path, exist_ok=True) # Generate keyword to consist url keyword = f'{hurr}{year}' url = (f'{self.SFMR_CONFIG["urls"]["prefix"]}' + f'{keyword}' + f'{self.SFMR_CONFIG["urls"]["suffix"]}') # Get page according to url page = requests.get(url) data = page.text soup = bs4.BeautifulSoup(data, features='lxml') anchors = soup.find_all('a') # Times of NetCDF file's date being in period for link in anchors: href = link.get('href') # Find href of netcdf file if href.endswith(suffix): # Extract file name filename = href.split('/')[-1] tail_half = filename.split('SFMR')[1] try: # There may be NetCDF name format # like 'USAF_SFMR0809221638.nc' # from 'https://www.aoml.noaa.gov/hrd' # '/Storm_pages/kyle2008/sfmr.html' # It is very annoying and there seems # no simple rule to check this problem. # Because it hard to distinguish # 'SFMR20110536' and 'SFMR20110524'. # First one is the case as kyle2008, its # actually date is 2020/11/05. # Second one is a normal case, its # actually date is 2011/05/24. # Before 2020, following rule may work. if (tail_half.startswith('20') or tail_half.startswith('199')): date_str = tail_half[:8] date_ = datetime.date(int(date_str[:4]), int(date_str[4:6]), int(date_str[6:])) else: date_str = tail_half[:6] date_ = datetime.date(int(f'20{date_str[:2]}'), int(date_str[2:4]), int(date_str[4:])) filename = ( f'{filename.split("SFMR")[0]}SFMR20' + f'{filename.split("SFMR")[1]}') except Exception as msg: breakpoint() exit(msg) if not utils.check_period(date_, self.period): continue file_path = dir_path + filename utils.download(href, file_path) utils.delete_last_lines() print('Done')