예제 #1
0
    def _download_single_tc(self, year, tc, tc_dir, data_link_text):
        url = self.CONFIG['hwind']['url']
        tc_url = f'{url}/{tc.name.lower()}{year}.html'
        tc_dir = f'{tc_dir}{tc.basin}/{year}/{tc.name}/'
        os.makedirs(tc_dir, exist_ok=True)

        page = requests.get(tc_url)
        data = page.text
        soup = BeautifulSoup(data, features='lxml')
        anchors = soup.find_all('a', text=data_link_text)
        if not len(anchors):
            return False

        url_prefix = self.CONFIG['hwind']['data_url_prefix']
        for a in anchors:
            href = a.get("href")
            filename = href.split('/')[-1]
            file_url = f'{url_prefix}{href}'
            file_path = f'{tc_dir}{filename}'

            if not utils.check_period(self._get_dt_of_hwind_file(href),
                                      self.period):
                continue

            utils.download(file_url, file_path)

        return True
예제 #2
0
    def _extract_year_hurr_file_path(self, read_all=False):
        data_root_dir = self.SFMR_CONFIG['dirs']['hurr']
        self.year_hurr_file_path = dict()

        for year in self.year_hurr.keys():
            if not len(self.year_hurr[year]):
                continue
            self.year_hurr_file_path[year] = dict()

            for hurr in self.year_hurr[year]:
                spec_data_dir = '{0}{1}/{2}/'.format(data_root_dir, year, hurr)
                try:
                    filenames = [
                        f for f in os.listdir(spec_data_dir)
                        if f.endswith('.nc')
                    ]
                except FileNotFoundError:
                    pass
                if not len(filenames):
                    continue

                self.year_hurr_file_path[year][hurr] = []

                for file in filenames:
                    date_ = datetime.datetime.strptime(
                        file.split('SFMR')[1][:8] + '000000',
                        '%Y%m%d%H%M%S').date()

                    if not read_all and utils.check_period(date_, self.period):
                        self.year_hurr_file_path[year][hurr].append(
                            spec_data_dir + file)
                    if read_all:
                        self.year_hurr_file_path[year][hurr].append(
                            spec_data_dir + file)
예제 #3
0
    def _create_year_tc(self):
        # Structure of dictionary year_tc
        # year - tc_info (basin, sid and name)

        # Return existing year_tc
        year_tc_path = self.CONFIG['hwind']['pickle']['year_tc']

        # Create year_tc
        year_tc = dict()
        url = self.CONFIG['hwind']['url']

        # Get page according to url
        page = requests.get(url)
        data = page.text
        soup = BeautifulSoup(data, features='lxml')
        mydivs = soup.find_all('div', class_='legacy-year')

        for year in self.years:
            year_tc[year] = []
            if year <= 1994:
                year_name = '1994 &amp; Earlier'
            elif year > 2013:
                continue
            else:
                year_name = str(year)
            try:
                div_year = soup.find_all('div',
                                         class_='legacy-year',
                                         text=year_name)[0]
            except Exception as msg:
                self.logger.error(
                    (f'Error occurs when extract year div ' + f'from {url}'))
            strong_basins = div_year.find_parent('div', class_='row').\
                    find_next_sibling('div').find_all('strong')
            for item in strong_basins:
                basin = self.CONFIG['hwind']['basin_map'][item.text]
                anchors = item.find_parent('div', class_='col-sm-4').\
                        find_all('a')
                for a in anchors:
                    tc_url = a.get('href')
                    tc_page = requests.get(tc_url)
                    tc_data = tc_page.text
                    tc_soup = BeautifulSoup(tc_data, features='lxml')

                    gridded_anchors = tc_soup.find_all(
                        'a', text=self.CONFIG['hwind']['data_link_text']\
                        ['gridded'])
                    if not len(gridded_anchors):
                        continue

                    url_prefix = self.CONFIG['hwind']['data_url_prefix']
                    tc_data_in_period_count = 0
                    for g_a in gridded_anchors:
                        if utils.check_period(
                                self._get_dt_of_hwind_file(g_a.get('href')),
                                self.period):
                            tc_data_in_period_count += 1

                    if not tc_data_in_period_count:
                        continue

                    tc_info = TCInfo(year, basin, a.text)
                    self._find_sid(tc_info)

                    year_tc[year].append(tc_info)

        os.makedirs(os.path.dirname(year_tc_path), exist_ok=True)
        with open(year_tc_path, 'wb') as file:
            pickle.dump(year_tc, file)

        return year_tc
예제 #4
0
파일: ibtracs.py 프로젝트: Neo-101/R2S
    def _read_detail(self, basin, region_restriction, vars, storm_num,
                     date_time_num, have_read, info):
        """Read detail of IBTrACS data.

        """
        total = storm_num
        # List to record all details
        tc_list = []
        IBTrACSTable = self.create_tc_table(basin)

        season_check_offset = self.CONFIG['ibtracs']\
                ['season_check_offset']
        for i in range(storm_num):
            print(f'\r{info} {i+1}/{total}', end='')
            # Season is not just the year, so to ensure correctly
            # skipping loop by checking season, we need to set an offset
            # for checking season
            if int(vars['season'][i]) < (self.period[0].year -
                                         season_check_offset):
                continue
            if int(vars['season'][i]) > (self.period[1].year +
                                         season_check_offset):
                continue

            # Skip this loop if datetime of first record is earlier than
            # start date of period of more than 60 days,
            # or datetime of first record is later than end date of
            # period
            iso_times = vars['iso_time'][i]
            not_masked_count = np.count_nonzero(iso_times.count(1))

            if not not_masked_count:
                self.logger.debug((f'Skipping No.{i+1} TC because its ' +
                                   f'iso_time field is all masked'))
                continue

            last_iso_time = iso_times[not_masked_count - 1]
            last_datetime = datetime.datetime.strptime(
                last_iso_time.tostring().decode('utf-8'), '%Y-%m-%d %H:%M:%S')
            if last_datetime < self.period[0]:
                self.logger.debug(
                    (f'Skipping No.{i+1} TC because its ' +
                     f'last datetime is earlier than ' +
                     f'starting datetime of period: ' + f'{last_datetime}'))
                continue

            first_iso_time = iso_times[0]
            first_datetime = datetime.datetime.strptime(
                first_iso_time.tostring().decode('utf-8'), '%Y-%m-%d %H:%M:%S')
            if first_datetime > self.period[1]:
                self.logger.debug(
                    (f'Skipping No.{i+1} TC because its ' +
                     f'first datetime is later than ' +
                     f'ending datetime of period: ' + f'{first_datetime}'))
                continue

            self.logger.debug((f'Reading No.{i+1} TC which lived from ' +
                               f'{first_datetime} to {last_datetime}'))

            sid = vars['sid'][i].tostring().decode('utf-8')
            name = vars['name'][i]
            name = name[name.mask == False].tostring().decode('utf-8')

            for j in range(date_time_num):
                row = IBTrACSTable()

                # Read ISO time and check whether record is in period
                iso_time = vars['iso_time'][i][j]
                if iso_time[0] is MASKED:
                    break

                iso_time_str = iso_time.tostring().decode('utf-8')
                row.date_time = datetime.datetime.strptime(
                    iso_time_str, '%Y-%m-%d %H:%M:%S')
                if not utils.check_period(row.date_time, self.period):
                    continue

                # Insert rows which have read to TC table until
                # find next unread month
                # year, month = row.date_time.year, row.date_time.month
                # if not have_read[year][month]:
                #     if len(tc_list):
                #         utils.bulk_insert_avoid_duplicate_unique(
                #             tc_list, self.CONFIG['database']\
                #             ['batch_size']['insert'],
                #             IBTrACSTable, ['sid_date_time'], self.session,
                #             check_self=True)
                #         tc_list = []
                #     self.logger.debug((f'Reading WMO records of '
                #                       + f'{year}-{str(month).zfill(2)}'))
                #     have_read[year][month] = True

                # Read basin of TC
                row.basin = vars['basin'][i][j].tostring().decode('utf-8')

                # Read latitude, longitude, minimal centeral pressure,
                # maximum sustained wind speed from official WMO agency
                lat = vars['lat'][i][j]
                lon = (vars['lon'][i][j] + 360) % 360
                # breakpoint()
                if lat is MASKED or lon is MASKED:
                    continue
                if region_restriction:
                    if (lat < self.lat1 or lat > self.lat2 or lon < self.lon1
                            or lon > self.lon2):
                        continue

                pres = vars['wmo_pres'][i][j]
                wind = vars['wmo_wind'][i][j]
                # if pres is MASKED or wind is MASKED:
                #     continue

                # Set attributes of row
                row.sid = sid
                if name != 'NOT_NAMED':
                    row.name = name
                row.lat = float(lat)
                row.lon = float(lon)
                row.pres = int(pres) if pres is not MASKED else None
                row.wind = int(wind) if wind is not MASKED else None
                row.sid_date_time = f'{sid}_{row.date_time}'

                # Average radius of 34/50/64 knot winds in four
                # directions (ne, se, sw, nw) from three agencies
                # (bom, reunion, usa)
                dirs = ['ne', 'se', 'sw', 'nw']
                radii = dict()
                for r in ['r34', 'r50', 'r64']:
                    radii[r] = dict()
                    for d in range(4):
                        radii[r][d] = []
                        for a in ['bom', 'reunion', 'usa']:
                            r_d_a = vars[f'{a}_{r}'][i][j][d]
                            if r_d_a is not MASKED:
                                radii[r][d].append(int(r_d_a))
                        if len(radii[r][d]):
                            setattr(row, f'{r}_{dirs[d]}',
                                    int(sum(radii[r][d]) / len(radii[r][d])))

                tc_list.append(row)
                # breakpoint()

        if len(tc_list):
            utils.bulk_insert_avoid_duplicate_unique(
                tc_list, self.CONFIG['database']\
                ['batch_size']['insert'],
                IBTrACSTable, ['sid_date_time'], self.session,
                check_self=True)

        utils.delete_last_lines()
        print('Done')
예제 #5
0
    def get_one_hurricane_brief_info(self, hurricane_sfmr_url):
        brief_info = []

        try:
            page = requests.get(hurricane_sfmr_url)
            data = page.text
            soup = bs4.BeautifulSoup(data, features='lxml')
            anchors = soup.find_all('a')
            filename_suffix = '.nc'
        except Exception as msg:
            breakpoint()
            exit(msg)

        possible_names = self.SFMR_CONFIG['possible_names']
        error_files = self.SFMR_CONFIG['error_files']
        wrong_name_correction = self.SFMR_CONFIG['files_with_wrong_name']

        for link in anchors:
            href = link.get('href')
            # Find href of netcdf file
            if href.endswith(filename_suffix):
                try:
                    split_name = None
                    # Extract file name
                    filename = href.split('/')[-1]
                    if filename in error_files:
                        self.logger.warning(f'[Skip] Error file {href}')
                        continue
                    if href in wrong_name_correction.keys():
                        filename = wrong_name_correction[href]
                        self.logger.warning(
                            f"""[Correct] wrong name from """
                            f"""{href.split('/')[-1]} to {filename}""")

                    for name in possible_names:
                        if name in filename:
                            tail_half = filename.split(name)[1]
                            split_name = name
                            break
                    # There may be NetCDF name format
                    # like 'USAF_SFMR0809221638.nc'
                    # from 'https://www.aoml.noaa.gov/hrd'
                    # '/Storm_pages/kyle2008/sfmr.html'
                    # It is very annoying and there seems
                    # no simple rule to check this problem.
                    # Because it hard to distinguish
                    # 'SFMR20110536' and 'SFMR20110524'.
                    # First one is the case as kyle2008, its
                    # actually date is 2020/11/05.
                    # Second one is a normal case, its
                    # actually date is 2011/05/24.
                    # Before 2020, following rule may work.
                    if (tail_half.startswith('20')
                            or tail_half.startswith('199')):
                        date_str = tail_half[:8]
                        date_ = datetime.date(int(date_str[:4]),
                                              int(date_str[4:6]),
                                              int(date_str[6:]))
                    else:
                        date_str = tail_half[:6]
                        date_ = datetime.date(int(f'20{date_str[:2]}'),
                                              int(date_str[2:4]),
                                              int(date_str[4:]))
                        filename = (f"""{filename.split(split_name)[0]}"""
                                    f"""{split_name}20"""
                                    f"""{filename.split(split_name)[1]}""")
                except Exception as msg:
                    breakpoint()
                    exit(msg)
                if not utils.check_period(date_, self.period):
                    continue

                info = SFMRDetail()

                info.hurr_name = hurricane_sfmr_url.split('/')[-2][:-4]
                info.filename = filename.replace(split_name, 'SFMR')
                info.file_url = href

                brief_info.append(info)

        return brief_info
예제 #6
0
    def _download_sfmr_data(self):
        """Download SFMR data of hurricanes.

        Parameters
        ----------
        None
            Nothing is required by this function.

        Returns
        -------
        hit_times : dict
            Times of hurricane NetCDF file's date being in period.

        """
        self.logger.info(self.SFMR_CONFIG['prompt']['info']\
                         ['download_hurr'])
        utils.set_format_custom_text(self.SFMR_CONFIG['data_name_length'])
        suffix = '.nc'
        save_root_dir = self.SFMR_CONFIG['dirs']['hurr']
        os.makedirs(save_root_dir, exist_ok=True)

        total = 0
        count = 0
        for year in self.year_hurr.keys():
            total += len(self.year_hurr[year])

        for year in self.year_hurr.keys():
            hurrs = list(self.year_hurr[year])
            for hurr in hurrs:
                count += 1
                info = (f'Download SFMR data of hurricane {hurr} ' +
                        f'in {year}')
                self.logger.debug(info)
                if count > 1:
                    utils.delete_last_lines()
                print(f'\r{info} ({count}/{total})', end='')

                # Create directory to store SFMR files
                dir_path = f'{save_root_dir}{year}/{hurr}/'
                os.makedirs(dir_path, exist_ok=True)
                # Generate keyword to consist url
                keyword = f'{hurr}{year}'
                url = (f'{self.SFMR_CONFIG["urls"]["prefix"]}' + f'{keyword}' +
                       f'{self.SFMR_CONFIG["urls"]["suffix"]}')
                # Get page according to url
                page = requests.get(url)
                data = page.text
                soup = bs4.BeautifulSoup(data, features='lxml')
                anchors = soup.find_all('a')

                # Times of NetCDF file's date being in period
                for link in anchors:
                    href = link.get('href')
                    # Find href of netcdf file
                    if href.endswith(suffix):
                        # Extract file name
                        filename = href.split('/')[-1]
                        tail_half = filename.split('SFMR')[1]
                        try:
                            # There may be NetCDF name format
                            # like 'USAF_SFMR0809221638.nc'
                            # from 'https://www.aoml.noaa.gov/hrd'
                            # '/Storm_pages/kyle2008/sfmr.html'
                            # It is very annoying and there seems
                            # no simple rule to check this problem.
                            # Because it hard to distinguish
                            # 'SFMR20110536' and 'SFMR20110524'.
                            # First one is the case as kyle2008, its
                            # actually date is 2020/11/05.
                            # Second one is a normal case, its
                            # actually date is 2011/05/24.
                            # Before 2020, following rule may work.
                            if (tail_half.startswith('20')
                                    or tail_half.startswith('199')):
                                date_str = tail_half[:8]
                                date_ = datetime.date(int(date_str[:4]),
                                                      int(date_str[4:6]),
                                                      int(date_str[6:]))
                            else:
                                date_str = tail_half[:6]
                                date_ = datetime.date(int(f'20{date_str[:2]}'),
                                                      int(date_str[2:4]),
                                                      int(date_str[4:]))
                                filename = (
                                    f'{filename.split("SFMR")[0]}SFMR20' +
                                    f'{filename.split("SFMR")[1]}')
                        except Exception as msg:
                            breakpoint()
                            exit(msg)
                        if not utils.check_period(date_, self.period):
                            continue
                        file_path = dir_path + filename

                        utils.download(href, file_path)

        utils.delete_last_lines()
        print('Done')