def _open_data_file(url, data_dir): """returns an open file handle for a data file; downloading if necessary or otherwise using a previously downloaded file """ file_name = url.rsplit('/', 1)[-1] file_path = os.path.join(data_dir, file_name) return util.open_file_for_url(url, file_path, check_modified=True, use_bytes=True)
def _open_data_file(url, data_dir): """returns an open file handle for a data file; downloading if necessary or otherwise using a previously downloaded file """ file_name = url.rsplit('/', 1)[-1] file_path = os.path.join(data_dir, file_name) return util.open_file_for_url(url, file_path, check_modified=True)
def _get_element_data(element, by_state, use_file, location_names): url = _get_url(element, by_state) filename = url.rsplit('/', 1)[-1] path = os.path.join(CIRS_DIR, filename) with util.open_file_for_url(url, path, use_file=use_file) as f: element_df = _parse_values(f, by_state, location_names, element) return element_df
def get_stations(): path = os.path.join(USACE_RIVERGAGES_DIR, 'datamining_field_list.cfm') with util.open_file_for_url(URL, path, use_bytes=True) as f: soup = BeautifulSoup(f) options = soup.find('select', id='fld_station').find_all('option') stations = _parse_options(options) return stations
def get_stations(): path = os.path.join(USACE_RIVERGAGES_DIR, 'datamining_field_list.cfm') with util.open_file_for_url(URL, path) as f: soup = BeautifulSoup(f) options = soup.find('select', id='fld_station').find_all('option') stations = _parse_options(options) return stations
def get_stations(fips=None, country=None, state=None, start=None, end=None, update=True): if start: start_date = util.convert_date(start) else: start_date = None if end: end_date = util.convert_date(end) else: end_date = None if isinstance(fips, basestring): fips = [fips] if isinstance(country, basestring): country = [country] if isinstance(state, basestring): state = [state] stations_url = 'http://www1.ncdc.noaa.gov/pub/data/gsod/ish-history.csv' with util.open_file_for_url(stations_url, NCDC_GSOD_STATIONS_FILE) as f: reader = csv.DictReader(f) if fips is None and country is None and state is None \ and start is None and end is None: rows = reader else: if start_date is None: start_str = None else: start_str = start_date.strftime('%Y%m%d') if end_date is None: end_str = None else: end_str = end_date.strftime('%Y%m%d') rows = [ row for row in reader if _passes_row_filter(row, fips=fips, country=country, state=state, start_str=start_str, end_str=end_str) ] stations = dict([ (_station_code(row), _process_station(row)) for row in rows ]) return stations
def get_stations(): """Fetches a list of station codes and descriptions. Returns ------- stations_dict : dict a python dict with station codes mapped to station information """ stations_url = 'http://www.swt-wc.usace.army.mil/shefids.htm' path = os.path.join(USACE_SWTWC_DIR, 'shefids.htm') with util.open_file_for_url(stations_url, path) as f: soup = BeautifulSoup(f) pre = soup.find('pre') links = pre.find_all('a') stations = [_parse_station_link(link) for link in links] return dict([(station['code'], station) for station in stations])
def get_station_data(station_code, date=None, as_dataframe=False): """Fetches data for a station at a given date. Parameters ---------- station_code: str The station code to fetch data for. A list of stations can be retrieved with ``get_stations()`` date : ``None`` or date (see :ref:`dates-and-times`) The date of the data to be queried. If date is ``None`` (default), then data for the current day is retreived. as_dataframe : bool This determines what format values are returned as. If ``False`` (default), the values dict will be a dict with timestamps as keys mapped to a dict of gauge variables and values. If ``True`` then the values dict will be a pandas.DataFrame object containing the equivalent information. Returns ------- data_dict : dict A dict containing station information and values. """ station_dict = {} if date is None: date_str = 'current' year = datetime.date.today().year else: date = util.convert_date(date) date_str = date.strftime('%Y%m%d') year = date.year filename = '%s.%s.html' % (station_code, date_str) data_url = 'http://www.swt-wc.usace.army.mil/webdata/gagedata/' + filename path = os.path.join(USACE_SWTWC_DIR, filename) with util.open_file_for_url(data_url, path) as f: soup = BeautifulSoup(f) pre = soup.find('pre') if pre is None: error_msg = 'no data could be found for station code %(station_code)s and date %(date)s (url: %(data_url)s)' % { 'date': date, 'data_url': data_url, 'station_code': station_code, } raise ValueError(error_msg) sio = StringIO.StringIO(str(pre.text.strip())) first_line = sio.readline() split = first_line[8:].strip().split() station_dict['code'] = split[0] station_dict['description'] = ' '.join(split[1:]) second_line = sio.readline() station_dict['station_type'] = second_line.strip().split(':')[1].strip() notes = [] while 1: next_line = sio.readline() if ':' in next_line: notes.append(next_line.strip()) else: break if len(notes): station_dict['notes'] = '\n'.join(notes) variable_names = _split_line(sio.readline()[15:], 10) variable_units = _split_line(sio.readline()[15:], 10) variable_sources = _split_line(sio.readline()[15:], 10) station_dict['variables'] = dict([(name, { 'unit': unit, 'source': source }) for name, unit, source in zip(variable_names, variable_units, variable_sources)]) station_dict['timezone'] = sio.readline().strip().strip('()') column_names = ['datetime'] + variable_names widths = [15] + ([10] * len(variable_names)) converters = dict([(variable_name, lambda x: float(x) if x != '----' else np.nan) for variable_name in variable_names]) date_parser = lambda x: _convert_datetime(x, year) dataframe = pandas.read_fwf( sio, names=column_names, widths=widths, index_col=['datetime'], na_values=['----'], converters=converters, parse_dates=True, date_parser=date_parser) # parse out rows that are all nans (e.g. end of "current" page) dataframe = dataframe[~np.isnan(dataframe.T.sum())] if as_dataframe: station_dict['values'] = dataframe else: station_dict['values'] = util.dict_from_dataframe(dataframe) return station_dict
def get_stations(country=None, state=None, start=None, end=None, update=True): """Retrieve information on the set of available stations. Parameters ---------- country : {``None``, str, or iterable} If specified, results will be limited to stations with matching country codes. state : {``None``, str, or iterable} If specified, results will be limited to stations with matching state codes. start : ``None`` or date (see :ref:`dates-and-times`) If specified, results will be limited to stations which have data after this start date. end : ``None`` or date (see :ref:`dates-and-times`) If specified, results will be limited to stations which have data before this end date. update : bool If ``True`` (default), check for a newer copy of the stations file and download if it is newer the previously downloaded copy. If ``False``, then a new stations file will only be downloaded if a previously downloaded file cannot be found. Returns ------- stations_dict : dict A dict with USAF-WBAN codes keyed to station information dicts. """ if start: start_date = util.convert_date(start) else: start_date = None if end: end_date = util.convert_date(end) else: end_date = None if isinstance(country, basestring): country = [country] if isinstance(state, basestring): state = [state] stations_url = 'http://www1.ncdc.noaa.gov/pub/data/noaa/isd-history.csv' with util.open_file_for_url(stations_url, NCDC_GSOD_STATIONS_FILE) as f: reader = csv.DictReader(f) if country is None and state is None and start is None and end is None: rows = reader else: if start_date is None: start_str = None else: start_str = start_date.strftime('%Y%m%d') if end_date is None: end_str = None else: end_str = end_date.strftime('%Y%m%d') rows = [ row for row in reader if _passes_row_filter(row, country=country, state=state, start_str=start_str, end_str=end_str) ] stations = dict([ (_station_code(row), _process_station(row)) for row in rows ]) return stations
def get_station_data(station_code, date=None, as_dataframe=False): """Fetches data for a station at a given date. Parameters ---------- station_code: str The station code to fetch data for. A list of stations can be retrieved with ``get_stations()`` date : ``None`` or date (see :ref:`dates-and-times`) The date of the data to be queried. If date is ``None`` (default), then data for the current day is retreived. as_dataframe : bool This determines what format values are returned as. If ``False`` (default), the values dict will be a dict with timestamps as keys mapped to a dict of gauge variables and values. If ``True`` then the values dict will be a pandas.DataFrame object containing the equivalent information. Returns ------- data_dict : dict A dict containing station information and values. """ station_dict = {} if date is None: date_str = 'current' year = datetime.date.today().year else: date = util.convert_date(date) date_str = date.strftime('%Y%m%d') year = date.year filename = '%s.%s.html' % (station_code, date_str) data_url = 'http://www.swt-wc.usace.army.mil/webdata/gagedata/' + filename path = os.path.join(USACE_SWTWC_DIR, filename) with util.open_file_for_url(data_url, path) as f: soup = BeautifulSoup(f) pre = soup.find('pre') if pre is None: error_msg = 'no data could be found for station code %(station_code)s and date %(date)s (url: %(data_url)s)' % { 'date': date, 'data_url': data_url, 'station_code': station_code, } raise ValueError(error_msg) sio = StringIO.StringIO(str(pre.text.strip())) first_line = sio.readline() split = first_line[8:].strip().split() station_dict['code'] = split[0] station_dict['description'] = ' '.join(split[1:]) second_line = sio.readline() station_dict['station_type'] = second_line.strip().split(':')[1].strip() notes = [] while 1: next_line = sio.readline() if ':' in next_line: notes.append(next_line.strip()) else: break if len(notes): station_dict['notes'] = '\n'.join(notes) variable_names = _split_line(sio.readline()[15:], 10) variable_units = _split_line(sio.readline()[15:], 10) variable_sources = _split_line(sio.readline()[15:], 10) station_dict['variables'] = dict([(name, { 'unit': unit, 'source': source }) for name, unit, source in zip(variable_names, variable_units, variable_sources)]) station_dict['timezone'] = sio.readline().strip().strip('()') column_names = ['datetime'] + variable_names widths = [15] + ([10] * len(variable_names)) converters = dict([(variable_name, lambda x: float(x) if x != '----' else np.nan) for variable_name in variable_names]) date_parser = lambda x: _convert_datetime(x, year) dataframe = pandas.read_fwf(sio, names=column_names, widths=widths, index_col=['datetime'], na_values=['----'], converters=converters, parse_dates=True, date_parser=date_parser) # parse out rows that are all nans (e.g. end of "current" page) dataframe = dataframe[~np.isnan(dataframe.T.sum())] if as_dataframe: station_dict['values'] = dataframe else: station_dict['values'] = util.dict_from_dataframe(dataframe) return station_dict