Пример #1
0
def get_data(county=None, start=None, end=None, as_dataframe=False, data_dir=None):
    """Retreives data.

    Parameters
    ----------
    county : ``None`` or str
        If specified, results will be limited to the county corresponding to the
        given 5-character Texas county fips code i.e. 48???.
    end : ``None`` or date (see :ref:`dates-and-times`)
        Results will be limited to data on or before this date. Default is the
        current date.
    start : ``None`` or date (see :ref:`dates-and-times`)
        Results will be limited to data on or after this date. Default is the
        start of the calendar year for the end date.
    as_dataframe: bool
        If ``False`` (default), a dict with a nested set of dicts will be
        returned with data indexed by 5-character Texas county FIPS code. If ``True``
        then a pandas.DataFrame object will be returned.  The pandas dataframe
        is used internally, so setting this to ``True`` is a little bit faster
        as it skips a serialization step.
    data_dir : ``None`` or directory path
        Directory for holding downloaded data files. If no path is provided
        (default), then a user-specific directory for holding application data
        will be used (the directory will depend on the platform/operating
        system).


    Returns
    -------
    data : dict or pandas.Dataframe
        A dict or pandas.DataFrame representing the data. See the
        ``as_dataframe`` parameter for more.
    """
    if end is None:
        end_date = datetime.date.today()
    else:
        end_date = util.convert_date(end)
    if start is None:
        start_date = datetime.date(end_date.year, 1, 1)
    else:
        start_date = util.convert_date(start)
    if data_dir is None:
        data_dir = os.path.join(util.get_ulmo_dir(), 'twc/kbdi')

    df = pandas.concat([
        _date_dataframe(date, data_dir)
        for date in pandas.period_range(start_date, end_date, freq='D')
    ], ignore_index=True)
    fips_df = _fips_dataframe()
    df = pandas.merge(df, fips_df, left_on='county', right_on='name')
    del df['name']

    if county:
        df = df[df['fips'] == county]

    if as_dataframe:
        return df
    else:
        return _as_data_dict(df)
Пример #2
0
def get_stations(fips=None, country=None, state=None, start=None, end=None, update=True):

    if start:
        start_date = util.convert_date(start)
    else:
        start_date = None
    if end:
        end_date = util.convert_date(end)
    else:
        end_date = None

    if isinstance(fips, basestring):
        fips = [fips]
    if isinstance(country, basestring):
        country = [country]
    if isinstance(state, basestring):
        state = [state]

    stations_url = 'http://www1.ncdc.noaa.gov/pub/data/gsod/ish-history.csv'
    with util.open_file_for_url(stations_url, NCDC_GSOD_STATIONS_FILE) as f:
        reader = csv.DictReader(f)

        if fips is None and country is None and state is None \
                and start is None and end is None:
            rows = reader
        else:
            if start_date is None:
                start_str = None
            else:
                start_str = start_date.strftime('%Y%m%d')
            if end_date is None:
                end_str = None
            else:
                end_str = end_date.strftime('%Y%m%d')
            rows = [
                row for row in reader
                if _passes_row_filter(row, fips=fips, country=country,
                    state=state, start_str=start_str, end_str=end_str)
            ]

        stations = dict([
            (_station_code(row), _process_station(row))
            for row in rows
        ])
    return stations
Пример #3
0
def get_all_files(start=None, end=None):
    if start:
        start_date = util.convert_date(start)
    else:
        start_date = NCDC_GSOD_START_DATE
    if end:
        end_date = util.convert_date(end)
    else:
        end_date = datetime.date.today()

    for year in range(start_date.year, end_date.year + 1):
        tar_path = _get_gsod_file(year)
        with _open_tarfile(tar_path, 'r:') as gsod_tar:
            stations_in_file = [
                name.split('./')[-1].rsplit('-', 1)[0]
                for name in gsod_tar.getnames() if len(name) > 1]
            stations = stations_in_file
            for station in stations:
                _read_gsod_file(gsod_tar, station, year)
Пример #4
0
def test_convert_date_from_datetime():
    compare_dates = [
        (datetime.datetime(2011, 12, 31, 20), datetime.date(2011, 12, 31)),
        (datetime.datetime(2011, 12, 31, 0, 0, 0), datetime.date(2011, 12, 31)),
        (datetime.datetime(2011, 12, 31, 23, 59, 59), datetime.date(2011, 12, 31)),
    ]

    for test_datetime, test_date in compare_dates:
        date = util.convert_date(test_datetime)
        assert date == test_date
Пример #5
0
def test_convert_date_from_date():
    compare_dates = [
        datetime.date(2011, 12, 31),
        datetime.date(2012, 2, 29),
        datetime.date(2013, 1, 1),
    ]

    for test_date in compare_dates:
        date = util.convert_date(test_date)
        assert date == test_date
Пример #6
0
def test_convert_date_from_datetime():
    compare_dates = [
        (datetime.datetime(2011, 12, 31, 20), datetime.date(2011, 12, 31)),
        (datetime.datetime(2011, 12, 31, 0, 0, 0), datetime.date(2011, 12,
                                                                 31)),
        (datetime.datetime(2011, 12, 31, 23, 59,
                           59), datetime.date(2011, 12, 31)),
    ]

    for test_datetime, test_date in compare_dates:
        date = util.convert_date(test_datetime)
        assert date == test_date
Пример #7
0
def get_station_data(station_code,
                     parameter,
                     start=None,
                     end=None,
                     min_value=None,
                     max_value=None):

    if min_value is None:
        min_value = -9000000
    if max_value is None:
        max_value = 9000000
    if start is None:
        start_date = DEFAULT_START_DATE
    else:
        start_date = util.convert_date(start)
    if end is None:
        end_date = datetime.date.today()
    else:
        end_date = util.convert_date(end)

    start_date_str = _format_date(start_date)
    end_date_str = _format_date(end_date)

    form_data = {
        'fld_station': station_code,
        'fld_parameter': parameter,
        'fld_from': min_value,
        'fld_to': max_value,
        'fld_fromdate': start_date_str,
        'fld_todate': end_date_str,
        'hdn_excel': '',
    }

    req = requests.post(URL, params=dict(sid=station_code), data=form_data)
    soup = BeautifulSoup(req.content)
    data_table = soup.find('table').find_all('table')[-1]

    return dict(
        [_parse_value(value_tr) for value_tr in data_table.find_all('tr')[2:]])
Пример #8
0
def test_convert_date_from_string():
    compare_dates = [
        ("2011-12-31", datetime.date(2011, 12, 31)),
        ("12/31/2011", datetime.date(2011, 12, 31)),
        ("2012-02-29", datetime.date(2012, 2, 29)),
        ("2012-2-29", datetime.date(2012, 2, 29)),
        ("2/29/2012", datetime.date(2012, 2, 29)),
        ("02/29/2012", datetime.date(2012, 2, 29)),
        ("2013-01-01", datetime.date(2013, 1, 1)),
    ]

    for test_str, test_date in compare_dates:
        date = util.convert_date(test_str)
        assert date == test_date
Пример #9
0
def test_convert_date_from_string():
    compare_dates = [
        ('2011-12-31', datetime.date(2011, 12, 31)),
        ('12/31/2011', datetime.date(2011, 12, 31)),
        ('2012-02-29', datetime.date(2012, 2, 29)),
        ('2012-2-29', datetime.date(2012, 2, 29)),
        ('2/29/2012', datetime.date(2012, 2, 29)),
        ('02/29/2012', datetime.date(2012, 2, 29)),
        ('2013-01-01', datetime.date(2013, 1, 1)),
    ]

    for test_str, test_date in compare_dates:
        date = util.convert_date(test_str)
        assert date == test_date
Пример #10
0
def get_station_data(station_code, parameter, start=None, end=None,
        min_value=None, max_value=None):

    if min_value is None:
        min_value = -9000000
    if max_value is None:
        max_value = 9000000
    if start is None:
        start_date = DEFAULT_START_DATE
    else:
        start_date = util.convert_date(start)
    if end is None:
        end_date = datetime.date.today()
    else:
        end_date = util.convert_date(end)

    start_date_str = _format_date(start_date)
    end_date_str = _format_date(end_date)

    form_data = {
        'fld_station': station_code,
        'fld_parameter': parameter,
        'fld_from': min_value,
        'fld_to': max_value,
        'fld_fromdate': start_date_str,
        'fld_todate': end_date_str,
        'hdn_excel': '',
    }

    req = requests.post(URL, params=dict(sid=station_code), data=form_data)
    soup = BeautifulSoup(req.text)
    data_table = soup.find('table').find_all('table')[-1]

    return dict([
        _parse_value(value_tr)
        for value_tr in data_table.find_all('tr')[2:]
    ])
Пример #11
0
def get_station_data(station_code, date=None, as_dataframe=False):
    """Fetches data for a station at a given date.


    Parameters
    ----------
    station_code: str
        The station code to fetch data for. A list of stations can be retrieved with
        ``get_stations()``
    date : ``None`` or date (see :ref:`dates-and-times`)
        The date of the data to be queried. If date is ``None`` (default), then
        data for the current day is retreived.
    as_dataframe : bool
        This determines what format values are returned as. If ``False``
        (default), the values dict will be a dict with timestamps as keys mapped
        to a dict of gauge variables and values. If ``True`` then the values
        dict will be a pandas.DataFrame object containing the equivalent
        information.


    Returns
    -------
    data_dict : dict
        A dict containing station information and values.
    """

    station_dict = {}
    if date is None:
        date_str = 'current'
        year = datetime.date.today().year
    else:
        date = util.convert_date(date)
        date_str = date.strftime('%Y%m%d')
        year = date.year

    filename = '%s.%s.html' % (station_code, date_str)
    data_url = 'http://www.swt-wc.usace.army.mil/webdata/gagedata/' + filename
    path = os.path.join(USACE_SWTWC_DIR, filename)

    with util.open_file_for_url(data_url, path) as f:
        soup = BeautifulSoup(f)
        pre = soup.find('pre')
        if pre is None:
            error_msg = 'no data could be found for station code %(station_code)s and date %(date)s (url: %(data_url)s)' % {
                'date': date,
                'data_url': data_url,
                'station_code': station_code,
            }
            raise ValueError(error_msg)
        sio = StringIO.StringIO(str(pre.text.strip()))

    first_line = sio.readline()
    split = first_line[8:].strip().split()

    station_dict['code'] = split[0]
    station_dict['description'] = ' '.join(split[1:])

    second_line = sio.readline()
    station_dict['station_type'] = second_line.strip().split(':')[1].strip()

    notes = []

    while 1:
        next_line = sio.readline()
        if ':' in next_line:
            notes.append(next_line.strip())
        else:
            break

    if len(notes):
        station_dict['notes'] = '\n'.join(notes)

    variable_names = _split_line(sio.readline()[15:], 10)
    variable_units = _split_line(sio.readline()[15:], 10)
    variable_sources = _split_line(sio.readline()[15:], 10)

    station_dict['variables'] = dict([(name, {
        'unit': unit,
        'source': source
    }) for name, unit, source in zip(variable_names, variable_units,
                                     variable_sources)])

    station_dict['timezone'] = sio.readline().strip().strip('()')
    column_names = ['datetime'] + variable_names
    widths = [15] + ([10] * len(variable_names))
    converters = dict([(variable_name,
                        lambda x: float(x) if x != '----' else np.nan)
                       for variable_name in variable_names])
    date_parser = lambda x: _convert_datetime(x, year)
    dataframe = pandas.read_fwf(
        sio,
        names=column_names,
        widths=widths,
        index_col=['datetime'],
        na_values=['----'],
        converters=converters,
        parse_dates=True,
        date_parser=date_parser)

    # parse out rows that are all nans (e.g. end of "current" page)
    dataframe = dataframe[~np.isnan(dataframe.T.sum())]

    if as_dataframe:
        station_dict['values'] = dataframe
    else:
        station_dict['values'] = util.dict_from_dataframe(dataframe)

    return station_dict
Пример #12
0
def get_data(station_ids=None,
             sensor_ids=None,
             resolutions=None,
             start=None,
             end=None):
    """
    Downloads data for a set of CDEC station and sensor ids. If either is not
    provided, all available data will be downloaded. Be really careful with
    choosing hourly resolution as the data sets are big, and CDEC's servers
    are slow as molasses in winter.


    Usage example::

        from ulmo import cdec
        dat = cdec.historical.get_data(['PRA'],resolutions=['daily'])

    Parameters
    ----------
    station_ids : iterable of strings or ``None``

    sensor_ids : iterable of integers or ``None``
        check out  or use the ``get_sensors()`` function to see a list of
        available sensor numbers

    resolutions : iterable of strings or ``None``
        Possible values are 'event', 'hourly', 'daily', and 'monthly' but not
        all of these time resolutions are available at every station.


    Returns
    -------
    dict : a python dict
        a python dict with site codes as keys. Values will be nested dicts
        containing all of the sensor/resolution combinations.
    """

    if start is None:
        start_date = util.convert_date(DEFAULT_START_DATE)
    else:
        start_date = util.convert_date(start)
    if end is None:
        end_date = util.convert_date(DEFAULT_END_DATE)
    else:
        end_date = util.convert_date(end)

    start_date_str = _format_date(start_date)
    end_date_str = _format_date(end_date)

    if station_ids is None:
        station_ids = get_stations().index

    sensors = get_station_sensors(station_ids, sensor_ids, resolutions)

    d = {}

    for station_id, sensor_list in list(sensors.items()):
        station_data = {}

        for index, row in sensor_list.iterrows():
            res = row.ix['resolution']
            var = row.ix['variable']
            sensor_id = row.ix['sensor_id']
            station_data[var] = _download_raw(station_id, sensor_id,
                                              _res_to_dur_code(res),
                                              start_date_str, end_date_str)

        d[station_id] = station_data

    return d
Пример #13
0
def get_data(station_codes, start=None, end=None, parameters=None):
    """Retrieves data for a set of stations.


    Parameters
    ----------
    station_codes : str or list
        Single station code or iterable of station codes to retrieve data for.
    start : ``None`` or date (see :ref:`dates-and-times`)
        If specified, data are limited to values after this date.
    end : ``None`` or date (see :ref:`dates-and-times`)
        If specified, data are limited to values before this date.
    parameters : ``None``, str or list
        If specified, data are limited to this set of parameter codes.


    Returns
    -------
    data_dict : dict
        Dict with station codes keyed to lists of value dicts.
    """
    if start:
        start_date = util.convert_date(start)
    else:
        start_date = NCDC_GSOD_START_DATE
    if end:
        end_date = util.convert_date(end)
    else:
        end_date = datetime.date.today()
    if isinstance(parameters, basestring):
        parameters = [parameters]
    if parameters and not 'date' in parameters:
        # add date to list of parameters if it's not there already
        parameters.insert(0, 'date')
    if isinstance(station_codes, basestring):
        station_codes = [station_codes]

    # note: opening tar files and parsing the headers and such is a relatively
    # lengthy operation so you don't want to do it too often, hence try to
    # grab all stations at the same time per tarfile
    data_dict = dict([(station_code, None) for station_code in station_codes])

    for year in range(start_date.year, end_date.year + 1):
        tar_path = _get_gsod_file(year)
        with tarfile.open(tar_path, 'r:') as gsod_tar:
            stations_in_file = [
                name.split('./')[-1].rsplit('-', 1)[0]
                for name in gsod_tar.getnames() if len(name) > 1]
            if station_codes:
                stations = list(set(station_codes) & set(stations_in_file))
            else:
                stations = stations_in_file
            for station in stations:
                year_data = _read_gsod_file(gsod_tar, station, year)
                if parameters:
                    year_data = _subset_record_array(year_data, parameters)
                if not year_data is None:
                    # apply date ranges if they exist
                    if start_date or end_date:
                        mask = np.ones(len(year_data), dtype=bool)
                        if start_date:
                            mask = mask & (year_data['date'] >= start_date)
                        if end_date:
                            mask = mask & (year_data['date'] <= end_date)
                        year_data = year_data[mask]

                    if not data_dict[station] is None:
                        # XXX: this could be more efficient for large numbers
                        # of years with a list comprehension or generator
                        data_dict[station] = np.append(data_dict[station], year_data)
                    else:
                        data_dict[station] = year_data
    for station, data_array in data_dict.items():
        if not data_dict[station] is None:
            data_dict[station] = _record_array_to_value_dicts(data_array)
    return data_dict
Пример #14
0
def get_data(state=None, climate_division=None, start=None, end=None,
             as_dataframe=False):
    """Retreives data.


    Parameters
    ----------
    state : ``None`` or str
        If specified, results will be limited to the state corresponding to the
        given 2-character state code.
    climate_division : ``None`` or int
        If specified, results will be limited to the climate division.
    start : ``None`` or date (see :ref:`dates-and-times`)
        Results will be limited to those after the given date. Default is the
        start of the current calendar year.
    end : ``None`` or date (see :ref:`dates-and-times`)
        If specified, results will be limited to data before this date.
    as_dataframe: bool
        If ``False`` (default), a dict with a nested set of dicts will be
        returned with data indexed by state, then climate division. If ``True``
        then a pandas.DataFrame object will be returned.  The pandas dataframe
        is used internally, so setting this to ``True`` is a little bit faster
        as it skips a serialization step.


    Returns
    -------
    data : dict or pandas.Dataframe
        A dict or pandas.DataFrame representing the data. See the
        ``as_dataframe`` parameter for more.
    """
    if not start is None:
        start_date = util.convert_date(start)
    else:
        start_date = None
    if not end is None:
        end_date = util.convert_date(end)
    else:
        end_date = None

    if not end_date:
        end_date = datetime.date.today()
    if not start_date:
        start_date = datetime.date(end_date.year, 1, 1)

    start_year, start_week = _week_number(start_date)
    end_year, end_week = _week_number(end_date)

    if state:
        state_code = STATE_CODES.get(state.upper())
    else:
        state_code = None

    data = None
    for year in range(start_year, end_year + 1):
        url = _get_data_url(year)
        format_type = _get_data_format(year)
        with _open_data_file(url) as data_file:
            year_data = _parse_data_file(data_file, format_type, year)

        if state_code:
            year_data = year_data[year_data['state_code'] == state_code]
        if climate_division:
            year_data = year_data[year_data['climate_division'] == climate_division]

        year_data = _reindex_data(year_data)

        if data is None:
            data = year_data
        else:
            # some data are duplicated (e.g. final data from 2011 stretches into
            # prelim data of 2012), so just take those that are new
            append_index = year_data.index - data.index
            if len(append_index):
                data = data.append(year_data.ix[append_index])

    # restrict results to date range
    period_index = pandas.PeriodIndex(data['period'])
    periods_in_range = (period_index >= start_date) & (period_index <= end_date)
    data = data[periods_in_range]

    # this does what data.reset_index() should do, but at least as of 0.10.1, that sets
    # will cast period objects to ints
    data.index = np.arange(len(data))
    if as_dataframe:
        return data
    else:
        return _as_data_dict(data)
Пример #15
0
def get_historical_data(site_code, start=None, end=None, as_dataframe=False):
    """Fetches data for a site at a given date.
    Parameters
    ----------
    site_code: str
        The site code to fetch data for. A list of sites can be retrieved with
        ``get_sites()``
    date : ``None`` or date (see :ref:`dates-and-times`)
        The date of the data to be queried. If date is ``None`` (default), then
        all data will be returned.
    as_dataframe : bool
        This determines what format values are returned as. If ``False``
        (default), the values dict will be a dict with timestamps as keys mapped
        to a dict of gauge variables and values. If ``True`` then the values
        dict will be a pandas.DataFrame object containing the equivalent
        information.
    Returns
    -------
    data_dict : dict
        A dict containing site information and values.
    """

    if isinstance(site_code, (str)):
        pass
    elif isinstance(site_code, (int)):
        site_code = str(site_code)
    else:
        log.error("Unsure of the site_code parameter type. \
                Try string or int")
        raise

    waterquality_url = "http://waterquality.lcra.org/parameter.aspx?qrySite=%s" % site_code
    waterquality_url2 = 'http://waterquality.lcra.org/events.aspx'

    initial_request = requests.get(waterquality_url)
    initialsoup = BeautifulSoup(initial_request.content, 'html.parser')

    sitevals = [statag.get('value', None)
        for statag in initialsoup.findAll(id="multiple")
        if statag.get('value', None)]

    result = _make_next_request(waterquality_url2, 
                                initial_request, 
                                {'multiple': sitevals,
                                'site': site_code})

    soup = BeautifulSoup(result.content, 'html.parser')

    gridview = soup.find(id="GridView1")

    results = []

    headers = [head.text for head in gridview.findAll('th')]

    #uses \xa0 for blank

    for row in gridview.findAll('tr'):
        vals = [_parse_val(aux.text) for aux in row.findAll('td')]
        if len(vals) == 0:
            continue
        results.append(dict(zip(headers, vals)))

    data = _create_dataframe(results)

    if start and not data.empty:
        data = data.ix[util.convert_date(start):]

    if end and not data.empty:
        data = data.ix[:util.convert_date(end)]

    if as_dataframe:
        return data
    else:
        return data.to_dict(orient='records')
Пример #16
0
def get_stations(country=None, state=None, start=None, end=None, update=True):
    """Retrieve information on the set of available stations.


    Parameters
    ----------
    country : {``None``, str, or iterable}
        If specified, results will be limited to stations with matching country
        codes.
    state : {``None``, str, or iterable}
        If specified, results will be limited to stations with matching state
        codes.
    start : ``None`` or date (see :ref:`dates-and-times`)
        If specified, results will be limited to stations which have data after
        this start date.
    end : ``None`` or date (see :ref:`dates-and-times`)
        If specified, results will be limited to stations which have data before
        this end date.
    update : bool
        If ``True`` (default), check for a newer copy of the stations file and
        download if it is newer the previously downloaded copy. If ``False``,
        then a new stations file will only be downloaded if a previously
        downloaded file cannot be found.


    Returns
    -------
    stations_dict : dict
        A dict with USAF-WBAN codes keyed to station information dicts.
    """
    if start:
        start_date = util.convert_date(start)
    else:
        start_date = None
    if end:
        end_date = util.convert_date(end)
    else:
        end_date = None

    if isinstance(country, basestring):
        country = [country]
    if isinstance(state, basestring):
        state = [state]

    stations_url = 'http://www1.ncdc.noaa.gov/pub/data/noaa/isd-history.csv'
    with util.open_file_for_url(stations_url, NCDC_GSOD_STATIONS_FILE) as f:
        reader = csv.DictReader(f)

        if country is None and state is None and start is None and end is None:
            rows = reader
        else:
            if start_date is None:
                start_str = None
            else:
                start_str = start_date.strftime('%Y%m%d')
            if end_date is None:
                end_str = None
            else:
                end_str = end_date.strftime('%Y%m%d')
            rows = [
                row for row in reader
                if _passes_row_filter(row, country=country, state=state,
                    start_str=start_str, end_str=end_str)
            ]

        stations = dict([
            (_station_code(row), _process_station(row))
            for row in rows
        ])
    return stations
Пример #17
0
def get_station_data(station_code, date=None, as_dataframe=False):
    """Fetches data for a station at a given date.


    Parameters
    ----------
    station_code: str
        The station code to fetch data for. A list of stations can be retrieved with
        ``get_stations()``
    date : ``None`` or date (see :ref:`dates-and-times`)
        The date of the data to be queried. If date is ``None`` (default), then
        data for the current day is retreived.
    as_dataframe : bool
        This determines what format values are returned as. If ``False``
        (default), the values dict will be a dict with timestamps as keys mapped
        to a dict of gauge variables and values. If ``True`` then the values
        dict will be a pandas.DataFrame object containing the equivalent
        information.


    Returns
    -------
    data_dict : dict
        A dict containing station information and values.
    """

    station_dict = {}
    if date is None:
        date_str = 'current'
        year = datetime.date.today().year
    else:
        date = util.convert_date(date)
        date_str = date.strftime('%Y%m%d')
        year = date.year

    filename = '%s.%s.html' % (station_code, date_str)
    data_url = 'http://www.swt-wc.usace.army.mil/webdata/gagedata/' + filename
    path = os.path.join(USACE_SWTWC_DIR, filename)

    with util.open_file_for_url(data_url, path) as f:
        soup = BeautifulSoup(f)
        pre = soup.find('pre')
        if pre is None:
            error_msg = 'no data could be found for station code %(station_code)s and date %(date)s (url: %(data_url)s)' % {
                'date': date,
                'data_url': data_url,
                'station_code': station_code,
            }
            raise ValueError(error_msg)
        sio = StringIO.StringIO(str(pre.text.strip()))

    first_line = sio.readline()
    split = first_line[8:].strip().split()

    station_dict['code'] = split[0]
    station_dict['description'] = ' '.join(split[1:])

    second_line = sio.readline()
    station_dict['station_type'] = second_line.strip().split(':')[1].strip()

    notes = []

    while 1:
        next_line = sio.readline()
        if ':' in next_line:
            notes.append(next_line.strip())
        else:
            break

    if len(notes):
        station_dict['notes'] = '\n'.join(notes)

    variable_names = _split_line(sio.readline()[15:], 10)
    variable_units = _split_line(sio.readline()[15:], 10)
    variable_sources = _split_line(sio.readline()[15:], 10)

    station_dict['variables'] = dict([(name, {
        'unit': unit,
        'source': source
    }) for name, unit, source in zip(variable_names, variable_units,
                                     variable_sources)])

    station_dict['timezone'] = sio.readline().strip().strip('()')
    column_names = ['datetime'] + variable_names
    widths = [15] + ([10] * len(variable_names))
    converters = dict([(variable_name, lambda x: float(x)
                        if x != '----' else np.nan)
                       for variable_name in variable_names])
    date_parser = lambda x: _convert_datetime(x, year)
    dataframe = pandas.read_fwf(sio,
                                names=column_names,
                                widths=widths,
                                index_col=['datetime'],
                                na_values=['----'],
                                converters=converters,
                                parse_dates=True,
                                date_parser=date_parser)

    # parse out rows that are all nans (e.g. end of "current" page)
    dataframe = dataframe[~np.isnan(dataframe.T.sum())]

    if as_dataframe:
        station_dict['values'] = dataframe
    else:
        station_dict['values'] = util.dict_from_dataframe(dataframe)

    return station_dict
Пример #18
0
def get_data(county=None,
             start=None,
             end=None,
             as_dataframe=False,
             data_dir=None):
    """Retreives data.

    Parameters
    ----------
    county : ``None`` or str
        If specified, results will be limited to the county corresponding to the
        given 5-character Texas county fips code i.e. 48???.
    end : ``None`` or date (see :ref:`dates-and-times`)
        Results will be limited to data on or before this date. Default is the
        current date.
    start : ``None`` or date (see :ref:`dates-and-times`)
        Results will be limited to data on or after this date. Default is the
        start of the calendar year for the end date.
    as_dataframe: bool
        If ``False`` (default), a dict with a nested set of dicts will be
        returned with data indexed by 5-character Texas county FIPS code. If ``True``
        then a pandas.DataFrame object will be returned.  The pandas dataframe
        is used internally, so setting this to ``True`` is a little bit faster
        as it skips a serialization step.
    data_dir : ``None`` or directory path
        Directory for holding downloaded data files. If no path is provided
        (default), then a user-specific directory for holding application data
        will be used (the directory will depend on the platform/operating
        system).


    Returns
    -------
    data : dict or pandas.Dataframe
        A dict or pandas.DataFrame representing the data. See the
        ``as_dataframe`` parameter for more.
    """
    if end is None:
        end_date = datetime.date.today()
    else:
        end_date = util.convert_date(end)
    if start is None:
        start_date = datetime.date(end_date.year, 1, 1)
    else:
        start_date = util.convert_date(start)
    if data_dir is None:
        data_dir = os.path.join(util.get_ulmo_dir(), 'twc/kbdi')

    df = pandas.concat([
        _date_dataframe(date, data_dir)
        for date in pandas.period_range(start_date, end_date, freq='D')
    ],
                       ignore_index=True)
    fips_df = _fips_dataframe()
    df = pandas.merge(df, fips_df, left_on='county', right_on='name')
    del df['name']

    if county:
        df = df[df['fips'] == county]

    if as_dataframe:
        return df
    else:
        return _as_data_dict(df)
Пример #19
0
def test_convert_date_from_date():
    compare_dates = [datetime.date(2011, 12, 31), datetime.date(2012, 2, 29), datetime.date(2013, 1, 1)]

    for test_date in compare_dates:
        date = util.convert_date(test_date)
        assert date == test_date
Пример #20
0
def get_data(station_ids=None, sensor_ids=None, resolutions=None, start=None, end=None):
    """
    Downloads data for a set of CDEC station and sensor ids. If either is not
    provided, all available data will be downloaded. Be really careful with
    choosing hourly resolution as the data sets are big, and CDEC's servers
    are slow as molasses in winter.


    Usage example::

        from ulmo import cdec
        dat = cdec.historical.get_data(['PRA'],resolutions=['daily'])

    Parameters
    ----------
    station_ids : iterable of strings or ``None``

    sensor_ids : iterable of integers or ``None``
        check out  or use the ``get_sensors()`` function to see a list of
        available sensor numbers

    resolutions : iterable of strings or ``None``
        Possible values are 'event', 'hourly', 'daily', and 'monthly' but not
        all of these time resolutions are available at every station.


    Returns
    -------
    dict : a python dict
        a python dict with site codes as keys. Values will be nested dicts
        containing all of the sensor/resolution combinations.
    """

    if start is None:
        start_date = util.convert_date(DEFAULT_START_DATE)
    else:
        start_date = util.convert_date(start)
    if end is None:
        end_date = util.convert_date(DEFAULT_END_DATE)
    else:
        end_date = util.convert_date(end)

    start_date_str = _format_date(start_date)
    end_date_str = _format_date(end_date)

    if station_ids is None:
        station_ids = get_stations().index

    sensors = get_station_sensors(station_ids, sensor_ids, resolutions)

    d = {}

    for station_id, sensor_list in list(sensors.items()):
        station_data = {}

        for index, row in sensor_list.iterrows():
            res = row.ix['resolution']
            var = row.ix['variable']
            sensor_id = row.ix['sensor_id']
            station_data[var] = _download_raw(station_id, sensor_id, _res_to_dur_code(res), start_date_str, end_date_str)

        d[station_id] = station_data

    return d
Пример #21
0
def get_historical_data(site_code, start=None, end=None, as_dataframe=False):
    """Fetches data for a site at a given date.

    Parameters
    ----------
    site_code : str
        The site code to fetch data for. A list of sites can be retrieved with
        ``get_sites()``
    date : ``None`` or date (see :ref:`dates-and-times`)
        The date of the data to be queried. If date is ``None`` (default), then
        all data will be returned.
    as_dataframe : bool
        This determines what format values are returned as. If ``False``
        (default), the values dict will be a dict with timestamps as keys mapped
        to a dict of gauge variables and values. If ``True`` then the values
        dict will be a pandas.DataFrame object containing the equivalent
        information.

    Returns
    -------
    data_dict : dict
        A dict containing site information and values.
    """

    if isinstance(site_code, (str)):
        pass
    elif isinstance(site_code, (int)):
        site_code = str(site_code)
    else:
        log.error("Unsure of the site_code parameter type. \
                Try string or int")
        raise

    waterquality_url = "http://waterquality.lcra.org/parameter.aspx?qrySite=%s" % site_code
    waterquality_url2 = 'http://waterquality.lcra.org/events.aspx'

    initial_request = requests.get(waterquality_url)
    initialsoup = BeautifulSoup(initial_request.content, 'html.parser')

    sitevals = [
        statag.get('value', None)
        for statag in initialsoup.findAll(id="multiple")
        if statag.get('value', None)
    ]

    result = _make_next_request(waterquality_url2, initial_request, {
        'multiple': sitevals,
        'site': site_code
    })

    soup = BeautifulSoup(result.content, 'html.parser')

    gridview = soup.find(id="GridView1")

    results = []

    headers = [head.text for head in gridview.findAll('th')]

    # uses \xa0 for blank

    for row in gridview.findAll('tr'):
        vals = [_parse_val(aux.text) for aux in row.findAll('td')]
        if len(vals) == 0:
            continue
        results.append(dict(zip(headers, vals)))

    data = _create_dataframe(results)

    if start and not data.empty:
        data = data.ix[util.convert_date(start):]

    if end and not data.empty:
        data = data.ix[:util.convert_date(end)]

    if as_dataframe:
        return data
    else:
        return data.to_dict(orient='records')
Пример #22
0
def _parse_value(value_tr):
    date_td, value_td = value_tr.find_all('td')

    return (util.convert_date(date_td.text),
        float(value_td.text))
Пример #23
0
def _parse_value(value_tr):
    date_td, value_td = value_tr.find_all('td')

    return (util.convert_date(date_td.text), float(value_td.text))