示例#1
0
    def parse(self, stations, years, fields):
        ''' Pass in some stations and years. For convenience (and to match CRN's data output)
        we'll only deal with data in complete years. '''
        # First make sure we've got the data locally:
        self._download(stations, years)

        doc = DataObjectCollection()
        for station in stations:
            do = DataObject()
            for field in self.fields:
                # Skip some fields we know we don't care about
                useless_fields = ['WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME', 'CRX_VN', 'SUR_TEMP_TYPE']
                if (fields and field not in fields) or (not fields and field in useless_fields):
                    continue
                do[field] = TimeSeries([])

            for year in years:
                f = open(self.storage_dir + self._filename(station, year))
                for line in f:
                    values = line.split()
                    do.append(values, self.fields)
            for ts in do.values():
                ts.replace_data(interpolate_forward_backward(ts, missing_values))
            doc.append(do)
        return doc
示例#2
0
    def parse(self, stations, years, fields):
        ''' Pass in some stations and years. For convenience (and to match CRN's data output)
        we'll only deal with data in complete years. '''
        # First make sure we've got the data locally:
        self._download(stations, years)

        doc = DataObjectCollection()
        for station in stations:
            do = DataObject()
            for field in self.fields:
                # Skip some fields we know we don't care about
                useless_fields = [
                    'WBANNO', 'UTC_DATE', 'UTC_TIME', 'LST_DATE', 'LST_TIME',
                    'CRX_VN', 'SUR_TEMP_TYPE'
                ]
                if (fields and field not in fields) or (not fields and field
                                                        in useless_fields):
                    continue
                do[field] = TimeSeries([])

            for year in years:
                f = open(self.storage_dir + self._filename(station, year))
                for line in f:
                    values = line.split()
                    do.append(values, self.fields)
            for ts in do.values():
                ts.replace_data(
                    interpolate_forward_backward(ts, missing_values))
            doc.append(do)
        return doc
示例#3
0
 def parse(self, listofdicts):
     doc = DataObjectCollection()
     for curdict in listofdicts:
         do = DataObject()
         for key, val in curdict.items():
             do[key] = val
         doc.append(do)
     return doc
示例#4
0
 def parse(self, listofdicts):
     doc = DataObjectCollection()
     for curdict in listofdicts:
         do = DataObject()
         for key, val in curdict.items():
             do[key] = val
         doc.append(do)
     return doc
示例#5
0
 def parse(self, sineslist):
     doc = DataObjectCollection()
     for sines in sineslist:
         do = DataObject()
         for key, sine in sines.items():
            ts = TimeSeries(sine)
            ts.sample_rate = 1
            do[key] = ts
         doc.append(do)
     return doc
示例#6
0
    def parse(self, sines):
        doc = DataObjectCollection()
        do = DataObject()
        for key, sine in sines.items():
           ts = TimeSeries(sine)
           ts.sample_rate = 1
#            ts.rangex = (-1,1)
           do[key] = ts
        doc.append(do)
        return doc
示例#7
0
 def parse(self, sineslist):
     doc = DataObjectCollection()
     for sines in sineslist:
         do = DataObject()
         for key, sine in sines.items():
             ts = TimeSeries(sine)
             ts.sample_rate = 1
             do[key] = ts
         doc.append(do)
     return doc
示例#8
0
 def parse(self, sines):
     doc = DataObjectCollection()
     do = DataObject()
     for key, sine in sines.items():
         ts = TimeSeries(sine)
         ts.sample_rate = 1
         #            ts.rangex = (-1,1)
         do[key] = ts
     doc.append(do)
     return doc
示例#9
0
 def parse(self):
     d = eval(self.file.readline().strip())
     doc = DataObjectCollection(sample_rate=1 / 3.0)
     for i, octant in enumerate(d):
         do = DataObject()
         for varname, values in octant.items():
             ts = TimeSeries(values)
             do[varname] = ts
         doc.append(do)
     return doc
示例#10
0
 def parse(self):
     d = eval(self.file.readline().strip())
     doc = DataObjectCollection(sample_rate=1 / 3.0)
     for i, octant in enumerate(d):
         do = DataObject()
         for varname, values in octant.items():
             ts = TimeSeries(values)
             do[varname] = ts
         doc.append(do)
     return doc
示例#11
0
def test_doc_imposes_sample_rate():
    # create a DO
    do1 = DataObject()

    # create a DOC and put the DO in it
    doc = DataObjectCollection(sample_rate=60)
    doc.append(do1)

    retrieved_do = doc[0]
    assert(retrieved_do.sample_rate == 60)
示例#12
0
def test_datamapper_1():
    # create a TimeSeries and stick something in it
    ts1 = TimeSeries(['datapoint'], sample_rate=60)

    # create a DO and put the TS in it
    do1 = DataObject()
    do1['somedata'] = ts1
    assert do1.keys() == ['somedata']

    # create a DOC and put the DO in it
    doc = DataObjectCollection()
    doc.append(do1)

    # dig down through the levels and get the datapoint we originally inserted
    timeseries = doc[0]['somedata']
    datapoint = timeseries[0]
    assert(datapoint == 'datapoint')
示例#13
0
def test_do_imposes_sample_rate():
    # create a TimeSeries
    ts1 = TimeSeries(['datapoint'])

    # create a DO and put the TS in it
    do1 = DataObject(sample_rate=60)
    do1['somed'] = ts1

    # test the sample_rate of the TimeSeries (which it should derive from the DO)
    assert(do1['somed'].sample_rate == 60)

    # while we're at it, make sure that it percolates up to the DOC
    doc = DataObjectCollection([do1])
    assert doc.sample_rate == 60, str(doc.sample_rate) + ' is not 60.'
示例#14
0
    def parse(self, input_filename, num_buoys=4, criterion_function=record_length,
              interpolation_function=interpolate_forward_backward,
              start=None, end=None, maxlines=None, print_heap=False):
        ''' Parse a file from the Global Drifter buoy program. Keeps the num_buoys buoys that most
        closely match the criterion function (eg longest record, closest to some latitude, closest
        to some lat/long pair). Each buoy becomes a DataObject.
         '''

        ''' Metadata for global drifter program:
        VE and VN are eastward and northward velocity. SPD is speed. Last 3 are variance. Do I care about any of them?
             ID     MM  DD   YY       LAT      LON       TEMP      VE        VN        SPD     VAR. LAT   VAR. LON  VAR. TEMP
                                                 Deg C    CM/S      CM/S       CM/S
        Note: file is very large (2+ GB) 
        Files can be obtained from ftp://ftp.aoml.noaa.gov/phod/pub/buoydata/
            and must be gunzipped despite the odd .dat-gz suffix. '''
        column_names = 'ID     MM  DD   YY       LAT      LON       TEMP      VE        VN        SPD     VAR_LAT   VAR_LON  VAR_TEMP'.split()

        def _getDataObject():
            ''' Convenience method to return a DataObject initialized to fit the buoy data. '''
            do = DataObject(metadata={'buoy_id': id})
            for key in ['LAT', 'LON', 'TEMP']:
                do[key] = TimeSeries([])
            return do

        def _push_to_heap(data, curdata):
            # make sure curdata isn't empty:
            ts = curdata.values()[0]
            if not ts: return

            heapindex = criterion_function(curdata)
            if len(data) >= num_buoys:
                popped = heappushpop(data, (heapindex, curdata))
                if print_heap and heapindex != popped[0]:
                    print 'pushing', heapindex
                    print 'popping', popped[0]
                    print
                    print 'now:'
                    for v in data:
                        print '  ', v[0]
                    print
            else: # Still building our heap to the size we want
                heappush(data, (heapindex, curdata))

        with open(input_filename) as input_file:
            data = [] # treat as heapq
            buoy_id = None
            curdata = _getDataObject()

            for i, line in enumerate(input_file):
                if maxlines and i > maxlines: break

                splitline = line.split()
                if not splitline: continue # blank line

                new_id = splitline[0] # buoy_id for this line
                if new_id != buoy_id: # Have we moved on to a new buoy?
                    if curdata:
                        curdata.metadata['buoy_id'] = buoy_id
                        _push_to_heap(data, curdata)
                    buoy_id = new_id
                    curdata = _getDataObject()

                # Start by stuffing all the data for this observation into a dict:
                temp_data_dict = {}
                for i, val in enumerate(splitline):
                    column_name = column_names[i]
                    temp_data_dict[column_name] = val

                # But we don't want to save all of it (there's a bunch of stuff we don't care
                # about). So we pick through it for the stuff we want, parsing and transforming
                # as necessary. Right now they're all strings.

                # Date/time first
                # Day of month plus time of day is represented like: 3.75 (3rd day, 3/4 of the way through)
                day_time = float(temp_data_dict['DD'])
                day = int(day_time)
                percent_of_day = day_time - day
                hour = int(24 * percent_of_day) # leaves us with 0, 6, 12, or 18
                year = int(temp_data_dict['YY'])
                month = int(temp_data_dict['MM'])
                date_time = datetime(year, month, day, hour)

                if start and date_time < start: continue
                if end   and date_time > end: continue

                # preserve first and last datetimes
                if 'start' not in curdata.metadata: curdata.metadata['start'] = date_time
                curdata.metadata['end'] = date_time

                curdata['LAT'].append(float(temp_data_dict['LAT']))
                curdata['LON'].append(float(temp_data_dict['LON']))
                curdata['TEMP'].append(float(temp_data_dict['TEMP']))

            # We hit EOF; push the current data
            curdata.metadata['buoy_id'] = buoy_id
            _push_to_heap(data, curdata)

            doc = DataObjectCollection(sample_rate=1.0 / 360) # 1 sample per six hours
            for _, do in data: # _ is the heap index
                doc.append(do)
            try:
                v = doc[0].values()[0]
                if not v: return None
            except IndexError:
                return None # Saner to return None than an empty DOC

            # interpolate
            for do in doc:
                for ts in do.values():
                    ts.replace_data(interpolation_function(ts, missing_values))
            return doc