Exemplo n.º 1
0
def merge_collections(origin, deriv, merge, dfkeys=DFILE_KEY_COLS):
    """Match up files in two archives and apply the merge function to them."""
    # Only merge files into the ones already present in origin. Warn if any
    # files from deriv are not used
    merged_dfc = DataFileCollection()
    dfile_map = map_collections(origin, deriv)
    for odfile, ddfile, dfkey in dfile_map:
        log.info(u'Merging files for key {0}'.format(dfkey))
        try:
            merged_dfc.append(merge(odfile, ddfile))
        except ValueError, err:
            log.error(
                u'Unable to merge datafiles for {0}: {1}'.format(dfkey, err))
Exemplo n.º 2
0
    def test_map_collections_keep_origin_files(self):
        """When merging collections, make sure to keep origin's files.

        If files from origin were not mapped to deriv's files, keep them in the
        final product.

        Warn if deriv file not in origin collection.

        """
        odfc = DataFileCollection()
        ddfc = DataFileCollection()

        df0 = DataFile()
        df0.globals['EXPOCODE'] = 'a'
        df0.globals['STNNBR'] = 1
        df0.globals['CASTNO'] = 1
        odfc.append(df0)

        df1 = DataFile()
        df1.globals['EXPOCODE'] = 'b'
        df1.globals['STNNBR'] = 1
        df1.globals['CASTNO'] = 1
        ddfc.append(df1)

        dfile_map = map_collections(odfc, ddfc)

        self.assertEqual(dfile_map, [(df0, df0, ('a', 1, 1))])
        lines = [
            "Origin file key ('a', 1, 1) is not present in derivative collection.",
            "Derivative file key ('b', 1, 1) is not present in origin collection.",
        ]
        self.assertTrue(self.ensure_lines(lines))
Exemplo n.º 3
0
def _multi_file(reader, files, output, **kwargs):
    dfc = DataFileCollection()
    for f in files:
        d = DataFile()
        reader.read(d, f, **kwargs)
        dfc.files.append(d)
    if output is not sys.stdout:
        output = open(output, 'w')
    ctdzipex.write(dfc, output)
Exemplo n.º 4
0
    def test_merge_collections(self):
        """When merging collections, map files, then merge mapped files.

        """
        odfc = DataFileCollection()
        ddfc = DataFileCollection()

        df0 = DataFile()
        df0.globals['EXPOCODE'] = 'a'
        df0.globals['STNNBR'] = 1
        df0.globals['CASTNO'] = 1
        df0.create_columns(['CTDPRS', 'NITRAT', 'NITRIT'])
        df0['CTDPRS'].append(1, 2)
        df0['CTDPRS'].append(2, 2)
        df0['NITRAT'].append(10, 2)
        df0['NITRAT'].append(11, 2)
        df0['NITRIT'].append(10, 2)
        df0['NITRIT'].append(11, 2)
        odfc.append(df0)

        df1 = DataFile()
        df1.globals['EXPOCODE'] = 'a'
        df1.globals['STNNBR'] = 1
        df1.globals['CASTNO'] = 1
        df1.create_columns(['CTDPRS', 'NITRAT', 'NITRIT'])
        df1['CTDPRS'].append(1, 2)
        df1['CTDPRS'].append(3, 2)
        df1['NITRAT'].append(20, 2)
        df1['NITRAT'].append(21, 2)
        df1['NITRIT'].append(10, 2)
        df1['NITRIT'].append(11, 2)
        ddfc.append(df1)

        def merger(origin, deriv):
            return merge_datafiles(origin, deriv, ['CTDPRS'],
                                   ['NITRAT', 'NITRIT'])

        merged_dfc = merge_collections(odfc, ddfc, merger)

        self.assertEqual(merged_dfc.files[0]['CTDPRS'].values, [1, 2])
        self.assertEqual(merged_dfc.files[0]['NITRAT'].values, [20, 11])
        self.assertEqual(merged_dfc.files[0]['NITRIT'].values, [10, 11])

        lines = [
            # df1 has an different CTDPRS record (3)
            'Key (3,) does not exist in origin from derivative rows',
            # NITRIT columns are the same
            "Instructed to merge parameters that are not different: ['NITRIT']"
        ]
        self.assertTrue(self.ensure_lines(lines))
Exemplo n.º 5
0
def guess_ftype_dftype_format(fileobj, file_type=None, file_name=None):
    """Return a tuple of guessed file type, Datafile or DatafileCollection, and 
    the format module.

    """
    from libcchdo.model.datafile import (
        DataFile, SummaryFile, DataFileCollection)
    file_type = guess_file_type_from_file(fileobj, file_type, file_name)
    if 'zip' in file_type or file_type.startswith('archive'):
        dfile = DataFileCollection()
    elif file_type.startswith('sum'):
        dfile = SummaryFile()
    else:
        dfile = DataFile()
    format_module = guess_format_module(fileobj, file_type)
    return (file_type, dfile, format_module)
Exemplo n.º 6
0
def split_on_cast(dfile):
    """Split a DataFile that has multiple casts into a DataFileCollection.

    Splits are done based on station cast. Each cast is a new 'file'.

    """
    coll = DataFileCollection()

    file_parameters = dfile.parameter_mnemonics_woce()

    current_file = copy(dfile)

    expocodes = dfile['EXPOCODE']
    stations = dfile['STNNBR']
    casts = dfile['CASTNO']

    expocode = expocodes[0]
    station = stations[0]
    cast = casts[0]
    for i in range(len(dfile)):
        # Check if this row is a new measurement location
        if expocodes[i] != expocode or \
           stations[i] != station or \
           casts[i] != cast:
            current_file.check_and_replace_parameters()
            coll.append(current_file)
            current_file = copy(dfile)
        expocode = expocodes[i]
        station = stations[i]
        cast = casts[i]

        # Put the current row in the current dfile
        for p in file_parameters:
            source_col = dfile[p]
            value = source_col[i]
            try:
                flag_woce = source_col.flags_woce[i]
            except IndexError:
                flag_woce = None
            try:
                flag_igoss = source_col.flags_igoss[i]
            except IndexError:
                flag_igoss = None
            current_file[p].append(value, flag_woce, flag_igoss)

    current_file.check_and_replace_parameters()
    coll.append(current_file)

    return coll
Exemplo n.º 7
0
def australian_navy_ctd(args):
    """Download and convert Australian Navy CTD data."""
    from pydap.client import open_url
    from libcchdo.thredds import crawl
    from libcchdo.formats.ctd.zip import exchange as ctdzipex
    from libcchdo.formats.zip import write as zwrite

    dfcs = []

    cf_param_to_cchdo_param = {
        'sea_water_pressure': 'CTDPRS',
        'sea_water_temperature': 'CTDTMP',
        'sea_water_practical_salinity': 'CTDSAL',
    }
    ignored_qc_flags = [
        'time_qc_flag',
        'position_qc_flag',
    ]
    qc_conventions = {
        'Proposed IODE qc scheme March 2012': {
            1: 2,  # good
            2: 5,  # not_evaluated_or_unknown
            3: 3,  # suspect
            4: 4,  # bad
            9: 9,  # missing
        },
    }

    dfc = DataFileCollection()
    catalog = "http://www.metoc.gov.au/thredds/catalog/RAN_CTD_DATA/catalog.xml"
    for url in crawl(catalog):
        df = DataFile()

        log.info(u'Reading %s', url)
        dset = open_url(url)
        vars = dset.keys()
        for vname in vars:
            var = dset[vname]
            attrs = var.attributes
            if 'standard_name' in attrs:
                std_name = attrs['standard_name']
                if std_name == 'time':
                    df.globals['_DATETIME'] = \
                        datetime(1950, 1, 1) + timedelta(var[:])
                elif std_name == 'latitude':
                    df.globals['LATITUDE'] = var[:]
                elif std_name == 'longitude':
                    df.globals['LONGITUDE'] = var[:]
                elif std_name in cf_param_to_cchdo_param:
                    cparam = cf_param_to_cchdo_param[std_name]
                    if '_FillValue' in attrs:
                        fill_value = attrs['_FillValue']
                        values = []
                        for x in var[:]:
                            if equal_with_epsilon(x, fill_value):
                                values.append(None)
                            else:
                                values.append(x)
                    else:
                        values = var[:]

                    try:
                        df[cparam].values = values
                    except KeyError:
                        df[cparam] = Column(cparam)
                        df[cparam].values = values
                elif 'status_flag' in std_name:
                    flagged_param = std_name.replace('status_flag', '').strip()
                    cparam = cf_param_to_cchdo_param[flagged_param]
                    qc_convention = attrs['quality_control_convention']
                    if qc_convention in qc_conventions:
                        qc_map = qc_conventions[qc_convention]
                        df[cparam].flags_woce = [qc_map[x] for x in var[:]]
                else:
                    log.debug('unhandled standard_name %s', std_name)
            elif ('long_name' in attrs
                  and attrs['long_name'] == 'profile identifier'):
                profile_id = var[:]
                cruise_id = profile_id / 10**4
                profile_id = profile_id - cruise_id * 10**4
                df.globals['EXPOCODE'] = str(cruise_id)
                df.globals['STNNBR'] = str(profile_id)
                df.globals['CASTNO'] = str(1)
            elif vname in ignored_qc_flags:
                df.globals['_' + vname] = var[:]
            elif (vname.endswith('whole_profile_flag')
                  or vname.endswith('sd_test')):
                pass
            else:
                log.debug('unhandled variable %s', vname)

        # attach new file to appropriate collection
        if dfc.files:
            if dfc.files[0].globals['EXPOCODE'] != df.globals['EXPOCODE']:
                dfcs.append(dfc)
                dfc = DataFileCollection()
        dfc.append(df)

    with closing(args.output) as out_file:
        next_id = 0

        def get_filename(dfc):
            try:
                return '{0}_ct1.zip'.format(dfc.files[0].globals['EXPOCODE'])
            except IndexError:
                next_id += 1
                return '{0}_ct1.zip'.format(next_id)

        zwrite(dfcs, out_file, ctdzipex, get_filename)
Exemplo n.º 8
0
 def test_read(self):
     self.datafile = DataFileCollection()
     botzipnc.read(self.datafile, self.infile)
     self.assertTrue(True)
Exemplo n.º 9
0
 def test_read(self):
     self.datafile = DataFileCollection()
     ctdzipwoce.read(self.datafile, self.infile)
     self.assertTrue(True)
Exemplo n.º 10
0
def report_argo_ctd_index(args):
    """Generates an Argo style index file of all CTD profiles.

    http://www.usgodae.org/pub/outgoing/argo/ar_index_global_prof.txt
    file,date,latitude,longitude,ocean,profiler_type,institution,date_update
    aoml/13857/profiles/R13857_001.nc,19970729200300,0.267,-16.032,A,845,AO,20080918131927

    """
    directories = []
    with closing(lsession()) as session:
        dirs = session.query(Document).filter(Document.FileType == 'Directory').all()
        for directory in dirs:
            if 'Queue' in directory.FileName:
                continue
            if 'ExpoCode' not in directory.Files:
                continue
            directories.append(directory)

    sftp = SFTP()
    sftp.connect(get_datadir_hostname())
    aftp = AFTP(sftp)

    argo_index = ArgoIndexFile()
    for directory in directories:
        ctd_files = {}
        files = directory.Files.split('\n')
        for fname in files:
            if fname.endswith('ct1.zip'):
                ctd_files['exchange'] = fname
            elif fname.endswith('nc_ctd.zip'):
                ctd_files['netcdf'] = fname
            elif fname.endswith('ct.zip'):
                ctd_files['woce'] = fname
            elif fname.endswith('su.txt'):
                ctd_files['wocesum'] = fname

        if not ctd_files:
            continue
        
        try:
            precedent_format = _pick_precedent_ctd_format(ctd_files.keys())
        except ValueError:
            continue

        cruise_dir = directory.FileName
        ctd_file = ctd_files[precedent_format]
        path = os.path.join(cruise_dir, ctd_file)

        log.debug(path)
        try:
            mtime = aftp.mtime(path)
            mtime = mtime.strftime('%Y%m%d%H%M%S')
        except IOError:
            log.error(u'Could not open file {0}'.format(path))
        if precedent_format == 'exchange':
            files = DataFileCollection()
            with aftp.dl(path) as fff:
                if fff is None:
                    log.error(u'Could not find file {0}'.format(path))
                    continue
                log.setLevel(ERROR)
                try:
                    ctdzipex.read(files, fff, header_only=True)
                except (ValueError, InvalidOperation):
                    log.error(u'Unable to read {0}'.format(path))
                log.setLevel(DEBUG)

            for ctdfile in files:
                fpath = path + '#' + ctdfile.globals['_FILENAME']
                date = ctdfile.globals['_DATETIME']
                if date is None:
                    date = ''
                else:
                    date = date.strftime('%Y%m%d%H%M%S')
                lat = ctdfile.globals['LATITUDE']
                lon = ctdfile.globals['LONGITUDE']
                ocean = ''
                profiler_type = ''
                inst = ''
                argo_index.append(ArgoIndexProfile(
                    fpath, date, lat, lon, ocean, profiler_type, inst, mtime
                ))
        elif precedent_format == 'netcdf':
            # TODO currently there aren't any files that have netcdf precedent
            args.output.write('netcdf!!!' + path + '\n')
        elif precedent_format == 'wocesum':
            sumfile = SummaryFile()
            path = os.path.join(get_datadir_root(), path)
            with aftp.dl(path) as fff:
                if fff is None:
                    log.error(u'Could not find file {0}'.format(path))
                    continue
                log.setLevel(ERROR)
                wocesum.read(sumfile, fff)
                log.setLevel(DEBUG)

            for iii in range(len(sumfile)):
                fpath = path + '#' + str(iii)
                date = sumfile['_DATETIME'][iii]
                if date is None:
                    date = ''
                else:
                    date = date.strftime('%Y%m%d%H%M%S')
                lat = sumfile['LATITUDE'][iii]
                lon = sumfile['LONGITUDE'][iii]
                ocean = ''
                profiler_type = ''
                inst = ''
                argo_index.append(ArgoIndexProfile(
                    fpath, date, lat, lon, ocean, profiler_type, inst, mtime
                ))
        else:
            raise ValueError(u'Unknown format {0}'.format(precedent_format))

    args.output.write(str(argo_index))
Exemplo n.º 11
0
def split(dfile, expocode):
    """Split a Pangea DataFile into a DataFileCollection.

    """
    lines = [line[1:] for line in dfile.globals['header'].split('\n')[1:-2]]
    metadata = _parse_data_description(lines)
    event_metas = {}
    for meta in metadata['Event(s)']:
        event_metas[meta[''][0].split()[0]] = meta

    dfc = DataFileCollection()
    cur_event = None
    cur_file = None
    for rowi in range(len(dfile)):
        event = dfile['_EVENT'][rowi]
        if event != cur_event:
            cur_event = event
            cur_file = copy(dfile)

            try:
                event_meta = event_metas[event]
            except KeyError:
                log.error(
                    u'Unable to get event metadata for event {0}'.format(event))

            sect, stncast = event.split('/')
            stn, cast = stncast.split('-')
            cur_file.globals['SECT_ID'] = sect
            cur_file.globals['STNNBR'] = stn
            cur_file.globals['CASTNO'] = cast
            try:
                cur_file.globals['DEPTH'] = int(
                    float(event_meta['ELEVATION'][1:-2]))
            except KeyError:
                pass
            try:
                cur_file.globals['LATITUDE'] = float(event_meta['LATITUDE'])
            except KeyError:
                pass
            try:
                cur_file.globals['LONGITUDE'] = float(event_meta['LONGITUDE'])
            except KeyError:
                pass
            try:
                cur_file.globals['_DATETIME'] = _parse_datetime(
                    event_meta['DATE/TIME'])
            except KeyError:
                pass
            cur_file.globals['EXPOCODE'] = expocode

            del cur_file.columns['LATITUDE']
            del cur_file.columns['LONGITUDE']
            del cur_file.columns['_DATETIME']
            del cur_file.columns['_EVENT']

            dfc.append(cur_file)

        for key, col in cur_file.columns.items():
            try:
                source_col = dfile[key]
            except KeyError:
                log.error(u'Missing column {0}'.format(key))
                continue
            try:
                flag_woce = source_col.flags_woce[rowi]
            except (KeyError, IndexError):
                flag_woce = None
            try:
                flag_igoss = source_col.flags_igoss[rowi]
            except (KeyError, IndexError):
                flag_igoss = None
            col.append(source_col.values[rowi], flag_woce, flag_igoss)
    return dfc
def read(self, handle, metadata=None):
    """How to read a Bottle Bermuda Atlantic Time-Series Study file.

    This function reads bats_bottle.txt.

    Arguments:
    self - (special case, see NOTE) dictionary
    metadata - (optional) BATS cruise metadata to be used to find port dates

    NOTE: The result for this method is a special case. The bottle file format
    contains the entire BATS holdings while the internal data format splits data
    up by cruises. Because cruises for timeseries are split by file for cruise,
    the end result is a dictionary with cruise_ids as keys to
    DatafileCollections (cruises) containing Datafiles (casts). 

    """
    sections = _read_header_sections(self, handle)
    _read_variables(self, handle)
    parameters = _get_variables(self, handle, sections)

    # Add DON for note in Variables list stating DON is reported for TON prior
    # to BATS 121
    parameters.append(['DON', None, 'umol/kg'])

    manual_parameters = [
        ['BTLNBR', ''],
        ['_DATETIME', ''],
        ['LATITUDE', ''],
        ['LONGITUDE', ''],
        ['_ACTUAL_DEPTH', 'METERS'],
    ]
    columns = [x[0] for x in manual_parameters]
    units = [x[1] for x in manual_parameters]

    s = None
    for i, (var, d, u) in enumerate(parameters):
        if var == 'Depth':
            s = i + 1
            continue
        # Only want to add parameters after Depth. The others were done manually.
        if s is None:
            continue
        try:
            var = bats_to_param[var]
        except KeyError:
            pass
        columns.append(var)
        units.append(u)

    template_df = DataFile()
    template_df.create_columns(columns, units)
    template_df.check_and_replace_parameters(convert=False)

    for sec, lines in sections.items():
        if sec == 'Variables list':
            continue
        if sec != 'Comments':
            continue
        template_df.globals['_{0}'.format(sec)] = '\n'.join(lines)

    df = None
    params_auto = parameters[s:]
    dfi = 0
    for i, l in enumerate(handle):
        parts = l.split()

        id = parts[0]
        (cruise_type, type_id, cruise_num, cruise_id, cast_type, cast_id,
         nisk_id) = _parse_bats_id(id)
        ship = _ship_from_cruise_num(cruise_num)
        if not ship:
            ship = 'R/V Atlantic Explorer'

        if (df is None or df.globals['_OS_ID'] != cruise_id
                or df.globals['STNNBR'] != cruise_type
                or df.globals['CASTNO'] != cast_id):
            if df is not None:
                # Done reading one cast. Finalize it.
                log.info(u'finalizing cast {0} {1} {2}'.format(
                    df.globals['_OS_ID'], df.globals['STNNBR'],
                    df.globals['CASTNO']))
                try:
                    meta = metadata[cruise_id]
                    port_date = meta['dates'][0]
                except (TypeError, KeyError):
                    port_date = None
                if not port_date:
                    port_date = min(df['_DATETIME'])
                df.globals['EXPOCODE'] = create_expocode(
                    ship_code(ship, raise_on_unknown=False), port_date)
                log.info(df.globals['EXPOCODE'])
                df.globals['DEPTH'] = max(df['_ACTUAL_DEPTH'])
                collapse_globals(df, ['_DATETIME', 'LATITUDE', 'LONGITUDE'])
                # Normalize all the parameter column lengths. There may be
                # columns that did not get data written to them so make sure
                # they are just as long as the rest
                length = len(df)
                for c in df.columns.values():
                    c.set_length(length)
                try:
                    dfc = self[df.globals['_OS_ID']]
                except KeyError:
                    dfc = self[df.globals['_OS_ID']] = DataFileCollection()
                dfc.files.append(df)
                dfi = 0

            # Create a new cast
            df = copy(template_df)
            df.globals['SECT_ID'] = BATS_SECT_ID
            df.globals['_SHIP'] = ship
            df.globals['_OS_ID'] = cruise_id
            df.globals['STNNBR'] = cruise_type
            df.globals['CASTNO'] = cast_id

        df['BTLNBR'].set(dfi, nisk_id)

        dt_ascii = datetime.strptime(parts[1] + parts[3], '%Y%m%d%H%M')
        dt_deci = bats_time_to_dt(parts[2])
        #if dt_ascii != dt_deci:
        #    log.warn(
        #        u'Dates differ on data row {0}: {5} {1!r}={2} '
        #        '{3!r}={4}'.format(i, parts[1] + parts[3], dt_ascii, parts[2],
        #                           dt_deci, dt_deci - dt_ascii))
        df['_DATETIME'].set(dfi, dt_ascii)

        df['LATITUDE'].set(dfi, Decimal(parts[4]))
        df['LONGITUDE'].set(dfi, Decimal(correct_longitude(parts[5])))
        df['_ACTUAL_DEPTH'].set_check_range(dfi, Decimal(parts[6]))

        parts_auto = parts[s:]
        for p, v in zip(params_auto, parts_auto):
            param = p[0]
            try:
                param = bats_to_param[param]
            except KeyError:
                pass
            if cruise_num < 121 and param == 'TON':
                param = 'DON'

            if (equal_with_epsilon(v, -9) or equal_with_epsilon(v, -9.9)
                    or equal_with_epsilon(v, -9.99)):
                df[param].set_check_range(dfi, None)
            # TODO determine whether -10 is just bad formatting for -9.9
            elif equal_with_epsilon(v, -10):
                #log.warn(u'Possible missing data value {0}'.format(v))
                df[param].set_check_range(dfi, None)
            elif v == 0:
                log.warn(u'Data under detection limit, set flag to '
                         'WOCE water sample questionable measurement')
                df[param].set_check_range(dfi, None, flag=3)
            else:
                df[param].set_check_range(dfi, Decimal(v))

        dfi += 1
        # Since this is a super long file that contains multiple cruises and
        # casts, as the file is processed it is split apart into a list of
        # DataFileCollection(s) containing DataFile objects for each casts
        if i % 100 == 0:
            log.info(u'processed {0} lines'.format(i))