Exemplo n.º 1
0
    def __init__(self, y1, m1, y2, m2):

        self.n = 12*(y2-y1) + m2 - m1 + 1

        self.years = np.zeros(self.n)
        self.months = np.zeros(self.n)
        self.counter = np.zeros(self.n)

        ct = 0
        for year, month in qc.year_month_gen(y1, m1, y2, m2):
            self.years[ct] = year
            self.months[ct] = month
            ct += 1
Exemplo n.º 2
0
def main(argv):
    '''
    tracking_qc.py invoked by typing::

    python2.7 tracking_qc.py -config configuration.txt -id "SHIPNAME" 

    This quality controls data for the chosen ID (which will be end-padded with spaces). The location 
    of the data and the locations of the climatology files are all to be specified in the configuration files.
    '''

    print '########################'
    print 'Running tracking_qc'
    print '########################'

    parser = argparse.ArgumentParser(
        description='Marine QC system, main program')
    parser.add_argument('-config',
                        type=str,
                        default='configuration.txt',
                        help='name of config file')
    parser.add_argument('-id', type=str, help='ID to read in and process')
    args = parser.parse_args()

    id = args.id
    while len(id) < 9:
        id += ' '

    print "running on ICOADS, this is not a test!"

    print 'Input file is ', args.config
    print 'Running for ID ', id
    print ''

    config = ConfigParser.ConfigParser()
    config.read(args.config)
    icoads_dir = config.get('Directories', 'ICOADS_dir')
    out_dir = config.get('Directories', 'out_dir')

    with open(config.get('Files', 'parameter_file'), 'r') as f:
        parameters = json.load(f)

    print 'ICOADS directory =', icoads_dir
    print 'Output to', out_dir
    print ''

    v = ex.Voyage()

    for year, month in qc.year_month_gen(1985, 1, 2014, 12):

        sy = str(year)
        sm = "%02d" % (month)

        filename = out_dir + '/' + sy + '/' + sm + '/Variables_' + sy + sm + '_' + id + '_standard.csv'
        print(filename)
        #try top open file containing ID data
        try:
            with open(filename, 'r') as csvfile:

                #get the headers from the CSV file and sort them out: strip trailing carriage return, split by commas, fix duplicates
                headers = csvfile.readline()
                headers = headers[:-1]
                headers = headers.split(',')
                headers[
                    11] = 'AT_anom'  #THIS IS AN AWFUL BODGE, HEADERS SHOULD BE UNIQUE GRRRR
                headers[
                    13] = 'SST_anom'  #THIS IS AN AWFUL BODGE, HEADERS SHOULD BE UNIQUE GRRRR

                #now read the rest of the CSV file using the headers as a dictionary. Need to add OSTIA information as "ext" information in the rep
                reader = csv.DictReader(csvfile, fieldnames=headers)
                for line in reader:
                    rep = ex.MarineReportQC(easy_imma(line))
                    rep.setext('OSTIA', line['OSTIA'])
                    rep.setext('ICE', line['ICE'])
                    rep.setext('BGVAR', line['BGVAR'])

                    v.add_report(rep)
#Now do soemthing if it goes wrong
        except:
            print("Something went wrong. Does the file " + filename +
                  " exist?")

    print ""
    print "read in " + str(len(v)) + " reports from the good ship " + id

    #all the data now read in do something with it. This isn't very exciting, but we can...print out all the obs
    for rep in v.rep_feed():
        print rep.getvar('ID'), rep.getvar('YR'), rep.getvar('MO'), rep.getvar(
            'LAT'), rep.getvar('LON')


#Or, we can run a positional track check on the combined track...
    v.track_check(parameters['track_check'])
    #and print out the results
    for rep in v.rep_feed():
        print rep.get_qc('POS', 'trk')
Exemplo n.º 3
0
def main(argv):
    """
    This program reads in data from ICOADS.3.0.0/ICOADS.3.0.1 and applies quality control processes to it, flagging data
    as good or bad according to a set of different criteria. Optionally it will replace drifting buoy SST data in
    ICOADS.3.0.1 with drifter data taken from the GDBC portal.

    The first step of the process is to read in various SST and MAT climatologies from file. These are 1degree latitude 
    by 1 degree longitude by 73 pentad fields in NetCDF format.
    
    The program then loops over all specified years and months reads in the data needed to QC that month and then 
    does the QC. There are three stages in the QC
    
    basic QC - this proceeds one observation at a time. Checks are relatively simple and detect gross errors
    
    track check - this works on Voyages consisting of all the observations from a single ship (or at least a single ID) 
    and identifies observations which make for an implausible ship track
    
    buddy check - this works on Decks which are large collections of observations and compares observations to their
    neighbours
    """

    print('########################')
    print('Running make_and_full_qc')
    print('########################')

    parser = argparse.ArgumentParser(
        description='Marine QC system, main program')
    parser.add_argument('-config',
                        type=str,
                        default='configuration.txt',
                        help='name of config file')
    parser.add_argument('-year1',
                        type=int,
                        default=1850,
                        help='First year for processing')
    parser.add_argument('-year2',
                        type=int,
                        default=1850,
                        help='Final year for processing')
    parser.add_argument('-month1',
                        type=int,
                        default=1,
                        help='First month for processing')
    parser.add_argument('-month2',
                        type=int,
                        default=1,
                        help='Final month for processing')
    parser.add_argument('-tracking',
                        action='store_true',
                        help='perform tracking QC')
    args = parser.parse_args()

    inputfile = args.config
    year1 = args.year1
    year2 = args.year2
    month1 = args.month1
    month2 = args.month2
    tracking = args.tracking

    print("running on ICOADS, this is not a test!")

    print('Input file is {}'.format(inputfile))
    print('Running from {} {} to {} {}'.format(month1, year1, month2, year2))
    print('')

    config = ConfigParser.ConfigParser()
    config.read(inputfile)
    icoads_dir = config.get('Directories', 'ICOADS_dir')
    out_dir = config.get('Directories', 'out_dir')
    bad_id_file = config.get('Files', 'IDs_to_exclude')
    version = config.get('Icoads', 'icoads_version')

    print('ICOADS directory = {}'.format(icoads_dir))
    print('ICOADS version = {}'.format(version))
    print('Output to {}'.format(out_dir))
    print('List of bad IDs = {}'.format(bad_id_file))
    print('Parameter file = {}'.format(config.get('Files', 'parameter_file')))
    print('')

    ids_to_exclude = bf.process_bad_id_file(bad_id_file)

    # read in climatology files
    sst_pentad_stdev = clim.Climatology.from_filename(
        config.get('Climatologies', 'Old_SST_stdev_climatology'), 'sst')

    sst_stdev_1 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_one_box_to_buddy_avg'), 'sst')
    sst_stdev_2 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_one_ob_to_box_avg'), 'sst')
    sst_stdev_3 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_avg_sampling'), 'sst')

    with open(config.get('Files', 'parameter_file'), 'r') as f:
        parameters = json.load(f)

    print("Reading climatologies from parameter file")
    climlib = ex.ClimatologyLibrary()
    for entry in parameters['climatologies']:
        print("{} {}".format(entry[0], entry[1]))
        climlib.add_field(entry[0], entry[1],
                          clim.Climatology.from_filename(entry[2], entry[3]))

    for year, month in qc.year_month_gen(year1, month1, year2, month2):

        print("{} {}".format(year, month))

        last_year, last_month = qc.last_month_was(year, month)
        next_year, next_month = qc.next_month_is(year, month)

        reps = ex.Deck()
        count = 0
        lastday = -99

        for readyear, readmonth in qc.year_month_gen(last_year, last_month,
                                                     next_year, next_month):

            print("{} {}".format(readyear, readmonth))

            ostia_bg_var = None
            if tracking:
                ostia_bg_var = clim.Climatology.from_filename(
                    config.get('Climatologies',
                               qc.season(readmonth) + '_ostia_background'),
                    'bg_var')

            filename = bf.icoads_filename_from_stub(
                parameters['icoads_dir'], parameters['icoads_filenames'],
                readyear, readmonth)
            try:
                icoads_file = gzip.open(filename, "r")
            except IOError:
                print("no ICOADS file for {} {}".format(readyear, readmonth))
                continue

            rec = IMMA()

            for line in icoads_file:

                try:
                    rec.readstr(line)
                    readob = True
                except:
                    readob = False
                    print("Rejected ob {}".format(line))

                if (not (rec.data['ID'] in ids_to_exclude) and readob
                        and rec.data['YR'] == readyear
                        and rec.data['MO'] == readmonth):

                    rep = ex.MarineReportQC(rec)
                    del rec

                    # if day has changed then read in OSTIA field if available and append SST and sea-ice fraction
                    # to the observation metadata
                    if tracking and readyear >= 1985 and rep.getvar(
                            'DY') is not None:
                        if rep.getvar('DY') != lastday:
                            lastday = rep.getvar('DY')
                            y_year, y_month, y_day = qc.yesterday(
                                readyear, readmonth, lastday)

                            #                            ofname = ostia_filename(ostia_dir, y_year, y_month, y_day)
                            ofname = bf.get_background_filename(
                                parameters['background_dir'],
                                parameters['background_filenames'], y_year,
                                y_month, y_day)

                            climlib.add_field(
                                'OSTIA', 'background',
                                clim.Climatology.from_filename(
                                    ofname, 'analysed_sst'))
                            climlib.add_field(
                                'OSTIA', 'ice',
                                clim.Climatology.from_filename(
                                    ofname, 'sea_ice_fraction'))

                        rep_clim = climlib.get_field(
                            'OSTIA', 'background').get_value_ostia(
                                rep.lat(), rep.lon())
                        if rep_clim is not None:
                            rep_clim -= 273.15

                        rep.setext('OSTIA', rep_clim)
                        rep.setext(
                            'ICE',
                            climlib.get_field('OSTIA', 'ice').get_value_ostia(
                                rep.lat(), rep.lon()))
                        rep.setext(
                            'BGVAR',
                            ostia_bg_var.get_value_mds_style(
                                rep.lat(), rep.lon(), rep.getvar('MO'),
                                rep.getvar('DY')))

                    for varname in ['SST']:
                        rep_clim = climlib.get_field(
                            varname, 'mean').get_value_mds_style(
                                rep.lat(), rep.lon(), rep.getvar('MO'),
                                rep.getvar('DY'))
                        rep.add_climate_variable(varname, rep_clim)

                    rep.perform_base_qc(parameters)
                    rep.set_qc(
                        'POS', 'month_match',
                        qc.month_match(year, month, rep.getvar('YR'),
                                       rep.getvar('MO')))

                    reps.append(rep)
                    count += 1

                rec = IMMA()

            icoads_file.close()

        print("Read {} ICOADS records".format(count))

        # filter the obs into passes and fails of basic positional QC
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'date', 0)
        filt.add_qc_filter('POS', 'time', 0)
        filt.add_qc_filter('POS', 'pos', 0)
        filt.add_qc_filter('POS', 'blklst', 0)

        reps.add_filter(filt)

        # track check the passes one ship at a time
        count_ships = 0
        for one_ship in reps.get_one_platform_at_a_time():

            one_ship.track_check(parameters['track_check'])
            one_ship.iquam_track_check(parameters['IQUAM_track_check'])
            one_ship.spike_check(parameters['IQUAM_spike_check'])
            one_ship.find_saturated_runs(parameters['saturated_runs'])
            one_ship.find_multiple_rounded_values(
                parameters['multiple_rounded_values'])

            for varname in ['SST']:
                one_ship.find_repeated_values(
                    parameters['find_repeated_values'], intype=varname)

            count_ships += 1

        print("Track checked {} ships".format(count_ships))

        # SST buddy check
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'is780', 0)
        filt.add_qc_filter('POS', 'date', 0)
        filt.add_qc_filter('POS', 'time', 0)
        filt.add_qc_filter('POS', 'pos', 0)
        filt.add_qc_filter('POS', 'blklst', 0)
        filt.add_qc_filter('POS', 'trk', 0)
        filt.add_qc_filter('SST', 'noval', 0)
        filt.add_qc_filter('SST', 'freez', 0)
        filt.add_qc_filter('SST', 'clim', 0)
        filt.add_qc_filter('SST', 'nonorm', 0)

        reps.add_filter(filt)

        reps.bayesian_buddy_check('SST', sst_stdev_1, sst_stdev_2, sst_stdev_3,
                                  parameters)
        reps.mds_buddy_check('SST', sst_pentad_stdev,
                             parameters['mds_buddy_check'])

        extdir = bf.safe_make_dir(out_dir, year, month)

        if tracking:
            # set QC for output by ID - buoys only and passes base SST QC
            filt = ex.QC_filter()
            filt.add_qc_filter('POS', 'month_match', 1)
            filt.add_qc_filter('POS', 'isdrifter', 1)

            reps.add_filter(filt)

            idfile = open(extdir + '/ID_file.txt', 'w')
            for one_ship in reps.get_one_platform_at_a_time():

                if len(one_ship) > 0:
                    thisid = one_ship.getrep(0).getvar('ID')
                    if thisid is not None:
                        idfile.write(thisid + ',' + ex.safe_filename(thisid) +
                                     '\n')
                        one_ship.write_output(parameters['runid'], extdir,
                                              year, month)
            idfile.close()

        del reps
Exemplo n.º 4
0
def main(argv):
    """
    This program reads in data from ICOADS.2.5.1 and applies quality control processes to it, flagging data as 
    good or bad according to a set of different criteria.

    The first step of the process is to read in various SST and MAT climatologies from file. These are 1degree latitude 
    by 1 degree longitude by 73 pentad fields in NetCDF format.
    
    The program then loops over all specified years and months reads in the data needed to QC that month and then 
    does the QC. There are three stages in the QC
    
    basic QC - this proceeds one observation at a time. Checks are relatively simple and detect gross errors
    
    track check - this works on Voyages consisting of all the observations from a single ship (or at least a single ID) 
    and identifies observations which make for an implausible ship track
    
    buddy check - this works on Decks which are large collections of observations and compares observations to their
    neighbours
    """

    print('########################')
    print('Running make_and_full_qc')
    print('########################')

    parser = argparse.ArgumentParser(
        description='Marine QC system, main program')
    parser.add_argument('-config',
                        type=str,
                        default='configuration.txt',
                        help='name of config file')
    parser.add_argument('-tracking',
                        action='store_true',
                        help='perform tracking QC')
    parser.add_argument('-jobs',
                        type=str,
                        default='jobs.json',
                        help='name of job file')
    parser.add_argument('-job_index', type=int, default=0, help='job index')

    args = parser.parse_args()

    inputfile = args.config
    jobfile = args.jobs
    jobindex = args.job_index - 1
    tracking = args.tracking

    with open(jobfile) as fp:
        jobs = json.load(fp)

    year1 = jobs['jobs'][jobindex]['year1']
    year2 = jobs['jobs'][jobindex]['year2']
    month1 = jobs['jobs'][jobindex]['month1']
    month2 = jobs['jobs'][jobindex]['month2']
    input_schema = jobs['schema']
    code_tables = jobs['code_tables']

    verbose = True  # need set to read as arg in future

    print('Input file is {}'.format(inputfile))
    print('Running from {} {} to {} {}'.format(month1, year1, month2, year2))
    print('')

    config = ConfigParser.ConfigParser()
    config.read(inputfile)
    icoads_dir = config.get('Directories', 'ICOADS_dir')
    out_dir = config.get('Directories', 'out_dir')
    bad_id_file = config.get('Files', 'IDs_to_exclude')
    version = config.get('Icoads', 'icoads_version')

    print('ICOADS directory = {}'.format(icoads_dir))
    print('ICOADS version = {}'.format(version))
    print('Output to {}'.format(out_dir))
    print('List of bad IDs = {}'.format(bad_id_file))
    print('Parameter file = {}'.format(config.get('Files', 'parameter_file')))
    print('')

    ids_to_exclude = bf.process_bad_id_file(bad_id_file)

    # read in climatology files
    sst_pentad_stdev = clim.Climatology.from_filename(
        config.get('Climatologies', 'Old_SST_stdev_climatology'), 'sst')

    sst_stdev_1 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_one_box_to_buddy_avg'), 'sst')
    sst_stdev_2 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_one_ob_to_box_avg'), 'sst')
    sst_stdev_3 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_avg_sampling'), 'sst')

    with open(config.get('Files', 'parameter_file'), 'r') as f:
        parameters = json.load(f)

    # read in high resolution SST climatology file
    for entry in parameters['hires_climatologies']:
        if entry[0] == 'SST' and entry[1] == 'mean':
            sst_climatology_file = entry[2]
            print("hires sst climatology file {}".format(sst_climatology_file))

    climlib = ex.ClimatologyLibrary()
    climlib.add_field(
        'SST', 'mean',
        clim.Climatology.from_filename(sst_climatology_file, 'temperature'))

    for year, month in qc.year_month_gen(year1, month1, year2, month2):

        print("{} {}".format(year, month))

        last_year, last_month = qc.last_month_was(year, month)
        next_year, next_month = qc.next_month_is(year, month)

        reps = ex.Deck()
        count = 0

        for readyear, readmonth in qc.year_month_gen(last_year, last_month,
                                                     next_year, next_month):

            print("{} {}".format(readyear, readmonth))

            #icoads_dir = '/gws/nopw/j04/c3s311a_lot2/data/level0/marine/sub_daily_data/IMMA1_R3.0.0T-QC/'
            filename = icoads_dir + '{:4d}-{:02d}.psv'.format(
                readyear, readmonth)
            # YR|MO|DY|HR|LAT|LON|DS|VS|ID|AT|SST|DPT|DCK|SLP|SID|PT|UID|W|D|IRF|bad_data|outfile
            imma_obj = pd.read_csv(filename,
                                   sep='|',
                                   header=None,
                                   names=[
                                       'YR', 'MO', 'DY', 'HR', 'LAT', 'LON',
                                       'DS', 'VS', 'ID', 'AT', 'SST', 'DPT',
                                       'DCK', 'SLP', 'SID', 'PT', 'UID', 'W',
                                       'D', 'IRF', 'bad_data', 'outfile'
                                   ],
                                   low_memory=False)

            imma_obj['ID'].replace(' ', '', inplace=True)
            imma_obj = imma_obj.sort_values(['YR', 'MO', 'DY', 'HR', 'ID'],
                                            axis=0,
                                            ascending=True)
            imma_obj = imma_obj.reset_index(drop=True)

            data_index = imma_obj.index

            rec = IMMA()

            for idx in data_index:
                # set missing values to None
                for k, v in imma_obj.loc[idx, ].to_dict().items():
                    rec.data[k] = to_none(v)

                readob = True
                if (not (rec.data['ID'] in ids_to_exclude) and readob
                        and rec.data['YR'] == readyear
                        and rec.data['MO'] == readmonth and rec.data['DY']
                        is not None):  # dyb - new line / check

                    rep = ex.MarineReportQC(rec)
                    del rec

                rep_clim = climlib.get_field('SST', 'mean').get_value(
                    rep.lat(), rep.lon(), rep.getvar('MO'), rep.getvar('DY'))
                rep.add_climate_variable('SST', rep_clim)

                rep.perform_base_sst_qc(parameters)
                rep.set_qc(
                    'POS', 'month_match',
                    qc.month_match(year, month, rep.getvar('YR'),
                                   rep.getvar('MO')))

                reps.append(rep)
                count += 1

                rec = IMMA()

            #icoads_file.close()

        print("Read {} ICOADS records".format(count))

        # filter the obs into passes and fails of basic positional QC
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'date', 0)
        filt.add_qc_filter('POS', 'time', 0)
        filt.add_qc_filter('POS', 'pos', 0)
        filt.add_qc_filter('POS', 'blklst', 0)

        reps.add_filter(filt)

        # track check the passes one ship at a time
        count_ships = 0
        for one_ship in reps.get_one_platform_at_a_time():
            one_ship.sort(
            )  # corrections applied can move reports between months, corrections currently applied after reading IMMA
            one_ship.track_check(parameters['track_check'])
            one_ship.find_repeated_values(parameters['find_repeated_values'],
                                          intype='SST')
            count_ships += 1

        print("Track checked {} ships".format(count_ships))

        # SST buddy check
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'is780', 0)
        filt.add_qc_filter('POS', 'date', 0)
        filt.add_qc_filter('POS', 'time', 0)
        filt.add_qc_filter('POS', 'pos', 0)
        filt.add_qc_filter('POS', 'blklst', 0)
        filt.add_qc_filter('POS', 'trk', 0)
        filt.add_qc_filter('SST', 'noval', 0)
        filt.add_qc_filter('SST', 'freez', 0)
        filt.add_qc_filter('SST', 'clim', 0)
        filt.add_qc_filter('SST', 'nonorm', 0)

        reps.add_filter(filt)

        reps.bayesian_buddy_check('SST', sst_stdev_1, sst_stdev_2, sst_stdev_3,
                                  parameters)
        reps.mds_buddy_check('SST', sst_pentad_stdev,
                             parameters['mds_buddy_check'])

        extdir = bf.safe_make_dir(out_dir, year, month)

        varnames_to_print = {
            'SST': [
                'bud', 'clim', 'nonorm', 'freez', 'noval', 'nbud', 'bbud',
                'rep', 'spike', 'hardlimit'
            ]
        }

        reps.write_qc('hires_' + parameters['runid'], extdir, year, month,
                      varnames_to_print)

        if tracking:
            # set QC for output by ID - buoys only and passes base SST QC
            filt = ex.QC_filter()
            filt.add_qc_filter('POS', 'month_match', 1)
            filt.add_qc_filter('POS', 'isdrifter', 1)

            reps.add_filter(filt)

            idfile = open(extdir + '/ID_file.txt', 'w')
            for one_ship in reps.get_one_platform_at_a_time():

                if len(one_ship) > 0:
                    thisid = one_ship.getrep(0).getvar('ID')
                    if thisid is not None:
                        idfile.write(thisid + ',' + ex.safe_filename(thisid) +
                                     '\n')
                        one_ship.write_qc('hires_' + parameters['runid'],
                                          extdir, year, month,
                                          varnames_to_print)
            idfile.close()

        del reps
Exemplo n.º 5
0
def main(argv):
    """
    Calls the tracking qc checks for a specified drifting buoy

    Invoked as::

      python tracking_qc.py -config configuration.txt -id BUOYID -yr1 YEAR -mn1 MONTH -yr2 YEAR2 -mn2 MONTH2
      -edge EDGE -runmonthid RUNID

    Inputs

    -config
      specifies the location of the configuration file.

    -id
      ID of the buoy to which tracking QC will be applied

    -yr1
      year of the first month to QC

    -mn1
      month of the first month to QC

    -yr2
      year of the last month to QC

    -mn2
      month of the last month to QC

    -edge
      specific type of edge case, one of: new, regular, start_edge_case, end_edge_case

    -runmonthid
      used to label special directories for start and end edge cases. This is of the form YYYYMM-YYYYMM

    This quality controls drifter data for the chosen ID (which will be end-padded with spaces) over the
    specified time range. The time range should specify a single complete drifter record. The location of the 
    input data and the location of the qc-parameters file are specified in the configuration file. The qc-parameters 
    file specifies the input parameters used by the various tracking checks. Input data are from the marine QC system. 
    These are in 'per-ID per-month' csv format with observation variables, basic QC flags and SST QC flags stored in 
    separate files and linkable via observation UID. 

    A drifting buoy record is first assembled from the input data files and stored as a :class:`.Voyage` of 
    :class:`.MarineReport` s. This record is then passed to the various tracking QC checks. Some observations that
    fail basic or SST QC are not passed to the tracking QC checks and will not receive tracking QC flags. 
    Which observations are filtered out is dependent on tracking QC check.

    Output is written to a file in the track_out_dir specified in the configuration file. Where it is written depends on the
    EDGE flag (EDGE can be 'new', 'regular', 'start_edge_case' or 'end_edge_case'). The RUNID is intended to label
    the directories to which edge cases are sent. It should be of the form YYYMM-YYYMM specifying the start and end
    dates for which the overall QC was run.

    UserWarning is raised for problems with the input files.
    AssertionError is raised if inputs (parameters or MarineReport data) to a QC check are invalid
    """

    parser = argparse.ArgumentParser(description='Marine QC system, main program')
    parser.add_argument('-config', type=str, default='configuration.txt', help='name of config file')
    parser.add_argument('-id', type=str, help='ID to read in and process')
    parser.add_argument('-yr1', type=int, help='First year of data for drifting buoy')
    parser.add_argument('-mn1', type=int, help='First month of data for drifting buoy')
    parser.add_argument('-yr2', type=int, help='Last year of data for drifting buoy')
    parser.add_argument('-mn2', type=int, help='Last month of data for drifting buoy')
    parser.add_argument('-edge', nargs='+', help='list of edge case descriptors')
    parser.add_argument('-runmonthid', type=str, default='',
                        help='string for tagging directories should be of form YYYYMM-YYYYMM')
    args = parser.parse_args()

    edge = args.edge
    runmonthid = args.runmonthid

    oldqc = False # this can be used to switch in the old versions of the aground and speed checks

    target_id = args.id
    while len(target_id) < 9:
        target_id += ' '

    print('Running track QC for ID {}'.format(target_id))
    print('')
    print("Type of case: {}".format(edge[0]))
    print("Specific run id from wrapper script: {}".format(runmonthid))

    config = ConfigParser.ConfigParser()
    config.read(args.config)
    out_dir = config.get('Directories', 'out_dir')
    track_out_dir = config.get('Directories', 'track_out_dir')

    print("{}".format(out_dir))

    with open(config.get('Files', 'parameter_file'), 'r') as f:
        parameters = json.load(f)

    rep_list = []  # this will store input data as MarineReports

    count = 0
    for year, month in qc.year_month_gen(args.yr1, args.mn1, args.yr2, args.mn2):

        sy = str(year)
        sm = "{:02}".format(month)

        # input data files
        filename = "{0}/{1}/{2}/Variables_{1}{2}_{3}_{4}.csv".format(out_dir, sy, sm, target_id, parameters['runid'])
        posqc_filename = "{0}/{1}/{2}/POS_qc_{1}{2}_{3}_{4}.csv".format(out_dir, sy, sm, target_id, parameters['runid'])
        sstqc_filename = "{0}/{1}/{2}/SST_qc_{1}{2}_{3}_{4}.csv".format(out_dir, sy, sm, target_id, parameters['runid'])

        # check if any data exists for this month before continuing
        if not (os.path.isfile(filename) or
                os.path.isfile(posqc_filename) or
                os.path.isfile(sstqc_filename)):
            continue

        print('reading data for: {}/{}'.format(sy, sm))
        # check all files exist, have data and have same amount of data before proceeding
        file_fail = False
        filelines = []
        for infile in [filename, posqc_filename, sstqc_filename]:
            try:
                with open(infile, 'r') as file:
                    linecount = 0
                    for line in file:
                        linecount += 1
                    filelines.append(linecount)
                    if linecount == 0:
                        message = 'empty file: ' + infile
                        file_fail = True
                    if linecount == 1:
                        print('only header in {}'.format(infile))
            except IOError:
                message = 'could not open ' + infile
                file_fail = True
        if not all(x == filelines[0] for x in filelines):
            message = 'file lengths do not match'
            file_fail = True
        if file_fail:
            raise UserWarning('problem with files for {}/{}: '.format(sy, sm), message)

        # read in ID data
        try:
            with open(filename, 'r') as csvfile:
                # get the headers from the CSV file and sort them out:
                # strip trailing carriage return, split by commas, fix duplicates
                headers = csvfile.readline()
                headers = headers[:-1]
                headers = headers.split(',')
                headers[11] = 'AT_anom'  # THIS IS AN AWFUL BODGE, HEADERS SHOULD BE UNIQUE GRRRR
                headers[13] = 'SST_anom'  # THIS IS AN AWFUL BODGE, HEADERS SHOULD BE UNIQUE GRRRR

                # now read the rest of the CSV file using the headers as a dictionary.
                # Need to add OSTIA information as "ext" information in the rep
                reader = csv.DictReader(csvfile, fieldnames=headers)
                nrep = 0
                for line in reader:
                    rep = ex.MarineReportQC(EasyImma(line))
                    # variables not in the Extended_IMMA.py VARLIST are not added by the above step,
                    # so OSTIA, ICE and BGVAR variables now need adding manually
                    rep.setext('OSTIA', None if line['OSTIA'] is None else float(line['OSTIA']))
                    rep.setext('ICE', None if line['ICE'] is None else float(line['ICE']))
                    rep.setext('BGVAR', None if line['BGVAR'] is None else float(line['BGVAR']))
                    rep_list.append(rep)
                    nrep += 1
        except Exception as error:
            print("Something went wrong populating report list")
            raise

        # now read in basic qc data
        try:
            with open(posqc_filename, 'r') as csvfile:
                # get the headers from the CSV file and sort them out:
                # strip trailing carriage return, split by commas, fix duplicates
                headers = csvfile.readline()
                headers = headers[:-1]
                headers = headers.split(',')

                # now read the rest of the CSV file using the headers as a dictionary.
                reader = csv.DictReader(csvfile, fieldnames=headers)
                indx = count
                for line in reader:
                    for key in line:
                        if key == 'UID':
                            uid = rep_list[indx].getvar('UID')
                            if line[key].strip() != uid.strip():
                                raise UserWarning("UIDs don't match: {0}-{1}".format(line[key], uid))
                        else:
                            rep_list[indx].set_qc('POS', key, int(line[key]))
                    indx += 1
        except Exception as error:
            print("Something went wrong adding basic qc to report_list")
            raise

        # now read in sst qc data
        try:
            with open(sstqc_filename, 'r') as csvfile:
                # get the headers from the CSV file and sort them out:
                # strip trailing carriage return, split by commas, fix duplicates
                headers = csvfile.readline()
                headers = headers[:-1]
                headers = headers.split(',')

                # now read the rest of the CSV file using the headers as a dictionary.
                reader = csv.DictReader(csvfile, fieldnames=headers)
                indx = count
                for line in reader:
                    for key in line:
                        if key == 'UID':
                            uid = rep_list[indx].getvar('UID')
                            if line[key].strip() != uid.strip():
                                raise UserWarning("UIDs don't match: {0}-{1}".format(line[key], uid))
                        else:
                            rep_list[indx].set_qc('SST', key, int(line[key]))
                    indx += 1
        except Exception as error:
            print("Something went wrong adding sst qc to report_list")
            raise

        count += nrep

    if len(rep_list) == 0:
        raise UserWarning('no data for buoy ' + target_id)
    print("")
    print("read in {} reports from the buoy {}".format(len(rep_list), target_id))

    # now perform various tracking qc checks
    # note that any obs filtered out ahead of track qc won't receive qc flags
    # get_qc(trackflag) will return 9 for these obs.

    # ---aground QC---

    # pre-filter obs
    filt = ex.QC_filter()
    filt.add_qc_filter('POS', 'isbuoy', 1)  # should already be applied, but just in case
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)  # includes rejection of (lon,lat)=(0,0)
    v_filt = ex.Voyage()
    for rep in rep_list:
        if filt.test_report(rep) == 0:
            v_filt.add_report(rep)
    v_filt.sort()  # sort in time
    print("passing {} to aground check".format(len(v_filt)))
    if oldqc:
        v_filt.buoy_aground_check(parameters['buoy_aground_check'],
                                  False)  # raises AssertionError if check inputs are invalid
    else:
        v_filt.new_buoy_aground_check(parameters['new_buoy_aground_check'],
                                      False)  # raises AssertionError if check inputs are invalid

    # ---picked up QC---

    filt = ex.QC_filter()
    filt.add_qc_filter('POS', 'isbuoy', 1)  # should already be applied, but just in case
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)  # includes rejection of (lon,lat)=(0,0)
    if oldqc:
        filt.add_qc_filter('POS', 'iquam_track', 0)  # NOTE only use this for original buoy speed check
    v_filt = ex.Voyage()
    for rep in rep_list:
        if filt.test_report(rep) == 0:
            v_filt.add_report(rep)
    v_filt.sort()  # sort in time
    print("passing {} to picked-up check".format(len(v_filt)))
    if oldqc:
        v_filt.buoy_speed_check(parameters['buoy_speed_check'],
                                False)  # raises AssertionError if check inputs are invalid
    else:
        v_filt.new_buoy_speed_check(parameters['IQUAM_track_check'], parameters['new_buoy_speed_check'],
                                    False)  # raises AssertionError if check inputs are invalid

    # ---sst tail QC---

    filt = ex.QC_filter()
    filt.add_qc_filter('POS', 'isbuoy', 1)  # should already be applied, but just in case
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)  # includes rejection of (lon,lat)=(0,0)
    filt.add_qc_filter('SST', 'clim', 0)
    filt.add_qc_filter('SST', 'nonorm', 0)
    filt.add_qc_filter('SST', 'freez', 0)
    filt.add_qc_filter('SST', 'noval', 0)
    filt.add_qc_filter('SST', 'rep', 0)  # flags repeated value obs where these are >70% of record (set in parameters)
    filt.add_qc_filter('SST', 'hardlimit', 0)  # limits are -5.0 and 45.0 for SST set in parameters
    filt.add_qc_filter('POS', 'drf_agr', 0)  # remove obs failing preceding track checks
    filt.add_qc_filter('POS', 'drf_spd', 0)  # remove obs failing preceding track checks
    v_filt = ex.Voyage()
    for rep in rep_list:
        if filt.test_report(rep) == 0 and rep.get_qc('SST', 'bbud') < 4:
            v_filt.add_report(rep)
    v_filt.sort()  # sort in time
    print("passing {} to tail check".format(len(v_filt)))
    v_filt.buoy_tail_check(parameters['buoy_tail_check'], False)  # raises AssertionError if check inputs are invalid

    # ---sst biased or noisy buoy QC---

    filt = ex.QC_filter()
    filt.add_qc_filter('POS', 'isbuoy', 1)  # should already be applied, but just in case
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)  # includes rejection of (lon,lat)=(0,0)
    filt.add_qc_filter('SST', 'clim', 0)
    filt.add_qc_filter('SST', 'nonorm', 0)
    filt.add_qc_filter('SST', 'freez', 0)
    filt.add_qc_filter('SST', 'noval', 0)
    filt.add_qc_filter('SST', 'rep', 0)  # flags repeated value obs where these are >70% of record (set in parameters)
    filt.add_qc_filter('SST', 'hardlimit', 0)  # limits are -5.0 and 45.0 for SST set in parameters
    filt.add_qc_filter('POS', 'drf_agr', 0)  # remove obs failing preceding track checks
    filt.add_qc_filter('POS', 'drf_spd', 0)  # remove obs failing preceding track checks
    filt.add_qc_filter('SST', 'drf_tail1', 0)  # remove obs failing preceding track checks
    filt.add_qc_filter('SST', 'drf_tail2', 0)  # remove obs failing preceding track checks
    v_filt = ex.Voyage()
    for rep in rep_list:
        if filt.test_report(rep) == 0 and rep.get_qc('SST', 'bbud') < 4:
            v_filt.add_report(rep)
    v_filt.sort()  # sort in time
    print("passing {} to biased-noisy check".format(len(v_filt)))
    v_filt.buoy_bias_noise_check(parameters['buoy_bias_noise_check'],
                                 False)  # raises AssertionError if check inputs are invalid

    # return voyage with track QC flags
    voy = ex.Voyage()
    for rep in rep_list:
        voy.add_report(rep)
    voy.sort()  # sort in time

    # write out the QC outcomes for this chunk for this ID args.yr1,args.mn1,args.yr2,args.mn2)
    # stored in directory corresponding to last month in the chunk
    if 'new' in edge or 'regular' in edge:
        extdir = safe_make_tracking_dir(track_out_dir, args.yr2, args.mn2)
        if oldqc:
            voy.write_tracking_output(parameters['runid']+'oldqc', extdir, args.yr2, args.mn2)
        else:
            voy.write_tracking_output(parameters['runid'], extdir, args.yr2, args.mn2)

    if 'start_edge_case' in edge:
        extdir = safe_make_edge_dir(track_out_dir, args.yr2, args.mn2, 'start_edge_case', runmonthid)
        if oldqc:
            voy.write_tracking_output(parameters['runid']+'oldqc', extdir, args.yr2, args.mn2)
        else:
            voy.write_tracking_output(parameters['runid'], extdir, args.yr2, args.mn2)

    if 'end_edge_case' in edge:
        extdir = safe_make_edge_dir(track_out_dir, args.yr2, args.mn2, 'end_edge_case', runmonthid)
        if oldqc:
            voy.write_tracking_output(parameters['runid']+'oldqc', extdir, args.yr2, args.mn2)
        else:
            voy.write_tracking_output(parameters['runid'], extdir, args.yr2, args.mn2)
Exemplo n.º 6
0
def main(argv):
    """
    This program reads in data from ICOADS.3.0.0/ICOADS.3.0.1 and applies quality control processes to it, flagging data
    as good or bad according to a set of different criteria. Optionally it will replace drifting buoy SST data in
    ICOADS.3.0.1 with drifter data taken from the GDBC portal.

    The first step of the process is to read in various SST and MAT climatologies from file. These are 1degree latitude
    by 1 degree longitude by 73 pentad fields in NetCDF format.

    The program then loops over all specified years and months reads in the data needed to QC that month and then
    does the QC. There are three stages in the QC

    basic QC - this proceeds one observation at a time. Checks are relatively simple and detect gross errors

    track check - this works on Voyages consisting of all the observations from a single ship (or at least a single ID)
    and identifies observations which make for an implausible ship track

    buddy check - this works on Decks which are large collections of observations and compares observations to their
    neighbours
    """

    print('########################')
    print('Running make_and_full_qc')
    print('########################')

    parser = argparse.ArgumentParser(
        description='Marine QC system, main program')
    parser.add_argument('-config',
                        type=str,
                        default='configuration.txt',
                        help='name of config file')
    parser.add_argument('-tracking',
                        action='store_true',
                        help='perform tracking QC')
    parser.add_argument('-jobs',
                        type=str,
                        default='jobs.json',
                        help='name of job file')
    parser.add_argument('-job_index', type=int, default=0, help='job index')

    args = parser.parse_args()

    inputfile = args.config
    jobfile = args.jobs
    jobindex = args.job_index - 1
    tracking = args.tracking

    with open(jobfile) as fp:
        jobs = json.load(fp)

    year1 = jobs['jobs'][jobindex]['year1']
    year2 = jobs['jobs'][jobindex]['year2']
    month1 = jobs['jobs'][jobindex]['month1']
    month2 = jobs['jobs'][jobindex]['month2']

    verbose = True  # need set to read as arg in future

    print("running on ICOADS, this is not a test!")

    print('Input file is {}'.format(inputfile))
    print('Running from {} {} to {} {}'.format(month1, year1, month2, year2))
    print('')

    config = configparser.ConfigParser()
    config.read(inputfile)
    icoads_dir = config.get('Directories', 'ICOADS_dir')
    out_dir = config.get('Directories', 'out_dir')
    bad_id_file = config.get('Files', 'IDs_to_exclude')
    version = config.get('Icoads', 'icoads_version')

    print('ICOADS directory = {}'.format(icoads_dir))
    print('ICOADS version = {}'.format(version))
    print('Output to {}'.format(out_dir))
    print('List of bad IDs = {}'.format(bad_id_file))
    print('Parameter file = {}'.format(config.get('Files', 'parameter_file')))
    print('')

    ids_to_exclude = bf.process_bad_id_file(bad_id_file)

    # read in climatology files
    sst_pentad_stdev = clim.Climatology.from_filename(
        config.get('Climatologies', 'Old_SST_stdev_climatology'), 'sst')

    sst_stdev_1 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_one_box_to_buddy_avg'), 'sst')
    sst_stdev_2 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_one_ob_to_box_avg'), 'sst')
    sst_stdev_3 = clim.Climatology.from_filename(
        config.get('Climatologies', 'SST_buddy_avg_sampling'), 'sst')

    with open(config.get('Files', 'parameter_file'), 'r') as f:
        parameters = json.load(f)

    print("Reading climatologies from parameter file")
    climlib = ex.ClimatologyLibrary()
    for entry in parameters['climatologies']:
        print("{} {}".format(entry[0], entry[1]))
        climlib.add_field(entry[0], entry[1],
                          clim.Climatology.from_filename(entry[2], entry[3]))

    for year, month in qc.year_month_gen(year1, month1, year2, month2):

        print("INFO({}): {} {}".format(
            datetime.now().time().isoformat(timespec='milliseconds'), year,
            month))

        last_year, last_month = qc.last_month_was(year, month)
        next_year, next_month = qc.next_month_is(year, month)

        reps = ex.Deck()
        count = 0
        lastday = -99

        for readyear, readmonth in qc.year_month_gen(last_year, last_month,
                                                     next_year, next_month):

            print("INFO({}): {} {}".format(
                datetime.now().time().isoformat(timespec='milliseconds'),
                readyear, readmonth))

            ostia_bg_var = None
            if tracking:
                ostia_bg_var = clim.Climatology.from_filename(
                    config.get('Climatologies',
                               qc.season(readmonth) + '_ostia_background'),
                    'bg_var')

            filename = icoads_dir + '{:4d}-{:02d}.psv'.format(
                readyear, readmonth)

            imma_obj = pd.read_csv(filename,
                                   sep='|',
                                   header=None,
                                   names=[
                                       'YR', 'MO', 'DY', 'HR', 'LAT', 'LON',
                                       'DS', 'VS', 'ID', 'AT', 'SST', 'DPT',
                                       'DCK', 'SLP', 'SID', 'PT', 'UID', 'W',
                                       'D', 'IRF', 'bad_data', 'outfile'
                                   ],
                                   low_memory=False)

            # replace ' ' in ID field with '' (corrections introduce bug)
            imma_obj['ID'].replace(' ', '', inplace=True)
            imma_obj = imma_obj.sort_values(['YR', 'MO', 'DY', 'HR', 'ID'],
                                            axis=0,
                                            ascending=True)
            imma_obj = imma_obj.reset_index(drop=True)

            data_index = imma_obj.index

            rec = IMMA()
            print('INFO({}): Data read, applying first QC'.format(
                datetime.now().time().isoformat(timespec='milliseconds')))
            dyb_count = 0
            for idx in data_index:
                # set missing values to None
                for k, v in imma_obj.loc[idx, ].to_dict().items():
                    rec.data[k] = to_none(v)
                readob = True
                if (not (rec.data['ID'] in ids_to_exclude) and readob
                        and rec.data['YR'] == readyear
                        and rec.data['MO'] == readmonth
                        and rec.data['DY'] is not None):

                    rep = ex.MarineReportQC(rec)
                    del rec

                    rep.setvar('AT2', rep.getvar('AT'))

                    # if day has changed then read in OSTIA field if available and append SST and sea-ice fraction
                    # to the observation metadata
                    if tracking and readyear >= 1985 and rep.getvar(
                            'DY') is not None:
                        if rep.getvar('DY') != lastday:
                            lastday = rep.getvar('DY')
                            y_year, y_month, y_day = qc.yesterday(
                                readyear, readmonth, lastday)

                            #                            ofname = ostia_filename(ostia_dir, y_year, y_month, y_day)
                            ofname = bf.get_background_filename(
                                parameters['background_dir'],
                                parameters['background_filenames'], y_year,
                                y_month, y_day)

                            climlib.add_field(
                                'OSTIA', 'background',
                                clim.Climatology.from_filename(
                                    ofname, 'analysed_sst'))
                            climlib.add_field(
                                'OSTIA', 'ice',
                                clim.Climatology.from_filename(
                                    ofname, 'sea_ice_fraction'))

                        rep_clim = climlib.get_field(
                            'OSTIA', 'background').get_value_ostia(
                                rep.lat(), rep.lon())
                        if rep_clim is not None:
                            rep_clim -= 273.15

                        rep.setext('OSTIA', rep_clim)
                        rep.setext(
                            'ICE',
                            climlib.get_field('OSTIA', 'ice').get_value_ostia(
                                rep.lat(), rep.lon()))
                        rep.setext(
                            'BGVAR',
                            ostia_bg_var.get_value_mds_style(
                                rep.lat(), rep.lon(), rep.getvar('MO'),
                                rep.getvar('DY')))

                    for varname in ['SST', 'AT']:
                        rep_clim = climlib.get_field(
                            varname, 'mean').get_value_mds_style(
                                rep.lat(), rep.lon(), rep.getvar('MO'),
                                rep.getvar('DY'))
                        rep.add_climate_variable(varname, rep_clim)

                    for varname in ['SLP2', 'SHU', 'CRH', 'CWB', 'DPD']:
                        rep_clim = climlib.get_field(varname,
                                                     'mean').get_value(
                                                         rep.lat(), rep.lon(),
                                                         rep.getvar('MO'),
                                                         rep.getvar('DY'))
                        rep.add_climate_variable(varname, rep_clim)

                    for varname in ['DPT', 'AT2', 'SLP']:
                        rep_clim = climlib.get_field(varname,
                                                     'mean').get_value(
                                                         rep.lat(), rep.lon(),
                                                         rep.getvar('MO'),
                                                         rep.getvar('DY'))
                        rep_stdev = climlib.get_field(varname,
                                                      'stdev').get_value(
                                                          rep.lat(), rep.lon(),
                                                          rep.getvar('MO'),
                                                          rep.getvar('DY'))
                        rep.add_climate_variable(varname, rep_clim, rep_stdev)

                    rep.calculate_humidity_variables(
                        ['SHU', 'VAP', 'CRH', 'CWB', 'DPD'])

                    rep.perform_base_qc(parameters)
                    rep.set_qc(
                        'POS', 'month_match',
                        qc.month_match(year, month, rep.getvar('YR'),
                                       rep.getvar('MO')))

                    reps.append(rep)
                    count += 1

                rec = IMMA()
                dyb_count += 1
                if dyb_count % 1000 == 0:
                    print('INFO({}): {} out of {} processed'.format(
                        datetime.now().time().isoformat(
                            timespec='milliseconds'), dyb_count,
                        imma_obj.index.size))

                # icoads_file.close()

    print("INFO({}): Read {} ICOADS records".format(
        datetime.now().time().isoformat(timespec='milliseconds'), count))

    # filter the obs into passes and fails of basic positional QC
    filt = ex.QC_filter()
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)

    reps.add_filter(filt)

    if verbose:
        print('INFO ({}) .... Track checking individual ships'.format(
            datetime.now().time().isoformat(timespec='milliseconds')))

        # track check the passes one ship at a time
    count_ships = 0
    for one_ship in reps.get_one_platform_at_a_time():
        one_ship.track_check(parameters['track_check'])
        one_ship.iquam_track_check(parameters['IQUAM_track_check'])
        one_ship.spike_check(parameters['IQUAM_spike_check'])
        one_ship.find_saturated_runs(parameters['saturated_runs'])
        one_ship.find_multiple_rounded_values(
            parameters['multiple_rounded_values'])

        for varname in ['SST', 'AT', 'AT2', 'DPT', 'SLP']:
            one_ship.find_repeated_values(parameters['find_repeated_values'],
                                          intype=varname)

        count_ships += 1

    print("Track checked {} ships".format(count_ships))

    if verbose:
        print('INFO ({}) .... Applying buddy checks'.format(
            datetime.now().time().isoformat(timespec='milliseconds')))
    if verbose:
        print('INFO ({}) ........ SST'.format(
            datetime.now().time().isoformat(timespec='milliseconds')))
        # SST buddy check
    filt = ex.QC_filter()
    filt.add_qc_filter('POS', 'is780', 0)
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)
    filt.add_qc_filter('POS', 'trk', 0)
    filt.add_qc_filter('SST', 'noval', 0)
    filt.add_qc_filter('SST', 'freez', 0)
    filt.add_qc_filter('SST', 'clim', 0)
    filt.add_qc_filter('SST', 'nonorm', 0)

    reps.add_filter(filt)

    reps.bayesian_buddy_check('SST', sst_stdev_1, sst_stdev_2, sst_stdev_3,
                              parameters)
    reps.mds_buddy_check('SST', sst_pentad_stdev,
                         parameters['mds_buddy_check'])

    if verbose:
        print('INFO ({}) ........ NMAT'.format(
            datetime.now().time().isoformat(timespec='milliseconds')))
        # NMAT buddy check
    filt = ex.QC_filter()
    filt.add_qc_filter('POS', 'isship', 1)  # only do ships mat_blacklist
    filt.add_qc_filter('AT', 'mat_blacklist', 0)
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)
    filt.add_qc_filter('POS', 'trk', 0)
    filt.add_qc_filter('POS', 'day', 0)
    filt.add_qc_filter('AT', 'noval', 0)
    filt.add_qc_filter('AT', 'clim', 0)
    filt.add_qc_filter('AT', 'nonorm', 0)

    reps.add_filter(filt)

    reps.bayesian_buddy_check('AT', sst_stdev_1, sst_stdev_2, sst_stdev_3,
                              parameters)
    reps.mds_buddy_check('AT', sst_pentad_stdev, parameters['mds_buddy_check'])

    # DPT buddy check #NB no day check for this one
    filt = ex.QC_filter()
    filt.add_qc_filter('DPT', 'hum_blacklist', 0)
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)
    filt.add_qc_filter('POS', 'trk', 0)
    filt.add_qc_filter('DPT', 'noval', 0)
    filt.add_qc_filter('DPT', 'clim', 0)
    filt.add_qc_filter('DPT', 'nonorm', 0)

    reps.add_filter(filt)

    reps.mds_buddy_check('DPT', climlib.get_field('DPT', 'stdev'),
                         parameters['mds_buddy_check'])

    if verbose:
        print('INFO ({}) ........ SLP'.format(
            datetime.now().time().isoformat(timespec='milliseconds')))
        # SLP buddy check
    filt = ex.QC_filter()
    filt.add_qc_filter('POS', 'date', 0)
    filt.add_qc_filter('POS', 'time', 0)
    filt.add_qc_filter('POS', 'pos', 0)
    filt.add_qc_filter('POS', 'blklst', 0)
    filt.add_qc_filter('POS', 'trk', 0)
    filt.add_qc_filter('SLP', 'noval', 0)
    filt.add_qc_filter('SLP', 'clim', 0)
    filt.add_qc_filter('SLP', 'nonorm', 0)

    reps.add_filter(filt)

    reps.mds_buddy_check('SLP', climlib.get_field('SLP', 'stdev'),
                         parameters['slp_buddy_check'])

    extdir = bf.safe_make_dir(out_dir, year, month)
    reps.write_output(parameters['runid'], extdir, year, month)

    if tracking:

        if verbose:
            print('INFO ({}) .... Tracking'.format(
                datetime.now().time().isoformat(timespec='milliseconds')))

            # set QC for output by ID - buoys only and passes base SST QC
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'month_match', 1)
        filt.add_qc_filter('POS', 'isdrifter', 1)

        reps.add_filter(filt)

        idfile = open(extdir + '/ID_file.txt', 'w')
        for one_ship in reps.get_one_platform_at_a_time():

            if len(one_ship) > 0:
                thisid = one_ship.getrep(0).getvar('ID')
                if thisid is not None:
                    idfile.write(thisid + ',' + ex.safe_filename(thisid) +
                                 '\n')
                    one_ship.write_output(parameters['runid'], extdir, year,
                                          month)
        idfile.close()

    del reps
Exemplo n.º 7
0
def main(argv):
    '''
    This program reads in data from ICOADS.2.5.1 and applies quality control processes to it, flagging data as 
    good or bad according to a set of different criteria.

    The first step of the process is to read in various SST and MAT climatologies from file. These are 1degree latitude 
    by 1 degree longitude by 73 pentad fields in NetCDF format.
    
    The program then loops over all specified years and months reads in the data needed to QC that month and then 
    does the QC. There are three stages in the QC
    
    basic QC - this proceeds one observation at a time. Checks are relatively simple and detect gross errors
    
    track check - this works on Voyages consisting of all the observations from a single ship (or at least a single ID) 
    and identifies observations which make for an implausible ship track
    
    buddy check - this works on Decks which are large collections of observations and compares observations to their neighbours
    '''
    
    print '########################'
    print 'Running make_and_full_qc'
    print '########################'

    parser = argparse.ArgumentParser(description='Marine QC system, main program')
    parser.add_argument('-config', type=str, default='configuration.txt', help='name of config file')
    parser.add_argument('-year1', type=int, default=1850, help='First year for processing')
    parser.add_argument('-year2', type=int, default=1850, help='Final year for processing')
    parser.add_argument('-month1', type=int, default=1, help='First month for processing')
    parser.add_argument('-month2', type=int, default=1, help='Final month for processing')
    parser.add_argument('-test', action='store_true', help='run test suite')
    args = parser.parse_args() 

    inputfile = args.config
    year1 = args.year1
    year2 = args.year2
    month1 = args.month1
    month2 = args.month2
    Test = args.test 

    print 'Input file is ', inputfile
    print 'Running from ', month1, year1, ' to ', month2, year2
    print ''

    config = ConfigParser.ConfigParser()    
    config.read(inputfile)

    sst_climatology_file  = '/project/mds/HADISST2/OIv2_clim_MDS_6190_0.25x0.25xdaily_365.nc'

    icoads_dir = config.get('Directories', 'ICOADS_dir')
    out_dir = config.get('Directories', 'out_dir')
    bad_id_file = config.get('Files', 'IDs_to_exclude')
    version = config.get('Icoads', 'icoads_version')

    print 'ICOADS directory =', icoads_dir
    print 'ICOADS version =', version
    print 'List of bad IDs =', bad_id_file 
    print ''

    ids_to_exclude = process_bad_id_file(bad_id_file)

#read in climatology files
    sst_pentad_stdev = clim.Climatology.from_filename(config.get('Climatologies', 'Old_SST_stdev_climatology'), 'sst')

    sst_stdev_1 = clim.Climatology.from_filename(config.get('Climatologies', 'SST_buddy_one_box_to_buddy_avg'), 'sst')
    sst_stdev_2 = clim.Climatology.from_filename(config.get('Climatologies', 'SST_buddy_one_ob_to_box_avg'), 'sst')
    sst_stdev_3 = clim.Climatology.from_filename(config.get('Climatologies', 'SST_buddy_avg_sampling'), 'sst')

    with open(config.get('Files','parameter_file'), 'r') as f:
        parameters = json.load(f)

    climlib = ex.ClimatologyLibrary()
    climlib.add_field('SST', 'mean', clim.Climatology.from_filename(sst_climatology_file, 'temperature'))

    for year, month in qc.year_month_gen(year1, month1, year2, month2):

        print year, month

        last_year, last_month = qc.last_month_was(year, month)
        next_year, next_month = qc.next_month_is(year, month)

        reps = ex.Deck()
        count = 0
        count2 = 0

        for readyear, readmonth in qc.year_month_gen(last_year, 
                                                     last_month, 
                                                     next_year, 
                                                     next_month):

            print readyear, readmonth
            syr = str(readyear)
            smn = "%02d" % (readmonth)

            filename = icoads_filename(icoads_dir, readyear, 
                                       readmonth, version)

            try:
                icoads_file = gzip.open(filename, "r")
            except IOError:
                print "no ICOADS file ",filename," for ", readyear, readmonth
                continue

            rec = IMMA()

            for line in icoads_file:

                try:
                    rec.readstr(line)
                    readob = True
                except:
                    readob = False
                    print "Rejected ob", line
                    
#if this is not on the exclusion list, readable and not a buoy in the NRT runs
                if (not(rec.data['ID'] in ids_to_exclude) and 
                    readob and
                    rec.data['YR'] == readyear and
                    rec.data['MO'] == readmonth):

                    rep = ex.MarineReportQC(rec)
                    del rec
                    rep_clim = climlib.get_field('SST', 'mean').get_value(rep.lat(), rep.lon(), rep.getvar('MO'), rep.getvar('DY')) 
                    rep.add_climate_variable('SST', rep_clim)
                    rep.perform_base_sst_qc(parameters)
                    reps.append(rep)
                    count += 1
                rec = IMMA()
            icoads_file.close()

        print "Read ", count, " ICOADS records"

#filter the obs into passes and fails of basic positional QC        
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'date',   0)
        filt.add_qc_filter('POS', 'time',   0)
        filt.add_qc_filter('POS', 'pos',    0)
        filt.add_qc_filter('POS', 'blklst', 0)
         
        reps.add_filter(filt)

#track check the passes one ship at a time
        count_ships = 0
        for one_ship in reps.get_one_platform_at_a_time():

            one_ship.track_check(parameters['track_check'])
            one_ship.find_repeated_values(parameters['find_repeated_values'], intype='SST')
            count_ships += 1

        print "Track checked ", count_ships, " ships"

#SST buddy check
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'is780',  0)
        filt.add_qc_filter('POS', 'date',   0)
        filt.add_qc_filter('POS', 'time',   0)
        filt.add_qc_filter('POS', 'pos',    0)
        filt.add_qc_filter('POS', 'blklst', 0)
        filt.add_qc_filter('POS', 'trk',    0)
        filt.add_qc_filter('SST', 'noval',  0)
        filt.add_qc_filter('SST', 'freez',  0)
        filt.add_qc_filter('SST', 'clim',   0)
        filt.add_qc_filter('SST', 'nonorm', 0)

        reps.add_filter(filt)

        reps.bayesian_buddy_check('SST', sst_stdev_1, sst_stdev_2, sst_stdev_3, parameters)
        reps.mds_buddy_check('SST', sst_pentad_stdev, parameters['mds_buddy_check'])

        varnames_to_print = {'SST':['bud', 'clim', 'nonorm', 'freez', 'noval', 'nbud', 'bbud', 'rep', 'spike', 'hardlimit']}
        
        reps.write_qc('hires_'+parameters['runid'], out_dir, year, month, varnames_to_print)

        del reps
Exemplo n.º 8
0
def main(argv):
    '''
    This is the program that runs the base QC on data in the data base (created by Make_DB.py. The checks are the simpler 
    checks, which can be performed on an observation-by-observation basis.
    '''
    
    print '###############'
    print 'Running base_qc'
    print '###############'
    
    inputfile = 'configuration.txt'
    month1 = 1
    month2 = 12

    try:
        opts, args = getopt.getopt(argv,"hi:",
                                   ["ifile=",
                                    "year1=",
                                    "year2=",
                                    "month1=",
                                    "month2="])
    except getopt.GetoptError:
        print 'Usage Make_DB.py -i <configuration_file> '+\
        '--year1 <start year> --year2 <end year>'+\
        '--month1 <start month> --month2 <end month>'
        sys.exit(2)

    inputfile, year1, year2, month1, month2 = qc.get_arguments(opts)

    print 'Input file is ', inputfile
    print 'Running from ',year1,' to ',year2
    print 'Running from ',month1,' to ',month2
    print ''

    config = qc.get_config(inputfile)

    data_base_host        = config['data_base_host']
    data_base_name        = config['data_base_name'] 

    print 'Data base host =', data_base_host
    print 'Data base name =', data_base_name
    print ''

#connect to data base	
    connection = MySQLdb.connect(host=data_base_host, 
                                 user='******',
                                 db=data_base_name)

    for years,months in qc.year_month_gen(year1, month1, year2, month2):

        print '\nRunning Base QC for',years,months

        cursor = connection.cursor()
        cursor2 = connection.cursor()

        syr = str(years)
        
        '''set up a QC filter and use it to extract obs from the database direct into MarineReport format'''
        filter = db.Quality_Control_Filter()
        filter.year = years
        filter.month = months

        t0 = time.time()
        reps = db.get_marine_report_from_db(cursor,years,filter)
        t1 = time.time()
        total_time = t1-t0
        print "read",total_time

        '''For each report, do all the basic QC checks then update the QC flags in the data base'''
        for rep in reps:

            rep.bad_position = qc.position_check(rep.lat, rep.lon)
            rep.bad_date = qc.date_check(rep.year, rep.month, rep.day, rep.hour)
            if rep.bad_position == 0 and rep.bad_date == 0:
                rep.day_check = qc.day_test(rep.year,rep.month,rep.day,rep.hour,rep.lat,rep.lon)
            else:
                rep.day_check = 1

            rep.no_sst = qc.value_check(rep.sst)
            rep.sst_below_freezing = qc.sst_freeze_check(rep.sst, 0.0)
            rep.sst_climatology_fail = qc.climatology_check(rep.sst,rep.sst_norm,8.0)
            rep.no_sst_normal = qc.no_normal_check(rep.sst_norm)
            
            rep.no_mat = qc.value_check(rep.mat)
            rep.mat_climatology_fail = qc.climatology_check(rep.mat,rep.mat_norm,10.0)
            rep.no_mat_normal = qc.no_normal_check(rep.mat_norm)
            
            rep.blacklist = qc.blacklist(rep.id, rep.dck, rep.year, rep.lat, rep.lon)
            
        t15 = time.time()
        print "qcd",t15-t1
        for rep in reps:
            result = db.update_db_basic_qc_flags(rep,years,cursor2)
            
        t2 = time.time()
        print "added to db",t2-t15
        '''Commit the changes then print a summary'''
        connection.commit()
        #db.report_qc_counts(cursor,years,months)
        t3 = time.time()
        print "commited",t3-t2

    connection.close()

    print "All Done :)"
def main(argv):
    '''
    For input year range, extract and print obs from the database.
    '''
    inputfile = 'configuration.txt'
    
    try:
        opts, args = getopt.getopt(argv, 
                                   "hi:", 
                                   ["ifile=", 
                                    "year1=", 
                                    "year2="])
    except getopt.GetoptError:
        print 'Usage Make_DB.py -i <configuration_file>'+\
        ' --year1 <start year> --year2 <end year>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'Usage Make_DB.py -i <configuration_file> '+\
            '--year1 <start year> --year2 <end year>'
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-x", "--year1"):
            try:
                year1 = int(arg)
            except:
                sys.exit("Failed: year1 not an integer")
        elif opt in ("-y", "--year2"):
            try:
                year2 = int(arg)
            except:
                sys.exit("Failed: year2 not an integer")

    print 'Input file is ', inputfile
    print 'Running from ', year1, ' to ', year2
    print ''

    config = qc.get_config(inputfile)

    data_base_host        = config['data_base_host']
    data_base_name        = config['data_base_name'] 

    print 'Data base host =', data_base_host
    print 'Data base name =', data_base_name

    #connect to data base    
    connection = MySQLdb.connect(host=data_base_host, 
                                 user='******',
                                 db=data_base_name)
    #need two cursors, one for reading and one for making QC changes
    cursor = connection.cursor()

    for years, months in qc.year_month_gen(year1, 1, year2, 12):
        
        print years, months
        
        syr = str(years)
        smn = "%02d" % (months,)

        print syr+smn

        outfile = open('/data/local/hadjj/ICOADS.2.5.1/blobs_'+syr+smn+'.txt','w')
        
        sql_request = 'SELECT \
        marinereports'+syr+'.id, \
        marinereports'+syr+'.lat, \
        marinereports'+syr+'.lon, \
        marinereports'+syr+'.sst, \
        marinereports'+syr+'.mat, \
        marinereports'+syr+'.year, \
        marinereports'+syr+'.month, \
        marinereports'+syr+'.day, \
        marinereports'+syr+'.hour, \
        marinereports'+syr+'.icoads_ds, \
        marinereports'+syr+'.icoads_vs, \
        marinereports'+syr+'.uid, \
        base_qc'+syr+'.bad_position , \
        base_qc'+syr+'.bad_date , \
        base_qc'+syr+'.bad_track , \
        sst_qc'+syr+'.no_sst , \
        sst_qc'+syr+'.sst_below_freezing , \
        sst_qc'+syr+'.sst_climatology_fail , \
        sst_qc'+syr+'.no_sst_normal , \
        sst_qc'+syr+'.sst_buddy_fail, \
        mat_qc'+syr+'.no_mat , \
        mat_qc'+syr+'.mat_climatology_fail, \
        mat_qc'+syr+'.no_mat_normal , \
        mat_qc'+syr+'.mat_buddy_fail,  \
        marinereports'+syr+'.dck, \
        marinereports'+syr+'.sid, \
        base_qc'+syr+'.day_check, \
        base_qc'+syr+'.blacklist, \
        base_qc'+syr+'.fewsome_check, \
        extra_qc'+syr+'.new_track_check, \
        extra_qc'+syr+'.bayesian_sst_buddy_check \
        FROM marinereports'+syr+' \
        INNER JOIN base_qc'+syr+' ON \
        marinereports'+syr+'.uid = base_qc'+syr+'.uid \
        INNER JOIN sst_qc'+syr+ ' ON \
        marinereports'+syr+'.uid = sst_qc'+syr+'.uid \
        INNER JOIN mat_qc'+syr+ ' ON \
        marinereports'+syr+'.uid = mat_qc'+syr+'.uid \
        INNER JOIN extra_qc'+syr+ ' ON \
        marinereports'+syr+'.uid = extra_qc'+syr+'.uid \
        WHERE marinereports'+syr+'.month = '+str(months)

        reps = []
        cursor.execute(sql_request)
        numrows = cursor.rowcount
        
        for i in range(numrows):
            rows = cursor.fetchone()
            rep = qc.MarineReport(rows[0], rows[1], rows[2], 
                                  rows[3], rows[4], rows[5],
                                  rows[6], rows[7], rows[8], 
                                  rows[9], rows[10], rows[11])

            rep.bad_position = rows[12]
            rep.bad_time = rows[13]
            rep.bad_track = rows[14]
            
            rep.no_sst = rows[15]
            rep.sst_below_freezing = rows[16]
            rep.sst_climatology_fail = rows[17]
            rep.no_sst_normal = rows[18]
            rep.sst_buddy_fail = rows[19]
            
            rep.no_mat = rows[20]
            rep.mat_climatology_fail = rows[21]
            rep.no_mat_normal = rows[22]
            rep.mat_buddy_fail = rows[23]
            
            rep.dck = rows[24]
            rep.sid = rows[25]
            
            rep.day_check = rows[26]
            rep.blacklist = rows[27]
            
            rep.fewsome_check = rows[28]
            rep.new_track_check = rows[29]
            rep.bayesian_sst_buddy_check = rows[30]
            
            reps.append(rep)

        reps.sort()
        for rep in reps:
            outfile.write(rep.print_report())
        
        outfile.close()
        
        print "out ", years, months
        
    connection.close()
def main(argv):
    '''
    The buddy check compares observations to other nearby observations. If the observation differs 
    substantially from the neighbour-average, the observation will be rejected.
    '''

    print '###################'
    print 'Running buddy_check'
    print '###################'
    
    inputfile = 'configuration.txt'

    try:
        opts, args = getopt.getopt(argv, "hi:", 
                                   ["ifile=", 
                                    "year1=", 
                                    "year2="])
    except getopt.GetoptError:
        print 'Usage Make_DB.py -i <configuration_file> '+\
        '--year1 <start year> --year2 <end year>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'test.py -i <inputfile> -o <outputfile>'
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-x", "--year1"):
            try:
                year1 = int(arg)
            except:
                sys.exit("Failed: year1 not an integer")
        elif opt in ("-y", "--year2"):
            try:
                year2 = int(arg)
            except:
                sys.exit("Failed: year2 not an integer")

    print 'Input file is ', inputfile
    print 'Running from ', year1, ' to ', year2
    print ''

    config = qc.get_config(inputfile)

    sst_climatology_file  = config['SST_climatology'] 
    nmat_climatology_file = config['MAT_climatology'] 
    icoads_dir            = config['ICOADS_dir'] 
    sst_stdev_climatology_file  = config['Old_SST_stdev_climatology']
    data_base_host        = config['data_base_host']
    data_base_name        = config['data_base_name'] 

    print 'Data base host =', data_base_host
    print 'Data base name =', data_base_name
    print 'SST climatology =', sst_climatology_file
    print 'NMAT climatology =', nmat_climatology_file
    print 'ICOADS directory =', icoads_dir
    print ''

#read in the pentad climatology of standard deviations
    climatology = Dataset(sst_stdev_climatology_file)
    sst_pentad_stdev = climatology.variables['sst'][:]

    connection = MySQLdb.connect(host=data_base_host, 
                                 user='******',
                                 db=data_base_name)
    cursor  = connection.cursor() #read
    cursor2 = connection.cursor() #write
    
    for years, months in qc.year_month_gen(year1, 1, year2, 12):

#want to get a month either side of the 
#target month, which may be in different years
        last_year, last_month = qc.last_month_was(years, months)
        next_year, next_month = qc.next_month_is(years, months)
        
        print years, months
        
        first_year = min([last_year, years, next_year])
        final_year = max([last_year, years, next_year])
        
        if first_year < 1850:
            first_year = 1850
        if final_year > 2014:
            final_year = 2014

#first and last julian days are +- approximately one month
        month_lengths = qc.month_lengths(years)
        jul1 = qc.jul_day(years, months, 1)-25
        jul2 = qc.jul_day(years, months, month_lengths[months-1])+25
        
        for check_variable in ['SST','MAT']:
        
            reps = []
            for yyy in range(first_year, final_year+1):
                
                qcfilter = db.Quality_Control_Filter()
                qcfilter.jul1 = jul1
                qcfilter.jul2 = jul2
                qcfilter.set_multiple_qc_flags_to_pass(['bad_position',
                                                        'bad_date',
                                                        'blacklist'])
                
                if check_variable == 'SST':
                    qcfilter.set_multiple_qc_flags_to_pass(['no_sst',
                                                            'sst_below_freezing',
                                                            'no_sst_normal',
                                                            'sst_climatology_fail'])
                elif check_variable == 'MAT':
                    qcfilter.set_multiple_qc_flags_to_pass(['no_mat',
                                                            'no_mat_normal',
                                                            'mat_climatology_fail'])
                else:
                    print "no such type ", check_variable
                    assert False

                sql_request = db.build_sql_query(yyy, qcfilter)
                
                cursor.execute(sql_request)
                numrows = cursor.rowcount

                for i in range(numrows):
                    rows = cursor.fetchone()
                    rep = qc.MarineReport.report_from_array(rows)
                    reps.append(rep)

            print len(reps)," observations read in"

#Do the buddy check
            if check_variable == 'SST':
                qcs = qc_buddy_check.mds_buddy_check(reps, 
                                                     sst_pentad_stdev, 
                                                     'SST')
            elif check_variable == 'MAT':
                qcs = qc_buddy_check.mds_buddy_check(reps, 
                                                     sst_pentad_stdev, 
                                                     'MAT')
            else:
                print "no such type ", check_variable
                assert False

#put updated QC flags into data base
            for rep in reps:
                if rep.month == months:
                    if check_variable == 'SST':
                        result = db.update_db_qc_single_flag(rep,
                                                             rep.sst_buddy_fail,
                                                             'sst_qc',
                                                             'sst_buddy_fail',
                                                             years,
                                                             cursor2)
                    elif check_variable == 'MAT':
                        result = db.update_db_qc_single_flag(rep,
                                                             rep.mat_buddy_fail,
                                                             'mat_qc',
                                                             'mat_buddy_fail',
                                                             years,
                                                             cursor2)
                    else:
                        print "no such type ", check_variable
                        assert False

            print "Of "+str(len(qcs))+" observations "+\
            str(np.sum(qcs))+" failed "+check_variable+\
            " buddy check"

        connection.commit() #Each month
        #db.report_qc_counts(cursor, years, months)

    connection.close()

    
    print "All Done :)"
def main(argv):
    '''
    This program builds the marine data base which will be used to store the subset of ICOADS used in QC and 
    other data processing. The current version reads in IMMA1 data from ICOADS.2.5.1 and the UID is used as the 
    primary key for the data base so that it can be easily matched to individual obs if need be.
    
    #KW added para
    The database is now just a set of ascii files for each year/month. Later it may be the SQL database.

    The first step of the process is to read in the SST and MAT climatologies from file. These are 1degree latitude 
    by 1 degree longitude by 73 pentad fields in NetCDF format. The data are read into numpy arrays.

    Next a connection is made to the data base, which may or may not already exist. If it does not exist, a database 
    will be created.
    
    The program then loops over all years and months and DROPs existing tables for each year if they already exist and 
    then recreates them. It then loops over all months in the year, opens the appropriate IMMA file and reads in 
    the data one observation at a time.
    '''
    
    print '########################'
    print 'Running make_and_full_qc'
    print '########################'
    
    inputfile = 'configuration.txt'
    month1 = 1
    month2 = 1
    year1 = 1880
    year2 = 1880
# KW Querying second instance of inputfile - I have commented this out for now    
#    inputfile = 'configuration_local.txt'
    
    try:
        opts, args = getopt.getopt(argv, "hi:", 
                                   ["ifile=", 
                                    "year1=", 
                                    "year2=",
                                    "month1=",
                                    "month2="])
    except getopt.GetoptError:
# KW changed Make_DB.py to make_and_full_qc.py
        print 'Usage make_and_full_qc.py -i <configuration_file> '+\
        '--year1 <start year> --year2 <end year> '+\
        '--month1 <start month> --month2 <end month>'
        sys.exit(2)

    inputfile, year1, year2, month1, month2 = qc.get_arguments(opts)

    print 'Input file is ', inputfile
    print 'Running from ', year1, ' to ', year2
    print ''

    config = qc.get_config(inputfile)

# KW Added a 'switch' to tell the code whether to run in HadISDH only (HadISDHSwitch == True) mode or 
# full mode (HadISDHSwitch == False)
    HadISDHSwitch = config['HadISDHSwitch']

    sst_climatology_file  = config['SST_climatology'] 
    nmat_climatology_file = config['MAT_climatology'] 
# KW Added climatology files for the humidity variables 
    at_climatology_file  = config['AT_climatology']
    dpt_climatology_file  = config['DPT_climatology']
    shu_climatology_file  = config['SHU_climatology']
    vap_climatology_file  = config['VAP_climatology']
    crh_climatology_file  = config['CRH_climatology']
    cwb_climatology_file  = config['CWB_climatology']
    dpd_climatology_file  = config['DPD_climatology']
# KW Added climatology file for the SLP which is needed if no SLP ob exists, or if it has failed qc - or if we choose to derive humidity using climatological P (which we have)
    slp_climatology_file  = config['SLP_climatology']
    icoads_dir            = config['ICOADS_dir'] 
#KW Added the 'recent' ICOADS dir for files 2015+
    recent_icoads_dir            = config['RECENT_ICOADS_dir'] 
    bad_id_file           = config['IDs_to_exclude']
# KW added an item for the database dir to write out the QC'd ascii data to - hijacking SQL data_base_dir for now
    data_base_dir	  = config['data_base_dir']
# KW added an item as a suffix for the output file name to note which iteration we're on
    output_suffix         = config['output_suffix']    

# KW Noting this is set to read the OLD SST stdevs - nothing reads in the newer OSTIA one yet.       
    sst_stdev_climatology_file  = config['Old_SST_stdev_climatology']
    
    sst_stdev_1_file = config['SST_buddy_one_box_to_buddy_avg']
    sst_stdev_2_file = config['SST_buddy_one_ob_to_box_avg']
    sst_stdev_3_file = config['SST_buddy_avg_sampling']

# KW added standard deviation files for AT and DPT - for MDSKate_buddy_check
    at_stdev_climatology_file  = config['AT_stdev_climatology']
    dpt_stdev_climatology_file  = config['DPT_stdev_climatology']
    
# KW Added a look for hardwired limits passed through the config file or set to None
    if ('HardLimits' in config): 
	HardLimit = np.float(config['HardLimits'])
    else:
        HardLimit = None	   
    print "This is the provided HardLimit: ",HardLimit
    #pdb.set_trace()

    print 'SST climatology =', sst_climatology_file
    print 'NMAT climatology =', nmat_climatology_file
# KW Added climatology files for the humidity variables 
    print 'DPT climatology =', dpt_climatology_file
    print 'SHU climatology =', shu_climatology_file
    print 'VAP climatology =', vap_climatology_file
    print 'CRH climatology =', crh_climatology_file
    print 'CWB climatology =', cwb_climatology_file
    print 'DPD climatology =', dpd_climatology_file
## KW Added climatology files for SLP for calculation of humidity variables if no good quality SLP ob exists
    print 'SLP climatology =', slp_climatology_file
    print 'ICOADS directory =', icoads_dir
# KW added 'recent' icoads dir
    print 'RECENT ICOADS directory =', recent_icoads_dir
    print 'List of bad IDs =', bad_id_file 
# KW added an item for the database dir to write out the QC'd ascii data to - hijacking SQL data_base_dir for now
    print 'QCd Database directory =', data_base_dir 
    print 'QCd File Suffix =', output_suffix 
    print ''

    ids_to_exclude = process_bad_id_file(bad_id_file)

#read in climatology files
    climsst = read_climatology(sst_climatology_file, 'sst')
    climnmat = read_climatology(nmat_climatology_file, 'nmat')
# KW Added climatology read in files for the humidity variables
    climat = read_climatology(at_climatology_file, 't2m_clims')
    climdpt = read_climatology(dpt_climatology_file, 'td2m_clims')
    climshu = read_climatology(shu_climatology_file, 'q2m_clims')
    climvap = read_climatology(vap_climatology_file, 'e2m_clims')
    climcrh = read_climatology(crh_climatology_file, 'rh2m_clims')
    climcwb = read_climatology(cwb_climatology_file, 'tw2m_clims')
    climdpd = read_climatology(dpd_climatology_file, 'dpd2m_clims')
## KW Added climatology read in files for SLP for calculating humidity variabls if no SLP value exists
    climslp = read_climatology(slp_climatology_file, 'p2m_clims')

# KW Note that if this points to OLD_SST_stdev_climatology then it is a 73,180,360 array whereas the SST_stdev_climatology file is just 180,360
    sst_pentad_stdev = read_climatology(sst_stdev_climatology_file, 'sst')
    
    sst_stdev_1 = read_climatology(sst_stdev_1_file, 'sst')
    sst_stdev_2 = read_climatology(sst_stdev_2_file, 'sst')
    sst_stdev_3 = read_climatology(sst_stdev_3_file, 'sst')

# KW added standard deviation files for AT and DPT - for MDSKate_buddy_check
    at_pentad_stdev = read_climatology(at_stdev_climatology_file, 't2m_stdevs')
    dpt_pentad_stdev = read_climatology(dpt_stdev_climatology_file, 'td2m_stdevs')
    
    print 'Read climatology files'

    tim00 = time.time()

    for year, month in qc.year_month_gen(year1, month1, year2, month2):

        tim0 = time.time()

        print year, month

        last_year, last_month = qc.last_month_was(year, month)
        next_year, next_month = qc.next_month_is(year, month)

        if last_year < 1850:
            last_year = 1850 # KW don't understand why last year forced to be 1850 yet
            last_month = 1

        print last_year, last_month, next_year, next_month

        reps = ex.Deck()
        count = 0

# KW This takes a long time to read in each year/month and process
# For every candidate year/month the year/month before and after are also read in
# Can we store the candidate year/month and following year/month for the next loop?
# Hopefully there will be enough memory on spice
# HOWEVER - IF WE RUN MANY YEARS IN PARALELL THEN OK TO READ IN EACH TIME

        for readyear, readmonth in qc.year_month_gen(last_year, 
                                                     last_month, 
                                                     next_year, 
                                                     next_month):

            print readyear, readmonth

            syr = str(readyear)
            smn = "%02d" % (readmonth)

# KW THIS BIT IS FOR 2.5.0/1    
#            filename = icoads_dir+'/R2.5.1.'+syr+'.'+smn+'.gz'
# KW FOUND A BUG - changed 'year' to 'readyear' below because it was trying to 
# read R2.5.2.2007.12.gz because 'year'=2008, 'month'=1
# KW Now added a catch for 'recent' years - at present this is anything from 2015 onwards - data only available in IMMA (not IMMA2) format - no UID!
#            if ((readyear > 2007) & (readyear < 2015)):
#                filename = icoads_dir+'/R2.5.2.'+syr+'.'+smn+'.gz'
#            if (readyear >= 2015):
#                filename = recent_icoads_dir+'/IMMA.'+syr+'.'+smn+'.gz'
# KW THIS BIT IS FOR 3.0.0/1
            filename = icoads_dir+'/IMMA1_R3.0.0_'+syr+'-'+smn+'.gz'
            if (readyear >= 2015):
                filename = recent_icoads_dir+'/IMMA1_R3.0.1_'+syr+'-'+smn+'.gz'
    
            icoads_file = gzip.open(filename,"r")

# KW Noted that this creates an object of whole month of IMMA data separated into all available parameters from all available attachments
# The rec.read bit later could be speeded up by ignoring the attachments we are not interested in in the first place?    
# The rec object has a .data dictionary of all variables (see IMMA2.py for variable IDs/keys
            rec = IMMA()
   
            EOF = False
    
            while not(EOF):

#need to wrap the read in a exception catching thingy 
#becasuse there are some IMMA records which contain control 
#characters
                try:
                    result = rec.read(icoads_file)
                    if result == None:
                        EOF = True
                        # KW are we sure this isn't doing anything silly later when rec is overwritten with a new rec - could
			# this overwrite ids_to_exclude[0]?
			rec.data['ID'] = ids_to_exclude[0]
                except:
                    rec.data['ID'] = ids_to_exclude[0]


                if not(rec.data['ID'] in ids_to_exclude):

#strip everything out of the IMMA record except what we # KW (Kate Robert and John)# need
# KW this should work for both IMMA and IMMA1 e.g. C4 (IMMA) and C7 (IMMA1) use same 'key's so it 'should' find
# them because both are encoded in IMMA2.py
		    keys = []
                    for key in rec.data:
                        keys.append(key)
                    for key in keys:
# KW Added quite a few things in here - assume these don't have to be all from attachment 0 because UID isn't
# Assume they don't have to be in a particular order either
# I've put them in the order they appear in the attachments
# See: RequiredIMMAColumnsforHadISDH.xlsx
# Only a few of these will be written out but they are useful in the QC and bias adjustment process
# May remove some of these later if they are not useful - to save time/memory
#                        if not(key in ['YR','MO','DY','HR','LAT','LON',
#                                       'SST','AT','DCK','ID','PT','SI',
#                                       'SIM','DS','VS','SLP','UID','SID']):
                        if not(key in ['YR','MO','DY','HR','LAT','LON',
				       'DS','VS','II','ID','C1',
				       'DI','D','WI','W','VI','VV','SLP',
				       'IT','AT','WBTI','WBT','DPTI','DPT','SI','SST',
				       'DCK','SID','PT','DUPS',
				       'COR','TOB','TOT','EOT','TOH','EOH',
				       'SIM','LOV','HOP','HOT','HOB','HOA','SMF',
				       'UID']):
                            if key in rec.data: del rec.data[key]
# KW So I've noticed that if one of the listed keys above isn't in the ob then a data['key'] isn't
# set up (makes sense!) so when I come to print them later it all goes to pot
# So, I loop through the non-core0 keys here to add blank keys where they are missing
# KW Added 'UID' to this list because it is not present in the RECENT_ICOADS (2015+)
		    for inkey in ['DUPS','COR','TOB','TOT','EOT',
		                  'TOH','EOH','SIM','LOV','HOP','HOT','HOB','HOA','SMF','UID']:
		        if not(inkey in keys):
			    #print("Missing key: ",inkey)
			    rec.data[inkey] = None
			    					
                    rep = ex.MarineReport(rec)
                    del rec

#************HadISDH ONLY*******************************
# KW Added a catch here to check the platform type and whether there is both a T (AT) and DPT  present.
# Only keep the ob if it is from a ship (0,1,2,3,4,5) or moored platform/buoy (6,8,9,10,15) and has 
# AT and DPT present.
# This may not be desirable for a full run but should save time/memory for HadISDH
# If HadISDHSwitch == True then the ob needs to pass the test else all obs are processed
# No QC performed yet so cannot call get_qc - qc.value_check returns 0 if present and 1 if noval
# Previously I had also pulled through PT=14 but this can be a coastal or island station - so not what we want.
# KW Oct 2016 - I've now decided that future runs shoudl NOT include any platforms. We don't have height
# info and they can vary from <10 to >200m so its just too screwy
#		    if (not (HadISDHSwitch)) | ((rep.data['PT']  in [0,1,2,3,4,5,6,8,9,10,15]) & 
		    if (not (HadISDHSwitch)) | ((rep.data['PT']  in [0,1,2,3,4,5,6,8]) & 
		                                (qc.value_check(rep.getvar('AT')) == 0) & 
						(qc.value_check(rep.getvar('DPT')) == 0)):

# KW TESTED: WORKS IF VALUES ARE BLANK AT LEAST
# KW CHECK THAT THIS KICKS OUT OBS WITH REPORTED MISSING VALUES (e.g. -99.9 or 99.9) FOR AT or DPT		    
#*******************************************************

# KW Call my rep.setvar routine that I built into the MarineReport in Extended_IMMA.py
# Use this to add blank var containers for the humidity variables that are calculated 
# later
                        rep.setvar(['SHU','VAP','CRH','CWB','DPD'])

# KW Get climatologies for slp to calculate humidity values if no good quality qc ob exists
                        rep_slp_clim = get_clim(rep, climslp)
			#print('SLP: ',rep_slp_clim)
			#if (count == 10):
			#    pdb.set_trace()
                        rep.add_climate_variable('SLP', rep_slp_clim)

# KW Calculate humidity variables here - so we can then kick out anything really silly e.g. RH>150
# Very silly values can cause longer line lengths at output which is an extra problem for post processing
# For the longer term these could be set to missing but we just want to focus on 'good' humidity obs for now
# Use my new routine as part of the Extended_IMMA MarineReport class rep.calcvar() 
# This routine returns values as None if there is no climslp or if RH is < 0 or > 150.
                        rep.calcvar(['SHU','VAP','CRH','CWB','DPD'])
			
# Now we have the checker for very silly values - which will just break the loop
# No RH - means that there is either an AT or DPT missing
# RH must be between 0 and 150
# AT must be between -80 and 65
# DPT must be between -80 and 65
# SHU must be greater than 0.0
# Inadvertantly, this kicks out any ob for which no climatology is available - the ones that would late fail pos or date checks
# Later on - we may change this to just set the humidity values to missing rather than delete the ob. SST might be ok after all.
                        if (rep.getvar('CRH') == None):
#			    print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT'))
#			    pdb.set_trace()
			    # delete the rep to keep things tidy
			    del rep
			    # create a new rec because we're skipping the end of the WHILE loop
			    rec = IMMA()
			    continue
                        if ((rep.getvar('CRH') <= 0.0) | (rep.getvar('CRH') > 150.0)):
#			    print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT'))
#			    pdb.set_trace()
			    # delete the rep to keep things tidy
			    del rep
			    # create a new rec because we're skipping the end of the WHILE loop
			    rec = IMMA()
			    continue
                        if ((rep.getvar('AT') < -80.) | (rep.getvar('AT') > 65.)):
#			    print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT'))
#			    pdb.set_trace()
			    # delete the rep to keep things tidy
			    del rep
			    # create a new rec because we're skipping the end of the WHILE loop
			    rec = IMMA()
			    continue
                        if ((rep.getvar('DPT') < -80.) | (rep.getvar('DPT') > 65.)):
#			    print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT'))
#			    pdb.set_trace()
			    # delete the rep to keep things tidy
			    del rep
			    # create a new rec because we're skipping the end of the WHILE loop
			    rec = IMMA()
			    continue
                        if (rep.getvar('SHU') <= 0.0):
#			    print('Found a SILLINESS ',rep.getvar('AT'),rep.getvar('DPT'))
#			    pdb.set_trace()
			    # delete the rep to keep things tidy
			    del rep
			    # create a new rec because we're skipping the end of the WHILE loop
			    rec = IMMA()
			    continue
					
# Get climatologies for all variables (for outlier test and anomaly creation [done in buddy check and for final print out] - if AT or DPT are missing (None) then do not carry on processing that variable
# If we're using OBSclims then there are missing data which will be returned as None (NOT A STRING!!!)
# KW Added bit to find and store climatological stdev for AT and DPT - for outlier test 
                        rep_sst_clim = get_clim(rep, climsst)
                        rep.add_climate_variable('SST', rep_sst_clim)

# KW Set to read in ERA (or OBS+ERA) clim file for AT (not NMAT)
#                        rep_mat_clim = get_clim(rep, climnmat)
                        rep_mat_clim = get_clim(rep, climat)
                        rep_mat_stdev = get_clim(rep, at_pentad_stdev)
			#print(rep_mat_clim,rep_mat_stdev)
			#pdb.set_trace()
## KW added to test clim value pulled out
#			print(rep.getvar('UID'),rep.getvar('AT'),rep_mat_clim,rep.getnorm('AT'))			
#			print(rep.getvar('UID'),rep.getvar('AT'),rep_mat_stdev,rep.getstdev('AT'))			
#			if (count == 10):
#			    pdb.set_trace() 
## KW This seems to be pulling out the correct climatological value 		    
                        if ((rep_mat_clim == None) | (rep_mat_stdev == None)):
			    del rep
			    # create a new rec because we're skipping the end of the WHILE loop
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('AT', rep_mat_clim)
                            rep.add_stdev_variable('AT', rep_mat_stdev)

                        rep_dpt_clim = get_clim(rep, climdpt)
                        rep_dpt_stdev = get_clim(rep, dpt_pentad_stdev)
                        if ((rep_dpt_clim == None) | (rep_dpt_stdev == None)):
			    del rep
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('DPT', rep_dpt_clim)
                            rep.add_stdev_variable('DPT', rep_dpt_stdev)

                        rep_shu_clim = get_clim(rep, climshu)
                        if (rep_shu_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped
			    del rep
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('SHU', rep_shu_clim)

			rep_vap_clim = get_clim(rep, climvap)
                        if (rep_vap_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped
			    del rep
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('VAP', rep_vap_clim)

		        rep_crh_clim = get_clim(rep, climcrh)
                        if (rep_crh_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped
			    del rep
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('CRH', rep_crh_clim)

			rep_cwb_clim = get_clim(rep, climcwb)
                        if (rep_cwb_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped
			    del rep
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('CWB', rep_cwb_clim)

			rep_dpd_clim = get_clim(rep, climdpd)
                        if (rep_dpd_clim == None) : # if there is no SHU then either an AT or DPT would be missing I think so loop shoudld already be stopped
			    del rep
			    rec = IMMA()
			    continue
			else:			
                            rep.add_climate_variable('DPD', rep_dpd_clim)
					
#Deck 701 has a whole bunch of otherwise good obs with missing Hours.
#Set to 0000UTC and recalculate the ob time
                        if (rep.getvar('DCK') == 701 and 
                            rep.getvar('YR') < 1860 and 
                            rep.getvar('HR') == None):
                            rep.data['HR'] = 0
                            rep.calculate_dt()

# KW Added a HardLimit variable that has to be passed to the base_qc_report
                        #rep = base_qc_report(rep)
                        rep = base_qc_report(rep,HardLimit)

#			print(rep.getvar('ID'),rep.getvar('AT'),rep.getvar('DPT'),rep.getvar('SHU'),rep.getvar('CRH'),rep.getvar('VAP'))
#                        pdb.set_trace()

                        reps.append(rep)
                        count += 1

                rec = IMMA()

            icoads_file.close()

        tim1 = time.time()
        print count, " obs read and base QC ", tim1-tim0
        
#filter the obs into passes and fails of basic positional QC        
# KW NOtes that this uses the month before and after to apply track check - and so actually spends time applying
# track check to the month before and month after too, which will then be ignored and redone later, with its following month
# Is there scope to save effort here by only checking the candidate month while still passing the surrounding months for info
        reps.sort()
        filt = ex.QC_filter()
        filt.add_qc_filter('POS', 'date',   0)
        filt.add_qc_filter('POS', 'pos',    0)
        filt.add_qc_filter('POS', 'blklst', 0)
        passes, reps = filt.split_reports(reps)
        passes.sort()

        tim2 = time.time()
        print "obs filtered and sorted in ", tim2-tim1, len(reps)+len(passes)

# KW So in here we could put some kind of parsing loop to say that if you are looping through more than one month
# then you could save the candidate and previous month

# KW ALSO NOW ONLY CARRY ON WITH THOSE OBS THAT PASS BASE QC (date, pos, blacklist)
# KW commented out the following:
##all fails pass track check 
#        reps.set_qc('POS', 'trk', 0)
#        reps.set_qc('POS', 'few', 0)
#        reps.set_qc('SST', 'rep', 0)
#        reps.set_qc('AT',  'rep', 0)
## KW Added for DPT
#        reps.set_qc('DPT',  'rep', 0)
#	reps.set_qc('DPT', 'repsat', 0)
# KW End of commenting out
# KW now clear and reset reps so that it gets overwritten and filled with only passes
        del reps
	reps = ex.Deck()

#track check the passes one ship at a time
        for one_ship in passes.get_one_ship_at_a_time():
            one_ship.track_check()
# KW I don't think we need to spend time doing this for SST so have commented out
#            one_ship.find_repeated_values(threshold=0.7, intype='SST')
# KW FOr AT and DPT this procedure now also looks at the proportion of obs in a track (>20 obs - same as rep value check) that have .0 precision
# Where >=50% obs end in .0 the ATround or DPTround flag is set to 1
            one_ship.find_repeated_values(threshold=0.7, intype='AT')
# KW Added for DPT
# KW For DPT this QC procedure now also searches for persistant streaks of 100% RH (AT == DPT) and flags repsat
            one_ship.find_repeated_values(threshold=0.7, intype='DPT')

            for rep in one_ship.rep_feed():
                rep.reset_ext()
                reps.append(rep)

        del passes

        reps.sort()

        tim3 = time.time()
        print "obs track checked in ", tim3-tim2, len(reps)

#*******************************
# KW Commented out for now to save time on debug
##SST buddy check
## KW NOtes that this uses the month before and after to apply track check - and so actually spends time applying
## track check to the month before and month after too, which will then be ignored and redone later, with its following month
## Is there scope to save effort here by only checking the candidate month while still passing the surrounding months for info
#        filt = ex.QC_filter()
#        filt.add_qc_filter('POS', 'date',   0)
#        filt.add_qc_filter('POS', 'pos',    0)
#        filt.add_qc_filter('POS', 'blklst', 0)
#        filt.add_qc_filter('POS', 'trk',    0)
#        filt.add_qc_filter('SST', 'noval',  0)
#        filt.add_qc_filter('SST', 'freez',  0)
#        filt.add_qc_filter('SST', 'clim',   0)
#        filt.add_qc_filter('SST', 'nonorm', 0)
#
## KW Notes splitting marine obs into passes and fails
#        passes, reps = filt.split_reports(reps)
#
## KW Thinks this only buddy checks those obs that pass the filter of QC above
#        passes.bayesian_buddy_check('SST', sst_stdev_1, sst_stdev_2, sst_stdev_3)
#        passes.mds_buddy_check('SST', sst_pentad_stdev)
#
#******************************************
## KW Thinks all fails obs that do not pass teh QC filter above are not buddy checked - they are set to 0
## which means pass but should not be used later because they fail one of the other basic checks
#        reps.set_qc('SST', 'bbud', 0)
#        reps.set_qc('SST', 'bud',  0)

#****************************************
# KW Commented out to save time
#        for i in range(0, len(passes)):
#            rep = passes.pop(0)
#            reps.append(rep)
#
#        del passes
#
#        reps.sort()
#****************************************
        tim4 = time.time()
        print "obs SST buddy checked in ", tim4-tim3, len(reps)

#NMAT buddy check
# KW NOtes that this uses the month before and after to apply track check - and so actually spends time applying
# track check to the month before and month after too, which will then be ignored and redone later, with its following month
# Is there scope to save effort here by only checking the candidate month while still passing the surrounding months for info?
# For now I've made mdsKATE_buddy_check which only applies actual check to candidate month and year. It also uses actual pentad
# for that time of year rather than the average pentad stdev.
        filt = ex.QC_filter()
## KW Commented out date/pos/blklst as these have already been filtered out
#        filt.add_qc_filter('POS', 'date',   0)
#        filt.add_qc_filter('POS', 'pos',    0)
#        filt.add_qc_filter('POS', 'blklst', 0)
        filt.add_qc_filter('POS', 'trk',    0)
# KW commented out because we want to try to use all obs for AT and SPT
#        filt.add_qc_filter('POS', 'day',    0)
# KW Commented out because we've already filtered so that only present obs are retained
#        filt.add_qc_filter('AT',  'noval',  0)
        filt.add_qc_filter('AT',  'clim',   0)
        filt.add_qc_filter('AT',  'nonorm', 0)
# KW Notes that 'reps' are those obs that have failed one of the tests in the filter above
        passes, reps = filt.split_reports(reps)

# KW Notes that passes is an object containing a months worth of marine obs that pass (flag=0) for all above filters
# Both the bayesian buddy check and the mds buddy check test for distance to neighbours in space and time and flag
# with a 1 where it is too great/fails.
# KW NOT GOING TO APPLY BAYESIAN BUDDY CHECK BECAUSE WE CAN'T USE IT FOR DPT AND 
# ITS EXPERIMENTAL???
#        passes.bayesian_buddy_check('AT', sst_stdev_1, sst_stdev_2, sst_stdev_3)
# KW Commented out original mds_buddy_check to use mdsKATE_buddy_check instead (like DPT) which uses the seasonal stdev
# rather than the average and only applies buddy check to candidate month
# ALSO = we now use clim T stdevs from ERA (will eventually be obs+ERA combo?)
#        passes.mds_buddy_check('AT', sst_pentad_stdev)
# KW Added a HardLimit variable that has to be passed to mdsKATE_buddy_check for the stdev multiplier
        passes.mdsKATE_buddy_check('AT',  at_pentad_stdev, year, month, HardLimit)

# KW - all fails (reps) are set to have a flag of 0 which means to pass the buddy checks.because there is no point in spending
# further time buddy checking them, same as for track checks
# KW NOT GOING TO APPLY BAYESIAN BUDDY CHECK BECAUSE WE CAN'T USE IT FOR DPT AND 
# ITS EXPERIMENTAL???
#        reps.set_qc('AT', 'bbud', 8)
        reps.set_qc('AT', 'bud', 8)

        for i in range(0, len(passes)):
            rep = passes.pop(0)
            reps.append(rep)

        del passes

        reps.sort()

        tim5 = time.time()
        print "obs MAT buddy checked in ", tim5-tim4, len(reps)

# Don't think we need to set - if its not set it will be 9!
## KW Added buddy check for DPT - NOT RUNNING BAYESIAN BECAUSE WE DON'T HAVE APPROPRIATE DATA - SET FLAG TO 8!
#        reps.set_qc('DPT', 'bbud', 8)

#DPT buddy check
# KW NOtes that this uses the month before and after to apply track check - and so actually spends time applying
# track check to the month before and month after too, which will then be ignored and redone later, with its following month
# Is there scope to save effort here by only checking the candidate month while still passing the surrounding months for info
        filt = ex.QC_filter()
# KW commented out date, pos, blklst because we've already got rid of those that fail these
#        filt.add_qc_filter('POS', 'date',   0)
#        filt.add_qc_filter('POS', 'pos',    0)
#        filt.add_qc_filter('POS', 'blklst', 0)
        filt.add_qc_filter('POS', 'trk',    0)
# KW Commented out day because we want to try to use all obs for DPT and AT
#        filt.add_qc_filter('POS', 'day',    0) # Hmmm so only checking the nightime obs
# KW Commented out because we've already filtered so that only present obs are retained
#        filt.add_qc_filter('DPT',  'noval',  0)
        filt.add_qc_filter('DPT',  'clim',   0)
# KW commented out nonorm because there will always be a norm (if using ERA or combo ERA+obs)
#        filt.add_qc_filter('DPT',  'nonorm', 0) # KW could change this to ERANorm when we have actual climatologies from data - more useful because there always will be a norm from ERA
# KW Notes that 'reps' are those obs that have failed one of the tests in the filter above
        passes, reps = filt.split_reports(reps)

# KW Notes that passes is an object containing a months worth of marine obs that pass (flag=0) for all above filters
# Both the bayesian buddy check and the mds buddy check test for distance to neighbours in space and time and flag
# with a 1 where it is too great/fails.
#        passes.bayesian_buddy_check('DPT', sst_stdev_1, sst_stdev_2, sst_stdev_3)
#        passes.mds_buddy_check('DPT', dpt_pentad_stdev)
# KW Added a HardLimit variable that has to be passed to mdsKATE_buddy_check for the stdev multiplier
# KW Using Kate's version of MDS buddy check now which has a stdev for each pentad and only checks candidate month
        passes.mdsKATE_buddy_check('DPT', dpt_pentad_stdev, year, month, HardLimit)

# KW - all fails (reps) are set to have a flag of 0 which means to pass the buddy checks.because there is no point in spending
# further time buddy checking them, same as for track checks
#        reps.set_qc('DPT', 'bbud', 8)
        reps.set_qc('DPT', 'bud', 8) # KW set as 8 for now

        for i in range(0, len(passes)):
            rep = passes.pop(0)
            reps.append(rep)

        del passes

        reps.sort()

        tim6 = time.time()
        print "obs DPT buddy checked in ", tim6-tim5, len(reps)

        syr = str(year)
        smn = "%02d" % (month)
# KW changed outfile from icoards_dir to data_base_dir so that it writes to a different place to where the original 
# data are stored - don't want to mess with John's working version.
        outfile = open(data_base_dir+'/new_suite_'+syr+smn+'_'+output_suffix+'.txt', 'w')
        for rep in reps.reps:
            if rep.data['YR'] == year and rep.data['MO'] == month:
                outfile.write(rep.print_report())
        outfile.close()

        del reps

        tim11 = time.time()
        print year, " so far in ", tim11-tim00
Exemplo n.º 12
0
def main(argv):
    """
    Tracking_QC_wrapper.py

    script to control the running of the tracking QC::

      python Tracking_QC_wrapper.py -config configuration.txt -gap 3 -yr1 1985 -yr2 2005 -mn1 1 -mn2 12 -edge new

    Reads in files containing list of IDs for each month and decides when to quality control the observations:

    Inputs

    -config
      specifies the location of the configuration file.

    -gap
      specifies the gap in months that must separate chunks of data

    -yr1
      year of start month.

    -mn1
      month of start month.

    -yr2
      year of end month.

    -mn2
      month of end month.

    -edge
      specifies how different cases should be treated. 'all' will run QC for all chunks separated by "gap" months of
      data; 'standard' will run for all chunks except for those that start or end fewer than "gap" months from the start
      or end of the series; 'new' will run only those chunks that have a gap of exactly "gap" months from the end of the
      series; 'noend' will run for all chunks except for those that end fewer than "gap" months from the end of the series.

    The four "edge" cases allow for running in different modes. In principle, 'standard' will QC everything that will
    not change from the addition of data to the start or end of the series. It is intended for running all the
    historical QC in preparation for monthly updates. The flag 'new' can be used for real time updates to only QC those
    IDs that have not been eligible for QC in earlier months and have an appropriate gap at the end of the series. The
    flag 'all' will QC everything, including chunks at the start and end of the series which may change with extra data
    appended to either end of the series. The 'noend' flag can be used to QC everything that will not change from addition
    of data to the end of the series, which may be more appropriate ahead of monthly updates.

    Note that adding extra data in the middle of the series is liable to change all QC outcomes regardless of whether QC
    was run in 'all', 'standard', 'noend' or 'new' configurations.
    """

    parser = argparse.ArgumentParser(
        description='Marine QC system, main program')
    parser.add_argument('-config',
                        type=str,
                        default='configuration.txt',
                        help='name of config file')
    parser.add_argument('-gap',
                        type=int,
                        default=3,
                        help='gap of -gap months needed to trigger QC of ID')
    parser.add_argument('-yr1',
                        type=int,
                        default=1985,
                        help='first year to analyse')
    parser.add_argument('-yr2',
                        type=int,
                        default=2019,
                        help='last year to analyse')
    parser.add_argument('-mn1',
                        type=int,
                        default=1,
                        help='first month to analyse in first year')
    parser.add_argument('-mn2',
                        type=int,
                        default=12,
                        help='last month to analyse in last year')
    parser.add_argument('-edge',
                        type=str,
                        default='standard',
                        help='How to deal with edge cases')

    args = parser.parse_args()

    inputfile = args.config
    y1 = args.yr1
    y2 = args.yr2
    m1 = args.mn1
    m2 = args.mn2
    gap = args.gap
    edge = args.edge

    runmonthid = "{}{:02}-{}{:02}".format(y1, m1, y2, m2)

    if edge not in ['standard', 'all', 'new', 'noend']:
        raise Exception("edge not one of 'standard', 'all', 'new' or 'noend'")

    config = ConfigParser.ConfigParser()
    config.read(inputfile)
    out_dir = config.get('Directories', 'out_dir')
    track_out_dir = config.get('Directories', 'track_out_dir')

    with open(config.get('Files', 'parameter_file'), 'r') as f:
        parameters = json.load(f)

    # establish full list of IDs to QC
    id_dictionary = {}

    for year, month in qc.year_month_gen(y1, m1, y2, m2):
        # create directory and file names for the ID list
        extdir = safe_dir(out_dir, year, month)
        idfile = open(extdir + '/ID_file.txt', 'r')

        for line in idfile:
            line = line.rstrip("\n")
            columns = line.split(',')
            if columns[0] in id_dictionary:
                id_dictionary[columns[0]].setym(year, month, 1)
            else:
                id_dictionary[columns[0]] = ym.YMCounter(y1, m1, y2, m2)
                id_dictionary[columns[0]].setym(year, month, 1)

        idfile.close()

    for targetid in id_dictionary:

        g = id_dictionary[targetid]
        print(targetid, g.counter)

        for yy1, mm1, yy2, mm2, cl in g.yield_start_and_end_dates(gap):

            if edge == 'all':
                print('Submit', yy1, mm1, yy2, mm2, cl)
                write_submission(inputfile, targetid, yy1, mm1, yy2, mm2, cl,
                                 runmonthid, track_out_dir,
                                 parameters['runid'])

            if edge == 'standard':
                if 'regular' in cl:
                    print('Submit', yy1, mm1, yy2, mm2, cl)
                    write_submission(inputfile, targetid, yy1, mm1, yy2, mm2,
                                     cl, runmonthid, track_out_dir,
                                     parameters['runid'])
                else:
                    print('Ignore', yy1, mm1, yy2, mm2, cl)

            if edge == 'new':
                if 'new' in cl:
                    print('Submit', yy1, mm1, yy2, mm2, cl)
                    write_submission(inputfile, targetid, yy1, mm1, yy2, mm2,
                                     cl, runmonthid, track_out_dir,
                                     parameters['runid'])
                else:
                    print('Ignore', yy1, mm1, yy2, mm2, cl)

            if edge == 'noend':
                if 'regular' in cl or 'start_edge_case' in cl:
                    print('Submit', yy1, mm1, yy2, mm2, cl)
                    write_submission(inputfile, targetid, yy1, mm1, yy2, mm2,
                                     cl, runmonthid, track_out_dir,
                                     parameters['runid'])
                else:
                    print('Ignore', yy1, mm1, yy2, mm2, cl)

        print()
def main(argv):
    
    '''
    The new track check program. First the program gets a list of all unique IDs in the month 
    that is to be track checked. It then reads in three months of data at a time: the month 
    you want to track check, a month before and a month after. For each unique ID, the track 
    check is run.
    
    Track check comprises as set of related tests
    
    This program checks positional data for individual ships and buoys for internal consistency; 
    checking reported positions against positions calculated using reported speeds and directions.
    
    The obs are sorted by call-sign then date. Obs can only be checked if they have a valid call-sign 
    that is unique to one ship or buoy, so obs with no call-sign or with the generic call-signs 'SHIP' 
    or 'PLAT' are passed unchecked. The call-sign '0102' was apparently shared by several ships, so obs 
    with this call-sign are also passed unchecked.
    '''
    
    print '###################'
    print 'Running New Track Check'
    print '###################'
    
    inputfile = 'configuration.txt'
    
    try:
        opts, args = getopt.getopt(argv, 
                                   "hi:", 
                                   ["ifile=", 
                                    "year1=", 
                                    "year2="])
    except getopt.GetoptError:
        print 'Usage Make_DB.py -i <configuration_file>'+\
        ' --year1 <start year> --year2 <end year>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'test.py -i <inputfile> -o <outputfile>'
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-x", "--year1"):
            try:
                year1 = int(arg)
            except:
                sys.exit("Failed: year1 not an integer")
        elif opt in ("-y", "--year2"):
            try:
                year2 = int(arg)
            except:
                sys.exit("Failed: year2 not an integer")
                
    print 'Input file is ', inputfile
    print 'Running from ', year1, ' to ', year2
    print ''
    
    config = qc.get_config(inputfile)
    
    data_base_host        = config['data_base_host']
    data_base_name        = config['data_base_name'] 

    print 'Data base host =', data_base_host
    print 'Data base name =', data_base_name
 
    print ''

    connection = MySQLdb.connect(host=data_base_host, 
                                 user='******',
                                 db=data_base_name)

    #need two cursors, one for reading and one for making QC changes
    cursor = connection.cursor()
    cursor2 = connection.cursor()
    
    t00 = time.time()
    
    for years, months in qc.year_month_gen(year1, 1, year2, 12):
    
    #want to get a month either side of the target month, 
    #which may be in different years
        last_year, last_month = qc.last_month_was(years, months)
        next_year, next_month = qc.next_month_is(years, months)
        
        print years, months
    
        t0 = time.time()
        
        first_year = min([last_year, years, next_year])
        final_year = max([last_year, years, next_year])
    
        if first_year < 1850:
            first_year = 1850
        if final_year > 1990:
            final_year = 1990
    
    #first and last julian days are +- approximately one month
        month_lengths = qc.month_lengths(years)
        jul1 = qc.jul_day(years, months, 1)-10
        jul2 = qc.jul_day(years, months, month_lengths[months-1])+10
        
        '''Get all unique IDs for this month and fill a dictionary 
        with all the distinct ids that we want to QC as keys and an 
        empty Voyage for each key'''            
        allids = db.get_unique_ids(cursor, years, months)
        reps = {}
        for idrows in allids:
            thisid = idrows[0]
            reps[thisid] = qc.Voyage()
        
        t1 = time.time()
        print "got all IDs ",t1-t0
        
    #extract all data for this month and a month either side
        for yyy in range(first_year, final_year+1):
            
            '''
            Build filter for extracting data from data base and then extract. 
            In this case, we want observations between jul1 and jul2 which pass 
            the base QC checks. 
            '''
            qcfilter = db.Quality_Control_Filter()
            qcfilter.jul1 = jul1
            qcfilter.jul2 = jul2
            qcfilter.set_multiple_qc_flags_to_pass(['bad_position',
                                                    'bad_date',
                                                    'blacklist'])
            
            sql_request = db.build_sql_query(yyy, qcfilter)
            
            cursor.execute(sql_request)
            numrows = cursor.rowcount

    #put each ob into the dictionary if there is a key for it
            for i in range(numrows):
                rows = cursor.fetchone()
                rep = qc.ExtendedMarineReport.report_from_array(rows)
                if rep.id in reps:
                    reps[rep.id].add_report(rep)

        t2 = time.time()
        print "read all obs from DB",t2-t1

    #loop over all the distinct callsigns, extract the obs 
    #where the callsign matches and track check them
        for idrows in allids:
            thisid = idrows[0]
            matches = reps[thisid]
            matches.sort()

#run improved track check with spherical geometry etc.
            mqcs = qc_new_track_check.mds_full_track_check(matches)
            matches.find_repeated_values()

            for rep in matches.reps:
                if rep.month == months:
                    result = db.update_db_qc_single_flag(rep,rep.bad_track,
                                                         'extra_qc',
                                                         'bayesian_track_check',
                                                         years,cursor2)
                    result = db.update_db_qc_single_flag(rep,rep.repeated_value,
                                                         'extra_qc',
                                                         'repeated_value',
                                                         years,cursor2)

            split_matches = qc.split_generic_callsign(matches)

            for split in split_matches:
                qcs = qc_new_track_check.mds_full_track_check(split)

#update QC in the data base but only for the target month
                for i, rep in enumerate(split.reps):
                    if rep.month == months:
                        result = db.update_db_qc_single_flag(rep,
                                                             qcs[i],
                                                             'extra_qc',
                                                             'new_track_check',
                                                             years,
                                                             cursor2)
                        result = db.update_db_qc_single_flag(rep,
                                                             rep.fewsome_check,
                                                             'base_qc',
                                                             'fewsome_check',
                                                             years,
                                                             cursor2)

        connection.commit()

        t3 = time.time()
        print "done ",t3-t2

        #db.report_qc_counts(cursor, years, months)
    
    connection.close()
    
    print "All Done :)"