示例#1
0
def internal_checks(station_info,
                    restart_id="",
                    end_id="",
                    second=False,
                    all_checks=True,
                    duplicate=False,
                    odd=False,
                    frequent=False,
                    diurnal=False,
                    gap=False,
                    records=False,
                    streaks=False,
                    climatological=False,
                    spike=False,
                    humidity=False,
                    cloud=False,
                    variance=False,
                    winds=False,
                    diagnostics=False,
                    plots=False):
    '''
    Run through internal checks on list of stations passed
    
    :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings
    :param str restart_id: which station to start on
    :param str end_id: which station to end on
    :param bool second: do the second run 

    :param bool all_checks: run all the checks

    :param bool duplicate/odd/frequent/diurnal/gap/records/streaks/
                climatological/spike/humidity/cloud/variance/winds: run each test separately
    :param bool diagnostics: print extra material to screen
    :param bool plots: create plots from each test [many files if all stations/all tests]

    '''
    first = not second

    if all_checks:
        duplicate = True
        odd = True
        frequent = True
        diurnal = True
        gap = True
        records = True
        streaks = True
        climatological = True
        spike = True
        humidity = True
        cloud = True
        variance = True
        winds = True
    else:
        print "single tests selected"

    qc_code_version = subprocess.check_output(['svnversion']).strip()

    # sort truncated run
    startindex = 0
    if restart_id != "":
        startindex, = np.where(station_info[:, 0] == restart_id)

    if end_id != "":
        endindex, = np.where(station_info[:, 0] == end_id)
        if endindex != len(station_info) - 1:
            station_info = station_info[startindex:endindex + 1]
        else:
            station_info = station_info[startindex:]
    else:
        station_info = station_info[startindex:]

    for st, stat in enumerate(station_info):

        # if st%100 != 0: continue # do every nth station

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "{:35s} {:d}/{:d}".format("Station Number : ", st + 1,
                                        len(station_info))
        print "{:35s} {}".format("Station Identifier :", stat[0])

        if plots or diagnostics:
            logfile = ""
        else:
            if first:
                logfile = file(LOG_OUTFILE_LOCS + stat[0] + '.log', 'w')
            elif second:
                logfile = file(LOG_OUTFILE_LOCS + stat[0] + '.log',
                               'a')  # append to file if second iteration.
            logfile.write(
                dt.datetime.strftime(dt.datetime.now(),
                                     "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("Internal Checks\n")
            logfile.write("{:35s} {}\n".format("Station Identifier :",
                                               stat[0]))

        process_start_time = time.time()

        station = utils.Station(stat[0], float(stat[1]), float(stat[2]),
                                float(stat[3]))

        # latitude and longitude check

        if np.abs(station.lat) > 90.:
            if plots or diagnostics:
                print "{} {} {} {} {} {} {}\n".format(\
                        station.id,"Latitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical latitude {}".format(station.lat))
            else:
                logfile.write("{} {} {} {} {} {} {}\n".format(\
                        station.id,"Latitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical latitude {}".format(station.lat)))
                logfile.close()

            continue

        if np.abs(station.lon) > 180.:
            if plots or diagnostics:
                print "{} {} {} {} {} {} {}\n".format(\
                    station.id,"Longitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical longitude {}".format(station.lon))
            else:
                logfile.write("{} {} {} {} {} {} {}\n".format(\
                        station.id,"Longitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical longitude {}".format(station.lon)))
                logfile.close()
            continue

        # if running through the first time
        if first:

            if os.path.exists(
                    os.path.join(NETCDF_DATA_LOCS, station.id + ".nc.gz")):
                # if gzip file, unzip here
                subprocess.call([
                    "gunzip",
                    os.path.join(NETCDF_DATA_LOCS, station.id + ".nc.gz")
                ])
                time.sleep(5)  # make sure it is unzipped before proceeding

            # read in the data
            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + ".nc"),
                       station,
                       process_vars,
                       opt_var_list=carry_thru_vars,
                       diagnostics=diagnostics)

            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",
                                            len(station.time.data))
            else:
                logfile.write("{:35s}  {}\n".format(
                    "Total station record size :", len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars,
                                                       DATASTART, DATAEND,
                                                       carry_thru_vars)

            station.qc_flags = np.zeros(
                [len(station.time.data),
                 69])  # changed to include updated wind tests

            # get reporting accuracies and frequencies.

            for var in process_vars:

                st_var = getattr(station, var)
                st_var.reporting_stats = utils.monthly_reporting_statistics(
                    st_var, DATASTART, DATAEND)

        # or if second pass through?
        elif second:
            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"),
                       station,
                       process_vars,
                       opt_var_list=carry_thru_vars,
                       diagnostics=diagnostics)
            print "{:35s}  {}\n".format("Total station record size :",
                                        len(station.time.data))

            match_to_compress = utils.create_fulltimes(station, process_vars,
                                                       DATASTART, DATAEND,
                                                       carry_thru_vars)

        # Add history text to netcdf file
        # Reporting Changes - TODO

        # Duplicate months - check on temperature ONLY
        if duplicate:
            qc_tests.duplicate_months.dmc(station, ['temperatures'],
                                          process_vars, [0],
                                          DATASTART,
                                          DATAEND,
                                          logfile,
                                          diagnostics=diagnostics,
                                          plots=plots)

        # Odd Clusters
        if odd:
            qc_tests.odd_cluster.occ(
                station, ['temperatures', 'dewpoints', 'windspeeds', 'slp'],
                [54, 55, 56, 57],
                DATASTART,
                logfile,
                diagnostics=diagnostics,
                plots=plots,
                second=second)
            utils.apply_windspeed_flags_to_winddir(station,
                                                   diagnostics=diagnostics)

        # Frequent Values
        if frequent:
            qc_tests.frequent_values.fvc(station,
                                         ['temperatures', 'dewpoints', 'slp'],
                                         [1, 2, 3],
                                         DATASTART,
                                         DATAEND,
                                         logfile,
                                         diagnostics=diagnostics,
                                         plots=plots)

        # Diurnal Cycle
        if diurnal:
            if np.abs(station.lat) <= 60.:
                qc_tests.diurnal_cycle.dcc(station, ['temperatures'],
                                           process_vars, [4],
                                           logfile,
                                           diagnostics=diagnostics,
                                           plots=plots)

            else:
                if plots or diagnostics:
                    print "Diurnal Cycle Check not run as station latitude ({}) > 60\n".format(
                        station.lat)
                else:
                    logfile.write(
                        "Diurnal Cycle Check not run as station latitude ({}) > 60\n"
                        .format(station.lat))

        # Distributional Gap
        if gap:
            qc_tests.distributional_gap.dgc(
                station, ['temperatures', 'dewpoints', 'slp'], [5, 6, 7],
                DATASTART,
                DATAEND,
                logfile,
                diagnostics=diagnostics,
                plots=plots,
                GH=True)

        # Records
        if records:
            qc_tests.records.krc(
                station, ['temperatures', 'dewpoints', 'windspeeds', 'slp'],
                [8, 9, 10, 11],
                logfile,
                diagnostics=diagnostics,
                plots=plots)
            utils.apply_windspeed_flags_to_winddir(station,
                                                   diagnostics=diagnostics)

        # Streaks and Repetitions
        if streaks:
            qc_tests.streaks.rsc(
                station,
                ['temperatures', 'dewpoints', 'windspeeds', 'slp', 'winddirs'],
                [[12, 16, 20], [13, 17, 21], [14, 18, 22], [15, 19, 23],
                 [66, 67, 68]],
                DATASTART,
                DATAEND,
                logfile,
                diagnostics=diagnostics,
                plots=plots)
            utils.apply_windspeed_flags_to_winddir(station,
                                                   diagnostics=diagnostics)

        # Climatological Outlier
        if climatological:
            qc_tests.climatological.coc(station, ['temperatures', 'dewpoints'],
                                        [24, 25],
                                        DATASTART,
                                        DATAEND,
                                        logfile,
                                        diagnostics=diagnostics,
                                        plots=plots)
            # column 26 kept spare for slp

        # Spike
        if spike:
            qc_tests.spike.sc(
                station, ['temperatures', 'dewpoints', 'slp', 'windspeeds'],
                [27, 28, 29, 65],
                DATASTART,
                DATAEND,
                logfile,
                diagnostics=diagnostics,
                plots=plots,
                second=second)
            utils.apply_windspeed_flags_to_winddir(station,
                                                   diagnostics=diagnostics)

        # Humidity cross checks
        if humidity:
            qc_tests.humidity.hcc(station, [30, 31, 32],
                                  DATASTART,
                                  DATAEND,
                                  logfile,
                                  diagnostics=diagnostics,
                                  plots=plots)

        # Cloud cross check
        if cloud:
            qc_tests.clouds.ccc(station, [33, 34, 35, 36, 37, 38, 39, 40],
                                logfile,
                                diagnostics=diagnostics,
                                plots=plots)

        # Variance
        if variance:
            qc_tests.variance.evc(
                station, ['temperatures', 'dewpoints', 'slp', 'windspeeds'],
                [58, 59, 60, 61],
                DATASTART,
                DATAEND,
                logfile,
                diagnostics=diagnostics,
                plots=plots)
            utils.apply_windspeed_flags_to_winddir(station,
                                                   diagnostics=diagnostics)

        # Winds
        if winds:
            qc_tests.winds.wdc(station, [62, 63, 64],
                               DATASTART,
                               DATAEND,
                               logfile,
                               diagnostics=diagnostics,
                               plots=plots)

        # are flags actually applied?

        if diagnostics or plots: raw_input("stop")

        # write to file
        if first:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS,
                                     station.id + "_internal.nc"),
                        station,
                        process_vars,
                        os.path.join(INPUT_FILE_LOCS, 'attributes.dat'),
                        opt_var_list=carry_thru_vars,
                        compressed=match_to_compress,
                        processing_date='',
                        qc_code_version=qc_code_version)
            # gzip the raw file
            subprocess.call(
                ["gzip",
                 os.path.join(NETCDF_DATA_LOCS, station.id + ".nc")])

        elif second:

            ncdfp.write(os.path.join(NETCDF_DATA_LOCS,
                                     station.id + "_internal2.nc"),
                        station,
                        process_vars,
                        os.path.join(INPUT_FILE_LOCS, 'attributes.dat'),
                        opt_var_list=carry_thru_vars,
                        compressed=match_to_compress,
                        processing_date='',
                        qc_code_version=qc_code_version)
            # gzip the raw file
            subprocess.call([
                "gzip",
                os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc")
            ])

        logfile.write(
            dt.datetime.strftime(dt.datetime.now(),
                                 "%A, %d %B %Y, %H:%M:%S\n"))
        logfile.write(
            "processing took {:4.0f}s\n\n".format(time.time() -
                                                  process_start_time))
        logfile.close()

    print "Internal Checks completed\n"

    return  # internal_checks
示例#2
0
def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False):
    """
    Run through neighbour checks on list of stations passed
    
    :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings
    :param array distances: array of distances between station pairs
    :param array angles: array of angles between station pairs
    :param bool second: do the second run
    :param bool masking: apply the flags to the data to mask the observations.

    """
    first = not second

    qc_code_version = subprocess.check_output(['svnversion']).strip()

    # if distances and angles not calculated, then do so
    if (len(distances) == 0) or (len(angles) == 0):
        print "calculating distances and bearings matrix"
        distances, angles = get_distances_angles(station_info)

    # extract before truncate the array
    neighbour_elevations = np.array(station_info[:,3], dtype=float) 
    neighbour_ids        = np.array(station_info[:,0])
    neighbour_info       = np.array(station_info[:,:])

    # sort truncated run
    startindex = 0
    if restart_id != "":
        startindex, = np.where(station_info[:,0] == restart_id)


    if end_id != "":
        endindex, = np.where(station_info[:,0] == end_id)
        if endindex != len(station_info) -1:
            station_info = station_info[startindex: endindex+1]
            distances = distances[startindex:endindex+1,:]
            angles = angles[startindex:endindex+1,:]
        else:
            station_info = station_info[startindex:]
            distances = distances[startindex:,:]
            angles = angles[startindex:,:]
    else:
        station_info = station_info[startindex:]
        distances = distances[startindex:,:]
        angles = angles[startindex:,:]
        

    # process each neighbour
    for st, stat in enumerate(station_info):       

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "Neighbour Check"
        print "{:35s} {}".format("Station Identifier :", stat[0])

        if not plots and not diagnostics:
            logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration.
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("Neighbour Check\n")
            logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0]))
        else:
            logfile = ""

        process_start_time = time.time()

        station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3]))

        # if running through the first time
        if first:

            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            # read in the data
            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)

            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
            else:
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)

        # or if second pass through?
        elif second:
            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)
            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
            else:
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)


        # select neighbours
        neighbour_distances  = distances[st,:]
        neighbour_bearings   = angles[st,:]

        # have to add in start index so that can use location in distance file.
        # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively)
        neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        if plots or diagnostics:
            print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation")
            for n in neighbours:
                print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])

        else:
            logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation"))
            for n in neighbours:
                logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]))

        # if sufficient neighbours
        if len(neighbours) >= 3:

            for variable, col in FLAG_OUTLIER_DICT.items():
                # NOTE - this requires multiple reads of the same file
                #      but does make it easier to understand and code

                st_var = getattr(station, variable)

                if plots or diagnostics:
                    print "Length of {} record: {}".format(variable, len(st_var.data.compressed()))
                else:
                    logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed())))

                
                if len(st_var.data.compressed()) > 0:

                    final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots)


                    # now read in final set of neighbours and process

                    neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad
                    neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour

                    all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values

                    for nn, nn_loc in enumerate(final_neighbours):

                        neigh_details = neighbour_info[nn_loc]
                        neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3]))

                        if first:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)
                        elif second:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)

                        dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False)

                        all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable))

                        if diagnostics:
                            print neigh_details

                        n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots)

                        reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data)

                        dpd_flags += neigh.qc_flags[:,31]
                    # gone through all neighbours


                    # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours)
                    some_flags, = np.where(neigh_flags > 0)            
                    outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.)))

                    # flag where < 3 neighbours
                    locs = np.where(neigh_count[some_flags] < 3)
                    station.qc_flags[some_flags[locs], col] = -1

                    if len(outlier_locs) >= 1:
                        station.qc_flags[some_flags[outlier_locs], col] = 1

                        # print number flagged and copy into attribute
                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                        else:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))
                        st_var = getattr(station, variable)
                        st_var.flags[some_flags[outlier_locs]] = 1

                    else:
                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                        else:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))


                    if plots:
                        n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART)

                    # unflagging using neighbours
                    n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics)

                else:
                    if plots or diagnostics:
                        print "No observations to assess for {}".format(variable)
                    else:
                        logfile.write("No observations to assess for {}\n".format(variable))
                    


            # variable loop
        else:
            if plots or diagnostics:
                print "Fewer than 3 neighbours"
            else:
                logfile.write("Fewer than 3 neighbours\n")

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)

        # end of neighbour check
	utils.append_history(station, "Neighbour Outlier Check")

        # clean up months 

        qc_tests.clean_up.clu(station, ["temperatures","dewpoints","windspeeds","winddirs","slp"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots)


        if diagnostics or plots: raw_input("stop")

        # masking (at least call from here - optional call from internal?)

        # write to file
        if first:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file
        elif second:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file
 

        # masking - apply the flags and copy masked data to flagged_obs attribute
        if masking:

            station = utils.mask(station, process_vars, logfile)

        # write to file
            if first:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            elif second:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)

        if plots or diagnostics:
            print "Masking completed\n"
            print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")
            print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)
        else:
            logfile.write("Masking completed\n")
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time))
            logfile.close()
            
    # gzip up all the raw files
    if doZip:
        for st, stat in enumerate(station_info):       
            if first:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")])

            elif second:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")])

    print "Neighbour Checks completed\n"

    return # neighbour_checks 
示例#3
0
    if stat[0] == input_id:

        # set up station
        station = utils.Station(stat[0], float(stat[1]), float(stat[2]),
                                float(stat[3]))
        break
else:
    sys.exit(0)

# read attributes and qc_flags
ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"),
           station,
           process_vars, [],
           read_qc_flags=True)

match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART,
                                           DATAEND, [])

# nyears x 12 months
month_start_locs = np.array(utils.month_starts(DATASTART,
                                               DATAEND)).reshape(-1, 12)

# which years
years = DATASTART.year + np.arange(month_start_locs.shape[0])

# find which year and test to plot
year_loc, = np.where(years == year)
test_loc, = np.where(qc_test == test)[0]

# and get the plot range
if year != DATAEND.year - 1:
    plot_range = (month_start_locs[year_loc, 0], month_start_locs[year_loc + 1,
示例#4
0

for st,stat in enumerate(station_info):  

    if stat[0] == input_id:

        # set up station
        station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3]))
        break
else:
    sys.exit(0)

# read attributes and qc_flags
ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, [])

match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, [])

# nyears x 12 months
month_start_locs = np.array(utils.month_starts(DATASTART, DATAEND)).reshape(-1,12)

# which years
years = DATASTART.year + np.arange(month_start_locs.shape[0])

for year in range(DATASTART.year, DATAEND.year):

    year_loc, = np.where(years == year)

    if year != DATAEND.year - 1:
        plot_range = (month_start_locs[year_loc,0], month_start_locs[year_loc+1,0])
    else:
        plot_range = (month_start_locs[year_loc,0], -1) # misses last hour
示例#5
0
def neighbour_checks(station_info, restart_id = "", end_id = "", distances=np.array([]), angles=np.array([]), second = False, masking = False, doZip=False, plots = False, diagnostics = False):
    """
    Run through neighbour checks on list of stations passed
    
    :param list station_info: list of lists - [[ID, lat, lon, elev]] - strings
    :param array distances: array of distances between station pairs
    :param array angles: array of angles between station pairs
    :param bool second: do the second run
    :param bool masking: apply the flags to the data to mask the observations.

    """
    first = not second

    qc_code_version = subprocess.check_output(['svnversion']).strip()

    # if distances and angles not calculated, then do so
    if (len(distances) == 0) or (len(angles) == 0):
        print "calculating distances and bearings matrix"
        distances, angles = get_distances_angles(station_info)

    # extract before truncate the array
    neighbour_elevations = np.array(station_info[:,3], dtype=float) 
    neighbour_ids        = np.array(station_info[:,0])
    neighbour_info       = np.array(station_info[:,:])

    # sort truncated run
    startindex = 0
    if restart_id != "":
        startindex, = np.where(station_info[:,0] == restart_id)


    if end_id != "":
        endindex, = np.where(station_info[:,0] == end_id)
        if endindex != len(station_info) -1:
            station_info = station_info[startindex: endindex+1]
            distances = distances[startindex:endindex+1,:]
            angles = angles[startindex:endindex+1,:]
        else:
            station_info = station_info[startindex:]
            distances = distances[startindex:,:]
            angles = angles[startindex:,:]
    else:
        station_info = station_info[startindex:]
        distances = distances[startindex:,:]
        angles = angles[startindex:,:]
        

    # process each neighbour
    for st, stat in enumerate(station_info):       

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "Neighbour Check"
        print "{:35s} {}".format("Station Identifier :", stat[0])

        if not plots and not diagnostics:
            logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','a') # append to file if second iteration.
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("Neighbour Check\n")
            logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0]))
        else:
            logfile = ""

        process_start_time = time.time()

        station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3]))

        # if running through the first time
        if first:

            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            # read in the data
            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)

            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
            else:
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)

        # or if second pass through?
        elif second:
            if os.path.exists(os.path.join(NETCDF_DATA_LOCS, station.id + "internal2.nc.gz")):
                # if gzip file, unzip here
                subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc.gz")])
                time.sleep(5) # make sure it is unzipped before proceeding

            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_internal2.nc"), station, process_vars, carry_thru_vars, diagnostics = diagnostics)
            if plots or diagnostics:
                print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
            else:
                logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

            match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)


        # select neighbours
        neighbour_distances  = distances[st,:]
        neighbour_bearings   = angles[st,:]

        # have to add in start index so that can use location in distance file.
        # neighbours = n_utils.get_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        # return all neighbours up to a limit from the distance and elevation offsets (500km and 300m respectively)
        neighbours, neighbour_quadrants = n_utils.get_all_neighbours(st+startindex, np.float(stat[3]), neighbour_distances, neighbour_bearings, neighbour_elevations)

        if plots or diagnostics:
            print "{:14s} {:10s} {:10s}".format("Neighbour","Distance","Elevation")
            for n in neighbours:
                print "{:14s} {:10.1f} {:10.1f}".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n])

        else:
            logfile.write("{:14s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation"))
            for n in neighbours:
                logfile.write("{:14s} {:10.1f} {:10.1f}\n".format(neighbour_ids[n],neighbour_distances[n],neighbour_elevations[n]))

        # if sufficient neighbours
        if len(neighbours) >= 3:

            for variable, col in FLAG_OUTLIER_DICT.items():
                # NOTE - this requires multiple reads of the same file
                #      but does make it easier to understand and code

                st_var = getattr(station, variable)

                if plots or diagnostics:
                    print "Length of {} record: {}".format(variable, len(st_var.data.compressed()))
                else:
                    logfile.write("Length of {} record: {}\n".format(variable, len(st_var.data.compressed())))

                
                if len(st_var.data.compressed()) > 0:

                    final_neighbours = n_utils.select_neighbours(station, variable, neighbour_info[neighbours], neighbours, neighbour_distances[neighbours], neighbour_quadrants, NETCDF_DATA_LOCS, DATASTART, DATAEND, logfile, second = second, diagnostics = diagnostics, plots = plots)


                    # now read in final set of neighbours and process

                    neigh_flags = np.zeros(len(station.time.data)) # count up how many neighbours think this obs is bad
                    neigh_count = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    dpd_flags = np.zeros(len(station.time.data)) # number of neighbours at each time stamp
                    reporting_accuracies = np.zeros(len(neighbours)) # reporting accuracy of each neighbour

                    all_data = np.ma.zeros([len(final_neighbours), len(station.time.data)]) # store all the neighbour values

                    for nn, nn_loc in enumerate(final_neighbours):

                        neigh_details = neighbour_info[nn_loc]
                        neigh = utils.Station(neigh_details[0], float(neigh_details[1]), float(neigh_details[2]), float(neigh_details[3]))

                        if first:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)
                        elif second:
                            ncdfp.read(os.path.join(NETCDF_DATA_LOCS, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)

                        dummy = utils.create_fulltimes(neigh, [variable], DATASTART, DATAEND, [], do_input_station_id = False)

                        all_data[nn, :] = utils.apply_filter_flags(getattr(neigh, variable))

                        if diagnostics:
                            print neigh_details

                        n_utils.detect(station, neigh, variable, neigh_flags, neigh_count, DATASTART, DATAEND, distance = neighbour_distances[nn_loc], diagnostics = diagnostics, plots = plots)

                        reporting_accuracies[nn] = utils.reporting_accuracy(getattr(neigh,variable).data)

                        dpd_flags += neigh.qc_flags[:,31]
                    # gone through all neighbours


                    # if at least 2/3 of neighbours have flagged this point (and at least 3 neighbours)
                    some_flags, = np.where(neigh_flags > 0)            
                    outlier_locs, = np.where(np.logical_and((neigh_count[some_flags] >= 3),(neigh_flags[some_flags].astype("float")/neigh_count[some_flags] > 2./3.)))

                    # flag where < 3 neighbours
                    locs = np.where(neigh_count[some_flags] < 3)
                    station.qc_flags[some_flags[locs], col] = -1

                    if len(outlier_locs) >= 1:
                        station.qc_flags[some_flags[outlier_locs], col] = 1

                        # print number flagged and copy into attribute
                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                        else:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))
                        st_var = getattr(station, variable)
                        st_var.flags[some_flags[outlier_locs]] = 1

                    else:
                        if plots or diagnostics:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs), noWrite = True)
                        else:
                            utils.print_flagged_obs_number(logfile, "Neighbour", variable, len(outlier_locs))


                    if plots:
                        n_utils.plot_outlier(station, variable, some_flags[outlier_locs], all_data, DATASTART)

                    # unflagging using neighbours
                    n_utils.do_unflagging(station, variable, all_data, reporting_accuracies, neigh_count, dpd_flags, FLAG_COL_DICT, DATASTART, logfile, plots = plots, diagnostics = diagnostics)

                else:
                    if plots or diagnostics:
                        print "No observations to assess for {}".format(variable)
                    else:
                        logfile.write("No observations to assess for {}\n".format(variable))
                    

            # variable loop
        else:
            if plots or diagnostics:
                print "Fewer than 3 neighbours"
            else:
                logfile.write("Fewer than 3 neighbours\n")

        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)

        # end of neighbour check
        utils.append_history(station, "Neighbour Outlier Check")
        
        # clean up months 

        qc_tests.clean_up.clu(station, ["temperatures","dewpoints","slp","windspeeds","winddirs"], [44,45,46,47,48], FLAG_COL_DICT, DATASTART, DATAEND, logfile, plots = plots, diagnostics = diagnostics)


        if diagnostics or plots: raw_input("stop")

        # masking (at least call from here - optional call from internal?)

        # write to file
        if first:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file
        elif second:
            ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_external2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            # gzip the raw file
 

        # masking - apply the flags and copy masked data to flagged_obs attribute
        if masking:

            station = utils.mask(station, process_vars, logfile, FLAG_COL_DICT)

            # write to file
            if first:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)
            elif second:
                ncdfp.write(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask2.nc"), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = '', qc_code_version = qc_code_version)

        if plots or diagnostics:
            print "Masking completed\n"
            print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n")
            print "processing took {:4.0f}s\n\n".format(time.time() - process_start_time)
        else:
            logfile.write("Masking completed\n")
            logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time))
            logfile.close()
            
    # looped through all stations

    # gzip up all the raw files
    if doZip:
        for st, stat in enumerate(station_info):       
            if first:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask.nc")])

            elif second:
                subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_internal2.nc")])
                if masking:
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_external2.nc")])
                    subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, stat[0]+"_mask2.nc")])

    print "Neighbour Checks completed\n"

    return # neighbour_checks 
示例#6
0
def internal_checks(restart_id = "", end_id = "",
                    all_checks = True,
                    duplicate = False,
                    odd = False,
                    frequent = False,
                    diurnal = False,
                    gap = False,
                    records = False,
                    streaks = False,
                    climatological = False,
                    spike = False,
                    humidity = False,
                    cloud = False,
                    variance = False, 
                    winds = False, 
                    pressure = False,
                    precipitation = False,
                    diagnostics = False,
                    plots = False,
                    doMonth = False):
    '''
    Run through internal checks on list of stations passed
    
    :param str restart_id: which station to start on
    :param str end_id: which station to end on

    :param bool all_checks: run all the checks

    :param bool duplicate/odd/frequent/diurnal/gap/records/streaks/climatological/spike/humidity/cloud/variance/winds/pressure/precipitation: run each test separately
    :param bool diagnostics: print extra material to screen
    :param bool plots: create plots from each test [many files if all stations/all tests]
    :param bool doMonth: a monthly append process

    '''

    if all_checks:
        duplicate = True
        odd = True
        frequent = True
        diurnal = True
        gap = True
        records = True
        streaks = True
        climatological = True
        spike = True
        humidity = True
        cloud = True
        variance = True
        winds = True
        pressure = True
        precipitation = True
    else:
        print "single tests selected"
        
#    qc_code_version = subprocess.check_output(['svnversion']).strip()
    qc_code_version = subprocess.check_output(['svn', 'info', 'file:///home/h05/rdunn/svn/hadisd_py_qc/branches/monthly/'])
    for line in qc_code_version.split("\n"):
        if line.split(":")[0] == "Revision":
            qc_code_version = line.split(":")[1]
            break

        
    # get station information
    try:
        station_info = np.genfromtxt(os.path.join(INPUT_FILE_LOCS, STATION_LIST), dtype=(str))
    except IOError:
        print "station list not found"
        sys.exit()

    # sort truncated run
    startindex = [0]
    if restart_id != "":
        startindex, = np.where(station_info[:,0] == restart_id)


    if end_id != "":
        endindex, = np.where(station_info[:,0] == end_id)
        if endindex != len(station_info) -1:
            station_info = station_info[startindex[0]: endindex[0]+1]
        else:
            station_info = station_info[startindex[0]:]
    else:
        station_info = station_info[startindex[0]:]
        

    for st,stat in enumerate(station_info):     

        # if st%100 != 0: continue # do every nth station
  
        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "{:35s} {:d}/{:d}".format("Station Number : ", st + 1, len(station_info))
        print "{:35s} {}".format("Station Identifier :", stat[0])
        if doMonth: print "Running with incomplete final year"

        # set up the log file
        logfile = file(LOG_OUTFILE_LOCS+stat[0]+'.log','w')
        logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
        logfile.write("Internal Checks\n")
        logfile.write("{:35s} {}\n".format("Station Identifier :", stat[0]))

        process_start_time = time.time()

        station = utils.Station(stat[0], float(stat[1]), float(stat[2]), float(stat[3]))

        # latitude and longitude check
        if np.abs(station.lat) > 90.:
            if plots or diagnostics:
                print "{} {} {} {} {} {} {}\n".format(\
                        station.id,"Latitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical latitude {}".format(station.lat))
            else:
                logfile.write("{} {} {} {} {} {} {}\n".format(\
                        station.id,"Latitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical latitude {}".format(station.lat)))
                logfile.close()

            continue

        # check if station longitude outside of bounds
        if np.abs(station.lon) > 180.:       
            if plots or diagnostics:
                print "{} {} {} {} {} {} {}\n".format(\
                    station.id,"Longitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical longitude {}".format(station.lon))
            else:
                logfile.write("{} {} {} {} {} {} {}\n".format(\
                        station.id,"Longitude Check",DATASTART.year, DATAEND.year,"All", "Unphysical longitude {}".format(station.lon)))
                logfile.close()
            continue

        # check if file is zipped
        if os.path.exists(os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_raw.nc.gz".format(LONG_VERSION, END_TIME, station.id))):
            # if gzip file, unzip here
            subprocess.call(["gunzip",os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_raw.nc.gz".format(LONG_VERSION, END_TIME, station.id))])
            time.sleep(5) # make sure it is unzipped before proceeding

        # read in the data
        ncdfp.read(os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_raw.nc".format(LONG_VERSION, END_TIME, station.id)), station, process_vars, opt_var_list = carry_thru_vars, diagnostics = diagnostics)

        if plots or diagnostics:
            print "{:35s}  {}\n".format("Total station record size :",len(station.time.data))
        else:
            logfile.write("{:35s}  {}\n".format("Total station record size :",len(station.time.data)))

        match_to_compress = utils.create_fulltimes(station, process_vars, DATASTART, DATAEND, carry_thru_vars)

        station.qc_flags = np.zeros([len(station.time.data),71]) # changed to include updated wind tests, station level pressure & precipitation

        # get reporting accuracies and frequencies.

        for var in process_vars:

            st_var = getattr(station, var)
            st_var.reporting_stats = utils.monthly_reporting_statistics(st_var, DATASTART, DATAEND)


        # Add history text to netcdf file
        # Reporting Changes - TODO

        # Duplicate months - check on temperature ONLY
        if duplicate:
            # no change as result of incomplete year
            qc_tests.duplicate_months.dmc(station, ['temperatures'], process_vars, [0], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots)

        # Odd Clusters
        if odd:
            # no change as result of incomplete year
            qc_tests.odd_cluster.occ(station,['temperatures','dewpoints','windspeeds','slp'], [54,55,56,57], DATASTART, logfile, diagnostics = diagnostics, plots = plots)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)
            utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics)

        # Frequent Values
        if frequent:
            qc_tests.frequent_values.fvc(station, ['temperatures', 'dewpoints','slp'], [1,2,3], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)

        # Diurnal Cycle 
        if diurnal:
            if np.abs(station.lat) <= 60.:
                qc_tests.diurnal_cycle.dcc(station, ['temperatures'], process_vars, [4], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)
                
            else:
                if plots or diagnostics:
                    print "Diurnal Cycle Check not run as station latitude ({}) > 60\n".format(station.lat)
                else:
                    logfile.write("Diurnal Cycle Check not run as station latitude ({}) > 60\n".format(station.lat))

        # Distributional Gap
        if gap:
            qc_tests.distributional_gap.dgc(station, ['temperatures','dewpoints','slp'], [5,6,7], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, GH = True, doMonth = doMonth)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)

        # Records 
        if records:
            qc_tests.records.krc(station, ['temperatures','dewpoints','windspeeds','slp'], [8,9,10,11], logfile, diagnostics = diagnostics, plots = plots)
            utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)

        # Streaks and Repetitions 
        if streaks:
            qc_tests.streaks.rsc(station, ['temperatures','dewpoints','windspeeds','slp','winddirs'], [[12,16,20],[13,17,21],[14,18,22],[15,19,23],[66,67,68]], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)
            utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)

        # Climatological Outlier
        if climatological:
            qc_tests.climatological.coc(station, ['temperatures','dewpoints'], [24,25], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)
            # column 26 kept spare for slp

        # Spike
        if spike:
            qc_tests.spike.sc(station, ['temperatures','dewpoints','slp','windspeeds'], [27,28,29,65], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)
            utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)

        # Humidity cross checks
        if humidity:
            qc_tests.humidity.hcc(station, [30,31,32], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots)

        # Cloud cross check
        if cloud:
            qc_tests.clouds.ccc(station, [33,34,35,36,37,38,39,40], logfile, diagnostics = diagnostics, plots = plots)

        # Variance
        if variance:
            qc_tests.variance.evc(station, ['temperatures','dewpoints','slp','windspeeds'], [58,59,60,61], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth) 
            utils.apply_flags_from_A_to_B(station, "windspeeds", "winddirs", diagnostics = diagnostics)
            utils.apply_flags_from_A_to_B(station, "slp", "stnlp", diagnostics = diagnostics)

        # Winds
        if winds:
            qc_tests.winds.wdc(station, [62,63,64], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)

        # Pressure
        if pressure:
            qc_tests.pressure.spc(station, [69], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots, doMonth = doMonth)

        # Precipitation
        if precipitation:
            qc_tests.precipitation.pcc(station, [70], DATASTART, DATAEND, logfile, diagnostics = diagnostics, plots = plots)


        # are flags actually applied?
        sys.stdout.flush()
        if diagnostics or plots: raw_input("stop")

        # write to file
        ncdfp.write(os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_internal.nc".format(LONG_VERSION, END_TIME, station.id)), station, process_vars, os.path.join(INPUT_FILE_LOCS,'attributes.dat'), opt_var_list = carry_thru_vars, compressed = match_to_compress, processing_date = dt.datetime.strftime(dt.datetime.now(), "%d-%b-%Y"), qc_code_version = qc_code_version)
        # gzip the raw file
        subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_raw.nc".format(LONG_VERSION, END_TIME, station.id))])


        logfile.write(dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S\n"))
        logfile.write("processing took {:4.0f}s\n\n".format(time.time() - process_start_time))
        logfile.close()

        # clean up
        gc.collect()

    print "Internal Checks completed\n"

    return # internal_checks
示例#7
0
def make_hum_heat_vars(station_info,
                       restart_id="",
                       end_id="",
                       diagnostics=False,
                       plots=False):
    """
    Make the humidity and heat-stress variable netCDF files

    Make two sets of output files containing the humidity and heat-stress
    parameters calculated on an hourly basis from the QC'd HadISD data

    :param list station_info: station information list
    :param str restart_id: first station to process
    :param str end_id: last station to process
    :param bool diagnostics: verbose output to screen
    :param bool plots: make plots (placeholder)
    """

    # sort truncated run
    startindex = 0
    if restart_id != "":
        startindex, = np.where(station_info[:, 0] == restart_id)

    if end_id != "":
        endindex, = np.where(station_info[:, 0] == end_id)
        if endindex != len(station_info) - 1:
            station_info = station_info[startindex:endindex + 1]
        else:
            station_info = station_info[startindex:]
    else:
        station_info = station_info[startindex:]

    for st, stat in enumerate(station_info):
        print dt.datetime.strftime(dt.datetime.now(), "%A, %d %B %Y, %H:%M:%S")
        print "{:35s} {:d}/{:d}".format("Station Number : ", st + 1,
                                        len(station_info))
        print "{:35s} {}".format("Station Identifier :", stat[0])

        if plots or diagnostics:
            logfile = ""
        else:
            logfile = file(LOG_OUTFILE_LOCS + stat[0] + '.log', 'a')
            logfile.write(
                dt.datetime.strftime(dt.datetime.now(),
                                     "%A, %d %B %Y, %H:%M:%S\n"))
            logfile.write("Calculating Humidity and Heat Stress variables\n")
            logfile.write("{:35s} {}\n".format("Station Identifier :",
                                               stat[0]))
        process_start_time = time.time()

        station = utils.Station(stat[0], float(stat[1]), float(stat[2]),
                                float(stat[3]))
        if os.path.exists(
                os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc.gz")):
            # if gzip file, unzip here
            subprocess.call([
                "gunzip",
                os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc.gz")
            ])
            time.sleep(5)  # make sure it is unzipped before proceeding

        # read in the data
        ncdfp.read(os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc"),
                   station,
                   process_vars,
                   diagnostics=diagnostics,
                   read_qc_flags=False,
                   read_flagged_obs=False)

        match_to_compress = utils.create_fulltimes(station,
                                                   process_vars,
                                                   DATASTART,
                                                   DATAEND,
                                                   do_qc_flags=False,
                                                   do_flagged_obs=False)

        # run through calculations, each one should add a new variable to object.
        """
        1) Use T and P to get e  [to get es, use Td]
        2) Use e, P, Td and T to get Tw
        3) If Tw < 0C, recalculate e w.r.t ice, and re-obtain Tw - keep both!
        4) Use e and P to calculate q
        5) Use e and es to get rh (use appropriate es too) - or q and qs

        what P to use if no measurement - using monthly mean probably isn't appropriate in this instance??
        """

        station = humidity.run_calcs(station, logfile)

        # run through heat stress calculations

        station = heat_stress.run_calcs(station, logfile)

        if diagnostics or plots: raw_input("stop")

        # adjust this to work with the desired output file - will need a separate write function - output humidity in one set, heat indices in another?
        humidity_vars = [
            "temperatures", "dewpoints", "slp", "vapour_pressure",
            "saturation_vapour_pressure", "wetbulb_temperature",
            "specific_humidity", "relative_humidity"
        ]
        ncdfp.write(os.path.join(NETCDF_DATA_LOCS,
                                 station.id + "_humidity.nc"),
                    station,
                    humidity_vars,
                    os.path.join(INPUT_FILE_LOCS, 'attributes.dat'),
                    compressed=match_to_compress,
                    processing_date='',
                    qc_code_version='',
                    write_QC_flags=False,
                    write_flagged_obs=False,
                    least_significant_digit=5)

        heat_stress_vars = [
            "temperatures", "dewpoints", "windspeeds", "THI", "WBGT",
            "humidex", "apparent_t", "heat_index"
        ]
        ncdfp.write(os.path.join(NETCDF_DATA_LOCS,
                                 station.id + "_heat_stress.nc"),
                    station,
                    heat_stress_vars,
                    os.path.join(INPUT_FILE_LOCS, 'attributes.dat'),
                    compressed=match_to_compress,
                    processing_date='',
                    qc_code_version='',
                    write_QC_flags=False,
                    write_flagged_obs=False,
                    least_significant_digit=5)

        # gzip the raw file
        # subprocess.call(["gzip","-f",os.path.join(NETCDF_DATA_LOCS, station.id + "_humidity.nc")])
        # subprocess.call(["gzip","-f",os.path.join(NETCDF_DATA_LOCS, station.id + "_heat_stress.nc")])
        # subprocess.call(["gzip",os.path.join(NETCDF_DATA_LOCS, station.id + "_mask.nc")])

        logfile.write(
            dt.datetime.strftime(dt.datetime.now(),
                                 "%A, %d %B %Y, %H:%M:%S\n"))
        logfile.write(
            "processing took {:4.0f}s\n\n".format(time.time() -
                                                  process_start_time))
        logfile.close()

        print "Humidity and Heat Stress Indices calculated"

    return  # make_hum_heat_vars
示例#8
0
def select_neighbours(station,
                      variable,
                      neighbour_info,
                      neighbours,
                      neighbour_distances,
                      neighbour_quadrants,
                      data_locs,
                      datastart,
                      dataend,
                      logfile,
                      diagnostics=False,
                      plots=False):
    '''
    From the list of nearby stations select the ones which will be good neighours for the test.
    Select on basis of correlation, overlap of data points and bearing (quadrants)
    
    :param object station: station object
    :param str variable: which variable to proces
    :param array neighbour_info: array of ID, lat, lon and elev
    :param array neighbours: which station sequence numbers are the nearby stations
    :param array neighbour_distances: distances to nearby stations
    :param array neighbour_quadrants: bearings to nearby stations (in 90deg bins)
    :param array data_locs: path to data files
    :param datetime datastart: start of data set
    :param datetime dataend: end of data set
    :param file logfile: logfile to store outputs
    :param boolean diagnostics: output diagnostic information
    :param boolean plots: make a plot

    :returns: final_locs - array of station sequence numbers to use.
    '''

    # set up storage arrays
    n_correlations = np.zeros(len(neighbours))
    n_distances = np.zeros(len(neighbours))
    n_quadrants = np.zeros(len(neighbours))
    n_overlaps = np.zeros(len(neighbours))
    combined_score = np.zeros(len(neighbours))

    # get station data
    st_var = getattr(station, variable)
    st_anomalies = hourly_daily_anomalies(st_var.data[:])

    # go through initial list and extract correlations and overlaps
    for nn, nn_loc in enumerate(neighbours):

        n_details = neighbour_info[nn]
        neigh = utils.Station(n_details[0], float(n_details[1]),
                              float(n_details[2]), float(n_details[3]))

        ncdfp.read(os.path.join(
            NETCDF_DATA_LOCS, "hadisd.{}_19310101-{}_{}_internal.nc".format(
                LONG_VERSION, END_TIME, station.id)),
                   neigh, [variable],
                   diagnostics=diagnostics,
                   read_input_station_id=False)

        dummy = utils.create_fulltimes(neigh, [variable],
                                       datastart,
                                       dataend, [],
                                       do_input_station_id=False)

        # get the correlations of data to this neighbour
        neigh_var = getattr(neigh, variable)
        neigh_anomalies = hourly_daily_anomalies(neigh_var.data[:])
        # correlation = np.ma.corrcoef(neigh_var.data, st_var.data)[1,0]
        correlation = np.ma.corrcoef(neigh_anomalies, st_anomalies)[1, 0]

        overlap = len(
            np.where(
                np.logical_or(neigh_var.data.mask, st_var.data.mask) == False)
            [0]) / float(len(st_var.data.compressed()))

        if not math.isnan(correlation):
            n_correlations[nn] = correlation
            n_overlaps[nn] = overlap
            combined_score[nn] = correlation + overlap
            n_distances[nn] = neighbour_distances[nn]
            n_quadrants[nn] = neighbour_quadrants[nn]

        # clear up to save memory
        del dummy
        del neigh_var
        del neigh_anomalies
        gc.collect()
    # sort in order of the combination of correlation and overlap
    sort_order = np.argsort(combined_score)[::-1]

    # and select the best 10
    # final_selection = neighbours[sort_order][:10]

    # sort out the quadrants

    locs1 = neighbours[sort_order][n_quadrants[sort_order] == 1]
    locs2 = neighbours[sort_order][n_quadrants[sort_order] == 2]
    locs3 = neighbours[sort_order][n_quadrants[sort_order] == 3]
    locs4 = neighbours[sort_order][n_quadrants[sort_order] == 4]

    final_locs = np.concatenate((locs1[:2], locs2[:2], locs3[:2], locs4[:2]),
                                axis=0).reshape(-1)

    # and add the rest in order of combined score
    for index in neighbours[sort_order]:
        if index not in final_locs:
            final_locs = np.append(final_locs, index)

        if len(final_locs) == N_NEIGHBOURS:
            break

    # output table showing distances, correlations, overlaps, the combined score and which ones were selected
    if plots or diagnostics:
        print "{:14s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s}".format(
            "Neighbour", "Distance", "Elevation", "Correl'n", "Overlap",
            "Combined", "Quadrant", "Selected")
    else:
        logfile.write(
            "{:14s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s}\n".format(
                "Neighbour", "Distance", "Elevation", "Correl'n", "Overlap",
                "Combined", "Quadrant", "Selected"))

    selected_correlations = []
    selected_overlaps = []
    for nn, nn_loc in enumerate(neighbours[sort_order]):

        selected = ""
        if nn_loc in final_locs:
            selected = "Y"
            if plots:
                selected_correlations += [n_correlations[sort_order[nn]]]
                selected_overlaps += [n_overlaps[sort_order[nn]]]

        neigh_details = neighbour_info[sort_order][nn]
        if plots or diagnostics:
            print "{:14s} {:10.1f} {:10.1f} {:10.5f} {:10.3f} {:10.3f} {:10.0f} {:10s}".format(
                neigh_details[0], n_distances[sort_order][nn],
                float(neigh_details[3]), n_correlations[sort_order][nn],
                n_overlaps[sort_order][nn], combined_score[sort_order][nn],
                n_quadrants[sort_order][nn], selected)
        else:
            logfile.write(
                "{:14s} {:10.1f} {:10.1f} {:10.5f} {:10.3f} {:10.3f} {:10.0f} {:10s}\n"
                .format(neigh_details[0], n_distances[sort_order][nn],
                        float(neigh_details[3]),
                        n_correlations[sort_order][nn],
                        n_overlaps[sort_order][nn],
                        combined_score[sort_order][nn],
                        n_quadrants[sort_order][nn], selected))

    # plot of correlations and overlaps, with selected stations highlighted
    if plots:
        import matplotlib.pyplot as plt

        plt.clf()
        plt.plot(n_correlations, n_overlaps, 'bo')
        plt.plot(selected_correlations, selected_overlaps, 'ro')
        plt.xlabel("correlations")
        plt.ylabel("data overlap")
        plt.title("{} - {}".format(station.id, variable))
        plt.show()

    return final_locs  # select_neighbours
示例#9
0
def select_neighbours(station, variable, neighbour_info, neighbours, neighbour_distances, neighbour_quadrants, data_locs, datastart, dataend, logfile, second = False, diagnostics = False, plots = False):
    '''
    From the list of nearby stations select the ones which will be good neighours for the test.
    Select on basis of correlation, overlap of data points and bearing (quadrants)
    
    :param object station: station object
    :param str variable: which variable to proces
    :param array neighbour_info: array of ID, lat, lon and elev
    :param array neighbours: which station sequence numbers are the nearby stations
    :param array neighbour_distances: distances to nearby stations
    :param array neighbour_quadrants: bearings to nearby stations (in 90deg bins)
    :param array data_locs: path to data files
    :param datetime datastart: start of data set
    :param datetime dataend: end of data set
    :param file logfile: logfile to store outputs
    :param boolean second: second run through
    :param boolean diagnostics: output diagnostic information
    :param boolean plots: make a plot

    :returns: final_locs - array of station sequence numbers to use.
    '''

    first = not second

    # set up storage arrays
    n_correlations = np.zeros(len(neighbours))
    n_distances = np.zeros(len(neighbours))
    n_quadrants = np.zeros(len(neighbours))
    n_overlaps = np.zeros(len(neighbours))
    combined_score = np.zeros(len(neighbours))

    # get station data
    st_var = getattr(station, variable)
    st_anomalies = hourly_daily_anomalies(st_var.data[:])

    # go through initial list and extract correlations and overlaps
    for nn, nn_loc in enumerate(neighbours):

        n_details = neighbour_info[nn]
        neigh = utils.Station(n_details[0], float(n_details[1]), float(n_details[2]), float(n_details[3]))

        if first:
            ncdfp.read(os.path.join(data_locs, neigh.id + "_internal.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)
        elif second:
            ncdfp.read(os.path.join(data_locs, neigh.id + "_internal2.nc"), neigh, [variable], diagnostics = diagnostics, read_input_station_id = False)

        dummy = utils.create_fulltimes(neigh, [variable], datastart, dataend, [], do_input_station_id = False)

        # get the correlations of data to this neighbour
        neigh_var = getattr(neigh, variable)
        neigh_anomalies = hourly_daily_anomalies(neigh_var.data[:])
        # correlation = np.ma.corrcoef(neigh_var.data, st_var.data)[1,0]
        correlation = np.ma.corrcoef(neigh_anomalies, st_anomalies)[1,0]

        overlap = len(np.where(np.logical_or(neigh_var.data.mask, st_var.data.mask) == False)[0])/float(len(st_var.data.compressed()))

        if not math.isnan(correlation):
            n_correlations[nn] = correlation
            n_overlaps[nn] = overlap
            combined_score[nn] = correlation + overlap
            n_distances[nn] = neighbour_distances[nn]
            n_quadrants[nn] = neighbour_quadrants[nn]
            
        # clear up to save memory
        del dummy
        del neigh_var
        del neigh_anomalies
        gc.collect()
    # sort in order of the combination of correlation and overlap
    sort_order = np.argsort(combined_score)[::-1]

    # and select the best 10
    # final_selection = neighbours[sort_order][:10]

    # sort out the quadrants
                    
    locs1 = neighbours[sort_order][n_quadrants[sort_order] == 1]
    locs2 = neighbours[sort_order][n_quadrants[sort_order] == 2]
    locs3 = neighbours[sort_order][n_quadrants[sort_order] == 3]
    locs4 = neighbours[sort_order][n_quadrants[sort_order] == 4]

    final_locs = np.concatenate((locs1[:2], locs2[:2], locs3[:2], locs4[:2]), axis = 0).reshape(-1)

    # and add the rest in order of combined score
    for index in neighbours[sort_order]:
        if index not in final_locs:
            final_locs = np.append(final_locs, index)
            
        if len(final_locs) == N_NEIGHBOURS:
            break

    # output table showing distances, correlations, overlaps, the combined score and which ones were selected
    if plots or diagnostics:
        print "{:14s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s}".format("Neighbour","Distance","Elevation", "Correl'n", "Overlap", "Combined", "Quadrant","Selected")
    else:
        logfile.write("{:14s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s} {:10s}\n".format("Neighbour","Distance","Elevation", "Correl'n", "Overlap", "Combined", "Quadrant","Selected")
)

    selected_correlations = []
    selected_overlaps = []
    for nn, nn_loc in enumerate(neighbours[sort_order]):

        selected = ""
        if nn_loc in final_locs: 
            selected = "Y"
            if plots:
                selected_correlations += [n_correlations[sort_order[nn]]]
                selected_overlaps += [n_overlaps[sort_order[nn]]]


        neigh_details = neighbour_info[sort_order][nn]
        if plots or diagnostics:
            print "{:14s} {:10.1f} {:10.1f} {:10.5f} {:10.3f} {:10.3f} {:10.0f} {:10s}".format(neigh_details[0], n_distances[sort_order][nn], float(neigh_details[3]), n_correlations[sort_order][nn], n_overlaps[sort_order][nn], combined_score[sort_order][nn], n_quadrants[sort_order][nn], selected)
        else:
            logfile.write("{:14s} {:10.1f} {:10.1f} {:10.5f} {:10.3f} {:10.3f} {:10.0f} {:10s}\n".format(neigh_details[0], n_distances[sort_order][nn], float(neigh_details[3]), n_correlations[sort_order][nn], n_overlaps[sort_order][nn], combined_score[sort_order][nn], n_quadrants[sort_order][nn], selected))

            
    # plot of correlations and overlaps, with selected stations highlighted
    if plots:
        import matplotlib.pyplot as plt

        plt.clf()
        plt.plot(n_correlations, n_overlaps, 'bo')
        plt.plot(selected_correlations, selected_overlaps, 'ro')
        plt.xlabel("correlations")
        plt.ylabel("data overlap")
        plt.title("{} - {}".format(station.id, variable))
        plt.show()

    return final_locs # select_neighbours