Пример #1
def record_parameters(profile, bgStdLevels, bgevStdLevels, origLevels,
                      ptLevels, bgLevels):
    # pack the parameter arrays into the enbackground table
    # for consumption by the buddy check

    bgstdlevels = main.pack_array(bgStdLevels)
    bgevstdlevels = main.pack_array(bgevStdLevels)
    origlevels = main.pack_array(origLevels)
    ptlevels = main.pack_array(ptLevels)
    bglevels = main.pack_array(bgLevels)
    query = "REPLACE INTO enbackground VALUES(?,?,?,?,?,?);"
    main.dbinteract(query, [
        profile.uid(), bgstdlevels, bgevstdlevels, origlevels, ptlevels,
Пример #2
def process_row(uid, logdir):
    '''run all tests on the indicated database row'''

    # reroute stdout, stderr to separate files for each profile to preserve logs
    sys.stdout = open(logdir + "/" + str(uid) + ".stdout", "w")
    sys.stderr = open(logdir + "/" + str(uid) + ".stderr", "w")

    # extract profile
    profile = main.get_profile_from_db(uid)

    # mask out error codes in temperature data

    # run tests
    for itest, test in enumerate(testNames):
            result = run(test, [profile], parameterStore)[0]
            print test, 'exception', sys.exc_info()
            result = np.zeros(1, dtype=bool)

            query = "UPDATE " + sys.argv[
                1] + " SET " + test + "=? WHERE uid=" + str(
                    profile.uid()) + ";"
            main.dbinteract(query, [main.pack_array(result)])
            print 'db exception', sys.exc_info()
Пример #3
def test(p, parameters):
    Runs the quality control check on profile p and returns a numpy array 
    of quality control decisions with False where the data value has 
    passed the check and True where it failed. 
    cruise = p.cruise()
    uid = p.uid()
    # don't bother if cruise == 0 or None, or if timestamp is corrupt
    if (cruise in [0, None]) or (None in [p.year(), p.month(), p.day(), p.time()]):
        return np.zeros(1, dtype=bool)
    # don't bother if this has already been analyzed
    command = 'SELECT en_track_check FROM ' + parameters["table"] + ' WHERE uid = ' + str(uid) + ';'
    en_track_result = main.dbinteract(command)
    if en_track_result[0][0] is not None:
        en_track_result = main.unpack_row(en_track_result[0])[0]
        result = np.zeros(1, dtype=bool)
        result[0] = np.any(en_track_result)
        return result
    # some detector types cannot be assessed by this test; do not raise flag.
    if p.probe_type() in [None]:
        return np.zeros(1, dtype=bool)
    # fetch all profiles on track, sorted chronologically, earliest first (None sorted as highest)
    command = 'SELECT uid, year, month, day, time, lat, long, probe FROM ' + parameters["table"] + ' WHERE cruise = ' + str(cruise) + ' and year is not null and month is not null and day is not null and time is not null ORDER BY year, month, day, time, uid ASC;'
    track_rows = main.dbinteract(command)

    # start all as passing by default:
    EN_track_results = {}
    for i in range(len(track_rows)):
        EN_track_results[track_rows[i][0]] = np.zeros(1, dtype=bool)
    # copy the list of headers;
    # remove entries as they are flagged.
    passed_rows = copy.deepcopy(track_rows)
    rejects = findOutlier(passed_rows, EN_track_results)
    while rejects != []:
        passed_index = [x for x in range(len(passed_rows)) if x not in rejects ]
        passed_rows = [passed_rows[index] for index in passed_index ]
        rejects = findOutlier(passed_rows, EN_track_results)
    # if more than half got rejected, reject everyone
    if len(passed_rows) < len(track_rows) / 2:
        for i in range(len(track_rows)):
            EN_track_results[track_rows[i][0]][0] = True
    # write all to db
    result = []
    for i in range(len(track_rows)):
        result.append((main.pack_array(EN_track_results[track_rows[i][0]]), track_rows[i][0]))

    query = "UPDATE " + sys.argv[1] + " SET en_track_check=? WHERE uid=?"
    main.interact_many(query, result)

    return EN_track_results[uid]
Пример #4
Пример #5
def test(p, parameters):
    Runs the quality control check on profile p and returns a numpy array
    of quality control decisions with False where the data value has
    passed the check and True where it failed.

    country = p.primary_header['Country code'] 
    cruise = p.cruise()
    originator_cruise = p.originator_cruise()
    uid = p.uid()

    # don't bother if this has already been analyzed
    command = 'SELECT en_track_check FROM ' + parameters["table"] + ' WHERE uid = ' + str(uid) + ';'
    en_track_result = main.dbinteract(command)
    if en_track_result[0][0] is not None:
        en_track_result = main.unpack_row(en_track_result[0])[0]
        result = np.zeros(1, dtype=bool)
        result[0] = np.any(en_track_result)
        return result

    # make sure this profile makes sense in the track check
    if not assess_usability(p):
        return np.zeros(1, dtype=bool)

    # fetch all profiles on track, sorted chronologically, earliest first (None sorted as highest), then by uid
    command = 'SELECT uid, year, month, day, time, lat, long, probe, raw FROM ' + parameters["table"] + ' WHERE cruise = ' + str(cruise) + ' and country = "' + str(country) + '" and ocruise = "' + str(originator_cruise) + '" and year is not null and month is not null and day is not null and time is not null ORDER BY year, month, day, time, uid ASC;'
    track_rows = main.dbinteract(command)

    # avoid inappropriate profiles
    track_rows = [tr for tr in track_rows if assess_usability_raw(tr[8][1:-1])]

    # start all as passing by default
    EN_track_results = {}
    for i in range(len(track_rows)):
        EN_track_results[track_rows[i][0]] = np.zeros(1, dtype=bool)

    # copy the list of headers;
    # remove entries as they are flagged.
    passed_rows = copy.deepcopy(track_rows)
    rejects = findOutlier(passed_rows, EN_track_results)

    while rejects != []:
        passed_index = [x for x in range(len(passed_rows)) if x not in rejects ]
        passed_rows = [passed_rows[index] for index in passed_index ]
        rejects = findOutlier(passed_rows, EN_track_results)

    # if more than half got rejected, reject everyone
    if len(passed_rows) < len(track_rows) / 2:
        for i in range(len(track_rows)):
            EN_track_results[track_rows[i][0]][0] = True

    # write all to db
    result = []
    for i in range(len(track_rows)):
        result.append((main.pack_array(EN_track_results[track_rows[i][0]]), track_rows[i][0]))

    query = "UPDATE " + sys.argv[1] + " SET en_track_check=? WHERE uid=?"
    main.interact_many(query, result)
    return EN_track_results[uid]
Пример #6
Пример #7
def builddb(infile,
            months_to_use=range(1, 13),

    conn = sqlite3.connect(outfile, isolation_level=None)
    cur = conn.cursor()

    # Identify tests
    testNames = main.importQC('qctests')

    # set up our table
    query = "CREATE TABLE IF NOT EXISTS " + dbtable + """(
                raw text,
                truth BLOB,
                uid integer PRIMARY KEY,
                year integer,
                month integer,
                day integer,
                time real,
                lat real,
                long real,
                country text,
                cruise integer,
                ocruise text,
                probe integer,
                training integer,
                flagged integer,
    for i in range(len(testNames)):
        query += testNames[i].lower() + ' BLOB'
        if i < len(testNames) - 1:
            query += ','
            query += ');'


    # populate table from wod-ascii data
    fid = open(infile)
    uids = []
    good = 0
    bad = 0

    while True:
        # extract profile as wodpy object and raw text
        start = fid.tell()
        profile = wod.WodProfile(fid)
        end = fid.tell()
        raw = fid.read(end - start)
        # set up dictionary for populating query string
        p = profile.npdict()
        p['raw'] = "'" + raw + "'"

        # check for duplicate profiles in raw data
        if p['uid'] in uids:
            if profile.is_last_profile_in_file(fid) == True:

        # skip pathological profiles
        isgood = assessProfile(profile, check_originator_flag_type,
        if not isgood and profile.is_last_profile_in_file(fid) == True:
        elif not isgood:

        # encode temperature error codes into truth array
        truth = encodeTruth(profile)
        p['truth'] = main.pack_array(truth)

        # extract country code
        country = profile.primary_header['Country code']

        # originator cruise
        orig_cruise = profile.originator_cruise()

        # keep tabs on how many good and how many bad profiles have been added to db
        # nowire == index of first wire break level
        wireqc = qctests.CSIRO_wire_break.test(profile, {})
            nowire = list(wireqc).index(True)
            nowire = len(truth)
        # flag only counts if its before the wire break:
        flagged = dbutils.summarize_truth(truth[0:nowire])
        if flagged:
            bad += 1
            good += 1

        query = "INSERT INTO " + dbtable + " (raw, truth, uid, year, month, day, time, lat, long, country, cruise, ocruise, probe, flagged) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?);"
        values = (p['raw'], p['truth'], p['uid'], p['year'], p['month'],
                  p['day'], p['time'], p['latitude'], p['longitude'], country,
                  p['cruise'], orig_cruise, p['probe_type'], int(flagged))
        main.dbinteract(query, values, targetdb=outfile)
        if profile.is_last_profile_in_file(fid) == True:

    print('number of clean profiles written:', good)
    print('number of flagged profiles written:', bad)
    print('total number of profiles written:', good + bad)
Пример #8
def db_to_df(table,
    Reads the table from targetdb into a pandas dataframe.
    If filter_on_wire_break_test is True, the results from that test are used to exclude
         levels below a wire break from the test results and the wire break test is not returned.
    filter_on_tests is a generalised form of filter_on_wire_break and is used to exclude results; it takes a list of
         [testname, action], where levels failing <testname> are excluded towards the surface (if action is 'up'), towards depth (if action is 'down') and the whole profile deleted (if action is 'remove').
    Set n_to_extract to limit the number of rows extracted to the specified number.

    # what tests are available
    testNames = main.importQC('qctests')

    # connect to database
    conn = sqlite3.connect(targetdb, isolation_level=None)
    cur = conn.cursor()

    # extract matrix of test results and true flags into a dataframe
    query = 'SELECT uid, truth'
    for test in testNames:
        query += ', ' + test.lower()
    query += ' FROM ' + table
    query += ' WHERE uid IN (SELECT uid FROM ' + table + ' ORDER BY RANDOM() LIMIT ' + str(
        n_to_extract) + ')'

    rawresults = cur.fetchall()

    sub = 1000
    df_final = None
    for i in range(math.ceil(len(rawresults) / sub)):
        df = pandas.DataFrame(rawresults[i * sub:(i + 1) *
        df.columns = ['uid', 'Truth'] + testNames
        df = df.astype({'uid': 'int'})
        if filter_on_wire_break_test:
            nlevels = get_n_levels_before_fail(df['CSIRO_wire_break'])
            del df['CSIRO_wire_break']  # No use for this now.
            testNames = df.columns[2:].values.tolist()
            for i in range(len(df.index)):
                for j in range(1, len(df.columns)):
                    qc = unpack_qc(df.iloc[i, j])
                    # Some QC tests may return only one value so check for this.
                    if len(qc) > 1:
                        qc = qc[:nlevels[i]]
                    df.iat[i, j] = main.pack_array(qc)

        todrop = set()
        for action in filter_on_tests:
            # Check if the action is relevant.
            if action == 'Optional' or action == 'At least one from group':

            # Initialise variables.
            nlevels = -1
            outcomes = False
            qcresults = []
            for testname in filter_on_tests[action]:
                for i in range(0, len(df.index)):
                    if action == 'Remove above reject':
                        nlevels = get_reversed_n_levels_before_fail(
                    elif action == 'Remove below reject':
                        nlevels = get_n_levels_before_fail([df[testname][i]
                    elif action == 'Remove profile':
                        outcomes = check_for_fail([df[testname][i]])[0]
                    elif action == 'Remove rejected levels':
                        qcresults = unpack_qc_results([df[testname][i]])[0]
                        raise NameError('Unrecognised action: ' + action)

                    if (((action == 'Remove above reject'
                          or action == 'Remove below reject') and nlevels == 0)
                        (action == 'Remove profile' and outcomes == True) or
                        (action == 'Remove rejected levels'
                         and numpy.count_nonzero(qcresults == False) == 0)):
                        # Completely remove a profile if it has no valid levels or if it
                        # has a fail and the action is to remove.
                    elif (action != 'Remove profile'):
                        for j in range(1, len(df.columns)):
                            # Retain only the levels that passed testname.
                            # Some QC tests may return only one value so check for this.
                            qc = unpack_qc(df.iloc[i, j])
                            if len(qc) > 1:
                                if action == 'Remove above reject':
                                    qc = qc[nlevels:]
                                elif action == 'Remove below reject':
                                    qc = qc[:nlevels]
                                elif action == 'Remove rejected levels':
                                    qc = qc[qcresults == False]
                                df.iat[i, j] = main.pack_array(qc)

                del df[testname]  # No need to keep this any longer.
                df.reset_index(inplace=True, drop=True)

        todrop = list(todrop)
        if len(todrop) > 0:
            df.drop(todrop, inplace=True)
        df.reset_index(inplace=True, drop=True)
        testNames = df.columns[2:].values.tolist()
        if applyparse:
            df[['Truth']] = df[['Truth']].apply(parse_truth)
            df[testNames] = df[testNames].apply(parse)

        if i == 0:
            df_final = df
            df_final = pandas.concat([df_final, df])

    return df_final.reset_index(drop=True)
Пример #9
def run_qc(p, suspect):

    # check for pre-registered suspect tabulation, if that's what we want:
    if suspect:
        query = 'SELECT suspect FROM enspikeandstep WHERE uid = ' + str(p.uid()) + ';'
        susp = main.dbinteract(query)
        if len(susp) > 0:
            return main.unpack_row(susp[0])[0]
    # Define tolerances used.
    tolD     = np.array([0, 200, 300, 500, 600])
    tolDTrop = np.array([0, 300, 400, 500, 600])
    tolT     = np.array([5.0, 5.0, 2.5, 2.0, 1.5])  

    # Define an array to hold results.
    qc    = np.zeros(p.n_levels(), dtype=bool)

    # Get depth and temperature values from the profile.
    z = p.z()
    t = p.t()

    # Find which levels have data.
    isTemperature = (t.mask==False)
    isDepth = (z.mask==False)
    isData = isTemperature & isDepth

    # Array to hold temperature differences between levels and gradients.
    dt, gt = composeDT(t, z, p.n_levels())
    # Spikes and steps detection.
    for i in range(1, p.n_levels()):
        if i >= 2:
            if (isData[i-2] and isData[i-1] and isData[i]) == False:
            if z[i] - z[i-2] >= 5.0:
                wt1 = (z[i-1] - z[i-2]) / (z[i] - z[i-2])
                wt1 = 0.5
            if (isData[i-1] and isData[i]) == False:
            wt1 = 0.5
        dTTol = determineDepthTolerance(z[i-1], np.abs(p.latitude()))
        gTTol = 0.05

        # Check for low temperatures in the Tropics.
        # This might be more appropriate to appear in a separate EN regional
        # range check but is included here for now for consistency with the
        # original code.
        if (np.abs(p.latitude()) < 20.0 and z[i-1] < 1000.0 and
            t[i-1] < 1.0):
               dt[i] = np.ma.masked 
               if suspect == True: qc[i-1] = True
        qc, dt = conditionA(dt, dTTol, qc, wt1, i, suspect)                
        qc, dt = conditionB(dt, dTTol, gTTol, qc, gt, i, suspect)
        qc = conditionC(dt, dTTol, z, qc, t, i, suspect)
    # End of loop over levels.
    # Step or 0.0 at the bottom of a profile.
    if isData[-1] and dt.mask[-1] == False:
        dTTol = determineDepthTolerance(z[-1], np.abs(p.latitude()))
        if np.abs(dt[-1]) > dTTol:
            if suspect == True: qc[-1] = True
    if isTemperature[-1]:
        if t[-1] == 0.0:
            if suspect == True: qc[-1] = True
    # If 4 levels or more than half the profile is rejected then reject all.
    if suspect == False:
        nRejects = np.count_nonzero(qc)
        if nRejects >= 4 or nRejects > p.n_levels()/2:
            qc[:] = True

    # register suspects, if computed, to db
    if suspect:
        query = "REPLACE INTO enspikeandstep VALUES(?,?);"
        main.dbinteract(query, [p.uid(), main.pack_array(qc)] )

    return qc
Пример #10
Пример #11
Пример #12
            if profile.is_last_profile_in_file(fid) == True:

        # skip pathological profiles
        isgood = assessProfile(profile)
        if not isgood and profile.is_last_profile_in_file(fid) == True:
        elif not isgood:

        # encode temperature error codes into truth array
        truth = encodeTruth(profile)
        p['truth'] = main.pack_array(truth)

        # keep tabs on how many good and how many bad profiles have been added to db
        # nowire == index of first wire break level
        wireqc = qctests.CSIRO_wire_break.test(profile, {})
            nowire = list(wireqc).index(True)
            nowire = len(truth)
        # flag only counts if its before the wire break:
        flagged = dbutils.summarize_truth(truth[0:nowire])
        if flagged:
            bad += 1
            good += 1