def read_file_to_dataframe(filename): """Reads a WOD file (filename) and returns data as a dataframe. data inlcudes columns Temperature, Salinity, Depth, Year, Month, Day, Longitude, Latitude, Datetime""" file = open(filename) # empty list for gatherting profiles. list_data = [] # loop through profiles profile = wod.WodProfile(file) while not profile.is_last_profile_in_file(file): year = profile.year() lat = profile.latitude() lon = profile.longitude() s = profile.s() d = profile.z() t = profile.t() month = profile.month() day = profile.day() date = datetime.datetime(year, month, day) tmp = { 'Year': year, 'Month': month, 'Day': day, 'Longitude': lon, 'Latitude': lat, 'Salinity': s, 'Temperature': t, 'Depth': d, 'Datetime': date } list_data.append(tmp) profile = wod.WodProfile(file) # again for last profile year = profile.year() lat = profile.latitude() lon = profile.longitude() s = profile.s() d = profile.z() t = profile.t() month = profile.month() day = profile.day() tmp = { 'Year': year, 'Month': month, 'Day': day, 'Longitude': lon, 'Latitude': lat, 'Salinity': s, 'Temperature': t, 'Depth': d } list_data.append(tmp) # convert to data frame data = pd.DataFrame(list_data) return data
def setUp(self): # WOD13 format data classic = open("tests/testData/classic.dat") # example from pp 124 of http://data.nodc.noaa.gov/woa/WOD/DOC/wodreadme.pdf self.classic1 = wod.WodProfile(classic) self.classic1_df = self.classic1.df() self.classic1_dict = self.classic1.npdict() self.classic1_head = self.classic1.header() # example with missing salinity information self.classic2 = wod.WodProfile(classic) # IQuOD 0.1 format data # short example (unpacked by hand to validate) iquod = open("tests/testData/iquod.dat") self.iquod1 = wod.WodProfile(iquod) self.iquod1_df = self.iquod1.df() self.iquod1_dict = self.iquod1.npdict() self.iquod1_head = self.iquod1.header() # example with some metadata self.iquod2 = wod.WodProfile(iquod) self.iquod2_df = self.iquod2.df() self.iquod2_dict = self.iquod2.npdict() # data with some interesting pathologies path = open("tests/testData/pathological.dat") self.path1 = wod.WodProfile(path) self.path1_df = self.path1.df() self.path1_dict = self.path1.npdict() self.path1_head = self.path1.header() return
def setUp(self): filenames = main.readInput('datafiles.json') profiles = main.extractProfiles(filenames) # identify and import tests testNames = main.importQC('qctests') testNames.sort() for testName in testNames: exec('from qctests import ' + testName) # Set up any keyword arguments needed by tests. kwargs = {'profiles': profiles} testResults = [] testVerbose = [] trueResults = [] trueVerbose = [] firstProfile = True delete = [] currentFile = '' self.profiles = [] for iprofile, pinfo in enumerate(profiles): # Load the profile data. if pinfo.file_name != currentFile: if currentFile != '': f.close() currentFile = pinfo.file_name f = open(currentFile) if f.tell() != pinfo.file_position: f.seek(pinfo.file_position) self.profiles.append(wod.WodProfile(f))
def extractProfiles(filenames): ''' Read all profiles from the files and store in a list. Only the profile descriptions are read, not the profile data, in order to avoid using too much memory. ''' profiles = [] for filename in filenames: with open(filename) as f: profiles.append(wod.WodProfile(f, load_profile_data=False)) while profiles[-1].is_last_profile_in_file(f) == False: profiles.append(wod.WodProfile(f, load_profile_data=False)) # assert all elements of profiles are WodProfiles for i in profiles: assert isinstance(i, wod.WodProfile), i + ' is not a WodProfile' return profiles
def setUp(self): #create an artificial profile to trigger the temperature flag #sets first temperature to 99.9; otherwise identical to data/example.dat file = open("tests/testData/example.dat") self.demoProfile = wod.WodProfile(file) self.dataframe = self.demoProfile.df() self.dictionary = self.demoProfile.npdict() self.head = self.demoProfile.header() return
def text2wod(raw): ''' given the raw text of a wod ascii profile, return a wodpy object representing the same. ''' fProfile = tempfile.TemporaryFile() fProfile.write(raw) # a file-like object containing only the profile from the queried row fProfile.seek(0) profile = wod.WodProfile(fProfile) fProfile.close() return profile
def profileData(pinfo, currentFile, f): ''' takes a profile info stub as returned by extractProfiles and extracts the whole profile from file f. ''' if pinfo.file_name != currentFile: if currentFile != '': f.close() currentFile = pinfo.file_name f = open(currentFile) if f.tell() != pinfo.file_position: f.seek(pinfo.file_position) return wod.WodProfile(f), currentFile, f
def get_profiles(ffile, N=-1): """Extracts profiles from a WOD file.""" fid = open(ffile) pfs = [] counter = 0 while True: pf = wod.WodProfile(fid) if pf.is_last_profile_in_file(fid): break pfs.append(pf) counter += 1 if N >= 0: if counter == N: break return pfs
def builddb(infile, check_originator_flag_type=True, months_to_use=range(1, 13), outfile='iquod.db', dbtable='iquod'): conn = sqlite3.connect(outfile, isolation_level=None) cur = conn.cursor() # Identify tests testNames = main.importQC('qctests') testNames.sort() # set up our table query = "CREATE TABLE IF NOT EXISTS " + dbtable + """( raw text, truth BLOB, uid integer PRIMARY KEY, year integer, month integer, day integer, time real, lat real, long real, country text, cruise integer, ocruise text, probe integer, training integer, flagged integer, """ for i in range(len(testNames)): query += testNames[i].lower() + ' BLOB' if i < len(testNames) - 1: query += ',' else: query += ');' cur.execute(query) # populate table from wod-ascii data fid = open(infile) uids = [] good = 0 bad = 0 while True: # extract profile as wodpy object and raw text start = fid.tell() profile = wod.WodProfile(fid) end = fid.tell() fid.seek(start) raw = fid.read(end - start) fid.seek(end) # set up dictionary for populating query string p = profile.npdict() p['raw'] = "'" + raw + "'" # check for duplicate profiles in raw data if p['uid'] in uids: if profile.is_last_profile_in_file(fid) == True: break else: continue uids.append(p['uid']) # skip pathological profiles isgood = assessProfile(profile, check_originator_flag_type, months_to_use) if not isgood and profile.is_last_profile_in_file(fid) == True: break elif not isgood: continue # encode temperature error codes into truth array truth = encodeTruth(profile) p['truth'] = main.pack_array(truth) # extract country code country = profile.primary_header['Country code'] # originator cruise orig_cruise = profile.originator_cruise() # keep tabs on how many good and how many bad profiles have been added to db # nowire == index of first wire break level wireqc = qctests.CSIRO_wire_break.test(profile, {}) try: nowire = list(wireqc).index(True) except: nowire = len(truth) # flag only counts if its before the wire break: flagged = dbutils.summarize_truth(truth[0:nowire]) if flagged: bad += 1 else: good += 1 query = "INSERT INTO " + dbtable + " (raw, truth, uid, year, month, day, time, lat, long, country, cruise, ocruise, probe, flagged) values (?,?,?,?,?,?,?,?,?,?,?,?,?,?);" values = (p['raw'], p['truth'], p['uid'], p['year'], p['month'], p['day'], p['time'], p['latitude'], p['longitude'], country, p['cruise'], orig_cruise, p['probe_type'], int(flagged)) main.dbinteract(query, values, targetdb=outfile) if profile.is_last_profile_in_file(fid) == True: break conn.commit() print('number of clean profiles written:', good) print('number of flagged profiles written:', bad) print('total number of profiles written:', good + bad)
#filename = '../data/quota_subset.dat' filename = '../../AutoQC_raw/quota/test/chunk.dat' n = 30 fid = open(filename) fid.read() fileSize = fid.tell() chunkSize = int(math.ceil(fileSize / n)) fileNo = 0 start = 0 end = 0 target = open('split-' + str(fileNo) + '.dat', 'w') fid.seek(0) while not (fid.read(1) == ''): #write next chunk to open target fid.seek(end) start = fid.tell() profile = wod.WodProfile(fid) end = fid.tell() fid.seek(start) extract = fid.read(end - start) target.write(extract) #wrap the file and start a new one once we've crossed the max size if target.tell() > chunkSize: target.close() fileNo += 1 target = open('split-' + str(fileNo) + '.dat', 'w')
def main(): # print('This executes the wod_prof_db package\n') parser = argparse.ArgumentParser( description="setup WOD profile lookup database") parser.add_argument( "source_dir", type=str, help= "full path to directory containing source data (e.g. download folder)") parser.add_argument("dest_dir", type=str, nargs='?', help="directory path where output array will reside") parser.add_argument("wild_card", type=str, nargs='?', help="wild card string to narrow input files") args = parser.parse_args() cur_dir = subprocess.check_output("pwd", shell=True)[:-1] print("source dir is " + args.source_dir) source_dir = args.source_dir # dir of source data (wod files) if args.dest_dir: print("dest dir is " + args.dest_dir) dest_dir = args.dest_dir else: print("creating profile_pool dir in current dir\n") dest_dir = cur_dir + "/profile_db/" # where to put database if not os.path.isdir(dest_dir): os.system("mkdir " + dest_dir) print("creating destination directory") # use glob to form a list of input files: if args.wild_card: prof_files = glob.glob(source_dir + '/ocldb' + args.wild_card) print(prof_files) else: prof_files = glob.glob(source_dir + '/ocldb*') print(prof_files) # prof_files.sort(key=lambda x: [int(x.split('-')[2])]) # no need for sort # prepare look-up table array/list/dict # maybe list less ideal because it's slow and lists may require more memory to fill up dbase = [] # dbase is the list of profiles that contains profile info # loop over input files, retrieve the necessary info and store it in the # appropriate place in print("\nputting together database: list filling loop\n") for dafile in prof_files: print("\nWorking on file: " + dafile + "\n") fid = open(dafile) profile = wod.WodProfile(fid) prof_data, prof_ok = get_prof_data(profile) if prof_ok: dbase.append(prof_data) last_prof = profile.is_last_profile_in_file(fid) while not last_prof: profile = wod.WodProfile(fid) prof_data, prof_ok = get_prof_data(profile) if prof_ok: dbase.append(prof_data) last_prof = profile.is_last_profile_in_file(fid) dbase = np.array(dbase, dtype=[("probe_type", '|S21'), ('nlevs', 'int32'), ('year', 'int32'), ('month', 'int32'), ('day', 'int32'), ('date', 'O'), ('lat', 'float32'), ('lon', 'float32'), ('pmin', 'float32'), ('pmax', 'float32'), ('dpm', 'float32'), ('dzm', 'float32'), ("ps_qc", 'int32'), ("pt_qc", 'int32'), ('pres', 'O'), ('sal', 'O'), ('temp', 'O'), ('z', 'O'), ('usal', 'O'), ('utemp', 'O'), ('uz', 'O')]) np.savez_compressed(dest_dir + "cal_wod_profile_info_database", dbase=dbase)