day=(date-year*10000-month*100) hour=time//100 min=(time-hour*100) doy=[ datetime.datetime(np.int(year[i]),np.int(month[i]),np.int(day[i]),\ np.int(hour[i]),np.int(min[i]),0)- \ datetime.datetime(first_year,1,1,0,0,0) \ for i in range(len(year))] processed_dates=[doy[i].days+doy[i].seconds/(24.*60.*60.) for i in range(len(doy))] return processed_dates #read obs files all_files = glob.glob('/work/home/db876/observations/surface/%s/EMEP/*'%(species)) all_files = modules.natsorted(all_files) #get all refs ref_list = [] valid_refs=[] for i in range(len(all_files)): f = all_files[i] f = f.replace("/work/home/db876/observations/surface/%s/EMEP/"%(species), "") f = f[:7] ref_list.append(f) refs = set(ref_list) refs = sorted([i for i in refs]) refs = np.array(refs) print 'all refs len = ', len(refs)
elif species == "NO": data_resolution = 1.1 mol_mass = 30.01 elif species == "NO2": data_resolution = 1.1 mol_mass = 46.0055 elif species == "CO": data_resolution = 100.1 mol_mass = 28.01 # read obs files hourly_files = glob.glob("/work/home/db876/observations/surface/%s/GAW/*.hr*.dat" % (species)) hourly_files = modules.natsorted(hourly_files) daily_files = glob.glob("/work/home/db876/observations/surface/%s/GAW/*.da.*" % (species)) daily_files = modules.natsorted(daily_files) monthly_files = glob.glob("/work/home/db876/observations/surface/%s/GAW/*.mo.*" % (species)) monthly_files = modules.natsorted(monthly_files) # get all refs ref_list_hourly = [] ref_list_daily = [] ref_list_monthly = [] valid_refs = [] for i in range(len(hourly_files)): f = hourly_files[i] f = f.replace("/work/home/db876/observations/surface/%s/GAW/" % (species), "")
def site_iter_process(valid_refs, c): # for r in range(len(valid_refs)): ref = valid_refs[c] print ref # get site instrument for species met_i = met_refs.index(ref) print len(met_refs) print len(met_species) site_species = list(met_species[met_i]) site_instruments = list(met_instruments[met_i]) print site_species print site_instruments mm = site_instruments[site_species.index(species)] site_resolutions = [] data_valid = True s_files = insensitive_glob("/work/home/db876/observations/surface/%s/EANET/*%s.csv" % (species, ref)) site_files = [] for y in year_array: for f in s_files: if str(y)[-2:] in f: site_files.append(f) site_files = modules.natsorted(site_files) if site_files == []: print "No files for ref.\n" years = [] months = [] days = [] hours = [] vals = [] last_year_index = len(site_files) for y in year_array: print "Processing Year %s" % y got_year = False for file in site_files: last_file_split = file.split("/")[-1] if str(y)[2:] in last_file_split: got_year = True break if got_year == False: # fill in data for missing year timedelta_diff = datetime.date(y + 1, 1, 1) - datetime.date(y, 1, 1) ndays_missing = timedelta_diff.days print "ndays missing = ", ndays_missing vals = np.append(vals, [-99999] * (ndays_missing * 24)) continue print file valid = True with open(file, "rb") as f: reader = csv.reader(f, delimiter=",") counter = 0 # get resolution for row in reader: if counter == 0: all_units = row elif counter == 1: file_res = "H" try: hour_index = row.index("Hour") except: file_res = "D" try: day_index = row.index("Day") except: file_res = "M" month_index = row.index("Month") year_index = row.index("Year") try: spec_index = row.index(species.upper()) units = all_units[spec_index] except: valid = False break # make sure each year units are ppb if units != "ppb": print "Units not ppb!" 1 + "a" if counter == 2: if file_res == "H": yyyy = row[year_index] mm = row[month_index] dd = row[day_index] hh = row[hour_index] elif file_res == "D": yyyy = row[year_index] mm = row[month_index] dd = row[day_index] hh = 1 elif file_res == "M": yyyy = row[year_index] mm = row[month_index] dd = 1 hh = 1 start_datetime = datetime.datetime(int(yyyy), int(mm), int(dd), int(hh)) if counter == 3: if file_res == "H": yyyy = row[year_index] mm = row[month_index] dd = row[day_index] hh = row[hour_index] elif file_res == "D": yyyy = row[year_index] mm = row[month_index] dd = row[day_index] hh = 1 elif file_res == "M": yyyy = row[year_index] mm = row[month_index] dd = 1 hh = 1 present_datetime = datetime.datetime(int(yyyy), int(mm), int(dd), int(hh)) time_delt = present_datetime - start_datetime hour_delt = datetime.timedelta(hours=1) day_delt = datetime.timedelta(hours=24) week_delt = datetime.timedelta(hours=24 * 7) month_delt = datetime.timedelta(hours=24 * 28) print time_delt if time_delt < day_delt: print "Hourly Data" file_res = "H" site_resolutions.append(file_res) elif (time_delt > hour_delt) & (time_delt < week_delt): print "Daily Data" file_res = "D" site_resolutions.append(file_res) elif time_delt > week_delt: print "Monthly Data" file_res = "M" site_resolutions.append(file_res) # break # limit files by timeres return if not suitable for output res if output_res == "H": if (file_res == "D") or (file_res == "M"): print "File resolution has to be Minimum Hourly. Skipping" data_valid = False return c, vals, data_valid, -999, -999, -999, "na", "na", "na", "na", "na", -999 elif output_res == "D": if file_res == "M": print "File resolution has to be Minimum Daily. Skipping" data_valid = False return c, vals, data_valid, -999, -999, -999, "na", "na", "na", "na", "na", -999 counter += 1 # READ IN DATA if valid == True: with open(file, "rb") as f: reader = csv.reader(f, delimiter=",") counter = 0 for row in reader: if counter >= 2: yyyy = row[year_index] mm = row[month_index] if file_res == "H": try: vals = np.append(vals, np.float64(row[spec_index])) except: vals = np.append(vals, -99999) elif file_res == "D": try: vals = np.append(vals, [np.float64(row[spec_index])] * 24) except: vals = np.append(vals, [-99999] * 24) elif file_res == "M": month_days = monthrange(int(yyyy), int(mm))[1] try: vals = np.append(vals, [np.float64(row[spec_index])] * (month_days * 24)) except: vals = np.append(vals, [-99999] * (month_days * 24)) counter += 1 else: print "Species is not in file header. Skipping Year" timedelta_diff = datetime.date(y + 1, 1, 1) - datetime.date(y, 1, 1) ndays_missing = timedelta_diff.days print "ndays missing = ", ndays_missing vals = np.append(vals, [-99999] * (ndays_missing * 24)) valid_refs_rev.append(ref) i_ref = met_refs.index(ref) tz = np.float64(met_tz[i_ref]) lat = np.float64(met_lats[i_ref]) lon = np.float64(met_lons[i_ref]) alt = np.float64(met_alts[i_ref]) raw_class_name = met_class[i_ref] anthrome_class_name = class_name[i_ref] # check tz is whole number else skip site if (tz % 1) != 0: print "Timezone is not even. Skipping" data_valid = False tz = int(tz) # correct time to UTC if tz < 0: # get rid of values at start and append -99999's at end cut = vals[:tz] for num in range(np.abs(tz)): cut = np.insert(cut, 0, -99999) vals = cut elif tz > 0: # put -99999's at start and get rid of values at end cut = vals[tz:] for num in range(tz): cut = np.append(cut, -99999) vals = cut # do data quality checks full_data, data_valid, data_complete = modules.quality_check_periodic( vals, data_valid, data_resolution, np.float64(alt), grid_dates, start_year, end_year ) # if all site resolutions are same continue, make program exit all_same = all(x == site_resolutions[0] for x in site_resolutions) if all_same == True: pass else: print "Not all files for site have same resolution. Skipping." data_valid = False return c, full_data, data_valid, -999, -999, -999, "na", "na", "na", "na", "na", -999 # set sampling as average st = "average" return ( c, full_data, data_valid, lat, lon, alt, raw_class_name, anthrome_class_name, mm, st, file_res, data_complete, )
elif (species == 'NO'): data_resolution = 1.1 param_code = 42601 elif (species == 'NO2'): data_resolution = 1.1 param_code = 42602 elif species == 'ISOP': data_resolution = 0.6 param_code = 43243 files = glob.glob('/work/home/db876/observations/surface/%s/AQS/*'%(species)) files=modules.natsorted(files) year_array = np.arange(start_year,end_year+1) valid_files = [] for i in year_array: for f in files: if str(i) in f: valid_files.append(f) print valid_files #create ref lists all_site_refs = [] test_site_refs = [] uniq_refs = []
doy=[ datetime.datetime(np.int(year[i]),np.int(month[i]),np.int(day[i]),\ np.int(hour[i]),np.int(min[i]),0)- \ datetime.datetime(first_year,1,1,0,0,0) \ for i in range(len(year))] processed_dates = [ doy[i].days + doy[i].seconds / (24. * 60. * 60.) for i in range(len(doy)) ] return processed_dates #read obs files all_files = glob.glob('/work/home/db876/observations/surface/%s/EMEP/*' % (species)) all_files = modules.natsorted(all_files) #get all refs ref_list = [] valid_refs = [] for i in range(len(all_files)): f = all_files[i] f = f.replace("/work/home/db876/observations/surface/%s/EMEP/" % (species), "") f = f[:7] ref_list.append(f) refs = set(ref_list) refs = sorted([i for i in refs]) refs = np.array(refs)
start_year = start_years[y] end_year = end_years[y] print start_year, end_year year_array = range(start_year, end_year + 1) if (species == "O3") or (species == "NO") or (species == "NO2"): data_resolution = 1.1 n_years = (end_year - start_year) + 1 # read obs files all_files = glob.glob("/work/home/db876/observations/surface/%s/EANET/AT*" % (species)) all_files = modules.natsorted(all_files) ref_list = [] valid_refs = [] valid_refs_rev = [] for i in range(len(all_files)): f = all_files[i].replace(".csv", "") f = f.replace("/work/home/db876/observations/surface/%s/EANET/" % (species), "") f = f[4:] f = f.lower() ref_list.append(f) refs = set(ref_list) refs = [i for i in refs] refs = sorted(refs)
def site_iter_process(valid_refs, c): #set local counts inv_nometa = 0 inv_anyvaliddata = 0 inv_nokeymeta = 0 inv_resolution = 0 inv_badmeasurementmethod = 0 n_all = 0 n_after_nometa = 0 n_after_flagsandlod = 0 n_after_duplicate = 0 n_after_anyvaliddata = 0 n_after_nokeymeta = 0 n_after_resolution = 0 n_after_badmeasurementmethod = 0 #set local unknown lists unknown_mm_list = [] unknown_mm_refs_list = [] unknown_local_tz_list = [] #for ref_i in range(len(valid_refs)): data_valid = True site_ref = valid_refs[c] print 'Current Ref is = ', site_ref, c s_files = glob.glob( '/work/home/db876/observations/surface/%s/CAPMON/ozon_smpls_%s*' % (species, site_ref)) site_files = [] for y in year_array: for f in s_files: if str(y) in f: site_files.append(f) site_files = modules.natsorted(site_files) yymmdd = [] hhmm = [] vals = [] for file_i in range(len(site_files)): count = 0 meta_start = -99999 start_read_1 = False start_read_2 = False with open(site_files[file_i], 'rb') as f: reader = csv.reader(f, delimiter=',') print site_files[file_i] for row in reader: #print count #break out of loop at bottom of file if (start_read_2 == True) & (row[0] == '*TABLE ENDS'): break #get metadata try: if (row[0] == '*TABLE NAME') & (row[1] == 'Site information'): meta_start = count + 2 except: pass if count == meta_start: siteid_i = row.index('Site ID: standard') sitename_i = row.index('Description') lat_i = row.index('Latitude: decimal degrees') lon_i = row.index('Longitude: decimal degrees') try: alt_i = row.index( 'Ground elevation: above mean sea level') except: alt_i = row.index('Ground altitude') class_i = row.index('Site land use') if count == (meta_start + 6): latitude = row[lat_i] longitude = row[lon_i] altitude = row[alt_i] raw_class_name = row[class_i] site_name = row[sitename_i] #get data if start_read_2 == True: #read dates, times, and vals date = row[8] time = row[9] yymmdd.append(date[:4] + date[5:7] + date[8:]) hhmm.append(time[:2] + time[3:]) quality_code = row[13] #if flag not equal to V0 then make -99999 if quality_code == 'V0': vals = np.append(vals, np.float64(row[12])) else: vals = np.append(vals, -99999) try: if (row[0] == '*TABLE NAME') & (row[1] == 'OZONE_HOURLY'): start_read_1 = True except: pass if (start_read_1 == True) & (row[0] == '*TABLE COLUMN UNITS'): unit = row[12] if (start_read_1 == True) & (row[0] == '*TABLE BEGINS'): start_read_2 = True count += 1 #add to n_obs_all n_all += len(vals) n_after_nometa += len(vals) #convert data less < 0 to -99999 test_inv = vals < 0 vals[test_inv] = -99999 #create max possible grid full_data = np.empty(n_hours) full_data_after_flagsandlod = np.empty(n_hours) big_n_dup_array = np.zeros(n_hours) full_data[:] = -99999 full_data_after_flagsandlod[:] = -99999 #put vals into full grid date_con = np.array(yymmdd).astype(int) time_con = np.array(hhmm).astype(int) #remove data < 1970 and >= 2015 test_inds = (date_con >= 19700101) & (date_con < 20150101) date_con = date_con[test_inds] time_con = time_con[test_inds] vals = vals[test_inds] #set st_big and mm_big st_big = ['continuous'] * len(vals) mm_big = ['ultraviolet photometry'] * len(vals) #get obs valid test = vals != -99999 n_obs_valid = len(vals[test]) n_after_flagsandlod += n_obs_valid #find matching times between actual times and grid of times, return big array of indices of matched indices in grid converted_time = modules.date_process(date_con, time_con, start_year) converted_time = np.round(converted_time, decimals=5) syn_grid_time = np.arange(0, n_days, 1. / 24) syn_grid_time = np.round(syn_grid_time, decimals=5) raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left') vals = np.array(vals) full_data_after_flagsandlod[raw_indices] = vals raw_st = np.copy(st_big) raw_mm = np.copy(mm_big) # test and remove duplicate and overlap points converted_time, vals, mm_big, st_big, na = modules.remove_duplicate_points( site_ref, converted_time, vals, mm_big, st_big, 'blank', output_res) test = vals >= 0 n_obs_valid = int(len(vals[test])) n_after_duplicate += n_obs_valid #find matching times between actual times and grid of times, return big array of indices of matched indices in grid indices = np.searchsorted(syn_grid_time, converted_time, side='left') full_data[indices] = vals #get metadata try: lat = np.float32(latitude) except: lat = 'na' try: lon = np.float32(longitude) except: lon = 'na' try: alt = np.float32(altitude) except: alt = 'na' unit = str(unit) raw_class_name = str(raw_class_name) site_name = str(site_name) country = 'Canada' contact = 'Dave MacTavish, 4905 Dufferin St., Toronto ON, CANADA, M3H 5T4, [email protected]' #set data tz - all CAPMON times are UTC data_tz = 0 all_tz = [data_tz] key_meta = [lat, lon, alt] #set site file resolution file_res = 'H' #get sampling/instrument grids raw_st_grid, p_st_grid, p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid, p_mm_grid_after_flagsandlod, unknown_mm_list, unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup( site_ref, process_group, species, raw_st, raw_mm, full_data_after_flagsandlod, full_data, raw_indices, unknown_mm_list, unknown_mm_refs_list, no2_type) #do quality checks data_valid, full_data, valid_hours_dup, p_st_grid, p_mm_grid, n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata, inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution, inv_badmeasurementmethod, n_after_badmeasurementmethod, exit_r = modules.primary_quality_control( site_ref, species, file_res, no2_type, grid_dates, full_data, big_n_dup_array, 0, raw_st_grid, p_st_grid, p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid, p_mm_grid_after_flagsandlod, data_resolution, n_obs_valid, key_meta, n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata, inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution, inv_badmeasurementmethod, n_after_badmeasurementmethod) if data_valid == False: exit_c_list = np.array([ inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution, inv_badmeasurementmethod ]) n_c_list = np.array([ n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate, n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution, n_after_badmeasurementmethod ]) unknown_list = [ unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list ] meta = [ lat, lon, alt, 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na' ] return c, ['na'], ['na'], [ 'na' ], False, meta, exit_c_list, n_c_list, unknown_list, exit_r, np.zeros( 1) #set processed unit p_unit = 'pbbv' #get local timezone try: local_tz_name = tz_root.tzNameAt(lat, lon, forceTZ=True) pytz_obj = pytz.timezone(local_tz_name) datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000, 1, 1)) if datetime_offset < datetime.timedelta(0): local_tz = -(24 - int(datetime_offset.seconds / 60 / 60)) else: local_tz = int(datetime_offset.seconds / 60 / 60) except: local_tz = 'na' print 'TIMEZONE NOT KNOWN, SITE IS %s' % (site_ref) unknown_local_tz_list.append(site_ref) #pack meta meta = [ lat, lon, alt, raw_class_name, file_res, unit, p_unit, data_tz, local_tz, site_name, country, contact ] #if blank strings in meta then convert to 'na' for i in range(len(meta)): try: if meta[i].strip() == '': meta[i] = 'na' except: pass print set(raw_st_grid) print set(raw_mm_grid) print set(p_st_grid) print set(p_mm_grid) print meta exit_c_list = np.array([ inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution, inv_badmeasurementmethod ]) n_c_list = np.array([ n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate, n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution, n_after_badmeasurementmethod ]) print 'exit counts = ', exit_c_list print 'n obs counts = ', n_c_list unknown_list = [ unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list ] return c, full_data, p_st_grid, p_mm_grid, data_valid, meta, exit_c_list, n_c_list, unknown_list, 'na', big_n_dup_array
#setup netcdf output root_grp = Dataset( '%s_RADIOSONDES_SHADOZ_%s_%s.nc' % (species, start_year, end_year), 'w') root_grp.description = 'SHADOZ Radiosondes of %s at sites in ppb - Program written by Dene Bowdalo' % ( species) site_count = 0 #--------------------------------------------------------------------------------- #process SHADOZ data print '\nProcessing SHADOZ data\n' files = glob.glob('/work/home/db876/observations/ozonesonde/SHADOZ/*') files = modules.natsorted(files) site_names = [] #separate out sites into each location for i in files: i = i.replace('/work/home/db876/observations/ozonesonde/SHADOZ/', '') split = i.split('_') site_names.append(split[0]) site_names = np.sort(list(set(site_names))) for site_name in site_names: print '\n' print site_name data_valid = True
exit_resolution_lats = [] exit_resolution_lons = [] exit_resolution_pg = [] exit_badmeasurementmethod_refs = [] exit_badmeasurementmethod_lats = [] exit_badmeasurementmethod_lons = [] exit_badmeasurementmethod_pg = [] unknown_mm = [] unknown_mm_refs = [] unknown_local_tz = [] #read obs files all_files = glob.glob('/work/home/db876/observations/surface/%s/CAPMON/*' % (species)) all_files = modules.natsorted(all_files) #get all refs ref_list = [] valid_refs = [] for i in range(len(all_files)): f = all_files[i] f = f.replace( "/work/home/db876/observations/surface/%s/CAPMON/" % (species), "") f = f[11:14] ref_list.append(f) refs = set(ref_list) refs = [i for i in refs] refs = sorted(refs)
def site_iter_process(valid_refs,c): #set local counts inv_nometa = 0 inv_anyvaliddata = 0 inv_nokeymeta = 0 inv_resolution = 0 inv_badmeasurementmethod = 0 n_all = 0 n_after_nometa = 0 n_after_flagsandlod = 0 n_after_duplicate = 0 n_after_anyvaliddata = 0 n_after_nokeymeta = 0 n_after_resolution = 0 n_after_badmeasurementmethod = 0 #set local unknown lists unknown_mm_list = [] unknown_mm_refs_list = [] unknown_local_tz_list = [] #read files site at a time #for ref_i in range(len(valid_refs)): site_ref = valid_refs[c] all_latitudes = [] all_longitudes = [] all_altitudes = [] all_unit = [] all_site_name = [] all_country = [] all_contact = [] mm_big = [] meta_valid_list = [] data_valid = True print 'Current Ref is = ', site_ref,c #find if sites have full valid range from start year and finishing in end year s_files = glob.glob('/work/home/db876/observations/surface/%s/EMEP/%s*'%(fname_species,site_ref)) year_files = [file.replace("/work/home/db876/observations/surface/%s/EMEP/"%(fname_species), "") for file in s_files] cut_year_files = [file[8:12] for file in year_files] site_files = [] for y in year_array: for i in range(len(s_files)): if str(y) in cut_year_files[i]: site_files.append(s_files[i]) site_files = modules.natsorted(site_files) #test for duplicate file years, if duplicates break processing file_years = [] for file in site_files: last_file_split = file.split('/')[-1] file_years=np.append(file_years,last_file_split[8:12]) for y in year_array: test = file_years == str(y) if len(file_years[test]) > 1: print 'Site has duplicate files for %s. Breaking processing'%(y) 1+'a' if site_files == []: print 'No valid files for site\n' return #remove daily/monthly files if necessary if output_res == 'H': del_i = [] for i in range(len(site_files)): if '.1d.' in site_files[i]: del_i.append(i) elif '.1mo.' in site_files[i]: del_i.append(i) site_files=np.delete(site_files,del_i) elif output_res == 'HD': del_i = [] for i in range(len(site_files)): if '.1mo.' in site_files[i]: del_i.append(i) site_files=np.delete(site_files,del_i) for y in year_array: bad_meta = False got_year = False for file in site_files: last_file_split = file.split('/')[-1] if str(y) in last_file_split[8:12]: got_year = True break if got_year == False: #fill in data for missing year timedelta_diff = datetime.date(y+1, 1, 1) - datetime.date(y, 1, 1) ndays_missing = timedelta_diff.days continue count = 0 with open(file, 'rb') as f: reader = csv.reader(f,delimiter=' ') print file for row in reader: try: row = filter(lambda a: a != '', row) except: pass try: row = filter(lambda a: a != ',', row) except: pass #get start date of file if row[0] == 'Startdate:': data = row[1] s_yyyy = data[:4] s_mm = data[4:6] s_dd = data[6:8] s_hh = data[8:10] s_min = data[10:12] start_datetime = datetime.datetime(int(s_yyyy),1,1,0,0) #get unit if row[0] == 'Unit:': try: if len(row) == 3: unit_part1 = row[1] unit_part2 = row[2] unit = unit_part1+'_'+unit_part2 elif len(row) == 2: unit = row[1] all_unit.append(unit) except: bad_meta = True #get resolution if row[0] == 'Resolution': if row[1] == 'code:': file_res = row[2] print 'Resolution = %s'%file_res #get latitude if row[0] == 'Station': if row[1] == 'latitude:': latitude = row[2] all_latitudes.append(latitude) #get longitude if row[0] == 'Station': if row[1] == 'longitude:': longitude = row[2] all_longitudes.append(longitude) #get altitude if row[0] == 'Station': if row[1] == 'altitude:': altitude = row[2][:-1] all_altitudes.append(altitude) #get site name if row[0] == 'Station': if row[1] == 'name:': site_name = row[2] all_site_name.append(site_name) #get period if row[0] == 'Period': period_code = row[2] #get stats method if row[0] == 'Statistics:': try: st = row[1] + row[2] if st != 'arithmeticmean': print 'Not Arithmetic Mean!' print row[1] print 1+'a' except: print 'Not Arithmetic Mean!' print row[1] print 1+'a' #get instrument method and name if row[0] == 'Instrument': if row[1] == 'type:': mm_list = row[2:] if len(mm_list) > 1: site_mm = '' for x in range(len(mm_list)): site_mm = site_mm+mm_list[x]+' ' site_mm = site_mm.strip() else: site_mm = mm_list[0] if row[1] == 'name:': mn_list = row[2:] if len(mn_list) > 1: site_mn = '' for x in range(len(mn_list)): site_mn = site_mn+mn_list[x]+' ' site_mn = site_mn.strip() else: site_mn = mn_list[0] #get method ref if row[0] == 'Method': if row[1] == 'ref:': try: mf_list = row[2:] if len(mf_list) > 1: site_mf = '' for x in range(len(mf_list)): site_mf = site_mf+mf_list[x]+' ' site_mf = site_mf.strip() else: site_mf = mf_list[0] except: site_mf = '' #put together intrument type+instrument_name+method_ref mm = site_mm+site_mn+site_mf #get contact if row[0] == 'Originator:': try: contact_list = row[1:] if len(contact_list) > 1: site_contact = '' for x in range(len(mf_list)): site_contact = site_contact+contact_list[x]+' ' site_contact = site_contact.strip() else: site_contact = site_contact[0] except: site_contact = '' all_contact.append(site_contact) #get country site_country = EMEP_COUNTRIES(file.split('/')[-1][:2]) all_country.append(site_country) if row[0] == 'starttime': skip_n = count+1 if species == 'ISOP': spec_ind = row.index('C5H8') try: flag_ind = row.index('flag_C5H8') except: flag_ind = row.index('flag') else: spec_ind = row.index(species) try: flag_ind = row.index('flag_'+species) except: flag_ind = row.index('flag') count+=1 read = np.loadtxt(file,dtype="f8,f8,f8,f8",skiprows=skip_n,usecols=(0,1,spec_ind,flag_ind),unpack=True) read = np.array(read) times_since_start = read[0,:] endtimes_since_start = read[1,:] conc = read[2,:] conc = np.array(conc).astype('float64') flags = read[3,:] dates = [] times = [] enddates = [] endtimes = [] times_since_start = np.float64(times_since_start) endtimes_since_start = np.float64(endtimes_since_start) for x in range(len(times_since_start)): days_since_start = math.trunc(times_since_start[x]) enddays_since_start = math.trunc(endtimes_since_start[x]) remainder = times_since_start[x] - days_since_start remainder_end = endtimes_since_start[x] - enddays_since_start unrounded_hour = remainder*24 unrounded_hour_end = remainder_end*24 hour = np.round(unrounded_hour) hour_end = np.round(unrounded_hour_end) time_delta = datetime.timedelta(days = days_since_start,hours = hour) time_delta_end = datetime.timedelta(days = enddays_since_start,hours = hour_end) calc_datetime = start_datetime + time_delta calc_datetime_end = start_datetime + time_delta_end calc_yyyymmdd = calc_datetime.strftime("%Y%m%d") calc_hhmm = calc_datetime.strftime("%H%M") end_calc_yyyymmdd = calc_datetime_end.strftime("%Y%m%d") end_calc_hhmm = calc_datetime_end.strftime("%H%M") dates.append(calc_yyyymmdd) times.append(calc_hhmm) enddates.append(end_calc_yyyymmdd) endtimes.append(end_calc_hhmm) conc = np.float64(conc) flags = np.float64(flags) #add to n_obs_all n_all += len(conc) #IF bad_meta == True then set all file vals as nans if bad_meta == True: conc[:] = np.NaN meta_valid_list.append(bad_meta) #DO INLINE INVALID AND FLAG CONVERT to NaN test = conc < 0 conc[test] = np.NaN test = flags != 0 conc[test] = np.NaN #convert units by line (only if value is >= than 0 try: if (unit.lower() != 'ppb') & (unit.lower() != 'ppbv'): if unit == 'ug/m3': #calculate conversion factor from mg/m3 assuming 293K and 1013 hPa - in EU LAW #R/MW*(TEMP0C(K)*TEMP(degC)/P(hPa)/10 conv_fact = 8.3144/mol_mass*(293.)/(1013./10.) conc = conv_fact*conc elif unit == 'ug_N/m3': conv_fact = 8.3144/14.00674*(293.)/(1013./10.) conc = conv_fact*conc elif (unit == 'ppm') or (unit == 'ppmv'): conc = conc*1e3 #print 'Converting Units from ppmv to ppbv' elif (unit == 'ppt') or (unit == 'pptv'): conc = conc/1e3 #print 'Converting Units from pptv to ppbv' else: print 'Unknown Unit' 1+'a' except: pass #remove 9.999 from ISOP dataset if species == 'ISOP': test = conc == 9.999 conc[test] = np.NaN #if file resolution is daily or monthly then replicate times after point, to fill hourly data array. count=0 if file_res == '1h': n_dups = np.zeros(len(conc)) elif file_res == '1d': n_dups = [] #if measurement method is flask, then put leave flask measurement in as hourly measurement, the first hour of month file_hours = len(dates) for i in range(file_hours): current_year = int(dates[count][:4]) current_month = int(dates[count][4:6]) current_day = int(dates[count][6:]) current_hh = int(times[count][:2]) current_mm = int(times[count][2:]) next_year = int(enddates[i][:4]) next_month = int(enddates[i][4:6]) next_day = int(enddates[i][6:]) next_hh = int(endtimes[i][:2]) next_mm = int(endtimes[i][2:]) s = datetime.datetime(year = current_year, month = current_month, day = current_day, hour = current_hh, minute = current_mm) e = datetime.datetime(year = next_year, month = next_month, day = next_day, hour = next_hh, minute = next_mm) day_dates = [d.strftime('%Y%m%d') for d in pd.date_range(s,e,freq='H')][1:-1] day_hours = [d.strftime('%H%M') for d in pd.date_range(s,e,freq='H')][1:-1] dates = np.insert(dates,count+1,day_dates) times = np.insert(times,count+1,day_hours) conc = np.insert(conc,count+1,[conc[count]]*len(day_dates)) #append to n duplicated array n_dups=np.append(n_dups,0) n_dups=np.append(n_dups,[1]*len(day_dates)) count +=(len(day_dates)+1) elif file_res == '1mo': n_dups = [] #if measurement method is flask, then put leave flask measurement in as hourly measurement, the first hour of month file_hours = len(dates) for i in range(file_hours): current_year = int(dates[count][:4]) current_month = int(dates[count][4:6]) current_day = int(dates[count][6:]) current_hh = int(times[count][:2]) current_mm = int(times[count][2:]) next_year = int(enddates[i][:4]) next_month = int(enddates[i][4:6]) next_day = int(enddates[i][6:]) next_hh = int(endtimes[i][:2]) next_mm = int(endtimes[i][2:]) s = datetime.datetime(year = current_year, month = current_month, day = current_day, hour = current_hh, minute = current_mm) e = datetime.datetime(year = next_year, month = next_month, day = next_day, hour = next_hh, minute = next_mm) day_dates = [d.strftime('%Y%m%d') for d in pd.date_range(s,e,freq='H')][1:-1] day_hours = [d.strftime('%H%M') for d in pd.date_range(s,e,freq='H')][1:-1] dates = np.insert(dates,count+1,day_dates) times = np.insert(times,count+1,day_hours) conc = np.insert(conc,count+1,[conc[count]]*len(day_dates)) #append to n duplicated array n_dups=np.append(n_dups,0) n_dups=np.append(n_dups,[1]*len(day_dates)) count += (len(day_dates)+1) data = [dates,times,conc,n_dups] #put measurnement methods and into big list len of times mm_big=np.append(mm_big,[mm]*len(dates)) try: big_list = np.hstack((big_list,data)) except: big_list = np.array(data) if (y == year_array[-1]): #get dates and times date_con = big_list[0,:] time_con = big_list[1,:] #get vals vals = np.array(big_list[2,:]).astype('float64') #get n dup array n_dup_array = np.array(big_list[3,:]).astype(float).astype(int) #if all files have missing key meta then exit if all(i == True for i in meta_valid_list) == True: inv_nometa += 1 print 'Site Invalid. No Metadata for ref' if no2_type == 'MOLYBDENUM': n_all,inv_nometa,n_after_nometa,n_after_flagsandlod,n_after_duplicate,inv_anyvaliddata,n_obs_after_anyvaliddata,inv_nokeymeta,n_obs_after_nokeymeta,inv_resolution,n_obs_after_resolution,inv_badmeasurementmethod,n_obs_after_badmeasurementmethod = 0,0,0,0,0,0,0,0,0,0,0,0,0 exit_c_list = np.array([inv_nometa,inv_anyvaliddata,inv_nokeymeta,inv_resolution,inv_badmeasurementmethod]) n_c_list = np.array([n_all,n_after_nometa,n_after_flagsandlod,n_after_duplicate,n_after_anyvaliddata,n_after_nokeymeta,n_after_resolution,n_after_badmeasurementmethod]) unknown_list = [unknown_mm_list,unknown_mm_refs_list,unknown_local_tz_list] meta = ['na','na','na','na','na','na','na','na','na','na','na','na'] exit_r = 'nometa' return c,['na'],['na'],['na'],False,meta,exit_c_list,n_c_list,unknown_list,exit_r,np.zeros(1) valid_hours_dup = np.sum(n_dup_array) n_after_nometa += (len(vals)-valid_hours_dup) #delete big list del big_list date_con = np.array(date_con).astype(int) time_con = np.array(time_con).astype(int) #remove data < 1970 and >= 2015 test_inds = (date_con >= 19700101) & (date_con < 20150101) date_con = date_con[test_inds] time_con = time_con[test_inds] vals = vals[test_inds] mm_big = mm_big[test_inds] n_dup_array = n_dup_array[test_inds] #set st_big as 'continuous' st_big = ['continuous']*len(vals) #convert all Nans back to -99999 test = np.isnan(vals) vals[test] = -99999 #get obs valid test = vals >= 0 valid_hours_dup = np.sum(n_dup_array[test]) n_obs_valid = int(len(vals[test]) - valid_hours_dup) n_after_flagsandlod += n_obs_valid #create max possible species grid, measurement method and sampling type grids full_data = np.empty(n_hours) full_data_after_flagsandlod = np.empty(n_hours) big_n_dup_array = np.zeros(n_hours) full_data[:] = -99999 full_data_after_flagsandlod[:] = -99999 #find matching times between actual times and grid of times, return big array of indices of matched indices in grid converted_time = modules.date_process(date_con,time_con,start_year) converted_time = np.round(converted_time,decimals=5) syn_grid_time = np.arange(0,n_days,1./24) syn_grid_time = np.round(syn_grid_time,decimals=5) #find matching times between actual times and grid of times, return big array of indices of matched indices in grid raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left') vals = np.array(vals) full_data_after_flagsandlod[raw_indices] = vals raw_st = np.copy(st_big) raw_mm = np.copy(mm_big) # test and remove duplicate and overlap points converted_time,vals,mm_big,st_big,n_dup_array = modules.remove_duplicate_points(site_ref,converted_time,vals,mm_big,st_big,n_dup_array,output_res) test = vals >= 0 valid_hours_dup = np.sum(n_dup_array[test]) n_obs_valid = int(len(vals[test]) - valid_hours_dup) n_after_duplicate += n_obs_valid #find matching times between actual times and grid of times, return big array of indices of matched indices in grid indices = np.searchsorted(syn_grid_time, converted_time, side='left') full_data[indices] = vals big_n_dup_array[indices] = n_dup_array #get mode of metadata try: lat = np.float32(stats.mode(all_latitudes)[0][0]) except: lat = 'na' try: lon = np.float32(stats.mode(all_longitudes)[0][0]) except: lon = 'na' try: alt = np.float32(stats.mode(all_altitudes)[0][0]) except: alt = 'na' unit = stats.mode(all_unit)[0][0] #remove empty strings from extra meta before mode test try: site_name = stats.mode(filter(None, all_site_name))[0][0] except: site_name = 'na' try: country = stats.mode(filter(None, all_country))[0][0] except: country = 'na' try: contact = stats.mode(filter(None, all_contact))[0][0] except: contact = 'na' #set data tz - all EMEP times are UTC data_tz = 0 all_tz = [data_tz] key_meta = [lat,lon,alt] #convert file res to standard format if file_res == '1h': file_res = 'H' elif file_res == '1d': file_res = 'D' elif file_res == '1mo': file_res = 'M' #get sampling/instrument grids raw_st_grid,p_st_grid,p_st_grid_after_flagsandlod,raw_mm_grid,p_mm_grid,p_mm_grid_after_flagsandlod,unknown_mm_list,unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup(site_ref,process_group,species,raw_st,raw_mm,full_data_after_flagsandlod,full_data,raw_indices,unknown_mm_list,unknown_mm_refs_list,no2_type) #do quality checks data_valid,full_data,valid_hours_dup,p_st_grid,p_mm_grid,n_all,inv_nometa,n_after_nometa,n_after_flagsandlod,n_after_duplicate,inv_anyvaliddata,n_after_anyvaliddata,inv_nokeymeta,n_after_nokeymeta,inv_resolution,n_after_resolution,inv_badmeasurementmethod,n_after_badmeasurementmethod,exit_r = modules.primary_quality_control(site_ref,species,file_res,no2_type,grid_dates,full_data,big_n_dup_array,valid_hours_dup,raw_st_grid,p_st_grid,p_st_grid_after_flagsandlod,raw_mm_grid,p_mm_grid,p_mm_grid_after_flagsandlod,data_resolution,n_obs_valid,key_meta,n_all,inv_nometa,n_after_nometa,n_after_flagsandlod,n_after_duplicate,inv_anyvaliddata,n_after_anyvaliddata,inv_nokeymeta,n_after_nokeymeta,inv_resolution,n_after_resolution,inv_badmeasurementmethod,n_after_badmeasurementmethod) if data_valid == False: exit_c_list = np.array([inv_nometa,inv_anyvaliddata,inv_nokeymeta,inv_resolution,inv_badmeasurementmethod]) n_c_list = np.array([n_all,n_after_nometa,n_after_flagsandlod,n_after_duplicate,n_after_anyvaliddata,n_after_nokeymeta,n_after_resolution,n_after_badmeasurementmethod]) unknown_list = [unknown_mm_list,unknown_mm_refs_list,unknown_local_tz_list] meta = [lat,lon,alt,'na','na','na','na','na','na','na','na','na'] return c,['na'],['na'],['na'],False,meta,exit_c_list,n_c_list,unknown_list,exit_r,np.zeros(1) #set metadata not available as na raw_class_name = 'na' #set processed unit p_unit = 'ppbv' #get local timezone try: local_tz_name = tz_root.tzNameAt(lat,lon,forceTZ=True) pytz_obj = pytz.timezone(local_tz_name) datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000,1,1)) if datetime_offset < datetime.timedelta(0): local_tz = -(24-int(datetime_offset.seconds/60/60)) else: local_tz = int(datetime_offset.seconds/60/60) except: local_tz = 'na' print 'TIMEZONE NOT KNOWN, SITE IS %s'%(site_ref) unknown_local_tz_list.append(site_ref) #pack meta meta = [lat,lon,alt,raw_class_name,file_res,unit,p_unit,data_tz,local_tz,site_name,country,contact] #if blank strings in meta then convert to 'na' for i in range(len(meta)): try: if meta[i].strip() == '': meta[i] = 'na' except: pass print set(raw_st_grid) print set(raw_mm_grid) print set(p_st_grid) print set(p_mm_grid) print meta exit_c_list = np.array([inv_nometa,inv_anyvaliddata,inv_nokeymeta,inv_resolution,inv_badmeasurementmethod]) n_c_list = np.array([n_all,n_after_nometa,n_after_flagsandlod,n_after_duplicate,n_after_anyvaliddata,n_after_nokeymeta,n_after_resolution,n_after_badmeasurementmethod]) print 'exit counts = ', exit_c_list print 'n obs counts = ', n_c_list unknown_list = [unknown_mm_list,unknown_mm_refs_list,unknown_local_tz_list] return c,full_data,p_st_grid,p_mm_grid,data_valid,meta,exit_c_list,n_c_list,unknown_list,'na',big_n_dup_array
def site_iter_process(valid_refs,c): #for ref_i in range(len(valid_refs)): data_valid = True site_ref = valid_refs[c] print 'Current Ref is = ', site_ref s_files = glob.glob('/work/home/db876/observations/surface/%s/CAPMON/ozon_smpls_%s*'%(species,site_ref)) site_files = [] for y in year_array: for f in s_files: if str(y) in f: site_files.append(f) site_files = modules.natsorted(site_files) yymmdd = [] hhmm = [] vals = [] #create max possible o3 grid full_data = np.empty(n_hours) full_data[:] = -99999 for file_i in range(len(site_files)): count = 0 meta_start = -99999 start_read_1 = False start_read_2 = False with open(site_files[file_i], 'rb') as f: reader = csv.reader(f,delimiter=',') print site_files[file_i] for row in reader: #print count #break out of loop at bottom of file if (start_read_2 == True) & (row[0] == '*TABLE ENDS'): break #get metadata try: if (row[0] =='*TABLE NAME') & (row[1] == 'Site information'): meta_start = count+2 except: pass if count == meta_start: lat_i = row.index('Latitude: decimal degrees') lon_i = row.index('Longitude: decimal degrees') try: alt_i = row.index('Ground elevation: above mean sea level') except: alt_i = row.index('Ground altitude') class_i = row.index('Site land use') if count == (meta_start+6): latitude = row[lat_i] longitude = row[lon_i] altitude = row[alt_i] raw_class_name = row[class_i] #get data if start_read_2 == True: #read dates, times, and vals date = row[8] time = row[9] yymmdd.append(date[:4]+date[5:7] + date[8:]) hhmm.append(time[:2]+time[3:]) quality_code = row[13] if quality_code == 'V0': vals = np.append(vals,np.float64(row[12])) else: vals = np.append(vals,-99999) try: if (row[0] == '*TABLE NAME') & (row[1] == 'OZONE_HOURLY'): start_read_1 = True except: pass if (start_read_1 == True) & (row[0] == '*TABLE COLUMN UNITS'): unit = row[12] if (start_read_1 == True) & (row[0] == '*TABLE BEGINS'): start_read_2 = True count+=1 #convert all invalids to -99999 test_inv = vals < 0 vals[test_inv] = -99999 #put o3 vals into full grid date_con = np.array(yymmdd).astype(int) time_con = np.array(hhmm).astype(int) #find matching times between actual times and grid of times, return big array of indices of matched indices in grid converted_time = modules.date_process(date_con,time_con,start_year) converted_time = np.round(converted_time,decimals=5) syn_grid_time = np.arange(0,n_days,1./24) syn_grid_time = np.round(syn_grid_time,decimals=5) #find matching times between actual times and grid of times, return big array of indices of matched indices in grid indices = np.searchsorted(syn_grid_time, converted_time, side='left') vals = np.array(vals) #make sure no data is past end year index_test = indices < len(full_data) indices = indices[index_test] vals = vals[index_test] full_data[indices] = vals #get metadata lat = np.float64(latitude) lon = np.float64(longitude) alt = np.float64(altitude) #do data quality checks full_data,data_valid = modules.quality_check_nr(full_data,data_valid,data_resolution,np.float64(altitude),grid_dates,start_year,end_year) #set measurement method mm = 'ultraviolet photometry' #set site file resolution file_res = 'H' #set sampling as average st = 'average' anthrome_class_name = 'na' return c,full_data,data_valid,lat,lon,alt,raw_class_name,anthrome_class_name,mm,st,file_res
def site_iter_process(valid_refs,c): #read files site at a time #for ref_i in range(len(valid_refs)): site_ref = valid_refs[c] all_latitudes = [] all_longitudes = [] all_altitudes = [] all_mm = [] print 'Current Ref is = ', site_ref #find if sites have full valid range from start year and finishing in end year s_files = glob.glob('/work/home/db876/observations/surface/%s/EMEP/%s*'%(species,site_ref)) year_files = [file.replace("/work/home/db876/observations/surface/%s/EMEP/"%(species), "") for file in s_files] cut_year_files = [file[8:12] for file in year_files] site_files = [] for y in year_array: for i in range(len(s_files)): if str(y) in cut_year_files[i]: site_files.append(s_files[i]) site_files = modules.natsorted(site_files) year_files = modules.natsorted(year_files) file_startdate = [] file_height = [] instr_names = [] file_lasttime = [] data_valid = True yyyymmdd = [] hhmm = [] vals = [] flags = [] #create max possible o3 grid full_data = np.empty(n_hours) full_data[:] = -99999 if site_files == []: print 'No valid files for site\n' return for y in year_array: print 'Processing Year %s'%y got_year = False for file in site_files: last_file_split = file.split('/')[-1] if str(y) in last_file_split[8:12]: got_year = True break if got_year == False: #fill in data for missing year timedelta_diff = datetime.date(y+1, 1, 1) - datetime.date(y, 1, 1) ndays_missing = timedelta_diff.days print 'ndays missing = ', ndays_missing continue if data_valid == True: data_start = 9999999 count = 0 start_read = False with open(file, 'rb') as f: read_count = 0 reader = csv.reader(f,delimiter=' ') print file for row in reader: try: row = filter(lambda a: a != '', row) except: pass try: row = filter(lambda a: a != ',', row) except: pass #get start date of file if row[0] == 'Startdate:': data = row[1] s_yyyy = data[:4] s_mm = data[4:6] s_dd = data[6:8] s_hh = data[8:10] s_min = data[10:12] start_datetime = datetime.datetime(int(s_yyyy),1,1,0,0) #get unit if row[0] == 'Unit:': try: unit_part1 = row[1] unit_part2 = row[2] unit = unit_part1+'_'+unit_part2 except: unit = row[1] #get resolution if row[0] == 'Resolution': if row[1] == 'code:': file_res = row[2] print 'Resolution = %s'%file_res if (output_res == 'H'): if (file_res == '1d') or (file_res == '1mo'): print 'File resolution has to be Minimum Hourly. Skipping' data_valid = False return c,full_data,data_valid,-999,-999,-999,'na','na','na','na','na' elif (output_res == 'D'): if (file_res == '1mo'): print 'File resolution has to be Minimum Daily. Skipping' data_valid = False return c,full_data,data_valid,-999,-999,-999,'na','na','na','na','na' #get latitude if row[0] == 'Station': if row[1] == 'latitude:': latitude = row[2] all_latitudes.append(latitude) #get longitude if row[0] == 'Station': if row[1] == 'longitude:': longitude = row[2] all_longitudes.append(longitude) #get altitude if row[0] == 'Station': if row[1] == 'altitude:': altitude = row[2][:-1] all_altitudes.append(altitude) #get period if row[0] == 'Period': period_code = row[2] #get stats method if row[0] == 'Statistics:': try: st = row[1] + row[2] if st != 'arithmeticmean': print 'Not Arithmetic Mean!' print row[1] print 1+'a' except: print 'Not Arithmetic Mean!' print row[1] print 1+'a' #get instrument method if row[0] == 'Instrument': if row[1] == 'type:': mm_list = row[2:] if len(mm_list) > 1: site_mm = '' for x in range(len(mm_list)): site_mm = site_mm+mm_list[x]+' ' site_mm = site_mm.strip() else: site_mm = mm_list[0] all_mm.append(site_mm) #get data if start_read == True: #calc dates, times, and take o3 vals time_since_start = np.float64(row[0]) days_since_start = math.trunc(time_since_start) remainder = time_since_start - days_since_start unrounded_hour = remainder*24 hour = np.round(unrounded_hour) time_delta = datetime.timedelta(days = days_since_start,hours = hour) calc_datetime = start_datetime + time_delta calc_yyyymmdd = calc_datetime.strftime("%Y%m%d") calc_hhmm = calc_datetime.strftime("%H%M") line_val = np.float64(row[2]) #convert units by line (only if value is >= than 0 if line_val >= 0: if (unit.lower() != 'ppb') & (unit.lower() != 'ppbv'): if unit == 'ug/m3': #print 'converting units, temp = 20degC' #calculate conversion factor from mg/m3 assuming 20 degC and 1 atm - default for O3 instruments #R/MW*(TEMP0C(K)*TEMP(degC)/P(hPa)/10 conv_fact = 8.3144/mol_mass*(273.15+20)/(1013.25/10) line_val = conv_fact*line_val #print 'Converting Units from ug/m3 20degC to ppbv' elif unit == 'ug_N/m3': conv_fact = 8.3144/mol_mass*(273.15+20)/(1013.25/10) line_val = conv_fact*line_val #print 'Converting Units from ug/Nm3 20degC to ppbv' elif (unit == 'ppm') or (unit == 'ppmv'): line_val = line_val*1e3 #print 'Converting Units from ppmv to ppbv' elif (unit == 'ppt') or (unit == 'pptv'): line_val = line_val/1e3 #print 'Converting Units from pptv to ppbv' else: print 'Unknown Unit' data_valid = False 1+'a' if file_res == '1h': yyyymmdd=np.append(yyyymmdd,calc_yyyymmdd) hhmm=np.append(hhmm,calc_hhmm) vals = np.append(vals,line_val) flags = np.append(flags,np.float64(row[3])) elif file_res == '1d': yyyymmdd=np.append(yyyymmdd,calc_yyyymmdd) hhmm=np.append(hhmm,'0000') vals = np.append(vals,line_val) flags = np.append(flags,np.float64(row[3])) for j in range(1,24): time_delta = datetime.timedelta(days = days_since_start,hours = j) calc_datetime = start_datetime + time_delta vals = np.append(vals,vals[-1]) flags = np.append(flags,flags[-1]) yyyymmdd = np.append(yyyymmdd,calc_datetime.strftime("%Y%m%d")) hhmm = np.append(hhmm,calc_datetime.strftime("%H%M")) elif file_res == '1mo': yyyymmdd=np.append(yyyymmdd,calc_yyyymmdd) hhmm=np.append(hhmm,'0000') vals = np.append(vals,line_val) flags = np.append(flags,np.float64(row[3])) month_days = monthrange(int(yyyymmdd[-1][:4]), int(yyyymmdd[-1][4:6]))[1] for j in range(1,24*month_days): time_delta = datetime.timedelta(days = days_since_start,hours = j) calc_datetime = start_datetime + time_delta vals = np.append(vals,vals[-1]) flags = np.append(flags,flags[-1]) yyyymmdd = np.append(yyyymmdd,calc_datetime.strftime("%Y%m%d")) hhmm = np.append(hhmm,calc_datetime.strftime("%H%M")) if row[0] == 'starttime': start_read = True count+=1 if (y == year_array[-1]): #convert all invalids by flags to -99999 test_inv = flags != 0 if len(test_inv) != 0: vals[test_inv] = -99999 #any values less than zero are -99999 test_inv = vals < 0 if len(test_inv) != 0: vals[test_inv] = -99999 #do additional invalid test, as flags not always correct #test_inv_2 = vals > 300 #vals[test_inv_2] = -99999 #put o3 vals into full grid date_con = np.array(yyyymmdd).astype(int) time_con = np.array(hhmm).astype(int) #find matching times between actual times and grid of times, return big array of indices of matched indices in grid converted_time = date_process(date_con,time_con,start_year) converted_time = np.round(converted_time,decimals=5) syn_grid_time = np.arange(0,n_days,1./24) syn_grid_time = np.round(syn_grid_time,decimals=5) #find matching times between actual times and grid of times, return big array of indices of matched indices in grid indices = np.searchsorted(syn_grid_time, converted_time, side='left') vals = np.array(vals) #make sure no data is past end year index_test = indices < len(full_data) indices = indices[index_test] vals = vals[index_test] full_data[indices] = vals #get mode of metadata lat = np.float64(stats.mode(all_latitudes)[0][0]) lon = np.float64(stats.mode(all_longitudes)[0][0]) alt = np.float64(stats.mode(all_altitudes)[0][0]) mm = stats.mode(all_mm)[0][0] #check site is not urban using anthrome map from 2000 anthfile = '/work/home/db876/plotting_tools/core_tools/anthro2_a2000.nc' anthload = Dataset(anthfile) class_valid,anthrome_class_name = modules.anthrome_classify(anthload,[lat],[lon]) if class_valid == 'invalid': data_valid = False print 'Site Invalid, site classed as urban by anthrome map.' #get measurement method if (mm == 'uv_abs') or (mm == 'chemiluminesc') or (mm == 'uv_fluoresc'): if species == 'O3': mm = 'ultraviolet photometry' if (species == 'NO') or (species == 'NO2') or (species == 'CO'): mm = 'chemiluminescence' elif (mm == 'ndir') or (mm == 'infrared_absorption'): mm = 'non-dispersive infrared spectroscopy' elif (mm == 'GC-HgO'): mm = 'gas chromatography reduction gas detection' elif (mm == 'tracegas_monitor'): mm = 'cavity attenuated phase shift spectroscopy' elif (mm == 'filter_1pack') or (mm == 'filter_2pack') or (mm == 'filter_3pack'): if species == 'NO2': mm = 'griess saltzman colorimetric' elif species == 'CO': mm = 'ion chromatography' elif (mm == 'steel_canister'): mm = 'gas chromatography flame ionisation detection' elif (mm == 'online_gc'): mm = 'online gas chromatography' elif (mm == 'glass_sinter') or (mm == 'abs_solution') or (mm == 'filter_abs_solution') or (mm == 'abs_tube') or (mm == 'continuous_colorimetric'): mm = 'griess saltzman colorimetric' elif (mm == 'NaJ_solution'): mm = 'flame ionisation detection' elif (mm == 'doas'): mm = 'differential optical absorption spectrosocopy' elif (mm == 'diffusion_tube'): mm = 'diffusive sampler' elif (mm == 'NA') or (mm == ''): if species == 'O3': mm = 'ultraviolet photometry' if species == 'CO': mm = 'non-dispersive infrared spectroscopy' if species == 'NO2': mm = 'chemiluminescence' if species == 'NO': mm = 'chemiluminescence' if species == 'ISOP': mm = 'gas chromatography flame ionisation detection' else: print mm 1+'a' #do data quality checks full_data,data_valid = modules.quality_check(full_data,data_valid,data_resolution,alt,grid_dates,start_year,end_year) #convert file res to standard format if file_res == '1h': file_res = 'H' elif file_res == '1d': file_res = 'D' elif file_res == '1mo': file_res = 'M' #no raw class so set as na raw_class_name = 'na' #set sampling as average st = 'average' return c,full_data,data_valid,lat,lon,alt,raw_class_name,anthrome_class_name,mm,st,file_res
def site_iter_process(valid_refs, c): #set local counts inv_nometa = 0 inv_anyvaliddata = 0 inv_nokeymeta = 0 inv_resolution = 0 inv_badmeasurementmethod = 0 n_all = 0 n_after_nometa = 0 n_after_flagsandlod = 0 n_after_duplicate = 0 n_after_anyvaliddata = 0 n_after_nokeymeta = 0 n_after_resolution = 0 n_after_badmeasurementmethod = 0 #set local unknown lists unknown_mm_list = [] unknown_mm_refs_list = [] unknown_local_tz_list = [] ref = valid_refs[c] print 'ref = ', ref, c #get site instrument for species met_i = file_refs.index(ref) file_name = met_refs[met_i] site_name = met_sitenames[met_i] print site_name site_species = list(met_species[met_i]) print site_species site_instruments = list(met_instruments[met_i]) m_method = site_instruments[site_species.index(species)] site_resolutions = [] data_valid = True s_files = insensitive_glob( '/work/home/db876/observations/surface/%s/EANET/*%s.csv' % (fname_species, file_name)) site_files = [] for y in year_array: for f in s_files: if str(y)[-2:] in f: site_files.append(f) site_files = modules.natsorted(site_files) years = [] months = [] days = [] hours = [] vals = [] yyyymmdd = [] hhmm = [] n_dup_array = [] last_year_index = len(site_files) for y in year_array: got_year = False for file in site_files: last_file_split = file.split('/')[-1] if str(y)[2:] in last_file_split: got_year = True break if got_year == False: timedelta_diff = datetime.date(y + 1, 1, 1) - datetime.date( y, 1, 1) ndays_missing = timedelta_diff.days continue print file valid = True with open(file, 'rb') as f: reader = csv.reader(f, delimiter=',') counter = 0 #get resolution for row in reader: if counter == 0: all_units = row elif counter == 1: file_res = 'H' try: hour_index = row.index('Hour') except: file_res = 'D' try: day_index = row.index('Day') except: file_res = 'M' month_index = row.index('Month') year_index = row.index('Year') try: spec_index = row.index(species.upper()) unit = all_units[spec_index] except: valid = False break #make sure each year units are ppb if unit != 'ppb': print 'Units not ppb!' 1 + 'a' if counter == 2: if file_res == 'H': yyyy = row[year_index] mm = row[month_index] dd = row[day_index] hh = row[hour_index] elif file_res == 'D': yyyy = row[year_index] mm = row[month_index] dd = row[day_index] hh = 1 elif file_res == 'M': yyyy = row[year_index] mm = row[month_index] dd = 1 hh = 1 start_datetime = datetime.datetime(int(yyyy), int(mm), int(dd), int(hh)) if counter == 3: if file_res == 'H': yyyy = row[year_index] mm = row[month_index] dd = row[day_index] hh = row[hour_index] elif file_res == 'D': yyyy = row[year_index] mm = row[month_index] dd = row[day_index] hh = 1 elif file_res == 'M': yyyy = row[year_index] mm = row[month_index] dd = 1 hh = 1 present_datetime = datetime.datetime( int(yyyy), int(mm), int(dd), int(hh)) time_delt = present_datetime - start_datetime hour_delt = datetime.timedelta(hours=1) day_delt = datetime.timedelta(hours=24) week_delt = datetime.timedelta(hours=24 * 7) month_delt = datetime.timedelta(hours=24 * 28) print time_delt if (time_delt < day_delt): print 'Hourly Data' file_res = 'H' site_resolutions.append(file_res) elif (time_delt > hour_delt) & (time_delt < week_delt): print 'Daily Data' file_res = 'D' site_resolutions.append(file_res) elif (time_delt > week_delt): print 'Monthly Data' file_res = 'M' site_resolutions.append(file_res) counter += 1 #READ IN DATA if valid == True: #limit to sites with hourly date files for, if required if output_res == 'H': if file_res != 'H': print 'Not processing as only want hourly files' continue if output_res == 'HD': if file_res == 'M': print 'Not processing as only want hourly and daily files' continue with open(file, 'rb') as f: reader = csv.reader(f, delimiter=',') counter = 0 val_count = 0 for row in reader: if counter >= 2: yyyy = row[year_index] mm = row[month_index] #add to n_obs_all n_all += 1 n_after_nometa += 1 if file_res == 'H': try: vals = np.append(vals, np.float64(row[spec_index])) except: vals = np.append(vals, -99999) current_datetime = present_datetime + relativedelta( hours=val_count) yyyymmdd.append( current_datetime.strftime("%Y%m%d")) hhmm.append(current_datetime.strftime("%H%M")) n_dup_array = np.append(n_dup_array, 0) elif file_res == 'D': try: vals = np.append( vals, [np.float64(row[spec_index])] * 24) except: vals = np.append(vals, [-99999] * 24) current_datetime = present_datetime + relativedelta( days=val_count) next_datetime = present_datetime + relativedelta( days=val_count + 1) all_datetimes = pd.date_range(current_datetime, next_datetime, freq='H')[:-1] for d in all_datetimes: yyyymmdd.append(d.strftime("%Y%m%d")) hhmm.append(d.strftime("%H%M")) n_dup_array = np.append(n_dup_array, 0) n_dup_array = np.append(n_dup_array, [1] * 23) elif file_res == 'M': month_days = monthrange(int(yyyy), int(mm))[1] try: vals = np.append( vals, [np.float64(row[spec_index])] * (month_days * 24)) except: vals = np.append(vals, [-99999] * (month_days * 24)) current_datetime = present_datetime + relativedelta( months=int(mm) - 1) next_datetime = present_datetime + relativedelta( months=int(mm)) all_datetimes = pd.date_range(current_datetime, next_datetime, freq='H')[:-1] for d in all_datetimes: yyyymmdd.append(d.strftime("%Y%m%d")) hhmm.append(d.strftime("%H%M")) n_dup_array = np.append(n_dup_array, 0) n_dup_array = np.append(n_dup_array, [1] * ((month_days * 24) - 1)) val_count += 1 counter += 1 else: print 'Species is not in file header. Skipping Year' timedelta_diff = datetime.date(y + 1, 1, 1) - datetime.date( y, 1, 1) ndays_missing = timedelta_diff.days print 'ndays missing = ', ndays_missing #test if have no data due to not required time resolution, if so exit if len(vals) == 0: n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_obs_after_anyvaliddata, inv_nokeymeta, n_obs_after_nokeymeta, inv_resolution, n_obs_after_resolution, inv_badmeasurementmethod, n_obs_after_badmeasurementmethod = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 exit_c_list = np.array([ inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution, inv_badmeasurementmethod ]) n_c_list = np.array([ n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate, n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution, n_after_badmeasurementmethod ]) unknown_list = [ unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list ] meta = [ 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na' ] return c, ['na'], ['na'], [ 'na' ], False, meta, exit_c_list, n_c_list, unknown_list, 'nothourly', np.zeros( 0) #create max possible grid full_data = np.empty(n_hours) full_data_after_flagsandlod = np.empty(n_hours) big_n_dup_array = np.zeros(n_hours) full_data[:] = -99999 full_data_after_flagsandlod[:] = -99999 #convert blank values to -99999 test_inv = vals == '' vals[test_inv] = -99999 #convert number invalids to -99999 test_inv = vals < 0 vals[test_inv] = -99999 #if all site resolutions are same continue then take first file_res all_same = all(x == site_resolutions[0] for x in site_resolutions) if all_same == True: file_res = site_resolutions[0] else: #otherwise take lowest frequency res as file_res if 'M' in site_resolutions: file_res = 'M' elif 'D' in site_resolutions: file_res = 'D' else: file_res = 'H' #get meta i_ref = file_refs.index(ref) site_ref = ref data_tz = np.float32(met_tz[i_ref]) all_tz = [data_tz] lat = np.float32(met_lats[i_ref]) lon = np.float32(met_lons[i_ref]) alt = np.float32(met_alts[i_ref]) raw_class_name = met_class[i_ref] country = met_country[i_ref] unit = str(unit) contact = 'Ayako Aoyagi, Asia Center for Air Pollution Research, [email protected]' #adjust dates and times if tz is not equal to 0 tz = int(data_tz) if tz != 0: for i in range(len(yyyymmdd)): #create datetime dt = datetime.datetime(int(yyyymmdd[i][:4]), int(yyyymmdd[i][4:6]), int(yyyymmdd[i][6:]), int(hhmm[i][:2]), int(hhmm[i][2:])) if tz > 0: dt = dt - datetime.timedelta(hours=int(tz)) elif tz < 0: dt = dt + datetime.timedelta(hours=np.abs(int(tz))) yyyymmdd[i] = dt.strftime("%Y%m%d") hhmm[i] = dt.strftime("%H%M") #put vals into full grid date_con = np.array(yyyymmdd).astype(int) time_con = np.array(hhmm).astype(int) #remove data < 1970 and >= 2015 test_inds = (date_con >= 19700101) & (date_con < 20150101) date_con = date_con[test_inds] time_con = time_con[test_inds] vals = vals[test_inds] n_dup_array = n_dup_array[test_inds] #set st_big and mm_big st_big = ['continuous'] * len(vals) mm_big = [m_method] * len(vals) #get obs valid test = vals >= 0 valid_hours_dup = np.sum(n_dup_array[test]) n_obs_valid = int(len(vals[test]) - valid_hours_dup) n_after_flagsandlod += n_obs_valid #find matching times between actual times and grid of times, return big array of indices of matched indices in grid converted_time = modules.date_process(date_con, time_con, start_year) converted_time = np.round(converted_time, decimals=5) syn_grid_time = np.arange(0, n_days, 1. / 24) syn_grid_time = np.round(syn_grid_time, decimals=5) raw_indices = np.searchsorted(syn_grid_time, converted_time, side='left') vals = np.array(vals) full_data_after_flagsandlod[raw_indices] = vals raw_st = np.copy(st_big) raw_mm = np.copy(mm_big) # test and remove duplicate and overlap points converted_time, vals, mm_big, st_big, n_dup_array = modules.remove_duplicate_points( site_ref, converted_time, vals, mm_big, st_big, n_dup_array, output_res) test = vals >= 0 valid_hours_dup = np.sum(n_dup_array[test]) n_obs_valid = int(len(vals[test]) - valid_hours_dup) n_after_duplicate += n_obs_valid #find matching times between actual times and grid of times, return big array of indices of matched indices in grid indices = np.searchsorted(syn_grid_time, converted_time, side='left') full_data[indices] = vals big_n_dup_array[indices] = n_dup_array key_meta = [lat, lon, alt] #get sampling/instrument grids raw_st_grid, p_st_grid, p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid, p_mm_grid_after_flagsandlod, unknown_mm_list, unknown_mm_refs_list = modules.sampling_and_instruments_by_processgroup( site_ref, process_group, species, raw_st, raw_mm, full_data_after_flagsandlod, full_data, raw_indices, unknown_mm_list, unknown_mm_refs_list, no2_type) #do quality checks data_valid, full_data, valid_hours_dup, p_st_grid, p_mm_grid, n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata, inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution, inv_badmeasurementmethod, n_after_badmeasurementmethod, exit_r = modules.primary_quality_control( site_ref, species, file_res, no2_type, grid_dates, full_data, big_n_dup_array, valid_hours_dup, raw_st_grid, p_st_grid, p_st_grid_after_flagsandlod, raw_mm_grid, p_mm_grid, p_mm_grid_after_flagsandlod, data_resolution, n_obs_valid, key_meta, n_all, inv_nometa, n_after_nometa, n_after_flagsandlod, n_after_duplicate, inv_anyvaliddata, n_after_anyvaliddata, inv_nokeymeta, n_after_nokeymeta, inv_resolution, n_after_resolution, inv_badmeasurementmethod, n_after_badmeasurementmethod) if data_valid == False: exit_c_list = np.array([ inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution, inv_badmeasurementmethod ]) n_c_list = np.array([ n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate, n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution, n_after_badmeasurementmethod ]) unknown_list = [ unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list ] meta = [ lat, lon, alt, 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na' ] return c, ['na'], ['na'], [ 'na' ], False, meta, exit_c_list, n_c_list, unknown_list, exit_r, np.zeros( 1) #make tz int after checks data_tz = np.float32(data_tz) #set processed unit p_unit = 'pbbv' #get local timezone try: local_tz_name = tz_root.tzNameAt(lat, lon, forceTZ=True) pytz_obj = pytz.timezone(local_tz_name) datetime_offset = pytz_obj.utcoffset(datetime.datetime(2000, 1, 1)) if datetime_offset < datetime.timedelta(0): local_tz = -(24 - int(datetime_offset.seconds / 60 / 60)) else: local_tz = int(datetime_offset.seconds / 60 / 60) except: local_tz = 'na' print 'TIMEZONE NOT KNOWN, SITE IS %s' % (site_ref) unknown_local_tz_list.append(site_ref) #pack meta meta = [ lat, lon, alt, raw_class_name, file_res, unit, p_unit, data_tz, local_tz, site_name, country, contact ] #if blank strings in meta then convert to 'na' for i in range(len(meta)): try: if meta[i].strip() == '': meta[i] = 'na' except: pass print set(raw_st_grid) print set(raw_mm_grid) print set(p_st_grid) print set(p_mm_grid) print meta exit_c_list = np.array([ inv_nometa, inv_anyvaliddata, inv_nokeymeta, inv_resolution, inv_badmeasurementmethod ]) n_c_list = np.array([ n_all, n_after_nometa, n_after_flagsandlod, n_after_duplicate, n_after_anyvaliddata, n_after_nokeymeta, n_after_resolution, n_after_badmeasurementmethod ]) print 'exit counts = ', exit_c_list print 'n obs counts = ', n_c_list unknown_list = [ unknown_mm_list, unknown_mm_refs_list, unknown_local_tz_list ] return c, full_data, p_st_grid, p_mm_grid, data_valid, meta, exit_c_list, n_c_list, unknown_list, 'na', big_n_dup_array