def AhfromRH(ds,Ah_out,RH_in,Ta_in): """ Purpose: Function to calculate absolute humidity given relative humidity and air temperature. Absolute humidity is not calculated if any of the input series are missing or if the specified output series already exists in the data structure. The calculated absolute humidity is created as a new series in the data structure. Usage: qcfunc.AhfromRH(ds,"Ah_HMP_2m","RH_HMP_2m","Ta_HMP_2m") Author: PRI Date: September 2015 """ for item in [RH_in,Ta_in]: if item not in ds.series.keys(): msg = " AhfromRH: Requested series "+item+" not found, "+Ah_out+" not calculated" log.error(msg) return 0 if Ah_out in ds.series.keys(): msg = " AhfromRH: Output series "+Ah_out+" already exists, skipping ..." log.error(msg) return 0 RH_data,RH_flag,RH_attr = qcutils.GetSeriesasMA(ds,RH_in) Ta_data,Ta_flag,Ta_attr = qcutils.GetSeriesasMA(ds,Ta_in) Ah_data = mf.absolutehumidityfromRH(Ta_data,RH_data) Ah_attr = qcutils.MakeAttributeDictionary(long_name="Absolute humidity calculated from "+RH_in+" and "+Ta_in, height=RH_attr["height"], units="g/m3") qcutils.CreateSeries(ds,Ah_out,Ah_data,FList=[RH_in,Ta_in],Attr=Ah_attr) return 1
def MRfromRH(ds, MR_out, RH_in, Ta_in, ps_in): """ Purpose: Calculate H2O mixing ratio from RH. """ nRecs = int(ds.globalattributes["nc_nrecs"]) zeros = numpy.zeros(nRecs, dtype=numpy.int32) ones = numpy.ones(nRecs, dtype=numpy.int32) for item in [RH_in, Ta_in, ps_in]: if item not in ds.series.keys(): msg = " MRfromRH: Requested series " + item + " not found, " + MR_out + " not calculated" logger.error(msg) return 0 if MR_out in ds.series.keys(): msg = " MRfromRH: Output series " + MR_out + " already exists, skipping ..." logger.error(msg) return 0 RH_data, RH_flag, RH_attr = pfp_utils.GetSeriesasMA(ds, RH_in) Ta_data, Ta_flag, Ta_attr = pfp_utils.GetSeriesasMA(ds, Ta_in) Ah_data = pfp_mf.absolutehumidityfromRH(Ta_data, RH_data) ps_data, ps_flag, ps_attr = pfp_utils.GetSeriesasMA(ds, ps_in) MR_data = pfp_mf.h2o_mmolpmolfromgpm3(Ah_data, Ta_data, ps_data) MR_attr = pfp_utils.MakeAttributeDictionary( long_name="H2O mixing ratio calculated from " + RH_in + ", " + Ta_in + " and " + ps_in, height=RH_attr["height"], units="mmol/mol") flag = numpy.where(numpy.ma.getmaskarray(MR_data) == True, ones, zeros) pfp_utils.CreateSeries(ds, MR_out, MR_data, flag, MR_attr) return 1
def AhfromRH(ds, Ah_out, RH_in, Ta_in): """ Purpose: Function to calculate absolute humidity given relative humidity and air temperature. Absolute humidity is not calculated if any of the input series are missing or if the specified output series already exists in the data structure. The calculated absolute humidity is created as a new series in the data structure. Usage: pfp_func.AhfromRH(ds,"Ah_HMP_2m","RH_HMP_2m","Ta_HMP_2m") Author: PRI Date: September 2015 """ nRecs = int(ds.globalattributes["nc_nrecs"]) zeros = numpy.zeros(nRecs,dtype=numpy.int32) ones = numpy.ones(nRecs,dtype=numpy.int32) for item in [RH_in,Ta_in]: if item not in ds.series.keys(): msg = " AhfromRH: Requested series "+item+" not found, "+Ah_out+" not calculated" logger.error(msg) return 0 if Ah_out in ds.series.keys(): msg = " AhfromRH: Output series "+Ah_out+" already exists, skipping ..." logger.error(msg) return 0 RH_data,RH_flag,RH_attr = pfp_utils.GetSeriesasMA(ds,RH_in) Ta_data,Ta_flag,Ta_attr = pfp_utils.GetSeriesasMA(ds,Ta_in) Ah_data = pfp_mf.absolutehumidityfromRH(Ta_data,RH_data) Ah_attr = pfp_utils.MakeAttributeDictionary(long_name="Absolute humidity calculated from "+RH_in+" and "+Ta_in, height=RH_attr["height"], units="g/m3") flag = numpy.where(numpy.ma.getmaskarray(Ah_data)==True,ones,zeros) pfp_utils.CreateSeries(ds,Ah_out,Ah_data,flag,Ah_attr) return 1
def percent_to_mmolpmol(ds, MF_out, RH_in, Ta_in, ps_in): """ Purpose: Calculate H2O mole fraction from relative humidity (RH). """ nRecs = int(ds.globalattributes["nc_nrecs"]) zeros = numpy.zeros(nRecs, dtype=numpy.int32) ones = numpy.ones(nRecs, dtype=numpy.int32) for item in [RH_in, Ta_in, ps_in]: if item not in list(ds.series.keys()): msg = " Requested series " + item + " not found, " + MF_out + " not calculated" logger.error(msg) return 0 # get the relative humidity and check the units RH = pfp_utils.GetVariable(ds, RH_in) RH = pfp_utils.convert_units_func(ds, RH, "percent") # get the temperature and check the units Ta = pfp_utils.GetVariable(ds, Ta_in) Ta = pfp_utils.convert_units_func(ds, Ta, "degC") # get the absoulte humidity AH_data = pfp_mf.absolutehumidityfromRH(Ta["Data"], RH["Data"]) # get the atmospheric pressure and check the units ps = pfp_utils.GetVariable(ds, ps_in) ps = pfp_utils.convert_units_func(ds, ps, "kPa") # get the output variable (created in pfp_ts.DoFunctions()) MF = pfp_utils.GetVariable(ds, MF_out) # do the business MF["Data"] = pfp_mf.h2o_mmolpmolfromgpm3(AH_data, Ta["Data"], ps["Data"]) MF["Flag"] = numpy.where( numpy.ma.getmaskarray(MF["Data"]) == True, ones, zeros) MF["Attr"]["units"] = "mmol/mol" # put the output variable back into the data structure pfp_utils.CreateVariable(ds, MF) return 1
def get_absolutehumidity(ds_60minutes): for i in range(0,3): for j in range(0,3): Ta_label = "Ta_"+str(i)+str(j) RH_label = "RH_"+str(i)+str(j) Ah_label = "Ah_"+str(i)+str(j) Ta,f,a = qcutils.GetSeriesasMA(ds_60minutes,Ta_label) RH,f,a = qcutils.GetSeriesasMA(ds_60minutes,RH_label) Ah = mf.absolutehumidityfromRH(Ta, RH) attr = qcutils.MakeAttributeDictionary(long_name='Absolute humidity', units='g/m3',standard_name='not defined') qcutils.CreateSeries(ds_60minutes,Ah_label,Ah,Flag=f,Attr=attr) return
def percent_to_gH2Opm3(ds, AH_out, RH_in, Ta_in): """ Purpose: Function to calculate absolute humidity given relative humidity and air temperature. Absolute humidity is not calculated if any of the input series are missing or if the specified output series already exists in the data structure. The calculated absolute humidity is created as a new series in the data structure. Usage: pfp_func_units.percent_to_gpm3(ds,"AH_HMP_2m","RH_HMP_2m","Ta_HMP_2m") Author: PRI Date: September 2015 """ nRecs = int(ds.globalattributes["nc_nrecs"]) zeros = numpy.zeros(nRecs, dtype=numpy.int32) ones = numpy.ones(nRecs, dtype=numpy.int32) for item in [RH_in, Ta_in]: if item not in ds.series.keys(): msg = " Requested series " + item + " not found, " + AH_out + " not calculated" logger.error(msg) return 0 # get the relative humidity and check the units RH = pfp_utils.GetVariable(ds, RH_in) RH = pfp_utils.convert_units_func(ds, RH, "percent") # get the temperature and check the units Ta = pfp_utils.GetVariable(ds, Ta_in) Ta = pfp_utils.convert_units_func(ds, Ta, "degC") # get the absolute humidity AH = pfp_utils.GetVariable(ds, AH_out) AH["Data"] = pfp_mf.absolutehumidityfromRH(Ta["Data"], RH["Data"]) AH["Flag"] = numpy.where( numpy.ma.getmaskarray(AH["Data"]) == True, ones, zeros) AH["Attr"]["units"] = "g/m^3" pfp_utils.CreateVariable(ds, AH) return 1
def read_isd_file(isd_file_path): """ Purpose: Reads an ISD CSV file (gz or uncompressed) and returns the data in a data structure. Assumptions: Usage: Author: PRI Date: June 2017 """ isd_file_name = os.path.split(isd_file_path)[1] msg = "Reading ISD file "+isd_file_name logger.info(msg) isd_site_id = isd_file_name.split("-") isd_site_id = isd_site_id[0]+"-"+isd_site_id[1] # read the file if os.path.splitext(isd_file_path)[1] == ".gz": with gzip.open(isd_file_path, 'rb') as fp: content = fp.readlines() else: with open(isd_file_path) as fp: content = fp.readlines() # get a data structure ds = qcio.DataStructure() # get the site latitude, longitude and altitude ds.globalattributes["altitude"] = float(content[0][46:51]) ds.globalattributes["latitude"] = float(content[0][28:34])/float(1000) ds.globalattributes["longitude"] = float(content[0][34:41])/float(1000) ds.globalattributes["isd_site_id"] = isd_site_id # initialise the data structure isd = {} isd["DateTime"] = {"Data":[],"Flag":[],"Attr":{"long_name":"Datetime","units":"none"}} isd["Wd"] = {"Data":[],"Attr":{"long_name":"Wind direction","units":"degrees","missing_value":999}} isd["Ws"] = {"Data":[],"Attr":{"long_name":"Wind speed","units":"m/s","missing_value":999.9}} isd["Ta"] = {"Data":[],"Attr":{"long_name":"Air temperature","units":"C","missing_value":999.9}} isd["Td"] = {"Data":[],"Attr":{"long_name":"Dew point temperature","units":"C","missing_value":999.9}} isd["ps"] = {"Data":[],"Attr":{"long_name":"Surface pressure","units":"kPa","missing_value":9999.9}} isd["Precip"] = {"Data":[],"Attr":{"long_name":"Precipitation","units":"mm","missing_value":999.9}} # define the codes for good data in the ISD file OK_obs_code = ["AUTO ","CRN05","CRN15","FM-12","FM-15","FM-16","SY-MT"] # iterate over the lines in the file and decode the data for i in range(len(content)-1): #for i in range(10): # filter out anything other than hourly data if content[i][41:46] not in OK_obs_code: continue YY = int(content[i][15:19]) MM = int(content[i][19:21]) DD = int(content[i][21:23]) HH = int(content[i][23:25]) mm = int(content[i][25:27]) dt = datetime.datetime(YY,MM,DD,HH,mm,0) #isd["DateTime"]["Data"].append(pytz.utc.localize(dt)) isd["DateTime"]["Data"].append(dt) # wind direction, degT try: isd["Wd"]["Data"].append(float(content[i][60:63])) except: isd["Wd"]["Data"].append(float(999)) # wind speed, m/s try: isd["Ws"]["Data"].append(float(content[i][65:69])/float(10)) except: isd["Ws"]["Data"].append(float(999.9)) # air temperature, C try: isd["Ta"]["Data"].append(float(content[i][87:92])/float(10)) except: isd["Ta"]["Data"].append(float(999.9)) # dew point temperature, C try: isd["Td"]["Data"].append(float(content[i][93:98])/float(10)) except: isd["Td"]["Data"].append(float(999.9)) # sea level pressure, hPa try: isd["ps"]["Data"].append(float(content[i][99:104])/float(10)) except: isd["ps"]["Data"].append(float(9999.9)) # precipitation, mm if content[i][108:111] == "AA1": try: isd["Precip"]["Data"].append(float(content[i][113:117])/float(10)) except: isd["Precip"]["Data"].append(float(999.9)) else: isd["Precip"]["Data"].append(float(999.9)) # add the time zone to the DateTime ataributes isd["DateTime"]["Attr"]["time_zone"] = "UTC" # get the number of records and add this to the global attributes nrecs = len(isd["DateTime"]["Data"]) ds.globalattributes["nc_nrecs"] = str(nrecs) # define the QC flags f0 = numpy.zeros(len(isd["DateTime"]["Data"])) f1 = numpy.ones(len(isd["DateTime"]["Data"])) # deal with the datetime first variable = {"Label":"DateTime", "Data":numpy.array(isd["DateTime"]["Data"]), "Flag":f0, "Attr":isd["DateTime"]["Attr"]} qcutils.CreateVariable(ds, variable) # get the nominal time step dt_delta = qcutils.get_timestep(ds) ts = scipy.stats.mode(dt_delta)[0]/60 ds.globalattributes["time_step"] = ts[0] # add the variables to the data structure logger.info("Writing data to the data structure") labels = [label for label in isd.keys() if label != "DateTime"] for label in labels: data = numpy.ma.masked_equal(isd[label]["Data"], isd[label]["Attr"]["missing_value"]) flag = numpy.where(numpy.ma.getmaskarray(data) == True, f1, f0) attr = isd[label]["Attr"] variable = {"Label":label, "Data":data, "Flag":flag, "Attr":attr} qcutils.CreateVariable(ds, variable) # hPa to kPa ps = qcutils.GetVariable(ds, "ps") ps["Data"] = ps["Data"]/float(10) # convert sea level pressure to station pressure site_altitude = float(ds.globalattributes["altitude"]) Ta = qcutils.GetVariable(ds, "Ta") cfac = numpy.ma.exp((-1*site_altitude)/((Ta["Data"]+273.15)*29.263)) ps["Data"] = ps["Data"]*cfac ps["Attr"]["long_name"] = ps["Attr"]["long_name"]+", adjusted from sea level to station" qcutils.CreateVariable(ds, ps) # do precipitation and apply crude limits Precip = qcutils.GetVariable(ds, "Precip") condition = (Precip["Data"]<0)|(Precip["Data"]>100) Precip["Data"] = numpy.ma.masked_where(condition, Precip["Data"]) Precip["Flag"] = numpy.where(numpy.ma.getmaskarray(Precip["Data"])==True, f1, f0) Precip["Attr"]["RangeCheck_upper"] = 100 Precip["Attr"]["RangeCheck_lower"] = 0 qcutils.CreateVariable(ds, Precip) # get the humidities from Td Ta = qcutils.GetVariable(ds, "Ta") Td = qcutils.GetVariable(ds, "Td") ps = qcutils.GetVariable(ds, "ps") RH = mf.RHfromdewpoint(Td["Data"], Ta["Data"]) flag = numpy.where(numpy.ma.getmaskarray(RH)==True, f1, f0) attr = {"long_name":"Relative humidity", "units":"%"} variable = {"Label":"RH", "Data":RH, "Flag":flag, "Attr":attr} qcutils.CreateVariable(ds, variable) Ah = mf.absolutehumidityfromRH(Ta["Data"], RH) flag = numpy.where(numpy.ma.getmaskarray(Ah)==True, f1, f0) attr = {"long_name":"Absolute humidity", "units":"g/m3"} variable = {"Label":"Ah", "Data":Ah, "Flag":flag, "Attr":attr} qcutils.CreateVariable(ds, variable) q = mf.specifichumidityfromRH(RH, Ta["Data"], ps["Data"]) flag = numpy.where(numpy.ma.getmaskarray(q)==True, f1, f0) attr = {"long_name":"Specific humidity", "units":"kg/kg"} variable = {"Label":"q", "Data":q, "Flag":flag, "Attr":attr} qcutils.CreateVariable(ds, variable) # get U and V components from wind speed and direction Ws = qcutils.GetVariable(ds, "Ws") Wd = qcutils.GetVariable(ds, "Wd") U, V = qcutils.convert_WSWDtoUV(Ws, Wd) qcutils.CreateVariable(ds, U) qcutils.CreateVariable(ds, V) # add the time variable qcutils.get_nctime_from_datetime(ds) # return the data return ds
VPD_erai_tts = es_erai_tts - e_erai_tts flag = numpy.zeros(len(VPD_erai_tts), dtype=numpy.int32) attr = qcutils.MakeAttributeDictionary( long_name="Vapour pressure deficit", units="kPa") qcutils.CreateSeries(ds_erai, "VPD", VPD_erai_tts, Flag=flag, Attr=attr) RH_erai_tts = float(100) * e_erai_tts / es_erai_tts flag = numpy.zeros(len(RH_erai_tts), dtype=numpy.int32) attr = qcutils.MakeAttributeDictionary(long_name="Relative humidity", units="percent") qcutils.CreateSeries(ds_erai, "RH", RH_erai_tts, Flag=flag, Attr=attr) # get the absolute humidity Ah_erai_tts = mf.absolutehumidityfromRH(Ta_erai_tts, RH_erai_tts) flag = numpy.zeros(len(Ah_erai_tts), dtype=numpy.int32) attr = qcutils.MakeAttributeDictionary(long_name="Absolute humidity", units="g/m3") qcutils.CreateSeries(ds_erai, "Ah", Ah_erai_tts, Flag=flag, Attr=attr) # get the specific humidity q_erai_tts = mf.specifichumidityfromRH(RH_erai_tts, Ta_erai_tts, ps_erai_tts) flag = numpy.zeros(len(q_erai_tts), dtype=numpy.int32) attr = qcutils.MakeAttributeDictionary(long_name="Specific humidity", units="kg/kg") qcutils.CreateSeries(ds_erai, "q", q_erai_tts, Flag=flag, Attr=attr) # Interpolate the 3 hourly boundary layer height to the tower time step # NOTE: ERA-I variables are dimensioned [time,latitude,longitude] Habl_3d = erai_file.variables["blh"][:, :, :]
RH_label = "RH_"+str(i)+str(j) q,f,a = qcutils.GetSeriesasMA(ds_60minutes,q_label) Ta,f,a = qcutils.GetSeriesasMA(ds_60minutes,Ta_label) ps,f,a = qcutils.GetSeriesasMA(ds_60minutes,ps_label) RH = mf.RHfromspecifichumidity(q, Ta, ps) attr = qcutils.MakeAttributeDictionary(long_name='Relative humidity',units='%',standard_name='not defined') qcutils.CreateSeries(ds_60minutes,RH_label,RH,Flag=f,Attr=attr) # absolute humidity from temperature and relative humidity for i in range(0,3): for j in range(0,3): Ta_label = "Ta_"+str(i)+str(j) RH_label = "RH_"+str(i)+str(j) Ah_label = "Ah_"+str(i)+str(j) Ta,f,a = qcutils.GetSeriesasMA(ds_60minutes,Ta_label) RH,f,a = qcutils.GetSeriesasMA(ds_60minutes,RH_label) Ah = mf.absolutehumidityfromRH(Ta, RH) attr = qcutils.MakeAttributeDictionary(long_name='Absolute humidity',units='g/m3',standard_name='not defined') qcutils.CreateSeries(ds_60minutes,Ah_label,Ah,Flag=f,Attr=attr) # soil moisture from kg/m2 to m3/m3 attr = qcutils.GetAttributeDictionary(ds_60minutes,"Sws_00") for i in range(0,3): for j in range(0,3): label = "Sws_"+str(i)+str(j) Sws,f,a = qcutils.GetSeriesasMA(ds_60minutes,label) Sws = Sws/float(100) attr["units"] = "frac" qcutils.CreateSeries(ds_60minutes,label,Sws,Flag=flag_60minutes,Attr=attr) # net radiation and upwelling short and long wave radiation for i in range(0,3): for j in range(0,3): label_Fn = "Fn_"+str(i)+str(j)
def read_isd_file(isd_file_path): """ Purpose: Reads an ISD CSV file (gz or uncompressed) and returns the data in a data structure. Assumptions: Usage: Author: PRI Date: June 2017 """ isd_file_name = os.path.split(isd_file_path)[1] msg = "Reading ISD file " + isd_file_name logger.info(msg) isd_site_id = isd_file_name.split("-") isd_site_id = isd_site_id[0] + "-" + isd_site_id[1] # read the file if os.path.splitext(isd_file_path)[1] == ".gz": with gzip.open(isd_file_path, 'rb') as fp: content = fp.readlines() else: with open(isd_file_path) as fp: content = fp.readlines() # get a data structure ds = qcio.DataStructure() # get the site latitude, longitude and altitude ds.globalattributes["altitude"] = float(content[0][46:51]) ds.globalattributes["latitude"] = float(content[0][28:34]) / float(1000) ds.globalattributes["longitude"] = float(content[0][34:41]) / float(1000) ds.globalattributes["isd_site_id"] = isd_site_id # initialise the data structure ds.series["DateTime"] = { "Data": [], "Flag": [], "Attr": { "long_name": "Datetime", "units": "none" } } ds.series["Wd"] = { "Data": [], "Flag": [], "Attr": { "long_name": "Wind direction", "units": "degrees" } } ds.series["Ws"] = { "Data": [], "Flag": [], "Attr": { "long_name": "Wind speed", "units": "m/s" } } ds.series["Ta"] = { "Data": [], "Flag": [], "Attr": { "long_name": "Air temperature", "units": "C" } } ds.series["Td"] = { "Data": [], "Flag": [], "Attr": { "long_name": "Dew point temperature", "units": "C" } } ds.series["ps"] = { "Data": [], "Flag": [], "Attr": { "long_name": "Surface pressure", "units": "kPa" } } ds.series["Precip"] = { "Data": [], "Flag": [], "Attr": { "long_name": "Precipitation", "units": "mm" } } # define the codes for good data in the ISD file OK_obs_code = [ "AUTO ", "CRN05", "CRN15", "FM-12", "FM-15", "FM-16", "SY-MT" ] # iterate over the lines in the file and decode the data for i in range(len(content) - 1): #for i in range(10): # filter out anything other than hourly data if content[i][41:46] not in OK_obs_code: continue YY = int(content[i][15:19]) MM = int(content[i][19:21]) DD = int(content[i][21:23]) HH = int(content[i][23:25]) mm = int(content[i][25:27]) dt = datetime.datetime(YY, MM, DD, HH, mm, 0) ds.series["DateTime"]["Data"].append(pytz.utc.localize(dt)) # wind direction, degT try: ds.series["Wd"]["Data"].append(float(content[i][60:63])) except: ds.series["Wd"]["Data"].append(float(999)) # wind speed, m/s try: ds.series["Ws"]["Data"].append( float(content[i][65:69]) / float(10)) except: ds.series["Ws"]["Data"].append(float(999.9)) # air temperature, C try: ds.series["Ta"]["Data"].append( float(content[i][87:92]) / float(10)) except: ds.series["Ta"]["Data"].append(float(999.9)) # dew point temperature, C try: ds.series["Td"]["Data"].append( float(content[i][93:98]) / float(10)) except: ds.series["Td"]["Data"].append(float(999.9)) # sea level pressure, hPa try: ds.series["ps"]["Data"].append( float(content[i][99:104]) / float(10)) except: ds.series["ps"]["Data"].append(float(9999.9)) # precipitation, mm if content[i][108:111] == "AA1": try: ds.series["Precip"]["Data"].append( float(content[i][113:117]) / float(10)) except: ds.series["Precip"]["Data"].append(float(999.9)) else: ds.series["Precip"]["Data"].append(float(999.9)) # add the time zone to the DateTime ataributes ds.series["DateTime"]["Attr"]["time_zone"] = "UTC" # convert from lists to masked arrays f0 = numpy.zeros(len(ds.series["DateTime"]["Data"])) f1 = numpy.ones(len(ds.series["DateTime"]["Data"])) ds.series["DateTime"]["Data"] = numpy.array(ds.series["DateTime"]["Data"]) ds.series["DateTime"]["Flag"] = f0 ds.globalattributes["nc_nrecs"] = len(f0) dt_delta = qcutils.get_timestep(ds) ts = scipy.stats.mode(dt_delta)[0] / 60 ds.globalattributes["time_step"] = ts[0] ds.series["Wd"]["Data"] = numpy.ma.masked_equal(ds.series["Wd"]["Data"], 999) ds.series["Wd"]["Flag"] = numpy.where( numpy.ma.getmaskarray(ds.series["Wd"]["Data"]) == True, f1, f0) ds.series["Ws"]["Data"] = numpy.ma.masked_equal(ds.series["Ws"]["Data"], 999.9) ds.series["Ws"]["Flag"] = numpy.where( numpy.ma.getmaskarray(ds.series["Ws"]["Data"]) == True, f1, f0) ds.series["Ta"]["Data"] = numpy.ma.masked_equal(ds.series["Ta"]["Data"], 999.9) ds.series["Ta"]["Flag"] = numpy.where( numpy.ma.getmaskarray(ds.series["Ta"]["Data"]) == True, f1, f0) ds.series["Td"]["Data"] = numpy.ma.masked_equal(ds.series["Td"]["Data"], 999.9) ds.series["Td"]["Flag"] = numpy.where( numpy.ma.getmaskarray(ds.series["Td"]["Data"]) == True, f1, f0) # hPa to kPa ds.series["ps"]["Data"] = numpy.ma.masked_equal(ds.series["ps"]["Data"], 9999.9) / float(10) ds.series["ps"]["Flag"] = numpy.where( numpy.ma.getmaskarray(ds.series["ps"]["Data"]) == True, f1, f0) # convert sea level pressure to station pressure site_altitude = float(ds.globalattributes["altitude"]) cfac = numpy.ma.exp( (-1 * site_altitude) / ((ds.series["Ta"]["Data"] + 273.15) * 29.263)) ds.series["ps"]["Data"] = ds.series["ps"]["Data"] * cfac # do precipitation and apply crude limits ds.series["Precip"]["Data"] = numpy.ma.masked_equal( ds.series["Precip"]["Data"], 999.9) condition = (ds.series["Precip"]["Data"] < 0) | (ds.series["Precip"]["Data"] > 100) ds.series["Precip"]["Data"] = numpy.ma.masked_where( condition, ds.series["Precip"]["Data"]) ds.series["Precip"]["Flag"] = numpy.where( numpy.ma.getmaskarray(ds.series["Precip"]["Data"]) == True, f1, f0) # get the humidities from Td Ta, flag, attr = qcutils.GetSeriesasMA(ds, "Ta") Td, flag, attr = qcutils.GetSeriesasMA(ds, "Td") ps, flag, attr = qcutils.GetSeriesasMA(ds, "ps") RH = mf.RHfromdewpoint(Td, Ta) flag = numpy.where(numpy.ma.getmaskarray(RH) == True, f1, f0) attr = {"long_name": "Relative humidity", "units": "%"} qcutils.CreateSeries(ds, "RH", RH, Flag=flag, Attr=attr) Ah = mf.absolutehumidityfromRH(Ta, RH) flag = numpy.where(numpy.ma.getmaskarray(Ah) == True, f1, f0) attr = {"long_name": "Absolute humidity", "units": "g/m3"} qcutils.CreateSeries(ds, "Ah", Ah, Flag=flag, Attr=attr) q = mf.specifichumidityfromRH(RH, Ta, ps) flag = numpy.where(numpy.ma.getmaskarray(q) == True, f1, f0) attr = {"long_name": "Specific humidity", "units": "kg/kg"} qcutils.CreateSeries(ds, "q", q, Flag=flag, Attr=attr) # return the data return ds
#index = [x for x in range(len(ldt_all)) if (ldt_all[x].hour==9) and (ldt_all[x].minute==0)] #precip[index] = accum_24hr[index] # set attributes as appropriate accum_attr["long_name"] = "Precipitation total over time step" accum_attr["units"] = "mm/30 minutes" # put the precipitation per time step back into the data struicture qcutils.CreateSeries(ds_all,output_label,precip,Flag=accum_flag,Attr=accum_attr) # calculate missing humidities RH_list = sorted([x for x in ds_all.series.keys() if ("RH" in x) and ("_QCFlag" not in x)]) Ta_list = sorted([x for x in ds_all.series.keys() if ("Ta" in x) and ("_QCFlag" not in x)]) ps_list = sorted([x for x in ds_all.series.keys() if ("ps" in x) and ("_QCFlag" not in x)]) for RH_label,Ta_label,ps_label in zip(RH_list,Ta_list,ps_list): Ta,f,a = qcutils.GetSeriesasMA(ds_all,Ta_label) RH,f,a = qcutils.GetSeriesasMA(ds_all,RH_label) ps,f,a = qcutils.GetSeriesasMA(ds_all,ps_label) Ah = mf.absolutehumidityfromRH(Ta, RH) attr = qcutils.MakeAttributeDictionary(long_name='Absolute humidity',units='g/m3',standard_name='not defined', bom_id=a["bom_id"],bom_name=a["bom_name"],bom_dist=a["bom_dist"]) qcutils.CreateSeries(ds_all,RH_label.replace("RH","Ah"),Ah,Flag=f,Attr=attr) q = mf.specifichumidityfromRH(RH, Ta, ps) attr = qcutils.MakeAttributeDictionary(long_name='Specific humidity',units='kg/kg',standard_name='not defined', bom_id=a["bom_id"],bom_name=a["bom_name"],bom_dist=a["bom_dist"]) qcutils.CreateSeries(ds_all,RH_label.replace("RH","q"),q,Flag=f,Attr=attr) # now write the data structure to file # OMG, the user may want to overwrite the old data ... if os.path.exists(ncname): # ... but we will save them from themselves! t = time.localtime() rundatetime = datetime.datetime(t[0],t[1],t[2],t[3],t[4],t[5]).strftime("%Y%m%d%H%M") new_ext = "_"+rundatetime+".nc"
def construct_data(ALL_combined, VarToCorrelate, AWSVarToCorrelate, bestAWS_ID, Site_ID, corr_freq, myBaseforResults): #================================================================ # Main code started here #================================================================ #If L3 then change Ta label so the code can run #ALL_combined = ALL_combined.rename(columns={'Ta': 'Ta_EC'}) print "Correlation frequency ", corr_freq #Check for place to put resuolts - does it exist? If not create if not os.path.isdir(myBaseforResults): os.mkdir(myBaseforResults) #Then subdirectories if not os.path.isdir(myBaseforResults+"/AWS"): os.mkdir(myBaseforResults+"/AWS") if not os.path.isdir(myBaseforResults+"/AWS/"+VarToCorrelate): os.mkdir(myBaseforResults+"/AWS/"+VarToCorrelate) mypathforResults=myBaseforResults+"/AWS/"+VarToCorrelate #Do correlation and plots for AWS variables #------------------------------------------- ID1=bestAWS_ID[0] ID2=bestAWS_ID[1] ID3=bestAWS_ID[2] #Initialise the list empty with 4 items Labels=[None]*4 Labels[0]=VarToCorrelate #Flux tower label Labels[1]=AWSVarToCorrelate+"_"+ID1 #AWS first tower label Labels[2]=AWSVarToCorrelate+"_"+ID2 #AWS second tower label Labels[3]=AWSVarToCorrelate+"_"+ID3 #AWS third tower label #AWS_Flux_Correlate_plot_V1.AWS_correlate(ALL_combined,corr_freq,varname,bestAWS_ID) ## Temp statement to read in DAta that we have saved already. Otherwise we process the whole thing again #ALL_combined= pd.DataFrame.load('ALL_combined.df') #Go through the list of VARIABLES of interest and ID's #and make sure each value in each Column is a number so it can be converted # to a float. Then convert to a float so that it can be applied in stats.linregress #Problem is that the series imported into the data frame are obj not float and cant be used in array operations #These are the variables from teh AWS file that we want to import at some stage AWSvariables=['Ta','RH','Tw','DP','WS','Rain','GUST','P'] #Loop through the AWS ID. There are 3 AWS files in the ALL_combined dataframe for index1,ID in enumerate(bestAWS_ID): #Loop through the variables list for index2,AWSvariable in enumerate(AWSvariables): tempAWS_label=AWSvariable+'_'+ID newAWS_label=AWSvariable+'_'+ID+'_NEW' ALL_combined[tempAWS_label]=ALL_combined[tempAWS_label].map(is_number) ALL_combined[tempAWS_label]=ALL_combined[tempAWS_label].astype('float64') #Do for the same for current variables in the Flux data (i.e. Ta, Ah, etc). ALL_combined[VarToCorrelate]=ALL_combined[VarToCorrelate].map(is_number) ALL_combined[VarToCorrelate]=ALL_combined[VarToCorrelate].astype('float64') # Do calculations on met variables depending on variable type if VarToCorrelate=='Ah': for index,ID in enumerate(bestAWS_ID): Ta_temp_label='Ta_'+ID RH_temp_label='RH_'+ID #Here call function from Ozflux meteorolgoical functions script ALL_combined[Labels[index+1]]=mf.absolutehumidityfromRH(ALL_combined[Ta_temp_label],ALL_combined[RH_temp_label] ) #Replace -9999 in returned data to nan's. Leave gaps and do regressions. Then fill ALL_combined[Labels[index+1]].replace(-9999,value=np.nan,method='None', inplace=True) if VarToCorrelate=='Ws': for index,ID in enumerate(bestAWS_ID): #Here convert Bom km/h to flux m/s ALL_combined[Labels[index+1]]=ALL_combined[Labels[index+1]]*0.27778 #Replace -9999 in returned data to nan's. Leave gaps and do regressions. Then fill ALL_combined[Labels[index+1]].replace(-9999,value=np.nan,method='None', inplace=True) if VarToCorrelate=='P': for index,ID in enumerate(bestAWS_ID): #Here convert Bom hPa to kPa as per flux ALL_combined[Labels[index+1]]=ALL_combined[Labels[index+1]]/10 #Replace -9999 in returned data to nan's. Leave gaps and do regressions. Then fill ALL_combined[Labels[index+1]].replace(-9999,value=np.nan,method='None', inplace=True) #subset the large dataframe to variables we want for this process. One at a time SubsetDF=ALL_combined[[Labels[0],Labels[1],Labels[2],Labels[3]]] #Get things to report. Number of samples in each variable. Number of Nan's #This Gives the SIZE = total records in the by GROUP #The Count gives non Nan values by Group stats_size=SubsetDF.groupby([lambda x: x.year,lambda x: x.month]).size() stats_count=SubsetDF.groupby([lambda x: x.year,lambda x: x.month]).count() #Do FIRST stats correlation #Initialise the list empty with 3 items Statsresults=[] SubsetDF1=SubsetDF[[Labels[0],Labels[1]]] SubsetDF1_withNans = SubsetDF1 SubsetDF1 = SubsetDF1.dropna(how='any') #Check to see what basis we want the correlation to be performed. All, annual or monthly #Use SCiPy stats scipy.stats.linregress(x, y) if corr_freq=='monthly': SubsetDF1_grouped=SubsetDF1.groupby([lambda x: x.year,lambda x: x.month]) SubsetDF1_withNans_grouped=SubsetDF1_withNans.groupby([lambda x: x.year,lambda x: x.month]) Corr_grouped1=SubsetDF1_grouped.apply(lambda x: stats.linregress(x [Labels[1]], x [Labels[0]])) elif corr_freq=='annual': SubsetDF1_grouped=SubsetDF1.groupby([lambda x: x.year]) SubsetDF1_withNans_grouped=SubsetDF1_withNans.groupby([lambda x: x.year]) Corr_grouped1=SubsetDF1_grouped.apply(lambda x: stats.linregress(x [Labels[1]], x [Labels[0]])) else: SubsetDF1_grouped=SubsetDF1 SubsetDF1_withNans_grouped=SubsetDF1_withNans Corr_grouped1=(stats.linregress(SubsetDF1[Labels[1]], SubsetDF1[Labels[0]])) #Use SCiPy stats scipy.stats.linregress(x, y) Corr_all1=stats.linregress(SubsetDF1[Labels[1]], SubsetDF1[Labels[0]]) #Do SECOND stats correlation #Initialise the list empty with 3 items SubsetDF2=SubsetDF[[Labels[0],Labels[2]]] SubsetDF2 = SubsetDF2.dropna(how='any') #Check to see what basis we want the correlation to be performed. All, annual or monthly if corr_freq=='monthly': SubsetDF2_grouped=SubsetDF2.groupby([lambda x: x.year,lambda x: x.month]) Corr_grouped2=SubsetDF2_grouped.apply(lambda x: stats.linregress(x [Labels[2]], x [Labels[0]])) elif corr_freq=='annual': SubsetDF2_grouped=SubsetDF2.groupby([lambda x: x.year]) Corr_grouped2=SubsetDF2_grouped.apply(lambda x: stats.linregress(x [Labels[2]], x [Labels[0]])) else: SubsetDF2_grouped=SubsetDF2 Corr_grouped2=stats.linregress(SubsetDF2[Labels[2]], SubsetDF2[Labels[0]]) Corr_all2=stats.linregress(SubsetDF2[Labels[2]], SubsetDF2[Labels[0]]) #Do THIRD stats correlation #Initialise the list empty with 3 items SubsetDF3=SubsetDF[[Labels[0],Labels[3]]] SubsetDF3 = SubsetDF3.dropna(how='any') #Check to see what basis we want the correlation to be performed. All, annual or monthly if corr_freq=='monthly': SubsetDF3_grouped=SubsetDF3.groupby([lambda x: x.year,lambda x: x.month]) Corr_grouped3=SubsetDF3_grouped.apply(lambda x: stats.linregress(x [Labels[3]], x [Labels[0]])) elif corr_freq=='annual': SubsetDF3_grouped=SubsetDF3.groupby([lambda x: x.year]) Corr_grouped3=SubsetDF3_grouped.apply(lambda x: stats.linregress(x [Labels[3]], x [Labels[0]])) else: SubsetDF3_grouped=SubsetDF3 Corr_grouped3=stats.linregress(SubsetDF3[Labels[3]], SubsetDF3[Labels[0]]) Corr_all3=stats.linregress(SubsetDF3[Labels[3]], SubsetDF3[Labels[0]]) #Output Panda DF to list so it can be used to create a tabel in ReportLab #Create list for EACH AWS Site ID and output BY gruop for YEAR and MONTH #totable2=Corr_grouped1.unstack #totable3=Corr_grouped2.unstack #totable4=Corr_grouped3.unstack #Create a Pandas DF of the ALL data stats. A linregression of all data. #Returns'slope', 'intercept', 'r_value', 'p_value', 'std_err' #That is then put in a Pandas DF and output to list d1=[Corr_all1,Corr_all2, Corr_all3] Corr_data=pd.DataFrame(d1,columns=['slope', 'intercept', 'r_value', 'p_value', 'std_err']) #Create a DF with just the site ID's with column names ID d2=pd.Series([ID1, ID2, ID3],name='ID') #Join the two DF together d3=Corr_data.join(d2) #Set the DF Index to ID new= d3.set_index('ID') #Sort the DF by best (highest r quared value . This finds the order in which to use sites in the data gapfill. new=new.sort(columns='r_value',ascending=False) #Output Panda DF to list so it can be used to create a tabel in ReportLab #Create list of the Site ID correlations for ALL data. totable1=list(new.T.itertuples()) #Outputs (Tables and PDF) #=============================== #Output PDF Canvas doc1name=(mypathforResults+'/Table Tower and AWS correlation_%s_%s.pdf' % (VarToCorrelate,Site_ID)) doc1 = SimpleDocTemplate(doc1name,pagesize=A4, rightMargin=40,leftMargin=40, topMargin=20,bottomMargin=20) styles=getSampleStyleSheet() width, height = A4 # container for the 'Flowable' objects #Then put objects in there to fill and generate PDF later container1 = [] #Create first table of best ALL data correlations table1=Table(totable1) ptext1 = (' <font size=12>The data ALL data correlation stats for variable %s at AWS site ID %s</font>' % (VarToCorrelate, Site_ID)) ptext1a = (' <font size=12>Sites are %s %s %s</font>' % (ID1,ID2,ID3)) container1.append(Paragraph(ptext1, styles["Normal"])) container1.append(Paragraph(ptext1a, styles["Normal"])) container1.append(Spacer(1, 12)) #table1.setStyle(TableStyle([('BACKGROUND',(0,0),(0,rowsto_output1),colors.green),('BACKGROUND',(1,0),(1,rowsto_output1),colors.yellow)])) #set_column_titles(array('ID','Dist', 'Have Data','Lat','Long','Name','Start', 'End','Elev')) container1.append(table1) container1.append(Spacer(1, 20)) doc1.build(container1) #NOTE# ###### #I couldnt get anything appropriate to pas to the table constructor here. So I have commented it out #It works for a DF converted to LIST above for small table #Problem here is we want to pass the DF grouped object. This has to to_list method ##Output PDF Canvas 2 #doc2name=("Table Tower and AWS correlationwith BoM ID %s variable %s at %s.pdf" % (ID1,VarToCorrelate,Site_ID)) #doc2 = SimpleDocTemplate(doc2name,pagesize=A4, rightMargin=40,leftMargin=40, topMargin=20,bottomMargin=20) #styles=getSampleStyleSheet() #width, height = A4 ## container for the 'Flowable' objects ##Then put objects in there to fill and generate PDF later #container2 = [] #ptext2 = (' <font size=12>The data shows a breakdown by YEAR and MONTH of the correlation stats for variable %s at AWS site ID %s</font>' % (VarToCorrelate, Site_ID)) #table2=Table(totable2) #container2.append(Paragraph(ptext2, styles["Normal"])) #container2.append(table2) #container2.append(Spacer(1, 20)) #doc2.build(container2) ##Output PDF Canvas 3 #doc3name=("Table Tower and AWS correlationwith BoM ID %s variable %s at %s.pdf" % (ID2,VarToCorrelate,Site_ID)) #doc3 = SimpleDocTemplate(doc3name,pagesize=A4, rightMargin=40,leftMargin=40, topMargin=20,bottomMargin=20) #styles=getSampleStyleSheet() #width, height = A4 ## container for the 'Flowable' objects ##Then put objects in there to fill and generate PDF later #container3 = [] #ptext3 = (' <font size=12>The data shows a breakdown by YEAR and MONTH of the correlation stats for variable %s at AWS site ID %s</font>' % (VarToCorrelate, Site_ID)) #table3=Table(totable3) #container3.append(Paragraph(ptext3, styles["Normal"])) #container3.append(table3) #container3.append(Spacer(1, 20)) #doc3.build(container3) ##Output PDF Canvas 4 #doc4name=("Table Tower and AWS correlationwith BoM ID %s variable %s at %s.pdf" % (ID3,VarToCorrelate,Site_ID)) #doc4 = SimpleDocTemplate(doc4name,pagesize=A4, rightMargin=40,leftMargin=40, topMargin=20,bottomMargin=20) #styles=getSampleStyleSheet() #width, height = A4 ## container for the 'Flowable' objects ##Then put objects in there to fill and generate PDF later #container4 = [] #ptext4 = (' <font size=12>The data shows a breakdown by YEAR and MONTH of the correlation stats for variable %s at AWS site ID %s</font>' % (VarToCorrelate, Site_ID)) #table4=Table(totable4) #container4.append(Paragraph(ptext4, styles["Normal"])) #container4.append(table4) #container4.append(Spacer(1, 20)) #doc4.build(container4) # Write output files ################################################################ #when write file at this stage when ALL then the output is just a list and cant be written using the #Panadas file output if corr_freq != 'all': outputfilename1=str((mypathforResults+'/'+'Subset grouped AWS ID %s variable %s site %s.csv' % (ID1, VarToCorrelate, Site_ID))) Corr_grouped1.to_csv(outputfilename1, sep=',') outputfilename2=str((mypathforResults+'/'+'Subset grouped AWS ID %s variable %s site %s.csv' % (ID2, VarToCorrelate, Site_ID))) Corr_grouped2.to_csv(outputfilename2, sep=',') outputfilename3=str((mypathforResults+'/'+'Subset grouped AWS ID %s variable %s site %s.csv' % (ID3, VarToCorrelate, Site_ID))) Corr_grouped3.to_csv(outputfilename3, sep=',') outputfilename4=str((mypathforResults+'/'+'Correlation stats all data variable %s site %s.csv' % ( VarToCorrelate, Site_ID))) new.to_csv(outputfilename4, sep=',') # Produce graphs ################################################################ # Produce ALL data graph first # The DF 'new' has info in this format ID as index then 'slope', 'intercept', 'r_value', 'p_value', 'std_err' #plotdata = pd.DataFrame(SubsetDF) #Test plot using Pandas plots tools #Produce a 4 way correlation plot. #fig1 = plt.figure() #scatter_matrix(SubsetDF, alpha=0.2, figsize=(8, 8), diagonal='kde') #plt.show() #Calculate some things n_datapoints=len(SubsetDF) startdate= SubsetDF.index[0] enddate= SubsetDF.index[n_datapoints-1] #fig2 = plt.figure() #plt.scatter(SubsetDF[Labels[0]], SubsetDF[Labels[1]],'yo') ##plt.scatter(SubsetDF[Labels[0]], SubsetDF[Labels[2]],'g+') ##plt.scatter(SubsetDF[Labels[0]], SubsetDF[Labels[3]],'r^') #plt.show() #PLOT Three dta sets on Here #Loop through each ID in the list withing the DF 'new' #Remember that this list has been sorted so top has highest correlation slopetemp=[None]*3 intercepttemp=[None]*3 tempx_line0=[] tempy_line0=[] tempx_line1=[] tempy_line1=[] tempx_line2=[] tempy_line2=[] for index, IDx in enumerate(new.index): #set some temporary varibales to pass plot variables to plotXcolumn=AWSVarToCorrelate+"_"+IDx plotYcolumn=VarToCorrelate #For plotting we want to find tha range of varaible values across all sites and tower to get entire range to plot DFmins= SubsetDF.min() DFmaxs= SubsetDF.max() scale_min= int(min(DFmins[Labels[0]],DFmins[Labels[1]],DFmins[Labels[2]],DFmins[Labels[3]]))-1 scale_max= int(max(DFmaxs[Labels[0]],DFmaxs[Labels[1]],DFmaxs[Labels[2]],DFmaxs[Labels[3]]))+1 #create series to plot line slopetemp[index]=new.lookup([IDx], ['slope']) intercepttemp[index]=new.lookup([IDx], ['intercept']) for increment in range(scale_min,scale_max): if index==0: tempx_line0.append(increment) tempy_line0.append(slopetemp[index]*increment+intercepttemp[index]) elif index==1: tempx_line1.append(increment) tempy_line1.append(slopetemp[index]*increment+intercepttemp[index]) elif index==2: tempx_line2.append(increment) tempy_line2.append(slopetemp[index]*increment+intercepttemp[index]) #Produce these plots in reverse order 2 to 0 so that plot with least correlations is on the bottom if index==2: plt.plot(SubsetDF[plotXcolumn], SubsetDF[plotYcolumn], 'b+',tempx_line2, tempy_line2, ':b' ,label=IDx,linewidth=2) elif index==1: plt.plot(SubsetDF[plotXcolumn], SubsetDF[plotYcolumn], 'ro' ,tempx_line1, tempy_line1, '--r',label=IDx,linewidth=2) elif index==0: plt.plot(SubsetDF[plotXcolumn], SubsetDF[plotYcolumn], 'y>',tempx_line0, tempy_line0, '-y',label=IDx,linewidth=2) plt.xlim(scale_min, scale_max) plt.ylim(scale_min, scale_max) #create text for ID and r2 box graphtext1=('Station ID R2 \n' +str(new.index[0]) +" "+ "{0:.3f}".format(float(new.lookup([ID1], ['r_value'])))+"\n" +str(new.index[1]) +" "+ "{0:.3f}".format(float(new.lookup([ID2], ['r_value'])))+"\n" +str(new.index[2]) +" "+ "{0:.3f}".format(float(new.lookup([ID3], ['r_value'])))) #create text for start and end dates graphtext2=('Data start date: '+str(startdate)+'\n' +'End date: '+str(enddate)+'\n' +'Number records: '+str(n_datapoints)) units=' oC' plt.figtext(0.5,0.25,graphtext1, bbox=dict()) plt.figtext(0.5,0.1,graphtext2, bbox=dict()) plt.title('Variable '+VarToCorrelate+ ' for ALL data' + ' at ' +Site_ID) plt.xlabel('BoM AWS stations ' + '('+units+')') plt.ylabel(Labels[0]+ ' ' + '('+units+')') plt.legend(shadow=True, fancybox=True,loc='best') #Output to PDF using PdfPages a backend for MatPlotLib fname_graph=mypathforResults+'/'+'Linear Plot Tower vs BoM AWSs for ALL data - Variable '+VarToCorrelate+ ' at ' +Site_ID+'.pdf' # Create the PdfPages object to which we will save the pages: pdf = PdfPages(fname_graph) savefig(pdf, format='pdf',facecolor='w', edgecolor='w') # note the format='pdf' argument! #show() close() pdf.close() #Now try and produce a panel of graph of 6 ############################################## #This just finds the number of Year and Month groups so that we can use it later for plotting #Where we have 6 graphs to a page. How many graphs do we need? subsets=len(SubsetDF1_grouped) if corr_freq=='monthly': for (k1, k2), group in SubsetDF1_grouped: #Calculate some things n_datapoints=len(group) startdate= group.index[0] enddate= group.index[n_datapoints-1] #Set temp variables and lists slopetemp=[] intercepttemp=[] tempx_line=[] tempy_line=[] #set some temporary varibales to pass plot variables to plotXcolumn=VarToCorrelate+"_"+IDx plotYcolumn=VarToCorrelate #For plotting we want to find the range of varaible values across all sites and tower to get entire range to plot DFmins= group.min() DFmaxs= group.max() scale_min= int(min(DFmins[Labels[0]],DFmins[Labels[1]]))-1 scale_max= int(max(DFmaxs[Labels[0]],DFmaxs[Labels[1]]))+1 #create series to plot line #Need to extract the linear regression stats done bygroup earlier slopetemp, intercepttemp, r_valuetemp, p_valuetemp, std_errtemp = stats.linregress(group[Labels[1]],group[Labels[0]]) for increment in range(scale_min,scale_max): tempx_line.append(increment) tempy_line.append(slopetemp*increment+intercepttemp) ## Could work for later pd.merge(df, k1_means, left_on='key1', right_index=True #Produce the plot plt.plot(group[Labels[1]], group[Labels[0]], 'go',tempx_line, tempy_line, ':b' ,label=IDx,linewidth=2) #Set the scale mins and maxs plt.xlim(scale_min, scale_max) plt.ylim(scale_min, scale_max) #create text for ID and r2 box graphtext1= str('intercept ' + str("{0:.2f}".format(intercepttemp) +'\n') + 'slope ' + str("{0:.2f}".format(slopetemp)) +'\n' + 'r value ' + str("{0:.2f}".format(r_valuetemp)) +'\n' + 'p_value ' + str("{0:.2f}".format(p_valuetemp)) +'\n' + 'std_err ' + str("{0:.2f}".format(std_errtemp)) +'\n') #create text for start and end dates graphtext2=('Data start date: '+str(startdate)+'\n' +'End date: '+str(enddate)+'\n' +'Number records: '+str(n_datapoints)) units=' oC' plt.figtext(0.7,0.3,graphtext1, bbox=dict()) plt.figtext(0.5,0.13,graphtext2, bbox=dict()) plt.title('Tower vs Best BoM AWS Year '+ str(k1) +' Month '+ str(k2) +' Variable '+VarToCorrelate+ ' at ' +Site_ID +'\n'+FLUXfilename) plt.xlabel('BoM AWS station ' + '('+units+')') plt.ylabel(Labels[0]+ ' ' + '('+units+')') plt.legend(shadow=True, fancybox=True,loc='best') #Output to PDF using PdfPages a backend for MatPlotLib fname_graph=mypathforResults+'/'+'Linear Plot Tower vs BoM AWSs for Year '+ str(k1) +' and Month '+ str(k2) +' - Variable '+VarToCorrelate+ ' at ' +Site_ID+'.pdf' # Create the PdfPages object to which we will save the pages: pdf = PdfPages(fname_graph) savefig(pdf, format='pdf',facecolor='w', edgecolor='w') # note the format='pdf' argument! #show() close() pdf.close() elif corr_freq=='annual': for k1 , group in SubsetDF1.groupby([lambda x: x.year]): #Calculate some things n_datapoints=len(group) startdate= group.index[0] enddate= group.index[n_datapoints-1] #Set temp variables and lists slopetemp=[] intercepttemp=[] tempx_line=[] tempy_line=[] #set some temporary varibales to pass plot variables to plotXcolumn=VarToCorrelate+"_"+IDx plotYcolumn=VarToCorrelate #For plotting we want to find the range of varaible values across all sites and tower to get entire range to plot DFmins= group.min() DFmaxs= group.max() scale_min= int(min(DFmins[Labels[0]],DFmins[Labels[1]]))-1 scale_max= int(max(DFmaxs[Labels[0]],DFmaxs[Labels[1]]))+1 #create series to plot line #Need to extract the linear regression stats done bygroup earlier slopetemp, intercepttemp, r_valuetemp, p_valuetemp, std_errtemp = stats.linregress(group[Labels[1]],group[Labels[0]]) for increment in range(scale_min,scale_max): tempx_line.append(increment) tempy_line.append(slopetemp*increment+intercepttemp) ## Could work for later pd.merge(df, k1_means, left_on='key1', right_index=True #Produce the plot plt.plot(group[Labels[1]], group[Labels[0]], 'go',tempx_line, tempy_line, ':b' ,label=IDx,linewidth=2) #Set the scale mins and maxs plt.xlim(scale_min, scale_max) plt.ylim(scale_min, scale_max) #create text for ID and r2 box graphtext1=str('intercept ' + str("{0:.2f}".format(intercepttemp) +'\n') + 'r value ' + str("{0:.2f}".format(r_valuetemp)) +'\n'+'p_value ' + str("{0:.2f}".format(p_valuetemp)) +'\n' + 'std_err ' + str("{0:.2f}".format(std_errtemp)) +'\n') #create text for start and end dates graphtext2=('Data start date: '+str(startdate)+'\n' +'End date: '+str(enddate)+'\n' +'Number records: '+str(n_datapoints)) units=' oC' plt.figtext(0.7,0.3,graphtext1, bbox=dict()) plt.figtext(0.5,0.13,graphtext2, bbox=dict()) plt.title('Tower vs Best BoM AWS Year '+ str(k1) +' Variable '+VarToCorrelate+ ' at ' +Site_ID) plt.xlabel('BoM AWS station ' + '('+units+')') plt.ylabel(Labels[0]+ ' ' + '('+units+')') plt.legend(shadow=True, fancybox=True,loc='best') #Output to PDF using PdfPages a backend for MatPlotLib fname_graph=mypathforResults+'/'+'Linear Plot Tower vs BoM AWSs for Year '+ str(k1) +' - Variable '+VarToCorrelate+ ' at ' +Site_ID+'.pdf' # Create the PdfPages object to which we will save the pages: pdf = PdfPages(fname_graph) savefig(pdf, format='pdf',facecolor='w', edgecolor='w') # note the format='pdf' argument! #show() close() pdf.close() ########################################################### # Do the correlation analysis and APPLY it to the columns ########################################################## #Apply depending on the frequency required # 'slope', 'inter', 'rsqu', 'pval', 'se' # suffix with _all, _yr or _mon to indicate #Suffux with site ID i.e. _045623 #STEP 1. Create Pandas DF for each of the Yearly and Monthly breakdowns to get stats #But statslinreg wont work with Nans so start with DF without Nans. #Then later step 2 come back to fill the Nans in the original DF = SubSetDF #Create a new label for pandas df column for the contructed variable (and the QC flag) and column to fill label construct_label=str(VarToCorrelate+"_Con") fill_label=str(VarToCorrelate) #start by copying the existing data from the tower to the construct column, then fill the missing bits SubsetDF[construct_label]=SubsetDF[fill_label] #Also later we need the following columns alrady defined in the DF so here goes for ID in new.index: corr_label_All=str(VarToCorrelate+"_AllCorr"+"_"+ID) SubsetDF[corr_label_All]=zeros corr_label_Ann=str(VarToCorrelate+"_AnnCorr"+"_"+ID) SubsetDF[corr_label_Ann]=zeros corr_label_Mon=str(VarToCorrelate+"_MonCorr"+"_"+ID) SubsetDF[corr_label_Mon]=zeros #Do ALL correlation slope_all={} inter_all={} rsqu_all ={} pval_all ={} se_all={} for ID in new.index: xlabel=str(AWSVarToCorrelate+"_"+ID) ylabel=str(VarToCorrelate) temp=SubsetDF.dropna(how='any') xvalues=temp[xlabel] yvalues=temp[ylabel] slope_all[ID], inter_all[ID], rsqu_all[ID], pval_all[ID], se_all[ID]= stats.linregress(xvalues,yvalues) print "ID : ",ID, slope_all[ID] for key, value in slope_all.iteritems(): print key, value #Do ANNUAL correlation #prepare the datasets #Setup DF grouped by year temp_a1=SubsetDF temp_a2=SubsetDF.dropna(how='any') tempAnnualgrouped=temp_a1.groupby([lambda x: x.year]) tempAnnualgrouped_noNans=temp_a2.groupby([lambda x: x.year]) #loop through the BoM sites Best to worst correlation over 3 sites #setup for a new Pandas datatable to put results in #Create a list fits AnnualStatsList=[] for ID in new.index: xlabel=str(AWSVarToCorrelate+"_"+ID) ylabel=str(VarToCorrelate) for a1 , group in tempAnnualgrouped_noNans: xvalues=group[xlabel] yvalues=group[ylabel] slope_yr, inter_yr, rsqu_yr, pval_yr, se_yr= stats.linregress(xvalues,yvalues) #output to list and later make into DF AnnualStatsList.append([a1,ID, slope_yr, inter_yr, rsqu_yr, pval_yr, se_yr]) AnnualStats=pd.DataFrame(AnnualStatsList, columns=['year','ID', 'slope', 'inter', 'rsqu', 'pval', 'se']) #AnnualStats=temp1.set_index(['year','ID']) print AnnualStats.head(5) #Do MONTHLY correlation #prepare the datasets #Setup DF grouped by year temp_m1=SubsetDF temp_m2=SubsetDF.dropna(how='any') tempMonthlyGrouped_noNans = temp_m2.groupby([lambda x: x.year,lambda x: x.month]) #loop through the BoM sites Best to worst correlation over 3 sites #setup for a new Pandas datatable to put results in #Create a list fits MonthlyStatsList=[] for ID in new.index: xlabel=str(AWSVarToCorrelate+"_"+ID) ylabel=str(VarToCorrelate) for (m1 , m2) , group in tempMonthlyGrouped_noNans: xvalues=group[xlabel] yvalues=group[ylabel] slope_yr, inter_yr, rsqu_yr, pval_yr, se_yr= stats.linregress(xvalues,yvalues) #output to list and later make into DF MonthlyStatsList.append([m1, m2 ,ID, slope_yr, inter_yr, rsqu_yr, pval_yr, se_yr]) MonthlyStats=pd.DataFrame(MonthlyStatsList, columns=['year','month','ID', 'slope', 'inter', 'rsqu', 'pval', 'se']) #MonthlyStats=temp2.set_index(['year','month','ID']) print MonthlyStats.head(5) #Now step2 Apply the stats back to original dataframe #===================================================== #Create the series first #Do MONTHLY series for ID in new.index: #print "Applying correlations for Monthly" + ID #for ID in new.index: xlabel=str(AWSVarToCorrelate+"_"+ID) ylabel=str(VarToCorrelate) #Create new variable corr_label=str(VarToCorrelate+"_MonCorr"+"_"+ID) temp_m1[corr_label].iloc[:]=np.nan testshape=len(temp_m1.groupby([lambda x: x.year,lambda x: x.month])) MonCorResults = temp_m1.groupby([lambda x: x.year,lambda x: x.month], group_keys=False, as_index=False).apply(regress_func,xlabel,ylabel,corr_label) #Do a fill of data where one value is missing. This solves the problem of when the AWS data is #60 minutes but other AWS data is 30 minutes. Under that case the selected AWS ID oscillates #between station IDs MonCorResults.fillna(method='ffill', limit=2, inplace=True) temp_m1[corr_label]=MonCorResults #Do ALL series for ID in new.index: #print "Applying correlations for ALL" + ID #for ID in new.index: xlabel=str(AWSVarToCorrelate+"_"+ID) ylabel=str(VarToCorrelate) #Create new variable corr_label=str(VarToCorrelate+"_AllCorr"+"_"+ID) temp_m1[corr_label].iloc[:]=np.nan #AllCorResults = temp_m1.apply(regress_func, axis=0, broadcast=False, raw=False, args=(xlabel,ylabel,corr_label)) #Do the regression. Start by subsetting the two columns required. #Then drop any NaN case wise #reset (get rid of the index) and drop the index rather than keeping it as a column #so it can be passed to linregress xnow=temp_m1[[xlabel,ylabel]] xnow=xnow.dropna(how='any') xdata=xnow[xlabel].dropna().reset_index(drop=True) ydata=xnow[ylabel].dropna().reset_index(drop=True) slope, inter, rsqu, pval, se= stats.linregress(xdata,ydata) print "stats:",slope, inter, rsqu, pval, se #Here use the original column to do apply the lin regresssion as #values had been dropped previously temp_m1[corr_label]=slope*temp_m1[xlabel]+inter #Do a fill of data where one value is missing. This solves the problem of when the AWS data is #60 minutes but other AWS data is 30 minutes. Under that case the selected AWS ID oscillates #between station IDs temp_m1[corr_label].fillna(method='ffill', limit=2, inplace=True) #Do ANNUAL series for ID in new.index: #print "Applying correlations for Annual" + ID #for ID in new.index: xlabel=str(AWSVarToCorrelate+"_"+ID) ylabel=str(VarToCorrelate) #Create new variable corr_label=str(VarToCorrelate+"_AnnCorr"+"_"+ID) corr_label_all=str(VarToCorrelate+"_AllCorr"+"_"+ID) temp_m1[corr_label].iloc[:]=np.nan testshape=len(temp_m1.groupby([lambda x: x.year])) #test to see if more than one year. Otherwise this will give an error #Instead use the ALL series instead and just copy. #this is calculated in previous block #pdb.set_trace() if testshape > 1: AnnCorResults = temp_m1.groupby([lambda x: x.year]).apply(regress_func,xlabel,ylabel,corr_label) AnnCorResults.index=AnnCorResults.index.droplevel(0) temp_m1[corr_label]=AnnCorResults else: temp_m1[corr_label]=temp_m1[corr_label_all] #Do a fill of data where one value is missing. This solves the problem of when the AWS data is #60 minutes but other AWS data is 30 minutes. Under that case the selected AWS ID oscillates #between station IDs temp_m1[corr_label].fillna(method='ffill', limit=2, inplace=True) ######################################################################################### #STEP3 Now we have created a number of Time series that have we can use for the gap filling #Now apply the gap filling! ########################################################################################## #Apply this based on the frequency variable passed along All, Annual or Monthly #create different columns for each frequency of gap filling ################################################################# #add a column for the constructed QC flag #This will be 1 if valid data from the tower else the AWS ID or construct_flag_label=str(VarToCorrelate+"_Con_QCFlag") temp_m1[construct_flag_label]=np.nan #Set Construct flag equal to 1 to say that EC data was used and QC is OK. Later add to this the AWS ID based on missing data temp_m1[construct_flag_label][((temp_m1[VarToCorrelate])>-50) & ((temp_m1[VarToCorrelate])<100)]=1 if corr_freq=='all': for row in range(0,(temp_m1.shape[0])): # Check to see if the Current row of variable to contrustct a variable is a number. # Call function and fill if necessary #Get the column number from the name construct_col_number=temp_m1.columns.get_loc(construct_label) #Use iloc indexing to get the points in the dataframe if is_nan(temp_m1.iloc[row,construct_col_number])==True: #Loop through BoM ID's first #Fill in using each of the ID stations in the list #This should fill using best station in the list. If any Nans then #they should be filled by the next statio, and then the next #Assign the AWS station ID as the flagfor ID in new.index: #Define Flag variable name and column number corr_label_All_Flag=str(VarToCorrelate+"_Con_QCFlag") corr_Flag_col_number=temp_m1.columns.get_loc(corr_label_All_Flag) #Loop through again with Correlate ALL for ID in new.index: corr_label_All=str(VarToCorrelate+"_AllCorr"+"_"+ID) corr_All_col_number=temp_m1.columns.get_loc(corr_label_All) if (is_nan(temp_m1.iloc[row,corr_All_col_number])==False) and (is_nan(temp_m1.iloc[row,construct_col_number])==True): temp_m1.iloc[row,construct_col_number]=temp_m1.iloc[row,corr_All_col_number] temp_m1.iloc[row,corr_Flag_col_number]=999 print "All Stats" print "Mean ", temp_m1[construct_label].mean() print "Count ", temp_m1[construct_label].count() elif corr_freq=='annual': for row in range(0,(temp_m1.shape[0])): # Check to see if the Current row of variable to contrustct a variable is a number. # Call function and fill if necessary #Get the column number from the name construct_col_number=temp_m1.columns.get_loc(construct_label) #Use iloc indexing to get the points in the dataframe if is_nan(temp_m1.iloc[row,construct_col_number])==True: #Loop through BoM ID's first #Fill in using each of the ID stations in the list #This should fill using best station in the list. If any Nans then #they should be filled by the next statio, and then the next #Assign the AWS station ID as the flagfor ID in new.index: #Define Flag variable name and column number corr_label_Ann_Flag=str(VarToCorrelate+"_Con_QCFlag") corr_Flag_col_number=temp_m1.columns.get_loc(corr_label_Ann_Flag) #If Construct is missing AND BoM is present then fill and and flag value for ID in new.index: corr_label_Ann=str(VarToCorrelate+"_AnnCorr"+"_"+str(ID)) corr_col_number=temp_m1.columns.get_loc(corr_label_Ann) if (is_nan(temp_m1.iloc[row,corr_col_number])==False) and (is_nan(temp_m1.iloc[row,construct_col_number])==True): temp_m1.iloc[row,construct_col_number]=temp_m1.iloc[row,corr_col_number] temp_m1.iloc[row,corr_Flag_col_number]=100 #If still missing then use ALL fill #Loop through again with Correlate ALL for ID in new.index: if is_nan(temp_m1.iloc[row,construct_col_number])==True: #Create a new variable corr_label_All=str(VarToCorrelate+"_AllCorr"+"_"+ID) corr_All_col_number=temp_m1.columns.get_loc(corr_label_All) temp_m1.iloc[row,construct_col_number]=temp_m1.iloc[row,corr_All_col_number] temp_m1.iloc[row,corr_Flag_col_number]=999 print "Annual Stats" print "Mean ", temp_m1[construct_label].mean() print "Count ", temp_m1[construct_label].count() elif corr_freq=='monthly': for row in range(0,(temp_m1.shape[0])): # Check to see if the Current row of variable to contrustct a variable is a number. # Call function and fill if necessary #Get the column number from the name construct_col_number=temp_m1.columns.get_loc(construct_label) #Use iloc indexing to get the points in the dataframe if is_nan(temp_m1.iloc[row,construct_col_number])==True: #Loop through BoM ID's first #Fill in using each of the ID stations in the list #This should fill using best station in the list. If any Nans then #they should be filled by the next statio, and then the next #Assign the AWS station ID as the flagfor ID in new.index: #Define Flag variable name and column number corr_label_Mon_Flag=str(VarToCorrelate+"_Con_QCFlag") corr_Flag_col_number=temp_m1.columns.get_loc(corr_label_Mon_Flag) #If Construct is missing AND BoM is present then fill and and flag value for ID in new.index: corr_label_Mon=str(VarToCorrelate+"_MonCorr"+"_"+str(ID)) corr_col_number=temp_m1.columns.get_loc(corr_label_Mon) if (is_nan(temp_m1.iloc[row,corr_col_number])==False) and (is_nan(temp_m1.iloc[row,construct_col_number])==True): temp_m1.iloc[row,construct_col_number]=temp_m1.iloc[row,corr_col_number] temp_m1.iloc[row,corr_Flag_col_number]=30 #If still missing then use ALL fill #Loop through again with Correlate ALL for ID in new.index: if is_nan(temp_m1.iloc[row,construct_col_number])==True: #Create a new variable corr_label_All=str(VarToCorrelate+"_AllCorr"+"_"+ID) corr_All_col_number=temp_m1.columns.get_loc(corr_label_All) temp_m1.iloc[row,construct_col_number]=temp_m1.iloc[row,corr_All_col_number] temp_m1.iloc[row,corr_Flag_col_number]=999 print "Monthly Stats" print "Mean ", temp_m1[construct_label].mean() print "Count ", temp_m1[construct_label].count() #Do ALL Counts #================================================= #Counts now done in diagnostics if VarToCorrelate=='Ta': yunits='oC' elif VarToCorrelate=='Ah': yunits='g m-3' elif VarToCorrelate=='Ws': yunits='m s-1' elif VarToCorrelate=='Ws_CSAT': yunits='m s-1' elif VarToCorrelate=='P': yunits='kPa' elif VarToCorrelate=='ps': yunits='kPa' else: yunits=' ' #Do some plots. Call the routines #Plot the 30 minute data title4plot="Tower and Constructed 30 minute" xdata1=temp_m1[construct_label]; xdata2=temp_m1[fill_label] ylabel=VarToCorrelate ; xlabel="Time" ; xunits="months" plotstuff.plotandPDF2timeseries(title4plot,xdata1,xdata2,xlabel,ylabel,xunits,yunits,Site_ID,VarToCorrelate,mypathforResults) #Plot the monthly title4plot="Tower and Constructed monthly averages" xdata1=temp_m1[construct_label].groupby([lambda x: x.year,lambda x: x.week]).mean() xdata2=temp_m1[fill_label].groupby([lambda x: x.year,lambda x: x.week]).mean() ylabel=VarToCorrelate ; xlabel="Time" ; xunits="months" plotstuff.plotandPDF2timeseries(title4plot,xdata1,xdata2,xlabel,ylabel,xunits,yunits,Site_ID,VarToCorrelate,mypathforResults) ############################################################ # Create a variable that is the best correlated output. OUtput in entirety # ############################################################ #Create a new label for pandas df column for the contructed variable corr_label=str(VarToCorrelate+"_Corr") temp_m1[corr_label]=np.nan for ID in new.index: corr_label_Ann=str(VarToCorrelate+"_AnnCorr"+"_"+str(ID)) temp_m1[corr_label][temp_m1[corr_label].isnull()]=temp_m1[corr_label_Ann] ########################################### # Call function to do climatology gap fill# ########################################### #If there are any values still missing use climatology to gap fill temp_m1=gap_fill_climatology.climatology_monthly_diurnal(temp_m1,VarToCorrelate) #Write out file print "Writing out files for "+VarToCorrelate+" at "+Site_ID+ " at frequency "+ corr_freq fname_results=mypathforResults+'/'+'Results Tower and BoM AWS Variable '+VarToCorrelate+ ' at ' +Site_ID+'.csv' temp_m1.to_csv(fname_results) print "Finished Meteorological Gap filling for "+VarToCorrelate+" at "+Site_ID+ " at frequency "+ corr_freq con_label_Flag=str(VarToCorrelate+"_Con_QCFlag") #return a dataframe with two variables return temp_m1[[construct_label,con_label_Flag,corr_label]]