def printHandleJsonValidity(jsonOK, errMsg): #print "printHandleJsonValidity():", (jsonOK, errMsg) if jsonOK: #csvt.printProgress("JSON is OK") pass else: csvt.printProgress("JSON is WRONG!\n%s" % errMsg)
def GetTimeRangeOfData(self): ''' Return json: { "minDateTime": "2006-01-03 00:10:00 UTC", "maxDateTime": "2006-01-30 08:57:00 UTC" } Note: The actual range is compute in function: ReadInputCSV() ''' if not self.initialized: printProgress("ERROR: Initialize first!") return return self.csvDataObj.GetTimeRangeOfData()
def FindClosestDateTimeIndex(self, givenDateTime): self.givenDateTime = givenDateTime closestDateTime = self.FindClosestDateTime(givenDateTime, self.dateTimeArray) closestDateTimeIndex = np.where( self.dateTimeArray == closestDateTime )[0][ 0] # np.where(self.dateTimeArray == closestDateTime) == gives= => [(array([2511]),)] csvT.printProgress( "givenDateTime=%s, closestDateTime=%s, closestDateTimeIndex=%d" % (str(givenDateTime), str(closestDateTime), closestDateTimeIndex)) return closestDateTimeIndex
def GetLatLonBBOXOfData(self): ''' Return json: { "west": 4.0161296310787549, "east": 6.8340902374793098, "north": 50.304600877017236, "south": 48.4668502279617 } Note: The actual range is compute in function: ReadInputCSV() ''' if not self.initialized: printProgress("ERROR: Initialize first!") return return self.csvDataObj.GetLatLonBBOXOfData()
def FindClosestLonLatPointIndex(self, lon, lat): # This works with 2D array self.lonLatStacked: [ [lon,lat], ... ] printProgress("Computing distance %s: to lat-lon point-array [%d];" %(str((lon, lat)),self.arraySize)) idx = 0 distArray = np.zeros(self.lonLatStacked.shape[0]).astype(np.float) for tupleLL in self.lonLatStacked: # tupleLL == [lon,lat] #print "idx=",idx, "tupleLL=",tupleLL dist = self.Distance2pointsInLonLat(lon, lat, tupleLL[0], tupleLL[1]) distArray[idx] = dist #printProgress("Compute distance %s: to [%d] %s; dist=%f" %(str((lon, lat)),idx, str(tupleLL),dist)) idx +=1 minDist = np.min(distArray) minDistIndex = np.where(distArray == minDist)[0][0] # np.where(distArray == minDist) == gives= => [(array([26451]),)] printProgress("Minimum distance %s: is %f; [%s] = %s" %(str((lon, lat)), minDist, str(minDistIndex), str(self.lonLatStacked[minDistIndex]))) return minDistIndex
def GetDataAtIndex(self, stationCode, timeIndex, dataIndex, variableName="precipitation"): ''' ''' if dataIndex>self.arraySize: self.CLASSPRINT("ERROR: Probing non-existing data!") return variableFound = (variableName in self.GetListOfExistingData()) self.GetVariable(variableName) if variableFound: ''' self.supportedData = { 'Precipitation': ['RH', '1hour_validated_rh'], 'Temperature': ['T_DRYB_10', '10min_validated_t'] } self.elementsDict = { 'RH': (1, 'RH', 'Precipitation', 'Precipitation AWS', 0.1, 'mm'), 'T_DRYB_10': (8, 'T_DRYB_10', 'Temperature', 'Air temperature (10-minute)', 0.1, 'degrees C') } ## (1, 'RH', 'Precipitation', 'Precipitation AWS', 0.1, 'mm') ## self.elementsDict['RH'][4] == 0.1 ... scaling ## self.elementsDict['RH'][5] .. units ''' closestDateTime = self.dateTimeArray[closestDateTimeIndex] #print "closestDateTime=", closestDateTime elementName=self.supportedData[variableName][1] elementId=self.supportedData[variableName][0] dataValues = self.QueryValues( stationCode, elementName, elementId, getValues=True, getStationName=False, getLonLat=False, timeRange=(closestDateTime,closestDateTime), limitTo=-1) #precipitationDataDeBILT = self.QueryValues(260, '1hour_validated_rh', 'RH', getValues=True, getStationName=False, getLonLat=False, timeRange=("",""), limitTo=-1) #pprint.pprint(list(precipitationDataDeBILT), width=200) # #tempDataDeBILT = self.QueryValues(260, '10min_validated_t', 'T_DRYB_10', getValues=True, getStationName=False, getLonLat=False, timeRange=("",""), limitTo=-1) #pprint.pprint(list(tempDataDeBILT), width=200) # dataValues == [[datetime.datetime(2010, 7, 14, 16, 30) 193]] dataValue = dataValues[0][1] * self.elementsDict[elementId][4] #print self.elementsDict dataUnits = self.elementsDict[elementId][5] printProgress("givenDateTime=%s, closestDateTimeIndex=%d, query(lon, lat)=%s, minDistanceDataIndex=%d, dataValue=%f %s" %(str(givenDateTime), timeIndex, str((lon, lat)), dataIndex, float(dataValue), dataUnits) ) return dataValue else: return None
def CLASSPRINT(self, *args): ''' Printing debug information ... ''' if self.verbose: try: pid = "%5d" % (os.getpid()) except: pid = "%5d" % (0) try: printProgress("[%s:%s] %s.%s() %s" % (pid, csvT.loggerName, self.__class__.__name__, inspect.stack()[1][3], ''.join(args))) except: printProgress("[%s:%s] %s.%s() %s" % (pid, csvT.loggerName, self.__class__.__name__, inspect.stack()[1][3], ''.join(map(str, args))))
def FindClosestLonLatPointIndex(self, lon, lat): self.givenLon = lon self.givenLat = lat # This works with 2D array self.lonLatStacked: [ [lon,lat], ... ] if self.dataType == "GRID-DATA": printProgress( "Computing distance %s: to lat-lon-grid [%dx%d]; gridSize=%d" % (str((lon, lat)), self.xDim, self.yDim, self.gridSize)) elif self.dataType == "STATION-DATA": printProgress( "Computing distance %s: to lat-lon stations [%d]; gridSize=%d" % (str((lon, lat)), self.lonDim, self.gridSize)) idx = 0 distArray = np.zeros(self.lonLatStacked.shape[0]).astype(np.float) for tupleLL in self.lonLatStacked: # tupleLL == [lon,lat] #print "idx=",idx, "tupleLL=",tupleLL dist = self.Distance2pointsInLonLat(lon, lat, tupleLL[0], tupleLL[1]) distArray[idx] = dist #printProgress("Compute distance %s: to [%d] %s; dist=%f" %(str((lon, lat)),idx, str(tupleLL),dist)) idx += 1 minDist = np.min(distArray) minDistIndex = np.where(distArray == minDist)[0][ 0] # np.where(distArray == minDist) == gives= => [(array([26451]),)] printProgress("Minimum distance %s: is %f; [%s] = %s" % (str((lon, lat)), minDist, str(minDistIndex), str(self.lonLatStacked[minDistIndex]))) return minDistIndex
def ReadJsonConfigurationFromFile(jsonFileName): jsonFile = open(jsonFileName) jsonLines = jsonFile.readlines() # Note: object_hook takes care of removing unicode u".." where possible #print "MISSING_LIB_commentjson=",MISSING_LIB_commentjson if not MISSING_LIB_commentjson: jsonStr = ''.join(jsonLines) #print jsonStr #checkJsonValidity(jsonStringOrDict, handlePrintFunc=None, _hookObject=None) if not checkJsonValidity(jsonStr, handlePrintFunc=printHandleJsonValidity, _hookObject=_decode_dict): csvt.printProgress("jsonStr is NOT VALID JSON!\n") #sys.exit(1) raise ValueError('JSON-INVALID') return None try: w3dxConfigDict = commentjson.loads(jsonStr, encoding=JsonEncodingType, object_hook=_decode_dict) except Exception as eStr: exceptionStr = "Cannot construct JSON from: type=%s;\nException: %s;\ngiven = %s" % ( str(type(jsonStr)), str(eStr), enumerateLines(str(jsonStr))) (jsonOK, errMsg) = (False, exceptionStr) printHandleJsonValidity(jsonOK, errMsg) #sys.exit(1) raise ValueError('JSON-INVALID') #return None else: # In case of missing library commentjson # Remove commented lines from json text jsonStr = '' for ln in jsonLines: ln0 = ln.strip() if (len(ln0) > 0) and (ln0[0] != '#') and (ln0 != '\n'): jsonStr += ln # print jsonStr if not checkJsonValidity(jsonStr, handlePrintFunc=printHandleJsonValidity, _hookObject=_decode_dict): csvt.printProgress("jsonStr is NOT VALID JSON!\n") #print "ERROR: jsonStr is NOT VALID JSON!\n" #sys.exit(1) raise ValueError('JSON-INVALID') #return None try: w3dxConfigDict = json.loads(jsonStr, encoding=JsonEncodingType, object_hook=_decode_dict) except Exception as eStr: exceptionStr = "Cannot construct JSON from: type=%s;\nException: %s;\ngiven = %s" % ( str(type(jsonStr)), str(eStr), enumerateLines(str(jsonStr))) (jsonOK, errMsg) = (False, exceptionStr) printHandleJsonValidity(jsonOK, errMsg) #sys.exit(1) raise ValueError('JSON-INVALID') #return None return w3dxConfigDict
def WrangleWithNetCdfData(self, argsDict): ''' dwp.WrangleWithNetCdfData( { "outputCSV":options.outputCSV } ) ''' if not self.initialized: printProgress("ERROR: Initialize first!") return if not "outputCSV" in argsDict: printProgress("ERROR: outputCSV must be provided! ") raise ValueError('MISSING-input-file(s)') else: self.outputCSV = argsDict["outputCSV"] self.csvDataObj.SetOutputCSVFile(self.outputCSV) printProgress("*******************************************") printProgress("***** Wrangling Processing STARTED. ******") printProgress("*******************************************") tmpFileName = "./tempFile.csv" self.csvDataObj.WriteFullQueryDataToTmpFile(tmpFileName) ''' # Do everything at once: self.csvDataObj.ReadFullQueryDataFromTmpFile(tmpFileName, startAtRow = 0, readRows=-1) self.csvDataObj.WrangleMeteoParameter(parameterName = "temperature") self.csvDataObj.WrangleMeteoParameter(parameterName = "precipitation") self.csvDataObj.ProduceOutput(exportLonLat = True) ''' ''' {"datatowrangle": [ { "dataURL": "http://opendap.knmi.nl/knmi/thredds/dodsC/DATALAB/hackathon/radarFull2015.nc", # r"http://opendap.knmi.nl/knmi/thredds/dodsC/DATALAB/hackathon/radarFull2015.nc" # "/visdataATX/hackathon/radarFullWholeData.nc" "fields": ["precipitation_amount"] } ] } ''' for dataSource in self.jobDescDict["datatowrangle"]: url = dataSource["dataURL"] ndo = ncdft.ncdfDataObject() ndo.SetDataURL(url) ndo.OpenMetaData() dataSource["ndo"] = ndo self.totalNumberOfCSVrows = self.csvDataObj.GetTotalNumberOfCSVrows() self.nproc = 1 #self.percentFraction = 0.01 # 0.01% of 150.000 rows => 10000 files of 15 rows in each file #self.percentFraction = 0.1 # 0.1% of 150.000 rows => 1000 files of 150 rows in each file self.percentFraction = 1 # 1% of 150.000 rows => 100 files of 1500 rows in each file self.percentParts = int(100 / self.percentFraction) self.processingBulkSize = self.totalNumberOfCSVrows / self.percentParts # number of rows representing 1% (0.1%,0.01%) of total if self.limitTo > 0 and self.limitTo < self.processingBulkSize: self.processingBulkSize = self.limitTo # split temporary request data into #nr bulks bulkNr = 0 rowsProcessed = 0 tempFileList = [] parameterList = ["utc-time", "longitude", "latitude"] try: while rowsProcessed < self.totalNumberOfCSVrows: self.csvDataObj.ReadFullQueryDataFromTmpFile( tmpFileName, startAtRow=rowsProcessed, readRows=self.processingBulkSize) for dataSource in self.jobDescDict["datatowrangle"]: for parameterName in dataSource["fields"]: valueArray = self.WrangleWithNetCdfDataArray( dataSource, parameterName, rowsProcessed) if valueArray == None: raise Exception("NONEXISTENT-variable: %s" % parameterName) self.csvDataObj.meteoDataStore[ parameterName] = valueArray parameterList.append(parameterName) #self.csvDataObj.WrangleMeteoParameter(parameterName = "temperature") #self.csvDataObj.WrangleMeteoParameter(parameterName = "precipitation") tmpBulkFileName = "./tempBulkOutputFile%d.csv" % (bulkNr) tempFileList.append(tmpBulkFileName) self.csvDataObj.ProduceBulkOutput( tmpBulkFileName, bulkNr, startAtRow=rowsProcessed, readRows=self.processingBulkSize, exportLonLat=True) rowsProcessed += self.processingBulkSize bulkNr += 1 #if self.limitTo > 0: # self.callStatusCallback("Calculating. %d of %d records processed" % (rowsProcessed, self.limitTo), # (float(self.processingBulkSize) / float(self.limitTo)) * 100.0) #else: # self.callStatusCallback("Calculating. %d of %d records processed" % (rowsProcessed, self.totalNumberOfCSVrows), # self.percentFraction) if self.limitTo > 0 and rowsProcessed >= self.limitTo: break except: raise finally: try: self.csvDataObj.WriteCSVHeader(fieldList=parameterList) self.csvDataObj.JoinBulkResults(tempFileList) except: pass printProgress("*******************************************") printProgress("***** Wrangling Processing FINISHED. ******") printProgress("*******************************************")
def ReadInputCSV(self): if not self.initialized: printProgress("ERROR: Initialize first!") return self.csvDataObj.ReadInputCSV()
def Initialize(self, argsDict): ''' dwp.Initialize( { "inputCSV":options.inputCSV, "metaCSV": options.metaCSV, "jobDesc": options.jobDesc, "logFile":options.outputCSV+".log", "limitTo": options.limitTo, "scanOnly": True "verboseLevel" : 10 // 0,1,10 } ) OPTIONAL parameters: limitTo, verboseLevel ''' if not "logFile" in argsDict: printProgress("ERROR: logFile must be provided! ") raise ValueError('MISSING-input-file(s)') else: self.logFileName = argsDict["logFile"] csvT.logFileName = self.logFileName csvT.InitializeWranglerLogger(csvT.logFileName) if not "inputCSV" in argsDict: printProgress("ERROR: inputCSV must be provided! ") raise ValueError('MISSING-input-file(s)') else: self.inputCSV = argsDict["inputCSV"] if not "metaCSV" in argsDict: printProgress("ERROR: metaCSV must be provided! ") raise ValueError('MISSING-input-file(s)') else: self.metaCSV = argsDict["metaCSV"] if not "jobDesc" in argsDict and not ("scanOnly" in argsDict and argsDict["scanOnly"]): printProgress("ERROR: jobDesc must be provided! ") raise ValueError('MISSING-input-file(s)') elif "jobDesc" in argsDict and not ("scanOnly" in argsDict and argsDict["scanOnly"]): self.jobDesc = argsDict["jobDesc"] self.jobDescDict = jst.ReadJsonConfigurationFromFile(self.jobDesc) self.scanOnly = False elif ("scanOnly" in argsDict and argsDict["scanOnly"]): self.jobDesc = None self.jobDescDict = None self.scanOnly = True if not "limitTo" in argsDict: # limitTo: OPTIONAL parameter self.limitTo = -1 # Does not apply limit; process the whole csv dataset else: self.limitTo = argsDict["limitTo"] if not os.path.exists(self.inputCSV): printProgress("ERROR: inputCSV does NOT exists! %s " % (self.inputCSV)) raise ValueError('MISSING-input-file(s)') if "statusCallback" in argsDict: self.statusCallback = argsDict["statusCallback"] self.csvDataObj = csvT.csvDataObject() if "verboseLevel" in argsDict: self.SetVerboseLevel(argsDict["verboseLevel"]) self.csvDataObj.SetVerboseLevel(argsDict["verboseLevel"]) self.csvDataObj.SetInputCSVFile(self.inputCSV) self.csvDataObj.SetInputMetaCSVFile(self.metaCSV) if not self.scanOnly: self.csvDataObj.SetJobDescriptionFile(self.jobDesc) if not os.path.exists("./output"): os.makedirs("./output") self.csvDataObj.ApplyLimit(self.limitTo) self.initialized = True
default=False, help="Perform SCAN-ONLY action on CSV data.") (options, args) = parser.parse_args() csvT.logFileName = options.logfile[:] try: dwp.Initialize({ "inputCSV": options.inputCSV, "metaCSV": options.metaCSV, "jobDesc": options.jobDesc, "logFile": options.outputCSV + ".log", #"logFile": csvT.logFileName, "limitTo": options.limitTo, "verboseLevel": options.verboseLevel, "scanOnly": options.scanOnly }) dwp.ReadInputCSV() if not options.scanOnly: dwp.WrangleWithNetCdfData({"outputCSV": options.outputCSV}) #Possible exceptions raised: #raise ValueError('JSON-INVALID') #raise ValueError('MISSING-input-file(s)') except ValueError as err: printProgress("Catched exception:" + str(err.args)) sys.exit(1) sys.exit(0)
def GetDataAtIndex(self, timeIndex, dataIndex, variableName="precipitation_amount"): ''' NOTE: dataIndex is 1D index to the data. For 2D arrays the 2D index has to be computed from the 1D index... self.longitudes .. 2D array [256][256] self.latitudes .. 2D array [256][256] self.lonLatStacked [256 * 256] of [ lon, lat ] print self.metaData.variables['image1_image_data'] <type 'netCDF4._netCDF4.Variable'> uint16 image1_image_data(time, y, x) VERSION: 1.2 grid_mapping: projection units: kg m-2 _FillValue: 65535 standard_name: precipitation_amount comment: Original units are in mm scale_factor: 0.01 add_offset: 0.0 unlimited dimensions: time current shape = (1051776, 256, 256) filling on ''' if self.dataType == "GRID-DATA": idX = dataIndex % self.xDim idY = dataIndex / self.xDim keylist = self.metaData.variables.keys() variableFound = self.GetVariable(variableName) if variableFound: dataValue = variableFound[timeIndex][idY][ idX] # grid data (time,lat,lon) printProgress( "givenDateTime=%s, closestDateTimeIndex=%d, query(lon, lat)=%s, minDistanceDataIndex=%d, dataValue=%f %s" % (str(self.givenDateTime), timeIndex, str((self.givenLon, self.givenLat)), dataIndex, float(dataValue), variableFound.units)) return dataValue else: return None elif self.dataType == "STATION-DATA": idLon = dataIndex % self.lonDim keylist = self.metaData.variables.keys() variableFound = self.GetVariable(variableName) if variableFound: dataValue = variableFound[timeIndex][ idLon] # station data (time,station) printProgress( "givenDateTime=%s, closestDateTimeIndex=%d, query(lon, lat)=%s, minDistanceDataIndex=%d, dataValue=%f %s" % (str(self.givenDateTime), timeIndex, str((self.givenLon, self.givenLat)), dataIndex, float(dataValue), variableFound.units)) return dataValue else: return None else: return None
def QueryValues(self, stationCode, elementName, elementId, getValues=True, getStationName=False, getLonLat=False, timeRange=("",""), limitTo=-1): ''' ## (1, 'RH', 'Precipitation', 'Precipitation AWS', 0.1, 'mm') ## elementsDict['RH'][4] == 0.1 ... scaling ## elementsDict['RH'][5] .. units precipitationDataDeBILT = self.QueryValues(260, '1hour_validated_rh', 'RH', getValues=True, getStationName=False, getLonLat=False, timeRange=("",""), limitTo=-1) pprint.pprint(list(precipitationDataDeBILT), width=200) tempDataDeBILT = self.QueryValues(260, '10min_validated_t', 'T_DRYB_10', getValues=True, getStationName=False, getLonLat=False, timeRange=("",""), limitTo=-1) pprint.pprint(list(tempDataDeBILT), width=200) ''' db = self.InitDBConnection() cur = db.cursor() #queryStr1 = "SELECT * FROM %s a limit 100;" %(valueName) #queryStr1 = "SELECT * FROM %s a limit %d;" %(valueName, limitTo) if limitTo==-1: LIMIToption = "" else: LIMIToption = "limit %d" %(int(limitTo)) if timeRange==("",""): timeRangeOption = "" else: #timeRangeOption=" a.date > '2015-01-01 10:30:00' and a.date < '2015-01-01 12:30:00' and " timeRangeOption=" a.date >= '%s' and a.date <= '%s' and " %(timeRange[0],timeRange[1]) if getLonLat: getLonLatOption = "c.longitude, c.latitude, " else: getLonLatOption = "" if getStationName: getStationNameOption = "c.name, " else: getStationNameOption = "" if getValues: getValuesOption = ", a.value " else: getValuesOption = "" # NOTE: c.type_id = 2 + e.type = 'H' refer to automatic weather-stations #queryStr1 = "SELECT %s c.code, c.type_id, %s a.data_id, a.date, a.value, a.qc, d.element \ queryStr = "SELECT %s %s a.date %s \ FROM %s a, series b, stations c, elements d, types e \ WHERE %s c.code = %d and c.type_id = 2 and c.type_id = e.type_id and a.data_id = b.data_id and b.code = c.code and \ b.type_id = c.type_id and d.element_id = b.element_id and \ d.element = '%s' and e.type = 'H' %s;" %(getStationNameOption, getLonLatOption, getValuesOption, elementName, timeRangeOption, stationCode, elementId, LIMIToption) #queryStr1 = "SELECT c.name, c.code, c.type_id, c.latitude, c.longitude, a.data_id, a.date, a.value, a.qc, d.element \ #FROM %s a, series b, stations c, elements d, types e \ #WHERE c.type_id = 2 and c.type_id = e.type_id and a.data_id = b.data_id and b.code = c.code and \ #a.date > '2015-01-01 10:30:00' and a.date < '2015-01-01 12:30:00' and \ #b.type_id = c.type_id and d.element_id = b.element_id and \ #d.element = '%s' and e.type = 'H' limit %d;" %(elementName, elementId, limitTo) #d.element = 'rh' and e.type = 'H' limit %d;" %(valueName,stationCode, limitTo) #a.date > date('2015-01-01 10:30:00') and a.date < date('2015-01-01 12:30:00') and \ printProgress( "MySQL_query: "+ queryStr ) cur.execute(queryStr) data=[] for row in cur.fetchall(): data.append(list(row)) db.close() dataNP = np.array(data) return dataNP