def parseGLTextFile(self, infoDict): """ parse a GreenLogger text file """ dataRecItemCt = 0 dataRecsAdded = 0 dataRecsDupSkipped = 0 # print "About to put lines into temp table" self.putTextLinesIntoTmpTable(infoDict) # print "Finished putting lines into temp table" # get the metadata header stSQL = """SELECT Line FROM tmpLines WHERE Line LIKE '{"Instrument identifier":%' GROUP BY Line;""" mtDat = scidb.curT.execute(stSQL).fetchone() dictHd = ast.literal_eval(mtDat['Line']) # somewhat redundant to do 'LoggerSerialNumber' here (fn 'assureChannelIsInDB' would fill in # later), but allows putting Model into the DB iInstSpecID = scidb.assureItemIsInTableField(dictHd['Model'], "InstrumentSpecs", "InstrumentSpec") iLoggerID = scidb.assureItemIsInTableField( dictHd['Instrument identifier'], "Loggers", "LoggerSerialNumber") scidb.curD.execute( "UPDATE Loggers SET InstrumentSpecID = ? WHERE ID = ?;", (iInstSpecID, iLoggerID)) # get the hour offset(s); most files will have only one stSQL = "SELECT substr(Line, 21, 3) as TZ FROM tmpLines " \ "WHERE Line LIKE '____-__-__ __:__:__ ___%' GROUP BY TZ;" hrOffsets = scidb.curT.execute(stSQL).fetchall() for hrOffset in hrOffsets: iTimeZoneOffset = int(hrOffset['TZ']) # make a dictionary of channel IDs for data lines that have this hour offset # would be slightly different for different hour offsets lCols = dictHd[ 'Columns'] # get col metadata; this is a list of dictionaries # # it is indexed by the columns list, and is zero-based # lCh = [0 for i in range(len(lCols))] # initially, fill with all zeroes dictChannels = {} # this is the dictionary we will build for dictCol in lCols: # somewhat redundant to fill these in before calling function 'assureChannelIsInDB', but # allows assigning sensor device types iDeviceTypeID = scidb.assureItemIsInTableField( dictCol['Device'], "DeviceSpecs", "DeviceSpec") iSensorID = scidb.assureItemIsInTableField( dictCol['Identifier'], "Sensors", "SensorSerialNumber") scidb.curD.execute( "UPDATE Sensors SET DeviceSpecID = ? WHERE ID = ?;", (iDeviceTypeID, iSensorID)) # iDataTypeID = scidb.assureItemIsInTableField(dictCol['DataType'], "DataTypes", "TypeText") # iDataUnitsID = scidb.assureItemIsInTableField(dictCol['DataUnits'], "DataUnits", "UnitsText") # build list to create the channel # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new lChannel = [ 0, dictCol['Order'], dictHd['Instrument identifier'], dictCol['Identifier'], dictCol['DataType'], dictCol['DataUnits'], iTimeZoneOffset, '' ] dictChannels[dictCol['Order']] = ( lChannel[:]) # the key is the column number scidb.assureChannelIsInDB(dictChannels[ dictCol['Order']]) # get or create the channel iChannelID = dictChannels[dictCol['Order']][0] # store the column name as a Series iSeriesID = scidb.assureItemIsInTableField( dictCol['Name'], "DataSeries", "DataSeriesDescription") # tie it to this Channel, to offer later stSQLcs = 'INSERT INTO tmpChanSegSeries(ChannelID, SeriesID) VALUES (?, ?);' try: scidb.curD.execute(stSQLcs, (iChannelID, iSeriesID)) except sqlite3.IntegrityError: pass # silently ignore duplicates # print 'Before Channel function' # for ky in dictChannels.keys(): # print ky, dictChannels[ky][:] # for ky in dictChannels.keys(): # scidb.assureChannelIsInDB(dictChannels[ky]) # print 'After Channel function' # for ky in dictChannels.keys(): # print ky, dictChannels[ky][:] # make a list of channel IDs for the set of lines with this HrOffset, for quick lookup # it is indexed by the columns list, and is zero-based lCh = [] for iCol in range(len(lCols)): iNomCol = iCol + 1 if iNomCol in dictChannels: lChanSet = dictChannels[iNomCol][:] lCh.append(lChanSet[0]) else: # does not correspond to a data colum lCh.append( 0) # placeholder, to make list indexes work right # done setting up channels, get data lines stSQL = "SELECT ID, Line FROM tmpLines WHERE substr(Line, 21, 3) = ? " \ "AND Line LIKE '____-__-__ __:__:__ ___%' ORDER BY ID;" recs = scidb.curT.execute(stSQL, (hrOffset['TZ'], )).fetchall() for rec in recs: lData = rec['Line'].split('\t') # item zero is the timestamp followed by the timezone offset sTimeStamp = lData[ 0][:-4] # drop timezone offset, we already have it tsAsTime = datetime.datetime.strptime(sTimeStamp, "%Y-%m-%d %H:%M:%S") tsAsTime.replace( tzinfo=None ) # make sure it does not get local timezone info tsAsTime = tsAsTime + datetime.timedelta( hours=-iTimeZoneOffset) tsAsDate = tsAsTime.date() stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)" for iCol in range(len(lData)): if iCol > 0: # an item of data # give some progress diagnostics dataRecItemCt += 1 if dataRecItemCt % 100 == 0: self.msgArea.ChangeValue("Line " + str(rec['ID']) + " of " + str(infoDict['lineCt']) + "; " + str(dataRecsAdded) + " records added, " + str(dataRecsDupSkipped) + " duplicates skipped.") wx.Yield() try: # much faster to try and fail than to test first scidb.curD.execute( stSQL, (tsAsTime, lCh[iCol], lData[iCol])) dataRecsAdded += 1 # count it except sqlite3.IntegrityError: # item is already in Data table dataRecsDupSkipped += 1 # count but otherwise ignore finally: wx.Yield() # finished parsing lines infoDict['numNewDataRecsAdded'] = dataRecsAdded infoDict['numDupDataRecsSkipped'] = dataRecsDupSkipped infoDict['stParseMsg'] = str(infoDict['lineCt']) + " lines processed; " + \ str(dataRecsAdded) + " data records added to database, " + \ str(dataRecsDupSkipped) + " duplicates skipped." self.msgArea.ChangeValue(infoDict['stParseMsg'])
def parseHoboWareTextFile(self, infoDict): """ parse a data file exported as text by HoboWare """ sStrip = '" \x0a\x0d' # characters to strip from parsed items # regular expression pattern to find logger number pLogger = re.compile(r'LGR S/N: (?P<nLogger>\d+)') # regular expression pattern to find sensor number pSensor = re.compile(r'SEN S/N: (?P<nSensor>\d+)') # regular expression pattern to find hour offset pHrOffset = re.compile(r'Time, GMT(?P<sHrOffset>.+)') dataRecItemCt = 0 dataRecsAdded = 0 dataRecsDupSkipped = 0 print "About to put lines into temp table" self.putTextLinesIntoTmpTable(infoDict) print "Finished putting lines into temp table" #parse file, start with header line; 1st line in this file format scidb.curT.execute("SELECT * FROM tmpLines ORDER BY ID;") for rec in scidb.curT: if rec['ID'] == 1: """ Build a dictionary of the channels. The indexes will be the column numbers because all data files have at least that. The value will be a 7-membered list The first item in each list will be the primary key in the Channels table. The list is first created with this = 0. The rest of the list is built up, then the list is sent to a function that looks in the database. The function fills in the primary key, either existing or newly created. List item 7 will be "new" if the record is new, otherwise "existing" The list contains the text values of logger serial number, sensor serial number, data type, and data units. The function takes care of filling in all these in their respective tables. When the dictionary is complete, the calling procedure can quickly insert data values into the data table by just pulling list item [0] for the dictionary key, which key is the column number in the source file. This loose structure allows some kludgy workarounds for bugs that were in some versions of the data files. """ lHdrs = rec['Line'].split('\t') # ignore item zero, just a pound sign and possibly three junk characters # item 1 is the hour offset, and clue to export bugs we need to workaround sHd = lHdrs[1].strip(sStrip) m = pHrOffset.search(sHd) if m: sTimeOffset = m.group('sHrOffset') lTimeOffsetComponents = sTimeOffset.split(':') sTimeOffsetHrs = lTimeOffsetComponents[0] iHrOffset = int(sTimeOffsetHrs) else: iHrOffset = 0 dictChannels = {} # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new lChannel = [0, 0, '', '', '', '', iHrOffset, ''] for iCol in range(len(lHdrs)): # skip items 0 & 1 if iCol > 1: # a header for a data column lChannel[1] = iCol + 1 # stored columns are 1-based sHd = lHdrs[iCol].strip(sStrip) # get the type and units lTypeUnits = sHd.split('(', 2) sTypeUnits = lTypeUnits[0].strip(' ') lTypeUnits = sTypeUnits.split(',') ## TNP_MOD print("TNP debug1") print(lTypeUnits) ## If units specified, make units 'NA' if (len(lTypeUnits) == 1): lTypeUnits.append('NA') if lTypeUnits[0]: sType = lTypeUnits[0].strip(' ') else: sType = '(na)' lChannel[4] = sType if lTypeUnits[1]: sUnits = lTypeUnits[1].strip(' ') else: sUnits = '(na)' lChannel[5] = sUnits # get the logger ID and sensor ID m = pLogger.search(sHd) if m: sLoggerID = m.group('nLogger') else: sLoggerID = '(na)' lChannel[2] = sLoggerID m = pSensor.search(sHd) if m: sSensorID = m.group('nSensor') else: sSensorID = "(na)" lChannel[3] = sSensorID dictChannels[iCol + 1] = (lChannel[:]) # gone through all the headers, apply bug workarounds here print 'Before Channel function' for ky in dictChannels.keys(): print ky, dictChannels[ky][:] for ky in dictChannels.keys(): scidb.assureChannelIsInDB(dictChannels[ky]) print 'After Channel function' for ky in dictChannels.keys(): print ky, dictChannels[ky][:] # make a list of channel IDs for the rest of this file, for quick lookup # it is indexed by the columns list, and is zero-based lCh = [] for iCol in range(len(lHdrs)): iNomCol = iCol + 1 if iNomCol in dictChannels: lChanSet = dictChannels[iNomCol][:] lCh.append(lChanSet[0]) else: # does not correspond to a data colum lCh.append( 0) # placeholder, to make list indexes work right else: # not the 1st (header) line, but a line of data lData = rec['Line'].split('\t') # ignore item zero, a line number, not used sTimeStamp = lData[1] tsAsTime = datetime.datetime.strptime(sTimeStamp, "%Y-%m-%d %H:%M:%S") tsAsTime.replace( tzinfo=None ) # make sure it does not get local timezone info tsAsTime = tsAsTime + datetime.timedelta(hours=-iHrOffset) tsAsDate = tsAsTime.date() stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)" for iCol in range(len(lData)): if iCol > 1: # an item of data # give some progress diagnostics dataRecItemCt += 1 if dataRecItemCt % 100 == 0: self.msgArea.ChangeValue("Line " + str(rec['ID']) + " of " + str(infoDict['lineCt']) + "; " + str(dataRecsAdded) + " records added, " + str(dataRecsDupSkipped) + " duplicates skipped.") wx.Yield() try: # much faster to try and fail than to test first scidb.curD.execute( stSQL, (tsAsTime, lCh[iCol], lData[iCol])) dataRecsAdded += 1 # count it except sqlite3.IntegrityError: # item is already in Data table dataRecsDupSkipped += 1 # count but otherwise ignore finally: wx.Yield() # finished parsing lines infoDict['numNewDataRecsAdded'] = dataRecsAdded infoDict['numDupDataRecsSkipped'] = dataRecsDupSkipped infoDict['stParseMsg'] = str(infoDict['lineCt']) + " lines processed; " + \ str(dataRecsAdded) + " data records added to database, " + \ str(dataRecsDupSkipped) + " duplicates skipped." self.msgArea.ChangeValue(infoDict['stParseMsg'])
def parseBlueTermFile(self, infoDict): """ a BlueTerm log file can contain captured data from a series of daily GL text files, possibly even from different loggers """ dataRecItemCt = 0 dataRecsAdded = 0 dataRecsDupSkipped = 0 self.putTextLinesIntoTmpTable(infoDict) # find the different data dump segments # within each one, data will be from only one logger, and so we # can use the same metadata header # make a list of 2-tuples; tuple item 0 = the starting ID of the # dump segment and tuple item 1 = the ending ID lDmpSegs = [] stSQL = """SELECT ID, Line FROM tmpLines WHERE Line = '{"datadump":"begin"}' OR Line = '{"datadump":"end"}' ORDER BY ID;""" dmSegs = scidb.curT.execute(stSQL).fetchall() # begin or end may be missing, due to log interruption idSegStart = None idSegEnd = None # only use segments that have valid begin/end for dmSegItm in dmSegs: if dmSegItm['Line'] == '{"datadump":"begin"}': idSegStart = dmSegItm['ID'] elif dmSegItm['Line'] == '{"datadump":"end"}': if idSegStart != None: # make the tuple tSeg = (idSegStart, dmSegItm['ID']) # append to list lDmpSegs.append(tSeg) # null the vars idSegStart = None idSegEnd = None else: # this should not happen, but tie up if it does idSegStart = None idSegEnd = None print "dump segments:", lDmpSegs # being worked on >>>> iNumSegs = len(lDmpSegs) if iNumSegs == 0: infoDict['stParseMsg'] = " No valid data segments in file." # will skip following loop iCtSeg = 0 for tSeg in lDmpSegs: iCtSeg += 1 idSegStart, idSegEnd = tSeg # get the metadata header; if multiple will all be the same for one dump segment stSQL = """SELECT Line FROM tmpLines WHERE ID > ? AND ID < ? AND Line LIKE '{"Instrument identifier":%' GROUP BY Line;""" mtDat = scidb.curT.execute(stSQL, tSeg).fetchone() dictHd = ast.literal_eval(mtDat['Line']) # somewhat redundant to do 'LoggerSerialNumber' here (fn 'assureChannelIsInDB' would fill in # later), but allows putting Model into the DB iInstSpecID = scidb.assureItemIsInTableField( dictHd['Model'], "InstrumentSpecs", "InstrumentSpec") iLoggerID = scidb.assureItemIsInTableField( dictHd['Instrument identifier'], "Loggers", "LoggerSerialNumber") scidb.curD.execute( "UPDATE Loggers SET InstrumentSpecID = ? WHERE ID = ?;", (iInstSpecID, iLoggerID)) # get the hour offset(s); most files will have only one stSQL = "SELECT substr(Line, 21, 3) as TZ FROM tmpLines " \ "WHERE ID > ? AND ID < ? " \ "AND Line LIKE '____-__-__ __:__:__ ___%' GROUP BY TZ;" hrOffsets = scidb.curT.execute(stSQL, tSeg).fetchall() for hrOffset in hrOffsets: iTimeZoneOffset = int(hrOffset['TZ']) # make a dictionary of channel IDs for data lines that have this hour offset # would be slightly different for different hour offsets lCols = dictHd[ 'Columns'] # get col metadata; this is a list of dictionaries # # it is indexed by the columns list, and is zero-based # lCh = [0 for i in range(len(lCols))] # initially, fill with all zeroes dictChannels = {} # this is the dictionary we will build for dictCol in lCols: # somewhat redundant to fill these in before calling function 'assureChannelIsInDB', but # allows assigning sensor device types iDeviceTypeID = scidb.assureItemIsInTableField( dictCol['Device'], "DeviceSpecs", "DeviceSpec") iSensorID = scidb.assureItemIsInTableField( dictCol['Identifier'], "Sensors", "SensorSerialNumber") scidb.curD.execute( "UPDATE Sensors SET DeviceSpecID = ? WHERE ID = ?;", (iDeviceTypeID, iSensorID)) # iDataTypeID = scidb.assureItemIsInTableField(dictCol['DataType'], "DataTypes", "TypeText") # iDataUnitsID = scidb.assureItemIsInTableField(dictCol['DataUnits'], "DataUnits", "UnitsText") # build list to create the channel # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new lChannel = [ 0, dictCol['Order'], dictHd['Instrument identifier'], dictCol['Identifier'], dictCol['DataType'], dictCol['DataUnits'], iTimeZoneOffset, '' ] dictChannels[dictCol['Order']] = ( lChannel[:]) # the key is the column number scidb.assureChannelIsInDB(dictChannels[ dictCol['Order']]) # get or create the channel iChannelID = dictChannels[dictCol['Order']][0] # store the column name as a Series iSeriesID = scidb.assureItemIsInTableField( dictCol['Name'], "DataSeries", "DataSeriesDescription") # tie it to this Channel, to offer later stSQLcs = 'INSERT INTO tmpChanSegSeries(ChannelID, SeriesID) VALUES (?, ?);' try: scidb.curD.execute(stSQLcs, (iChannelID, iSeriesID)) except sqlite3.IntegrityError: pass # silently ignore duplicates # print 'Before Channel function' # for ky in dictChannels.keys(): # print ky, dictChannels[ky][:] # for ky in dictChannels.keys(): # scidb.assureChannelIsInDB(dictChannels[ky]) # print 'After Channel function' # for ky in dictChannels.keys(): # print ky, dictChannels[ky][:] # make a list of channel IDs for the set of lines with this HrOffset, for quick lookup # it is indexed by the columns list, and is zero-based lCh = [] for iCol in range(len(lCols)): iNomCol = iCol + 1 if iNomCol in dictChannels: lChanSet = dictChannels[iNomCol][:] lCh.append(lChanSet[0]) else: # does not correspond to a data colum lCh.append( 0) # placeholder, to make list indexes work right # done setting up channels, get data lines stSQL = "SELECT ID, Line FROM tmpLines " \ "WHERE ID > ? AND ID < ? " \ "AND substr(Line, 21, 3) = ? " \ "AND Line LIKE '____-__-__ __:__:__ ___%' ORDER BY ID;" recs = scidb.curT.execute(stSQL, ( idSegStart, idSegEnd, hrOffset['TZ'], )).fetchall() iNumSegLines = len(recs) iCtSegLines = 0 for rec in recs: iCtSegLines += 1 lData = rec['Line'].split('\t') # item zero is the timestamp followed by the timezone offset sTimeStamp = lData[ 0][:-4] # drop timezone offset, we already have it tsAsTime = datetime.datetime.strptime( sTimeStamp, "%Y-%m-%d %H:%M:%S") tsAsTime.replace( tzinfo=None ) # make sure it does not get local timezone info tsAsTime = tsAsTime + datetime.timedelta( hours=-iTimeZoneOffset) tsAsDate = tsAsTime.date() stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)" for iCol in range(len(lData)): if iCol > 0: # an item of data # give some progress diagnostics dataRecItemCt += 1 if dataRecItemCt % 100 == 0: self.msgArea.ChangeValue( "Segment " + str(iCtSeg) + " of " + str(iNumSegs) + ", HrOffset " + str(iTimeZoneOffset) + ", Line " + str(iCtSegLines) + " of " + str(iNumSegLines) + "; " + str(dataRecsAdded) + " records added, " + str(dataRecsDupSkipped) + " duplicates skipped.") wx.Yield() try: # much faster to try and fail than to test first scidb.curD.execute( stSQL, (tsAsTime, lCh[iCol], lData[iCol])) dataRecsAdded += 1 # count it except sqlite3.IntegrityError: # item is already in Data table dataRecsDupSkipped += 1 # count but otherwise ignore finally: wx.Yield() # <<<<< being worked on # finished parsing lines infoDict['numNewDataRecsAdded'] = dataRecsAdded infoDict['numDupDataRecsSkipped'] = dataRecsDupSkipped infoDict['stParseMsg'] = str(infoDict['lineCt']) + " lines processed; " + \ str(dataRecsAdded) + " data records added to database, " + \ str(dataRecsDupSkipped) + " duplicates skipped." self.msgArea.ChangeValue(infoDict['stParseMsg'])
def parseHoboWareTextFile(self, infoDict): """ parse a data file exported as text by HoboWare """ sStrip = '" \x0a\x0d' # characters to strip from parsed items # regular expression pattern to find logger number pLogger = re.compile(r'LGR S/N: (?P<nLogger>\d+)') # regular expression pattern to find sensor number pSensor = re.compile(r'SEN S/N: (?P<nSensor>\d+)') # regular expression pattern to find hour offset pHrOffset = re.compile(r'Time, GMT(?P<sHrOffset>.+)') dataRecItemCt = 0 dataRecsAdded = 0 dataRecsDupSkipped = 0 print "About to put lines into temp table" self.putTextLinesIntoTmpTable(infoDict) print "Finished putting lines into temp table" #parse file, start with header line; 1st line in this file format scidb.curT.execute("SELECT * FROM tmpLines ORDER BY ID;") for rec in scidb.curT: if rec['ID'] == 1: """ Build a dictionary of the channels. The indexes will be the column numbers because all data files have at least that. The value will be a 7-membered list The first item in each list will be the primary key in the Channels table. The list is first created with this = 0. The rest of the list is built up, then the list is sent to a function that looks in the database. The function fills in the primary key, either existing or newly created. List item 7 will be "new" if the record is new, otherwise "existing" The list contains the text values of logger serial number, sensor serial number, data type, and data units. The function takes care of filling in all these in their respective tables. When the dictionary is complete, the calling procedure can quickly insert data values into the data table by just pulling list item [0] for the dictionary key, which key is the column number in the source file. This loose structure allows some kludgy workarounds for bugs that were in some versions of the data files. """ lHdrs = rec['Line'].split('\t') # ignore item zero, just a pound sign and possibly three junk characters # item 1 is the hour offset, and clue to export bugs we need to workaround sHd = lHdrs[1].strip(sStrip) m = pHrOffset.search(sHd) if m: sTimeOffset = m.group('sHrOffset') lTimeOffsetComponents = sTimeOffset.split(':') sTimeOffsetHrs = lTimeOffsetComponents[0] iHrOffset = int(sTimeOffsetHrs) else: iHrOffset = 0 dictChannels = {} # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new lChannel = [0, 0, '', '', '', '', iHrOffset, ''] for iCol in range(len(lHdrs)): # skip items 0 & 1 if iCol > 1: # a header for a data column lChannel[1] = iCol + 1 # stored columns are 1-based sHd = lHdrs[iCol].strip(sStrip) # get the type and units lTypeUnits = sHd.split('(',2) sTypeUnits = lTypeUnits[0].strip(' ') lTypeUnits = sTypeUnits.split(',') ## TNP_MOD ## If units specified, make units 'NA' if(len(lTypeUnits)==1): lTypeUnits.append('NA') if lTypeUnits[0]: sType = lTypeUnits[0].strip(' ') else: sType = '(na)' lChannel[4] = sType if lTypeUnits[1]: sUnits = lTypeUnits[1].strip(' ') else: sUnits = '(na)' lChannel[5] = sUnits # get the logger ID and sensor ID m = pLogger.search(sHd) if m: sLoggerID = m.group('nLogger') else: sLoggerID = '(na)' lChannel[2] = sLoggerID m = pSensor.search(sHd) if m: sSensorID = m.group('nSensor') else: sSensorID = "(na)" lChannel[3] = sSensorID dictChannels[iCol + 1] = (lChannel[:]) # gone through all the headers, apply bug workarounds here print 'Before Channel function' for ky in dictChannels.keys(): print ky, dictChannels[ky][:] for ky in dictChannels.keys(): scidb.assureChannelIsInDB(dictChannels[ky]) print 'After Channel function' for ky in dictChannels.keys(): print ky, dictChannels[ky][:] # make a list of channel IDs for the rest of this file, for quick lookup # it is indexed by the columns list, and is zero-based lCh = [] for iCol in range(len(lHdrs)): iNomCol = iCol + 1 if iNomCol in dictChannels: lChanSet = dictChannels[iNomCol][:] lCh.append(lChanSet[0]) else: # does not correspond to a data colum lCh.append(0) # placeholder, to make list indexes work right else: # not the 1st (header) line, but a line of data lData = rec['Line'].split('\t') # ignore item zero, a line number, not used sTimeStamp = lData[1] tsAsTime = datetime.datetime.strptime(sTimeStamp, "%Y-%m-%d %H:%M:%S") tsAsTime.replace(tzinfo=None) # make sure it does not get local timezone info tsAsTime = tsAsTime + datetime.timedelta(hours = -iHrOffset) tsAsDate = tsAsTime.date() stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)" for iCol in range(len(lData)): if iCol > 1: # an item of data # give some progress diagnostics dataRecItemCt += 1 if dataRecItemCt % 100 == 0: self.msgArea.ChangeValue("Line " + str(rec['ID']) + " of " + str(infoDict['lineCt']) + "; " + str(dataRecsAdded) + " records added, " + str(dataRecsDupSkipped) + " duplicates skipped.") wx.Yield() try: # much faster to try and fail than to test first scidb.curD.execute(stSQL, (tsAsTime, lCh[iCol], lData[iCol])) dataRecsAdded += 1 # count it except sqlite3.IntegrityError: # item is already in Data table dataRecsDupSkipped += 1 # count but otherwise ignore finally: wx.Yield() # finished parsing lines infoDict['numNewDataRecsAdded'] = dataRecsAdded infoDict['numDupDataRecsSkipped'] = dataRecsDupSkipped infoDict['stParseMsg'] = str(infoDict['lineCt']) + " lines processed; " + \ str(dataRecsAdded) + " data records added to database, " + \ str(dataRecsDupSkipped) + " duplicates skipped." self.msgArea.ChangeValue(infoDict['stParseMsg'])
def parseGLTextFile(self, infoDict): """ parse a GreenLogger text file """ dataRecItemCt = 0 dataRecsAdded = 0 dataRecsDupSkipped = 0 # print "About to put lines into temp table" self.putTextLinesIntoTmpTable(infoDict) # print "Finished putting lines into temp table" # get the metadata header stSQL = """SELECT Line FROM tmpLines WHERE Line LIKE '{"Instrument identifier":%' GROUP BY Line;""" mtDat = scidb.curT.execute(stSQL).fetchone() dictHd = ast.literal_eval(mtDat['Line']) # somewhat redundant to do 'LoggerSerialNumber' here (fn 'assureChannelIsInDB' would fill in # later), but allows putting Model into the DB iInstSpecID = scidb.assureItemIsInTableField(dictHd['Model'], "InstrumentSpecs", "InstrumentSpec") iLoggerID = scidb.assureItemIsInTableField(dictHd['Instrument identifier'], "Loggers", "LoggerSerialNumber") scidb.curD.execute("UPDATE Loggers SET InstrumentSpecID = ? WHERE ID = ?;", (iInstSpecID, iLoggerID)) # get the hour offset(s); most files will have only one stSQL = "SELECT substr(Line, 21, 3) as TZ FROM tmpLines " \ "WHERE Line LIKE '____-__-__ __:__:__ ___%' GROUP BY TZ;" hrOffsets = scidb.curT.execute(stSQL).fetchall() for hrOffset in hrOffsets: iTimeZoneOffset = int(hrOffset['TZ']) # make a dictionary of channel IDs for data lines that have this hour offset # would be slightly different for different hour offsets lCols = dictHd['Columns'] # get col metadata; this is a list of dictionaries # # it is indexed by the columns list, and is zero-based # lCh = [0 for i in range(len(lCols))] # initially, fill with all zeroes dictChannels = {} # this is the dictionary we will build for dictCol in lCols: # somewhat redundant to fill these in before calling function 'assureChannelIsInDB', but # allows assigning sensor device types iDeviceTypeID = scidb.assureItemIsInTableField(dictCol['Device'], "DeviceSpecs", "DeviceSpec") iSensorID = scidb.assureItemIsInTableField(dictCol['Identifier'], "Sensors", "SensorSerialNumber") scidb.curD.execute("UPDATE Sensors SET DeviceSpecID = ? WHERE ID = ?;", (iDeviceTypeID, iSensorID)) # iDataTypeID = scidb.assureItemIsInTableField(dictCol['DataType'], "DataTypes", "TypeText") # iDataUnitsID = scidb.assureItemIsInTableField(dictCol['DataUnits'], "DataUnits", "UnitsText") # build list to create the channel # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new lChannel = [0, dictCol['Order'], dictHd['Instrument identifier'], dictCol['Identifier'], dictCol['DataType'], dictCol['DataUnits'], iTimeZoneOffset, ''] dictChannels[dictCol['Order']] = (lChannel[:]) # the key is the column number scidb.assureChannelIsInDB(dictChannels[dictCol['Order']]) # get or create the channel iChannelID = dictChannels[dictCol['Order']][0] # store the column name as a Series iSeriesID = scidb.assureItemIsInTableField(dictCol['Name'], "DataSeries", "DataSeriesDescription") # tie it to this Channel, to offer later stSQLcs = 'INSERT INTO tmpChanSegSeries(ChannelID, SeriesID) VALUES (?, ?);' try: scidb.curD.execute(stSQLcs, (iChannelID, iSeriesID)) except sqlite3.IntegrityError: pass # silently ignore duplicates # print 'Before Channel function' # for ky in dictChannels.keys(): # print ky, dictChannels[ky][:] # for ky in dictChannels.keys(): # scidb.assureChannelIsInDB(dictChannels[ky]) # print 'After Channel function' # for ky in dictChannels.keys(): # print ky, dictChannels[ky][:] # make a list of channel IDs for the set of lines with this HrOffset, for quick lookup # it is indexed by the columns list, and is zero-based lCh = [] for iCol in range(len(lCols)): iNomCol = iCol + 1 if iNomCol in dictChannels: lChanSet = dictChannels[iNomCol][:] lCh.append(lChanSet[0]) else: # does not correspond to a data colum lCh.append(0) # placeholder, to make list indexes work right # done setting up channels, get data lines stSQL = "SELECT ID, Line FROM tmpLines WHERE substr(Line, 21, 3) = ? " \ "AND Line LIKE '____-__-__ __:__:__ ___%' ORDER BY ID;" recs = scidb.curT.execute(stSQL, (hrOffset['TZ'],)).fetchall() for rec in recs: lData = rec['Line'].split('\t') # item zero is the timestamp followed by the timezone offset sTimeStamp = lData[0][:-4] # drop timezone offset, we already have it tsAsTime = datetime.datetime.strptime(sTimeStamp, "%Y-%m-%d %H:%M:%S") tsAsTime.replace(tzinfo=None) # make sure it does not get local timezone info tsAsTime = tsAsTime + datetime.timedelta(hours = -iTimeZoneOffset) tsAsDate = tsAsTime.date() stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)" for iCol in range(len(lData)): if iCol > 0: # an item of data # give some progress diagnostics dataRecItemCt += 1 if dataRecItemCt % 100 == 0: self.msgArea.ChangeValue("Line " + str(rec['ID']) + " of " + str(infoDict['lineCt']) + "; " + str(dataRecsAdded) + " records added, " + str(dataRecsDupSkipped) + " duplicates skipped.") wx.Yield() try: # much faster to try and fail than to test first scidb.curD.execute(stSQL, (tsAsTime, lCh[iCol], lData[iCol])) dataRecsAdded += 1 # count it except sqlite3.IntegrityError: # item is already in Data table dataRecsDupSkipped += 1 # count but otherwise ignore finally: wx.Yield() # finished parsing lines infoDict['numNewDataRecsAdded'] = dataRecsAdded infoDict['numDupDataRecsSkipped'] = dataRecsDupSkipped infoDict['stParseMsg'] = str(infoDict['lineCt']) + " lines processed; " + \ str(dataRecsAdded) + " data records added to database, " + \ str(dataRecsDupSkipped) + " duplicates skipped." self.msgArea.ChangeValue(infoDict['stParseMsg'])
def parseBlueTermFile(self, infoDict): """ a BlueTerm log file can contain captured data from a series of daily GL text files, possibly even from different loggers """ dataRecItemCt = 0 dataRecsAdded = 0 dataRecsDupSkipped = 0 self.putTextLinesIntoTmpTable(infoDict) # find the different data dump segments # within each one, data will be from only one logger, and so we # can use the same metadata header # make a list of 2-tuples; tuple item 0 = the starting ID of the # dump segment and tuple item 1 = the ending ID lDmpSegs = [] stSQL = """SELECT ID, Line FROM tmpLines WHERE Line = '{"datadump":"begin"}' OR Line = '{"datadump":"end"}' ORDER BY ID;""" dmSegs = scidb.curT.execute(stSQL).fetchall() # begin or end may be missing, due to log interruption idSegStart = None idSegEnd = None # only use segments that have valid begin/end for dmSegItm in dmSegs: if dmSegItm['Line'] == '{"datadump":"begin"}': idSegStart = dmSegItm['ID'] elif dmSegItm['Line'] == '{"datadump":"end"}': if idSegStart != None: # make the tuple tSeg = (idSegStart, dmSegItm['ID']) # append to list lDmpSegs.append(tSeg) # null the vars idSegStart = None idSegEnd = None else: # this should not happen, but tie up if it does idSegStart = None idSegEnd = None print "dump segments:", lDmpSegs # being worked on >>>> iNumSegs = len(lDmpSegs) if iNumSegs == 0: infoDict['stParseMsg'] = " No valid data segments in file." # will skip following loop iCtSeg = 0 for tSeg in lDmpSegs: iCtSeg += 1 idSegStart, idSegEnd = tSeg # get the metadata header; if multiple will all be the same for one dump segment stSQL = """SELECT Line FROM tmpLines WHERE ID > ? AND ID < ? AND Line LIKE '{"Instrument identifier":%' GROUP BY Line;""" mtDat = scidb.curT.execute(stSQL, tSeg).fetchone() dictHd = ast.literal_eval(mtDat['Line']) # somewhat redundant to do 'LoggerSerialNumber' here (fn 'assureChannelIsInDB' would fill in # later), but allows putting Model into the DB iInstSpecID = scidb.assureItemIsInTableField(dictHd['Model'], "InstrumentSpecs", "InstrumentSpec") iLoggerID = scidb.assureItemIsInTableField(dictHd['Instrument identifier'], "Loggers", "LoggerSerialNumber") scidb.curD.execute("UPDATE Loggers SET InstrumentSpecID = ? WHERE ID = ?;", (iInstSpecID, iLoggerID)) # get the hour offset(s); most files will have only one stSQL = "SELECT substr(Line, 21, 3) as TZ FROM tmpLines " \ "WHERE ID > ? AND ID < ? " \ "AND Line LIKE '____-__-__ __:__:__ ___%' GROUP BY TZ;" hrOffsets = scidb.curT.execute(stSQL, tSeg).fetchall() for hrOffset in hrOffsets: iTimeZoneOffset = int(hrOffset['TZ']) # make a dictionary of channel IDs for data lines that have this hour offset # would be slightly different for different hour offsets lCols = dictHd['Columns'] # get col metadata; this is a list of dictionaries # # it is indexed by the columns list, and is zero-based # lCh = [0 for i in range(len(lCols))] # initially, fill with all zeroes dictChannels = {} # this is the dictionary we will build for dictCol in lCols: # somewhat redundant to fill these in before calling function 'assureChannelIsInDB', but # allows assigning sensor device types iDeviceTypeID = scidb.assureItemIsInTableField(dictCol['Device'], "DeviceSpecs", "DeviceSpec") iSensorID = scidb.assureItemIsInTableField(dictCol['Identifier'], "Sensors", "SensorSerialNumber") scidb.curD.execute("UPDATE Sensors SET DeviceSpecID = ? WHERE ID = ?;", (iDeviceTypeID, iSensorID)) # iDataTypeID = scidb.assureItemIsInTableField(dictCol['DataType'], "DataTypes", "TypeText") # iDataUnitsID = scidb.assureItemIsInTableField(dictCol['DataUnits'], "DataUnits", "UnitsText") # build list to create the channel # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new lChannel = [0, dictCol['Order'], dictHd['Instrument identifier'], dictCol['Identifier'], dictCol['DataType'], dictCol['DataUnits'], iTimeZoneOffset, ''] dictChannels[dictCol['Order']] = (lChannel[:]) # the key is the column number scidb.assureChannelIsInDB(dictChannels[dictCol['Order']]) # get or create the channel iChannelID = dictChannels[dictCol['Order']][0] # store the column name as a Series iSeriesID = scidb.assureItemIsInTableField(dictCol['Name'], "DataSeries", "DataSeriesDescription") # tie it to this Channel, to offer later stSQLcs = 'INSERT INTO tmpChanSegSeries(ChannelID, SeriesID) VALUES (?, ?);' try: scidb.curD.execute(stSQLcs, (iChannelID, iSeriesID)) except sqlite3.IntegrityError: pass # silently ignore duplicates # print 'Before Channel function' # for ky in dictChannels.keys(): # print ky, dictChannels[ky][:] # for ky in dictChannels.keys(): # scidb.assureChannelIsInDB(dictChannels[ky]) # print 'After Channel function' # for ky in dictChannels.keys(): # print ky, dictChannels[ky][:] # make a list of channel IDs for the set of lines with this HrOffset, for quick lookup # it is indexed by the columns list, and is zero-based lCh = [] for iCol in range(len(lCols)): iNomCol = iCol + 1 if iNomCol in dictChannels: lChanSet = dictChannels[iNomCol][:] lCh.append(lChanSet[0]) else: # does not correspond to a data colum lCh.append(0) # placeholder, to make list indexes work right # done setting up channels, get data lines stSQL = "SELECT ID, Line FROM tmpLines " \ "WHERE ID > ? AND ID < ? " \ "AND substr(Line, 21, 3) = ? " \ "AND Line LIKE '____-__-__ __:__:__ ___%' ORDER BY ID;" recs = scidb.curT.execute(stSQL, (idSegStart, idSegEnd, hrOffset['TZ'],)).fetchall() iNumSegLines = len(recs) iCtSegLines = 0 for rec in recs: iCtSegLines += 1 lData = rec['Line'].split('\t') # item zero is the timestamp followed by the timezone offset sTimeStamp = lData[0][:-4] # drop timezone offset, we already have it tsAsTime = datetime.datetime.strptime(sTimeStamp, "%Y-%m-%d %H:%M:%S") tsAsTime.replace(tzinfo=None) # make sure it does not get local timezone info tsAsTime = tsAsTime + datetime.timedelta(hours = -iTimeZoneOffset) tsAsDate = tsAsTime.date() stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)" for iCol in range(len(lData)): if iCol > 0: # an item of data # give some progress diagnostics dataRecItemCt += 1 if dataRecItemCt % 100 == 0: self.msgArea.ChangeValue("Segment " + str(iCtSeg) + " of " + str(iNumSegs) + ", HrOffset " + str(iTimeZoneOffset) + ", Line " + str(iCtSegLines) + " of " + str(iNumSegLines) + "; " + str(dataRecsAdded) + " records added, " + str(dataRecsDupSkipped) + " duplicates skipped.") wx.Yield() try: # much faster to try and fail than to test first scidb.curD.execute(stSQL, (tsAsTime, lCh[iCol], lData[iCol])) dataRecsAdded += 1 # count it except sqlite3.IntegrityError: # item is already in Data table dataRecsDupSkipped += 1 # count but otherwise ignore finally: wx.Yield() # <<<<< being worked on # finished parsing lines infoDict['numNewDataRecsAdded'] = dataRecsAdded infoDict['numDupDataRecsSkipped'] = dataRecsDupSkipped infoDict['stParseMsg'] = str(infoDict['lineCt']) + " lines processed; " + \ str(dataRecsAdded) + " data records added to database, " + \ str(dataRecsDupSkipped) + " duplicates skipped." self.msgArea.ChangeValue(infoDict['stParseMsg'])
def parseHoboWareTextFile(self, infoDict): """ parse a data file exported as text by HoboWare """ sStrip = '" \x0a\x0d' # characters to strip from parsed items # regular expression pattern to find logger number pLogger = re.compile(r"LGR S/N: (?P<nLogger>\d+)") # regular expression pattern to find sensor number pSensor = re.compile(r"SEN S/N: (?P<nSensor>\d+)") # regular expression pattern to find hour offset pHrOffset = re.compile(r"Time, GMT(?P<sHrOffset>.+)") dataRecItemCt = 0 dataRecsAdded = 0 dataRecsDupSkipped = 0 print "About to put lines into temp table" self.putTextLinesIntoTmpTable(infoDict) print "Finished putting lines into temp table" # parse file, start with header line; 1st line in this file format scidb.curT.execute("SELECT * FROM tmpLines ORDER BY ID;") for rec in scidb.curT: if rec["ID"] == 1: """ Build a dictionary of the channels. The indexes will be the column numbers because all data files have at least that. The value will be a 7-membered list The first item in each list will be the primary key in the Channels table. The list is first created with this = 0. The rest of the list is built up, then the list is sent to a function that looks in the database. The function fills in the primary key, either existing or newly created. List item 7 will be "new" if the record is new, otherwise "existing" The list contains the text values of logger serial number, sensor serial number, data type, and data units. The function takes care of filling in all these in their respective tables. When the dictionary is complete, the calling procedure can quickly insert data values into the data table by just pulling list item [0] for the dictionary key, which key is the column number in the source file. This loose structure allows some kludgy workarounds for bugs that were in some versions of the data files. """ lHdrs = rec["Line"].split("\t") # ignore item zero, just a pound sign and possibly three junk characters # item 1 is the hour offset, and clue to export bugs we need to workaround sHd = lHdrs[1].strip(sStrip) m = pHrOffset.search(sHd) if m: sTimeOffset = m.group("sHrOffset") lTimeOffsetComponents = sTimeOffset.split(":") sTimeOffsetHrs = lTimeOffsetComponents[0] iHrOffset = int(sTimeOffsetHrs) else: iHrOffset = 0 dictChannels = {} # list items are: ChannelID, originalCol, Logger, Sensor, dataType, dataUnits, hrOffset, new lChannel = [0, 0, "", "", "", "", iHrOffset, ""] for iCol in range(len(lHdrs)): # skip items 0 & 1 if iCol > 1: # a header for a data column lChannel[1] = iCol + 1 # stored columns are 1-based sHd = lHdrs[iCol].strip(sStrip) # get the type and units lTypeUnits = sHd.split("(", 2) sTypeUnits = lTypeUnits[0].strip(" ") lTypeUnits = sTypeUnits.split(",") # check validity # print "lTypeUnits (before fix):", lTypeUnits # If there are no umits, this is a non-data column like "Bad Battery" if len(lTypeUnits) == 1: lTypeUnits.append("non-data") # print "lTypeUnits (after fix):", lTypeUnits if lTypeUnits[0]: sType = lTypeUnits[0].strip(" ") else: sType = "(na)" lChannel[4] = sType if lTypeUnits[1]: sUnits = lTypeUnits[1].strip(" ") else: sUnits = "(na)" lChannel[5] = sUnits # get the logger ID and sensor ID m = pLogger.search(sHd) if m: sLoggerID = m.group("nLogger") else: sLoggerID = "(na)" lChannel[2] = sLoggerID m = pSensor.search(sHd) if m: sSensorID = m.group("nSensor") else: sSensorID = "(na)" lChannel[3] = sSensorID dictChannels[iCol + 1] = lChannel[:] # gone through all the headers, apply bug workarounds here print "Before Channel function" for ky in dictChannels.keys(): print ky, dictChannels[ky][:] for ky in dictChannels.keys(): scidb.assureChannelIsInDB(dictChannels[ky]) print "After Channel function" for ky in dictChannels.keys(): print ky, dictChannels[ky][:] # make a list of channel IDs for the rest of this file, for quick lookup # it is indexed by the columns list, and is zero-based lCh = [] for iCol in range(len(lHdrs)): iNomCol = iCol + 1 if iNomCol in dictChannels: lChanSet = dictChannels[iNomCol][:] lCh.append(lChanSet[0]) else: # does not correspond to a data colum lCh.append(0) # placeholder, to make list indexes work right else: # not the 1st (header) line, but a line of data lData = rec["Line"].split("\t") # ignore item zero, a line number, not used sTimeStamp = lData[1] try: tsAsTime = datetime.datetime.strptime(sTimeStamp, "%Y-%m-%d %H:%M:%S") except: # time format is nonstandard, give a try to wx datetime parsing # does not work, perhaps it's a wx/Python conversion problem # but string e.g. "05/16/10 12:00:00 PM" gives dates in 2106 # dt = wx.DateTime() # Uninitialized datetime # DateTimeValid = dt.ParseDateTime(sTimeStamp) # if DateTimeValid != -1: # valid datetime # tsAsTime = datetime.datetime.fromtimestamp(dt.GetTicks()) # else: print "unresolvable timestamp:", sTimeStamp self.msgArea.ChangeValue("unresolvable timestamp: " + TimeStamp) return tsAsTime.replace(tzinfo=None) # make sure it does not get local timezone info tsAsTime = tsAsTime + datetime.timedelta(hours=-iHrOffset) tsAsDate = tsAsTime.date() stSQL = "INSERT INTO Data (UTTimestamp, ChannelID, Value) VALUES (?, ?, ?)" for iCol in range(len(lData)): if iCol > 1: # an item of data # give some progress diagnostics dataRecItemCt += 1 if dataRecItemCt % 100 == 0: self.msgArea.ChangeValue( "Line " + str(rec["ID"]) + " of " + str(infoDict["lineCt"]) + "; " + str(dataRecsAdded) + " records added, " + str(dataRecsDupSkipped) + " duplicates skipped." ) wx.Yield() try: # much faster to try and fail than to test first # stripping commas from floats is a hack, but had to get this working # if ',' in lData[iCol]: # print lData[iCol], lData[iCol].replace(',',''), tsAsTime, lCh[iCol] scidb.curD.execute(stSQL, (tsAsTime, lCh[iCol], lData[iCol].replace(",", ""))) # print "record added" dataRecsAdded += 1 # count it except sqlite3.IntegrityError: # error adding item # distinguish duplicate error from invalid Value error err_type, err_value, err_traceback = sys.exc_info() # if Value is non-numeric, we get 'constraint failed'; silently ignore if "not unique" in repr(err_value): # only count these dataRecsDupSkipped += 1 # count but otherwise ignore # if ',' in lData[iCol]: # print "record not added:", repr(err_value) # print tsAsTime, lCh[iCol], lData[iCol] finally: wx.Yield() # finished parsing lines infoDict["numNewDataRecsAdded"] = dataRecsAdded infoDict["numDupDataRecsSkipped"] = dataRecsDupSkipped infoDict["stParseMsg"] = ( str(infoDict["lineCt"]) + " lines processed; " + str(dataRecsAdded) + " data records added to database, " + str(dataRecsDupSkipped) + " duplicates skipped." ) self.msgArea.ChangeValue(infoDict["stParseMsg"])