class PostgresInsertOutput(PostgresDbOutput): """ Output by inserting a single record in a Postgres database table. Input is a Stetl record (Python dict structure) or a list of records. Creates an INSERT for Postgres to insert each single record. When the "replace" parameter is True, any existing record keyed by "key" is attempted to be UPDATEd first. NB a constraint is that the first and each subsequent each record needs to contain all values as an INSERT and UPDATE query template is built once for the columns in the first record. consumes=[FORMAT.record_array, FORMAT.record] """ # Start attribute config meta @Config(ptype=str, required=False, default='public') def table(self): """ Table for inserts. """ pass @Config(ptype=bool, required=False, default=False) def replace(self): """ Replace record if exists? """ pass @Config(ptype=str, required=False, default=None) def key(self): """ The key column name of the table, required when replacing records. """ pass # End attribute config meta def __init__(self, configdict, section, consumes=FORMAT.record): DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record]) self.query = None self.update_query = None self.db = None def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def create_query(self, record): # We assume that all records do the same INSERT key/values # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s) query = "INSERT INTO %s (%s) VALUES (%s)" % ( self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join(["%s", ] * len(record.keys()))) log.info('query is %s', query) return query def create_update_query(self, record): # We assume that all records do the same UPDATE key/values # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838 # e.g. UPDATE table SET field='C', field2='Z' WHERE id=3; query = "UPDATE %s SET (%s) = (%s) WHERE %s = %s" % ( self.cfg.get('table'), ",".join(['%s ' % k for k in record]), ",".join(["%s", ] * len(record.keys())), self.key, "%s") log.info('update query is %s', query) return query def insert(self, record): res = 0 if self.replace and self.key and self.key in record: # Replace option: try UPDATE if existing # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838 values = record.values() values.append(record[self.key]) res = self.db.execute(self.update_query, values) # del_query = "DELETE FROM %s WHERE %s = '%s'" % (self.cfg.get('table'), self.key, record[self.key]) # res = self.db.execute(del_query) if res < 1: # Do insert with values from the record dict # only if we did not do an UPDATE (res==0) on existing record. self.db.execute(self.query, record.values()) self.db.commit(close=False) def write(self, packet): # Deal with empty or zero-length data structures (list or dict) if packet.data is None or len(packet.data) == 0: return packet # ASSERT: record data present # record is Python dict (single record) or list of Python dict (multiple records) record = packet.data # Generate INSERT query template once first_record = record if type(record) is list and len(record) > 0: first_record = record[0] # Create INSERT and optional UPDATE query-templates once if self.query is None: self.query = self.create_query(first_record) if self.replace and self.key and not self.update_query: self.update_query = self.create_update_query(first_record) # Check if record is single (dict) or array (list of dict) if type(record) is dict: # Do insert with values from the single record self.insert(record) # log.info('committed record key=%s' % record[self.key]) elif type(record) is list: # Multiple records in list for rec in record: # Do insert with values from the record self.insert(rec) log.info('committed %d records' % len(record)) return packet
class Josene(Device): def __init__(self): Device.__init__(self, 'jose') self.model_query = "SELECT id,parameters,model from calibration_models WHERE predicts = '%s' AND invalid = FALSE ORDER BY timestamp DESC LIMIT 1" self.state_query = "SELECT state from calibration_state WHERE process = '%s' AND model_id = %d ORDER BY timestamp DESC LIMIT 1" self.state_insert = "INSERT INTO calibration_state (process, model_id, state) VALUES ('%s', %d, '%s')" self.sensor_model_names = { 'co': 'carbon_monoxide__air_', 'no2': 'nitrogen_dioxide__air_', 'o3': 'ozone__air_' } self.config_dict = None def init(self, config_dict): self.config_dict = config_dict self.process_name = config_dict['process_name'] self.db = PostGIS(config_dict) self.db.connect() ids = dict() parameters = dict() models = dict() state = dict() # Query ANN Calibration Model and its State from DB for each calibrated sensor. if self.model_query is not None and len(self.sensor_model_names) > 0: log.info('Getting calibration models and state from database') for k in self.sensor_model_names: v = self.sensor_model_names[k] id, param, model = self.query_model(v) ids[k] = id parameters[k] = param models[k] = model model_state = self.query_state(id) state[k] = model_state else: log.info('No query for fetching calibration models given or no ' 'mapping for calibration models to gas components given.') # Put Model and State info in the Device definitions. for k in ids: SENSOR_DEFS[k]['converter_model']['model_id'] = ids[k] for k in parameters: SENSOR_DEFS[k]['converter_model']['running_mean_weights'] = parameters[k] for k in models: SENSOR_DEFS[k]['converter_model']['mlp_regressor'] = models[k] for k, v in state.iteritems(): for device_id, device_state in v.iteritems(): for gas, state in device_state.iteritems(): v[device_id][gas] = RunningMean.from_dict(state) SENSOR_DEFS[k]['converter_model']['state'] = v def exit(self): # Save the calibration state. for k in self.sensor_model_names: model = SENSOR_DEFS[k]['converter_model'] self.save_state(model['model_id'], json.dumps(model['state'])) self.db.commit(close=False) def get_sensor_defs(self): return SENSOR_DEFS def raw_query(self, query_str): self.db.execute(query_str) db_records = self.db.cursor.fetchall() log.info('read recs: %d' % len(db_records)) return db_records def query_model(self, name): query = self.model_query % name log.info('Getting calibration model with query: %s' % query) ret = self.raw_query(query) if len(ret) > 0: id, parameters, model = ret[0] return id, parameters, pickle.loads(model) else: log.warn("No model found for %s" % name) return None, {}, {} def query_state(self, model_id): query = self.state_query % (self.process_name, model_id) log.info('Getting calibration model state with query: %s' % query) ret = self.raw_query(query) if len(ret) > 0: return ret[0][0] else: log.warn("No state found for model_id=%d" % model_id) return {} def save_state(self, model_id, state): insert_query = self.state_insert % (self.process_name, model_id, state) log.info('Inserting calibration model state for process %s model_id=%d' % (self.process_name, model_id)) ret = self.db.execute(insert_query) if ret != 1: log.warn('Cannot save state for process %s model_id=%d' % (self.process_name, model_id)) # Get raw sensor value or list of values def get_raw_value(self, name, val_dict): val = None if type(name) is list: name = name[0] return self.get_raw_value(name, val_dict) # name is list of names # for n in name: # if n in val_dict: # if val is None: # val = [] # val.append(val_dict[n]) else: # name is single name if name in val_dict: val = val_dict[name] if 'audio' in name: # We may have audio encoded in 3 bands bands = [float(val & 255), float((val >> 8) & 255), float((val >> 16) & 255)] val = bands[0] return val, name # Check for valid sensor value def check_value(self, name, val_dict, value=None): val = None if type(name) is list: # name is list of names for n in name: result, reason = self.check_value(n, val_dict, value) if result is False: return result, reason else: # name is single name if name not in val_dict and value is None: return False, '%s not present' % name else: if value is not None: val = value else: val = val_dict[name] if val is None: return False, '%s is None' % name if name not in SENSOR_DEFS: return False, '%s not in SENSOR_DEFS' % name name_def = SENSOR_DEFS[name] # Audio inputs: need to unpack 3 bands and check for decibel vals if 'audio' in name: bands = [float(val & 255), float((val >> 8) & 255), float((val >> 16) & 255)] # determine validity of these 3 bands dbMin = name_def['min'] dbMax = name_def['max'] err_cnt = 0 msg = '' for i in range(0, len(bands)): band_val = bands[i] # accumulate outliers if band_val < dbMin: err_cnt +=1 msg += '%s: val(%s) < min(%s)\n' % (name, str(band_val), str(name_def['min'])) elif band_val > dbMax: err_cnt +=1 msg += '%s: val(%s) > max(%s)\n' % (name, str(band_val), str(name_def['max'])) # Only invalid if all bands outside range if err_cnt >= len(bands): return False, msg return True, '%s OK' % name if 'min' in name_def and val < name_def['min']: return False, '%s: val(%s) < min(%s)' % (name, str(val), str(name_def['min'])) if 'max' in name_def and val > name_def['max']: return False, '%s: val(%s) > max(%s)' % (name, str(val), str(name_def['max'])) return True, '%s OK' % name # Get location as lon, lat def get_lon_lat(self, val_dict): result = (None, None) if 's_longitude' in val_dict and 's_latitude' in val_dict: lon = SENSOR_DEFS['longitude']['converter'](val_dict['s_longitude']) lat = SENSOR_DEFS['latitude']['converter'](val_dict['s_latitude']) valid, reason = self.check_value('latitude', val_dict, value=lat) if not valid: return result valid, reason = self.check_value('longitude', val_dict, value=lon) if not valid: return result result = (lon, lat) return result
def write(self, packet): if packet.data is None: return packet gml_doc = packet.data log.info('inserting features in DB') db = PostGIS(self.cfg.get_dict()) db.connect() # print self.to_string(gml_doc, False, False) # NS = {'base': 'urn:x-inspire:specification:gmlas:BaseTypes:3.2', 'gml': 'http://www.opengis.net/gml/3.2'} # featureMembers = gml_doc.xpath('//base:member/*', namespaces=NS) featureMembers = gml_doc.xpath("//*[local-name() = '%s']/*" % self.feature_member_tag) count = 0 gml_ns = None for childNode in featureMembers: if gml_ns is None: if childNode.nsmap.has_key('gml'): gml_ns = childNode.nsmap['gml'] else: if childNode.nsmap.has_key('GML'): gml_ns = childNode.nsmap['GML'] gml_id = childNode.get('{%s}id' % gml_ns) feature_type_id = self.feature_type_ids[childNode.tag] # Find a GML geometry in the GML NS ogrGeomWKT = None # gmlMembers = childNode.xpath(".//gml:Point|.//gml:Curve|.//gml:Surface|.//gml:MultiSurface", namespaces=NS) gmlMembers = childNode.xpath( ".//*[local-name() = 'Point']|.//*[local-name() = 'Polygon']|.//*[local-name() = 'Curve']|.//*[local-name() = 'Surface']|.//*[local-name() = 'MultiSurface']") geom_str = None for gmlMember in gmlMembers: if geom_str is None: geom_str = etree.tostring(gmlMember) # no need for GDAL Python bindings for now, maybe when we'll optimize with COPY iso INSERT # ogrGeom = ogr.CreateGeometryFromGML(str(gmlStr)) # if ogrGeom is not None: # ogrGeomWKT = ogrGeom.ExportToWkt() # if ogrGeomWKT is not None: # break blob = etree.tostring(childNode, pretty_print=False, xml_declaration=False, encoding='UTF-8') if geom_str is None: sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object) VALUES (%s, %s, %s)" parameters = (gml_id, feature_type_id, db.make_bytea(blob)) else: # ST_SetSRID(ST_GeomFromGML(%s)),-1) sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object, gml_bounded_by) VALUES (%s, %s, %s, ST_SetSRID( ST_GeomFromGML(%s),%s) )" parameters = (gml_id, feature_type_id, db.make_bytea(blob), geom_str, self.srid) if db.execute(sql, parameters) == -1: log.error("feat num# = %d error inserting feature blob=%s (but continuing)" % (count, blob)) # will fail but we will close connection also db.commit() # proceed... log.info('retrying to proceed with remaining features...') db = PostGIS(self.cfg.get_dict()) db.connect() count = 0 count += 1 exception = db.commit() if exception is not None: log.error("error in commit") log.info("inserted %s features" % count) return packet
class PostgresInsertOutput(PostgresDbOutput): """ Output by inserting a single record in a Postgres database table. Input is a Stetl record (Python dict structure) or a list of records. Creates an INSERT for Postgres to insert each single record. When the "replace" parameter is True, any existing record keyed by "key" is attempted to be UPDATEd first. NB a constraint is that the first and each subsequent each record needs to contain all values as an INSERT and UPDATE query template is built once for the columns in the first record. consumes=[FORMAT.record_array, FORMAT.record] """ # Start attribute config meta @Config(ptype=str, required=False, default='public') def table(self): """ Table for inserts. """ pass @Config(ptype=bool, required=False, default=False) def replace(self): """ Replace record if exists? """ pass @Config(ptype=str, required=False, default=None) def key(self): """ The key column name of the table, required when replacing records. """ pass # End attribute config meta def __init__(self, configdict, section, consumes=FORMAT.record): DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record]) self.query = None self.update_query = None self.db = None def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def create_query(self, record): # We assume that all records do the same INSERT key/values # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s) query = "INSERT INTO %s (%s) VALUES (%s)" % ( self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join([ "%s", ] * len(record.keys()))) log.info('query is %s', query) return query def create_update_query(self, record): # We assume that all records do the same UPDATE key/values # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838 # e.g. UPDATE table SET field='C', field2='Z' WHERE id=3; query = "UPDATE %s SET (%s) = (%s) WHERE %s = %s" % (self.cfg.get( 'table'), ",".join(['%s ' % k for k in record]), ",".join([ "%s", ] * len(record.keys())), self.key, "%s") log.info('update query is %s', query) return query def insert(self, record): res = 0 if self.replace and self.key and self.key in record: # Replace option: try UPDATE if existing # https://stackoverflow.com/questions/1109061/insert-on-duplicate-update-in-postgresql/6527838#6527838 values = record.values() values.append(record[self.key]) res = self.db.execute(self.update_query, values) # del_query = "DELETE FROM %s WHERE %s = '%s'" % (self.cfg.get('table'), self.key, record[self.key]) # res = self.db.execute(del_query) if res < 1: # Do insert with values from the record dict # only if we did not do an UPDATE (res==0) on existing record. self.db.execute(self.query, record.values()) self.db.commit(close=False) def write(self, packet): # Deal with empty or zero-length data structures (list or dict) if packet.data is None or len(packet.data) == 0: return packet # ASSERT: record data present # record is Python dict (single record) or list of Python dict (multiple records) record = packet.data # Generate INSERT query template once first_record = record if type(record) is list and len(record) > 0: first_record = record[0] # Create INSERT and optional UPDATE query-templates once if self.query is None: self.query = self.create_query(first_record) if self.replace and self.key and not self.update_query: self.update_query = self.create_update_query(first_record) # Check if record is single (dict) or array (list of dict) if type(record) is dict: # Do insert with values from the single record self.insert(record) # log.info('committed record key=%s' % record[self.key]) elif type(record) is list: # Multiple records in list for rec in record: # Do insert with values from the record self.insert(rec) log.info('committed %d records' % len(record)) return packet
class DeegreeBlobstoreInput(Input): """ Read features from deegree Blobstore DB into an etree doc. produces=FORMAT.etree_doc """ # Start attribute config meta @Config(ptype=int, required=False, default=10000) def max_features_per_doc(self): """ Max features to read from input feature GML stream per internal document. """ pass @Config(ptype=str, required=True, default=None) def start_container(self): """ Tag that starts container. """ pass @Config(ptype=str, required=True, default=None) def end_container(self): """ Tag that ends container. """ pass @Config(ptype=str, required=False, default=False) def start_feature_tag(self): """ XML tag that starts Feature. """ pass @Config(ptype=str, required=False, default=None) def end_feature_tag(self): """ XML tag that ends Feature. """ pass # End attribute config meta def __init__(self, configdict, section): Input.__init__(self, configdict, section, produces=FORMAT.etree_doc) self.cur_feature_blob = None self.rowcount = 0 # http://www.mkyong.com/regular-expressions/how-to-extract-html-links-with-regular-expression/ self.regex_xlink_href = re.compile( "\\s*(?i)xlink:href\\s*=\\s*(\"#([^\"]*\")|'#[^']*'|(#[^'\">\\s]+))" ) self.db = None self.xlink_db = None self.buffer = None self.feature_count = 0 # Reusable XML parser self.xml_parser = etree.XMLParser(remove_blank_text=True) def init(self): pass def read(self, packet): if packet.is_end_of_stream(): return packet if self.db is None: # First time read log.info("reading records from blobstore..") self.db = PostGIS(self.cfg.get_dict()) self.db.connect() sql = self.cfg.get('sql') self.rowcount = self.db.execute(sql) self.cur = self.db.cursor log.info("Read records rowcount=%d" % self.rowcount) # Init separate connection to fetch objects referenced by xlink:href self.xlink_db = PostGIS(self.cfg.get_dict()) self.xlink_db.connect() # Query active while self.cur is not None: if self.buffer is None: self.buffer = self.init_buf() self.buffer.write(self.start_container) # Get next blob record record = self.cur.fetchone() # End of all records if record is None: # End of records: start closing self.buffer.write(self.end_container) self.cur = None self.db.commit() # Only create doc if there are features in the buffer if self.feature_count > 0: self.buffer_to_doc(packet) packet.set_end_of_doc() break else: # New record: embed feature blob in feature tags and write to buffer feature_blob = self.write_feature(record) # If we have local xlinks: fetch the related features as well from the DB and # output them within the same document (local href resolvable) # TODO: in some cases we may need to be recursive (xlinks in xlinked features...) # First construct a single query for all xlinks xlink_sql = None for xlink in self.regex_xlink_href.finditer(feature_blob): gml_id = xlink.group(1).strip('"').strip('#') # We don't want multiple occurences of the same xlinked feature if gml_id in self.xlink_ids: continue self.xlink_ids.add(gml_id) if xlink_sql is None: xlink_sql = "SELECT binary_object from gml_objects where gml_id = '%s'" % gml_id else: xlink_sql += "OR gml_id = '%s'" % gml_id # Should we retrieve and write xlinked features? if xlink_sql is not None: # Fetch from DB self.xlink_db.execute(xlink_sql) while True: # Get next blob record xlink_record = self.xlink_db.cursor.fetchone() if xlink_record is None: break self.write_feature(xlink_record) # Should we output a doc if self.feature_count >= self.max_features_per_doc: # End of records: create XML doc self.buffer.write(self.end_container) self.buffer_to_doc(packet) break if self.cur is None: # All records handled: close off packet.set_end_of_stream() # log.info("[%s]" % packet.data) return packet def write_feature(self, record): feature_blob = str(record[0]) # Write start-tag, blob element, end-tag self.buffer.write(self.start_feature_tag) self.buffer.write(feature_blob) self.buffer.write(self.end_feature_tag) self.feature_count += 1 return feature_blob def init_buf(self): buffer = StringIO() buffer = codecs.getwriter("utf8")(buffer) self.feature_count = 0 self.xlink_ids = set() return buffer def buffer_to_doc(self, packet): # Process/transform data in buffer self.buffer.seek(0) try: packet.data = etree.parse(self.buffer, self.xml_parser) except Exception as e: bufStr = self.buffer.getvalue() if not bufStr: log.info("parse buffer empty: content=[%s]" % bufStr) else: log.error("error in buffer parsing %s" % str(e)) raise self.buffer.close() self.buffer = None
class DeegreeBlobstoreInput(Input): """ Read features from deegree Blobstore DB into an etree doc. produces=FORMAT.etree_doc """ # Start attribute config meta @Config(ptype=int, required=False, default=10000) def max_features_per_doc(self): """ Max features to read from input feature GML stream per internal document. """ pass @Config(ptype=str, required=True, default=None) def start_container(self): """ Tag that starts container. """ pass @Config(ptype=str, required=True, default=None) def end_container(self): """ Tag that ends container. """ pass @Config(ptype=str, required=False, default=False) def start_feature_tag(self): """ XML tag that starts Feature. """ pass @Config(ptype=str, required=False, default=None) def end_feature_tag(self): """ XML tag that ends Feature. """ pass # End attribute config meta def __init__(self, configdict, section): Input.__init__(self, configdict, section, produces=FORMAT.etree_doc) self.cur_feature_blob = None self.rowcount = 0 # http://www.mkyong.com/regular-expressions/how-to-extract-html-links-with-regular-expression/ self.regex_xlink_href = re.compile("\\s*(?i)xlink:href\\s*=\\s*(\"#([^\"]*\")|'#[^']*'|(#[^'\">\\s]+))") self.db = None self.xlink_db = None self.buffer = None self.feature_count = 0 # Reusable XML parser self.xml_parser = etree.XMLParser(remove_blank_text=True) def init(self): pass def read(self, packet): if packet.is_end_of_stream(): return packet if self.db is None: # First time read log.info("reading records from blobstore..") self.db = PostGIS(self.cfg.get_dict()) self.db.connect() sql = self.cfg.get('sql') self.rowcount = self.db.execute(sql) self.cur = self.db.cursor log.info("Read records rowcount=%d" % self.rowcount) # Init separate connection to fetch objects referenced by xlink:href self.xlink_db = PostGIS(self.cfg.get_dict()) self.xlink_db.connect() # Query active while self.cur is not None: if self.buffer is None: self.buffer = self.init_buf() self.buffer.write(self.start_container) # Get next blob record record = self.cur.fetchone() # End of all records if record is None: # End of records: start closing self.buffer.write(self.end_container) self.cur = None self.db.commit() # Only create doc if there are features in the buffer if self.feature_count > 0: self.buffer_to_doc(packet) packet.set_end_of_doc() break else: # New record: embed feature blob in feature tags and write to buffer feature_blob = self.write_feature(record) # If we have local xlinks: fetch the related features as well from the DB and # output them within the same document (local href resolvable) # TODO: in some cases we may need to be recursive (xlinks in xlinked features...) # First construct a single query for all xlinks xlink_sql = None for xlink in self.regex_xlink_href.finditer(feature_blob): gml_id = xlink.group(1).strip('"').strip('#') # We don't want multiple occurences of the same xlinked feature if gml_id in self.xlink_ids: continue self.xlink_ids.add(gml_id) if xlink_sql is None: xlink_sql = "SELECT binary_object from gml_objects where gml_id = '%s'" % gml_id else: xlink_sql += "OR gml_id = '%s'" % gml_id # Should we retrieve and write xlinked features? if xlink_sql is not None: # Fetch from DB self.xlink_db.execute(xlink_sql) while True: # Get next blob record xlink_record = self.xlink_db.cursor.fetchone() if xlink_record is None: break self.write_feature(xlink_record) # Should we output a doc if self.feature_count >= self.max_features_per_doc: # End of records: create XML doc self.buffer.write(self.end_container) self.buffer_to_doc(packet) break if self.cur is None: # All records handled: close off packet.set_end_of_stream() # log.info("[%s]" % packet.data) return packet def write_feature(self, record): feature_blob = str(record[0]) # Write start-tag, blob element, end-tag self.buffer.write(self.start_feature_tag) self.buffer.write(feature_blob) self.buffer.write(self.end_feature_tag) self.feature_count += 1 return feature_blob def init_buf(self): buffer = StringIO() buffer = codecs.getwriter("utf8")(buffer) self.feature_count = 0 self.xlink_ids = set() return buffer def buffer_to_doc(self, packet): # Process/transform data in buffer self.buffer.seek(0) try: packet.data = etree.parse(self.buffer, self.xml_parser) except Exception as e: bufStr = self.buffer.getvalue() if not bufStr: log.info("parse buffer empty: content=[%s]" % bufStr) else: log.error("error in buffer parsing %s" % str(e)) raise self.buffer.close() self.buffer = None
def write(self, packet): if packet.data is None: return packet gml_doc = packet.data log.info('inserting features in DB') db = PostGIS(self.cfg.get_dict()) db.connect() # print self.to_string(gml_doc, False, False) # NS = {'base': 'urn:x-inspire:specification:gmlas:BaseTypes:3.2', 'gml': 'http://www.opengis.net/gml/3.2'} # featureMembers = gml_doc.xpath('//base:member/*', namespaces=NS) featureMembers = gml_doc.xpath("//*[local-name() = '%s']/*" % self.feature_member_tag) count = 0 gml_ns = None for childNode in featureMembers: if gml_ns is None: if childNode.nsmap.has_key('gml'): gml_ns = childNode.nsmap['gml'] else: if childNode.nsmap.has_key('GML'): gml_ns = childNode.nsmap['GML'] gml_id = childNode.get('{%s}id' % gml_ns) feature_type_id = self.feature_type_ids[childNode.tag] # Find a GML geometry in the GML NS ogrGeomWKT = None # gmlMembers = childNode.xpath(".//gml:Point|.//gml:Curve|.//gml:Surface|.//gml:MultiSurface", namespaces=NS) gmlMembers = childNode.xpath( ".//*[local-name() = 'Point']|.//*[local-name() = 'Polygon']|.//*[local-name() = 'Curve']|.//*[local-name() = 'Surface']|.//*[local-name() = 'MultiSurface']" ) geom_str = None for gmlMember in gmlMembers: if geom_str is None: geom_str = etree.tostring(gmlMember) # no need for GDAL Python bindings for now, maybe when we'll optimize with COPY iso INSERT # ogrGeom = ogr.CreateGeometryFromGML(str(gmlStr)) # if ogrGeom is not None: # ogrGeomWKT = ogrGeom.ExportToWkt() # if ogrGeomWKT is not None: # break blob = etree.tostring(childNode, pretty_print=False, xml_declaration=False, encoding='UTF-8') if geom_str is None: sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object) VALUES (%s, %s, %s)" parameters = (gml_id, feature_type_id, db.make_bytea(blob)) else: # ST_SetSRID(ST_GeomFromGML(%s)),-1) sql = "INSERT INTO gml_objects(gml_id, ft_type, binary_object, gml_bounded_by) VALUES (%s, %s, %s, ST_SetSRID( ST_GeomFromGML(%s),%s) )" parameters = (gml_id, feature_type_id, db.make_bytea(blob), geom_str, self.srid) if db.execute(sql, parameters) == -1: log.error( "feat num# = %d error inserting feature blob=%s (but continuing)" % (count, blob)) # will fail but we will close connection also db.commit() # proceed... log.info('retrying to proceed with remaining features...') db = PostGIS(self.cfg.get_dict()) db.connect() count = 0 count += 1 exception = db.commit() if exception is not None: log.error("error in commit") log.info("inserted %s features" % count) return packet
class WeewxDbInput(SqliteDbInput): """ Reads weewx raw archive records from SQLite. """ def __init__(self, configdict, section): SqliteDbInput.__init__(self, configdict, section) self.progress_query = self.cfg.get('progress_query') self.progress_update = self.cfg.get('progress_update') # Connect only once to DB log.info('Init: connect to Postgres DB') self.progress_db = PostGIS(self.cfg.get_dict()) self.progress_db.connect() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.progress_db.disconnect() def after_chain_invoke(self, packet): """ Called right after entire Component Chain invoke. Used to update last id of processed file record. """ # last_datetime.datetime.fromtimestamp(self.last_id).strftime('%Y-%m-%d %H:%M:%S') ts_local = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime(self.last_id)) log.info('Updating progress table ts_unix=%d ts_local=%s' % (self.last_id, ts_local)) self.progress_db.execute(self.progress_update % (self.last_id, ts_local)) self.progress_db.commit(close=False) log.info('Update progress table ok') return True def read(self, packet): # Get last processed id of archive table self.progress_db.execute(self.progress_query) progress_rec = self.progress_db.cursor.fetchone() self.last_id = progress_rec[3] log.info('progress record: %s' % str(progress_rec)) # Fetch next batch of archive records archive_recs = self.do_query(self.query % self.last_id) log.info('read archive_recs: %d' % len(archive_recs)) # No more records to process? if len(archive_recs) == 0: packet.set_end_of_stream() log.info('Nothing to do. All file_records done') return packet # Remember last id processed for next query self.last_id = archive_recs[len(archive_recs)-1].get('dateTime') packet.data = archive_recs # Always stop after batch, otherwise we would continue forever packet.set_end_of_stream() return packet
class ProgressTracker(Filter): """" Filter to track progress of a stream of processed records. Stores progress (last id, last timestamp etc) in Postgres table. """ @Config(ptype=str, required=False, default='localhost') def host(self): """ host name or host IP-address, defaults to 'localhost' """ pass @Config(ptype=str, required=False, default='5432') def port(self): """ port for host, defaults to '5432' """ pass @Config(ptype=str, required=False, default='postgres') def user(self): """ User name, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='postgres') def password(self): """ User password, defaults to 'postgres' """ pass @Config(ptype=str, required=False, default='public') def schema(self): """ The postgres schema name, defaults to 'public' """ pass @Config(ptype=str, required=False, default='progress') def table(self): """ Table name, defaults to 'progress'. """ pass @Config(ptype=str, required=True) def progress_update_query(self): """ Query to update progress Required: True Default: "" """ pass @Config(ptype=str, required=True) def id_key(self): """ Key to select id from record array Required: True """ @Config(ptype=str, default=None, required=False) def name_key(self): """ Key to select name from record array Required: True """ def __init__(self, config_dict, section): Filter.__init__(self, config_dict, section, consumes=[FORMAT.record_array, FORMAT.record], produces=[FORMAT.record_array, FORMAT.record]) self.last_ids = None self.db = None def init(self): self.db = PostGIS(self.cfg.get_dict()) self.db.connect() def invoke(self, packet): self.last_ids = dict() if packet.data is None or packet.is_end_of_doc() or packet.is_end_of_stream(): log.info("No packet data or end of doc/stream") return packet record_in = packet.data if type(record_in) is not list: record_in = [record_in] for record in record_in: if self.name_key is not None: name = record[self.name_key] else: name = "all" if len(record) > 0: new = record[self.id_key] self.last_ids[name] = max(self.last_ids.get(name, -1), new) log.info("Last ids are: %s", str(self.last_ids)) return packet def after_chain_invoke(self, packet): """ Called right after entire Component Chain invoke. Used to update last id of processed file record. """ for name in self.last_ids: param_tuple = (self.last_ids[name], name) log.info('Updating progress table with (id=%d, name=%s)' % param_tuple) self.db.execute(self.progress_update_query % param_tuple) self.db.commit(close=False) log.info('Update progress table ok') else: log.info('No update for progress table') return True
class PostgresInsertOutput(PostgresDbOutput): """ Output by inserting single record into Postgres database. Input is a record (Python dic structure) or a Python list of dicts (records). Creates an INSERT for Postgres to insert each single record. consumes=FORMAT.record """ def __init__(self, configdict, section, consumes=FORMAT.record): DbOutput.__init__(self, configdict, section, consumes=[FORMAT.record_array, FORMAT.record]) self.query = None self.db = None self.key = self.cfg.get('key') def init(self): # Connect only once to DB log.info('Init: connect to DB') self.db = PostGIS(self.cfg.get_dict()) self.db.connect() def exit(self): # Disconnect from DB when done log.info('Exit: disconnect from DB') self.db.disconnect() def create_query(self, record): # We assume that all records do the same INSERT key/values # See http://grokbase.com/t/postgresql/psycopg/12735bvkmv/insert-into-with-a-dictionary-or-generally-with-a-variable-number-of-columns # e.g. INSERT INTO lml_files ("file_name", "file_data") VALUES (%s,%s) query = "INSERT INTO %s (%s) VALUES (%s)" % (self.cfg.get('table'), ",".join(['%s' % k for k in record]), ",".join(["%s",]*len(record.keys()))) log.info('query is %s', query) return query def write(self, packet): # Deal with empty or zero-length data structures (list or dict) if packet.data is None or len(packet.data) == 0: return packet # ASSERT: record data present # record is Python dict (single record) or list of Python dict (multiple records) record = packet.data # Generate INSERT query template once first_record = record if type(record) is list and len(record) > 0: first_record = record[0] # Create query once if self.query is None: self.query = self.create_query(first_record) # Check if record is single (dict) or array (list of dict) if type(record) is dict: # Do insert with values from the single record self.db.execute(self.query, record.values()) self.db.commit(close=False) # log.info('committed record key=%s' % record[self.key]) elif type(record) is list: # Multiple records in list for rec in record: # Do insert with values from the record self.db.execute(self.query, rec.values()) self.db.commit(close=False) log.info('committed %d records' % len(record)) return packet