def init(self): self.ogr = ogr # http://trac.osgeo.org/gdal/wiki/PythonGotchas self.gdal = gdal self.gdal.UseExceptions() log.info("Using GDAL/OGR version: %d" % int(gdal.VersionInfo('VERSION_NUM'))) # GDAL error handler function # http://pcjericks.github.io/py-gdalogr-cookbook/gdal_general.html def gdal_error_handler(err_class, err_num, err_msg): err_type = { gdal.CE_None: 'None', gdal.CE_Debug: 'Debug', gdal.CE_Warning: 'Warning', gdal.CE_Failure: 'Failure', gdal.CE_Fatal: 'Fatal' } err_msg = err_msg.replace('\n', ' ') err_class = err_type.get(err_class, 'None') log.error('Error Number: %s, Type: %s, Msg: %s' % (err_num, err_class, err_msg)) # install error handler self.gdal.PushErrorHandler(gdal_error_handler) # Raise a dummy error for testing # self.gdal.Error(1, 2, 'test error') if self.source_options: for k in self.source_options: self.gdal.SetConfigOption(k, self.source_options[k]) # Open OGR data source in read-only mode. if self.source_format: self.data_source_p = ogr.GetDriverByName(self.source_format).Open(self.data_source, 0) else: self.data_source_p = self.ogr.Open(self.data_source, 0) # Report failure if failed if self.data_source_p is None: log.error("Cannot open OGR datasource: %s with the following drivers." % Util.safe_string_value(self.data_source)) for iDriver in range(self.ogr.GetDriverCount()): log.info(" -> " + self.ogr.GetDriver(iDriver).GetName()) raise Exception() else: # Open ok: initialize self.layer = None if self.sql: self.layer_count = 1 self.layer_idx = -1 else: self.layer_count = self.data_source_p.GetLayerCount() self.layer_idx = 0 log.info("Opened OGR source ok: %s layer count=%d" % (Util.safe_string_value(self.data_source), self.layer_count))
def execute_cmd(self, cmd): env_vars = Util.string_to_dict(self.env_args, self.env_separator) old_environ = os.environ.copy() try: os.environ.update(env_vars) log.info("executing cmd=%s" % Util.safe_string_value(cmd)) subprocess.call(cmd, shell=True) log.info("execute done") finally: os.environ = old_environ
def read(self, packet): if not self.data_source_p: log.info("End reading from: %s" % Util.safe_string_value(self.data_source)) return packet if self.layer is None: if self.sql and self.layer_idx == -1: # PostgreSQL: Layer is gotten via Query # http://trac.osgeo.org/postgis/wiki/UsersWikiOGR self.layer = self.data_source_p.ExecuteSQL(self.sql) self.layer_idx = 0 elif self.layer_idx < self.layer_count: self.layer = self.data_source_p.GetLayer(self.layer_idx) self.layer_idx += 1 if self.layer is None: log.error("Could not fetch layer %d" % 0) raise Exception() log.info("Start reading from OGR Source: %s, Layer: %s" % (Util.safe_string_value( self.data_source), self.layer.GetName())) else: # No more Layers left: cleanup packet.set_end_of_stream() log.info("Closing OGR source: %s" % Util.safe_string_value(self.data_source)) # Destroy not required anymore: http://trac.osgeo.org/gdal/wiki/PythonGotchas # self.data_source_p.Destroy() self.data_source_p = None return packet # Return all features from Layer (ogr_feature_array) or next feature (ogr_feature) if self.output_format == FORMAT.ogr_feature_array: # Assemble all features features = list() for feature in self.layer: features.append(feature) packet.data = features log.info("End reading all features from Layer: %s count=%d" % (self.layer.GetName(), len(features))) packet.set_end_of_doc() self.layer = None else: # Next feature feature = self.layer.GetNextFeature() if feature: packet.data = feature else: log.info("End reading from Layer: %s" % self.layer.GetName()) packet.set_end_of_doc() self.layer = None return packet
def read(self, packet): if not self.data_source_p: log.info("End reading from: %s" % Util.safe_string_value(self.data_source)) return packet if self.layer is None: if self.sql and self.layer_idx == -1: # PostgreSQL: Layer is gotten via Query # http://trac.osgeo.org/postgis/wiki/UsersWikiOGR self.layer = self.data_source_p.ExecuteSQL(self.sql) self.layer_idx = 0 elif self.layer_idx < self.layer_count: self.layer = self.data_source_p.GetLayer(self.layer_idx) self.layer_idx += 1 if self.layer is None: log.error("Could not fetch layer %d" % 0) raise Exception() log.info("Start reading from OGR Source: %s, Layer: %s" % (Util.safe_string_value(self.data_source), self.layer.GetName())) else: # No more Layers left: cleanup packet.set_end_of_stream() log.info("Closing OGR source: %s" % Util.safe_string_value(self.data_source)) # Destroy not required anymore: http://trac.osgeo.org/gdal/wiki/PythonGotchas # self.data_source_p.Destroy() self.data_source_p = None return packet # Return all features from Layer (ogr_feature_array) or next feature (ogr_feature) if self.output_format == FORMAT.ogr_feature_array: # Assemble all features features = list() for feature in self.layer: features.append(feature) packet.data = features log.info("End reading all features from Layer: %s count=%d" % (self.layer.GetName(), len(features))) packet.set_end_of_doc() self.layer = None else: # Next feature feature = self.layer.GetNextFeature() if feature: packet.data = feature else: log.info("End reading from Layer: %s" % self.layer.GetName()) packet.set_end_of_doc() self.layer = None return packet
def write_end(self, packet): # Destroy not required anymore: http://trac.osgeo.org/gdal/wiki/PythonGotchas # self.dest_fd.Destroy() log.info("End writing to: %s" % Util.safe_string_value(self.dest_data_source)) self.dest_fd = None self.layer = None return packet
def write(self, packet): # Are we all done? if packet.data is None or self.dest_fd is None: self.write_end(packet) return packet if self.layer is None: log.info("No Layer, end writing to: %s" % Util.safe_string_value(self.dest_data_source)) return packet # Assume ogr_feature_array input, otherwise convert ogr_feature to list if type(packet.data) is list: # Write feature collection to OGR Layer output for feature in packet.data: self.write_feature(feature) self.write_end(packet) else: # Write single feature to OGR Layer output if packet.end_of_stream or packet.end_of_doc: self.write_end(packet) return packet self.write_feature(packet.data) return packet
def test_make_file_list_depth_search(self): # Util.make_file_list import sys file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data/depth_search_test') filename_pattern = 'dummy.gml' # Test with depth_search enabled depth_search = True file_list = Util.make_file_list(file_path, None, filename_pattern, depth_search) self.assertEqual(len(file_list), 2) # Test with depth_search disabled depth_search = False file_list = Util.make_file_list(file_path, None, filename_pattern, depth_search) self.assertEqual(len(file_list), 1)
def __init__(self, configdict, section): StringFilter.__init__(self, configdict, section, consumes=FORMAT.string, produces=FORMAT.string) # Convert string to dict: http://stackoverflow.com/a/1248990 self.format_args_dict = Util.string_to_dict(self.format_args, self.separator)
def __init__(self, configdict, section, produces): Input.__init__(self, configdict, section, produces) # Create the list of files to be used as input self.file_list = Util.make_file_list(self.file_path, None, self.filename_pattern, self.depth_search) log.info("file_list=%s" % str(self.file_list)) if not len(self.file_list): raise Exception('File list is empty!!') self.file_list_done = []
def __init__(self, configdict, section): FileInput.__init__(self, configdict, section, produces=FORMAT.string) self.file = None # Optional formatting of content according to Python String.format() # Input file should have substitutable values like {schema} {foo} # format_args should be of the form format_args = schema:test foo:bar if self.format_args: # Convert string to dict: http://stackoverflow.com/a/1248990 self.format_args = Util.string_to_dict(self.format_args, ':')
def __init__(self, configdict, section): StringFilter.__init__(self, configdict, section, consumes=FORMAT.string, produces=FORMAT.string) # Formatting of content according to Python String.format() # String should have substitutable values like {schema} {foo} # format_args should be of the form format_args = schema:test foo:bar ... self.format_args = self.cfg.get('format_args') # Convert string to dict: http://stackoverflow.com/a/1248990 self.format_args_dict = Util.string_to_dict(self.format_args, ':')
def exec_cmd(self): log.info("start ogr2ogr cmd = %s" % Util.safe_string_value(repr(self.cmd))) self.ogr_process = subprocess.Popen(self.cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) err_line = self.readline_err() if err_line: log.warning('ogr2ogr: %s ' % err_line)
def __init__(self, configdict, section, produces): Input.__init__(self, configdict, section, produces) # Create the list of files to be used as input self.file_list = Util.make_file_list(self.file_path, None, self.filename_pattern, self.depth_search) log.info("file_list=%s" % str(self.file_list)) if not len(self.file_list): raise Exception('File list is empty!!') self.cur_file_path = None self.file_list_done = []
def execute_cmd(self, cmd): env_vars = Util.string_to_dict(self.env_args, self.env_separator) old_environ = os.environ.copy() try: os.environ.update(env_vars) log.info("executing cmd=%s" % cmd) result = subprocess.check_output(cmd, shell=True) log.info("execute done") return result finally: os.environ = old_environ
def parse_args(args_list): log.info("Stetl version = %s" % __version__) argparser = argparse.ArgumentParser(description='Invoke Stetl') argparser.add_argument('-c ', '--config', type=str, help='ETL config file in .ini format', dest='config_file', required=False) argparser.add_argument('-s ', '--section', type=str, help='Section in the config file to execute, default is [etl]', dest='config_section', required=False) argparser.add_argument('-a ', '--args', type=str, help='Arguments or .properties files to be substituted for symbolic {argN}s in Stetl config file,\ as -a "arg1=foo arg2=bar" and/or -a args.properties, multiple -a options are possible', dest='config_args', required=False, action='append') argparser.add_argument('-d ', '--doc', type=str, help='Get component documentation like its configuration parameters, e.g. stetl doc stetl.inputs.fileinput.FileInput', dest='doc_args', required=False) argparser.add_argument('-v', '--version', action='store_true', help='Show current version of stetl and exit', required=False) args = argparser.parse_args(args_list) if args.config_args: args_total = dict() for arg in args.config_args: if os.path.isfile(arg): log.info('Found args file at: %s' % arg) args_total = Util.merge_two_dicts(args_total, Util.propsfile_to_dict(arg)) else: # Convert string to dict: http://stackoverflow.com/a/1248990 args_total = Util.merge_two_dicts(args_total, Util.string_to_dict(arg)) args.config_args = args_total return args
def etree_elem2struct(packet, strip_space=True, strip_ns=True, sub=False, attr_prefix='', gml2ogr=True, ogr2json=True): """ :param packet: :param strip_space: :param strip_ns: :param sub: :param attr_prefix: :param gml2ogr: :param ogr2json: :return: """ packet.data = Util.elem_to_dict(packet.data, strip_space, strip_ns, sub, attr_prefix, gml2ogr, ogr2json) return packet
def process_xml(self, packet): while not self.context is None: #while not packet.is_end_of_doc(): try: event, elem = self.context.next() except (etree.XMLSyntaxError, StopIteration): # workaround for etree.XMLSyntaxError https://bugs.launchpad.net/lxml/+bug/1185701 self.context = None if self.context is None: # Always end of doc # TODO: is this still useful for a non-input component? packet.set_end_of_doc() log.info("End of doc: %s elem_count=%d" % (self.cur_file_path, self.elem_count)) return packet # Filter out Namespace from the tag # this is the easiest way to go for now tag = elem.tag.split('}') if len(tag) == 2: # Namespaced tag: 2nd is tag tag = tag[1] else: # Non-namespaced tag: first tag = tag[0] if tag in self.element_tags: if event == "start": # TODO check if deepcopy is the right thing to do here. # packet.data = elem pass # self.root.remove(elem) elif event == "end": # Delete the element from the tree # self.root.clear() packet.data = elem self.elem_count += 1 self.root.remove(elem) if self.strip_namespaces: packet.data = Util.stripNamespaces(elem).getroot() # If there is a next component, let it process if self.next: # Hand-over data (line, doc whatever) to the next component packet.format = self._output_format packet = self.next.process(packet) return packet
def __init__(self, configdict, section, produces): Input.__init__(self, configdict, section, produces) # path to file or files: can be a dir or files or even multiple, comma separated self.file_path = self.cfg.get('file_path') # The filename pattern according to Python glob.glob self.filename_pattern = self.cfg.get('filename_pattern', '*.[gxGX][mM][lL]') # Recurse into directories ? self.depth_search = self.cfg.get_bool('depth_search', False) # Create the list of files to be used as input self.file_list = Util.make_file_list(self.file_path, None, self.filename_pattern, self.depth_search) log.info("file_list=%s" % str(self.file_list))
def process_xml(self, packet): while self.context is not None: # while not packet.is_end_of_doc(): try: event, elem = next(self.context) except (etree.XMLSyntaxError, StopIteration): # workaround for etree.XMLSyntaxError https://bugs.launchpad.net/lxml/+bug/1185701 self.context = None if self.context is None: # Always end of doc # TODO: is this still useful for a non-input component? packet.set_end_of_doc() log.info("End of doc: %s elem_count=%d" % (self.cur_file_path, self.elem_count)) return packet # Filter out Namespace from the tag # this is the easiest way to go for now tag = elem.tag.split('}') if len(tag) == 2: # Namespaced tag: 2nd is tag tag = tag[1] else: # Non-namespaced tag: first tag = tag[0] if tag in self.element_tags: if event == "start": pass elif event == "end": packet.data = deepcopy(elem) self.elem_count += 1 if self.strip_namespaces: packet.data = Util.stripNamespaces(elem).getroot() # Clear the element which has been read. Don't clear the root document, # since the last element hasn't been processed yet. elem.clear() # If there is a next component, let it process if self.next: # Hand-over data (line, doc whatever) to the next component packet.format = self._output_format packet = self.next.process(packet) return packet
#!/usr/bin/env python # -*- coding: utf-8 -*- # # POST data via WFS Transactional protocol (WFS-T). # # Author: Just van den Broecke # from stetl.component import Config from stetl.output import Output from stetl.util import Util from stetl.packet import FORMAT import httplib log = Util.get_log('wfsoutput') class WFSTOutput(Output): """ Insert features via WFS-T (WFS Transaction) OGC protocol from an etree doc. consumes=FORMAT.etree_doc """ # Start attribute config meta @Config(ptype=str, required=True, default=None) def wfs_host(self): """ Hostname-part of URL e.g. geodata.ngr.nl. """ pass
# -*- coding: utf-8 -*- # # Writes the payload of a packet as a string to a file. # Based on outputs.fileoutput.FileOutput. # # Author: Frank Steggink # from stetl.component import Config from stetl.filter import Filter from stetl.util import Util from stetl.packet import FORMAT import os log = Util.get_log('packetwriter') class PacketWriter(Filter): """ Writes the payload of a packet as a string to a file. consumes=FORMAT.any, produces=FORMAT.string """ # Start attribute config meta @Config(ptype=str, default=None, required=True) def file_path(self): """ File path to write content to. """ pass
#!/usr/bin/env python # # Extracts arrays of etree GML features from an GML etree document. # # Author: Just van den Broecke # from stetl.util import Util from stetl.filter import Filter from stetl.packet import FORMAT log = Util.get_log('gmlfeatureextractor') class GmlFeatureExtractor(Filter): """ Extract arrays of GML features etree elements from etree docs. consumes=FORMAT.etree_doc, produces=FORMAT.etree_feature_array """ # XPATH Query base for extracting features by (non-namespaced thus local-name) tagname xpath_base = "//*[local-name() = '%s']" # Constructor def __init__(self, configdict, section='gml_feature_extractor'): Filter.__init__(self, configdict, section, consumes=FORMAT.etree_doc, produces=FORMAT.etree_feature_array) log.info("cfg = %s" % self.cfg.to_string()) # Build the Xpath expresion from configures tagnames self.feature_tags = self.cfg.get('feature_tags').split(',')
# -*- coding: utf-8 -*- # # MeasurementsDbInput: Reads SmartEm raw AQ/LML file data from measurements table and converts to recordlist # # Author:Just van den Broecke from stetl.util import Util, etree from stetl.inputs.dbinput import PostgresDbInput from stetl.packet import FORMAT from stetl.postgis import PostGIS from datetime import datetime log = Util.get_log("MeasurementsDbInput") class MeasurementsDbInput(PostgresDbInput): """ Reads SmartEm raw AQ/LML file data from measurements table and converts to recordlist """ def __init__(self, configdict, section): PostgresDbInput.__init__(self, configdict, section) self.progress_query = self.cfg.get('progress_query') self.progress_update = self.cfg.get('progress_update') self.db = None def after_chain_invoke(self, packet): """ Called right after entire Component Chain invoke. Used to update last id of processed file record. """
#!/usr/bin/env python # # Splits stream of GML lines into etree docs. # # Author: Just van den Broecke # import codecs from deprecated.sphinx import deprecated from stetl.util import Util, etree, StringIO from stetl.filter import Filter from stetl.packet import FORMAT log = Util.get_log('gmlsplitter') @deprecated( version='1.0.4', reason= 'Use the more robust XmlElementStreamerFileInput + XmlAssembler instead!!!' ) class GmlSplitter(Filter): """ Split a stream of text XML lines into documents TODO phase out consumes=FORMAT.xml_line_stream, produces=FORMAT.etree_doc """ def __init__(self, configdict, section='gml_splitter'): Filter.__init__(self, configdict, section,
# Output classes for ETL with SensorThings API. # # Author: Just van den Broecke # from os import path import requests import json import base64 from stetl.util import Util from stetl.packet import FORMAT from stetl.component import Config from stetl.outputs.httpoutput import HttpOutput log = Util.get_log('staoutput') class STAOutput(HttpOutput): """ Output via SensorThings API (STA) over plain HTTP using the HttpOutput base class. See examples: http://www.sensorup.com/docs/?python consumes=FORMAT.record_array """ @Config(ptype=str, default='application/json;charset=UTF-8', required=False) def content_type(self): """
# -*- coding: utf-8 -*- # # Output classes for ETL, databases. # # Author: Just van den Broecke # from stetl.output import Output from stetl.util import Util from stetl.packet import FORMAT from stetl.component import Config from stetl.postgis import PostGIS log = Util.get_log('dboutput') class DbOutput(Output): """ Output to any database (abstract base class). """ def __init__(self, configdict, section, consumes): Output.__init__(self, configdict, section, consumes) def write(self, packet): return packet class PostgresDbOutput(DbOutput): """ Output to PostgreSQL database. Input is an SQL string.
#!/usr/bin/env python # # Converts Stetl Packet FORMATs. This can be used to connect # Stetl components with different output/input formats. # # Author:Just van den Broecke import json from stetl.component import Config from stetl.util import Util, etree from stetl.filter import Filter from stetl.packet import FORMAT log = Util.get_log("formatconverter") class FormatConverter(Filter): """ Converts (almost) any packet format (if converter available). consumes=FORMAT.any, produces=FORMAT.any but actual formats are changed at initialization based on the input to output format to be converted via the input_format and output_format config parameters. """ # Start attribute config meta # Applying Decorator pattern with the Config class to provide # read-only config values from the configured properties. @Config(ptype=dict, default=None, required=False) def converter_args(self):
# -*- coding: utf-8 -*- # # Reads an XML file and returns XML elements. # Based on inputs.fileinput.XmlElementStreamFileInput. # # Author: Frank Steggink # from copy import deepcopy from stetl.component import Config from stetl.filter import Filter from stetl.util import Util, etree from stetl.packet import FORMAT log = Util.get_log('xmlelementreader') class XmlElementReader(Filter): """ Extracts XML elements from a file, outputs each feature element in Packet. Parsing is streaming (no internal DOM buildup) so any file size can be handled. Use this class for your big GML files! consumes=FORMAT.string, produces=FORMAT.etree_element """ # Start attribute config meta @Config(ptype=list, default=None, required=True) def element_tags(self): """ Comma-separated string of XML (feature) element tag names of the elements that should be extracted
# Packet buffering. # # Author:Just van den Broecke import copy from stetl.util import Util from stetl.filter import Filter from stetl.packet import FORMAT log = Util.get_log("packetbuffer") class PacketBuffer(Filter): """ Buffers all incoming Packets, main use is unit-testing to inspect Packets after ETL is done. """ # Constructor def __init__(self, configdict, section): Filter.__init__(self, configdict, section, consumes=FORMAT.any, produces=FORMAT.any) self.packet_list = [] def invoke(self, packet): # Buffer Packet and pass-through, we need a deep copy as Packets may be cleared/reused self.packet_list.append(copy.copy(packet)) return packet
# -*- coding: utf-8 -*- # # Input classes for ETL, Files. # # Author: Just van den Broecke # from stetl.input import Input from stetl.util import Util, etree from stetl.packet import FORMAT log = Util.get_log('fileinput') class FileInput(Input): """ Abstract base class for specific FileInputs. """ def __init__(self, configdict, section, produces): Input.__init__(self, configdict, section, produces) # path to file or files: can be a dir or files or even multiple, comma separated self.file_path = self.cfg.get('file_path') # The filename pattern according to Python glob.glob self.filename_pattern = self.cfg.get('filename_pattern', '*.[gxGX][mM][lL]') # Recurse into directories ? self.depth_search = self.cfg.get_bool('depth_search', False) # Create the list of files to be used as input self.file_list = Util.make_file_list(self.file_path, None, self.filename_pattern, self.depth_search)
# # Author: Pieter Marsman - 2016 import sys import traceback from stetl.component import Config from stetl.filter import Filter from stetl.inputs.dbinput import PostgresDbInput from stetl.packet import FORMAT from stetl.util import Util from dateutil import parser from sensordefs import * log = Util.get_log("Extractor") class ExtractFilter(Filter): """ Filter to consume single raw record with sensor (single hour) timeseries values and extract these for each component. Input is a single timeseries record for a single hour with all sensorvalues for a single device within that hour. """ @Config(ptype=list, default=[], required=True) def sensor_names(self): """ The output sensor names to extract. Required: True Default: []
# Transformation of any input using Python Templating as # meant in: https://wiki.python.org/moin/Templating. # A TemplatingFilter typically is configured with a template file. # The input is typically the Template context, the variables to be substituted. # The output is a string passed to the next Filter or Output. # # Author:Just van den Broecke from stetl.util import Util, ogr, osr from stetl.component import Config from stetl.filter import Filter from stetl.packet import FORMAT from string import Template import os log = Util.get_log("templatingfilter") class TemplatingFilter(Filter): """ Abstract base class for specific template-based filters. See https://wiki.python.org/moin/Templating Subclasses implement a specific template language like Python string.Template, Mako, Genshi, Jinja2, consumes=FORMAT.any, produces=FORMAT.string """ # Start attribute config meta # Applying Decorator pattern with the Config class to provide # read-only config values from the configured properties.
# # Filter that deals with subfeatures in BGT GML files. # # Author: Frank Steggink import os from copy import deepcopy # We need specifically lxml, because of the incremental XML generation from lxml import etree from stetl.component import Config from stetl.filter import Filter from stetl.packet import FORMAT from stetl.util import Util log = Util.get_log("subfeaturehandler") class SubFeatureHandler(Filter): """ This filter checks whether the data file contains the given parent features. If this is the case, the parent feature and subfeatures are split into different features. """ # Start attribute config meta # Applying Decorator pattern with the Config class to provide # read-only config values from the configured properties. @Config(ptype=str, default=None, required=True) def temp_file(self): """
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Output Components for deegree server storage (www.deegree.org). # # Author: Just van den Broecke # # NB deegree also supports WFS-T! # from stetl.postgis import PostGIS from stetl.output import Output from stetl.util import Util, etree from stetl.packet import FORMAT import os log = Util.get_log('deegreeoutput') class DeegreeBlobstoreOutput(Output): """ Insert features into deegree Blobstore from an etree doc. consumes=FORMAT.etree_doc """ def __init__(self, configdict, section): Output.__init__(self, configdict, section, consumes=FORMAT.etree_doc) self.overwrite = self.cfg.get_bool('overwrite') self.srid = self.cfg.get_int('srid', -1) self.feature_member_tag = self.cfg.get('feature_member_tag') self.feature_type_ids = {} def init(self):
# -*- coding: utf-8 -*- # # Filter that does noting, just passes any data through. # # Author:Just van den Broecke from stetl.util import Util from stetl.filter import Filter from stetl.packet import FORMAT log = Util.get_log("nullfilter") class NullFilter(Filter): """ Pass-through Filter, does nothing. Mainly used in Test Cases. """ # Constructor def __init__(self, configdict, section, consumes=FORMAT.any, produces=FORMAT.any): Filter.__init__(self, configdict, section, consumes, produces) def invoke(self, packet): return packet
#!/usr/bin/env python # # Transformation of an etree doc with XSLT. # # Author:Just van den Broecke from stetl.component import Config from stetl.util import Util, etree from stetl.filter import Filter from stetl.packet import FORMAT log = Util.get_log("xsltfilter") class XsltFilter(Filter): """ Invokes XSLT processor (via lxml) for given XSLT script on an etree doc. consumes=FORMAT.etree_doc, produces=FORMAT.etree_doc """ @Config(ptype=str, required=True) def script(self): """ Path to XSLT script file. """ pass # Constructor def __init__(self, configdict, section): Filter.__init__(self, configdict,
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Extracts data from a string using a regular expression and generates a record. # # Author: Frank Steggink from stetl.component import Config from stetl.filter import Filter from stetl.packet import FORMAT from stetl.util import Util import re log = Util.get_log("regexfilter") class RegexFilter(Filter): """ Extracts data from a string using a regular expression and returns the named groups as a record. consumes=FORMAT.string, produces=FORMAT.record """ # Start attribute config meta # Applying Decorator pattern with the Config class to provide # read-only config values from the configured properties. @Config(ptype=str, default=None, required=True) def pattern_string(self): """ Regex pattern string. Should contain named groups. """
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Converts Stetl Packet FORMATs. This can be used to connect # Stetl components with different output/input formats. # # Author:Just van den Broecke from stetl.component import Config from stetl.util import Util, etree from stetl.filter import Filter from stetl.packet import FORMAT import json log = Util.get_log("formatconverter") class FormatConverter(Filter): """ Converts (almost) any packet format (if converter available). consumes=FORMAT.any, produces=FORMAT.any but actual formats are changed at initialization based on the input to output format to be converted via the input_format and output_format config parameters. """ # Start attribute config meta # Applying Decorator pattern with the Config class to provide # read-only config values from the configured properties. @Config(ptype=dict, default=None, required=False)
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Input classes for ETL via GDAL OGR. # # Author: Just van den Broecke # import subprocess from stetl.component import Config from stetl.util import Util, gdal, ogr from stetl.input import Input from stetl.packet import FORMAT log = Util.get_log('ogrinput') class OgrInput(Input): """ Direct GDAL OGR input via Python OGR wrapper. Via the Python API http://gdal.org/python an OGR data source is accessed and from each layer the Features are read. Each Layer corresponds to a "doc", so for multi-layer sources the 'end-of-doc' flag is set after a Layer has been read. This input can read almost any geospatial dataformat. One can use the features directly in a Stetl Filter or use a converter to e.g. convert to GeoJSON structures. produces=FORMAT.ogr_feature or FORMAT.ogr_feature_array (all features) """ # Start attribute config meta # Applying Decorator pattern with the Config class to provide
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Splits stream of XML elements into etree docs. # # Author: Just van den Broecke # from stetl.util import Util, etree from stetl.filter import Filter from stetl.packet import FORMAT log = Util.get_log('xmlassembler') class XmlAssembler(Filter): """ Split a stream of etree DOM XML elements (usually Features) into etree DOM docs. Consumes and buffers elements until max_elements reached, will then produce an etree doc. consumes=FORMAT.etree_element_stream, produces=FORMAT.etree_doc """ xpath_base = "//*[local-name() = '%s']" # Constructor def __init__(self, configdict, section): Filter.__init__(self, configdict, section, consumes=FORMAT.etree_element_stream, produces=FORMAT.etree_doc)
# -*- coding: utf-8 -*- # # Output to File classes. # # Author: Just van den Broecke # from stetl.output import Output from stetl.util import Util from stetl.packet import FORMAT import os log = Util.get_log('fileoutput') class FileOutput(Output): """ Pretty print XML to file from an etree doc. consumes=FORMAT.etree_doc """ def __init__(self, configdict, section): Output.__init__(self, configdict, section, consumes=FORMAT.etree_doc) log.info("working dir %s" % os.getcwd()) def write(self, packet): if packet.data is None: return packet file_path = self.cfg.get('file_path') return self.write_file(packet, file_path)
#!/usr/bin/env python # # Extracts data from a string using a regular expression and generates a record. # # Author: Frank Steggink import re from stetl.component import Config from stetl.filter import Filter from stetl.packet import FORMAT from stetl.util import Util log = Util.get_log("regexfilter") class RegexFilter(Filter): """ Extracts data from a string using a regular expression and returns the named groups as a record. consumes=FORMAT.string, produces=FORMAT.record """ # Start attribute config meta # Applying Decorator pattern with the Config class to provide # read-only config values from the configured properties. @Config(ptype=str, default=None, required=True) def pattern_string(self): """ Regex pattern string. Should contain named groups. """ pass
#!/usr/bin/env python # -*- coding: utf-8 -*- # # Output classes for ETL. # # Author: Just van den Broecke # from os import sys, path from stetl.outputs.httpoutput import HttpOutput from stetl.util import Util from stetl.packet import FORMAT from stetl.component import Config log = Util.get_log('sosoutput') class SOSTOutput(HttpOutput): """ Output via SOS-T protocol over plain HTTP. consumes=FORMAT.record_array """ @Config(ptype=str, default='application/json;charset=UTF-8', required=True) def content_type(self): """ The content type (for template). Required: True Default: application/json;charset=UTF-8 """ pass
# -*- coding: utf-8 -*- # # String filtering. # # Author:Just van den Broecke from stetl.component import Config from stetl.util import Util from stetl.filter import Filter from stetl.packet import FORMAT log = Util.get_log("stringfilter") class StringFilter(Filter): """ Base class for any string filtering """ # Constructor def __init__(self, configdict, section, consumes, produces): Filter.__init__(self, configdict, section, consumes, produces) def invoke(self, packet): if packet.data is None: return packet return self.filter_string(packet) def filter_string(self, packet): pass
# -*- coding: utf-8 -*- # # Extracts a file from a ZIP file, and saves it as the given file name. # # Author: Frank Steggink # from stetl.component import Config from stetl.filter import Filter from stetl.util import Util from stetl.packet import FORMAT log = Util.get_log('zipfileextractor') BUFFER_SIZE = 1024 * 1024 * 1024 class ZipFileExtractor(Filter): """ Extracts a file from a ZIP file, and saves it as the given file name. consumes=FORMAT.record, produces=FORMAT.string """ # Start attribute config meta @Config(ptype=str, default=None, required=True) def file_path(self): """ File name to write the extracted file to. """ pass
# -*- coding: utf-8 -*- # # Writes the payload of a packet as a string to a file. # Based on outputs.fileoutput.FileOutput. # # Author: Frank Steggink # from stetl.component import Config from stetl.filter import Filter from stetl.util import Util from stetl.packet import FORMAT import os log = Util.get_log('packetwriter') class PacketWriter(Filter): """ Writes the payload of a packet as a string to a file. consumes=FORMAT.any, produces=FORMAT.string """ # Start attribute config meta @Config(ptype=str, default=None, required=True) def file_path(self): """ File path to write content to. Required: True