Пример #1
0
    def __init__(self, verbose=False, quiet=True):

        ETL.__init__(self, verbose=verbose)

        self.read_configfiles()

        self.config["plugins"] = []
Пример #2
0
    def run_etl_groups(cls, logger, data_manager, neo_transactor):
        """This function runs each of the ETL in parallel"""
        etl_time_tracker_list = []
        for etl_group in cls.etl_groups:
            etl_group_start_time = time.time()
            logger.info("Starting ETL group: %s" % etl_group)
            thread_pool = []
            for etl_name in etl_group:
                logger.info("ETL Name: %s" % etl_name)
                config = data_manager.get_config(etl_name)
                if config is not None:
                    etl = cls.etl_dispatch[etl_name](config)
                    process = multiprocessing.Process(target=etl.run_etl)
                    process.start()
                    thread_pool.append(process)
                else:
                    logger.info("No Config found for: %s" % etl_name)
            ETL.wait_for_threads(thread_pool)

            logger.info("Waiting for Queues to sync up")
            neo_transactor.check_for_thread_errors()
            neo_transactor.wait_for_queues()
            etl_elapsed_time = time.time() - etl_group_start_time
            etl_time_message = (
                "Finished ETL group: %s, Elapsed time: %s" %
                (etl_group,
                 time.strftime("%H:%M:%S", time.gmtime(etl_elapsed_time))))
            logger.info(etl_time_message)
            etl_time_tracker_list.append(etl_time_message)

        return etl_time_tracker_list
	def export_row_data_to_index(self, data, rownumber):

		parameters = self.config.copy()
		
		# todo: all content plugins configurated, not only this one
		parameters['plugins'] = [
			'enhance_path',
			'enhance_entity_linking',
			'enhance_multilingual',
		]

		etl = ETL()

		try:
			
			etl.process( parameters=parameters, data=data)
		
		# if exception because user interrupted by keyboard, respect this and abbort		
		except KeyboardInterrupt:
			raise KeyboardInterrupt
		except BaseException as e:
			sys.stderr.write( "Exception adding CSV row {} : {}".format(rownumber, e) )

			if 'raise_pluginexception' in self.config:
				if self.config['raise_pluginexception']:
					raise e
Пример #4
0
    def __init__(self, verbose=False):

        ETL.__init__(self, verbose=verbose)

        self.verbose = verbose

        self.read_configfiles()

        # Watched events
        #
        # We need IN_MOVE_SELF to track moved folder paths
        # pyinotify-internally. If omitted, the os instructions
        # mv /docs/src /docs/dest; touch /docs/dest/doc.pdf
        # will produce a IN_MOVED_TO pathname=/docs/dest/ followed by
        # IN_CLOSE_WRITE pathname=/docs/src/doc.pdf
        # where we would like a IN_CLOSE_WRITE pathname=/docs/dest/doc.pdf
        self.mask = (pyinotify.IN_DELETE
                     | pyinotify.IN_CLOSE_WRITE
                     | pyinotify.IN_MOVED_TO
                     | pyinotify.IN_MOVED_FROM
                     | pyinotify.IN_MOVE_SELF)

        self.watchmanager = pyinotify.WatchManager()  # Watch Manager

        self.handler = EventHandler()

        self.notifier = pyinotify.Notifier(self.watchmanager, self.handler)
Пример #5
0
    def set_configdefaults(self):
        #
        # Standard config
        #
        # Do not edit config here! Overwrite options in /etc/etl/ or /etc/opensemanticsearch/connector-files
        #

        ETL.set_configdefaults(self)

        self.config['force'] = False

        # filename to URI mapping
        self.config['mappings'] = {"/": "file:///"}

        self.config['facet_path_strip_prefix'] = [
            "file://", "http://www.", "https://www.", "http://", "https://"
        ]

        self.config['plugins'] = [
            'enhance_mapping_id',
            'filter_blacklist',
            'filter_file_not_modified',
            'enhance_extract_text_tika_server',
            'enhance_detect_language_tika_server',
            'enhance_contenttype_group',
            'enhance_pst',
            'enhance_csv',
            'enhance_file_mtime',
            'enhance_path',
            'enhance_extract_hashtags',
            'enhance_warc',
            'enhance_zip',
            'clean_title',
            'enhance_multilingual',
        ]

        self.config['blacklist'] = [
            "/etc/opensemanticsearch/blacklist/blacklist-url"
        ]
        self.config['blacklist_prefix'] = [
            "/etc/opensemanticsearch/blacklist/blacklist-url-prefix"
        ]
        self.config['blacklist_suffix'] = [
            "/etc/opensemanticsearch/blacklist/blacklist-url-suffix"
        ]
        self.config['blacklist_regex'] = [
            "/etc/opensemanticsearch/blacklist/blacklist-url-regex"
        ]
        self.config['whitelist'] = [
            "/etc/opensemanticsearch/blacklist/whitelist-url"
        ]
        self.config['whitelist_prefix'] = [
            "/etc/opensemanticsearch/blacklist/whitelist-url-prefix"
        ]
        self.config['whitelist_suffix'] = [
            "/etc/opensemanticsearch/blacklist/whitelist-url-suffix"
        ]
        self.config['whitelist_regex'] = [
            "/etc/opensemanticsearch/blacklist/whitelist-url-regex"
        ]
Пример #6
0
    def export_row_data_to_index(self, data, rownumber):

        parameters = self.config.copy()

        # todo: all content plugins configurated, not only this one
        parameters['plugins'] = [
            'enhance_path',
        ]

        etl = ETL()

        try:

            etl.process(parameters=parameters, data=data)

        # if exception because user interrupted by keyboard, respect this and abbort
        except KeyboardInterrupt:
            raise KeyboardInterrupt
        except BaseException as e:
            sys.stderr.write("Exception adding CSV row {} : {}".format(
                rownumber, e.message))

            if 'raise_pluginexception' in self.config:
                if self.config['raise_pluginexception']:
                    raise e
Пример #7
0
    def getTargetDdl(self, tableformat, external=True, _ddl=""):
        ddl = _ddl
        databasename = self.metaresultlist[0].target_database
        tablename = self.metaresultlist[0].target_table
        tablelocation = self.metaresultlist[0].target_file_path
        if external is True and ETL.isNullOrEmpty(databasename) is None:
            ddl = f"CREATE EXTERNAL TABLE {tablename}(\n"
        elif external is True and ETL.isNullOrEmpty(databasename) is not None:
            ddl = f"CREATE EXTERNAL TABLE {databasename}.{tablename}(\n"
        elif external is False and ETL.isNullOrEmpty(databasename) is None:
            ddl = f"CREATE TABLE {tablename}(\n"
        elif external is False and ETL.isNullOrEmpty(databasename) is not None:
            ddl = f"CREATE TABLE {databasename}.{tablename}(\n"
        else:
            ddl = f"CREATE EXTERNAL TABLE {databasename}.{tablename}(\n"

        for metares in self.metaresultlist:
            if int(str(metares.src_table_order).strip()).__eq__(0):
                ddl = f"{ddl}`{metares.target_col}` {metares.target_col_datatype},\n"

        ddl = f"{ddl}--End"
        ddl = ddl.strip(',\n--End')
        ddl = f"{ddl}\n)" \
              f"STORED AS {tableformat}"
        if ETL.isNullOrEmpty(tablelocation) is not None and external is True:
            ddl = f"{ddl}\nLOCATION {tablelocation}"
        return ddl
	def __init__(self, verbose=False, quiet=True):

		ETL.__init__(self, verbose=verbose)

		self.read_configfiles()

		self.config["plugins"] = []
Пример #9
0
 def count_word(self):
     """
     Count stop words and output statistics.
     """
     word_list = []
     result = ETL().extract_word()
     count, total = 0, len(result)
     for row in result:
         count += 1
         test_id = row[0]
         print(f"{test_id}, {count}/{total}")
         try:
             dump = ETL().extract_cdb(test_id)
             processed = Process(dump).internal_process()
         except (IndexError, UnicodeDecodeError):
             continue
         if "\n\n" in dump:
             exceptions = dump[dump.index("\n\n") + len("\n\n"):]
             try:
                 header = "exception throw location:\n"
                 stack = exceptions[exceptions.index(header) + len(header):]
             except ValueError:
                 continue
             # extract root cause from exceptions
             if dump.count(header) > 1:
                 stack = stack[:stack.index("\n\n")]
             roots = re.findall(r"^\d+:[ ](.+)[ ]at[ ].+", stack, re.M)
             words = self.obtain_word(roots, processed)
             word_list += words
     Log().chart_print(Counter(word_list).most_common(10))
Пример #10
0
class Task:
    def __init__(self, name, question, db_table, row_handler, answer_cql):
        self._name = name
        self._question = question
        self._db_table = db_table
        self._answer_cql = answer_cql
        self._etl = ETL(
            f"{name} ETL",
            [helper.join_path(TEMP_DIR, 'staging.csv')],
            {
                "target": 'Cassandra',
                "table": db_table["table_name"],
                "is_file": False
            },
            [i[0] for i in db_table["cols"]],
            row_handler,
        )

        logger.info(f"{self._name} - Question: {self._question}")

    def _create_table(self):
        table_name, cols, key = self._db_table.values()
        cql_col_types = ', '.join([f"{col[0]} {col[1]}" for col in cols])

        db.execute(
            f"CREATE TABLE IF NOT EXISTS {table_name} ({cql_col_types}, PRIMARY KEY({key}))"
        )

        logger.info(f"{self._name} - Create '{table_name}' table")

    def _get_result(self):
        result = db.fetch(self._answer_cql)

        result_col_name = helper.get_str_between(self._answer_cql, 'SELECT',
                                                 'FROM')[0].strip().split(', ')

        try:
            with open(helper.join_path(RESULT_DIR, f'{self._name}.csv'),
                      'a') as target_file:
                writer = csv.writer(target_file, dialect='Dialect')
                writer.writerow(result_col_name)

                for row in result:
                    writer.writerow(row)
        except IOError as e:
            logger.error(e)
        except:
            logger.error(f"Unexpected error: {sys.exc_info()[0]}")

        logger.info(
            f"{self._name} - Complete generate '{self._name}.csv' file")

    def run(self):
        self._create_table()

        self._etl.run()
        logger.info(f"{self._name} - Complete ETL process")

        self._get_result()
    def transform(self):
        # Get Unique source table names for Transformation
        srctables = set()
        for metares in self.model.metaresultlist:
            srctables.add(metares.src_table)

        # For each source table create SourceTable object and assign transform columns
        for srctable in srctables:
            tablemetaresult = self.model.filterMetaResultBySourceTable(
                srctbl=srctable)
            tblinfo: MetaResult = tablemetaresult[0]

            fklist = []

            for item in self.model.datamodel.keys():
                if self.model.datamodel[item][
                        'fk'] is not None or self.model.datamodel[item][
                            'fk'] is {}:
                    if srctable in self.model.datamodel[item]['fk'].keys():
                        fklist.extend(self.model.datamodel[item]['fk']
                                      [srctable]['fk_pk'])

            sourcetable: SourceTable = SourceTable(
                sourcesystem=tblinfo.src_system,
                tablename=tblinfo.src_table,
                pk=self.model.datamodel[tblinfo.src_table]['pk'],
                fk=fklist,
                database=tblinfo.src_database,
                filepath=tblinfo.src_file_path,
                filetype=tblinfo.src_filetype,
                modeltableorder=tblinfo.src_table_order)
            self.sourcetables.append(sourcetable)
            for tbl in tablemetaresult:
                sourcetable.addColumn(
                    name=tbl.src_col,
                    type=tbl.src_col_datatype,
                    pk=(True, False)[tbl.src_key_constraints.__eq__('pk')],
                    udf=tbl.udf,
                    udfargs=tbl.udfarguments,
                    casttype=tbl.target_col_datatype,
                    aliasname=tbl.target_col,
                    filterclause=tbl.src_col_filter,
                    fk={})

            # Read file as dataframe
            sourcetable.readFileFromSource(spark=self.spark)

        ETL.registerAllUDF(sc=self.spark)
        for sourcetable in self.sourcetables:
            sourcetable.applyTransform()

        self.applyJoin()

        self.applyFilters()

        self.applyGroupAggregation()

        self.targetdf.show()
	def __init__(self, verbose=False, quiet=True):

		ETL.__init__(self, verbose=verbose)

		self.quiet = quiet

		self.set_configdefaults()
		
		self.read_configfiles()
Пример #13
0
    def applyColTransform(self, query, src_table, src_col, target_col, target_col_datatype, udf, udfarguments):
        if ETL.isNullOrEmpty(udf) is not None and len(udfarguments) is not 0:
            query = f"{query} CAST({udf}({src_table}.`{src_col}`, {','.join(udfarguments)}) AS {target_col_datatype}) AS {target_col},"
        elif ETL.isNullOrEmpty(udf) is not None and len(udfarguments) is 0:
            query = f"{query} CAST({udf}({src_table}.`{src_col}`) AS {target_col_datatype}) AS {target_col},"
        else:
            query = f"{query} CAST({src_table}.`{src_col}` AS {target_col_datatype}) AS {target_col},"

        return query
Пример #14
0
    def _load_and_process_data(self):
        thread_pool = []

        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type,))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)
Пример #15
0
    def __init__(self, verbose=False, quiet=True):

        ETL.__init__(self, verbose=verbose)

        self.quiet = quiet

        self.set_configdefaults()

        self.read_configfiles()
    def set_configdefaults(self):
        #
        # Standard config
        #
        # Do not edit config here! Overwrite options in /etc/etl/ or /etc/opensemanticsearch/connector-files
        #

        ETL.set_configdefaults(self)

        self.config['force'] = False
Пример #17
0
    def set_configdefaults(self):
        #
        # Standard config
        #
        # Do not edit config here! Overwrite options in /etc/etl/ or /etc/opensemanticsearch/connector-files
        #

        ETL.set_configdefaults(self)

        self.config['force'] = False
Пример #18
0
 def setUp(self):
     self.etl = ETL()
     data = {
         'customer_id': [1, 1],
         'order_id': [1, 2],
         'order_item_id': [5, 6],
         'num_items': [2, 3],
         'revenue': [90, 50],
         'created_at_date': [datetime(2017, 10, 5),
                             datetime(2017, 10, 12)]
     }
     self.test_df = pd.DataFrame.from_dict(data)
    def _load_and_process_data(self):
        thread_pool = []
        ensg_to_gene_primary_id_map = self._get_primary_gene_ids_to_ensembl_ids(
        )

        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(
                target=self._process_sub_type,
                args=(sub_type, ensg_to_gene_primary_id_map))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)
Пример #20
0
def ETI_TEST():
    print("Staring ETL Test  Job !!!!")
    var = Variable()
    var.INPUT_DATA = "/input/employee.csv"
    csvParser = csv_parser(var.INPUT_DATA)

    if csvParser.file_exist(var.INPUT_DATA):
        TABLE_NAME = csvParser.get_table_name(
            var.INPUT_DATA)  # Takes Table name from Filename

        if var.CREATE_TABLE and not var.RELATION:
            ob = ETL(TABLE_NAME, var.INPUT_DATA)
            ob.etl_process(csvParser.check_header(), 20)
Пример #21
0
def main():

    print("Staring ETL Job !!!!")
    var = Variable()
    csvParser = csv_parser(var.INPUT_DATA)

    if csvParser.file_exist(var.INPUT_DATA):
        TABLE_NAME = csvParser.get_table_name(
            var.INPUT_DATA)  # Takes Table name from Filename

        if var.CREATE_TABLE and not var.RELATION:
            ob = ETL(TABLE_NAME, var.INPUT_DATA)
            ob.etl_process(csvParser.check_header())
Пример #22
0
def enrich(plugins, uri, wait=0):

    if wait:
        time.sleep(wait)

    etl = ETL()
    etl.read_configfile('/etc/opensemanticsearch/etl')
    etl.read_configfile('/etc/opensemanticsearch/enhancer-rdf')

    etl.config['plugins'] = plugins.split(',')

    filename = uri

    # if exist delete protocoll prefix file://
    if filename.startswith("file://"):
        filename = filename.replace("file://", '', 1)

    parameters = etl.config.copy()

    parameters['id'] = uri
    parameters['filename'] = filename

    parameters, data = etl.process(parameters=parameters, data={})

    return data
Пример #23
0
Файл: main.py Проект: gijs/stetl
def main():
    """The `main` function, to be called from commandline, like `python src/main.py -c etl.cfg`.

    Args:
       -c  --config <config_file>  the Stetl config file.
       -s  --section <section_name> the section in the Stetl config (ini) file to execute (default is [etl]).
       -a  --args <arglist> substitutable args for symbolic, {arg}, values in Stetl config file, in format "arg1=foo arg2=bar" etc.

    """
    args = parse_args()

    # Do the ETL
    etl = ETL(vars(args), args.config_args)
    etl.run()
Пример #24
0
def main():
    parser = argparse.ArgumentParser(
        description='CLI for the pldb application.')
    parser.add_argument(
        '--update',
        action='store_true',
        help='Download season data and update the JSON file and the database.')
    parser.add_argument(
        '--table',
        action='store_true',
        help=
        'Display the current standings table (calculated from data in the database).'
    )
    parser.add_argument(
        '--club',
        type=str,
        default=None,
        help=
        'Display info for all the matches for the given club in the season.')
    args = parser.parse_args()

    if args.update:
        print("Updating season data...")
        etl = ETL()
        etl.run()
        print("done.")
    elif args.table:
        query = Query()
        table_data = query.table()
        print("#\tClub\tPlayed\tWon\tDrawn\tLost\tGD\tPoints")
        for rank in range(len(table_data)):
            row = table_data[rank]
            print(
                f"{rank + 1}\t{row['club']}\t{row['matches_played']}\t"
                f"{row['wins']}\t{row['draws']}\t{row['losses']}\t{row['goal_diff']}\t"
                f"{row['points']}")
    elif args.club:
        query = Query()
        for match in query.club(args.club):
            kick_time = match['kickoff'] / 1000
            kick_time = datetime.datetime.fromtimestamp(kick_time).strftime(
                "%a %d %b %H:%M")
            if match['status'] == 'C':
                score = f"{match['away_goals']} {match['home_goals']}"
            else:
                score = ' @ '
            print(
                f"{kick_time} {match['away_club']['abbr']} "
                f"{score} {match['home_club']['abbr']} {match['ground']['name']}"
            )
Пример #25
0
def main():
    """The `main` function, to be called from commandline, like `python src/main.py -c etl.cfg`.

    Args:
       -c  --config <config_file>  the Stetl config file.
       -s  --section <section_name> the section in the Stetl config (ini) file to execute (default is [etl]).
       -a  --args <arglist> substitutable args for symbolic, {arg}, values in Stetl config file, in format "arg1=foo arg2=bar" etc.

    """
    args = parse_args()

    # Do the ETL
    etl = ETL(vars(args), args.config_args)
    etl.run()
Пример #26
0
    def __init__(self, verbose=False):

        ETL.__init__(self, verbose=verbose)

        self.verbose = verbose

        self.read_configfiles()

        self.mask = pyinotify.IN_DELETE | pyinotify.IN_CLOSE_WRITE | pyinotify.IN_MOVED_TO | pyinotify.IN_MOVED_FROM  # watched events

        self.watchmanager = pyinotify.WatchManager()  # Watch Manager

        self.handler = EventHandler()

        self.notifier = pyinotify.Notifier(self.watchmanager, self.handler)
	def __init__(self, verbose = False ):

		ETL.__init__(self, verbose=verbose)

		self.verbose=verbose

		self.read_configfiles()
		
		self.mask = pyinotify.IN_DELETE | pyinotify.IN_CLOSE_WRITE | pyinotify.IN_MOVED_TO | pyinotify.IN_MOVED_FROM  # watched events

		self.watchmanager = pyinotify.WatchManager()  # Watch Manager

		self.handler = EventHandler()
		
		self.notifier = pyinotify.Notifier(self.watchmanager, self.handler)
Пример #28
0
	def __init__(self, 	plugins=[], verbose=False ):


		ETL.__init__(self, plugins=plugins, verbose=verbose)




		self.read_configfile ('/etc/etl/config')
		self.read_configfile ('/etc/opensemanticsearch/etl')
		self.read_configfile ('/etc/opensemanticsearch/enhancer-rdf')

		self.fields = self.getfieldnames_from_plugins()


		# init exporter	(todo: exporter as extended PySolr)	
		self.export_solr = export_solr.export_solr()


		# init PySolr
		solr_uri = self.config['solr']
		if not solr_uri.endswith('/'):
			solr_uri += '/'
		solr_uri += self.config['index'] 

		self.solr = pysolr.Solr( solr_uri )

		self.threads_max = None

		# if not set explicit, autodetection of count of CPUs for amount of threads
		if not self.threads_max:
			import multiprocessing
			self.threads_max = multiprocessing.cpu_count()
			if self.verbose:
				print ( "Setting threads to count of CPUs: " + str(self.threads_max) )

		self.rows_per_step = 100
		if self.rows_per_step < self.threads_max * 2:
			self.rows_per_step = self.threads_max * 2


		self.work_in_progress = []
		self.delete_from_work_in_progress_lock = threading.Lock() 

		self.delete_from_work_in_progress_after_commit = []
		self.work_in_progress_lock = threading.Lock() 

		self.e_job_done = threading.Event()
def run_etl():
	#initialize auth_controller
	auth_controller = AuthController('http://restservice:8001/ETL/api/v1.0/', 'auths')

	#initialize data_controller
	data_controller = DataController('http://restservice:8001/ETL/api/v1.0/')

	#initialize sftp_controller
	sftp_controller = SftpController('sftpserver.pyc.test', 'etluser', 'sftpserver_keys/sftpserver_rsa', 'IntuitPYC')

	#initialize vertica_controller

	vertica_controller = VerticaController('config/vsql.config')

	etl = ETL(auth_controller, data_controller, sftp_controller, vertica_controller)
	etl.run()
Пример #30
0
    def getWhereClauses(self):
        def matchEqualityOperator(expression):
            expr = str(expression).strip()
            if expr.__contains__('eq'):
                expr = expr.replace('eq(', '=').replace(')', '', 1)
            if expr.__contains__('gt'):
                expr = expr.replace('gt(', '>').replace(')', '', 1)
            elif expr.__contains__('lt'):
                expr = expr.replace('lt(', '<').replace(')', '', 1)
            elif expr.__contains__('lte'):
                expr = expr.replace('lte(', '<=').replace(')', '', 1)
            elif expr.__contains__('gte'):
                expr = expr.replace('gte(', '>=').replace(')', '', 1)
            elif expr.__contains__('notin'):
                expr = expr.replace('notin(', 'NOT IN').replace(')', '', 1)
            elif expr.__contains__('in'):
                expr = expr.replace('in(', 'IN').replace(')', '', 1)
            elif expr.__contains__('ne'):
                expr = expr.replace('ne(', '<>').replace(')', '', 1)
            else:
                expr = str(expression).strip()
            return expr

        query, joindict = self.joinSQL(self.datamodel, 'purchase', 'product', 'store')

        wherequery = ""
        for metares in self.metaresultlist:
            if str(metares.src_col_filter).strip() is not None and not str(metares.src_col_filter).strip().__eq__(""):
                wherequery = f"{metares.src_table}.`{metares.src_col}` {self.matchEqualityOperator(metares.src_col_filter)}"
        if ETL.isNullOrEmpty(wherequery) is not None:
            query = f"{query} WHERE {wherequery}"
        return query
Пример #31
0
    def __init__(self, src_system, src_database, src_table, src_filetype, src_file_path, src_col, src_col_datatype,
                 src_key_constraints,
                 src_col_filter,
                 src_col_aggregator,
                 src_col_aggregator_filter,
                 src_table_order,
                 target_database,
                 target_table,
                 target_filetype,
                 target_file_path, target_col, target_col_datatype, udf="", udfarguments=""):
        self.metacolumnslist = {}
        self.src_system = src_system
        self.src_database = src_database
        self.src_table = src_table
        self.src_filetype = src_filetype
        self.src_file_path = src_file_path
        self.src_col = src_col
        self.src_col_datatype = str(src_col_datatype).lower()
        self.src_key_constraints = str(src_key_constraints).lower()
        self.src_col_filter = src_col_filter
        self.src_col_aggregator = src_col_aggregator
        self.src_col_aggregator_filter = src_col_aggregator_filter
        self.src_table_order = int(str(src_table_order).strip())
        self.target_database = target_database
        self.target_table = target_table
        self.target_filetype = target_filetype
        self.target_file_path = target_file_path
        self.target_col = target_col
        self.target_col_datatype = target_col_datatype
        self.target_col_aggregator = ""
        self.target_col_aggregator_filter = ""
        self.udf = udf
        if ETL.isNullOrEmpty(udfarguments) is not None:
            self.udfarguments = udfarguments.split('|')
        else:
            self.udfarguments = []

        self.metacolumnslist.update({'src_filetype': self.src_filetype})
        self.metacolumnslist.update({'src_system': self.src_system})
        self.metacolumnslist.update({'src_database': self.src_database})
        self.metacolumnslist.update({'src_table': self.src_table})
        self.metacolumnslist.update({'src_file_path': self.src_table})
        self.metacolumnslist.update({'src_col': self.src_table})
        self.metacolumnslist.update({'src_col_datatype': self.src_col_datatype})
        self.metacolumnslist.update({'src_key_constraints': self.src_key_constraints})
        self.metacolumnslist.update({'src_col_filter': self.src_col_filter})
        self.metacolumnslist.update({'src_col_aggregator': self.src_col_aggregator})
        self.metacolumnslist.update({'src_col_aggregator_filter': self.src_col_aggregator_filter})
        self.metacolumnslist.update({'src_table_order': self.src_table_order})
        self.metacolumnslist.update({'target_database': self.target_database})
        self.metacolumnslist.update({'target_table': self.target_table})
        self.metacolumnslist.update({'target_filetype': self.target_filetype})
        self.metacolumnslist.update({'target_file_path': self.target_file_path})
        self.metacolumnslist.update({'target_col': self.target_col})
        self.metacolumnslist.update({'target_col_datatype': self.target_col_datatype})
        self.metacolumnslist.update({'target_col_aggregator': self.target_col_aggregator})
        self.metacolumnslist.update({'target_col_aggregator_filter': self.target_col_aggregator_filter})
        self.metacolumnslist.update({'udf': self.udf})
        self.metacolumnslist.update({'udfarguments': self.udfarguments})
Пример #32
0
    def __init__(self, verbose=False, quiet=True):

        ETL.__init__(self, verbose=verbose)

        self.quiet = quiet

        self.set_configdefaults()

        self.read_configfiles()

        # read on what DB or search server software our index is
        export = self.config['export']

        # call delete function of the configured exporter
        module = importlib.import_module(export)
        objectreference = getattr(module, export)
        self.connector = objectreference()
    def __init__(self, verbose=False, quiet=True):

        ETL.__init__(self, verbose=verbose)

        self.quiet = quiet

        self.set_configdefaults()

        self.read_configfiles()

        # read on what DB or search server software our index is
        export = self.config['export']

        # call delete function of the configured exporter
        module = importlib.import_module(export)
        objectreference = getattr(module, export)
        self.connector = objectreference()
Пример #34
0
def enrich(plugins, uri, wait=0):
	
	if wait:
		time.sleep(wait)
	
	etl = ETL()
	etl.read_configfile('/etc/opensemanticsearch/etl')
	etl.read_configfile('/etc/opensemanticsearch/enhancer-rdf')
	
	etl.config['plugins'] = plugins.split(',')

	filename = uri

	# if exist delete protocoll prefix file://
	if filename.startswith("file://"):
		filename = filename.replace("file://", '', 1)
	
	parameters = etl.config.copy()
			
	parameters['id'] = uri
	parameters['filename'] = filename
	
	parameters, data = etl.process (parameters=parameters, data={})

	return data
Пример #35
0
    def __init__(self, name, question, db_table, row_handler, answer_cql):
        self._name = name
        self._question = question
        self._db_table = db_table
        self._answer_cql = answer_cql
        self._etl = ETL(
            f"{name} ETL",
            [helper.join_path(TEMP_DIR, 'staging.csv')],
            {
                "target": 'Cassandra',
                "table": db_table["table_name"],
                "is_file": False
            },
            [i[0] for i in db_table["cols"]],
            row_handler,
        )

        logger.info(f"{self._name} - Question: {self._question}")
Пример #36
0
    def _load_and_process_data(self):
        thread_pool = []

        query_tracking_list = multiprocessing.Manager().list()
        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(target=self._process_sub_type,
                                              args=(sub_type,
                                                    query_tracking_list))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)

        queries = []
        for item in query_tracking_list:
            queries.append(item)

        Neo4jTransactor.execute_query_batch(queries)
Пример #37
0
 def update_mapping_new_product_id(self):
     logging.info("Create new product id after pruning dataset")
     _r_map = self.dataset[['PRODUCT_ID', 'NEW_PRODUCT_ID']]
     mm = [_r_map['PRODUCT_ID'].unique(), _r_map['NEW_PRODUCT_ID'].unique()]
     mapping = pd.DataFrame(data=np.array(mm).T,
                            columns=["product_id", "new_product_id"])
     logging.info("Save new product id into db")
     ETL(DATA_PATH, CONFIG_PATH,
         SCHEMA_PATH).insert_new_product_id_table(mapping)
Пример #38
0
    def __init__(self, verbose=False, quiet=True):

        ETL.__init__(self, verbose=verbose)

        self.quiet = quiet

        self.set_configdefaults()

        self.read_configfiles()

        # if not set explicit, autodetection of count of CPUs for amount of threads
        if not self.threads_max:
            import multiprocessing
            self.threads_max = multiprocessing.cpu_count()
            if self.verbose:
                print("Setting threads to count of CPUs: " +
                      str(self.threads_max))

        self.e_job_done = threading.Event()
Пример #39
0
    def _load_and_process_data(self):

        sub_types = []

        for sub_type in self.data_type_config.get_sub_type_objects():
            sub_types.append(sub_type.get_data_provider())

        thread_pool = []

        query_tracking_list = multiprocessing.Manager().list()
        for sub_type in self.data_type_config.get_sub_type_objects():
            process = multiprocessing.Process(target=self._process_sub_type,
                                              args=(sub_type, sub_types,
                                                    query_tracking_list))
            process.start()
            thread_pool.append(process)

        ETL.wait_for_threads(thread_pool)

        queries = []
        for item in query_tracking_list:
            queries.append(item)

        algo_queries = []

        for item in queries:
            if "algorithm" in item[1]:
                algo_queries.append(item)

        main_list = self.get_randomized_list(sub_types)

        for file_set in main_list:
            for pair in file_set:
                for item in queries:
                    if pair[0] + "_" + pair[1] in item[1]:
                        self.logger.debug("Pair: %s Item: %s", pair, item[1])
                        Neo4jTransactor.execute_query_batch([item])

            Neo4jTransactor().wait_for_queues()

        Neo4jTransactor.execute_query_batch(algo_queries)
        self.error_messages()
	def set_configdefaults(self):
		#
		# Standard config
		#
		# Do not edit config here! Overwrite options in /etc/etl/ or /etc/opensemanticsearch/connector-files
		#
		
		ETL.set_configdefaults(self)
						
		self.config['force'] = False

		# filename to URI mapping
		self.config['mappings'] = { "/": "file:///" }

		self.config['facet_path_strip_prefix'] = [ "file://", "http://www.", "https://www.", "http://", "https://" ]
		
		self.config['plugins'] = [
			'enhance_mapping_id',
			'filter_blacklist',
			'filter_file_not_modified',
			'enhance_extract_text_tika_server',
			'enhance_detect_language_tika_server',
			'enhance_contenttype_group',
			'enhance_pst',
			'enhance_csv',
			'enhance_file_mtime',
			'enhance_path',
			'enhance_extract_hashtags',
			'enhance_warc',
			'enhance_zip',
			'clean_title',
			'enhance_multilingual',
		]
		
		self.config['blacklist'] = ["/etc/opensemanticsearch/blacklist/blacklist-url"]
		self.config['blacklist_prefix'] = ["/etc/opensemanticsearch/blacklist/blacklist-url-prefix"]
		self.config['blacklist_suffix'] = ["/etc/opensemanticsearch/blacklist/blacklist-url-suffix"]
		self.config['blacklist_regex'] = ["/etc/opensemanticsearch/blacklist/blacklist-url-regex"]
		self.config['whitelist'] = ["/etc/opensemanticsearch/blacklist/whitelist-url"]
		self.config['whitelist_prefix'] = ["/etc/opensemanticsearch/blacklist/whitelist-url-prefix"]
		self.config['whitelist_suffix'] = ["/etc/opensemanticsearch/blacklist/whitelist-url-suffix"]
		self.config['whitelist_regex'] = ["/etc/opensemanticsearch/blacklist/whitelist-url-regex"]
Пример #41
0
def main():
    """The `main` function, to be called from commandline, like `python src/main.py -c etl.cfg`.

    Args:
       -c  --config <config_file>  the Stetl config file.
       -s  --section <section_name> the section in the Stetl config (ini) file to execute (default is [etl]).
       -a  --args <arglist> substitutable args for symbolic, {arg}, values in Stetl config file, in format "arg1=foo arg2=bar" etc.
       -d  --doc <class> Get component documentation like its configuration parameters, e.g. stetl --doc stetl.inputs.fileinput.FileInput
       -h  --help get help info

    """
    args = parse_args()

    if args.config_file:
        # Do the ETL
        etl = ETL(vars(args), args.config_args)
        etl.run()

    elif args.doc_args:
        print_doc(args.doc_args)
    else:
        print('Unknown option, try stetl -h for help')
	def etl_graph(self, parameters):

		if self.verbose:
			print("Graph has {} triples.".format(len(self.graph)) )
	
		count_triple = 0
		count_subjects = 0
	
		part_parameters = {}
		part_parameters['plugins'] = []
		part_parameters['export'] = parameters['export']
						
		property2facet = {}
		if 'property2facet' in parameters:
			property2facet = parameters['property2facet']

		etl_processor = ETL()
		etl_processor.verbose = self.verbose
		
		class_properties = []
		class_properties.append(rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'))
		class_properties.append(rdflib.term.URIRef(u'http://www.wikidata.org/prop/direct/P31'))
		# since there can be multiple triples/values for same property in/from different graphs or graph describes existing other file/document,
		# do not overwrite document but add value to existent document & values of the facet/field/property
		part_parameters['add'] = True

		# use SPARQL query with distinct to get subjects only once
		res = self.graph.query(
			"""SELECT DISTINCT ?subject
			WHERE {
			?subject ?predicate ?object .
			}""")
	
		for row in res:

			count_subjects += 1
	
			if self.verbose:
				print( "Importing entity / subject {}".format(count_subjects) )

			# get subject of the concept from first column
			subj = row[0]

			if self.verbose:
				print ( "Processing RDF subject {}".format(subj) )

			part_data = {}
			
			part_data['content_type_group_ss'] = 'Knowledge graph'
			# subject as URI/ID
			part_parameters['id'] = subj
			
			preferred_label = self.get_preferred_label(subject=subj)
			part_data['title_txt'] = preferred_label
			
			count_subject_triple = 0

			# get all triples for this subject
			for pred, obj in self.graph.predicate_objects(subject=subj):

				count_triple += 1
				count_subject_triple += 1

				if self.verbose:
					print( "Importing subjects triple {}".format(count_subject_triple) )
					print( "Predicate / property: {}".format(pred) )
					print( "Object / value: {}".format(obj) )


				try:
					
					# if class add preferredlabel of this entity to facet of its class (RDF rdf:type or Wikidata "instance of" (Property:P31)),
					# so its name (label) will be available in entities view and as filter for faceted search
					
					if pred in class_properties:
						class_facet = str(obj)
						# map class to facet, if mapping for class exist
						if class_facet in property2facet:
							class_facet = property2facet[class_facet]
							if class_facet in parameters['facets']:
								part_data['content_type_ss'] = 'Knowledge graph class {}'.format(parameters['facets'][class_facet]['label'])
						etl.append(data=part_data, facet=class_facet, values=preferred_label)


					#
					# Predicate/property to facet/field
					#

					# set Solr datatype strings so facets not available yet in Solr schema can be inserted automatically (dynamic fields) with right datatype
					
					facet = pred + '_ss'
					facet_uri = facet + '_uri_ss'
					facet_preferred_label_and_uri = facet + '_preflabel_and_uri_ss'
					
					if self.verbose:
						print ( "Facet: {}".format(facet) )

	
					#
					# get values or labels of this object
					#

					values = self.get_values(obj=obj)
					if self.verbose:
						print ( "Values: {}".format(values) )

					# insert or append value (object of triple) to data
					etl.append(data=part_data, facet=facet, values=values)
					

					# if object is reference/URI append URI
					if type(obj) == rdflib.URIRef:
						
						uri = obj
						
						etl.append( data=part_data, facet=facet_uri, values=uri )

						# append mixed field with preferred label and URI of the object for disambiguation of different Entities/IDs/URIs with same names/labels in faceted search
						preferredlabel_and_uri = "{} <{}>".format ( self.get_preferred_label(subject=obj), obj)

					else:
						preferredlabel_and_uri = self.get_preferred_label(subject=obj)
					
					etl.append(data=part_data, facet=facet_preferred_label_and_uri, values=preferredlabel_and_uri)


				except KeyboardInterrupt:
					raise KeyboardInterrupt
	
				except BaseException as e:
					sys.stderr.write( "Exception while triple {} of subject {}: {}\n".format(count_subject_triple, subj, e) )
	
	
			# index triple
			etl_processor.process( part_parameters, part_data)
Пример #43
0
    def run(self):
        print("running")

        etl = ETL()
        etl.run()
	def process (self, parameters={}, data={} ):
	
		verbose = False
		if 'verbose' in parameters:
			if parameters['verbose']:	
				verbose = True
		
		if 'id' in data:
			docid = data['id']
		else:
			docid = parameters['id']

		# default classifier
		classifier = 'en_core_web_sm'

		if 'spacy_ner_classifier_default' in parameters:
			classifier = parameters['spacy_ner_classifier_default']

		# set language specific classifier, if configured and document language detected
		if 'spacy_ner_classifiers' in parameters and 'language_s' in data:
			# is a language speciic cassifier there for the detected language?
			if data['language_s'] in parameters['spacy_ner_classifiers']:
				classifier = parameters['spacy_ner_classifiers'][data['language_s']]

				analyse_fields = ['content_txt','ocr_t','ocr_descew_t']

		text = ''
		for field in analyse_fields:
			if field in data:
				text = "{}{}\n".format(text, data[field])


		# extract sentences from text
		url = "http://localhost:8080/sents"
		headers = {'content-type': 'application/json'}
		d = {'text': text, 'model': classifier}

		response = requests.post(url, data=json.dumps(d), headers=headers)
		sentences = response.json()
						
	
		etl = ETL()
		
		sentencenumber = 0
		
		for sentence in sentences:
			
			sentencenumber +=1

			partdocid = docid + '#sentence' + str(sentencenumber)
			
			partparameters = parameters.copy()
			partparameters['plugins'] = ['enhance_path', 'enhance_detect_language_tika_server', 'enhance_entity_linking', 'enhance_multilingual']

			if 'enhance_ner_spacy' in parameters['plugins']:
				partparameters['plugins'].append('enhance_ner_spacy')
			if 'enhance_ner_stanford' in parameters['plugins']:
				partparameters['plugins'].append('enhance_ner_stanford')

			sentencedata = {}
			sentencedata['id'] = partdocid
		
			sentencedata['container_s'] = docid
	
			if 'author_ss' in data:
				sentencedata['author_ss'] = data['author_ss']
				
			sentencedata['content_type_group_ss'] = "Sentence"
			sentencedata['content_type_ss'] = "Sentence"
			sentencedata['content_txt'] = sentence

			# index sentence
			try:	
				partparameters, sentencedata = etl.process( partparameters, sentencedata )
	
			except BaseException as e:
				sys.stderr.write( "Exception adding sentence {} : {}".format(sentencenumber, e) )
		
		data['sentences_i'] = sentencenumber	

		return parameters, data
	def segment_pdf_to_pages ( self, parameters={}, data={} ):
	
		verbose = False
		if 'verbose' in parameters:
			if parameters['verbose']:	
				verbose = True
	
		if 'id' in data:
			docid = data['id']
		else:
			docid = parameters['id']
		
		filename = parameters['filename']
				
	
		#defaults, if pdfinfo will not detect them
		pages = 1
		title = 'No title'
		author = None
		
		# get pagecount with pdfinfo command line tool
		pdfinfo = subprocess.check_output(['pdfinfo', '-enc', 'UTF-8', filename])
	
		# decode
		pdfinfo = pdfinfo.decode(encoding='UTF-8')
	
		# get the count of pages from pdfinfo result
		# its a text with a line per parameter
		for line in pdfinfo.splitlines():
			line=line.strip()
			# we want only the line with the pagecount
			if line.startswith('Pages:'):
				pages = int( line.split()[1] )
	
			if line.startswith('Title:'):
				title = line.replace("Title:", '', 1)
				title = title.strip()
	
			if line.startswith('Author:'):
				author = line.replace("Author:", '', 1)
				author = author.strip()
		
		etl = ETL()

		# export and index each page
		for pagenumber in range(1, pages + 1):
	
			if verbose:
				print ("Extracting PDF page {} of {}".format(pagenumber, pages))
			# generate temporary filename
			md5hash = hashlib.md5(filename.encode('utf-8')).hexdigest()
			temp_filename = tempfile.gettempdir() + os.path.sep + "opensemanticetl_pdftotext_" + md5hash + "_" + str(pagenumber)
	
			# call pdftotext to write the text of page into tempfile
			try:
				result = subprocess.check_call(['pdftotext', '-enc', 'UTF-8','-f', str(pagenumber), '-l', str(pagenumber), filename, temp_filename])
			except BaseException as e:
				sys.stderr.write( "Exception extracting text from PDF page {}: {}\n".format(pagenumber, e) )
	
			# read text from tempfile
			f = open(temp_filename, "r", encoding="utf-8")
			text = f.read()
			os.remove(temp_filename)

			partdocid = docid + '#page=' + str(pagenumber)
			
			partparameters = parameters.copy()
			partparameters['plugins'] = ['enhance_path', 'enhance_detect_language_tika_server', 'enhance_entity_linking', 'enhance_multilingual']

			if 'enhance_ner_spacy' in parameters['plugins']:
				partparameters['plugins'].append('enhance_ner_spacy')
			if 'enhance_ner_stanford' in parameters['plugins']:
				partparameters['plugins'].append('enhance_ner_stanford')

			pagedata = {}
			pagedata['id'] = partdocid
		
		
			pagedata['page_i'] = pagenumber
			pagedata['pages_i'] = pages
			pagedata['container_s'] = docid
			pagedata['title_txt'] = title
	
			if author:
				pagedata['author_ss'] = author
				
			pagedata['content_type_group_ss'] = "Page"
			pagedata['content_type_ss'] = "PDF page"
			pagedata['content_txt'] = text

			if verbose:
				print ( "Indexing extracted page {}".format(pagenumber) )
	
	
			# index page
			try:	
				partparameters, pagedata = etl.process( partparameters, pagedata)
	
			except BaseException as e:
				sys.stderr.write( "Exception adding PDF page {} : {}".format(pagenumber, e) )
		
		data['pages_i'] = pages
		
		return parameters, data