def create_connection( configuration: dict, isolation_level: str = None) -> Connection: # todo: a user may prefer to run a session that commits data only at the # very end of the ETL instead of an auto commit execution at engine level. # if no isolation level is indicated, we just create an engine without the # isolation level parameter. This means it will use the default one. # Check your backend DB documentation to know which one is the default as # it varies between databases. if not isolation_level: engine = create_engine( configuration['connection_url'], poolclass=NullPool ) else: engine = create_engine( configuration['connection_url'], poolclass=NullPool, isolation_level=isolation_level ) connection = engine.connect() if configuration.get('connection_query') is not None: logger.info(f'Running connection query: ' f'{configuration["connection_query"]}') connection.execute(text(configuration['connection_query'])) return connection
def render_queries(self) -> None: for i, query in enumerate(self.configuration["order"]): # we skip the queries out of index if i < self.from_step_index or i > self.to_step_index: continue rendered_query = self.configuration["queries"][query] logger.info(f'\n##### Query {str(i + 1)}: {query} #####\n' f'\n{rendered_query}\n')
def run_job(sqlbucket, name, db, fstep, tstep, to_date, from_date, from_days, to_days, group, isolation, verbose, rendering, all, edb, args): submitted_variables = cli_variables_parser(args) if from_days is not None: from_date = n_days_ago(int(from_days)) if to_days is not None: to_date = n_days_ago(int(to_days)) submitted_variables["to"] = to_date submitted_variables["from"] = from_date logger.info('Variables used') logger.info(submitted_variables) # included dbs dbs = list() if db: dbs = db.split(',') elif all: dbs = list(sqlbucket.connections.keys()) else: print(f"Either parameter db (-b) or (--all) flag is required") # excluded dbs if edb: ex_dbs = edb.split(',') dbs = [item for item in dbs if item not in ex_dbs] for dbi in dbs: connection_variables = sqlbucket.connection_variables[dbi] if 'isolation_level' in connection_variables and isolation is None: isolation = connection_variables['isolation_level'] etl = sqlbucket.load_project(project_name=name, connection_name=dbi, variables=submitted_variables) if rendering: etl.render(from_step=fstep, to_step=tstep, group=group) else: if isolation: isolation = isolation.upper() etl.run(from_step=fstep, to_step=tstep, group=group, verbose=verbose, isolation_level=isolation)
def run_integrity(sqlbucket, name, db, prefix, verbose, args): submitted_variables = cli_variables_parser(args) logger.info('Variables used') logger.info(submitted_variables) etl = sqlbucket.load_project(project_name=name, connection_name=db, variables=submitted_variables) errors = etl.run_integrity(prefix=prefix, verbose=verbose) if errors: sys.exit(3)
def run_project(self) -> None: self.starting_logs() start = datetime.now() connection = create_connection( self.configuration, isolation_level=self.isolation_level ) for i, query in enumerate(self.configuration["order"]): # we skip the queries out of index if i < self.from_step_index or i > self.to_step_index: continue # we run the query and monitor the time it takes query_start = datetime.now() logger.info(f"Now running query {str(i + 1)}: '{query}'...") rendered_query = self.configuration["queries"][query] if self.verbose: logger.info(f'\n\n{rendered_query}\n') connection.execute(text(rendered_query)) query_end = datetime.now() timing = str(query_end - query_start) logger.info(f"Query '{query}' successfully executed in {timing}.") end = datetime.now() self.ending_logs(start, end)
def create_connection(configuration: dict) -> Connection: # todo: isolation parameter consider possibility to set different isolation # level. # todo: a user may prefer to run a session that commits data only at the # very end of the ETL instead of an auto commit execution at engine level. isolation_level = "AUTOCOMMIT" # SQLITE does not have autocommit, so we set to a more if configuration['connection_url'][:6] == 'sqlite': isolation_level = 'SERIALIZABLE' engine = create_engine(configuration['connection_url'], poolclass=NullPool, isolation_level=isolation_level) connection = engine.connect() if configuration.get('connection_query') is not None: logger.info(f'Running connection query: ' f'{configuration["connection_query"]}') connection.execute(text(configuration['connection_query'])) return connection
def starting_logs(self): logger.info(sqlbucket_logo) logger.info( f"Starting project {self.configuration['project_name'].upper()}" f" for connection {self.configuration['connection_name'].upper()}") logger.info(f"Variables: {self.configuration['context']}") queries = list() for i, query in enumerate(self.configuration["order"]): if i < self.from_step_index or i > self.to_step_index: continue queries.append(query) logger.info("\n\nRunning the following queries:" "\n\t" + "\n\t".join(queries) + '\n')
def create_project(sqlbucket, name): sqlbucket.create_project(name) logger.info(f'Project "{name}" successfully created!')
def ending_logs(self, start, end): logger.info(f"Project '{self.configuration['project_name']}' " f"successfully completed for database " f"'{self.configuration['connection_name']}'") logger.info(f"Project completed in {end - start}")