def run(self) -> str: """Run input command as a subprocess command.""" try: out = os.popen(self.cmd + " 2>&1").read() if "Error" in out: raise RunnerException( self.task, self.run_id, 17, self.error_msg + ("\n" if out != "" else "") + out, ) RunnerLog(self.task, self.run_id, 17, self.success_msg) return out # pylint: disable=broad-except except BaseException as e: raise RunnerException( self.task, self.run_id, 17, self.error_msg + ("\n" if out != "" else "") + "\n" + str(e), )
def web_url(self, url: str) -> str: """Get contents of a webpage.""" try: page = requests.get( str(url), verify=app.config["HTTP_VERIFY_SSL"]) # noqa: S501 self.query = page.text self.db_type = ("mssql" if self.task.source_database_conn and self.task.source_database_conn.type_id == 2 else None) if page.status_code != 200: raise ValueError( f"{url} returned bad status: {page.status_code}") # save query cache before cleanup. if self.run_id or self.refresh_cache: self.task.source_cache = self.query db.session.commit() if self.refresh_cache: RunnerLog(self.task, self.run_id, 15, "Source cache manually refreshed.") # insert params return self.cleanup() # pylint: disable=broad-except except BaseException as e: if (self.run_id and self.task.enable_source_cache == 1 and self.task.source_cache): RunnerLog( self.task, self.run_id, 15, f"Failed to get source from {url}. Using cached query.\nFull trace:\n{e}", ) self.db_type = ("mssql" if self.task.source_database_conn and self.task.source_database_conn.type_id == 2 else None) self.query = self.task.source_cache return self.cleanup() elif (self.run_id and self.task.enable_source_cache == 1 and not self.task.source_cache): raise RunnerException( self.task, self.run_id, 15, f"Failed to get source from {url}. Cache enabled, but no cache available.\n{e}", ) else: raise RunnerException( self.task, self.run_id, 15, f"Failed to get source from {url}\n{e}.")
def __close(self) -> None: try: self.conn.close() except BaseException as e: raise RunnerException(self.task, self.run_id, 13, f"Failed to close connection.\n{e}")
def __connect(self) -> paramiko.SSHClient: try: return connect(self.connection) except BaseException as e: raise RunnerException( self.task, self.run_id, 19, f"Failed to connect.\n{e}" )
def __connect(self) -> Tuple[Transport, SFTPClient]: try: return connect(self.connection) except ValueError as e: raise RunnerException(self.task, self.run_id, 9, str(e))
def __build_env(self) -> None: """Build a virtual environment. Runs command: .. code-block:: console virtualenv <path> """ try: Cmd( task=self.task, run_id=self.run_id, cmd=f'virtualenv "{self.env_path}"', success_msg=f"Environment created.\n{self.env_path}", error_msg=f"Failed to create environment.\n{self.env_path}", ).shell() # pylint: disable=broad-except except BaseException as e: raise RunnerException( self.task, self.run_id, 14, f"Failed to build environment.\n{self.base_path}\n{e}", )
def string_to_date(self) -> str: """Return a complete date string. Input string is split into date parts. Each part is individually converted to a string. All string parts are then re-joined. """ def get_repeating_part(parts: List[str]) -> Optional[str]: """Find and return first duplicate part in array.""" my_counter: Counter = Counter() for part in parts: my_counter[part] += 1 if my_counter[part] > 1: return part return None try: parameters = [ x.group() for x in re.finditer(r"%[a-zA-Z]", self.date_string) ] parts = [] # split into parts param = get_repeating_part(parameters) if param: date_string = self.date_string while param: # split the date string on the first two parameters. # ex: 'file_name','%y%m_stuff','%y%m_otherstuff_%y%m' split_parts = date_string.split(param, 2) # join the first to parts and append them to our part list. parts.append(param.join(split_parts[:2])) # update the date string with our remainder (3rd element in array) date_string = param + split_parts[2] # update remaining parameters parameters = [ x.group() for x in re.finditer(r"%[a-zA-Z]", date_string) ] param = get_repeating_part(parameters) # need to add on the last part, if there are no more duplicate params. # pylint: disable=W0120 else: parts.append(date_string) else: parts.append(self.date_string) self.date_string = ("").join( [self.get_date_part(part) for part in parts]) return self.date_string except BaseException as e: raise RunnerException(self.task, self.run_id, 17, f"Failed to parse date string.\n{e}")
def __close(self) -> None: try: self.session.close() except BaseException as e: raise RunnerException( self.task, self.run_id, 19, f"Failed to disconnect.\n{e}" )
def __clean_up(self) -> None: # remove file try: if Path(self.temp_path).exists(): shutil.rmtree(self.temp_path) # pylint: disable=broad-except except BaseException as e: raise RunnerException(self.task, self.run_id, 8, f"Failed to clean up job.\n{e}")
def save(self, overwrite: int, file_name: str) -> None: """Use to copy local file to FTP server. :returns: true if successful. """ self.__connect() try: self.conn.cwd(self.connection.path or "/") # pylint: disable=broad-except except BaseException as e: raise RunnerException(self.task, self.run_id, 13, f"Failed to change path.\n{e}") if overwrite != 1: try: self.conn.size(file_name) RunnerLog( self.task, self.run_id, 13, "File already exists and will not be loaded.", ) self.__close() return # pylint: disable=broad-except except BaseException: pass try: with open(str(self.dir.joinpath(file_name)), "rb") as file: self.conn.storbinary("STOR " + file_name, file) RunnerLog(self.task, self.run_id, 13, "File loaded to server.") self.__close() except BaseException as e: raise RunnerException(self.task, self.run_id, 13, f"Failed to save file on server.\n{e}")
def source(self, query: Optional[str] = None) -> str: """Get task source code.""" try: self.query = query or self.task.source_code or "" self.db_type = ("mssql" if self.task.source_database_conn and self.task.source_database_conn.type_id == 2 else None) return self.cleanup() # pylint: disable=broad-except except BaseException as e: raise RunnerException(self.task, self.run_id, 15, f"Failed to clean source code.\n{e}.")
def read(self, file_name: str) -> List[IO[str]]: """Read file contents of network file path. Data is loaded into a temp file. Returns a path or raises an exception. """ try: # if there is a wildcard in the filename if "*" in file_name: RunnerLog(self.task, self.run_id, 10, "Searching for matching files...") # a smb file name can be a path, but listpath # will only list current folder. # we need to split the filename path and iter # through the folders that match. # get the path up to the *. base_dir = str(Path(file_name.split("*")[0]).parent) file_list = [] for _, _, walk_file_list in self._walk(base_dir): for this_file in walk_file_list: if fnmatch.fnmatch(this_file, file_name): file_list.append(this_file) RunnerLog( self.task, self.run_id, 10, "Found %d file%s.\n%s" % ( len(file_list), ("s" if len(file_list) != 1 else ""), "\n".join(file_list), ), ) # if a file was found, try to open. return [self.__load_file(file_name) for file_name in file_list] return [self.__load_file(file_name)] except BaseException as e: raise RunnerException( self.task, self.run_id, 10, f"File failed to load file from server.\n{e}", )
def read(self, file_name: str) -> List[IO[str]]: """Read a file from FTP server. Data is loaded into a temp file. Returns a path or raises an exception. """ try: self.conn.cwd(self.__clean_path(self.connection.path or "/")) if "*" in file_name: RunnerLog(self.task, self.run_id, 13, "Searching for matching files...") # get the path up to the * base_dir = str(Path(file_name.split("*")[0]).parent) file_list = [] for _, _, walk_file_list in self._walk(base_dir): for this_file in walk_file_list: if fnmatch.fnmatch(this_file, file_name): file_list.append(this_file) RunnerLog( self.task, self.run_id, 13, "Found %d file%s.\n%s" % ( len(file_list), ("s" if len(file_list) != 1 else ""), "\n".join(file_list), ), ) return [self.__load_file(file_name) for file_name in file_list] return [self.__load_file(file_name)] # pylint: disable=broad-except except BaseException as e: raise RunnerException( self.task, self.run_id, 13, f"File failed to load file from server.\n{e}", )
def __connect(self) -> SMBConnection: """Connect to SMB server. After making a connection we save it to redis. Next time we need a connection we can grab if from redis and attempt to use. If it is no longer connected then reconnect. Because we want to use existing connection we will not close them... """ try: return connect( str(self.username), str(self.password), str(self.server_name), str(self.server_ip), ) except ValueError as e: raise RunnerException(self.task, self.run_id, 10, str(e))
def __run_script(self) -> None: try: # if data files exist, pass them as a param. cmd = ( f'"{self.env_path}/bin/python" "{self.job_path}/{self.script}" ' ) + " ".join([f'"{x.name}"' for x in self.source_files]) self.output = Cmd( task=self.task, run_id=self.run_id, cmd=cmd, success_msg="Script successfully run.", error_msg="Failed run script: " + "\n" + cmd, ).shell() except BaseException as e: raise RunnerException( self.task, self.run_id, 14, f"Failed to build run script.\n{self.base_path}\n{e}", )
def save(self, overwrite: int, file_name: str) -> None: """Use to copy local file to FTP server. :returns: true if successful. """ try: self.conn.chdir(self.__clean_path(self.connection.path or "/")) except BaseException as e: raise RunnerException(self.task, self.run_id, 9, f"Failed to change path.\n{e}") if overwrite != 1: try: self.conn.stat(file_name) RunnerLog( self.task, self.run_id, 9, "File already exists and will not be loaded.", ) self.__close() return except BaseException: # continue of file does not exist. pass try: # some sftp server do not allow overwrites. When attempted will # return a permission error or other. So we log if the file exists # to help with debugging. try: self.conn.stat(file_name) RunnerLog( self.task, self.run_id, 9, "File already exist. Attempting to overwrite.", ) # pylint: disable=broad-except except BaseException: # continue of file does not exist. pass self.conn.put(str(self.dir.joinpath(file_name)), file_name, confirm=True) # file is now confirmed on server w/ confirm=True flag RunnerLog( self.task, self.run_id, 9, f"{file_size(self.conn.stat(file_name).st_size or 0)} stored on server as {file_name}.", ) self.__close() except BaseException as e: raise RunnerException(self.task, self.run_id, 9, f"Failed to save file on server.\n{e}")
def gitlab(self, url: str) -> str: """Get source code from gitlab using authentication.""" # pylint: disable=too-many-statements if ".git" in str(url): return "" if url: try: # convert the "raw" url into an api url branch = urllib.parse.quote( urllib.parse.unquote( re.findall(r"\/(?:raw|blob)\/(.+?)\/", url)[0]), safe="", ) project = urllib.parse.quote( urllib.parse.unquote( re.findall(r"\.(?:com|net|org)\/(.+?)\/-", url)[0]), safe="", ) file_path = urllib.parse.quote( urllib.parse.unquote( re.findall(r"\/(?:raw|blob)\/.+?\/(.+?)$", url)[0]), safe="", ) api_url = "%sapi/v4/projects/%s/repository/files/%s/raw?ref=%s" % ( app.config["GIT_URL"], project, file_path, branch, ) headers = { "PRIVATE-TOKEN": app.config["GIT_TOKEN"], "Connection": "close", } page = requests.get(api_url, verify=app.config["GIT_VERIFY_SSL"], headers=headers) # noqa: S501 if page.status_code != 200: raise Exception("Failed to get code: " + page.text) if url.lower().endswith(".sql"): self.query = page.text self.db_type = ("mssql" if self.task.source_database_conn and self.task.source_database_conn.type_id == 2 else None) # save query cache before cleanup. if self.run_id or self.refresh_cache: self.task.source_cache = self.query db.session.commit() if self.refresh_cache: RunnerLog( self.task, self.run_id, 15, "Source cache manually refreshed.", ) # insert params return self.cleanup() return (page.text if not page.text.startswith("<!DOCTYPE") else "Visit URL to view code") # pylint: disable=broad-except except BaseException as e: # only use cache if we have a run id. Otherwise failures are from code preview. if (self.run_id and self.task.enable_source_cache == 1 and self.task.source_cache): RunnerLog( self.task, self.run_id, 15, f"Failed to get source from {url}. Using cached query.\nFull trace:\n{e}", ) self.db_type = ("mssql" if self.task.source_database_conn and self.task.source_database_conn.type_id == 2 else None) self.query = self.task.source_cache return self.cleanup() elif (self.run_id and self.task.enable_source_cache == 1 and not self.task.source_cache): raise RunnerException( self.task, self.run_id, 15, f"Failed to get source from {url}. Cache enabled, but no cache available.\n{e}", ) else: raise RunnerException( self.task, self.run_id, 15, f"Failed to get source from {url}.\n{e}", ) raise RunnerException(self.task, self.run_id, 15, "No url specified to get source from.")
def __connect(self) -> FTP: try: return connect(self.connection) except ValueError as e: raise RunnerException(self.task, self.run_id, 13, str(e))
def run(self) -> None: """Run an SSH Command. First, this will make a connection then run the command Some code from https://stackoverflow.com/a/32758464 - thanks! :returns: Output from command. """ self.__connect() timeout = 600 try: RunnerLog( self.task, self.run_id, 19, "Starting command.", ) # pylint: disable=W0612 stdin, stdout, stderr = self.session.exec_command( # noqa: S601 self.command, timeout=timeout ) channel = stdout.channel stdin.close() channel.shutdown_write() stderr_data = b"" stdout_data = b"" while ( not channel.closed or channel.recv_ready() or channel.recv_stderr_ready() ): got_chunk = False readq, _, _ = select.select([stdout.channel], [], [], timeout) for chunk in readq: if chunk.recv_ready(): stdout_data += stdout.channel.recv(len(chunk.in_buffer)) got_chunk = True if chunk.recv_stderr_ready(): stderr_data += stderr.channel.recv_stderr( len(chunk.in_stderr_buffer) ) got_chunk = True if ( not got_chunk and stdout.channel.exit_status_ready() and not stderr.channel.recv_stderr_ready() and not stdout.channel.recv_ready() ): # indicate that we're not going to read from this channel anymore stdout.channel.shutdown_read() # close the channel stdout.channel.close() break # exit as remote side is finished and our buffers are empty time.sleep(0.01) # timeout after a few minutes out = stdout_data.decode("utf-8") or "None" err = stderr_data.decode("utf-8") or "None" if stdout.channel.recv_exit_status() != 0 or stderr_data != b"": raise ValueError( f"Command stdout: {out}\nCommand stderr: {err}", ) RunnerLog( self.task, self.run_id, 19, f"Command output:\n{out}", ) except BaseException as e: raise RunnerException( self.task, self.run_id, 19, f"Failed to run command.\n{e}" ) self.__close()
def shell(self) -> str: """Run input command as a shell command.""" try: out_bytes = subprocess.check_output( self.cmd, stderr=subprocess.STDOUT, shell=True ) out = out_bytes.decode("utf-8") if "Error" in out: raise RunnerException( self.task, self.run_id, 17, self.error_msg + ("\n" if out != "" else "") + re.sub( r"(?<=:)([^:]+?)(?=@)", "*****", out, flags=re.IGNORECASE | re.MULTILINE, ), ) RunnerLog( self.task, self.run_id, 17, self.success_msg + (("\n" + out) if out != "" else ""), ) return out except subprocess.CalledProcessError as e: out = e.output.decode("utf-8") raise RunnerException( self.task, self.run_id, 17, self.error_msg + (("\n" + out) if out != "" else "") + "\n" + re.sub( r"(?<=:)([^:]+?)(?=@)", "*****", str(e), flags=re.IGNORECASE | re.MULTILINE, ), ) except BaseException as e: raise RunnerException( self.task, self.run_id, 17, "Command failed.\n" + (("\n" + out) if out != "" else "") + "\n" + re.sub( r"(?<=:)([^:]+?)(?=@)", "*****", str(e), flags=re.IGNORECASE | re.MULTILINE, ), )
def save(self) -> Tuple[str, str, str]: """Create and save the file. returns [filename, filepath] of final file. """ if (self.task.destination_file_name is None or self.task.destination_file_name == ""): RunnerLog( self.task, self.run_id, 11, f"No filename specified, {Path(self.data_file.name).name} will be used.", ) if (self.task.destination_file_name != "" and self.task.destination_file_name is not None): # insert params self.file_name = self.params.insert_file_params( self.task.destination_file_name.strip()) # parse python dates self.file_name = DateParsing(self.task, self.run_id, self.file_name).string_to_date() else: self.file_name = Path(self.data_file.name).name # 4 is other if self.task.destination_file_type_id != 4 and self.task.file_type is not None: self.file_name += "." + (self.task.file_type.ext or "csv") self.file_path = str(Path(self.base_path).joinpath(self.file_name)) # if the source name matches the destination name, rename the source and update tmp file name. if self.data_file.name == self.file_path: data_file_as_path = Path(self.data_file.name) new_data_file_name = str( data_file_as_path.parent / (data_file_as_path.stem + "_tmp" + data_file_as_path.suffix)) os.rename(self.data_file.name, new_data_file_name) self.data_file.name = new_data_file_name # type: ignore[misc] with open(self.data_file.name, "r", newline="") as data_file: reader = csv.reader(data_file) with open(self.file_path, mode="w") as myfile: # if csv (1) or text (2) and had delimiter if (self.task.destination_file_type_id == 1 or self.task.destination_file_type_id == 2 or self.task.destination_file_type_id == 4) and ( self.task.destination_ignore_delimiter is None or self.task.destination_ignore_delimiter != 1): wrtr = ( csv.writer( myfile, delimiter=str(self.task.destination_file_delimiter) .encode("utf-8").decode("unicode_escape"), quoting=self.__quote_level(), ) if self.task.destination_file_delimiter is not None and len(self.task.destination_file_delimiter) > 0 and (self.task.destination_file_type_id == 2 or self.task.destination_file_type_id == 4 ) # txt or other else csv.writer( myfile, quoting=self.__quote_level(), )) for row in reader: new_row = [(x.strip('"').strip("'") if isinstance( x, str) else x) for x in row] if (self.task.destination_file_type_id == 1 or self.task.destination_file_type_id == 2 or self.task.destination_file_type_id == 4 ) and (self.task.destination_file_line_terminator is not None and self.task.destination_file_line_terminator != ""): new_row.append( self.task.destination_file_line_terminator) wrtr.writerow(new_row) # if xlxs (3) elif self.task.destination_file_type_id == 3: wrtr = csv.writer( myfile, dialect="excel", quoting=self.__quote_level(), ) for row in reader: new_row = [(x.strip('"').strip("'") if isinstance( x, str) else x) for x in row] wrtr.writerow(new_row) else: for line in data_file: myfile.write(line) RunnerLog( self.task, self.run_id, 11, f"File {self.file_name} created. Size: {file_size(Path(self.file_path).stat().st_size)}.\n{self.file_path}", ) # encrypt file if self.task.file_gpg == 1: gpg = gnupg.GPG("/usr/local/bin/gpg") # import the key keychain = gpg.import_keys( em_decrypt(self.task.file_gpg_conn.key, app.config["PASS_KEY"])) # set it to trusted gpg.trust_keys(keychain.fingerprints, "TRUST_ULTIMATE") # encrypt file with open(self.file_path, "rb") as my_file: encrypt_status = gpg.encrypt_file( file=my_file, recipients=keychain.fingerprints, output=self.file_path + ".gpg", ) # remove key gpg.delete_keys(keychain.fingerprints) # update global file name if not encrypt_status.ok: raise RunnerException( self.task, self.run_id, 11, "File failed to encrypt.\n%s\n%s\n%s" % ( self.file_path, encrypt_status.status, encrypt_status.stderr, ), ) self.file_path = self.file_path + ".gpg" self.file_name = self.file_name + ".gpg" RunnerLog( self.task, self.run_id, 11, "File encrypted.\n%s\n%s\n%s" % (self.file_path, encrypt_status.status, encrypt_status.stderr), ) # get file hash.. after encrypting with open(self.file_path, "rb") as my_file: while True: chunk = my_file.read(8192) if not chunk: break self.file_hash.update(chunk) RunnerLog(self.task, self.run_id, 11, f"File md5 hash: {self.file_hash.hexdigest()}") # create zip if self.task.destination_create_zip == 1: self.zip_name = DateParsing( self.task, self.run_id, str(self.task.destination_zip_name)).string_to_date() # parse params self.zip_name = self.params.insert_file_params(self.zip_name) self.zip_name = self.zip_name.replace(".zip", "") + ".zip" with zipfile.ZipFile( str(Path(self.base_path).joinpath(self.zip_name)), "w") as zip_file: zip_file.write( self.file_path, compress_type=zipfile.ZIP_DEFLATED, arcname=self.file_name, ) # now we change all file stuff to our zip. self.file_name = self.zip_name self.file_path = str(Path(self.base_path).joinpath(self.zip_name)) RunnerLog(self.task, self.run_id, 11, f"ZIP archive created.\n{self.file_path}") return self.file_name, self.file_path, self.file_hash.hexdigest()
def __init__(self, task_id: int) -> None: """Set up class parameters. On sequence jobs, only the first enabled job in the sequence should be in the scheduler. """ # Create id for the run instance and assign to tasks being run. my_hash = hashlib.sha256() my_hash.update(str(time.time() * 1000).encode("utf-8")) self.run_id = my_hash.hexdigest()[:10] task = Task.query.filter_by(id=task_id).first() self.source_files: List[IO[str]] self.output_files: List[str] = [] print("starting task " + str(task.id)) # noqa: T001 logging.info( "Runner: Starting task: %s, with run: %s", str(task.id), str(my_hash.hexdigest()[:10]), ) # set status to running task.status_id = 1 task.last_run_job_id = self.run_id task.last_run = datetime.datetime.now() db.session.commit() RunnerLog(task, self.run_id, 8, "Starting task!") self.task = task # If monitor fails then cancel task. try: system_monitor() except ValueError as message: raise RunnerException(self.task, self.run_id, 18, message) # create temp folder for output self.temp_path = Path( Path(__file__).parent.parent / "temp" / sanitize_filename(self.task.project.name) / sanitize_filename(self.task.name) / self.run_id) self.temp_path.mkdir(parents=True, exist_ok=True) RunnerLog(task, self.run_id, 8, "Loading parameters...") self.param_loader = ParamLoader(self.task, self.run_id) # load file/ run query/ etc to get some sort of data or process something. self.query_output_size: Optional[int] = None self.source_loader = SourceCode(self.task, self.run_id, self.param_loader) self.source_files = [] self.__get_source() # any data post-processing if self.task.processing_type_id is not None: self.__process() # store output self.__store_files() # send confirmation/error emails self.__send_email() # any cleanup process. remove file from local storage self.__clean_up() RunnerLog(self.task, self.run_id, 8, "Completed task!") # remove any retry tracking redis_client.delete(f"runner_{task_id}_attempt") task.status_id = 4 task.est_duration = (datetime.datetime.now() - task.last_run).total_seconds() # if this is a sequence job, trigger the next job. if task.project.sequence_tasks == 1: task_id_list = [ x.id for x in Task.query.filter_by(enabled=1).filter_by( project_id=task.project_id).order_by( Task.order.asc(), Task.name.asc()) # type: ignore[union-attr] .all() ] # potentially the task was disabled while running # and removed from list. when that happens we should # quit. if task.id in task_id_list: next_task_id = task_id_list[task_id_list.index(task.id) + 1:task_id_list.index(task.id) + 2] if next_task_id: # trigger next task RunnerLog( self.task, self.run_id, 8, f"Triggering run of next sequence job: {next_task_id}.", ) next_task = Task.query.filter_by( id=next_task_id[0]).first() RunnerLog( next_task, None, 8, f"Run triggered by previous sequence job: {task.id}.", ) requests.get(app.config["RUNNER_HOST"] + "/" + str(next_task_id[0])) else: RunnerLog(self.task, self.run_id, 8, "Sequence completed!") task.last_run_job_id = None task.last_run = datetime.datetime.now() db.session.commit()
def __send_email(self) -> None: logs = ( TaskLog.query.filter_by( task_id=self.task.id, job_id=self.run_id).order_by( TaskLog.status_date.desc()) # type: ignore[union-attr] .all()) error_logs = (TaskLog.query.filter_by(task_id=self.task.id, job_id=self.run_id, error=1).order_by( TaskLog.status_date).all()) date = str(datetime.datetime.now()) # pylint: disable=broad-except try: template = env.get_template("email/email.html.j2") except BaseException as e: raise RunnerException(self.task, self.run_id, 8, f"Failed to get email template.\n{e}") # success email if self.task.email_completion == 1 and ( (len(error_logs) < 1 and self.task.email_error == 1) or self.task.email_error != 1): RunnerLog(self.task, self.run_id, 8, "Sending completion email.") output: List[List[str]] = [] empty = 0 attachments: List[str] = [] if self.task.email_completion_file == 1 and len( self.output_files) > 0: for output_file in self.output_files: if self.task.email_completion_file_embed == 1: with open(output_file, newline="") as csvfile: output.extend(list(csv.reader(csvfile))) # check attachement file size if the task # should not send blank files if (self.task.email_completion_dont_send_empty_file == 1 and output_file # if query and data is blank, or other types and file is 0 and os.path.getsize(output_file) == 0): empty = 1 attachments.append(output_file) if empty == 1: RunnerLog( self.task, self.run_id, 8, "Not sending completion email, file is empty.", ) return Smtp( task=self.task, run_id=self.run_id, recipients=self.task.email_completion_recipients, subject="Project: %s / Task: %s / Run: %s %s" % ( self.task.project.name, self.task.name, self.run_id, date, ), message=template.render( task=self.task, success=1, date=date, logs=logs, output=output, host=app.config["WEB_HOST"], ), short_message=self.task.email_completion_message or f"Atlas Hub job {self.task} completed successfully.", attachments=attachments, )
def __store_files(self) -> None: if not self.source_files or len(self.source_files) == 0: return RunnerLog( self.task, self.run_id, 8, "Storing output file%s..." % ("s" if len(self.source_files) != 1 else ""), ) for file_counter, this_file in enumerate(self.source_files, 1): this_file_size = (self.query_output_size if self.query_output_size is not None else Path( this_file.name).stat().st_size) # get file name. if no name specified in task setting, then use temp name. try: file_name, file_path, file_hash = File( task=self.task, run_id=self.run_id, data_file=this_file, params=self.param_loader, ).save() except BaseException as e: raise RunnerException(self.task, self.run_id, 11, f"Failed to create data file.\n{e}") self.output_files.append(file_path) if len(self.source_files) > 1: RunnerLog( self.task, self.run_id, 8, f"Storing file {file_counter} of {len(self.source_files)}...", ) # store # send to sftp if self.task.destination_sftp == 1 and self.task.destination_sftp_conn: if (self.task.destination_sftp_dont_send_empty_file == 1 and this_file_size == 0): RunnerLog( self.task, self.run_id, 8, "Skipping SFTP, file is empty.", ) else: Sftp( task=self.task, run_id=self.run_id, connection=self.task.destination_sftp_conn, directory=self.temp_path, ).save( overwrite=self.task.destination_sftp_overwrite, file_name=file_name, ) # send to ftp if self.task.destination_ftp == 1 and self.task.destination_ftp_conn: if (self.task.destination_ftp_dont_send_empty_file == 1 and this_file_size == 0): RunnerLog( self.task, self.run_id, 8, "Skipping FTP, file is empty.", ) else: Ftp( task=self.task, run_id=self.run_id, connection=self.task.destination_ftp_conn, directory=self.temp_path, ).save( overwrite=self.task.destination_ftp_overwrite, file_name=file_name, ) # save to smb if self.task.destination_smb == 1 and self.task.destination_smb_conn: if (self.task.destination_smb_dont_send_empty_file == 1 and this_file_size == 0): RunnerLog( self.task, self.run_id, 8, "Skipping SMB, file is empty.", ) else: Smb( task=self.task, run_id=self.run_id, connection=self.task.destination_smb_conn, directory=self.temp_path, ).save( overwrite=self.task.destination_smb_overwrite, file_name=file_name, ) # save historical copy smb_path = Smb( task=self.task, run_id=self.run_id, connection=None, # "default", directory=self.temp_path, ).save(overwrite=1, file_name=file_name) # log file details db.session.add( TaskFile( name=file_name, path=smb_path, task_id=self.task.id, job_id=self.run_id, file_hash=file_hash, size=file_size(str(os.path.getsize(file_path))), )) db.session.commit()
def __process(self) -> None: RunnerLog(self.task, self.run_id, 8, "Starting processing script...") # get processing script # 1 = smb # 2 = sftp # 3 = ftp # 4 = git url # 5 = other url # 6 = source code processing_script_name = self.temp_path / (self.run_id + ".py") my_file = "" if (self.task.processing_type_id == 1 and self.task.processing_smb_id is not None): file_name = self.param_loader.insert_file_params( self.task.source_smb_file) file_name = DateParsing( task=self.task, run_id=self.run_id, date_string=file_name, ).string_to_date() my_file = Path( Smb( task=self.task, run_id=self.run_id, directory=self.temp_path, connection=self.task.processing_smb_conn, ).read(file_name)[0].name).read_text("utf8") elif (self.task.processing_type_id == 2 and self.task.processing_sftp_id is not None): file_name = self.param_loader.insert_file_params( self.task.processing_sftp_file) file_name = DateParsing( task=self.task, run_id=self.run_id, date_string=file_name, ).string_to_date() my_file = Path( Sftp( task=self.task, run_id=self.run_id, connection=self.task.processing_sftp_conn, directory=self.temp_path, ).read(file_name=file_name)[0].name).read_text("utf8") elif (self.task.processing_type_id == 3 and self.task.processing_ftp_id is not None): file_name = self.param_loader.insert_file_params( self.task.processing_ftp_file) file_name = DateParsing( task=self.task, run_id=self.run_id, date_string=file_name, ).string_to_date() my_file = Path( Ftp( task=self.task, run_id=self.run_id, connection=self.task.source_ftp_conn, directory=self.temp_path, ).read(file_name=file_name)[0].name).read_text("utf8") elif self.task.processing_type_id == 4 and self.task.processing_git is not None: # if a dir is specified then download all files if (self.task.processing_command is not None and self.task.processing_command != ""): try: url = (re.sub( r"(https?://)(.+?)", r"\1<username>:<password>@\2", self.task.processing_git, flags=re.IGNORECASE, ).replace("<username>", urllib.parse.quote( app.config["GIT_USERNAME"])).replace( "<password>", urllib.parse.quote( app.config["GIT_PASSWORD"]))) cmd = ( "$(which git) clone -q --depth 1 " + '--recurse-submodules --shallow-submodules %s "%s"' % (url, str(self.temp_path))) Cmd( self.task, self.run_id, cmd, "Repo cloned.", "Failed to clone repo: %s" % (self.task.processing_git, ), ).shell() # pylint: disable=broad-except except BaseException: raise RunnerException(self.task, self.run_id, 8, "Processor failed to clone repo.") # otherwise get py file else: my_file = self.source_loader.gitlab(self.task.processing_git) elif self.task.processing_type_id == 5 and self.task.processing_url is not None: if self.task.processing_command is not None: try: cmd = ( "$(which git) clone -q --depth 1 " + '--recurse-submodules --shallow-submodules %s "%s"' % (self.task.processing_url, str(self.temp_path))) Cmd( task=self.task, run_id=self.run_id, cmd=cmd, success_msg="Repo cloned", error_msg="Failed to clone repo: %s" % (self.task.processing_url, ), ).shell() processing_script_name = str(self.temp_path) + ( self.task.processing_command if self.task.processing_command is not None else "") # pylint: disable=broad-except except BaseException: raise RunnerException(self.task, self.run_id, 8, "Processor failed to clone repo.") else: my_file = self.source_loader.web_url(self.task.processing_url) elif (self.task.processing_type_id == 6 and self.task.processing_code is not None): my_file = self.task.processing_code elif self.task.processing_type_id > 0: raise RunnerException( self.task, self.run_id, 8, "Processing error, Not enough information to run a processing script from.", ) try: if my_file != "" and self.task.processing_type_id > 0: Path(processing_script_name).parent.mkdir(parents=True, exist_ok=True) with open(processing_script_name, "w") as text_file: text_file.write(my_file) RunnerLog(self.task, self.run_id, 8, "Processing script created.") # pylint: disable=broad-except except BaseException as e: raise RunnerException(self.task, self.run_id, 8, f"Processing script failure:\n{e}") # run processing script output = PyProcesser( task=self.task, run_id=self.run_id, directory=self.temp_path, source_files=self.source_files, script=self.task.processing_command or processing_script_name.name, ).run() # # allow processer to rename file if output: RunnerLog(self.task, self.run_id, 8, f"Processing script output:\n{output}") self.data_files = output
def __get_source(self) -> None: if self.task.source_type_id == 1: # sql external_db = self.task.source_database_conn try: RunnerLog(self.task, self.run_id, 8, "Loading query...") query = self.__get_query() except BaseException as e: raise RunnerException(self.task, self.run_id, 8, f"Failed to load query.\n{e}") RunnerLog(self.task, self.run_id, 8, "Starting query run, waiting for results...") if external_db.database_type.id == 1: # postgres try: self.query_output_size, self.source_files = Postgres( task=self.task, run_id=self.run_id, connection=em_decrypt(external_db.connection_string, app.config["PASS_KEY"]), timeout=external_db.timeout or app.config["DEFAULT_SQL_TIMEOUT"], directory=self.temp_path, ).run(query) except ValueError as message: raise RunnerException(self.task, self.run_id, 21, message) except BaseException as message: raise RunnerException(self.task, self.run_id, 21, f"Failed to run query.\n{message}") elif external_db.database_type.id == 2: # mssql try: self.query_output_size, self.source_files = SqlServer( task=self.task, run_id=self.run_id, connection=em_decrypt(external_db.connection_string, app.config["PASS_KEY"]), timeout=external_db.timeout or app.config["DEFAULT_SQL_TIMEOUT"], directory=self.temp_path, ).run(query) except ValueError as message: raise RunnerException(self.task, self.run_id, 20, message) except BaseException as message: raise RunnerException(self.task, self.run_id, 20, f"Failed to run query.\n{message}") RunnerLog( self.task, self.run_id, 8, f"Query completed.\nData file {self.source_files[0].name} created. Data size: {file_size(str(Path(self.source_files[0].name).stat().st_size))}.", ) elif self.task.source_type_id == 2: # smb file file_name = self.param_loader.insert_file_params( self.task.source_smb_file) file_name = DateParsing( task=self.task, run_id=self.run_id, date_string=file_name, ).string_to_date() self.source_files = Smb( task=self.task, run_id=self.run_id, connection=self.task.source_smb_conn, directory=self.temp_path, ).read(file_name=file_name) elif self.task.source_type_id == 3: # sftp file RunnerLog(self.task, self.run_id, 9, "Loading data from server...") file_name = self.param_loader.insert_file_params( self.task.source_sftp_file) file_name = DateParsing( task=self.task, run_id=self.run_id, date_string=file_name, ).string_to_date() self.source_files = Sftp( task=self.task, run_id=self.run_id, connection=self.task.source_sftp_conn, directory=self.temp_path, ).read(file_name=file_name) elif self.task.source_type_id == 4: # ftp file RunnerLog(self.task, self.run_id, 13, "Loading data from server...") file_name = self.param_loader.insert_file_params( self.task.source_ftp_file) file_name = DateParsing( task=self.task, run_id=self.run_id, date_string=file_name, ).string_to_date() self.source_files = Ftp( task=self.task, run_id=self.run_id, connection=self.task.source_ftp_conn, directory=self.temp_path, ).read(file_name=file_name) elif self.task.source_type_id == 6: # ssh command query = self.__get_query() Ssh( task=self.task, run_id=self.run_id, connection=self.task.source_ssh_conn, command=query, ).run()
def save(self, overwrite: int, file_name: str) -> str: # type: ignore[return] """Load data into network file path, creating location if not existing.""" try: if self.connection is not None: dest_path = str( Path(self.connection.path or "").joinpath(file_name)) else: dest_path = str( Path( Path(sanitize_filename(self.task.project.name)) / sanitize_filename(self.task.name) / sanitize_filename(self.task.last_run_job_id) / file_name)) # path must be created one folder at a time.. the docs say the path will all be # created if not existing, but it doesn't seem to be the case :) my_dir = dest_path.split("/")[:-1] path_builder = "" for my_path in my_dir: path_builder += my_path + "/" try: self.conn.listPath(self.share_name, path_builder) # pylint: disable=broad-except except OperationFailure: self.conn.createDirectory(self.share_name, path_builder) # pylint: disable=useless-else-on-loop else: if overwrite != 1: try: # try to get security of the file. if it doesn't exist, # we crash and then can create the file. self.conn.getSecurity(self.share_name, dest_path) RunnerLog( self.task, self.run_id, 10, "File already exists and will not be loaded", ) return dest_path # pylint: disable=broad-except except BaseException: pass with open(str(self.dir.joinpath(file_name)), "rb", buffering=0) as file_obj: uploaded_size = self.conn.storeFile( self.share_name, dest_path, file_obj) server_name = ("backup" if self.connection is None else self.connection.server_name) RunnerLog( self.task, self.run_id, 10, f"{file_size(uploaded_size)} uploaded to {server_name} server.", ) return dest_path # pylint: disable=broad-except except BaseException as e: raise RunnerException(self.task, self.run_id, 10, f"Failed to save file on server.\n{e}")
def __pip_install(self) -> None: r"""Get includes from script. get import (...) (?<=^import)\s+[^\.][^\s]+?\\s+?$ get import (...) as ... (?<=^import)\s+[^\.][^\s]+?(?=\s) get from (...) imoprt (...) (?<=^from)\s+[^\.].+?(?=import) """ try: imports = [] # find all scripts in dir, but not in venv paths = list( set(Path(self.job_path).rglob("*.py")) - set(Path(self.env_path).rglob("*.py"))) for this_file in paths: with open(this_file, "r") as my_file: for line in my_file: imports.extend( re.findall(r"(?<=^import)\s+[^\.][^\s]+?\s+?$", line)) imports.extend( re.findall(r"(?<=^from)\s+[^\.].+?(?=import)", line)) imports.extend( re.findall(r"(?<=^import)\s+[^\.][^\s]+?(?=\s)", line)) package_map = {"dateutil": "python-dateutil", "smb": "pysmb"} # clean list imports = [ str( package_map.get(x.strip().split(".")[0], x.strip().split(".")[0])) for x in imports if x.strip() != "" ] # remove any relative imports names = [my_file.stem for my_file in paths] imports = list(set(imports) - set(names)) # remove preinstalled packages from imports cmd = f'"{self.env_path}/bin/python" -c "help(\'modules\')"' built_in_packages = Cmd( task=self.task, run_id=self.run_id, cmd=cmd, success_msg="Python packages loaded.", error_msg="Failed to get preloaded packages: " + "\n" + cmd, ).shell() built_in_packages = built_in_packages.split( "Please wait a moment while I gather a list of all available modules..." )[1].split("Enter any module name to get more help.")[0] cleaned_built_in_packages = [ this_out.strip() for this_out in list( chain.from_iterable([ g.split(" ") for g in built_in_packages.split("\n") if g != "" ])) if this_out.strip() != "" ] # remove default python packages from list imports = [ x.strip() for x in imports if x not in cleaned_built_in_packages and x.strip() ] # try to install if len(imports) > 0: cmd = ( f'"{self.env_path}/bin/pip" install --disable-pip-version-check --quiet ' + " ".join([str(x) for x in imports])) Cmd( task=self.task, run_id=self.run_id, cmd=cmd, success_msg="Imports succesfully installed: " + ", ".join([str(x) for x in imports]) + " with command: " + "\n" + cmd, error_msg="Failed to install imports with command: " + "\n" + cmd, ).shell() except BaseException as e: raise RunnerException( self.task, self.run_id, 14, f"Failed to install packages.\n{self.base_path}\n{e}", )