def get_pipeline_status(cache_id, name): status = status_mgr(f'{FILE_PATH}/tmp') status.initialize() pipeline_status = status.get(f'./{cache_id}/{name}') start_time = None finish_time = None pipeline_id = None status = None success = None error_log = None if pipeline_status and pipeline_status.last_execution: last_execution = pipeline_status.last_execution start_time = last_execution.start_time pipeline_id = last_execution.pipeline_id finish_time = last_execution.finish_time success = last_execution.success error_log = last_execution.error_log status = pipeline_status.state() return { 'start_time': start_time, 'finish_time': finish_time, 'pipeline_id': pipeline_id, 'status': status, 'error_log': error_log, 'success': success, }
def specs(self, argument, root_dir, ignore_missing_deps=False): status_manager = status_mgr(root_dir) specs = [] for spec in pipelines(ignore_missing_deps=ignore_missing_deps, root_dir=root_dir, status_manager=status_manager): if match_pipeline_id(argument, spec.pipeline_id): specs.append(spec) return specs
def pipelines(prefixes=None, ignore_missing_deps=False, root_dir='.', status_manager=None): specs: Iterator[PipelineSpec] = find_specs(root_dir) hasher = HashCalculator() if status_manager is None: status_manager = status_mgr() if prefixes is None: prefixes = ('', ) while specs is not None: deferred = [] found = False for spec_ in specs: spec: PipelineSpec = spec_ if not any( spec.pipeline_id.startswith(prefix) for prefix in prefixes): continue if (spec.pipeline_details is not None and validate_pipeline( spec.pipeline_details, spec.validation_errors)): resolve_processors(spec) process_schedules(spec) try: hasher.calculate_hash(spec, status_manager, ignore_missing_deps) found = True except DependencyMissingException as e_: e: DependencyMissingException = e_ deferred.append((e.spec, e.missing)) continue yield spec if found and len(deferred) > 0: specs = iter((x[0] for x in deferred)) else: for spec, missing in deferred: spec.validation_errors.append( SpecError( 'Missing dependency', 'Failed to find a dependency: {}'.format(missing))) yield spec specs = None
# -*- coding: utf-8 -*- import json import threading import time from http.server import HTTPServer, BaseHTTPRequestHandler from datapackage_pipelines.manager import execute_pipeline, run_pipelines from datapackage_pipelines.specs.specs import pipelines from datapackage_pipelines.utilities.execution_id import gen_execution_id from datapackage_pipelines.status import status_mgr called_hooks = [] progresses = 0 status = status_mgr() class SaveHooks(BaseHTTPRequestHandler): def do_POST(self): global progresses content_len = int(self.headers.get('content-length', 0)) post_body = self.rfile.read(content_len) hook = json.loads(post_body) if hook['event'] != 'progress': called_hooks.append(hook) else: progresses += 1 self.send_response(200) self.end_headers() return
def run_pipeline(self, cache_id=None, verbose=False, num_rows=-1, background=False): ''' Starts a thread that runs the datapackage pipelines for this pipeline - On fail, return error message - On success, return datapackage.json contents and the resulting csv - On both fail and success return a status code and a unique id that can be passed back in to this function to use the cache - if run in the background use the static functions get_pipeline_status and get_pipeline_data to access the results. ''' if not cache_id: cache_id = str(uuid.uuid1()) # We have to check the cache_id value since it's # potentially being passed in from the outside pattern = re.compile( r'^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}' r'-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$' ) if not pattern.match(cache_id): raise Exception('The unique ID that was provided was not in uuid format') ''' IMPORTANT ''' # If the file structure between this file and the tmp folder # ever changes this code must change cache_dir = f'{ROOT_DIR}/{cache_id}' results_folder = f'{cache_dir}/results' # Create the directory and file if not os.path.exists(cache_dir): start = time.time() os.makedirs(cache_dir) BcodmoPipeline.log_slow_compute(start, cache_id, 'creating the directories') try: start = time.time() self.save_to_file(f'{cache_dir}/pipeline-spec.yaml.original', steps=self._steps) BcodmoPipeline.log_slow_compute(start, cache_id, 'creating the pipeline-spec.original.yaml file') # Create a new save step so we can access the data here new_save_step = { 'run': 'dump_to_path', 'parameters': { 'out-path': results_folder, 'temporal_format_property': 'outputFormat', } } new_steps = self._steps + [new_save_step] start = time.time() self.save_to_file(f'{cache_dir}/pipeline-spec.yaml', steps=new_steps) BcodmoPipeline.log_slow_compute(start, cache_id, 'creating the pipeline-spec.yaml file') # Remove the results folder start = time.time() shutil.rmtree(results_folder, ignore_errors=True) BcodmoPipeline.log_slow_compute(start, cache_id, 'removing the results folder') start = time.time() pipeline_id = f'./{cache_id}/{self.name}' status = status_mgr(ROOT_DIR) status.initialize() pipeline_status = status.get(pipeline_id) last_execution = pipeline_status.last_execution BcodmoPipeline.log_slow_compute(start, cache_id, 'checking the status before creating a thread') old_start_time = None if last_execution: old_start_time = last_execution.start_time start = time.time() x = threading.Thread(target=self.run_pipeline_thread, args=(cache_id, verbose,), daemon=True) BcodmoPipeline.log_slow_compute(start, cache_id, 'creating the thread') start = time.time() x.start() BcodmoPipeline.log_slow_compute(start, cache_id, 'starting the thread') if background: while True: # Loop until the next pipeline has started start = time.time() status = status_mgr(ROOT_DIR) status.initialize() pipeline_status = status.get(pipeline_id) last_execution = pipeline_status.last_execution BcodmoPipeline.log_slow_compute(start, cache_id, 'checking the status after creating the thread') if last_execution and last_execution.start_time != old_start_time: break if x.is_alive(): time.sleep(0.1) else: return { 'status_code': 1, 'cache_id': cache_id, 'yaml': self.get_yaml(), 'error_text': 'There was an unknown error in starting the pipeline', } return { 'status_code': 0, 'cache_id': cache_id, 'yaml': self.get_yaml(), } else: # Join the thread x.join() status_dict = BcodmoPipeline.get_pipeline_status(cache_id, self.name) if status_dict['success']: pipeline_data = BcodmoPipeline.get_pipeline_data(cache_id, num_rows) return { 'status_code': 0, 'cache_id': cache_id, 'yaml': self.get_yaml(), 'datapackage': pipeline_data['datapackage'], 'resources': pipeline_data['resources'], } else: return { 'status_code': 1, 'cache_id': cache_id, 'yaml': self.get_yaml(), 'error_text': status_dict.error_log, } finally: try: start = time.time() # Clean up the directory, deleting old folders cur_time = time.time() dirs = [ folder_name for folder_name in os.listdir(f'{FILE_PATH}/tmp') if not folder_name.startswith('.') ] for folder_name in dirs: folder = f'{FILE_PATH}/tmp/{folder_name}' st = os.stat(folder) modified_time = st.st_mtime age = cur_time - modified_time if age > DAY * 30: shutil.rmtree(folder) BcodmoPipeline.log_slow_compute(start, cache_id, 'checking age status of folders after complete') except Exception as e: logger.info(f'There was an error trying to clean up folder: {str(e)}') logger.error(vars(e))
def run_pipeline_thread(self, cache_id, verbose): cache_dir = f'{ROOT_DIR}/{cache_id}' pipeline_spec_path = f'{cache_dir}/pipeline-spec.yaml' pipeline_id = f'./{cache_id}/{self.name}' dpp_command_path, processor_path = self._get_version_paths(self.version) os.environ['DPP_PROCESSOR_PATH'] = processor_path try: # Activate the correct virtual environment start = time.time() self._activate_virtualenv(self.version) BcodmoPipeline.log_slow_compute(start, cache_id, 'activating the virtualenv') # Set the verbose string if necessary if verbose: command_list = [dpp_command_path, 'run', '--verbose', pipeline_id] else: command_list = [dpp_command_path, 'run', pipeline_id] # Start the dpp process start = time.time() p = subprocess.Popen( command_list, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL, cwd=ROOT_DIR, ) BcodmoPipeline.log_slow_compute(start, cache_id, 'creating the process') sleep_timer = 1 start = time.time() while p.poll() is None: BcodmoPipeline.log_slow_compute(start, cache_id, 'polling the process') time.sleep(1) if sleep_timer != 5: sleep_timer += 1 # The pipeline-spec.yaml was deleted, need to end the process now if not os.path.exists(pipeline_spec_path): # Get the chilren of the dpp process (the dpp slave process) children = [child.pid for child in psutil.Process(p.pid).children()] # Terminate the parent process p.terminate() # Terminate all of the children processes for child in children: os.kill(child, signal.SIGTERM) # Invalidate the pipeline in the dpp backend status = status_mgr(ROOT_DIR) status.initialize() pipeline_status = status.get(pipeline_id) if pipeline_status: last_execution = pipeline_status.last_execution if last_execution: last_execution.finish_execution( False, {}, ['This pipeline was stopped by laminar'], ) # One last try if p.poll() is None: p.kill() break start = time.time() finally: # Deactivate the virtualenv - not sure if this is necessary since it is a thread start = time.time() self._deactivate_virtualenv() BcodmoPipeline.log_slow_compute(start, cache_id, 'deactivating the virtualenv') # If the pipeline-spec.yaml file has been deleted since this thread started, the # whole cache_id folder should be deleted if not os.path.exists(pipeline_spec_path) and os.path.exists(cache_dir): shutil.rmtree(cache_dir)
def register_all_pipelines(root_dir='.'): for spec in pipelines(root_dir=root_dir): ps = status_mgr().get(spec.pipeline_id) ps.init(spec.pipeline_details, spec.source_details, spec.validation_errors, spec.cache_hash) ps.save()