def run_parallel_combinations(self):
     logger.info('Start to work on parallel combinations')
     self.parallel_jobs_pool_executor.create_jobs_pool()
     # if equal to one - we don't need to concatenate the number of repetitions to combination id nor calculate avg
     is_multiple_combinations = self.multiple_combinations > 1
     for combination_json in self.db.combinations_iterator():
         original_combination_obj = Combination.json_to_obj(combination_json)
         logger.info(LogPhrases.NEW_COMBINATION.format(original_combination_obj.combination_id))
         for i in range(self.multiple_combinations):
             if is_multiple_combinations:
                 combination_obj = copy.deepcopy(original_combination_obj)
                 combination_obj.combination_id = f'{combination_obj.combination_id}_{i}'
                 logger.info(f'#{i} repetition of {original_combination_obj.combination_id} combination')
             else:
                 combination_obj = original_combination_obj
             combination_folder_path = self.create_combination_folder(str(combination_obj.get_combination_id()))
             try:
                 self.parallel_compilation_of_one_combination(combination_obj, combination_folder_path)
                 self.compile_combination_to_binary(combination_folder_path)
             except Exception as ex:
                 logger.info_error(f'Exception at {Compar.__name__}: {ex}')
                 logger.debug_error(f'{traceback.format_exc()}')
                 self.save_combination_as_failure(combination_obj.get_combination_id(), str(ex),
                                                  combination_folder_path)
                 continue
             job = Job(combination_folder_path, combination_obj, self.main_file_parameters)
             self.parallel_jobs_pool_executor.run_job_in_thread(self.run_and_save_job, job)
     self.parallel_jobs_pool_executor.wait_and_finish_pool()
     if is_multiple_combinations:
         self.calculate_multiple_combinations_average()
     logger.info('Finish to work on all the parallel combinations')
예제 #2
0
 def delete_combination(self, combination_id: str):
     try:
         self.dynamic_db[self.collection_name].delete_one({"_id": combination_id})
         return True
     except Exception as e:
         logger.info_error(f'Exception at {Database.__name__}: Could not delete combination: {e}')
         logger.debug_error(f'{traceback.format_exc()}')
         return False
예제 #3
0
 def insert_new_combination_results(self, combination_result: dict):
     try:
         self.dynamic_db[self.collection_name].insert_one(combination_result)
         return True
     except Exception as e:
         logger.info_error(f'{Database.__name__}: cannot update dynamic DB: {e}')
         logger.debug_error(f'{traceback.format_exc()}')
         return False
예제 #4
0
 def get_combination_results(self, combination_id: str):
     combination = None
     try:
         combination = self.dynamic_db[self.collection_name].find_one({"_id": combination_id})
     except Exception as e:
         logger.info_error(f'Exception at {Database.__name__}: Could not find results for combination: {e}')
         logger.debug_error(f'{traceback.format_exc()}')
     finally:
         return combination
예제 #5
0
 def combinations_iterator(self):
     try:
         for combination in self.static_db[self.collection_name].find():
             if self.combination_has_results(combination['_id']):
                 continue
             yield combination
     except Exception:
         logger.info_error(f"Exception at {Database.__name__}: get_next_combination")
         raise
예제 #6
0
 def run_and_save_job(self, job_obj: Job):
     try:
         job_obj = self.execute_job(job_obj, self.serial_run_time)
     except Exception as ex:
         logger.info_error(f'Exception at {Compar.__name__}: {ex}')
         logger.debug_error(f'{traceback.format_exc()}')
     finally:
         if not self.save_combinations_folders:
             self.__delete_combination_folder(job_obj.get_directory_path())
예제 #7
0
 def __get_collection_name(project_name):
     collection_name = f"{getpass.getuser()}_{project_name}"
     static_namespace = f'{DatabaseConfig.STATIC_DB_NAME}.{collection_name}'
     dynamic_namespace = f'{DatabaseConfig.DYNAMIC_DB_NAME}.{collection_name}'
     longer_namespace = max((static_namespace, dynamic_namespace), key=len)
     if len(longer_namespace) > DatabaseConfig.NAMESPACE_LENGTH_LIMIT:
         new_name = longer_namespace[:DatabaseConfig.NAMESPACE_LENGTH_LIMIT].split('.')[1]
         logger.info_error(f'DB namespace is too long! (max is {DatabaseConfig.NAMESPACE_LENGTH_LIMIT} characters)')
         logger.info_error(f'The name was changed from {collection_name} to {new_name}')
         collection_name = new_name
     return collection_name
예제 #8
0
 def __remove_bswap_function(file_path: str):
     bswap_regex = re.compile(r'static __uint64_t __bswap_64[^\}]*\}',
                              flags=re.DOTALL)
     try:
         with open(file_path, 'r+') as f:
             content = f.read()
             if bswap_regex.match(content):
                 content = bswap_regex.sub('', content)
                 f.seek(0)
                 f.write(content)
                 f.truncate()
     except Exception as e:
         logger.info_error(f'Exception at {Par4all.__name__}: {e}')
         logger.debug_error(f'{traceback.format_exc()}')
예제 #9
0
 def get_combination_from_static_db(self, combination_id: str):
     combination = None
     if combination_id == self.SERIAL_COMBINATION_ID:
         return {
             "_id": Database.SERIAL_COMBINATION_ID,
             "compiler_name": Database.SERIAL_COMBINATION_ID,
             "parameters": {
                 "omp_rtl_params": [],
                 "omp_directives_params": [],
                 "compilation_params": []
             }
         }
     try:
         combination = self.static_db[self.collection_name].find_one({"_id": combination_id})
     except Exception as e:
         logger.info_error(f'Exception at {Database.__name__}: Could not find combination: {e}')
         logger.debug_error(f'{traceback.format_exc()}')
     finally:
         return combination
예제 #10
0
 def __analyze_job_exit_code(self):
     job_id = self.get_job().get_job_id()
     command = f"sacct -j {job_id} --format=exitcode"
     try:
         stdout, stderr, ret_code = run_subprocess(command)
         result = stdout.replace("\r", "").split("\n")
         if len(result) < 3:
             logger.info_error(
                 f'Warning: sacct command - no results for job id: {job_id}.'
             )
             return
         left_code, right_code = result[2].replace(" ", "").split(":")
         left_code, right_code = int(left_code), int(right_code)
         if left_code != 0 or right_code != 0:
             raise Exception(
                 f"Job id: {job_id} ended with return code: {left_code}:{right_code}."
             )
     except subprocess.CalledProcessError as ex:
         logger.info_error(
             f'Warning: sacct command not responding (slurm is down?)\n{ex.output}\n{ex.stderr}'
         )
예제 #11
0
 def __run_user_script(self, script_name: str):
     json_script_file_path = os.path.join(GlobalsConfig.ASSETS_DIR_PATH,
                                          script_name)
     if os.path.exists(json_script_file_path):
         with open(json_script_file_path, 'r') as f:
             json_content = json.load(f)
         if self.NAME in json_content:
             user_script_path = json_content[self.NAME]
             if os.path.exists(user_script_path):
                 try:
                     script_command = f'{user_script_path} {self.get_input_file_directory()}'
                     std_out, std_err, ret_code = run_subprocess(
                         script_command)
                     logger.debug(std_out)
                     logger.debug_error(std_err)
                 except subprocess.CalledProcessError as e:
                     logger.info_error(
                         f'{self.NAME}: user {script_name} script return with {e.returncode}: {e}'
                     )
                     logger.info(e.output)
                     logger.info_error(e.stderr)
예제 #12
0
 def initialize_static_db(self):
     try:
         combinations = generate_combinations()
         num_of_parallel_combinations = len(combinations)
         for combination in combinations:
             curr_combination_id = Database.generate_combination_id(combination)
             self.static_db[self.collection_name].update_one(
                 filter={
                     '_id': curr_combination_id
                 },
                 update={
                     '$setOnInsert': combination
                 },
                 upsert=True
             )
         return num_of_parallel_combinations
     except Exception as e:
         logger.info_error(f'Exception at {Database.__name__}: cannot initialize static DB: {e}')
         logger.debug_error(f'{traceback.format_exc()}')
         raise DatabaseError()
     finally:
         del combinations
예제 #13
0
 def trigger_test_output_test(test_file_path: str,
                              working_dir: str = "",
                              output_file_name: str = "",
                              check_for_existence: bool = False):
     command = ["pytest"]
     command += [f"{test_file_path}::{CombinationValidator.UNIT_TEST_NAME}"]
     if working_dir:
         command += ["--working_dir", working_dir]
     if output_file_name:
         command += ["--output_file_name", output_file_name]
     command = " ".join(command)
     try:
         stdout, stderr, exit_code = run_subprocess(command)
     except CalledProcessError as e:
         if e.returncode is None or e.returncode not in [
                 code for code in ExitCode
         ]:
             logger.info_error(
                 f"{CombinationValidator.__name__}: "
                 f"pytest operation failed. could not run the test.\n{e}")
             return ExitCode.INTERNAL_ERROR
         stdout = e.stdout
         stderr = e.stderr
         exit_code = e.returncode
     except Exception as ex:
         logger.info_error(
             f"{CombinationValidator.__name__}: exception thrown during pytest operation."
             f" could not run the test.\n{ex}")
         return ExitCode.INTERNAL_ERROR
     if not check_for_existence:
         if exit_code == ExitCode.OK:
             logger.verbose(
                 f"{CombinationValidator.__name__}: test '{CombinationValidator.UNIT_TEST_NAME}' passed."
             )
         else:
             logger.info_error(
                 f"{CombinationValidator.__name__}: "
                 f"test '{CombinationValidator.UNIT_TEST_NAME}' failed.")
         logger.debug(
             f"{CombinationValidator.__name__}: {stdout}\n{stderr}.")
     return exit_code
예제 #14
0
    def __run_with_sbatch(self, user_slurm_parameters: list):
        logger.info(
            f'Start running {self.get_job().get_combination().get_combination_id()} combination'
        )
        slurm_parameters = user_slurm_parameters
        dir_path = self.get_job().get_directory_path()
        dir_name = os.path.basename(dir_path)
        x_file = dir_name + MakefileConfig.EXE_FILE_EXTENSION
        sbatch_script_file = self.__make_sbatch_script_file(x_file)

        log_file = dir_name + GlobalsConfig.LOG_EXTENSION
        x_file_path = os.path.join(dir_path, x_file)
        log_file_path = os.path.join(dir_path, log_file)
        slurm_parameters = " ".join(slurm_parameters)
        cmd = f'sbatch {slurm_parameters} -o {log_file_path} {sbatch_script_file} {x_file_path}'
        if self.get_job().get_exec_file_args():
            cmd += f' {" ".join([str(arg) for arg in self.get_job().get_exec_file_args()])} '
        stdout = ""
        batch_job_sent = False
        while not batch_job_sent:
            try:
                stdout, stderr, ret_code = run_subprocess(cmd)
                batch_job_sent = True
            except subprocess.CalledProcessError as ex:
                logger.info_error(
                    f'Exception at {ExecuteJob.__name__}: {ex}\n{ex.output}\n{ex.stderr}'
                )
                logger.debug_error(f'{traceback.format_exc()}')
                logger.info_error(
                    'sbatch command not responding (slurm is down?)')
                time.sleep(
                    ExecuteJobConfig.TRY_SLURM_RECOVERY_AGAIN_SECOND_TIME)
        result = stdout
        # set job id
        result = re.findall('[0-9]', str(result))
        result = ''.join(result)
        self.get_job().set_job_id(result)
        logger.info(
            LogPhrases.JOB_SENT_TO_SLURM.format(self.get_job().get_job_id()))
        cmd = f"squeue -j {self.get_job().get_job_id()} --format %t"
        last_status = ''
        is_first_time = True
        is_finish = False
        while not is_finish:
            try:
                stdout, stderr = '', ''
                try:
                    stdout, stderr, ret_code = run_subprocess(cmd)
                except subprocess.CalledProcessError:  # check if squeue is not working or if the job finished
                    _, _, ret_code = run_subprocess('squeue')
                    if ret_code != 0:
                        raise
                    else:
                        is_finish = True
                current_status = ''
                try:
                    current_status = stdout.split('\n')[1]
                except IndexError:
                    if not is_finish:
                        logger.info_error(
                            f'Warning: check the squeue command output: {stdout} {stderr}'
                        )
                        time.sleep(ExecuteJobConfig.
                                   TRY_SLURM_RECOVERY_AGAIN_SECOND_TIME)
                        continue
                if current_status != last_status and current_status != '':
                    logger.info(
                        f'Job {self.get_job().get_job_id()} status is {current_status}'
                    )
                    last_status = current_status
                if not is_finish and not is_first_time:
                    # not is_first_time - some times the job go to COMPLETE immediately (fast running)
                    time.sleep(ExecuteJobConfig.CHECK_SQUEUE_SECOND_TIME)
                if is_first_time:
                    is_first_time = False
            except subprocess.CalledProcessError as ex:  # squeue command not responding (slurm is down?)
                logger.info_error(
                    f'Exception at {ExecuteJob.__name__}: {ex}\n{ex.stdout}\n{ex.stderr}'
                )
                logger.debug_error(f'{traceback.format_exc()}')
                logger.info_error(
                    'squeue command not responding (slurm is down?)')
                time.sleep(
                    ExecuteJobConfig.TRY_SLURM_RECOVERY_AGAIN_SECOND_TIME)
        logger.info(
            LogPhrases.JOB_IS_COMPLETE.format(self.get_job().get_job_id()))
예제 #15
0
        slurm_parameters=args.slurm_parameters,
        extra_files=args.extra_files,
        main_file_rel_path=args.main_file_rel_path,
        time_limit=args.time_limit,
        slurm_partition=args.slurm_partition,
        test_file_path=args.test_file_path,
        mode=args.mode,
        code_with_markers=args.code_with_markers,
        clear_db=args.clear_db,
        multiple_combinations=args.multiple_combinations,
        log_level=args.log_level)
    try:
        compar_obj.fragment_and_add_timers()
        compar_obj.run_serial()
        compar_obj.run_parallel_combinations()
        compar_obj.generate_optimal_code()
        logger.info('Finish Compar execution')
    except Exception:
        if args.clear_db:
            compar_obj.clear_related_collections()
        raise


if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        logger.info_error(f'Exception at Compar Program: {e}')
        logger.debug_error(traceback.format_exc())
        exit(1)