def _insert_buildpairs(repo: str, buildpairs: List): bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN) if not bugswarmapi.bulk_insert_mined_build_pairs(buildpairs): log.error( 'Could not bulk insert mined build pairs for {}. Exiting.'. format(repo)) sys.exit(1)
def is_test(files_changed): """ Checks if the file classifies as test error or not :param files_changed: the modified filename list between two commits :return: confidence, files_test, files_not_test """ count = 0 files_test = list() files_not_test = list() if len(files_changed) < 1: log.error("No files changed") return None, list(), list() for filename in files_changed: if re.search(r'tests?\/', filename): count += 1 files_test.append(filename) elif re.search(r'test', filename): count += 0.5 files_test.append(filename) else: files_not_test.append(filename) files_actually_changed = len(files_changed) if files_actually_changed > 0: confidence = count / files_actually_changed else: confidence = 0.0 return confidence, files_test, files_not_test
def _base_pre_run(self): if self.job_center.total_jobs < 1: log.info('No jobs to reproduce. Exiting.') return # Set up the required directories. os.makedirs(self.config.orig_logs_dir, exist_ok=True) os.makedirs(self.config.output_dir, exist_ok=True) self.utils.directories_setup() if os.path.isfile(self.utils.get_error_reason_file_path()): self.error_reasons = read_json( self.utils.get_error_reason_file_path()) self.error_reasons = self.manager.dict(self.error_reasons) # Check if commands to Travis work. if not Utils.is_travis_installed(): log.error( colored( 'Commands to Travis are failing unexpectedly. Try restarting your shell and ensure your ' 'environment is provisioned correctly. Also try restarting your shell.', 'red')) raise Exception( 'Unexpected state: Commands to Travis are failing unexpectedly.' ) # Read travis_images.json. try: self.travis_images = read_json(self.config.travis_images_json) except FileNotFoundError: log.error( colored( self.config.travis_images_json + ' not found. Exiting.', 'red')) raise
def get_reproducer_version() -> str: stdout, stderr, returncode = ShellWrapper.run_commands('git rev-parse HEAD', stdout=subprocess.PIPE, shell=True) if returncode: msg = 'Error getting reproducer version: {}'.format(stderr) log.error(msg) raise IOError(msg) return stdout
def docker_pull(image_tag): assert image_tag assert isinstance(image_tag, str) # Exit early if the image already exists locally. exists, image_location = _image_exists_locally(image_tag) if exists: return True, image_location image_location = _image_location(image_tag) command = 'sudo docker pull {}'.format(image_location) _, _, returncode = ShellWrapper.run_commands(command, shell=True) if returncode != 0: # Image is not cached. Attempt to pull from bugswarm/images. image_location = '{}:{}'.format(DOCKER_HUB_REPO, image_tag) command = 'sudo docker pull {}'.format(image_location) _, _, returncode = ShellWrapper.run_commands(command, shell=True) if returncode != 0: # Image is not in bugswarm/images log.error('Could not download the image', image_location) else: log.info('Downloaded the image', image_location + '.') else: log.info('Downloaded the image', image_location + '.') return returncode == 0, image_location
def _modify_script(utils: Utils, jobpair: JobPair): for j in jobpair.jobs: script_path = join(utils.get_jobpair_dir(jobpair.jobs[0]), j.job_id + '.sh') if not isfile(script_path): log.error('Script file not found at', script_path) return 1 lines = [] with open(script_path) as f: found_cd_line = False for l in f: if r'travis_cmd cd\ ' + j.repo in l: found_cd_line = True lines.append(_replace_repo_path(j, l)) elif 'export TRAVIS_BUILD_DIR=$HOME/build/' in l: lines.append(_replace_repo_path(j, l)) else: lines.append(l) if not found_cd_line: raise ReproduceError('found_cd_line is False for {}'.format( j.job_id)) with open( join(utils.get_jobpair_dir(jobpair.jobs[0]), j.job_id + '-p.sh'), 'w') as f: for l in lines: f.write(l)
def _exceeded_api_quota(self) -> Tuple[bool, Optional[int]]: """ :return: A 2-tuple. (True, number of seconds until the quota resets) if the API quota has been exceeded. (False, None) otherwise. :raises Exception: When an exception is raised by the request. """ quota_url = 'https://api.github.com/rate_limit' log.info('Checking GitHub API quota.') response = self._session.get(quota_url) try: response.raise_for_status() result = response.json() if 'resources' in result: remaining = result['resources']['core']['remaining'] if remaining <= 0: reset_at = result['resources']['core'][ 'reset'] # Time when the quota resets, in UTC epoch seconds log.warning( 'GitHub API quota exceeded and will reset at UTC {}.'. format(reset_at)) now = int(time.time()) sleep_duration = ( reset_at - now ) + 10 # Add a few seconds to be sure that we sleep long enough. return True, sleep_duration except Exception as e: log.error('Exception while checking API quota:', e) raise return False, None
def load_buildpairs(dir_of_jsons: str, repo: str): """ :param dir_of_jsons: A directory containing JSON files of build pairs. :param repo: repo_slug name :raises json.decoder.JSONDecodeError: When the passed directory contains JSON files with invalid JSON. """ all_buildpairs = [] count = 0 task_name = repo.replace('/', '-') filename = task_name + '.json' try: data = read_json(os.path.join(dir_of_jsons, filename)) except json.decoder.JSONDecodeError: log.error('{} contains invalid JSON.'.format(filename)) return None except FileNotFoundError: log.error('{} is not found.'.format(filename)) return None all_buildpairs.extend(data) if not data: log.warning('{} does not contain any build pairs.'.format(filename)) count += 1 log.info('Read {} build pairs from {}.'.format(len(all_buildpairs), filename)) return all_buildpairs
def _load_jobs_from_pairs_for_repo(self, input_file): """ Read the input file, which should contain mined pairs from the database. """ try: buildpairs = read_json(input_file) except json.JSONDecodeError: log.error('Error reading input file {} in PairCenter. Exiting.') raise for bp in buildpairs: # For debug purposes: When we only want to reproduce non-PR pairs, we can uncomment these lines. # if bp['pr_num'] == -1: # continue repo = bp['repo'] if repo not in self.repos: self.repos[repo] = Repo(repo) self.uninitialized_repos.put(repo) self._append_buildpair_and_jobpair_to_repo(repo, bp) self._init_names() self.set_skip_of_job_pairs() self._init_queue_of_repos() # Calculate buildpair and job numbers after done loading from file. self._calc_num_total_buildpairs() self._calc_num_total_jobpairs() self._calc_num_total_jobs() log.debug('pair_center.total_buildpairs =', self.total_buildpairs, 'pair_center.total_jobpairs =', self.total_jobpairs, 'pair_center.total_jobs =', self.total_jobs)
def _get(self, address, **kwargs): sleep_seconds = _SLEEP_SECONDS attempts = 0 while True: response = self._session.get(address, params=kwargs) code = response.status_code if code == 200: return response.json() elif code == 404: log.error('Get request for {} returned 404 Not Found.'.format( address)) response.raise_for_status() elif code == 429: if attempts < 1 or not _TOKENS: log.warning( 'The Travis API returned status code 429 Too Many Requests. ' 'Retrying after sleeping for {} seconds.'.format( sleep_seconds)) time.sleep(sleep_seconds) attempts += 1 else: # Use another token if # of attempts for GET Requests >= 1, will use next token in list # deque.pop() removes element from the right so we appendleft() self._session.headers['Authorization'] = 'token {}'.format( _TOKENS[0]) _TOKENS.appendleft(_TOKENS.pop()) else: log.error('Get request for {} returned {}.'.format( address, code)) raise requests.exceptions.ConnectionError( '{} download failed. Error code is {}.'.format( address, code))
def process_logs(root, file_list): """ Returns contents of the failed log as a list :param root: directory :param file_list: [failed log, passed log] :return: list """ file_list.sort() try: with open(os.path.join(root, file_list[1])) as passed_file: passed = passed_file.readlines() passed = list(filter(None, [line.strip() for line in passed])) with open(os.path.join(root, file_list[0])) as failed_file: failed = failed_file.readlines() failed = list(filter(None, [line.strip() for line in failed])) except OSError as e: log.error(e) return None if "Done. Your build exited with 0." not in passed[-1]: # error-condition, skip classification if "Done. Your build exited with 0." not in failed[-1]: return None else: # passed and failed got interchanged return passed return failed
def build_and_run(self, job_id, gen_files_dir, repo_path, repo_name, base_image_name, repo): log.info('Building and running job with ID {}.'.format(job_id)) dockerfile_path = os.path.join(gen_files_dir, job_id + "-dockerfile") # Determine the image name. image_name = 'binswarm/cbuilds:{}'.format(job_id + "-" + repo_name) image_name = image_name.lower() # Actually build the image now. image = self.build_image(path=gen_files_dir, dockerfile=dockerfile_path, full_image_name=image_name) f = open("image.txt", "w") f.write(image_name) f.close() # Spawn the container. container_name = job_id retry_count = 0 while True: try: reproduced_log_destination = os.path.join( gen_files_dir, "docker-log.txt") self.spawn_container(image_name, container_name, reproduced_log_destination, repo_path, base_image_name, repo) except requests.exceptions.ReadTimeout as e: log.error('Error while attempting to spawn a container:', e) log.info('Retrying to spawn container.') retry_count += 1 else: break
def get_offending_tests(self): for line in self.tests_failed_lines: try: test_name = JavaAntAnalyzer.extract_test_name(line) self.tests_failed.append(test_name) except Exception: log.error('Encountered an error while extracting test name.')
def check_package_outdated(package: str): """ Checks if the installed version of a package is older than the latest non-prerelease version available on PyPI. If so, prints a message the asks the user to consider upgrading. The package must be available on PyPI and must have always used a version numbering scheme that can be parsed by distutils.version.StrictVersion. This function is meant to be used for packages in the 'bugswarm' namespace, which meet the above requirements, and therefore is not guaranteed to work for packages outside that namespace. :param package: The name of the package to check. """ if not isinstance(package, str): raise TypeError try: installed = _get_installed_version(package) latest = _get_latest_version(package) if latest > installed: # A newer, non-prerelease version is available. log.info( 'You are using {} version {}, but version {} is available.'. format(package, installed, latest)) log.info( "You should consider upgrading via the 'pip3 install --upgrade {}' command." .format(package)) except Exception as e: log.error( 'Encountered an error while checking if {} can be updated: {}'. format(package, e))
def is_code(files_changed, all_files_changed): """ Checks if the file classifies as code error or not :param files_changed: the modified filename list between two commits :param all_files_changed: all files been modified :return: confidence, files_code, files_not_code """ count = 0 files_code = list() files_not_code = list() if len(files_changed) < 1: log.error("No files changed") return None, list(), list() for filename in files_changed: # cannot contain 'test' or 'tests' in path if not re.search(r'test', filename): # if ends with ".java", needs to have "main" if re.search(r'\.java$', filename) and re.search(r'src', filename): count += 1 files_code.append(filename) # if is a python file elif re.search(r'\.pyx?$', filename): count += 1 files_code.append(filename) else: files_not_code.append(filename) files_actually_changed = len(all_files_changed) if files_actually_changed > 0: confidence = count / files_actually_changed else: confidence = 0.0 return confidence, files_code, files_not_code
def main(args=dict()): log.config_logging(getattr(logging, 'INFO', None)) # Log the current version of this BugSwarm component. log.info(get_current_component_version_message('Classifier')) repo_list, pipeline = _validate_input(args) filter_output_dir = os.path.join(os.path.dirname(__file__), '../pair-filter/output-json/') if pipeline and not os.path.exists(filter_output_dir): log.error( 'pipeline == true, but output_file_path ({}) does not exist. ' 'Exiting PairClassifier.'.format(filter_output_dir)) return for repo in repo_list: if pipeline: task_name = repo.replace('/', '-') json_path = os.path.join(filter_output_dir, task_name + '.json') if not os.path.exists(json_path): log.error(json_path, 'does not exist. Repo', repo, 'will be skipped.') continue # Get the input json from the file generated by pair-filter. dir_of_jsons = generate_build_pair_json(repo, json_path) else: # Get the input json from the DB. dir_of_jsons = generate_build_pair_json(repo) PairClassifier.run(repo, dir_of_jsons, args)
def _validate_input(argv): shortopts = 'i:c' longopts = 'csv'.split() input_file = None csv_mode = False try: optlist, args = getopt.getopt(argv[1:], shortopts, longopts) except getopt.GetoptError: log.error('Could not parse arguments. Exiting.') _print_usage() sys.exit(2) for opt, arg in optlist: if opt in ['-i']: input_file = arg if opt in ['-c', '--csv']: csv_mode = True if not input_file: _print_usage() sys.exit(1) if not os.path.isfile(input_file): log.error('The input_file argument ({}) is not a file or does not exist. Exiting.'.format(input_file)) sys.exit(1) return input_file, csv_mode
def run(self): """ Start processing image tags. Overriding is forbidden. """ self.pre_run() with ThreadPoolExecutor(max_workers=self._num_workers) as executor: future_to_image_tag = {executor.submit(self._thread_main, image_tag): image_tag for image_tag in self._image_tags} attempted = 0 succeeded = 0 errored = 0 for future in as_completed(future_to_image_tag): attempted += 1 try: data = future.result() if data: succeeded += 1 else: errored += 1 except Exception as e: log.error(e) errored += 1 self.post_run()
def run(self): """ The entry point for reproducing jobs. Calls post_run() after all items are processed. Subclasses must not override this method. """ self._base_pre_run() self.pre_run() try: while self.job_center.get_num_remaining_items(self.package_mode): log.info('Ready to initialize threads.') if not self.utils.check_disk_space_available(): self.utils.clean_disk_usage(self) if not self.utils.check_disk_space_available(): msg = 'Still inadequate disk space after removing temporary Reproducer files. Exiting.' log.error(msg) raise OSError(msg) if not self.utils.check_docker_disk_space_available(self.docker_storage_path): self.utils.clean_docker_disk_usage(self.docker) if not self.utils.check_docker_disk_space_available(self.docker_storage_path): msg = 'Still inadequate disk space after removing inactive Docker Images. Exiting.' log.error(msg) raise OSError(msg) self._init_threads() except KeyboardInterrupt: log.info('Caught KeyboardInterrupt. Cleaning up before terminating.') self.terminate.value = 1 else: self.post_run() log.info('Done!') finally: log.info(self.progress_str())
def is_dependency(files_changed, all_files_changed): """ Checks if the file classifies as deoendency (build) error or not :param files_changed: the modified filename list between two commits :param all_files_changed: all filenames that been modified :return: confidence, files_relevant, files_not_relevant """ build_config_files = [ 'pom.xml', 'travis.yml', 'build.gradle', '.travis/', 'build.xml' ] count = 0 files_relevant = list() files_not_relevant = list() if len(files_changed) < 1: log.error("No files changed") return None, list(), list() for filename in files_changed: if any([x in filename for x in build_config_files]): count += 1 files_relevant.append(filename) else: files_not_relevant.append(filename) files_actually_changed = len(all_files_changed) if files_actually_changed > 0: confidence = count / files_actually_changed else: confidence = 0.0 return confidence, files_relevant, files_not_relevant
def build_and_run(self, job): log.info('Building and running job with ID {}.'.format(job.job_id)) # Determine the image name. image_name = 'job_id:{}'.format(job.job_id) # Get paths required for building the image. abs_reproduce_tmp_dir = os.path.abspath(self.utils.get_reproduce_tmp_dir(job)) abs_dockerfile_path = os.path.abspath(self.utils.get_dockerfile_path(job)) reproduced_log_destination = self.utils.get_log_path(job) # Actually build the image now. image = self.build_image(path=abs_reproduce_tmp_dir, dockerfile=abs_dockerfile_path, full_image_name=image_name) # Spawn the container. container_name = str(job.job_id) retry_count = 0 while True: try: self.spawn_container(image, container_name, reproduced_log_destination) except requests.exceptions.ReadTimeout as e: log.error('Error while attempting to spawn a container:', e) log.info('Retrying to spawn container.') retry_count += 1 else: break
def run(repo: str, dir_of_jsons: str): utils.create_dirs() try: buildpairs = utils.load_buildpairs(dir_of_jsons, repo) except json.decoder.JSONDecodeError: log.error( 'At least one JSON file in {} contains invalid JSON. Exiting.'. format(dir_of_jsons)) sys.exit(1) log.info('Filtering. Starting with', utils.count_jobpairs(buildpairs), 'jobpairs.') PairFilter._set_attribute_defaults(buildpairs) # Apply the filters. filters.filter_no_sha(buildpairs) filters.filter_same_commit(buildpairs) filters.filter_unavailable(buildpairs) filters.filter_non_exact_images(buildpairs) log.info('Finished filtering.') PairFilter._set_is_filtered(buildpairs) log.info('Writing output to output_json') PairFilter._save_to_file(repo, OUTPUT_FILE_DIR, buildpairs) log.info('Writing build pairs to the database.') PairFilter._insert_buildpairs(repo, buildpairs) log.info('Updating mined project in the database.') PairFilter._update_mined_project(repo, buildpairs) log.info('Done! After filtering,', utils.count_unfiltered_jobpairs(buildpairs), 'jobpairs remain.')
def validate_input(argv, artifact_type): assert artifact_type in ['maven', 'python'] parser = argparse.ArgumentParser() parser.add_argument('image_tags_file', help='Path to a file containing a newline-separated list of image tags to process.') parser.add_argument('task_name', help='Name of current task. Results will be put in ./output/<task-name>.csv.') parser.add_argument('--workers', type=int, default=4, help='Number of parallel tasks to run.') parser.add_argument('--no-push', action='store_true', help='Do not push the artifact to Docker Hub.') parser.add_argument('--src-repo', default=DOCKER_HUB_REPO, help='Which repo to pull non-cached images from.') parser.add_argument('--dst-repo', default=DOCKER_HUB_CACHED_REPO, help='Which repo to push cached images to.') parser.add_argument('--keep-tmp-images', action='store_true', help='Keep temporary container images in the temporary repository.') parser.add_argument('--keep-containers', action='store_true', help='Keep containers in order to debug.') parser.add_argument('--keep-tars', action='store_true', help='Keep tar files in order to debug.') if artifact_type == 'maven': parser.add_argument('--no-copy-home-m2', action='store_true', help='Do not copy /home/travis/.m2/ directory.') parser.add_argument('--no-copy-home-gradle', action='store_true', help='Do not copy /home/travis/.gradle/ directory.') parser.add_argument('--no-copy-home-ivy2', action='store_true', help='Do not copy /home/travis/.ivy2/ directory.') parser.add_argument('--no-copy-proj-gradle', action='store_true', help='Do not copy /home/travis/build/*/*/*/.gradle/ directory.') parser.add_argument('--no-remove-maven-repositories', action='store_true', help='Do not remove `_remote.repositories` and `_maven.repositories`.') parser.add_argument('--ignore-cache-error', action='store_true', help='Ignore error when running build script to download cached files.') parser.add_argument('--no-strict-offline-test', action='store_true', help='Do not apply strict offline mode when testing.') parser.add_argument('--separate-passed-failed', action='store_true', help='Separate passed and failed cached files (will increase artifact size).') if artifact_type == 'python': parser.add_argument('--parse-new-log', action='store_true', help='Run build script on the artifact and parse this log for list of packages ' 'to download (otherwise will parse the original build log)') args = parser.parse_args(argv[1:]) image_tags_file = args.image_tags_file task_name = args.task_name if not os.path.isfile(image_tags_file): log.error('{} is not a file or does not exist. Exiting.'.format(image_tags_file)) parser.print_usage() exit(1) if not re.fullmatch(r'[a-zA-Z0-9\-\_]+', task_name): log.error('Invalid task_name: {}. Exiting.'.format(repr(task_name))) parser.print_usage() exit(1) output_file = 'output/{}.csv'.format(task_name) if not os.path.isdir('output'): os.mkdir('output') return image_tags_file, output_file, args
def is_github_archived(repo, sha): url = 'https://github.com/{}/commit/{}'.format(repo, sha) try: return requests.head(url).status_code != 404 except requests.exceptions.RequestException: log.error( 'Encountered an error while checking GitHub commit archive.') raise StepException
def setup_docker_storage_path(self): try: docker_dict = self.client.info() docker_root_dir = docker_dict['DockerRootDir'] storage_driver = docker_dict['Driver'] path = os.path.join(docker_root_dir, storage_driver) return path except docker.errors.APIError: log.error('Encountered a Docker API error while gathering the Docker environment info.') raise
def modify_deprecated_links(search_dir): file_path_result = [] for deprecated_url in _LIST_OF_DEPRECATED_URLS: grep_for_pom_command = 'grep -rl {} {}'.format(deprecated_url, search_dir) _, stdout, stderr, ok = _run_command(grep_for_pom_command) if ok: file_path_result += stdout.splitlines() for file_path in file_path_result: file_modified = False if os.path.isfile(file_path): extension_type = file_path.split('.')[-1] if extension_type == 'xml' or extension_type == 'pom': try: soup = BeautifulSoup(open(file_path), 'lxml-xml') list_of_repo_urls = soup.find_all('url') for url in list_of_repo_urls: stripped_url = url.getText().strip() if stripped_url in _LIST_OF_DEPRECATED_URLS: url.string.replace_with(_REPLACEMENT_URL) file_modified = True # Overwrite the existing POM with the updated POM. if file_modified: with open(file_path, 'w', encoding='utf-8') as f: f.write(soup.prettify()) log.info('Modified {} file.'.format(file_path)) except IOError: log.error('Error reading file: ', file_path) else: # square-retrofit-104397133 is an edge case example that contains a .js file that contains the # deprecated link and is executed at some point during the build causing the HTTPs 501 Error with fileinput.input(file_path, inplace=True) as f: for line in f: match_obj_found = False for url in _LIST_OF_DEPRECATED_URLS: match_obj = re.search(url, line) if match_obj: print( line.replace(url, _REPLACEMENT_URL).strip('\n')) file_modified = True match_obj_found = True continue if match_obj_found: continue else: print(line.strip('\n')) if file_modified: log.info('Modified {} file.'.format(file_path)) else: log.error('Error opening file: ', file_path)
def build_image(self, path, dockerfile, full_image_name): image = None try: image = self.client.images.build(path=path, dockerfile=dockerfile, tag=full_image_name) except docker.errors.BuildError as e: log.debug(e) raise ReproduceError('Encountered a build error while building a Docker image: {}'.format(e)) except docker.errors.APIError as e: raise ReproduceError('Encountered a Docker API error while building a Docker image: {}'.format(e)) except KeyboardInterrupt: log.error('Caught a KeyboardInterrupt while building a Docker image.') return image
def _update_mined_project(repo: str, buildpairs: List): bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN) def _key(filter_name: str, pr: bool): return 'filtered{}_{}'.format('_pr' if pr else '', filter_name) def _unfiltered_key(pr: bool): return 'unfiltered{}'.format('_pr' if pr else '') d = { 'filtered_no_sha': 0, 'filtered_same_commit': 0, 'filtered_unavailable': 0, 'filtered_no_original_log': 0, 'filtered_error_reading_original_log': 0, 'filtered_no_image_provision_timestamp': 0, 'filtered_inaccessible_image': 0, 'unfiltered': 0, 'filtered_pr_no_sha': 0, 'filtered_pr_same_commit': 0, 'filtered_pr_unavailable': 0, 'filtered_pr_no_original_log': 0, 'filtered_pr_error_reading_original_log': 0, 'filtered_pr_no_image_provision_timestamp': 0, 'filtered_pr_inaccessible_image': 0, 'unfiltered_pr': 0, } for bp in buildpairs: is_pr = bp['pr_num'] > 0 d[_unfiltered_key(is_pr)] += utils.count_unfiltered_jobpairs([bp]) for jp in bp['jobpairs']: reason = jp[FILTERED_REASON_KEY] if reason == reasons.NO_HEAD_SHA: d[_key('no_sha', is_pr)] += 1 elif reason == reasons.SAME_COMMIT_PAIR: d[_key('same_commit', is_pr)] += 1 elif reason == reasons.NOT_AVAILABLE: d[_key('unavailable', is_pr)] += 1 elif reason == reasons.NO_ORIGINAL_LOG: d[_key('no_original_log', is_pr)] += 1 elif reason == reasons.ERROR_READING_ORIGINAL_LOG: d[_key('error_reading_original_log', is_pr)] += 1 elif reason == reasons.NO_IMAGE_PROVISION_TIMESTAMP: d[_key('no_image_provision_timestamp', is_pr)] += 1 elif reason == reasons.INACCESSIBLE_IMAGE: d[_key('inaccessible_image', is_pr)] += 1 for metric_name, metric_value in d.items(): if not bugswarmapi.set_mined_project_progression_metric( repo, metric_name, metric_value): log.error( 'Encountered an error while setting a progression metric. Exiting.' ) sys.exit(1)
def main(): all_info = {} url_list = {} # file with image tags and their urls, separated by tabs with open('url_list.tsv', 'r') as f_tags: for line in f_tags: line_info = line.split('\t') image_tag = line_info[0] repo = line_info[1] failed_sha = line_info[2] passed_sha = line_info[3] url = get_github_url(failed_sha, passed_sha, repo) url_list[image_tag] = url t_start = time.time() # format: {'image_tag': {'url': url, 'num_files': num_files, 'changed_paths': changed_paths}, ...} with ThreadPoolExecutor(max_workers=4) as executor: future_to_tag = { executor.submit(gather_info, url_list[image_tag]): image_tag for image_tag in url_list.keys() } for future in as_completed(future_to_tag): try: the_info = future.result() with lock: if image_tag not in all_info: all_info[image_tag] = the_info except Exception as e: if not SUPPRESS_THREAD_EXCEPTIONS: log.error(e) raise t_stop = time.time() total_time = t_stop - t_start print("total time:", total_time) with open('changed_paths_info.tsv', 'w') as f: # write information from all_info list into the file for tag in all_info: info = all_info[tag] f.write('{}\t{}\t{}\t{}\t{}\t\n\n'.format( tag, str(info['num_changed_files']), str(info['error_found']), str(info['url']), str(info['changed_paths']))) with open('artifact_info.json', 'w') as file: json.dump(all_info, file) print("total amount:", len(all_info))
def download_logs(job_ids: List[Union[str, int]], destinations: List[str], overwrite: bool = True, num_workers: int = 5, retries: int = _DEFAULT_RETRIES) -> bool: """ Downloads one or more Travis job logs in parallel and stores them at the given destinations. This function calls `download_log` and raises the first exception it catches from that function, if any. If you only need to download a single Travis job log, use the `download_log` function. :param job_ids: A list of Travis job IDs, as strings or integers, identifying jobs whose logs to download. :param destinations: A list of paths where the logs should be stored. The path at index `i` corresponds to the log downloaded for the job ID at index `i` in `job_ids`. Thus, `job_ids` and `destinations` must be the same length. :param overwrite: Same as the argument for `download_log`. :param num_workers: Number of workers to download logs. Defaults to the maximum of 5. :param retries: Same as the argument for `download_log`. :raises ValueError: :raises FileExistsError: When a file already exists at the given destination and `overwrite` is False. :return: True if all downloads succeeded. """ if not job_ids: raise ValueError if not destinations: raise ValueError if not len(job_ids) == len(destinations): log.error( 'The job_ids and destinations arguments must be of equal length.') raise ValueError num_workers = min(num_workers, len(job_ids)) with ThreadPoolExecutor(max_workers=num_workers) as executor: future_to_job_id = { executor.submit(download_log, job_id, dst, overwrite, retries): job_id for job_id, dst in zip(job_ids, destinations) } succeeded = 0 for future in as_completed(future_to_job_id): try: # The result will be True if the download succeeded. Otherwise, future.result() will raise an exception or # return False. ok = future.result() except Exception: raise else: if ok: succeeded += 1 return succeeded == len(job_ids)