def _download_jira_issues_segment( thread_num, jira_connection, jira_issue_ids_segment, field_spec, batch_size, q ): ''' Each thread's target function. Downloads 1/nth of the issues necessary, where n is the number of threads, a page at a time. Puts the result of each page's download onto the shared queue. ''' start_at = 0 try: while start_at < len(jira_issue_ids_segment): issues, num_apparently_deleted = _download_jira_issues_page( jira_connection, jira_issue_ids_segment, field_spec, start_at, batch_size ) issues_retrieved = len(issues) + num_apparently_deleted start_at += issues_retrieved if issues_retrieved == 0: break rows_to_insert = [(int(issue['id']), issue) for issue in issues] # TODO: configurable way to scrub things out of raw_issues here before we write them out. q.put(rows_to_insert) # sentinel to mark that this thread finished q.put(None) except BaseException as e: agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[thread_num], error_code=3042, exc_info=True, ) q.put(e)
def get_raw_result(self, url): # retry if rate-limited max_retries = 5 for i in range(1, max_retries + 1): try: result = self.session.get(url) # HACK: This appears to happen after we have been # rate-limited when hitting certain URLs, there is # likely a more elegant way to solve this but it takes # about an hour to test each time and it works. if result.status_code == 403: result = self.session.get(url) result.raise_for_status() return result except requests.exceptions.HTTPError as e: remaining_ratelimit = e.response.headers.get( 'X-RateLimit-Remaining') ratelimit_reset = e.response.headers.get('X-RateLimit-Reset') if remaining_ratelimit != '0': # We hit a non-rate-limiting-related error. Don't retry raise if i >= max_retries: agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[url, i], error_code=3101, ) raise # rate-limited! Sleep until it's ok, then try again reset_time = datetime.fromtimestamp(int(ratelimit_reset), pytz.utc) now = datetime.utcnow().replace(tzinfo=pytz.utc) reset_wait = reset_time - now reset_wait_in_seconds = reset_wait.total_seconds() # Sometimes github gives a reset time in the # past. In that case, wait for 5 mins just in case. if reset_wait_in_seconds <= 0: reset_wait_in_seconds = 300 # Sometimes github gives a reset time way in the # future. But rate limits reset each hour, so don't # wait longer than that reset_wait_in_seconds = min(reset_wait_in_seconds, 3600) reset_wait_str = str(timedelta(seconds=reset_wait_in_seconds)) agent_logging.log_and_print_error_or_warning( logger, logging.WARNING, msg_args=[reset_wait_str], error_code=3091, ) time.sleep(reset_wait_in_seconds) continue # retry
def download_customfieldoptions(jira_connection, project_ids): print('downloading jira custom field options... ', end='', flush=True) optionvalues = {} for project_id in project_ids: try: meta = jira_connection.createmeta( projectIds=[project_id], expand='projects.issuetypes.fields' ) except JIRAError: agent_logging.log_and_print_error_or_warning( logger, logging.WARNING, error_code=3072, exc_info=False ) return [] # Custom values are buried deep in the createmeta response: # projects -> issuetypes -> fields -> allowedValues for project in meta['projects']: for issue_type in project['issuetypes']: for field_key, field in issue_type['fields'].items(): if 'key' in field: field_key = field['key'] # same field may end up in multiple issue types (bug, task, etc), # so check if we've already added it if field_key not in optionvalues and _is_option_field(field): optionvalues[field_key] = field['allowedValues'] result = [{'field_key': k, 'raw_json': v} for k, v in optionvalues.items()] print('✓') return result
def get_raw_result(self, url, rate_limit_realm=None): start = datetime.utcnow() while True: try: with self.rate_limiter.limit(rate_limit_realm): result = self.session.get(url) result.raise_for_status() return result except requests.exceptions.HTTPError as e: if e.response.status_code == 429: # rate-limited in spite of trying to throttle # requests. We don't know how long we need to wait, # so just try in 30 seconds, unless it's already # been too long if (datetime.utcnow() - start) < timedelta(hours=1): agent_logging.log_and_print( logger, logging.INFO, 'Retrying in 30 seconds...', ) time.sleep(30) continue else: agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, error_code=3151 ) raise
def _get_repos_list_in_jira(issues_to_scan, jira_connection): print('Scanning Jira issues for Git repos...') missing_repositories = {} for issue_id, instance_types in issues_to_scan.items(): for instance_type in instance_types: try: repositories = _scan_jira_issue_for_repo_data( jira_connection, issue_id, instance_type ) except JIRAError as e: if e.status_code == 403: agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, error_code=2122, ) return [] for repo in repositories: repo_name = repo['name'] repo_url = repo['url'] if repo_name not in missing_repositories: missing_repositories[repo_name] = { 'name': repo_name, 'url': repo_url, 'instance_type': instance_type, } return missing_repositories
def upload_file_from_thread(filename, path_to_obj, signed_url): try: upload_file(filename, path_to_obj, signed_url) except Exception as e: thread_exceptions.append(e) agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[filename], error_code=3000, exc_info=True, )
def get_basic_jira_connection(config, creds): try: return _get_raw_jira_connection(config, creds) except Exception as e: agent_logging.log_and_print_error_or_warning(logger, logging.ERROR, msg_args=[e], error_code=2102, exc_info=True)
def limit(self, realm): # if realm is None, don't rate limit, just execute the thing if realm is None: yield return max_calls, period_secs = self.realm_config[realm] start = datetime.utcnow() while True: # decide whether to sleep or call, inside the lock with self.lock: sleep_until, calls_made = self._call_available( realm, max_calls) if not sleep_until: self._record_call(realm, period_secs) if not sleep_until: try: # stuff within the context manager happens here yield return except requests.exceptions.HTTPError as e: if e.response.status_code == 429: # Got rate limited anyway! agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[calls_made, max_calls, realm], error_code=3010, ) raise agent_logging.log_and_print( logger, logging.INFO, f'Rate limiter: exceeded {max_calls} calls in {period_secs} seconds for {realm}!', ) if (sleep_until - start) >= timedelta(seconds=self.timeout_secs): agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[self.timeout_secs], error_code=3020) raise Exception('Rate limit timeout') sleep_period_secs = (sleep_until - datetime.utcnow()).total_seconds() if sleep_period_secs > 0: # it's possible that sleep_until was a couple ms ago agent_logging.log_and_print( logger, logging.INFO, f'Sleeping for {sleep_period_secs:.1f} secs ({sleep_period_secs / 60.0:.1f} mins)', ) time.sleep(sleep_period_secs)
def _strip_history_items(items): # Skip items that are a change of a field that's filtered out for i in items: field_id_field = _get_field_identifier(i) if not field_id_field: agent_logging.log_and_print_error_or_warning( logger=logger, level=logging.WARNING, error_code=3082, msg_args=[i.keys()], ) if include_fields and i.get(field_id_field) not in include_fields: continue if i.get(field_id_field) in exclude_fields: continue yield i
def get_commit_by_ref(self, full_repo_name, ref): url = f'{self.base_url}/repos/{full_repo_name}/commits/{ref}' try: raw = self.get_raw_result(url) return raw.json() except HTTPError as e: if e.response.status_code in (422, ): agent_logging.log_and_print_error_or_warning( logger, logging.WARNING, msg_args=[e.response.status_code, ref, full_repo_name], error_code=3121, ) return None
def get_git_client(config: GitConfig, git_creds: dict, skip_ssl_verification: bool): try: if config.git_provider == BBS_PROVIDER: return Stash( base_url=config.git_url, username=git_creds['bb_server_username'], password=git_creds['bb_server_password'], verify=not skip_ssl_verification, session=retry_session(), ) if config.git_provider == BBC_PROVIDER: return BitbucketCloudClient( server_base_uri=config.git_url, username=git_creds['bb_cloud_username'], app_password=git_creds['bb_cloud_app_password'], session=retry_session(), ) if config.git_provider == GH_PROVIDER: return GithubClient( base_url=config.git_url, token=git_creds['github_token'], verify=not skip_ssl_verification, session=retry_session(), ) if config.git_provider == GL_PROVIDER: return GitLabClient( server_url=config.git_url, private_token=git_creds['gitlab_token'], verify=not skip_ssl_verification, per_page_override=config.gitlab_per_page_override, session=retry_session(), ) except Exception as e: agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[config.git_provider, e], error_code=2101, exc_info=True, ) return # if the git provider is none of the above, throw an error raise ValueError(f'unsupported git provider {config.git_provider}')
def _download_jira_issues_page( jira_connection, jira_issue_ids_segment, field_spec, start_at, batch_size ): ''' Returns a tuple: (issues_downloaded, num_issues_apparently_deleted) ''' get_changelog = True while batch_size > 0: search_params = { 'jql': f"id in ({','.join(str(iid) for iid in jira_issue_ids_segment)}) order by id asc", 'fields': field_spec, 'expand': ['renderedFields'], 'startAt': start_at, 'maxResults': batch_size, } if get_changelog: search_params['expand'].append('changelog') try: resp_json = json_loads( jira_connection._session.post( url=jira_connection._get_url('search'), data=json.dumps(search_params) ) ) return _expand_changelog(resp_json['issues'], jira_connection), 0 except (json.decoder.JSONDecodeError, JIRAError) as e: if hasattr(e, 'status_code') and e.status_code == 429: # This is rate limiting ("Too many requests") raise batch_size = int(batch_size / 2) agent_logging.log_and_print_error_or_warning( logger, logging.WARNING, msg_args=[e, batch_size], error_code=3052, exc_info=True, ) if batch_size == 0: if re.match(r"A value with ID .* does not exist for the field 'id'", e.text): return [], 1 elif not get_changelog: agent_logging.log_and_print_error_or_warning( logger, logging.WARNING, msg_args=[search_params], error_code=3062, ) return [], 0 else: get_changelog = False batch_size = 1
def get_all_repos(self, org): url = f'{self.base_url}/orgs/{org}/repos' for m in self.get_all_pages(url): try: yield self.get_json(m['url']) except requests.exceptions.HTTPError as e: # non-403 should bubble up if e.response.status_code != 403: raise # we've seen some strange behavior with ghe, where we can get a 403 for # a repo that comes back in the list. SKip them. agent_logging.log_and_print_error_or_warning( logger, logging.WARNING, msg_args=[m["url"]], error_code=3081, )
def project_is_accessible(project_id): try: jira_connection.search_issues(f'project = {project_id}', fields=['id']) return True except JIRAError as e: # Handle zombie projects that appear in the project list # but are not actually accessible. I don't know wtf Black # is doing with this formatting, but whatever. if ( e.status_code == 400 and e.text == f"A value with ID '{project_id}' does not exist for the field 'project'." ): agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[project_id], error_code=2112, ) return False else: raise
def log_and_print_request_error(e, action='making request', log_as_exception=False): try: response_code = e.response_code except AttributeError: # if the request error is a retry error, we won't have the code response_code = '' error_name = type(e).__name__ if log_as_exception: agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[error_name, response_code, action, e], error_code=3131, exc_info=True, ) else: agent_logging.log_and_print_error_or_warning( logger, logging.WARNING, msg_args=[error_name, response_code, action], error_code=3141 )
def _download_some(thread_num, start_at, end_at): batch_size = 1000 try: while start_at < min(end_at, total_num_issues): try: api_resp = jira_connection.search_issues( f'{issue_jql} order by id asc', fields=['updated'], startAt=start_at, maxResults=batch_size, ) except (JIRAError, KeyError) as e: if hasattr(e, 'status_code') and e.status_code < 500: # something wrong with our request; re-raise raise # We have seen sporadic server-side flakiness here. Sometimes Jira Server (but not # Jira Cloud as far as we've seen) will return a 200 response with an empty JSON # object instead of a JSON object with an "issues" key, which results in the # `search_issues()` function in the Jira library throwing a KeyError. # # Sometimes both cloud and server will return a 5xx. # # In either case, reduce the maxResults parameter and try again, on the theory that # a smaller ask will prevent the server from choking. batch_size = int(batch_size / 2) if batch_size > 0: agent_logging.log_and_print_error_or_warning( logger, logging.WARNING, msg_args=[batch_size], error_code=3012, ) continue else: agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, error_code=3022, ) raise issue_metadata = { int(iss.id): IssueMetadata(iss.key, parser.parse(iss.fields.updated)) for iss in api_resp } all_issue_metadata.update(issue_metadata) start_at += len(issue_metadata) except Exception as e: thread_exceptions[thread_num] = e agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[thread_num, traceback.format_exc()], error_code=3032, )
def load_and_dump_jira(config, endpoint_jira_info, jira_connection): try: write_file( config.outdir, 'jira_fields', config.compress_output_files, download_fields(jira_connection, config.jira_include_fields, config.jira_exclude_fields), ) projects_and_versions = download_projects_and_versions( jira_connection, config.jira_include_projects, config.jira_exclude_projects, config.jira_include_project_categories, config.jira_exclude_project_categories, ) project_ids = {proj['id'] for proj in projects_and_versions} write_file( config.outdir, 'jira_projects_and_versions', config.compress_output_files, projects_and_versions, ) write_file( config.outdir, 'jira_users', config.compress_output_files, download_users(jira_connection, config.jira_gdpr_active), ) write_file( config.outdir, 'jira_resolutions', config.compress_output_files, download_resolutions(jira_connection), ) write_file( config.outdir, 'jira_issuetypes', config.compress_output_files, download_issuetypes(jira_connection, project_ids), ) write_file( config.outdir, 'jira_linktypes', config.compress_output_files, download_issuelinktypes(jira_connection), ) write_file( config.outdir, 'jira_priorities', config.compress_output_files, download_priorities(jira_connection), ) def download_and_write_boards_and_sprints(): boards, sprints, links = download_boards_and_sprints( jira_connection, project_ids, config.jira_download_sprints) write_file(config.outdir, 'jira_boards', config.compress_output_files, boards) write_file(config.outdir, 'jira_sprints', config.compress_output_files, sprints) write_file(config.outdir, 'jira_board_sprint_links', config.compress_output_files, links) download_and_write_boards_and_sprints() issue_metadata_from_jira = download_all_issue_metadata( jira_connection, project_ids, config.jira_earliest_issue_dt, config.jira_issue_download_concurrent_threads, config.jira_issue_jql, ) issue_metadata_from_jellyfish = { int(issue_id): IssueMetadata( issue_info['key'], datetime.fromisoformat( issue_info['updated']), # already includes TZ info ) for issue_id, issue_info in endpoint_jira_info['issue_metadata'].items() } issue_metadata_addl_from_jellyfish = { int(issue_id): ( issue_info.get('epic_link_field_issue_key'), issue_info.get('parent_field_issue_key'), ) for issue_id, issue_info in endpoint_jira_info['issue_metadata'].items() } ( missing_issue_ids, _, out_of_date_issue_ids, deleted_issue_ids, ) = detect_issues_needing_sync(issue_metadata_from_jira, issue_metadata_from_jellyfish) issue_ids_to_download = list( missing_issue_ids.union(out_of_date_issue_ids)) @diagnostics.capture_timing() @agent_logging.log_entry_exit(logger) def download_and_write_issues(): return download_and_write_streaming( config.outdir, 'jira_issues', config.compress_output_files, generator_func=download_necessary_issues, generator_func_args=( jira_connection, issue_ids_to_download, config.jira_include_fields, config.jira_exclude_fields, config.jira_issue_batch_size, config.jira_issue_download_concurrent_threads, ), item_id_dict_key='id', addl_info_dict_key='key', ) downloaded_issue_info = download_and_write_issues() issue_ids_needing_re_download = detect_issues_needing_re_download( downloaded_issue_info, issue_metadata_from_jellyfish, issue_metadata_addl_from_jellyfish, ) @diagnostics.capture_timing() @agent_logging.log_entry_exit(logger) def download_and_write_issues_needing_re_download(): return download_and_write_streaming( config.outdir, 'jira_issues_re_downloaded', config.compress_output_files, generator_func=download_necessary_issues, generator_func_args=( jira_connection, list(issue_ids_needing_re_download), config.jira_include_fields, config.jira_exclude_fields, config.jira_issue_batch_size, config.jira_issue_download_concurrent_threads, ), item_id_dict_key='id', addl_info_dict_key='key', ) re_downloaded_issue_info = download_and_write_issues_needing_re_download( ) all_downloaded_issue_ids = [ int(i[0]) for i in chain(downloaded_issue_info, re_downloaded_issue_info) ] write_file( config.outdir, 'jira_issue_ids_downloaded', config.compress_output_files, all_downloaded_issue_ids, ) write_file( config.outdir, 'jira_issue_ids_deleted', config.compress_output_files, list(deleted_issue_ids), ) if config.jira_download_worklogs: write_file( config.outdir, 'jira_worklogs', config.compress_output_files, download_worklogs(jira_connection, all_downloaded_issue_ids), ) write_file( config.outdir, 'jira_customfieldoptions', config.compress_output_files, download_customfieldoptions(jira_connection, project_ids), ) write_file( config.outdir, 'jira_statuses', config.compress_output_files, download_statuses(jira_connection), ) return {'type': 'Jira', 'status': 'success'} except Exception as e: agent_logging.log_and_print_error_or_warning(logger, logging.ERROR, msg_args=[e], error_code=3002, exc_info=True) return {'type': 'Jira', 'status': 'failed'}
def get_repos( self, normalized_projects: List[NormalizedProject], ) -> List[NormalizedRepository]: print('downloading gitlab repos... ', end='', flush=True) nrm_repos: List[NormalizedRepository] = [] for nrm_project in normalized_projects: repos_that_failed_to_download = [] for i, api_repo in enumerate( tqdm( self.client.list_group_projects(nrm_project.id), desc=f'downloading repos for {nrm_project.name}', unit='repos', ), start=1, ): if (self.config.git_include_repos # For GitLab, git_include_repos holds IDs instead of names (probably unintentionally), so # no need to be case insensitive and api_repo.id not in self.config.git_include_repos): if self.config.git_verbose: agent_logging.log_and_print( logger, logging.INFO, f'skipping repo {api_repo.id} because not in include_repos...', ) continue # skip this repo if (self.config.git_exclude_repos # For GitLab, git_exclude_repos holds IDs instead of names (probably unintentionally), so # no need to be case insensitive and api_repo.id in self.config.git_exclude_repos): if self.config.git_verbose: agent_logging.log_and_print( logger, logging.INFO, f'skipping repo {api_repo.id} because in exclude_repos...', ) continue # skip this repo try: nrm_branches = self.get_branches(api_repo) except gitlab.exceptions.GitlabListError: # this is likely due to fine-tuned permissions defined on the repo (gitlab project) # that is not allowing us to access to its repo details. if this happens, make a note of it and # don't blow up the rest of the pull repos_that_failed_to_download.append(api_repo) continue # skip this repo nrm_repos.append( _normalize_repo(api_repo, nrm_branches, nrm_project, self.config.git_redact_names_and_urls)) # if there were any repositories we had issues with... print them out now. if repos_that_failed_to_download: def __repo_log_string(api_repo): # build log string name = (api_repo.name if not self.config.git_redact_names_and_urls else _repo_redactor.redact_name(api_repo.name)) return {"id": api_repo.id, "name": name}.__str__() repos_failed_string = ", ".join([ __repo_log_string(api_repo) for api_repo in repos_that_failed_to_download ]) total_failed = len(repos_that_failed_to_download) agent_logging.log_and_print_error_or_warning( logger, logging.WARNING, msg_args=[ total_failed, nrm_project.id, repos_failed_string ], error_code=2201, ) print('✓') if not nrm_repos: raise ValueError( 'No repos found. Make sure your token has appropriate access to GitLab and check your configuration of repos to pull.' ) return nrm_repos
def obtain_config(args) -> ValidatedConfig: if args.since: print( 'WARNING: The -s / --since argument is deprecated and has no effect. You can remove its setting.' ) if args.until: print( 'WARNING: The -u / --until argument is deprecated and has no effect. You can remove its setting.' ) jellyfish_api_base = args.jellyfish_api_base config_file_path = args.config_file run_mode = args.mode if run_mode not in VALID_RUN_MODES: print( f'''ERROR: Mode should be one of "{', '.join(VALID_RUN_MODES)}"''') raise BadConfigException() run_mode_includes_download = run_mode in ('download_and_send', 'download_only') run_mode_includes_send = run_mode in ('download_and_send', 'send_only') run_mode_is_print_all_jira_fields = run_mode == 'print_all_jira_fields' run_mode_is_print_apparently_missing_git_repos = ( run_mode == 'print_apparently_missing_git_repos') try: with open(config_file_path, 'r') as yaml_file: yaml_config = yaml.safe_load(yaml_file) except FileNotFoundError: print(f'ERROR: Config file not found at "{config_file_path}"') raise BadConfigException() yaml_conf_global = yaml_config.get('global', {}) skip_ssl_verification = yaml_conf_global.get('no_verify_ssl', False) send_agent_config = yaml_conf_global.get('send_agent_config', False) # jira configuration jira_config = yaml_config.get('jira', {}) jira_url = jira_config.get('url', None) jira_earliest_issue_dt = jira_config.get('earliest_issue_dt', None) if jira_earliest_issue_dt is not None and type( jira_earliest_issue_dt) != date: print( 'ERROR: Invalid format for earliest_issue_dt; should be YYYY-MM-DD' ) raise BadConfigException() jira_issue_download_concurrent_threads = jira_config.get( 'issue_download_concurrent_threads', 10) jira_include_fields = set(jira_config.get('include_fields', [])) jira_exclude_fields = set(jira_config.get('exclude_fields', [])) jira_issue_batch_size = jira_config.get('issue_batch_size', 100) jira_gdpr_active = jira_config.get('gdpr_active', False) jira_include_projects = set(jira_config.get('include_projects', [])) jira_exclude_projects = set(jira_config.get('exclude_projects', [])) jira_include_project_categories = set( jira_config.get('include_project_categories', [])) jira_exclude_project_categories = set( jira_config.get('exclude_project_categories', [])) jira_issue_jql = jira_config.get('issue_jql', '') jira_download_worklogs = jira_config.get('download_worklogs', True) jira_download_sprints = jira_config.get('download_sprints', True) # warn if any of the recommended fields are missing or excluded if jira_include_fields: missing_required_fields = set(required_jira_fields) - set( jira_include_fields) if missing_required_fields: agent_logging.log_and_print_error_or_warning( logger, logging.WARNING, msg_args=[list(missing_required_fields)], error_code=2132, ) if jira_exclude_fields: excluded_required_fields = set(required_jira_fields).intersection( set(jira_exclude_fields)) if excluded_required_fields: agent_logging.log_and_print_error_or_warning( logger, logging.WARNING, msg_args=[list(excluded_required_fields)], error_code=2142, ) git_configs: List[GitConfig] = _get_git_config_from_yaml(yaml_config) now = datetime.utcnow() if not jira_url and not len(git_configs): print('ERROR: Config file must provide either a Jira or Git URL.') raise BadConfigException() if skip_ssl_verification: print('WARNING: Disabling SSL certificate validation') # To silence "Unverified HTTPS request is being made." urllib3.disable_warnings() if run_mode_includes_download: if args.prev_output_dir: print( 'ERROR: Provide output_basedir if downloading, not prev_output_dir' ) raise BadConfigException() output_basedir = args.output_basedir output_dir_timestamp = now.strftime('%Y%m%d_%H%M%S') outdir = os.path.join(output_basedir, output_dir_timestamp) try: os.makedirs(outdir, exist_ok=False) except FileExistsError: print(f"ERROR: Output dir {outdir} already exists") raise BadConfigException() except Exception: print( f"ERROR: Couldn't create output dir {outdir}. Make sure the output directory you mapped as a docker volume exists on your host." ) raise BadConfigException() if run_mode_is_print_all_jira_fields and not jira_url: print(f'ERROR: Must provide jira_url for mode {run_mode}') raise BadConfigException() if run_mode_includes_send and not run_mode_includes_download: if not args.prev_output_dir: print('ERROR: prev_output_dir must be provided if not downloading') raise BadConfigException() if not os.path.isdir(args.prev_output_dir): print( f'ERROR: prev_output_dir ("{args.prev_output_dir}") is not a directory' ) raise BadConfigException() outdir = args.prev_output_dir # If we're only downloading, do not compress the output files (so they can be more easily inspected) compress_output_files = (False if (run_mode_includes_download and not run_mode_includes_send) else True) if run_mode_is_print_apparently_missing_git_repos: if not len(git_configs): print(f'ERROR: {run_mode} requires git configuration.') raise BadConfigException() if not (jira_url and git_configs[0].git_url): print( f'ERROR: Must provide jira_url and git_url for mode {run_mode}' ) raise BadConfigException() for git_config in git_configs: if git_config.git_redact_names_and_urls: print( f'ERROR: git_redact_names_and_urls must be False for mode {run_mode}' ) raise BadConfigException() return ValidatedConfig( run_mode, run_mode_includes_download, run_mode_includes_send, run_mode_is_print_all_jira_fields, run_mode_is_print_apparently_missing_git_repos, jira_url, jira_earliest_issue_dt, jira_issue_download_concurrent_threads, jira_include_fields, jira_exclude_fields, jira_issue_batch_size, jira_gdpr_active, jira_include_projects, jira_exclude_projects, jira_include_project_categories, jira_exclude_project_categories, jira_issue_jql, jira_download_worklogs, jira_download_sprints, git_configs, # array of GitConfig outdir, compress_output_files, jellyfish_api_base, skip_ssl_verification, send_agent_config, )
def download_boards_and_sprints(jira_connection, project_ids, download_sprints): boards_by_id = {} # De-dup by id, since the same board might come back from more than one query for project_id in tqdm(project_ids, desc='downloading jira boards...', file=sys.stdout): b_start_at = 0 while True: try: # Can't use the jira_connection's .boards() method, since it doesn't support all the query parms project_boards = jira_connection._session.get( url=f'{jira_connection._options["server"]}/rest/agile/1.0/board', params={ 'maxResults': 50, 'startAt': b_start_at, 'type': 'scrum', 'includePrivate': 'false', 'projectKeyOrId': project_id, }, ).json()['values'] except JIRAError as e: if e.status_code == 400: agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[project_id], error_code=2202, ) break raise if not project_boards: break b_start_at += len(project_boards) boards_by_id.update({board['id']: board for board in project_boards}) links = [] sprints = {} if download_sprints: for b in tqdm(boards_by_id.values(), desc='downloading jira sprints', file=sys.stdout): s_start_at = 0 sprints_for_board = [] while True: batch = None try: batch = jira_connection.sprints( # ignore future sprints board_id=b['id'], startAt=s_start_at, maxResults=50, state='active,closed', ) except JIRAError as e: # JIRA returns 500 errors for various reasons: board is # misconfigured; "falied to execute search"; etc. Just # skip and move on if e.status_code == 500 or e.status_code == 404: print(f"Couldn't get sprints for board {b['id']}. Skipping...") else: raise if not batch: break s_start_at += len(batch) sprints_for_board.extend(batch) links.append({'board_id': b['id'], 'sprint_ids': [s.id for s in sprints_for_board]}) sprints.update({s.id: s for s in sprints_for_board}) return list(boards_by_id.values()), [s.raw for s in sprints.values()], links
def _normalize_pr( client, repo, api_pr, strip_text_content: bool, redact_names_and_urls: bool, ): # Process the PR's diff to get additions, deletions, changed_files additions, deletions, changed_files = None, None, None try: diff_str = client.pr_diff(repo.project.id, repo.id, api_pr['id']) additions, deletions, changed_files = _calculate_diff_counts(diff_str) if additions is None: agent_logging.log_and_print_error_or_warning( logger, logging.WARN, msg_args=[api_pr["id"], repo.id], error_code=3031, ) except requests.exceptions.RetryError: # Server threw a 500 on the request for the diff and we started retrying; # this happens consistently for certain PRs (if the PR has no commits yet). Just proceed with no diff pass except requests.exceptions.HTTPError as e: if e.response.status_code >= 500: # Server threw a 500 on the request for the diff; this happens consistently for certain PRs # (if the PR has no commits yet). Just proceed with no diff pass elif e.response.status_code == 401: # Server threw a 401 on the request for the diff; not sure why this would be, but it seems rare agent_logging.log_and_print_error_or_warning( logger, logging.WARN, msg_args=[api_pr["id"], repo.id], error_code=3041, ) else: # Some other HTTP error happened; Re-raise raise except UnicodeDecodeError: # Occasional diffs seem to be invalid UTF-8 agent_logging.log_and_print_error_or_warning( logger, logging.WARN, msg_args=[api_pr["id"], repo.id], error_code=3051, ) # Comments comments = [ NormalizedPullRequestComment( user=_normalize_user(c['user']), body=sanitize_text(c['content']['raw'], strip_text_content), created_at=parser.parse(c['created_on']), ) for c in client.pr_comments(repo.project.id, repo.id, api_pr['id']) ] # Crawl activity for approvals, merge and closed dates approvals = [] merge_date = None merged_by = None closed_date = None try: activity = list(client.pr_activity(repo.project.id, repo.id, api_pr['id'])) approvals = [ NormalizedPullRequestReview( user=_normalize_user(approval['user']), foreign_id=i, # There's no true ID (unlike with GitHub); use a per-PR sequence review_state='APPROVED', ) for i, approval in enumerate( (a['approval'] for a in activity if 'approval' in a), start=1, ) ] # Obtain the merge_date and merged_by by crawling over the activity history pr_updates = [a for a in activity if 'update' in a] for a in sorted(pr_updates, key=lambda x: x['update']['date'], reverse=True): if a['update']['state'] == 'MERGED': merge_date = parser.parse(a['update']['date']) merged_by = _normalize_user(a['update']['author']) break # Obtain the closed_date by crawling over the activity history, looking for the # first transition to one of the closed states ('MERGED' or 'DECLINED') for a in sorted(pr_updates, key=lambda x: x['update']['date'], reverse=False): if a['update']['state'] in ('MERGED', 'DECLINED'): closed_date = parser.parse(a['update']['date']) break except requests.exceptions.HTTPError as e: if e.response.status_code == 401: # not authorized to see activity; skip it pass else: raise # Commits commits = [ _normalize_commit( c, repo, api_pr['destination']['branch']['name'], strip_text_content, redact_names_and_urls, ) for c in client.pr_commits(repo.project.id, repo.id, api_pr['id']) ] merge_commit = None if ( api_pr['state'] == 'MERGED' and 'merge_commit' in api_pr and api_pr['merge_commit'] and api_pr['merge_commit'].get('hash') ): api_merge_commit = client.get_commit( repo.project.id, api_pr['source']['repository']['uuid'], api_pr['merge_commit']['hash'] ) merge_commit = _normalize_commit( api_merge_commit, repo, api_pr['destination']['branch']['name'], strip_text_content, redact_names_and_urls, ) # Repo links base_repo = _normalize_short_form_repo( api_pr['destination']['repository'], redact_names_and_urls ) head_repo = _normalize_short_form_repo(api_pr['source']['repository'], redact_names_and_urls) return NormalizedPullRequest( id=api_pr['id'], title=api_pr['title'], body=api_pr['description'], url=api_pr['links']['html']['href'], base_branch=api_pr['destination']['branch']['name'], head_branch=api_pr['source']['branch']['name'], base_repo=base_repo, head_repo=head_repo, author=_normalize_user(api_pr['author']), is_closed=api_pr['state'] != 'OPEN', is_merged=api_pr['state'] == 'MERGED', created_at=parser.parse(api_pr['created_on']), updated_at=parser.parse(api_pr['updated_on']), additions=additions, deletions=deletions, changed_files=changed_files, merge_date=merge_date, closed_date=closed_date, merged_by=merged_by, approvals=approvals, commits=commits, merge_commit=merge_commit, comments=comments, )
def get_pull_requests( self, normalized_repos: List[NormalizedRepository], server_git_instance_info, ) -> List[NormalizedPullRequest]: print('downloading bitbucket prs... ', end='', flush=True) for i, repo in enumerate( tqdm(normalized_repos, desc='downloading prs for repos', unit='repos'), start=1 ): with agent_logging.log_loop_iters(logger, 'repo for pull requests', i, 1): try: pull_since = pull_since_date_for_repo( server_git_instance_info, repo.project.login, repo.id, 'prs' ) api_prs = self.client.get_pullrequests(repo.project.id, repo.id) if not api_prs: agent_logging.log_and_print( logger, logging.INFO, f'no prs found for repo {repo.id}. Skipping... ' ) continue for api_pr in tqdm(api_prs, desc=f'processing prs for {repo.name}', unit='prs'): try: # Skip PRs with missng data if ( 'source' not in api_pr or 'repository' not in api_pr['source'] or not api_pr['source']['repository'] or 'destination' not in api_pr or 'repository' not in api_pr['destination'] or not api_pr['destination']['repository'] ): agent_logging.log_and_print_error_or_warning( logger, logging.WARN, msg_args=[api_pr['id']], error_code=3030 ) continue yield _normalize_pr( self.client, repo, api_pr, self.config.git_strip_text_content, self.config.git_redact_names_and_urls, ) # PRs are ordered newest to oldest if this # is too old, we're done with this repo. We # yield one old one on purpose so that we # handle the case correctly when the most # recent PR is really old. if pull_since and parser.parse(api_pr['updated_on']) < pull_since: break except Exception: # if something happens when normalizing a PR, just keep going with the rest agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[api_pr["id"], repo.id], error_code=3011, exc_info=True, ) except Exception: # if something happens when pulling PRs for a repo, just keep going. agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[repo.id], error_code=3021, exc_info=True, ) print('✓')
def load_and_dump_git( config: GitConfig, endpoint_git_instance_info: dict, outdir: str, compress_output_files: bool, git_connection, ): # use the unique git instance agent key to collate files instance_slug = endpoint_git_instance_info['slug'] instance_key = endpoint_git_instance_info['key'] outdir = f'{outdir}/git_{instance_key}' os.mkdir(outdir) try: if config.git_provider == 'bitbucket_server': # using old func method, todo: refactor to use GitAdapter from jf_agent.git.bitbucket_server import load_and_dump as load_and_dump_bbs load_and_dump_bbs( config=config, outdir=outdir, compress_output_files=compress_output_files, endpoint_git_instance_info=endpoint_git_instance_info, bb_conn=git_connection, ) elif config.git_provider == 'bitbucket_cloud': from jf_agent.git.bitbucket_cloud_adapter import BitbucketCloudAdapter BitbucketCloudAdapter(config, outdir, compress_output_files, git_connection).load_and_dump_git( endpoint_git_instance_info, ) elif config.git_provider == 'github': # using old func method, todo: refactor to use GitAdapter from jf_agent.git.github import load_and_dump as load_and_dump_gh load_and_dump_gh( config=config, outdir=outdir, compress_output_files=compress_output_files, endpoint_git_instance_info=endpoint_git_instance_info, git_conn=git_connection, ) elif config.git_provider == 'gitlab': from jf_agent.git.gitlab_adapter import GitLabAdapter GitLabAdapter( config, outdir, compress_output_files, git_connection).load_and_dump_git(endpoint_git_instance_info) else: raise ValueError(f'unsupported git provider {config.git_provider}') except Exception as e: agent_logging.log_and_print_error_or_warning( logger, logging.ERROR, msg_args=[config.git_provider, e], error_code=3061, exc_info=True, ) return { 'type': 'Git', 'instance': instance_slug, 'instance_key': instance_key, 'status': 'failed', } return { 'type': 'Git', 'instance': instance_slug, 'instance_key': instance_key, 'status': 'success', }