示例#1
0
def _download_jira_issues_segment(
    thread_num, jira_connection, jira_issue_ids_segment, field_spec, batch_size, q
):
    '''
    Each thread's target function.  Downloads 1/nth of the issues necessary, where
    n is the number of threads, a page at a time.  Puts the result of each page's
    download onto the shared queue.
    '''
    start_at = 0
    try:
        while start_at < len(jira_issue_ids_segment):
            issues, num_apparently_deleted = _download_jira_issues_page(
                jira_connection, jira_issue_ids_segment, field_spec, start_at, batch_size
            )

            issues_retrieved = len(issues) + num_apparently_deleted
            start_at += issues_retrieved
            if issues_retrieved == 0:
                break

            rows_to_insert = [(int(issue['id']), issue) for issue in issues]

            # TODO: configurable way to scrub things out of raw_issues here before we write them out.
            q.put(rows_to_insert)

        # sentinel to mark that this thread finished
        q.put(None)

    except BaseException as e:
        agent_logging.log_and_print_error_or_warning(
            logger, logging.ERROR, msg_args=[thread_num], error_code=3042, exc_info=True,
        )
        q.put(e)
示例#2
0
    def get_raw_result(self, url):
        # retry if rate-limited
        max_retries = 5
        for i in range(1, max_retries + 1):
            try:
                result = self.session.get(url)

                # HACK: This appears to happen after we have been
                # rate-limited when hitting certain URLs, there is
                # likely a more elegant way to solve this but it takes
                # about an hour to test each time and it works.
                if result.status_code == 403:
                    result = self.session.get(url)

                result.raise_for_status()
                return result
            except requests.exceptions.HTTPError as e:
                remaining_ratelimit = e.response.headers.get(
                    'X-RateLimit-Remaining')
                ratelimit_reset = e.response.headers.get('X-RateLimit-Reset')

                if remaining_ratelimit != '0':
                    # We hit a non-rate-limiting-related error.  Don't retry
                    raise

                if i >= max_retries:
                    agent_logging.log_and_print_error_or_warning(
                        logger,
                        logging.ERROR,
                        msg_args=[url, i],
                        error_code=3101,
                    )
                    raise

                # rate-limited!  Sleep until it's ok, then try again
                reset_time = datetime.fromtimestamp(int(ratelimit_reset),
                                                    pytz.utc)
                now = datetime.utcnow().replace(tzinfo=pytz.utc)
                reset_wait = reset_time - now

                reset_wait_in_seconds = reset_wait.total_seconds()

                # Sometimes github gives a reset time in the
                # past. In that case, wait for 5 mins just in case.
                if reset_wait_in_seconds <= 0:
                    reset_wait_in_seconds = 300

                # Sometimes github gives a reset time way in the
                # future. But rate limits reset each hour, so don't
                # wait longer than that
                reset_wait_in_seconds = min(reset_wait_in_seconds, 3600)
                reset_wait_str = str(timedelta(seconds=reset_wait_in_seconds))
                agent_logging.log_and_print_error_or_warning(
                    logger,
                    logging.WARNING,
                    msg_args=[reset_wait_str],
                    error_code=3091,
                )
                time.sleep(reset_wait_in_seconds)
                continue  # retry
示例#3
0
def download_customfieldoptions(jira_connection, project_ids):
    print('downloading jira custom field options... ', end='', flush=True)
    optionvalues = {}
    for project_id in project_ids:
        try:
            meta = jira_connection.createmeta(
                projectIds=[project_id], expand='projects.issuetypes.fields'
            )
        except JIRAError:
            agent_logging.log_and_print_error_or_warning(
                logger, logging.WARNING, error_code=3072, exc_info=False
            )
            return []

        # Custom values are buried deep in the createmeta response:
        #     projects -> issuetypes -> fields -> allowedValues
        for project in meta['projects']:
            for issue_type in project['issuetypes']:
                for field_key, field in issue_type['fields'].items():
                    if 'key' in field:
                        field_key = field['key']
                    # same field may end up in multiple issue types (bug, task, etc),
                    # so check if we've already added it
                    if field_key not in optionvalues and _is_option_field(field):
                        optionvalues[field_key] = field['allowedValues']

    result = [{'field_key': k, 'raw_json': v} for k, v in optionvalues.items()]
    print('✓')
    return result
 def get_raw_result(self, url, rate_limit_realm=None):
     start = datetime.utcnow()
     while True:
         try:
             with self.rate_limiter.limit(rate_limit_realm):
                 result = self.session.get(url)
                 result.raise_for_status()
                 return result
         except requests.exceptions.HTTPError as e:
             if e.response.status_code == 429:
                 # rate-limited in spite of trying to throttle
                 # requests.  We don't know how long we need to wait,
                 # so just try in 30 seconds, unless it's already
                 # been too long
                 if (datetime.utcnow() - start) < timedelta(hours=1):
                     agent_logging.log_and_print(
                         logger, logging.INFO, 'Retrying in 30 seconds...',
                     )
                     time.sleep(30)
                     continue
                 else:
                     agent_logging.log_and_print_error_or_warning(
                         logger, logging.ERROR, error_code=3151
                     )
             raise
示例#5
0
def _get_repos_list_in_jira(issues_to_scan, jira_connection):

    print('Scanning Jira issues for Git repos...')
    missing_repositories = {}

    for issue_id, instance_types in issues_to_scan.items():
        for instance_type in instance_types:
            try:
                repositories = _scan_jira_issue_for_repo_data(
                    jira_connection, issue_id, instance_type
                )
            except JIRAError as e:
                if e.status_code == 403:
                    agent_logging.log_and_print_error_or_warning(
                        logger, logging.ERROR, error_code=2122,
                    )
                    return []

            for repo in repositories:
                repo_name = repo['name']
                repo_url = repo['url']

                if repo_name not in missing_repositories:
                    missing_repositories[repo_name] = {
                        'name': repo_name,
                        'url': repo_url,
                        'instance_type': instance_type,
                    }
        return missing_repositories
示例#6
0
 def upload_file_from_thread(filename, path_to_obj, signed_url):
     try:
         upload_file(filename, path_to_obj, signed_url)
     except Exception as e:
         thread_exceptions.append(e)
         agent_logging.log_and_print_error_or_warning(
             logger, logging.ERROR, msg_args=[filename], error_code=3000, exc_info=True,
         )
示例#7
0
def get_basic_jira_connection(config, creds):
    try:
        return _get_raw_jira_connection(config, creds)
    except Exception as e:
        agent_logging.log_and_print_error_or_warning(logger,
                                                     logging.ERROR,
                                                     msg_args=[e],
                                                     error_code=2102,
                                                     exc_info=True)
示例#8
0
    def limit(self, realm):
        # if realm is None, don't rate limit, just execute the thing
        if realm is None:
            yield
            return

        max_calls, period_secs = self.realm_config[realm]
        start = datetime.utcnow()
        while True:
            # decide whether to sleep or call, inside the lock
            with self.lock:
                sleep_until, calls_made = self._call_available(
                    realm, max_calls)
                if not sleep_until:
                    self._record_call(realm, period_secs)

            if not sleep_until:
                try:
                    # stuff within the context manager happens here
                    yield
                    return
                except requests.exceptions.HTTPError as e:
                    if e.response.status_code == 429:
                        # Got rate limited anyway!
                        agent_logging.log_and_print_error_or_warning(
                            logger,
                            logging.ERROR,
                            msg_args=[calls_made, max_calls, realm],
                            error_code=3010,
                        )
                    raise

            agent_logging.log_and_print(
                logger,
                logging.INFO,
                f'Rate limiter: exceeded {max_calls} calls in {period_secs} seconds for {realm}!',
            )
            if (sleep_until - start) >= timedelta(seconds=self.timeout_secs):
                agent_logging.log_and_print_error_or_warning(
                    logger,
                    logging.ERROR,
                    msg_args=[self.timeout_secs],
                    error_code=3020)
                raise Exception('Rate limit timeout')

            sleep_period_secs = (sleep_until -
                                 datetime.utcnow()).total_seconds()
            if sleep_period_secs > 0:  # it's possible that sleep_until was a couple ms ago
                agent_logging.log_and_print(
                    logger,
                    logging.INFO,
                    f'Sleeping for {sleep_period_secs:.1f} secs ({sleep_period_secs / 60.0:.1f} mins)',
                )
                time.sleep(sleep_period_secs)
示例#9
0
 def _strip_history_items(items):
     # Skip items that are a change of a field that's filtered out
     for i in items:
         field_id_field = _get_field_identifier(i)
         if not field_id_field:
             agent_logging.log_and_print_error_or_warning(
                 logger=logger, level=logging.WARNING, error_code=3082, msg_args=[i.keys()],
             )
         if include_fields and i.get(field_id_field) not in include_fields:
             continue
         if i.get(field_id_field) in exclude_fields:
             continue
         yield i
示例#10
0
 def get_commit_by_ref(self, full_repo_name, ref):
     url = f'{self.base_url}/repos/{full_repo_name}/commits/{ref}'
     try:
         raw = self.get_raw_result(url)
         return raw.json()
     except HTTPError as e:
         if e.response.status_code in (422, ):
             agent_logging.log_and_print_error_or_warning(
                 logger,
                 logging.WARNING,
                 msg_args=[e.response.status_code, ref, full_repo_name],
                 error_code=3121,
             )
             return None
示例#11
0
def get_git_client(config: GitConfig, git_creds: dict,
                   skip_ssl_verification: bool):
    try:
        if config.git_provider == BBS_PROVIDER:
            return Stash(
                base_url=config.git_url,
                username=git_creds['bb_server_username'],
                password=git_creds['bb_server_password'],
                verify=not skip_ssl_verification,
                session=retry_session(),
            )

        if config.git_provider == BBC_PROVIDER:
            return BitbucketCloudClient(
                server_base_uri=config.git_url,
                username=git_creds['bb_cloud_username'],
                app_password=git_creds['bb_cloud_app_password'],
                session=retry_session(),
            )

        if config.git_provider == GH_PROVIDER:
            return GithubClient(
                base_url=config.git_url,
                token=git_creds['github_token'],
                verify=not skip_ssl_verification,
                session=retry_session(),
            )
        if config.git_provider == GL_PROVIDER:
            return GitLabClient(
                server_url=config.git_url,
                private_token=git_creds['gitlab_token'],
                verify=not skip_ssl_verification,
                per_page_override=config.gitlab_per_page_override,
                session=retry_session(),
            )

    except Exception as e:
        agent_logging.log_and_print_error_or_warning(
            logger,
            logging.ERROR,
            msg_args=[config.git_provider, e],
            error_code=2101,
            exc_info=True,
        )
        return

    # if the git provider is none of the above, throw an error
    raise ValueError(f'unsupported git provider {config.git_provider}')
示例#12
0
def _download_jira_issues_page(
    jira_connection, jira_issue_ids_segment, field_spec, start_at, batch_size
):
    '''
    Returns a tuple: (issues_downloaded, num_issues_apparently_deleted)
    '''
    get_changelog = True

    while batch_size > 0:
        search_params = {
            'jql': f"id in ({','.join(str(iid) for iid in jira_issue_ids_segment)}) order by id asc",
            'fields': field_spec,
            'expand': ['renderedFields'],
            'startAt': start_at,
            'maxResults': batch_size,
        }
        if get_changelog:
            search_params['expand'].append('changelog')

        try:
            resp_json = json_loads(
                jira_connection._session.post(
                    url=jira_connection._get_url('search'), data=json.dumps(search_params)
                )
            )
            return _expand_changelog(resp_json['issues'], jira_connection), 0

        except (json.decoder.JSONDecodeError, JIRAError) as e:
            if hasattr(e, 'status_code') and e.status_code == 429:
                # This is rate limiting ("Too many requests")
                raise

            batch_size = int(batch_size / 2)
            agent_logging.log_and_print_error_or_warning(
                logger, logging.WARNING, msg_args=[e, batch_size], error_code=3052, exc_info=True,
            )
            if batch_size == 0:
                if re.match(r"A value with ID .* does not exist for the field 'id'", e.text):
                    return [], 1
                elif not get_changelog:
                    agent_logging.log_and_print_error_or_warning(
                        logger, logging.WARNING, msg_args=[search_params], error_code=3062,
                    )
                    return [], 0
                else:
                    get_changelog = False
                    batch_size = 1
示例#13
0
    def get_all_repos(self, org):
        url = f'{self.base_url}/orgs/{org}/repos'
        for m in self.get_all_pages(url):
            try:
                yield self.get_json(m['url'])
            except requests.exceptions.HTTPError as e:
                # non-403 should bubble up
                if e.response.status_code != 403:
                    raise

                # we've seen some strange behavior with ghe, where we can get a 403 for
                # a repo that comes back in the list.  SKip them.
                agent_logging.log_and_print_error_or_warning(
                    logger,
                    logging.WARNING,
                    msg_args=[m["url"]],
                    error_code=3081,
                )
示例#14
0
 def project_is_accessible(project_id):
     try:
         jira_connection.search_issues(f'project = {project_id}', fields=['id'])
         return True
     except JIRAError as e:
         # Handle zombie projects that appear in the project list
         # but are not actually accessible.  I don't know wtf Black
         # is doing with this formatting, but whatever.
         if (
             e.status_code == 400
             and e.text
             == f"A value with ID '{project_id}' does not exist for the field 'project'."
         ):
             agent_logging.log_and_print_error_or_warning(
                 logger, logging.ERROR, msg_args=[project_id], error_code=2112,
             )
             return False
         else:
             raise
示例#15
0
def log_and_print_request_error(e, action='making request', log_as_exception=False):
    try:
        response_code = e.response_code
    except AttributeError:
        # if the request error is a retry error, we won't have the code
        response_code = ''

    error_name = type(e).__name__

    if log_as_exception:
        agent_logging.log_and_print_error_or_warning(
            logger,
            logging.ERROR,
            msg_args=[error_name, response_code, action, e],
            error_code=3131,
            exc_info=True,
        )
    else:
        agent_logging.log_and_print_error_or_warning(
            logger, logging.WARNING, msg_args=[error_name, response_code, action], error_code=3141
        )
示例#16
0
    def _download_some(thread_num, start_at, end_at):
        batch_size = 1000
        try:
            while start_at < min(end_at, total_num_issues):
                try:
                    api_resp = jira_connection.search_issues(
                        f'{issue_jql} order by id asc',
                        fields=['updated'],
                        startAt=start_at,
                        maxResults=batch_size,
                    )
                except (JIRAError, KeyError) as e:
                    if hasattr(e, 'status_code') and e.status_code < 500:
                        # something wrong with our request; re-raise
                        raise

                    # We have seen sporadic server-side flakiness here. Sometimes Jira Server (but not
                    # Jira Cloud as far as we've seen) will return a 200 response with an empty JSON
                    # object instead of a JSON object with an "issues" key, which results in the
                    # `search_issues()` function in the Jira library throwing a KeyError.
                    #
                    # Sometimes both cloud and server will return a 5xx.
                    #
                    # In either case, reduce the maxResults parameter and try again, on the theory that
                    # a smaller ask will prevent the server from choking.
                    batch_size = int(batch_size / 2)
                    if batch_size > 0:
                        agent_logging.log_and_print_error_or_warning(
                            logger, logging.WARNING, msg_args=[batch_size], error_code=3012,
                        )
                        continue
                    else:
                        agent_logging.log_and_print_error_or_warning(
                            logger, logging.ERROR, error_code=3022,
                        )
                        raise

                issue_metadata = {
                    int(iss.id): IssueMetadata(iss.key, parser.parse(iss.fields.updated))
                    for iss in api_resp
                }
                all_issue_metadata.update(issue_metadata)
                start_at += len(issue_metadata)

        except Exception as e:
            thread_exceptions[thread_num] = e
            agent_logging.log_and_print_error_or_warning(
                logger,
                logging.ERROR,
                msg_args=[thread_num, traceback.format_exc()],
                error_code=3032,
            )
示例#17
0
def load_and_dump_jira(config, endpoint_jira_info, jira_connection):
    try:
        write_file(
            config.outdir,
            'jira_fields',
            config.compress_output_files,
            download_fields(jira_connection, config.jira_include_fields,
                            config.jira_exclude_fields),
        )

        projects_and_versions = download_projects_and_versions(
            jira_connection,
            config.jira_include_projects,
            config.jira_exclude_projects,
            config.jira_include_project_categories,
            config.jira_exclude_project_categories,
        )

        project_ids = {proj['id'] for proj in projects_and_versions}
        write_file(
            config.outdir,
            'jira_projects_and_versions',
            config.compress_output_files,
            projects_and_versions,
        )

        write_file(
            config.outdir,
            'jira_users',
            config.compress_output_files,
            download_users(jira_connection, config.jira_gdpr_active),
        )
        write_file(
            config.outdir,
            'jira_resolutions',
            config.compress_output_files,
            download_resolutions(jira_connection),
        )
        write_file(
            config.outdir,
            'jira_issuetypes',
            config.compress_output_files,
            download_issuetypes(jira_connection, project_ids),
        )
        write_file(
            config.outdir,
            'jira_linktypes',
            config.compress_output_files,
            download_issuelinktypes(jira_connection),
        )
        write_file(
            config.outdir,
            'jira_priorities',
            config.compress_output_files,
            download_priorities(jira_connection),
        )

        def download_and_write_boards_and_sprints():
            boards, sprints, links = download_boards_and_sprints(
                jira_connection, project_ids, config.jira_download_sprints)
            write_file(config.outdir, 'jira_boards',
                       config.compress_output_files, boards)
            write_file(config.outdir, 'jira_sprints',
                       config.compress_output_files, sprints)
            write_file(config.outdir, 'jira_board_sprint_links',
                       config.compress_output_files, links)

        download_and_write_boards_and_sprints()

        issue_metadata_from_jira = download_all_issue_metadata(
            jira_connection,
            project_ids,
            config.jira_earliest_issue_dt,
            config.jira_issue_download_concurrent_threads,
            config.jira_issue_jql,
        )

        issue_metadata_from_jellyfish = {
            int(issue_id): IssueMetadata(
                issue_info['key'],
                datetime.fromisoformat(
                    issue_info['updated']),  # already includes TZ info
            )
            for issue_id, issue_info in
            endpoint_jira_info['issue_metadata'].items()
        }

        issue_metadata_addl_from_jellyfish = {
            int(issue_id): (
                issue_info.get('epic_link_field_issue_key'),
                issue_info.get('parent_field_issue_key'),
            )
            for issue_id, issue_info in
            endpoint_jira_info['issue_metadata'].items()
        }

        (
            missing_issue_ids,
            _,
            out_of_date_issue_ids,
            deleted_issue_ids,
        ) = detect_issues_needing_sync(issue_metadata_from_jira,
                                       issue_metadata_from_jellyfish)

        issue_ids_to_download = list(
            missing_issue_ids.union(out_of_date_issue_ids))

        @diagnostics.capture_timing()
        @agent_logging.log_entry_exit(logger)
        def download_and_write_issues():
            return download_and_write_streaming(
                config.outdir,
                'jira_issues',
                config.compress_output_files,
                generator_func=download_necessary_issues,
                generator_func_args=(
                    jira_connection,
                    issue_ids_to_download,
                    config.jira_include_fields,
                    config.jira_exclude_fields,
                    config.jira_issue_batch_size,
                    config.jira_issue_download_concurrent_threads,
                ),
                item_id_dict_key='id',
                addl_info_dict_key='key',
            )

        downloaded_issue_info = download_and_write_issues()

        issue_ids_needing_re_download = detect_issues_needing_re_download(
            downloaded_issue_info,
            issue_metadata_from_jellyfish,
            issue_metadata_addl_from_jellyfish,
        )

        @diagnostics.capture_timing()
        @agent_logging.log_entry_exit(logger)
        def download_and_write_issues_needing_re_download():
            return download_and_write_streaming(
                config.outdir,
                'jira_issues_re_downloaded',
                config.compress_output_files,
                generator_func=download_necessary_issues,
                generator_func_args=(
                    jira_connection,
                    list(issue_ids_needing_re_download),
                    config.jira_include_fields,
                    config.jira_exclude_fields,
                    config.jira_issue_batch_size,
                    config.jira_issue_download_concurrent_threads,
                ),
                item_id_dict_key='id',
                addl_info_dict_key='key',
            )

        re_downloaded_issue_info = download_and_write_issues_needing_re_download(
        )

        all_downloaded_issue_ids = [
            int(i[0])
            for i in chain(downloaded_issue_info, re_downloaded_issue_info)
        ]

        write_file(
            config.outdir,
            'jira_issue_ids_downloaded',
            config.compress_output_files,
            all_downloaded_issue_ids,
        )
        write_file(
            config.outdir,
            'jira_issue_ids_deleted',
            config.compress_output_files,
            list(deleted_issue_ids),
        )

        if config.jira_download_worklogs:
            write_file(
                config.outdir,
                'jira_worklogs',
                config.compress_output_files,
                download_worklogs(jira_connection, all_downloaded_issue_ids),
            )

        write_file(
            config.outdir,
            'jira_customfieldoptions',
            config.compress_output_files,
            download_customfieldoptions(jira_connection, project_ids),
        )

        write_file(
            config.outdir,
            'jira_statuses',
            config.compress_output_files,
            download_statuses(jira_connection),
        )

        return {'type': 'Jira', 'status': 'success'}

    except Exception as e:
        agent_logging.log_and_print_error_or_warning(logger,
                                                     logging.ERROR,
                                                     msg_args=[e],
                                                     error_code=3002,
                                                     exc_info=True)
        return {'type': 'Jira', 'status': 'failed'}
示例#18
0
    def get_repos(
        self,
        normalized_projects: List[NormalizedProject],
    ) -> List[NormalizedRepository]:
        print('downloading gitlab repos... ', end='', flush=True)

        nrm_repos: List[NormalizedRepository] = []
        for nrm_project in normalized_projects:

            repos_that_failed_to_download = []

            for i, api_repo in enumerate(
                    tqdm(
                        self.client.list_group_projects(nrm_project.id),
                        desc=f'downloading repos for {nrm_project.name}',
                        unit='repos',
                    ),
                    start=1,
            ):
                if (self.config.git_include_repos
                        # For GitLab, git_include_repos holds IDs instead of names (probably unintentionally), so
                        # no need to be case insensitive
                        and api_repo.id not in self.config.git_include_repos):
                    if self.config.git_verbose:
                        agent_logging.log_and_print(
                            logger,
                            logging.INFO,
                            f'skipping repo {api_repo.id} because not in include_repos...',
                        )
                    continue  # skip this repo

                if (self.config.git_exclude_repos
                        # For GitLab, git_exclude_repos holds IDs instead of names (probably unintentionally), so
                        # no need to be case insensitive
                        and api_repo.id in self.config.git_exclude_repos):
                    if self.config.git_verbose:
                        agent_logging.log_and_print(
                            logger,
                            logging.INFO,
                            f'skipping repo {api_repo.id} because in exclude_repos...',
                        )
                    continue  # skip this repo

                try:
                    nrm_branches = self.get_branches(api_repo)
                except gitlab.exceptions.GitlabListError:
                    # this is likely due to fine-tuned permissions defined on the repo (gitlab project)
                    # that is not allowing us to access to its repo details. if this happens, make a note of it and
                    # don't blow up the rest of the pull
                    repos_that_failed_to_download.append(api_repo)
                    continue  # skip this repo

                nrm_repos.append(
                    _normalize_repo(api_repo, nrm_branches, nrm_project,
                                    self.config.git_redact_names_and_urls))

            # if there were any repositories we had issues with... print them out now.
            if repos_that_failed_to_download:

                def __repo_log_string(api_repo):
                    # build log string
                    name = (api_repo.name
                            if not self.config.git_redact_names_and_urls else
                            _repo_redactor.redact_name(api_repo.name))
                    return {"id": api_repo.id, "name": name}.__str__()

                repos_failed_string = ", ".join([
                    __repo_log_string(api_repo)
                    for api_repo in repos_that_failed_to_download
                ])
                total_failed = len(repos_that_failed_to_download)

                agent_logging.log_and_print_error_or_warning(
                    logger,
                    logging.WARNING,
                    msg_args=[
                        total_failed, nrm_project.id, repos_failed_string
                    ],
                    error_code=2201,
                )

        print('✓')
        if not nrm_repos:
            raise ValueError(
                'No repos found. Make sure your token has appropriate access to GitLab and check your configuration of repos to pull.'
            )
        return nrm_repos
示例#19
0
def obtain_config(args) -> ValidatedConfig:
    if args.since:
        print(
            'WARNING: The -s / --since argument is deprecated and has no effect. You can remove its setting.'
        )
    if args.until:
        print(
            'WARNING: The -u / --until argument is deprecated and has no effect. You can remove its setting.'
        )

    jellyfish_api_base = args.jellyfish_api_base
    config_file_path = args.config_file

    run_mode = args.mode
    if run_mode not in VALID_RUN_MODES:
        print(
            f'''ERROR: Mode should be one of "{', '.join(VALID_RUN_MODES)}"''')
        raise BadConfigException()

    run_mode_includes_download = run_mode in ('download_and_send',
                                              'download_only')
    run_mode_includes_send = run_mode in ('download_and_send', 'send_only')
    run_mode_is_print_all_jira_fields = run_mode == 'print_all_jira_fields'
    run_mode_is_print_apparently_missing_git_repos = (
        run_mode == 'print_apparently_missing_git_repos')

    try:
        with open(config_file_path, 'r') as yaml_file:
            yaml_config = yaml.safe_load(yaml_file)
    except FileNotFoundError:
        print(f'ERROR: Config file not found at "{config_file_path}"')
        raise BadConfigException()

    yaml_conf_global = yaml_config.get('global', {})
    skip_ssl_verification = yaml_conf_global.get('no_verify_ssl', False)
    send_agent_config = yaml_conf_global.get('send_agent_config', False)

    # jira configuration
    jira_config = yaml_config.get('jira', {})
    jira_url = jira_config.get('url', None)

    jira_earliest_issue_dt = jira_config.get('earliest_issue_dt', None)
    if jira_earliest_issue_dt is not None and type(
            jira_earliest_issue_dt) != date:
        print(
            'ERROR: Invalid format for earliest_issue_dt; should be YYYY-MM-DD'
        )
        raise BadConfigException()

    jira_issue_download_concurrent_threads = jira_config.get(
        'issue_download_concurrent_threads', 10)
    jira_include_fields = set(jira_config.get('include_fields', []))
    jira_exclude_fields = set(jira_config.get('exclude_fields', []))
    jira_issue_batch_size = jira_config.get('issue_batch_size', 100)
    jira_gdpr_active = jira_config.get('gdpr_active', False)
    jira_include_projects = set(jira_config.get('include_projects', []))
    jira_exclude_projects = set(jira_config.get('exclude_projects', []))
    jira_include_project_categories = set(
        jira_config.get('include_project_categories', []))
    jira_exclude_project_categories = set(
        jira_config.get('exclude_project_categories', []))
    jira_issue_jql = jira_config.get('issue_jql', '')
    jira_download_worklogs = jira_config.get('download_worklogs', True)
    jira_download_sprints = jira_config.get('download_sprints', True)

    # warn if any of the recommended fields are missing or excluded
    if jira_include_fields:
        missing_required_fields = set(required_jira_fields) - set(
            jira_include_fields)
        if missing_required_fields:
            agent_logging.log_and_print_error_or_warning(
                logger,
                logging.WARNING,
                msg_args=[list(missing_required_fields)],
                error_code=2132,
            )
    if jira_exclude_fields:
        excluded_required_fields = set(required_jira_fields).intersection(
            set(jira_exclude_fields))
        if excluded_required_fields:
            agent_logging.log_and_print_error_or_warning(
                logger,
                logging.WARNING,
                msg_args=[list(excluded_required_fields)],
                error_code=2142,
            )

    git_configs: List[GitConfig] = _get_git_config_from_yaml(yaml_config)

    now = datetime.utcnow()

    if not jira_url and not len(git_configs):
        print('ERROR: Config file must provide either a Jira or Git URL.')
        raise BadConfigException()

    if skip_ssl_verification:
        print('WARNING: Disabling SSL certificate validation')
        # To silence "Unverified HTTPS request is being made."
        urllib3.disable_warnings()

    if run_mode_includes_download:
        if args.prev_output_dir:
            print(
                'ERROR: Provide output_basedir if downloading, not prev_output_dir'
            )
            raise BadConfigException()

    output_basedir = args.output_basedir
    output_dir_timestamp = now.strftime('%Y%m%d_%H%M%S')
    outdir = os.path.join(output_basedir, output_dir_timestamp)
    try:
        os.makedirs(outdir, exist_ok=False)
    except FileExistsError:
        print(f"ERROR: Output dir {outdir} already exists")
        raise BadConfigException()
    except Exception:
        print(
            f"ERROR: Couldn't create output dir {outdir}.  Make sure the output directory you mapped as a docker volume exists on your host."
        )
        raise BadConfigException()

    if run_mode_is_print_all_jira_fields and not jira_url:
        print(f'ERROR: Must provide jira_url for mode {run_mode}')
        raise BadConfigException()

    if run_mode_includes_send and not run_mode_includes_download:
        if not args.prev_output_dir:
            print('ERROR: prev_output_dir must be provided if not downloading')
            raise BadConfigException()

        if not os.path.isdir(args.prev_output_dir):
            print(
                f'ERROR: prev_output_dir ("{args.prev_output_dir}") is not a directory'
            )
            raise BadConfigException()

        outdir = args.prev_output_dir

    # If we're only downloading, do not compress the output files (so they can be more easily inspected)
    compress_output_files = (False if
                             (run_mode_includes_download
                              and not run_mode_includes_send) else True)

    if run_mode_is_print_apparently_missing_git_repos:
        if not len(git_configs):
            print(f'ERROR: {run_mode} requires git configuration.')
            raise BadConfigException()

        if not (jira_url and git_configs[0].git_url):
            print(
                f'ERROR: Must provide jira_url and git_url for mode {run_mode}'
            )
            raise BadConfigException()

        for git_config in git_configs:
            if git_config.git_redact_names_and_urls:
                print(
                    f'ERROR: git_redact_names_and_urls must be False for mode {run_mode}'
                )
                raise BadConfigException()

    return ValidatedConfig(
        run_mode,
        run_mode_includes_download,
        run_mode_includes_send,
        run_mode_is_print_all_jira_fields,
        run_mode_is_print_apparently_missing_git_repos,
        jira_url,
        jira_earliest_issue_dt,
        jira_issue_download_concurrent_threads,
        jira_include_fields,
        jira_exclude_fields,
        jira_issue_batch_size,
        jira_gdpr_active,
        jira_include_projects,
        jira_exclude_projects,
        jira_include_project_categories,
        jira_exclude_project_categories,
        jira_issue_jql,
        jira_download_worklogs,
        jira_download_sprints,
        git_configs,  # array of GitConfig
        outdir,
        compress_output_files,
        jellyfish_api_base,
        skip_ssl_verification,
        send_agent_config,
    )
示例#20
0
def download_boards_and_sprints(jira_connection, project_ids, download_sprints):
    boards_by_id = {}  # De-dup by id, since the same board might come back from more than one query
    for project_id in tqdm(project_ids, desc='downloading jira boards...', file=sys.stdout):
        b_start_at = 0
        while True:
            try:
                # Can't use the jira_connection's .boards() method, since it doesn't support all the query parms
                project_boards = jira_connection._session.get(
                    url=f'{jira_connection._options["server"]}/rest/agile/1.0/board',
                    params={
                        'maxResults': 50,
                        'startAt': b_start_at,
                        'type': 'scrum',
                        'includePrivate': 'false',
                        'projectKeyOrId': project_id,
                    },
                ).json()['values']
            except JIRAError as e:
                if e.status_code == 400:
                    agent_logging.log_and_print_error_or_warning(
                        logger, logging.ERROR, msg_args=[project_id], error_code=2202,
                    )
                    break
                raise

            if not project_boards:
                break

            b_start_at += len(project_boards)
            boards_by_id.update({board['id']: board for board in project_boards})

    links = []
    sprints = {}
    if download_sprints:
        for b in tqdm(boards_by_id.values(), desc='downloading jira sprints', file=sys.stdout):
            s_start_at = 0
            sprints_for_board = []
            while True:
                batch = None
                try:
                    batch = jira_connection.sprints(
                        # ignore future sprints
                        board_id=b['id'],
                        startAt=s_start_at,
                        maxResults=50,
                        state='active,closed',
                    )
                except JIRAError as e:
                    # JIRA returns 500 errors for various reasons: board is
                    # misconfigured; "falied to execute search"; etc.  Just
                    # skip and move on
                    if e.status_code == 500 or e.status_code == 404:
                        print(f"Couldn't get sprints for board {b['id']}.  Skipping...")
                    else:
                        raise

                if not batch:
                    break
                s_start_at += len(batch)
                sprints_for_board.extend(batch)

            links.append({'board_id': b['id'], 'sprint_ids': [s.id for s in sprints_for_board]})
            sprints.update({s.id: s for s in sprints_for_board})

    return list(boards_by_id.values()), [s.raw for s in sprints.values()], links
def _normalize_pr(
    client, repo, api_pr, strip_text_content: bool, redact_names_and_urls: bool,
):

    # Process the PR's diff to get additions, deletions, changed_files
    additions, deletions, changed_files = None, None, None
    try:
        diff_str = client.pr_diff(repo.project.id, repo.id, api_pr['id'])
        additions, deletions, changed_files = _calculate_diff_counts(diff_str)
        if additions is None:
            agent_logging.log_and_print_error_or_warning(
                logger, logging.WARN, msg_args=[api_pr["id"], repo.id], error_code=3031,
            )
    except requests.exceptions.RetryError:
        # Server threw a 500 on the request for the diff and we started retrying;
        # this happens consistently for certain PRs (if the PR has no commits yet). Just proceed with no diff
        pass
    except requests.exceptions.HTTPError as e:
        if e.response.status_code >= 500:
            # Server threw a 500 on the request for the diff; this happens consistently for certain PRs
            # (if the PR has no commits yet). Just proceed with no diff
            pass
        elif e.response.status_code == 401:
            # Server threw a 401 on the request for the diff; not sure why this would be, but it seems rare
            agent_logging.log_and_print_error_or_warning(
                logger, logging.WARN, msg_args=[api_pr["id"], repo.id], error_code=3041,
            )
        else:
            # Some other HTTP error happened; Re-raise
            raise
    except UnicodeDecodeError:
        # Occasional diffs seem to be invalid UTF-8
        agent_logging.log_and_print_error_or_warning(
            logger, logging.WARN, msg_args=[api_pr["id"], repo.id], error_code=3051,
        )

    # Comments
    comments = [
        NormalizedPullRequestComment(
            user=_normalize_user(c['user']),
            body=sanitize_text(c['content']['raw'], strip_text_content),
            created_at=parser.parse(c['created_on']),
        )
        for c in client.pr_comments(repo.project.id, repo.id, api_pr['id'])
    ]

    # Crawl activity for approvals, merge and closed dates
    approvals = []
    merge_date = None
    merged_by = None
    closed_date = None
    try:
        activity = list(client.pr_activity(repo.project.id, repo.id, api_pr['id']))
        approvals = [
            NormalizedPullRequestReview(
                user=_normalize_user(approval['user']),
                foreign_id=i,  # There's no true ID (unlike with GitHub); use a per-PR sequence
                review_state='APPROVED',
            )
            for i, approval in enumerate(
                (a['approval'] for a in activity if 'approval' in a), start=1,
            )
        ]

        # Obtain the merge_date and merged_by by crawling over the activity history
        pr_updates = [a for a in activity if 'update' in a]
        for a in sorted(pr_updates, key=lambda x: x['update']['date'], reverse=True):
            if a['update']['state'] == 'MERGED':
                merge_date = parser.parse(a['update']['date'])
                merged_by = _normalize_user(a['update']['author'])
                break

        # Obtain the closed_date by crawling over the activity history, looking for the
        # first transition to one of the closed states ('MERGED' or 'DECLINED')
        for a in sorted(pr_updates, key=lambda x: x['update']['date'], reverse=False):
            if a['update']['state'] in ('MERGED', 'DECLINED'):
                closed_date = parser.parse(a['update']['date'])
                break
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 401:
            # not authorized to see activity; skip it
            pass
        else:
            raise

    # Commits
    commits = [
        _normalize_commit(
            c,
            repo,
            api_pr['destination']['branch']['name'],
            strip_text_content,
            redact_names_and_urls,
        )
        for c in client.pr_commits(repo.project.id, repo.id, api_pr['id'])
    ]
    merge_commit = None
    if (
        api_pr['state'] == 'MERGED'
        and 'merge_commit' in api_pr
        and api_pr['merge_commit']
        and api_pr['merge_commit'].get('hash')
    ):
        api_merge_commit = client.get_commit(
            repo.project.id, api_pr['source']['repository']['uuid'], api_pr['merge_commit']['hash']
        )
        merge_commit = _normalize_commit(
            api_merge_commit,
            repo,
            api_pr['destination']['branch']['name'],
            strip_text_content,
            redact_names_and_urls,
        )

    # Repo links
    base_repo = _normalize_short_form_repo(
        api_pr['destination']['repository'], redact_names_and_urls
    )
    head_repo = _normalize_short_form_repo(api_pr['source']['repository'], redact_names_and_urls)

    return NormalizedPullRequest(
        id=api_pr['id'],
        title=api_pr['title'],
        body=api_pr['description'],
        url=api_pr['links']['html']['href'],
        base_branch=api_pr['destination']['branch']['name'],
        head_branch=api_pr['source']['branch']['name'],
        base_repo=base_repo,
        head_repo=head_repo,
        author=_normalize_user(api_pr['author']),
        is_closed=api_pr['state'] != 'OPEN',
        is_merged=api_pr['state'] == 'MERGED',
        created_at=parser.parse(api_pr['created_on']),
        updated_at=parser.parse(api_pr['updated_on']),
        additions=additions,
        deletions=deletions,
        changed_files=changed_files,
        merge_date=merge_date,
        closed_date=closed_date,
        merged_by=merged_by,
        approvals=approvals,
        commits=commits,
        merge_commit=merge_commit,
        comments=comments,
    )
    def get_pull_requests(
        self, normalized_repos: List[NormalizedRepository], server_git_instance_info,
    ) -> List[NormalizedPullRequest]:
        print('downloading bitbucket prs... ', end='', flush=True)
        for i, repo in enumerate(
            tqdm(normalized_repos, desc='downloading prs for repos', unit='repos'), start=1
        ):
            with agent_logging.log_loop_iters(logger, 'repo for pull requests', i, 1):
                try:
                    pull_since = pull_since_date_for_repo(
                        server_git_instance_info, repo.project.login, repo.id, 'prs'
                    )

                    api_prs = self.client.get_pullrequests(repo.project.id, repo.id)

                    if not api_prs:
                        agent_logging.log_and_print(
                            logger, logging.INFO, f'no prs found for repo {repo.id}. Skipping... '
                        )
                        continue

                    for api_pr in tqdm(api_prs, desc=f'processing prs for {repo.name}', unit='prs'):
                        try:
                            # Skip PRs with missng data
                            if (
                                'source' not in api_pr
                                or 'repository' not in api_pr['source']
                                or not api_pr['source']['repository']
                                or 'destination' not in api_pr
                                or 'repository' not in api_pr['destination']
                                or not api_pr['destination']['repository']
                            ):
                                agent_logging.log_and_print_error_or_warning(
                                    logger, logging.WARN, msg_args=[api_pr['id']], error_code=3030
                                )
                                continue

                            yield _normalize_pr(
                                self.client,
                                repo,
                                api_pr,
                                self.config.git_strip_text_content,
                                self.config.git_redact_names_and_urls,
                            )

                            # PRs are ordered newest to oldest if this
                            # is too old, we're done with this repo.  We
                            # yield one old one on purpose so that we
                            # handle the case correctly when the most
                            # recent PR is really old.
                            if pull_since and parser.parse(api_pr['updated_on']) < pull_since:
                                break

                        except Exception:
                            # if something happens when normalizing a PR, just keep going with the rest
                            agent_logging.log_and_print_error_or_warning(
                                logger,
                                logging.ERROR,
                                msg_args=[api_pr["id"], repo.id],
                                error_code=3011,
                                exc_info=True,
                            )

                except Exception:
                    # if something happens when pulling PRs for a repo, just keep going.
                    agent_logging.log_and_print_error_or_warning(
                        logger, logging.ERROR, msg_args=[repo.id], error_code=3021, exc_info=True,
                    )

        print('✓')
示例#23
0
def load_and_dump_git(
    config: GitConfig,
    endpoint_git_instance_info: dict,
    outdir: str,
    compress_output_files: bool,
    git_connection,
):
    # use the unique git instance agent key to collate files
    instance_slug = endpoint_git_instance_info['slug']
    instance_key = endpoint_git_instance_info['key']
    outdir = f'{outdir}/git_{instance_key}'
    os.mkdir(outdir)

    try:
        if config.git_provider == 'bitbucket_server':
            # using old func method, todo: refactor to use GitAdapter
            from jf_agent.git.bitbucket_server import load_and_dump as load_and_dump_bbs

            load_and_dump_bbs(
                config=config,
                outdir=outdir,
                compress_output_files=compress_output_files,
                endpoint_git_instance_info=endpoint_git_instance_info,
                bb_conn=git_connection,
            )

        elif config.git_provider == 'bitbucket_cloud':
            from jf_agent.git.bitbucket_cloud_adapter import BitbucketCloudAdapter

            BitbucketCloudAdapter(config, outdir, compress_output_files,
                                  git_connection).load_and_dump_git(
                                      endpoint_git_instance_info, )
        elif config.git_provider == 'github':
            # using old func method, todo: refactor to use GitAdapter
            from jf_agent.git.github import load_and_dump as load_and_dump_gh

            load_and_dump_gh(
                config=config,
                outdir=outdir,
                compress_output_files=compress_output_files,
                endpoint_git_instance_info=endpoint_git_instance_info,
                git_conn=git_connection,
            )
        elif config.git_provider == 'gitlab':
            from jf_agent.git.gitlab_adapter import GitLabAdapter

            GitLabAdapter(
                config, outdir, compress_output_files,
                git_connection).load_and_dump_git(endpoint_git_instance_info)
        else:
            raise ValueError(f'unsupported git provider {config.git_provider}')

    except Exception as e:
        agent_logging.log_and_print_error_or_warning(
            logger,
            logging.ERROR,
            msg_args=[config.git_provider, e],
            error_code=3061,
            exc_info=True,
        )

        return {
            'type': 'Git',
            'instance': instance_slug,
            'instance_key': instance_key,
            'status': 'failed',
        }

    return {
        'type': 'Git',
        'instance': instance_slug,
        'instance_key': instance_key,
        'status': 'success',
    }