示例#1
0
    def _base_pre_run(self):
        if self.job_center.total_jobs < 1:
            log.info('No jobs to reproduce. Exiting.')
            return

        # Set up the required directories.
        os.makedirs(self.config.orig_logs_dir, exist_ok=True)
        os.makedirs(self.config.output_dir, exist_ok=True)
        self.utils.directories_setup()
        if os.path.isfile(self.utils.get_error_reason_file_path()):
            self.error_reasons = read_json(
                self.utils.get_error_reason_file_path())
        self.error_reasons = self.manager.dict(self.error_reasons)
        # Check if commands to Travis work.
        if not Utils.is_travis_installed():
            log.error(
                colored(
                    'Commands to Travis are failing unexpectedly. Try restarting your shell and ensure your '
                    'environment is provisioned correctly. Also try restarting your shell.',
                    'red'))
            raise Exception(
                'Unexpected state: Commands to Travis are failing unexpectedly.'
            )
        # Read travis_images.json.
        try:
            self.travis_images = read_json(self.config.travis_images_json)
        except FileNotFoundError:
            log.error(
                colored(
                    self.config.travis_images_json + ' not found. Exiting.',
                    'red'))
            raise
示例#2
0
    def process(self, repo, builds_json_file, builds_info_json_file) -> Optional[Any]:
        # repo = context['repo']
        travis = TravisWrapper()
        if os.path.isfile(builds_json_file):
            build_list = read_json(builds_json_file)
        else:
            log.info('Getting the list of builds...')
            start_time = time.time()
            try:
                builds = travis.get_builds_for_repo(repo)
            except RequestException:
                error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo)
            build_list = list(builds)
            write_json(builds_json_file, build_list)
            log.info('Got the list of builds in', time.time() - start_time, 'seconds.')

        if os.path.isfile(builds_info_json_file):
            build_list = read_json(builds_info_json_file)
        else:
            log.info('Downloading build info for',
                     len(build_list),
                     'builds... This step may take several minutes for large repositories.')
            start_time = time.time()
            for idx, build in enumerate(build_list):
                build_id = build['id']
                try:
                    build_info = travis.get_build_info(build_id)
                except RequestException:
                    error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id)
                build['build_info'] = build_info
                if (idx + 1) % 500 == 0:
                    log.info('Downloaded build info for', idx + 1, 'builds so far...')
            write_json(builds_info_json_file, build_list)
            log.info('Downloaded build info in', time.time() - start_time, 'seconds.')
示例#3
0
def filter_non_exact_images(job_id, log_path):
    """
    Check if all jobs in this pair (from both the failed and passed build) used images that are available.

    If an image is found to match the job pair then it gets added to the job pair.

    This function assumes the language specified in the Travis configuration does not change between the failed and
    passed builds.

    Returns a 4-tuple of filter counts. The tuple members represent the following:
    1. The number of pairs filtered due to original log not found
    2. The number of pairs filtered due to an error reading the original log
    3. The number of pairs filtered due to no image provision timestamp in the original log
    4. The number of pairs filtered due to usage of a non-exact Docker image.
    """
    log.debug(
        'To detect non-exact pairs, we first extract the used images from the original logs.'
    )
    travis_images = read_json(os.path.join(BASE_DIR, "travis_images.json"))
    provisioned_strs = []
    for language in travis_images:
        provisioned_strs += travis_images[language].values()
    dockerhub_images = read_json(
        os.path.join(BASE_DIR, "dockerhub_images.json"))
    no_original_log = 0
    error_reading_original_log = 0
    orig_log_path = os.path.join(log_path, str(job_id) + ".txt")
    if not download_log(job_id, orig_log_path):
        no_original_log += 1

    # Try to find the image by timestamp. If found, add it to the job pair.
    try:
        chooser = ExactImageChooserByTime(orig_log_path, travis_images,
                                          language)
        orig_log_image_provision_timestamp = chooser.find_image_datetime_from_log(
        )
        image = chooser.get_image_tag()
        if image is not None:
            return image
    except OSError:
        # The original log file was not found.
        error_reading_original_log += 1

    # Try to find image by tag. If found, add it to the job pair.
    if not image:
        chooser = ExactImageChooserByTag(orig_log_path)
        image = chooser.get_image_tag()
        if image is not None:
            return image
    # Try to find image by GCE commit SHA. If found, add it to the job pair.
    if not image:
        chooser = ExactImageChooserByCommitSHA(orig_log_path, dockerhub_images)
        image = chooser.get_image_tag()
        if image is not None:
            return image
示例#4
0
 def clean_bad_json_files(task_name):
     log.info('Cleaning bad JSON files.')
     count = 0
     task_dir = os.path.join(OUTPUT_DIR, task_name)
     for file in os.listdir(task_dir):
         if '.json' in file:
             filepath = os.path.join(task_dir, file)
             try:
                 read_json(filepath)
             except (json.decoder.JSONDecodeError, UnicodeDecodeError):
                 os.remove(filepath)
                 log.info('Removing', filepath)
                 count += 1
     log.info('Removed', count, 'bad JSON files.')
示例#5
0
    def get_pr_commits_by_parsing_html(self):
        start_time = time.time()
        html_commits_json_file = self.utils.get_html_commits_json_file(
            self.repo)
        html_commits = {}
        if os.path.isfile(html_commits_json_file):
            html_commits = read_json(html_commits_json_file)
            for _, branch_obj in self.branches.items():
                if branch_obj.pr_num != -1:  # if it's a PR branch
                    branch_obj.html_commits = html_commits[str(
                        branch_obj.pr_num)]
        else:
            threads = [
                threading.Thread(
                    target=self.utils.github.get_pr_commits_by_html,
                    args=(self.repo, str(branch_obj.pr_num), branch_obj))
                for _, branch_obj in self.branches.items()
            ]
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join()

            for _, branch_obj in self.branches.items():
                if branch_obj.pr_num != -1:  # if it's a PR branch
                    html_commits[branch_obj.pr_num] = branch_obj.html_commits
            write_json(html_commits_json_file, html_commits)
            log.info('Got pull request commits (via HTML parsing) in',
                     time.time() - start_time, 'seconds.')
示例#6
0
def load_buildpairs(dir_of_jsons: str, repo: str):
    """
    :param dir_of_jsons: A directory containing JSON files of build pairs.
    :param repo: repo_slug name
    :raises json.decoder.JSONDecodeError: When the passed directory contains JSON files with invalid JSON.
    """
    all_buildpairs = []
    count = 0
    task_name = repo.replace('/', '-')
    filename = task_name + '.json'
    try:
        data = read_json(os.path.join(dir_of_jsons, filename))
    except json.decoder.JSONDecodeError:
        log.error('{} contains invalid JSON.'.format(filename))
        return None
    except FileNotFoundError:
        log.error('{} is not found.'.format(filename))
        return None

    all_buildpairs.extend(data)
    if not data:
        log.warning('{} does not contain any build pairs.'.format(filename))
    count += 1
    log.info('Read {} build pairs from {}.'.format(len(all_buildpairs), filename))
    return all_buildpairs
示例#7
0
    def get_commit_info_for_virtual_commit(self):
        start_time = time.time()
        virtual_commits_info = {}
        virtual_commits_info_json_file = self.utils.get_virtual_commits_info_json_file(
            self.repo)
        has_json_file = os.path.isfile(virtual_commits_info_json_file)
        if has_json_file:
            virtual_commits_info = read_json(virtual_commits_info_json_file)

        for _, branch_obj in self.branches.items():
            if not branch_obj.pairs:
                continue
            for pair in branch_obj.pairs:
                builds = [pair.failed_build, pair.passed_build]
                for b in builds:
                    if has_json_file:
                        if b.commit in virtual_commits_info:
                            b.virtual_commit_info = virtual_commits_info[
                                b.commit]
                    else:
                        c = self.utils.github.get_commit_info(
                            self.repo, b.commit)
                        if c:
                            virtual_commits_info[b.commit] = c
                            b.virtual_commit_info = c
        if not has_json_file:
            write_json(virtual_commits_info_json_file, virtual_commits_info)
        log.info('Got commit info for virtual commits in',
                 time.time() - start_time, 'seconds.')
示例#8
0
    def _load_jobs_from_pairs_for_repo(self, input_file):
        """
        Read the input file, which should contain mined pairs from the database.
        """
        try:
            buildpairs = read_json(input_file)
        except json.JSONDecodeError:
            log.error('Error reading input file {} in PairCenter. Exiting.')
            raise
        for bp in buildpairs:
            # For debug purposes: When we only want to reproduce non-PR pairs, we can uncomment these lines.
            # if bp['pr_num'] == -1:
            #     continue
            repo = bp['repo']
            if repo not in self.repos:
                self.repos[repo] = Repo(repo)
                self.uninitialized_repos.put(repo)
            self._append_buildpair_and_jobpair_to_repo(repo, bp)

        self._init_names()
        self.set_skip_of_job_pairs()
        self._init_queue_of_repos()
        # Calculate buildpair and job numbers after done loading from file.
        self._calc_num_total_buildpairs()
        self._calc_num_total_jobpairs()
        self._calc_num_total_jobs()
        log.debug('pair_center.total_buildpairs =', self.total_buildpairs,
                  'pair_center.total_jobpairs =', self.total_jobpairs,
                  'pair_center.total_jobs =', self.total_jobs)
示例#9
0
    def get_commits_from_github_api(self):
        start_time = time.time()
        github_commits = {}
        get_github_commits = True
        github_commits_json_file = self.utils.get_github_commits_json_file(
            self.repo)
        if os.path.isfile(github_commits_json_file):
            github_commits = read_json(github_commits_json_file)
            get_github_commits = False

        for _, branch_obj in self.branches.items():
            if branch_obj.pr_num != -1:  # Whether it is a PR branch.
                # Get commits from the GitHub API.
                if get_github_commits:
                    github_commits[str(branch_obj.pr_num
                                       )] = self.utils.github.list_pr_commits(
                                           self.repo, str(branch_obj.pr_num))
                branch_obj.github_commits = github_commits[str(
                    branch_obj.pr_num)]
                # for commit in github_commits[str(branch.pr_num)]:
                #     commit['build_ids'] = self.utils.github.get_build_ids_for_commit(self.repo, commit['sha'])

        write_json(github_commits_json_file, github_commits)
        log.info('Got pull request commits (via GitHub API calls) in',
                 time.time() - start_time, 'seconds.')
示例#10
0
def get_stats(output_file_path: str) -> Optional[dict]:
    try:
        return read_json(output_file_path)['stats']
    except FileNotFoundError:
        print('Cannot find output file at ' + output_file_path + '. Skipping.')
    except KeyError:
        print(output_file_path, 'does not have the key "stats". Skipping.')
    return None
示例#11
0
def get_duration(output_file_path: str) -> Optional[int]:
    try:
        duration_string = read_json(output_file_path)['duration']
        duration = datetime.datetime.strptime(duration_string,
                                              '%Hh %Mm %S.%fs')
        delta = datetime.timedelta(hours=duration.hour,
                                   minutes=duration.minute,
                                   seconds=duration.second)
        return delta.total_seconds() // 60
    except FileNotFoundError:
        print('Cannot find output file at ' + output_file_path + '. Skipping.')
    except KeyError:
        print(output_file_path, 'does not have the key "duration". Skipping.')
    return None
示例#12
0
def get_repr_metadata_dict(task_json_path, repr_metadata_dict):
    buildpairs = read_json(task_json_path)
    for bp in buildpairs:
        for jp in bp['jobpairs']:
            image_tag = bugswarmutils.get_image_tag(bp['repo'],
                                                    jp['failed_job']['job_id'])
            failed_job = jp['failed_job']
            passed_job = jp['passed_job']
            jobs = [failed_job, passed_job]
            tag_metadata = dict()
            tag_metadata['repo'] = bp['repo']
            build_system = failed_job['orig_result'][
                'tr_build_system'] if failed_job['orig_result'] else ''
            tag_metadata['build_system'] = build_system
            tag_metadata['failed_job'] = {'job_id': jobs[0]['job_id']}
            tag_metadata['passed_job'] = {'job_id': jobs[1]['job_id']}
            repr_metadata_dict[image_tag] = tag_metadata
    return repr_metadata_dict
示例#13
0
    def load_buildpairs(dir_of_jsons: str, filename: str):
        """
        :param dir_of_jsons: A directory containing JSON files of build pairs.
        :param filename: the name of json file
        :raises json.decoder.JSONDecodeError: When the passed directory contains JSON files with invalid JSON.
        """
        all_buildpairs = []
        # Iterate over files that we expect to contain JSON.
        try:
            data = read_json(os.path.join(dir_of_jsons, filename))
        except json.decoder.JSONDecodeError:
            log.error('{} contains invalid JSON.'.format(filename))
            raise

        all_buildpairs.extend(data)
        if not data:
            log.warning(
                '{} does not contain any build pairs.'.format(filename))
        log.info('Read {} build pairs from {}.'.format(len(all_buildpairs),
                                                       filename))
        return all_buildpairs
示例#14
0
    def run(self):
        buildpairs = read_json(self.input_file)
        # Only check for skipping if CSV mode is disabled.
        to_insert = []
        for bp in buildpairs:
            for jp in bp['jobpairs']:
                image_tag = Utils.construct_jobpair_image_tag_from_dict(jp, bp['repo'])
                reproduce_successes, _, _ = Packager._calc_stability(jp)
                artifact_exists = Packager._is_artifact_in_db(image_tag)
                if artifact_exists and not reproduce_successes:
                    log.info('Artifact', image_tag, 'already exists in the database.')
                    continue
                to_insert.append((image_tag, artifact_exists, self._structure_artifact_data(image_tag, bp, jp)))

        if self.csv_mode:
            self._write_csv(to_insert)
        elif not to_insert:
            log.info('Done! No new metadata to insert.')
        else:
            inserts = 0
            errors = 0
            for artifact_data in to_insert:
                (image_tag, artifact_exists, artifact) = artifact_data
                if artifact_exists:
                    if Packager._update_artifact(image_tag, artifact):
                        inserts += 1
                    else:
                        errors += 1
                elif Packager._insert_artifact(artifact):
                    inserts += 1
                else:
                    errors += 1
            if errors == 0:
                log.info('Done! Inserted metadata for {} jobpairs with 0 errors.'.format(inserts))
            else:
                log.info('Done! Attempted to insert {} jobpairs into the database. {} insertions succeeded and {} '
                         'encountered an error.'.format(len(to_insert), inserts, errors))
示例#15
0
    def process(self, data: Dict[str, Branch], context: dict) -> Optional[Any]:
        repo = context['repo']
        utils = context['utils']

        branches = data

        # Get the merge state of each pull request.
        log.info('Getting merge state for all pull requests.')

        start_time = time.time()
        pr_list_json_file = utils.get_pr_list_json_file(repo)
        pr_dict = {}
        if os.path.isfile(pr_list_json_file):
            try:
                pr_dict = read_json(pr_list_json_file)
            except ValueError:
                os.remove(pr_list_json_file)
                raise StepException
        else:
            pr_entities = utils.github.list_pull_requests(repo)
            for pr_entity in pr_entities:
                pr_dict[str(pr_entity['number'])] = pr_entity
            write_json(pr_list_json_file, pr_dict)

        for branch_id, branch_obj in branches.items():
            if branch_obj.pr_num != -1:  # Whether the branch is a pull request branch.
                if str(branch_obj.pr_num) in pr_dict:
                    branch_obj.merged_at = pr_dict[str(
                        branch_obj.pr_num)]['merged_at']
                    branch_obj.base_branch = pr_dict[str(
                        branch_obj.pr_num)]['base']['ref']
                    branch_obj.pr_info = pr_dict[str(branch_obj.pr_num)]

        log.debug('Got merge state for all pull requests in',
                  time.time() - start_time, 'seconds.')
        return branches
    def process(self, data: Any, context: dict) -> Optional[Any]:
        repo = context['repo']
        travis = TravisWrapper()

        builds_json_file = Utils.get_repo_builds_api_result_file(repo)
        builds_info_json_file = Utils.get_repo_builds_info_api_result_file(repo)
        if os.path.isfile(builds_json_file):
            build_list = read_json(builds_json_file)
        else:
            log.info('Getting the list of builds...')
            start_time = time.time()
            try:
                builds = travis.get_builds_for_repo(repo)
            except RequestException:
                error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo)
                raise StepException(error_message)
            build_list = list(builds)
            write_json(builds_json_file, build_list)
            log.info('Got the list of builds in', time.time() - start_time, 'seconds.')

        if os.path.isfile(builds_info_json_file):
            build_list = read_json(builds_info_json_file)
        else:
            log.info('Downloading build info for',
                     len(build_list),
                     'builds... This step may take several minutes for large repositories.')
            start_time = time.time()
            for idx, build in enumerate(build_list):
                build_id = build['id']
                try:
                    build_info = travis.get_build_info(build_id)
                except RequestException:
                    error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id)
                    raise StepException(error_message)
                build['build_info'] = build_info
                if (idx + 1) % 500 == 0:
                    log.info('Downloaded build info for', idx + 1, 'builds so far...')
            write_json(builds_info_json_file, build_list)
            log.info('Downloaded build info in', time.time() - start_time, 'seconds.')

        # Now that we have data from the Travis API, restructure it so it appears as if it came from the database using
        # the following query:
        #   SELECT j.job_id, j.job_number, j.config, j.result,
        #          b.build_id, b.number, b.finished_at, b.commit, b.branch, b.event_type, b.language,
        #          c.committed_at, c.compare_at, c.committer_name, c.message
        #   FROM jobs j
        #   LEFT JOIN builds b on b.build_id = j.build_id
        #   LEFT JOIN commits c on b.commit = c.sha
        #   WHERE j.repo_id = "<repo_id>"
        jobs = []
        for build in build_list:
            for job in build['build_info']['matrix']:
                j = {
                    'job_id': job['id'],
                    'job_number': job['number'],
                    'config': job['config'],
                    'result': job['result'],
                    'build_id': build['id'],
                    'number': build['number'],
                    'finished_at': job['finished_at'],
                    'commit': build['commit'],
                    'message': build['message'],
                    'branch': build['branch'],
                    'event_type': build['build_info']['event_type'],
                    'committed_at': build['build_info']['committed_at'],
                    'compare_at': build['build_info']['compare_url'],
                    'committer_name': build['build_info']['committer_name'],
                }
                if 'language' in job['config']:
                    language = job['config']['language']
                else:
                    log.debug('Language not found in config, defaulting to ruby for job ID {}.'.format(job['id']))
                    language = 'ruby'
                j['language'] = language
                jobs.append(j)

        # Expose mining progression metrics via the context. Other pipeline steps must not change these values.
        # Do not raise a StepException before the context is populated.
        failed_builds, failed_pr_builds = GetJobsFromTravisAPI._count_failed_builds(build_list)
        failed_jobs, failed_pr_jobs = GetJobsFromTravisAPI._count_failed_jobs(build_list)
        context['mined_project_builder'].builds = len(build_list)
        context['mined_project_builder'].jobs = len(jobs)
        context['mined_project_builder'].failed_builds = failed_builds
        context['mined_project_builder'].failed_jobs = failed_jobs
        context['mined_project_builder'].failed_pr_builds = failed_pr_builds
        context['mined_project_builder'].failed_pr_jobs = failed_pr_jobs

        if not jobs:
            msg = 'Did not get any jobs for {}.'.format(repo)
            log.warning(msg)
            raise StepException(msg)

        return jobs
示例#17
0
    def _write_output_json(self):
        log.info('Writing output JSON annotated with match history.')
        pairs = read_json(self.input_file)
        # Write default attributes.
        for p in pairs:
            for jp in p['jobpairs']:
                jp['match_history'] = {}
                jp['failed_job']['match_history'] = {}
                jp['passed_job']['match_history'] = {}
                jp['failed_job']['orig_result'] = ''
                jp['passed_job']['orig_result'] = ''
                jp['failed_job']['mismatch_attrs'] = []
                jp['passed_job']['mismatch_attrs'] = []
                jp['failed_job']['pip_patch'] = False
                jp['passed_job']['pip_patch'] = False

        for p in pairs:
            repo = p['repo']
            if repo not in self.pair_center.repos:
                continue

            # Try to find this build pair in pair center.
            for bp in self.pair_center.repos[repo].buildpairs:
                if p['failed_build']['build_id'] == bp.builds[0].build_id:
                    # Found build pair in pair center.

                    # Optional: Write buildpair match type.
                    # This is not used since we switched to jobpair packaging.
                    p['match'] = bp.match.value
                    trigger_sha = p['failed_build']['head_sha']
                    # Similarly, for each job pair in build pair, try to find it in the pair center.
                    for jp in p['jobpairs']:
                        # For a build that has some jobs filtered and some jobs not filtered,
                        # the job cannot be found in paircenter.
                        if jp['is_filtered']:
                            continue

                        found_in_paircenter = False
                        for jobpair in bp.jobpairs:
                            if str(jobpair.jobs[0].job_id) == str(jp['failed_job']['job_id']):
                                found_in_paircenter = True
                                # Write jobpair match history, analyzed results, and mismatched attributes.
                                jp['match_history'] = jobpair.match_history
                                jp['failed_job']['match_history'] = jobpair.failed_job_match_history
                                jp['passed_job']['match_history'] = jobpair.passed_job_match_history
                                jp['failed_job']['orig_result'] = jobpair.jobs[0].orig_result
                                jp['passed_job']['orig_result'] = jobpair.jobs[1].orig_result
                                jp['failed_job']['mismatch_attrs'] = jobpair.jobs[0].mismatch_attrs
                                jp['passed_job']['mismatch_attrs'] = jobpair.jobs[1].mismatch_attrs
                                jp['failed_job']['pip_patch'] = jobpair.jobs[0].pip_patch
                                jp['passed_job']['pip_patch'] = jobpair.jobs[1].pip_patch

                        if not found_in_paircenter:
                            # If not found in pair center, this jobpair was filtered out.
                            # In this case, we still analyze the original log to get as many attributes as possible.
                            for i in range(2):
                                job_name = 'failed_job' if i == 0 else 'passed_job'
                                job_id = jp[job_name]['job_id']
                                original_log_path = self.utils.get_orig_log_path(job_id)
                                if not download_log(job_id, original_log_path):
                                    continue
                                original_result = self.analyzer.analyze_single_log(original_log_path, job_id,
                                                                                   trigger_sha, repo)
                                if 'not_in_supported_language' in original_result:
                                    continue
                                jp[job_name]['orig_result'] = original_result
                            raise RuntimeError('Unexpected state: Jobpair not found in pair center. Exiting.')

        os.makedirs(self.config.result_json_dir, exist_ok=True)
        filename = self.config.task + '.json'
        filepath = os.path.join(self.config.result_json_dir, filename)
        write_json(filepath, pairs)
    def process(self, data: Any, context: dict) -> Optional[Any]:
        repo = context['repo']
        mined_build_exists = False
        lock = Lock()
        with lock:
            travis = TravisWrapper()

        last_mined_build_number = 0
        if context['original_mined_project_metrics']['last_build_mined']['build_number']:
            last_mined_build_number = context['original_mined_project_metrics']['last_build_mined']['build_number']
            mined_build_exists = True

        builds_json_file = Utils.get_repo_builds_api_result_file(repo)
        builds_info_json_file = Utils.get_repo_builds_info_api_result_file(repo)
        if os.path.isfile(builds_json_file):
            build_list = read_json(builds_json_file)
        else:
            log.info('Getting the list of builds...')
            start_time = time.time()
            try:
                if not mined_build_exists:
                    # gets all builds for project
                    builds = travis.get_builds_for_repo(repo)
                else:
                    # gets the latest builds and stops mining after reaching our last mined build number
                    builds = travis.get_builds_for_repo(repo, last_mined_build_number)
            except RequestException:
                error_message = 'Encountered an error while downloading builds for repository {}.'.format(repo)
                raise StepException(error_message)
            build_list = list(builds)
            write_json(builds_json_file, build_list)
            log.info('Got the list of builds in', time.time() - start_time, 'seconds.')

        if not build_list:
            msg = 'Did not get any new builds for {}.'.format(repo)
            raise StepException(msg)

        if os.path.isfile(builds_info_json_file):
            build_list = read_json(builds_info_json_file)
        else:
            log.info('Downloading build info for',
                     len(build_list),
                     'builds... This step may take several minutes for large repositories.')
            start_time = time.time()
            for idx, build in enumerate(build_list):
                build_id = build['id']
                try:
                    build_info = travis.get_build_info(build_id)
                except RequestException:
                    error_message = 'Encountered an error while downloading build info for build {}.'.format(build_id)
                    raise StepException(error_message)
                build['build_info'] = build_info
                if (idx + 1) % 500 == 0:
                    log.info('Downloaded build info for', idx + 1, 'builds so far...')
            write_json(builds_info_json_file, build_list)
            log.info('Downloaded build info in', time.time() - start_time, 'seconds.')

        # Now that we have data from the Travis API, restructure it so it appears as if it came from the database using
        # the following query:
        #   SELECT j.job_id, j.job_number, j.config, j.result,
        #          b.build_id, b.number, b.finished_at, b.commit, b.branch, b.event_type, b.language,
        #          c.committed_at, c.compare_at, c.committer_name, c.message
        #   FROM jobs j
        #   LEFT JOIN builds b on b.build_id = j.build_id
        #   LEFT JOIN commits c on b.commit = c.sha
        #   WHERE j.repo_id = "<repo_id>"
        jobs = []
        leftover_build_list = []
        highest_build_number = 0
        highest_build_number_id = 0

        # The 'build_list' will return at minimum 25 builds due to the response gathered from Travis API being a page.
        # We will always set the 'highest_build_number/id' and skip builds that we have mined previously by checking if
        # the 'build_number <= last_mined_build_number'
        for build in build_list:
            build_id = build['id']
            build_number = int(build['number'])

            if build_number > highest_build_number:
                highest_build_number_id = build_id
                highest_build_number = build_number
            if build_number <= last_mined_build_number:
                continue

            for job in build['build_info']['matrix']:
                j = {
                    'job_id': job['id'],
                    'job_number': job['number'],
                    'config': job['config'],
                    'result': job['result'],
                    'build_id': build['id'],
                    'number': build['number'],
                    'finished_at': job['finished_at'],
                    'commit': build['commit'],
                    'message': build['message'],
                    'branch': build['branch'],
                    'event_type': build['build_info']['event_type'],
                    'committed_at': build['build_info']['committed_at'],
                    'compare_at': build['build_info']['compare_url'],
                    'committer_name': build['build_info']['committer_name'],
                }
                if 'language' in job['config']:
                    language = job['config']['language']
                else:
                    log.debug('Language not found in config, defaulting to ruby for job ID {}.'.format(job['id']))
                    language = 'ruby'
                j['language'] = language
                jobs.append(j)

            leftover_build_list.append(build)

        if not jobs:
            msg = 'Did not get any jobs for {}.'.format(repo)
            # Set the build_number & build_id metric to the latest build info we've received if no jobs are found.
            bugswarmapi = DatabaseAPI(DATABASE_PIPELINE_TOKEN)
            bugswarmapi.set_latest_build_info_metric(repo, highest_build_number, highest_build_number_id)
            raise StepException(msg)

        # Expose mining progression metrics via the context. Other pipeline steps must not change these values.
        # Do not raise a StepException before the context is populated.
        failed_builds, failed_pr_builds = GetJobsFromTravisAPI._count_failed_builds(leftover_build_list)
        failed_jobs, failed_pr_jobs = GetJobsFromTravisAPI._count_failed_jobs(leftover_build_list)
        context['mined_project_builder'].builds = len(leftover_build_list) + \
            context['original_mined_project_metrics']['progression_metrics']['builds']
        context['mined_project_builder'].jobs = len(jobs) + \
            context['original_mined_project_metrics']['progression_metrics']['jobs']
        context['mined_project_builder'].failed_builds = failed_builds + \
            context['original_mined_project_metrics']['progression_metrics']['failed_builds']
        context['mined_project_builder'].failed_jobs = failed_jobs + \
            context['original_mined_project_metrics']['progression_metrics']['failed_jobs']
        context['mined_project_builder'].failed_pr_builds = failed_pr_builds + \
            context['original_mined_project_metrics']['progression_metrics']['failed_pr_builds']
        context['mined_project_builder'].failed_pr_jobs = failed_pr_jobs + \
            context['original_mined_project_metrics']['progression_metrics']['failed_pr_jobs']
        context['mined_project_builder'].last_build_mined['build_id'] = highest_build_number_id
        context['mined_project_builder'].last_build_mined['build_number'] = highest_build_number

        return jobs
示例#19
0
import unittest
import sys

from bugswarm.common.json import read_json

sys.path.append('../')
from pair_filter.image_chooser import (ExactImageChooserByCommitSHA, ExactImageChooserByTag,
ExactImageChooserByTime)  # noqa: E402
from pair_filter.constants import DOCKERHUB_IMAGES_JSON, TRAVIS_IMAGES_JSON  # noqa: E402
_DOCKERHUB_IMAGES = read_json(DOCKERHUB_IMAGES_JSON)
_TRAVIS_IMAGES = read_json('../' + TRAVIS_IMAGES_JSON)


class Test(unittest.TestCase):

    def test_match_object_by_commit_sha_1(self):
        log = '566084454-orig.log'
        file_path = 'logs/' + log
        chooser = ExactImageChooserByCommitSHA(file_path, _DOCKERHUB_IMAGES)
        image = chooser.get_image_tag()
        assert image == 'travisci/ci-sardonyx:packer-1558623664-f909ac5'

    def test_match_object_by_commit_sha_2(self):
        log = '520562883-orig.log'
        file_path = 'logs/' + log
        chooser = ExactImageChooserByCommitSHA(file_path, _DOCKERHUB_IMAGES)
        image = chooser.get_image_tag()
        assert image == 'travisci/ci-sardonyx:packer-1558623664-f909ac5'

    def test_match_object_by_commit_sha_3(self):
        log = '100252761-orig.log'
示例#20
0
import unittest
import sys

from bugswarm.common.json import read_json

sys.path.append("../")
from pair_filter.image_chooser import ExactImageChooserByCommitSHA, ExactImageChooserByTag  # noqa: E402
from pair_filter.constants import DOCKERHUB_IMAGES_JSON  # noqa: E402

_DOCKERHUB_IMAGES = read_json(DOCKERHUB_IMAGES_JSON)


class Test(unittest.TestCase):
    def test_match_object_by_commit_sha_1(self):
        log = "566084454-orig.log"
        file_path = "logs/" + log
        chooser = ExactImageChooserByCommitSHA(file_path, _DOCKERHUB_IMAGES)
        image = chooser.get_image_tag()
        assert image == 'travisci/ci-sardonyx:packer-1558623664-f909ac5'

    def test_match_object_by_commit_sha_2(self):
        log = "520562883-orig.log"
        file_path = "logs/" + log
        chooser = ExactImageChooserByCommitSHA(file_path, _DOCKERHUB_IMAGES)
        image = chooser.get_image_tag()
        assert image == 'travisci/ci-sardonyx:packer-1558623664-f909ac5'

    def test_match_object_by_commit_sha_3(self):
        log = "100252761-orig.log"
        file_path = "logs/" + log
        chooser = ExactImageChooserByCommitSHA(file_path, _DOCKERHUB_IMAGES)
示例#21
0
    def _update_mined_project(repo: str, buildpairs: List):
        bugswarmapi = DatabaseAPI(token=DATABASE_PIPELINE_TOKEN)
        file_name = utils.canonical_repo(repo)
        file_path = os.path.join(
            os.path.dirname(os.path.realpath('.')),
            'pair-finder/output/original_metrics/{}.json'.format(file_name))
        original_d = read_json(file_path)

        def _key(filter_name: str, pr: bool):
            return 'filtered{}_{}'.format('_pr' if pr else '', filter_name)

        def _unfiltered_key(pr: bool):
            return 'unfiltered{}'.format('_pr' if pr else '')

        d = {
            'filtered_no_sha': 0,
            'filtered_same_commit': 0,
            'filtered_unavailable': 0,
            'filtered_no_original_log': 0,
            'filtered_error_reading_original_log': 0,
            'filtered_no_image_provision_timestamp': 0,
            'filtered_inaccessible_image': 0,
            'unfiltered': 0,
            'filtered_pr_no_sha': 0,
            'filtered_pr_same_commit': 0,
            'filtered_pr_unavailable': 0,
            'filtered_pr_no_original_log': 0,
            'filtered_pr_error_reading_original_log': 0,
            'filtered_pr_no_image_provision_timestamp': 0,
            'filtered_pr_inaccessible_image': 0,
            'unfiltered_pr': 0,
        }
        for bp in buildpairs:
            is_pr = bp['pr_num'] > 0
            d[_unfiltered_key(is_pr)] += utils.count_unfiltered_jobpairs([bp])
            for jp in bp['jobpairs']:
                reason = jp[FILTERED_REASON_KEY]
                if reason == reasons.NO_HEAD_SHA:
                    d[_key('no_sha', is_pr)] += 1
                elif reason == reasons.SAME_COMMIT_PAIR:
                    d[_key('same_commit', is_pr)] += 1
                elif reason == reasons.NOT_AVAILABLE:
                    d[_key('unavailable', is_pr)] += 1
                elif reason == reasons.NO_ORIGINAL_LOG:
                    d[_key('no_original_log', is_pr)] += 1
                elif reason == reasons.ERROR_READING_ORIGINAL_LOG:
                    d[_key('error_reading_original_log', is_pr)] += 1
                elif reason == reasons.NO_IMAGE_PROVISION_TIMESTAMP:
                    d[_key('no_image_provision_timestamp', is_pr)] += 1
                elif reason == reasons.INACCESSIBLE_IMAGE:
                    d[_key('inaccessible_image', is_pr)] += 1
        for metric_name, metric_value in d.items():
            try:
                metric_value = metric_value + original_d[
                    'progression_metrics'][metric_name]
            except KeyError:
                pass
            if not bugswarmapi.set_mined_project_progression_metric(
                    repo, metric_name, metric_value):
                log.error(
                    'Encountered an error while setting a progression metric. Exiting.'
                )
                sys.exit(1)
示例#22
0
def filter_non_exact_images(pairs: List) -> Tuple[int, int, int, int]:
    """
    Check if all jobs in this pair (from both the failed and passed build) used images that are available.

    If an image is found to match the job pair then it gets added to the job pair.

    This function assumes the language specified in the Travis configuration does not change between the failed and
    passed builds.

    Returns a 4-tuple of filter counts. The tuple members represent the following:
    1. The number of pairs filtered due to original log not found
    2. The number of pairs filtered due to an error reading the original log
    3. The number of pairs filtered due to no image provision timestamp in the original log
    4. The number of pairs filtered due to usage of a non-exact Docker image.
    """
    log.debug('To detect non-exact pairs, we first extract the used images from the original logs.')

    travis_images = read_json(TRAVIS_IMAGES_JSON)
    provisioned_strs = []
    for language in travis_images:
        provisioned_strs += travis_images[language].values()

    dockerhub_images = read_json(DOCKERHUB_IMAGES_JSON)
    filtered = 0
    no_original_log = 0
    error_reading_original_log = 0
    no_image_provision_timestamp = 0
    inaccessible_image = 0
    exact_jobs = 0
    images_we_have = {}
    images_we_dont_have = {}

    log.debug('Analyzing original logs to extract used images.')
    processed = 0
    for p in pairs:
        config = p['failed_build']['jobs'][0]['config']

        # Travis defaults to the Ruby image if the language is not specified.
        # See https://github.com/travis-ci/travis-ci/issues/4895.
        language = config.get('language') or 'ruby'

        # Multiple languages can be specified by using a list. In this case, we take the first language in the list.
        # We should eventually consider supporting the behavior mentioned in
        # https://stackoverflow.com/a/44054333/5007059 if it becomes officially supported.
        if isinstance(language, list):
            language = language[0]

        if language == 'java':
            language = 'jvm'

        for jp in p['jobpairs']:
            # If the job pair has already been filtered, skip it.
            if utils.jobpair_is_filtered(jp):
                continue

            jobs = [jp['failed_job'], jp['passed_job']]
            for j in jobs:
                processed += 1

                job_id = j['job_id']
                orig_log_path = utils.get_orig_log_path(job_id)
                if not download_log(job_id, orig_log_path):
                    no_original_log += 1
                    jp[FILTERED_REASON_KEY] = reasons.NO_ORIGINAL_LOG
                    continue

                # Try to find the image by timestamp. If found, add it to the job pair.
                try:
                    chooser = ExactImageChooserByTime(orig_log_path, travis_images, language)
                    orig_log_image_provision_timestamp = chooser.find_image_datetime_from_log()
                    image = chooser.get_image_tag()
                    if image is not None:
                        j[PARSED_IMAGE_TAG_KEY] = image
                except OSError:
                    # The original log file was not found.
                    error_reading_original_log += 1
                    jp[FILTERED_REASON_KEY] = reasons.ERROR_READING_ORIGINAL_LOG
                    continue

                if not orig_log_image_provision_timestamp:
                    # Jobs older than 01/2015 did not use Docker, so the build log does not contain an image provision
                    # timestamp.
                    no_image_provision_timestamp += 1
                    jp[FILTERED_REASON_KEY] = reasons.NO_IMAGE_PROVISION_TIMESTAMP
                    continue

                # Try to find image by tag. If found, add it to the job pair.
                if not image:
                    chooser = ExactImageChooserByTag(orig_log_path)
                    image = chooser.get_image_tag()
                    if image is not None:
                        j[PARSED_IMAGE_TAG_KEY] = image
                # Try to find image by GCE commit SHA. If found, add it to the job pair.
                if not image:
                    chooser = ExactImageChooserByCommitSHA(orig_log_path, dockerhub_images)
                    image = chooser.get_image_tag()
                    if image is not None:
                        j[PARSED_IMAGE_TAG_KEY] = image

                # 'tr_build_image' is the attribute containing the provision timestamp extracted from a build log.
                if orig_log_image_provision_timestamp not in provisioned_strs and image is None:
                    # This image is inaccessible.
                    inaccessible_image += 1
                    if orig_log_image_provision_timestamp not in images_we_dont_have and image is None:
                        images_we_dont_have[orig_log_image_provision_timestamp] = 1
                    else:
                        images_we_dont_have[orig_log_image_provision_timestamp] += 1
                    jp[FILTERED_REASON_KEY] = reasons.INACCESSIBLE_IMAGE
                else:
                    exact_jobs += 1
                    if orig_log_image_provision_timestamp not in images_we_have:
                        images_we_have[orig_log_image_provision_timestamp] = 1
                    else:
                        images_we_have[orig_log_image_provision_timestamp] += 1

            if utils.jobpair_is_filtered(jp):
                filtered += 1

    # Print the images we have and do not have and how many times they are used by these jobs.
    log.debug('Stats about images that we have:')
    for k in images_we_have:
        log.debug('{} jobs use an image provisioned on {}.'.format(k, images_we_have[k]))
    log.debug('Stats about images that we do not have:')
    for k in images_we_dont_have:
        log.debug('{} jobs use an unavabilable image provisioned on {}.'.format(k, images_we_dont_have[k]))
    log.debug('Total exact jobs:', exact_jobs)
    log.debug('Total non-exact jobs:', inaccessible_image)
    log.debug('Jobs with missing logs:', no_original_log)
    utils.log_filter_count(filtered, 'jobpairs that use non-exact images')
    return no_original_log, error_reading_original_log, no_image_provision_timestamp, inaccessible_image