def transform(self, data, project_filter=None, revision_filter=None, job_group_filter=None): """ transform the builds4h structure into something we can ingest via our restful api """ revisions = defaultdict(list) missing_resultsets = defaultdict(set) valid_projects = set(x.project for x in Datasource.objects.cached()) for build in data['builds']: try: prop = build['properties'] project = prop['branch'] buildername = prop['buildername'] if common.should_skip_project(project, valid_projects, project_filter): continue if common.should_skip_revision(prop['revision'], revision_filter): continue if common.is_blacklisted_buildername(buildername): continue prop['short_revision'] = prop['revision'][0:12] except KeyError as e: logger.warning("skipping builds-4hr job %s since missing property: %s", build['id'], str(e)) continue revisions[project].append(prop['short_revision']) revisions_lookup = common.lookup_revisions(revisions) job_ids_seen_last_time = cache.get(CACHE_KEYS['complete'], set()) job_ids_seen_now = set() # Holds one collection per unique branch/project th_collections = {} for build in data['builds']: try: prop = build['properties'] project = prop['branch'] buildername = prop['buildername'] if common.should_skip_project(project, valid_projects, project_filter): continue if common.should_skip_revision(prop['revision'], revision_filter): continue if common.is_blacklisted_buildername(buildername): continue # todo: Continue using short revisions until Bug 1199364 resultset = common.get_resultset(project, revisions_lookup, prop['short_revision'], missing_resultsets, logger) except KeyError: # There was no matching resultset, skip the job. continue # We record the id here rather than at the start of the loop, since we # must not count jobs whose revisions were not yet imported as processed, # or we'll never process them once we've ingested their associated revision. job_ids_seen_now.add(build['id']) # Don't process jobs that were already present in builds-4hr # the last time this task completed successfully. if build['id'] in job_ids_seen_last_time: continue platform_info = buildbot.extract_platform_info(buildername) job_name_info = buildbot.extract_name_info(buildername) if (job_group_filter and job_name_info.get('group_symbol', '').lower() != job_group_filter.lower()): continue treeherder_data = { 'revision_hash': resultset['revision_hash'], 'resultset_id': resultset['id'], 'project': project, 'coalesced': [] } log_reference = [] if 'log_url' in prop: log_reference.append({ 'url': prop['log_url'], 'name': 'buildbot_text' }) # add structured logs to the list of log references if 'blobber_files' in prop: try: blobber_files = json.loads(prop['blobber_files']) for bf, url in blobber_files.items(): if bf and url and bf.endswith('_raw.log'): log_reference.append({ 'url': url, 'name': 'mozlog_json' }) except Exception as e: logger.warning("invalid blobber_files json for build id %s (%s): %s", build['id'], buildername, e) try: job_guid_data = self.find_job_guid(build) # request_ids is mandatory, but can be found in several places. request_ids = prop.get('request_ids', build['request_ids']) # The last element in request_ids corresponds to the request id of this job, # the others are for the requests that were coalesced into this one. request_id = request_ids[-1] except KeyError: continue treeherder_data['coalesced'] = job_guid_data['coalesced'] job = { 'job_guid': job_guid_data['job_guid'], 'name': job_name_info.get('name', ''), 'job_symbol': job_name_info.get('job_symbol', ''), 'group_name': job_name_info.get('group_name', ''), 'group_symbol': job_name_info.get('group_symbol', ''), 'reference_data_name': buildername, 'product_name': prop.get('product', ''), 'state': 'completed', 'result': buildbot.RESULT_DICT[build['result']], 'reason': build['reason'], # scheduler, if 'who' property is not present 'who': prop.get('who', prop.get('scheduler', '')), 'submit_timestamp': build['requesttime'], 'start_timestamp': build['starttime'], 'end_timestamp': build['endtime'], 'machine': prop.get('slavename', 'unknown'), # build_platform same as machine_platform 'build_platform': { # platform attributes sometimes parse without results 'os_name': platform_info.get('os', ''), 'platform': platform_info.get('os_platform', ''), 'architecture': platform_info.get('arch', '') }, 'machine_platform': { 'os_name': platform_info.get('os', ''), 'platform': platform_info.get('os_platform', ''), 'architecture': platform_info.get('arch', '') }, # pgo or non-pgo dependent on buildername parsing 'option_collection': { buildbot.extract_build_type(buildername): True }, 'log_references': log_reference, 'artifacts': [ { 'type': 'json', 'name': 'buildapi', 'log_urls': [], 'blob': { 'buildername': buildername, 'request_id': request_id } }, ] } treeherder_data['job'] = job if project not in th_collections: th_collections[project] = TreeherderJobCollection() # get treeherder job instance and add the job instance # to the collection instance th_job = th_collections[project].get_job(treeherder_data) th_collections[project].add(th_job) if missing_resultsets and not revision_filter: common.fetch_missing_resultsets("builds4h", missing_resultsets, logger) num_new_jobs = len(job_ids_seen_now.difference(job_ids_seen_last_time)) logger.info("Imported %d completed jobs, skipped %d previously seen", num_new_jobs, len(job_ids_seen_now) - num_new_jobs) return th_collections, job_ids_seen_now
def transform(self, data, project_filter=None, revision_filter=None, job_group_filter=None): """ transform the builds4h structure into something we can ingest via our restful api """ revisions = defaultdict(list) missing_resultsets = defaultdict(set) valid_projects = set(x.project for x in Datasource.objects.cached()) for build in data['builds']: try: prop = build['properties'] project = prop['branch'] buildername = prop['buildername'] if common.should_skip_project(project, valid_projects, project_filter): continue if common.should_skip_revision(prop['revision'], revision_filter): continue if common.is_blacklisted_buildername(buildername): continue prop['short_revision'] = prop['revision'][0:12] except KeyError as e: logger.warning( "skipping builds-4hr job %s since missing property: %s", build['id'], str(e)) continue revisions[project].append(prop['short_revision']) revisions_lookup = common.lookup_revisions(revisions) job_ids_seen_last_time = cache.get(CACHE_KEYS['complete'], set()) job_ids_seen_now = set() # Holds one collection per unique branch/project th_collections = {} for build in data['builds']: try: prop = build['properties'] project = prop['branch'] buildername = prop['buildername'] if common.should_skip_project(project, valid_projects, project_filter): continue if common.should_skip_revision(prop['revision'], revision_filter): continue if common.is_blacklisted_buildername(buildername): continue # todo: Continue using short revisions until Bug 1199364 resultset = common.get_resultset(project, revisions_lookup, prop['short_revision'], missing_resultsets, logger) except KeyError: # There was no matching resultset, skip the job. continue # We record the id here rather than at the start of the loop, since we # must not count jobs whose revisions were not yet imported as processed, # or we'll never process them once we've ingested their associated revision. job_ids_seen_now.add(build['id']) # Don't process jobs that were already present in builds-4hr # the last time this task completed successfully. if build['id'] in job_ids_seen_last_time: continue platform_info = buildbot.extract_platform_info(buildername) job_name_info = buildbot.extract_name_info(buildername) if (job_group_filter and job_name_info.get( 'group_symbol', '').lower() != job_group_filter.lower()): continue treeherder_data = { 'revision_hash': resultset['revision_hash'], 'resultset_id': resultset['id'], 'project': project, 'coalesced': [] } log_reference = [] if 'log_url' in prop: log_reference.append({ 'url': prop['log_url'], 'name': 'buildbot_text' }) # add structured logs to the list of log references if 'blobber_files' in prop: try: blobber_files = json.loads(prop['blobber_files']) for bf, url in blobber_files.items(): if bf and url and bf.endswith('_raw.log'): log_reference.append({ 'url': url, 'name': 'mozlog_json' }) except Exception as e: logger.warning( "invalid blobber_files json for build id %s (%s): %s", build['id'], buildername, e) try: job_guid_data = self.find_job_guid(build) # request_ids is mandatory, but can be found in several places. request_ids = prop.get('request_ids', build['request_ids']) # The last element in request_ids corresponds to the request id of this job, # the others are for the requests that were coalesced into this one. request_id = request_ids[-1] except KeyError: continue treeherder_data['coalesced'] = job_guid_data['coalesced'] job = { 'job_guid': job_guid_data['job_guid'], 'name': job_name_info.get('name', ''), 'job_symbol': job_name_info.get('job_symbol', ''), 'group_name': job_name_info.get('group_name', ''), 'group_symbol': job_name_info.get('group_symbol', ''), 'reference_data_name': buildername, 'product_name': prop.get('product', ''), 'state': 'completed', 'result': buildbot.RESULT_DICT[build['result']], 'reason': build['reason'], # scheduler, if 'who' property is not present 'who': prop.get('who', prop.get('scheduler', '')), 'submit_timestamp': build['requesttime'], 'start_timestamp': build['starttime'], 'end_timestamp': build['endtime'], 'machine': prop.get('slavename', 'unknown'), # build_platform same as machine_platform 'build_platform': { # platform attributes sometimes parse without results 'os_name': platform_info.get('os', ''), 'platform': platform_info.get('os_platform', ''), 'architecture': platform_info.get('arch', '') }, 'machine_platform': { 'os_name': platform_info.get('os', ''), 'platform': platform_info.get('os_platform', ''), 'architecture': platform_info.get('arch', '') }, # pgo or non-pgo dependent on buildername parsing 'option_collection': { buildbot.extract_build_type(buildername): True }, 'log_references': log_reference, 'artifacts': [ { 'type': 'json', 'name': 'buildapi', 'log_urls': [], 'blob': { 'buildername': buildername, 'request_id': request_id } }, ] } treeherder_data['job'] = job if project not in th_collections: th_collections[project] = TreeherderJobCollection() # get treeherder job instance and add the job instance # to the collection instance th_job = th_collections[project].get_job(treeherder_data) th_collections[project].add(th_job) if missing_resultsets and not revision_filter: common.fetch_missing_resultsets("builds4h", missing_resultsets, logger) num_new_jobs = len(job_ids_seen_now.difference(job_ids_seen_last_time)) logger.info("Imported %d completed jobs, skipped %d previously seen", num_new_jobs, len(job_ids_seen_now) - num_new_jobs) return th_collections, job_ids_seen_now
def transform(self, data, source, revision_filter=None, project_filter=None, job_group_filter=None): """ transform the buildapi structure into something we can ingest via our restful api """ valid_projects = set(x.project for x in Datasource.objects.cached()) revision_dict = defaultdict(list) missing_resultsets = defaultdict(set) # loop to catch all the revisions for project, revisions in data[source].iteritems(): if common.should_skip_project(project, valid_projects, project_filter): continue for rev, jobs in revisions.items(): if common.should_skip_revision(rev, revision_filter): continue for job in jobs: if not common.is_blacklisted_buildername(job['buildername']): # Add the revision to the list to be fetched so long as we # find at least one valid job associated with it. revision_dict[project].append(rev) break # retrieving the revision->resultset lookups revisions_lookup = common.lookup_revisions(revision_dict) job_ids_seen_last_time = cache.get(CACHE_KEYS[source], set()) job_ids_seen_now = set() th_collections = {} for project, revisions in data[source].iteritems(): if common.should_skip_project(project, valid_projects, project_filter): continue for revision, jobs in revisions.items(): if common.should_skip_revision(revision, revision_filter): continue try: resultset = common.get_resultset(project, revisions_lookup, revision, missing_resultsets, logger) except KeyError: # There was no matching resultset, skip the job. continue # using project and revision form the revision lookups # to filter those jobs with unmatched revision for job in jobs: buildername = job['buildername'] if common.is_blacklisted_buildername(buildername): continue job_ids_seen_now.add(job['id']) # Don't process jobs that were already present in this datasource # the last time this task completed successfully. if job['id'] in job_ids_seen_last_time: continue treeherder_data = { 'revision_hash': resultset['revision_hash'], 'resultset_id': resultset['id'], 'project': project, } platform_info = buildbot.extract_platform_info(buildername) job_name_info = buildbot.extract_name_info(buildername) if (job_group_filter and job_name_info.get('group_symbol', '').lower() != job_group_filter.lower()): continue if source == 'pending': request_id = job['id'] elif source == 'running': # The last element in request_ids corresponds to the request id of this job, # the others are for the requests that were coalesced into this one. request_id = job['request_ids'][-1] new_job = { 'job_guid': common.generate_job_guid( request_id, buildername ), 'name': job_name_info.get('name', ''), 'job_symbol': job_name_info.get('job_symbol', ''), 'group_name': job_name_info.get('group_name', ''), 'group_symbol': job_name_info.get('group_symbol', ''), 'reference_data_name': buildername, 'state': source, 'submit_timestamp': job['submitted_at'], 'build_platform': { 'os_name': platform_info['os'], 'platform': platform_info['os_platform'], 'architecture': platform_info['arch'], }, # where are we going to get this data from? 'machine_platform': { 'os_name': platform_info['os'], 'platform': platform_info['os_platform'], 'architecture': platform_info['arch'], }, 'who': 'unknown', 'option_collection': { # build_type contains an option name, eg. PGO buildbot.extract_build_type(buildername): True }, 'log_references': [], 'artifacts': [ { 'type': 'json', 'name': 'buildapi', 'log_urls': [], 'blob': { 'buildername': buildername, 'request_id': request_id } }, ] } if source == 'running': new_job['start_timestamp'] = job['start_time'] # We store the original values to help debugging. new_job['artifacts'].append( { 'type': 'json', 'name': 'buildapi_running', 'log_urls': [], 'blob': { 'revision': revision, 'request_ids': job['request_ids'], 'submitted_at': job['submitted_at'], 'start_time': job['start_time'], } } ) treeherder_data['job'] = new_job if project not in th_collections: th_collections[project] = TreeherderJobCollection() # get treeherder job instance and add the job instance # to the collection instance th_job = th_collections[project].get_job(treeherder_data) th_collections[project].add(th_job) if missing_resultsets and not revision_filter: common.fetch_missing_resultsets(source, missing_resultsets, logger) num_new_jobs = len(job_ids_seen_now.difference(job_ids_seen_last_time)) logger.info("Imported %d %s jobs, skipped %d previously seen", num_new_jobs, source, len(job_ids_seen_now) - num_new_jobs) return th_collections, job_ids_seen_now
def transform(self, data, source, revision_filter=None, project_filter=None, job_group_filter=None): """ transform the buildapi structure into something we can ingest via our restful api """ valid_projects = set(x.project for x in Datasource.objects.cached()) revision_dict = defaultdict(list) missing_resultsets = defaultdict(set) # loop to catch all the revisions for project, revisions in data[source].iteritems(): if common.should_skip_project(project, valid_projects, project_filter): continue for rev, jobs in revisions.items(): if common.should_skip_revision(rev, revision_filter): continue for job in jobs: if not common.is_blacklisted_buildername( job['buildername']): # Add the revision to the list to be fetched so long as we # find at least one valid job associated with it. revision_dict[project].append(rev) break # retrieving the revision->resultset lookups revisions_lookup = common.lookup_revisions(revision_dict) job_ids_seen_last_time = cache.get(CACHE_KEYS[source], set()) job_ids_seen_now = set() th_collections = {} for project, revisions in data[source].iteritems(): if common.should_skip_project(project, valid_projects, project_filter): continue for revision, jobs in revisions.items(): if common.should_skip_revision(revision, revision_filter): continue try: resultset = common.get_resultset(project, revisions_lookup, revision, missing_resultsets, logger) except KeyError: # There was no matching resultset, skip the job. continue # using project and revision form the revision lookups # to filter those jobs with unmatched revision for job in jobs: buildername = job['buildername'] if common.is_blacklisted_buildername(buildername): continue job_ids_seen_now.add(job['id']) # Don't process jobs that were already present in this datasource # the last time this task completed successfully. if job['id'] in job_ids_seen_last_time: continue treeherder_data = { 'revision_hash': resultset['revision_hash'], 'resultset_id': resultset['id'], 'project': project, } platform_info = buildbot.extract_platform_info(buildername) job_name_info = buildbot.extract_name_info(buildername) if (job_group_filter and job_name_info.get('group_symbol', '').lower() != job_group_filter.lower()): continue if source == 'pending': request_id = job['id'] elif source == 'running': # The last element in request_ids corresponds to the request id of this job, # the others are for the requests that were coalesced into this one. request_id = job['request_ids'][-1] new_job = { 'job_guid': common.generate_job_guid(request_id, buildername), 'name': job_name_info.get('name', ''), 'job_symbol': job_name_info.get('job_symbol', ''), 'group_name': job_name_info.get('group_name', ''), 'group_symbol': job_name_info.get('group_symbol', ''), 'reference_data_name': buildername, 'state': source, 'submit_timestamp': job['submitted_at'], 'build_platform': { 'os_name': platform_info['os'], 'platform': platform_info['os_platform'], 'architecture': platform_info['arch'], }, # where are we going to get this data from? 'machine_platform': { 'os_name': platform_info['os'], 'platform': platform_info['os_platform'], 'architecture': platform_info['arch'], }, 'who': 'unknown', 'option_collection': { # build_type contains an option name, eg. PGO buildbot.extract_build_type(buildername): True }, 'log_references': [], 'artifacts': [ { 'type': 'json', 'name': 'buildapi', 'log_urls': [], 'blob': { 'buildername': buildername, 'request_id': request_id } }, ] } if source == 'running': new_job['start_timestamp'] = job['start_time'] # We store the original values to help debugging. new_job['artifacts'].append({ 'type': 'json', 'name': 'buildapi_running', 'log_urls': [], 'blob': { 'revision': revision, 'request_ids': job['request_ids'], 'submitted_at': job['submitted_at'], 'start_time': job['start_time'], } }) treeherder_data['job'] = new_job if project not in th_collections: th_collections[project] = TreeherderJobCollection() # get treeherder job instance and add the job instance # to the collection instance th_job = th_collections[project].get_job(treeherder_data) th_collections[project].add(th_job) if missing_resultsets and not revision_filter: common.fetch_missing_resultsets(source, missing_resultsets, logger) num_new_jobs = len(job_ids_seen_now.difference(job_ids_seen_last_time)) logger.info("Imported %d %s jobs, skipped %d previously seen", num_new_jobs, source, len(job_ids_seen_now) - num_new_jobs) return th_collections, job_ids_seen_now