def _get_repo_info(repo_and_commit): """ Query cgit for the namespace, username and email of the author. :param tuple repo_and_commit: contains the repo and commit to query for :return: a JSON string of a dictionary with the keys namespace, author_username, author_email, and the commit :rtype: str """ repo, commit = repo_and_commit log.debug( 'Attempting to find the cgit URL for the commit "{0}" in repo "{1}"' .format(commit, repo)) session = retry_session() rv = {'commit': commit} cgit_result = None # The tuple of namespaces to try when determining which namespace this git module belongs # to since this information isn't stored in GitBZ yet namespaces = ('rpms', 'containers', 'modules', 'tests') cgit_url = getenv('ESTUARY_CGIT_URL', 'http://pkgs.devel.redhat.com/cgit/') for namespace in namespaces: url = '{0}{1}/{2}/commit/?id={3}&dt=2'.format( cgit_url, namespace, repo, commit) log.debug('Trying the URL "{0}"'.format(url)) try: cgit_result = session.get(url, timeout=15) except ConnectionError: log.error('The connection to "{0}" failed'.format(url)) continue if cgit_result.status_code == 200: # If the repo is empty, cgit oddly returns a 200 status code, so let's correct the # status code so that the remainder of the code knows it's a bad request if 'Repository seems to be empty' in cgit_result.text: cgit_result.status_code = 404 else: # If the repo is populated and a 200 status code is returned, then we can # assume we found the correct repo break if not cgit_result or cgit_result.status_code != 200: log.error( 'Couldn\'t find the commit "{0}" for the repo "{1}" in the namespaces: {2}' .format(commit, repo, ', '.join(namespaces))) return rv log.debug( 'Found the cgit URL "{0}" for the commit "{1}" in repo "{2}"'. format(url, commit, repo)) rv['namespace'] = namespace # Start parsing the cgit content soup = BeautifulSoup(cgit_result.text, 'html.parser') # Workaround for BS4 in EL7 since `soup.find('th', string=person)` doesn't work in # that environment th_tags = soup.find_all('th') data_found = {'author': False} for th_tag in th_tags: if th_tag.string in ('author'): data_found[th_tag.string] = True username_key = '{0}_username'.format(th_tag.string) email_key = '{0}_email'.format(th_tag.string) rv[username_key], rv[ email_key] = DistGitScraper._parse_username_email_from_cgit( th_tag, commit, namespace, repo) # If all the "th" elements we're interested in were parsed, then break from the loop # early if all(data_found.values()): break soup.decompose() return rv
def query_api_and_update_neo4j(self): """ Scrape the Freshmaker API and upload the data to Neo4j. :param str start_date: a datetime to start scraping data from """ # Initialize session and url session = retry_session() fm_url = self.freshmaker_url while True: log.debug('Querying {0}'.format(fm_url)) try: rv_json = session.get(fm_url, timeout=60).json() except ConnectionError: # TODO: Remove this once FACTORY-3955 is resolved log.error( 'The connection to Freshmaker at %s failed. Skipping the rest of the scraper.', fm_url, ) break for fm_event in rv_json['items']: try: int(fm_event['search_key']) except ValueError: # Skip Freshmaker Events that don't have the search_key as the Advisory ID continue log.debug('Creating FreshmakerEvent {0}'.format(fm_event['id'])) event_params = dict( id_=fm_event['id'], event_type_id=fm_event['event_type_id'], message_id=fm_event['message_id'], state=fm_event['state'], state_name=fm_event['state_name'], state_reason=fm_event['state_reason'], url=fm_event['url'] ) if fm_event.get('time_created'): event_params['time_created'] = timestamp_to_datetime(fm_event['time_created']) if fm_event.get('time_done'): event_params['time_done'] = timestamp_to_datetime(fm_event['time_created']) event = FreshmakerEvent.create_or_update(event_params)[0] log.debug('Creating Advisory {0}'.format(fm_event['search_key'])) advisory = Advisory.get_or_create(dict( id_=fm_event['search_key'] ))[0] event.conditional_connect(event.triggered_by_advisory, advisory) for build_dict in fm_event['builds']: # To handle a faulty container build in Freshmaker if build_dict['build_id'] and int(build_dict['build_id']) < 0: continue log.debug('Creating FreshmakerBuild {0}'.format(build_dict['build_id'])) fb_params = dict( id_=build_dict['id'], dep_on=build_dict['dep_on'], name=build_dict['name'], original_nvr=build_dict['original_nvr'], rebuilt_nvr=build_dict['rebuilt_nvr'], state=build_dict['state'], state_name=build_dict['state_name'], state_reason=build_dict['state_reason'], time_submitted=timestamp_to_datetime(build_dict['time_submitted']), type_=build_dict['type'], type_name=build_dict['type_name'], url=build_dict['url'] ) if build_dict['time_completed']: fb_params['time_completed'] = timestamp_to_datetime( build_dict['time_completed']) if build_dict['build_id']: fb_params['build_id'] = build_dict['build_id'] fb = FreshmakerBuild.create_or_update(fb_params)[0] event.requested_builds.connect(fb) # The build ID obtained from Freshmaker API is actually a Koji task ID task_result = None if build_dict['build_id']: task_result = self.get_koji_task_result(build_dict['build_id']) if not task_result: continue # Extract the build ID from a task result xml_root = ET.fromstring(task_result) # TODO: Change this if a task can trigger multiple builds try: build_id = xml_root.find(".//*[name='koji_builds'].//string").text except AttributeError: build_id = None if not build_id: continue log.debug('Creating ContainerKojiBuild {0}'.format(build_id)) build_params = { 'id_': build_id, 'original_nvr': build_dict['original_nvr'] } try: build = ContainerKojiBuild.create_or_update(build_params)[0] except neomodel.exceptions.ConstraintValidationFailed: # This must have errantly been created as a KojiBuild instead of a # ContainerKojiBuild, so let's fix that. build = KojiBuild.nodes.get_or_none(id_=build_id) if not build: # If there was a constraint validation failure and the build isn't just # the wrong label, then we can't recover. raise build.add_label(ContainerKojiBuild.__label__) build = ContainerKojiBuild.create_or_update(build_params)[0] event.successful_koji_builds.connect(build) if rv_json['meta'].get('next'): fm_url = rv_json['meta']['next'] else: break
def query_api_and_update_neo4j(self): """ Scrape the Freshmaker API and upload the data to Neo4j. :param str start_date: a datetime to start scraping data from """ # Initialize session and url session = retry_session() fm_url = self.freshmaker_url while True: log.debug('Querying {0}'.format(fm_url)) rv_json = session.get(fm_url, timeout=15).json() for fm_event in rv_json['items']: try: int(fm_event['search_key']) except ValueError: # Skip Freshmaker Events that don't have the search_key as the Advisory ID continue event = FreshmakerEvent.create_or_update(dict( id_=fm_event['id'], event_type_id=fm_event['event_type_id'], message_id=fm_event['message_id'], state=fm_event['state'], state_name=fm_event['state_name'], state_reason=fm_event['state_reason'], url=fm_event['url'] ))[0] advisory = Advisory.get_or_create(dict( id_=fm_event['search_key'] ))[0] event.conditional_connect(event.triggered_by_advisory, advisory) for build_dict in fm_event['builds']: # To handle a faulty container build in Freshmaker if not build_dict['build_id'] or int(build_dict['build_id']) < 0: continue # The build ID obtained from Freshmaker API is actually a Koji task ID task_result = self.get_koji_task_result(build_dict['build_id']) if not task_result: continue # Extract the build ID from a task result xml_root = ET.fromstring(task_result) # TODO: Change this if a task can trigger multiple builds try: build_id = xml_root.find(".//*[name='koji_builds'].//string").text except AttributeError: build_id = None if build_id: build = ContainerKojiBuild.get_or_create(dict( id_=build_id, original_nvr=build_dict['original_nvr'] ))[0] event.triggered_container_builds.connect(build) if rv_json['meta'].get('next'): fm_url = rv_json['meta']['next'] else: break