def get_issue(url): """Get the data for a specific issue. Args: url: URL of the issue """ gh_client = graphql.GraphQLClient() result = github_util.get_issue(url, gh_client) print(json.dumps(result, indent=4, sort_keys=True))
def __init__(self): # A dictionary mapping keys to individual models. self._models = {} self._load_models() self._gh_client = graphql.GraphQLClient() if not self._gh_client._headers: logging.error( "client._headers not set on GraphQLClient. This likely " "means no GitHub credentials are loaded and requests to " "GitHub API will likely fail")
def graphql_client(self, org, repo): """Return a GitHub GraphQL client for the specified org and repository. Args: org: The org. repo: The repo """ # TODO(jlewi): Should we cache these? ghapp = github_app.GitHubApp.create_from_env() token_generator = github_app.GitHubAppTokenGenerator( ghapp, f"{org}/{repo}") gh_client = graphql.GraphQLClient(headers=token_generator.auth_headers) return gh_client
def __init__(self): self._client = graphql.GraphQLClient() self._headers = None self._token_refresher = None # Try various methods to obtain credentials try: self._token_refresher = github_app.FixedAccessTokenGenerator.from_env( ) except ValueError: logging.info( "Could not create a FixedAccessTokenGenerator; will try " "other methods for obtaining credentials") if not self._token_refresher: app = github_app.GitHubApp.create_from_env() self._token_refresher = github_app.GitHubAppTokenGenerator( app, "kubeflow/manifests")
def add_labels_to_issue(self, installation_id, repo_owner, repo_name, issue_num, predictions): """ Add predicted labels to issue using GitHub API. Args: installation_id: repo installation id repo_owner: repo owner repo_name: repo name issue_num: issue index prediction: dict str-> float; dictionary of labels and their predicted probability """ # TODO(jlewi): Should we cache the GitHub App? What about token # expiration? ghapp = github_app.GitHubApp.create_from_env() # Load IssueLabelBot config. Look for both organization configuration # and repo specific configuration. # TODO(jlewi): We should really cache these and use some form of # expiration to pick up changes. org_config = github_util.get_yaml(owner=repo_owner, repo=ORG_CONFIG_REPO, ghapp=ghapp) repo_config = github_util.get_yaml(owner=repo_owner, repo=repo_name, ghapp=ghapp) context = { "repo_owner": repo_owner, "repo_name": repo_name, "issue_num": issue_num } config = {} if org_config: config.update(org_config) if repo_config: config.update(repo_config) predictions = self.apply_repo_config(config, repo_owner, repo_name, predictions, ghapp) url = util.build_issue_url(repo_owner, repo_name, issue_num) token_generator = github_app.GitHubAppTokenGenerator( ghapp, f"{repo_owner}/{repo_name}") gh_client = graphql.GraphQLClient(headers=token_generator.auth_headers) issue_data = github_util.get_issue(url, gh_client) predicted_labels = set(predictions.keys()) # Remove from label_names any labels which already been applied # or which were explicitly removed. label_names = set(predicted_labels) - set(issue_data["labels"]) label_names = label_names - set(issue_data["removed_labels"]) already_applied = predicted_labels.intersection(issue_data["labels"]) removed = predicted_labels.intersection(issue_data["removed_labels"]) filtered_info = {} filtered_info.update(context) filtered_info["predicted_labels"] = list(predicted_labels) filtered_info["already_applied"] = list(already_applied) filtered_info["removed"] = list(removed) logging.info("Filtered predictions", extra=filtered_info) label_names = list(label_names) # Check whether the bot has already commented on this issue. already_commented = False for a in LABEL_BOT_LOGINS: if a in issue_data["comment_authors"]: already_commented = True break if already_commented: logging.info("Label bot has already commented on issue.", extra=context) else: logging.info("Label bot has not commented on issue.", extra=context) if not installation_id: logging.info("No GitHub App Installation Provided Fetching it") installation_id = ghapp.get_installation_id(repo_owner, repo_name) install = ghapp.get_installation(installation_id) # We are using the GitHub3 library to add comments. We should # TODO(jlewi): We should Use GraphQL so we can use a single library. issue = install.issue(repo_owner, repo_name, issue_num) message = None if label_names: # create message # Create a markdown table with probabilities. rows = [ "| Label | Probability |", "| ------------- | ------------- |" ] for l in label_names: rows.append("| {} | {:.2f} |".format(l, predictions[l])) lines = [ "Issue-Label Bot is automatically applying the labels:", "" ] lines.extend(rows) lines.append("") lines.append( "Please mark this comment with :thumbsup: or :thumbsdown: " "to give our bot feedback! ") lines.append( "Links: [app homepage](https://github.com/marketplace/issue-label-bot), " "[dashboard]({app_url}data/{repo_owner}/{repo_name}) and " "[code](https://github.com/hamelsmu/MLapp) for this bot.". format(app_url=self.app_url, repo_owner=repo_owner, repo_name=repo_name)) message = "\n".join(lines) # label the issue using the GitHub api issue.add_labels(*label_names) context["labels"] = label_names logging.info( f'Add `{"`, `".join(label_names)}` to the issue # {issue_num}', extra=context) else: # We don't want a spam an issue with comments. So once label # bot comments on an issue we will not chime in to report that # we aren't commented. if not already_commented: # TODO(jlewi): Should we include top predictions for area and # platform? Maybe we should include top predictions for # all areas? The problem is the model only returns predictions # above the threshold. message = """Issue Label Bot is not confident enough to auto-label this issue. See [dashboard]({app_url}data/{repo_owner}/{repo_name}) for more details. """.format(app_url=self.app_url, repo_owner=repo_owner, repo_name=repo_name) logging.warning( f'Not confident enough to label this issue: # {issue_num}', extra=context) # make a comment using the GitHub api if message: comment = issue.create_comment(message)
def _iter_issues(self, org, repo, issue_filter=None, output=None): """Iterate over issues in batches for a repository Args: org: The org that owns the repository repo: The directory for the repository output: The directory to write the results; if not specified results are not downloaded issue_filter: Used to filter issues to consider based on when they were last updated Writes the issues along with the first comments to a file in output directory. """ client = graphql.GraphQLClient() num_issues_per_page = 100 if not issue_filter: today = datetime.datetime.now() today = datetime.datetime(year=today.year, month=today.month, day=today.day) start_time = today - datetime.timedelta(days=60) # Labels and projects are available via timeline events. # However, in timeline events project info (e.g. actual project name) # is only in developer preview. # The advantage of using labels and projectCards (as opposed to timeline # events) is that its much easier to bound the number of items we need # to fetch in order to return all labels and projects # for timeline items its much more likely the labels and projects we care # about will require pagination. # # TODO(jlewi): We should add a method to fetch all issue timeline items # via pagination in the case the number of items exceeds the page size. # # TODO(jlewi): We need to consider closed issues if we want to compute # stats. # # TODO(jlewi): We should support fetching only OPEN issues; if we are # deciding which issues need triage or have been triaged we really only # need to look at open isues. Closed Issues will automatically move to # the appropriate card in the Kanban board. query = """query getIssues($org: String!, $repo: String!, $pageSize: Int, $issueCursor: String, $filter: IssueFilters) { repository(owner: $org, name: $repo) { issues(first: $pageSize, filterBy: $filter, after: $issueCursor) { totalCount pageInfo { endCursor hasNextPage } edges { node { author { __typename ... on User { login } ... on Bot { login } } id title body url state createdAt closedAt labels(first: 30) { totalCount edges { node { name } } } projectCards(first: 30) { totalCount pageInfo { endCursor hasNextPage } edges { node { id project { name number } } } } timelineItems(first: 30) { totalCount pageInfo { endCursor hasNextPage } edges { node { __typename ... on AddedToProjectEvent { createdAt } ... on LabeledEvent { createdAt label { name } } ... on ClosedEvent { createdAt } } } } } } } } } """ shard = 0 num_pages = None if output and not os.path.exists(output): os.makedirs(output) total_issues = None has_next_issues_page = True # TODO(jlewi): We should persist the cursors to disk so we can resume # after errors issues_cursor = None shard_writer = None if not issue_filter: start_time = datetime.datetime.now() - datetime.timedelta(weeks=24) issue_filter = { "since": start_time.isoformat(), } while has_next_issues_page: variables = { "org": org, "repo": repo, "pageSize": num_issues_per_page, "issueCursor": issues_cursor, "filter": issue_filter, } results = client.run_query(query, variables=variables) if results.get("errors"): message = json.dumps(results.get("errors")) logging.error(f"There was a problem issuing the query; errors:\n{message}\n") return if not total_issues: total_issues = results["data"]["repository"]["issues"]["totalCount"] num_pages = int(np.ceil(total_issues/float(num_issues_per_page))) logging.info("%s/%s has a total of %s issues", org, repo, total_issues) if output and not shard_writer: logging.info("initializing the shard writer") shard_writer = graphql.ShardWriter(num_pages, output, prefix="issues-{0}-{1}".format(org, repo)) issues = graphql.unpack_and_split_nodes( results, ["data", "repository", "issues", "edges"]) yield issues if shard_writer: shard_writer.write_shard(issues) page_info = results["data"]["repository"]["issues"]["pageInfo"] issues_cursor = page_info["endCursor"] has_next_issues_page = page_info["hasNextPage"]
def client(self): if not self._client: self._client = graphql.GraphQLClient() return self._client
def _iter_issues(self, org, repo, output=None): """Iterate over issues in batches for a repository Args: org: The org that owns the repository repo: The directory for the repository output: The directory to write the results; if not specified results are not downloaded Writes the issues along with the first comments to a file in output directory. """ client = graphql.GraphQLClient() num_issues_per_page = 100 # TODO(jlewi):Use query variables # TODO(jlewi): query_template = """{{ repository(owner: "{org}", name: "{repo}") {{ issues(first:{num_issues_per_page}, states: OPEN, {issues_cursor}) {{ totalCount pageInfo {{ endCursor hasNextPage }} edges{{ node {{ author {{ __typename ... on User {{ login }} ... on Bot{{ login }} }} id title body url state labels(first:30, ){{ totalCount edges {{ node {{ name }} }} }} projectCards(first:30, ){{ totalCount edges {{ node {{ id project {{ name number }} }} }} }} }} }} }} }} }} """ shard = 0 num_pages = None if output and not os.path.exists(output): os.makedirs(output) total_issues = None has_next_issues_page = True # TODO(jlewi): We should persist the cursors to disk so we can resume # after errors issues_cursor = None shard_writer = None while has_next_issues_page: issues_cursor_text = "" if issues_cursor: issues_cursor_text = "after:\"{0}\"".format(issues_cursor) query = query_template.format(org=org, repo=repo, num_issues_per_page=num_issues_per_page, issues_cursor=issues_cursor_text) results = client.run_query(query) if results.get("errors"): message = json.dumps(results.get("errors")) logging.error("There was a problem issuing the query; errors:\n%s", "\n", message) return if not total_issues: total_issues = results["data"]["repository"]["issues"]["totalCount"] num_pages = int(np.ceil(total_issues/float(num_issues_per_page))) logging.info("%s/%s has a total of %s issues", org, repo, total_issues) if output and not shard_writer: logging.info("initializing the shard writer") shard_writer = graphql.ShardWriter(num_pages, output, prefix="issues-{0}-{1}".format(org, repo)) issues = graphql.unpack_and_split_nodes( results, ["data", "repository", "issues", "edges"]) yield issues if shard_writer: shard_writer.write_shard(issues) page_info = results["data"]["repository"]["issues"]["pageInfo"] issues_cursor = page_info["endCursor"] has_next_issues_page = page_info["hasNextPage"]
def fetch_issues(self, org, repo, output): """Fetch issues for a repository Args: org: The org that owns the repository repo: The directory for the repository output: The directory to write the results Writes the issues along with the first comments to a file in output directory. """ client = graphql.GraphQLClient() num_issues_per_page = 100 query_template = """{{ repository(owner: "{org}", name: "{repo}") {{ issues(first:{num_issues_per_page} {issues_cursor}) {{ totalCount pageInfo {{ endCursor hasNextPage }} edges{{ node {{ author {{ __typename ... on User {{ login }} ... on Bot{{ login }} }} title body comments(first:20, ){{ totalCount edges {{ node {{ author {{ __typename ... on User {{ login }} ... on Bot{{ login }} }} body createdAt }} }} }} }} }} }} }} }} """ shard = 0 num_pages = None if not os.path.exists(output): os.makedirs(output) total_issues = None has_next_issues_page = True # TODO(jlewi): We should persist the cursors to disk so we can resume # after errors issues_cursor = None while has_next_issues_page: issues_cursor_text = "" if issues_cursor: issues_cursor_text = "after:\"{0}\"".format(issues_cursor) query = query_template.format( org=org, repo=repo, num_issues_per_page=num_issues_per_page, issues_cursor=issues_cursor_text) results = client.run_query(query) if results.get("errors"): logging.error( "There was a problem issuing the query; errors:\n%s", "\n".join(results.get("errors"))) return if not total_issues: total_issues = results["data"]["repository"]["issues"][ "totalCount"] num_pages = int( np.ceil(total_issues / float(num_issues_per_page))) logging.info("%s/%s has a total of %s issues", org, repo, total_issues) shard_file = os.path.join( output, "issues-{0}-{1}-{2:03d}-of-{3:03d}.json".format( org, repo, shard, num_pages)) issues = process_issue_results(results) with open(shard_file, "w") as hf: for i in issues: json.dump(i, hf) hf.write("\n") logging.info("Wrote shard %s to %s", shard, shard_file) shard += 1 page_info = results["data"]["repository"]["issues"]["pageInfo"] issues_cursor = page_info["endCursor"] has_next_issues_page = page_info["hasNextPage"]