class GitHubDB(object): def __init__(self, ghtoken): # Get handle to Github API if ghtoken is not None and ghtoken != '': self.gh = login(token=ghtoken) else: log.warning('Using unauthenticated access to Github API. This will result in severe rate limiting.') self.gh = GitHub() def waitForRateLimit(self, resourceType): """resourceType can be 'search' or 'core'.""" try: rateLimitInfo = self.gh.rate_limit()['resources'] while rateLimitInfo[resourceType]['remaining'] < (1 if resourceType == 'search' else 12): waitTime = max(1, rateLimitInfo[resourceType]['reset'] - time.time()) log.warning('Waiting %s seconds for Github rate limit...', waitTime) time.sleep(waitTime) rateLimitInfo = self.gh.rate_limit()['resources'] except ConnectionError as e: log.error("Connection error while querying GitHub rate limit. Retrying...") self.waitForRateLimit(resourceType) def refreshGithubUser(self, ghUserObject): self.waitForRateLimit('core') return ghUserObject.refresh(True) def getGithubUserForLogin(self, login, session): """Uses the Github API to find the user for the given username. Returns NullObject if the user was not found for any reason.""" # Try to use cached result to avoid hitting rate limit cachedUser = session.query(GitHubUserCache).filter(GitHubUserCache.login == login).first() if cachedUser is not None: return cachedUser if not cachedUser.fake else NullObject() log.debug('Querying GutHub API for login %s', login) try: self.waitForRateLimit('core') potentialUser = self.gh.user(login) if potentialUser is None: # store login as fake session.add(GitHubUserCache(login=login, fake=True)) return NullObject() actualUser = self.refreshGithubUser(potentialUser) if isinstance(potentialUser, NullObject): # store login as fake session.add(GitHubUserCache(login=login, fake=True)) else: # cache user session.add(GitHubUserCache(login=login, name=actualUser.name, email=actualUser.email, company=actualUser.company, location=actualUser.location)) return actualUser except ConnectionError: log.error("github query failed when attempting to verify username %s", login) return NullObject() def searchGithubUsers(self, query): self.waitForRateLimit('search') return self.gh.search_users(query)
class GitHubAdaptor(object): """ thin wrapper over github3 with the purpose of importin [trac] tickets """ def __init__(self, config, dry_run=False, only_from_cache=False): self._dry_run = dry_run self.only_from_cache = only_from_cache self._mapping = config['mapping'] self._template = config['template'] self._gh = GitHub(token=config['token']) # Everything is done via _repo self._repo = self._gh.repository(config['owner'], config['repository']) self._upstream_repo = self._gh.repository( config['upstream_owner'], config['upstream_repository']) # get current set of available milestones self._milestones = dict({ milestone.title: milestone.number for milestone in self._repo.iter_milestones() }) self._users = dict() self._user_cache = config.get('user_cache', None) self._load_user_cache() def __del__(self): """ save currently known user mapping """ if self._user_cache is not None: with open(self._user_cache, 'w') as user_cache: dump(self._users, user_cache) def _load_user_cache(self): """ load users that are already handled in a previous attempt """ if self._user_cache is not None and os.path.isfile(self._user_cache): with open(self._user_cache) as user_cache: tempo = load(user_cache) assert isinstance(tempo, dict) self._users = tempo self._users.update(self._mapping) def ensure_milestone(self, name): """ check if the given milestone is known already and if it's not create it """ num = self._milestones.get(name, None) if num is None: milestone = self._repo.create_milestone(name) num = self._milestones[name] = milestone.number return num def find_user_in_commits(self, email): """ find a user using the commit api. This helps to find more users, as the email is not always public for search api also this helps with rate limits on search api """ if email in self._users: return self._users[email] gh_user = None for commit in self._upstream_repo.iter_commits(author=email, number=1): if commit.author is None: print email, commit.commit.author, "https://github.com/buildbot/buildbot/commit/" + commit.sha q = 'fullname:"{}"'.format(commit.commit.author['name']) result = list(self._gh.search_users(q)) if len(result) == 1: gh_user = result[0].user.login else: print " ".join([r.user.login for r in result]), "possibilities" self.wait_rate_limits() else: gh_user = commit.author.login if gh_user is not None: print "found mapping for", email, ":", gh_user self._users[email] = gh_user return gh_user print "email not found in repositorie's authors", email return None def find_users(self, emails): not_mapped_users = [] for email in emails: q = '{} in:email'.format(email) result = list(self._gh.search_users(q)) print q, result if len(result) == 1: gh_user = result[0].user.login self._users[email] = gh_user else: not_mapped_users.append(email) self.wait_rate_limits() return not_mapped_users def wait_rate_limits(self): for k, v in self._gh.rate_limit()['resources'].items(): if v['remaining'] < 2: print("waiting one minute for rate limiting reasons..", k) time.sleep(60) def get_user(self, user): """ transform the given id to a github username if it's an public e-mail cache results take into account provided mapping """ if user is None: return user gh_user = self._users.get(user, None) if gh_user is None and not self.only_from_cache: gh_user = self._mapping.get(user, user) if gh_user.find('@') > 0: result = list( self._gh.search_users('{} in:email'.format(gh_user))) if len(result) == 1: gh_user = '******'.format(result[0].user.login) self._users[user] = gh_user return gh_user def _user_display(self, user): gh_user = self.get_user(user) if not gh_user: gh_user = "******" if gh_user[0] == '@': display_user = gh_user # this will result in a mention else: parts = gh_user.split('@') assert len(parts) in (1, 2), 'Special case, needs handling' if len(parts) == 2: # only first part of the e-mail display_user = '******'.format(parts[0]) else: # use as is display_user = '******'.format(gh_user) return display_user def _convert_contributors(self, contributors): """ represent the list of contributors in Markdown """ result = list() for user, contributions in contributors.items(): display_user = self._user_display(self.get_user(user)) print display_user, contributions result.append(display_user) return ', '.join(result) def _format_comments(self, comments): comments_text = [] for comment in comments: if comment.get('message'): if "Ticket retargeted after milestone closed" not in comment[ 'message']: text = "" text += "Comment from: " + self._user_display( self.get_user(comment['author'])) + "\n" text += convert_text(self.get_user(comment['message'])) comments_text.append(text) return "\n---\n".join(comments_text) def create_issue(self, ticket): """ create an issue in the given project """ assert isinstance(ticket, dict) if self._dry_run: return None, None res = self._repo.create_issue( ticket['summary'], body=self._template.format( trac_id=ticket['id'], trac_url=ticket['url'], users=self._convert_contributors(ticket['contributors']), body=ticket['description'], creation_date=format_date(ticket['time']), modification_date=format_date(ticket['changetime']), comments=self._format_comments(ticket['comments'])), milestone=self.ensure_milestone(ticket['milestone'])) return res, res.html_url
def getGitHubProfiles(locations, languages, num): logger.info("Locations: {0}".format(locations)) logger.info("Languages: {0}".format(languages)) num = int(num) if num else DEFAULT_DESIRED_CANDIDATES_PER_EMAIL_DIGEST logger.info("Number of Profiles requested: {0}".format(num)) logger.info("Building query string") queryString = '' for location in locations: queryString = queryString + 'location:\"' + location + '\" ' for language in languages: queryString = queryString + 'language:\"' + language + '\" ' queryString = queryString + 'type:User' logger.info("Query String = {}".format(queryString)) logger.info("Connecting to Github") gh = GitHub(token=os.environ['TOKEN']) logger.info("Getting a list of matching users using GitHub API") matchingUsers = [] for userSearchResult in gh.search_users(queryString): matchingUsers.append(userSearchResult.user) logger.info("Number of matching profiles: {}".format(len(matchingUsers))) userActivityDict = {} logger.info( "Using githubcontributions api to get the number of contributions for each user" ) # TODO: Remove the top 25 when ready for u in matchingUsers[:25]: cmd = 'curl -s https://githubcontributions.io/api/user/' + u.login output = subprocess.check_output(cmd, shell=True) userActivityDict[u.login] = json.loads(output)['eventCount'] logger.info("Sorting the profiles based on # of contributions") topUsers = sorted(userActivityDict.items(), key=lambda x: x[1], reverse=True) logger.info( "Emailing top {} profiles not already in the cache (not already sent before)" .format(num)) r = redis.StrictRedis(host='localhost', port=6379, db=0) format.initialize(num) # TODO Run the following when done debugging, to clear the cache # redis-cli flushall count = 0 for u in topUsers: if count < num: usr = gh.user(u[0]) contributions = u[1] if not r.exists(usr.login) and (usr.company == None or 'HookLogic' not in usr.company or 'Hooklogic' not in usr.company): # Query StackExchange for User id cmd = 'curl -s http://data.stackexchange.com/stackoverflow/csv/670133?Name=' + usr.login output = subprocess.check_output(cmd, shell=True) user_id = '' user_id = output.split('\n')[1].replace('\"', '') stackoverflow_url = "http://stackoverflow.com/users/" + user_id + "/" + usr.login format.format_html(usr, contributions, stackoverflow_url if user_id else '') r.set(usr.login, True) count = count + 1 format.save_file() send_email.send()