def __init__(self, commits=True, blames=True, botmeta=None, botmetafile=None, maintainers=None, gh_client=None, cachedir=u'~/.ansibullbot/cache', gitrepo=None): ''' Maintainers: defaultdict(dict) where keys are filepath and values are dict gh_client: GraphQL GitHub client ''' self.get_commits = commits self.get_blames = blames self.botmetafile = botmetafile if botmeta: self.botmeta = botmeta else: self.botmeta = { } # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed) self.modules = {} # keys: paths of files belonging to the repository self.maintainers = maintainers or {} self.importmap = {} self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper') self.scraper_cache = os.path.expanduser(self.scraper_cache) self.gws = GithubWebScraper(cachedir=self.scraper_cache) self.gqlc = gh_client self.files = [] if gitrepo: self.gitrepo = gitrepo else: self.gitrepo = GitRepoWrapper( cachedir=cachedir, repo=u'https://github.com/ansible/ansible') # sqlalchemy unc = os.path.join(cachedir, u'ansible_module_indexer.db') unc = os.path.expanduser(unc) unc = u'sqlite:///' + unc self.engine = create_engine(unc) self.Session = sessionmaker(bind=self.engine) self.session = self.Session() Email.metadata.create_all(self.engine) Blame.metadata.create_all(self.engine) # committers by module self.committers = {} # commits by module self.commits = {} # map of email to github login self.emails_cache = {} # load the bot meta self.update(force=True)
def __init__(self, maintainers=None): self.modules = {} self.maintainers = maintainers or {} self.checkoutdir = '~/.ansibullbot/cache/ansible.modules.checkout' self.checkoutdir = os.path.expanduser(self.checkoutdir) self.importmap = {} self.scraper_cache = '~/.ansibullbot/cache/ansible.modules.scraper' self.scraper_cache = os.path.expanduser(self.scraper_cache) self.gws = GithubWebScraper(cachedir=self.scraper_cache) # committers by module self.committers = {} # commits by module self.commits = {} # map of email to github login self.emailmap = {}
def __init__(self, issuewrapper, meta): self.issuewrapper = issuewrapper self.original = self.issuewrapper.instance.body self.meta = meta self.missing = [] self.sections = {} self.section_map = {} self.section_order = [] self.new_description = u'' self.retemplate = True self.cachedir = u'~/.ansibullbot/cache' self.cachedir = os.path.expanduser(self.cachedir) self.gws = GithubWebScraper(cachedir=self.cachedir) if self.issuewrapper.github_type == u'pullrequest': rfile = PTEMPLATE else: rfile = ITEMPLATE raw = self.gws.get_raw_content(u'ansible', u'ansible', u'devel', rfile, usecache=True) rlines = raw.split(u'\n') for rline in rlines: if not rline.startswith(u'#####'): continue section = rline.strip().split(None, 1)[1] section = section.lower() self.section_order.append(section) self.sections[section] = u'' if self.section_order[0] not in [u'issue type', u'summary']: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() else: raise Exception(u'out of order section') self.process() self.create_body()
def __init__(self, maintainers=None, gh_client=None): self.botmeta = {} self.modules = {} self.maintainers = maintainers or {} self.checkoutdir = '~/.ansibullbot/cache/ansible.modules.checkout' self.checkoutdir = os.path.expanduser(self.checkoutdir) self.importmap = {} self.scraper_cache = '~/.ansibullbot/cache/ansible.modules.scraper' self.scraper_cache = os.path.expanduser(self.scraper_cache) self.gws = GithubWebScraper(cachedir=self.scraper_cache) self.gqlc = gh_client # committers by module self.committers = {} # commits by module self.commits = {} # map of email to github login self.emailmap = {} # load the bot meta self.update(force=True)
def __init__(self, commits=True, blames=True, botmeta=None, maintainers=None, gh_client=None, cachedir='~/.ansibullbot/cache', gitrepo=None): ''' Maintainers: defaultdict(dict) where keys are filepath and values are dict gh_client: GraphQL GitHub client ''' self.get_commits = commits self.get_blames = blames botmeta = botmeta if botmeta else {} self.maintainers = maintainers or {} self.gqlc = gh_client self.scraper_cache = os.path.expanduser( os.path.join(cachedir, 'ansible.modules.scraper')) self.gws = GithubWebScraper(cachedir=self.scraper_cache) self.gitrepo = gitrepo self.modules = {} # keys: paths of files belonging to the repository # sqlalchemy unc = os.path.join(cachedir, 'ansible_module_indexer.db') unc = os.path.expanduser(unc) unc = 'sqlite:///' + unc self.engine = create_engine(unc) self.Session = sessionmaker(bind=self.engine) self.session = self.Session() Email.metadata.create_all(self.engine) Blame.metadata.create_all(self.engine) # committers by module self.committers = {} # commits by module self.commits = {} # map of email to github login self.emails_cache = {} self.update(botmeta)
def __init__(self, commits=True, blames=True, botmetafile=None, maintainers=None, gh_client=None, cachedir=u'~/.ansibullbot/cache', gitrepo=None): ''' Maintainers: defaultdict(dict) where keys are filepath and values are dict gh_client: GraphQL GitHub client ''' self.get_commits = commits self.get_blames = blames self.botmetafile = botmetafile self.botmeta = {} # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed) self.modules = {} # keys: paths of files belonging to the repository self.maintainers = maintainers or {} self.importmap = {} self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper') self.scraper_cache = os.path.expanduser(self.scraper_cache) self.gws = GithubWebScraper(cachedir=self.scraper_cache) self.gqlc = gh_client self.files = [] if gitrepo: self.gitrepo = gitrepo else: self.gitrepo = GitRepoWrapper(cachedir=cachedir, repo=u'https://github.com/ansible/ansible') # sqlalchemy unc = os.path.join(cachedir, u'ansible_module_indexer.db') unc = os.path.expanduser(unc) unc = u'sqlite:///' + unc self.engine = create_engine(unc) self.Session = sessionmaker(bind=self.engine) self.session = self.Session() Email.metadata.create_all(self.engine) Blame.metadata.create_all(self.engine) # committers by module self.committers = {} # commits by module self.commits = {} # map of email to github login self.emails_cache = {} # load the bot meta self.update(force=True)
def __init__(self, issuewrapper, meta): self.issuewrapper = issuewrapper self.original = self.issuewrapper.instance.body self.meta = meta self.missing = [] self.sections = {} self.section_map = {} self.section_order = [] self.new_description = u'' self.retemplate = True self.cachedir = u'~/.ansibullbot/cache' self.cachedir = os.path.expanduser(self.cachedir) self.gws = GithubWebScraper(cachedir=self.cachedir) if self.issuewrapper.github_type == u'pullrequest': rfile = PTEMPLATE else: rfile = ITEMPLATE raw = self.gws.get_raw_content( u'ansible', u'ansible', u'devel', rfile, usecache=True ) rlines = raw.split(u'\n') for rline in rlines: if not rline.startswith(u'#####'): continue section = rline.strip().split(None, 1)[1] section = section.lower() self.section_order.append(section) self.sections[section] = u'' if self.section_order[0] not in [u'issue type', u'summary']: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() else: raise Exception(u'out of order section') self.process() self.create_body()
class ModuleIndexer(object): EMPTY_MODULE = { 'authors': [], 'name': None, 'namespaced_module': None, 'namespace_maintainers': [], 'deprecated': False, 'deprecated_filename': None, 'dirpath': None, 'filename': None, 'filepath': None, 'fulltopic': None, 'maintainers': [], '_maintainers': [], 'maintainers_keys': None, 'metadata': {}, 'repo_filename': None, 'repository': 'ansible', 'subtopic': None, 'topic': None, 'imports': [] } REPO = "http://github.com/ansible/ansible" def __init__(self, maintainers=None, gh_client=None, cachedir='~/.ansibullbot/cache'): ''' Maintainers: defaultdict(dict) where keys are filepath and values are dict gh_client: GraphQL GitHub client ''' self.botmeta = { } # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed) self.modules = {} # keys: paths of files belonging to the repository self.checkoutdir = '~/.ansibullbot/cache/ansible.modules.checkout' self.maintainers = maintainers or {} self.checkoutdir = os.path.join(cachedir, 'ansible.modules.checkout') self.checkoutdir = os.path.expanduser(self.checkoutdir) self.importmap = {} self.scraper_cache = '~/.ansibullbot/cache/ansible.modules.scraper' self.scraper_cache = os.path.expanduser(self.scraper_cache) self.gws = GithubWebScraper(cachedir=self.scraper_cache) self.gqlc = gh_client self.files = [] # sqlalchemy unc = os.path.join(cachedir, 'ansible_module_indexer.db') unc = os.path.expanduser(unc) unc = 'sqlite:///' + unc self.engine = create_engine(unc) self.Session = sessionmaker(bind=self.engine) self.session = self.Session() Email.metadata.create_all(self.engine) Blame.metadata.create_all(self.engine) # committers by module self.committers = {} # commits by module self.commits = {} # map of email to github login self.emails_cache = {} # load the bot meta self.update(force=True) def update(self, force=False): '''Reload everything if there are new commits''' changed = self.manage_checkout() if changed or force: self.get_files() self.parse_metadata() def manage_checkout(self): '''Check if there are any changes to the repo''' changed = False if not os.path.isdir(self.checkoutdir): self.create_checkout() changed = True else: changed = self.update_checkout() return changed def get_files(self): '''Cache a list of filenames in the checkout''' cmd = 'cd {}; git ls-files'.format(self.checkoutdir) (rc, so, se) = run_command(cmd) files = so.split('\n') files = [x.strip() for x in files if x.strip()] self.files = files def parse_metadata(self): fp = '.github/BOTMETA.yml' rdata = self.get_file_content(fp) self.botmeta = BotMetadataParser.parse_yaml(rdata) # load the modules logging.info('loading modules') self.get_ansible_modules() def create_checkout(self): """checkout ansible""" print('# creating checkout for module indexer') # cleanup if os.path.isdir(self.checkoutdir): shutil.rmtree(self.checkoutdir) #cmd = "git clone http://github.com/ansible/ansible --recursive %s" \ cmd = "git clone %s %s" \ % (self.REPO, self.checkoutdir) (rc, so, se) = run_command(cmd) print str(so) + str(se) def update_checkout(self): """rebase + pull + update the checkout""" changed = False cmd = "cd %s ; git pull --rebase" % self.checkoutdir (rc, so, se) = run_command(cmd) print str(so) + str(se) # If rebase failed, recreate the checkout if rc != 0: self.create_checkout() return True else: if 'current branch devel is up to date.' not in so.lower(): changed = True return changed def _find_match(self, pattern, exact=False): match = None for k, v in self.modules.iteritems(): if v['name'] == pattern: match = v break if not match: # search by key ... aka the filepath for k, v in self.modules.iteritems(): if k == pattern: match = v break if not match and not exact: # search by properties for k, v in self.modules.iteritems(): for subkey in v.keys(): if v[subkey] == pattern: match = v break if match: break return match def find_match(self, pattern, exact=False): '''Exact module name matching''' if not pattern: return None # https://github.com/ansible/ansible/issues/19755 if pattern == 'setup': pattern = 'system/setup.py' # https://github.com/ansible/ansible/issues/18527 # docker-container -> docker_container if '-' in pattern: pattern = pattern.replace('-', '_') if 'module_utils' in pattern: # https://github.com/ansible/ansible/issues/20368 return None elif '/' in pattern and not self._find_match(pattern, exact=True): # https://github.com/ansible/ansible/issues/20520 if not pattern.startswith('lib/'): keys = self.modules.keys() for k in keys: if pattern in k: ppy = pattern + '.py' if k.endswith(pattern) or k.endswith(ppy): return self.modules[k] elif pattern.endswith('.py') and self._find_match(pattern, exact=False): # https://github.com/ansible/ansible/issues/19889 candidate = self._find_match(pattern, exact=False) if candidate['filename'] == pattern: return candidate match = self._find_match(pattern, exact=exact) if not match and not exact: # check for just the basename # 2617: ansible-s-extras/network/cloudflare_dns.py bname = os.path.basename(pattern) match = self._find_match(bname) if not match: # check for deprecated name # _fireball -> fireball match = self._find_match('_' + bname) return match def is_valid(self, mname): match = self.find_match(mname) if match: return True else: return False def get_repository_for_module(self, mname): match = self.find_match(mname) if match: return match['repository'] else: return None def get_ansible_modules(self): """Make a list of known modules""" matches = [] module_dir = os.path.join(self.checkoutdir, 'lib/ansible/modules') module_dir = os.path.expanduser(module_dir) for root, dirnames, filenames in os.walk(module_dir): for filename in filenames: if 'lib/ansible/modules' in root and \ not filename == '__init__.py' and \ (filename.endswith('.py') or filename.endswith('.ps1')): matches.append(os.path.join(root, filename)) matches = sorted(set(matches)) self.populate_modules(matches) # custom fixes newitems = [] for k, v in self.modules.iteritems(): # include* is almost always an ansible/ansible issue # https://github.com/ansible/ansibullbot/issues/214 if k.endswith('/include.py'): self.modules[k]['repository'] = 'ansible' # https://github.com/ansible/ansibullbot/issues/214 if k.endswith('/include_vars.py'): self.modules[k]['repository'] = 'ansible' if k.endswith('/include_role.py'): self.modules[k]['repository'] = 'ansible' # ansible maintains these if 'include' in k: self.modules[k]['maintainers'] = ['ansible'] # deprecated modules are annoying if v['name'].startswith('_'): dkey = os.path.dirname(v['filepath']) dkey = os.path.join(dkey, v['filename'].replace('_', '', 1)) if dkey not in self.modules: nd = v.copy() nd['name'] = nd['name'].replace('_', '', 1) newitems.append((dkey, nd)) for ni in newitems: self.modules[ni[0]] = ni[1] # parse metadata logging.debug('set module metadata') self.set_module_metadata() # parse imports logging.debug('set module imports') self.set_module_imports() # last modified logging.debug('set module commits') self.get_module_commits() # parse blame logging.debug('set module blames') self.get_module_blames() # depends on metadata now ... logging.debug('set module maintainers') self.set_maintainers() return self.modules def populate_modules(self, matches): # figure out the names for match in matches: mdict = copy.deepcopy(self.EMPTY_MODULE) mdict['filename'] = os.path.basename(match) dirpath = os.path.dirname(match) dirpath = dirpath.replace(self.checkoutdir + '/', '') mdict['dirpath'] = dirpath filepath = match.replace(self.checkoutdir + '/', '') mdict['filepath'] = filepath mdict.update(self.split_topics_from_path(filepath)) mdict['repo_filename'] = mdict['filepath']\ .replace('lib/ansible/modules/%s/' % mdict['repository'], '') # clustering/consul mdict['namespaced_module'] = mdict['repo_filename'] mdict['namespaced_module'] = \ mdict['namespaced_module'].replace('.py', '') mdict['namespaced_module'] = \ mdict['namespaced_module'].replace('.ps1', '') mname = os.path.basename(match) mname = mname.replace('.py', '') mname = mname.replace('.ps1', '') mdict['name'] = mname # deprecated modules if mname.startswith('_'): mdict['deprecated'] = True deprecated_filename = \ os.path.dirname(mdict['namespaced_module']) deprecated_filename = \ os.path.join(deprecated_filename, mname[1:] + '.py') mdict['deprecated_filename'] = deprecated_filename else: mdict['deprecated_filename'] = mdict['repo_filename'] self.modules[filepath] = mdict # meta is a special module self.modules['meta'] = copy.deepcopy(self.EMPTY_MODULE) self.modules['meta']['name'] = 'meta' self.modules['meta']['repo_filename'] = 'meta' def get_module_commits(self): keys = self.modules.keys() keys = sorted(keys) for k in keys: #v = self.modules[k] self.commits[k] = [] cpath = os.path.join(self.checkoutdir, k) if not os.path.isfile(cpath): continue mtime = os.path.getmtime(cpath) refresh = False pfile = os.path.join(self.scraper_cache, k.replace('/', '_') + '.commits.pickle') if not os.path.isfile(pfile): refresh = True else: with open(pfile, 'rb') as f: pdata = pickle.load(f) if pdata[0] == mtime: self.commits[k] = pdata[1] else: refresh = True if refresh: logging.info('refresh commit cache for %s' % k) cmd = 'cd %s; git log --follow %s' % (self.checkoutdir, k) (rc, so, se) = run_command(cmd) for line in so.split('\n'): if line.startswith('commit '): commit = { 'name': None, 'email': None, 'login': None, 'hash': line.split()[-1], 'date': None } # Author: Matt Clay <*****@*****.**> if line.startswith('Author: '): line = line.replace('Author: ', '') line = line.replace('<', '') line = line.replace('>', '') lparts = line.split() if '@' in lparts[-1]: commit['email'] = lparts[-1] commit['name'] = ' '.join(lparts[:-1]) else: pass if commit['email'] and \ 'noreply.github.com' in commit['email']: commit['login'] = commit['email'].split('@')[0] # Date: Sat Jan 28 23:28:53 2017 -0800 if line.startswith('Date:'): dstr = line.split(':', 1)[1].strip() dstr = ' '.join(dstr.split(' ')[:-1]) ds = datetime.datetime.strptime( dstr, '%a %b %d %H:%M:%S %Y') commit['date'] = ds self.commits[k].append(commit) with open(pfile, 'wb') as f: pickle.dump((mtime, self.commits[k]), f) def last_commit_for_file(self, filepath): if filepath in self.commits: return self.commits[filepath][0]['hash'] # git log --pretty=format:'%H' -1 # lib/ansible/modules/cloud/amazon/ec2_metric_alarm.py cmd = 'cd %s; git log --pretty=format:\'%%H\' -1 %s' % \ (self.checkoutdir, filepath) (rc, so, se) = run_command(cmd) #import epdb; epdb.st() return so.strip() def get_module_blames(self): logging.debug('build email cache') emails_cache = self.session.query(Email) emails_cache = [(x.email, x.login) for x in emails_cache] self.emails_cache = dict(emails_cache) logging.debug('build blame cache') blame_cache = self.session.query(Blame).all() blame_cache = [x.file_commit for x in blame_cache] blame_cache = sorted(set(blame_cache)) logging.debug('eval module hashes') changed = False keys = sorted(self.modules.keys()) for k in keys: #logging.debug('eval {}'.format(k)) if k not in self.files: self.committers[k] = {} continue #logging.debug('last commit {}'.format(k)) ghash = self.last_commit_for_file(k) if ghash in blame_cache: continue logging.debug('checking hash for {}'.format(k)) res = self.session.query(Blame).filter_by(file_name=k, file_commit=ghash).all() hashes = [x.file_commit for x in res] if ghash not in hashes: logging.debug( 'hash {} not found for {}, updating blames'.format( ghash, k)) scraper_args = ['ansible', 'ansible', 'devel', k] uns, emailmap = self.gqlc.get_usernames_from_filename_blame( *scraper_args) # check the emails for email, login in emailmap.items(): if email in self.emails_cache: continue exists = self.session.query(Email).filter_by( email=email).first() if not exists: logging.debug('insert {}:{}'.format(login, email)) _email = Email(email=email, login=login) self.session.add(_email) changed = True # check the blames for login, commits in uns.items(): for commit in commits: kwargs = { 'file_name': k, 'file_commit': ghash, 'author_commit': commit, 'author_login': login } exists = self.session.query(Blame).filter_by( **kwargs).first() if not exists: logging.debug('insert {}:{}:{}'.format( k, commit, login)) _blame = Blame(**kwargs) self.session.add(_blame) changed = True if changed: self.session.commit() logging.debug('re-build email cache') emails_cache = self.session.query(Email) emails_cache = [(x.email, x.login) for x in emails_cache] self.emails_cache = dict(emails_cache) # fill in what we can ... logging.debug('fill in commit logins') for k in keys: for idc, commit in enumerate(self.commits[k][:]): if not commit.get('login'): continue login = self.emails_cache.get(commit['email']) if not login and '@users.noreply.github.com' in commit['email']: login = commit['email'].split('@')[0] self.emails_cache[commit['email']] = login if not login: print('unknown: {}'.format(commit['email'])) #import epdb; epdb.st() self.commits[k][idc]['login'] = self.emails_cache.get(login) def _get_module_blames(self): ''' Scrape the blame page for each module and store it ''' keys = sorted(self.modules.keys()) # scrape the data #for k,v in self.modules.iteritems(): for k in keys: #v = self.modules[k] cpath = os.path.join(self.checkoutdir, k) if not os.path.isfile(cpath): self.committers[k] = {} continue #mtime = os.path.getmtime(cpath) ghash = self.last_commit_for_file(k) pfile = os.path.join(self.scraper_cache, k.replace('/', '_') + '.blame.pickle') sargs = ['ansible', 'ansible', 'devel', k] refresh = False if not os.path.isfile(pfile): refresh = True else: logging.debug('load {}'.format(pfile)) with open(pfile, 'rb') as f: pdata = pickle.load(f) import epdb epdb.st() if pdata[0] == ghash: self.committers[k] = pdata[1] if len(pdata) == 3: # use emailmap if available emailmap = pdata[2] else: emailmap = {} else: refresh = True if refresh: if self.gqlc: logging.debug('graphql blame usernames {}'.format(pfile)) uns, emailmap = self.gqlc.get_usernames_from_filename_blame( *sargs) else: emailmap = {} # scrapping: emails not available logging.debug('www blame usernames {}'.format(pfile)) uns = self.gws.get_usernames_from_filename_blame(*sargs) self.committers[k] = uns with open(pfile, 'wb') as f: pickle.dump((ghash, uns, emailmap), f) for email, github_id in emailmap.items(): if email not in self.emails_cache: self.emails_cache[email] = github_id # add scraped logins to the map #for k,v in self.modules.iteritems(): for k in keys: #v = self.modules[k] for idx, x in enumerate(self.commits[k]): if x['email'] in ['@']: continue if x['email'] not in self.emails_cache: self.emails_cache[x['email']] = None if x['login']: self.emails_cache[x['email']] = x['login'] continue xhash = x['hash'] for ck, cv in self.committers[k].iteritems(): if xhash in cv: self.emails_cache[x['email']] = ck break # fill in what we can ... #for k,v in self.modules.iteritems(): for k in keys: #v = self.modules[k] for idx, x in enumerate(self.commits[k]): if not x['login']: if x['email'] in ['@']: continue if self.emails_cache[x['email']]: login = self.emails_cache[x['email']] xhash = x['hash'] self.commits[k][idx]['login'] = login if login not in self.committers[k]: self.committers[k][login] = [] if xhash not in self.committers[k][login]: self.committers[k][login].append(xhash) def set_maintainers(self): '''Define the maintainers for each module''' # grep the authors: for k, v in self.modules.iteritems(): if v['filepath'] is None: continue mfile = os.path.join(self.checkoutdir, v['filepath']) authors = self.get_module_authors(mfile) self.modules[k]['authors'] = authors # authors are maintainers by -default- self.modules[k]['maintainers'] += authors self.modules[k]['maintainers'] = \ sorted(set(self.modules[k]['maintainers'])) metadata = self.botmeta['files'].keys() for k, v in self.modules.iteritems(): if k == 'meta': continue if k in self.botmeta['files']: # There are metadata in .github/BOTMETA.yml for this file # copy maintainers_keys self.modules[k]['maintainers_keys'] = self.botmeta['files'][k][ 'maintainers_keys'][:] if self.botmeta['files'][k]: maintainers = self.botmeta['files'][k].get( 'maintainers', []) for maintainer in maintainers: if maintainer not in self.modules[k]['maintainers']: self.modules[k]['maintainers'].append(maintainer) # remove the people who want to be ignored if 'ignored' in self.botmeta['files'][k]: ignored = self.botmeta['files'][k]['ignored'] for x in ignored: if x in self.modules[k]['maintainers']: self.modules[k]['maintainers'].remove(x) else: # There isn't metadata in .github/BOTMETA.yml for this file best_match = None for mkey in metadata: if v['filepath'].startswith(mkey): if not best_match: best_match = mkey continue if len(mkey) > len(best_match): best_match = mkey if best_match: self.modules[k]['maintainers_keys'] = [best_match] for maintainer in self.botmeta['files'][best_match].get( 'maintainers', []): if maintainer not in self.modules[k]['maintainers']: self.modules[k]['maintainers'].append(maintainer) # remove the people who want to be ignored for ignored in self.botmeta['files'][best_match].get( 'ignored', []): if ignored in self.modules[k]['maintainers']: self.modules[k]['maintainers'].remove(ignored) # save a pristine copy so that higher level code can still use it self.modules[k]['maintainers'] = sorted( set(self.modules[k]['maintainers'])) self.modules[k]['_maintainers'] = \ [x for x in self.modules[k]['maintainers']] # set the namespace maintainers ... for k, v in self.modules.iteritems(): if 'namespace_maintainers' not in self.modules[k]: self.modules[k]['namespace_maintainers'] = [] if v.get('namespace'): ns = v.get('namespace') nms = self.get_maintainers_for_namespace(ns) self.modules[k]['namespace_maintainers'] = nms def split_topics_from_path(self, module_file): subpath = module_file.replace('lib/ansible/modules/', '') path_parts = subpath.split('/') topic = path_parts[0] if len(path_parts) > 2: subtopic = path_parts[1] fulltopic = '/'.join(path_parts[0:2]) else: subtopic = None fulltopic = path_parts[0] tdata = { 'fulltopic': fulltopic, 'namespace': fulltopic, 'topic': topic, 'subtopic': subtopic } return tdata def get_module_authors(self, module_file): """Grep the authors out of the module docstrings""" if not os.path.exists(module_file): return [] documentation = '' inphase = False with open(module_file, 'rb') as f: for line in f: if 'DOCUMENTATION' in line: inphase = True continue if line.strip().endswith("'''") or line.strip().endswith( '"""'): #phase = None break if inphase: documentation += line if not documentation: return [] # clean out any other yaml besides author to save time inphase = False author_lines = '' doc_lines = documentation.split('\n') for idx, x in enumerate(doc_lines): if x.startswith('author'): #print("START ON %s" % x) inphase = True #continue if inphase and not x.strip().startswith('-') and \ not x.strip().startswith('author'): #print("BREAK ON %s" % x) inphase = False break if inphase: author_lines += x + '\n' if not author_lines: return [] ydata = {} try: ydata = yaml.load(author_lines) except Exception as e: print e return [] # quit early if the yaml was not valid if not ydata: return [] # sometimes the field is 'author', sometimes it is 'authors' if 'authors' in ydata: ydata['author'] = ydata['authors'] # quit if the key was not found if 'author' not in ydata: return [] if type(ydata['author']) != list: ydata['author'] = [ydata['author']] authors = [] for author in ydata['author']: github_ids = self.extract_github_id(author) if github_ids: authors.extend(github_ids) return authors def extract_github_id(self, author): authors = set() if 'ansible core team' in author.lower(): authors.add('ansible') elif '@' in author: # match github ids but not emails authors.update(re.findall(r'(?<!\w)@([\w-]+)(?![\w.])', author)) elif 'github.com/' in author: # {'author': 'Henrique Rodrigues (github.com/Sodki)'} idx = author.find('github.com/') author = author[idx + 11:] authors.add(author.replace(')', '')) elif '(' in author and len(author.split()) == 3: # Mathieu Bultel (matbu) idx = author.find('(') author = author[idx + 1:] authors.add(author.replace(')', '')) # search for emails for email in re.findall(r'[<(]([^@]+@[^)>]+)[)>]', author): github_id = self.emails_cache.get(email) if github_id: authors.add(github_id) return list(authors) def fuzzy_match(self, repo=None, title=None, component=None): '''Fuzzy matching for modules''' # https://github.com/ansible/ansible/issues/18179 if 'validate-modules' in component: return None # https://github.com/ansible/ansible/issues/20368 if 'module_utils' in component: return None # authorized_keys vs. authorized_key if component and component.endswith('s'): tm = self.find_match(component[:-1]) if tm: return tm['name'] match = None known_modules = [] for k, v in self.modules.iteritems(): known_modules.append(v['name']) title = title.lower() title = title.replace(':', '') title_matches = [x for x in known_modules if x + ' module' in title] if not title_matches: title_matches = [ x for x in known_modules if title.startswith(x + ' ') ] if not title_matches: title_matches = \ [x for x in known_modules if ' ' + x + ' ' in title] # don't do singular word matching in title for ansible/ansible cmatches = None if component: cmatches = [x for x in known_modules if x in component] cmatches = [x for x in cmatches if not '_' + x in component] # use title ... ? if title_matches: cmatches = [x for x in cmatches if x in title_matches] if cmatches: if len(cmatches) >= 1: match = cmatches[0] if not match: if 'docs.ansible.com' in component: pass else: pass print("module - component matches: %s" % cmatches) if not match: if len(title_matches) == 1: match = title_matches[0] else: print("module - title matches: %s" % title_matches) return match def is_multi(self, rawtext): '''Is the string a list or a glob of modules?''' if rawtext: lines = rawtext.split('\n') # clean up lines lines = [x.strip() for x in lines if x.strip()] lines = [x for x in lines if len(x) > 2] if len(lines) > 1: return True if lines: if lines[0].strip().endswith('*'): return True return False # https://github.com/ansible/ansible-modules-core/issues/3831 def multi_match(self, rawtext): '''Return a list of matches for a given glob or list of names''' matches = [] lines = rawtext.split('\n') lines = [x.strip() for x in lines if x.strip()] for line in lines: # is it an exact name, a path, a globbed name, a globbed path? if line.endswith('*'): thiskey = line.replace('*', '') keymatches = [] for k in self.modules.keys(): if thiskey in k: keymatches.append(k) for k in keymatches: matches.append(self.modules[k].copy()) else: match = self.find_match(line) if match: matches.append(match) # unique the list tmplist = [] for x in matches: if x not in tmplist: tmplist.append(x) if matches != tmplist: matches = [x for x in tmplist] return matches def set_module_metadata(self): for k, v in self.modules.iteritems(): if not v['filepath']: continue mfile = os.path.join(self.checkoutdir, v['filepath']) if not mfile.endswith('.py'): # metadata is only the .py files ... ext = mfile.split('.')[-1] mfile = mfile.replace('.' + ext, '.py', 1) self.modules[k]['metadata'].update(self.get_module_metadata(mfile)) def get_module_metadata(self, module_file): meta = {} if not os.path.isfile(module_file): return meta rawmeta = '' inphase = False with open(module_file, 'rb') as f: for line in f: if line.startswith('ANSIBLE_METADATA'): inphase = True #continue if line.startswith('DOCUMENTATION'): break if inphase: rawmeta += line rawmeta = rawmeta.replace('ANSIBLE_METADATA =', '', 1) rawmeta = rawmeta.strip() try: meta = ast.literal_eval(rawmeta) except SyntaxError: pass return meta def set_module_imports(self): for k, v in self.modules.iteritems(): if not v['filepath']: continue mfile = os.path.join(self.checkoutdir, v['filepath']) self.modules[k]['imports'] = self.get_module_imports(mfile) def get_module_imports(self, module_file): #import ansible.module_utils.nxos #from ansible.module_utils.netcfg import NetworkConfig, dumps #from ansible.module_utils.network import NetworkModule mimports = [] if not os.path.isfile(module_file): return mimports else: with open(module_file, 'rb') as f: for line in f: line = line.strip() line = line.replace(',', '') if line.startswith('import') or \ ('import' in line and 'from' in line): lparts = line.split() if line.startswith('import '): mimports.append(lparts[1]) elif line.startswith('from '): mpath = lparts[1] + '.' for spath in lparts[3:]: mimports.append(mpath + spath) return mimports @property def all_maintainers(self): maintainers = set() for path, metadata in self.botmeta['files'].items(): maintainers.update(metadata.get('maintainers', [])) return maintainers def get_maintainers_for_namespace(self, namespace): maintainers = [] for k, v in self.modules.items(): if 'namespace' not in v or 'maintainers' not in v: continue if v['namespace'] == namespace: for m in v['maintainers']: if m not in maintainers: maintainers.append(m) maintainers = [x for x in maintainers if x.strip()] return maintainers @staticmethod def replace_ansible(maintainers, ansible_members, bots=[]): '''Replace -ansible- with the -humans- in the org''' newlist = [] for m in maintainers: if m != 'ansible': newlist.append(m) else: newlist += ansible_members newlist = sorted(set(newlist)) newlist = [x for x in newlist if x not in bots] return newlist def get_file_content(self, filepath): fpath = os.path.join(self.checkoutdir, filepath) if not os.path.isfile(fpath): return None with open(fpath, 'rb') as f: data = f.read() return data
class DescriptionFixer(object): def __init__(self, issuewrapper, meta): self.issuewrapper = issuewrapper self.original = self.issuewrapper.instance.body self.meta = meta self.missing = [] self.sections = {} self.section_map = {} self.section_order = [] self.new_description = u'' self.retemplate = True self.cachedir = u'~/.ansibullbot/cache' self.cachedir = os.path.expanduser(self.cachedir) self.gws = GithubWebScraper(cachedir=self.cachedir) if self.issuewrapper.github_type == u'pullrequest': rfile = PTEMPLATE else: rfile = ITEMPLATE raw = self.gws.get_raw_content(u'ansible', u'ansible', u'devel', rfile, usecache=True) rlines = raw.split(u'\n') for rline in rlines: if not rline.startswith(u'#####'): continue section = rline.strip().split(None, 1)[1] section = section.lower() self.section_order.append(section) self.sections[section] = u'' if self.section_order[0] not in [u'issue type', u'summary']: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() else: raise Exception(u'out of order section') self.process() self.create_body() def process(self): for k, v in self.issuewrapper.template_data.items(): if k in [u'component raw', u'component_raw']: continue # use consistent key if k == u'environment': k = u'os / environment' # use consistent key if k == u'ansible configuration': k = u'configuration' # cleanup duble newlines if v: v = v.replace(u'\n\n', u'\n') if k == u'ansible version': self.sections[k] = u'```\n' + v + u'\n```' else: self.sections[k] = v if k not in self.section_order: self.section_order.append(k) # what is missing? missing = [x for x in self.section_order] missing = [x for x in missing if not self.sections.get(x)] missing = [x for x in missing if x != u'additional information'] self.missing = missing # inject section(s) versus recreating the whole body if len(missing) < 2: self.section_map = {} dlines = self.original.split(u'\n') for section in self.section_order: for idx, x in enumerate(dlines): if x.startswith(u'##### %s' % section.upper()): self.section_map[section] = idx if self.section_map: self.retemplate = False return None # set summary summary = self.sections.get(u'summary') if not summary: if self.original: if not self.issuewrapper.template_data.keys(): self.sections[u'summary'] = self.original else: self.sections[u'summary'] = self.issuewrapper.title else: self.sections[u'summary'] = self.issuewrapper.title # set issue type if not self.sections.get(u'issue type'): labeled = False for k, v in six.iteritems(ISSUE_TYPES): if k in self.issuewrapper.labels: self.sections[u'issue type'] = v labeled = True if not labeled: if self.issuewrapper.github_type == u'issue': self.sections[u'issue type'] = u'bug report' else: self.sections[u'issue type'] = u'feature pull request' # set component name if not self.sections.get(u'component name'): if not self.meta[u'is_module']: if self.issuewrapper.github_type == u'pullrequest': self.sections[u'component name'] = \ u'\n'.join(self.issuewrapper.files) else: self.sections[u'component name'] = u'core' else: self.sections[u'component name'] = \ self.meta[u'module_match'][u'name'] + u' module' # set ansible version if not self.sections.get(u'ansible version'): vlabels = [ x for x in self.issuewrapper.labels if x.startswith(u'affects_') ] vlabels = sorted(set(vlabels)) if vlabels: version = vlabels[0].split(u'_')[1] self.sections[u'ansible version'] = version elif self.meta[u'ansible_version']: self.sections[u'ansible version'] = self.meta[ u'ansible_version'] else: self.sections[u'ansible version'] = u'N/A' def create_body(self): # cleanup remnant colons for k, v in six.iteritems(self.sections): if v.startswith(u':\n'): self.sections[k] = v[2:] elif v.startswith(u': \n'): self.sections[k] = v[3:] elif v.startswith(u':'): self.sections[k] = v[1:] if self.retemplate: # render to text for section in self.section_order: data = self.sections.get(section) if data is None: data = u'' self.new_description += u'##### ' + section.upper() + u'\n' if section == u'issue type': self.new_description += data.title() self.new_description += u'\n' else: self.new_description += data + u'\n' self.new_description += u'\n' else: dlines = self.original.split(u'\n') for msection in self.missing: midx = self.section_order.index(msection) post_section = self.section_order[midx + 1] if post_section not in self.section_map: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() else: raise Exception(u'section not in map') post_line = self.section_map[post_section] new_section = [u'##### %s' % msection.upper()] if msection == u'component name': if not self.meta[u'is_module']: if self.issuewrapper.github_type == u'pullrequest': new_section += self.issuewrapper.files else: new_section.append(u'core') else: new_section.append( self.meta[u'module_match'][u'name'] + u' module') new_section.append(u'') for x in reversed(new_section): dlines.insert(post_line, x) self.new_description = u'\n'.join(dlines)
class ModuleIndexer(object): EMPTY_MODULE = { u'authors': [], u'name': None, u'namespaced_module': None, u'namespace_maintainers': [], u'deprecated': False, u'deprecated_filename': None, u'dirpath': None, u'filename': None, u'filepath': None, u'fulltopic': None, u'maintainers': [], u'_maintainers': [], u'maintainers_keys': None, u'metadata': {}, u'repo_filename': None, u'repository': u'ansible', u'subtopic': None, u'topic': None, u'imports': [] } def __init__(self, commits=True, blames=True, botmetafile=None, maintainers=None, gh_client=None, cachedir=u'~/.ansibullbot/cache', gitrepo=None): ''' Maintainers: defaultdict(dict) where keys are filepath and values are dict gh_client: GraphQL GitHub client ''' self.get_commits = commits self.get_blames = blames self.botmetafile = botmetafile self.botmeta = { } # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed) self.modules = {} # keys: paths of files belonging to the repository self.maintainers = maintainers or {} self.importmap = {} self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper') self.scraper_cache = os.path.expanduser(self.scraper_cache) self.gws = GithubWebScraper(cachedir=self.scraper_cache) self.gqlc = gh_client self.files = [] if gitrepo: self.gitrepo = gitrepo else: self.gitrepo = GitRepoWrapper( cachedir=cachedir, repo=u'https://github.com/ansible-collections/community.general' ) # sqlalchemy unc = os.path.join(cachedir, u'ansible_module_indexer.db') unc = os.path.expanduser(unc) unc = u'sqlite:///' + unc self.engine = create_engine(unc) self.Session = sessionmaker(bind=self.engine) self.session = self.Session() Email.metadata.create_all(self.engine) Blame.metadata.create_all(self.engine) # committers by module self.committers = {} # commits by module self.commits = {} # map of email to github login self.emails_cache = {} # load the bot meta self.update(force=True) def update(self, force=False): '''Reload everything if there are new commits''' changed = self.gitrepo.manage_checkout() if changed or force: self.get_files() self.parse_metadata() def get_files(self): '''Cache a list of filenames in the checkout''' cmd = u'cd {}; git ls-files'.format(self.gitrepo.checkoutdir) (rc, so, se) = run_command(cmd) files = to_text(so).split(u'\n') files = [x.strip() for x in files if x.strip()] self.files = files def parse_metadata(self): if self.botmetafile is not None: with open(self.botmetafile, 'rb') as f: rdata = f.read() else: fp = u'.github/BOTMETA.yml' rdata = self.get_file_content(fp) self.botmeta = BotMetadataParser.parse_yaml(rdata) # load the modules logging.info(u'loading modules') self.get_ansible_modules() def _find_match(self, pattern, exact=False): logging.debug(u'exact:{} matching on {}'.format(exact, pattern)) matches = [] if isinstance(pattern, six.text_type): pattern = to_text(to_bytes(pattern, 'ascii', 'ignore'), 'ascii') for k, v in six.iteritems(self.modules): if v[u'name'] == pattern: logging.debug(u'match {} on name: {}'.format(k, v[u'name'])) matches = [v] break if not matches: # search by key ... aka the filepath for k, v in six.iteritems(self.modules): if k == pattern: logging.debug(u'match {} on key: {}'.format(k, k)) matches = [v] break if not matches and not exact: # search by properties for k, v in six.iteritems(self.modules): for subkey in v.keys(): if v[subkey] == pattern: logging.debug(u'match {} on subkey: {}'.format( k, subkey)) matches.append(v) if not matches and not exact: # Levenshtein distance should workaround most typos distance_map = {} for k, v in six.iteritems(self.modules): mname = v.get(u'name') if not mname: continue if isinstance(mname, six.text_type): mname = to_text(to_bytes(mname, 'ascii', 'ignore'), 'ascii') try: res = Levenshtein.distance(pattern, mname) except TypeError as e: logging.error(e) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() distance_map[mname] = [res, k] res = sorted(distance_map.items(), key=lambda x: x[1], reverse=True) if len(pattern) > 3 > res[-1][1]: logging.debug(u'levenshtein ratio match: ({}) {} {}'.format( res[-1][-1], res[-1][0], pattern)) matches = [self.modules[res[-1][-1]]] return matches def find_match(self, pattern, exact=False): '''Exact module name matching''' logging.debug(u'find_match for "{}"'.format(pattern)) BLACKLIST = [ u'module_utils', u'callback', u'network modules', u'networking modules' u'windows modules' ] if not pattern or pattern is None: return None if pattern.lower() == u'core': return None ''' if 'docs.ansible.com' in pattern and '_module.html' in pattern: # http://docs.ansible.com/ansible/latest/copy_module.html # http://docs.ansible.com/ansible/latest/dev_guide/developing_modules.html # http://docs.ansible.com/ansible/latest/postgresql_db_module.html # [helm module](https//docs.ansible.com/ansible/2.4/helm_module.html) # Windows module: win_robocopy\nhttp://docs.ansible.com/ansible/latest/win_robocopy_module.html # Examples:\n* archive (https://docs.ansible.com/ansible/archive_module.html)\n* s3_sync (https://docs.ansible.com/ansible/s3_sync_module.html) urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', pattern ) #urls = [x for x in urls if '_module.html' in x] #if urls: # import epdb; epdb.st() import epdb; epdb.st() ''' # https://github.com/ansible/ansible/issues/19755 if pattern == u'setup': pattern = u'system/setup.py' if u'/facts.py' in pattern or u' facts.py' in pattern: pattern = u'system/setup.py' # https://github.com/ansible/ansible/issues/18527 # docker-container -> docker_container if u'-' in pattern: pattern = pattern.replace(u'-', u'_') if u'module_utils' in pattern: # https://github.com/ansible/ansible/issues/20368 return None elif u'callback' in pattern: return None elif u'lookup' in pattern: return None elif u'contrib' in pattern and u'inventory' in pattern: return None elif pattern.lower() in BLACKLIST: return None elif u'/' in pattern and not self._find_match(pattern, exact=True): # https://github.com/ansible/ansible/issues/20520 # FIXME what's this for? if not pattern.startswith(u'plugins/'): keys = self.modules.keys() for k in keys: if pattern in k: ppy = pattern + u'.py' if k.endswith(pattern) or k.endswith(ppy): return self.modules[k] elif pattern.endswith(u'.py') and self._find_match(pattern, exact=False): # https://github.com/ansible/ansible/issues/19889 candidate = self._find_match(pattern, exact=False) if isinstance(candidate, list): if len(candidate) == 1: candidate = candidate[0] if candidate[u'filename'] == pattern: return candidate match = self._find_match(pattern, exact=exact) if not match and not exact: # check for just the basename # 2617: ansible-s-extras/network/cloudflare_dns.py bname = os.path.basename(pattern) match = self._find_match(bname) if not match: # check for deprecated name # _fireball -> fireball match = self._find_match(u'_' + bname) # unique the results if isinstance(match, list) and len(match) > 1: _match = [] for m in match: if m not in _match: _match.append(m) match = _match[:] return match def is_valid(self, mname): match = self.find_match(mname, exact=True) if match: return True else: return False def get_repository_for_module(self, mname): match = self.find_match(mname, exact=True) if match: return match[u'repository'] else: return None def get_ansible_modules(self): """Make a list of known modules""" matches = [] module_dir = os.path.join(self.gitrepo.checkoutdir, u'plugins/modules') module_dir = os.path.expanduser(module_dir) for root, _, filenames in os.walk(module_dir): for filename in filenames: if u'plugins/modules' in root and not filename == u'__init__.py': matches.append(os.path.join(root, filename)) matches = sorted(set(matches)) self.populate_modules(matches) # custom fixes newitems = [] for k, v in six.iteritems(self.modules): # include* is almost always an ansible/ansible issue # https://github.com/ansible/ansibullbot/issues/214 if k.endswith(u'/include.py'): self.modules[k][u'repository'] = u'ansible' # https://github.com/ansible/ansibullbot/issues/214 if k.endswith(u'/include_vars.py'): self.modules[k][u'repository'] = u'ansible' if k.endswith(u'/include_role.py'): self.modules[k][u'repository'] = u'ansible' # ansible maintains these if u'include' in k: self.modules[k][u'maintainers'] = [u'ansible'] # deprecated modules are annoying if v[u'name'].startswith(u'_'): dkey = os.path.dirname(v[u'filepath']) dkey = os.path.join(dkey, v[u'filename'].replace(u'_', u'', 1)) if dkey not in self.modules: nd = v.copy() nd[u'name'] = nd[u'name'].replace(u'_', u'', 1) newitems.append((dkey, nd)) for ni in newitems: self.modules[ni[0]] = ni[1] # parse metadata logging.debug(u'set module metadata') self.set_module_metadata() # parse imports logging.debug(u'set module imports') self.set_module_imports() # last modified if self.get_commits: logging.debug(u'set module commits') self.get_module_commits() # parse blame if self.get_blames and self.get_commits: logging.debug(u'set module blames') self.get_module_blames() # depends on metadata now ... logging.debug(u'set module maintainers') self.set_maintainers() return self.modules def populate_modules(self, matches): # figure out the names for match in matches: mdict = copy.deepcopy(self.EMPTY_MODULE) mdict[u'filename'] = os.path.basename(match) dirpath = os.path.dirname(match) dirpath = dirpath.replace(self.gitrepo.checkoutdir + u'/', u'') mdict[u'dirpath'] = dirpath filepath = match.replace(self.gitrepo.checkoutdir + u'/', u'') mdict[u'filepath'] = filepath mdict.update(self.split_topics_from_path(filepath)) mdict[u'repo_filename'] = mdict[u'filepath']\ .replace(u'plugins/modules/%s/' % mdict[u'repository'], u'') # clustering/consul mdict[u'namespaced_module'] = mdict[u'repo_filename'] mdict[u'namespaced_module'] = \ mdict[u'namespaced_module'].replace(u'.py', u'') mdict[u'namespaced_module'] = \ mdict[u'namespaced_module'].replace(u'.ps1', u'') mname = os.path.basename(match) mname = mname.replace(u'.py', u'') mname = mname.replace(u'.ps1', u'') mdict[u'name'] = mname # deprecated modules if mname.startswith(u'_'): mdict[u'deprecated'] = True deprecated_filename = \ os.path.dirname(mdict[u'namespaced_module']) deprecated_filename = \ os.path.join(deprecated_filename, mname[1:] + u'.py') mdict[u'deprecated_filename'] = deprecated_filename else: mdict[u'deprecated_filename'] = mdict[u'repo_filename'] self.modules[filepath] = mdict # meta is a special module self.modules[u'meta'] = copy.deepcopy(self.EMPTY_MODULE) self.modules[u'meta'][u'name'] = u'meta' self.modules[u'meta'][u'repo_filename'] = u'meta' def get_module_commits(self): keys = self.modules.keys() keys = sorted(keys) for k in keys: self.commits[k] = [] cpath = os.path.join(self.gitrepo.checkoutdir, k) if not os.path.isfile(cpath): continue mtime = os.path.getmtime(cpath) refresh = False pfile = os.path.join(self.scraper_cache, k.replace(u'/', u'_') + u'.commits.pickle') if not os.path.isfile(pfile): refresh = True else: pickle_kwargs = {'encoding': 'bytes'} if six.PY3 else {} print(pfile) with open(pfile, 'rb') as f: pdata = pickle_load(f, **pickle_kwargs) if pdata[0] == mtime: self.commits[k] = pdata[1] else: refresh = True if refresh: logging.info(u'refresh commit cache for %s' % k) cmd = u'cd %s; git log --follow %s' % ( self.gitrepo.checkoutdir, k) (rc, so, se) = run_command(cmd) for line in to_text(so).split(u'\n'): if line.startswith(u'commit '): commit = { u'name': None, u'email': None, u'login': None, u'hash': line.split()[-1], u'date': None } # Author: Matt Clay <*****@*****.**> if line.startswith(u'Author: '): line = line.replace(u'Author: ', u'') line = line.replace(u'<', u'') line = line.replace(u'>', u'') lparts = line.split() if u'@' in lparts[-1]: commit[u'email'] = lparts[-1] commit[u'name'] = u' '.join(lparts[:-1]) else: pass if commit[u'email'] and \ u'noreply.github.com' in commit[u'email']: commit[u'login'] = commit[u'email'].split(u'@')[0] # Date: Sat Jan 28 23:28:53 2017 -0800 if line.startswith(u'Date:'): dstr = line.split(u':', 1)[1].strip() dstr = u' '.join(dstr.split(u' ')[:-1]) ds = datetime.datetime.strptime( to_text(dstr), u'%a %b %d %H:%M:%S %Y') commit[u'date'] = ds self.commits[k].append(commit) with open(pfile, 'wb') as f: pickle_dump((mtime, self.commits[k]), f) def last_commit_for_file(self, filepath): if filepath in self.commits and u'hash' in self.commits[filepath][0]: return self.commits[filepath][0][u'hash'] # git log --pretty=format:'%H' -1 # lib/ansible/modules/cloud/amazon/ec2_metric_alarm.py cmd = u'cd %s; git log --pretty=format:\'%%H\' -1 %s' % \ (self.gitrepo.checkoutdir, filepath) (rc, so, se) = run_command(cmd) return to_text(so).strip() def get_module_blames(self): logging.debug(u'build email cache') emails_cache = self.session.query(Email) emails_cache = [(x.email, x.login) for x in emails_cache] self.emails_cache = dict(emails_cache) logging.debug(u'build blame cache') blame_cache = self.session.query(Blame).all() blame_cache = [x.file_commit for x in blame_cache] blame_cache = sorted(set(blame_cache)) logging.debug(u'eval module hashes') changed = False keys = sorted(self.modules.keys()) for k in keys: if k not in self.files: self.committers[k] = {} continue ghash = self.last_commit_for_file(k) if ghash in blame_cache: continue logging.debug(u'checking hash for {}'.format(k)) res = self.session.query(Blame).filter_by(file_name=k, file_commit=ghash).all() hashes = [x.file_commit for x in res] if ghash not in hashes: logging.debug( u'hash {} not found for {}, updating blames'.format( ghash, k)) scraper_args = [u'ansible', u'ansible', u'devel', k] uns, emailmap = self.gqlc.get_usernames_from_filename_blame( *scraper_args) # check the emails for email, login in emailmap.items(): if email in self.emails_cache: continue exists = self.session.query(Email).filter_by( email=email).first() if not exists: logging.debug(u'insert {}:{}'.format(login, email)) _email = Email(email=email, login=login) self.session.add(_email) changed = True # check the blames for login, commits in uns.items(): for commit in commits: kwargs = { u'file_name': k, u'file_commit': ghash, u'author_commit': commit, u'author_login': login } exists = self.session.query(Blame).filter_by( **kwargs).first() if not exists: logging.debug(u'insert {}:{}:{}'.format( k, commit, login)) _blame = Blame(**kwargs) self.session.add(_blame) changed = True if changed: self.session.commit() logging.debug(u're-build email cache') emails_cache = self.session.query(Email) emails_cache = [(x.email, x.login) for x in emails_cache] self.emails_cache = dict(emails_cache) # fill in what we can ... logging.debug(u'fill in commit logins') for k in keys: for idc, commit in enumerate(self.commits[k][:]): if not commit.get(u'login'): continue login = self.emails_cache.get(commit[u'email']) if not login and u'@users.noreply.github.com' in commit[ u'email']: login = commit[u'email'].split(u'@')[0] self.emails_cache[commit[u'email']] = login if not login: print(u'unknown: {}'.format(commit[u'email'])) self.commits[k][idc][u'login'] = self.emails_cache.get(login) def get_emails_by_login(self, login): res = self.session.query(Email).filter_by(login=login) emails = [x.email for x in res.values()] return emails def _get_module_blames(self): ''' Scrape the blame page for each module and store it ''' keys = sorted(self.modules.keys()) # scrape the data for k in keys: cpath = os.path.join(self.gitrepo.checkoutdir, k) if not os.path.isfile(cpath): self.committers[k] = {} continue ghash = self.last_commit_for_file(k) pfile = os.path.join(self.scraper_cache, k.replace(u'/', u'_') + u'.blame.pickle') sargs = [u'ansible', u'ansible', u'devel', k] refresh = False if not os.path.isfile(pfile): refresh = True else: logging.debug(u'load {}'.format(pfile)) with open(pfile, 'rb') as f: pdata = pickle_load(f) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() if pdata[0] == ghash: self.committers[k] = pdata[1] if len(pdata) == 3: # use emailmap if available emailmap = pdata[2] else: emailmap = {} else: refresh = True if refresh: if self.gqlc: logging.debug(u'graphql blame usernames {}'.format(pfile)) uns, emailmap = self.gqlc.get_usernames_from_filename_blame( *sargs) else: emailmap = {} # scrapping: emails not available logging.debug(u'www blame usernames {}'.format(pfile)) uns = self.gws.get_usernames_from_filename_blame(*sargs) self.committers[k] = uns with open(pfile, 'wb') as f: pickle_dump((ghash, uns, emailmap), f) for email, github_id in emailmap.items(): if email not in self.emails_cache: self.emails_cache[email] = github_id # add scraped logins to the map for k in keys: for idx, x in enumerate(self.commits[k]): if x[u'email'] in [u'@']: continue if x[u'email'] not in self.emails_cache: self.emails_cache[x[u'email']] = None if x[u'login']: self.emails_cache[x[u'email']] = x[u'login'] continue xhash = x[u'hash'] for ck, cv in six.iteritems(self.committers[k]): if xhash in cv: self.emails_cache[x[u'email']] = ck break # fill in what we can ... for k in keys: for idx, x in enumerate(self.commits[k]): if not x[u'login']: if x[u'email'] in [u'@']: continue if self.emails_cache[x[u'email']]: login = self.emails_cache[x[u'email']] xhash = x[u'hash'] self.commits[k][idx][u'login'] = login if login not in self.committers[k]: self.committers[k][login] = [] if xhash not in self.committers[k][login]: self.committers[k][login].append(xhash) def set_maintainers(self): '''Define the maintainers for each module''' # grep the authors: for k, v in six.iteritems(self.modules): if v[u'filepath'] is None: continue mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath']) authors = self.get_module_authors(mfile) self.modules[k][u'authors'] = authors # authors are maintainers by -default- self.modules[k][u'maintainers'] += authors self.modules[k][u'maintainers'] = \ sorted(set(self.modules[k][u'maintainers'])) metadata = self.botmeta[u'files'].keys() for k, v in six.iteritems(self.modules): if k == u'meta': continue if k in self.botmeta[u'files']: # There are metadata in .github/BOTMETA.yml for this file # copy maintainers_keys self.modules[k][u'maintainers_keys'] = self.botmeta[u'files'][ k][u'maintainers_keys'][:] if self.botmeta[u'files'][k]: maintainers = self.botmeta[u'files'][k].get( u'maintainers', []) for maintainer in maintainers: if maintainer not in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].append(maintainer) # remove the people who want to be ignored if u'ignored' in self.botmeta[u'files'][k]: ignored = self.botmeta[u'files'][k][u'ignored'] for x in ignored: if x in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].remove(x) else: # There isn't metadata in .github/BOTMETA.yml for this file best_match = None for mkey in metadata: if v[u'filepath'].startswith(mkey): if not best_match: best_match = mkey continue if len(mkey) > len(best_match): best_match = mkey if best_match: self.modules[k][u'maintainers_keys'] = [best_match] for maintainer in self.botmeta[u'files'][best_match].get( u'maintainers', []): if maintainer not in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].append(maintainer) # remove the people who want to be ignored for ignored in self.botmeta[u'files'][best_match].get( u'ignored', []): if ignored in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].remove(ignored) # save a pristine copy so that higher level code can still use it self.modules[k][u'maintainers'] = sorted( set(self.modules[k][u'maintainers'])) self.modules[k][u'_maintainers'] = \ [x for x in self.modules[k][u'maintainers']] # set the namespace maintainers ... for k, v in six.iteritems(self.modules): if u'namespace_maintainers' not in self.modules[k]: self.modules[k][u'namespace_maintainers'] = [] if v.get(u'namespace'): ns = v.get(u'namespace') nms = self.get_maintainers_for_namespace(ns) self.modules[k][u'namespace_maintainers'] = nms def split_topics_from_path(self, module_file): subpath = module_file.replace(u'plugins/modules/', u'') path_parts = subpath.split(u'/') topic = path_parts[0] if len(path_parts) > 2: subtopic = path_parts[1] fulltopic = u'/'.join(path_parts[0:2]) else: subtopic = None fulltopic = path_parts[0] tdata = { u'fulltopic': fulltopic, u'namespace': fulltopic, u'topic': topic, u'subtopic': subtopic } return tdata def get_module_authors(self, module_file): """Grep the authors out of the module docstrings""" if not os.path.exists(module_file): return [] documentation = b'' inphase = False with io.open(module_file, 'rb') as f: for line in f: if b'DOCUMENTATION' in line: inphase = True continue if line.strip().endswith((b"'''", b'"""')): break if inphase: documentation += line if not documentation: return [] # clean out any other yaml besides author to save time inphase = False author_lines = u'' doc_lines = to_text(documentation).split(u'\n') for idx, x in enumerate(doc_lines): if x.startswith(u'author'): inphase = True if inphase and not x.strip().startswith((u'-', u'author')): inphase = False break if inphase: author_lines += x + u'\n' if not author_lines: return [] ydata = {} try: ydata = yaml.load(author_lines, BotYAMLLoader) except Exception as e: print(e) return [] # quit early if the yaml was not valid if not ydata: return [] # quit if the key was not found if u'author' not in ydata: return [] if not isinstance(ydata[u'author'], list): ydata[u'author'] = [ydata[u'author']] authors = [] for author in ydata[u'author']: github_ids = self.extract_github_id(author) if github_ids: authors.extend(github_ids) return authors def extract_github_id(self, author): authors = set() if author is None: return [] if u'ansible core team' in author.lower(): authors.add(u'ansible') elif u'@' in author: # match github ids but not emails authors.update(re.findall(r'(?<!\w)@([\w-]+)(?![\w.])', author)) elif u'github.com/' in author: # {'author': 'Henrique Rodrigues (github.com/Sodki)'} idx = author.find(u'github.com/') author = author[idx + 11:] authors.add(author.replace(u')', u'')) elif u'(' in author and len(author.split()) == 3: # Mathieu Bultel (matbu) idx = author.find(u'(') author = author[idx + 1:] authors.add(author.replace(u')', u'')) # search for emails for email in re.findall(r'[<(]([^@]+@[^)>]+)[)>]', author): github_id = self.emails_cache.get(email) if github_id: authors.add(github_id) return list(authors) def fuzzy_match(self, repo=None, title=None, component=None): '''Fuzzy matching for modules''' logging.debug(u'fuzzy match {}'.format( to_text(to_bytes(component, 'ascii', 'ignore'), 'ascii'))) if component.lower() == u'core': return None # https://github.com/ansible/ansible/issues/18179 if u'validate-modules' in component: return None # https://github.com/ansible/ansible/issues/20368 if u'module_utils' in component: return None if u'new module' in component: return None # authorized_keys vs. authorized_key if component and component.endswith(u's'): tm = self.find_match(component[:-1]) if tm: if not isinstance(tm, list): return tm[u'name'] elif len(tm) == 1: return tm[0][u'name'] else: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() match = None known_modules = [] for k, v in six.iteritems(self.modules): if v[u'name'] in [u'include']: continue known_modules.append(v[u'name']) title = title.lower() title = title.replace(u':', u'') title_matches = [x for x in known_modules if x + u' module' in title] if not title_matches: title_matches = [ x for x in known_modules if title.startswith(x + u' ') ] if not title_matches: title_matches = \ [x for x in known_modules if u' ' + x + u' ' in title] if title_matches: title_matches = [x for x in title_matches if x != u'at'] # don't do singular word matching in title for ansible/ansible cmatches = None if component: cmatches = [x for x in known_modules if x in component] cmatches = [x for x in cmatches if not u'_' + x in component] # globs if not cmatches and u'*' in component: fmatches = [ x for x in known_modules if fnmatch.fnmatch(x, component) ] if fmatches: cmatches = fmatches[:] if title_matches: # use title ... ? cmatches = [ x for x in cmatches if x in title_matches and x not in [u'at'] ] if cmatches: if len(cmatches) >= 1 and (u'*' not in component and u'modules' not in component): match = cmatches[0] else: match = cmatches[:] if not match: if u'docs.ansible.com' in component: pass else: pass logging.debug("module - component matches: %s" % cmatches) if not match: if len(title_matches) == 1: match = title_matches[0] else: logging.debug("module - title matches: %s" % title_matches) return match def is_multi(self, rawtext): '''Is the string a list or a glob of modules?''' if rawtext: lines = rawtext.split(u'\n') # clean up lines lines = [x.strip() for x in lines if x.strip()] lines = [x for x in lines if len(x) > 2] if len(lines) > 1: return True if lines: if lines[0].strip().endswith(u'*'): return True return False # https://github.com/ansible/ansible-modules-core/issues/3831 def multi_match(self, rawtext): '''Return a list of matches for a given glob or list of names''' matches = [] lines = rawtext.split(u'\n') lines = [x.strip() for x in lines if x.strip()] for line in lines: # is it an exact name, a path, a globbed name, a globbed path? if line.endswith(u'*'): thiskey = line.replace(u'*', u'') keymatches = [] for k in self.modules.keys(): if thiskey in k: keymatches.append(k) for k in keymatches: matches.append(self.modules[k].copy()) else: match = self.find_match(line) if match: matches.append(match) # unique the list tmplist = [] for x in matches: if x not in tmplist: tmplist.append(x) if matches != tmplist: matches = [x for x in tmplist] return matches def set_module_metadata(self): for k, v in six.iteritems(self.modules): if not v[u'filepath']: continue mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath']) if not mfile.endswith(u'.py'): # metadata is only the .py files ... ext = mfile.split(u'.')[-1] mfile = mfile.replace(u'.' + ext, u'.py', 1) self.modules[k][u'metadata'].update( self.get_module_metadata(mfile)) def get_module_metadata(self, module_file): meta = {} if not os.path.isfile(module_file): return meta rawmeta = u'' inphase = False with io.open(module_file, 'r', encoding='utf-8') as f: for line in f: if line.startswith(u'ANSIBLE_METADATA'): inphase = True if line.startswith(u'DOCUMENTATION'): break if inphase: rawmeta += line rawmeta = rawmeta.replace(u'ANSIBLE_METADATA =', u'', 1) rawmeta = rawmeta.strip() try: meta = ast.literal_eval(rawmeta) tmp_meta = {} for k, v in meta.items(): if isinstance(k, six.binary_type): k = to_text(k) if isinstance(v, six.binary_type): v = to_text(v) if isinstance(v, list): tmp_list = [] for i in v: if isinstance(i, six.binary_type): i = to_text(i) tmp_list.append(i) v = tmp_list del tmp_list tmp_meta[k] = v meta = tmp_meta del tmp_meta except SyntaxError: pass return meta def set_module_imports(self): for k, v in six.iteritems(self.modules): if not v[u'filepath']: continue mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath']) self.modules[k][u'imports'] = self.get_module_imports(mfile) def get_module_imports(self, module_file): mimports = [] if not os.path.isfile(module_file): return mimports else: with open(module_file, 'rb') as f: for line in f: line = line.strip() line = line.replace(b',', b'') if line.startswith(b'import') or \ (b'import' in line and b'from' in line): lparts = line.split() if line.startswith(b'import '): mimports.append(lparts[1]) elif line.startswith(b'from '): mpath = lparts[1] + b'.' for spath in lparts[3:]: mimports.append(mpath + spath) return [to_text(m) for m in mimports] @property def all_maintainers(self): maintainers = set() for path, metadata in self.botmeta[u'files'].items(): maintainers.update(metadata.get(u'maintainers', [])) return maintainers @property def all_authors(self): authors = set() for key, metadata in self.modules.items(): authors.update(metadata.get(u'authors', [])) return authors def get_maintainers_for_namespace(self, namespace): maintainers = [] for k, v in self.modules.items(): if u'namespace' not in v or u'maintainers' not in v: continue if v[u'namespace'] == namespace: for m in v[u'maintainers']: if m not in maintainers: maintainers.append(m) maintainers = [x for x in maintainers if x.strip()] return maintainers @staticmethod def replace_ansible(maintainers, ansible_members, bots=[]): '''Replace -ansible- with the -humans- in the org''' newlist = [] for m in maintainers: if m != u'ansible': newlist.append(m) else: newlist += ansible_members newlist = sorted(set(newlist)) newlist = [x for x in newlist if x not in bots] return newlist def get_file_content(self, filepath): fpath = os.path.join(self.gitrepo.checkoutdir, filepath) if not os.path.isfile(fpath): return None with io.open(fpath, 'r', encoding='utf-8') as f: data = f.read() return data
class DescriptionFixer(object): def __init__(self, issuewrapper, meta): self.issuewrapper = issuewrapper self.original = self.issuewrapper.instance.body self.meta = meta self.missing = [] self.sections = {} self.section_map = {} self.section_order = [] self.new_description = u'' self.retemplate = True self.cachedir = u'~/.ansibullbot/cache' self.cachedir = os.path.expanduser(self.cachedir) self.gws = GithubWebScraper(cachedir=self.cachedir) if self.issuewrapper.github_type == u'pullrequest': rfile = PTEMPLATE else: rfile = ITEMPLATE raw = self.gws.get_raw_content( u'ansible', u'ansible', u'devel', rfile, usecache=True ) rlines = raw.split(u'\n') for rline in rlines: if not rline.startswith(u'#####'): continue section = rline.strip().split(None, 1)[1] section = section.lower() self.section_order.append(section) self.sections[section] = u'' if self.section_order[0] not in [u'issue type', u'summary']: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() else: raise Exception(u'out of order section') self.process() self.create_body() def process(self): for k, v in self.issuewrapper.template_data.items(): if k in [u'component raw', u'component_raw']: continue # use consistent key if k == u'environment': k = u'os / environment' # use consistent key if k == u'ansible configuration': k = u'configuration' # cleanup duble newlines if v: v = v.replace(u'\n\n', u'\n') if k == u'ansible version': self.sections[k] = u'```\n' + v + u'\n```' else: self.sections[k] = v if k not in self.section_order: self.section_order.append(k) # what is missing? missing = [x for x in self.section_order] missing = [x for x in missing if not self.sections.get(x)] missing = [x for x in missing if x != u'additional information'] self.missing = missing # inject section(s) versus recreating the whole body if len(missing) < 2: self.section_map = {} dlines = self.original.split(u'\n') for section in self.section_order: for idx, x in enumerate(dlines): if x.startswith(u'##### %s' % section.upper()): self.section_map[section] = idx if self.section_map: self.retemplate = False return None # set summary summary = self.sections.get(u'summary') if not summary: if self.original: if not self.issuewrapper.template_data.keys(): self.sections[u'summary'] = self.original else: self.sections[u'summary'] = self.issuewrapper.title else: self.sections[u'summary'] = self.issuewrapper.title # set issue type if not self.sections.get(u'issue type'): labeled = False for k, v in six.iteritems(ISSUE_TYPES): if k in self.issuewrapper.labels: self.sections[u'issue type'] = v labeled = True if not labeled: if self.issuewrapper.github_type == u'issue': self.sections[u'issue type'] = u'bug report' else: self.sections[u'issue type'] = u'feature pull request' # set component name if not self.sections.get(u'component name'): if not self.meta[u'is_module']: if self.issuewrapper.github_type == u'pullrequest': self.sections[u'component name'] = \ u'\n'.join(self.issuewrapper.files) else: self.sections[u'component name'] = u'core' else: self.sections[u'component name'] = \ self.meta[u'module_match'][u'name'] + u' module' # set ansible version if not self.sections.get(u'ansible version'): vlabels = [x for x in self.issuewrapper.labels if x.startswith(u'affects_')] vlabels = sorted(set(vlabels)) if vlabels: version = vlabels[0].split(u'_')[1] self.sections[u'ansible version'] = version elif self.meta[u'ansible_version']: self.sections[u'ansible version'] = self.meta[u'ansible_version'] else: self.sections[u'ansible version'] = u'N/A' def create_body(self): # cleanup remnant colons for k, v in six.iteritems(self.sections): if v.startswith(u':\n'): self.sections[k] = v[2:] elif v.startswith(u': \n'): self.sections[k] = v[3:] elif v.startswith(u':'): self.sections[k] = v[1:] if self.retemplate: # render to text for section in self.section_order: data = self.sections.get(section) if data is None: data = u'' self.new_description += u'##### ' + section.upper() + u'\n' if section == u'issue type': self.new_description += data.title() self.new_description += u'\n' else: self.new_description += data + u'\n' self.new_description += u'\n' else: dlines = self.original.split(u'\n') for msection in self.missing: midx = self.section_order.index(msection) post_section = self.section_order[midx + 1] if post_section not in self.section_map: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() else: raise Exception(u'section not in map') post_line = self.section_map[post_section] new_section = [u'##### %s' % msection.upper()] if msection == u'component name': if not self.meta[u'is_module']: if self.issuewrapper.github_type == u'pullrequest': new_section += self.issuewrapper.files else: new_section.append(u'core') else: new_section.append( self.meta[u'module_match'][u'name'] + u' module' ) new_section.append(u'') for x in reversed(new_section): dlines.insert(post_line, x) self.new_description = u'\n'.join(dlines)
class ModuleIndexer(object): EMPTY_MODULE = { 'authors': [], 'name': None, 'namespaced_module': None, 'namespace_maintainers': [], 'deprecated': False, 'deprecated_filename': None, 'dirpath': None, 'filename': None, 'filepath': None, 'fulltopic': None, 'maintainers': [], '_maintainers': [], 'maintainers_key': None, 'metadata': {}, 'repo_filename': None, 'repository': 'ansible', 'subtopic': None, 'topic': None, 'imports': [] } def __init__(self, maintainers=None): self.modules = {} self.maintainers = maintainers or {} self.checkoutdir = '~/.ansibullbot/cache/ansible.modules.checkout' self.checkoutdir = os.path.expanduser(self.checkoutdir) self.importmap = {} self.scraper_cache = '~/.ansibullbot/cache/ansible.modules.scraper' self.scraper_cache = os.path.expanduser(self.scraper_cache) self.gws = GithubWebScraper(cachedir=self.scraper_cache) # committers by module self.committers = {} # commits by module self.commits = {} # map of email to github login self.emailmap = {} def create_checkout(self): """checkout ansible""" print('# creating checkout for module indexer') # cleanup if os.path.isdir(self.checkoutdir): shutil.rmtree(self.checkoutdir) cmd = "git clone http://github.com/ansible/ansible --recursive %s" \ % self.checkoutdir (rc, so, se) = run_command(cmd) print str(so) + str(se) def update_checkout(self): """rebase + pull + update the checkout""" print('# updating checkout for module indexer') #success = True cmd = "cd %s ; git pull --rebase" % self.checkoutdir (rc, so, se) = run_command(cmd) print str(so) + str(se) # If rebase failed, recreate the checkout if rc != 0: self.create_checkout() return cmd = "cd %s ; git submodule update --recursive" % self.checkoutdir (rc, so, se) = run_command(cmd) print str(so) + str(se) # if update fails, recreate the checkout if rc != 0: self.create_checkout() def _find_match(self, pattern, exact=False): match = None for k,v in self.modules.iteritems(): if v['name'] == pattern: match = v break if not match: # search by key ... aka the filepath for k,v in self.modules.iteritems(): if k == pattern: match = v break if not match and not exact: # search by properties for k,v in self.modules.iteritems(): for subkey in v.keys(): if v[subkey] == pattern: match = v break if match: break return match def find_match(self, pattern, exact=False): '''Exact module name matching''' if not pattern: return None # https://github.com/ansible/ansible/issues/19755 if pattern == 'setup': pattern = 'system/setup.py' # https://github.com/ansible/ansible/issues/18527 # docker-container -> docker_container if '-' in pattern: pattern = pattern.replace('-', '_') if 'module_utils' in pattern: # https://github.com/ansible/ansible/issues/20368 return None elif '/' in pattern and not self._find_match(pattern, exact=True): # https://github.com/ansible/ansible/issues/20520 if not pattern.startswith('lib/'): keys = self.modules.keys() for k in keys: if pattern in k: ppy = pattern + '.py' if k.endswith(pattern) or k.endswith(ppy): return self.modules[k] elif pattern.endswith('.py') and self._find_match(pattern, exact=False): # https://github.com/ansible/ansible/issues/19889 candidate = self._find_match(pattern, exact=False) if candidate['filename'] == pattern: return candidate match = self._find_match(pattern, exact=exact) if not match and not exact: # check for just the basename # 2617: ansible-s-extras/network/cloudflare_dns.py bname = os.path.basename(pattern) match = self._find_match(bname) if not match: # check for deprecated name # _fireball -> fireball match = self._find_match('_' + bname) return match def is_valid(self, mname): match = self.find_match(mname) if match: return True else: return False def get_repository_for_module(self, mname): match = self.find_match(mname) if match: return match['repository'] else: return None def get_ansible_modules(self): """Make a list of known modules""" # manage the checkout if not os.path.isdir(self.checkoutdir): self.create_checkout() else: self.update_checkout() #(Epdb) pp module #u'wait_for' #(Epdb) pp self.module_indexer.is_valid(module) #False matches = [] module_dir = os.path.join(self.checkoutdir, 'lib/ansible/modules') module_dir = os.path.expanduser(module_dir) for root, dirnames, filenames in os.walk(module_dir): for filename in filenames: if 'lib/ansible/modules' in root and \ not filename == '__init__.py' and \ (filename.endswith('.py') or filename.endswith('.ps1')): matches.append(os.path.join(root, filename)) matches = sorted(set(matches)) # figure out the names for match in matches: mdict = copy.deepcopy(self.EMPTY_MODULE) mdict['filename'] = os.path.basename(match) dirpath = os.path.dirname(match) dirpath = dirpath.replace(self.checkoutdir + '/', '') mdict['dirpath'] = dirpath filepath = match.replace(self.checkoutdir + '/', '') mdict['filepath'] = filepath mdict.update( self.split_topics_from_path(filepath) ) mdict['repo_filename'] = mdict['filepath']\ .replace('lib/ansible/modules/%s/' % mdict['repository'], '') # clustering/consul mdict['namespaced_module'] = mdict['repo_filename'] mdict['namespaced_module'] = \ mdict['namespaced_module'].replace('.py', '') mdict['namespaced_module'] = \ mdict['namespaced_module'].replace('.ps1', '') mname = os.path.basename(match) mname = mname.replace('.py', '') mname = mname.replace('.ps1', '') mdict['name'] = mname # deprecated modules if mname.startswith('_'): mdict['deprecated'] = True deprecated_filename = \ os.path.dirname(mdict['namespaced_module']) deprecated_filename = \ os.path.join(deprecated_filename, mname[1:] + '.py') mdict['deprecated_filename'] = deprecated_filename else: mdict['deprecated_filename'] = mdict['repo_filename'] mkey = mdict['filepath'] self.modules[mkey] = mdict # grep the authors: for k,v in self.modules.iteritems(): mfile = os.path.join(self.checkoutdir, v['filepath']) authors = self.get_module_authors(mfile) self.modules[k]['authors'] = authors # meta is a special module self.modules['meta'] = copy.deepcopy(self.EMPTY_MODULE) self.modules['meta']['name'] = 'meta' self.modules['meta']['repo_filename'] = 'meta' # custom fixes newitems = [] for k,v in self.modules.iteritems(): # include* is almost always an ansible/ansible issue # https://github.com/ansible/ansibullbot/issues/214 if k.endswith('/include.py'): self.modules[k]['repository'] = 'ansible' # https://github.com/ansible/ansibullbot/issues/214 if k.endswith('/include_vars.py'): self.modules[k]['repository'] = 'ansible' if k.endswith('/include_role.py'): self.modules[k]['repository'] = 'ansible' # ansible maintains these if 'include' in k: self.modules[k]['maintainers'] = ['ansible'] # deprecated modules are annoying if v['name'].startswith('_'): dkey = os.path.dirname(v['filepath']) dkey = os.path.join(dkey, v['filename'].replace('_', '', 1)) if dkey not in self.modules: nd = v.copy() nd['name'] = nd['name'].replace('_', '', 1) newitems.append((dkey, nd)) for ni in newitems: self.modules[ni[0]] = ni[1] # parse metadata self.set_module_metadata() # parse imports self.set_module_imports() # last modified self.get_module_commits() # parse blame self.get_module_blames() # depends on metadata now ... self.set_maintainers() return self.modules def get_module_commits(self): keys = self.modules.keys() keys = sorted(keys) for k in keys: #v = self.modules[k] self.commits[k] = [] cpath = os.path.join(self.checkoutdir, k) if not os.path.isfile(cpath): continue mtime = os.path.getmtime(cpath) refresh = False pfile = os.path.join( self.scraper_cache, k.replace('/', '_') + '.commits.pickle' ) if not os.path.isfile(pfile): refresh = True else: with open(pfile, 'rb') as f: pdata = pickle.load(f) if pdata[0] == mtime: self.commits[k] = pdata[1] else: refresh = True if refresh: logging.info('refresh commit cache for %s' % k) cmd = 'cd %s; git log --follow %s' % (self.checkoutdir, k) (rc, so, se) = run_command(cmd) for line in so.split('\n'): if line.startswith('commit '): commit = { 'name': None, 'email': None, 'login': None, 'hash': line.split()[-1], 'date': None } # Author: Matt Clay <*****@*****.**> if line.startswith('Author: '): line = line.replace('Author: ', '') line = line.replace('<', '') line = line.replace('>', '') lparts = line.split() if '@' in lparts[-1]: commit['email'] = lparts[-1] commit['name'] = ' '.join(lparts[:-1]) else: pass if commit['email'] and \ 'noreply.github.com' in commit['email']: commit['login'] = commit['email'].split('@')[0] # Date: Sat Jan 28 23:28:53 2017 -0800 if line.startswith('Date:'): dstr = line.split(':', 1)[1].strip() dstr = ' '.join(dstr.split(' ')[:-1]) ds = datetime.datetime.strptime( dstr, '%a %b %d %H:%M:%S %Y' ) commit['date'] = ds self.commits[k].append(commit) with open(pfile, 'wb') as f: pickle.dump((mtime, self.commits[k]), f) def last_commit_for_file(self, filepath): # git log --pretty=format:'%H' -1 # lib/ansible/modules/cloud/amazon/ec2_metric_alarm.py cmd = 'cd %s; git log --pretty=format:\'%%H\' -1 %s' % \ (self.checkoutdir, filepath) (rc, so, se) = run_command(cmd) #import epdb; epdb.st() return so.strip() def get_module_blames(self): ''' Scrape the blame page for each module and store it ''' keys = sorted(self.modules.keys()) # scrape the data #for k,v in self.modules.iteritems(): for k in keys: #v = self.modules[k] cpath = os.path.join(self.checkoutdir, k) if not os.path.isfile(cpath): self.committers[k] = {} continue #mtime = os.path.getmtime(cpath) ghash = self.last_commit_for_file(k) pfile = os.path.join( self.scraper_cache, k.replace('/', '_') + '.blame.pickle' ) sargs = ['ansible', 'ansible', 'devel', k] refresh = False if not os.path.isfile(pfile): refresh = True else: with open(pfile, 'rb') as f: pdata = pickle.load(f) if pdata[0] == ghash: self.committers[k] = pdata[1] else: refresh = True if refresh: uns = self.gws.get_usernames_from_filename_blame(*sargs) self.committers[k] = uns with open(pfile, 'wb') as f: pickle.dump((ghash, uns), f) # add scraped logins to the map #for k,v in self.modules.iteritems(): for k in keys: #v = self.modules[k] for idx,x in enumerate(self.commits[k]): if x['email'] in ['@']: continue if x['email'] not in self.emailmap: self.emailmap[x['email']] = None if x['login']: self.emailmap[x['email']] = x['login'] continue xhash = x['hash'] for ck,cv in self.committers[k].iteritems(): if xhash in cv: self.emailmap[x['email']] = ck break # fill in what we can ... #for k,v in self.modules.iteritems(): for k in keys: #v = self.modules[k] for idx,x in enumerate(self.commits[k]): if not x['login']: if x['email'] in ['@']: continue if self.emailmap[x['email']]: login = self.emailmap[x['email']] xhash = x['hash'] self.commits[k][idx]['login'] = login if login not in self.committers[k]: self.committers[k][login] = [] if xhash not in self.committers[k][login]: self.committers[k][login].append(xhash) def set_maintainers(self): '''Define the maintainers for each module''' mkeys = self.maintainers.keys() for k,v in self.modules.iteritems(): if not v['filepath']: continue best_match = None for mkey in mkeys: if mkey in v['filepath']: if not best_match: best_match = mkey continue if len(mkey) > len(best_match): best_match = mkey if best_match: self.modules[k]['maintainers_key'] = best_match self.modules[k]['maintainers'] = self.maintainers[best_match] else: if v['metadata'].get('supported_by') not in ['community']: self.modules[k]['maintainers_key'] = best_match if v['metadata'].get('supported_by') == 'core': self.modules[k]['maintainers'] = ['ansible'] else: # curated? ... what now? pass # save a pristine copy so that higher level code can still use it self.modules[k]['_maintainers'] = \ [x for x in self.modules[k]['maintainers']] # set the namespace maintainers ... for k,v in self.modules.iteritems(): if 'namespace_maintainers' not in self.modules[k]: self.modules[k]['namespace_maintainers'] = [] if v.get('namespace'): ns = v.get('namespace') nms = self.get_maintainers_for_namespace(ns) self.modules[k]['namespace_maintainers'] = nms def split_topics_from_path(self, module_file): subpath = module_file.replace('lib/ansible/modules/', '') path_parts = subpath.split('/') topic = path_parts[0] if len(path_parts) > 2: subtopic = path_parts[1] fulltopic = '/'.join(path_parts[0:2]) else: subtopic = None fulltopic = path_parts[0] tdata = { 'fulltopic': fulltopic, 'namespace': fulltopic, 'topic': topic, 'subtopic': subtopic } return tdata def get_module_authors(self, module_file): """Grep the authors out of the module docstrings""" authors = [] if not os.path.exists(module_file): return authors documentation = '' inphase = False with open(module_file, 'rb') as f: for line in f: if 'DOCUMENTATION' in line: inphase = True continue if line.strip().endswith("'''") or line.strip().endswith('"""'): #phase = None break if inphase: documentation += line if not documentation: return authors # clean out any other yaml besides author to save time inphase = False author_lines = '' doc_lines = documentation.split('\n') for idx,x in enumerate(doc_lines): if x.startswith('author'): #print("START ON %s" % x) inphase = True #continue if inphase and not x.strip().startswith('-') and \ not x.strip().startswith('author'): #print("BREAK ON %s" % x) inphase = False break if inphase: author_lines += x + '\n' if not author_lines: return authors ydata = {} try: ydata = yaml.load(author_lines) except Exception as e: print e return authors # quit early if the yaml was not valid if not ydata: return authors # sometimes the field is 'author', sometimes it is 'authors' if 'authors' in ydata: ydata['author'] = ydata['authors'] # quit if the key was not found if 'author' not in ydata: return authors if type(ydata['author']) != list: ydata['author'] = [ydata['author']] for author in ydata['author']: if 'ansible core team' in author.lower(): authors.append('ansible') elif '@' in author: words = author.split() for word in words: if '@' in word and '(' in word and ')' in word: if '(' in word: word = word.split('(')[-1] if ')' in word: word = word.split(')')[0] word = word.strip() if word.startswith('@'): word = word.replace('@', '', 1) authors.append(word) elif 'github.com/' in author: # {'author': 'Henrique Rodrigues (github.com/Sodki)'} idx = author.find('github.com/') author = author[idx+11:] author = author.replace(')', '') authors.append(author) elif '(' in author and len(author.split()) == 3: # Mathieu Bultel (matbu) idx = author.find('(') author = author[idx+1:] author = author.replace(')', '') else: pass return authors def fuzzy_match(self, repo=None, title=None, component=None): '''Fuzzy matching for modules''' # https://github.com/ansible/ansible/issues/18179 if 'validate-modules' in component: return None # https://github.com/ansible/ansible/issues/20368 if 'module_utils' in component: return None # authorized_keys vs. authorized_key if component and component.endswith('s'): tm = self.find_match(component[:-1]) if tm: return tm['name'] match = None known_modules = [] for k,v in self.modules.iteritems(): known_modules.append(v['name']) title = title.lower() title = title.replace(':', '') title_matches = [x for x in known_modules if x + ' module' in title] if not title_matches: title_matches = [x for x in known_modules if title.startswith(x + ' ')] if not title_matches: title_matches = \ [x for x in known_modules if ' ' + x + ' ' in title] # don't do singular word matching in title for ansible/ansible cmatches = None if component: cmatches = [x for x in known_modules if x in component] cmatches = [x for x in cmatches if not '_' + x in component] # use title ... ? if title_matches: cmatches = [x for x in cmatches if x in title_matches] if cmatches: if len(cmatches) >= 1: match = cmatches[0] if not match: if 'docs.ansible.com' in component: pass else: pass print("module - component matches: %s" % cmatches) if not match: if len(title_matches) == 1: match = title_matches[0] else: print("module - title matches: %s" % title_matches) return match def is_multi(self, rawtext): '''Is the string a list or a glob of modules?''' if rawtext: lines = rawtext.split('\n') # clean up lines lines = [x.strip() for x in lines if x.strip()] lines = [x for x in lines if len(x) > 2] if len(lines) > 1: return True if lines: if lines[0].strip().endswith('*'): return True return False # https://github.com/ansible/ansible-modules-core/issues/3831 def multi_match(self, rawtext): '''Return a list of matches for a given glob or list of names''' matches = [] lines = rawtext.split('\n') lines = [x.strip() for x in lines if x.strip()] for line in lines: # is it an exact name, a path, a globbed name, a globbed path? if line.endswith('*'): thiskey = line.replace('*', '') keymatches = [] for k in self.modules.keys(): if thiskey in k: keymatches.append(k) for k in keymatches: matches.append(self.modules[k].copy()) else: match = self.find_match(line) if match: matches.append(match) # unique the list tmplist = [] for x in matches: if x not in tmplist: tmplist.append(x) if matches != tmplist: matches = [x for x in tmplist] return matches def set_module_metadata(self): for k,v in self.modules.iteritems(): if not v['filepath']: continue mfile = os.path.join(self.checkoutdir, v['filepath']) if not mfile.endswith('.py'): # metadata is only the .py files ... ext = mfile.split('.')[-1] mfile = mfile.replace('.' + ext, '.py', 1) self.modules[k]['metadata'].update(self.get_module_metadata(mfile)) def get_module_metadata(self, module_file): meta = {} if not os.path.isfile(module_file): return meta rawmeta = '' inphase = False with open(module_file, 'rb') as f: for line in f: if line.startswith('ANSIBLE_METADATA'): inphase = True #continue if line.startswith('DOCUMENTATION'): break if inphase: rawmeta += line rawmeta = rawmeta.replace('ANSIBLE_METADATA =', '', 1) rawmeta = rawmeta.strip() try: meta = ast.literal_eval(rawmeta) except SyntaxError: pass return meta def set_module_imports(self): for k,v in self.modules.iteritems(): if not v['filepath']: continue mfile = os.path.join(self.checkoutdir, v['filepath']) self.modules[k]['imports'] = self.get_module_imports(mfile) def get_module_imports(self, module_file): #import ansible.module_utils.nxos #from ansible.module_utils.netcfg import NetworkConfig, dumps #from ansible.module_utils.network import NetworkModule mimports = [] with open(module_file, 'rb') as f: for line in f: line = line.strip() line = line.replace(',', '') if line.startswith('import') or \ ('import' in line and 'from' in line): lparts = line.split() if line.startswith('import '): mimports.append(lparts[1]) elif line.startswith('from '): mpath = lparts[1] + '.' for spath in lparts[3:]: mimports.append(mpath + spath) return mimports @property def all_maintainers(self): maintainers = [] for m in self.maintainers.values(): if not isinstance(m, list): m = [m] for mi in m: if mi not in maintainers: maintainers.append(mi) return maintainers def get_maintainers_for_namespace(self, namespace): maintainers = [] for k,v in self.modules.items(): if 'namespace' not in v or 'maintainers' not in v: continue if v['namespace'] == namespace: for m in v['maintainers']: if m not in maintainers: maintainers.append(m) maintainers = [x for x in maintainers if x.strip()] return maintainers @staticmethod def replace_ansible(maintainers, ansible_members, bots=[]): '''Replace -ansible- with the -humans- in the org''' newlist = [] for m in maintainers: if m != 'ansible': newlist.append(m) else: newlist += ansible_members newlist = sorted(set(newlist)) newlist = [x for x in newlist if x not in bots] return newlist
class ModuleIndexer(object): EMPTY_MODULE = { u'authors': [], u'name': None, u'namespaced_module': None, u'namespace_maintainers': [], u'deprecated': False, u'deprecated_filename': None, u'dirpath': None, u'filename': None, u'filepath': None, u'fulltopic': None, u'maintainers': [], u'_maintainers': [], u'maintainers_keys': None, u'metadata': {}, u'repo_filename': None, u'repository': u'ansible', u'subtopic': None, u'topic': None, u'imports': [] } def __init__(self, commits=True, blames=True, botmetafile=None, maintainers=None, gh_client=None, cachedir=u'~/.ansibullbot/cache', gitrepo=None): ''' Maintainers: defaultdict(dict) where keys are filepath and values are dict gh_client: GraphQL GitHub client ''' self.get_commits = commits self.get_blames = blames self.botmetafile = botmetafile self.botmeta = {} # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed) self.modules = {} # keys: paths of files belonging to the repository self.maintainers = maintainers or {} self.importmap = {} self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper') self.scraper_cache = os.path.expanduser(self.scraper_cache) self.gws = GithubWebScraper(cachedir=self.scraper_cache) self.gqlc = gh_client self.files = [] if gitrepo: self.gitrepo = gitrepo else: self.gitrepo = GitRepoWrapper(cachedir=cachedir, repo=u'https://github.com/ansible/ansible') # sqlalchemy unc = os.path.join(cachedir, u'ansible_module_indexer.db') unc = os.path.expanduser(unc) unc = u'sqlite:///' + unc self.engine = create_engine(unc) self.Session = sessionmaker(bind=self.engine) self.session = self.Session() Email.metadata.create_all(self.engine) Blame.metadata.create_all(self.engine) # committers by module self.committers = {} # commits by module self.commits = {} # map of email to github login self.emails_cache = {} # load the bot meta self.update(force=True) def update(self, force=False): '''Reload everything if there are new commits''' changed = self.gitrepo.manage_checkout() if changed or force: self.get_files() self.parse_metadata() def get_files(self): '''Cache a list of filenames in the checkout''' cmd = u'cd {}; git ls-files'.format(self.gitrepo.checkoutdir) (rc, so, se) = run_command(cmd) files = to_text(so).split(u'\n') files = [x.strip() for x in files if x.strip()] self.files = files def parse_metadata(self): if self.botmetafile is not None: with open(self.botmetafile, 'rb') as f: rdata = f.read() else: fp = u'.github/BOTMETA.yml' rdata = self.get_file_content(fp) self.botmeta = BotMetadataParser.parse_yaml(rdata) # load the modules logging.info(u'loading modules') self.get_ansible_modules() def _find_match(self, pattern, exact=False): logging.debug(u'exact:{} matching on {}'.format(exact, pattern)) matches = [] if isinstance(pattern, six.text_type): pattern = to_text(to_bytes(pattern,'ascii', 'ignore'), 'ascii') for k, v in six.iteritems(self.modules): if v[u'name'] == pattern: logging.debug(u'match {} on name: {}'.format(k, v[u'name'])) matches = [v] break if not matches: # search by key ... aka the filepath for k, v in six.iteritems(self.modules): if k == pattern: logging.debug(u'match {} on key: {}'.format(k, k)) matches = [v] break if not matches and not exact: # search by properties for k, v in six.iteritems(self.modules): for subkey in v.keys(): if v[subkey] == pattern: logging.debug(u'match {} on subkey: {}'.format(k, subkey)) matches.append(v) if not matches and not exact: # Levenshtein distance should workaround most typos distance_map = {} for k, v in six.iteritems(self.modules): mname = v.get(u'name') if not mname: continue if isinstance(mname, six.text_type): mname = to_text(to_bytes(mname, 'ascii', 'ignore'), 'ascii') try: res = Levenshtein.distance(pattern, mname) except TypeError as e: logging.error(e) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() distance_map[mname] = [res, k] res = sorted(distance_map.items(), key=lambda x: x[1], reverse=True) if len(pattern) > 3 > res[-1][1]: logging.debug(u'levenshtein ratio match: ({}) {} {}'.format(res[-1][-1], res[-1][0], pattern)) matches = [self.modules[res[-1][-1]]] return matches def find_match(self, pattern, exact=False): '''Exact module name matching''' logging.debug(u'find_match for "{}"'.format(pattern)) BLACKLIST = [ u'module_utils', u'callback', u'network modules', u'networking modules' u'windows modules' ] if not pattern or pattern is None: return None if pattern.lower() == u'core': return None ''' if 'docs.ansible.com' in pattern and '_module.html' in pattern: # http://docs.ansible.com/ansible/latest/copy_module.html # http://docs.ansible.com/ansible/latest/dev_guide/developing_modules.html # http://docs.ansible.com/ansible/latest/postgresql_db_module.html # [helm module](https//docs.ansible.com/ansible/2.4/helm_module.html) # Windows module: win_robocopy\nhttp://docs.ansible.com/ansible/latest/win_robocopy_module.html # Examples:\n* archive (https://docs.ansible.com/ansible/archive_module.html)\n* s3_sync (https://docs.ansible.com/ansible/s3_sync_module.html) urls = re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', pattern ) #urls = [x for x in urls if '_module.html' in x] #if urls: # import epdb; epdb.st() import epdb; epdb.st() ''' # https://github.com/ansible/ansible/issues/19755 if pattern == u'setup': pattern = u'system/setup.py' if u'/facts.py' in pattern or u' facts.py' in pattern: pattern = u'system/setup.py' # https://github.com/ansible/ansible/issues/18527 # docker-container -> docker_container if u'-' in pattern: pattern = pattern.replace(u'-', u'_') if u'module_utils' in pattern: # https://github.com/ansible/ansible/issues/20368 return None elif u'callback' in pattern: return None elif u'lookup' in pattern: return None elif u'contrib' in pattern and u'inventory' in pattern: return None elif pattern.lower() in BLACKLIST: return None elif u'/' in pattern and not self._find_match(pattern, exact=True): # https://github.com/ansible/ansible/issues/20520 if not pattern.startswith(u'lib/'): keys = self.modules.keys() for k in keys: if pattern in k: ppy = pattern + u'.py' if k.endswith(pattern) or k.endswith(ppy): return self.modules[k] elif pattern.endswith(u'.py') and self._find_match(pattern, exact=False): # https://github.com/ansible/ansible/issues/19889 candidate = self._find_match(pattern, exact=False) if isinstance(candidate, list): if len(candidate) == 1: candidate = candidate[0] if candidate[u'filename'] == pattern: return candidate match = self._find_match(pattern, exact=exact) if not match and not exact: # check for just the basename # 2617: ansible-s-extras/network/cloudflare_dns.py bname = os.path.basename(pattern) match = self._find_match(bname) if not match: # check for deprecated name # _fireball -> fireball match = self._find_match(u'_' + bname) # unique the results if isinstance(match, list) and len(match) > 1: _match = [] for m in match: if m not in _match: _match.append(m) match = _match[:] return match def is_valid(self, mname): match = self.find_match(mname, exact=True) if match: return True else: return False def get_repository_for_module(self, mname): match = self.find_match(mname, exact=True) if match: return match[u'repository'] else: return None def get_ansible_modules(self): """Make a list of known modules""" matches = [] module_dir = os.path.join(self.gitrepo.checkoutdir, u'lib/ansible/modules') module_dir = os.path.expanduser(module_dir) for root, _, filenames in os.walk(module_dir): for filename in filenames: if u'lib/ansible/modules' in root and not filename == u'__init__.py': matches.append(os.path.join(root, filename)) matches = sorted(set(matches)) self.populate_modules(matches) # custom fixes newitems = [] for k, v in six.iteritems(self.modules): # include* is almost always an ansible/ansible issue # https://github.com/ansible/ansibullbot/issues/214 if k.endswith(u'/include.py'): self.modules[k][u'repository'] = u'ansible' # https://github.com/ansible/ansibullbot/issues/214 if k.endswith(u'/include_vars.py'): self.modules[k][u'repository'] = u'ansible' if k.endswith(u'/include_role.py'): self.modules[k][u'repository'] = u'ansible' # ansible maintains these if u'include' in k: self.modules[k][u'maintainers'] = [u'ansible'] # deprecated modules are annoying if v[u'name'].startswith(u'_'): dkey = os.path.dirname(v[u'filepath']) dkey = os.path.join(dkey, v[u'filename'].replace(u'_', u'', 1)) if dkey not in self.modules: nd = v.copy() nd[u'name'] = nd[u'name'].replace(u'_', u'', 1) newitems.append((dkey, nd)) for ni in newitems: self.modules[ni[0]] = ni[1] # parse metadata logging.debug(u'set module metadata') self.set_module_metadata() # parse imports logging.debug(u'set module imports') self.set_module_imports() # last modified if self.get_commits: logging.debug(u'set module commits') self.get_module_commits() # parse blame if self.get_blames and self.get_commits: logging.debug(u'set module blames') self.get_module_blames() # depends on metadata now ... logging.debug(u'set module maintainers') self.set_maintainers() return self.modules def populate_modules(self, matches): # figure out the names for match in matches: mdict = copy.deepcopy(self.EMPTY_MODULE) mdict[u'filename'] = os.path.basename(match) dirpath = os.path.dirname(match) dirpath = dirpath.replace(self.gitrepo.checkoutdir + u'/', u'') mdict[u'dirpath'] = dirpath filepath = match.replace(self.gitrepo.checkoutdir + u'/', u'') mdict[u'filepath'] = filepath mdict.update( self.split_topics_from_path(filepath) ) mdict[u'repo_filename'] = mdict[u'filepath']\ .replace(u'lib/ansible/modules/%s/' % mdict[u'repository'], u'') # clustering/consul mdict[u'namespaced_module'] = mdict[u'repo_filename'] mdict[u'namespaced_module'] = \ mdict[u'namespaced_module'].replace(u'.py', u'') mdict[u'namespaced_module'] = \ mdict[u'namespaced_module'].replace(u'.ps1', u'') mname = os.path.basename(match) mname = mname.replace(u'.py', u'') mname = mname.replace(u'.ps1', u'') mdict[u'name'] = mname # deprecated modules if mname.startswith(u'_'): mdict[u'deprecated'] = True deprecated_filename = \ os.path.dirname(mdict[u'namespaced_module']) deprecated_filename = \ os.path.join(deprecated_filename, mname[1:] + u'.py') mdict[u'deprecated_filename'] = deprecated_filename else: mdict[u'deprecated_filename'] = mdict[u'repo_filename'] self.modules[filepath] = mdict # meta is a special module self.modules[u'meta'] = copy.deepcopy(self.EMPTY_MODULE) self.modules[u'meta'][u'name'] = u'meta' self.modules[u'meta'][u'repo_filename'] = u'meta' def get_module_commits(self): keys = self.modules.keys() keys = sorted(keys) for k in keys: self.commits[k] = [] cpath = os.path.join(self.gitrepo.checkoutdir, k) if not os.path.isfile(cpath): continue mtime = os.path.getmtime(cpath) refresh = False pfile = os.path.join( self.scraper_cache, k.replace(u'/', u'_') + u'.commits.pickle' ) if not os.path.isfile(pfile): refresh = True else: pickle_kwargs = {'encoding': 'bytes'} if six.PY3 else {} print(pfile) with open(pfile, 'rb') as f: pdata = pickle_load(f, **pickle_kwargs) if pdata[0] == mtime: self.commits[k] = pdata[1] else: refresh = True if refresh: logging.info(u'refresh commit cache for %s' % k) cmd = u'cd %s; git log --follow %s' % (self.gitrepo.checkoutdir, k) (rc, so, se) = run_command(cmd) for line in to_text(so).split(u'\n'): if line.startswith(u'commit '): commit = { u'name': None, u'email': None, u'login': None, u'hash': line.split()[-1], u'date': None } # Author: Matt Clay <*****@*****.**> if line.startswith(u'Author: '): line = line.replace(u'Author: ', u'') line = line.replace(u'<', u'') line = line.replace(u'>', u'') lparts = line.split() if u'@' in lparts[-1]: commit[u'email'] = lparts[-1] commit[u'name'] = u' '.join(lparts[:-1]) else: pass if commit[u'email'] and \ u'noreply.github.com' in commit[u'email']: commit[u'login'] = commit[u'email'].split(u'@')[0] # Date: Sat Jan 28 23:28:53 2017 -0800 if line.startswith(u'Date:'): dstr = line.split(u':', 1)[1].strip() dstr = u' '.join(dstr.split(u' ')[:-1]) ds = datetime.datetime.strptime( to_text(dstr), u'%a %b %d %H:%M:%S %Y' ) commit[u'date'] = ds self.commits[k].append(commit) with open(pfile, 'wb') as f: pickle_dump((mtime, self.commits[k]), f) def last_commit_for_file(self, filepath): if filepath in self.commits: return self.commits[filepath][0][u'hash'] # git log --pretty=format:'%H' -1 # lib/ansible/modules/cloud/amazon/ec2_metric_alarm.py cmd = u'cd %s; git log --pretty=format:\'%%H\' -1 %s' % \ (self.gitrepo.checkoutdir, filepath) (rc, so, se) = run_command(cmd) return to_text(so).strip() def get_module_blames(self): logging.debug(u'build email cache') emails_cache = self.session.query(Email) emails_cache = [(x.email, x.login) for x in emails_cache] self.emails_cache = dict(emails_cache) logging.debug(u'build blame cache') blame_cache = self.session.query(Blame).all() blame_cache = [x.file_commit for x in blame_cache] blame_cache = sorted(set(blame_cache)) logging.debug(u'eval module hashes') changed = False keys = sorted(self.modules.keys()) for k in keys: if k not in self.files: self.committers[k] = {} continue ghash = self.last_commit_for_file(k) if ghash in blame_cache: continue logging.debug(u'checking hash for {}'.format(k)) res = self.session.query(Blame).filter_by(file_name=k, file_commit=ghash).all() hashes = [x.file_commit for x in res] if ghash not in hashes: logging.debug(u'hash {} not found for {}, updating blames'.format(ghash, k)) scraper_args = [u'ansible', u'ansible', u'devel', k] uns, emailmap = self.gqlc.get_usernames_from_filename_blame(*scraper_args) # check the emails for email, login in emailmap.items(): if email in self.emails_cache: continue exists = self.session.query(Email).filter_by(email=email).first() if not exists: logging.debug(u'insert {}:{}'.format(login, email)) _email = Email(email=email, login=login) self.session.add(_email) changed = True # check the blames for login, commits in uns.items(): for commit in commits: kwargs = { u'file_name': k, u'file_commit': ghash, u'author_commit': commit, u'author_login': login } exists = self.session.query(Blame).filter_by(**kwargs).first() if not exists: logging.debug(u'insert {}:{}:{}'.format(k, commit, login)) _blame = Blame(**kwargs) self.session.add(_blame) changed = True if changed: self.session.commit() logging.debug(u're-build email cache') emails_cache = self.session.query(Email) emails_cache = [(x.email, x.login) for x in emails_cache] self.emails_cache = dict(emails_cache) # fill in what we can ... logging.debug(u'fill in commit logins') for k in keys: for idc, commit in enumerate(self.commits[k][:]): if not commit.get(u'login'): continue login = self.emails_cache.get(commit[u'email']) if not login and u'@users.noreply.github.com' in commit[u'email']: login = commit[u'email'].split(u'@')[0] self.emails_cache[commit[u'email']] = login if not login: print(u'unknown: {}'.format(commit[u'email'])) self.commits[k][idc][u'login'] = self.emails_cache.get(login) def get_emails_by_login(self, login): res = self.session.query(Email).filter_by(login=login) emails = [x.email for x in res.values()] return emails def _get_module_blames(self): ''' Scrape the blame page for each module and store it ''' keys = sorted(self.modules.keys()) # scrape the data for k in keys: cpath = os.path.join(self.gitrepo.checkoutdir, k) if not os.path.isfile(cpath): self.committers[k] = {} continue ghash = self.last_commit_for_file(k) pfile = os.path.join( self.scraper_cache, k.replace(u'/', u'_') + u'.blame.pickle' ) sargs = [u'ansible', u'ansible', u'devel', k] refresh = False if not os.path.isfile(pfile): refresh = True else: logging.debug(u'load {}'.format(pfile)) with open(pfile, 'rb') as f: pdata = pickle_load(f) if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() if pdata[0] == ghash: self.committers[k] = pdata[1] if len(pdata) == 3: # use emailmap if available emailmap = pdata[2] else: emailmap = {} else: refresh = True if refresh: if self.gqlc: logging.debug(u'graphql blame usernames {}'.format(pfile)) uns, emailmap = self.gqlc.get_usernames_from_filename_blame(*sargs) else: emailmap = {} # scrapping: emails not available logging.debug(u'www blame usernames {}'.format(pfile)) uns = self.gws.get_usernames_from_filename_blame(*sargs) self.committers[k] = uns with open(pfile, 'wb') as f: pickle_dump((ghash, uns, emailmap), f) for email, github_id in emailmap.items(): if email not in self.emails_cache: self.emails_cache[email] = github_id # add scraped logins to the map for k in keys: for idx, x in enumerate(self.commits[k]): if x[u'email'] in [u'@']: continue if x[u'email'] not in self.emails_cache: self.emails_cache[x[u'email']] = None if x[u'login']: self.emails_cache[x[u'email']] = x[u'login'] continue xhash = x[u'hash'] for ck, cv in six.iteritems(self.committers[k]): if xhash in cv: self.emails_cache[x[u'email']] = ck break # fill in what we can ... for k in keys: for idx, x in enumerate(self.commits[k]): if not x[u'login']: if x[u'email'] in [u'@']: continue if self.emails_cache[x[u'email']]: login = self.emails_cache[x[u'email']] xhash = x[u'hash'] self.commits[k][idx][u'login'] = login if login not in self.committers[k]: self.committers[k][login] = [] if xhash not in self.committers[k][login]: self.committers[k][login].append(xhash) def set_maintainers(self): '''Define the maintainers for each module''' # grep the authors: for k, v in six.iteritems(self.modules): if v[u'filepath'] is None: continue mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath']) authors = self.get_module_authors(mfile) self.modules[k][u'authors'] = authors # authors are maintainers by -default- self.modules[k][u'maintainers'] += authors self.modules[k][u'maintainers'] = \ sorted(set(self.modules[k][u'maintainers'])) metadata = self.botmeta[u'files'].keys() for k, v in six.iteritems(self.modules): if k == u'meta': continue if k in self.botmeta[u'files']: # There are metadata in .github/BOTMETA.yml for this file # copy maintainers_keys self.modules[k][u'maintainers_keys'] = self.botmeta[u'files'][k][u'maintainers_keys'][:] if self.botmeta[u'files'][k]: maintainers = self.botmeta[u'files'][k].get(u'maintainers', []) for maintainer in maintainers: if maintainer not in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].append(maintainer) # remove the people who want to be ignored if u'ignored' in self.botmeta[u'files'][k]: ignored = self.botmeta[u'files'][k][u'ignored'] for x in ignored: if x in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].remove(x) else: # There isn't metadata in .github/BOTMETA.yml for this file best_match = None for mkey in metadata: if v[u'filepath'].startswith(mkey): if not best_match: best_match = mkey continue if len(mkey) > len(best_match): best_match = mkey if best_match: self.modules[k][u'maintainers_keys'] = [best_match] for maintainer in self.botmeta[u'files'][best_match].get(u'maintainers', []): if maintainer not in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].append(maintainer) # remove the people who want to be ignored for ignored in self.botmeta[u'files'][best_match].get(u'ignored', []): if ignored in self.modules[k][u'maintainers']: self.modules[k][u'maintainers'].remove(ignored) # save a pristine copy so that higher level code can still use it self.modules[k][u'maintainers'] = sorted(set(self.modules[k][u'maintainers'])) self.modules[k][u'_maintainers'] = \ [x for x in self.modules[k][u'maintainers']] # set the namespace maintainers ... for k, v in six.iteritems(self.modules): if u'namespace_maintainers' not in self.modules[k]: self.modules[k][u'namespace_maintainers'] = [] if v.get(u'namespace'): ns = v.get(u'namespace') nms = self.get_maintainers_for_namespace(ns) self.modules[k][u'namespace_maintainers'] = nms def split_topics_from_path(self, module_file): subpath = module_file.replace(u'lib/ansible/modules/', u'') path_parts = subpath.split(u'/') topic = path_parts[0] if len(path_parts) > 2: subtopic = path_parts[1] fulltopic = u'/'.join(path_parts[0:2]) else: subtopic = None fulltopic = path_parts[0] tdata = { u'fulltopic': fulltopic, u'namespace': fulltopic, u'topic': topic, u'subtopic': subtopic } return tdata def get_module_authors(self, module_file): """Grep the authors out of the module docstrings""" if not os.path.exists(module_file): return [] documentation = b'' inphase = False with io.open(module_file, 'rb') as f: for line in f: if b'DOCUMENTATION' in line: inphase = True continue if line.strip().endswith((b"'''", b'"""')): break if inphase: documentation += line if not documentation: return [] # clean out any other yaml besides author to save time inphase = False author_lines = u'' doc_lines = to_text(documentation).split(u'\n') for idx, x in enumerate(doc_lines): if x.startswith(u'author'): inphase = True if inphase and not x.strip().startswith((u'-', u'author')): inphase = False break if inphase: author_lines += x + u'\n' if not author_lines: return [] ydata = {} try: ydata = yaml.load(author_lines, BotYAMLLoader) except Exception as e: print(e) return [] # quit early if the yaml was not valid if not ydata: return [] # quit if the key was not found if u'author' not in ydata: return [] if not isinstance(ydata[u'author'], list): ydata[u'author'] = [ydata[u'author']] authors = [] for author in ydata[u'author']: github_ids = self.extract_github_id(author) if github_ids: authors.extend(github_ids) return authors def extract_github_id(self, author): authors = set() if author is None: return [] if u'ansible core team' in author.lower(): authors.add(u'ansible') elif u'@' in author: # match github ids but not emails authors.update(re.findall(r'(?<!\w)@([\w-]+)(?![\w.])', author)) elif u'github.com/' in author: # {'author': 'Henrique Rodrigues (github.com/Sodki)'} idx = author.find(u'github.com/') author = author[idx+11:] authors.add(author.replace(u')', u'')) elif u'(' in author and len(author.split()) == 3: # Mathieu Bultel (matbu) idx = author.find(u'(') author = author[idx+1:] authors.add(author.replace(u')', u'')) # search for emails for email in re.findall(r'[<(]([^@]+@[^)>]+)[)>]', author): github_id = self.emails_cache.get(email) if github_id: authors.add(github_id) return list(authors) def fuzzy_match(self, repo=None, title=None, component=None): '''Fuzzy matching for modules''' logging.debug(u'fuzzy match {}'.format( to_text(to_bytes(component, 'ascii', 'ignore'), 'ascii')) ) if component.lower() == u'core': return None # https://github.com/ansible/ansible/issues/18179 if u'validate-modules' in component: return None # https://github.com/ansible/ansible/issues/20368 if u'module_utils' in component: return None if u'new module' in component: return None # authorized_keys vs. authorized_key if component and component.endswith(u's'): tm = self.find_match(component[:-1]) if tm: if not isinstance(tm, list): return tm[u'name'] elif len(tm) == 1: return tm[0][u'name'] else: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb; epdb.st() match = None known_modules = [] for k, v in six.iteritems(self.modules): if v[u'name'] in [u'include']: continue known_modules.append(v[u'name']) title = title.lower() title = title.replace(u':', u'') title_matches = [x for x in known_modules if x + u' module' in title] if not title_matches: title_matches = [x for x in known_modules if title.startswith(x + u' ')] if not title_matches: title_matches = \ [x for x in known_modules if u' ' + x + u' ' in title] if title_matches: title_matches = [x for x in title_matches if x != u'at'] # don't do singular word matching in title for ansible/ansible cmatches = None if component: cmatches = [x for x in known_modules if x in component] cmatches = [x for x in cmatches if not u'_' + x in component] # globs if not cmatches and u'*' in component: fmatches = [x for x in known_modules if fnmatch.fnmatch(x, component)] if fmatches: cmatches = fmatches[:] if title_matches: # use title ... ? cmatches = [x for x in cmatches if x in title_matches and x not in [u'at']] if cmatches: if len(cmatches) >= 1 and (u'*' not in component and u'modules' not in component): match = cmatches[0] else: match = cmatches[:] if not match: if u'docs.ansible.com' in component: pass else: pass logging.debug("module - component matches: %s" % cmatches) if not match: if len(title_matches) == 1: match = title_matches[0] else: logging.debug("module - title matches: %s" % title_matches) return match def is_multi(self, rawtext): '''Is the string a list or a glob of modules?''' if rawtext: lines = rawtext.split(u'\n') # clean up lines lines = [x.strip() for x in lines if x.strip()] lines = [x for x in lines if len(x) > 2] if len(lines) > 1: return True if lines: if lines[0].strip().endswith(u'*'): return True return False # https://github.com/ansible/ansible-modules-core/issues/3831 def multi_match(self, rawtext): '''Return a list of matches for a given glob or list of names''' matches = [] lines = rawtext.split(u'\n') lines = [x.strip() for x in lines if x.strip()] for line in lines: # is it an exact name, a path, a globbed name, a globbed path? if line.endswith(u'*'): thiskey = line.replace(u'*', u'') keymatches = [] for k in self.modules.keys(): if thiskey in k: keymatches.append(k) for k in keymatches: matches.append(self.modules[k].copy()) else: match = self.find_match(line) if match: matches.append(match) # unique the list tmplist = [] for x in matches: if x not in tmplist: tmplist.append(x) if matches != tmplist: matches = [x for x in tmplist] return matches def set_module_metadata(self): for k, v in six.iteritems(self.modules): if not v[u'filepath']: continue mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath']) if not mfile.endswith(u'.py'): # metadata is only the .py files ... ext = mfile.split(u'.')[-1] mfile = mfile.replace(u'.' + ext, u'.py', 1) self.modules[k][u'metadata'].update(self.get_module_metadata(mfile)) def get_module_metadata(self, module_file): meta = {} if not os.path.isfile(module_file): return meta rawmeta = u'' inphase = False with io.open(module_file, 'r', encoding='utf-8') as f: for line in f: if line.startswith(u'ANSIBLE_METADATA'): inphase = True if line.startswith(u'DOCUMENTATION'): break if inphase: rawmeta += line rawmeta = rawmeta.replace(u'ANSIBLE_METADATA =', u'', 1) rawmeta = rawmeta.strip() try: meta = ast.literal_eval(rawmeta) tmp_meta = {} for k, v in meta.items(): if isinstance(k, six.binary_type): k = to_text(k) if isinstance(v, six.binary_type): v = to_text(v) if isinstance(v, list): tmp_list = [] for i in v: if isinstance(i, six.binary_type): i = to_text(i) tmp_list.append(i) v = tmp_list del tmp_list tmp_meta[k] = v meta = tmp_meta del tmp_meta except SyntaxError: pass return meta def set_module_imports(self): for k, v in six.iteritems(self.modules): if not v[u'filepath']: continue mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath']) self.modules[k][u'imports'] = self.get_module_imports(mfile) def get_module_imports(self, module_file): mimports = [] if not os.path.isfile(module_file): return mimports else: with open(module_file, 'rb') as f: for line in f: line = line.strip() line = line.replace(b',', b'') if line.startswith(b'import') or \ (b'import' in line and b'from' in line): lparts = line.split() if line.startswith(b'import '): mimports.append(lparts[1]) elif line.startswith(b'from '): mpath = lparts[1] + b'.' for spath in lparts[3:]: mimports.append(mpath + spath) return [to_text(m) for m in mimports] @property def all_maintainers(self): maintainers = set() for path, metadata in self.botmeta[u'files'].items(): maintainers.update(metadata.get(u'maintainers', [])) return maintainers @property def all_authors(self): authors = set() for key, metadata in self.modules.items(): authors.update(metadata.get(u'authors', [])) return authors def get_maintainers_for_namespace(self, namespace): maintainers = [] for k, v in self.modules.items(): if u'namespace' not in v or u'maintainers' not in v: continue if v[u'namespace'] == namespace: for m in v[u'maintainers']: if m not in maintainers: maintainers.append(m) maintainers = [x for x in maintainers if x.strip()] return maintainers @staticmethod def replace_ansible(maintainers, ansible_members, bots=[]): '''Replace -ansible- with the -humans- in the org''' newlist = [] for m in maintainers: if m != u'ansible': newlist.append(m) else: newlist += ansible_members newlist = sorted(set(newlist)) newlist = [x for x in newlist if x not in bots] return newlist def get_file_content(self, filepath): fpath = os.path.join(self.gitrepo.checkoutdir, filepath) if not os.path.isfile(fpath): return None with io.open(fpath, 'r', encoding='utf-8') as f: data = f.read() return data