Пример #1
0
class ModuleIndexer(object):

    EMPTY_MODULE = {
        'authors': [],
        'name': None,
        'namespaced_module': None,
        'namespace_maintainers': [],
        'deprecated': False,
        'deprecated_filename': None,
        'dirpath': None,
        'filename': None,
        'filepath': None,
        'fulltopic': None,
        'maintainers': [],
        '_maintainers': [],
        'maintainers_keys': None,
        'metadata': {},
        'repo_filename': None,
        'repository': 'ansible',
        'subtopic': None,
        'topic': None,
        'imports': []
    }

    REPO = "http://github.com/ansible/ansible"

    def __init__(self,
                 maintainers=None,
                 gh_client=None,
                 cachedir='~/.ansibullbot/cache'):
        '''
        Maintainers: defaultdict(dict) where keys are filepath and values are dict
        gh_client: GraphQL GitHub client
        '''
        self.botmeta = {
        }  # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed)
        self.modules = {}  # keys: paths of files belonging to the repository
        self.checkoutdir = '~/.ansibullbot/cache/ansible.modules.checkout'
        self.maintainers = maintainers or {}
        self.checkoutdir = os.path.join(cachedir, 'ansible.modules.checkout')
        self.checkoutdir = os.path.expanduser(self.checkoutdir)
        self.importmap = {}
        self.scraper_cache = '~/.ansibullbot/cache/ansible.modules.scraper'
        self.scraper_cache = os.path.expanduser(self.scraper_cache)
        self.gws = GithubWebScraper(cachedir=self.scraper_cache)
        self.gqlc = gh_client
        self.files = []

        # sqlalchemy
        unc = os.path.join(cachedir, 'ansible_module_indexer.db')
        unc = os.path.expanduser(unc)
        unc = 'sqlite:///' + unc

        self.engine = create_engine(unc)
        self.Session = sessionmaker(bind=self.engine)
        self.session = self.Session()

        Email.metadata.create_all(self.engine)
        Blame.metadata.create_all(self.engine)

        # committers by module
        self.committers = {}
        # commits by module
        self.commits = {}
        # map of email to github login
        self.emails_cache = {}

        # load the bot meta
        self.update(force=True)

    def update(self, force=False):
        '''Reload everything if there are new commits'''
        changed = self.manage_checkout()
        if changed or force:
            self.get_files()
            self.parse_metadata()

    def manage_checkout(self):
        '''Check if there are any changes to the repo'''
        changed = False
        if not os.path.isdir(self.checkoutdir):
            self.create_checkout()
            changed = True
        else:
            changed = self.update_checkout()
        return changed

    def get_files(self):
        '''Cache a list of filenames in the checkout'''
        cmd = 'cd {}; git ls-files'.format(self.checkoutdir)
        (rc, so, se) = run_command(cmd)
        files = so.split('\n')
        files = [x.strip() for x in files if x.strip()]
        self.files = files

    def parse_metadata(self):

        fp = '.github/BOTMETA.yml'
        rdata = self.get_file_content(fp)
        self.botmeta = BotMetadataParser.parse_yaml(rdata)

        # load the modules
        logging.info('loading modules')
        self.get_ansible_modules()

    def create_checkout(self):
        """checkout ansible"""

        print('# creating checkout for module indexer')

        # cleanup
        if os.path.isdir(self.checkoutdir):
            shutil.rmtree(self.checkoutdir)

        #cmd = "git clone http://github.com/ansible/ansible --recursive %s" \
        cmd = "git clone %s %s" \
            % (self.REPO, self.checkoutdir)
        (rc, so, se) = run_command(cmd)
        print str(so) + str(se)

    def update_checkout(self):
        """rebase + pull + update the checkout"""

        changed = False

        cmd = "cd %s ; git pull --rebase" % self.checkoutdir
        (rc, so, se) = run_command(cmd)
        print str(so) + str(se)

        # If rebase failed, recreate the checkout
        if rc != 0:
            self.create_checkout()
            return True
        else:
            if 'current branch devel is up to date.' not in so.lower():
                changed = True

        return changed

    def _find_match(self, pattern, exact=False):

        match = None
        for k, v in self.modules.iteritems():
            if v['name'] == pattern:
                match = v
                break
        if not match:
            # search by key ... aka the filepath
            for k, v in self.modules.iteritems():
                if k == pattern:
                    match = v
                    break
        if not match and not exact:
            # search by properties
            for k, v in self.modules.iteritems():
                for subkey in v.keys():
                    if v[subkey] == pattern:
                        match = v
                        break
                if match:
                    break
        return match

    def find_match(self, pattern, exact=False):
        '''Exact module name matching'''
        if not pattern:
            return None

        # https://github.com/ansible/ansible/issues/19755
        if pattern == 'setup':
            pattern = 'system/setup.py'

        # https://github.com/ansible/ansible/issues/18527
        #   docker-container -> docker_container
        if '-' in pattern:
            pattern = pattern.replace('-', '_')

        if 'module_utils' in pattern:
            # https://github.com/ansible/ansible/issues/20368
            return None
        elif '/' in pattern and not self._find_match(pattern, exact=True):
            # https://github.com/ansible/ansible/issues/20520
            if not pattern.startswith('lib/'):
                keys = self.modules.keys()
                for k in keys:
                    if pattern in k:
                        ppy = pattern + '.py'
                        if k.endswith(pattern) or k.endswith(ppy):
                            return self.modules[k]
        elif pattern.endswith('.py') and self._find_match(pattern,
                                                          exact=False):
            # https://github.com/ansible/ansible/issues/19889
            candidate = self._find_match(pattern, exact=False)
            if candidate['filename'] == pattern:
                return candidate

        match = self._find_match(pattern, exact=exact)
        if not match and not exact:
            # check for just the basename
            #   2617: ansible-s-extras/network/cloudflare_dns.py
            bname = os.path.basename(pattern)
            match = self._find_match(bname)

            if not match:
                # check for deprecated name
                #   _fireball -> fireball
                match = self._find_match('_' + bname)

        return match

    def is_valid(self, mname):
        match = self.find_match(mname)
        if match:
            return True
        else:
            return False

    def get_repository_for_module(self, mname):
        match = self.find_match(mname)
        if match:
            return match['repository']
        else:
            return None

    def get_ansible_modules(self):
        """Make a list of known modules"""

        matches = []
        module_dir = os.path.join(self.checkoutdir, 'lib/ansible/modules')
        module_dir = os.path.expanduser(module_dir)
        for root, dirnames, filenames in os.walk(module_dir):
            for filename in filenames:
                if 'lib/ansible/modules' in root and \
                        not filename == '__init__.py' and \
                        (filename.endswith('.py') or filename.endswith('.ps1')):
                    matches.append(os.path.join(root, filename))

        matches = sorted(set(matches))

        self.populate_modules(matches)

        # custom fixes
        newitems = []
        for k, v in self.modules.iteritems():

            # include* is almost always an ansible/ansible issue
            # https://github.com/ansible/ansibullbot/issues/214
            if k.endswith('/include.py'):
                self.modules[k]['repository'] = 'ansible'
            # https://github.com/ansible/ansibullbot/issues/214
            if k.endswith('/include_vars.py'):
                self.modules[k]['repository'] = 'ansible'
            if k.endswith('/include_role.py'):
                self.modules[k]['repository'] = 'ansible'

            # ansible maintains these
            if 'include' in k:
                self.modules[k]['maintainers'] = ['ansible']

            # deprecated modules are annoying
            if v['name'].startswith('_'):

                dkey = os.path.dirname(v['filepath'])
                dkey = os.path.join(dkey, v['filename'].replace('_', '', 1))
                if dkey not in self.modules:
                    nd = v.copy()
                    nd['name'] = nd['name'].replace('_', '', 1)
                    newitems.append((dkey, nd))

        for ni in newitems:
            self.modules[ni[0]] = ni[1]

        # parse metadata
        logging.debug('set module metadata')
        self.set_module_metadata()

        # parse imports
        logging.debug('set module imports')
        self.set_module_imports()

        # last modified
        logging.debug('set module commits')
        self.get_module_commits()

        # parse blame
        logging.debug('set module blames')
        self.get_module_blames()

        # depends on metadata now ...
        logging.debug('set module maintainers')
        self.set_maintainers()

        return self.modules

    def populate_modules(self, matches):
        # figure out the names
        for match in matches:
            mdict = copy.deepcopy(self.EMPTY_MODULE)

            mdict['filename'] = os.path.basename(match)

            dirpath = os.path.dirname(match)
            dirpath = dirpath.replace(self.checkoutdir + '/', '')
            mdict['dirpath'] = dirpath

            filepath = match.replace(self.checkoutdir + '/', '')
            mdict['filepath'] = filepath

            mdict.update(self.split_topics_from_path(filepath))

            mdict['repo_filename'] = mdict['filepath']\
                .replace('lib/ansible/modules/%s/' % mdict['repository'], '')

            # clustering/consul
            mdict['namespaced_module'] = mdict['repo_filename']
            mdict['namespaced_module'] = \
                mdict['namespaced_module'].replace('.py', '')
            mdict['namespaced_module'] = \
                mdict['namespaced_module'].replace('.ps1', '')

            mname = os.path.basename(match)
            mname = mname.replace('.py', '')
            mname = mname.replace('.ps1', '')
            mdict['name'] = mname

            # deprecated modules
            if mname.startswith('_'):
                mdict['deprecated'] = True
                deprecated_filename = \
                    os.path.dirname(mdict['namespaced_module'])
                deprecated_filename = \
                    os.path.join(deprecated_filename, mname[1:] + '.py')
                mdict['deprecated_filename'] = deprecated_filename
            else:
                mdict['deprecated_filename'] = mdict['repo_filename']

            self.modules[filepath] = mdict

        # meta is a special module
        self.modules['meta'] = copy.deepcopy(self.EMPTY_MODULE)
        self.modules['meta']['name'] = 'meta'
        self.modules['meta']['repo_filename'] = 'meta'

    def get_module_commits(self):
        keys = self.modules.keys()
        keys = sorted(keys)
        for k in keys:
            #v = self.modules[k]
            self.commits[k] = []
            cpath = os.path.join(self.checkoutdir, k)
            if not os.path.isfile(cpath):
                continue

            mtime = os.path.getmtime(cpath)
            refresh = False
            pfile = os.path.join(self.scraper_cache,
                                 k.replace('/', '_') + '.commits.pickle')

            if not os.path.isfile(pfile):
                refresh = True
            else:
                with open(pfile, 'rb') as f:
                    pdata = pickle.load(f)
                if pdata[0] == mtime:
                    self.commits[k] = pdata[1]
                else:
                    refresh = True

            if refresh:
                logging.info('refresh commit cache for %s' % k)
                cmd = 'cd %s; git log --follow %s' % (self.checkoutdir, k)
                (rc, so, se) = run_command(cmd)
                for line in so.split('\n'):
                    if line.startswith('commit '):
                        commit = {
                            'name': None,
                            'email': None,
                            'login': None,
                            'hash': line.split()[-1],
                            'date': None
                        }

                    # Author: Matt Clay <*****@*****.**>
                    if line.startswith('Author: '):
                        line = line.replace('Author: ', '')
                        line = line.replace('<', '')
                        line = line.replace('>', '')
                        lparts = line.split()

                        if '@' in lparts[-1]:
                            commit['email'] = lparts[-1]
                            commit['name'] = ' '.join(lparts[:-1])
                        else:
                            pass

                        if commit['email'] and \
                                'noreply.github.com' in commit['email']:
                            commit['login'] = commit['email'].split('@')[0]

                    # Date:   Sat Jan 28 23:28:53 2017 -0800
                    if line.startswith('Date:'):
                        dstr = line.split(':', 1)[1].strip()
                        dstr = ' '.join(dstr.split(' ')[:-1])
                        ds = datetime.datetime.strptime(
                            dstr, '%a %b %d %H:%M:%S %Y')
                        commit['date'] = ds
                        self.commits[k].append(commit)

                with open(pfile, 'wb') as f:
                    pickle.dump((mtime, self.commits[k]), f)

    def last_commit_for_file(self, filepath):
        if filepath in self.commits:
            return self.commits[filepath][0]['hash']

        # git log --pretty=format:'%H' -1
        # lib/ansible/modules/cloud/amazon/ec2_metric_alarm.py
        cmd = 'cd %s; git log --pretty=format:\'%%H\' -1 %s' % \
            (self.checkoutdir, filepath)
        (rc, so, se) = run_command(cmd)
        #import epdb; epdb.st()
        return so.strip()

    def get_module_blames(self):

        logging.debug('build email cache')
        emails_cache = self.session.query(Email)
        emails_cache = [(x.email, x.login) for x in emails_cache]
        self.emails_cache = dict(emails_cache)

        logging.debug('build blame cache')
        blame_cache = self.session.query(Blame).all()
        blame_cache = [x.file_commit for x in blame_cache]
        blame_cache = sorted(set(blame_cache))

        logging.debug('eval module hashes')
        changed = False
        keys = sorted(self.modules.keys())
        for k in keys:
            #logging.debug('eval {}'.format(k))

            if k not in self.files:
                self.committers[k] = {}
                continue

            #logging.debug('last commit {}'.format(k))
            ghash = self.last_commit_for_file(k)

            if ghash in blame_cache:
                continue

            logging.debug('checking hash for {}'.format(k))
            res = self.session.query(Blame).filter_by(file_name=k,
                                                      file_commit=ghash).all()
            hashes = [x.file_commit for x in res]

            if ghash not in hashes:

                logging.debug(
                    'hash {} not found for {}, updating blames'.format(
                        ghash, k))

                scraper_args = ['ansible', 'ansible', 'devel', k]
                uns, emailmap = self.gqlc.get_usernames_from_filename_blame(
                    *scraper_args)

                # check the emails
                for email, login in emailmap.items():
                    if email in self.emails_cache:
                        continue
                    exists = self.session.query(Email).filter_by(
                        email=email).first()
                    if not exists:
                        logging.debug('insert {}:{}'.format(login, email))
                        _email = Email(email=email, login=login)
                        self.session.add(_email)
                        changed = True

                # check the blames
                for login, commits in uns.items():
                    for commit in commits:
                        kwargs = {
                            'file_name': k,
                            'file_commit': ghash,
                            'author_commit': commit,
                            'author_login': login
                        }
                        exists = self.session.query(Blame).filter_by(
                            **kwargs).first()
                        if not exists:
                            logging.debug('insert {}:{}:{}'.format(
                                k, commit, login))
                            _blame = Blame(**kwargs)
                            self.session.add(_blame)
                            changed = True

        if changed:
            self.session.commit()
            logging.debug('re-build email cache')
            emails_cache = self.session.query(Email)
            emails_cache = [(x.email, x.login) for x in emails_cache]
            self.emails_cache = dict(emails_cache)

        # fill in what we can ...
        logging.debug('fill in commit logins')
        for k in keys:
            for idc, commit in enumerate(self.commits[k][:]):
                if not commit.get('login'):
                    continue
                login = self.emails_cache.get(commit['email'])
                if not login and '@users.noreply.github.com' in commit['email']:
                    login = commit['email'].split('@')[0]
                    self.emails_cache[commit['email']] = login
                if not login:
                    print('unknown: {}'.format(commit['email']))
                    #import epdb; epdb.st()
                self.commits[k][idc]['login'] = self.emails_cache.get(login)

    def _get_module_blames(self):
        ''' Scrape the blame page for each module and store it '''

        keys = sorted(self.modules.keys())

        # scrape the data
        #for k,v in self.modules.iteritems():
        for k in keys:

            #v = self.modules[k]
            cpath = os.path.join(self.checkoutdir, k)
            if not os.path.isfile(cpath):
                self.committers[k] = {}
                continue

            #mtime = os.path.getmtime(cpath)
            ghash = self.last_commit_for_file(k)
            pfile = os.path.join(self.scraper_cache,
                                 k.replace('/', '_') + '.blame.pickle')
            sargs = ['ansible', 'ansible', 'devel', k]

            refresh = False
            if not os.path.isfile(pfile):
                refresh = True
            else:
                logging.debug('load {}'.format(pfile))
                with open(pfile, 'rb') as f:
                    pdata = pickle.load(f)
                import epdb
                epdb.st()
                if pdata[0] == ghash:
                    self.committers[k] = pdata[1]
                    if len(pdata) == 3:
                        # use emailmap if available
                        emailmap = pdata[2]
                    else:
                        emailmap = {}
                else:
                    refresh = True

            if refresh:
                if self.gqlc:
                    logging.debug('graphql blame usernames {}'.format(pfile))
                    uns, emailmap = self.gqlc.get_usernames_from_filename_blame(
                        *sargs)
                else:
                    emailmap = {}  # scrapping: emails not available
                    logging.debug('www blame usernames {}'.format(pfile))
                    uns = self.gws.get_usernames_from_filename_blame(*sargs)
                self.committers[k] = uns
                with open(pfile, 'wb') as f:
                    pickle.dump((ghash, uns, emailmap), f)

            for email, github_id in emailmap.items():
                if email not in self.emails_cache:
                    self.emails_cache[email] = github_id

        # add scraped logins to the map
        #for k,v in self.modules.iteritems():
        for k in keys:
            #v = self.modules[k]
            for idx, x in enumerate(self.commits[k]):
                if x['email'] in ['@']:
                    continue
                if x['email'] not in self.emails_cache:
                    self.emails_cache[x['email']] = None
                if x['login']:
                    self.emails_cache[x['email']] = x['login']
                    continue

                xhash = x['hash']
                for ck, cv in self.committers[k].iteritems():
                    if xhash in cv:
                        self.emails_cache[x['email']] = ck
                        break

        # fill in what we can ...
        #for k,v in self.modules.iteritems():
        for k in keys:
            #v = self.modules[k]
            for idx, x in enumerate(self.commits[k]):
                if not x['login']:
                    if x['email'] in ['@']:
                        continue
                    if self.emails_cache[x['email']]:
                        login = self.emails_cache[x['email']]
                        xhash = x['hash']
                        self.commits[k][idx]['login'] = login
                        if login not in self.committers[k]:
                            self.committers[k][login] = []
                        if xhash not in self.committers[k][login]:
                            self.committers[k][login].append(xhash)

    def set_maintainers(self):
        '''Define the maintainers for each module'''

        # grep the authors:
        for k, v in self.modules.iteritems():
            if v['filepath'] is None:
                continue
            mfile = os.path.join(self.checkoutdir, v['filepath'])
            authors = self.get_module_authors(mfile)
            self.modules[k]['authors'] = authors

            # authors are maintainers by -default-
            self.modules[k]['maintainers'] += authors
            self.modules[k]['maintainers'] = \
                sorted(set(self.modules[k]['maintainers']))

        metadata = self.botmeta['files'].keys()
        for k, v in self.modules.iteritems():
            if k == 'meta':
                continue

            if k in self.botmeta['files']:
                # There are metadata in .github/BOTMETA.yml for this file
                # copy maintainers_keys
                self.modules[k]['maintainers_keys'] = self.botmeta['files'][k][
                    'maintainers_keys'][:]

                if self.botmeta['files'][k]:
                    maintainers = self.botmeta['files'][k].get(
                        'maintainers', [])
                    for maintainer in maintainers:
                        if maintainer not in self.modules[k]['maintainers']:
                            self.modules[k]['maintainers'].append(maintainer)

                    # remove the people who want to be ignored
                    if 'ignored' in self.botmeta['files'][k]:
                        ignored = self.botmeta['files'][k]['ignored']
                        for x in ignored:
                            if x in self.modules[k]['maintainers']:
                                self.modules[k]['maintainers'].remove(x)

            else:
                # There isn't metadata in .github/BOTMETA.yml for this file
                best_match = None
                for mkey in metadata:
                    if v['filepath'].startswith(mkey):
                        if not best_match:
                            best_match = mkey
                            continue
                        if len(mkey) > len(best_match):
                            best_match = mkey
                if best_match:
                    self.modules[k]['maintainers_keys'] = [best_match]
                    for maintainer in self.botmeta['files'][best_match].get(
                            'maintainers', []):
                        if maintainer not in self.modules[k]['maintainers']:
                            self.modules[k]['maintainers'].append(maintainer)

                    # remove the people who want to be ignored
                    for ignored in self.botmeta['files'][best_match].get(
                            'ignored', []):
                        if ignored in self.modules[k]['maintainers']:
                            self.modules[k]['maintainers'].remove(ignored)

            # save a pristine copy so that higher level code can still use it
            self.modules[k]['maintainers'] = sorted(
                set(self.modules[k]['maintainers']))
            self.modules[k]['_maintainers'] = \
                [x for x in self.modules[k]['maintainers']]

        # set the namespace maintainers ...
        for k, v in self.modules.iteritems():
            if 'namespace_maintainers' not in self.modules[k]:
                self.modules[k]['namespace_maintainers'] = []
            if v.get('namespace'):
                ns = v.get('namespace')
                nms = self.get_maintainers_for_namespace(ns)
                self.modules[k]['namespace_maintainers'] = nms

    def split_topics_from_path(self, module_file):
        subpath = module_file.replace('lib/ansible/modules/', '')
        path_parts = subpath.split('/')
        topic = path_parts[0]

        if len(path_parts) > 2:
            subtopic = path_parts[1]
            fulltopic = '/'.join(path_parts[0:2])
        else:
            subtopic = None
            fulltopic = path_parts[0]

        tdata = {
            'fulltopic': fulltopic,
            'namespace': fulltopic,
            'topic': topic,
            'subtopic': subtopic
        }

        return tdata

    def get_module_authors(self, module_file):
        """Grep the authors out of the module docstrings"""

        if not os.path.exists(module_file):
            return []

        documentation = ''
        inphase = False

        with open(module_file, 'rb') as f:
            for line in f:
                if 'DOCUMENTATION' in line:
                    inphase = True
                    continue
                if line.strip().endswith("'''") or line.strip().endswith(
                        '"""'):
                    #phase = None
                    break
                if inphase:
                    documentation += line

        if not documentation:
            return []

        # clean out any other yaml besides author to save time
        inphase = False
        author_lines = ''
        doc_lines = documentation.split('\n')
        for idx, x in enumerate(doc_lines):
            if x.startswith('author'):
                #print("START ON %s" % x)
                inphase = True
                #continue
            if inphase and not x.strip().startswith('-') and \
                    not x.strip().startswith('author'):
                #print("BREAK ON %s" % x)
                inphase = False
                break
            if inphase:
                author_lines += x + '\n'

        if not author_lines:
            return []

        ydata = {}
        try:
            ydata = yaml.load(author_lines)
        except Exception as e:
            print e
            return []

        # quit early if the yaml was not valid
        if not ydata:
            return []

        # sometimes the field is 'author', sometimes it is 'authors'
        if 'authors' in ydata:
            ydata['author'] = ydata['authors']

        # quit if the key was not found
        if 'author' not in ydata:
            return []

        if type(ydata['author']) != list:
            ydata['author'] = [ydata['author']]

        authors = []
        for author in ydata['author']:
            github_ids = self.extract_github_id(author)
            if github_ids:
                authors.extend(github_ids)
        return authors

    def extract_github_id(self, author):
        authors = set()

        if 'ansible core team' in author.lower():
            authors.add('ansible')
        elif '@' in author:
            # match github ids but not emails
            authors.update(re.findall(r'(?<!\w)@([\w-]+)(?![\w.])', author))
        elif 'github.com/' in author:
            # {'author': 'Henrique Rodrigues (github.com/Sodki)'}
            idx = author.find('github.com/')
            author = author[idx + 11:]
            authors.add(author.replace(')', ''))
        elif '(' in author and len(author.split()) == 3:
            # Mathieu Bultel (matbu)
            idx = author.find('(')
            author = author[idx + 1:]
            authors.add(author.replace(')', ''))

        # search for emails
        for email in re.findall(r'[<(]([^@]+@[^)>]+)[)>]', author):
            github_id = self.emails_cache.get(email)
            if github_id:
                authors.add(github_id)

        return list(authors)

    def fuzzy_match(self, repo=None, title=None, component=None):
        '''Fuzzy matching for modules'''

        # https://github.com/ansible/ansible/issues/18179
        if 'validate-modules' in component:
            return None

        # https://github.com/ansible/ansible/issues/20368
        if 'module_utils' in component:
            return None

        # authorized_keys vs. authorized_key
        if component and component.endswith('s'):
            tm = self.find_match(component[:-1])
            if tm:
                return tm['name']

        match = None
        known_modules = []

        for k, v in self.modules.iteritems():
            known_modules.append(v['name'])

        title = title.lower()
        title = title.replace(':', '')
        title_matches = [x for x in known_modules if x + ' module' in title]

        if not title_matches:
            title_matches = [
                x for x in known_modules if title.startswith(x + ' ')
            ]
            if not title_matches:
                title_matches = \
                    [x for x in known_modules if ' ' + x + ' ' in title]

        # don't do singular word matching in title for ansible/ansible
        cmatches = None
        if component:
            cmatches = [x for x in known_modules if x in component]
            cmatches = [x for x in cmatches if not '_' + x in component]

            # use title ... ?
            if title_matches:
                cmatches = [x for x in cmatches if x in title_matches]

            if cmatches:
                if len(cmatches) >= 1:
                    match = cmatches[0]
                if not match:
                    if 'docs.ansible.com' in component:
                        pass
                    else:
                        pass
                print("module - component matches: %s" % cmatches)

        if not match:
            if len(title_matches) == 1:
                match = title_matches[0]
            else:
                print("module - title matches: %s" % title_matches)

        return match

    def is_multi(self, rawtext):
        '''Is the string a list or a glob of modules?'''
        if rawtext:
            lines = rawtext.split('\n')

            # clean up lines
            lines = [x.strip() for x in lines if x.strip()]
            lines = [x for x in lines if len(x) > 2]

            if len(lines) > 1:
                return True

            if lines:
                if lines[0].strip().endswith('*'):
                    return True

        return False

    # https://github.com/ansible/ansible-modules-core/issues/3831
    def multi_match(self, rawtext):
        '''Return a list of matches for a given glob or list of names'''
        matches = []
        lines = rawtext.split('\n')
        lines = [x.strip() for x in lines if x.strip()]
        for line in lines:
            # is it an exact name, a path, a globbed name, a globbed path?
            if line.endswith('*'):
                thiskey = line.replace('*', '')
                keymatches = []
                for k in self.modules.keys():
                    if thiskey in k:
                        keymatches.append(k)
                for k in keymatches:
                    matches.append(self.modules[k].copy())
            else:
                match = self.find_match(line)
                if match:
                    matches.append(match)

        # unique the list
        tmplist = []
        for x in matches:
            if x not in tmplist:
                tmplist.append(x)
        if matches != tmplist:
            matches = [x for x in tmplist]

        return matches

    def set_module_metadata(self):
        for k, v in self.modules.iteritems():
            if not v['filepath']:
                continue
            mfile = os.path.join(self.checkoutdir, v['filepath'])
            if not mfile.endswith('.py'):
                # metadata is only the .py files ...
                ext = mfile.split('.')[-1]
                mfile = mfile.replace('.' + ext, '.py', 1)

            self.modules[k]['metadata'].update(self.get_module_metadata(mfile))

    def get_module_metadata(self, module_file):
        meta = {}

        if not os.path.isfile(module_file):
            return meta

        rawmeta = ''
        inphase = False
        with open(module_file, 'rb') as f:
            for line in f:
                if line.startswith('ANSIBLE_METADATA'):
                    inphase = True
                    #continue
                if line.startswith('DOCUMENTATION'):
                    break
                if inphase:
                    rawmeta += line
        rawmeta = rawmeta.replace('ANSIBLE_METADATA =', '', 1)
        rawmeta = rawmeta.strip()
        try:
            meta = ast.literal_eval(rawmeta)
        except SyntaxError:
            pass

        return meta

    def set_module_imports(self):
        for k, v in self.modules.iteritems():
            if not v['filepath']:
                continue
            mfile = os.path.join(self.checkoutdir, v['filepath'])
            self.modules[k]['imports'] = self.get_module_imports(mfile)

    def get_module_imports(self, module_file):

        #import ansible.module_utils.nxos
        #from ansible.module_utils.netcfg import NetworkConfig, dumps
        #from ansible.module_utils.network import NetworkModule

        mimports = []

        if not os.path.isfile(module_file):
            return mimports

        else:
            with open(module_file, 'rb') as f:
                for line in f:
                    line = line.strip()
                    line = line.replace(',', '')
                    if line.startswith('import') or \
                            ('import' in line and 'from' in line):
                        lparts = line.split()
                        if line.startswith('import '):
                            mimports.append(lparts[1])
                        elif line.startswith('from '):
                            mpath = lparts[1] + '.'
                            for spath in lparts[3:]:
                                mimports.append(mpath + spath)

            return mimports

    @property
    def all_maintainers(self):
        maintainers = set()
        for path, metadata in self.botmeta['files'].items():
            maintainers.update(metadata.get('maintainers', []))
        return maintainers

    def get_maintainers_for_namespace(self, namespace):
        maintainers = []
        for k, v in self.modules.items():
            if 'namespace' not in v or 'maintainers' not in v:
                continue
            if v['namespace'] == namespace:
                for m in v['maintainers']:
                    if m not in maintainers:
                        maintainers.append(m)
        maintainers = [x for x in maintainers if x.strip()]
        return maintainers

    @staticmethod
    def replace_ansible(maintainers, ansible_members, bots=[]):
        '''Replace -ansible- with the -humans- in the org'''
        newlist = []
        for m in maintainers:
            if m != 'ansible':
                newlist.append(m)
            else:
                newlist += ansible_members
        newlist = sorted(set(newlist))
        newlist = [x for x in newlist if x not in bots]
        return newlist

    def get_file_content(self, filepath):
        fpath = os.path.join(self.checkoutdir, filepath)
        if not os.path.isfile(fpath):
            return None
        with open(fpath, 'rb') as f:
            data = f.read()
        return data
Пример #2
0
class ModuleIndexer(object):

    EMPTY_MODULE = {
        'authors': [],
        'name': None,
        'namespaced_module': None,
        'namespace_maintainers': [],
        'deprecated': False,
        'deprecated_filename': None,
        'dirpath': None,
        'filename': None,
        'filepath': None,
        'fulltopic': None,
        'maintainers': [],
        '_maintainers': [],
        'maintainers_key': None,
        'metadata': {},
        'repo_filename': None,
        'repository': 'ansible',
        'subtopic': None,
        'topic': None,
        'imports': []
    }

    def __init__(self, maintainers=None):
        self.modules = {}
        self.maintainers = maintainers or {}
        self.checkoutdir = '~/.ansibullbot/cache/ansible.modules.checkout'
        self.checkoutdir = os.path.expanduser(self.checkoutdir)
        self.importmap = {}
        self.scraper_cache = '~/.ansibullbot/cache/ansible.modules.scraper'
        self.scraper_cache = os.path.expanduser(self.scraper_cache)
        self.gws = GithubWebScraper(cachedir=self.scraper_cache)

        # committers by module
        self.committers = {}
        # commits by module
        self.commits = {}
        # map of email to github login
        self.emailmap = {}

    def create_checkout(self):
        """checkout ansible"""

        print('# creating checkout for module indexer')

        # cleanup
        if os.path.isdir(self.checkoutdir):
            shutil.rmtree(self.checkoutdir)

        cmd = "git clone http://github.com/ansible/ansible --recursive %s" \
            % self.checkoutdir
        (rc, so, se) = run_command(cmd)
        print str(so) + str(se)

    def update_checkout(self):
        """rebase + pull + update the checkout"""

        print('# updating checkout for module indexer')
        #success = True

        cmd = "cd %s ; git pull --rebase" % self.checkoutdir
        (rc, so, se) = run_command(cmd)
        print str(so) + str(se)

        # If rebase failed, recreate the checkout
        if rc != 0:
            self.create_checkout()
            return

        cmd = "cd %s ; git submodule update --recursive" % self.checkoutdir
        (rc, so, se) = run_command(cmd)
        print str(so) + str(se)

        # if update fails, recreate the checkout
        if rc != 0:
            self.create_checkout()

    def _find_match(self, pattern, exact=False):

        match = None
        for k,v in self.modules.iteritems():
            if v['name'] == pattern:
                match = v
                break
        if not match:
            # search by key ... aka the filepath
            for k,v in self.modules.iteritems():
                if k == pattern:
                    match = v
                    break
        if not match and not exact:
            # search by properties
            for k,v in self.modules.iteritems():
                for subkey in v.keys():
                    if v[subkey] == pattern:
                        match = v
                        break
                if match:
                    break
        return match

    def find_match(self, pattern, exact=False):
        '''Exact module name matching'''
        if not pattern:
            return None

        # https://github.com/ansible/ansible/issues/19755
        if pattern == 'setup':
            pattern = 'system/setup.py'

        # https://github.com/ansible/ansible/issues/18527
        #   docker-container -> docker_container
        if '-' in pattern:
            pattern = pattern.replace('-', '_')

        if 'module_utils' in pattern:
            # https://github.com/ansible/ansible/issues/20368
            return None
        elif '/' in pattern and not self._find_match(pattern, exact=True):
            # https://github.com/ansible/ansible/issues/20520
            if not pattern.startswith('lib/'):
                keys = self.modules.keys()
                for k in keys:
                    if pattern in k:
                        ppy = pattern + '.py'
                        if k.endswith(pattern) or k.endswith(ppy):
                            return self.modules[k]
        elif pattern.endswith('.py') and self._find_match(pattern, exact=False):
            # https://github.com/ansible/ansible/issues/19889
            candidate = self._find_match(pattern, exact=False)
            if candidate['filename'] == pattern:
                return candidate

        match = self._find_match(pattern, exact=exact)
        if not match and not exact:
            # check for just the basename
            #   2617: ansible-s-extras/network/cloudflare_dns.py
            bname = os.path.basename(pattern)
            match = self._find_match(bname)

            if not match:
                # check for deprecated name
                #   _fireball -> fireball
                match = self._find_match('_' + bname)

        return match

    def is_valid(self, mname):
        match = self.find_match(mname)
        if match:
            return True
        else:
            return False

    def get_repository_for_module(self, mname):
        match = self.find_match(mname)
        if match:
            return match['repository']
        else:
            return None

    def get_ansible_modules(self):
        """Make a list of known modules"""

        # manage the checkout
        if not os.path.isdir(self.checkoutdir):
            self.create_checkout()
        else:
            self.update_checkout()

        #(Epdb) pp module
        #u'wait_for'
        #(Epdb) pp self.module_indexer.is_valid(module)
        #False

        matches = []
        module_dir = os.path.join(self.checkoutdir, 'lib/ansible/modules')
        module_dir = os.path.expanduser(module_dir)
        for root, dirnames, filenames in os.walk(module_dir):
            for filename in filenames:
                if 'lib/ansible/modules' in root and \
                        not filename == '__init__.py' and \
                        (filename.endswith('.py') or filename.endswith('.ps1')):
                    matches.append(os.path.join(root, filename))

        matches = sorted(set(matches))

        # figure out the names
        for match in matches:
            mdict = copy.deepcopy(self.EMPTY_MODULE)

            mdict['filename'] = os.path.basename(match)

            dirpath = os.path.dirname(match)
            dirpath = dirpath.replace(self.checkoutdir + '/', '')
            mdict['dirpath'] = dirpath

            filepath = match.replace(self.checkoutdir + '/', '')
            mdict['filepath'] = filepath

            mdict.update(
                self.split_topics_from_path(filepath)
            )

            mdict['repo_filename'] = mdict['filepath']\
                .replace('lib/ansible/modules/%s/' % mdict['repository'], '')

            # clustering/consul
            mdict['namespaced_module'] = mdict['repo_filename']
            mdict['namespaced_module'] = \
                mdict['namespaced_module'].replace('.py', '')
            mdict['namespaced_module'] = \
                mdict['namespaced_module'].replace('.ps1', '')

            mname = os.path.basename(match)
            mname = mname.replace('.py', '')
            mname = mname.replace('.ps1', '')
            mdict['name'] = mname

            # deprecated modules
            if mname.startswith('_'):
                mdict['deprecated'] = True
                deprecated_filename = \
                    os.path.dirname(mdict['namespaced_module'])
                deprecated_filename = \
                    os.path.join(deprecated_filename, mname[1:] + '.py')
                mdict['deprecated_filename'] = deprecated_filename
            else:
                mdict['deprecated_filename'] = mdict['repo_filename']

            mkey = mdict['filepath']
            self.modules[mkey] = mdict

        # grep the authors:
        for k,v in self.modules.iteritems():
            mfile = os.path.join(self.checkoutdir, v['filepath'])
            authors = self.get_module_authors(mfile)
            self.modules[k]['authors'] = authors

        # meta is a special module
        self.modules['meta'] = copy.deepcopy(self.EMPTY_MODULE)
        self.modules['meta']['name'] = 'meta'
        self.modules['meta']['repo_filename'] = 'meta'

        # custom fixes
        newitems = []
        for k,v in self.modules.iteritems():

            # include* is almost always an ansible/ansible issue
            # https://github.com/ansible/ansibullbot/issues/214
            if k.endswith('/include.py'):
                self.modules[k]['repository'] = 'ansible'
            # https://github.com/ansible/ansibullbot/issues/214
            if k.endswith('/include_vars.py'):
                self.modules[k]['repository'] = 'ansible'
            if k.endswith('/include_role.py'):
                self.modules[k]['repository'] = 'ansible'

            # ansible maintains these
            if 'include' in k:
                self.modules[k]['maintainers'] = ['ansible']

            # deprecated modules are annoying
            if v['name'].startswith('_'):

                dkey = os.path.dirname(v['filepath'])
                dkey = os.path.join(dkey, v['filename'].replace('_', '', 1))
                if dkey not in self.modules:
                    nd = v.copy()
                    nd['name'] = nd['name'].replace('_', '', 1)
                    newitems.append((dkey, nd))

        for ni in newitems:
            self.modules[ni[0]] = ni[1]

        # parse metadata
        self.set_module_metadata()

        # parse imports
        self.set_module_imports()

        # last modified
        self.get_module_commits()

        # parse blame
        self.get_module_blames()

        # depends on metadata now ...
        self.set_maintainers()

        return self.modules

    def get_module_commits(self):
        keys = self.modules.keys()
        keys = sorted(keys)
        for k in keys:
            #v = self.modules[k]
            self.commits[k] = []
            cpath = os.path.join(self.checkoutdir, k)
            if not os.path.isfile(cpath):
                continue

            mtime = os.path.getmtime(cpath)
            refresh = False
            pfile = os.path.join(
                self.scraper_cache,
                k.replace('/', '_') + '.commits.pickle'
            )

            if not os.path.isfile(pfile):
                refresh = True
            else:
                with open(pfile, 'rb') as f:
                    pdata = pickle.load(f)
                if pdata[0] == mtime:
                    self.commits[k] = pdata[1]
                else:
                    refresh = True

            if refresh:
                logging.info('refresh commit cache for %s' % k)
                cmd = 'cd %s; git log --follow %s' % (self.checkoutdir, k)
                (rc, so, se) = run_command(cmd)
                for line in so.split('\n'):
                    if line.startswith('commit '):
                        commit = {
                            'name': None,
                            'email': None,
                            'login': None,
                            'hash': line.split()[-1],
                            'date': None
                        }

                    # Author: Matt Clay <*****@*****.**>
                    if line.startswith('Author: '):
                        line = line.replace('Author: ', '')
                        line = line.replace('<', '')
                        line = line.replace('>', '')
                        lparts = line.split()

                        if '@' in lparts[-1]:
                            commit['email'] = lparts[-1]
                            commit['name'] = ' '.join(lparts[:-1])
                        else:
                            pass

                        if commit['email'] and \
                                'noreply.github.com' in commit['email']:
                            commit['login'] = commit['email'].split('@')[0]

                    # Date:   Sat Jan 28 23:28:53 2017 -0800
                    if line.startswith('Date:'):
                        dstr = line.split(':', 1)[1].strip()
                        dstr = ' '.join(dstr.split(' ')[:-1])
                        ds = datetime.datetime.strptime(
                            dstr,
                            '%a %b %d %H:%M:%S %Y'
                        )
                        commit['date'] = ds
                        self.commits[k].append(commit)

                with open(pfile, 'wb') as f:
                    pickle.dump((mtime, self.commits[k]), f)

    def last_commit_for_file(self, filepath):
        # git log --pretty=format:'%H' -1
        # lib/ansible/modules/cloud/amazon/ec2_metric_alarm.py
        cmd = 'cd %s; git log --pretty=format:\'%%H\' -1 %s' % \
            (self.checkoutdir, filepath)
        (rc, so, se) = run_command(cmd)
        #import epdb; epdb.st()
        return so.strip()

    def get_module_blames(self):
        ''' Scrape the blame page for each module and store it '''

        keys = sorted(self.modules.keys())

        # scrape the data
        #for k,v in self.modules.iteritems():
        for k in keys:
            #v = self.modules[k]
            cpath = os.path.join(self.checkoutdir, k)
            if not os.path.isfile(cpath):
                self.committers[k] = {}
                continue

            #mtime = os.path.getmtime(cpath)
            ghash = self.last_commit_for_file(k)
            pfile = os.path.join(
                self.scraper_cache,
                k.replace('/', '_') + '.blame.pickle'
            )
            sargs = ['ansible', 'ansible', 'devel', k]

            refresh = False
            if not os.path.isfile(pfile):
                refresh = True
            else:
                with open(pfile, 'rb') as f:
                    pdata = pickle.load(f)
                if pdata[0] == ghash:
                    self.committers[k] = pdata[1]
                else:
                    refresh = True

            if refresh:
                uns = self.gws.get_usernames_from_filename_blame(*sargs)
                self.committers[k] = uns
                with open(pfile, 'wb') as f:
                    pickle.dump((ghash, uns), f)

        # add scraped logins to the map
        #for k,v in self.modules.iteritems():
        for k in keys:
            #v = self.modules[k]
            for idx,x in enumerate(self.commits[k]):
                if x['email'] in ['@']:
                    continue
                if x['email'] not in self.emailmap:
                    self.emailmap[x['email']] = None
                if x['login']:
                    self.emailmap[x['email']] = x['login']
                    continue

                xhash = x['hash']
                for ck,cv in self.committers[k].iteritems():
                    if xhash in cv:
                        self.emailmap[x['email']] = ck
                        break

        # fill in what we can ...
        #for k,v in self.modules.iteritems():
        for k in keys:
            #v = self.modules[k]
            for idx,x in enumerate(self.commits[k]):
                if not x['login']:
                    if x['email'] in ['@']:
                        continue
                    if self.emailmap[x['email']]:
                        login = self.emailmap[x['email']]
                        xhash = x['hash']
                        self.commits[k][idx]['login'] = login
                        if login not in self.committers[k]:
                            self.committers[k][login] = []
                        if xhash not in self.committers[k][login]:
                            self.committers[k][login].append(xhash)

    def set_maintainers(self):
        '''Define the maintainers for each module'''
        mkeys = self.maintainers.keys()
        for k,v in self.modules.iteritems():
            if not v['filepath']:
                continue
            best_match = None
            for mkey in mkeys:
                if mkey in v['filepath']:
                    if not best_match:
                        best_match = mkey
                        continue
                    if len(mkey) > len(best_match):
                        best_match = mkey
            if best_match:
                self.modules[k]['maintainers_key'] = best_match
                self.modules[k]['maintainers'] = self.maintainers[best_match]
            else:
                if v['metadata'].get('supported_by') not in ['community']:
                    self.modules[k]['maintainers_key'] = best_match
                    if v['metadata'].get('supported_by') == 'core':
                        self.modules[k]['maintainers'] = ['ansible']
                    else:
                        # curated? ... what now?
                        pass
            # save a pristine copy so that higher level code can still use it
            self.modules[k]['_maintainers'] = \
                [x for x in self.modules[k]['maintainers']]

        # set the namespace maintainers ...
        for k,v in self.modules.iteritems():
            if 'namespace_maintainers' not in self.modules[k]:
                self.modules[k]['namespace_maintainers'] = []
            if v.get('namespace'):
                ns = v.get('namespace')
                nms = self.get_maintainers_for_namespace(ns)
                self.modules[k]['namespace_maintainers'] = nms

    def split_topics_from_path(self, module_file):
        subpath = module_file.replace('lib/ansible/modules/', '')
        path_parts = subpath.split('/')
        topic = path_parts[0]

        if len(path_parts) > 2:
            subtopic = path_parts[1]
            fulltopic = '/'.join(path_parts[0:2])
        else:
            subtopic = None
            fulltopic = path_parts[0]

        tdata = {
            'fulltopic': fulltopic,
            'namespace': fulltopic,
            'topic': topic,
            'subtopic': subtopic
        }

        return tdata

    def get_module_authors(self, module_file):
        """Grep the authors out of the module docstrings"""

        authors = []
        if not os.path.exists(module_file):
            return authors

        documentation = ''
        inphase = False

        with open(module_file, 'rb') as f:
            for line in f:
                if 'DOCUMENTATION' in line:
                    inphase = True
                    continue
                if line.strip().endswith("'''") or line.strip().endswith('"""'):
                    #phase = None
                    break
                if inphase:
                    documentation += line

        if not documentation:
            return authors

        # clean out any other yaml besides author to save time
        inphase = False
        author_lines = ''
        doc_lines = documentation.split('\n')
        for idx,x in enumerate(doc_lines):
            if x.startswith('author'):
                #print("START ON %s" % x)
                inphase = True
                #continue
            if inphase and not x.strip().startswith('-') and \
                    not x.strip().startswith('author'):
                #print("BREAK ON %s" % x)
                inphase = False
                break
            if inphase:
                author_lines += x + '\n'

        if not author_lines:
            return authors

        ydata = {}
        try:
            ydata = yaml.load(author_lines)
        except Exception as e:
            print e
            return authors

        # quit early if the yaml was not valid
        if not ydata:
            return authors

        # sometimes the field is 'author', sometimes it is 'authors'
        if 'authors' in ydata:
            ydata['author'] = ydata['authors']

        # quit if the key was not found
        if 'author' not in ydata:
            return authors

        if type(ydata['author']) != list:
            ydata['author'] = [ydata['author']]

        for author in ydata['author']:
            if 'ansible core team' in author.lower():
                authors.append('ansible')
            elif '@' in author:
                words = author.split()
                for word in words:
                    if '@' in word and '(' in word and ')' in word:
                        if '(' in word:
                            word = word.split('(')[-1]
                        if ')' in word:
                            word = word.split(')')[0]
                        word = word.strip()
                        if word.startswith('@'):
                            word = word.replace('@', '', 1)
                            authors.append(word)
            elif 'github.com/' in author:
                # {'author': 'Henrique Rodrigues (github.com/Sodki)'}
                idx = author.find('github.com/')
                author = author[idx+11:]
                author = author.replace(')', '')
                authors.append(author)
            elif '(' in author and len(author.split()) == 3:
                # Mathieu Bultel (matbu)
                idx = author.find('(')
                author = author[idx+1:]
                author = author.replace(')', '')
            else:
                pass

        return authors

    def fuzzy_match(self, repo=None, title=None, component=None):
        '''Fuzzy matching for modules'''

        # https://github.com/ansible/ansible/issues/18179
        if 'validate-modules' in component:
            return None

        # https://github.com/ansible/ansible/issues/20368
        if 'module_utils' in component:
            return None

        # authorized_keys vs. authorized_key
        if component and component.endswith('s'):
            tm = self.find_match(component[:-1])
            if tm:
                return tm['name']

        match = None
        known_modules = []

        for k,v in self.modules.iteritems():
            known_modules.append(v['name'])

        title = title.lower()
        title = title.replace(':', '')
        title_matches = [x for x in known_modules if x + ' module' in title]

        if not title_matches:
            title_matches = [x for x in known_modules
                             if title.startswith(x + ' ')]
            if not title_matches:
                title_matches = \
                    [x for x in known_modules if ' ' + x + ' ' in title]

        # don't do singular word matching in title for ansible/ansible
        cmatches = None
        if component:
            cmatches = [x for x in known_modules if x in component]
            cmatches = [x for x in cmatches if not '_' + x in component]

            # use title ... ?
            if title_matches:
                cmatches = [x for x in cmatches if x in title_matches]

            if cmatches:
                if len(cmatches) >= 1:
                    match = cmatches[0]
                if not match:
                    if 'docs.ansible.com' in component:
                        pass
                    else:
                        pass
                print("module - component matches: %s" % cmatches)

        if not match:
            if len(title_matches) == 1:
                match = title_matches[0]
            else:
                print("module - title matches: %s" % title_matches)

        return match

    def is_multi(self, rawtext):
        '''Is the string a list or a glob of modules?'''
        if rawtext:
            lines = rawtext.split('\n')

            # clean up lines
            lines = [x.strip() for x in lines if x.strip()]
            lines = [x for x in lines if len(x) > 2]

            if len(lines) > 1:
                return True

            if lines:
                if lines[0].strip().endswith('*'):
                    return True

        return False

    # https://github.com/ansible/ansible-modules-core/issues/3831
    def multi_match(self, rawtext):
        '''Return a list of matches for a given glob or list of names'''
        matches = []
        lines = rawtext.split('\n')
        lines = [x.strip() for x in lines if x.strip()]
        for line in lines:
            # is it an exact name, a path, a globbed name, a globbed path?
            if line.endswith('*'):
                thiskey = line.replace('*', '')
                keymatches = []
                for k in self.modules.keys():
                    if thiskey in k:
                        keymatches.append(k)
                for k in keymatches:
                    matches.append(self.modules[k].copy())
            else:
                match = self.find_match(line)
                if match:
                    matches.append(match)

        # unique the list
        tmplist = []
        for x in matches:
            if x not in tmplist:
                tmplist.append(x)
        if matches != tmplist:
            matches = [x for x in tmplist]

        return matches

    def set_module_metadata(self):
        for k,v in self.modules.iteritems():
            if not v['filepath']:
                continue
            mfile = os.path.join(self.checkoutdir, v['filepath'])
            if not mfile.endswith('.py'):
                # metadata is only the .py files ...
                ext = mfile.split('.')[-1]
                mfile = mfile.replace('.' + ext, '.py', 1)

            self.modules[k]['metadata'].update(self.get_module_metadata(mfile))

    def get_module_metadata(self, module_file):
        meta = {}

        if not os.path.isfile(module_file):
            return meta

        rawmeta = ''
        inphase = False
        with open(module_file, 'rb') as f:
            for line in f:
                if line.startswith('ANSIBLE_METADATA'):
                    inphase = True
                    #continue
                if line.startswith('DOCUMENTATION'):
                    break
                if inphase:
                    rawmeta += line
        rawmeta = rawmeta.replace('ANSIBLE_METADATA =', '', 1)
        rawmeta = rawmeta.strip()
        try:
            meta = ast.literal_eval(rawmeta)
        except SyntaxError:
            pass

        return meta

    def set_module_imports(self):
        for k,v in self.modules.iteritems():
            if not v['filepath']:
                continue
            mfile = os.path.join(self.checkoutdir, v['filepath'])
            self.modules[k]['imports'] = self.get_module_imports(mfile)

    def get_module_imports(self, module_file):

        #import ansible.module_utils.nxos
        #from ansible.module_utils.netcfg import NetworkConfig, dumps
        #from ansible.module_utils.network import NetworkModule

        mimports = []

        with open(module_file, 'rb') as f:
            for line in f:
                line = line.strip()
                line = line.replace(',', '')
                if line.startswith('import') or \
                        ('import' in line and 'from' in line):
                    lparts = line.split()
                    if line.startswith('import '):
                        mimports.append(lparts[1])
                    elif line.startswith('from '):
                        mpath = lparts[1] + '.'
                        for spath in lparts[3:]:
                            mimports.append(mpath + spath)

        return mimports

    @property
    def all_maintainers(self):
        maintainers = []
        for m in self.maintainers.values():
            if not isinstance(m, list):
                m = [m]
            for mi in m:
                if mi not in maintainers:
                    maintainers.append(mi)
        return maintainers

    def get_maintainers_for_namespace(self, namespace):
        maintainers = []
        for k,v in self.modules.items():
            if 'namespace' not in v or 'maintainers' not in v:
                continue
            if v['namespace'] == namespace:
                for m in v['maintainers']:
                    if m not in maintainers:
                        maintainers.append(m)
        maintainers = [x for x in maintainers if x.strip()]
        return maintainers

    @staticmethod
    def replace_ansible(maintainers, ansible_members, bots=[]):
        '''Replace -ansible- with the -humans- in the org'''
        newlist = []
        for m in maintainers:
            if m != 'ansible':
                newlist.append(m)
            else:
                newlist += ansible_members
        newlist = sorted(set(newlist))
        newlist = [x for x in newlist if x not in bots]
        return newlist
Пример #3
0
class ModuleIndexer(object):

    EMPTY_MODULE = {
        u'authors': [],
        u'name': None,
        u'namespaced_module': None,
        u'namespace_maintainers': [],
        u'deprecated': False,
        u'deprecated_filename': None,
        u'dirpath': None,
        u'filename': None,
        u'filepath': None,
        u'fulltopic': None,
        u'maintainers': [],
        u'_maintainers': [],
        u'maintainers_keys': None,
        u'metadata': {},
        u'repo_filename': None,
        u'repository': u'ansible',
        u'subtopic': None,
        u'topic': None,
        u'imports': []
    }

    def __init__(self,
                 commits=True,
                 blames=True,
                 botmetafile=None,
                 maintainers=None,
                 gh_client=None,
                 cachedir=u'~/.ansibullbot/cache',
                 gitrepo=None):
        '''
        Maintainers: defaultdict(dict) where keys are filepath and values are dict
        gh_client: GraphQL GitHub client
        '''
        self.get_commits = commits
        self.get_blames = blames
        self.botmetafile = botmetafile
        self.botmeta = {
        }  # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed)
        self.modules = {}  # keys: paths of files belonging to the repository
        self.maintainers = maintainers or {}
        self.importmap = {}
        self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper')
        self.scraper_cache = os.path.expanduser(self.scraper_cache)
        self.gws = GithubWebScraper(cachedir=self.scraper_cache)
        self.gqlc = gh_client
        self.files = []

        if gitrepo:
            self.gitrepo = gitrepo
        else:
            self.gitrepo = GitRepoWrapper(
                cachedir=cachedir,
                repo=u'https://github.com/ansible-collections/community.general'
            )

        # sqlalchemy
        unc = os.path.join(cachedir, u'ansible_module_indexer.db')
        unc = os.path.expanduser(unc)
        unc = u'sqlite:///' + unc

        self.engine = create_engine(unc)
        self.Session = sessionmaker(bind=self.engine)
        self.session = self.Session()

        Email.metadata.create_all(self.engine)
        Blame.metadata.create_all(self.engine)

        # committers by module
        self.committers = {}
        # commits by module
        self.commits = {}
        # map of email to github login
        self.emails_cache = {}

        # load the bot meta
        self.update(force=True)

    def update(self, force=False):
        '''Reload everything if there are new commits'''
        changed = self.gitrepo.manage_checkout()
        if changed or force:
            self.get_files()
            self.parse_metadata()

    def get_files(self):
        '''Cache a list of filenames in the checkout'''
        cmd = u'cd {}; git ls-files'.format(self.gitrepo.checkoutdir)
        (rc, so, se) = run_command(cmd)
        files = to_text(so).split(u'\n')
        files = [x.strip() for x in files if x.strip()]
        self.files = files

    def parse_metadata(self):

        if self.botmetafile is not None:
            with open(self.botmetafile, 'rb') as f:
                rdata = f.read()
        else:
            fp = u'.github/BOTMETA.yml'
            rdata = self.get_file_content(fp)
        self.botmeta = BotMetadataParser.parse_yaml(rdata)

        # load the modules
        logging.info(u'loading modules')
        self.get_ansible_modules()

    def _find_match(self, pattern, exact=False):

        logging.debug(u'exact:{} matching on {}'.format(exact, pattern))

        matches = []

        if isinstance(pattern, six.text_type):
            pattern = to_text(to_bytes(pattern, 'ascii', 'ignore'), 'ascii')

        for k, v in six.iteritems(self.modules):
            if v[u'name'] == pattern:
                logging.debug(u'match {} on name: {}'.format(k, v[u'name']))
                matches = [v]
                break

        if not matches:
            # search by key ... aka the filepath
            for k, v in six.iteritems(self.modules):
                if k == pattern:
                    logging.debug(u'match {} on key: {}'.format(k, k))
                    matches = [v]
                    break

        if not matches and not exact:
            # search by properties
            for k, v in six.iteritems(self.modules):
                for subkey in v.keys():
                    if v[subkey] == pattern:
                        logging.debug(u'match {} on subkey: {}'.format(
                            k, subkey))
                        matches.append(v)

        if not matches and not exact:
            # Levenshtein distance should workaround most typos
            distance_map = {}
            for k, v in six.iteritems(self.modules):
                mname = v.get(u'name')
                if not mname:
                    continue
                if isinstance(mname, six.text_type):
                    mname = to_text(to_bytes(mname, 'ascii', 'ignore'),
                                    'ascii')
                try:
                    res = Levenshtein.distance(pattern, mname)
                except TypeError as e:
                    logging.error(e)
                    if C.DEFAULT_BREAKPOINTS:
                        logging.error(u'breakpoint!')
                        import epdb
                        epdb.st()
                distance_map[mname] = [res, k]
            res = sorted(distance_map.items(),
                         key=lambda x: x[1],
                         reverse=True)
            if len(pattern) > 3 > res[-1][1]:
                logging.debug(u'levenshtein ratio match: ({}) {} {}'.format(
                    res[-1][-1], res[-1][0], pattern))
                matches = [self.modules[res[-1][-1]]]

        return matches

    def find_match(self, pattern, exact=False):
        '''Exact module name matching'''

        logging.debug(u'find_match for "{}"'.format(pattern))

        BLACKLIST = [
            u'module_utils', u'callback', u'network modules',
            u'networking modules'
            u'windows modules'
        ]

        if not pattern or pattern is None:
            return None

        if pattern.lower() == u'core':
            return None
        '''
        if 'docs.ansible.com' in pattern and '_module.html' in pattern:
            # http://docs.ansible.com/ansible/latest/copy_module.html
            # http://docs.ansible.com/ansible/latest/dev_guide/developing_modules.html
            # http://docs.ansible.com/ansible/latest/postgresql_db_module.html
            # [helm module](https//docs.ansible.com/ansible/2.4/helm_module.html)
            # Windows module: win_robocopy\nhttp://docs.ansible.com/ansible/latest/win_robocopy_module.html
            # Examples:\n* archive (https://docs.ansible.com/ansible/archive_module.html)\n* s3_sync (https://docs.ansible.com/ansible/s3_sync_module.html)
            urls = re.findall(
                'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                pattern
            )
            #urls = [x for x in urls if '_module.html' in x]
            #if urls:
            #    import epdb; epdb.st()
            import epdb; epdb.st()
        '''

        # https://github.com/ansible/ansible/issues/19755
        if pattern == u'setup':
            pattern = u'system/setup.py'

        if u'/facts.py' in pattern or u' facts.py' in pattern:
            pattern = u'system/setup.py'

        # https://github.com/ansible/ansible/issues/18527
        #   docker-container -> docker_container
        if u'-' in pattern:
            pattern = pattern.replace(u'-', u'_')

        if u'module_utils' in pattern:
            # https://github.com/ansible/ansible/issues/20368
            return None
        elif u'callback' in pattern:
            return None
        elif u'lookup' in pattern:
            return None
        elif u'contrib' in pattern and u'inventory' in pattern:
            return None
        elif pattern.lower() in BLACKLIST:
            return None
        elif u'/' in pattern and not self._find_match(pattern, exact=True):
            # https://github.com/ansible/ansible/issues/20520
            # FIXME what's this for?
            if not pattern.startswith(u'plugins/'):
                keys = self.modules.keys()
                for k in keys:
                    if pattern in k:
                        ppy = pattern + u'.py'
                        if k.endswith(pattern) or k.endswith(ppy):
                            return self.modules[k]
        elif pattern.endswith(u'.py') and self._find_match(pattern,
                                                           exact=False):
            # https://github.com/ansible/ansible/issues/19889
            candidate = self._find_match(pattern, exact=False)

            if isinstance(candidate, list):
                if len(candidate) == 1:
                    candidate = candidate[0]

            if candidate[u'filename'] == pattern:
                return candidate

        match = self._find_match(pattern, exact=exact)
        if not match and not exact:
            # check for just the basename
            #   2617: ansible-s-extras/network/cloudflare_dns.py
            bname = os.path.basename(pattern)
            match = self._find_match(bname)

            if not match:
                # check for deprecated name
                #   _fireball -> fireball
                match = self._find_match(u'_' + bname)

        # unique the results
        if isinstance(match, list) and len(match) > 1:
            _match = []
            for m in match:
                if m not in _match:
                    _match.append(m)
            match = _match[:]

        return match

    def is_valid(self, mname):
        match = self.find_match(mname, exact=True)
        if match:
            return True
        else:
            return False

    def get_repository_for_module(self, mname):
        match = self.find_match(mname, exact=True)
        if match:
            return match[u'repository']
        else:
            return None

    def get_ansible_modules(self):
        """Make a list of known modules"""

        matches = []
        module_dir = os.path.join(self.gitrepo.checkoutdir, u'plugins/modules')
        module_dir = os.path.expanduser(module_dir)
        for root, _, filenames in os.walk(module_dir):
            for filename in filenames:
                if u'plugins/modules' in root and not filename == u'__init__.py':
                    matches.append(os.path.join(root, filename))

        matches = sorted(set(matches))

        self.populate_modules(matches)

        # custom fixes
        newitems = []
        for k, v in six.iteritems(self.modules):

            # include* is almost always an ansible/ansible issue
            # https://github.com/ansible/ansibullbot/issues/214
            if k.endswith(u'/include.py'):
                self.modules[k][u'repository'] = u'ansible'
            # https://github.com/ansible/ansibullbot/issues/214
            if k.endswith(u'/include_vars.py'):
                self.modules[k][u'repository'] = u'ansible'
            if k.endswith(u'/include_role.py'):
                self.modules[k][u'repository'] = u'ansible'

            # ansible maintains these
            if u'include' in k:
                self.modules[k][u'maintainers'] = [u'ansible']

            # deprecated modules are annoying
            if v[u'name'].startswith(u'_'):

                dkey = os.path.dirname(v[u'filepath'])
                dkey = os.path.join(dkey, v[u'filename'].replace(u'_', u'', 1))
                if dkey not in self.modules:
                    nd = v.copy()
                    nd[u'name'] = nd[u'name'].replace(u'_', u'', 1)
                    newitems.append((dkey, nd))

        for ni in newitems:
            self.modules[ni[0]] = ni[1]

        # parse metadata
        logging.debug(u'set module metadata')
        self.set_module_metadata()

        # parse imports
        logging.debug(u'set module imports')
        self.set_module_imports()

        # last modified
        if self.get_commits:
            logging.debug(u'set module commits')
            self.get_module_commits()

        # parse blame
        if self.get_blames and self.get_commits:
            logging.debug(u'set module blames')
            self.get_module_blames()

        # depends on metadata now ...
        logging.debug(u'set module maintainers')
        self.set_maintainers()

        return self.modules

    def populate_modules(self, matches):
        # figure out the names
        for match in matches:
            mdict = copy.deepcopy(self.EMPTY_MODULE)

            mdict[u'filename'] = os.path.basename(match)

            dirpath = os.path.dirname(match)
            dirpath = dirpath.replace(self.gitrepo.checkoutdir + u'/', u'')
            mdict[u'dirpath'] = dirpath

            filepath = match.replace(self.gitrepo.checkoutdir + u'/', u'')
            mdict[u'filepath'] = filepath

            mdict.update(self.split_topics_from_path(filepath))

            mdict[u'repo_filename'] = mdict[u'filepath']\
                .replace(u'plugins/modules/%s/' % mdict[u'repository'], u'')

            # clustering/consul
            mdict[u'namespaced_module'] = mdict[u'repo_filename']
            mdict[u'namespaced_module'] = \
                mdict[u'namespaced_module'].replace(u'.py', u'')
            mdict[u'namespaced_module'] = \
                mdict[u'namespaced_module'].replace(u'.ps1', u'')

            mname = os.path.basename(match)
            mname = mname.replace(u'.py', u'')
            mname = mname.replace(u'.ps1', u'')
            mdict[u'name'] = mname

            # deprecated modules
            if mname.startswith(u'_'):
                mdict[u'deprecated'] = True
                deprecated_filename = \
                    os.path.dirname(mdict[u'namespaced_module'])
                deprecated_filename = \
                    os.path.join(deprecated_filename, mname[1:] + u'.py')
                mdict[u'deprecated_filename'] = deprecated_filename
            else:
                mdict[u'deprecated_filename'] = mdict[u'repo_filename']

            self.modules[filepath] = mdict

        # meta is a special module
        self.modules[u'meta'] = copy.deepcopy(self.EMPTY_MODULE)
        self.modules[u'meta'][u'name'] = u'meta'
        self.modules[u'meta'][u'repo_filename'] = u'meta'

    def get_module_commits(self):
        keys = self.modules.keys()
        keys = sorted(keys)
        for k in keys:
            self.commits[k] = []
            cpath = os.path.join(self.gitrepo.checkoutdir, k)
            if not os.path.isfile(cpath):
                continue

            mtime = os.path.getmtime(cpath)
            refresh = False
            pfile = os.path.join(self.scraper_cache,
                                 k.replace(u'/', u'_') + u'.commits.pickle')

            if not os.path.isfile(pfile):
                refresh = True
            else:
                pickle_kwargs = {'encoding': 'bytes'} if six.PY3 else {}
                print(pfile)
                with open(pfile, 'rb') as f:
                    pdata = pickle_load(f, **pickle_kwargs)
                if pdata[0] == mtime:
                    self.commits[k] = pdata[1]
                else:
                    refresh = True

            if refresh:
                logging.info(u'refresh commit cache for %s' % k)
                cmd = u'cd %s; git log --follow %s' % (
                    self.gitrepo.checkoutdir, k)
                (rc, so, se) = run_command(cmd)
                for line in to_text(so).split(u'\n'):
                    if line.startswith(u'commit '):
                        commit = {
                            u'name': None,
                            u'email': None,
                            u'login': None,
                            u'hash': line.split()[-1],
                            u'date': None
                        }

                    # Author: Matt Clay <*****@*****.**>
                    if line.startswith(u'Author: '):
                        line = line.replace(u'Author: ', u'')
                        line = line.replace(u'<', u'')
                        line = line.replace(u'>', u'')
                        lparts = line.split()

                        if u'@' in lparts[-1]:
                            commit[u'email'] = lparts[-1]
                            commit[u'name'] = u' '.join(lparts[:-1])
                        else:
                            pass

                        if commit[u'email'] and \
                                u'noreply.github.com' in commit[u'email']:
                            commit[u'login'] = commit[u'email'].split(u'@')[0]

                    # Date:   Sat Jan 28 23:28:53 2017 -0800
                    if line.startswith(u'Date:'):
                        dstr = line.split(u':', 1)[1].strip()
                        dstr = u' '.join(dstr.split(u' ')[:-1])
                        ds = datetime.datetime.strptime(
                            to_text(dstr), u'%a %b %d %H:%M:%S %Y')
                        commit[u'date'] = ds
                        self.commits[k].append(commit)

                with open(pfile, 'wb') as f:
                    pickle_dump((mtime, self.commits[k]), f)

    def last_commit_for_file(self, filepath):
        if filepath in self.commits and u'hash' in self.commits[filepath][0]:
            return self.commits[filepath][0][u'hash']

        # git log --pretty=format:'%H' -1
        # lib/ansible/modules/cloud/amazon/ec2_metric_alarm.py
        cmd = u'cd %s; git log --pretty=format:\'%%H\' -1 %s' % \
            (self.gitrepo.checkoutdir, filepath)
        (rc, so, se) = run_command(cmd)
        return to_text(so).strip()

    def get_module_blames(self):

        logging.debug(u'build email cache')
        emails_cache = self.session.query(Email)
        emails_cache = [(x.email, x.login) for x in emails_cache]
        self.emails_cache = dict(emails_cache)

        logging.debug(u'build blame cache')
        blame_cache = self.session.query(Blame).all()
        blame_cache = [x.file_commit for x in blame_cache]
        blame_cache = sorted(set(blame_cache))

        logging.debug(u'eval module hashes')
        changed = False
        keys = sorted(self.modules.keys())
        for k in keys:
            if k not in self.files:
                self.committers[k] = {}
                continue

            ghash = self.last_commit_for_file(k)

            if ghash in blame_cache:
                continue

            logging.debug(u'checking hash for {}'.format(k))
            res = self.session.query(Blame).filter_by(file_name=k,
                                                      file_commit=ghash).all()
            hashes = [x.file_commit for x in res]

            if ghash not in hashes:

                logging.debug(
                    u'hash {} not found for {}, updating blames'.format(
                        ghash, k))

                scraper_args = [u'ansible', u'ansible', u'devel', k]
                uns, emailmap = self.gqlc.get_usernames_from_filename_blame(
                    *scraper_args)

                # check the emails
                for email, login in emailmap.items():
                    if email in self.emails_cache:
                        continue
                    exists = self.session.query(Email).filter_by(
                        email=email).first()
                    if not exists:
                        logging.debug(u'insert {}:{}'.format(login, email))
                        _email = Email(email=email, login=login)
                        self.session.add(_email)
                        changed = True

                # check the blames
                for login, commits in uns.items():
                    for commit in commits:
                        kwargs = {
                            u'file_name': k,
                            u'file_commit': ghash,
                            u'author_commit': commit,
                            u'author_login': login
                        }
                        exists = self.session.query(Blame).filter_by(
                            **kwargs).first()
                        if not exists:
                            logging.debug(u'insert {}:{}:{}'.format(
                                k, commit, login))
                            _blame = Blame(**kwargs)
                            self.session.add(_blame)
                            changed = True

        if changed:
            self.session.commit()
            logging.debug(u're-build email cache')
            emails_cache = self.session.query(Email)
            emails_cache = [(x.email, x.login) for x in emails_cache]
            self.emails_cache = dict(emails_cache)

        # fill in what we can ...
        logging.debug(u'fill in commit logins')
        for k in keys:
            for idc, commit in enumerate(self.commits[k][:]):
                if not commit.get(u'login'):
                    continue
                login = self.emails_cache.get(commit[u'email'])
                if not login and u'@users.noreply.github.com' in commit[
                        u'email']:
                    login = commit[u'email'].split(u'@')[0]
                    self.emails_cache[commit[u'email']] = login
                if not login:
                    print(u'unknown: {}'.format(commit[u'email']))
                self.commits[k][idc][u'login'] = self.emails_cache.get(login)

    def get_emails_by_login(self, login):
        res = self.session.query(Email).filter_by(login=login)
        emails = [x.email for x in res.values()]
        return emails

    def _get_module_blames(self):
        ''' Scrape the blame page for each module and store it '''

        keys = sorted(self.modules.keys())

        # scrape the data
        for k in keys:

            cpath = os.path.join(self.gitrepo.checkoutdir, k)
            if not os.path.isfile(cpath):
                self.committers[k] = {}
                continue

            ghash = self.last_commit_for_file(k)
            pfile = os.path.join(self.scraper_cache,
                                 k.replace(u'/', u'_') + u'.blame.pickle')
            sargs = [u'ansible', u'ansible', u'devel', k]

            refresh = False
            if not os.path.isfile(pfile):
                refresh = True
            else:
                logging.debug(u'load {}'.format(pfile))
                with open(pfile, 'rb') as f:
                    pdata = pickle_load(f)
                if C.DEFAULT_BREAKPOINTS:
                    logging.error(u'breakpoint!')
                    import epdb
                    epdb.st()
                if pdata[0] == ghash:
                    self.committers[k] = pdata[1]
                    if len(pdata) == 3:
                        # use emailmap if available
                        emailmap = pdata[2]
                    else:
                        emailmap = {}
                else:
                    refresh = True

            if refresh:
                if self.gqlc:
                    logging.debug(u'graphql blame usernames {}'.format(pfile))
                    uns, emailmap = self.gqlc.get_usernames_from_filename_blame(
                        *sargs)
                else:
                    emailmap = {}  # scrapping: emails not available
                    logging.debug(u'www blame usernames {}'.format(pfile))
                    uns = self.gws.get_usernames_from_filename_blame(*sargs)
                self.committers[k] = uns
                with open(pfile, 'wb') as f:
                    pickle_dump((ghash, uns, emailmap), f)

            for email, github_id in emailmap.items():
                if email not in self.emails_cache:
                    self.emails_cache[email] = github_id

        # add scraped logins to the map
        for k in keys:
            for idx, x in enumerate(self.commits[k]):
                if x[u'email'] in [u'@']:
                    continue
                if x[u'email'] not in self.emails_cache:
                    self.emails_cache[x[u'email']] = None
                if x[u'login']:
                    self.emails_cache[x[u'email']] = x[u'login']
                    continue

                xhash = x[u'hash']
                for ck, cv in six.iteritems(self.committers[k]):
                    if xhash in cv:
                        self.emails_cache[x[u'email']] = ck
                        break

        # fill in what we can ...
        for k in keys:
            for idx, x in enumerate(self.commits[k]):
                if not x[u'login']:
                    if x[u'email'] in [u'@']:
                        continue
                    if self.emails_cache[x[u'email']]:
                        login = self.emails_cache[x[u'email']]
                        xhash = x[u'hash']
                        self.commits[k][idx][u'login'] = login
                        if login not in self.committers[k]:
                            self.committers[k][login] = []
                        if xhash not in self.committers[k][login]:
                            self.committers[k][login].append(xhash)

    def set_maintainers(self):
        '''Define the maintainers for each module'''

        # grep the authors:
        for k, v in six.iteritems(self.modules):
            if v[u'filepath'] is None:
                continue
            mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath'])
            authors = self.get_module_authors(mfile)
            self.modules[k][u'authors'] = authors

            # authors are maintainers by -default-
            self.modules[k][u'maintainers'] += authors
            self.modules[k][u'maintainers'] = \
                sorted(set(self.modules[k][u'maintainers']))

        metadata = self.botmeta[u'files'].keys()
        for k, v in six.iteritems(self.modules):
            if k == u'meta':
                continue

            if k in self.botmeta[u'files']:
                # There are metadata in .github/BOTMETA.yml for this file
                # copy maintainers_keys
                self.modules[k][u'maintainers_keys'] = self.botmeta[u'files'][
                    k][u'maintainers_keys'][:]

                if self.botmeta[u'files'][k]:
                    maintainers = self.botmeta[u'files'][k].get(
                        u'maintainers', [])

                    for maintainer in maintainers:
                        if maintainer not in self.modules[k][u'maintainers']:
                            self.modules[k][u'maintainers'].append(maintainer)

                    # remove the people who want to be ignored
                    if u'ignored' in self.botmeta[u'files'][k]:
                        ignored = self.botmeta[u'files'][k][u'ignored']
                        for x in ignored:
                            if x in self.modules[k][u'maintainers']:
                                self.modules[k][u'maintainers'].remove(x)

            else:
                # There isn't metadata in .github/BOTMETA.yml for this file
                best_match = None
                for mkey in metadata:
                    if v[u'filepath'].startswith(mkey):
                        if not best_match:
                            best_match = mkey
                            continue
                        if len(mkey) > len(best_match):
                            best_match = mkey
                if best_match:
                    self.modules[k][u'maintainers_keys'] = [best_match]
                    for maintainer in self.botmeta[u'files'][best_match].get(
                            u'maintainers', []):
                        if maintainer not in self.modules[k][u'maintainers']:
                            self.modules[k][u'maintainers'].append(maintainer)

                    # remove the people who want to be ignored
                    for ignored in self.botmeta[u'files'][best_match].get(
                            u'ignored', []):
                        if ignored in self.modules[k][u'maintainers']:
                            self.modules[k][u'maintainers'].remove(ignored)

            # save a pristine copy so that higher level code can still use it
            self.modules[k][u'maintainers'] = sorted(
                set(self.modules[k][u'maintainers']))
            self.modules[k][u'_maintainers'] = \
                [x for x in self.modules[k][u'maintainers']]

        # set the namespace maintainers ...
        for k, v in six.iteritems(self.modules):
            if u'namespace_maintainers' not in self.modules[k]:
                self.modules[k][u'namespace_maintainers'] = []
            if v.get(u'namespace'):
                ns = v.get(u'namespace')
                nms = self.get_maintainers_for_namespace(ns)
                self.modules[k][u'namespace_maintainers'] = nms

    def split_topics_from_path(self, module_file):
        subpath = module_file.replace(u'plugins/modules/', u'')
        path_parts = subpath.split(u'/')
        topic = path_parts[0]

        if len(path_parts) > 2:
            subtopic = path_parts[1]
            fulltopic = u'/'.join(path_parts[0:2])
        else:
            subtopic = None
            fulltopic = path_parts[0]

        tdata = {
            u'fulltopic': fulltopic,
            u'namespace': fulltopic,
            u'topic': topic,
            u'subtopic': subtopic
        }

        return tdata

    def get_module_authors(self, module_file):
        """Grep the authors out of the module docstrings"""

        if not os.path.exists(module_file):
            return []

        documentation = b''
        inphase = False

        with io.open(module_file, 'rb') as f:
            for line in f:
                if b'DOCUMENTATION' in line:
                    inphase = True
                    continue
                if line.strip().endswith((b"'''", b'"""')):
                    break
                if inphase:
                    documentation += line

        if not documentation:
            return []

        # clean out any other yaml besides author to save time
        inphase = False
        author_lines = u''
        doc_lines = to_text(documentation).split(u'\n')
        for idx, x in enumerate(doc_lines):
            if x.startswith(u'author'):
                inphase = True
            if inphase and not x.strip().startswith((u'-', u'author')):
                inphase = False
                break
            if inphase:
                author_lines += x + u'\n'

        if not author_lines:
            return []

        ydata = {}
        try:
            ydata = yaml.load(author_lines, BotYAMLLoader)
        except Exception as e:
            print(e)
            return []

        # quit early if the yaml was not valid
        if not ydata:
            return []

        # quit if the key was not found
        if u'author' not in ydata:
            return []

        if not isinstance(ydata[u'author'], list):
            ydata[u'author'] = [ydata[u'author']]

        authors = []
        for author in ydata[u'author']:
            github_ids = self.extract_github_id(author)
            if github_ids:
                authors.extend(github_ids)
        return authors

    def extract_github_id(self, author):
        authors = set()

        if author is None:
            return []
        if u'ansible core team' in author.lower():
            authors.add(u'ansible')
        elif u'@' in author:
            # match github ids but not emails
            authors.update(re.findall(r'(?<!\w)@([\w-]+)(?![\w.])', author))
        elif u'github.com/' in author:
            # {'author': 'Henrique Rodrigues (github.com/Sodki)'}
            idx = author.find(u'github.com/')
            author = author[idx + 11:]
            authors.add(author.replace(u')', u''))
        elif u'(' in author and len(author.split()) == 3:
            # Mathieu Bultel (matbu)
            idx = author.find(u'(')
            author = author[idx + 1:]
            authors.add(author.replace(u')', u''))

        # search for emails
        for email in re.findall(r'[<(]([^@]+@[^)>]+)[)>]', author):
            github_id = self.emails_cache.get(email)
            if github_id:
                authors.add(github_id)

        return list(authors)

    def fuzzy_match(self, repo=None, title=None, component=None):
        '''Fuzzy matching for modules'''

        logging.debug(u'fuzzy match {}'.format(
            to_text(to_bytes(component, 'ascii', 'ignore'), 'ascii')))

        if component.lower() == u'core':
            return None

        # https://github.com/ansible/ansible/issues/18179
        if u'validate-modules' in component:
            return None

        # https://github.com/ansible/ansible/issues/20368
        if u'module_utils' in component:
            return None

        if u'new module' in component:
            return None

        # authorized_keys vs. authorized_key
        if component and component.endswith(u's'):
            tm = self.find_match(component[:-1])
            if tm:
                if not isinstance(tm, list):
                    return tm[u'name']
                elif len(tm) == 1:
                    return tm[0][u'name']
                else:
                    if C.DEFAULT_BREAKPOINTS:
                        logging.error(u'breakpoint!')
                        import epdb
                        epdb.st()

        match = None
        known_modules = []

        for k, v in six.iteritems(self.modules):
            if v[u'name'] in [u'include']:
                continue
            known_modules.append(v[u'name'])

        title = title.lower()
        title = title.replace(u':', u'')
        title_matches = [x for x in known_modules if x + u' module' in title]

        if not title_matches:
            title_matches = [
                x for x in known_modules if title.startswith(x + u' ')
            ]
            if not title_matches:
                title_matches = \
                    [x for x in known_modules if u' ' + x + u' ' in title]

            if title_matches:
                title_matches = [x for x in title_matches if x != u'at']

        # don't do singular word matching in title for ansible/ansible
        cmatches = None
        if component:
            cmatches = [x for x in known_modules if x in component]
            cmatches = [x for x in cmatches if not u'_' + x in component]

        # globs
        if not cmatches and u'*' in component:
            fmatches = [
                x for x in known_modules if fnmatch.fnmatch(x, component)
            ]
            if fmatches:
                cmatches = fmatches[:]

        if title_matches:
            # use title ... ?
            cmatches = [
                x for x in cmatches if x in title_matches and x not in [u'at']
            ]

        if cmatches:
            if len(cmatches) >= 1 and (u'*' not in component
                                       and u'modules' not in component):
                match = cmatches[0]
            else:
                match = cmatches[:]
            if not match:
                if u'docs.ansible.com' in component:
                    pass
                else:
                    pass
            logging.debug("module - component matches: %s" % cmatches)

        if not match:
            if len(title_matches) == 1:
                match = title_matches[0]
            else:
                logging.debug("module - title matches: %s" % title_matches)

        return match

    def is_multi(self, rawtext):
        '''Is the string a list or a glob of modules?'''
        if rawtext:
            lines = rawtext.split(u'\n')

            # clean up lines
            lines = [x.strip() for x in lines if x.strip()]
            lines = [x for x in lines if len(x) > 2]

            if len(lines) > 1:
                return True

            if lines:
                if lines[0].strip().endswith(u'*'):
                    return True

        return False

    # https://github.com/ansible/ansible-modules-core/issues/3831
    def multi_match(self, rawtext):
        '''Return a list of matches for a given glob or list of names'''
        matches = []
        lines = rawtext.split(u'\n')
        lines = [x.strip() for x in lines if x.strip()]
        for line in lines:
            # is it an exact name, a path, a globbed name, a globbed path?
            if line.endswith(u'*'):
                thiskey = line.replace(u'*', u'')
                keymatches = []
                for k in self.modules.keys():
                    if thiskey in k:
                        keymatches.append(k)
                for k in keymatches:
                    matches.append(self.modules[k].copy())
            else:
                match = self.find_match(line)
                if match:
                    matches.append(match)

        # unique the list
        tmplist = []
        for x in matches:
            if x not in tmplist:
                tmplist.append(x)
        if matches != tmplist:
            matches = [x for x in tmplist]

        return matches

    def set_module_metadata(self):
        for k, v in six.iteritems(self.modules):
            if not v[u'filepath']:
                continue
            mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath'])
            if not mfile.endswith(u'.py'):
                # metadata is only the .py files ...
                ext = mfile.split(u'.')[-1]
                mfile = mfile.replace(u'.' + ext, u'.py', 1)

            self.modules[k][u'metadata'].update(
                self.get_module_metadata(mfile))

    def get_module_metadata(self, module_file):
        meta = {}

        if not os.path.isfile(module_file):
            return meta

        rawmeta = u''
        inphase = False
        with io.open(module_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith(u'ANSIBLE_METADATA'):
                    inphase = True
                if line.startswith(u'DOCUMENTATION'):
                    break
                if inphase:
                    rawmeta += line
        rawmeta = rawmeta.replace(u'ANSIBLE_METADATA =', u'', 1)
        rawmeta = rawmeta.strip()
        try:
            meta = ast.literal_eval(rawmeta)
            tmp_meta = {}
            for k, v in meta.items():
                if isinstance(k, six.binary_type):
                    k = to_text(k)
                if isinstance(v, six.binary_type):
                    v = to_text(v)
                if isinstance(v, list):
                    tmp_list = []
                    for i in v:
                        if isinstance(i, six.binary_type):
                            i = to_text(i)
                        tmp_list.append(i)
                    v = tmp_list
                    del tmp_list
                tmp_meta[k] = v
            meta = tmp_meta
            del tmp_meta
        except SyntaxError:
            pass

        return meta

    def set_module_imports(self):
        for k, v in six.iteritems(self.modules):
            if not v[u'filepath']:
                continue
            mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath'])
            self.modules[k][u'imports'] = self.get_module_imports(mfile)

    def get_module_imports(self, module_file):
        mimports = []

        if not os.path.isfile(module_file):
            return mimports

        else:
            with open(module_file, 'rb') as f:
                for line in f:
                    line = line.strip()
                    line = line.replace(b',', b'')
                    if line.startswith(b'import') or \
                            (b'import' in line and b'from' in line):
                        lparts = line.split()
                        if line.startswith(b'import '):
                            mimports.append(lparts[1])
                        elif line.startswith(b'from '):
                            mpath = lparts[1] + b'.'
                            for spath in lparts[3:]:
                                mimports.append(mpath + spath)

            return [to_text(m) for m in mimports]

    @property
    def all_maintainers(self):
        maintainers = set()
        for path, metadata in self.botmeta[u'files'].items():
            maintainers.update(metadata.get(u'maintainers', []))
        return maintainers

    @property
    def all_authors(self):
        authors = set()
        for key, metadata in self.modules.items():
            authors.update(metadata.get(u'authors', []))
        return authors

    def get_maintainers_for_namespace(self, namespace):
        maintainers = []
        for k, v in self.modules.items():
            if u'namespace' not in v or u'maintainers' not in v:
                continue
            if v[u'namespace'] == namespace:
                for m in v[u'maintainers']:
                    if m not in maintainers:
                        maintainers.append(m)
        maintainers = [x for x in maintainers if x.strip()]
        return maintainers

    @staticmethod
    def replace_ansible(maintainers, ansible_members, bots=[]):
        '''Replace -ansible- with the -humans- in the org'''
        newlist = []
        for m in maintainers:
            if m != u'ansible':
                newlist.append(m)
            else:
                newlist += ansible_members
        newlist = sorted(set(newlist))
        newlist = [x for x in newlist if x not in bots]
        return newlist

    def get_file_content(self, filepath):
        fpath = os.path.join(self.gitrepo.checkoutdir, filepath)
        if not os.path.isfile(fpath):
            return None
        with io.open(fpath, 'r', encoding='utf-8') as f:
            data = f.read()
        return data
Пример #4
0
class ModuleIndexer(object):

    EMPTY_MODULE = {
        u'authors': [],
        u'name': None,
        u'namespaced_module': None,
        u'namespace_maintainers': [],
        u'deprecated': False,
        u'deprecated_filename': None,
        u'dirpath': None,
        u'filename': None,
        u'filepath': None,
        u'fulltopic': None,
        u'maintainers': [],
        u'_maintainers': [],
        u'maintainers_keys': None,
        u'metadata': {},
        u'repo_filename': None,
        u'repository': u'ansible',
        u'subtopic': None,
        u'topic': None,
        u'imports': []
    }

    def __init__(self, commits=True, blames=True, botmetafile=None, maintainers=None, gh_client=None, cachedir=u'~/.ansibullbot/cache', gitrepo=None):
        '''
        Maintainers: defaultdict(dict) where keys are filepath and values are dict
        gh_client: GraphQL GitHub client
        '''
        self.get_commits = commits
        self.get_blames = blames
        self.botmetafile = botmetafile
        self.botmeta = {}  # BOTMETA.yml file with minor updates (macro rendered, empty default values fixed)
        self.modules = {}  # keys: paths of files belonging to the repository
        self.maintainers = maintainers or {}
        self.importmap = {}
        self.scraper_cache = os.path.join(cachedir, u'ansible.modules.scraper')
        self.scraper_cache = os.path.expanduser(self.scraper_cache)
        self.gws = GithubWebScraper(cachedir=self.scraper_cache)
        self.gqlc = gh_client
        self.files = []

        if gitrepo:
            self.gitrepo = gitrepo
        else:
            self.gitrepo = GitRepoWrapper(cachedir=cachedir, repo=u'https://github.com/ansible/ansible')

        # sqlalchemy
        unc = os.path.join(cachedir, u'ansible_module_indexer.db')
        unc = os.path.expanduser(unc)
        unc = u'sqlite:///' + unc

        self.engine = create_engine(unc)
        self.Session = sessionmaker(bind=self.engine)
        self.session = self.Session()

        Email.metadata.create_all(self.engine)
        Blame.metadata.create_all(self.engine)

        # committers by module
        self.committers = {}
        # commits by module
        self.commits = {}
        # map of email to github login
        self.emails_cache = {}

        # load the bot meta
        self.update(force=True)

    def update(self, force=False):
        '''Reload everything if there are new commits'''
        changed = self.gitrepo.manage_checkout()
        if changed or force:
            self.get_files()
            self.parse_metadata()

    def get_files(self):
        '''Cache a list of filenames in the checkout'''
        cmd = u'cd {}; git ls-files'.format(self.gitrepo.checkoutdir)
        (rc, so, se) = run_command(cmd)
        files = to_text(so).split(u'\n')
        files = [x.strip() for x in files if x.strip()]
        self.files = files

    def parse_metadata(self):

        if self.botmetafile is not None:
            with open(self.botmetafile, 'rb') as f:
                rdata = f.read()
        else:
            fp = u'.github/BOTMETA.yml'
            rdata = self.get_file_content(fp)
        self.botmeta = BotMetadataParser.parse_yaml(rdata)

        # load the modules
        logging.info(u'loading modules')
        self.get_ansible_modules()

    def _find_match(self, pattern, exact=False):

        logging.debug(u'exact:{} matching on {}'.format(exact, pattern))

        matches = []

        if isinstance(pattern, six.text_type):
            pattern = to_text(to_bytes(pattern,'ascii', 'ignore'), 'ascii')

        for k, v in six.iteritems(self.modules):
            if v[u'name'] == pattern:
                logging.debug(u'match {} on name: {}'.format(k, v[u'name']))
                matches = [v]
                break

        if not matches:
            # search by key ... aka the filepath
            for k, v in six.iteritems(self.modules):
                if k == pattern:
                    logging.debug(u'match {} on key: {}'.format(k, k))
                    matches = [v]
                    break

        if not matches and not exact:
            # search by properties
            for k, v in six.iteritems(self.modules):
                for subkey in v.keys():
                    if v[subkey] == pattern:
                        logging.debug(u'match {} on subkey: {}'.format(k, subkey))
                        matches.append(v)

        if not matches and not exact:
            # Levenshtein distance should workaround most typos
            distance_map = {}
            for k, v in six.iteritems(self.modules):
                mname = v.get(u'name')
                if not mname:
                    continue
                if isinstance(mname, six.text_type):
                    mname = to_text(to_bytes(mname, 'ascii', 'ignore'), 'ascii')
                try:
                    res = Levenshtein.distance(pattern, mname)
                except TypeError as e:
                    logging.error(e)
                    if C.DEFAULT_BREAKPOINTS:
                        logging.error(u'breakpoint!')
                        import epdb; epdb.st()
                distance_map[mname] = [res, k]
            res = sorted(distance_map.items(), key=lambda x: x[1], reverse=True)
            if len(pattern) > 3 > res[-1][1]:
                logging.debug(u'levenshtein ratio match: ({}) {} {}'.format(res[-1][-1], res[-1][0], pattern))
                matches = [self.modules[res[-1][-1]]]

        return matches

    def find_match(self, pattern, exact=False):
        '''Exact module name matching'''

        logging.debug(u'find_match for "{}"'.format(pattern))

        BLACKLIST = [
            u'module_utils',
            u'callback',
            u'network modules',
            u'networking modules'
            u'windows modules'
        ]

        if not pattern or pattern is None:
            return None

        if pattern.lower() == u'core':
            return None

        '''
        if 'docs.ansible.com' in pattern and '_module.html' in pattern:
            # http://docs.ansible.com/ansible/latest/copy_module.html
            # http://docs.ansible.com/ansible/latest/dev_guide/developing_modules.html
            # http://docs.ansible.com/ansible/latest/postgresql_db_module.html
            # [helm module](https//docs.ansible.com/ansible/2.4/helm_module.html)
            # Windows module: win_robocopy\nhttp://docs.ansible.com/ansible/latest/win_robocopy_module.html
            # Examples:\n* archive (https://docs.ansible.com/ansible/archive_module.html)\n* s3_sync (https://docs.ansible.com/ansible/s3_sync_module.html)
            urls = re.findall(
                'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                pattern
            )
            #urls = [x for x in urls if '_module.html' in x]
            #if urls:
            #    import epdb; epdb.st()
            import epdb; epdb.st()
        '''

        # https://github.com/ansible/ansible/issues/19755
        if pattern == u'setup':
            pattern = u'system/setup.py'

        if u'/facts.py' in pattern or u' facts.py' in pattern:
            pattern = u'system/setup.py'

        # https://github.com/ansible/ansible/issues/18527
        #   docker-container -> docker_container
        if u'-' in pattern:
            pattern = pattern.replace(u'-', u'_')

        if u'module_utils' in pattern:
            # https://github.com/ansible/ansible/issues/20368
            return None
        elif u'callback' in pattern:
            return None
        elif u'lookup' in pattern:
            return None
        elif u'contrib' in pattern and u'inventory' in pattern:
            return None
        elif pattern.lower() in BLACKLIST:
            return None
        elif u'/' in pattern and not self._find_match(pattern, exact=True):
            # https://github.com/ansible/ansible/issues/20520
            if not pattern.startswith(u'lib/'):
                keys = self.modules.keys()
                for k in keys:
                    if pattern in k:
                        ppy = pattern + u'.py'
                        if k.endswith(pattern) or k.endswith(ppy):
                            return self.modules[k]
        elif pattern.endswith(u'.py') and self._find_match(pattern, exact=False):
            # https://github.com/ansible/ansible/issues/19889
            candidate = self._find_match(pattern, exact=False)

            if isinstance(candidate, list):
                if len(candidate) == 1:
                    candidate = candidate[0]

            if candidate[u'filename'] == pattern:
                return candidate

        match = self._find_match(pattern, exact=exact)
        if not match and not exact:
            # check for just the basename
            #   2617: ansible-s-extras/network/cloudflare_dns.py
            bname = os.path.basename(pattern)
            match = self._find_match(bname)

            if not match:
                # check for deprecated name
                #   _fireball -> fireball
                match = self._find_match(u'_' + bname)

        # unique the results
        if isinstance(match, list) and len(match) > 1:
            _match = []
            for m in match:
                if m not in _match:
                    _match.append(m)
            match = _match[:]

        return match

    def is_valid(self, mname):
        match = self.find_match(mname, exact=True)
        if match:
            return True
        else:
            return False

    def get_repository_for_module(self, mname):
        match = self.find_match(mname, exact=True)
        if match:
            return match[u'repository']
        else:
            return None

    def get_ansible_modules(self):
        """Make a list of known modules"""

        matches = []
        module_dir = os.path.join(self.gitrepo.checkoutdir, u'lib/ansible/modules')
        module_dir = os.path.expanduser(module_dir)
        for root, _, filenames in os.walk(module_dir):
            for filename in filenames:
                if u'lib/ansible/modules' in root and not filename == u'__init__.py':
                    matches.append(os.path.join(root, filename))

        matches = sorted(set(matches))

        self.populate_modules(matches)

        # custom fixes
        newitems = []
        for k, v in six.iteritems(self.modules):

            # include* is almost always an ansible/ansible issue
            # https://github.com/ansible/ansibullbot/issues/214
            if k.endswith(u'/include.py'):
                self.modules[k][u'repository'] = u'ansible'
            # https://github.com/ansible/ansibullbot/issues/214
            if k.endswith(u'/include_vars.py'):
                self.modules[k][u'repository'] = u'ansible'
            if k.endswith(u'/include_role.py'):
                self.modules[k][u'repository'] = u'ansible'

            # ansible maintains these
            if u'include' in k:
                self.modules[k][u'maintainers'] = [u'ansible']

            # deprecated modules are annoying
            if v[u'name'].startswith(u'_'):

                dkey = os.path.dirname(v[u'filepath'])
                dkey = os.path.join(dkey, v[u'filename'].replace(u'_', u'', 1))
                if dkey not in self.modules:
                    nd = v.copy()
                    nd[u'name'] = nd[u'name'].replace(u'_', u'', 1)
                    newitems.append((dkey, nd))

        for ni in newitems:
            self.modules[ni[0]] = ni[1]

        # parse metadata
        logging.debug(u'set module metadata')
        self.set_module_metadata()

        # parse imports
        logging.debug(u'set module imports')
        self.set_module_imports()

        # last modified
        if self.get_commits:
            logging.debug(u'set module commits')
            self.get_module_commits()

        # parse blame
        if self.get_blames and self.get_commits:
            logging.debug(u'set module blames')
            self.get_module_blames()

        # depends on metadata now ...
        logging.debug(u'set module maintainers')
        self.set_maintainers()

        return self.modules

    def populate_modules(self, matches):
        # figure out the names
        for match in matches:
            mdict = copy.deepcopy(self.EMPTY_MODULE)

            mdict[u'filename'] = os.path.basename(match)

            dirpath = os.path.dirname(match)
            dirpath = dirpath.replace(self.gitrepo.checkoutdir + u'/', u'')
            mdict[u'dirpath'] = dirpath

            filepath = match.replace(self.gitrepo.checkoutdir + u'/', u'')
            mdict[u'filepath'] = filepath

            mdict.update(
                self.split_topics_from_path(filepath)
            )

            mdict[u'repo_filename'] = mdict[u'filepath']\
                .replace(u'lib/ansible/modules/%s/' % mdict[u'repository'], u'')

            # clustering/consul
            mdict[u'namespaced_module'] = mdict[u'repo_filename']
            mdict[u'namespaced_module'] = \
                mdict[u'namespaced_module'].replace(u'.py', u'')
            mdict[u'namespaced_module'] = \
                mdict[u'namespaced_module'].replace(u'.ps1', u'')

            mname = os.path.basename(match)
            mname = mname.replace(u'.py', u'')
            mname = mname.replace(u'.ps1', u'')
            mdict[u'name'] = mname

            # deprecated modules
            if mname.startswith(u'_'):
                mdict[u'deprecated'] = True
                deprecated_filename = \
                    os.path.dirname(mdict[u'namespaced_module'])
                deprecated_filename = \
                    os.path.join(deprecated_filename, mname[1:] + u'.py')
                mdict[u'deprecated_filename'] = deprecated_filename
            else:
                mdict[u'deprecated_filename'] = mdict[u'repo_filename']

            self.modules[filepath] = mdict

        # meta is a special module
        self.modules[u'meta'] = copy.deepcopy(self.EMPTY_MODULE)
        self.modules[u'meta'][u'name'] = u'meta'
        self.modules[u'meta'][u'repo_filename'] = u'meta'

    def get_module_commits(self):
        keys = self.modules.keys()
        keys = sorted(keys)
        for k in keys:
            self.commits[k] = []
            cpath = os.path.join(self.gitrepo.checkoutdir, k)
            if not os.path.isfile(cpath):
                continue

            mtime = os.path.getmtime(cpath)
            refresh = False
            pfile = os.path.join(
                self.scraper_cache,
                k.replace(u'/', u'_') + u'.commits.pickle'
            )

            if not os.path.isfile(pfile):
                refresh = True
            else:
                pickle_kwargs = {'encoding': 'bytes'} if six.PY3 else {}
                print(pfile)
                with open(pfile, 'rb') as f:
                    pdata = pickle_load(f, **pickle_kwargs)
                if pdata[0] == mtime:
                    self.commits[k] = pdata[1]
                else:
                    refresh = True

            if refresh:
                logging.info(u'refresh commit cache for %s' % k)
                cmd = u'cd %s; git log --follow %s' % (self.gitrepo.checkoutdir, k)
                (rc, so, se) = run_command(cmd)
                for line in to_text(so).split(u'\n'):
                    if line.startswith(u'commit '):
                        commit = {
                            u'name': None,
                            u'email': None,
                            u'login': None,
                            u'hash': line.split()[-1],
                            u'date': None
                        }

                    # Author: Matt Clay <*****@*****.**>
                    if line.startswith(u'Author: '):
                        line = line.replace(u'Author: ', u'')
                        line = line.replace(u'<', u'')
                        line = line.replace(u'>', u'')
                        lparts = line.split()

                        if u'@' in lparts[-1]:
                            commit[u'email'] = lparts[-1]
                            commit[u'name'] = u' '.join(lparts[:-1])
                        else:
                            pass

                        if commit[u'email'] and \
                                u'noreply.github.com' in commit[u'email']:
                            commit[u'login'] = commit[u'email'].split(u'@')[0]

                    # Date:   Sat Jan 28 23:28:53 2017 -0800
                    if line.startswith(u'Date:'):
                        dstr = line.split(u':', 1)[1].strip()
                        dstr = u' '.join(dstr.split(u' ')[:-1])
                        ds = datetime.datetime.strptime(
                            to_text(dstr),
                            u'%a %b %d %H:%M:%S %Y'
                        )
                        commit[u'date'] = ds
                        self.commits[k].append(commit)

                with open(pfile, 'wb') as f:
                    pickle_dump((mtime, self.commits[k]), f)

    def last_commit_for_file(self, filepath):
        if filepath in self.commits:
            return self.commits[filepath][0][u'hash']

        # git log --pretty=format:'%H' -1
        # lib/ansible/modules/cloud/amazon/ec2_metric_alarm.py
        cmd = u'cd %s; git log --pretty=format:\'%%H\' -1 %s' % \
            (self.gitrepo.checkoutdir, filepath)
        (rc, so, se) = run_command(cmd)
        return to_text(so).strip()

    def get_module_blames(self):

        logging.debug(u'build email cache')
        emails_cache = self.session.query(Email)
        emails_cache = [(x.email, x.login) for x in emails_cache]
        self.emails_cache = dict(emails_cache)

        logging.debug(u'build blame cache')
        blame_cache = self.session.query(Blame).all()
        blame_cache = [x.file_commit for x in blame_cache]
        blame_cache = sorted(set(blame_cache))

        logging.debug(u'eval module hashes')
        changed = False
        keys = sorted(self.modules.keys())
        for k in keys:
            if k not in self.files:
                self.committers[k] = {}
                continue

            ghash = self.last_commit_for_file(k)

            if ghash in blame_cache:
                continue

            logging.debug(u'checking hash for {}'.format(k))
            res = self.session.query(Blame).filter_by(file_name=k, file_commit=ghash).all()
            hashes = [x.file_commit for x in res]

            if ghash not in hashes:

                logging.debug(u'hash {} not found for {}, updating blames'.format(ghash, k))

                scraper_args = [u'ansible', u'ansible', u'devel', k]
                uns, emailmap = self.gqlc.get_usernames_from_filename_blame(*scraper_args)

                # check the emails
                for email, login in emailmap.items():
                    if email in self.emails_cache:
                        continue
                    exists = self.session.query(Email).filter_by(email=email).first()
                    if not exists:
                        logging.debug(u'insert {}:{}'.format(login, email))
                        _email = Email(email=email, login=login)
                        self.session.add(_email)
                        changed = True

                # check the blames
                for login, commits in uns.items():
                    for commit in commits:
                        kwargs = {
                            u'file_name': k,
                            u'file_commit': ghash,
                            u'author_commit': commit,
                            u'author_login': login
                        }
                        exists = self.session.query(Blame).filter_by(**kwargs).first()
                        if not exists:
                            logging.debug(u'insert {}:{}:{}'.format(k, commit, login))
                            _blame = Blame(**kwargs)
                            self.session.add(_blame)
                            changed = True

        if changed:
            self.session.commit()
            logging.debug(u're-build email cache')
            emails_cache = self.session.query(Email)
            emails_cache = [(x.email, x.login) for x in emails_cache]
            self.emails_cache = dict(emails_cache)

        # fill in what we can ...
        logging.debug(u'fill in commit logins')
        for k in keys:
            for idc, commit in enumerate(self.commits[k][:]):
                if not commit.get(u'login'):
                    continue
                login = self.emails_cache.get(commit[u'email'])
                if not login and u'@users.noreply.github.com' in commit[u'email']:
                    login = commit[u'email'].split(u'@')[0]
                    self.emails_cache[commit[u'email']] = login
                if not login:
                    print(u'unknown: {}'.format(commit[u'email']))
                self.commits[k][idc][u'login'] = self.emails_cache.get(login)

    def get_emails_by_login(self, login):
        res = self.session.query(Email).filter_by(login=login)
        emails = [x.email for x in res.values()]
        return emails

    def _get_module_blames(self):
        ''' Scrape the blame page for each module and store it '''

        keys = sorted(self.modules.keys())

        # scrape the data
        for k in keys:

            cpath = os.path.join(self.gitrepo.checkoutdir, k)
            if not os.path.isfile(cpath):
                self.committers[k] = {}
                continue

            ghash = self.last_commit_for_file(k)
            pfile = os.path.join(
                self.scraper_cache,
                k.replace(u'/', u'_') + u'.blame.pickle'
            )
            sargs = [u'ansible', u'ansible', u'devel', k]

            refresh = False
            if not os.path.isfile(pfile):
                refresh = True
            else:
                logging.debug(u'load {}'.format(pfile))
                with open(pfile, 'rb') as f:
                    pdata = pickle_load(f)
                if C.DEFAULT_BREAKPOINTS:
                    logging.error(u'breakpoint!')
                    import epdb; epdb.st()
                if pdata[0] == ghash:
                    self.committers[k] = pdata[1]
                    if len(pdata) == 3:
                        # use emailmap if available
                        emailmap = pdata[2]
                    else:
                        emailmap = {}
                else:
                    refresh = True

            if refresh:
                if self.gqlc:
                    logging.debug(u'graphql blame usernames {}'.format(pfile))
                    uns, emailmap = self.gqlc.get_usernames_from_filename_blame(*sargs)
                else:
                    emailmap = {}  # scrapping: emails not available
                    logging.debug(u'www blame usernames {}'.format(pfile))
                    uns = self.gws.get_usernames_from_filename_blame(*sargs)
                self.committers[k] = uns
                with open(pfile, 'wb') as f:
                    pickle_dump((ghash, uns, emailmap), f)

            for email, github_id in emailmap.items():
                if email not in self.emails_cache:
                    self.emails_cache[email] = github_id

        # add scraped logins to the map
        for k in keys:
            for idx, x in enumerate(self.commits[k]):
                if x[u'email'] in [u'@']:
                    continue
                if x[u'email'] not in self.emails_cache:
                    self.emails_cache[x[u'email']] = None
                if x[u'login']:
                    self.emails_cache[x[u'email']] = x[u'login']
                    continue

                xhash = x[u'hash']
                for ck, cv in six.iteritems(self.committers[k]):
                    if xhash in cv:
                        self.emails_cache[x[u'email']] = ck
                        break

        # fill in what we can ...
        for k in keys:
            for idx, x in enumerate(self.commits[k]):
                if not x[u'login']:
                    if x[u'email'] in [u'@']:
                        continue
                    if self.emails_cache[x[u'email']]:
                        login = self.emails_cache[x[u'email']]
                        xhash = x[u'hash']
                        self.commits[k][idx][u'login'] = login
                        if login not in self.committers[k]:
                            self.committers[k][login] = []
                        if xhash not in self.committers[k][login]:
                            self.committers[k][login].append(xhash)

    def set_maintainers(self):
        '''Define the maintainers for each module'''

        # grep the authors:
        for k, v in six.iteritems(self.modules):
            if v[u'filepath'] is None:
                continue
            mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath'])
            authors = self.get_module_authors(mfile)
            self.modules[k][u'authors'] = authors

            # authors are maintainers by -default-
            self.modules[k][u'maintainers'] += authors
            self.modules[k][u'maintainers'] = \
                sorted(set(self.modules[k][u'maintainers']))

        metadata = self.botmeta[u'files'].keys()
        for k, v in six.iteritems(self.modules):
            if k == u'meta':
                continue

            if k in self.botmeta[u'files']:
                # There are metadata in .github/BOTMETA.yml for this file
                # copy maintainers_keys
                self.modules[k][u'maintainers_keys'] = self.botmeta[u'files'][k][u'maintainers_keys'][:]

                if self.botmeta[u'files'][k]:
                    maintainers = self.botmeta[u'files'][k].get(u'maintainers', [])

                    for maintainer in maintainers:
                        if maintainer not in self.modules[k][u'maintainers']:
                            self.modules[k][u'maintainers'].append(maintainer)

                    # remove the people who want to be ignored
                    if u'ignored' in self.botmeta[u'files'][k]:
                        ignored = self.botmeta[u'files'][k][u'ignored']
                        for x in ignored:
                            if x in self.modules[k][u'maintainers']:
                                self.modules[k][u'maintainers'].remove(x)

            else:
                # There isn't metadata in .github/BOTMETA.yml for this file
                best_match = None
                for mkey in metadata:
                    if v[u'filepath'].startswith(mkey):
                        if not best_match:
                            best_match = mkey
                            continue
                        if len(mkey) > len(best_match):
                            best_match = mkey
                if best_match:
                    self.modules[k][u'maintainers_keys'] = [best_match]
                    for maintainer in self.botmeta[u'files'][best_match].get(u'maintainers', []):
                        if maintainer not in self.modules[k][u'maintainers']:
                            self.modules[k][u'maintainers'].append(maintainer)

                    # remove the people who want to be ignored
                    for ignored in self.botmeta[u'files'][best_match].get(u'ignored', []):
                        if ignored in self.modules[k][u'maintainers']:
                            self.modules[k][u'maintainers'].remove(ignored)

            # save a pristine copy so that higher level code can still use it
            self.modules[k][u'maintainers'] = sorted(set(self.modules[k][u'maintainers']))
            self.modules[k][u'_maintainers'] = \
                [x for x in self.modules[k][u'maintainers']]

        # set the namespace maintainers ...
        for k, v in six.iteritems(self.modules):
            if u'namespace_maintainers' not in self.modules[k]:
                self.modules[k][u'namespace_maintainers'] = []
            if v.get(u'namespace'):
                ns = v.get(u'namespace')
                nms = self.get_maintainers_for_namespace(ns)
                self.modules[k][u'namespace_maintainers'] = nms

    def split_topics_from_path(self, module_file):
        subpath = module_file.replace(u'lib/ansible/modules/', u'')
        path_parts = subpath.split(u'/')
        topic = path_parts[0]

        if len(path_parts) > 2:
            subtopic = path_parts[1]
            fulltopic = u'/'.join(path_parts[0:2])
        else:
            subtopic = None
            fulltopic = path_parts[0]

        tdata = {
            u'fulltopic': fulltopic,
            u'namespace': fulltopic,
            u'topic': topic,
            u'subtopic': subtopic
        }

        return tdata

    def get_module_authors(self, module_file):
        """Grep the authors out of the module docstrings"""

        if not os.path.exists(module_file):
            return []

        documentation = b''
        inphase = False

        with io.open(module_file, 'rb') as f:
            for line in f:
                if b'DOCUMENTATION' in line:
                    inphase = True
                    continue
                if line.strip().endswith((b"'''", b'"""')):
                    break
                if inphase:
                    documentation += line

        if not documentation:
            return []

        # clean out any other yaml besides author to save time
        inphase = False
        author_lines = u''
        doc_lines = to_text(documentation).split(u'\n')
        for idx, x in enumerate(doc_lines):
            if x.startswith(u'author'):
                inphase = True
            if inphase and not x.strip().startswith((u'-', u'author')):
                inphase = False
                break
            if inphase:
                author_lines += x + u'\n'

        if not author_lines:
            return []

        ydata = {}
        try:
            ydata = yaml.load(author_lines, BotYAMLLoader)
        except Exception as e:
            print(e)
            return []

        # quit early if the yaml was not valid
        if not ydata:
            return []

        # quit if the key was not found
        if u'author' not in ydata:
            return []

        if not isinstance(ydata[u'author'], list):
            ydata[u'author'] = [ydata[u'author']]

        authors = []
        for author in ydata[u'author']:
            github_ids = self.extract_github_id(author)
            if github_ids:
                authors.extend(github_ids)
        return authors

    def extract_github_id(self, author):
        authors = set()

        if author is None:
            return []
        if u'ansible core team' in author.lower():
            authors.add(u'ansible')
        elif u'@' in author:
            # match github ids but not emails
            authors.update(re.findall(r'(?<!\w)@([\w-]+)(?![\w.])', author))
        elif u'github.com/' in author:
            # {'author': 'Henrique Rodrigues (github.com/Sodki)'}
            idx = author.find(u'github.com/')
            author = author[idx+11:]
            authors.add(author.replace(u')', u''))
        elif u'(' in author and len(author.split()) == 3:
            # Mathieu Bultel (matbu)
            idx = author.find(u'(')
            author = author[idx+1:]
            authors.add(author.replace(u')', u''))

        # search for emails
        for email in re.findall(r'[<(]([^@]+@[^)>]+)[)>]', author):
            github_id = self.emails_cache.get(email)
            if github_id:
                authors.add(github_id)

        return list(authors)

    def fuzzy_match(self, repo=None, title=None, component=None):
        '''Fuzzy matching for modules'''

        logging.debug(u'fuzzy match {}'.format(
            to_text(to_bytes(component, 'ascii', 'ignore'), 'ascii'))
        )

        if component.lower() == u'core':
            return None

        # https://github.com/ansible/ansible/issues/18179
        if u'validate-modules' in component:
            return None

        # https://github.com/ansible/ansible/issues/20368
        if u'module_utils' in component:
            return None

        if u'new module' in component:
            return None

        # authorized_keys vs. authorized_key
        if component and component.endswith(u's'):
            tm = self.find_match(component[:-1])
            if tm:
                if not isinstance(tm, list):
                    return tm[u'name']
                elif len(tm) == 1:
                    return tm[0][u'name']
                else:
                    if C.DEFAULT_BREAKPOINTS:
                        logging.error(u'breakpoint!')
                        import epdb; epdb.st()

        match = None
        known_modules = []

        for k, v in six.iteritems(self.modules):
            if v[u'name'] in [u'include']:
                continue
            known_modules.append(v[u'name'])

        title = title.lower()
        title = title.replace(u':', u'')
        title_matches = [x for x in known_modules if x + u' module' in title]

        if not title_matches:
            title_matches = [x for x in known_modules
                             if title.startswith(x + u' ')]
            if not title_matches:
                title_matches = \
                    [x for x in known_modules if u' ' + x + u' ' in title]

            if title_matches:
                title_matches = [x for x in title_matches if x != u'at']

        # don't do singular word matching in title for ansible/ansible
        cmatches = None
        if component:
            cmatches = [x for x in known_modules if x in component]
            cmatches = [x for x in cmatches if not u'_' + x in component]

        # globs
        if not cmatches and u'*' in component:
            fmatches = [x for x in known_modules if fnmatch.fnmatch(x, component)]
            if fmatches:
                cmatches = fmatches[:]

        if title_matches:
            # use title ... ?
            cmatches = [x for x in cmatches if x in title_matches and x not in [u'at']]

        if cmatches:
            if len(cmatches) >= 1 and (u'*' not in component and u'modules' not in component):
                match = cmatches[0]
            else:
                match = cmatches[:]
            if not match:
                if u'docs.ansible.com' in component:
                    pass
                else:
                    pass
            logging.debug("module - component matches: %s" % cmatches)

        if not match:
            if len(title_matches) == 1:
                match = title_matches[0]
            else:
                logging.debug("module - title matches: %s" % title_matches)

        return match

    def is_multi(self, rawtext):
        '''Is the string a list or a glob of modules?'''
        if rawtext:
            lines = rawtext.split(u'\n')

            # clean up lines
            lines = [x.strip() for x in lines if x.strip()]
            lines = [x for x in lines if len(x) > 2]

            if len(lines) > 1:
                return True

            if lines:
                if lines[0].strip().endswith(u'*'):
                    return True

        return False

    # https://github.com/ansible/ansible-modules-core/issues/3831
    def multi_match(self, rawtext):
        '''Return a list of matches for a given glob or list of names'''
        matches = []
        lines = rawtext.split(u'\n')
        lines = [x.strip() for x in lines if x.strip()]
        for line in lines:
            # is it an exact name, a path, a globbed name, a globbed path?
            if line.endswith(u'*'):
                thiskey = line.replace(u'*', u'')
                keymatches = []
                for k in self.modules.keys():
                    if thiskey in k:
                        keymatches.append(k)
                for k in keymatches:
                    matches.append(self.modules[k].copy())
            else:
                match = self.find_match(line)
                if match:
                    matches.append(match)

        # unique the list
        tmplist = []
        for x in matches:
            if x not in tmplist:
                tmplist.append(x)
        if matches != tmplist:
            matches = [x for x in tmplist]

        return matches

    def set_module_metadata(self):
        for k, v in six.iteritems(self.modules):
            if not v[u'filepath']:
                continue
            mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath'])
            if not mfile.endswith(u'.py'):
                # metadata is only the .py files ...
                ext = mfile.split(u'.')[-1]
                mfile = mfile.replace(u'.' + ext, u'.py', 1)

            self.modules[k][u'metadata'].update(self.get_module_metadata(mfile))

    def get_module_metadata(self, module_file):
        meta = {}

        if not os.path.isfile(module_file):
            return meta

        rawmeta = u''
        inphase = False
        with io.open(module_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith(u'ANSIBLE_METADATA'):
                    inphase = True
                if line.startswith(u'DOCUMENTATION'):
                    break
                if inphase:
                    rawmeta += line
        rawmeta = rawmeta.replace(u'ANSIBLE_METADATA =', u'', 1)
        rawmeta = rawmeta.strip()
        try:
            meta = ast.literal_eval(rawmeta)
            tmp_meta = {}
            for k, v in meta.items():
                if isinstance(k, six.binary_type):
                    k = to_text(k)
                if isinstance(v, six.binary_type):
                    v = to_text(v)
                if isinstance(v, list):
                    tmp_list = []
                    for i in v:
                        if isinstance(i, six.binary_type):
                            i = to_text(i)
                        tmp_list.append(i)
                    v = tmp_list
                    del tmp_list
                tmp_meta[k] = v
            meta = tmp_meta
            del tmp_meta
        except SyntaxError:
            pass

        return meta

    def set_module_imports(self):
        for k, v in six.iteritems(self.modules):
            if not v[u'filepath']:
                continue
            mfile = os.path.join(self.gitrepo.checkoutdir, v[u'filepath'])
            self.modules[k][u'imports'] = self.get_module_imports(mfile)

    def get_module_imports(self, module_file):
        mimports = []

        if not os.path.isfile(module_file):
            return mimports

        else:
            with open(module_file, 'rb') as f:
                for line in f:
                    line = line.strip()
                    line = line.replace(b',', b'')
                    if line.startswith(b'import') or \
                            (b'import' in line and b'from' in line):
                        lparts = line.split()
                        if line.startswith(b'import '):
                            mimports.append(lparts[1])
                        elif line.startswith(b'from '):
                            mpath = lparts[1] + b'.'
                            for spath in lparts[3:]:
                                mimports.append(mpath + spath)

            return [to_text(m) for m in mimports]

    @property
    def all_maintainers(self):
        maintainers = set()
        for path, metadata in self.botmeta[u'files'].items():
            maintainers.update(metadata.get(u'maintainers', []))
        return maintainers

    @property
    def all_authors(self):
        authors = set()
        for key, metadata in self.modules.items():
            authors.update(metadata.get(u'authors', []))
        return authors

    def get_maintainers_for_namespace(self, namespace):
        maintainers = []
        for k, v in self.modules.items():
            if u'namespace' not in v or u'maintainers' not in v:
                continue
            if v[u'namespace'] == namespace:
                for m in v[u'maintainers']:
                    if m not in maintainers:
                        maintainers.append(m)
        maintainers = [x for x in maintainers if x.strip()]
        return maintainers

    @staticmethod
    def replace_ansible(maintainers, ansible_members, bots=[]):
        '''Replace -ansible- with the -humans- in the org'''
        newlist = []
        for m in maintainers:
            if m != u'ansible':
                newlist.append(m)
            else:
                newlist += ansible_members
        newlist = sorted(set(newlist))
        newlist = [x for x in newlist if x not in bots]
        return newlist

    def get_file_content(self, filepath):
        fpath = os.path.join(self.gitrepo.checkoutdir, filepath)
        if not os.path.isfile(fpath):
            return None
        with io.open(fpath, 'r', encoding='utf-8') as f:
            data = f.read()
        return data