예제 #1
0
def _retrieve_mails(uri):
    LOG.debug('Retrieving mail archive from uri: %s', uri)
    content = utils.read_uri(uri)
    if not content:
        LOG.error('Error reading mail archive from uri: %s', uri)
        return

    content = utils.gzip_decompress(content)
    LOG.debug('Mail archive is loaded, start processing')

    content += TRAILING_RECORD

    for rec in re.finditer(MAIL_BOX_PATTERN, content):
        email = rec.groupdict()
        email['author_email'] = email['author_email'].replace(' at ', '@', 1)
        if not utils.check_email_validity(email['author_email']):
            continue

        email['date'] = int(email_utils.mktime_tz(
            email_utils.parsedate_tz(email['date'])))

        for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS):
            collection = set()
            for item in re.finditer(pattern, email['body']):
                groups = item.groupdict()
                item_id = groups['id']
                if 'module' in groups:
                    item_id = groups['module'] + ':' + item_id
                    email['module'] = groups['module']
                collection.add(item_id)
            email[pattern_name] = list(collection)

        yield email
예제 #2
0
파일: mls.py 프로젝트: kado109/stackalytics
def _retrieve_mails(uri):
    LOG.debug("Retrieving mail archive from uri: %s", uri)
    content = utils.read_uri(uri)
    if not content:
        LOG.error("Error reading mail archive from uri: %s", uri)
        return
    gzip_fd = gzip.GzipFile(fileobj=StringIO.StringIO(content))
    content = gzip_fd.read()
    LOG.debug("Mail archive is loaded, start processing")

    content += TRAILING_RECORD

    for rec in re.finditer(MAIL_BOX_PATTERN, content):
        email = rec.groupdict()
        email["author_email"] = email["author_email"].replace(" at ", "@", 1)
        if not utils.check_email_validity(email["author_email"]):
            continue

        email["date"] = int(email_utils.mktime_tz(email_utils.parsedate_tz(email["date"])))

        for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS):
            collection = set()
            for item in re.finditer(pattern, email["body"]):
                groups = item.groupdict()
                item_id = groups["id"]
                if "module" in groups:
                    item_id = groups["module"] + ":" + item_id
                    email["module"] = groups["module"]
                collection.add(item_id)
            email[pattern_name] = list(collection)

        yield email
예제 #3
0
def _retrieve_mails(uri):
    LOG.debug('Retrieving mail archive from uri: %s', uri)
    content = utils.read_uri(uri)
    if not content:
        LOG.error('Error reading mail archive from uri: %s', uri)
        return

    content = utils.gzip_decompress(content)
    LOG.debug('Mail archive is loaded, start processing')

    content += TRAILING_RECORD

    for rec in re.finditer(MAIL_BOX_PATTERN, content):
        email = rec.groupdict()
        email['author_email'] = email['author_email'].replace(' at ', '@', 1)
        if not utils.check_email_validity(email['author_email']):
            continue

        email['date'] = int(
            email_utils.mktime_tz(email_utils.parsedate_tz(email['date'])))

        for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS):
            collection = set()
            for item in re.finditer(pattern, email['body']):
                groups = item.groupdict()
                item_id = groups['id']
                if 'module' in groups:
                    item_id = groups['module'] + ':' + item_id
                    email['module'] = groups['module']
                collection.add(item_id)
            email[pattern_name] = list(collection)

        yield email
예제 #4
0
    def _get_lp_info(self, email):
        lp_profile = None
        if not utils.check_email_validity(email):
            LOG.debug('User email is not valid %s', email)
        else:
            lp_profile = launchpad_utils.lp_profile_by_email(email)

        if not lp_profile:
            LOG.debug('User with email %s not found', email)
            return None, None

        LOG.debug('Email is mapped to launchpad user: %s', lp_profile['name'])
        return lp_profile['name'], lp_profile['display_name']
예제 #5
0
    def _get_lp_info(self, email):
        lp_profile = None
        if not utils.check_email_validity(email):
            LOG.debug("User email is not valid %s", email)
        else:
            lp_profile = launchpad_utils.lp_profile_by_email(email)

        if not lp_profile:
            LOG.debug("User with email %s not found", email)
            return None, None

        LOG.debug("Email %(email)s is mapped to launchpad user %(lp)s", {"email": email, "lp": lp_profile["name"]})
        return lp_profile["name"], lp_profile["display_name"]
예제 #6
0
def query_lp_info(email):
    """Query Launchpad ID and user name by email

    :param email: user email
    :return: tuple (launchpad id, name)
    """
    lp_profile = None
    if not utils.check_email_validity(email):
        LOG.debug('User email is not valid %s', email)
    else:
        lp_profile = _lp_profile_by_email(email)

    if not lp_profile:
        LOG.debug('User with email %s not found', email)
        return None, None

    LOG.debug('Email %(email)s is mapped to launchpad user %(lp)s',
              {'email': email, 'lp': lp_profile['name']})
    return lp_profile['name'], lp_profile['display_name']
예제 #7
0
def _retrieve_mails(uri):
    LOG.debug('Retrieving mail archive from uri: %s', uri)
    content = utils.read_uri(uri)
    if not content:
        LOG.error('Error reading mail archive from uri: %s', uri)
        return

    # only gunzip if the uri has a .gz suffix
    matchgz = re.compile ('\.txt\.gz')
    if matchgz.search(uri):
        LOG.debug ('%s is a gzipped file', uri)
        gzip_fd = gzip.GzipFile(fileobj=StringIO.StringIO(content))
        content = gzip_fd.read()
    else:
        LOG.debug ('%s is not a gzipped file', uri)
        
    LOG.debug('Mail archive is loaded, start processing')

    content += TRAILING_RECORD

    for rec in re.finditer(MAIL_BOX_PATTERN, content):
        email = rec.groupdict()
        email['author_email'] = email['author_email'].replace(' at ', '@', 1)
        if not utils.check_email_validity(email['author_email']):
            continue

        email['date'] = int(email_utils.mktime_tz(
            email_utils.parsedate_tz(email['date'])))

        for pattern_name, pattern in MESSAGE_PATTERNS.iteritems():
            collection = set()
            for item in re.finditer(pattern, email['body']):
                groups = item.groupdict()
                item_id = groups['id']
                if 'module' in groups:
                    item_id = groups['module'] + ':' + item_id
                    email['module'] = groups['module']
                collection.add(item_id)
            email[pattern_name] = list(collection)

        yield email
예제 #8
0
 def test_email_invalid(self):
     self.assertFalse(utils.check_email_validity('pupkin@localhost'))
     self.assertFalse(utils.check_email_validity('222@some.(trash)'))
예제 #9
0
 def test_email_valid(self):
     self.assertTrue(utils.check_email_validity('*****@*****.**'))
     self.assertTrue(utils.check_email_validity('*****@*****.**'))
예제 #10
0
    def log(self, branch, head_commit_id):
        LOG.debug('Parsing git log for repo uri %s', self.repo['uri'])

        os.chdir(self.folder)
        if not self._checkout(branch):
            return

        commit_range = 'HEAD'
        if head_commit_id:
            commit_range = head_commit_id + '..HEAD'

        try:
            output = sh.git('log', '--pretty=' + GIT_LOG_FORMAT, '--shortstat',
                            '-M', '--no-merges', commit_range, _tty_out=False,
                            _decode_errors='ignore')
        except sh.ErrorReturnCode as e:
            LOG.error('Unable to get log of git repo %s. Ignore it',
                      self.repo['uri'])
            LOG.exception(e)
            return

        for rec in re.finditer(GIT_LOG_PATTERN, str(output)):
            i = 1
            commit = {}
            for param in GIT_LOG_PARAMS:
                commit[param[0]] = six.text_type(rec.group(i), 'utf8')
                i += 1

            if not utils.check_email_validity(commit['author_email']):
                continue

            commit['files_changed'] = int(rec.group(i))
            i += 1
            lines_changed_group = rec.group(i)
            i += 1
            lines_changed = rec.group(i)
            i += 1
            deleted_or_inserted = rec.group(i)
            i += 1
            lines_deleted = rec.group(i)
            i += 1

            if lines_changed_group:  # there inserted or deleted lines
                if not lines_deleted:
                    if deleted_or_inserted[0] == 'd':  # deleted
                        lines_deleted = lines_changed
                        lines_changed = 0

            commit['lines_added'] = int(lines_changed or 0)
            commit['lines_deleted'] = int(lines_deleted or 0)

            for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS):
                collection = set()
                for item in re.finditer(pattern, commit['message']):
                    collection.add(item.group('id'))
                if collection:
                    commit[pattern_name] = list(collection)

            commit['date'] = int(commit['date'])
            commit['module'] = self.repo['module']
            commit['branches'] = set([branch])
            if commit['commit_id'] in self.release_index:
                commit['release'] = self.release_index[commit['commit_id']]
            else:
                commit['release'] = None

            if 'blueprint_id' in commit:
                commit['blueprint_id'] = [(commit['module'] + ':' + bp_name)
                                          for bp_name
                                          in commit['blueprint_id']]

            if 'coauthor' in commit:
                verified_coauthors = []
                for coauthor in commit['coauthor']:
                    m = re.match(CO_AUTHOR_PATTERN, coauthor)
                    if m and utils.check_email_validity(
                            m.group("author_email")):
                        verified_coauthors.append(m.groupdict())

                if verified_coauthors:
                    commit['coauthor'] = verified_coauthors
                else:
                    del commit['coauthor']  # no valid authors

            yield commit
예제 #11
0
    def log(self, branch, head_commit_id):
        LOG.debug('Parsing git log for repo uri %s', self.repo['uri'])

        os.chdir(self.folder)
        if not self._checkout(branch):
            return

        commit_range = 'HEAD'
        if head_commit_id:
            commit_range = head_commit_id + '..HEAD'

        try:
            output = sh.git('log',
                            '--pretty=' + GIT_LOG_FORMAT,
                            '--shortstat',
                            '-M',
                            '--no-merges',
                            commit_range,
                            _tty_out=False,
                            _decode_errors='ignore',
                            _encoding='utf8')
        except sh.ErrorReturnCode as e:
            LOG.error('Unable to get log of git repo %s. Ignore it',
                      self.repo['uri'])
            LOG.exception(e)
            return

        for rec in re.finditer(GIT_LOG_PATTERN, six.text_type(output)):
            i = 1
            commit = {}
            for param in GIT_LOG_PARAMS:
                commit[param[0]] = rec.group(i)
                i += 1

            if not commit['author_email']:
                # ignore commits with empty email (there are some < Essex)
                continue

            commit['author_email'] = utils.keep_safe_chars(
                commit['author_email'])

            diff_stat_str = rec.group('diff_stat')
            diff_rec = re.search(DIFF_STAT_PATTERN, diff_stat_str)

            if diff_rec:
                files_changed = int(diff_rec.group(1))
                lines_changed_group = diff_rec.group(2)
                lines_changed = diff_rec.group(3)
                deleted_or_inserted = diff_rec.group(4)
                lines_deleted = diff_rec.group(5)

                if lines_changed_group:  # there inserted or deleted lines
                    if not lines_deleted:
                        if deleted_or_inserted[0] == 'd':  # deleted
                            lines_deleted = lines_changed
                            lines_changed = 0
            else:
                files_changed = 0
                lines_changed = 0
                lines_deleted = 0

            commit['files_changed'] = files_changed
            commit['lines_added'] = int(lines_changed or 0)
            commit['lines_deleted'] = int(lines_deleted or 0)

            for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS):
                collection = set()
                for item in re.finditer(pattern, commit['message']):
                    collection.add(item.group('id'))
                if collection:
                    commit[pattern_name] = list(collection)

            commit['date'] = int(commit['date'])
            commit['module'] = self.repo['module']
            commit['branches'] = set([branch])
            if commit['commit_id'] in self.release_index:
                commit['release'] = self.release_index[commit['commit_id']]
            else:
                commit['release'] = None

            if commit['release'] == 'ignored':
                # drop commits that are marked by 'ignored' release
                continue

            if 'blueprint_id' in commit:
                commit['blueprint_id'] = [(commit['module'] + ':' + bp_name)
                                          for bp_name in commit['blueprint_id']
                                          ]

            if 'coauthor' in commit:
                verified_coauthors = []
                for coauthor in commit['coauthor']:
                    m = re.match(CO_AUTHOR_PATTERN, coauthor)
                    if m and utils.check_email_validity(
                            m.group("author_email")):
                        verified_coauthors.append(m.groupdict())

                if verified_coauthors:
                    commit['coauthor'] = verified_coauthors
                else:
                    del commit['coauthor']  # no valid authors

            yield commit
예제 #12
0
파일: vcs.py 프로젝트: Mingkii/stackalytics
    def log(self, branch, head_commit_id):
        LOG.debug('Parsing git log for repo uri %s', self.repo['uri'])

        os.chdir(self.folder)
        if not self._checkout(branch):
            return

        commit_range = 'HEAD'
        if head_commit_id:
            commit_range = head_commit_id + '..HEAD'
        output = sh.git('log', '--pretty=%s' % GIT_LOG_FORMAT, '--shortstat',
                        '-M', '--no-merges', commit_range, _tty_out=False,
                        _decode_errors='ignore')

        for rec in re.finditer(GIT_LOG_PATTERN, str(output)):
            i = 1
            commit = {}
            for param in GIT_LOG_PARAMS:
                commit[param[0]] = unicode(rec.group(i), 'utf8')
                i += 1

            if not utils.check_email_validity(commit['author_email']):
                continue

            commit['files_changed'] = int(rec.group(i))
            i += 1
            lines_changed_group = rec.group(i)
            i += 1
            lines_changed = rec.group(i)
            i += 1
            deleted_or_inserted = rec.group(i)
            i += 1
            lines_deleted = rec.group(i)
            i += 1

            if lines_changed_group:  # there inserted or deleted lines
                if not lines_deleted:
                    if deleted_or_inserted[0] == 'd':  # deleted
                        lines_deleted = lines_changed
                        lines_changed = 0

            commit['lines_added'] = int(lines_changed or 0)
            commit['lines_deleted'] = int(lines_deleted or 0)

            for pattern_name, pattern in MESSAGE_PATTERNS.iteritems():
                collection = set()
                for item in re.finditer(pattern, commit['message']):
                    collection.add(item.group('id'))
                commit[pattern_name] = list(collection)

            commit['date'] = int(commit['date'])
            commit['module'] = self.repo['module']
            commit['branches'] = set([branch])
            if commit['commit_id'] in self.release_index:
                commit['release'] = self.release_index[commit['commit_id']]
            else:
                commit['release'] = None
            if 'blueprint_id' in commit:
                commit['blueprint_id'] = [(commit['module'] + ':' + bp_name)
                                          for bp_name
                                          in commit['blueprint_id']]

            yield commit
예제 #13
0
 def test_email_invalid(self):
     self.assertFalse(utils.check_email_validity('pupkin@localhost'))
     self.assertFalse(utils.check_email_validity('222@some.(trash)'))
예제 #14
0
 def test_email_valid(self):
     self.assertTrue(utils.check_email_validity('*****@*****.**'))
     self.assertTrue(utils.check_email_validity('*****@*****.**'))
예제 #15
0
    def log(self, branch, head_commit_id):
        LOG.debug('Parsing git log for repo uri %s', self.repo['uri'])

        os.chdir(self.folder)
        if not self._checkout(branch):
            return

        commit_range = 'HEAD'
        if head_commit_id:
            commit_range = head_commit_id + '..HEAD'

        try:
            output = sh.git('log', '--pretty=' + GIT_LOG_FORMAT, '--shortstat',
                            '-M', '--no-merges', commit_range, _tty_out=False,
                            _decode_errors='ignore', _encoding='utf8')
        except sh.ErrorReturnCode:
            LOG.error('Unable to get log of git repo %s. Ignore it',
                      self.repo['uri'], exc_info=True)
            return

        for rec in re.finditer(GIT_LOG_PATTERN, six.text_type(output)):
            i = 1
            commit = {}
            for param in GIT_LOG_PARAMS:
                commit[param[0]] = rec.group(i)
                i += 1

            # ignore machine/script produced submodule auto updates
            if commit['subject'] == u'Update git submodules':
                continue

            if not commit['author_email']:
                # ignore commits with empty email (there are some < Essex)
                continue

            commit['author_email'] = utils.keep_safe_chars(
                commit['author_email'])

            diff_stat_str = rec.group('diff_stat')
            diff_rec = re.search(DIFF_STAT_PATTERN, diff_stat_str)

            if diff_rec:
                files_changed = int(diff_rec.group(1))
                lines_changed_group = diff_rec.group(2)
                lines_changed = diff_rec.group(3)
                deleted_or_inserted = diff_rec.group(4)
                lines_deleted = diff_rec.group(5)

                if lines_changed_group:  # there inserted or deleted lines
                    if not lines_deleted:
                        if deleted_or_inserted[0] == 'd':  # deleted
                            lines_deleted = lines_changed
                            lines_changed = 0
            else:
                files_changed = 0
                lines_changed = 0
                lines_deleted = 0

            commit['files_changed'] = files_changed
            commit['lines_added'] = int(lines_changed or 0)
            commit['lines_deleted'] = int(lines_deleted or 0)

            for pattern_name, pattern in six.iteritems(MESSAGE_PATTERNS):
                collection = set()
                for item in re.finditer(pattern, commit['message']):
                    collection.add(item.group('id'))
                if collection:
                    commit[pattern_name] = list(collection)

            commit['date'] = int(commit['date'])
            commit['module'] = self.repo['module']
            commit['branches'] = set([branch])
            if commit['commit_id'] in self.release_index:
                commit['release'] = self.release_index[commit['commit_id']]
            else:
                commit['release'] = None

            if commit['release'] == 'ignored':
                # drop commits that are marked by 'ignored' release
                continue

            if 'blueprint_id' in commit:
                commit['blueprint_id'] = [(commit['module'] + ':' + bp_name)
                                          for bp_name
                                          in commit['blueprint_id']]

            if 'coauthor' in commit:
                verified_coauthors = []
                for coauthor in commit['coauthor']:
                    m = re.match(CO_AUTHOR_PATTERN, coauthor)
                    if m and utils.check_email_validity(
                            m.group("author_email")):
                        verified_coauthors.append(m.groupdict())

                if verified_coauthors:
                    commit['coauthor'] = verified_coauthors
                else:
                    del commit['coauthor']  # no valid authors

            yield commit
예제 #16
0
    def log(self, branch, head_commit_id):
        LOG.debug('Parsing git log for repo uri %s', self.repo['uri'])

        os.chdir(self.folder)
        if not self._checkout(branch):
            return

        commit_range = 'HEAD'
        if head_commit_id:
            commit_range = head_commit_id + '..HEAD'
        output = sh.git('log',
                        '--pretty=%s' % GIT_LOG_FORMAT,
                        '--shortstat',
                        '-M',
                        '--no-merges',
                        commit_range,
                        _tty_out=False,
                        _decode_errors='ignore')

        for rec in re.finditer(GIT_LOG_PATTERN, str(output)):
            i = 1
            commit = {}
            for param in GIT_LOG_PARAMS:
                commit[param[0]] = unicode(rec.group(i), 'utf8')
                i += 1

            if not utils.check_email_validity(commit['author_email']):
                continue

            commit['files_changed'] = int(rec.group(i))
            i += 1
            lines_changed_group = rec.group(i)
            i += 1
            lines_changed = rec.group(i)
            i += 1
            deleted_or_inserted = rec.group(i)
            i += 1
            lines_deleted = rec.group(i)
            i += 1

            if lines_changed_group:  # there inserted or deleted lines
                if not lines_deleted:
                    if deleted_or_inserted[0] == 'd':  # deleted
                        lines_deleted = lines_changed
                        lines_changed = 0

            commit['lines_added'] = int(lines_changed or 0)
            commit['lines_deleted'] = int(lines_deleted or 0)

            for pattern_name, pattern in MESSAGE_PATTERNS.iteritems():
                collection = set()
                for item in re.finditer(pattern, commit['message']):
                    collection.add(item.group('id'))
                commit[pattern_name] = list(collection)

            commit['date'] = int(commit['date'])
            commit['module'] = self.repo['module']
            commit['branches'] = set([branch])
            if commit['commit_id'] in self.release_index:
                commit['release'] = self.release_index[commit['commit_id']]
            else:
                commit['release'] = None
            if 'blueprint_id' in commit:
                commit['blueprint_id'] = [(commit['module'] + ':' + bp_name)
                                          for bp_name in commit['blueprint_id']
                                          ]

            yield commit