def test_max_parsed_lines(self):
        # The max_parsed_lines config option limits the number of parsed
        # lines.
        config.push('log_parser config',
                    '[launchpad]\nlogparser_max_parsed_lines: 2')
        self.addCleanup(config.pop, 'log_parser config')
        fd = open(
            os.path.join(here, 'apache-log-files',
                         'launchpadlibrarian.net.access-log'))
        self.addCleanup(fd.close)

        downloads, parsed_bytes, parsed_lines = parse_file(
            fd,
            start_position=0,
            logger=self.logger,
            get_download_key=get_path_download_key)

        # We have initially parsed only the first two lines of data,
        # corresponding to one download (the first line is a 404 and
        # so ignored).
        self.assertEqual(parsed_lines, 2)
        date = datetime(2008, 6, 13)
        self.assertContentEqual(downloads.items(),
                                [('/9096290/me-tv-icon-14x14.png', {
                                    date: {
                                        'AU': 1
                                    }
                                })])
        fd.seek(0)
        lines = fd.readlines()
        line_lengths = [len(line) for line in lines]
        self.assertEqual(parsed_bytes, sum(line_lengths[:2]))

        # And the subsequent parse will be for the 3rd and 4th lines,
        # corresponding to two downloads of the same file.
        downloads, parsed_bytes, parsed_lines = parse_file(
            fd,
            start_position=parsed_bytes,
            logger=self.logger,
            get_download_key=get_path_download_key)
        self.assertContentEqual(downloads.items(),
                                [('/12060796/me-tv-icon-64x64.png', {
                                    date: {
                                        'AU': 1
                                    }
                                }),
                                 ('/8196569/mediumubuntulogo.png', {
                                     date: {
                                         'AR': 1
                                     }
                                 })])
        self.assertEqual(parsed_bytes, sum(line_lengths[:4]))
    def test_parsing(self):
        # The parse_file() function returns a tuple containing a dict (mapping
        # days and library file IDs to number of downloads), the total number
        # of bytes that have been parsed from this file, and the running total
        # of lines parsed (for testing against the maximum).  In our sample
        # log, the file with ID 8196569 has been downloaded twice (once from
        # Argentina and once from Japan) and the files with ID 12060796 and
        # 9096290 have been downloaded once.  The file with ID 15018215 has
        # also been downloaded once (last line of the sample log), but
        # parse_file() always skips the last line as it may be truncated, so
        # it doesn't show up in the dict returned.
        fd = open(os.path.join(
            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
        downloads, parsed_bytes, parsed_lines = parse_file(
            fd, start_position=0, logger=self.logger,
            get_download_key=get_path_download_key)
        self.assertEqual(
            self.logger.getLogBuffer().strip(),
            'INFO Parsed 5 lines resulting in 3 download stats.')
        date = datetime(2008, 6, 13)
        self.assertContentEqual(
            downloads.items(),
            [('/12060796/me-tv-icon-64x64.png', {date: {'AU': 1}}),
             ('/8196569/mediumubuntulogo.png', {date: {'AR': 1, 'JP': 1}}),
             ('/9096290/me-tv-icon-14x14.png', {date: {'AU': 1}})])

        # The last line is skipped, so we'll record that the file has been
        # parsed until the beginning of the last line.
        self.assertNotEqual(parsed_bytes, fd.tell())
        self.assertEqual(parsed_bytes, self._getLastLineStart(fd))
Пример #3
0
    def main(self):
        self.setUpUtilities()

        # Materialize the list of files to parse. It is better to do the
        # checks now, rather than potentially hours later when the
        # generator gets around to it, because there is a reasonable
        # chance log rotation will have kicked in and removed our oldest
        # files. Note that we still error if a file we want to parse
        # disappears before we get around to parsing it, which is
        # desirable behaviour.
        files_to_parse = list(
            get_files_to_parse(
                glob.glob(os.path.join(self.root, self.log_file_glob))))

        country_set = getUtility(ICountrySet)
        parsed_lines = 0
        max_parsed_lines = getattr(config.launchpad,
                                   'logparser_max_parsed_lines', None)
        max_is_set = max_parsed_lines is not None
        for fd, position in files_to_parse:
            # If we've used up our budget of lines to process, stop.
            if (max_is_set and parsed_lines >= max_parsed_lines):
                break
            downloads, parsed_bytes, parsed_lines = parse_file(
                fd, position, self.logger, self.getDownloadKey)
            # Use a while loop here because we want to pop items from the dict
            # in order to free some memory as we go along. This is a good
            # thing here because the downloads dict may get really huge.
            while downloads:
                file_id, daily_downloads = downloads.popitem()
                update_download_count = self.getDownloadCountUpdater(file_id)

                # The object couldn't be retrieved (maybe it was deleted).
                # Don't bother counting downloads for it.
                if update_download_count is None:
                    continue

                for day, country_downloads in daily_downloads.items():
                    for country_code, count in country_downloads.items():
                        try:
                            country = country_set[country_code]
                        except NotFoundError:
                            # We don't know the country for the IP address
                            # where this request originated.
                            country = None
                        update_download_count(day, country, count)
            fd.seek(0)
            first_line = fd.readline()
            fd.close()
            create_or_update_parsedlog_entry(first_line, parsed_bytes)
            self.txn.commit()
            if safe_hasattr(fd, 'name'):
                name = fd.name
            else:
                name = fd
            self.logger.info('Finished parsing %s' % name)

        self.logger.info('Done parsing apache log files')
Пример #4
0
    def main(self):
        self.setUpUtilities()

        # Materialize the list of files to parse. It is better to do the
        # checks now, rather than potentially hours later when the
        # generator gets around to it, because there is a reasonable
        # chance log rotation will have kicked in and removed our oldest
        # files. Note that we still error if a file we want to parse
        # disappears before we get around to parsing it, which is
        # desirable behavior.
        files_to_parse = list(get_files_to_parse(
            glob.glob(os.path.join(self.root, self.log_file_glob))))

        country_set = getUtility(ICountrySet)
        parsed_lines = 0
        max_parsed_lines = getattr(
            config.launchpad, 'logparser_max_parsed_lines', None)
        max_is_set = max_parsed_lines is not None
        for fd, position in files_to_parse:
            # If we've used up our budget of lines to process, stop.
            if (max_is_set and parsed_lines >= max_parsed_lines):
                break
            downloads, parsed_bytes, parsed_lines = parse_file(
                fd, position, self.logger, self.getDownloadKey)
            # Use a while loop here because we want to pop items from the dict
            # in order to free some memory as we go along. This is a good
            # thing here because the downloads dict may get really huge.
            while downloads:
                file_id, daily_downloads = downloads.popitem()
                update_download_count = self.getDownloadCountUpdater(file_id)

                # The object couldn't be retrieved (maybe it was deleted).
                # Don't bother counting downloads for it.
                if update_download_count is None:
                    continue

                for day, country_downloads in daily_downloads.items():
                    for country_code, count in country_downloads.items():
                        try:
                            country = country_set[country_code]
                        except NotFoundError:
                            # We don't know the country for the IP address
                            # where this request originated.
                            country = None
                        update_download_count(day, country, count)
            fd.seek(0)
            first_line = fd.readline()
            fd.close()
            create_or_update_parsedlog_entry(first_line, parsed_bytes)
            self.txn.commit()
            if safe_hasattr(fd, 'name'):
                name = fd.name
            else:
                name = fd
            self.logger.info('Finished parsing %s' % name)

        self.logger.info('Done parsing apache log files')
 def test_unexpected_error_while_parsing(self):
     # When there's an unexpected error, we log it and return as if we had
     # parsed up to the line before the one where the failure occurred.
     # Here we force an unexpected error on the first line.
     fd = StringIO('Not a log')
     downloads, parsed_bytes, parsed_lines = parse_file(
         fd, start_position=0, logger=self.logger,
         get_download_key=get_path_download_key)
     self.assertIn('Error', self.logger.getLogBuffer())
     self.assertEqual(downloads, {})
     self.assertEqual(parsed_bytes, 0)
 def _assertRequestWithGivenMethodIsIgnored(self, method):
     """Assert that requests with the given method are ignored."""
     fd = StringIO(
         self.sample_line % dict(status='200', method=method))
     downloads, parsed_bytes, parsed_lines = parse_file(
         fd, start_position=0, logger=self.logger,
         get_download_key=get_path_download_key)
     self.assertEqual(
         self.logger.getLogBuffer().strip(),
         'INFO Parsed 1 lines resulting in 0 download stats.')
     self.assertEqual(downloads, {})
     self.assertEqual(parsed_bytes, fd.tell())
 def _assertRequestWithGivenMethodIsIgnored(self, method):
     """Assert that requests with the given method are ignored."""
     fd = StringIO(self.sample_line % dict(status='200', method=method))
     downloads, parsed_bytes, parsed_lines = parse_file(
         fd,
         start_position=0,
         logger=self.logger,
         get_download_key=get_path_download_key)
     self.assertEqual(self.logger.getLogBuffer().strip(),
                      'INFO Parsed 1 lines resulting in 0 download stats.')
     self.assertEqual(downloads, {})
     self.assertEqual(parsed_bytes, fd.tell())
 def test_unexpected_error_while_parsing(self):
     # When there's an unexpected error, we log it and return as if we had
     # parsed up to the line before the one where the failure occurred.
     # Here we force an unexpected error on the first line.
     fd = StringIO('Not a log')
     downloads, parsed_bytes, parsed_lines = parse_file(
         fd,
         start_position=0,
         logger=self.logger,
         get_download_key=get_path_download_key)
     self.assertIn('Error', self.logger.getLogBuffer())
     self.assertEqual(downloads, {})
     self.assertEqual(parsed_bytes, 0)
    def test_max_parsed_lines(self):
        # The max_parsed_lines config option limits the number of parsed
        # lines.
        config.push(
            'log_parser config',
            '[launchpad]\nlogparser_max_parsed_lines: 2')
        self.addCleanup(config.pop, 'log_parser config')
        fd = open(os.path.join(
            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
        self.addCleanup(fd.close)

        downloads, parsed_bytes, parsed_lines = parse_file(
            fd, start_position=0, logger=self.logger,
            get_download_key=get_path_download_key)

        # We have initially parsed only the first two lines of data,
        # corresponding to one download (the first line is a 404 and
        # so ignored).
        self.assertEqual(parsed_lines, 2)
        date = datetime(2008, 6, 13)
        self.assertContentEqual(
            downloads.items(),
            [('/9096290/me-tv-icon-14x14.png', {date: {'AU': 1}})])
        fd.seek(0)
        lines = fd.readlines()
        line_lengths = [len(line) for line in lines]
        self.assertEqual(parsed_bytes, sum(line_lengths[:2]))

        # And the subsequent parse will be for the 3rd and 4th lines,
        # corresponding to two downloads of the same file.
        downloads, parsed_bytes, parsed_lines = parse_file(
            fd, start_position=parsed_bytes, logger=self.logger,
            get_download_key=get_path_download_key)
        self.assertContentEqual(
            downloads.items(),
            [('/12060796/me-tv-icon-64x64.png', {date: {'AU': 1}}),
             ('/8196569/mediumubuntulogo.png', {date: {'AR': 1}})])
        self.assertEqual(parsed_bytes, sum(line_lengths[:4]))
 def test_request_to_non_lfa_is_ignored(self):
     # A request to a path which doesn't map to a LibraryFileAlias (e.g.
     # '/') is ignored.
     fd = StringIO(
         '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" '
         '200 2261 "https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
     downloads, parsed_bytes, ignored = parse_file(
         fd, start_position=0, logger=self.logger,
         get_download_key=get_library_file_id)
     self.assertEqual(
         self.logger.getLogBuffer().strip(),
         'INFO Parsed 1 lines resulting in 0 download stats.')
     self.assertEqual(downloads, {})
     self.assertEqual(parsed_bytes, fd.tell())
Пример #11
0
 def test_request_to_non_lfa_is_ignored(self):
     # A request to a path which doesn't map to a LibraryFileAlias (e.g.
     # '/') is ignored.
     fd = StringIO(
         '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" '
         '200 2261 "https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
     downloads, parsed_bytes, ignored = parse_file(
         fd,
         start_position=0,
         logger=self.logger,
         get_download_key=get_library_file_id)
     self.assertEqual(self.logger.getLogBuffer().strip(),
                      'INFO Parsed 1 lines resulting in 0 download stats.')
     self.assertEqual(downloads, {})
     self.assertEqual(parsed_bytes, fd.tell())
    def test_normal_request_is_not_ignored(self):
        fd = StringIO(
            self.sample_line % dict(status=200, method='GET'))
        downloads, parsed_bytes, parsed_lines = parse_file(
            fd, start_position=0, logger=self.logger,
            get_download_key=get_path_download_key)
        self.assertEqual(
            self.logger.getLogBuffer().strip(),
            'INFO Parsed 1 lines resulting in 1 download stats.')

        self.assertEqual(downloads,
            {'/15018215/ul_logo_64x64.png':
                {datetime(2008, 6, 13): {'US': 1}}})

        self.assertEqual(parsed_bytes, fd.tell())
    def test_request_to_lfa_is_parsed(self):
        fd = StringIO(
            '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET '
            '/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 '
            '"https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
        downloads, parsed_bytes, ignored = parse_file(
            fd, start_position=0, logger=self.logger,
            get_download_key=get_library_file_id)
        self.assertEqual(
            self.logger.getLogBuffer().strip(),
            'INFO Parsed 1 lines resulting in 1 download stats.')

        self.assertEqual(downloads,
            {'15018215': {datetime(2008, 6, 13): {'US': 1}}})

        self.assertEqual(parsed_bytes, fd.tell())
    def test_parsing_last_line(self):
        # When there's only the last line of a given file for us to parse, we
        # assume the file has been rotated and it's safe to parse its last
        # line without worrying about whether or not it's been truncated.
        fd = open(os.path.join(
            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
        downloads, parsed_bytes, parsed_lines = parse_file(
            fd, start_position=self._getLastLineStart(fd), logger=self.logger,
            get_download_key=get_path_download_key)
        self.assertEqual(
            self.logger.getLogBuffer().strip(),
            'INFO Parsed 1 lines resulting in 1 download stats.')
        self.assertEqual(parsed_bytes, fd.tell())

        self.assertContentEqual(
            downloads.items(),
            [('/15018215/ul_logo_64x64.png',
              {datetime(2008, 6, 13): {'US': 1}})])
    def test_max_parsed_lines_exceeded(self):
        # Show that if a non-zero parsed_lines is passed in, the number of
        # lines parsed will be less than it would otherwise have been.

        # The max_parsed_lines config option limits the number of parsed
        # lines.
        config.push('log_parser config',
                    '[launchpad]\nlogparser_max_parsed_lines: 2')
        self.addCleanup(config.pop, 'log_parser config')
        fd = open(
            os.path.join(here, 'apache-log-files',
                         'launchpadlibrarian.net.access-log'))
        self.addCleanup(fd.close)

        # We want to start parsing on line 2 so we will have a value in
        # "downloads" to make a positive assertion about.  (The first line is
        # a 404 so wouldn't generate any output.)
        start_position = len(fd.readline())

        # If we have already parsed some lines, then the number of lines
        # parsed will be passed in (parsed_lines argument) and parse_file will
        # take that number into account when determining if the maximum number
        # of lines to parse has been reached.
        parsed_lines = 1
        downloads, parsed_bytes, parsed_lines = parse_file(
            fd,
            start_position=start_position,
            logger=self.logger,
            get_download_key=get_path_download_key,
            parsed_lines=parsed_lines)

        # The total number of lines parsed during the run (1 line) plus the
        # number of lines parsed previously (1 line, as passed in via
        # parsed_lines) is returned.
        self.assertEqual(parsed_lines, 2)
        # Since we told parse_file that we had already parsed 1 line and the
        # limit is 2 lines, it only parsed a single line.
        date = datetime(2008, 6, 13)
        self.assertContentEqual(downloads.items(),
                                [('/9096290/me-tv-icon-14x14.png', {
                                    date: {
                                        'AU': 1
                                    }
                                })])
    def test_parsing(self):
        # The parse_file() function returns a tuple containing a dict (mapping
        # days and library file IDs to number of downloads), the total number
        # of bytes that have been parsed from this file, and the running total
        # of lines parsed (for testing against the maximum).  In our sample
        # log, the file with ID 8196569 has been downloaded twice (once from
        # Argentina and once from Japan) and the files with ID 12060796 and
        # 9096290 have been downloaded once.  The file with ID 15018215 has
        # also been downloaded once (last line of the sample log), but
        # parse_file() always skips the last line as it may be truncated, so
        # it doesn't show up in the dict returned.
        fd = open(
            os.path.join(here, 'apache-log-files',
                         'launchpadlibrarian.net.access-log'))
        downloads, parsed_bytes, parsed_lines = parse_file(
            fd,
            start_position=0,
            logger=self.logger,
            get_download_key=get_path_download_key)
        self.assertEqual(self.logger.getLogBuffer().strip(),
                         'INFO Parsed 5 lines resulting in 3 download stats.')
        date = datetime(2008, 6, 13)
        self.assertContentEqual(downloads.items(),
                                [('/12060796/me-tv-icon-64x64.png', {
                                    date: {
                                        'AU': 1
                                    }
                                }),
                                 ('/8196569/mediumubuntulogo.png', {
                                     date: {
                                         'AR': 1,
                                         'JP': 1
                                     }
                                 }),
                                 ('/9096290/me-tv-icon-14x14.png', {
                                     date: {
                                         'AU': 1
                                     }
                                 })])

        # The last line is skipped, so we'll record that the file has been
        # parsed until the beginning of the last line.
        self.assertNotEqual(parsed_bytes, fd.tell())
        self.assertEqual(parsed_bytes, self._getLastLineStart(fd))
    def test_normal_request_is_not_ignored(self):
        fd = StringIO(self.sample_line % dict(status=200, method='GET'))
        downloads, parsed_bytes, parsed_lines = parse_file(
            fd,
            start_position=0,
            logger=self.logger,
            get_download_key=get_path_download_key)
        self.assertEqual(self.logger.getLogBuffer().strip(),
                         'INFO Parsed 1 lines resulting in 1 download stats.')

        self.assertEqual(downloads, {
            '/15018215/ul_logo_64x64.png': {
                datetime(2008, 6, 13): {
                    'US': 1
                }
            }
        })

        self.assertEqual(parsed_bytes, fd.tell())
Пример #18
0
    def test_request_to_lfa_is_parsed(self):
        fd = StringIO('69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET '
                      '/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 '
                      '"https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
        downloads, parsed_bytes, ignored = parse_file(
            fd,
            start_position=0,
            logger=self.logger,
            get_download_key=get_library_file_id)
        self.assertEqual(self.logger.getLogBuffer().strip(),
                         'INFO Parsed 1 lines resulting in 1 download stats.')

        self.assertEqual(downloads,
                         {'15018215': {
                             datetime(2008, 6, 13): {
                                 'US': 1
                             }
                         }})

        self.assertEqual(parsed_bytes, fd.tell())
    def test_parsing_last_line(self):
        # When there's only the last line of a given file for us to parse, we
        # assume the file has been rotated and it's safe to parse its last
        # line without worrying about whether or not it's been truncated.
        fd = open(
            os.path.join(here, 'apache-log-files',
                         'launchpadlibrarian.net.access-log'))
        downloads, parsed_bytes, parsed_lines = parse_file(
            fd,
            start_position=self._getLastLineStart(fd),
            logger=self.logger,
            get_download_key=get_path_download_key)
        self.assertEqual(self.logger.getLogBuffer().strip(),
                         'INFO Parsed 1 lines resulting in 1 download stats.')
        self.assertEqual(parsed_bytes, fd.tell())

        self.assertContentEqual(downloads.items(),
                                [('/15018215/ul_logo_64x64.png', {
                                    datetime(2008, 6, 13): {
                                        'US': 1
                                    }
                                })])
    def test_max_parsed_lines_exceeded(self):
        # Show that if a non-zero parsed_lines is passed in, the number of
        # lines parsed will be less than it would otherwise have been.

        # The max_parsed_lines config option limits the number of parsed
        # lines.
        config.push(
            'log_parser config',
            '[launchpad]\nlogparser_max_parsed_lines: 2')
        self.addCleanup(config.pop, 'log_parser config')
        fd = open(os.path.join(
            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
        self.addCleanup(fd.close)

        # We want to start parsing on line 2 so we will have a value in
        # "downloads" to make a positive assertion about.  (The first line is
        # a 404 so wouldn't generate any output.)
        start_position = len(fd.readline())

        # If we have already parsed some lines, then the number of lines
        # parsed will be passed in (parsed_lines argument) and parse_file will
        # take that number into account when determining if the maximum number
        # of lines to parse has been reached.
        parsed_lines = 1
        downloads, parsed_bytes, parsed_lines = parse_file(
            fd, start_position=start_position, logger=self.logger,
            get_download_key=get_path_download_key, parsed_lines=parsed_lines)

        # The total number of lines parsed during the run (1 line) plus the
        # number of lines parsed previously (1 line, as passed in via
        # parsed_lines) is returned.
        self.assertEqual(parsed_lines, 2)
        # Since we told parse_file that we had already parsed 1 line and the
        # limit is 2 lines, it only parsed a single line.
        date = datetime(2008, 6, 13)
        self.assertContentEqual(
            downloads.items(),
            [('/9096290/me-tv-icon-14x14.png', {date: {'AU': 1}})])