def test_max_parsed_lines(self): # The max_parsed_lines config option limits the number of parsed # lines. config.push('log_parser config', '[launchpad]\nlogparser_max_parsed_lines: 2') self.addCleanup(config.pop, 'log_parser config') fd = open( os.path.join(here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) self.addCleanup(fd.close) downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=0, logger=self.logger, get_download_key=get_path_download_key) # We have initially parsed only the first two lines of data, # corresponding to one download (the first line is a 404 and # so ignored). self.assertEqual(parsed_lines, 2) date = datetime(2008, 6, 13) self.assertContentEqual(downloads.items(), [('/9096290/me-tv-icon-14x14.png', { date: { 'AU': 1 } })]) fd.seek(0) lines = fd.readlines() line_lengths = [len(line) for line in lines] self.assertEqual(parsed_bytes, sum(line_lengths[:2])) # And the subsequent parse will be for the 3rd and 4th lines, # corresponding to two downloads of the same file. downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=parsed_bytes, logger=self.logger, get_download_key=get_path_download_key) self.assertContentEqual(downloads.items(), [('/12060796/me-tv-icon-64x64.png', { date: { 'AU': 1 } }), ('/8196569/mediumubuntulogo.png', { date: { 'AR': 1 } })]) self.assertEqual(parsed_bytes, sum(line_lengths[:4]))
def test_parsing(self): # The parse_file() function returns a tuple containing a dict (mapping # days and library file IDs to number of downloads), the total number # of bytes that have been parsed from this file, and the running total # of lines parsed (for testing against the maximum). In our sample # log, the file with ID 8196569 has been downloaded twice (once from # Argentina and once from Japan) and the files with ID 12060796 and # 9096290 have been downloaded once. The file with ID 15018215 has # also been downloaded once (last line of the sample log), but # parse_file() always skips the last line as it may be truncated, so # it doesn't show up in the dict returned. fd = open(os.path.join( here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=0, logger=self.logger, get_download_key=get_path_download_key) self.assertEqual( self.logger.getLogBuffer().strip(), 'INFO Parsed 5 lines resulting in 3 download stats.') date = datetime(2008, 6, 13) self.assertContentEqual( downloads.items(), [('/12060796/me-tv-icon-64x64.png', {date: {'AU': 1}}), ('/8196569/mediumubuntulogo.png', {date: {'AR': 1, 'JP': 1}}), ('/9096290/me-tv-icon-14x14.png', {date: {'AU': 1}})]) # The last line is skipped, so we'll record that the file has been # parsed until the beginning of the last line. self.assertNotEqual(parsed_bytes, fd.tell()) self.assertEqual(parsed_bytes, self._getLastLineStart(fd))
def main(self): self.setUpUtilities() # Materialize the list of files to parse. It is better to do the # checks now, rather than potentially hours later when the # generator gets around to it, because there is a reasonable # chance log rotation will have kicked in and removed our oldest # files. Note that we still error if a file we want to parse # disappears before we get around to parsing it, which is # desirable behaviour. files_to_parse = list( get_files_to_parse( glob.glob(os.path.join(self.root, self.log_file_glob)))) country_set = getUtility(ICountrySet) parsed_lines = 0 max_parsed_lines = getattr(config.launchpad, 'logparser_max_parsed_lines', None) max_is_set = max_parsed_lines is not None for fd, position in files_to_parse: # If we've used up our budget of lines to process, stop. if (max_is_set and parsed_lines >= max_parsed_lines): break downloads, parsed_bytes, parsed_lines = parse_file( fd, position, self.logger, self.getDownloadKey) # Use a while loop here because we want to pop items from the dict # in order to free some memory as we go along. This is a good # thing here because the downloads dict may get really huge. while downloads: file_id, daily_downloads = downloads.popitem() update_download_count = self.getDownloadCountUpdater(file_id) # The object couldn't be retrieved (maybe it was deleted). # Don't bother counting downloads for it. if update_download_count is None: continue for day, country_downloads in daily_downloads.items(): for country_code, count in country_downloads.items(): try: country = country_set[country_code] except NotFoundError: # We don't know the country for the IP address # where this request originated. country = None update_download_count(day, country, count) fd.seek(0) first_line = fd.readline() fd.close() create_or_update_parsedlog_entry(first_line, parsed_bytes) self.txn.commit() if safe_hasattr(fd, 'name'): name = fd.name else: name = fd self.logger.info('Finished parsing %s' % name) self.logger.info('Done parsing apache log files')
def main(self): self.setUpUtilities() # Materialize the list of files to parse. It is better to do the # checks now, rather than potentially hours later when the # generator gets around to it, because there is a reasonable # chance log rotation will have kicked in and removed our oldest # files. Note that we still error if a file we want to parse # disappears before we get around to parsing it, which is # desirable behavior. files_to_parse = list(get_files_to_parse( glob.glob(os.path.join(self.root, self.log_file_glob)))) country_set = getUtility(ICountrySet) parsed_lines = 0 max_parsed_lines = getattr( config.launchpad, 'logparser_max_parsed_lines', None) max_is_set = max_parsed_lines is not None for fd, position in files_to_parse: # If we've used up our budget of lines to process, stop. if (max_is_set and parsed_lines >= max_parsed_lines): break downloads, parsed_bytes, parsed_lines = parse_file( fd, position, self.logger, self.getDownloadKey) # Use a while loop here because we want to pop items from the dict # in order to free some memory as we go along. This is a good # thing here because the downloads dict may get really huge. while downloads: file_id, daily_downloads = downloads.popitem() update_download_count = self.getDownloadCountUpdater(file_id) # The object couldn't be retrieved (maybe it was deleted). # Don't bother counting downloads for it. if update_download_count is None: continue for day, country_downloads in daily_downloads.items(): for country_code, count in country_downloads.items(): try: country = country_set[country_code] except NotFoundError: # We don't know the country for the IP address # where this request originated. country = None update_download_count(day, country, count) fd.seek(0) first_line = fd.readline() fd.close() create_or_update_parsedlog_entry(first_line, parsed_bytes) self.txn.commit() if safe_hasattr(fd, 'name'): name = fd.name else: name = fd self.logger.info('Finished parsing %s' % name) self.logger.info('Done parsing apache log files')
def test_unexpected_error_while_parsing(self): # When there's an unexpected error, we log it and return as if we had # parsed up to the line before the one where the failure occurred. # Here we force an unexpected error on the first line. fd = StringIO('Not a log') downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=0, logger=self.logger, get_download_key=get_path_download_key) self.assertIn('Error', self.logger.getLogBuffer()) self.assertEqual(downloads, {}) self.assertEqual(parsed_bytes, 0)
def _assertRequestWithGivenMethodIsIgnored(self, method): """Assert that requests with the given method are ignored.""" fd = StringIO( self.sample_line % dict(status='200', method=method)) downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=0, logger=self.logger, get_download_key=get_path_download_key) self.assertEqual( self.logger.getLogBuffer().strip(), 'INFO Parsed 1 lines resulting in 0 download stats.') self.assertEqual(downloads, {}) self.assertEqual(parsed_bytes, fd.tell())
def _assertRequestWithGivenMethodIsIgnored(self, method): """Assert that requests with the given method are ignored.""" fd = StringIO(self.sample_line % dict(status='200', method=method)) downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=0, logger=self.logger, get_download_key=get_path_download_key) self.assertEqual(self.logger.getLogBuffer().strip(), 'INFO Parsed 1 lines resulting in 0 download stats.') self.assertEqual(downloads, {}) self.assertEqual(parsed_bytes, fd.tell())
def test_max_parsed_lines(self): # The max_parsed_lines config option limits the number of parsed # lines. config.push( 'log_parser config', '[launchpad]\nlogparser_max_parsed_lines: 2') self.addCleanup(config.pop, 'log_parser config') fd = open(os.path.join( here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) self.addCleanup(fd.close) downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=0, logger=self.logger, get_download_key=get_path_download_key) # We have initially parsed only the first two lines of data, # corresponding to one download (the first line is a 404 and # so ignored). self.assertEqual(parsed_lines, 2) date = datetime(2008, 6, 13) self.assertContentEqual( downloads.items(), [('/9096290/me-tv-icon-14x14.png', {date: {'AU': 1}})]) fd.seek(0) lines = fd.readlines() line_lengths = [len(line) for line in lines] self.assertEqual(parsed_bytes, sum(line_lengths[:2])) # And the subsequent parse will be for the 3rd and 4th lines, # corresponding to two downloads of the same file. downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=parsed_bytes, logger=self.logger, get_download_key=get_path_download_key) self.assertContentEqual( downloads.items(), [('/12060796/me-tv-icon-64x64.png', {date: {'AU': 1}}), ('/8196569/mediumubuntulogo.png', {date: {'AR': 1}})]) self.assertEqual(parsed_bytes, sum(line_lengths[:4]))
def test_request_to_non_lfa_is_ignored(self): # A request to a path which doesn't map to a LibraryFileAlias (e.g. # '/') is ignored. fd = StringIO( '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" ' '200 2261 "https://launchpad.net/~ubuntulite/+archive" "Mozilla"') downloads, parsed_bytes, ignored = parse_file( fd, start_position=0, logger=self.logger, get_download_key=get_library_file_id) self.assertEqual( self.logger.getLogBuffer().strip(), 'INFO Parsed 1 lines resulting in 0 download stats.') self.assertEqual(downloads, {}) self.assertEqual(parsed_bytes, fd.tell())
def test_request_to_non_lfa_is_ignored(self): # A request to a path which doesn't map to a LibraryFileAlias (e.g. # '/') is ignored. fd = StringIO( '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" ' '200 2261 "https://launchpad.net/~ubuntulite/+archive" "Mozilla"') downloads, parsed_bytes, ignored = parse_file( fd, start_position=0, logger=self.logger, get_download_key=get_library_file_id) self.assertEqual(self.logger.getLogBuffer().strip(), 'INFO Parsed 1 lines resulting in 0 download stats.') self.assertEqual(downloads, {}) self.assertEqual(parsed_bytes, fd.tell())
def test_normal_request_is_not_ignored(self): fd = StringIO( self.sample_line % dict(status=200, method='GET')) downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=0, logger=self.logger, get_download_key=get_path_download_key) self.assertEqual( self.logger.getLogBuffer().strip(), 'INFO Parsed 1 lines resulting in 1 download stats.') self.assertEqual(downloads, {'/15018215/ul_logo_64x64.png': {datetime(2008, 6, 13): {'US': 1}}}) self.assertEqual(parsed_bytes, fd.tell())
def test_request_to_lfa_is_parsed(self): fd = StringIO( '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET ' '/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 ' '"https://launchpad.net/~ubuntulite/+archive" "Mozilla"') downloads, parsed_bytes, ignored = parse_file( fd, start_position=0, logger=self.logger, get_download_key=get_library_file_id) self.assertEqual( self.logger.getLogBuffer().strip(), 'INFO Parsed 1 lines resulting in 1 download stats.') self.assertEqual(downloads, {'15018215': {datetime(2008, 6, 13): {'US': 1}}}) self.assertEqual(parsed_bytes, fd.tell())
def test_parsing_last_line(self): # When there's only the last line of a given file for us to parse, we # assume the file has been rotated and it's safe to parse its last # line without worrying about whether or not it's been truncated. fd = open(os.path.join( here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=self._getLastLineStart(fd), logger=self.logger, get_download_key=get_path_download_key) self.assertEqual( self.logger.getLogBuffer().strip(), 'INFO Parsed 1 lines resulting in 1 download stats.') self.assertEqual(parsed_bytes, fd.tell()) self.assertContentEqual( downloads.items(), [('/15018215/ul_logo_64x64.png', {datetime(2008, 6, 13): {'US': 1}})])
def test_max_parsed_lines_exceeded(self): # Show that if a non-zero parsed_lines is passed in, the number of # lines parsed will be less than it would otherwise have been. # The max_parsed_lines config option limits the number of parsed # lines. config.push('log_parser config', '[launchpad]\nlogparser_max_parsed_lines: 2') self.addCleanup(config.pop, 'log_parser config') fd = open( os.path.join(here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) self.addCleanup(fd.close) # We want to start parsing on line 2 so we will have a value in # "downloads" to make a positive assertion about. (The first line is # a 404 so wouldn't generate any output.) start_position = len(fd.readline()) # If we have already parsed some lines, then the number of lines # parsed will be passed in (parsed_lines argument) and parse_file will # take that number into account when determining if the maximum number # of lines to parse has been reached. parsed_lines = 1 downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=start_position, logger=self.logger, get_download_key=get_path_download_key, parsed_lines=parsed_lines) # The total number of lines parsed during the run (1 line) plus the # number of lines parsed previously (1 line, as passed in via # parsed_lines) is returned. self.assertEqual(parsed_lines, 2) # Since we told parse_file that we had already parsed 1 line and the # limit is 2 lines, it only parsed a single line. date = datetime(2008, 6, 13) self.assertContentEqual(downloads.items(), [('/9096290/me-tv-icon-14x14.png', { date: { 'AU': 1 } })])
def test_parsing(self): # The parse_file() function returns a tuple containing a dict (mapping # days and library file IDs to number of downloads), the total number # of bytes that have been parsed from this file, and the running total # of lines parsed (for testing against the maximum). In our sample # log, the file with ID 8196569 has been downloaded twice (once from # Argentina and once from Japan) and the files with ID 12060796 and # 9096290 have been downloaded once. The file with ID 15018215 has # also been downloaded once (last line of the sample log), but # parse_file() always skips the last line as it may be truncated, so # it doesn't show up in the dict returned. fd = open( os.path.join(here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=0, logger=self.logger, get_download_key=get_path_download_key) self.assertEqual(self.logger.getLogBuffer().strip(), 'INFO Parsed 5 lines resulting in 3 download stats.') date = datetime(2008, 6, 13) self.assertContentEqual(downloads.items(), [('/12060796/me-tv-icon-64x64.png', { date: { 'AU': 1 } }), ('/8196569/mediumubuntulogo.png', { date: { 'AR': 1, 'JP': 1 } }), ('/9096290/me-tv-icon-14x14.png', { date: { 'AU': 1 } })]) # The last line is skipped, so we'll record that the file has been # parsed until the beginning of the last line. self.assertNotEqual(parsed_bytes, fd.tell()) self.assertEqual(parsed_bytes, self._getLastLineStart(fd))
def test_normal_request_is_not_ignored(self): fd = StringIO(self.sample_line % dict(status=200, method='GET')) downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=0, logger=self.logger, get_download_key=get_path_download_key) self.assertEqual(self.logger.getLogBuffer().strip(), 'INFO Parsed 1 lines resulting in 1 download stats.') self.assertEqual(downloads, { '/15018215/ul_logo_64x64.png': { datetime(2008, 6, 13): { 'US': 1 } } }) self.assertEqual(parsed_bytes, fd.tell())
def test_request_to_lfa_is_parsed(self): fd = StringIO('69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET ' '/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 ' '"https://launchpad.net/~ubuntulite/+archive" "Mozilla"') downloads, parsed_bytes, ignored = parse_file( fd, start_position=0, logger=self.logger, get_download_key=get_library_file_id) self.assertEqual(self.logger.getLogBuffer().strip(), 'INFO Parsed 1 lines resulting in 1 download stats.') self.assertEqual(downloads, {'15018215': { datetime(2008, 6, 13): { 'US': 1 } }}) self.assertEqual(parsed_bytes, fd.tell())
def test_parsing_last_line(self): # When there's only the last line of a given file for us to parse, we # assume the file has been rotated and it's safe to parse its last # line without worrying about whether or not it's been truncated. fd = open( os.path.join(here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=self._getLastLineStart(fd), logger=self.logger, get_download_key=get_path_download_key) self.assertEqual(self.logger.getLogBuffer().strip(), 'INFO Parsed 1 lines resulting in 1 download stats.') self.assertEqual(parsed_bytes, fd.tell()) self.assertContentEqual(downloads.items(), [('/15018215/ul_logo_64x64.png', { datetime(2008, 6, 13): { 'US': 1 } })])
def test_max_parsed_lines_exceeded(self): # Show that if a non-zero parsed_lines is passed in, the number of # lines parsed will be less than it would otherwise have been. # The max_parsed_lines config option limits the number of parsed # lines. config.push( 'log_parser config', '[launchpad]\nlogparser_max_parsed_lines: 2') self.addCleanup(config.pop, 'log_parser config') fd = open(os.path.join( here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) self.addCleanup(fd.close) # We want to start parsing on line 2 so we will have a value in # "downloads" to make a positive assertion about. (The first line is # a 404 so wouldn't generate any output.) start_position = len(fd.readline()) # If we have already parsed some lines, then the number of lines # parsed will be passed in (parsed_lines argument) and parse_file will # take that number into account when determining if the maximum number # of lines to parse has been reached. parsed_lines = 1 downloads, parsed_bytes, parsed_lines = parse_file( fd, start_position=start_position, logger=self.logger, get_download_key=get_path_download_key, parsed_lines=parsed_lines) # The total number of lines parsed during the run (1 line) plus the # number of lines parsed previously (1 line, as passed in via # parsed_lines) is returned. self.assertEqual(parsed_lines, 2) # Since we told parse_file that we had already parsed 1 line and the # limit is 2 lines, it only parsed a single line. date = datetime(2008, 6, 13) self.assertContentEqual( downloads.items(), [('/9096290/me-tv-icon-14x14.png', {date: {'AU': 1}})])