def main(self): self.setUpUtilities() # Materialize the list of files to parse. It is better to do the # checks now, rather than potentially hours later when the # generator gets around to it, because there is a reasonable # chance log rotation will have kicked in and removed our oldest # files. Note that we still error if a file we want to parse # disappears before we get around to parsing it, which is # desirable behaviour. files_to_parse = list( get_files_to_parse( glob.glob(os.path.join(self.root, self.log_file_glob)))) country_set = getUtility(ICountrySet) parsed_lines = 0 max_parsed_lines = getattr(config.launchpad, 'logparser_max_parsed_lines', None) max_is_set = max_parsed_lines is not None for fd, position in files_to_parse: # If we've used up our budget of lines to process, stop. if (max_is_set and parsed_lines >= max_parsed_lines): break downloads, parsed_bytes, parsed_lines = parse_file( fd, position, self.logger, self.getDownloadKey) # Use a while loop here because we want to pop items from the dict # in order to free some memory as we go along. This is a good # thing here because the downloads dict may get really huge. while downloads: file_id, daily_downloads = downloads.popitem() update_download_count = self.getDownloadCountUpdater(file_id) # The object couldn't be retrieved (maybe it was deleted). # Don't bother counting downloads for it. if update_download_count is None: continue for day, country_downloads in daily_downloads.items(): for country_code, count in country_downloads.items(): try: country = country_set[country_code] except NotFoundError: # We don't know the country for the IP address # where this request originated. country = None update_download_count(day, country, count) fd.seek(0) first_line = fd.readline() fd.close() create_or_update_parsedlog_entry(first_line, parsed_bytes) self.txn.commit() if safe_hasattr(fd, 'name'): name = fd.name else: name = fd self.logger.info('Finished parsing %s' % name) self.logger.info('Done parsing apache log files')
def main(self): self.setUpUtilities() # Materialize the list of files to parse. It is better to do the # checks now, rather than potentially hours later when the # generator gets around to it, because there is a reasonable # chance log rotation will have kicked in and removed our oldest # files. Note that we still error if a file we want to parse # disappears before we get around to parsing it, which is # desirable behavior. files_to_parse = list(get_files_to_parse( glob.glob(os.path.join(self.root, self.log_file_glob)))) country_set = getUtility(ICountrySet) parsed_lines = 0 max_parsed_lines = getattr( config.launchpad, 'logparser_max_parsed_lines', None) max_is_set = max_parsed_lines is not None for fd, position in files_to_parse: # If we've used up our budget of lines to process, stop. if (max_is_set and parsed_lines >= max_parsed_lines): break downloads, parsed_bytes, parsed_lines = parse_file( fd, position, self.logger, self.getDownloadKey) # Use a while loop here because we want to pop items from the dict # in order to free some memory as we go along. This is a good # thing here because the downloads dict may get really huge. while downloads: file_id, daily_downloads = downloads.popitem() update_download_count = self.getDownloadCountUpdater(file_id) # The object couldn't be retrieved (maybe it was deleted). # Don't bother counting downloads for it. if update_download_count is None: continue for day, country_downloads in daily_downloads.items(): for country_code, count in country_downloads.items(): try: country = country_set[country_code] except NotFoundError: # We don't know the country for the IP address # where this request originated. country = None update_download_count(day, country, count) fd.seek(0) first_line = fd.readline() fd.close() create_or_update_parsedlog_entry(first_line, parsed_bytes) self.txn.commit() if safe_hasattr(fd, 'name'): name = fd.name else: name = fd self.logger.info('Finished parsing %s' % name) self.logger.info('Done parsing apache log files')
def test_creation_of_new_entries(self): # When given a first_line that doesn't exist in the ParsedApacheLog # table, create_or_update_parsedlog_entry() will create a new entry # with the given number of bytes read. first_line = u'First line' create_or_update_parsedlog_entry(first_line, parsed_bytes=len(first_line)) entry = IStore(ParsedApacheLog).find(ParsedApacheLog, first_line=first_line).one() self.assertIsNot(None, entry) self.assertEqual(entry.bytes_read, len(first_line))
def test_creation_of_new_entries(self): # When given a first_line that doesn't exist in the ParsedApacheLog # table, create_or_update_parsedlog_entry() will create a new entry # with the given number of bytes read. first_line = u'First line' create_or_update_parsedlog_entry( first_line, parsed_bytes=len(first_line)) entry = IStore(ParsedApacheLog).find( ParsedApacheLog, first_line=first_line).one() self.assertIsNot(None, entry) self.assertEqual(entry.bytes_read, len(first_line))
def test_update_of_existing_entries(self): # When given a first_line that already exists in the ParsedApacheLog # table, create_or_update_parsedlog_entry() will update that entry # with the given number of bytes read. first_line = u'First line' create_or_update_parsedlog_entry(first_line, parsed_bytes=2) store = IStore(ParsedApacheLog) entry = store.find(ParsedApacheLog, first_line=first_line).one() # Here we see that the new entry was created. self.assertIsNot(None, entry) self.assertEqual(entry.bytes_read, 2) create_or_update_parsedlog_entry(first_line, parsed_bytes=len(first_line)) # And here we see that same entry was updated by the second call to # create_or_update_parsedlog_entry(). entry2 = store.find(ParsedApacheLog, first_line=first_line).one() self.assertIs(entry, entry2) self.assertIsNot(None, entry2) self.assertEqual(entry2.bytes_read, len(first_line))
def test_update_of_existing_entries(self): # When given a first_line that already exists in the ParsedApacheLog # table, create_or_update_parsedlog_entry() will update that entry # with the given number of bytes read. first_line = u'First line' create_or_update_parsedlog_entry(first_line, parsed_bytes=2) store = IStore(ParsedApacheLog) entry = store.find(ParsedApacheLog, first_line=first_line).one() # Here we see that the new entry was created. self.assertIsNot(None, entry) self.assertEqual(entry.bytes_read, 2) create_or_update_parsedlog_entry( first_line, parsed_bytes=len(first_line)) # And here we see that same entry was updated by the second call to # create_or_update_parsedlog_entry(). entry2 = store.find(ParsedApacheLog, first_line=first_line).one() self.assertIs(entry, entry2) self.assertIsNot(None, entry2) self.assertEqual(entry2.bytes_read, len(first_line))