def test_fresh_gzipped_file(self): # get_files_to_parse() handles gzipped files just like uncompressed # ones. The first time we see one, we'll parse from the beginning. gz_name = 'launchpadlibrarian.net.access-log.1.gz' gz_path = os.path.join(self.root, gz_name) files_to_parse = get_files_to_parse([gz_path]) positions = map(itemgetter(1), files_to_parse) self.assertEqual(positions, [0])
def main(self): self.setUpUtilities() # Materialize the list of files to parse. It is better to do the # checks now, rather than potentially hours later when the # generator gets around to it, because there is a reasonable # chance log rotation will have kicked in and removed our oldest # files. Note that we still error if a file we want to parse # disappears before we get around to parsing it, which is # desirable behaviour. files_to_parse = list( get_files_to_parse( glob.glob(os.path.join(self.root, self.log_file_glob)))) country_set = getUtility(ICountrySet) parsed_lines = 0 max_parsed_lines = getattr(config.launchpad, 'logparser_max_parsed_lines', None) max_is_set = max_parsed_lines is not None for fd, position in files_to_parse: # If we've used up our budget of lines to process, stop. if (max_is_set and parsed_lines >= max_parsed_lines): break downloads, parsed_bytes, parsed_lines = parse_file( fd, position, self.logger, self.getDownloadKey) # Use a while loop here because we want to pop items from the dict # in order to free some memory as we go along. This is a good # thing here because the downloads dict may get really huge. while downloads: file_id, daily_downloads = downloads.popitem() update_download_count = self.getDownloadCountUpdater(file_id) # The object couldn't be retrieved (maybe it was deleted). # Don't bother counting downloads for it. if update_download_count is None: continue for day, country_downloads in daily_downloads.items(): for country_code, count in country_downloads.items(): try: country = country_set[country_code] except NotFoundError: # We don't know the country for the IP address # where this request originated. country = None update_download_count(day, country, count) fd.seek(0) first_line = fd.readline() fd.close() create_or_update_parsedlog_entry(first_line, parsed_bytes) self.txn.commit() if safe_hasattr(fd, 'name'): name = fd.name else: name = fd self.logger.info('Finished parsing %s' % name) self.logger.info('Done parsing apache log files')
def test_completely_parsed_file(self): # A file that has been completely parsed will be skipped. fd = open(self.file_path) first_line = fd.readline() fd.seek(0) ParsedApacheLog(first_line, len(fd.read())) files_to_parse = get_files_to_parse([self.file_path]) self.assertEqual(list(files_to_parse), [])
def test_fresh_gzipped_file(self): # get_files_to_parse() handles gzipped files just like uncompressed # ones. The first time we see one, we'll parse from the beginning. gz_name = 'launchpadlibrarian.net.access-log.1.gz' gz_path = os.path.join(self.root, gz_name) first_line = gzip.open(gz_path).readline() files_to_parse = get_files_to_parse([gz_path]) positions = map(itemgetter(1), files_to_parse) self.assertEqual(positions, [0])
def test_completely_parsed_file(self): # A file that has been completely parsed will be skipped. fd = open(self.file_path) first_line = fd.readline() fd.seek(0) ParsedApacheLog(first_line, len(fd.read())) files_to_parse = get_files_to_parse([self.file_path]) self.failUnlessEqual(list(files_to_parse), [])
def main(self): self.setUpUtilities() # Materialize the list of files to parse. It is better to do the # checks now, rather than potentially hours later when the # generator gets around to it, because there is a reasonable # chance log rotation will have kicked in and removed our oldest # files. Note that we still error if a file we want to parse # disappears before we get around to parsing it, which is # desirable behavior. files_to_parse = list(get_files_to_parse( glob.glob(os.path.join(self.root, self.log_file_glob)))) country_set = getUtility(ICountrySet) parsed_lines = 0 max_parsed_lines = getattr( config.launchpad, 'logparser_max_parsed_lines', None) max_is_set = max_parsed_lines is not None for fd, position in files_to_parse: # If we've used up our budget of lines to process, stop. if (max_is_set and parsed_lines >= max_parsed_lines): break downloads, parsed_bytes, parsed_lines = parse_file( fd, position, self.logger, self.getDownloadKey) # Use a while loop here because we want to pop items from the dict # in order to free some memory as we go along. This is a good # thing here because the downloads dict may get really huge. while downloads: file_id, daily_downloads = downloads.popitem() update_download_count = self.getDownloadCountUpdater(file_id) # The object couldn't be retrieved (maybe it was deleted). # Don't bother counting downloads for it. if update_download_count is None: continue for day, country_downloads in daily_downloads.items(): for country_code, count in country_downloads.items(): try: country = country_set[country_code] except NotFoundError: # We don't know the country for the IP address # where this request originated. country = None update_download_count(day, country, count) fd.seek(0) first_line = fd.readline() fd.close() create_or_update_parsedlog_entry(first_line, parsed_bytes) self.txn.commit() if safe_hasattr(fd, 'name'): name = fd.name else: name = fd self.logger.info('Finished parsing %s' % name) self.logger.info('Done parsing apache log files')
def test_resumed_gzipped_file(self): # In subsequent runs of the script we will resume from where we # stopped last time. (Here we pretend we parsed only the first line) gz_name = 'launchpadlibrarian.net.access-log.1.gz' gz_path = os.path.join(self.root, gz_name) first_line = gzip.open(gz_path).readline() ParsedApacheLog(first_line, len(first_line)) files_to_parse = get_files_to_parse([gz_path]) positions = map(itemgetter(1), files_to_parse) self.assertEqual(positions, [len(first_line)])
def test_resumed_gzipped_file(self): # In subsequent runs of the script we will resume from where we # stopped last time. (Here we pretend we parsed only the first line) gz_name = 'launchpadlibrarian.net.access-log.1.gz' gz_path = os.path.join(self.root, gz_name) first_line = gzip.open(gz_path).readline() ParsedApacheLog(first_line, len(first_line)) files_to_parse = get_files_to_parse([gz_path]) positions = map(itemgetter(1), files_to_parse) self.failUnlessEqual(positions, [len(first_line)])
def test_parsed_file_with_new_content(self): # A file that has been parsed already but in which new content was # added will be parsed again, starting from where parsing stopped last # time. first_line = open(self.file_path).readline() ParsedApacheLog(first_line, len(first_line)) files_to_parse = list(get_files_to_parse([self.file_path])) self.assertEqual(len(files_to_parse), 1) fd, position = files_to_parse[0] # Since we parsed the first line above, we'll be told to start where # the first line ends. self.assertEqual(position, len(first_line))
def test_sorts_by_mtime(self): # Files are sorted by ascending mtime. root = self.useFixture(TempDir()) file_paths = [root.join(str(name)) for name in range(3)] now = time.time() for i, path in enumerate(file_paths): write_file(path, '%s\n' % i) os.utime(path, (now - i, now - i)) contents = [] for fd, _ in get_files_to_parse(file_paths): fd.seek(0) contents.append(fd.read()) self.assertEqual(['2\n', '1\n', '0\n'], contents)
def test_different_files_with_same_name(self): # Thanks to log rotation, two runs of our script may see files with # the same name but completely different content. If we see a file # with a name matching that of an already parsed file but with content # differing from the last file with that name parsed, we know we need # to parse the file from the start. ParsedApacheLog('First line', bytes_read=1000) # This file has the same name of the previous one (which has been # parsed already), but its first line is different, so we'll have to # parse it from the start. fd, new_path = tempfile.mkstemp() content2 = 'Different First Line\nSecond Line' fd = open(new_path, 'w') fd.write(content2) fd.close() files_to_parse = get_files_to_parse([new_path]) positions = map(itemgetter(1), files_to_parse) self.assertEqual(positions, [0])
def test_different_files_with_same_name(self): # Thanks to log rotation, two runs of our script may see files with # the same name but completely different content. If we see a file # with a name matching that of an already parsed file but with content # differing from the last file with that name parsed, we know we need # to parse the file from the start. ParsedApacheLog('First line', bytes_read=1000) # This file has the same name of the previous one (which has been # parsed already), but its first line is different, so we'll have to # parse it from the start. fd, new_path = tempfile.mkstemp() content2 = 'Different First Line\nSecond Line' fd = open(new_path, 'w') fd.write(content2) fd.close() files_to_parse = get_files_to_parse([new_path]) positions = map(itemgetter(1), files_to_parse) self.failUnlessEqual(positions, [0])
def test_not_parsed_file(self): # A file that has never been parsed will have to be parsed from the # start. files_to_parse = get_files_to_parse([self.file_path]) fd, position = list(files_to_parse)[0] self.assertEqual(position, 0)