def test_fresh_gzipped_file(self):
     # get_files_to_parse() handles gzipped files just like uncompressed
     # ones.  The first time we see one, we'll parse from the beginning.
     gz_name = 'launchpadlibrarian.net.access-log.1.gz'
     gz_path = os.path.join(self.root, gz_name)
     files_to_parse = get_files_to_parse([gz_path])
     positions = map(itemgetter(1), files_to_parse)
     self.assertEqual(positions, [0])
示例#2
0
    def main(self):
        self.setUpUtilities()

        # Materialize the list of files to parse. It is better to do the
        # checks now, rather than potentially hours later when the
        # generator gets around to it, because there is a reasonable
        # chance log rotation will have kicked in and removed our oldest
        # files. Note that we still error if a file we want to parse
        # disappears before we get around to parsing it, which is
        # desirable behaviour.
        files_to_parse = list(
            get_files_to_parse(
                glob.glob(os.path.join(self.root, self.log_file_glob))))

        country_set = getUtility(ICountrySet)
        parsed_lines = 0
        max_parsed_lines = getattr(config.launchpad,
                                   'logparser_max_parsed_lines', None)
        max_is_set = max_parsed_lines is not None
        for fd, position in files_to_parse:
            # If we've used up our budget of lines to process, stop.
            if (max_is_set and parsed_lines >= max_parsed_lines):
                break
            downloads, parsed_bytes, parsed_lines = parse_file(
                fd, position, self.logger, self.getDownloadKey)
            # Use a while loop here because we want to pop items from the dict
            # in order to free some memory as we go along. This is a good
            # thing here because the downloads dict may get really huge.
            while downloads:
                file_id, daily_downloads = downloads.popitem()
                update_download_count = self.getDownloadCountUpdater(file_id)

                # The object couldn't be retrieved (maybe it was deleted).
                # Don't bother counting downloads for it.
                if update_download_count is None:
                    continue

                for day, country_downloads in daily_downloads.items():
                    for country_code, count in country_downloads.items():
                        try:
                            country = country_set[country_code]
                        except NotFoundError:
                            # We don't know the country for the IP address
                            # where this request originated.
                            country = None
                        update_download_count(day, country, count)
            fd.seek(0)
            first_line = fd.readline()
            fd.close()
            create_or_update_parsedlog_entry(first_line, parsed_bytes)
            self.txn.commit()
            if safe_hasattr(fd, 'name'):
                name = fd.name
            else:
                name = fd
            self.logger.info('Finished parsing %s' % name)

        self.logger.info('Done parsing apache log files')
    def test_completely_parsed_file(self):
        # A file that has been completely parsed will be skipped.
        fd = open(self.file_path)
        first_line = fd.readline()
        fd.seek(0)
        ParsedApacheLog(first_line, len(fd.read()))

        files_to_parse = get_files_to_parse([self.file_path])
        self.assertEqual(list(files_to_parse), [])
 def test_fresh_gzipped_file(self):
     # get_files_to_parse() handles gzipped files just like uncompressed
     # ones.  The first time we see one, we'll parse from the beginning.
     gz_name = 'launchpadlibrarian.net.access-log.1.gz'
     gz_path = os.path.join(self.root, gz_name)
     first_line = gzip.open(gz_path).readline()
     files_to_parse = get_files_to_parse([gz_path])
     positions = map(itemgetter(1), files_to_parse)
     self.assertEqual(positions, [0])
    def test_completely_parsed_file(self):
        # A file that has been completely parsed will be skipped.
        fd = open(self.file_path)
        first_line = fd.readline()
        fd.seek(0)
        ParsedApacheLog(first_line, len(fd.read()))

        files_to_parse = get_files_to_parse([self.file_path])
        self.failUnlessEqual(list(files_to_parse), [])
示例#6
0
    def main(self):
        self.setUpUtilities()

        # Materialize the list of files to parse. It is better to do the
        # checks now, rather than potentially hours later when the
        # generator gets around to it, because there is a reasonable
        # chance log rotation will have kicked in and removed our oldest
        # files. Note that we still error if a file we want to parse
        # disappears before we get around to parsing it, which is
        # desirable behavior.
        files_to_parse = list(get_files_to_parse(
            glob.glob(os.path.join(self.root, self.log_file_glob))))

        country_set = getUtility(ICountrySet)
        parsed_lines = 0
        max_parsed_lines = getattr(
            config.launchpad, 'logparser_max_parsed_lines', None)
        max_is_set = max_parsed_lines is not None
        for fd, position in files_to_parse:
            # If we've used up our budget of lines to process, stop.
            if (max_is_set and parsed_lines >= max_parsed_lines):
                break
            downloads, parsed_bytes, parsed_lines = parse_file(
                fd, position, self.logger, self.getDownloadKey)
            # Use a while loop here because we want to pop items from the dict
            # in order to free some memory as we go along. This is a good
            # thing here because the downloads dict may get really huge.
            while downloads:
                file_id, daily_downloads = downloads.popitem()
                update_download_count = self.getDownloadCountUpdater(file_id)

                # The object couldn't be retrieved (maybe it was deleted).
                # Don't bother counting downloads for it.
                if update_download_count is None:
                    continue

                for day, country_downloads in daily_downloads.items():
                    for country_code, count in country_downloads.items():
                        try:
                            country = country_set[country_code]
                        except NotFoundError:
                            # We don't know the country for the IP address
                            # where this request originated.
                            country = None
                        update_download_count(day, country, count)
            fd.seek(0)
            first_line = fd.readline()
            fd.close()
            create_or_update_parsedlog_entry(first_line, parsed_bytes)
            self.txn.commit()
            if safe_hasattr(fd, 'name'):
                name = fd.name
            else:
                name = fd
            self.logger.info('Finished parsing %s' % name)

        self.logger.info('Done parsing apache log files')
 def test_resumed_gzipped_file(self):
     # In subsequent runs of the script we will resume from where we
     # stopped last time. (Here we pretend we parsed only the first line)
     gz_name = 'launchpadlibrarian.net.access-log.1.gz'
     gz_path = os.path.join(self.root, gz_name)
     first_line = gzip.open(gz_path).readline()
     ParsedApacheLog(first_line, len(first_line))
     files_to_parse = get_files_to_parse([gz_path])
     positions = map(itemgetter(1), files_to_parse)
     self.assertEqual(positions, [len(first_line)])
 def test_resumed_gzipped_file(self):
     # In subsequent runs of the script we will resume from where we
     # stopped last time. (Here we pretend we parsed only the first line)
     gz_name = 'launchpadlibrarian.net.access-log.1.gz'
     gz_path = os.path.join(self.root, gz_name)
     first_line = gzip.open(gz_path).readline()
     ParsedApacheLog(first_line, len(first_line))
     files_to_parse = get_files_to_parse([gz_path])
     positions = map(itemgetter(1), files_to_parse)
     self.failUnlessEqual(positions, [len(first_line)])
    def test_parsed_file_with_new_content(self):
        # A file that has been parsed already but in which new content was
        # added will be parsed again, starting from where parsing stopped last
        # time.
        first_line = open(self.file_path).readline()
        ParsedApacheLog(first_line, len(first_line))

        files_to_parse = list(get_files_to_parse([self.file_path]))
        self.assertEqual(len(files_to_parse), 1)
        fd, position = files_to_parse[0]
        # Since we parsed the first line above, we'll be told to start where
        # the first line ends.
        self.assertEqual(position, len(first_line))
 def test_sorts_by_mtime(self):
     # Files are sorted by ascending mtime.
     root = self.useFixture(TempDir())
     file_paths = [root.join(str(name)) for name in range(3)]
     now = time.time()
     for i, path in enumerate(file_paths):
         write_file(path, '%s\n' % i)
         os.utime(path, (now - i, now - i))
     contents = []
     for fd, _ in get_files_to_parse(file_paths):
         fd.seek(0)
         contents.append(fd.read())
     self.assertEqual(['2\n', '1\n', '0\n'], contents)
    def test_parsed_file_with_new_content(self):
        # A file that has been parsed already but in which new content was
        # added will be parsed again, starting from where parsing stopped last
        # time.
        first_line = open(self.file_path).readline()
        ParsedApacheLog(first_line, len(first_line))

        files_to_parse = list(get_files_to_parse([self.file_path]))
        self.assertEqual(len(files_to_parse), 1)
        fd, position = files_to_parse[0]
        # Since we parsed the first line above, we'll be told to start where
        # the first line ends.
        self.assertEqual(position, len(first_line))
    def test_different_files_with_same_name(self):
        # Thanks to log rotation, two runs of our script may see files with
        # the same name but completely different content.  If we see a file
        # with a name matching that of an already parsed file but with content
        # differing from the last file with that name parsed, we know we need
        # to parse the file from the start.
        ParsedApacheLog('First line', bytes_read=1000)

        # This file has the same name of the previous one (which has been
        # parsed already), but its first line is different, so we'll have to
        # parse it from the start.
        fd, new_path = tempfile.mkstemp()
        content2 = 'Different First Line\nSecond Line'
        fd = open(new_path, 'w')
        fd.write(content2)
        fd.close()
        files_to_parse = get_files_to_parse([new_path])
        positions = map(itemgetter(1), files_to_parse)
        self.assertEqual(positions, [0])
    def test_different_files_with_same_name(self):
        # Thanks to log rotation, two runs of our script may see files with
        # the same name but completely different content.  If we see a file
        # with a name matching that of an already parsed file but with content
        # differing from the last file with that name parsed, we know we need
        # to parse the file from the start.
        ParsedApacheLog('First line', bytes_read=1000)

        # This file has the same name of the previous one (which has been
        # parsed already), but its first line is different, so we'll have to
        # parse it from the start.
        fd, new_path = tempfile.mkstemp()
        content2 = 'Different First Line\nSecond Line'
        fd = open(new_path, 'w')
        fd.write(content2)
        fd.close()
        files_to_parse = get_files_to_parse([new_path])
        positions = map(itemgetter(1), files_to_parse)
        self.failUnlessEqual(positions, [0])
 def test_not_parsed_file(self):
     # A file that has never been parsed will have to be parsed from the
     # start.
     files_to_parse = get_files_to_parse([self.file_path])
     fd, position = list(files_to_parse)[0]
     self.assertEqual(position, 0)
 def test_not_parsed_file(self):
     # A file that has never been parsed will have to be parsed from the
     # start.
     files_to_parse = get_files_to_parse([self.file_path])
     fd, position = list(files_to_parse)[0]
     self.assertEqual(position, 0)