Exemplo n.º 1
0
    def setup(self):
        global logger

        self.app = self.get_test_candidate(self.args.base)
        self.profile = self.make_profile("base_profile")

        tmp_zip_name = os.path.join(self.tmp_dir, "top.zip")
        logger.info("Fetching unfiltered top sites data from the `Tranco Top 1M` online database")
        get_to_file(self.top_sites_location, tmp_zip_name)

        try:
            zipped = zipfile.ZipFile(tmp_zip_name)
            if len(zipped.filelist) != 1 or not zipped.filelist[0].orig_filename.lower().endswith(".csv"):
                logger.critical("Top sites zip file has unexpected content")
                sys.exit(5)
            tmp_csv_name = zipped.extract(zipped.filelist[0], self.tmp_dir)
        except zipfile.BadZipfile:
            logger.critical("Error opening top sites zip archive")
            sys.exit(5)

        self.db = sdb.SourcesDB(self.args)
        is_default = self.args.source == self.db.default
        self.sources = sdb.Sources(self.args.source, is_default)

        with open(tmp_csv_name) as f:
            cr = csv.DictReader(f, fieldnames=["rank", "hostname"])
            self.sources.rows = [row for row in cr]

        # A mild sanity check to see whether the downloaded data is valid.
        if len(self.sources) < 900000:
            logger.warning("Top sites is surprisingly small, just %d hosts" % len(self.sources))

        if "hostname" not in self.sources.rows[0] or "rank" not in self.sources.rows[0] \
                or self.sources.rows[0]["rank"] != 1:
            logger.warning("Top sites data looks weird. First line: `%s`" % self.sources.rows[0])
Exemplo n.º 2
0
def test_sources_set_interface():
    """Sources object can be created from and yield sets"""

    # Sets are assumed to contain (rank, hostname) pairs
    src_set = {(1, "mozilla.org"), (2, "mozilla.com"),
               (3, "addons.mozilla.org")}
    src = sdb.Sources("foo")
    src.from_set(src_set)
    assert len(src) == 3, "database from set has correct length"
    assert src_set == src.as_set(), "yielded set is identical to the original"
    assert len(src.as_set(1, 2)) == 1, "yielded subset has expected length"
Exemplo n.º 3
0
def test_sources_sorting():
    """Sources object can sort its rows by rank"""

    src_set = {(1, "mozilla.org"), (2, "mozilla.com"),
               (3, "addons.mozilla.org")}
    src = sdb.Sources("foo")
    src.from_set(src_set)
    # Definitely "unsort"
    if int(src.rows[0]["rank"]) < int(src.rows[1]["rank"]):
        src.rows[0], src.rows[1] = src.rows[1], src.rows[0]
    assert not int(src.rows[0]["rank"]) < int(src.rows[1]["rank"]) < int(
        src.rows[2]["rank"]), "list is scrambled"
    src.sort()
    assert int(src.rows[0]["rank"]) < int(src.rows[1]["rank"]) < int(
        src.rows[2]["rank"]), "sorting works"
Exemplo n.º 4
0
def test_sources_chunking():
    """Sources object can be iterated in chunks"""

    src_set = {(1, "mozilla.org"), (2, "mozilla.com"),
               (3, "addons.mozilla.org"), (4, "irc.mozilla.org"),
               (5, "firefox.com")}
    assert len(src_set) == 5, "hardcoded test set has expected length"
    src = sdb.Sources("foo")
    src.from_set(src_set)
    next_chunk = src.iter_chunks(chunk_start=1,
                                 chunk_stop=20,
                                 chunk_size=2,
                                 min_chunk_size=100)
    assert src.chunk_size == 100, "chunking respects minimum size setting"
    assert src.chunk_start == 1, "chunking respects chunk start setting"
    chunk = next_chunk(20)
    assert len(chunk) == 4, "chunks are not larger than remaining data"

    read_set = set()
    next_chunk = src.iter_chunks(chunk_size=2)
    lengths = list()
    for _ in range(10):
        chunk = next_chunk(as_set=True)
        if chunk is None:
            break
        lengths.append(len(chunk))
        read_set.update(chunk)
    assert lengths == [2, 2, 1], "chunks have expected lengths"
    assert src_set == read_set, "chunks cover full set"

    next_chunk = src.iter_chunks(chunk_size=10)
    lengths = list()
    lengths.append(len(next_chunk(1)))
    lengths.append(len(next_chunk(2)))
    lengths.append(len(next_chunk(3)))
    assert next_chunk() is None, "after last chunk comes None"
    assert next_chunk() is None, "after last chunk comes None again"
    assert lengths == [1, 2, 2], "chunks size can be varied on-the-fly"
Exemplo n.º 5
0
def test_sources_db_write_and_override(tmpdir):
    """SourcesDB databases can be written and overridden"""

    db = sdb.SourcesDB(ArgsMock(workdir=tmpdir))
    old = db.read("debug")
    old_default = db.default
    override = sdb.Sources("debug", True)
    row_one = {"foo": "bar", "baz": "bang", "boom": "bang"}
    row_two = {"foo": "bar2", "baz": "bang2", "boom": "bang2"}
    override.append(row_one)
    override.append(row_two)
    db.write(override)

    # New SourcesDB instance required to detect overrides
    db = sdb.SourcesDB(ArgsMock(workdir=tmpdir))
    assert os.path.exists(tmpdir.join("sources",
                                      "debug.csv")), "override file is written"
    assert db.default == "debug", "overriding the default works"
    assert old_default != db.default, "overridden default actually changes"
    new = db.read("debug")
    assert len(new) == 2, "number of overridden rows is correct"
    assert new[0] == row_one and new[
        1] == row_two, "new rows are written as expected"
    assert old[0] != new[0], "overridden rows actually change"
Exemplo n.º 6
0
    def run(self):
        """
        Perform the filter run. The objective is to filter out permanent errors so
        we don't waste time on them during regular test runs.

        The concept is:
        Run top sites in chunks through Firefox and re-test all error URLs from that
        chunk a number of times to weed out spurious network errors. Stop the process
        once the required number of working hosts is collected.
        """
        global logger

        self.start_time = datetime.datetime.now()

        limit = 1000000
        if self.args.limit is not None:
            limit = self.args.limit

        logger.info("There are %d hosts in the unfiltered host set" % len(self.sources))
        logger.info("Compiling set of %d working hosts for `%s` database update" % (limit, self.sources.handle))
        working_set = set()

        # Chop unfiltered sources data into chunks and iterate over each
        # .iter_chunks() returns a generator method to call for next chunk
        next_chunk = self.sources.iter_chunks(chunk_size=1000)
        chunk_size = self.sources.chunk_size

        progress = pr.ProgressTracker(total=limit, unit="hosts", average=60 * 60.0)

        try:
            while True:
                hosts_to_go = max(0, limit - len(working_set))
                # Check if we're done
                if hosts_to_go == 0:
                    break
                logger.info("%d hosts to go to complete the working set" % hosts_to_go)

                # Shrink chunk if it contains way more hosts than required to complete the working set
                #
                # CAVE: This assumes that this is the last chunk we require. The downsized chunk
                # is still 50% larger than required to complete the set to compensate for broken
                # hosts. If the error rate in the chunk is greater than 50%, another chunk will be
                # consumed, resulting in a gap of untested hosts between the end of this downsized
                # chunk and the beginning of the next. Not too bad, but important to be aware of.
                if chunk_size > hosts_to_go * 2:
                    chunk_size = min(chunk_size, hosts_to_go * 2)
                pass_chunk = next_chunk(chunk_size, as_set=True)

                # Check if we ran out of data for completing the set
                if pass_chunk is None:
                    logger.warning("Ran out of hosts to complete the working set")
                    break

                # Run chunk through multiple passes of Firefox, leaving only persistent errors in the
                # error set.
                pass_chunk_size = len(pass_chunk)
                chunk_end = self.sources.chunk_offset
                chunk_start = chunk_end - pass_chunk_size
                logger.info("Processing chunk of %d hosts from the unfiltered set (#%d to #%d)"
                            % (chunk_end - chunk_start, chunk_start, chunk_end - 1))
                pass_errors = pass_chunk

                for i in range(self.args.scans):

                    logger.info("Pass %d with %d hosts" % (i + 1, len(pass_errors)))

                    # First run is regular, every other run is overhead
                    if i == 0:
                        report_callback = None
                    else:
                        report_callback = progress.log_overhead

                    pass_errors = self.run_test(self.app, pass_errors, profile=self.profile, get_info=False,
                                                get_certs=False, return_only_errors=True,
                                                report_callback=report_callback)
                    len_pass_errors = len(pass_errors)

                    # Log progress of first pass
                    if i == 0:
                        progress.log_completed(pass_chunk_size - len_pass_errors)
                        progress.log_overhead(len_pass_errors)

                    if len_pass_errors == 0:
                        break

                logger.info("Error rate in chunk was %.1f%%"
                            % (100.0 * float(len_pass_errors) / float(chunk_end - chunk_start)))

                # Add all non-errors to the working set
                working_set.update(pass_chunk.difference(pass_errors))

                # Log progress after every chunk
                logger.info(str(progress))

        except KeyboardInterrupt:
            logger.critical("Ctrl-C received")
            raise KeyboardInterrupt

        final_src = sdb.Sources(self.sources.handle, is_default=self.sources.is_default)
        final_src.from_set(working_set)
        final_src.sort()
        final_src.trim(limit)

        if len(final_src) < limit:
            logger.warning("Ran out of hosts to complete the working set")

        logger.info("Collected %d working hosts for the updated test set" % len(final_src))
        logger.info("Writing updated `%s` host database" % final_src.handle)
        self.db.write(final_src)