def setup(self): global logger self.app = self.get_test_candidate(self.args.base) self.profile = self.make_profile("base_profile") tmp_zip_name = os.path.join(self.tmp_dir, "top.zip") logger.info("Fetching unfiltered top sites data from the `Tranco Top 1M` online database") get_to_file(self.top_sites_location, tmp_zip_name) try: zipped = zipfile.ZipFile(tmp_zip_name) if len(zipped.filelist) != 1 or not zipped.filelist[0].orig_filename.lower().endswith(".csv"): logger.critical("Top sites zip file has unexpected content") sys.exit(5) tmp_csv_name = zipped.extract(zipped.filelist[0], self.tmp_dir) except zipfile.BadZipfile: logger.critical("Error opening top sites zip archive") sys.exit(5) self.db = sdb.SourcesDB(self.args) is_default = self.args.source == self.db.default self.sources = sdb.Sources(self.args.source, is_default) with open(tmp_csv_name) as f: cr = csv.DictReader(f, fieldnames=["rank", "hostname"]) self.sources.rows = [row for row in cr] # A mild sanity check to see whether the downloaded data is valid. if len(self.sources) < 900000: logger.warning("Top sites is surprisingly small, just %d hosts" % len(self.sources)) if "hostname" not in self.sources.rows[0] or "rank" not in self.sources.rows[0] \ or self.sources.rows[0]["rank"] != 1: logger.warning("Top sites data looks weird. First line: `%s`" % self.sources.rows[0])
def test_sources_set_interface(): """Sources object can be created from and yield sets""" # Sets are assumed to contain (rank, hostname) pairs src_set = {(1, "mozilla.org"), (2, "mozilla.com"), (3, "addons.mozilla.org")} src = sdb.Sources("foo") src.from_set(src_set) assert len(src) == 3, "database from set has correct length" assert src_set == src.as_set(), "yielded set is identical to the original" assert len(src.as_set(1, 2)) == 1, "yielded subset has expected length"
def test_sources_sorting(): """Sources object can sort its rows by rank""" src_set = {(1, "mozilla.org"), (2, "mozilla.com"), (3, "addons.mozilla.org")} src = sdb.Sources("foo") src.from_set(src_set) # Definitely "unsort" if int(src.rows[0]["rank"]) < int(src.rows[1]["rank"]): src.rows[0], src.rows[1] = src.rows[1], src.rows[0] assert not int(src.rows[0]["rank"]) < int(src.rows[1]["rank"]) < int( src.rows[2]["rank"]), "list is scrambled" src.sort() assert int(src.rows[0]["rank"]) < int(src.rows[1]["rank"]) < int( src.rows[2]["rank"]), "sorting works"
def test_sources_chunking(): """Sources object can be iterated in chunks""" src_set = {(1, "mozilla.org"), (2, "mozilla.com"), (3, "addons.mozilla.org"), (4, "irc.mozilla.org"), (5, "firefox.com")} assert len(src_set) == 5, "hardcoded test set has expected length" src = sdb.Sources("foo") src.from_set(src_set) next_chunk = src.iter_chunks(chunk_start=1, chunk_stop=20, chunk_size=2, min_chunk_size=100) assert src.chunk_size == 100, "chunking respects minimum size setting" assert src.chunk_start == 1, "chunking respects chunk start setting" chunk = next_chunk(20) assert len(chunk) == 4, "chunks are not larger than remaining data" read_set = set() next_chunk = src.iter_chunks(chunk_size=2) lengths = list() for _ in range(10): chunk = next_chunk(as_set=True) if chunk is None: break lengths.append(len(chunk)) read_set.update(chunk) assert lengths == [2, 2, 1], "chunks have expected lengths" assert src_set == read_set, "chunks cover full set" next_chunk = src.iter_chunks(chunk_size=10) lengths = list() lengths.append(len(next_chunk(1))) lengths.append(len(next_chunk(2))) lengths.append(len(next_chunk(3))) assert next_chunk() is None, "after last chunk comes None" assert next_chunk() is None, "after last chunk comes None again" assert lengths == [1, 2, 2], "chunks size can be varied on-the-fly"
def test_sources_db_write_and_override(tmpdir): """SourcesDB databases can be written and overridden""" db = sdb.SourcesDB(ArgsMock(workdir=tmpdir)) old = db.read("debug") old_default = db.default override = sdb.Sources("debug", True) row_one = {"foo": "bar", "baz": "bang", "boom": "bang"} row_two = {"foo": "bar2", "baz": "bang2", "boom": "bang2"} override.append(row_one) override.append(row_two) db.write(override) # New SourcesDB instance required to detect overrides db = sdb.SourcesDB(ArgsMock(workdir=tmpdir)) assert os.path.exists(tmpdir.join("sources", "debug.csv")), "override file is written" assert db.default == "debug", "overriding the default works" assert old_default != db.default, "overridden default actually changes" new = db.read("debug") assert len(new) == 2, "number of overridden rows is correct" assert new[0] == row_one and new[ 1] == row_two, "new rows are written as expected" assert old[0] != new[0], "overridden rows actually change"
def run(self): """ Perform the filter run. The objective is to filter out permanent errors so we don't waste time on them during regular test runs. The concept is: Run top sites in chunks through Firefox and re-test all error URLs from that chunk a number of times to weed out spurious network errors. Stop the process once the required number of working hosts is collected. """ global logger self.start_time = datetime.datetime.now() limit = 1000000 if self.args.limit is not None: limit = self.args.limit logger.info("There are %d hosts in the unfiltered host set" % len(self.sources)) logger.info("Compiling set of %d working hosts for `%s` database update" % (limit, self.sources.handle)) working_set = set() # Chop unfiltered sources data into chunks and iterate over each # .iter_chunks() returns a generator method to call for next chunk next_chunk = self.sources.iter_chunks(chunk_size=1000) chunk_size = self.sources.chunk_size progress = pr.ProgressTracker(total=limit, unit="hosts", average=60 * 60.0) try: while True: hosts_to_go = max(0, limit - len(working_set)) # Check if we're done if hosts_to_go == 0: break logger.info("%d hosts to go to complete the working set" % hosts_to_go) # Shrink chunk if it contains way more hosts than required to complete the working set # # CAVE: This assumes that this is the last chunk we require. The downsized chunk # is still 50% larger than required to complete the set to compensate for broken # hosts. If the error rate in the chunk is greater than 50%, another chunk will be # consumed, resulting in a gap of untested hosts between the end of this downsized # chunk and the beginning of the next. Not too bad, but important to be aware of. if chunk_size > hosts_to_go * 2: chunk_size = min(chunk_size, hosts_to_go * 2) pass_chunk = next_chunk(chunk_size, as_set=True) # Check if we ran out of data for completing the set if pass_chunk is None: logger.warning("Ran out of hosts to complete the working set") break # Run chunk through multiple passes of Firefox, leaving only persistent errors in the # error set. pass_chunk_size = len(pass_chunk) chunk_end = self.sources.chunk_offset chunk_start = chunk_end - pass_chunk_size logger.info("Processing chunk of %d hosts from the unfiltered set (#%d to #%d)" % (chunk_end - chunk_start, chunk_start, chunk_end - 1)) pass_errors = pass_chunk for i in range(self.args.scans): logger.info("Pass %d with %d hosts" % (i + 1, len(pass_errors))) # First run is regular, every other run is overhead if i == 0: report_callback = None else: report_callback = progress.log_overhead pass_errors = self.run_test(self.app, pass_errors, profile=self.profile, get_info=False, get_certs=False, return_only_errors=True, report_callback=report_callback) len_pass_errors = len(pass_errors) # Log progress of first pass if i == 0: progress.log_completed(pass_chunk_size - len_pass_errors) progress.log_overhead(len_pass_errors) if len_pass_errors == 0: break logger.info("Error rate in chunk was %.1f%%" % (100.0 * float(len_pass_errors) / float(chunk_end - chunk_start))) # Add all non-errors to the working set working_set.update(pass_chunk.difference(pass_errors)) # Log progress after every chunk logger.info(str(progress)) except KeyboardInterrupt: logger.critical("Ctrl-C received") raise KeyboardInterrupt final_src = sdb.Sources(self.sources.handle, is_default=self.sources.is_default) final_src.from_set(working_set) final_src.sort() final_src.trim(limit) if len(final_src) < limit: logger.warning("Ran out of hosts to complete the working set") logger.info("Collected %d working hosts for the updated test set" % len(final_src)) logger.info("Writing updated `%s` host database" % final_src.handle) self.db.write(final_src)