def __call__(self, mon, thr): self.mon = mon self.status_queue = queue.PriorityQueue() self.mon.register_event_queue(self.status_queue, (self._MON_SAYS_STOP, -1)) self.mon.set_status_prefix("d") self.mon.report_status("loading...") self.proxies = ProxySet(self, self.mon, self.args, self.proxy_sort_key) self.mon.report_status("loading... (proxies OK)") self.db = url_database.ensure_database(self.args) self.prepare_database() for _ in range(self.args.total_workers): wt = CaptureWorker(self) self.mon.add_work_thread(wt) self.idle_workers.add(wt) self.dispatcher_loop()
def __call__(self, mon, thr): self.mon = mon self.status_queue = queue.Queue() self.mon.register_event_queue(self.status_queue, (self._MON_SAYS_STOP,)) self.mon.set_status_prefix("d") self.mon.report_status("loading...") self.proxies = ProxySet(self, mon, self.args) self.mon.report_status("loading... (proxies OK)") self.locations = { loc: LocationState(loc, self.args.destinations, self.output_dir) for loc in self.proxies.locations.keys() } self.mon.report_status("loading... (locations OK)") # We only need one worker thread per proxy, because scamper # parallelizes work internally. for _ in range(self.args.max_simultaneous_proxies): wt = TracerouteWorker(self) self.mon.add_work_thread(wt) self.idle_workers.add(wt) self.mon.report_status("loading... (work threads OK)") # kick things off by starting one proxy (proxy, until_next, n_locations) = self.proxies.start_a_proxy() self.mon.report_status( "{}/{}/{} locations active, {} started, " "{} till next".format( len(self.proxies.active_proxies), n_locations, len(self.locations), proxy.label() if proxy else None, until_next, ) ) while n_locations: time_now = time.monotonic() # Technically, until_next being None means "wait for a proxy # to exit", but use an hour as a backstop. (When a proxy does # exit, this will get knocked down to zero below.) if until_next is None: until_next = 3600 time_next = time_now + until_next pending_stop = False while time_now < time_next: for msg in queue_iter(self.status_queue, until_next): if msg[0] == self._PROXY_ONLINE: self.proxies.note_proxy_online(msg[1]) self.mon.report_status("proxy {} online".format(msg[1].label())) self.mon.idle(1) elif msg[0] == self._PROXY_OFFLINE: self.mon.report_status("proxy {} offline".format(msg[1].label())) self.proxies.note_proxy_offline(msg[1]) # Wait no more than 5 minutes before trying to # start another proxy. (XXX This hardwires a # specific provider's policy.) time_now = time.monotonic() time_next = min(time_next, time_now + 300) until_next = time_next - time_now elif msg[0] == self._BATCH_COMPLETE: locstate = self.active_workers[msg[1]] del self.active_workers[msg[1]] self.idle_workers.add(msg[1]) locstate.complete_job() self.mon.report_status("{} batch complete".format(locstate.location)) elif msg[0] == self._BATCH_FAILED: locstate = self.active_workers[msg[1]] del self.active_workers[msg[1]] self.idle_workers.add(msg[1]) locstate.fail_job() self.mon.report_status("{} batch failed".format(locstate.location)) elif msg[0] == self._DROP_WORKER: self.idle_workers.discard(worker) if worker in self.active_workers: self.active_workers[worker].fail_job() del self.active_workers[worker] elif msg[0] == self._MON_SAYS_STOP: self.mon.report_status("interrupt pending") pending_stop = True else: self.mon.report_error("bogus message: {!r}".format(message)) for loc, state in self.locations.items(): if state.next_task is None: self.mon.report_status("{} finished".format(loc)) if loc in self.proxies.locations: self.proxies.locations[loc].finished() if pending_stop: self.mon.report_status("interrupted") self.mon.maybe_pause_or_stop() # don't start new work yet, the set of proxies # available may be totally different now else: for proxy in self.proxies.active_proxies: if not self.idle_workers: break if not proxy.online: continue state = self.locations[proxy.loc] if not state.active_task and state.next_task is not None: worker = self.idle_workers.pop() self.active_workers[worker] = state state.queue_job(worker, proxy) self.mon.report_status("queuing job for {}".format(proxy.label())) time_now = time.monotonic() until_next = time_next - time_now # when we get to this point, it's time to start another proxy (proxy, until_next, n_locations) = self.proxies.start_a_proxy() self.mon.report_status( "{}/{}/{} locations active, {} started, " "{} till next".format( len(self.proxies.active_proxies), n_locations, len(self.locations), proxy.label() if proxy else None, until_next, ) ) # done, kill off all the workers self.mon.report_status("finished") assert not self.active_workers for w in self.idle_workers: w.finished()
def __call__(self, mon, thr): self.mon = mon self.status_queue = queue.Queue() self.mon.register_event_queue(self.status_queue, (self._MON_SAYS_STOP, )) self.mon.set_status_prefix("d") self.mon.report_status("loading...") self.proxies = ProxySet(self, mon, self.args, include_locations=self.dns_servers) self.mon.report_status("loading... (proxies OK)") for loc in list(self.dns_servers.keys()): if loc not in self.proxies.locations: del self.dns_servers[loc] assert list(self.dns_servers.keys()) == \ list(self.proxies.locations.keys()) self.locations = { loc: LocationState(loc, self.dns_servers[loc], self.hostnames, self.output_dir) for loc in self.dns_servers.keys() } self.mon.report_status("loading... (locations OK)") # One work thread per active proxy. for _ in range(self.args.max_simultaneous_proxies): wt = DNSWorker(self) self.mon.add_work_thread(wt) self.idle_workers.add(wt) self.mon.report_status("loading... (work threads OK)") # kick things off by starting one proxy (proxy, until_next, n_locations) = self.proxies.start_a_proxy() self.mon.report_status("{}/{}/{} locations active, {} started, " "{} till next".format( len(self.proxies.active_proxies), n_locations, len(self.locations), proxy.label() if proxy else None, until_next)) while n_locations: time_now = time.monotonic() # Technically, until_next being None means "wait for a proxy # to exit", but use an hour as a backstop. (When a proxy does # exit, this will get knocked down to zero below.) if until_next is None: until_next = 3600 time_next = time_now + until_next pending_stop = False while time_now < time_next: for msg in queue_iter(self.status_queue, until_next): if msg[0] == self._PROXY_ONLINE: self.proxies.note_proxy_online(msg[1]) self.mon.report_status("proxy {} online".format( msg[1].label())) self.mon.idle(1) elif msg[0] == self._PROXY_OFFLINE: self.mon.report_status("proxy {} offline".format( msg[1].label())) self.proxies.note_proxy_offline(msg[1]) # Wait no more than 5 minutes before trying to # start another proxy. (XXX This hardwires a # specific provider's policy.) time_now = time.monotonic() time_next = min(time_next, time_now + 300) until_next = time_next - time_now elif msg[0] == self._BATCH_COMPLETE: locstate = self.active_workers[msg[1]] del self.active_workers[msg[1]] self.idle_workers.add(msg[1]) locstate.complete_job() self.mon.report_status("{} batch complete".format( locstate.location)) elif msg[0] == self._BATCH_FAILED: locstate = self.active_workers[msg[1]] del self.active_workers[msg[1]] self.idle_workers.add(msg[1]) locstate.fail_job() self.mon.report_status("{} batch failed".format( locstate.location)) elif msg[0] == self._DROP_WORKER: self.idle_workers.discard(worker) if worker in self.active_workers: self.active_workers[worker].fail_job() del self.active_workers[worker] elif msg[0] == self._MON_SAYS_STOP: self.mon.report_status("interrupt pending") pending_stop = True else: self.mon.report_error( "bogus message: {!r}".format(message)) for loc, state in self.locations.items(): if state.finished_p(): self.mon.report_status("{} finished".format(loc)) if loc in self.proxies.locations: self.proxies.locations[loc].finished() if pending_stop: self.mon.report_status("interrupted") self.mon.maybe_pause_or_stop() # don't start new work yet, the set of proxies # available may be totally different now else: for proxy in self.proxies.active_proxies: if not self.idle_workers: break if not proxy.online: continue state = self.locations[proxy.loc] if state.idle_p(): worker = self.idle_workers.pop() self.active_workers[worker] = state state.queue_job(worker, proxy) self.mon.report_status("queuing job for {}".format( proxy.label())) time_now = time.monotonic() until_next = time_next - time_now # when we get to this point, it's time to start another proxy (proxy, until_next, n_locations) = self.proxies.start_a_proxy() self.mon.report_status("{}/{}/{} locations active, {} started, " "{} till next".format( len(self.proxies.active_proxies), n_locations, len(self.locations), proxy.label() if proxy else None, until_next)) # done, kill off all the workers self.mon.report_status("finished") assert not self.active_workers for w in self.idle_workers: w.finished()
class CaptureDispatcher: def __init__(self, args): # complete initialization deferred till we're on the right thread self.args = args self.idle_workers = set() self.active_workers = {} self.locations = {} self.overall_jobsize = 0 self.proxies = None self.mon = None self.db = None self.status_queue = None self.status_queue_serializer = 0 def __call__(self, mon, thr): self.mon = mon self.status_queue = queue.PriorityQueue() self.mon.register_event_queue(self.status_queue, (self._MON_SAYS_STOP, -1)) self.mon.set_status_prefix("d") self.mon.report_status("loading...") self.proxies = ProxySet(self, self.mon, self.args, self.proxy_sort_key) self.mon.report_status("loading... (proxies OK)") self.db = url_database.ensure_database(self.args) self.prepare_database() for _ in range(self.args.total_workers): wt = CaptureWorker(self) self.mon.add_work_thread(wt) self.idle_workers.add(wt) self.dispatcher_loop() # Status queue helper constants and methods. _PROXY_OFFLINE = 1 _PROXY_ONLINE = 2 _BATCH_COMPLETE = 3 _BATCH_FAILED = 4 _DROP_WORKER = 5 _MON_SAYS_STOP = 6 # Stop after handling all incoming work # Entries in a PriorityQueue must be totally ordered. We just # want to service all COMPLETE messages ahead of all others, and # STOP messages after all others, so we give them all a serial # number which goes in the tuple right after the command code, # before the data. This also means we don't have to worry about # unsortable data. def oq(self): self.status_queue_serializer += 1 return self.status_queue_serializer # worker-to-dispatcher API def complete_batch(self, worker, result): self.status_queue.put((self._BATCH_COMPLETE, self.oq(), worker, result)) def fail_batch(self, worker, exc_info): self.status_queue.put((self._BATCH_FAILED, self.oq(), worker)) def drop_worker(self, worker): self.status_queue.put((self._DROP_WORKER, self.oq(), worker)) # proxy-to-dispatcher API def proxy_online(self, proxy): self.status_queue.put((self._PROXY_ONLINE, self.oq(), proxy)) def proxy_offline(self, proxy): self.status_queue.put((self._PROXY_OFFLINE, self.oq(), proxy)) def _invalid_message(self, *args): self.mon.report_error("invalid status queue message {!r}" .format(args)) def dispatcher_loop(self): # Kick things off by starting one proxy. (proxy, until_next, n_locations) = self.proxies.start_a_proxy() while n_locations: time_now = time.monotonic() # Technically, until_next being None means "wait for a proxy # to exit", but use an hour as a backstop. (When a proxy does # exit, this will get knocked down to zero below.) if until_next is None: until_next = 3600 time_next = time_now + until_next pending_stop = False while time_now < time_next: self.update_progress_statistics(n_locations, until_next) for msg in queue_iter(self.status_queue, until_next): if msg[0] == self._PROXY_ONLINE: self.proxies.note_proxy_online(msg[2]) elif msg[0] == self._PROXY_OFFLINE: self.proxies.note_proxy_offline(msg[2]) # Wait no more than 5 minutes before trying to # start another proxy. (XXX This hardwires a # specific provider's policy.) time_now = time.monotonic() time_next = min(time_next, time_now + 300) until_next = time_next - time_now elif msg[0] == self._BATCH_COMPLETE: worker, result = msg[2], msg[3] locstate, _ = self.active_workers[worker] del self.active_workers[worker] self.idle_workers.add(worker) self.record_batch(locstate, *result) elif msg[0] == self._BATCH_FAILED: worker = msg[2] # We might've already gotten a COMPLETE message # with more precision. if worker in self.active_workers: locstate, batch = self.active_workers[worker] del self.active_workers[worker] self.idle_workers.add(worker) self.record_batch(locstate, [], batch) elif msg[0] == self._DROP_WORKER: worker = msg[2] self.idle_workers.discard(worker) if worker in self.active_workers: self.active_workers[worker].fail_job() del self.active_workers[worker] elif msg[0] == self._MON_SAYS_STOP: self.mon.report_status("interrupt pending") pending_stop = True else: self.mon.report_error("bogus message: {!r}" .format(message)) for loc, state in self.locations.items(): if state.todo == 0 and loc in self.proxies.locations: self.proxies.locations[loc].finished() if pending_stop: self.mon.report_status("interrupted") self.mon.maybe_pause_or_stop() # don't start new work yet, the set of proxies # available may be totally different now else: # One-second delay before starting new work, because # proxies aren't always 100% up when they say they are. self.mon.idle(1) while self.idle_workers: assigned_work = False for proxy in self.proxies.active_proxies: if not proxy.online: continue state = self.locations[proxy.loc] if state.n_workers >= self.args.workers_per_loc: continue batch = self.select_batch(state) if not batch: # All work for this location is # assigned to other workers already. continue state.n_workers += 1 state.in_progress.update(row[0] for row in batch) worker = self.idle_workers.pop() self.active_workers[worker] = (state, batch) worker.queue_batch(state, batch) assigned_work = True if not self.idle_workers: break if not assigned_work: break time_now = time.monotonic() until_next = time_next - time_now # when we get to this point, it's time to start another proxy (proxy, until_next, n_locations) = self.proxies.start_a_proxy() # done, kill off all the workers self.mon.report_status("finished") assert not self.active_workers for w in self.idle_workers: w.finished() def proxy_sort_key(self, loc, method): # Consider locales that currently have no workers at all first. # Consider locales with more work to do first. # Consider locales whose proxy is 'direct' first. # Consider locales named 'us' first. # As a final tie breaker use alphabetical order of locale name. state = self.locations[loc] return (state.n_workers != 0, -state.todo, method != 'direct', loc != 'us', loc) def select_batch(self, loc): with self.db, self.db.cursor() as cr: query = ('SELECT c.url as uid, s.url as url' ' FROM capture_progress c, url_strings s' ' WHERE c.url = s.id') query += ' AND NOT c."l_{0}"'.format(loc.locale) if loc.in_progress: query += ' AND c.url NOT IN (' query += ','.join(str(u) for u in loc.in_progress) query += ')' query += ' LIMIT {0}'.format(self.args.batch_size) cr.execute(query) return cr.fetchall() def record_batch(self, loc, successes, failures): locale = loc.locale loc.n_workers -= 1 for r in failures: loc.in_progress.remove(r[0]) if not successes: return with self.db, self.db.cursor() as cr: for s in successes: url_id = s[0] r = s[1] loc.in_progress.remove(url_id) redir_url = None redir_url_id = None if r['canon']: redir_url = r['canon'] if redir_url == r['ourl']: redir_url_id = url_id elif redir_url is not None: try: (redir_url_id, _) = \ url_database.add_url_string(cr, redir_url) except (ValueError, UnicodeError): addendum = "invalid redir url: " + redir_url if ('detail' not in r or r['detail'] is None): r['detail'] = addendum else: r['detail'] += " | " + addendum detail_id = self.capture_detail.get(r['detail']) if detail_id is None: cr.execute("INSERT INTO capture_detail(id, detail) " " VALUES(DEFAULT, %s)" " RETURNING id", (r['detail'],)) detail_id = cr.fetchone()[0] self.capture_detail[r['detail']] = detail_id result = url_database.categorize_result(r['status'], r['detail'], url_id, redir_url_id) to_insert = { "locale": locale, "url": url_id, "result": result, "detail": detail_id, "redir_url": redir_url_id, "log": r['log'], "html_content": r['content'], "screenshot": r['render'] } cr.execute("INSERT INTO captured_pages" "(locale, url, access_time, result, detail," " redir_url, capture_log, html_content," " screenshot)" "VALUES (" " %(locale)s," " %(url)s," " TIMESTAMP 'now'," " %(result)s," " %(detail)s," " %(redir_url)s," " %(log)s," " %(html_content)s," " %(screenshot)s)", to_insert) cr.execute('UPDATE capture_progress SET "l_{0}" = TRUE ' ' WHERE url = {1}'.format(locale, url_id)) loc.todo -= 1 def update_progress_statistics(self, n_locations, until_next): jobsize = 0 plreport = [] for plstate in self.locations.values(): jobsize = max(jobsize, plstate.todo) plreport.append((-plstate.todo, plstate.locale)) plreport.sort() plreport = " ".join("{}:{}".format(pl[1], -pl[0]) for pl in plreport) self.mon.report_status("Processing {}/{} URLs | {}/{}/{} active, {} till next | {}" .format(jobsize, self.overall_jobsize, len(self.proxies.active_proxies), n_locations, len(self.locations), until_next, plreport)) def prepare_database(self): self.locations = { loc: PerLocaleState(loc, proxy) for loc, proxy in self.proxies.locations.items() } with self.db, self.db.cursor() as cr: # Cache the status table in memory; it's reasonably small. self.mon.report_status("Preparing database... (capture detail)") cr.execute("SELECT detail, id FROM capture_detail;") self.capture_detail = { row.detail: row.id for row in cr } # The capture_progress table tracks what we've done so far. # It is regenerated from scratch each time this program is run, # based on the contents of the urls_* and captured_pages tables. self.mon.maybe_pause_or_stop() self.mon.report_status("Preparing database... " "(capture progress)") l_columns = ",\n ".join( "\"l_{0}\" BOOLEAN NOT NULL DEFAULT FALSE" .format(loc) for loc in self.locations.keys()) cr.execute("CREATE TEMPORARY TABLE capture_progress (" " url INTEGER PRIMARY KEY," + l_columns + ");") # Determine the set of URLs yet to be captured from the selected # tables. self.mon.maybe_pause_or_stop() self.mon.report_status("Preparing database... " "(capture progress rows)") cr.execute("SELECT table_name FROM information_schema.tables" " WHERE table_schema = %s" " AND table_type = 'BASE TABLE'" " AND table_name LIKE 'urls_%%'", (self.args.schema,)) all_url_tables = set(row[0] for row in cr) if self.args.tables is None: want_url_tables = all_url_tables else: want_url_tables = set("urls_"+t.strip() for t in self.args.tables.split(",")) if not want_url_tables.issubset(all_url_tables): raise RuntimeError("Requested URL tables do not exist: " + ", ".join( t[5:] for t in want_url_tables - all_url_tables)) for tbl in want_url_tables: self.mon.maybe_pause_or_stop() self.mon.report_status("Preparing database... " "(capture progress rows: {})" .format(tbl)) # Only one row per URL, even if it appears in more than one # source table. cr.execute("INSERT INTO capture_progress (url) " " SELECT url FROM "+tbl+ " EXCEPT SELECT url FROM capture_progress") self.mon.maybe_pause_or_stop() self.mon.report_status("Preparing database... (analyzing)") cr.execute("ANALYZE captured_pages") for loc in self.locations.keys(): self.mon.maybe_pause_or_stop() self.mon.report_status("Preparing database... " "(capture progress values: {})" .format(loc)) cr.execute('UPDATE capture_progress c SET "l_{0}" = TRUE' ' FROM captured_pages p' ' WHERE c.url = p.url AND p.locale = \'{0}\'' .format(loc)) self.mon.maybe_pause_or_stop() self.mon.report_status("Preparing database... (indexing: {})" .format(loc)) cr.execute("CREATE INDEX \"capture_progress_l_{0}_idx\"" " ON capture_progress(\"l_{0}\");" .format(loc)) self.mon.maybe_pause_or_stop() self.mon.report_status("Preparing database... (analyzing)") cr.execute("ANALYZE capture_progress") self.mon.maybe_pause_or_stop() self.mon.report_status("Preparing database... (statistics)") query = "SELECT COUNT(*)" for loc in self.locations.keys(): query += ', SUM("l_{0}"::INTEGER) AS "l_{0}"'.format(loc) query += " FROM capture_progress" cr.execute(query) # Compute the number of unvisited URLs for each locale, # and remove locales where that number is zero from the # working set. counts = cr.fetchone() self.overall_jobsize = counts[0] for loc, done in zip(self.locations.keys(), counts[1:]): todo = self.overall_jobsize - done assert todo >= 0 if todo: self.locations[loc].todo = todo else: self.locations[loc].proxy.finished() self.mon.maybe_pause_or_stop() self.mon.report_status("Database prepared.")