def main(): parser = argparse.ArgumentParser(description='Rebuild the redis cache.') parser.add_argument('--rebuild_pickles', default=False, action='store_true', help='Delete and rebuild the pickles. Count 20s/pickle, it can take a very long time.') args = parser.parse_args() lookyloo = Lookyloo() if args.rebuild_pickles: lookyloo.rebuild_all() else: lookyloo.rebuild_cache() indexing = Indexing() indexing.clear_indexes() for capture_uuid in lookyloo.capture_uuids: index = True try: tree = lookyloo.get_crawled_tree(capture_uuid) except Exception as e: print(capture_uuid, e) continue if lookyloo.is_public_instance: cache = lookyloo.capture_cache(capture_uuid) if not cache: continue if cache.no_index is not None: index = False # NOTE: these methods do nothing if we just generated the pickle when calling lookyloo.get_crawled_tree if index: indexing.index_cookies_capture(tree) indexing.index_body_hashes_capture(tree) indexing.index_url_capture(tree) categories = list(lookyloo.categories_capture(capture_uuid).keys()) indexing.index_categories_capture(capture_uuid, categories)
calendar_week = today.isocalendar()[1] weeks_stats: Dict[int, Dict[str, Union[int, Set[str]]]] = \ {calendar_week - 1: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()}, calendar_week: {'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set()}} def uniq_domains(uniq_urls): domains = set() for url in uniq_urls: splitted = urlparse(url) domains.add(splitted.hostname) return domains for uuid in lookyloo.capture_uuids: cache = lookyloo.capture_cache(uuid) if not cache or not hasattr(cache, 'timestamp'): continue date = cache.timestamp if date.year not in stats: stats[date.year] = {} if date.month not in stats[date.year]: stats[date.year][date.month] = { 'analysis': 0, 'analysis_with_redirects': 0, 'redirects': 0, 'uniq_urls': set() } stats[date.year][date.month]['analysis'] += 1 if len(cache.redirects) > 0: stats[date.year][date.month]['analysis_with_redirects'] += 1