def loop(): while True: resId = conn.brpoplpush("extractor.pending", "extractor.working").decode("utf-8") if extract(resId): print(f"Success on {resId}") # Notify tokenizer conn.lpush("tokenizer.pending", resId) conn.rpush("ready", resId) else: print(f"Failed on {resId}") conn.lpush("extractor.failed", resId) conn.lrem("extractor.working", 0, resId)
def crawl(uri): resp = requests.get(uri, headers={ "Accept-Charset": "utf-8", "User-Agent": FAKE_UA, }) if not resp.headers["content-type"].startswith("text/html"): print("Incorrect Content-Type") return content = requests.utils.get_unicode_from_response(resp) soup = BeautifulSoup(content, features="lxml") if any(p.match(uri) for p in targetRe): resId = conn.incr("crawler.id") print(f"Is a target URI, allocated ID {resId}") global THRESHOLD if resId >= THRESHOLD: global PENDING_EXIT PENDING_EXIT = True conn.hset("uri", resId, uri) conn.hset("raw", resId, content) # Notify extractor conn.lpush("extractor.pending", resId) for link in soup.find_all("a"): href = link.get("href") if href is None: continue href = href.split("#")[0] href = urljoin(uri, href) # Normalize if not any(p.match(href) for p in allowedRe): # Not a target continue if conn.sadd("crawler.stored", href) == 0: # Already in store continue if any(p.match(href) for p in targetRe): print(f"Pushing new target {href} to head") conn.lpush("crawler.pending.prioritized", href) else: print(f"Pushing new link {href} to tail") conn.lpush("crawler.pending", href)
#!/usr/bin/env python from config import config from db import conn conn.delete("tokenizer.pending") conn.delete("tokenizer.working") maxId = conn.get("crawler.id") for i in range(1, int(maxId) + 1): if conn.hexists("title", i): conn.lpush("tokenizer.pending", i)
uri = uri.decode("utf-8") if conn.sadd("crawler.backlog", uri) == 0: print(f"{uri} already crawled, skipping") continue print(f"Working on: {uri}") crawl(uri) conn.lrem("crawler.working", 0, uri) global PENDING_EXIT if PENDING_EXIT: break while True: failedReqs = conn.lrange("crawler.working", 0, -1) conn.delete("crawler.working") if len(failedReqs) > 0: conn.lpush("crawler.pending", *failedReqs) print(f"Recovered: {failedReqs}") # TODO: multithreading try: loop() if PENDING_EXIT: break except Exception as e: traceback.print_exception(None, e, e.__traceback__) time.sleep(10)
#!/usr/bin/env python from config import config from db import conn for uri in config['seeds']: conn.lpush('crawler.pending', uri) conn.sadd('crawler.stored', uri) print(f"Seed added: {uri}")
#!/usr/bin/env python from config import config from db import conn conn.delete("extractor.pending") conn.delete("extractor.working") maxId = conn.get("crawler.id") conn.lpush("extractor.pending", *list(range(1, int(maxId) + 1)))
if word not in stash: stash[word] = 0 stash[word] += 50 for word in textSegs: if word not in stash: stash[word] = 0 stash[word] += 10 for k, v in stash.items(): wordConn.zadd(k, math.log(v), resId) lookupConn.sadd(resId, k) def loop(): while True: resId = conn.brpoplpush("tokenizer.pending", "tokenizer.working").decode("utf-8") print(f"Working on {resId}") tokenize(resId) conn.lrem("tokenizer.working", 0, resId) failedReqs = conn.lrange("tokenizer.working", 0, -1) conn.delete("tokenizer.working") count = len(failedReqs) if count > 0: conn.lpush("tokenizer.pending", *failedReqs) print(f"Recovered: {failedReqs}") loop()
#!/usr/bin/env python from config import config from db import conn failedReqs = conn.lrange("extractor.working", 0, -1) conn.delete("extractor.working") count = len(failedReqs) if count > 0: conn.lpush("extractor.pending", *failedReqs) print(f"Recovered: {failedReqs}") failedReqs = conn.lrange("extractor.failed", 0, -1) conn.delete("extractor.failed") count = len(failedReqs) if count > 0: conn.lpush("extractor.pending", *failedReqs) print(f"Requeued count: {count}")