Пример #1
0
def loop():
    while True:
        resId = conn.brpoplpush("extractor.pending",
                                "extractor.working").decode("utf-8")
        if extract(resId):
            print(f"Success on {resId}")
            # Notify tokenizer
            conn.lpush("tokenizer.pending", resId)
            conn.rpush("ready", resId)
        else:
            print(f"Failed on {resId}")
            conn.lpush("extractor.failed", resId)
        conn.lrem("extractor.working", 0, resId)
Пример #2
0
def crawl(uri):
    resp = requests.get(uri,
                        headers={
                            "Accept-Charset": "utf-8",
                            "User-Agent": FAKE_UA,
                        })

    if not resp.headers["content-type"].startswith("text/html"):
        print("Incorrect Content-Type")
        return

    content = requests.utils.get_unicode_from_response(resp)
    soup = BeautifulSoup(content, features="lxml")

    if any(p.match(uri) for p in targetRe):
        resId = conn.incr("crawler.id")
        print(f"Is a target URI, allocated ID {resId}")

        global THRESHOLD
        if resId >= THRESHOLD:
            global PENDING_EXIT
            PENDING_EXIT = True
        conn.hset("uri", resId, uri)
        conn.hset("raw", resId, content)
        # Notify extractor
        conn.lpush("extractor.pending", resId)

    for link in soup.find_all("a"):
        href = link.get("href")
        if href is None:
            continue

        href = href.split("#")[0]
        href = urljoin(uri, href)  # Normalize

        if not any(p.match(href) for p in allowedRe):
            # Not a target
            continue
        if conn.sadd("crawler.stored", href) == 0:
            # Already in store
            continue
        if any(p.match(href) for p in targetRe):
            print(f"Pushing new target {href} to head")
            conn.lpush("crawler.pending.prioritized", href)
        else:
            print(f"Pushing new link {href} to tail")
            conn.lpush("crawler.pending", href)
Пример #3
0
#!/usr/bin/env python

from config import config
from db import conn

conn.delete("tokenizer.pending")
conn.delete("tokenizer.working")
maxId = conn.get("crawler.id")
for i in range(1, int(maxId) + 1):
    if conn.hexists("title", i):
        conn.lpush("tokenizer.pending", i)
Пример #4
0
            uri = uri.decode("utf-8")

        if conn.sadd("crawler.backlog", uri) == 0:
            print(f"{uri} already crawled, skipping")
            continue

        print(f"Working on: {uri}")
        crawl(uri)
        conn.lrem("crawler.working", 0, uri)

        global PENDING_EXIT
        if PENDING_EXIT:
            break


while True:
    failedReqs = conn.lrange("crawler.working", 0, -1)
    conn.delete("crawler.working")
    if len(failedReqs) > 0:
        conn.lpush("crawler.pending", *failedReqs)
        print(f"Recovered: {failedReqs}")

    # TODO: multithreading
    try:
        loop()
        if PENDING_EXIT:
            break
    except Exception as e:
        traceback.print_exception(None, e, e.__traceback__)
        time.sleep(10)
Пример #5
0
#!/usr/bin/env python

from config import config
from db import conn

for uri in config['seeds']:
    conn.lpush('crawler.pending', uri)
    conn.sadd('crawler.stored', uri)
    print(f"Seed added: {uri}")
Пример #6
0
#!/usr/bin/env python

from config import config
from db import conn

conn.delete("extractor.pending")
conn.delete("extractor.working")
maxId = conn.get("crawler.id")
conn.lpush("extractor.pending", *list(range(1, int(maxId) + 1)))
Пример #7
0
        if word not in stash:
            stash[word] = 0
        stash[word] += 50

    for word in textSegs:
        if word not in stash:
            stash[word] = 0
        stash[word] += 10

    for k, v in stash.items():
        wordConn.zadd(k, math.log(v), resId)
        lookupConn.sadd(resId, k)


def loop():
    while True:
        resId = conn.brpoplpush("tokenizer.pending",
                                "tokenizer.working").decode("utf-8")
        print(f"Working on {resId}")
        tokenize(resId)
        conn.lrem("tokenizer.working", 0, resId)


failedReqs = conn.lrange("tokenizer.working", 0, -1)
conn.delete("tokenizer.working")
count = len(failedReqs)
if count > 0:
    conn.lpush("tokenizer.pending", *failedReqs)
    print(f"Recovered: {failedReqs}")
loop()
Пример #8
0
#!/usr/bin/env python

from config import config
from db import conn

failedReqs = conn.lrange("extractor.working", 0, -1)
conn.delete("extractor.working")
count = len(failedReqs)
if count > 0:
    conn.lpush("extractor.pending", *failedReqs)
    print(f"Recovered: {failedReqs}")

failedReqs = conn.lrange("extractor.failed", 0, -1)
conn.delete("extractor.failed")
count = len(failedReqs)
if count > 0:
    conn.lpush("extractor.pending", *failedReqs)
    print(f"Requeued count: {count}")