示例#1
0
def main(max_threads):
    scrape_Callback = AlexaCallback()
    cache = Mongocache()
    thread_crawl(scrape_Callback.seed_url,
                 scrape_callback=scrape_Callback,
                 cache=cache,
                 max_threads=max_threads)
示例#2
0
def main(max_threads=5):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    queue = MongoQueue()

    urls = []
    temple = scrape_callback.seed_url[0:-2]
    for i in range(1, 1189, 1):
        urls.append(temple + str(i) + '/')

    while True:
        now = datetime.now()
        if now.hour < 3 or now.hour > 12:
            queue.repairFast()
            process_crawler(
                urls,
                scrape_callback=scrape_callback,
                cache=cache,
                max_threads=max_threads,
                timeout=30,
                host=urlparse.urlparse(scrape_callback.seed_url).netloc,
                user_agent=
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
            )
        else:
            print 'pass:' + str(now)
            pass
        time.sleep(3600)
示例#3
0
def main():
    starttime = datetime.datetime.now()
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    #cache.clear()
    threaded_crawler('http://example.webscraping.com',scrape_callback.seed_url, scrape_callback=scrape_callback)
    endtime = datetime.datetime.now()
    print((endtime - starttime).seconds)
示例#4
0
def main(max_threads):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    process_crawler(scrape_callback.seed_url,
                    scrape_callback=scrape_callback,
                    cache=cache,
                    max_threads=max_threads,
                    timeout=10)
示例#5
0
def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    #cache.clear()
    link_crawler(scrape_callback.seed_url,
                 scrape_callback=scrape_callback,
                 cache=cache,
                 timeout=10,
                 ignore_robots=True)
示例#6
0
def main(max_threads):
    from mongo_cache import MongoCache
    from alexa_cb import AlexaCallback
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    # cache.clear()             #############
    threaded_crawler(scrape_callback.seed_url,
                     scrape_callback=scrape_callback,
                     cache=cache,
                     max_threads=max_threads,
                     timeout=10)
示例#7
0
def main(max_threads=4, max_websites=3, depth=3):
    ws_urls_cache = MongoWSURL()
    ws_urls_cache.clear()
    crawling_queue = MongoCrawlingQueue()
    crawling_queue.clear()
    crawled_urls = MongoCrawledURL()
    crawled_urls.clear()
    seed_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
    alexa = AlexaCallback(max_websites)
    try:
        links = alexa(seed_url) or []
    except Exception as e:
        print('Error in downing {}: {}'.format(seed_url, e))
    else:
        for link in links:
            crawling_queue.push(link, depth)
    process_crawler(ws_urls_cache=ws_urls_cache,
                    max_threads=max_threads,
                    timeout=10,
                    depth=depth)
示例#8
0
def main(max_threads = 5):
    catlog_callback = AlexaCallback()
    cache = MongoCache()
    queue = MongoQueue()


    client = MongoClient('localhost', 27017, connect=False)
        #create collection to store cached webpages,
        # which is the equivalent of a table in a relational database
    db = client.cache
    cursor = db.books.find()

    urls = []
    while cursor.alive:
        temp = cursor.next()
        temp = temp['link']

        if urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.junzige.la':
            temp = '/novel' + temp[5:-4] + '/'
            temp = normalize(catlog_callback.seed_url, temp)
        elif urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.boluoxs.com':
            temp = 'http://www.boluoxs.com/biquge/0/' + temp[temp.rfind('/') + 1 :temp.rfind('.')] + '/'

        print temp
        urls.append(temp)

    print urls[0]

    while True:
        now = datetime.now()

        if now.hour < 3 or now.hour > 12:
            queue.repairFast()
            process_crawler(urls, scrape_callback=catlog_callback, cache=cache, max_threads=max_threads, timeout=30, host = urlparse.urlparse(catlog_callback.seed_url).netloc, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36')
            # every time finished, clear the job queue
            queue.clear()
        else:
            print 'pass:' + str(now)
            pass
        time.sleep(3600)
示例#9
0
def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, user_agent='GoodCrawler', ignore_robots=True)
示例#10
0
def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    # cache.clear()
    link_crawler(seed_url=scrape_callback.seed_url, cache_callback=cache, scrape_callback=scrape_callback)