def main(max_threads): scrape_Callback = AlexaCallback() cache = Mongocache() thread_crawl(scrape_Callback.seed_url, scrape_callback=scrape_Callback, cache=cache, max_threads=max_threads)
def main(max_threads=5): scrape_callback = AlexaCallback() cache = MongoCache() queue = MongoQueue() urls = [] temple = scrape_callback.seed_url[0:-2] for i in range(1, 1189, 1): urls.append(temple + str(i) + '/') while True: now = datetime.now() if now.hour < 3 or now.hour > 12: queue.repairFast() process_crawler( urls, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=30, host=urlparse.urlparse(scrape_callback.seed_url).netloc, user_agent= 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' ) else: print 'pass:' + str(now) pass time.sleep(3600)
def main(): starttime = datetime.datetime.now() scrape_callback = AlexaCallback() cache = MongoCache() #cache.clear() threaded_crawler('http://example.webscraping.com',scrape_callback.seed_url, scrape_callback=scrape_callback) endtime = datetime.datetime.now() print((endtime - starttime).seconds)
def main(max_threads): scrape_callback = AlexaCallback() cache = MongoCache() cache.clear() process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
def main(): scrape_callback = AlexaCallback() cache = MongoCache() #cache.clear() link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, timeout=10, ignore_robots=True)
def main(max_threads): from mongo_cache import MongoCache from alexa_cb import AlexaCallback scrape_callback = AlexaCallback() cache = MongoCache() # cache.clear() ############# threaded_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
def main(max_threads=4, max_websites=3, depth=3): ws_urls_cache = MongoWSURL() ws_urls_cache.clear() crawling_queue = MongoCrawlingQueue() crawling_queue.clear() crawled_urls = MongoCrawledURL() crawled_urls.clear() seed_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip' alexa = AlexaCallback(max_websites) try: links = alexa(seed_url) or [] except Exception as e: print('Error in downing {}: {}'.format(seed_url, e)) else: for link in links: crawling_queue.push(link, depth) process_crawler(ws_urls_cache=ws_urls_cache, max_threads=max_threads, timeout=10, depth=depth)
def main(max_threads = 5): catlog_callback = AlexaCallback() cache = MongoCache() queue = MongoQueue() client = MongoClient('localhost', 27017, connect=False) #create collection to store cached webpages, # which is the equivalent of a table in a relational database db = client.cache cursor = db.books.find() urls = [] while cursor.alive: temp = cursor.next() temp = temp['link'] if urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.junzige.la': temp = '/novel' + temp[5:-4] + '/' temp = normalize(catlog_callback.seed_url, temp) elif urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.boluoxs.com': temp = 'http://www.boluoxs.com/biquge/0/' + temp[temp.rfind('/') + 1 :temp.rfind('.')] + '/' print temp urls.append(temp) print urls[0] while True: now = datetime.now() if now.hour < 3 or now.hour > 12: queue.repairFast() process_crawler(urls, scrape_callback=catlog_callback, cache=cache, max_threads=max_threads, timeout=30, host = urlparse.urlparse(catlog_callback.seed_url).netloc, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36') # every time finished, clear the job queue queue.clear() else: print 'pass:' + str(now) pass time.sleep(3600)
def main(): scrape_callback = AlexaCallback() cache = MongoCache() cache.clear() link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, user_agent='GoodCrawler', ignore_robots=True)
def main(): scrape_callback = AlexaCallback() cache = MongoCache() # cache.clear() link_crawler(seed_url=scrape_callback.seed_url, cache_callback=cache, scrape_callback=scrape_callback)