def main(max_threads=5): scrape_callback = AlexaCallback() cache = MongoCache() queue = MongoQueue() urls = [] temple = scrape_callback.seed_url[0:-2] for i in range(1, 1189, 1): urls.append(temple + str(i) + '/') while True: now = datetime.now() if now.hour < 3 or now.hour > 12: queue.repairFast() process_crawler( urls, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=30, host=urlparse.urlparse(scrape_callback.seed_url).netloc, user_agent= 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' ) else: print 'pass:' + str(now) pass time.sleep(3600)
def main(max_threads = 5): catlog_callback = AlexaCallback() cache = MongoCache() queue = MongoQueue() client = MongoClient('localhost', 27017, connect=False) #create collection to store cached webpages, # which is the equivalent of a table in a relational database db = client.cache cursor = db.books.find() urls = [] while cursor.alive: temp = cursor.next() temp = temp['link'] if urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.junzige.la': temp = '/novel' + temp[5:-4] + '/' temp = normalize(catlog_callback.seed_url, temp) elif urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.boluoxs.com': temp = 'http://www.boluoxs.com/biquge/0/' + temp[temp.rfind('/') + 1 :temp.rfind('.')] + '/' print temp urls.append(temp) print urls[0] while True: now = datetime.now() if now.hour < 3 or now.hour > 12: queue.repairFast() process_crawler(urls, scrape_callback=catlog_callback, cache=cache, max_threads=max_threads, timeout=30, host = urlparse.urlparse(catlog_callback.seed_url).netloc, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36') # every time finished, clear the job queue queue.clear() else: print 'pass:' + str(now) pass time.sleep(3600)