def run_crawler(duration = 120): KeyWords.load_link_kw() pl = PageLoader.PageLoader_v2() #pl.add_task(PageLoader.PageLoaderTask('http://bookz.ru/')) res = Results.Results() rf = ResultFilter.ResultFilter(res) rank = Ranker.Ranker(pl, rf) visitHistory = VisitHistory.VisitHistory() sched = Scheduler.Scheduler(pl, rf, rank, visitHistory, 40) rank.active = True sched.active = True print 'all initialized. Addint start page to PageLoader in 3 seconds' #pl.add_task(PageLoaderTask('http://bookz.ru')) #print 'added start page' print 'now running for %i seconds' % duration sleep(duration) pl.enter_sleep_mode() waited_for = 0 while (not sched.ready_to_shutdown()) and waited_for < 60: waited_for += 1 sleep(1) sched.active = False rank.active = False sched.save_state() pl.save_state() res.save_state() visitHistory.save_state() res.list_sites()
# -*- coding: UTF-8 -*- __author__ = 'Павел' from crawler import PageLoader, Ranker, Scheduler, ResultFilter, Results from crawler.PageLoader import PageLoaderTask from time import sleep from os import environ from KeyWords import KeyWords environ['http_proxy'] = 'http://192.168.0.2:3128' if __name__ == '__main__': KeyWords.load_link_kw(); pl = PageLoader.PageLoader_v2() pl.add_task(PageLoaderTask('http://bookz.ru')) res = Results.Results() rf = ResultFilter.ResultFilter(res) rank = Ranker.Ranker(pl, rf) rank.active = True print 'now will exit'