示例#1
0
    async def timed_crawler(self, url, freq, ws_id):
        redis_conn = redis.Redis(connection_pool=self.redis)

        while True:
            ws_freq_check = redis_conn.get(ws_id)

            if not ws_freq_check or not int(ws_freq_check) >= 0:
                logging.info('got shutdown signal {}'.format(str(ws_id)))
                break

            crawler = Crawler([url], loop=self.loop, db_rpc=DbRpcClient())
            await crawler.crawl()
            crawler.close()

            await asyncio.sleep(int(freq) * 60)
示例#2
0
class CrawlerTest(unittest.TestCase):
    def test_output_count(self):
        self.crawler = Crawler("https://stackoverflow.com/questions",
                               5,
                               timeout=10)
        self.crawler.engage()
        self.assertEqual(len(self.crawler.directory.keys()), 5)

    def test_visited_count(self):
        self.crawler = Crawler("https://stackoverflow.com/questions", 2)
        self.crawler.engage()
        self.assertEqual(len(self.crawler.visits), 2)

    def test_zero_url_count(self):
        with self.assertRaises(RuntimeError):
            self.crawler = Crawler("http://www.google.com", 0)
示例#3
0
class Getter:
    '''获取器,获取ip'''
    def __init__(self):
        self.crawler = Crawler()
        self.redis = RedisOperate()

    def run_getter(self):
        '''存入代理池,设置分数最高'''
        print('获取器开始运行,抓取代理中。。。')
        for callback in self.crawler.__CrawlFun__:
            # print('抓取代理的函数有',callback)
            new_proxy_list = self.crawler.get_proxy(callback)
            for proxy in new_proxy_list:
                self.redis.add(proxy)
示例#4
0
class Getter:
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()

    def limit(self, limit_num=500):
        """
        判断代理数量是否超过代理池设定值
        :param limit_num:
        :return:
        """
        if self.redis.count() >= limit_num:
            return True
        else:
            return False

    def run(self):
        print("Getter is running...")
        if not self.limit():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.put(proxy)
示例#5
0
 def get_pages(self, url, no_verbose=False):
     """Return list of all webpages"""
     crawler = Crawler(url, no_verbose, limit=self.pagelimit)
     return crawler.start()
示例#6
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
示例#7
0
    async def one_time_crawler(self, url, freq, ws_id):

        crawler = Crawler([url], loop=self.loop, db_rpc=DbRpcClient())
        await crawler.crawl()
        crawler.close()
示例#8
0
 def test_output_count(self):
     self.crawler = Crawler("https://stackoverflow.com/questions",
                            5,
                            timeout=10)
     self.crawler.engage()
     self.assertEqual(len(self.crawler.directory.keys()), 5)
示例#9
0
 def test_zero_url_count(self):
     with self.assertRaises(RuntimeError):
         self.crawler = Crawler("http://www.google.com", 0)
示例#10
0
 def test_visited_count(self):
     self.crawler = Crawler("https://stackoverflow.com/questions", 2)
     self.crawler.engage()
     self.assertEqual(len(self.crawler.visits), 2)
示例#11
0
#                   Reference link:                                                                                    #
#                                                                                                                      #
#                   pig6:https://github.com/pig6/login_taobao                                                          #
#                   Germey:https://github.com/Python3WebSpider/ProxyPool                                               #
#                          https://github.com/Python3WebSpider/Weixin                                                  #
#                                                                                                                      #
#                                                                                   Thanks!                            #
# ******************************************************************************************************************** #

from spider.crawler import Crawler
from spider.login import Login

if __name__ == '__main__':

    # 这三个表单参数在浏览器中复制,可以多次使用,ua和password2分别为加密后的账号和密码
    # 获取过程:
    # 1、清除淘宝网的cookies,访问淘宝网:https://www.taobao.com
    # 2、在搜索栏搜索任意商品,跳转到登录页面
    # 3、点击登陆,用抓包工具或浏览器开发者模式截获url:https://login.taobao.com/newlogin/login.do?appName=taobao&fromSite=0
    # 4、复制此url内的表单信息ua、loginId、password2
    ua = ''
    loginId = ''
    password2 = ''
    login = Login(ua, loginId, password2)
    crawler = Crawler()
    if login.logged():
        login.print_title()
        crawler.run()
    else:
        crawler.run()
示例#12
0
 def __init__(self):
     self.crawler = Crawler()
     self.redis = RedisOperate()
示例#13
0
"""
Developer : Kavitha Madhavaraj

To interrupt the program:
    * Use ctrl+c once
"""
from spider.crawler import Crawler

#Case 1: Specify the number of URL's to crawl 
cr1 = Crawler('http://www.windowscentral.com', strict=True, timeout=5, multi=10)
directory =  cr1.engage()
print "\n\nPages visited (Case1) : ", len(cr1.visits)
#Save the crawled directory in a JSON file
cr1.save_directory('Result1')

"""
#Case 2: Crawl untill interrupted (ctrl+c)
cr2 = Crawler('http://www.bitsathy.ac.in/sitemap.xml', timeout=5, strict=True)
directory =  cr2.engage()
print "\n\nPages visited (Case2) : ", len(cr2.visits)

#Save the crawled directory in a JSON file
cr2.save_directory('Result2')
"""