def test_redis_url_noexists_referer(self): html0 = Html(url='http://rs.crosswarp.com/') html1 = Html(url='http://www.crosswarp.com/', priority=10) html0.destinations.append(html1.url) html1.referer = html0.url self.data_access.insert_htmls([html1]) self.assertEqual(self.cli.zcard(self.data_access.url_rank), 1)
def get_next_url(self): doc = self.htmls.find_and_modify( query={'crawled_at': {'$exists': False}}, update={'$set': {'crawled_at': datetime.now()}}, upsert=False, sort={'priority': -1}) if doc is None: return None obj = Html() obj.from_dict(doc) return obj
def get_next_url(self): if self.cli.zcard(self.url_rank) == 0: return None with self.cli.pipeline() as p: while True: try: p.watch('get_next_url') md5hash_list = p.zrevrange(self.url_rank, 0, 0) if len(md5hash_list) == 0: continue md5hash = md5hash_list[0] url_json = p.get(self.prefix + md5hash) p.multi() p.zrem(self.url_rank, md5hash) p.execute() html = Html(json_str=url_json) html.crawled_at = datetime.now() return html except redis.WatchError: self.logger.exception({'datasource': 'redis', 'message': 'redis.WatchError', 'method': 'get_next_url'}) continue except: self.logger.exception({'datasource': 'redis', 'message': 'Exception', 'method': 'get_next_url'}) continue
def test_html_to_json(self): #m = Html(url='http://www.crosswarp.com/info/', priority=10, cookie='hoge', referer='http://www.crosswarp.com/') m = Html() print m.to_json() self.assertEqual(m.to_json(), '{"priority": 1, "response_code": 0, "md5hash": "d41d8cd98f00b204e9800998ecf8427e"}')