class RefreshQueue(object): ''' Responsible for encapsulating Queue behaviour, such as getting a new payload. ''' QUEUE = 'refresher_queue' def __init__(self, work_id=''): self.redis = Redis(host='redis') self.work_id = work_id def get_new_payload(self): key, value = self.redis.brpop(self.QUEUE) data = json.loads(value.decode()) self.work_id = data.get('work_id') self.redis.lpush(self.work_id, json.dumps({"status": "processing"})) return data def respond(self, data): if self.work_id: self.redis.lpush(self.work_id, json.dumps(data)) self.clear() def clear(self): self.work_id = ''
class RefreshRequester(object): def __init__(self, queue, max_tries=5, timeout=15): self.redis = Redis(host='redis') self.queue = queue self.max_tries = max_tries self.timeout = timeout def block_request(self, data): work_id = self._generate_work_id() data.update({"work_id": work_id}) for _ in range(self.max_tries): if self._wait_for_receive(work_id, data): response = self._wait_for_response(work_id) if response: return response return {} def _wait_for_receive(self, work_id, data): self.redis.lpush(self.queue, json.dumps(data)) response = self.redis.brpop(work_id, timeout=self.timeout) if response: key, value = response data = json.loads(value.decode()) if data.get('status') == 'processing': return True return False def _wait_for_response(self, work_id): response = self.redis.brpop(work_id, timeout=self.timeout) if response is not None: key, value = response if response is not None and value is not None: data = json.loads(value.decode()) return data else: return {} def _generate_work_id(self): h = sha256() h.update(str(datetime.now()).encode('utf-8')) return h.hexdigest()
class Worker: def __init__(self, config): self.config = config self.work_list = None if 'redis' in config: # TODO support sockets and stuffs if 'unix_socket_path' in config["redis"]: self.redis = Redis(unix_socket_path = config["redis"]["unix_socket_path"]) elif 'host' in config["redis"]: if 'port' in config["redis"]: port = config["redis"]["port"] else: port = 6379 self.redis = Redis(host=config["redis"]["host"], port=port) def fetch_thumb_job(self): if self.config['fetch_mode'] == 'queue': return self.redis.brpop('queue:thumb')[1] else: if self.work_list == None: self.work_list = [] for album in self.redis.zrange('zalbums', 0, -1): for key in self.redis.zrange(album, 0, -1): self.work_list.append(key) return self.work_list.pop() def get_image_info(self, key): image = self.redis.hgetall(key) return image def save_image_info(self, imagekey, data): for key, value in data.items(): self.redis.hset(imagekey, key, value) def rotate(self, infile): ''' Lossless autoration based on exif using jhead/jpegtran ''' return run(['/usr/bin/jhead', '-autorot', infile]) def thumbnail(self, infile, outfile, size, quality, no_upscale=False): #image = Image(infile) #image.resize(size) #image.write(outfile) quality = str(quality) if infile.endswith('.gif') or no_upscale: size = size+'>' resize = run(['/usr/bin/convert', '-interlace', "Plane", '-quality', quality, '-strip', '-thumbnail', size, infile, outfile]) image = Image(outfile) return { 'width': image.size().width(), \ 'height': image.size().height() }
class RedisRpcClient(implements(RpcClient)): def __init__(self): self.__client = Redis(connection_pool=settings.REDIS_CONNECTION_POOL) def call(self, topic, data): request = RpcRequest( params=data, id=shortuuid.uuid(), ) self.__client.lpush(topic, pickle.dumps(request)) _, response = self.__client.brpop(keys=request.id) return pickle.loads(response)
class Worker: def __init__(self, config): self.config = config self.work_list = None if 'redis' in config: # TODO support sockets and stuffs if 'unix_socket_path' in config["redis"]: self.redis = Redis( unix_socket_path=config["redis"]["unix_socket_path"]) def fetch_thumb_job(self): if self.config['fetch_mode'] == 'queue': return self.redis.brpop('queue:thumb')[1] else: if self.work_list == None: self.work_list = [] for album in self.redis.zrange('zalbums', 0, -1): for key in self.redis.zrange(album, 0, -1): self.work_list.append(key) return self.work_list.pop() def get_image_info(self, key): image = self.redis.hgetall(key) return image def save_image_info(self, imagekey, data): for key, value in data.items(): self.redis.hset(imagekey, key, value) def rotate(self, infile): ''' Lossless autoration based on exif using jhead/jpegtran ''' return run(['/usr/bin/jhead', '-autorot', infile]) def thumbnail(self, infile, outfile, size, quality, no_upscale=False): #image = Image(infile) #image.resize(size) #image.write(outfile) quality = str(quality) if infile.endswith('.gif') or no_upscale: size = size + '>' resize = run([ '/usr/bin/convert', '-interlace', "Plane", '-quality', quality, '-strip', '-thumbnail', size, infile, outfile ]) image = Image(outfile) return { 'width': image.size().width(), \ 'height': image.size().height() }
def call(): """ wait 1 sec for server to start """ time.sleep(1) redis = Redis(connection_pool=settings.REDIS_CONNECTION_POOL) request = RpcRequest(params=PARAMS, id=shortuuid.uuid()) redis.lpush(TOPIC, pickle.dumps(request)) _, response = redis.brpop(keys=request.id) response = pickle.loads(response) """ assertion """ self.assertEqual(response.id, request.id) self.assertEqual(response.result, PARAMS)
def redis_list(rc: redis.Redis): key = 'test_list' rc.delete(key) print('' + str(rc.llen(key))) rc.lpush(key, 2) rc.lpush(key, 1) rc.rpush(key, 3) rc.rpush(key, 4) print(rc.llen(key)) print(rc.lindex(key, 0)) print(rc.lrange(key, 0, -1)) print(rc.lrange(key, 1, -1)) print(rc.lpop(key)) print(rc.brpop(key))
class Worker: def __init__(self, config): self.config = config self.work_list = None if 'redis' in config: if 'host' in config["redis"] and 'port' in config['redis']: self.redis = Redis(host=config["redis"]["host"], port=config["redis"]["port"]) self.db = Database(config) def fetch_thumb_job(self): if self.config['fetch_mode'] == 'queue': return self.redis.brpop('pix:upload:queue')[1] else: if self.work_list == None: self.work_list = [] for album in self.redis.zrange('zalbums', 0, -1): for key in self.redis.zrange(album, 0, -1): self.work_list.append(key) return self.work_list.pop() def get_image_info(self, token): image = self.db.get_image(token) return image def save_image_info(self, imagekey, data): self.db.save_image_info(imagekey, data) def rotate(self, infile): ''' Lossless autoration based on exif using jhead/jpegtran ''' try: return run(['/usr/bin/jhead', '-autorot', infile]) except Exception, e: print e
class TasksMQ: TASKS_QUEUE = "tasks:enqueued" RUNNING_TASKS = "tasks:running" def __init__(self): self.redis_connection = Redis() def enqueue_task_command(self, command): self.redis_connection.lpush(self.TASKS_QUEUE, command.serialize()) def dequeue_task_command(self): _, data = self.redis_connection.brpop(self.TASKS_QUEUE) return _deserialize_command(data) def add_running_task(self, task_id, running_task): self.redis_connection.hset(self.RUNNING_TASKS, task_id, running_task.serialize()) def remove_running_task(self, task_id): self.redis_connection.hdel(self.RUNNING_TASKS, task_id) def get_running_tasks(self): return self.redis_connection.hgetall(self.RUNNING_TASKS)
def get_fake_name_from_buffer(r: Redis) -> str: tasks.add_fake_name_to_buffer.delay() _, name = r.brpop(f"fake_names") return name.decode("utf-8")
class HotQueue(object): """Simple FIFO message queue stored in a Redis list. Example: >>> from hotqueue import HotQueue >>> queue = HotQueue("myqueue", host="localhost", port=6379, db=0) :param name: name of the queue :param serializer: the class or module to serialize msgs with, must have methods or functions named ``dumps`` and ``loads``, `pickle <http://docs.python.org/library/pickle.html>`_ is the default, use ``None`` to store messages in plain text (suitable for strings, integers, etc) :param kwargs: additional kwargs to pass to :class:`Redis`, most commonly :attr:`host`, :attr:`port`, :attr:`db` """ def __init__(self, name='adminaccess', serializer=pickle, redis_connection=None, originIPAddress=None, senderName=None, **kwargs): self.group_name = kwargs.pop("group", "hotqueue") self.name = name self.serializer = serializer self.senderName = senderName self.originIPAddress = originIPAddress if redis_connection: self.__redis = redis_connection else: self.__redis = Redis(**kwargs) def __len__(self): return self.__redis.llen(self.key) def _get_hotqueues(self, wildcard="*"): return self.__redis.keys(key_for_name(wildcard)) def _get_unacked_hotqueue(self,wildcard="*"): queues = self._get_hotqueues("unacked:"+wildcard) if queues: return queues[0].replace('hotqueue:','') else: return None @property def key(self): """Return the key name used to store this queue in Redis.""" return "%s:%s" % (self.group_name, self.name) def clear(self): """Clear the queue of all messages, deleting the Redis key.""" self.__redis.delete(self.key) def clear_value(self, value): """Removes any occurence of an item from queue""" self.__redis.lrem(self.key, 0, value) def consume(self, **kwargs): """Return a generator that yields whenever a message is waiting in the queue. Will block otherwise. Example: >>> for msg in queue.consume(timeout=1): ... print msg my message another message :param kwargs: any arguments that :meth:`~hotqueue.HotQueue.get` can accept (:attr:`block` will default to ``True`` if not given) """ kwargs.setdefault('block', True) try: while True: msg = self.get(**kwargs) if msg is None: break yield msg except KeyboardInterrupt: print; return def get(self, block=False, timeout=None): """Return a message from the queue. Example: >>> queue.get() 'my message' >>> queue.get() 'another message' :param block: whether or not to wait until a msg is available in the queue before returning; ``False`` by default :param timeout: when using :attr:`block`, if no msg is available for :attr:`timeout` in seconds, give up and return ``None`` """ processId = str(uuid4()) process_queue = key_for_name("processing:"+processId+":"+str(time.time()+5)) if block: if timeout is None: timeout = 0 msg = self.__redis.brpop(self.key, timeout=timeout) if msg is not None: msg = msg[1] else: msg = self.__redis.rpop(self.key) self.__redis.lpush(process_queue, msg) # try *rpoplpush instead of this hq_message = HQMessage() msg_tmp = msg if msg is not None and self.serializer is not None: msg_tmp = self.serializer.loads(msg) hq_message = msg_tmp if msg: hq_message.reserve_message() self.__redis.rpush(key_for_name("unacked:"+hq_message.get_reservationId()+":"+str(hq_message.get_expiration())), self.serializer.dumps(hq_message)) self.__redis.delete(process_queue) return hq_message def put(self, *msgs): hq_message_list = [] hq_message_list_raw = [] """Put one or more messages onto the queue. Example: >>> queue.put("my message") >>> queue.put("another message") To put messages onto the queue in bulk, which can be significantly faster if you have a large number of messages: >>> queue.put("my message", "another message", "third message") """ for msg in msgs: hq_message = HQMessage(msg,self.name) if self.originIPAddress: hq_message.set_originIPAddress(self.originIPAddress) if self.senderName: hq_message.set_senderName(self.senderName) if self.serializer is not None: hq_message_list.append(self.serializer.dumps(hq_message)) else: hq_message_list.append(hq_message) hq_message_list_raw.append(hq_message) # if self.serializer is not None: # hq_message_list = map(self.serializer.dumps, *hq_message_list) self.__redis.lpush(self.key, *hq_message_list) return hq_message_list_raw def _acknack(self, reservation_uuid, ack=False, nack=False): unackedqueue_name = self._get_unacked_hotqueue(str(reservation_uuid)+':*') if unackedqueue_name: msg = self.__redis.rpop(key_for_name(unackedqueue_name)) hq_message = HQMessage() msg_tmp = msg if msg is not None and self.serializer is not None: msg_tmp = self.serializer.loads(msg) hq_message = msg_tmp if nack: original_queue = key_for_name(hq_message.get_queueName()) #hq_message.inc_deliveryCount() #if self.serializer is not None: # msg = self.serializer.dumps(hq_message) self.__redis.lpush(original_queue, msg) self.__redis.delete(key_for_name(unackedqueue_name)) return hq_message else: return None def ack(self, reservation_uuid): return self._acknack(reservation_uuid, ack=True, nack=False) def nack(self, reservation_uuid): return self._acknack(reservation_uuid, ack=False, nack=True) def worker(self, *args, **kwargs): """Decorator for using a function as a queue worker. Example: >>> @queue.worker(timeout=1) ... def printer(msg): ... print msg >>> printer() my message another message You can also use it without passing any keyword arguments: >>> @queue.worker ... def printer(msg): ... print msg >>> printer() my message another message :param kwargs: any arguments that :meth:`~hotqueue.HotQueue.get` can accept (:attr:`block` will default to ``True`` if not given) """ def decorator(worker): @wraps(worker) def wrapper(*args): for msg in self.consume(**kwargs): worker(*args + (msg,)) return wrapper if args: return decorator(*args) return decorator
from redis import Redis redis_client = Redis(host='localhost', port=6379, db=0, charset="utf-8", decode_responses=True) key_input = 'eth:shares2' key_output = 'gobin:eth:shares' while True: try: share = redis_client.brpop('eth:shares2')[1] print(share) ms, login, diff = share.split(':') user, rig = login.split('.') share2 = {'user': user, 'rig': rig, 'diff': diff} redis_client.xadd(key_output, share2, id=ms) except Exception as e: print('pipe error:', e)
conn.hkeys('hkey') conn.hvals('hkey') conn.hgetall('hkey') conn.hincrby('hkey', 'key', 1) conn.hincrbyfloat('hkey', 'key', 2.3) # list conn.rpush('lkey', 1, 2, 3) conn.lpush('lkey', 1, 2, 3) conn.lpop('lkey') conn.rpop('lkey') conn.lrange('lkey', 0, -1) # return a list conn.lindex('lkey', 2) conn.ltrim('lkey', 1, -1) conn.blpop(['list1', 'list2'], 1) conn.brpop(['list1', 'list2'], 2) conn.rpoplpush('list1', 'list2') conn.brpoplpush('list1', 'list2', 3) # set conn.sadd('key', 'item1', 'item2') conn.srem('key', 'item2') conn.ismember('key', 'item') # not sure conn.scard('key') conn.smembers('key') conn.smove('key1', 'key2', 'item') conn.sdiff('key1', 'key2', 'key3') # 返回存在第一个集合,不在其他集合的元素 conn.sinter('key1', 'key2') conn.sunion('key1', 'key2',) # string
from redis import Redis redis_connection = Redis(decode_responses=True) list_key = "names_to_shout" while True: print(redis_connection.brpop(list_key)[1].upper())
class HotQueue(object): """Simple FIFO message queue stored in a Redis list. Example: >>> from hotqueue import HotQueue >>> queue = HotQueue("myqueue", host="localhost", port=6379, db=0) :param name: name of the queue :param serializer: the class or module to serialize msgs with, must have methods or functions named ``dumps`` and ``loads``, `pickle <http://docs.python.org/library/pickle.html>`_ is the default, use ``None`` to store messages in plain text (suitable for strings, integers, etc) :param kwargs: additional kwargs to pass to :class:`Redis`, most commonly :attr:`host`, :attr:`port`, :attr:`db` """ def __init__(self, name='adminaccess', serializer=pickle, redis_connection=None, originIPAddress=None, senderName=None, **kwargs): self.group_name = kwargs.pop("group", "hotqueue") self.name = name self.serializer = serializer self.senderName = senderName self.originIPAddress = originIPAddress if redis_connection: self.__redis = redis_connection else: self.__redis = Redis(**kwargs) def __len__(self): return self.__redis.llen(self.key) def _get_hotqueues(self, wildcard="*"): return self.__redis.keys(key_for_name(wildcard)) def _get_unacked_hotqueue(self, wildcard="*"): queues = self._get_hotqueues("unacked:" + wildcard) if queues: return queues[0].replace('hotqueue:', '') else: return None @property def key(self): """Return the key name used to store this queue in Redis.""" return "%s:%s" % (self.group_name, self.name) def clear(self): """Clear the queue of all messages, deleting the Redis key.""" self.__redis.delete(self.key) def clear_value(self, value): """Removes any occurence of an item from queue""" self.__redis.lrem(self.key, 0, value) def consume(self, **kwargs): """Return a generator that yields whenever a message is waiting in the queue. Will block otherwise. Example: >>> for msg in queue.consume(timeout=1): ... print msg my message another message :param kwargs: any arguments that :meth:`~hotqueue.HotQueue.get` can accept (:attr:`block` will default to ``True`` if not given) """ kwargs.setdefault('block', True) try: while True: msg = self.get(**kwargs) if msg is None: break yield msg except KeyboardInterrupt: print return def get(self, block=False, timeout=None): """Return a message from the queue. Example: >>> queue.get() 'my message' >>> queue.get() 'another message' :param block: whether or not to wait until a msg is available in the queue before returning; ``False`` by default :param timeout: when using :attr:`block`, if no msg is available for :attr:`timeout` in seconds, give up and return ``None`` """ processId = str(uuid4()) process_queue = key_for_name("processing:" + processId + ":" + str(time.time() + 5)) if block: if timeout is None: timeout = 0 msg = self.__redis.brpop(self.key, timeout=timeout) if msg is not None: msg = msg[1] else: msg = self.__redis.rpop(self.key) self.__redis.lpush(process_queue, msg) # try *rpoplpush instead of this hq_message = HQMessage() msg_tmp = msg if msg is not None and self.serializer is not None: msg_tmp = self.serializer.loads(msg) hq_message = msg_tmp if msg: hq_message.reserve_message() self.__redis.rpush( key_for_name("unacked:" + hq_message.get_reservationId() + ":" + str(hq_message.get_expiration())), self.serializer.dumps(hq_message)) self.__redis.delete(process_queue) return hq_message def put(self, *msgs): hq_message_list = [] hq_message_list_raw = [] """Put one or more messages onto the queue. Example: >>> queue.put("my message") >>> queue.put("another message") To put messages onto the queue in bulk, which can be significantly faster if you have a large number of messages: >>> queue.put("my message", "another message", "third message") """ for msg in msgs: hq_message = HQMessage(msg, self.name) if self.originIPAddress: hq_message.set_originIPAddress(self.originIPAddress) if self.senderName: hq_message.set_senderName(self.senderName) if self.serializer is not None: hq_message_list.append(self.serializer.dumps(hq_message)) else: hq_message_list.append(hq_message) hq_message_list_raw.append(hq_message) # if self.serializer is not None: # hq_message_list = map(self.serializer.dumps, *hq_message_list) self.__redis.lpush(self.key, *hq_message_list) return hq_message_list_raw def _acknack(self, reservation_uuid, ack=False, nack=False): unackedqueue_name = self._get_unacked_hotqueue( str(reservation_uuid) + ':*') if unackedqueue_name: msg = self.__redis.rpop(key_for_name(unackedqueue_name)) hq_message = HQMessage() msg_tmp = msg if msg is not None and self.serializer is not None: msg_tmp = self.serializer.loads(msg) hq_message = msg_tmp if nack: original_queue = key_for_name(hq_message.get_queueName()) #hq_message.inc_deliveryCount() #if self.serializer is not None: # msg = self.serializer.dumps(hq_message) self.__redis.lpush(original_queue, msg) self.__redis.delete(key_for_name(unackedqueue_name)) return hq_message else: return None def ack(self, reservation_uuid): return self._acknack(reservation_uuid, ack=True, nack=False) def nack(self, reservation_uuid): return self._acknack(reservation_uuid, ack=False, nack=True) def worker(self, *args, **kwargs): """Decorator for using a function as a queue worker. Example: >>> @queue.worker(timeout=1) ... def printer(msg): ... print msg >>> printer() my message another message You can also use it without passing any keyword arguments: >>> @queue.worker ... def printer(msg): ... print msg >>> printer() my message another message :param kwargs: any arguments that :meth:`~hotqueue.HotQueue.get` can accept (:attr:`block` will default to ``True`` if not given) """ def decorator(worker): @wraps(worker) def wrapper(*args): for msg in self.consume(**kwargs): worker(*args + (msg, )) return wrapper if args: return decorator(*args) return decorator
from redis import Redis redis_connection = Redis(decode_responses=True) list_key = "przykladowa-lista" while True: print(redis_connection.brpop(list_key))
from redis import Redis, RedisError while True: try: redis = Redis(host='redis', port=6379) break except RedisError: print('Cannot reach redis. Retry...') time.sleep(.5) continue try: print('Waiting for pep8 jobs...') while (True): try: job = json.loads(redis.brpop('pep8jobs')[1]) except RedisError: print('Cannot reach redis. Retry...') time.sleep(.5) continue print('Working on job #{}'.format(job['id'])) try: # list of code lines: lines = job.pop('lines', None).replace('\r', '').split('\n') options = autopep8.parse_args(['']) # set of selected errors/warnings: options.select = {k for k in job.pop('select', None)} job['result'] = autopep8.fix_lines(lines, options) except: traceback.print_exc() print('Job #{} got canceled!'.format(job['id']))
class Worker(Process): YET_NO_DATA = (None, None) def __init__(self, options, stop_flag, *args, **kwargs): super().__init__(*args, **kwargs) self.stop_flag = stop_flag self.use_lxml = options.use_lxml self.mongo_host = options.mongo_host self.mongo_port = options.mongo_port self.redis_host = options.redis_host self.redis_port = options.redis_port logger.info('Mongo connection {}:{}. Redis connection {}:{}'\ .format(options.mongo_host, options.mongo_port, options.redis_host, options.redis_port)) def run(self): logger.info('Started') signal.signal(signal.SIGINT, signal.SIG_IGN) signal.signal(signal.SIGTERM, signal.SIG_DFL) # must place here to be in forked process memory mc = MongoClient(self.mongo_host, self.mongo_port) db = mc[cns.MONGO_DB] self.collection = db[cns.MONGO_COLLECTION] self.rc = Redis(self.redis_host, self.redis_port) self.tree = Tree(self.rc) self.documents = [ ] # to collect during a whole lifecircle for a batching while not self.stop_flag.value: try: start_time = time.time() data, base_url = self.get_data_and_base_url() if (data, base_url) == self.YET_NO_DATA: logger.info( 'Continue after waiting data getting. Blocking is expired' ) continue elif not data: logger.error('No data') continue soup = BeautifulSoup( data.decode(), 'lxml' if self.use_lxml else 'html.parser') self.parse_posts_fill_documents(soup, base_url) self.parse_topics(soup, base_url) self.parse_cards(soup, base_url) self.save_documents(base_url) end_time = time.time() logger.debug('Job duration: {}'.format(end_time - start_time)) except: logger.exception('Work error') self.save_documents(finish=True) logger.info('Stopped') def get_data_and_base_url(self): queuekey_datakey = self.rc.brpop(cns.DATA_QUEUE_KEY, cns.BLOCKING_TIMEOUT) if not queuekey_datakey: return self.YET_NO_DATA data_key = queuekey_datakey[1].decode() with self.rc.pipeline() as pipeline: pipeline.get(data_key) pipeline.delete(data_key) data, del_count = pipeline.execute() base_url = self._extract_base_url(data_key) logger.info('Data is gotten') return data, base_url def parse_cards(self, soup, base_url): child_nodes = [] new_urls = [] for card_source in soup.find_all(class_='ForumCard'): href = card_source['href'] card = {} card['url'] = urljoin(base_url, href) name = card_source.find(class_='ForumCard-heading') if name: card['name'] = name.get_text() description = card_source.find(class_='ForumCard-description') if description: card['description'] = description.get_text() n = Node(position=card['url'], data=card, level=cns.NODE_SUBCATEGORY_LEVEL, parent=base_url) child_nodes.append(n) new_urls.append(card['url']) self.add_child_nodes(child_nodes) self.add_new_urls(new_urls) def parse_topics(self, soup, base_url): self._add_paginated_topics_link(soup, base_url) child_nodes = [] new_urls = [] for topic_source in soup.find_all(class_='ForumTopic'): topic = {} topic['url'] = urljoin(base_url, topic_source['href']) topic['name'] = topic_source.find( class_='ForumTopic-heading').get_text() topic['author'] = topic_source.find( class_='ForumTopic-author').get_text() topic['replies'] = topic_source.find( class_='ForumTopic-replies').get_text() n = Node(position=topic['url'], data=topic, level=cns.NODE_TOPIC_LEVEL, parent=base_url) child_nodes.append(n) new_urls.append(topic['url']) self.add_child_nodes(child_nodes) self.add_new_urls(new_urls) def parse_posts_fill_documents(self, soup, base_url): self._add_paginated_posts_links(soup, base_url) for post_source in soup.find_all(class_='TopicPost'): document = defaultdict(dict) document['post']['id'] = post_source['id'] document['post']['user'] = post_source.find( class_='Author-name').get_text() document['post']['created'] = post_source.find( class_='TopicPost-timestamp')['data-tooltip-content'] document['post']['rank'] = post_source.find( class_='TopicPost-rank').get_text() document['post']['text'] = post_source.find( class_='TopicPost-bodyContent').get_text() for node in self.tree.get_parents(base_url): if node.level == cns.NODE_TOPIC_LEVEL: document['topic'] = node.data elif node.level == cns.NODE_SUBCATEGORY_LEVEL: document['subcategory'] = node.data self.documents.append(document) def add_child_nodes(self, nodes): self.tree.add_nodes(nodes) def add_new_urls(self, urls): """Вставка ссылок в очередь пачкой. Ссылки сохраняются в множество запрошенных, а в очередь только в него не входящие. """ if not isinstance(urls, tuple, list): urls = [urls] pipeline = self.rc.pipeline() for url in urls: pipeline.sismember(cns.PARSED_URLS_KEY, url) members = pipeline.execute() urls = [url for url, ismember in zip(urls, members) if not ismember] if urls: pipeline.sadd(cns.PARSED_URLS_KEY, *urls) pipeline.lpush(cns.URL_QUEUE_KEY, *urls) pipeline.execute() logger.debug('Put new urls: {}'.format(urls)) pipeline.reset() def save_documents(self, base_url=None, finish=False, batch_size=cns.INSERT_BATCH_SIZE): if len(self.documents) >= batch_size: self.collection.insert_many(self.documents) self.documents.clear() logger.info('Data is written') elif finish and self.documents: self.collection.insert_many(self.documents) logger.info('Finished step. Data is written') elif not self.documents: logger.warning('No documents for url {}'.format(base_url)) def _extract_base_url(self, key): return key.replace(cns.DATA_KEY_PREFIX, '') def _add_paginated_posts_links(self, soup, base_url): new_urls = [] pagination = soup.select( '.Topic-pagination--header .Pagination-button--ordinal') if pagination: last_page = int(pagination[-1]['data-page-number']) urls = [ urljoin(base_url, '?page=%d' % n) for n in range(1, last_page + 1) ] new_urls.extend(urls) self.add_new_urls(new_urls) def _add_paginated_topics_link(self, soup, base_url): href = soup.select('.Pagination-button--next')[0]['href'] new_url = urljoin(base_url, href) self.add_new_urls(new_url)