def start(self): while True: try: # dequeue a processed resource message = self._mq_client.get_message(self._processed_resource_queue) resource = self._mq_codec.decode(message.body, Resource) except EmptyQueueException: self._logger.info('Empty queue. Sleeping...') time.sleep(self.EMPTY_QUEUE_TIMEOUT) continue resource._id = makeIdFromURI(resource.uri) story = self._story_provider.provide_for_resource(resource) resource.story_id = story._id try: # save and index the resource self._resource_collection.insert_model(resource) self._index_helper.index_resource(resource, self._index) self._logger.debug('Resource %s: Inserted and indexed.' % resource._id) # queue update update = StoryUpdate() update.story_id = story._id update.type = STORY_UPDATE_RESOURCE_INSERTED update.value = resource._id self._update_collection.insert_model(update) self._su_client.update(story._id) except DuplicateKeyError: self._logger.warning('Resource %s: Already inserted.' % resource._id) # delete the queue message self._mq_client.delete_message(self._processed_resource_queue, message.id)
def _enqueue(self, resource): # Discovered URI - capped collection --db.discovered_uri mongoConnection = Connection() db = mongoConnection.feed_reading_discovery # settings max_content_length = config['discovery']['resource_enqueueing']['max_content_length'] # message queue mq_client = message_queue_client_from_config(config['message_queue']['client']) mq_client.connect() mq_codec = JSONCodec() queue = 'discovered_resources' try: #insert into capped db.discovered_uri.insert({'_id':makeIdFromURI(resource.uri), 'uri':resource.uri}) #insert into queue if len(resource.content) > max_content_length: self._logger.warning('Skipped %s: Content length is %s.' % (resource, len(resource.content))) msg_body = mq_codec.encode(resource) mq_client.put_message(queue, msg_body) self._logger.debug("Enqueued: %s" % resource._id) except DuplicateKeyError: pass # remove resource from collection self._resources_collection.remove_model(resource)
def discovered_uri_process(self, entry): # Resource fields kwargs = {} kwargs['_id'] = makeIdFromURI(entry.link) kwargs['uri'] = entry.link return DiscoveredURI(**kwargs)
def resources_process(self, source, entry, config): self._config = config # configure logging logging.config.dictConfig(config['logging']) self._logger = logging.getLogger() # Resource fields kwargs = {} kwargs['_id'] = makeIdFromURI(entry.link) kwargs['uri'] = entry.link kwargs['type'] = 'article' kwargs['discovered'] = int(time.time()) kwargs['title'] = entry.title if 'updated_parsed' in entry: kwargs['published'] = calendar.timegm(entry.updated_parsed) elif 'date_parsed' in entry: kwargs['published'] = calendar.timegm(entry.date_parsed) else: kwargs['published'] = None if 'author' in entry: kwargs['author'] = entry.author # content content = '' if 'summary' in entry: content = entry.summary if 'content' in entry: c = entry.content[0].value if len(c) > len (content): content = c kwargs['feed_content'] = content kwargs['content'] = '' kwargs['source'] = source kwargs['content_extracted'] = 1 return DiscoveredResource(**kwargs)
} mq_client = message_queue_client_from_config(mq_config) mq_codec = JSONCodec() processed_resource_queue = 'processed_resources' # ElasticSearch es = ES('localhost:9200', timeout=60) es_index = 'topic_tracking' # dequeue one resource mq_client.connect() message = mq_client.get_message(processed_resource_queue) resource = mq_codec.decode(message.body, Resource) mq_client.delete_message(processed_resource_queue, message.id) mq_client.disconnect() # save the resource to mongo resource._id = makeIdFromURI(resource.uri) resource_collection.insert_model(resource) # index the resource for boost in [1, 1000]: es_doc = {} es_doc['content'] = resource.content es_doc['title'] = resource.title es_doc['entities'] = build_payload_string(resource.entities, boost) es_doc['terms'] = build_payload_string(resource.terms, boost) id = '%s_%d' % (resource._id, boost) r = es.index(es_doc, es_index, 'resource', id) pprint(r)