def scan_and_queue(self,p_queue,p_index,p_query={},p_doctype=None,p_scroll_time='5m',p_timeout='1m'): """Reads docs from an es index according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_scroll_time: Time for scroll method p_timeout: Timeout - After this period, scan context is closed p_index: Index where items are picked from p_doctype: DocType of the items p_query: ElasticSearch query for scanning the index """ try: param = [{'host':self.host,'port':self.port}] es = Elasticsearch(param) logger.info('Connected to ES Server for reading: %s',json.dumps(param)) except Exception as e: logger.error('Connection failed to ES Server for reading: %s',json.dumps(param)) logger.error(e) sys.exit(EXIT_IO_ERROR) try: if 'p_doctype' is not None: documents = helpers.scan(client=es, query=p_query, size=1000, scroll=p_scroll_time, index=p_index, doc_type=p_doctype, timeout=p_timeout) else: documents = helpers.scan(client=es, query=p_query, size=1000, scroll= p_scroll_time, index=p_index, timeout=p_timeout) for doc in documents: logger.debug(doc) p_queue.put(doc) except Exception as e: logger.info("Error while scanning ES index %s with query %s",p_index,p_query)
def dequeue_and_store(self,p_queue,p_file,p_delimiter=',',p_quotechar='"',p_quoting=csv.QUOTE_NONNUMERIC): """Gets docs from p_queue and stores them in the csv file Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_file: file to store in """ # If not exists, creates the cursor if p_file not in self.csvfilecursor: self.csvfilecursor[p_file] = open(p_file, "w") self.out_csvfile[p_file] = csv.writer(self.csvfilecursor[p_file],delimiter=p_delimiter,quotechar=p_quotechar,quoting=p_quoting) # Loop untill receiving the "poison pill" item (meaning : no more element to read) poison_pill = False while not(poison_pill): try: source_doc = p_queue.get() # Manage poison pill : stop trying to get elements if source_doc is None: logger.debug("CSVio has received 'poison pill' and is now ending ...") poison_pill = True self.csvfilecursor[p_file].close() p_queue.task_done() break self.out_csvfile[p_file].writerow(source_doc) p_queue.task_done() except KeyboardInterrupt: logger.info("CSVio.dequeue_and_store : User interruption of the process") self.csvfilecursor[p_file].close() sys.exit(EXIT_USER_INTERRUPT)
def dequeue_and_store(self,p_queue, p_collection): """Gets docs from p_queue and stores them in a mongo collection Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_collection: mongo collection where to store the docs; """ # uri for mongo connection uri = 'mongodb://%s:%s@%s:%s/%s?connectTimeoutMS=' % (self.user,self.password,self.host,self.port,self.base,self.connect_timeout) # Connect to mongo try: mongo_client = MongoClient(uri) mongo_connection = mongo_client[self.base] logger.info('Connection succeeded on %s',uri) except PyMongoError as e: logger.error('Failed to connect to %s',uri) logger.error(e) sys.exit(EXIT_IO_ERROR) # Loop untill receiving the "poison pill" item (meaning : no more element to read) poison_pill = False while not(poison_pill): try: source_doc = p_queue.get() # Manage poison pill if source_doc is None: logger.debug("Mongoio has received 'poison pill' and is now ending ...") poison_pill = True p_queue.task_done() break #management of 'update/set' style request try: find = source_doc['_mongo_find'] except KeyError: find = {'_id':source_doc['_id']} try: update = source_doc['_mongo_update'] except KeyError: update = source_doc #insert into collection try: mongo_connection[p_collection].update(find,update,upsert=True) except Exception as e: logger.error("Document not inserted in Mongo Collection %s", source_doc['_id']) logger.error(e) p_queue.task_done() except KeyboardInterrupt: logger.info("Mongoio.dequeue_and_store : User interruption of the process") sys.exit(EXIT_USER_INTERRUPT)
def dequeue_and_store(self,p_queue,p_index,p_nbmax_retry=3): """Gets docs from p_queue and stores them in the algolia Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: algolia index where to store the docs p_nbmax_retry: number of tries when failing on a request (default is 3) """ client = algoliasearch.Client(self.app_id,self.api_key) index = client.init_index(p_index) # Loop untill receiving the "poison pill" item (meaning : no more element to read) poison_pill = False while not(poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: logger.debug("ESio has received 'poison pill' and is now ending ...") poison_pill = True p_queue.task_done() break bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: try: # Bulk indexation if len(bulk) > 0: logger.debug("Indexing %i documents",len(bulk)) index.add_objects(bulk) except Exception as e: logger.error("Bulk not indexed in algolia - Retry number %i",try_counter) logger.error(e) try_counter += 1 else: is_indexed = True if not is_indexed: logger.error("Bulk not indexed in algolia : operation aborted after %i retries",try_counter-1) except KeyboardInterrupt: logger.info("ESio.dequeue_and_store : User interruption of the process") sys.exit(1)
def scan_and_queue(self, p_queue, p_index, p_query={}, p_connect_timeout=1, p_read_timeout=30): """Reads docs from an Algolia index according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_index: Index where items are picked from p_query: query for scanning the index """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) try: client = algoliasearch.Client(self.app_id, self.api_key) client.timeout = (p_connect_timeout, p_read_timeout) index = client.init_index(p_index) except Exception as e: logger.error(e) sys.exit(EXIT_IO_ERROR) try: documents = index.browse_all(p_query) start = time.time() for doc in documents: p_queue.put(doc) elapsed = time.time() - start with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 nb_items = self.counters['nb_items_scanned'].value self.counters['scan_time'].value += elapsed if nb_items % self.counters['log_every'] == 0: logger.info("Scan : {0} items".format(nb_items)) logger.debug(" -> Avg scan time : {0}ms".format( 1000 * self.counters['scan_time'].value / nb_items)) # Start timers reinit start = time.time() except Exception as e: logger.info("Error while scanning Algolia index %s with query %s", p_index, p_query) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1
def get_and_parse(p_inqueue,p_outqueue,p_process,**kwargs): """ Gets doc from an input queue, applies transformation according to p_process function, then pushes the so produced new doc into an output queue p_process must take a "doc" as a first parameter """ current = current_process() while True: try: logger.debug("(%s) Size of queues. in : %i / ou : %i",current.name,p_inqueue.qsize(),p_outqueue.qsize()) try: in_doc = p_inqueue.get(False) except Exception: logger.info("Nothing to get in the Queue") else: # Manage poison pill if in_doc is None: logger.info("(%s) => Parser has received 'poison pill' and is now ending ...",current.name) p_inqueue.task_done() break # Call the proc with the arg list (keeping the * means : unwrap the list when calling the function) out_doc = p_process(in_doc,**kwargs) for doc in out_doc: p_outqueue.put(doc) p_inqueue.task_done() except TimeoutError: logger.warn('Timeout exception while parsing with %s method',p_process) except KeyboardInterrupt: logger.info("user interruption") sys.exit(0)
def scan_and_queue(self, p_queue, p_index, p_query={}, p_connect_timeout=1, p_read_timeout=30): """Reads docs from an Algolia index according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_index: Index where items are picked from p_query: query for scanning the index """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) try: client = algoliasearch.Client(self.app_id, self.api_key) client.timeout = (p_connect_timeout, p_read_timeout) index = client.init_index(p_index) except Exception as e: logger.error(e) sys.exit(EXIT_IO_ERROR) try: documents = index.browse_all(p_query) start = time.time() for doc in documents: p_queue.put(doc) elapsed = time.time() - start with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 nb_items = self.counters['nb_items_scanned'].value self.counters['scan_time'].value += elapsed if nb_items % self.counters['log_every'] == 0: logger.info("Scan : {0} items".format(nb_items)) logger.debug(" -> Avg scan time : {0}ms".format(1000*self.counters['scan_time'].value / nb_items)) # Start timers reinit start = time.time() except Exception as e: logger.info("Error while scanning Algolia index %s with query %s", p_index, p_query) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1
def dequeue_and_store(self, p_queue, p_index, p_nbmax_retry=3): """Gets docs from p_queue and stores them in the algolia Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: algolia index where to store the docs p_nbmax_retry: number of tries when failing on a request (default is 3) """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) client = algoliasearch.Client(self.app_id, self.api_key) index = client.init_index(p_index) # Loop untill receiving the "poison pill" item (meaning : no more element to read) # Main loop max retry main_loop_max_retry = 5 main_loop_retry = 0 start = time.time() poison_pill = False while not(poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: start_bulking = time.time() try: # Bulk indexation if len(bulk) > 0: index.add_objects(bulk) except Exception as e: logger.error("Bulk not indexed in algolia - Retry number %i", try_counter) logger.error(e) try_counter += 1 else: is_indexed = True now = time.time() elapsed_bulking = now - start_bulking elapsed = now - start with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += len(bulk) self.counters['whole_storage_time'].value += elapsed self.counters['bulk_storage_time'].value += elapsed_bulking nb_items = self.counters['nb_items_stored'].value if nb_items % self.counters['log_every'] == 0 and nb_items != 0: logger.info("Store : {0} items".format(nb_items)) logger.debug(" -> Avg store time : {0}ms".format(1000 * self.counters['whole_storage_time'].value / nb_items)) logger.debug(" -> Avg bulk time : {0}ms".format(1000 * self.counters['bulk_storage_time'].value / nb_items)) start = time.time() if not is_indexed: start = time.time() logger.error("Bulk not indexed in algolia : operation aborted after %i retries", try_counter - 1) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += len(bulk) except KeyboardInterrupt: logger.info("ESio.dequeue_and_store : User interruption of the process") poison_pill = True p_queue.task_done() except Exception as e: logger.error("An error occured while storing elements to Algolia : {0}".format(e)) main_loop_retry += 1 if main_loop_retry >= main_loop_max_retry: logger.error("Too many errors while storing. Process interrupted after {0} errors".format(main_loop_retry)) poison_pill = True p_queue.task_done()
def dequeue_and_store(self, p_queue, p_index, p_nbmax_retry=3): """Gets docs from p_queue and stores them in the algolia Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: algolia index where to store the docs p_nbmax_retry: number of tries when failing on a request (default is 3) """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) client = algoliasearch.Client(self.app_id, self.api_key) index = client.init_index(p_index) # Loop untill receiving the "poison pill" item (meaning : no more element to read) start = time.time() poison_pill = False while not (poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: start_bulking = time.time() try: # Bulk indexation if len(bulk) > 0: index.add_objects(bulk) except Exception as e: logger.error( "Bulk not indexed in algolia - Retry number %i", try_counter) logger.error(e) try_counter += 1 else: is_indexed = True now = time.time() elapsed_bulking = now - start_bulking elapsed = now - start with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += len(bulk) self.counters[ 'whole_storage_time'].value += elapsed self.counters[ 'bulk_storage_time'].value += elapsed_bulking nb_items = self.counters['nb_items_stored'].value if nb_items % self.counters[ 'log_every'] == 0 and nb_items != 0: logger.info( "Store : {0} items".format(nb_items)) logger.debug( " -> Avg store time : {0}ms".format( 1000 * self. counters['whole_storage_time'].value / nb_items)) logger.debug( " -> Avg bulk time : {0}ms".format( 1000 * self. counters['bulk_storage_time'].value / nb_items)) start = time.time() if not is_indexed: start = time.time() logger.error( "Bulk not indexed in algolia : operation aborted after %i retries", try_counter - 1) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += len(bulk) except KeyboardInterrupt: logger.info( "ESio.dequeue_and_store : User interruption of the process" ) sys.exit(1)
def dequeue_and_store(self,p_queue,p_index,p_timeout=10,p_nbmax_retry=3): """Gets docs from p_queue and stores them in the csv file Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: elasticsearch index where to store the docs p_timeout: timeout for bulk (default is 10s) p_nbmax_retry: number of tries when failing on a request (default is 3) """ try: param = [{'host':self.host,'port':self.port,'timeout':p_timeout,'max_retries':p_nbmax_retry,'retry_on_timeout':True}] es = Elasticsearch(param) logger.info('Connected to ES Server: %s',json.dumps(param)) except Exception as e: logger.error('Connection failed to ES Server : %s',json.dumps(param)) logger.error(e) sys.exit(EXIT_IO_ERROR) # Loop untill receiving the "poison pill" item (meaning : no more element to read) poison_pill = False while not(poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: logger.debug("ESio has received 'poison pill' and is now ending ...") poison_pill = True p_queue.task_done() break # Bulk element creation from the source_doc source_doc['_index'] = p_index bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: try: # Bulk indexation if len(bulk) > 0: logger.debug("Indexing %i documents",len(bulk)) helpers.bulk(es, bulk, raise_on_error=True) # es.index(index=self.index,doc_type=p_doctype,body=source_doc) except Exception as e: logger.error("Bulk not indexed in ES - Retry n°%i",try_counter) logger.error(e) try_counter += 1 else: is_indexed = True if not is_indexed: logger.error("Bulk not indexed in elasticsearch : operation aborted after %i retries",try_counter-1) except KeyboardInterrupt: logger.info("ESio.dequeue_and_store : User interruption of the process") sys.exit(EXIT_USER_INTERRUPT)