Пример #1
0
    def scan_and_queue(self,p_queue,p_index,p_query={},p_doctype=None,p_scroll_time='5m',p_timeout='1m'):
        """Reads docs from an es index according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_scroll_time:    Time for scroll method
            p_timeout:        Timeout - After this period, scan context is closed
            p_index:        Index where items are picked from
            p_doctype:        DocType of the items
            p_query:        ElasticSearch query for scanning the index
        """
        try:
            param = [{'host':self.host,'port':self.port}]
            es = Elasticsearch(param)
            logger.info('Connected to ES Server for reading: %s',json.dumps(param))
        except Exception as e:
            logger.error('Connection failed to ES Server for reading: %s',json.dumps(param))
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            if 'p_doctype' is not None:
                documents = helpers.scan(client=es, query=p_query, size=1000, scroll=p_scroll_time, index=p_index, doc_type=p_doctype, timeout=p_timeout)
            else:
                documents = helpers.scan(client=es, query=p_query, size=1000, scroll= p_scroll_time, index=p_index, timeout=p_timeout)
            for doc in documents:
                logger.debug(doc)
                p_queue.put(doc)
        except Exception as e:
            logger.info("Error while scanning ES index %s with query %s",p_index,p_query)
Пример #2
0
    def dequeue_and_store(self,p_queue,p_file,p_delimiter=',',p_quotechar='"',p_quoting=csv.QUOTE_NONNUMERIC):
        """Gets docs from p_queue and stores them in the csv file
             Stops dealing with the queue when receiving a "None" item

            p_queue:    queue wich items are picked from. Elements has to be "list".
            p_file:     file to store in
        """

        # If not exists, creates the cursor
        if p_file not in self.csvfilecursor:
            self.csvfilecursor[p_file] = open(p_file, "w")
            self.out_csvfile[p_file] = csv.writer(self.csvfilecursor[p_file],delimiter=p_delimiter,quotechar=p_quotechar,quoting=p_quoting)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        poison_pill = False
        while not(poison_pill):
            try:
                source_doc = p_queue.get()

                # Manage poison pill : stop trying to get elements
                if source_doc is None:
                    logger.debug("CSVio has received 'poison pill' and is now ending ...")
                    poison_pill = True
                    self.csvfilecursor[p_file].close()
                    p_queue.task_done()
                    break

                self.out_csvfile[p_file].writerow(source_doc)

                p_queue.task_done()
            except KeyboardInterrupt:
                logger.info("CSVio.dequeue_and_store : User interruption of the process")
                self.csvfilecursor[p_file].close()
                sys.exit(EXIT_USER_INTERRUPT)
Пример #3
0
    def dequeue_and_store(self,p_queue, p_collection):
        """Gets docs from p_queue and stores them in a mongo collection
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_collection:        mongo collection where to store the docs;            
        """
        # uri for mongo connection
        uri = 'mongodb://%s:%s@%s:%s/%s?connectTimeoutMS=' % (self.user,self.password,self.host,self.port,self.base,self.connect_timeout)
        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connection = mongo_client[self.base]
            logger.info('Connection succeeded on %s',uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s',uri)
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        poison_pill = False        

        while not(poison_pill):
            try:                
                
                source_doc = p_queue.get()

                # Manage poison pill
                if source_doc is None:
                    logger.debug("Mongoio has received 'poison pill' and is now ending ...")
                    poison_pill = True
                    p_queue.task_done()
                    break

                #management of 'update/set' style request                 
                try:
                    find = source_doc['_mongo_find']
                except KeyError:
                    find = {'_id':source_doc['_id']}

                try:
                    update = source_doc['_mongo_update']
                except KeyError:
                    update = source_doc
            
                #insert into collection
                try:                                                                        
                    mongo_connection[p_collection].update(find,update,upsert=True)
                except Exception as e:
                    logger.error("Document not inserted in Mongo Collection %s", source_doc['_id'])
                    logger.error(e)                

                p_queue.task_done()

            except KeyboardInterrupt:
                logger.info("Mongoio.dequeue_and_store : User interruption of the process")
                sys.exit(EXIT_USER_INTERRUPT)
Пример #4
0
    def dequeue_and_store(self,p_queue,p_index,p_nbmax_retry=3):
        """Gets docs from p_queue and stores them in the algolia
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_index:            algolia index where to store the docs
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """

        client = algoliasearch.Client(self.app_id,self.api_key)
        index = client.init_index(p_index)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        poison_pill = False
        while not(poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()
                    
                    # Manage poison pill
                    if source_doc is None:
                        logger.debug("ESio has received 'poison pill' and is now ending ...")
                        poison_pill = True
                        p_queue.task_done()
                        break

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            logger.debug("Indexing %i documents",len(bulk))
                            index.add_objects(bulk)
                    except Exception as e:
                        logger.error("Bulk not indexed in algolia - Retry number %i",try_counter)
                        logger.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True

                if not is_indexed:
                    logger.error("Bulk not indexed in algolia : operation aborted after %i retries",try_counter-1)

            except KeyboardInterrupt:
                logger.info("ESio.dequeue_and_store : User interruption of the process")
                sys.exit(1)
Пример #5
0
    def scan_and_queue(self,
                       p_queue,
                       p_index,
                       p_query={},
                       p_connect_timeout=1,
                       p_read_timeout=30):
        """Reads docs from an Algolia index according to a query and pushes them to the queue

            p_queue:        Queue where items are pushed to
            p_index:        Index where items are picked from
            p_query:        query for scanning the index
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)
        try:
            client = algoliasearch.Client(self.app_id, self.api_key)
            client.timeout = (p_connect_timeout, p_read_timeout)
            index = client.init_index(p_index)
        except Exception as e:
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            documents = index.browse_all(p_query)
            start = time.time()
            for doc in documents:
                p_queue.put(doc)
                elapsed = time.time() - start

                with self.counters['nb_items_scanned'].get_lock():
                    self.counters['nb_items_scanned'].value += 1
                    nb_items = self.counters['nb_items_scanned'].value
                    self.counters['scan_time'].value += elapsed

                    if nb_items % self.counters['log_every'] == 0:
                        logger.info("Scan : {0} items".format(nb_items))
                        logger.debug("   -> Avg scan time : {0}ms".format(
                            1000 * self.counters['scan_time'].value /
                            nb_items))

                    # Start timers reinit
                    start = time.time()
        except Exception as e:
            logger.info("Error while scanning Algolia index %s with query %s",
                        p_index, p_query)
            with self.counters['nb_items_error'].get_lock():
                self.counters['nb_items_error'].value += 1
Пример #6
0
def get_and_parse(p_inqueue,p_outqueue,p_process,**kwargs):
    """
        Gets doc from an input queue, applies transformation according to p_process function,
        then pushes the so produced new doc into an output queue

        p_process must take a "doc" as a first parameter
    """

    current = current_process()

    while True:
        try:
            logger.debug("(%s) Size of queues. in : %i / ou : %i",current.name,p_inqueue.qsize(),p_outqueue.qsize())
            
            try:
                in_doc = p_inqueue.get(False)
            except Exception:
                logger.info("Nothing to get in the Queue")
            else:
                # Manage poison pill
                if in_doc is None:
                    logger.info("(%s) => Parser has received 'poison pill' and is now ending ...",current.name)
                    p_inqueue.task_done()
                    break

                # Call the proc with the arg list (keeping the * means : unwrap the list when calling the function)

                out_doc = p_process(in_doc,**kwargs)

                for doc in out_doc:
                    p_outqueue.put(doc)

                p_inqueue.task_done()

        except TimeoutError:
            logger.warn('Timeout exception while parsing with %s method',p_process)
        except KeyboardInterrupt:
            logger.info("user interruption")
            sys.exit(0)
Пример #7
0
    def scan_and_queue(self, p_queue, p_index, p_query={}, p_connect_timeout=1, p_read_timeout=30):
        """Reads docs from an Algolia index according to a query and pushes them to the queue

            p_queue:        Queue where items are pushed to
            p_index:        Index where items are picked from
            p_query:        query for scanning the index
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)
        try:
            client = algoliasearch.Client(self.app_id, self.api_key)
            client.timeout = (p_connect_timeout, p_read_timeout)
            index = client.init_index(p_index)
        except Exception as e:
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            documents = index.browse_all(p_query)
            start = time.time()
            for doc in documents:
                p_queue.put(doc)
                elapsed = time.time() - start

                with self.counters['nb_items_scanned'].get_lock():
                    self.counters['nb_items_scanned'].value += 1
                    nb_items = self.counters['nb_items_scanned'].value
                    self.counters['scan_time'].value += elapsed

                    if nb_items % self.counters['log_every'] == 0:
                        logger.info("Scan : {0} items".format(nb_items))
                        logger.debug("   -> Avg scan time : {0}ms".format(1000*self.counters['scan_time'].value / nb_items))

                    # Start timers reinit
                    start = time.time()
        except Exception as e:
            logger.info("Error while scanning Algolia index %s with query %s", p_index, p_query)
            with self.counters['nb_items_error'].get_lock():
                self.counters['nb_items_error'].value += 1
Пример #8
0
    def dequeue_and_store(self, p_queue, p_index, p_nbmax_retry=3):
        """Gets docs from p_queue and stores them in the algolia
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_index:            algolia index where to store the docs
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)

        client = algoliasearch.Client(self.app_id, self.api_key)
        index = client.init_index(p_index)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        # Main loop max retry
        main_loop_max_retry = 5
        main_loop_retry = 0
        start = time.time()
        poison_pill = False
        while not(poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()

                    # Manage poison pill
                    if source_doc is None:
                        poison_pill = True
                        p_queue.task_done()
                        break

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    start_bulking = time.time()
                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            index.add_objects(bulk)
                    except Exception as e:
                        logger.error("Bulk not indexed in algolia - Retry number %i", try_counter)
                        logger.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True
                        now = time.time()
                        elapsed_bulking = now - start_bulking
                        elapsed = now - start
                        with self.counters['nb_items_stored'].get_lock():
                            self.counters['nb_items_stored'].value += len(bulk)
                            self.counters['whole_storage_time'].value += elapsed
                            self.counters['bulk_storage_time'].value += elapsed_bulking
                            nb_items = self.counters['nb_items_stored'].value
                            if nb_items % self.counters['log_every'] == 0 and nb_items != 0:
                                logger.info("Store : {0} items".format(nb_items))
                                logger.debug("   -> Avg store time : {0}ms".format(1000 * self.counters['whole_storage_time'].value / nb_items))
                                logger.debug("   -> Avg bulk time  : {0}ms".format(1000 * self.counters['bulk_storage_time'].value / nb_items))

                            start = time.time()

                if not is_indexed:
                    start = time.time()
                    logger.error("Bulk not indexed in algolia : operation aborted after %i retries", try_counter - 1)
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += len(bulk)

            except KeyboardInterrupt:
                logger.info("ESio.dequeue_and_store : User interruption of the process")
                poison_pill = True
                p_queue.task_done()
            except Exception as e:
                logger.error("An error occured while storing elements to Algolia : {0}".format(e))
                main_loop_retry += 1
                if main_loop_retry >= main_loop_max_retry:
                    logger.error("Too many errors while storing. Process interrupted after {0} errors".format(main_loop_retry))
                    poison_pill = True
                    p_queue.task_done()
Пример #9
0
    def dequeue_and_store(self, p_queue, p_index, p_nbmax_retry=3):
        """Gets docs from p_queue and stores them in the algolia
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_index:            algolia index where to store the docs
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)

        client = algoliasearch.Client(self.app_id, self.api_key)
        index = client.init_index(p_index)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        start = time.time()
        poison_pill = False
        while not (poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()

                    # Manage poison pill
                    if source_doc is None:
                        poison_pill = True
                        p_queue.task_done()
                        break

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    start_bulking = time.time()
                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            index.add_objects(bulk)
                    except Exception as e:
                        logger.error(
                            "Bulk not indexed in algolia - Retry number %i",
                            try_counter)
                        logger.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True
                        now = time.time()
                        elapsed_bulking = now - start_bulking
                        elapsed = now - start
                        with self.counters['nb_items_stored'].get_lock():
                            self.counters['nb_items_stored'].value += len(bulk)
                            self.counters[
                                'whole_storage_time'].value += elapsed
                            self.counters[
                                'bulk_storage_time'].value += elapsed_bulking
                            nb_items = self.counters['nb_items_stored'].value
                            if nb_items % self.counters[
                                    'log_every'] == 0 and nb_items != 0:
                                logger.info(
                                    "Store : {0} items".format(nb_items))
                                logger.debug(
                                    "   -> Avg store time : {0}ms".format(
                                        1000 * self.
                                        counters['whole_storage_time'].value /
                                        nb_items))
                                logger.debug(
                                    "   -> Avg bulk time  : {0}ms".format(
                                        1000 * self.
                                        counters['bulk_storage_time'].value /
                                        nb_items))

                            start = time.time()

                if not is_indexed:
                    start = time.time()
                    logger.error(
                        "Bulk not indexed in algolia : operation aborted after %i retries",
                        try_counter - 1)
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += len(bulk)

            except KeyboardInterrupt:
                logger.info(
                    "ESio.dequeue_and_store : User interruption of the process"
                )
                sys.exit(1)
Пример #10
0
    def dequeue_and_store(self,p_queue,p_index,p_timeout=10,p_nbmax_retry=3):
        """Gets docs from p_queue and stores them in the csv file
             Stops dealing with the queue when receiving a "None" item

            p_queue:            queue wich items are picked from. Elements has to be "list".
            p_index:            elasticsearch index where to store the docs
            p_timeout:          timeout for bulk (default is 10s)
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """
        try:
            param = [{'host':self.host,'port':self.port,'timeout':p_timeout,'max_retries':p_nbmax_retry,'retry_on_timeout':True}]
            es = Elasticsearch(param)
            logger.info('Connected to ES Server: %s',json.dumps(param))
        except Exception as e:
            logger.error('Connection failed to ES Server : %s',json.dumps(param))
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        poison_pill = False
        while not(poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()
                    
                    # Manage poison pill
                    if source_doc is None:
                        logger.debug("ESio has received 'poison pill' and is now ending ...")
                        poison_pill = True
                        p_queue.task_done()
                        break

                    # Bulk element creation from the source_doc
                    source_doc['_index'] = p_index

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            logger.debug("Indexing %i documents",len(bulk))
                            helpers.bulk(es, bulk, raise_on_error=True)
                            # es.index(index=self.index,doc_type=p_doctype,body=source_doc)
                    except Exception as e:
                        logger.error("Bulk not indexed in ES - Retry n°%i",try_counter)
                        logger.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True

                if not is_indexed:
                    logger.error("Bulk not indexed in elasticsearch : operation aborted after %i retries",try_counter-1)                  

            except KeyboardInterrupt:
                logger.info("ESio.dequeue_and_store : User interruption of the process")
                sys.exit(EXIT_USER_INTERRUPT)