示例#1
0
    def __init__(self, batch_size=500, age=0):
        """Initialize the class.

        Args:
            batch_size: Number of files to read
            age: Minimum age of files to be read per batch

        Returns:
            None

        """
        # Get cache directory
        config = Config()
        directory = config.agent_cache_directory(PATTOO_API_AGENT_NAME)
        self._batch_id = int(time.time() * 1000)

        # Read data from cache. Stop if there is no data found.
        self._data = files.read_json_files(directory,
                                           die=False,
                                           age=age,
                                           count=batch_size)

        # Save the number of files read
        self.files = len(self._data)
示例#2
0
def process_cache(batch_size=500, max_duration=3600, fileage=10, script=False):
    """Ingest data.

    Args:
        batch_size: Number of files to process at a time
        max_duration: Maximum duration
        fileage: Minimum age of files to be processed in seconds

    Returns:
        success: True if successful

    Method:
        1) Read the files in the cache directory older than a threshold
        2) Process the data in the files
        3) Repeat, if new files are found that are older than the threshold,
           or we have been running too long.

        Batches of files are read to reduce the risk of overloading available
        memory, and ensure we can exit if we are running too long.

    """
    # Initialize key variables
    records = 0
    start = time.time()
    looptime = 0
    files_read = 0
    success = True

    # Get cache directory
    config = Config()
    directory = config.agent_cache_directory(PATTOO_API_AGENT_NAME)

    # Log what we are doing
    log_message = 'Processing ingest cache.'
    log.log2info(20085, log_message)

    # Get the number of files in the directory
    files_found = len(
        [_ for _ in os.listdir(directory) if _.endswith('.json')])

    # Create lockfile only if running as a script.
    # The daemon has its own locking mechanism
    if bool(script) is True:
        success = _lock()
        if bool(success) is False:
            return bool(success)

    # Process the files in batches to reduce the database connection count
    # This can cause errors
    while True:
        # Agents constantly update files. We don't want an infinite loop
        # situation where we always have files available that are newer than
        # the desired fileage.
        loopstart = time.time()
        fileage = fileage + looptime

        # Automatically stop if we are going on too long.(1 of 2)
        duration = loopstart - start
        if duration > max_duration:
            log_message = ('''\
Stopping ingester after exceeding the maximum runtime duration of {}s. \
This can be adjusted on the CLI.'''.format(max_duration))
            log.log2info(20022, log_message)
            break

        # Automatically stop if we are going on too long.(2 of 2)
        if files_read >= files_found:
            # No need to log. This is an expected outcome.
            break

        # Read data from cache. Stop if there is no data found.
        cache = Cache(batch_size=batch_size, age=fileage)
        count = cache.ingest()

        # Automatically stop if we are going on too long.(2 of 2)
        if bool(cache.files) is False:
            # No need to log. This is an expected outcome.
            break

        # Get the records processed, looptime and files read
        records += count
        files_read += cache.files
        looptime = max(time.time() - loopstart, looptime)

    # Print result
    duration = time.time() - start
    if bool(records) is True and bool(duration) is True:
        log_message = ('''\
Agent cache ingest completed. {0} records processed in {1:.2f} seconds, \
{2:.2f} records / second. {3} files read. \
'''.format(records, duration, records / duration, files_read))
        log.log2info(20084, log_message)
    else:
        log_message = 'No files found to ingest'
        log.log2info(20021, log_message)

    # Delete lockfile only if running as a script.
    # The daemon has its own locking mechanism
    if bool(script) is True:
        success = _lock(delete=True)

    # Log what we are doing
    log_message = 'Finished processing ingest cache.'
    log.log2info(20020, log_message)

    return bool(success)