Exemplo n.º 1
0
def appLoadMP(pathToLoad, dbfilenameFullPath, maxCores, governorOffFlag):
    global _tasksPerJob

    files_to_process = []
    conn = None

    # Start timer
    t0 = datetime.now()

    logger.debug("Starting appLoadMP")
    # Calculate aggreagate file_filter for all ingest types supported:
    file_filter = '|'.join([v.getFileNameFilter() for k,v in ingest_plugins.iteritems()])
    # Add zip extension
    file_filter += "|.*\.zip"

    # Check if we're loading Redline data
    if os.path.isdir(pathToLoad) and os.path.basename(pathToLoad).lower() == 'RedlineAudits'.lower():
        files_to_process = searchRedLineAudits(pathToLoad)
    else:
        # Search for all files to be processed
        if os.path.isdir(pathToLoad):
            files_to_process = searchFolders(pathToLoad, file_filter)
        else:
            files_to_process = processArchives(pathToLoad, file_filter)

    if files_to_process:
        # Init DB if required
        DB = appDB.DBClass(dbfilenameFullPath, True, settings.__version__)
        conn = DB.appConnectDB()

        # Extract hostnames, grab existing host IDs from DB and calculate instance ID for new IDs to be ingested:
        instancesToProcess = []
        instancesToProcess += GetIDForHosts(files_to_process, DB)
        countInstancesToProcess = len(instancesToProcess)
        logger.info("Found %d new instances" % (countInstancesToProcess))

        # Setup producers/consumers initial counts
        num_consumers = 1
        num_producers = 1

        # Setup MPEngine
        mpe = MPEngineProdCons(maxCores, appLoadProd, appLoadCons, governorOffFlag)

        # Reduce _tasksPerJob for small jobs
        if countInstancesToProcess < _tasksPerJob: _tasksPerJob = 1

        # Create task list
        task_list = []
        instancesPerJob = _tasksPerJob
        num_tasks = 0
        for chunk in chunks(instancesToProcess, instancesPerJob):
            # todo: We no longer need pathToLoad as tasks include the fullpath now
            task_list.append(Task(pathToLoad, chunk))
            num_tasks += 1

        if num_tasks > 0:
            # Check if we have to drop indexes to speedup insertions
            # todo: Research ratio of existing hosts to new hosts were this makes sense
            if countInstancesToProcess > 1000 or DB.CountHosts() < 1000:
                DB.appDropIndexesDB()

            # Queue tasks for Producers
            mpe.addTaskList(task_list)

            # Start procs
            mpe.startProducers(num_producers)
            mpe.startConsumers(num_consumers, [dbfilenameFullPath])
            # mpe.addProducer()

            # Control loop
            while mpe.working():
                time.sleep(1.0)
                (num_producers,num_consumers,num_tasks,progress_producers,progress_consumers) = mpe.getProgress()
                elapsed_time = datetime.now() - t0
                mean_loadtime_per_host = (elapsed_time) / max(1, _tasksPerJob * progress_consumers)
                pending_hosts = ((num_tasks * _tasksPerJob) - (_tasksPerJob * progress_consumers))
                etr = (mean_loadtime_per_host * pending_hosts)
                eta = t0 + elapsed_time + etr
                ett = (eta - t0)
                if settings.logger_getDebugMode(): status_extra_data = " Prod: %s Cons: %s (%d -> %d -> %d: %d) [RAM: %d%% / Obj: %d / ETH: %s / ETA: %s / ETT: %s]" % \
                                                                       (num_producers, num_consumers, num_tasks, progress_producers, progress_consumers, progress_producers - progress_consumers,
                     psutil_phymem_usage(), len(gc.get_objects()),
                     mean_loadtime_per_host if progress_consumers * _tasksPerJob > 100 else "N/A",
                     str(eta.time()).split(".")[0] if progress_consumers * _tasksPerJob > 100 else "N/A",
                     str(ett).split(".")[0] if progress_consumers * _tasksPerJob > 100 else "N/A")
                else: status_extra_data = ""
                # logger.info("Parsing files%s" % status_extra_data)

                logger.info(update_progress(min(1,float(progress_consumers) / float(num_tasks)), "Parsing files%s" % status_extra_data, True))
                mpe.rebalance()

            del mpe

        # Stop timer
        elapsed_time = datetime.now() - t0
        mean_loadtime_per_host = (elapsed_time) / max(1, countInstancesToProcess)
        logger.info("Load speed: %s seconds / file" % (mean_loadtime_per_host))
        logger.info("Load time: %s" % (str(elapsed_time).split(".")[0]))
    else:
        logger.info("Found no files to process!")
Exemplo n.º 2
0
def GetIDForHosts(fileFullPathList, DB):
    # Returns: (filePath, instanceID, hostname, hostID, ingest_type)
    hostsTest = {}
    hostsProcess = []
    progress_total = 0
    progress_current = 0

    # Determine plugin_type and hostname
    for file_name_fullpath in fileFullPathList:
        hostName = None
        ingest_type = None
        loop_counter = 0
        while True:
            if loop_counter > len(ingest_plugins_types_stack):
                # We ignore empty file from hosts with no appcompat data
                # todo: Omit suppression on verbose mode
                tmp_file_size = file_size(file_name_fullpath)
                if tmp_file_size > 500:
                    logger.warning("No ingest plugin could process: %s (skipping file) [size: %d]" %
                                   (ntpath.basename(file_name_fullpath), tmp_file_size))
                break
            ingest_type = ingest_plugins_types_stack[0]
            if ingest_plugins[ingest_type].matchFileNameFilter(file_name_fullpath):
                # Check magic:
                try:
                    magic_check = ingest_plugins[ingest_type].checkMagic(file_name_fullpath)
                    if isinstance(magic_check, tuple):
                        logger.error("Report bug")
                    else: magic_check_res = magic_check
                    if magic_check_res:
                        # Magic OK, go with this plugin
                        hostName = ingest_plugins[ingest_type].getHostName(file_name_fullpath)
                        break
                except Exception as e:
                    logger.exception("Error processing: %s (%s)" % (file_name_fullpath, str(e)))
            # Emulate stack with list to minimize internal looping (place last used plugin at the top)
            ingest_plugins_types_stack.remove(ingest_type)
            ingest_plugins_types_stack.insert(len(ingest_plugins_types_stack), ingest_type)
            loop_counter += 1
        if hostName is not None:
            if hostName in hostsTest:
                hostsTest[hostName].append((file_name_fullpath, ingest_plugins[ingest_type]))
            else:
                hostsTest[hostName] = []
                hostsTest[hostName].append((file_name_fullpath, ingest_plugins[ingest_type]))

    progress_total = len(hostsTest.keys())
    # Iterate over hosts. If host exists in DB grab rowID else create and grab rowID.
    conn = DB.appGetConn()
    with closing(conn.cursor()) as c:
        for hostName in hostsTest.keys():
            assert(hostName)
            logger.debug("Processing host: %s" % hostName)
            # Check if Host exists
            c.execute("SELECT count(*) FROM Hosts WHERE HostName = '%s'" % hostName)
            data = c.fetchone()[0]
            if (data != 0):
                # Host already has at least one instance in the DB
                c.execute("SELECT HostID, Instances FROM Hosts WHERE HostName = '%s'" % hostName)
                data = c.fetchone()
                tmpHostID = data[0]
                tmpInstances = eval(data[1])
                for (file_fullpath, ingest_plugin) in hostsTest[hostName]:
                    logger.debug("Grabbing instanceID from file: %s" % file_fullpath)
                    try:
                        instance_ID = CalculateInstanceID(file_fullpath, ingest_plugin)
                    except Exception:
                        logger.error("Error parsing: %s (skipping)" % file_fullpath)
                        traceback.print_exc(file=sys.stdout)
                    else:
                        if str(instance_ID) not in tmpInstances:
                            tmpInstances.append(str(instance_ID))
                            hostsProcess.append((file_fullpath, instance_ID, hostName, tmpHostID, ingest_plugin))
                        else:
                            logger.debug("Duplicate host and instance found: %s" %hostName)
                            continue
                # Save updated Instances list
                c.execute("UPDATE Hosts SET Instances = %s, InstancesCounter = %d WHERE HostName = '%s'" % ('"' + str(repr(tmpInstances)) + '"', len(tmpInstances), hostName))
            else:
                # Host does not exist. Add instance and grab the host ID.
                tmpInstances = []
                newInstances = []
                for (file_fullpath, ingest_plugin) in hostsTest[hostName]:
                    try:
                        instance_ID = CalculateInstanceID(file_fullpath, ingest_plugin)
                    except Exception:
                        logger.error("Error parsing: %s (skipping)" % file_fullpath)
                        traceback.print_exc(file=sys.stdout)
                    else:
                        if str(instance_ID) not in tmpInstances:
                            tmpInstances.append(str(instance_ID))
                            newInstances.append((file_fullpath, instance_ID, ingest_plugin))

                c.execute("INSERT INTO Hosts VALUES (NULL,%s,%s,%d,%d,%d)" % ('"' + hostName + '"', '"' + str(repr(tmpInstances)) + '"', len(tmpInstances), 0, 0))
                tmpHostID = c.lastrowid
                for (file_fullpath, instance_ID, ingest_plugin) in newInstances:
                    # todo: Do we want/need each row to track from what instance it came?
                    hostsProcess.append((file_fullpath, instance_ID, hostName, tmpHostID, ingest_plugin))
            # Update progress
            progress_current += 1
            if settings.logger_getDebugMode():
                status_extra_data = " [RAM: %d%%]" % psutil_phymem_usage()
            else: status_extra_data = ""
            # logger.debug("Pre-process new hosts/instances%s" % status_extra_data)
            logger.info(update_progress(min(1, float(progress_current) / float(progress_total)), "Calculate ID's for new hosts/instances%s" % status_extra_data, True))
        conn.commit()

    # Return hosts to be processed
    return hostsProcess