Пример #1
0
def main():
    #Default Parameters can be overwrite by environments
    max_cache = convertStrToInt(
        os.environ.get("MAX_CACHE_SIZE", str(MAX_CACHE_SIZE)), MAX_CACHE_SIZE)
    ES_ENDPOINT = os.environ.get(
        'ES_ENDPOINT',
        'http://elasticsearch-discovery-service.foremast.svc.cluster.local:9200'
    )

    #cache= os.environ.get('ENABLE_CACHE', DEFAULT_ENABLE_CACHE)
    #enableCache = False
    #if cache=='':
    #    enableCache = True
    ML_ALGORITHM = os.environ.get('ML_ALGORITHM',
                                  AI_MODEL.MOVING_AVERAGE_ALL.value)
    #ML_ALGORITHM= AI_MODEL.EXPONENTIAL_SMOOTHING.value
    #ML_ALGORITHM= AI_MODEL.DOUBLE_EXPONENTIAL_SMOOTHING.value
    #prophet algm parameters start
    #ML_ALGORITHM = AI_MODEL.PROPHET.value

    MIN_MANN_WHITE_DATA_POINTS = convertStrToInt(
        os.environ.get("MIN_MANN_WHITE_DATA_POINTS",
                       str(MANN_WHITE_MIN_DATA_POINT)),
        MANN_WHITE_MIN_DATA_POINT)

    MIN_WILCOXON_DATA_POINTS = convertStrToInt(
        os.environ.get("MIN_WILCOXON_DATA_POINTS",
                       str(WILCOXON_MIN_DATA_POINTS)),
        WILCOXON_MIN_DATA_POINTS)

    MIN_KRUSKAL_DATA_POINTS = convertStrToInt(
        os.environ.get("MIN_KRUSKAL_DATA_POINTS",
                       str(KRUSKAL_MIN_DATA_POINTS)), KRUSKAL_MIN_DATA_POINTS)

    ML_THRESHOLD = convertStrToFloat(
        os.environ.get(THRESHOLD, str(DEFAULT_THRESHOLD)), DEFAULT_THRESHOLD)
    #lower threshold is for warning.
    ML_LOWER_THRESHOLD = convertStrToFloat(
        os.environ.get(LOWER_THRESHOLD, str(DEFAULT_LOWER_THRESHOLD)),
        DEFAULT_LOWER_THRESHOLD)
    ML_BOUND = convertStrToInt(os.environ.get(BOUND, str(IS_UPPER_BOUND)),
                               IS_UPPER_BOUND)
    ML_MIN_LOWER_BOUND = convertStrToFloat(
        os.environ.get(MIN_LOWER_BOUND, str(DEFAULT_MIN_LOWER_BOUND)),
        DEFAULT_MIN_LOWER_BOUND)

    # this is for pairwise algorithem which is used for canary deployment anomaly detetion.
    config.setKV("MIN_MANN_WHITE_DATA_POINTS", MIN_MANN_WHITE_DATA_POINTS)
    config.setKV("MIN_WILCOXON_DATA_POINTS", MIN_WILCOXON_DATA_POINTS)
    config.setKV("MIN_KRUSKAL_DATA_POINTS", MIN_KRUSKAL_DATA_POINTS)
    config.setKV(THRESHOLD, ML_THRESHOLD)
    config.setKV(BOUND, ML_BOUND)
    config.setKV(MIN_LOWER_BOUND, ML_MIN_LOWER_BOUND)
    wavefrontEndpoint = os.environ.get('WAVEFRONT_ENDPOINT',
                                       "https://intuit.wavefront.com")
    wavefrontToken = os.environ.get('WAVEFRONT_TOKEN',
                                    "06258b32-5ada-4485-8e78-886faf7a938b")
    config.setKV('WAVEFRONT_ENDPOINT', wavefrontEndpoint)
    config.setKV('WAVEFRONT_TOKEN', wavefrontToken)

    #os.environ[METRIC_TYPE_THRESHOLD_COUNT]='1'
    #os.environ[THRESHOLD+'0']='3'
    #os.environ[BOUND+'0']=str(IS_UPPER_BOUND)
    #os.environ[MIN_LOWER_BOUND+'0']=str(DEFAULT_MIN_LOWER_BOUND)
    #os.environ[METRIC_TYPE+'0']='error5xx'
    metric_threshold_count = convertStrToInt(
        os.environ.get(METRIC_TYPE_THRESHOLD_COUNT, -1),
        METRIC_TYPE_THRESHOLD_COUNT)
    if metric_threshold_count >= 0:
        for i in range(metric_threshold_count):
            istr = str(i)
            mtype = os.environ.get(METRIC_TYPE + istr, '')
            if mtype != '':
                mthreshold = convertStrToFloat(
                    os.environ.get(THRESHOLD + istr, str(ML_THRESHOLD)),
                    ML_THRESHOLD)
                mbound = convertStrToInt(
                    os.environ.get(BOUND + istr, str(ML_BOUND)), ML_BOUND)
                mminlowerbound = convertStrToInt(
                    os.environ.get(MIN_LOWER_BOUND + istr,
                                   str(ML_MIN_LOWER_BOUND)),
                    ML_MIN_LOWER_BOUND)
                config.setThresholdKV(mtype, THRESHOLD, mthreshold)
                config.setThresholdKV(mtype, BOUND, mbound)
                config.setThresholdKV(mtype, MIN_LOWER_BOUND, mminlowerbound)
    #hpa config
    hpa_metric_count = convertStrToInt(os.environ.get("hpa_metric_count", -1),
                                       1)

    if hpa_metric_count >= 0:
        for i in range(hpa_metric_count):
            istr = str(i)
            htype = os.environ.get("hpa_metric_type" + istr, '')
            if htype != '':
                hthreshold = convertStrToFloat(
                    os.environ.get("hpa_threshold" + istr, "3"), 3)
                hbound = convertStrToInt(
                    os.environ.get("hpa_bound" + istr, str(ML_BOUND)),
                    ML_BOUND)
                hminlowerbound = convertStrToInt(
                    os.environ.get("hpa_min_lower_bound" + istr, str('0')), 0)
                hweight = convertStrToFloat(
                    os.environ.get("hpa_weight" + istr, "1"), 1)
                config.setThresholdKV(mtype, THRESHOLD, mthreshold)
                config.setThresholdKV(mtype, BOUND, mbound)
                config.setThresholdKV(mtype, MIN_LOWER_BOUND, mminlowerbound)

    ML_PROPHET_PERIOD = convertStrToInt(
        os.environ.get(PROPHET_PERIOD, str(DEFAULT_PROPHET_PERIOD)),
        DEFAULT_PROPHET_PERIOD)
    ML_PROPHET_FREQ = os.environ.get(PROPHET_FREQ, DEFAULT_PROPHET_FREQ)
    #prophet algm parameters end

    ML_PAIRWISE_ALGORITHM = os.environ.get(PAIRWISE_ALGORITHM, ALL)
    ML_PAIRWISE_THRESHOLD = convertStrToFloat(
        os.environ.get(PAIRWISE_THRESHOLD, str(DEFAULT_PAIRWISE_THRESHOLD)),
        DEFAULT_PAIRWISE_THRESHOLD)

    MAX_STUCK_IN_SECONDS = convertStrToInt(
        os.environ.get('MAX_STUCK_IN_SECONDS',
                       str(DEFAULT_MAX_STUCK_IN_SECONDS)),
        DEFAULT_MAX_STUCK_IN_SECONDS)
    min_historical_data_points = convertStrToInt(
        os.environ.get('MIN_HISTORICAL_DATA_POINT_TO_MEASURE',
                       str(DEFAULT_MIN_HISTORICAL_DATA_POINT_TO_MEASURE)),
        DEFAULT_MIN_HISTORICAL_DATA_POINT_TO_MEASURE)

    es_url_status_search = buildElasticSearchUrl(ES_ENDPOINT, ES_INDEX)
    es_url_status_update = buildElasticSearchUrl(ES_ENDPOINT,
                                                 ES_INDEX,
                                                 isSearch=False)

    # Start up the server to expose the metrics.
    start_http_server(8000)
    measurementMetric = measurementmetrics()
    label_info = {
        'jobId': '',
        'calcuHistorical': 'False',
        'hasCurrent': 'True'
    }
    MONITORING_REQUEST_TIME = "request_process_time"

    while True:
        resp = ''
        modelHolder = None

        threshold = ML_THRESHOLD
        lower_threshold = ML_LOWER_THRESHOLD

        resp = searchByStatuslist(es_url_status_search,
                                  REQUEST_STATE.INITIAL.value,
                                  REQUEST_STATE.PREPROCESS_COMPLETED.value)
        #resp = searchByStatuslist(es_url_status_search, REQUEST_STATE.COMPLETED_UNHEALTH.value, REQUEST_STATE.COMPLETED_HEALTH.value,
        #                         REQUEST_STATE.COMPLETED_UNKNOWN.value)
        openRequestlist = parseResult(resp)
        openRequest = selectRequestToProcess(openRequestlist)

        if openRequest == None:
            #process stucked preprogress_inprogress event.
            resp = searchByStatus(es_url_status_search,
                                  REQUEST_STATE.PREPROCESS_INPROGRESS.value,
                                  MAX_STUCK_IN_SECONDS)
            openRequestlist = parseResult(resp)
            openRequest = selectRequestToProcess(openRequestlist)
            if openRequest == None:
                openRequest, modelHolder = retrieveCachedRequest(
                    es_url_status_search)
                openRequestlist = parseResult(resp)
                openRequest = selectRequestToProcess(openRequestlist)
                if openRequest == None:
                    logger.warning(
                        "No long running preprocess job found .....")

                    time.sleep(1)
                    continue

                    #Test Start########################
                    '''
                    id ='35aa7789aa7e6176c975c7a3c1c51c1e7572ec7a2d83ee953f8306618949eb74'
                    openRequest = retrieveRequestById(es_url_status_search, id)
                    if (openRequest==None):
                        print("es is down, will sleep and retry")
                        time.sleep(1)
                        continue
                    '''
                    #Test End##########################
            else:
                uuid = openRequest['id']
                openRequest_tmp, modelHolder = retrieveOneCachedRequest(
                    es_url_status_search, uuid)

        outputMsg = []
        uuid = openRequest['id']
        status = openRequest['status']
        #updatedStatus = reserveJob(es_url_status_update, uuid, status)
        updatedStatus = reserveJob(es_url_status_update, es_url_status_search,
                                   uuid, status)

        logger.warning("Start to processing job id " + uuid +
                       " original status:" + status)

        #print(getNowStr(), ": start to processing uuid ..... ",uuid," status:", status)

        historicalConfig = openRequest['historicalConfig']
        currentConfig = openRequest['currentConfig']
        baselineConfig = openRequest['baselineConfig']

        historicalMetricStore = openRequest['historicalMetricStore']
        currentMetricStore = openRequest['currentMetricStore']
        baselineMetricStore = openRequest['baselineMetricStore']
        startTime = openRequest['startTime']
        endTime = openRequest['endTime']
        strategy = openRequest['strategy']
        skipHistorical = (historicalConfig == '') or (strategy == 'canary')
        skipBaseline = (baselineConfig == '') or (strategy != 'canary')
        label_info['jobId'] = uuid
        label_info['calcuHistorical'] = 'False'
        label_info['hasCurrent'] = 'False'
        start = time.time()

        #Need to be removed below line due to baseline is enabled at upstream
        #skipBaseline = True
        skipCurrent = (currentConfig == '')

        try:
            if (skipCurrent):
                ret = updateESDocStatus(es_url_status_update,
                                        es_url_status_search, uuid,
                                        REQUEST_STATE.COMPLETED_UNKNOWN.value,
                                        "Error: no current config")
                logger.warning("request error : jobid  " + uuid +
                               " updateESDocStatus  is :" + str(ret) +
                               " current config is empty. make status unknown")
                #print(getNowStr(), " : jobid  ",uuid, " current config is empty. make status unknown")
                measurementmetrics.sendMetric(MONITORING_REQUEST_TIME,
                                              label_info,
                                              calculateDuration(start))
                continue

            #dict  metric name : url , if modelHolder does not have model, give chance to recalculate
            if modelHolder == None:
                modelConfig = {
                    THRESHOLD: threshold,
                    LOWER_THRESHOLD: lower_threshold,
                    MIN_DATA_POINTS: min_historical_data_points,
                    BOUND: ML_BOUND,
                    PAIRWISE_ALGORITHM: ML_PAIRWISE_ALGORITHM,
                    PAIRWISE_THRESHOLD: ML_PAIRWISE_THRESHOLD
                }
                modelHolder = ModelHolder(ML_ALGORITHM, modelConfig, {},
                                          METRIC_PERIOD.HISTORICAL.value, uuid)

            if (not (modelHolder.hasModels or skipHistorical)):
                configMapHistorical = convertStringToMap(historicalConfig)
                storeMapHistorical = convertStringToMap(historicalMetricStore)
                isProphet = False
                if (ML_ALGORITHM == AI_MODEL.PROPHET.value):
                    isProphet = True
                    modelConfig.setdefault(PROPHET_PERIOD, ML_PROPHET_PERIOD)
                    modelConfig.setdefault(PROPHET_FREQ, ML_PROPHET_FREQ)
                modelHolder, msg = computeHistoricalModel(
                    configMapHistorical, modelHolder, isProphet,
                    storeMapHistorical)
                label_info['calcuHistorical'] = 'True'
                if (msg != ''):
                    outputMsg.append(msg)
                if (not modelHolder.hasModels):
                    outputMsg.append("No historical Data and model ")
                    #print(getNowStr(), ": Warning: No historical: "+str(modelHolder))

            hasHistorical = modelHolder.hasModels

            #start baseline
            to_do = []

            currentDataSet = {}
            baselineDataSet = {}

            if skipBaseline:
                currentDataSet, p = computeNonHistoricalModel(
                    convertStringToMap(currentConfig),
                    METRIC_PERIOD.CURRENT.value,
                    convertStringToMap(currentMetricStore))
            else:
                with ProcessPoolExecutor(max_workers=2) as executor:
                    currentjob = executor.submit(
                        computeNonHistoricalModel,
                        convertStringToMap(currentConfig),
                        METRIC_PERIOD.CURRENT.value,
                        convertStringToMap(currentMetricStore))
                    baselinejob = executor.submit(
                        computeNonHistoricalModel,
                        convertStringToMap(baselineConfig),
                        METRIC_PERIOD.BASELINE.value,
                        convertStringToMap(baselineMetricStore))
                    to_do.append(currentjob)
                    to_do.append(baselinejob)
                    for future in futures.as_completed(to_do):
                        try:
                            res = future.result()
                            if (res[1] == METRIC_PERIOD.CURRENT.value):
                                currentDataSet = res[0]
                            else:
                                baselineDataSet = res[0]
                        except Exception as e:
                            logger.error("job id" + uuid +
                                         " encount errorProcessPoolExecutor " +
                                         str(e))

            #This is used for canary deployment to comarsion how close baseline and current
            currentLen = len(currentDataSet)
            baselineLen = len(baselineDataSet)
            hasCurrent = currentLen > 0
            label_info['hasCurrent'] = hasCurrent

            hasBaseline = baselineLen > 0
            logger.warning("jobid:" + uuid + " hasCurrent " + str(hasCurrent) +
                           ", hasBaseline " + str(hasBaseline))
            #print(getNowStr(), ": hasCurrent, hasBaseline ", str(hasCurrent), str(hasBaseline)," id ",uuid , " skip bseline is ", skipBaseline)

            if hasCurrent == False:
                ret = True
                if isPast(endTime, 20):
                    ret = updateESDocStatus(
                        es_url_status_update, es_url_status_search, uuid,
                        REQUEST_STATE.COMPLETED_UNKNOWN.value,
                        "Error: there is no current Metric. ")
                    logger.warning("Current metric is empty, jobid " + uuid +
                                   " updateESDocStatus  is :" + str(ret) +
                                   "  time past mark job unknow " +
                                   currentConfig + " ".join(outputMsg))
                else:
                    cacheModels(modelHolder, max_cache)
                    ret = updateESDocStatus(
                        es_url_status_update, es_url_status_search, uuid,
                        REQUEST_STATE.PREPROCESS_INPROGRESS.value,
                        "Warning: there is no current Metric, Will keep try until reachs endTime. "
                    )
                    # print(getNowStr(), ":  no current metric is not ready, jobid ",uuid,"  ",  currentConfig)
                    logger.warning(
                        "Current metric is empty, jobid " + uuid +
                        " updateESDocStatus  is :" + str(ret) +
                        " end time is not reach, will cache and retry " +
                        currentConfig + " ".join(outputMsg))
                if not ret:
                    cacheModels(modelHolder, max_cache)
                    logger.error("ES update failed: job ID: " + uuid)
                measurementMetric.sendMetric(MONITORING_REQUEST_TIME,
                                             label_info,
                                             calculateDuration(start))
                continue

            if (hasBaseline):
                hasSameDistribution, detailedResults, meetSize = pairWiseComparson(
                    currentDataSet, baselineDataSet, ML_PAIRWISE_ALGORITHM,
                    ML_PAIRWISE_THRESHOLD, ML_BOUND)
                ret = True
                if (not hasSameDistribution):
                    logger.warning(
                        "current and base line does not have same distribution "
                        + str(detailedResults) + " ".join(outputMsg))
                    '''
                    if hasHistorical == True:
                        if meetSize :
                             updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.COMPLETED_UNHEALTH , "baseline and current are different pattern. "+escapeString(''.join(outputMsg)))
                             continue
                        requireLowerThreshold = True
                    else:
                    '''
                    if meetSize:
                        ret = updateESDocStatus(
                            es_url_status_update, es_url_status_search, uuid,
                            REQUEST_STATE.COMPLETED_UNHEALTH,
                            "Warning:  baseline and current are different pattern. "
                        )
                        #print(getNowStr(),": id ",uuid, " completed_unhealth... bacause pairwise is not same" )
                        logger.warning(
                            "job id :" + uuid +
                            "completed_unhealth, current and baseline has different distribution pattern,  updateESDocStatus  is :"
                            + str(ret))
                    else:
                        if isPast(endTime, 10):
                            ret = updateESDocStatus(
                                es_url_status_update, es_url_status_search,
                                uuid, REQUEST_STATE.COMPLETED_UNKNOWN.value,
                                "Warning: baseline and current are different pattern but not meet min datapoints to determine . "
                            )
                            #print(getNowStr(),": id ",uuid, " completed_unknown... bacause pairwise is not same but not enough datapoints " )
                            logger.warning(
                                "job id :" + uuid +
                                "completed_unknown...current or baseline is not same but not enough datapoints to confirm,  updateESDocStatus  is :"
                                + str(ret))
                        else:

                            ret = updateESDocStatus(
                                es_url_status_update, es_url_status_search,
                                uuid, REQUEST_STATE.PREPROCESS_COMPLETED.value,
                                " pairwise not same so far and not meet min datapoints to determine."
                            )
                            #print(getNowStr(),": id ",uuid, "  bacause pairwise is not same and not enough datapoint " )
                            logger.warning(
                                "job id :" + uuid +
                                " pairwise not same and not enough datapoints but not meet min datapoint to determine ,  updateESDocStatus  is :"
                                + str(ret))
                else:
                    if isPast(endTime, 10):
                        ret = updateESDocStatus(
                            es_url_status_update, es_url_status_search, uuid,
                            REQUEST_STATE.COMPLETED_HEALTH.value, 'health')
                        logger.warning("job ID : " + uuid +
                                       " is health. updateESDocStatus  is :" +
                                       str(ret))
                        #print(getNowStr(),": id ",uuid, "mark as health....")
                    else:
                        ret = updateESDocStatus(
                            es_url_status_update, es_url_status_search, uuid,
                            REQUEST_STATE.PREPROCESS_COMPLETED.value,
                            " current and baseline have same distribution but not past endtime yet. "
                        )
                        # print(getNowStr(),": id ",uuid, " continue . bacause pairwise is not same but not past endTime yet " )
                        logger.warning(
                            "job id :" + uuid +
                            " will reprocess . current and base have same distribution but not past endTime yet, updateESDocStatus  is :"
                            + str(ret))
                if not ret:
                    cacheModels(modelHolder, max_cache)
                    logger.error("ES update failed: job ID: " + uuid)
                measurementMetric.sendMetric(MONITORING_REQUEST_TIME,
                                             label_info,
                                             calculateDuration(start))
                continue
            else:
                if not skipBaseline:
                    ret = True
                    if isPast(endTime, 10):
                        ret = updateESDocStatus(
                            es_url_status_update, es_url_status_search, uuid,
                            REQUEST_STATE.COMPLETED_UNKNOWN.value,
                            "baseline query is empty. ")
                        logger.warning(
                            "job ID : " + uuid +
                            " unknown because baseline no data, updateESDocStatus  is :"
                            + str(ret))
                    else:

                        ret = updateESDocStatus(
                            es_url_status_update, es_url_status_search, uuid,
                            REQUEST_STATE.PREPROCESS_COMPLETED.value,
                            " no baseline data yet, ")
                        # print(getNowStr(),": id ",uuid, " continue . no baseline data yet. " )
                        logger.warning(
                            "job ID : " + uuid +
                            " continue . no baseline data yet. updateESDocStatus  is :"
                            + str(ret))
                    if not ret:
                        cacheModels(modelHolder, max_cache)
                        logger.error("ES update failed: job ID: " + uuid)
                    measurementMetric.sendMetric(MONITORING_REQUEST_TIME,
                                                 label_info,
                                                 calculateDuration(start))
                    continue

            #check historical and  baseline
            if hasHistorical == False:
                ret = True
                if isPast(endTime, 5):
                    ret = updateESDocStatus(
                        es_url_status_update, es_url_status_search, uuid,
                        REQUEST_STATE.COMPLETED_UNKNOWN.value,
                        "Error: no enough historical data and no baseline data. "
                    )
                    logger.warning(
                        "job id: " + uuid +
                        " completed unknown  no enough historical data and no baseline data , updateESDocStatus  is :"
                        + str(ret))
                else:
                    ret = updateESDocStatus(
                        es_url_status_update, es_url_status_search, uuid,
                        REQUEST_STATE.PREPROCESS_COMPLETED.value,
                        "Warning: not enough  historical data and no baseline data will retry until endtime reaches. "
                    )
                    #print(getNowStr(),": id ",uuid, "  will reprocess because no historical.. " )
                    logger.warning(
                        "job id: " + uuid +
                        "  will cache and reprocess becasue no historical, updateESDocStatus  is :"
                        + str(ret))

                if not ret:
                    cacheModels(modelHolder, max_cache)
                    logger.error("ES update failed: job ID: " + uuid)

                measurementMetric.sendMetric(MONITORING_REQUEST_TIME,
                                             label_info,
                                             calculateDuration(start))
                continue

            hasAnomaly, anomaliesDataStr = computeAnomaly(
                currentDataSet, modelHolder)
            logger.warning("job ID is " + uuid + "  hasAnomaly is " +
                           str(hasAnomaly))

            if hasAnomaly:
                #update ES to anomaly otherwise continue
                anomalyInfo = escapeString(anomaliesDataStr)
                ret = updateESDocStatus(
                    es_url_status_update, es_url_status_search, uuid,
                    REQUEST_STATE.COMPLETED_UNHEALTH.value,
                    "Warning: anomaly detected between current and historical. ",
                    anomalyInfo)
                #print(getNowStr(),"job ID is ",uuid, " mark unhealth anomalies data is ", anomalyInfo)
                logger.warning("**job ID is unhealth  " + uuid +
                               " updateESDocStatus  is :" + str(ret) + "  " +
                               anomaliesDataStr)
                if not ret:
                    cacheModels(modelHolder, max_cache)
                    logger.error("ES update failed: job ID: " + uuid)
            else:
                if isPast(endTime, 10):
                    ret = updateESDocStatus(
                        es_url_status_update, es_url_status_search, uuid,
                        REQUEST_STATE.COMPLETED_HEALTH.value,
                        "current compare to histroical model is health")
                    logger.warning("job ID: " + uuid +
                                   " is health, updateESDocStatus is :" +
                                   str(ret))
                    if not ret:
                        cacheModels(modelHolder, max_cache)
                        logger.error("ES update failed: job ID: " + uuid)

                    #print(getNowStr(),"job ID is ",uuid, " mark as health....")
                else:
                    cacheModels(modelHolder, max_cache)
                    ret = updateESDocStatus(
                        es_url_status_update, es_url_status_search, uuid,
                        REQUEST_STATE.PREPROCESS_INPROGRESS.value,
                        "Need to continuous to check untile reachs deployment endTime. "
                    )
                    logger.warning(
                        "job ID : " + uuid +
                        " health so far will reprocess  updateESDocStatus is :"
                        + str(ret))

            measurementMetric.sendMetric(MONITORING_REQUEST_TIME, label_info,
                                         calculateDuration(start))

        except Exception as e:
            #print("uuid ",uuid, " error :",str(e))
            logger.error("uuid : " + uuid + " failed because ", e)
            #print(getNowStr(),"job ID is ",uuid, " critical error encounted ", str(e))
            try:
                if isPast(endTime, 5):
                    updateESDocStatus(
                        es_url_status_update, es_url_status_search, uuid,
                        REQUEST_STATE.PREPROCESS_FAILED.value,
                        "Critical: encount code exception. " +
                        escapeString(''.join(outputMsg)))
                else:
                    updateESDocStatus(
                        es_url_status_update, es_url_status_search, uuid,
                        REQUEST_STATE.PREPROCESS_COMPLETED.value,
                        "Critical: encount code exception. " +
                        escapeString(''.join(outputMsg)))

            except Exception as ee:
                #print(getNowStr(),"job ID is ",uuid, " critical error encounted +str(ee) )
                logger.error("uuid : " + uuid + " failed because " + str(ee))
            continue
Пример #2
0
def main():
    #Default Parameters can be overwrite by environments
    max_cache = convertStrToInt(os.environ.get("MAX_CACHE_SIZE", str(MAX_CACHE_SIZE)), MAX_CACHE_SIZE) 
    ES_ENDPOINT = os.environ.get('ES_ENDPOINT', 'http://elasticsearch-discovery-service.foremast.svc.cluster.local:9200')
    ML_ALGORITHM = os.environ.get('ML_ALGORITHM', AI_MODEL.MOVING_AVERAGE_ALL.value)
    FLUSH_FREQUENCY = os.environ.get('FLUSH_FREQUENCY', 5)
    OIM_BUCKET = os.environ.get("OIM_BUCKET")
    
    MIN_MANN_WHITE_DATA_POINTS = convertStrToInt(os.environ.get("MIN_MANN_WHITE_DATA_POINTS", str(MANN_WHITE_MIN_DATA_POINT)), MANN_WHITE_MIN_DATA_POINT) 
    MIN_WILCOXON_DATA_POINTS = convertStrToInt(os.environ.get("MIN_WILCOXON_DATA_POINTS", str(WILCOXON_MIN_DATA_POINTS)), WILCOXON_MIN_DATA_POINTS) 
    MIN_KRUSKAL_DATA_POINTS=convertStrToInt(os.environ.get("MIN_KRUSKAL_DATA_POINTS", str(KRUSKAL_MIN_DATA_POINTS)), KRUSKAL_MIN_DATA_POINTS) 
    ML_THRESHOLD = convertStrToFloat(os.environ.get(THRESHOLD, str(DEFAULT_THRESHOLD)), DEFAULT_THRESHOLD)
    #lower threshold is for warning.
    ML_LOWER_THRESHOLD = convertStrToFloat(os.environ.get(LOWER_THRESHOLD, str(DEFAULT_LOWER_THRESHOLD)), DEFAULT_LOWER_THRESHOLD)
    ML_BOUND = convertStrToInt(os.environ.get(BOUND, str(IS_UPPER_BOUND)), IS_UPPER_BOUND)
    ML_MIN_LOWER_BOUND = convertStrToFloat(os.environ.get(MIN_LOWER_BOUND, str(DEFAULT_MIN_LOWER_BOUND)), DEFAULT_MIN_LOWER_BOUND)
    # this is for pairwise algorithem which is used for canary deployment anomaly detetion.
    config.setKV("MIN_MANN_WHITE_DATA_POINTS",MIN_MANN_WHITE_DATA_POINTS)
    config.setKV("MIN_WILCOXON_DATA_POINTS",MIN_WILCOXON_DATA_POINTS)
    config.setKV("MIN_KRUSKAL_DATA_POINTS",MIN_KRUSKAL_DATA_POINTS)
    config.setKV(THRESHOLD, ML_THRESHOLD )
    config.setKV(BOUND, ML_BOUND)
    config.setKV(MIN_LOWER_BOUND, ML_MIN_LOWER_BOUND)
    config.setKV("FLUSH_FREQUENCY", int(FLUSH_FREQUENCY))
    config.setKV("OIM_BUCKET", OIM_BUCKET)
    #Add Metric source env
    config.setKV("SOURCE_ENV", "ppd")
    MODE_DROP_ANOMALY = os.environ.get('MODE_DROP_ANOMALY', 'y')
    config.setKV('MODE_DROP_ANOMALY', MODE_DROP_ANOMALY)
    wavefrontEndpoint = os.environ.get('WAVEFRONT_ENDPOINT')
    wavefrontToken = os.environ.get('WAVEFRONT_TOKEN')

    foremastEnv = os.environ.get("FOREMAST_ENV",'qa')
    metricDestation = os.environ.get('METRIC_DESTINATION',"prometheus")
    if wavefrontEndpoint is not None:
        config.setKV('WAVEFRONT_ENDPOINT',wavefrontEndpoint)
    else:
        logger.error("WAVEFRONT_ENDPOINT is null!!! foremat-brain will throw exception is you consumer wavefront metric...")
    if wavefrontToken is not None:
        config.setKV('WAVEFRONT_TOKEN',wavefrontToken)
    else:
        logger.error("WAVEFRONT_TOKEN is null!!! foremat-brain will throw exception is you consumer wavefront metric...")
    if metricDestation is not None:
        config.setKV('METRIC_DESTINATION',metricDestation)
    else:
        config.setKV('METRIC_DESTINATION',"prometheus")
    if foremastEnv is  None or foremastEnv == '':
        config.setKV("FOREMAST_ENV",'qa')
    else:
        config.setKV("FOREMAST_ENV",foremastEnv)
    
    metric_threshold_count = convertStrToInt(os.environ.get(METRIC_TYPE_THRESHOLD_COUNT, -1), METRIC_TYPE_THRESHOLD_COUNT)
    if metric_threshold_count >= 0:
        for i in range(metric_threshold_count):
            istr = str(i)
            mtype = os.environ.get(METRIC_TYPE+istr,'')
            if mtype!='':
                mthreshold = convertStrToFloat(os.environ.get(THRESHOLD+istr, str(ML_THRESHOLD)), ML_THRESHOLD)
                mbound = convertStrToInt(os.environ.get(BOUND+istr, str(ML_BOUND )), ML_BOUND )
                mminlowerbound  = convertStrToInt(os.environ.get(MIN_LOWER_BOUND+istr, str(ML_MIN_LOWER_BOUND)), ML_MIN_LOWER_BOUND)
                config.setThresholdKV(mtype,THRESHOLD,mthreshold)
                config.setThresholdKV(mtype,BOUND, mbound)
                config.setThresholdKV(mtype,MIN_LOWER_BOUND, mminlowerbound)
       
    ML_PROPHET_PERIOD = convertStrToInt(os.environ.get(PROPHET_PERIOD, str(DEFAULT_PROPHET_PERIOD)),DEFAULT_PROPHET_PERIOD) 
    ML_PROPHET_FREQ = os.environ.get(PROPHET_FREQ, DEFAULT_PROPHET_FREQ)
    #prophet algm parameters end
    
    ML_PAIRWISE_ALGORITHM =os.environ.get(PAIRWISE_ALGORITHM, ALL)
    ML_PAIRWISE_THRESHOLD = convertStrToFloat(os.environ.get(PAIRWISE_THRESHOLD, str(DEFAULT_PAIRWISE_THRESHOLD)), DEFAULT_PAIRWISE_THRESHOLD)
    
    


    MAX_STUCK_IN_SECONDS = convertStrToInt(os.environ.get('MAX_STUCK_IN_SECONDS', str(DEFAULT_MAX_STUCK_IN_SECONDS)), DEFAULT_MAX_STUCK_IN_SECONDS)
    min_historical_data_points = convertStrToInt(os.environ.get('MIN_HISTORICAL_DATA_POINT_TO_MEASURE', str(DEFAULT_MIN_HISTORICAL_DATA_POINT_TO_MEASURE)), DEFAULT_MIN_HISTORICAL_DATA_POINT_TO_MEASURE)

    es_url_status_search=buildElasticSearchUrl(ES_ENDPOINT, ES_INDEX)
    es_url_status_update=buildElasticSearchUrl(ES_ENDPOINT, ES_INDEX, isSearch=False)
 
    # Start up the server to expose the metrics.
    start_http_server(8000)
    #measurementMetric=  measurementmetrics()
    label_info = {'jobId':'','calcuHistorical':'False','hasCurrent':'True'}
    MONITORING_REQUEST_TIME = "request_process_time"
    
    while True:
        resp=''
        modelHolder = None
        
        threshold = ML_THRESHOLD
        lower_threshold = ML_LOWER_THRESHOLD
      
  
        resp = searchByStatuslist(es_url_status_search, REQUEST_STATE.INITIAL.value ,REQUEST_STATE.PREPROCESS_COMPLETED.value)
        openRequestlist=parseResult(resp)
        openRequest =selectRequestToProcess(openRequestlist)

        if openRequest == None :
            #process stucked preprogress_inprogress event.
            resp = searchByStatus(es_url_status_search, REQUEST_STATE.PREPROCESS_INPROGRESS.value, MAX_STUCK_IN_SECONDS)
            openRequestlist=parseResult(resp)
            openRequest = selectRequestToProcess(openRequestlist)
            if openRequest == None:
                openRequest, modelHolder = retrieveCachedRequest(es_url_status_search)
                openRequestlist=parseResult(resp)
                openRequest = selectRequestToProcess(openRequestlist)
                if openRequest == None :
                    logger.warning("No long running preprocess job found .....")
                    
                    time.sleep(1)
                    continue
                
                    #Test Start########################
                    '''
                    id='719a1a711bcaa94fff9677b9c0e24bcee67ec27ac67b57532316a3f8a37a8649'
                    openRequest = retrieveRequestById(es_url_status_search, id)
                    if (openRequest==None):
                        print("es is down, will sleep and retry")
                        time.sleep(1)
                        continue
                    '''
                    #Test End##########################
            else:
                uuid = openRequest['id']
                openRequest_tmp, modelHolder = retrieveOneCachedRequest(es_url_status_search,uuid)
          

        outputMsg = []
        uuid = openRequest['id']
        status = openRequest['status']
        #updatedStatus = reserveJob(es_url_status_update, uuid, status)
        updatedStatus = reserveJob(es_url_status_update,es_url_status_search, uuid,status)

        logger.warning("Start to processing job id "+uuid+ " original status:"+ status)

        historicalConfig =openRequest['historicalConfig']
        currentConfig = openRequest['currentConfig']
        baselineConfig = None
        if 'baselineConfig' in openRequest:
            baselineConfig = openRequest['baselineConfig']
        historicalMetricStore= None
        if  ('historicalMetricStore' in openRequest):    
            historicalMetricStore =openRequest['historicalMetricStore']
        currentMetricStore = None
        if ('currentMetricStore' in openRequest):     
            currentMetricStore = openRequest['currentMetricStore']
        baselineMetricStore = None    
        if 'baselineMetricStore' in openRequest: 
            baselineMetricStore = openRequest['baselineMetricStore']
        startTime = openRequest['startTime']
        endTime = openRequest['endTime']
        #strategy
        strategy = openRequest['strategy']
        skipHistorical =( historicalConfig=='') or (strategy == 'canary')
        # only canary deploymebnt requires baseline
        skipBaseline = strategy != 'canary'
        label_info['jobId']= uuid
        label_info['calcuHistorical']='False'
        label_info['hasCurrent']='False'
        start = time.time()
        
        #Need to be removed below line due to baseline is enabled at upstream
        #skipBaseline = True
        skipCurrent = (currentConfig=='')
        
        try:
            if (skipCurrent):
                ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.COMPLETED_UNKNOWN.value, "Error: no current config")
                logger.warning("request error : jobid  "+uuid+" updateESDocStatus  is :"+ str(ret)+ " current config is empty. make status unknown")
                #print(getNowStr(), " : jobid  ",uuid, " current config is empty. make status unknown")
                #measurementmetrics.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))
                continue


            #dict  metric name : url , if modelHolder does not have model, give chance to recalculate
            if modelHolder == None:
                modelConfig = {THRESHOLD : threshold,LOWER_THRESHOLD : lower_threshold, 
                                MIN_DATA_POINTS:min_historical_data_points, BOUND: ML_BOUND, 
                                PAIRWISE_ALGORITHM:ML_PAIRWISE_ALGORITHM,PAIRWISE_THRESHOLD:ML_PAIRWISE_THRESHOLD}
                modelHolder = ModelHolder(ML_ALGORITHM,modelConfig,{}, METRIC_PERIOD.HISTORICAL.value, uuid)

                
            if  (not (modelHolder.hasModels or skipHistorical) ):
                configMapHistorical = convertStringToMap(historicalConfig)
                storeMapHistorical = convertStringToMap(historicalMetricStore)
                isProphet = False
                if (ML_ALGORITHM==AI_MODEL.PROPHET.value):
                    isProphet=True
                    modelConfig.setdefault(PROPHET_PERIOD, ML_PROPHET_PERIOD )
                    modelConfig.setdefault(PROPHET_FREQ,ML_PROPHET_FREQ )
                # pass stragegy for hpa
                modelHolder, msg = computeHistoricalModel(configMapHistorical, modelHolder, isProphet,storeMapHistorical, strategy)
                label_info['calcuHistorical'] ='True' 
                if (msg!=''):
                    outputMsg.append(msg)
                if (not modelHolder.hasModels):
                    outputMsg.append("No historical Data and model ")
                    #print(getNowStr(), ": Warning: No historical: "+str(modelHolder))
                                
            hasHistorical =  modelHolder.hasModels
            
            #start baseline             
            to_do = []
            
            currentDataSet={}
            baselineDataSet={}
            

            if skipBaseline :
                currentDataSet, p = computeNonHistoricalModel(convertStringToMap(currentConfig), METRIC_PERIOD.CURRENT.value,convertStringToMap(currentMetricStore), strategy);
            else:                
                with ProcessPoolExecutor(max_workers=2) as executor:
                    currentjob = executor.submit(computeNonHistoricalModel, convertStringToMap(currentConfig),METRIC_PERIOD.CURRENT.value,convertStringToMap(currentMetricStore), strategy);
                    baselinejob = executor.submit(computeNonHistoricalModel, convertStringToMap(baselineConfig), METRIC_PERIOD.BASELINE.value,convertStringToMap(baselineMetricStore), strategy);
                    to_do.append(currentjob)
                    to_do.append(baselinejob)
                    for future in futures.as_completed(to_do):
                        try:
                            res = future.result()
                            if (res[1]== METRIC_PERIOD.CURRENT.value):
                                currentDataSet = res[0]
                            else:
                                baselineDataSet = res[0]
                        except Exception as e:
                            logger.error("job id"+ uuid+ " encount errorProcessPoolExecutor " +str(e))
                            
                                      
                    
            #This is used for canary deployment to comarsion how close baseline and current 
            currentLen = len(currentDataSet)
            baselineLen= len(baselineDataSet)
            hasCurrent = currentLen>0
            label_info['hasCurrent'] =hasCurrent 
            
            hasBaseline = baselineLen>0
            logger.warning("jobid:"+ uuid +" hasCurrent "+ str(hasCurrent)+", hasBaseline "+ str(hasBaseline) )
            
            if hasCurrent == False:
                ret = True
                if isPast(endTime, 20):
                    ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.COMPLETED_UNKNOWN.value, "Error: there is no current Metric. ")
                    logger.warning("Current metric is empty, jobid "+uuid+" updateESDocStatus  is :"+ str(ret)+ "  time past mark job unknow "+  currentConfig+" ".join(outputMsg))
                else:
                    cacheModels(modelHolder, max_cache) 
                    ret =updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.PREPROCESS_INPROGRESS.value, "Warning: there is no current Metric, Will keep try until reachs endTime. ")
                    logger.warning("Current metric is empty, jobid "+uuid+" updateESDocStatus  is :"+ str(ret)+ " end time is not reach, will cache and retry "+  currentConfig+" ".join(outputMsg))
                if not ret:
                    cacheModels( modelHolder,  max_cache)
                    logger.error("ES update failed: job ID: "+uuid) 
                 #measurementMetric.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))
                continue
            
            if (hasBaseline):
                hasSameDistribution, detailedResults, meetSize = pairWiseComparson (currentDataSet, baselineDataSet, ML_PAIRWISE_ALGORITHM, ML_PAIRWISE_THRESHOLD, ML_BOUND)
                ret = True
                if (not hasSameDistribution):
                    logger.warning("current and base line does not have same distribution "+str(detailedResults)+" ".join(outputMsg))

                    '''
                    if hasHistorical == True:
                        if meetSize :
                             updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.COMPLETED_UNHEALTH , "baseline and current are different pattern. "+escapeString(''.join(outputMsg)))
                             continue
                        requireLowerThreshold = True
                    else:
                    '''
                    if meetSize :
                        ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.COMPLETED_UNHEALTH , "Warning:  baseline and current are different pattern. ")
                        logger.warning("job id :"+uuid+"completed_unhealth, current and baseline has different distribution pattern,  updateESDocStatus  is :"+ str(ret))
                    else:
                        if isPast(endTime, 10):
                            ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.COMPLETED_UNKNOWN.value, "Warning: baseline and current are different pattern but not meet min datapoints to determine . ")
                            logger.warning("job id :"+uuid+"completed_unknown...current or baseline is not same but not enough datapoints to confirm,  updateESDocStatus  is :"+ str(ret))
                        else: 
                            
                            ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.PREPROCESS_COMPLETED.value, " pairwise not same so far and not meet min datapoints to determine.")
                            logger.warning("job id :"+uuid+" pairwise not same and not enough datapoints but not meet min datapoint to determine ,  updateESDocStatus  is :"+ str(ret))
                else:
                    if isPast(endTime, 10):
                        ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.COMPLETED_HEALTH.value, 'health')
                        logger.warning("job ID : "+uuid+" is health. updateESDocStatus  is :"+ str(ret))
                    else:
                        ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.PREPROCESS_COMPLETED.value , " current and baseline have same distribution but not past endtime yet. ")
                        # print(getNowStr(),": id ",uuid, " continue . bacause pairwise is not same but not past endTime yet " )                      
                        logger.warning("job id :"+uuid+" will reprocess . current and base have same distribution but not past endTime yet, updateESDocStatus  is :"+ str(ret))
                if not ret:
                    cacheModels( modelHolder,  max_cache)
                    logger.error("ES update failed: job ID: "+uuid)  
                #measurementMetric.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))   
                continue
            else:
                #no baseline metric but require baseline then wait or reach end time to mark as unknown
                if not skipBaseline :
                    ret = True
                    if isPast(endTime, 10):
                        ret =updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.COMPLETED_UNKNOWN.value, "baseline query is empty. ")
                        logger.warning("job ID : "+uuid+" unknown because baseline no data, updateESDocStatus  is :"+ str(ret))
                    else:
                        # wait for baseline metric to generate
                        ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.PREPROCESS_COMPLETED.value , " no baseline data yet, ")
                        logger.warning("job ID : "+uuid+" continue . no baseline data yet. updateESDocStatus  is :"+ str(ret)) 
                    if not ret:
                        cacheModels( modelHolder,  max_cache)
                        logger.error("ES update failed: job ID: "+uuid) 
                    #measurementMetric.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))               
                    continue
                    
            
            #check historical (we may need to fail fast for non histrical netric use case
            #:TODO
            if hasHistorical == False :
                ret = True
                if isPast(endTime, 5):    
                    ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.COMPLETED_UNKNOWN.value, "Error: no enough historical data and no baseline data. ")
                    logger.warning("job id: "+uuid+" completed unknown  no enough historical data and no baseline data , updateESDocStatus  is :"+ str(ret))
                else:
                    ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.PREPROCESS_COMPLETED.value, "Warning: not enough  historical data and no baseline data will retry until endtime reaches. ")
                    logger.warning("job id: "+uuid+"  will cache and reprocess becasue no historical, updateESDocStatus  is :"+ str(ret))

                if not ret:
                    cacheModels( modelHolder,  max_cache)
                    logger.error("ES update failed: job ID: "+uuid)             
                #measurementMetric.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))  
                continue
             
            if strategy ==HPA:
                    computeAnomaly(currentDataSet,modelHolder,strategy) 
                    if isPast(endTime, 5): 
                        ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.COMPLETED_HEALTH.value, "")
                        logger.warning("job id: "+uuid+"  hpa cycle completed.")
                    else:
                        ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.PREPROCESS_INPROGRESS.value, "")
                        logger.warning("job id: "+uuid+"  hpa in progress.")
                    if not ret:
                        cacheModels( modelHolder,  max_cache)
                        logger.error("ES update failed: hpa job ID: "+uuid)       

                    #measurementMetric.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))
                    continue
            #add strategy    
            hasAnomaly, anomaliesDataStr = computeAnomaly(currentDataSet,modelHolder,strategy)   
            logger.warning("job ID is "+uuid+"  hasAnomaly is "+str(hasAnomaly) )

            if hasAnomaly:
                #update ES to anomaly otherwise continue 
                anomalyInfo = escapeString(anomaliesDataStr)
                ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.COMPLETED_UNHEALTH.value , "Warning: anomaly detected between current and historical. ",anomalyInfo)
                logger.warning("**job ID is unhealth  "+uuid+" updateESDocStatus  is :"+ str(ret)+ "  "+anomaliesDataStr)
                if not ret:
                    cacheModels( modelHolder,  max_cache)
                    logger.error("ES update failed: job ID: "+uuid)
            else:
                if isPast(endTime, 10):
                    ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.COMPLETED_HEALTH.value,"current compare to histroical model is health")
                    logger.warning("job ID: "+uuid+" is health, updateESDocStatus is :"+ str(ret))
                    if not ret:
                        cacheModels( modelHolder,  max_cache)
                        logger.error("ES update failed: job ID: "+uuid)
                else:
                    cacheModels( modelHolder,  max_cache)    
                    ret = updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.PREPROCESS_INPROGRESS.value, "Need to continuous to check untile reachs deployment endTime. ")
                    logger.warning("job ID : "+uuid+" health so far will reprocess  updateESDocStatus is :"+ str(ret))
                    
            #measurementMetric.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))

        except Exception as e:
            logger.error("uuid : "+ uuid+" failed because ",e )
            try:
                if isPast(endTime, 5):
                    updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.PREPROCESS_FAILED.value,"Critical: encount code exception. "+escapeString(''.join(outputMsg)))
                else:
                    updateESDocStatus(es_url_status_update, es_url_status_search, uuid, REQUEST_STATE.PREPROCESS_COMPLETED.value,"Critical: encount code exception. "+escapeString(''.join(outputMsg)))

            except Exception as ee:
                logger.error("uuid : "+ uuid+" failed because "+str(ee) )
            continue
Пример #3
0
def main():
    # Default Parameters can be overwrite by environments
    max_cache = convertStrToInt(
        os.environ.get("MAX_CACHE_SIZE", str(MAX_CACHE_SIZE)), MAX_CACHE_SIZE)
    ML_ALGORITHM = os.environ.get('ML_ALGORITHM',
                                  AI_MODEL.MOVING_AVERAGE_ALL.value)
    FLUSH_FREQUENCY = os.environ.get('FLUSH_FREQUENCY', 5)
    OIM_BUCKET = os.environ.get("OIM_BUCKET")

    # get historical time window
    HISTORICAL_CONF_TIME_WINDOW = os.environ.get('HISTORICAL_CONF_TIME_WINDOW',
                                                 7 * 24 * 60 * 60)
    CURRENT_CONF_TIME_WINDOW = os.environ.get('CURRENT_CONF_TIME_WINDOW', 1.75)
    CURRENT_CONF_POD_TIME_WINDOW = os.environ.get('CURRENT_CONF_TIME_WINDOW',
                                                  5.75)
    MIN_MANN_WHITE_DATA_POINTS = convertStrToInt(
        os.environ.get("MIN_MANN_WHITE_DATA_POINTS",
                       str(MANN_WHITE_MIN_DATA_POINT)),
        MANN_WHITE_MIN_DATA_POINT)
    MIN_WILCOXON_DATA_POINTS = convertStrToInt(
        os.environ.get("MIN_WILCOXON_DATA_POINTS",
                       str(WILCOXON_MIN_DATA_POINTS)),
        WILCOXON_MIN_DATA_POINTS)
    MIN_KRUSKAL_DATA_POINTS = convertStrToInt(
        os.environ.get("MIN_KRUSKAL_DATA_POINTS",
                       str(KRUSKAL_MIN_DATA_POINTS)), KRUSKAL_MIN_DATA_POINTS)

    #ML_THRESHOLD = convertStrToFloat(os.environ.get(THRESHOLD, str(DEFAULT_THRESHOLD)), DEFAULT_THRESHOLD)
    # lower threshold is for warning.
    #ML_LOWER_THRESHOLD = convertStrToFloat(os.environ.get(LOWER_THRESHOLD, str(DEFAULT_LOWER_THRESHOLD)),
    #                                       DEFAULT_LOWER_THRESHOLD)
    ML_THRESHOLD = convertStrToFloat(
        os.environ.get(THRESHOLD, str(0.8416212335729143)), 0.8416212335729143)
    ML_LOWER_THRESHOLD = convertStrToFloat(
        os.environ.get(LOWER_THRESHOLD, str(0.6744897501960817)),
        0.6744897501960817)

    ML_BOUND = convertStrToInt(os.environ.get(BOUND, str(IS_UPPER_BOUND)),
                               IS_UPPER_BOUND)
    ML_MIN_LOWER_BOUND = convertStrToFloat(
        os.environ.get(MIN_LOWER_BOUND, str(DEFAULT_MIN_LOWER_BOUND)),
        DEFAULT_MIN_LOWER_BOUND)
    # this is for pairwise algorithem which is used for canary deployment anomaly detetion.
    config.setKV("MIN_MANN_WHITE_DATA_POINTS", MIN_MANN_WHITE_DATA_POINTS)
    config.setKV("MIN_WILCOXON_DATA_POINTS", MIN_WILCOXON_DATA_POINTS)
    config.setKV("MIN_KRUSKAL_DATA_POINTS", MIN_KRUSKAL_DATA_POINTS)
    config.setKV(THRESHOLD, ML_THRESHOLD)
    config.setKV(BOUND, ML_BOUND)
    config.setKV(MIN_LOWER_BOUND, ML_MIN_LOWER_BOUND)
    config.setKV("FLUSH_FREQUENCY", int(FLUSH_FREQUENCY))
    config.setKV("OIM_BUCKET", OIM_BUCKET)
    config.setKV("CACHE_EXPIRE_TIME",
                 os.environ.get('CACHE_EXPIRE_TIME', 30 * 60))
    config.setKV("REQ_CHECK_INTERVAL",
                 int(os.environ.get('REQ_CHECK_INTERVAL', 45)))
    # Add Metric source env
    config.setKV("SOURCE_ENV", "ppd")
    MODE_DROP_ANOMALY = os.environ.get('MODE_DROP_ANOMALY', 'y')
    config.setKV('MODE_DROP_ANOMALY', MODE_DROP_ANOMALY)

    NO_MATCH_PICK_LAST = os.environ.get('NO_MATCH_PICK_LAST', 'y')
    config.setKV('NO_MATCH_PICK_LAST', NO_MATCH_PICK_LAST)

    wavefrontEndpoint = os.environ.get('WAVEFRONT_ENDPOINT')
    wavefrontToken = os.environ.get('WAVEFRONT_TOKEN')

    foremastEnv = os.environ.get("FOREMAST_ENV", 'qa')
    metricDestation = os.environ.get('METRIC_DESTINATION', "prometheus")
    if wavefrontEndpoint is not None:
        config.setKV('WAVEFRONT_ENDPOINT', wavefrontEndpoint)
    else:
        logger.error(
            "WAVEFRONT_ENDPOINT is null!!! foremat-brain will throw exception is you consumer wavefront metric..."
        )
    if wavefrontToken is not None:
        config.setKV('WAVEFRONT_TOKEN', wavefrontToken)
    else:
        logger.error(
            "WAVEFRONT_TOKEN is null!!! foremat-brain will throw exception is you consumer wavefront metric..."
        )
    if metricDestation is not None:
        config.setKV('METRIC_DESTINATION', metricDestation)
    else:
        config.setKV('METRIC_DESTINATION', "prometheus")
    if foremastEnv is None or foremastEnv == '':
        config.setKV("FOREMAST_ENV", 'qa')
    else:
        config.setKV("FOREMAST_ENV", foremastEnv)

    metric_threshold_count = convertStrToInt(
        os.environ.get(METRIC_TYPE_THRESHOLD_COUNT, -1),
        METRIC_TYPE_THRESHOLD_COUNT)
    if metric_threshold_count >= 0:
        for i in range(metric_threshold_count):
            istr = str(i)
            mtype = os.environ.get(METRIC_TYPE + istr, '')
            if mtype != '':
                mthreshold = convertStrToFloat(
                    os.environ.get(THRESHOLD + istr, str(ML_THRESHOLD)),
                    ML_THRESHOLD)
                mbound = convertStrToInt(
                    os.environ.get(BOUND + istr, str(ML_BOUND)), ML_BOUND)
                mminlowerbound = convertStrToInt(
                    os.environ.get(MIN_LOWER_BOUND + istr,
                                   str(ML_MIN_LOWER_BOUND)),
                    ML_MIN_LOWER_BOUND)
                config.setThresholdKV(mtype, THRESHOLD, mthreshold)
                config.setThresholdKV(mtype, BOUND, mbound)
                config.setThresholdKV(mtype, MIN_LOWER_BOUND, mminlowerbound)

    ML_PROPHET_PERIOD = convertStrToInt(
        os.environ.get(PROPHET_PERIOD, str(DEFAULT_PROPHET_PERIOD)),
        DEFAULT_PROPHET_PERIOD)
    ML_PROPHET_FREQ = os.environ.get(PROPHET_FREQ, DEFAULT_PROPHET_FREQ)
    # prophet algm parameters end

    ML_PAIRWISE_ALGORITHM = os.environ.get(PAIRWISE_ALGORITHM, ALL)
    ML_PAIRWISE_THRESHOLD = convertStrToFloat(
        os.environ.get(PAIRWISE_THRESHOLD, str(DEFAULT_PAIRWISE_THRESHOLD)),
        DEFAULT_PAIRWISE_THRESHOLD)

    MAX_STUCK_IN_SECONDS = convertStrToInt(
        os.environ.get('MAX_STUCK_IN_SECONDS',
                       str(DEFAULT_MAX_STUCK_IN_SECONDS)),
        DEFAULT_MAX_STUCK_IN_SECONDS)
    min_historical_data_points = convertStrToInt(
        os.environ.get('MIN_HISTORICAL_DATA_POINT_TO_MEASURE',
                       str(DEFAULT_MIN_HISTORICAL_DATA_POINT_TO_MEASURE)),
        DEFAULT_MIN_HISTORICAL_DATA_POINT_TO_MEASURE)

    es = ESClient()

    # Start up the server to expose the metrics.
    start_http_server(8000)
    # measurementMetric=  measurementmetrics()
    label_info = {
        'jobId': '',
        'calcuHistorical': 'False',
        'hasCurrent': 'True'
    }
    MONITORING_REQUEST_TIME = "request_process_time"

    while True:
        resp = ''
        modelHolder = None

        threshold = ML_THRESHOLD
        lower_threshold = ML_LOWER_THRESHOLD

        resp = es.search_by_statuslist(
            REQUEST_STATE.INITIAL.value,
            REQUEST_STATE.PREPROCESS_COMPLETED.value)
        _, openRequestlist = es.parse_result(resp)
        openRequest = selectRequestToProcess(openRequestlist)

        if openRequest == None:
            # process stucked preprogress_inprogress event.
            resp = es.search_status_and_lastmodify(
                REQUEST_STATE.PREPROCESS_INPROGRESS.value,
                MAX_STUCK_IN_SECONDS)
            _, openRequestlist = es.parse_result(resp)
            openRequest = selectRequestToProcess(openRequestlist)
            if openRequest == None:
                openRequest, modelHolder = retrieveCachedRequest()
                if openRequest == None:
                    #logger.warning("No long running preprocess job found .....")
                    continue
                    '''
                    # Test Start########################
                    
                    id='3c100dba1da813e4e0be6ca07d88a5bbafe3ac8a0cacd58f1e8bcacfdb2119d1'
                    openRequest = retrieveRequestById(id)
                    if (openRequest==None):
                        print("es is down, will sleep and retry")
                        time.sleep(1)
                        continue
                    
                    # Test End##########################
                    '''
            else:
                uuid = openRequest['id']
                _, modelHolder = retrieveOneCachedRequest(uuid)

        outputMsg = []
        uuid = openRequest['id']
        status = openRequest['status']

        updatedStatus = reserveJob(uuid, status)
        logger.warning("Start to processing job id " + uuid +
                       " original status:" + status)
        #strategy
        strategy = openRequest['strategy']
        start = time.time()

        historicalConfig = None
        historicalConfigMap = None
        historicalMetricStore = None
        if strategy not in [CANARY]:
            if 'historicalConfig' in openRequest:
                historicalConfig = openRequest['historicalConfig']
                if historicalConfig != '':
                    historicalConfigMap = convertStringToMap(historicalConfig)
                    if ('historicalMetricStore' in openRequest):
                        historicalMetricStore = openRequest[
                            'historicalMetricStore']

        #currentConfig should never null
        currentConfig = openRequest['currentConfig']
        currentConfigMap = None
        currentMetricStore = None
        if currentConfig != '':
            currentConfigMap = convertStringToMap(currentConfig)
            if ('currentMetricStore' in openRequest):
                currentMetricStore = openRequest['currentMetricStore']

        baselineConfig = None
        baselineConfigMap = None
        baselineMetricStore = None
        if strategy in [CANARY] and 'baselineConfig' in openRequest:
            baselineConfig = openRequest['baselineConfig']
            if baselineConfig != '':
                baselineConfigMap = convertStringToMap(baselineConfig)
                if 'baselineMetricStore' in openRequest:
                    baselineMetricStore = openRequest['baselineMetricStore']

        skipHistorical = (historicalConfig == '') or (strategy == CANARY)
        # only canary deploymebnt requires baseline
        skipBaseline = strategy != CANARY
        #label_info['jobId']= uuid
        #label_info['calcuHistorical']='False'
        #label_info['hasCurrent']='False'

        endTime = openRequest['endTime']

        #Need to be removed below line due to baseline is enabled at upstream
        skipCurrent = (currentConfig == '')

        persistModelConfig = False
        try:
            if (skipCurrent):
                #this should not pick up
                ret = update_es_doc(strategy, status, uuid,
                                    REQUEST_STATE.COMPLETED_UNKNOWN.value,
                                    "Error: no current config")
                logger.warning("request error : jobid  " + uuid +
                               " updateESDocStatus  is :" + str(ret) +
                               " current config is empty. make status unknown")
                #measurementmetrics.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))
                continue

            #dict  metric name : url , if modelHolder does not have model, give chance to recalculate
            if modelHolder == None:
                modelConfig = loadModelConfig(uuid)
                if strategy == CANARY:
                    if modelConfig is None:
                        modelConfig = {
                            PAIRWISE_ALGORITHM: ML_PAIRWISE_ALGORITHM,
                            PAIRWISE_THRESHOLD: ML_PAIRWISE_THRESHOLD,
                            BOUND: ML_BOUND
                        }
                        persistModelConfig = True
                    modelHolder = ModelHolder(ML_PAIRWISE_ALGORITHM,
                                              modelConfig, {},
                                              METRIC_PERIOD.BASELINE.value,
                                              uuid)
                else:
                    if modelConfig is None:
                        modelConfig = {
                            THRESHOLD: threshold,
                            LOWER_THRESHOLD: lower_threshold,
                            MIN_DATA_POINTS: min_historical_data_points,
                            BOUND: ML_BOUND,
                            MIN_LOWER_BOUND: ML_MIN_LOWER_BOUND
                        }
                        persistModelConfig = True
                    modelHolder = ModelHolder(ML_ALGORITHM, modelConfig, {},
                                              METRIC_PERIOD.HISTORICAL.value,
                                              uuid)

            if strategy in [HPA, CONTINUOUS]:
                # replace start and end time for HPA and continuous strategy
                start_history_str = str(time.time() -
                                        float(HISTORICAL_CONF_TIME_WINDOW))
                start_current_str = str(time.time() -
                                        float(CURRENT_CONF_TIME_WINDOW))
                end_str = str(time.time())
                hpaMetricsConfig = None
                if strategy == HPA:
                    if "hpaMetricsConfig" in openRequest:
                        hpaMetricsConfig = openRequest['hpaMetricsConfig']

                if historicalConfigMap:
                    for metric_type, metric_url in historicalConfigMap.items():
                        metric_url = metric_url.replace(
                            'START_TIME', start_history_str)
                        metric_url = metric_url.replace('END_TIME', end_str)
                        historicalConfigMap[metric_type] = metric_url
                        if hpaMetricsConfig is not None and metric_type in hpaMetricsConfig:
                            hpaMetricsConfigMap = hpaMetricsConfig[metric_type]
                            for k, v in hpaMetricsConfigMap.items():
                                modelHolder.setModelConfig(
                                    "hpa", metric_type, k, v)

                if currentConfigMap:
                    podUrl = openRequest['podCountURL']
                    if podUrl is not None and len(podUrl) > 0:
                        start_current_pod_str = str(
                            time.time() - float(CURRENT_CONF_POD_TIME_WINDOW))
                        podUrl = podUrl.replace('START_TIME',
                                                start_current_pod_str)
                        podUrl = podUrl.replace('END_TIME', end_str)
                        currentConfigMap['hpa_pods'] = podUrl
                    for metric_type, metric_url in currentConfigMap.items():
                        metric_url = metric_url.replace(
                            'START_TIME', start_current_str)
                        metric_url = metric_url.replace('END_TIME', end_str)
                        currentConfigMap[metric_type] = metric_url

            if (not (modelHolder.hasModels or skipHistorical)):
                storeMapHistorical = convertStringToMap(historicalMetricStore)
                # below code only used while use prophet algm
                isProphet = False
                if (ML_ALGORITHM == AI_MODEL.PROPHET.value):
                    isProphet = True
                    modelConfig.setdefault(PROPHET_PERIOD, ML_PROPHET_PERIOD)
                    modelConfig.setdefault(PROPHET_FREQ, ML_PROPHET_FREQ)
                if persistModelConfig:
                    storeModelConfig(uuid, modelHolder.getModelConfigs())
                # pass stragegy for hpa
                modelHolder, msg = computeHistoricalModel(
                    historicalConfigMap, modelHolder, isProphet,
                    storeMapHistorical, strategy)
                cacheModels(modelHolder)
                label_info['calcuHistorical'] = 'True'
                if (msg != ''):
                    outputMsg.append(msg)
                if (not modelHolder.hasModels):
                    outputMsg.append("No historical Data and model ")
                    #print(getNowStr(), ": Warning: No historical: "+str(modelHolder))

            hasHistorical = modelHolder.hasModels

            #start baseline
            to_do = []

            currentDataSet = {}
            baselineDataSet = {}

            if skipBaseline:
                currentDataSet, _ = computeNonHistoricalModel(
                    currentConfigMap, METRIC_PERIOD.CURRENT.value,
                    convertStringToMap(currentMetricStore), strategy)
            else:
                with ProcessPoolExecutor(max_workers=2) as executor:
                    currentjob = executor.submit(
                        computeNonHistoricalModel, currentConfigMap,
                        METRIC_PERIOD.CURRENT.value,
                        convertStringToMap(currentMetricStore), strategy)
                    baselinejob = executor.submit(
                        computeNonHistoricalModel,
                        convertStringToMap(baselineConfig),
                        METRIC_PERIOD.BASELINE.value,
                        convertStringToMap(baselineMetricStore), strategy)
                    to_do.append(currentjob)
                    to_do.append(baselinejob)
                    for future in futures.as_completed(to_do):
                        try:
                            res = future.result()
                            if (res[1] == METRIC_PERIOD.CURRENT.value):
                                currentDataSet = res[0]
                            else:
                                baselineDataSet = res[0]
                        except Exception as e:
                            logger.error("job id" + uuid +
                                         " encount errorProcessPoolExecutor " +
                                         str(e))

            #This is used for canary deployment to comarsion how close baseline and current
            currentLen = len(currentDataSet)
            baselineLen = len(baselineDataSet)
            hasCurrent = currentLen > 0
            label_info['hasCurrent'] = hasCurrent

            hasBaseline = baselineLen > 0
            logger.warning("jobid:" + uuid + " hasCurrent " + str(hasCurrent) +
                           ", hasBaseline " + str(hasBaseline))

            if hasCurrent == False:
                if strategy in [HPA, 'continuous']:
                    logger.warning("job id: " + uuid +
                                   "  not current metric...")
                    continue
                ret = True
                if isPast(endTime, 20):
                    ret = update_es_doc(strategy, status, uuid,
                                        REQUEST_STATE.COMPLETED_UNKNOWN.value,
                                        "Error: there is no current Metric. ")
                    logger.warning("Current metric is empty, jobid " + uuid +
                                   " updateESDocStatus  is :" + str(ret) +
                                   "  time past mark job unknow " +
                                   currentConfig + " ".join(outputMsg))
                else:
                    cacheModels(modelHolder)
                    ret = update_es_doc(
                        strategy, status, uuid,
                        REQUEST_STATE.PREPROCESS_INPROGRESS.value,
                        "Warning: there is no current Metric, Will keep try until reachs endTime. "
                    )
                    logger.warning(
                        "Current metric is empty, jobid " + uuid +
                        " updateESDocStatus  is :" + str(ret) +
                        " end time is not reach, will cache and retry " +
                        currentConfig + " ".join(outputMsg))
                if not ret:
                    cacheModels(modelHolder)
                    logger.error("ES update failed: job ID: " + uuid)
                # measurementMetric.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))
                continue

            if (hasBaseline):
                hasSameDistribution, detailedResults, meetSize = pairWiseComparson(
                    currentDataSet, baselineDataSet, ML_PAIRWISE_ALGORITHM,
                    ML_PAIRWISE_THRESHOLD, ML_BOUND)
                ret = True
                if (not hasSameDistribution):
                    logger.warning(
                        "current and base line does not have same distribution "
                        + str(detailedResults) + " ".join(outputMsg))
                    '''
                    if hasHistorical == True:
                        if meetSize :
                             updateESDocStatus(uuid, REQUEST_STATE.COMPLETED_UNHEALTH , "baseline and current are different pattern. "+escapeString(''.join(outputMsg)))
                             continue
                        requireLowerThreshold = True
                    else:
                    '''
                    if meetSize:
                        ret = update_es_doc(
                            strategy, status, uuid,
                            REQUEST_STATE.COMPLETED_UNHEALTH.value,
                            "Warning:  baseline and current are different pattern. "
                        )
                        logger.warning(
                            "job id :" + uuid +
                            "completed_unhealth, current and baseline has different distribution pattern,  updateESDocStatus  is :"
                            + str(ret))
                    else:
                        if isPast(endTime, 10):
                            ret = update_es_doc(
                                strategy, status, uuid,
                                REQUEST_STATE.COMPLETED_UNKNOWN.value,
                                "Warning: baseline and current are different pattern but not meet min datapoints to determine."
                            )
                            logger.warning(
                                "job id :" + uuid +
                                "completed_unknown...current or baseline is not same but not enough datapoints to confirm,  updateESDocStatus  is :"
                                + str(ret))
                        else:

                            ret = update_es_doc(
                                strategy, status, uuid,
                                REQUEST_STATE.PREPROCESS_COMPLETED.value,
                                "pairwise not same so far and not meet min datapoints to determine."
                            )
                            logger.warning(
                                "job id :" + uuid +
                                " pairwise not same and not enough datapoints but not meet min datapoint to determine ,  updateESDocStatus  is :"
                                + str(ret))
                else:
                    if isPast(endTime, 10):
                        ret = update_es_doc(
                            strategy, status, uuid,
                            REQUEST_STATE.COMPLETED_HEALTH.value, "health")
                        logger.warning("job ID : " + uuid +
                                       " is health. updateESDocStatus  is :" +
                                       str(ret))
                    else:
                        ret = update_es_doc(
                            strategy, status, uuid,
                            REQUEST_STATE.PREPROCESS_COMPLETED.value,
                            "current and baseline have same distribution but not past endtime yet."
                        )
                        # print(getNowStr(),": id ",uuid, " continue . bacause pairwise is not same but not past endTime yet " )
                        logger.warning(
                            "job id :" + uuid +
                            " will reprocess . current and base have same distribution but not past endTime yet, updateESDocStatus  is :"
                            + str(ret))
                if not ret:
                    cacheModels(modelHolder)
                    logger.error("ES update failed: job ID: " + uuid)
                # measurementMetric.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))
                continue
            else:
                # no baseline metric but require baseline then wait or reach end time to mark as unknown
                if not skipBaseline:
                    ret = True
                    if isPast(endTime, 10):
                        ret = update_es_doc(
                            strategy, status, uuid,
                            REQUEST_STATE.COMPLETED_UNKNOWN.value,
                            "baseline query is empty.")
                        logger.warning(
                            "job ID : " + uuid +
                            " unknown because baseline no data, updateESDocStatus  is :"
                            + str(ret))
                    else:
                        # wait for baseline metric to generate
                        ret = update_es_doc(
                            strategy, status, uuid,
                            REQUEST_STATE.PREPROCESS_COMPLETED.value,
                            "no baseline data yet.")
                        logger.warning(
                            "job ID : " + uuid +
                            " continue . no baseline data yet. updateESDocStatus  is :"
                            + str(ret))
                    if not ret:
                        cacheModels(modelHolder)
                        logger.error("ES update failed: job ID: " + uuid)
                    # measurementMetric.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))
                    continue

            # check historical (we may need to fail fast for non histrical netric use case
            #:TODO
            if hasHistorical == False:
                if strategy not in [HPA, CONTINUOUS]:
                    logger.warning("job id: " + uuid +
                                   "  not historical metric...")
                    continue
                ret = True
                if isPast(endTime, 5):
                    ret = update_es_doc(
                        strategy, status, uuid,
                        REQUEST_STATE.COMPLETED_UNKNOWN.value,
                        "Error: no enough historical data and no baseline data."
                    )
                    logger.warning(
                        "job id: " + uuid +
                        " completed unknown  no enough historical data and no baseline data , updateESDocStatus  is :"
                        + str(ret))
                else:
                    ret = update_es_doc(
                        strategy, status, uuid,
                        REQUEST_STATE.PREPROCESS_COMPLETED.value,
                        "Warning: not enough  historical data and no baseline data will retry until endtime reaches."
                    )
                    logger.warning(
                        "job id: " + uuid +
                        "  will cache and reprocess becasue no historical, updateESDocStatus  is :"
                        + str(ret))

                if not ret:
                    cacheModels(modelHolder)
                    logger.error("ES update failed: job ID: " + uuid)
                # measurementMetric.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))
                continue

            if strategy in [HPA, 'continuous']:
                computeAnomaly(currentDataSet, modelHolder, strategy)
                ret = update_es_doc(strategy, status, uuid,
                                    REQUEST_STATE.PREPROCESS_INPROGRESS.value,
                                    "")
                logger.warning("job id: " + uuid + "  hpa in progress.")
                # if not ret:
                #     cacheModels( modelHolder,  max_cache)
                #     logger.error("ES update failed: hpa job ID: "+uuid)

                # always cache models
                cacheModels(modelHolder)

                # measurementMetric.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))
                continue
            # add strategy
            hasAnomaly, anomaliesDataStr = computeAnomaly(
                currentDataSet, modelHolder, strategy)
            logger.warning("job ID is " + uuid + "  hasAnomaly is " +
                           str(hasAnomaly))

            if hasAnomaly:
                # update ES to anomaly otherwise continue
                anomalyInfo = escapeString(anomaliesDataStr)
                ret = update_es_doc(
                    strategy, status, uuid,
                    REQUEST_STATE.COMPLETED_UNHEALTH.value,
                    "Warning: anomaly detected between current and historical.",
                    anomalyInfo)
                logger.warning("**job ID is unhealth  " + uuid +
                               " updateESDocStatus  is :" + str(ret) + "  " +
                               anomaliesDataStr)
                if not ret:
                    cacheModels(modelHolder)
                    logger.error("ES update failed: job ID: " + uuid)
            else:
                if isPast(endTime, 10):
                    ret = update_es_doc(
                        strategy, status, uuid,
                        REQUEST_STATE.COMPLETED_HEALTH.value,
                        "current compare to histroical model is health")
                    logger.warning("job ID: " + uuid +
                                   " is health, updateESDocStatus is :" +
                                   str(ret))
                    if not ret:
                        cacheModels(modelHolder)
                        logger.error("ES update failed: job ID: " + uuid)
                else:
                    cacheModels(modelHolder)
                    ret = update_es_doc(
                        strategy, status, uuid,
                        REQUEST_STATE.PREPROCESS_INPROGRESS.value,
                        "Need to continuous to check untile reachs deployment endTime."
                    )
                    logger.warning(
                        "job ID : " + uuid +
                        " health so far will reprocess  updateESDocStatus is :"
                        + str(ret))

            # measurementMetric.sendMetric(MONITORING_REQUEST_TIME, label_info, calculateDuration(start))

        except Exception as e:
            logger.error("uuid : " + uuid + " failed because ", e)
            try:
                if isPast(endTime, 5):
                    update_es_doc(
                        strategy, status, uuid,
                        REQUEST_STATE.PREPROCESS_FAILED.value,
                        "Critical: encount code exception. " +
                        escapeString(''.join(outputMsg)))
                else:
                    update_es_doc(
                        strategy, status, uuid,
                        REQUEST_STATE.PREPROCESS_COMPLETED.value,
                        "Critical: encount code exception. " +
                        escapeString(''.join(outputMsg)))

            except Exception as ee:
                logger.error("uuid : " + uuid + " failed because " + str(ee))
            continue