예제 #1
0
def sim(main, query, sr=22050 // 4):
    # main is good track
    batch_sz = len(main) // 10
    idxs, dists = mts.mass2_batch(main,
                                  query,
                                  batch_size=batch_sz,
                                  top_matches=len(main) // batch_sz - 1,
                                  n_jobs=6)
    return idxs[np.argsort(dists)]
예제 #2
0
def test_mass2_batch_robotdog_multi_threaded():
    """Sanity check that compares results from UCR use case."""
    robot_dog = np.loadtxt(
        os.path.join(MODULE_PATH, '..', 'tests', 'robot_dog.txt'))
    carpet_walk = np.loadtxt(
        os.path.join(MODULE_PATH, '..', 'tests', 'carpet_query.txt'))

    indices, distances = mts.mass2_batch(
        robot_dog, carpet_walk, 1000, top_matches=3, n_jobs=2)
    min_dist_idx = np.argmin(distances)
    min_idx = indices[min_dist_idx]

    assert(min_idx == 7479)
예제 #3
0
def f_ts_clustering(param_pe, param_row, param_ca_data, param_ce_data,
                    param_tipo, param_p_ventana, param_cores, param_batch,
                    param_matches):
    """
    Parameters
    ----------
    param_pe : pd.DataFrame : dataframe con precios
    param_row :
    param_ce_data :
    param_ca_data :
    param_p_ventana :
    param_cores :
    param_tipo :
    param_batch :
    param_matches :

    Returns
    -------
    df_tabla_busqueda :

    Debugging
    ---------
    param_pe = df_precios # precios historicos para minar
    param_row = 4 # renglon de iteracion de candidatos
    param_ca_data = df_ind_3 # dataframe con candidatos a iterar
    param_ce_data = df_ce # dataframe con calendario completo
    param_tipo = 'mid'
    param_p_ventana = 30 # tamano de ventana para buscar serie de tiempo
    param_cores = 4 # nucleos con los cuales utilizar algoritmo
    param_batch = 300
    param_matches = 10
    """
    # almacenar resultados
    # dict_res = {'name': [], 'esc': [], 'timestamp': [],
    #             'tipo_1': [], 'tipo_2': [], 'tipo_3': [], 'tipo_4': []}

    # renglon con informacion de evento disparador candidato
    candidate_data = param_ca_data.iloc[param_row, :]
    # print('Ind disparador: ' + str(candidate_data['name']) + ' - ' + candidate_data['esc'])

    # datos completos de todas las ocurrencias del evento disparador candidato
    df_ancla = param_ce_data[(param_ce_data['esc'] == candidate_data['esc']) &
                             (param_ce_data['name'] == candidate_data['name'])]

    # todos los timestamps del calendario economico completo
    ts_serie_ce = list(param_ce_data['timestamp'])

    # inicializar contadores de ocurrencias por escenario ancla
    p1, p2, p3, p4 = 0, 0, 0, 0

    # Para guardar resultados parciales
    dict_res = {'ancla': df_ancla['id'].iloc[0], 'metricas': {}, 'datos': {}}

    # -- ------------------------------------------------------ OCURRENCIA POR OCURRENCIA -- #
    for ancla in range(0, len(df_ancla['timestamp'])):
        # ancla = 31
        # print(ancla)
        # datos de ancla para buscar hacia el futuro
        ancla_ocurr = df_ancla.iloc[ancla, ]
        # print('ind: ' + ancla_ocurr['name'] + ' ' + ancla_ocurr['esc'] + ' ' +
        #       str(ancla_ocurr['timestamp']))
        # fecha de ancla
        fecha_ini = ancla_ocurr['timestamp']

        # .. buscar recurrentemente la fecha mas cercana para construir serie y serie_p
        while len(param_pe[param_pe['timestamp'] == fecha_ini].index) == 0:
            fecha_ini = fecha_ini - timedelta(minutes=1)

        # se toma el timestamp de precios igual a timestamp del primer escenario del indicador
        ind_ini = param_pe[param_pe['timestamp'] == fecha_ini].index
        # fecha final es la fecha inicial mas un tamano de ventana arbitrario
        ind_fin = ind_ini + param_p_ventana

        # se construye la serie query
        df_serie_q = param_pe.copy().loc[ind_ini[0]:ind_fin[0], :]
        df_serie_q = df_serie_q.reset_index(drop=True)

        # se toma el mid como valor para construir series temporales
        serie_q = np.array(df_serie_q[param_tipo])

        # se construye la serie completa para busqueda (un array de numpy de 1 dimension)
        df_serie = param_pe.copy().loc[ind_ini[0]:, :]
        df_serie = df_serie.reset_index(drop=True)

        # se toma el mid como valor para construir series temporales
        serie = np.array(df_serie[param_tipo])

        try:
            # correr algoritmo y regresar los indices de coincidencias y las distancias
            mass_indices, mass_dists = mass.mass2_batch(
                ts=serie,
                query=serie_q,
                batch_size=param_batch,
                n_jobs=param_cores,
                top_matches=param_matches)

            # Borrar inidice 0 de resultados por ser el mismo que la serie query
            origen = np.where(mass_indices == 0)[0][0]
            mass_indices = np.delete(mass_indices, origen)
            # mass_dists = np.delete(mass_dists, origen)
            # print('indices encontrados' + ' ' + str(mass_indices))

            # Indice de referencia de n-esima serie similar encontrada
            for indice in mass_indices:
                # indice = mass_indices[0]
                # print(indice)
                # DataFrame de n-esima serie patron similar encontrada
                df_serie_p = df_serie.copy().loc[indice:(indice +
                                                         param_p_ventana), :]
                # print(df_serie_p.head())
                # print('Verificando patron con f_ini: ' +
                #       str(list(df_serie_p['timestamp'])[0]) + ' f_fin: ' +
                #       str(list(df_serie_p['timestamp'])[-1]))

                # Extraer el timestamp inicial para verificar si coincide con indicador
                ts_serie_p = list(df_serie_p['timestamp'])[0]

                # Busqueda si el timestamp inicial de cada uno de los patrones
                # encontrados es igual a alguna fecha de comunicacion de toda
                # la lista de indicadores que se tiene
                if ts_serie_p in ts_serie_ce:

                    # ID del evento ancla que genero patron hacia adelante
                    id_ocurrencia = ancla_ocurr['id'] + '_' + ancla_ocurr['esc'] +\
                                    '_' + str(ancla_ocurr['timestamp'])[:-6].replace(' ', '_')

                    match = np.where(
                        param_ce_data['timestamp'] == ts_serie_p)[0]
                    encontrados = param_ce_data.loc[match, :]

                    # print(' ------------------ Coincidencia encontrada ------------------')
                    # print('buscando en: ' + id_ocurrencia)
                    # print(' ----------- Se encontro el patron que empieza en: -----------')
                    # print(ts_serie_p)
                    # print('en los siguientes casos: ')
                    # print(encontrados)

                    # -- contar y sacar los datos segun tipo
                    # Paso 1: tener un diccionario con la llave id_ocurrencia con la encontrada
                    # Paso 2: dentro de la llave id_ocurrencia tener la llave datos
                    dict_res['datos'].update({
                        id_ocurrencia: {
                            'ocurrencias': {},
                            'df_serie_q': df_serie_q,
                            'df_serie_p': df_serie_p
                        }
                    })

                    # Paso 3: hacer las llaves id_sub_ocurrencia para cada sub de ocurrencia
                    llaves = [
                        encontrados['id'].iloc[j] + '_' +
                        encontrados['esc'].iloc[j] + '_' +
                        str(encontrados['timestamp'].iloc[j])[:-6].replace(
                            ' ', '_') for j in range(0, len(encontrados['id']))
                    ]
                    dict_res['datos'][id_ocurrencia]['ocurrencias'] = llaves

                    enc = (encontrados['name'] == ancla_ocurr['name']) & \
                          (encontrados['esc'] == ancla_ocurr['esc'])

                    # TIPO 1: name == name & esc == esc
                    p1 = p1 + len(encontrados.loc[enc, 'name'])
                    # print('tipo_1 = ' + str(p1))

                    # TIPO 2: name == name
                    p2 = p2 + len(encontrados.loc[encontrados['name'] ==
                                                  ancla_ocurr['name'], 'name'])
                    # print('tipo_2 = ' + str(p2))

                    # TIPO 3: cualquiera en calendario
                    p3 = p3 + len(encontrados.loc[
                        encontrados['name'] != ancla_ocurr['name'], 'name'])
                    # print('tipo_3 = ' + str(p3))

                    # TIPO 4: fuera de calendario
                    p4 = p4 + 0
                    # print('tipo_4 = ' + str(p4))

                else:
                    # TIPO 4: fuera de calendario
                    p4 += len(mass_indices)
                    # print('p4 = ' + str(p4))

        # tipo_4 = Cualquier otro punto en el tiempo
        except ValueError:
            # print('ValueError: problemas de indices en MASS-TS')
            p4 += 0
        except IndexError:
            # print('IndexError: problemas de indices en MASS-TS')
            p4 += 0

        # agregar al diccionario de resultados los casos encontrados
        dict_res.update({
            'metricas': {
                # Mismo Indicador + Mismo Escenario que la ancla
                'tipo_1': p1,
                # Mismo Indicador + Cualquier Escenario
                'tipo_2': p2,
                # Otro Indicador en la lista
                'tipo_3': p3,
                # Ninguna de las anteriores
                'tipo_4': p4
            }
        })

    return dict_res
예제 #4
0
def find_cloudburst_motifs(metric, snippet, timeseries, print_output=False):
    """
    Takes a snippet of timeseries, creates motifs at batch_sizes from the
    snippet and searches for those motifs in the given timeseries, returns a
    matched_motifs dict and a timeseries_matched dict

    """
    logger = logging.getLogger(skyline_app_logger)
    child_process_pid = os.getpid()
    logger.info(
        'functions.luminosity.find_cloudburst_motifs :: running for process_pid - %s for %s'
        % (str(child_process_pid), metric))

    start = timer()
    debug_logging = False
    timeseries_matched = {}
    timeseries_matched[metric] = {}
    matched_motifs = {}
    motifs_found = []
    dev_null = None

    for item in timeseries:
        timestamp = int(item[0])
        timeseries_matched[metric][timestamp] = {}
        timeseries_matched[metric][timestamp]['motif_matches'] = {}

    mass2_batch_times = []
    mass3_times = []
    exact_match_times = []

    nan = np.array([np.nan])
    nanj = complex(0.0, float('nan'))
    empty_dists = np.array(nan + nanj)

    motifs_found = []
    exact_matches_found = []

    motif_match_types = motif_match_types_dict()

    start_full_duration = timer()

    # metric_resolution = determine_data_frequency(skyline_app, timeseries, False)
    logger.info(
        'functions.luminosity.find_cloudburst_motifs :: looking for similar motifs in timeseries of length: %s'
        % str(len(timeseries)))

    exact_match_times = []

    #    relate_dataset = [float(item[1]) for item in fp_timeseries]
    relate_dataset = [float(item[1]) for item in timeseries]

    # namespace_key = 'default_inference_batch_sizes'
    # for batch_size in list(IONOSPHERE_INFERENCE_MOTIFS_SETTINGS[namespace_key].keys()):
    # for batch_size in [len(snippet)]:
    batch_size = len(snippet)
    if print_output:
        print(
            'functions.luminosity.find_cloudburst_motifs :: checking %s at batch_size: %s'
            % (metric, str(batch_size)))

    # @added 20210423 - Feature #4014: Ionosphere - inference
    # The convenience mass2_batch method will not work to find
    # top matches if the number of top_matches to be found are
    # greater than the number of indices in which a match can be
    # found.  In these cases such as trying to find the:
    # batch_size: 1440, top_matches: 50, max_distance: 30, snippet_length: 1451
    # even setting the top_matches to 1 will result in
    # mass2_batch throwing the error:
    # mts.mass2_batch error: kth(=1) out of bounds (1)
    # So use mass3 as appropriate.
    use_mass3 = False
    use_mass2_batch = True
    n = len(snippet)
    indices = list(range(0, n - batch_size + 1, batch_size))
    # mass2_batch default is 3 so if there are less than 3
    # indices in which the best macthes can be found, use mass3
    if len(indices) < 3:
        use_mass3 = True
        use_mass2_batch = False
        logger.info(
            'functions.luminosity.find_cloudburst_motifs :: batch_size: %s, snippet length: %s, len(indices) < 3, using mass3'
            % (str(batch_size), str(n)))
        if print_output:
            print(
                'functions.luminosity.find_cloudburst_motifs :: batch_size: %s, snippet length: %s, len(indices) < 3, using mass3'
                % (str(batch_size), str(n)))

    top_matches = 1
    max_distance = 1.8
    find_exact_matches = True

    # if use_mass2_batch:
    logger.info(
        'functions.luminosity.find_cloudburst_motifs :: analysis run - metric: %s, batch_size: %s, top_matches: %s, max_distance: %s, snippet_length: %s'
        % (str(metric), str(batch_size), str(top_matches), str(max_distance),
           str(len(snippet))))
    if print_output:
        print(
            'functions.luminosity.find_cloudburst_motifs :: analysis run - metric: %s, batch_size: %s, top_matches: %s, max_distance: %s, snippet_length: %s'
            % (str(metric), str(batch_size), str(top_matches),
               str(max_distance), str(len(snippet))))

    # Given that the snippet can be any length
    if len(snippet) < batch_size:
        if print_output:
            print(
                'functions.luminosity.find_cloudburst_motifs :: skipping snippet: %s, batch_size: %s'
                % (str(len(snippet)), str(batch_size)))
        return matched_motifs, timeseries_matched
    else:
        if print_output:
            print(
                'functions.luminosity.find_cloudburst_motifs :: checking %s, batch_size: %s'
                % (metric, str(batch_size)))

    # Create the subsequence that is being searched for
    n = batch_size
    # snippets = [snippet[i * n:(i + 1) * n] for i in range((len(snippet) + n - 1) // n)]
    #    snippets = [snippet]

    #        batch_size_anomalous_timeseries_subsequence = timeseries[-batch_size:]
    #        batch_size_dataset = [float(item[1]) for item in batch_size_anomalous_timeseries_subsequence]
    #    for i_snippet in snippets:

    # batch_size_timeseries_subsequence = i_snippet[-batch_size:]
    batch_size_timeseries_subsequence = snippet
    batch_size_dataset = [
        float(item[1]) for item in batch_size_timeseries_subsequence
    ]
    motif_timestamp = int(batch_size_timeseries_subsequence[-1][0])

    # Set defaults
    current_best_indices = []
    current_best_dists = []
    best_indices = None
    best_dists = None

    # POC running all through mass3 with maximum pieces (SUPER FAST)
    # and then filtering on max_distance, all_in_range and area
    # percent_different
    # use_mass3 = True
    # use_mass2_batch = False

    # POC running all through mass3 and then filtering FALIED in
    # terms of time taken... due to having to run 22421 motifs
    # through all_in_range and percent_different functions ...
    # just these motifs checked took 62.036366 seconds, the surfacing
    # and transforming of the data AND mass3 to only 2 seconds
    # 2021-04-27 13:45:59 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: analysed 2 fps of full_duration 86400 in 0.330732 seconds
    # 2021-04-27 13:45:59 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: 22421 distance_valid_motifs determined in 0.346807 seconds from 81432 motifs_found
    # 2021-04-27 13:45:59 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: sorted_motifs from distance_valid_motifs in 0.048316 seconds
    # 2021-04-27 13:46:01 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: percent_different in 0.000590 seconds
    # 2021-04-27 13:46:01 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: percent_different in 0.000271 seconds
    # ...
    # ...
    # 2021-04-27 13:46:57 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: percent_different in 0.000373 seconds
    # 2021-04-27 13:46:57 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: percent_different in 0.000381 seconds
    # 2021-04-27 13:46:58 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: percent_different in 0.000363 seconds
    # 2021-04-27 13:46:58 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: percent_different in 0.000348 seconds
    # 2021-04-27 13:47:01 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: motifs checked in 62.036366 seconds
    # 2021-04-27 13:47:01 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: 0 motif best match found from 81432 motifs_found, 4 fps where checked {604800: {'fp_count': 2}, 86400: {'fp_count': 2}} (motifs remove due to not in range 22325, percent_different 96) and it took a total of 64.761969 seconds (only mass3) to process telegraf.ssdnodes-26840.mariadb.localhost:3306.mysql.bytes_sent
    # 2021-04-27 13:47:01 :: 3586421 :: inference found 0 matching similar motifs, checked 0 fps in 64.790198 seconds

    if use_mass2_batch:
        try:
            # @added 20210419 - Feature #4014: Ionosphere - inference
            # Handle top_matches being greater than possible kth that can be found
            # mts.mass2_batch error: kth(=50) out of bounds (16)
            use_top_matches = int(top_matches)
            if (len(snippet) / int(batch_size)) <= int(top_matches):
                use_top_matches = round(len(snippet) / int(batch_size)) - 2
                if use_top_matches == 2:
                    use_top_matches = 1
                if use_top_matches < 1:
                    use_top_matches = 1
                logger.info(
                    'functions.luminosity.find_cloudburst_motifs :: adjusting top_matches for mass2_batch to %s (the maximum possible top - 1) as top_matches=%s will be out of bounds mts.mass2_batch'
                    % (str(use_top_matches), str(top_matches)))

            start_mass2_batch = timer()
            best_indices, best_dists = mts.mass2_batch(
                relate_dataset,
                batch_size_dataset,
                batch_size=batch_size,
                top_matches=use_top_matches)
            end_mass2_batch = timer()
            mass2_batch_times.append((end_mass2_batch - start_mass2_batch))
            current_best_indices = best_indices.tolist()
            current_best_dists = best_dists.tolist()

            logger.info(
                'functions.luminosity.find_cloudburst_motifs :: mass2_batch run on batch_size: %s, top_matches: %s, in %6f seconds'
                % (str(batch_size), str(use_top_matches),
                   (end_mass2_batch - start_mass2_batch)))
            if print_output:
                print(
                    'functions.luminosity.find_cloudburst_motifs :: mass2_batch run on batch_size: %s, top_matches: %s, in %6f seconds'
                    % (str(batch_size), str(use_top_matches),
                       (end_mass2_batch - start_mass2_batch)))

            if debug_logging:
                logger.debug(
                    'debug :: functions.luminosity.find_cloudburst_motifs :: best_indices: %s, best_dists: %s'
                    % (str(current_best_indices), str(current_best_dists)))
        except ValueError as e:
            # If mass2_batch reports out of bounds, use mass3
            if 'out of bounds' in str(e):
                use_mass3 = True
                best_dists = ['use_mass3']
                logger.info(
                    'functions.luminosity.find_cloudburst_motifs :: mts.mass2_batch will be out of bounds running mass3'
                )
        except Exception as e:
            logger.error(
                'error :: functions.luminosity.find_cloudburst_motifs :: %s mts.mass2_batch error: %s'
                % (str(metric), str(e)))
            if print_output:
                print(
                    'error :: functions.luminosity.find_cloudburst_motifs :: %s mts.mass2_batch error: %s'
                    % (str(metric), str(e)))
            return matched_motifs, timeseries_matched
        if not use_mass3:
            try:
                if str(list(best_dists)) == str(list(empty_dists)):
                    logger.info(
                        'functions.luminosity.find_cloudburst_motifs :: mts.mass2_batch no similar motif from %s - best_dists: %s'
                        % (str(metric), str(list(best_dists))))
                    if print_output:
                        print(
                            'functions.luminosity.find_cloudburst_motifs :: mts.mass2_batch no similar motif from %s - best_dists: %s'
                            % (str(metric), str(list(best_dists))))
                    return matched_motifs, timeseries_matched
            except Exception as e:
                dev_null = e

    # @added 20210423 -
    if use_mass3:
        # pieces should be larger than the query length and as many
        # as possible, a power of two would be best, but as many
        # pieces as possible is the best we can achieve above 265
        query_length = len(batch_size_dataset)
        # if query_length < 256:
        #     pieces = 256
        # else:
        #     pieces = query_length + 2
        pieces = len(snippet) - query_length
        if pieces < query_length:
            pieces = query_length + 2

        check_pieces_length = False
        if check_pieces_length:
            # @modified 20210504 - Feature #4014: Ionosphere - inference
            # Handle the fp_timeseries being the same length (meaning
            # too short) as the query length
            if len(snippet) <= pieces:
                logger.info(
                    'functions.luminosity.find_cloudburst_motifs :: skipping running mass3 with %s pieces on metric: %s, batch_size: %s because snippet length is not long enough for the query size'
                    % (str(pieces), str(metric), str(batch_size)))
                if print_output:
                    print(
                        'functions.luminosity.find_cloudburst_motifs :: skipping running mass3 with %s pieces on metric: %s, batch_size: %s because snippet length is not long enough for the query size'
                        % (str(pieces), str(metric), str(batch_size)))
                return matched_motifs, timeseries_matched

            # @modified 20210505 - Feature #4014: Ionosphere - inference
            # Skip the batch size if the fp_timeseries is a similar
            # length as the batch_size.  This was specifically added to
            # reduce errors were there may be missing data points in a
            # timeseries and the lengths are not the same.  This was
            # encountered on a batch_size of 1440 with FULL_DURATION
            # 86400 60 second data.   A match was never found at a
            # batch_size > 720 on that data, but errors were occassionally
            # encountered.
            ten_percent_of_batch_size = int(batch_size / 10)
            if (len(snippet) - ten_percent_of_batch_size) < batch_size:
                logger.info(
                    'functions.luminosity.find_cloudburst_motifs :: skipping running mass3 on metric: %s, batch_size: %s because the batch_size is too close to length'
                    % (str(metric), str(batch_size)))
                if print_output:
                    print(
                        'functions.luminosity.find_cloudburst_motifs :: skipping running mass3 on metric: %s, batch_size: %s because the batch_size is too close to length'
                        % (str(metric), str(batch_size)))
                return matched_motifs, timeseries_matched

        logger.info(
            'functions.luminosity.find_cloudburst_motifs :: running mass3 with %s pieces on on metric: %s, batch_size: %s'
            % (str(pieces), str(metric), str(batch_size)))

        if print_output:
            print(
                'functions.luminosity.find_cloudburst_motifs :: running mass3 with %s pieces on on metric: %s, batch_size: %s'
                % (str(pieces), str(metric), str(batch_size)))

        start_mass3 = timer()
        try:
            best_dists = mts.mass3(relate_dataset, batch_size_dataset, pieces)
            end_mass3 = timer()
        except Exception as e:
            logger.error(
                'error :: functions.luminosity.find_cloudburst_motifs :: metric %s mts.mass3 error: %s'
                % (str(metric), str(e)))
            if print_output:
                print(
                    'error :: functions.luminosity.find_cloudburst_motifs :: metric %s mts.mass3 error: %s'
                    % (str(metric), str(e)))
            return matched_motifs, timeseries_matched
        mass3_times.append((end_mass3 - start_mass3))

        current_best_dists = best_dists.tolist()

        # Create current_best_indices as mass2_batch returns
        current_best_indices = []
        if len(relate_dataset) > batch_size:
            for index in enumerate(relate_dataset):
                # if index[0] >= (batch_size - 1):
                # The array starts at batch_size + 1
                # if index[0] >= (batch_size + 1):
                # but that fails on the add_motifs comprehension
                # add_motifs = [[fp_id, current_best_indices[index], best_dist.real, batch_size_timeseries_subsequence, batch_size, max_distance, max_area_percent_diff, max_y, min_y, range_padding, min_y_padded, max_y_padded] for index, best_dist in enumerate(current_best_dists)]
                # IndexError: list index out of range
                if index[0] >= (batch_size - 1):
                    current_best_indices.append(index[0])

            # @modified 20210505 - Feature #4014: Ionosphere - inference
            # Handle the query_length being shorter than the batch_size
            if len(current_best_indices) != len(current_best_dists):
                current_best_indices = []
                if index[0] >= (query_length - 1):
                    current_best_indices.append(index[0])
            if len(current_best_indices) != len(current_best_dists):
                logger.info(
                    'functions.luminosity.find_cloudburst_motifs :: discarding mass3 results as current_best_dists length: %s, current_best_indices length: %s do not match, took %6f seconds'
                    % (str(len(current_best_dists)),
                       str(len(current_best_indices)),
                       (end_mass3 - start_mass3)))
                if print_output:
                    print(
                        'functions.luminosity.find_cloudburst_motifs :: discarding mass3 results as current_best_dists length: %s, current_best_indices length: %s do not match, took %6f seconds'
                        % (str(len(current_best_dists)),
                           str(len(current_best_indices)),
                           (end_mass3 - start_mass3)))
                return matched_motifs, timeseries_matched
        logger.info(
            'functions.luminosity.find_cloudburst_motifs :: mass3 run, current_best_dists length: %s, current_best_indices length: %s, took %6f seconds'
            % (str(len(current_best_dists)), str(len(current_best_indices)),
               (end_mass3 - start_mass3)))
        if print_output:
            print(
                'functions.luminosity.find_cloudburst_motifs :: mass3 run, current_best_dists length: %s, current_best_indices length: %s, took %6f seconds'
                %
                (str(len(current_best_dists)), str(len(current_best_indices)),
                 (end_mass3 - start_mass3)))

    if not use_mass3:
        if not current_best_indices[0]:
            return matched_motifs, timeseries_matched
    if use_mass3 and not current_best_indices:
        return matched_motifs, timeseries_matched

    # All in one quicker?  Yes
    start_add_motifs = timer()
    add_motifs = []
    try:
        add_motifs = [[
            metric, current_best_indices[index], best_dist.real,
            batch_size_timeseries_subsequence, batch_size, max_distance,
            motif_timestamp
        ] for index, best_dist in enumerate(current_best_dists)]
        if add_motifs:
            motifs_found = motifs_found + add_motifs
    except Exception as e:
        logger.error(traceback.format_exc())
        logger.error(
            'error :: functions.luminosity.find_cloudburst_motifs :: could not add_motifs to motifs_found - %s'
            % (e))
    end_add_motifs = timer()
    logger.info(
        'functions.luminosity.find_cloudburst_motifs :: added %s motifs to motifs_found in %.6f seconds'
        % (str(len(add_motifs)), (end_add_motifs - start_add_motifs)))
    # Break if an exact match is found
    # @modified 20210430 - Bug #4044: inference - motif distance override - exact match
    # @modified 20210504 - Bug #4044: inference - motif distance override - exact match
    # if len([item for item in add_motifs if item[2] == 0]) > 0:
    #     exact_matches_found = exact_matches_found + [item for item in add_motifs if item[2] == 0]
    #     break

    # @modified 20210427 - Feature #4014: Ionosphere - inference
    # Finding exact matches can result is more than doubling the
    # runtime when used after mass2_batch runs (which do not find)
    # exact matches, mass3 does.  However the amount of time an
    # exact match is found, is very rare
    # if not use_mass3:
    if not use_mass3 and find_exact_matches:
        # mass3 finds exact matches, mass2_batch does not, so
        # there is no need to find exacts matchs if mass3 was
        # run.
        # FIND EXACT MATCHES
        # Seeing as I cannot reproduce finding nan+nanj which represents an
        # exact match with mts.mass2_batch, do it DIY style - iterate the
        # timeseries and create a batch_size subsequence for every index and
        # compare the values to the anomalous_ts for an exact match.
        # This takes ~0.024850 seconds on a timeseries with 10079 datapoints
        try:
            start_exact_match = timer()
            indexed_relate_dataset = []
            for index, item in enumerate(relate_dataset):
                indexed_relate_dataset.append([index, item])
            last_index = indexed_relate_dataset[-1][0]
            current_index = 0
            while current_index < last_index:
                subsequence = [
                    value
                    for index, value in indexed_relate_dataset[current_index:(
                        current_index + batch_size)]
                ]
                if subsequence == batch_size_dataset:
                    # @modified 20210419 - Feature #4014: Ionosphere - inference
                    # Added batch_size
                    exact_matches_found.append([
                        metric, current_index, 0.0,
                        batch_size_timeseries_subsequence, batch_size,
                        max_distance, motif_timestamp
                    ])
                    motifs_found.append([
                        metric, current_index, 0.0,
                        batch_size_timeseries_subsequence, batch_size,
                        max_distance, motif_timestamp
                    ])
                current_index += 1
            end_exact_match = timer()
            exact_match_times.append((end_exact_match - start_exact_match))
        except Exception as e:
            logger.error(traceback.format_exc())
            logger.error(
                'error :: functions.luminosity.find_cloudburst_motifs :: could not determine it any exact matches could be found in %s timeseries - %s'
                % (str(metric), e))
        logger.info(
            'functions.luminosity.find_cloudburst_motifs :: exact matches checked in %.6f seconds'
            % ((end_exact_match - start_exact_match)))

        # TODO
        # mass3 ALL, then evaluate, would it be quicker?  No see POC
        # above
    logger.info(
        'functions.luminosity.find_cloudburst_motifs :: mts.mass2_batch runs on %s in %.6f seconds'
        % (str(metric), sum(mass2_batch_times)))
    logger.info(
        'functions.luminosity.find_cloudburst_motifs :: exact_match runs on %s in %.6f seconds'
        % (str(metric), sum(exact_match_times)))
    end_full_duration = timer()
    logger.info(
        'functions.luminosity.find_cloudburst_motifs :: analysed %s in %.6f seconds'
        % (str(metric), (end_full_duration - start_full_duration)))

    # Patterns are sorted by distance
    # The list produced with the mass3 method will include
    # nans
    start_distance_valid_motifs = timer()
    distance_valid_motifs = [
        item for item in motifs_found
        if not np.isnan(item[2]) and item[2] <= item[5]
    ]
    end_distance_valid_motifs = timer()
    logger.info(
        'functions.luminosity.find_cloudburst_motifs :: %s distance_valid_motifs determined in %.6f seconds from %s motifs_found'
        % (str(len(distance_valid_motifs)),
           (end_distance_valid_motifs - start_distance_valid_motifs),
           str(len(motifs_found))))
    if print_output:
        print(
            'functions.luminosity.find_cloudburst_motifs :: %s distance_valid_motifs determined in %.6f seconds from %s motifs_found'
            % (str(len(distance_valid_motifs)),
               (end_distance_valid_motifs - start_distance_valid_motifs),
               str(len(motifs_found))))

    start_sorted_motifs = timer()
    sorted_motifs = []
    if motifs_found:
        sorted_motifs = sorted(distance_valid_motifs, key=lambda x: x[2])
        # If the areas under the curve were calculated, the
        # list could be sorted by area_percent_diff then by
        # distance.
        # import operator
        # sorted_motifs = sorted(motifs_found_in_fp, key=operator.itemgetter(2, 2))
    end_sorted_motifs = timer()
    logger.info(
        'functions.luminosity.find_cloudburst_motifs :: sorted_motifs from distance_valid_motifs in %.6f seconds'
        % ((end_sorted_motifs - start_sorted_motifs)))

    start_motifs_check = timer()

    snippet_timestamps = [int(item[0]) for item in snippet]
    for motif in sorted_motifs:
        try:

            add_match = False

            metric = motif[0]
            best_index = motif[1]
            best_dist = motif[2]

            # motif_sequence = motif[3]

            # @modified 20210419 - Feature #4014: Ionosphere - inference
            # Added batch_size
            motif_size = motif[4]

            motif_timestamp = motif[6]

            add_match = True
            match_type = 'distance'

            if motif in exact_matches_found:
                match_type = 'exact'
                if debug_logging:
                    logger.debug(
                        'debug :: functions.luminosity.find_cloudburst_motifs :: exact match: %s'
                        % (str(motif)))

            full_relate_timeseries = timeseries
            relate_timeseries = [
                item for index, item in enumerate(full_relate_timeseries)
                if index >= best_index and index < (best_index + motif_size)
            ]
            relate_dataset = [item[1] for item in relate_timeseries]
            # relate_dataset_timestamps = [int(item[0]) for item in relate_timeseries]

            matched_period_timestamps = [
                int(item[0]) for item in relate_timeseries
            ]
            in_period = False
            for matched_period_timestamp in matched_period_timestamps:
                if matched_period_timestamp in snippet_timestamps:
                    in_period = True
            if not in_period:
                add_match = False

            if add_match:
                timestamp = int(relate_timeseries[-1][0])
                timeseries_matched[metric][timestamp]['motif_matches'][
                    motif_timestamp] = motif_size

                motif_id = '%s-%s-%s' % (str(metric), str(int(
                    snippet[-1][0])), str(best_index))
                matched_motifs[motif_id] = {}
                matched_motifs[motif_id]['index'] = best_index
                matched_motifs[motif_id]['distance'] = best_dist
                matched_motifs[motif_id]['size'] = motif_size
                matched_motifs[motif_id]['timestamp'] = timestamp
                matched_motifs[motif_id][
                    'matched_period_timestamps'] = matched_period_timestamps
                matched_motifs[motif_id]['motif_timestamp'] = motif_timestamp
                matched_motifs[motif_id]['type'] = match_type
                matched_motifs[motif_id]['type_id'] = motif_match_types[
                    match_type]
                runtime_end = timer()
                matched_motifs[motif_id]['runtime'] = (runtime_end - start)

                # if SINGLE_MATCH:
                #     break
        except Exception as e:
            logger.error(traceback.format_exc())
            logger.error(
                'error :: functions.luminosity.find_cloudburst_motifs :: metric %s and motif: %s - %s'
                % (str(metric), str(motif), str(e)))
            continue
    end_motifs_check = timer()
    logger.info(
        'functions.luminosity.find_cloudburst_motifs :: motifs checked in %.6f seconds'
        % ((end_motifs_check - start_motifs_check)))
    if print_output:
        print(
            'functions.luminosity.find_cloudburst_motifs :: motifs checked in %.6f seconds'
            % ((end_motifs_check - start_motifs_check)))

    # Sort by distance AND area_percent_diff
    sorted_ordered_matched_motifs_list = []
    if matched_motifs and len(matched_motifs) > 1:
        ordered_matched_motifs_list = []
        for motif_id in list(matched_motifs.keys()):
            distance = matched_motifs[motif_id]['distance']
            ordered_matched_motifs_list.append([motif_id, distance])
        # If the areas under the curve were calculated, the
        # list could be sorted by area_percent_diff then by
        # distance.
        sorted_matched_motifs = {}
        sorted_ordered_matched_motifs_list = sorted(
            ordered_matched_motifs_list, key=operator.itemgetter(1))
        logger.info(
            'functions.luminosity.find_cloudburst_motifs :: sorting %s matched_motifs by distance'
            % (str(len(sorted_ordered_matched_motifs_list))))

        for motif_id, distance, in sorted_ordered_matched_motifs_list:
            sorted_matched_motifs[motif_id] = matched_motifs[motif_id]
            # if SINGLE_MATCH:
            #     break
        matched_motifs = sorted_matched_motifs.copy()

    end = timer()
    if dev_null:
        del dev_null
    logger.info(
        'functions.luminosity.find_cloudburst_motifs :: %s motif best match found from %s motifs_found and it took a total of %.6f seconds (all mass2/mass3) to process %s'
        % (
            # str(len(matched_motifs)), str(len(motifs_found)), str(len(fps_checked_for_motifs)),
            str(len(matched_motifs)),
            str(len(motifs_found)),
            (end - start),
            metric))
    if len(matched_motifs) > 0:
        if print_output:
            print(
                'functions.luminosity.find_cloudburst_motifs :: %s motif best match found from %s distance valid motifs of %s motifs_found and it took a total of %.6f seconds (all mass2/mass3) to process %s'
                % (
                    # str(len(matched_motifs)), str(len(motifs_found)), str(len(fps_checked_for_motifs)),
                    str(len(matched_motifs)),
                    str(len(distance_valid_motifs)),
                    str(len(motifs_found)),
                    (end - start),
                    metric))
            distances = []
            for match in list(matched_motifs.keys()):
                distances.append(matched_motifs[match]['distance'])
            distances_dict = {}
            distances_dict['avg_distance'] = sum(distances) / len(distances)
            distances_dict['distances'] = distances
            print('%s' % str(distances_dict))

    # return matched_motifs, fps_checked_for_motifs
    return matched_motifs, timeseries_matched
예제 #5
0
distances = mts.mass2(ts, query)

# mass3
# distances = mts.mass3(ts, query, 256)

# mass2_batch
# start a multi-threaded batch job with all cpu cores and give me the top 5 matches.
# note that batch_size partitions your time series into a subsequence similarity search.
# even for large time series in single threaded mode, this is much more memory efficient than
# MASS2 on its own.
batch_size = 10000
top_matches = 5
n_jobs = -1
indices, distances = mts.mass2_batch(ts,
                                     query,
                                     batch_size,
                                     top_matches=top_matches,
                                     n_jobs=n_jobs)

# find minimum distance
min_idx = np.argmin(distances)

# find top 4 motif starting indices
k = 4
exclusion_zone = 25
top_motifs = mts.top_k_motifs(distances, k, exclusion_zone)

# find top 4 discord starting indices
k = 4
exclusion_zone = 25
top_discords = mts.top_k_discords(distances, k, exclusion_zone)
예제 #6
0
def on_demand_motif_analysis(metric, timestamp, similarity, batch_size,
                             top_matches, max_distance, range_padding,
                             max_area_percent_diff):
    """
    Process a motif similarity search on demand
    """
    import numpy as np
    import mass_ts as mts

    logger = logging.getLogger(skyline_app_logger)
    dev_null = None
    function_str = 'on_demand_motif_analysis'
    logger.info(
        '%s :: with parameters :: metric: %s, timestamp: %s, similarity: %s, batch_size:%s, top_matches: %s, max_distance: %s, range_padding: %s, max_area_percent_diff: %s'
        % (function_str, str(metric), str(timestamp), str(similarity),
           str(batch_size), str(top_matches), str(max_distance),
           str(range_padding), str(max_area_percent_diff)))
    trace = 'none'
    fail_msg = 'none'

    start = time.time()
    start_timer = timer()
    metric_vars_dict = {}
    metric_id = 0
    fp_ids = []
    timeseries = []
    not_similar_enough_sample = 0
    not_similar_motifs = 0
    similar_motifs = 0
    exact_motifs = 0
    distance_motifs = 0
    motifs_found = []
    find_exact_matches_run = False
    exact_matches_found = []
    fps_timeseries = {}
    # A motif_analysis dict to add to and return
    motif_analysis = {}
    motif_analysis[metric] = {}
    motif_analysis[metric]['timestamp'] = int(timestamp)
    motif_analysis[metric]['started'] = start
    motif_analysis[metric]['motifs'] = {}
    motif_analysis[metric]['exact_motifs'] = exact_motifs
    motif_analysis[metric]['similar_motifs'] = similar_motifs
    motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs
    motif_analysis[metric][
        'not_similar_enough_sample'] = not_similar_enough_sample
    # @added 20210417 - Feature #4014: Ionosphere - inference
    # Allow the user to define the batch_size per similarity search
    motif_analysis[metric]['batch_size'] = int(batch_size)
    motif_analysis[metric]['top_matches'] = int(top_matches)
    motif_analysis[metric]['max_distance'] = float(max_distance)
    # @added 20210425 - Feature #4014: Ionosphere - inference
    # Added max_area_percent_diff for computing the area under the curve
    motif_analysis[metric]['max_area_percent_diff'] = float(
        max_area_percent_diff)

    fps_checked_for_motifs = []

    metric_dir = metric.replace('.', '/')
    metric_timeseries_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER,
                                          str(timestamp), metric_dir)

    # @added 20210418 - Feature #4014: Ionosphere - inference
    # Allow for the similarity search on saved_training_data
    if 'saved_training_data' in request.args:
        saved_training_data_str = request.args.get('saved_training_data',
                                                   'false')
        if saved_training_data_str == 'true':
            saved_metric_timeseries_dir = '%s_saved/%s/%s' % (
                settings.IONOSPHERE_DATA_FOLDER, str(timestamp), metric_dir)
            if path.exists(saved_metric_timeseries_dir):
                metric_timeseries_dir = saved_metric_timeseries_dir
                logger.info('%s :: using saved training_data dir - %s' %
                            (function_str, saved_metric_timeseries_dir))

    metric_vars_file = '%s/%s.txt' % (metric_timeseries_dir, metric)
    timeseries_json = '%s/%s.json' % (metric_timeseries_dir, metric)
    full_duration_in_hours = int(settings.FULL_DURATION / 60 / 60)
    full_duration_timeseries_json = '%s/%s.mirage.redis.%sh.json' % (
        metric_timeseries_dir, metric, str(full_duration_in_hours))
    try:
        metric_vars_dict = mirage_load_metric_vars(skyline_app,
                                                   metric_vars_file, True)
    except Exception as e:
        logger.error(
            'error :: inference :: failed to load metric variables from check file - %s - %s'
            % (metric_vars_file, e))
    if not metric_vars_dict:
        motif_analysis[metric]['status'] = 'error'
        motif_analysis[metric][
            'reason'] = 'could not load training data variables'
        return motif_analysis

    full_duration = metric_vars_dict['metric_vars']['full_duration']

    # Determine the metric details from the database
    metric_id = 0
    metric_db_object = {}
    try:
        metric_db_object = get_metrics_db_object(metric)
    except Exception as e:
        logger.error('error :: %s :: failed to get_metrics_db_object - %s' %
                     (function_str, e))
    try:
        metric_id = int(metric_db_object['id'])
    except Exception as e:
        logger.error(
            'error :: %s :: failed to determine metric_id from metric_db_object %s - %s'
            % (function_str, str(metric_db_object), e))
        metric_id = 0
    if not metric_id:
        logger.error(
            'error :: %s :: failed to get metric id for %s from the database' %
            (function_str, str(metric)))
        fail_msg = 'failed to get metric id'
        motif_analysis[metric]['status'] = 'error'
        motif_analysis[metric]['reason'] = 'could not determine metric id'
        return motif_analysis, fail_msg, trace

    # @modified 20210419 - Feature #4014: Ionosphere - inference
    # Create a unique dir for each batch_size max_distance
    # motif_images_dir = '%s/motifs' % metric_timeseries_dir
    motif_images_dir = '%s/motifs/batch_size.%s/top_matches.%s/max_distance.%s' % (
        metric_timeseries_dir, str(batch_size), str(top_matches),
        str(max_distance))

    if not path.exists(motif_images_dir):
        # provision motifs image resources
        mkdir_p(motif_images_dir)

    full_durations = [full_duration]
    if path.isfile(full_duration_timeseries_json):
        full_durations = [full_duration, settings.FULL_DURATION]
    logger.info('%s :: full_durations - %s' %
                (function_str, str(full_durations)))

    # Loop through analysis per full_duration
    for full_duration in full_durations:
        start_full_duration = timer()
        fp_ids = []
        try:
            query = 'SELECT id,last_matched from ionosphere WHERE metric_id=%s AND full_duration=%s AND enabled=1 ORDER BY last_matched DESC' % (
                str(metric_id), str(full_duration))
            results = mysql_select(skyline_app, query)
            for row in results:
                fp_ids.append(int(row[0]))
        except Exception as e:
            logger.error(
                'error :: %s :: failed to get fp ids via mysql_select from %s - %s'
                % (function_str, metric, e))

        logger.info('%s :: metric_id: %s, full_duration: %s, fp_ids: %s' %
                    (function_str,
                     (metric_id), str(full_duration), str(fp_ids)))

        if not fp_ids:
            continue

        # Now there are known fps, load the timeseries
        if full_duration == settings.FULL_DURATION:
            timeseries_json_file = full_duration_timeseries_json
        else:
            timeseries_json_file = timeseries_json
        try:
            with open((timeseries_json_file), 'r') as f:
                raw_timeseries = f.read()
            timeseries_array_str = str(raw_timeseries).replace('(',
                                                               '[').replace(
                                                                   ')', ']')
            del raw_timeseries
            timeseries = literal_eval(timeseries_array_str)
            del timeseries_array_str
        except Exception as e:
            logger.error(
                'error :: %s :: failed to load timeseries for %s from %s - %s'
                % (function_str, metric, timeseries_json_file, e))
            continue

        anomalous_timeseries_subsequence = []
        for timestamp_float, value in timeseries[-int(batch_size):]:
            anomalous_timeseries_subsequence.append(
                [int(timestamp_float), value])

        logger.info(
            '%s :: looking for motif in trained fps of full_duration: %s' %
            (function_str, (full_duration)))
        dataset = [float(item[1]) for item in anomalous_timeseries_subsequence]

        max_y = max(dataset)
        min_y = min(dataset)

        # full_y_range = max_y - min_y

        # range_padding_percent = range_padding
        # This was just a test that did not have the desired results
        # if full_y_range < 10:
        #     range_padding_percent = 35
        # if full_y_range < 5:
        #     range_padding_percent = 75
        # if full_y_range < 2:
        #    range_padding_percent = 100

        use_range_padding = ((max_y - min_y) / 100) * range_padding
        if min_y > 0 and (min_y - use_range_padding) > 0:
            min_y_padded = min_y - use_range_padding
        else:
            min_y_padded = min_y
        max_y_padded = max_y + use_range_padding
        if min_y_padded == max_y_padded:
            min_y_padded = min_y_padded - (
                (min_y_padded / 100) * range_padding)
            max_y_padded = max_y_padded + (
                (max_y_padded / 100) * range_padding)

        # anomalous_ts = np.array(dataset)
        anomalous_ts = dataset

        mass2_batch_times = []
        exact_match_times = []

        nan = np.array([np.nan])
        nanj = complex(0.0, float('nan'))
        empty_dists = np.array(nan + nanj)

        # plotted = False
        count = 0

        # fp_ids = [fp_id for index, fp_id in enumerate(fp_ids) if index == 0]

        # motifs_found = []
        # exact_matches_found = []
        # fps_timeseries = {}

        for fp_id in fp_ids:
            if (time.time() - start) >= 20:
                break
            # Attempt to surface the fp timeseries from memcache and/or db
            # @modified 20210424 - Feature #4014: Ionosphere - inference
            #                      Task #4030: refactoring
            fp_timeseries = None
            try:
                fp_timeseries = get_fp_timeseries(skyline_app, metric_id,
                                                  fp_id)
            except Exception as e:
                logger.error(
                    'inference :: did not get fp timeseries with get_fp_timeseries(%s, %s, %s) - %s'
                    % (skyline_app, str(metric_id), str(fp_id), e))
            if not fp_timeseries:
                continue

            relate_dataset = [float(item[1]) for item in fp_timeseries]

            fps_timeseries[fp_id] = fp_timeseries

            current_best_indices = []
            current_best_dists = []
            best_indices = None
            best_dists = None

            try:
                logger.info(
                    '%s :: running mts.mass2_batch fp_id: %s, full_duration: %s, batch_size: %s, top_matches: %s, max_distance: %s, motif_size: %s'
                    % (function_str, str(fp_id), str(full_duration),
                       str(batch_size), str(top_matches), str(max_distance),
                       str(len(anomalous_ts))))

                # @added 20210418 - Feature #4014: Ionosphere - inference
                # Handle top_matches being greater than possible kth that can be found
                # mts.mass2_batch error: kth(=50) out of bounds (16)
                use_top_matches = int(top_matches)
                if (len(fp_timeseries) / int(batch_size)) <= int(top_matches):
                    use_top_matches = round(
                        len(fp_timeseries) / int(batch_size)) - 1
                    if use_top_matches == 2:
                        use_top_matches = 1
                    logger.info(
                        '%s :: adjusting top_matches to %s (the maximum possible top - 1) as kth(=%s) will be out of bounds mts.mass2_batch'
                        %
                        (function_str, str(use_top_matches), str(top_matches)))

                start_mass2_batch = timer()
                # @modified 20210418 - Feature #4014: Ionosphere - inference
                # Handle top_matches being greater than possible kth that can be found
                # best_indices, best_dists = mts.mass2_batch(relate_dataset, anomalous_ts, batch_size=int(batch_size), top_matches=int(top_matches))
                best_indices, best_dists = mts.mass2_batch(
                    relate_dataset,
                    anomalous_ts,
                    batch_size=int(batch_size),
                    top_matches=int(use_top_matches))
                end_mass2_batch = timer()
                mass2_batch_times.append((end_mass2_batch - start_mass2_batch))
                current_best_indices = best_indices.tolist()
                current_best_dists = best_dists.tolist()

                # @added 20210412 - Feature #4014: Ionosphere - inference
                #                   Branch #3590: inference
                # Add fp_id to fps_checked_for_motifs to enable ionosphere to update the
                # motif related columns in the ionosphere database table
                fps_checked_for_motifs.append(fp_id)
            except Exception as e:
                logger.error('error :: %s :: %s mts.mass2_batch error: %s' %
                             (function_str, (fp_id), str(e)))
                continue

            try:
                if str(list(best_dists)) == str(list(empty_dists)):
                    logger.info(
                        '%s :: mts.mass2_batch no similar motif from fp id %s - best_dists: %s'
                        % (function_str, (fp_id), str(list(best_dists))))
                    continue
            except Exception as e:
                dev_null = e

            if not current_best_indices[0]:
                continue
            # if list(best_indices)[0] != anomalous_index:
            #     continue
            # If the best_dists is > 1 they are not very similar
            # if list(best_dists)[0].real > 1.0:
            #     continue
            # if list(best_indices)[0] and best_dists:
            for index, best_dist in enumerate(current_best_dists):
                try:
                    motif_added = False
                    """
                    Note: mass_ts finds similar motifs NOT the same motif, the same motif
                    will result in the best_dists being a nan+nanj
                    So it is DIYed
                    """
                    try:
                        # @modified 20210414 - Feature #4014: Ionosphere - inference
                        #                      Branch #3590: inference
                        # Store the not anomalous motifs
                        # motif = [fp_id, current_best_indices[index], best_dist.real]
                        motif = [
                            fp_id, current_best_indices[index], best_dist.real,
                            anomalous_timeseries_subsequence, full_duration
                        ]
                    except Exception as e:
                        dev_null = e
                        motif = []

                    # if list(best_indices)[0] and best_dists:
                    # If it is greater than 1.0 it is not similar
                    # if best_dist.real > 1.0:
                    # if best_dist.real > IONOSPHERE_INFERENCE_MASS_TS_MAX_DISTANCE:
                    if best_dist.real > float(max_distance):
                        continue
                    else:
                        if motif:
                            count += 1
                            motifs_found.append(motif)
                            motif_added = True
                    if not motif_added:
                        if best_dist == nanj:
                            count += 1
                            motifs_found.append(motif)
                            motif_added = True
                    if not motif_added:
                        if str(best_dist) == 'nan+nanj':
                            count += 1
                            motifs_found.append([
                                fp_id, current_best_indices[index], 0.0,
                                anomalous_timeseries_subsequence, full_duration
                            ])
                            motif_added = True
                    if not motif_added:
                        if best_dist == empty_dists:
                            count += 1
                            motifs_found.append(motif)
                            motif_added = True
                except Exception as e:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: %s :: could not determine is if fp id %s timeseries at index %s was a match - %s'
                        % (function_str, str(fp_id),
                           str(current_best_indices[index]), e))
                    continue

            # FIND EXACT MATCHES
            # Seeing as I cannot reproduce finding nan+nanj which represents an
            # exact match with mts.mass2_batch, do it DIY style - iterate the
            # timeseries and create a batch_size subsequence for every index and
            # compare the values to the anomalous_ts for an exact match.
            # This takes ~0.024850 seconds on a timeseries with 10079 datapoints
            # @modified 20210418 - Feature #4014: Ionosphere - inference
            # However fiding exact matches can add ~2.5 seconds on 90 minute
            # batch_size and with a proproptionally scaled max_distance of say 15
            # finding an exact match in a longer sequence is less important,
            # the greater the batch_size the most likely greater the variability
            # and the chance of an exact match decreases.  So save 2.5 seconds.
            # UNLESS
            # At a 5 (to 10) batch_size and max_distance of 1.0 an exact match
            # can be found. Exact matches are quite frequent and sometimes with
            # such little variability, similar matchs may not be found.
            # Therefore find find_exact_matches has its place.  MASS
            # A CAVEAT here is that boring metrics and that change and have a
            # low variability even at a larger batch_size could also benefit and
            # possibly achieve better accruracy from the use of find_exact_matches
            # as they can be shapelets resemble a batch_size 5 shapelet.
            # It would perhaps be possible to use one or more of the features
            # profile tsfresh values to identify these types of shapelets, if
            # you knew which feature/s were most descriptive of this type of
            # shapelet, e.g. 'value__skewness': 3.079477685394873, etc (maybe)
            # However I predict that this method will perform worst on these
            # types of shapelets.
            # find_exact_matches = False
            # exact matches can be found in batch sizes of 500 and similar not
            # So actually always run it.
            find_exact_matches = True
            find_exact_matches_run = True

            if int(batch_size) < 10:
                find_exact_matches = True
                find_exact_matches_run = True

            if find_exact_matches:
                try:
                    start_exact_match = timer()
                    indexed_relate_dataset = []
                    for index, item in enumerate(relate_dataset):
                        indexed_relate_dataset.append([index, item])
                    last_index = indexed_relate_dataset[-1][0]
                    current_index = 0
                    while current_index < last_index:
                        subsequence = [
                            value for index, value in
                            indexed_relate_dataset[current_index:(
                                current_index + int(batch_size))]
                        ]
                        if subsequence == anomalous_ts:
                            exact_matches_found.append([
                                fp_id, current_index, 0.0,
                                anomalous_timeseries_subsequence, full_duration
                            ])
                            motifs_found.append([
                                fp_id, current_index, 0.0,
                                anomalous_timeseries_subsequence, full_duration
                            ])
                        current_index += 1
                    end_exact_match = timer()
                    exact_match_times.append(
                        (end_exact_match - start_exact_match))
                except Exception as e:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: %s :: could not determine it any exact matches could be found in fp id %s timeseries - %s'
                        % (function_str, str(fp_id), e))

        logger.info(
            '%s :: mts.mass2_batch runs on %s fps of full_duration %s in %.6f seconds'
            % (function_str, str(len(mass2_batch_times)), str(full_duration),
               sum(mass2_batch_times)))
        if find_exact_matches_run:
            logger.info(
                '%s :: exact_match runs on %s fps of full_duration %s in %.6f seconds'
                % (function_str, str(len(exact_match_times)),
                   str(full_duration), sum(exact_match_times)))
        end_full_duration = timer()
        logger.info(
            '%s :: analysed %s fps of full_duration %s in %.6f seconds' %
            (function_str, str(len(fp_ids)), str(full_duration),
             (end_full_duration - start_full_duration)))

        # Patterns are sorted
        sorted_motifs = []
        motifs_found_in_fps = []
        if motifs_found:
            sorted_motifs = sorted(motifs_found, key=lambda x: x[2])
            for item in sorted_motifs:
                motifs_found_in_fps.append(item[0])
        logger.info('%s :: %s motifs found' %
                    (function_str, str(len(sorted_motifs))))

        for motif in sorted_motifs:
            if (time.time() - start) >= 25:
                break
            try:
                add_match = False
                all_in_range = False

                fp_id = motif[0]
                best_index = motif[1]
                best_dist = motif[2]

                # @added 20210414 - Feature #4014: Ionosphere - inference
                #                   Branch #3590: inference
                # Store the not anomalous motifs
                motif_sequence = motif[3]

                motif_full_duration = motif[4]

                match_type = 'not_similar_enough'

                if motif in exact_matches_found:
                    add_match = True
                    match_type = 'exact'
                    all_in_range = True
                    exact_motifs += 1
                full_relate_timeseries = fps_timeseries[fp_id]
                # full_relate_dataset = [float(item[1]) for item in full_relate_timeseries]
                relate_timeseries = [
                    item for index, item in enumerate(full_relate_timeseries)
                    if index >= best_index and index < (best_index +
                                                        int(batch_size))
                ]
                relate_dataset = [item[1] for item in relate_timeseries]

                if not add_match:
                    all_in_range = True
                    for value in relate_dataset:
                        if value < min_y_padded:
                            all_in_range = False
                            break
                        if value > max_y_padded:
                            all_in_range = False
                            break
                    if all_in_range:
                        related_max_y = max(relate_dataset)
                        if related_max_y < (max_y - range_padding):
                            all_in_range = False
                        if related_max_y > (max_y + range_padding):
                            all_in_range = False
                        related_min_y = min(relate_dataset)
                        if related_min_y < (min_y - range_padding):
                            all_in_range = False
                        if related_min_y > (min_y + range_padding):
                            all_in_range = False
                    if all_in_range:
                        logger.info(
                            '%s :: ALL IN RANGE - all_in_range: %s, motif: %s'
                            % (function_str, str(all_in_range),
                               str(relate_dataset[0:2])))
                        add_match = True
                        match_type = 'all_in_range'
                        similar_motifs += 1

                    # @added 20210425 - Feature #4014: Ionosphere - inference
                    # Compute the area using the composite trapezoidal rule.
                    motif_area = None
                    fp_motif_area = None
                    percent_different = None
                    try:
                        batch_size_dataset = [
                            float(item[1]) for item in motif_sequence
                        ]
                        y_motif = np.array(batch_size_dataset)
                        motif_area = np.trapz(y_motif, dx=1)
                    except Exception as e:
                        logger.error(
                            'error :: %s :: failed to get motif_area with np.trapz - %s'
                            % (function_str, e))
                    try:
                        y_fp_motif = np.array(relate_dataset)
                        fp_motif_area = np.trapz(y_fp_motif, dx=1)
                    except Exception as e:
                        logger.error(
                            'error :: %s :: failed to get fp_motif_area with np.trapz - %s'
                            % (function_str, e))
                    # Determine the percentage difference (as a
                    # positive value) of the areas under the
                    # curves.
                    if motif_area and fp_motif_area:
                        percent_different = get_percent_different(
                            fp_motif_area, motif_area, True)
                        if percent_different > max_area_percent_diff:
                            if add_match:
                                logger.info(
                                    '%s :: AREA TOO DIFFERENT - not adding all_in_range match'
                                    % (function_str))
                                add_match = False
                            # BUT ...
                            if best_dist < 3 and not add_match:
                                logger.info(
                                    '%s :: DISTANCE VERY SIMILAR - adding match even though area_percent_diff is greater than max_area_percent_diff because best_dist: %s'
                                    % (function_str, str(best_dist)))
                                add_match = True
                                match_type = 'distance'
                                distance_motifs += 1

                if similarity == 'all':
                    if not add_match:
                        not_similar_motifs += 1
                        if not_similar_enough_sample >= 10:
                            continue
                        not_similar_enough_sample += 1
                        add_match = True
                        match_type = 'not_similar_enough'

                if add_match:
                    generation = 0
                    fp_id_row = None
                    try:
                        fp_id_row = get_ionosphere_fp_db_row(
                            skyline_app, int(fp_id))
                    except Exception as e:
                        logger.error(
                            'error :: %s :: failed to get_ionosphere_fp_db_row for fp_id %s - %s'
                            % (function_str, str(fp_id), e))
                    if fp_id_row:
                        try:
                            generation = fp_id_row['generation']
                        except Exception as e:
                            logger.error(
                                'error :: %s :: failed to generation from fp_id_row for fp_id %s - %s'
                                % (function_str, str(fp_id), e))
                    if generation == 0:
                        generation_str = 'trained'
                    else:
                        generation_str = 'LEARNT'
                    motif_match_types = motif_match_types_dict()
                    type_id = motif_match_types[match_type]

                    motif_id = '%s-%s' % (str(fp_id), str(best_index))
                    motif_analysis[metric]['motifs'][motif_id] = {}
                    motif_analysis[metric]['motifs'][motif_id][
                        'metric_id'] = metric_id
                    motif_analysis[metric]['motifs'][motif_id]['fp_id'] = fp_id
                    motif_analysis[metric]['motifs'][motif_id][
                        'generation'] = generation
                    motif_analysis[metric]['motifs'][motif_id][
                        'index'] = best_index
                    motif_analysis[metric]['motifs'][motif_id][
                        'distance'] = best_dist
                    motif_analysis[metric]['motifs'][motif_id]['size'] = int(
                        batch_size)
                    motif_analysis[metric]['motifs'][motif_id][
                        'max_distance'] = float(max_distance)
                    motif_analysis[metric]['motifs'][motif_id][
                        'timestamp'] = timestamp
                    motif_analysis[metric]['motifs'][motif_id][
                        'type_id'] = type_id
                    motif_analysis[metric]['motifs'][motif_id][
                        'type'] = match_type
                    motif_analysis[metric]['motifs'][motif_id][
                        'full_duration'] = motif_full_duration
                    # @added 20210414 - Feature #4014: Ionosphere - inference
                    #                   Branch #3590: inference
                    # Store the not anomalous motifs
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_timeseries'] = anomalous_timeseries_subsequence
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_sequence'] = motif_sequence
                    not_anomalous_timestamp = int(
                        anomalous_timeseries_subsequence[-1][0])
                    graph_period_seconds = not_anomalous_timestamp - int(
                        anomalous_timeseries_subsequence[0][0])
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_period_seconds'] = graph_period_seconds
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_period_minutes'] = round(graph_period_seconds /
                                                        60)

                    motif_analysis[metric]['motifs'][motif_id]['image'] = None

                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_area'] = motif_area
                    motif_analysis[metric]['motifs'][motif_id][
                        'fp_motif_area'] = fp_motif_area
                    motif_analysis[metric]['motifs'][motif_id][
                        'area_percent_diff'] = percent_different
                    motif_analysis[metric]['motifs'][motif_id][
                        'max_area_percent_diff'] = max_area_percent_diff

                    if (time.time() - start) >= 25:
                        continue

                    graph_image_file = '%s/motif.%s.%s.%s.with_max_distance.%s.png' % (
                        motif_images_dir, motif_id, match_type,
                        str(batch_size), str(max_distance))
                    plotted_image = False
                    on_demand_motif_analysis = True
                    if not path.isfile(graph_image_file):
                        plotted_image, plotted_image_file = plot_motif_match(
                            skyline_app, metric, timestamp, fp_id,
                            full_duration, generation_str, motif_id,
                            best_index, int(batch_size), best_dist, type_id,
                            relate_dataset, anomalous_timeseries_subsequence,
                            graph_image_file, on_demand_motif_analysis)
                    else:
                        plotted_image = True
                        logger.info('%s :: plot already exists - %s' %
                                    (function_str, str(graph_image_file)))
                    if plotted_image:
                        motif_analysis[metric]['motifs'][motif_id][
                            'image'] = graph_image_file
                    else:
                        logger.error('failed to plot motif match plot')
                        graph_image_file = None
            except Exception as e:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: inference :: with fp id %s proceesing motif at index: %s - %s'
                    % (str(fp_id), str(motif[0]), str(e)))
                continue
    end_timer = timer()
    motif_analysis[metric]['fps_checked'] = fps_checked_for_motifs
    motif_analysis[metric]['exact_motifs'] = exact_motifs
    motif_analysis[metric]['similar_motifs'] = similar_motifs
    motif_analysis[metric]['distance_motifs'] = distance_motifs
    motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs
    motif_analysis[metric][
        'not_similar_enough_sample'] = not_similar_enough_sample

    motif_analysis_file = '%s/motif.analysis.similarity_%s.batch_size_%s.top_matches_%s.max_distance_%s.dict' % (
        motif_images_dir, similarity, str(batch_size), str(top_matches),
        str(max_distance))
    try:
        write_data_to_file(skyline_app, motif_analysis_file, 'w',
                           str(motif_analysis))
    except Exception as e:
        trace = traceback.format_exc()
        logger.error('%s' % trace)
        fail_msg = '%s :: error :: failed to write motif_analysis_file - %s' % (
            function_str, motif_analysis_file)
        logger.error('%s' % fail_msg)
        dev_null = e

    motif_ids = list(motif_analysis[metric]['motifs'].keys())
    logger.info(
        '%s :: %s motif matches found, %s fps where checked and motifs plotted in %.6f seconds for %s'
        % (function_str, str(len(motif_ids)), str(len(fps_checked_for_motifs)),
           (end_timer - start_timer), metric))
    if dev_null:
        del dev_null
    return motif_analysis, fail_msg, trace