def sim(main, query, sr=22050 // 4): # main is good track batch_sz = len(main) // 10 idxs, dists = mts.mass2_batch(main, query, batch_size=batch_sz, top_matches=len(main) // batch_sz - 1, n_jobs=6) return idxs[np.argsort(dists)]
def test_mass2_batch_robotdog_multi_threaded(): """Sanity check that compares results from UCR use case.""" robot_dog = np.loadtxt( os.path.join(MODULE_PATH, '..', 'tests', 'robot_dog.txt')) carpet_walk = np.loadtxt( os.path.join(MODULE_PATH, '..', 'tests', 'carpet_query.txt')) indices, distances = mts.mass2_batch( robot_dog, carpet_walk, 1000, top_matches=3, n_jobs=2) min_dist_idx = np.argmin(distances) min_idx = indices[min_dist_idx] assert(min_idx == 7479)
def f_ts_clustering(param_pe, param_row, param_ca_data, param_ce_data, param_tipo, param_p_ventana, param_cores, param_batch, param_matches): """ Parameters ---------- param_pe : pd.DataFrame : dataframe con precios param_row : param_ce_data : param_ca_data : param_p_ventana : param_cores : param_tipo : param_batch : param_matches : Returns ------- df_tabla_busqueda : Debugging --------- param_pe = df_precios # precios historicos para minar param_row = 4 # renglon de iteracion de candidatos param_ca_data = df_ind_3 # dataframe con candidatos a iterar param_ce_data = df_ce # dataframe con calendario completo param_tipo = 'mid' param_p_ventana = 30 # tamano de ventana para buscar serie de tiempo param_cores = 4 # nucleos con los cuales utilizar algoritmo param_batch = 300 param_matches = 10 """ # almacenar resultados # dict_res = {'name': [], 'esc': [], 'timestamp': [], # 'tipo_1': [], 'tipo_2': [], 'tipo_3': [], 'tipo_4': []} # renglon con informacion de evento disparador candidato candidate_data = param_ca_data.iloc[param_row, :] # print('Ind disparador: ' + str(candidate_data['name']) + ' - ' + candidate_data['esc']) # datos completos de todas las ocurrencias del evento disparador candidato df_ancla = param_ce_data[(param_ce_data['esc'] == candidate_data['esc']) & (param_ce_data['name'] == candidate_data['name'])] # todos los timestamps del calendario economico completo ts_serie_ce = list(param_ce_data['timestamp']) # inicializar contadores de ocurrencias por escenario ancla p1, p2, p3, p4 = 0, 0, 0, 0 # Para guardar resultados parciales dict_res = {'ancla': df_ancla['id'].iloc[0], 'metricas': {}, 'datos': {}} # -- ------------------------------------------------------ OCURRENCIA POR OCURRENCIA -- # for ancla in range(0, len(df_ancla['timestamp'])): # ancla = 31 # print(ancla) # datos de ancla para buscar hacia el futuro ancla_ocurr = df_ancla.iloc[ancla, ] # print('ind: ' + ancla_ocurr['name'] + ' ' + ancla_ocurr['esc'] + ' ' + # str(ancla_ocurr['timestamp'])) # fecha de ancla fecha_ini = ancla_ocurr['timestamp'] # .. buscar recurrentemente la fecha mas cercana para construir serie y serie_p while len(param_pe[param_pe['timestamp'] == fecha_ini].index) == 0: fecha_ini = fecha_ini - timedelta(minutes=1) # se toma el timestamp de precios igual a timestamp del primer escenario del indicador ind_ini = param_pe[param_pe['timestamp'] == fecha_ini].index # fecha final es la fecha inicial mas un tamano de ventana arbitrario ind_fin = ind_ini + param_p_ventana # se construye la serie query df_serie_q = param_pe.copy().loc[ind_ini[0]:ind_fin[0], :] df_serie_q = df_serie_q.reset_index(drop=True) # se toma el mid como valor para construir series temporales serie_q = np.array(df_serie_q[param_tipo]) # se construye la serie completa para busqueda (un array de numpy de 1 dimension) df_serie = param_pe.copy().loc[ind_ini[0]:, :] df_serie = df_serie.reset_index(drop=True) # se toma el mid como valor para construir series temporales serie = np.array(df_serie[param_tipo]) try: # correr algoritmo y regresar los indices de coincidencias y las distancias mass_indices, mass_dists = mass.mass2_batch( ts=serie, query=serie_q, batch_size=param_batch, n_jobs=param_cores, top_matches=param_matches) # Borrar inidice 0 de resultados por ser el mismo que la serie query origen = np.where(mass_indices == 0)[0][0] mass_indices = np.delete(mass_indices, origen) # mass_dists = np.delete(mass_dists, origen) # print('indices encontrados' + ' ' + str(mass_indices)) # Indice de referencia de n-esima serie similar encontrada for indice in mass_indices: # indice = mass_indices[0] # print(indice) # DataFrame de n-esima serie patron similar encontrada df_serie_p = df_serie.copy().loc[indice:(indice + param_p_ventana), :] # print(df_serie_p.head()) # print('Verificando patron con f_ini: ' + # str(list(df_serie_p['timestamp'])[0]) + ' f_fin: ' + # str(list(df_serie_p['timestamp'])[-1])) # Extraer el timestamp inicial para verificar si coincide con indicador ts_serie_p = list(df_serie_p['timestamp'])[0] # Busqueda si el timestamp inicial de cada uno de los patrones # encontrados es igual a alguna fecha de comunicacion de toda # la lista de indicadores que se tiene if ts_serie_p in ts_serie_ce: # ID del evento ancla que genero patron hacia adelante id_ocurrencia = ancla_ocurr['id'] + '_' + ancla_ocurr['esc'] +\ '_' + str(ancla_ocurr['timestamp'])[:-6].replace(' ', '_') match = np.where( param_ce_data['timestamp'] == ts_serie_p)[0] encontrados = param_ce_data.loc[match, :] # print(' ------------------ Coincidencia encontrada ------------------') # print('buscando en: ' + id_ocurrencia) # print(' ----------- Se encontro el patron que empieza en: -----------') # print(ts_serie_p) # print('en los siguientes casos: ') # print(encontrados) # -- contar y sacar los datos segun tipo # Paso 1: tener un diccionario con la llave id_ocurrencia con la encontrada # Paso 2: dentro de la llave id_ocurrencia tener la llave datos dict_res['datos'].update({ id_ocurrencia: { 'ocurrencias': {}, 'df_serie_q': df_serie_q, 'df_serie_p': df_serie_p } }) # Paso 3: hacer las llaves id_sub_ocurrencia para cada sub de ocurrencia llaves = [ encontrados['id'].iloc[j] + '_' + encontrados['esc'].iloc[j] + '_' + str(encontrados['timestamp'].iloc[j])[:-6].replace( ' ', '_') for j in range(0, len(encontrados['id'])) ] dict_res['datos'][id_ocurrencia]['ocurrencias'] = llaves enc = (encontrados['name'] == ancla_ocurr['name']) & \ (encontrados['esc'] == ancla_ocurr['esc']) # TIPO 1: name == name & esc == esc p1 = p1 + len(encontrados.loc[enc, 'name']) # print('tipo_1 = ' + str(p1)) # TIPO 2: name == name p2 = p2 + len(encontrados.loc[encontrados['name'] == ancla_ocurr['name'], 'name']) # print('tipo_2 = ' + str(p2)) # TIPO 3: cualquiera en calendario p3 = p3 + len(encontrados.loc[ encontrados['name'] != ancla_ocurr['name'], 'name']) # print('tipo_3 = ' + str(p3)) # TIPO 4: fuera de calendario p4 = p4 + 0 # print('tipo_4 = ' + str(p4)) else: # TIPO 4: fuera de calendario p4 += len(mass_indices) # print('p4 = ' + str(p4)) # tipo_4 = Cualquier otro punto en el tiempo except ValueError: # print('ValueError: problemas de indices en MASS-TS') p4 += 0 except IndexError: # print('IndexError: problemas de indices en MASS-TS') p4 += 0 # agregar al diccionario de resultados los casos encontrados dict_res.update({ 'metricas': { # Mismo Indicador + Mismo Escenario que la ancla 'tipo_1': p1, # Mismo Indicador + Cualquier Escenario 'tipo_2': p2, # Otro Indicador en la lista 'tipo_3': p3, # Ninguna de las anteriores 'tipo_4': p4 } }) return dict_res
def find_cloudburst_motifs(metric, snippet, timeseries, print_output=False): """ Takes a snippet of timeseries, creates motifs at batch_sizes from the snippet and searches for those motifs in the given timeseries, returns a matched_motifs dict and a timeseries_matched dict """ logger = logging.getLogger(skyline_app_logger) child_process_pid = os.getpid() logger.info( 'functions.luminosity.find_cloudburst_motifs :: running for process_pid - %s for %s' % (str(child_process_pid), metric)) start = timer() debug_logging = False timeseries_matched = {} timeseries_matched[metric] = {} matched_motifs = {} motifs_found = [] dev_null = None for item in timeseries: timestamp = int(item[0]) timeseries_matched[metric][timestamp] = {} timeseries_matched[metric][timestamp]['motif_matches'] = {} mass2_batch_times = [] mass3_times = [] exact_match_times = [] nan = np.array([np.nan]) nanj = complex(0.0, float('nan')) empty_dists = np.array(nan + nanj) motifs_found = [] exact_matches_found = [] motif_match_types = motif_match_types_dict() start_full_duration = timer() # metric_resolution = determine_data_frequency(skyline_app, timeseries, False) logger.info( 'functions.luminosity.find_cloudburst_motifs :: looking for similar motifs in timeseries of length: %s' % str(len(timeseries))) exact_match_times = [] # relate_dataset = [float(item[1]) for item in fp_timeseries] relate_dataset = [float(item[1]) for item in timeseries] # namespace_key = 'default_inference_batch_sizes' # for batch_size in list(IONOSPHERE_INFERENCE_MOTIFS_SETTINGS[namespace_key].keys()): # for batch_size in [len(snippet)]: batch_size = len(snippet) if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: checking %s at batch_size: %s' % (metric, str(batch_size))) # @added 20210423 - Feature #4014: Ionosphere - inference # The convenience mass2_batch method will not work to find # top matches if the number of top_matches to be found are # greater than the number of indices in which a match can be # found. In these cases such as trying to find the: # batch_size: 1440, top_matches: 50, max_distance: 30, snippet_length: 1451 # even setting the top_matches to 1 will result in # mass2_batch throwing the error: # mts.mass2_batch error: kth(=1) out of bounds (1) # So use mass3 as appropriate. use_mass3 = False use_mass2_batch = True n = len(snippet) indices = list(range(0, n - batch_size + 1, batch_size)) # mass2_batch default is 3 so if there are less than 3 # indices in which the best macthes can be found, use mass3 if len(indices) < 3: use_mass3 = True use_mass2_batch = False logger.info( 'functions.luminosity.find_cloudburst_motifs :: batch_size: %s, snippet length: %s, len(indices) < 3, using mass3' % (str(batch_size), str(n))) if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: batch_size: %s, snippet length: %s, len(indices) < 3, using mass3' % (str(batch_size), str(n))) top_matches = 1 max_distance = 1.8 find_exact_matches = True # if use_mass2_batch: logger.info( 'functions.luminosity.find_cloudburst_motifs :: analysis run - metric: %s, batch_size: %s, top_matches: %s, max_distance: %s, snippet_length: %s' % (str(metric), str(batch_size), str(top_matches), str(max_distance), str(len(snippet)))) if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: analysis run - metric: %s, batch_size: %s, top_matches: %s, max_distance: %s, snippet_length: %s' % (str(metric), str(batch_size), str(top_matches), str(max_distance), str(len(snippet)))) # Given that the snippet can be any length if len(snippet) < batch_size: if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: skipping snippet: %s, batch_size: %s' % (str(len(snippet)), str(batch_size))) return matched_motifs, timeseries_matched else: if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: checking %s, batch_size: %s' % (metric, str(batch_size))) # Create the subsequence that is being searched for n = batch_size # snippets = [snippet[i * n:(i + 1) * n] for i in range((len(snippet) + n - 1) // n)] # snippets = [snippet] # batch_size_anomalous_timeseries_subsequence = timeseries[-batch_size:] # batch_size_dataset = [float(item[1]) for item in batch_size_anomalous_timeseries_subsequence] # for i_snippet in snippets: # batch_size_timeseries_subsequence = i_snippet[-batch_size:] batch_size_timeseries_subsequence = snippet batch_size_dataset = [ float(item[1]) for item in batch_size_timeseries_subsequence ] motif_timestamp = int(batch_size_timeseries_subsequence[-1][0]) # Set defaults current_best_indices = [] current_best_dists = [] best_indices = None best_dists = None # POC running all through mass3 with maximum pieces (SUPER FAST) # and then filtering on max_distance, all_in_range and area # percent_different # use_mass3 = True # use_mass2_batch = False # POC running all through mass3 and then filtering FALIED in # terms of time taken... due to having to run 22421 motifs # through all_in_range and percent_different functions ... # just these motifs checked took 62.036366 seconds, the surfacing # and transforming of the data AND mass3 to only 2 seconds # 2021-04-27 13:45:59 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: analysed 2 fps of full_duration 86400 in 0.330732 seconds # 2021-04-27 13:45:59 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: 22421 distance_valid_motifs determined in 0.346807 seconds from 81432 motifs_found # 2021-04-27 13:45:59 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: sorted_motifs from distance_valid_motifs in 0.048316 seconds # 2021-04-27 13:46:01 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: percent_different in 0.000590 seconds # 2021-04-27 13:46:01 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: percent_different in 0.000271 seconds # ... # ... # 2021-04-27 13:46:57 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: percent_different in 0.000373 seconds # 2021-04-27 13:46:57 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: percent_different in 0.000381 seconds # 2021-04-27 13:46:58 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: percent_different in 0.000363 seconds # 2021-04-27 13:46:58 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: percent_different in 0.000348 seconds # 2021-04-27 13:47:01 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: motifs checked in 62.036366 seconds # 2021-04-27 13:47:01 :: 3586421 :: functions.luminosity.find_cloudburst_motifs :: 0 motif best match found from 81432 motifs_found, 4 fps where checked {604800: {'fp_count': 2}, 86400: {'fp_count': 2}} (motifs remove due to not in range 22325, percent_different 96) and it took a total of 64.761969 seconds (only mass3) to process telegraf.ssdnodes-26840.mariadb.localhost:3306.mysql.bytes_sent # 2021-04-27 13:47:01 :: 3586421 :: inference found 0 matching similar motifs, checked 0 fps in 64.790198 seconds if use_mass2_batch: try: # @added 20210419 - Feature #4014: Ionosphere - inference # Handle top_matches being greater than possible kth that can be found # mts.mass2_batch error: kth(=50) out of bounds (16) use_top_matches = int(top_matches) if (len(snippet) / int(batch_size)) <= int(top_matches): use_top_matches = round(len(snippet) / int(batch_size)) - 2 if use_top_matches == 2: use_top_matches = 1 if use_top_matches < 1: use_top_matches = 1 logger.info( 'functions.luminosity.find_cloudburst_motifs :: adjusting top_matches for mass2_batch to %s (the maximum possible top - 1) as top_matches=%s will be out of bounds mts.mass2_batch' % (str(use_top_matches), str(top_matches))) start_mass2_batch = timer() best_indices, best_dists = mts.mass2_batch( relate_dataset, batch_size_dataset, batch_size=batch_size, top_matches=use_top_matches) end_mass2_batch = timer() mass2_batch_times.append((end_mass2_batch - start_mass2_batch)) current_best_indices = best_indices.tolist() current_best_dists = best_dists.tolist() logger.info( 'functions.luminosity.find_cloudburst_motifs :: mass2_batch run on batch_size: %s, top_matches: %s, in %6f seconds' % (str(batch_size), str(use_top_matches), (end_mass2_batch - start_mass2_batch))) if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: mass2_batch run on batch_size: %s, top_matches: %s, in %6f seconds' % (str(batch_size), str(use_top_matches), (end_mass2_batch - start_mass2_batch))) if debug_logging: logger.debug( 'debug :: functions.luminosity.find_cloudburst_motifs :: best_indices: %s, best_dists: %s' % (str(current_best_indices), str(current_best_dists))) except ValueError as e: # If mass2_batch reports out of bounds, use mass3 if 'out of bounds' in str(e): use_mass3 = True best_dists = ['use_mass3'] logger.info( 'functions.luminosity.find_cloudburst_motifs :: mts.mass2_batch will be out of bounds running mass3' ) except Exception as e: logger.error( 'error :: functions.luminosity.find_cloudburst_motifs :: %s mts.mass2_batch error: %s' % (str(metric), str(e))) if print_output: print( 'error :: functions.luminosity.find_cloudburst_motifs :: %s mts.mass2_batch error: %s' % (str(metric), str(e))) return matched_motifs, timeseries_matched if not use_mass3: try: if str(list(best_dists)) == str(list(empty_dists)): logger.info( 'functions.luminosity.find_cloudburst_motifs :: mts.mass2_batch no similar motif from %s - best_dists: %s' % (str(metric), str(list(best_dists)))) if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: mts.mass2_batch no similar motif from %s - best_dists: %s' % (str(metric), str(list(best_dists)))) return matched_motifs, timeseries_matched except Exception as e: dev_null = e # @added 20210423 - if use_mass3: # pieces should be larger than the query length and as many # as possible, a power of two would be best, but as many # pieces as possible is the best we can achieve above 265 query_length = len(batch_size_dataset) # if query_length < 256: # pieces = 256 # else: # pieces = query_length + 2 pieces = len(snippet) - query_length if pieces < query_length: pieces = query_length + 2 check_pieces_length = False if check_pieces_length: # @modified 20210504 - Feature #4014: Ionosphere - inference # Handle the fp_timeseries being the same length (meaning # too short) as the query length if len(snippet) <= pieces: logger.info( 'functions.luminosity.find_cloudburst_motifs :: skipping running mass3 with %s pieces on metric: %s, batch_size: %s because snippet length is not long enough for the query size' % (str(pieces), str(metric), str(batch_size))) if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: skipping running mass3 with %s pieces on metric: %s, batch_size: %s because snippet length is not long enough for the query size' % (str(pieces), str(metric), str(batch_size))) return matched_motifs, timeseries_matched # @modified 20210505 - Feature #4014: Ionosphere - inference # Skip the batch size if the fp_timeseries is a similar # length as the batch_size. This was specifically added to # reduce errors were there may be missing data points in a # timeseries and the lengths are not the same. This was # encountered on a batch_size of 1440 with FULL_DURATION # 86400 60 second data. A match was never found at a # batch_size > 720 on that data, but errors were occassionally # encountered. ten_percent_of_batch_size = int(batch_size / 10) if (len(snippet) - ten_percent_of_batch_size) < batch_size: logger.info( 'functions.luminosity.find_cloudburst_motifs :: skipping running mass3 on metric: %s, batch_size: %s because the batch_size is too close to length' % (str(metric), str(batch_size))) if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: skipping running mass3 on metric: %s, batch_size: %s because the batch_size is too close to length' % (str(metric), str(batch_size))) return matched_motifs, timeseries_matched logger.info( 'functions.luminosity.find_cloudburst_motifs :: running mass3 with %s pieces on on metric: %s, batch_size: %s' % (str(pieces), str(metric), str(batch_size))) if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: running mass3 with %s pieces on on metric: %s, batch_size: %s' % (str(pieces), str(metric), str(batch_size))) start_mass3 = timer() try: best_dists = mts.mass3(relate_dataset, batch_size_dataset, pieces) end_mass3 = timer() except Exception as e: logger.error( 'error :: functions.luminosity.find_cloudburst_motifs :: metric %s mts.mass3 error: %s' % (str(metric), str(e))) if print_output: print( 'error :: functions.luminosity.find_cloudburst_motifs :: metric %s mts.mass3 error: %s' % (str(metric), str(e))) return matched_motifs, timeseries_matched mass3_times.append((end_mass3 - start_mass3)) current_best_dists = best_dists.tolist() # Create current_best_indices as mass2_batch returns current_best_indices = [] if len(relate_dataset) > batch_size: for index in enumerate(relate_dataset): # if index[0] >= (batch_size - 1): # The array starts at batch_size + 1 # if index[0] >= (batch_size + 1): # but that fails on the add_motifs comprehension # add_motifs = [[fp_id, current_best_indices[index], best_dist.real, batch_size_timeseries_subsequence, batch_size, max_distance, max_area_percent_diff, max_y, min_y, range_padding, min_y_padded, max_y_padded] for index, best_dist in enumerate(current_best_dists)] # IndexError: list index out of range if index[0] >= (batch_size - 1): current_best_indices.append(index[0]) # @modified 20210505 - Feature #4014: Ionosphere - inference # Handle the query_length being shorter than the batch_size if len(current_best_indices) != len(current_best_dists): current_best_indices = [] if index[0] >= (query_length - 1): current_best_indices.append(index[0]) if len(current_best_indices) != len(current_best_dists): logger.info( 'functions.luminosity.find_cloudburst_motifs :: discarding mass3 results as current_best_dists length: %s, current_best_indices length: %s do not match, took %6f seconds' % (str(len(current_best_dists)), str(len(current_best_indices)), (end_mass3 - start_mass3))) if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: discarding mass3 results as current_best_dists length: %s, current_best_indices length: %s do not match, took %6f seconds' % (str(len(current_best_dists)), str(len(current_best_indices)), (end_mass3 - start_mass3))) return matched_motifs, timeseries_matched logger.info( 'functions.luminosity.find_cloudburst_motifs :: mass3 run, current_best_dists length: %s, current_best_indices length: %s, took %6f seconds' % (str(len(current_best_dists)), str(len(current_best_indices)), (end_mass3 - start_mass3))) if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: mass3 run, current_best_dists length: %s, current_best_indices length: %s, took %6f seconds' % (str(len(current_best_dists)), str(len(current_best_indices)), (end_mass3 - start_mass3))) if not use_mass3: if not current_best_indices[0]: return matched_motifs, timeseries_matched if use_mass3 and not current_best_indices: return matched_motifs, timeseries_matched # All in one quicker? Yes start_add_motifs = timer() add_motifs = [] try: add_motifs = [[ metric, current_best_indices[index], best_dist.real, batch_size_timeseries_subsequence, batch_size, max_distance, motif_timestamp ] for index, best_dist in enumerate(current_best_dists)] if add_motifs: motifs_found = motifs_found + add_motifs except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: functions.luminosity.find_cloudburst_motifs :: could not add_motifs to motifs_found - %s' % (e)) end_add_motifs = timer() logger.info( 'functions.luminosity.find_cloudburst_motifs :: added %s motifs to motifs_found in %.6f seconds' % (str(len(add_motifs)), (end_add_motifs - start_add_motifs))) # Break if an exact match is found # @modified 20210430 - Bug #4044: inference - motif distance override - exact match # @modified 20210504 - Bug #4044: inference - motif distance override - exact match # if len([item for item in add_motifs if item[2] == 0]) > 0: # exact_matches_found = exact_matches_found + [item for item in add_motifs if item[2] == 0] # break # @modified 20210427 - Feature #4014: Ionosphere - inference # Finding exact matches can result is more than doubling the # runtime when used after mass2_batch runs (which do not find) # exact matches, mass3 does. However the amount of time an # exact match is found, is very rare # if not use_mass3: if not use_mass3 and find_exact_matches: # mass3 finds exact matches, mass2_batch does not, so # there is no need to find exacts matchs if mass3 was # run. # FIND EXACT MATCHES # Seeing as I cannot reproduce finding nan+nanj which represents an # exact match with mts.mass2_batch, do it DIY style - iterate the # timeseries and create a batch_size subsequence for every index and # compare the values to the anomalous_ts for an exact match. # This takes ~0.024850 seconds on a timeseries with 10079 datapoints try: start_exact_match = timer() indexed_relate_dataset = [] for index, item in enumerate(relate_dataset): indexed_relate_dataset.append([index, item]) last_index = indexed_relate_dataset[-1][0] current_index = 0 while current_index < last_index: subsequence = [ value for index, value in indexed_relate_dataset[current_index:( current_index + batch_size)] ] if subsequence == batch_size_dataset: # @modified 20210419 - Feature #4014: Ionosphere - inference # Added batch_size exact_matches_found.append([ metric, current_index, 0.0, batch_size_timeseries_subsequence, batch_size, max_distance, motif_timestamp ]) motifs_found.append([ metric, current_index, 0.0, batch_size_timeseries_subsequence, batch_size, max_distance, motif_timestamp ]) current_index += 1 end_exact_match = timer() exact_match_times.append((end_exact_match - start_exact_match)) except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: functions.luminosity.find_cloudburst_motifs :: could not determine it any exact matches could be found in %s timeseries - %s' % (str(metric), e)) logger.info( 'functions.luminosity.find_cloudburst_motifs :: exact matches checked in %.6f seconds' % ((end_exact_match - start_exact_match))) # TODO # mass3 ALL, then evaluate, would it be quicker? No see POC # above logger.info( 'functions.luminosity.find_cloudburst_motifs :: mts.mass2_batch runs on %s in %.6f seconds' % (str(metric), sum(mass2_batch_times))) logger.info( 'functions.luminosity.find_cloudburst_motifs :: exact_match runs on %s in %.6f seconds' % (str(metric), sum(exact_match_times))) end_full_duration = timer() logger.info( 'functions.luminosity.find_cloudburst_motifs :: analysed %s in %.6f seconds' % (str(metric), (end_full_duration - start_full_duration))) # Patterns are sorted by distance # The list produced with the mass3 method will include # nans start_distance_valid_motifs = timer() distance_valid_motifs = [ item for item in motifs_found if not np.isnan(item[2]) and item[2] <= item[5] ] end_distance_valid_motifs = timer() logger.info( 'functions.luminosity.find_cloudburst_motifs :: %s distance_valid_motifs determined in %.6f seconds from %s motifs_found' % (str(len(distance_valid_motifs)), (end_distance_valid_motifs - start_distance_valid_motifs), str(len(motifs_found)))) if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: %s distance_valid_motifs determined in %.6f seconds from %s motifs_found' % (str(len(distance_valid_motifs)), (end_distance_valid_motifs - start_distance_valid_motifs), str(len(motifs_found)))) start_sorted_motifs = timer() sorted_motifs = [] if motifs_found: sorted_motifs = sorted(distance_valid_motifs, key=lambda x: x[2]) # If the areas under the curve were calculated, the # list could be sorted by area_percent_diff then by # distance. # import operator # sorted_motifs = sorted(motifs_found_in_fp, key=operator.itemgetter(2, 2)) end_sorted_motifs = timer() logger.info( 'functions.luminosity.find_cloudburst_motifs :: sorted_motifs from distance_valid_motifs in %.6f seconds' % ((end_sorted_motifs - start_sorted_motifs))) start_motifs_check = timer() snippet_timestamps = [int(item[0]) for item in snippet] for motif in sorted_motifs: try: add_match = False metric = motif[0] best_index = motif[1] best_dist = motif[2] # motif_sequence = motif[3] # @modified 20210419 - Feature #4014: Ionosphere - inference # Added batch_size motif_size = motif[4] motif_timestamp = motif[6] add_match = True match_type = 'distance' if motif in exact_matches_found: match_type = 'exact' if debug_logging: logger.debug( 'debug :: functions.luminosity.find_cloudburst_motifs :: exact match: %s' % (str(motif))) full_relate_timeseries = timeseries relate_timeseries = [ item for index, item in enumerate(full_relate_timeseries) if index >= best_index and index < (best_index + motif_size) ] relate_dataset = [item[1] for item in relate_timeseries] # relate_dataset_timestamps = [int(item[0]) for item in relate_timeseries] matched_period_timestamps = [ int(item[0]) for item in relate_timeseries ] in_period = False for matched_period_timestamp in matched_period_timestamps: if matched_period_timestamp in snippet_timestamps: in_period = True if not in_period: add_match = False if add_match: timestamp = int(relate_timeseries[-1][0]) timeseries_matched[metric][timestamp]['motif_matches'][ motif_timestamp] = motif_size motif_id = '%s-%s-%s' % (str(metric), str(int( snippet[-1][0])), str(best_index)) matched_motifs[motif_id] = {} matched_motifs[motif_id]['index'] = best_index matched_motifs[motif_id]['distance'] = best_dist matched_motifs[motif_id]['size'] = motif_size matched_motifs[motif_id]['timestamp'] = timestamp matched_motifs[motif_id][ 'matched_period_timestamps'] = matched_period_timestamps matched_motifs[motif_id]['motif_timestamp'] = motif_timestamp matched_motifs[motif_id]['type'] = match_type matched_motifs[motif_id]['type_id'] = motif_match_types[ match_type] runtime_end = timer() matched_motifs[motif_id]['runtime'] = (runtime_end - start) # if SINGLE_MATCH: # break except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: functions.luminosity.find_cloudburst_motifs :: metric %s and motif: %s - %s' % (str(metric), str(motif), str(e))) continue end_motifs_check = timer() logger.info( 'functions.luminosity.find_cloudburst_motifs :: motifs checked in %.6f seconds' % ((end_motifs_check - start_motifs_check))) if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: motifs checked in %.6f seconds' % ((end_motifs_check - start_motifs_check))) # Sort by distance AND area_percent_diff sorted_ordered_matched_motifs_list = [] if matched_motifs and len(matched_motifs) > 1: ordered_matched_motifs_list = [] for motif_id in list(matched_motifs.keys()): distance = matched_motifs[motif_id]['distance'] ordered_matched_motifs_list.append([motif_id, distance]) # If the areas under the curve were calculated, the # list could be sorted by area_percent_diff then by # distance. sorted_matched_motifs = {} sorted_ordered_matched_motifs_list = sorted( ordered_matched_motifs_list, key=operator.itemgetter(1)) logger.info( 'functions.luminosity.find_cloudburst_motifs :: sorting %s matched_motifs by distance' % (str(len(sorted_ordered_matched_motifs_list)))) for motif_id, distance, in sorted_ordered_matched_motifs_list: sorted_matched_motifs[motif_id] = matched_motifs[motif_id] # if SINGLE_MATCH: # break matched_motifs = sorted_matched_motifs.copy() end = timer() if dev_null: del dev_null logger.info( 'functions.luminosity.find_cloudburst_motifs :: %s motif best match found from %s motifs_found and it took a total of %.6f seconds (all mass2/mass3) to process %s' % ( # str(len(matched_motifs)), str(len(motifs_found)), str(len(fps_checked_for_motifs)), str(len(matched_motifs)), str(len(motifs_found)), (end - start), metric)) if len(matched_motifs) > 0: if print_output: print( 'functions.luminosity.find_cloudburst_motifs :: %s motif best match found from %s distance valid motifs of %s motifs_found and it took a total of %.6f seconds (all mass2/mass3) to process %s' % ( # str(len(matched_motifs)), str(len(motifs_found)), str(len(fps_checked_for_motifs)), str(len(matched_motifs)), str(len(distance_valid_motifs)), str(len(motifs_found)), (end - start), metric)) distances = [] for match in list(matched_motifs.keys()): distances.append(matched_motifs[match]['distance']) distances_dict = {} distances_dict['avg_distance'] = sum(distances) / len(distances) distances_dict['distances'] = distances print('%s' % str(distances_dict)) # return matched_motifs, fps_checked_for_motifs return matched_motifs, timeseries_matched
distances = mts.mass2(ts, query) # mass3 # distances = mts.mass3(ts, query, 256) # mass2_batch # start a multi-threaded batch job with all cpu cores and give me the top 5 matches. # note that batch_size partitions your time series into a subsequence similarity search. # even for large time series in single threaded mode, this is much more memory efficient than # MASS2 on its own. batch_size = 10000 top_matches = 5 n_jobs = -1 indices, distances = mts.mass2_batch(ts, query, batch_size, top_matches=top_matches, n_jobs=n_jobs) # find minimum distance min_idx = np.argmin(distances) # find top 4 motif starting indices k = 4 exclusion_zone = 25 top_motifs = mts.top_k_motifs(distances, k, exclusion_zone) # find top 4 discord starting indices k = 4 exclusion_zone = 25 top_discords = mts.top_k_discords(distances, k, exclusion_zone)
def on_demand_motif_analysis(metric, timestamp, similarity, batch_size, top_matches, max_distance, range_padding, max_area_percent_diff): """ Process a motif similarity search on demand """ import numpy as np import mass_ts as mts logger = logging.getLogger(skyline_app_logger) dev_null = None function_str = 'on_demand_motif_analysis' logger.info( '%s :: with parameters :: metric: %s, timestamp: %s, similarity: %s, batch_size:%s, top_matches: %s, max_distance: %s, range_padding: %s, max_area_percent_diff: %s' % (function_str, str(metric), str(timestamp), str(similarity), str(batch_size), str(top_matches), str(max_distance), str(range_padding), str(max_area_percent_diff))) trace = 'none' fail_msg = 'none' start = time.time() start_timer = timer() metric_vars_dict = {} metric_id = 0 fp_ids = [] timeseries = [] not_similar_enough_sample = 0 not_similar_motifs = 0 similar_motifs = 0 exact_motifs = 0 distance_motifs = 0 motifs_found = [] find_exact_matches_run = False exact_matches_found = [] fps_timeseries = {} # A motif_analysis dict to add to and return motif_analysis = {} motif_analysis[metric] = {} motif_analysis[metric]['timestamp'] = int(timestamp) motif_analysis[metric]['started'] = start motif_analysis[metric]['motifs'] = {} motif_analysis[metric]['exact_motifs'] = exact_motifs motif_analysis[metric]['similar_motifs'] = similar_motifs motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs motif_analysis[metric][ 'not_similar_enough_sample'] = not_similar_enough_sample # @added 20210417 - Feature #4014: Ionosphere - inference # Allow the user to define the batch_size per similarity search motif_analysis[metric]['batch_size'] = int(batch_size) motif_analysis[metric]['top_matches'] = int(top_matches) motif_analysis[metric]['max_distance'] = float(max_distance) # @added 20210425 - Feature #4014: Ionosphere - inference # Added max_area_percent_diff for computing the area under the curve motif_analysis[metric]['max_area_percent_diff'] = float( max_area_percent_diff) fps_checked_for_motifs = [] metric_dir = metric.replace('.', '/') metric_timeseries_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER, str(timestamp), metric_dir) # @added 20210418 - Feature #4014: Ionosphere - inference # Allow for the similarity search on saved_training_data if 'saved_training_data' in request.args: saved_training_data_str = request.args.get('saved_training_data', 'false') if saved_training_data_str == 'true': saved_metric_timeseries_dir = '%s_saved/%s/%s' % ( settings.IONOSPHERE_DATA_FOLDER, str(timestamp), metric_dir) if path.exists(saved_metric_timeseries_dir): metric_timeseries_dir = saved_metric_timeseries_dir logger.info('%s :: using saved training_data dir - %s' % (function_str, saved_metric_timeseries_dir)) metric_vars_file = '%s/%s.txt' % (metric_timeseries_dir, metric) timeseries_json = '%s/%s.json' % (metric_timeseries_dir, metric) full_duration_in_hours = int(settings.FULL_DURATION / 60 / 60) full_duration_timeseries_json = '%s/%s.mirage.redis.%sh.json' % ( metric_timeseries_dir, metric, str(full_duration_in_hours)) try: metric_vars_dict = mirage_load_metric_vars(skyline_app, metric_vars_file, True) except Exception as e: logger.error( 'error :: inference :: failed to load metric variables from check file - %s - %s' % (metric_vars_file, e)) if not metric_vars_dict: motif_analysis[metric]['status'] = 'error' motif_analysis[metric][ 'reason'] = 'could not load training data variables' return motif_analysis full_duration = metric_vars_dict['metric_vars']['full_duration'] # Determine the metric details from the database metric_id = 0 metric_db_object = {} try: metric_db_object = get_metrics_db_object(metric) except Exception as e: logger.error('error :: %s :: failed to get_metrics_db_object - %s' % (function_str, e)) try: metric_id = int(metric_db_object['id']) except Exception as e: logger.error( 'error :: %s :: failed to determine metric_id from metric_db_object %s - %s' % (function_str, str(metric_db_object), e)) metric_id = 0 if not metric_id: logger.error( 'error :: %s :: failed to get metric id for %s from the database' % (function_str, str(metric))) fail_msg = 'failed to get metric id' motif_analysis[metric]['status'] = 'error' motif_analysis[metric]['reason'] = 'could not determine metric id' return motif_analysis, fail_msg, trace # @modified 20210419 - Feature #4014: Ionosphere - inference # Create a unique dir for each batch_size max_distance # motif_images_dir = '%s/motifs' % metric_timeseries_dir motif_images_dir = '%s/motifs/batch_size.%s/top_matches.%s/max_distance.%s' % ( metric_timeseries_dir, str(batch_size), str(top_matches), str(max_distance)) if not path.exists(motif_images_dir): # provision motifs image resources mkdir_p(motif_images_dir) full_durations = [full_duration] if path.isfile(full_duration_timeseries_json): full_durations = [full_duration, settings.FULL_DURATION] logger.info('%s :: full_durations - %s' % (function_str, str(full_durations))) # Loop through analysis per full_duration for full_duration in full_durations: start_full_duration = timer() fp_ids = [] try: query = 'SELECT id,last_matched from ionosphere WHERE metric_id=%s AND full_duration=%s AND enabled=1 ORDER BY last_matched DESC' % ( str(metric_id), str(full_duration)) results = mysql_select(skyline_app, query) for row in results: fp_ids.append(int(row[0])) except Exception as e: logger.error( 'error :: %s :: failed to get fp ids via mysql_select from %s - %s' % (function_str, metric, e)) logger.info('%s :: metric_id: %s, full_duration: %s, fp_ids: %s' % (function_str, (metric_id), str(full_duration), str(fp_ids))) if not fp_ids: continue # Now there are known fps, load the timeseries if full_duration == settings.FULL_DURATION: timeseries_json_file = full_duration_timeseries_json else: timeseries_json_file = timeseries_json try: with open((timeseries_json_file), 'r') as f: raw_timeseries = f.read() timeseries_array_str = str(raw_timeseries).replace('(', '[').replace( ')', ']') del raw_timeseries timeseries = literal_eval(timeseries_array_str) del timeseries_array_str except Exception as e: logger.error( 'error :: %s :: failed to load timeseries for %s from %s - %s' % (function_str, metric, timeseries_json_file, e)) continue anomalous_timeseries_subsequence = [] for timestamp_float, value in timeseries[-int(batch_size):]: anomalous_timeseries_subsequence.append( [int(timestamp_float), value]) logger.info( '%s :: looking for motif in trained fps of full_duration: %s' % (function_str, (full_duration))) dataset = [float(item[1]) for item in anomalous_timeseries_subsequence] max_y = max(dataset) min_y = min(dataset) # full_y_range = max_y - min_y # range_padding_percent = range_padding # This was just a test that did not have the desired results # if full_y_range < 10: # range_padding_percent = 35 # if full_y_range < 5: # range_padding_percent = 75 # if full_y_range < 2: # range_padding_percent = 100 use_range_padding = ((max_y - min_y) / 100) * range_padding if min_y > 0 and (min_y - use_range_padding) > 0: min_y_padded = min_y - use_range_padding else: min_y_padded = min_y max_y_padded = max_y + use_range_padding if min_y_padded == max_y_padded: min_y_padded = min_y_padded - ( (min_y_padded / 100) * range_padding) max_y_padded = max_y_padded + ( (max_y_padded / 100) * range_padding) # anomalous_ts = np.array(dataset) anomalous_ts = dataset mass2_batch_times = [] exact_match_times = [] nan = np.array([np.nan]) nanj = complex(0.0, float('nan')) empty_dists = np.array(nan + nanj) # plotted = False count = 0 # fp_ids = [fp_id for index, fp_id in enumerate(fp_ids) if index == 0] # motifs_found = [] # exact_matches_found = [] # fps_timeseries = {} for fp_id in fp_ids: if (time.time() - start) >= 20: break # Attempt to surface the fp timeseries from memcache and/or db # @modified 20210424 - Feature #4014: Ionosphere - inference # Task #4030: refactoring fp_timeseries = None try: fp_timeseries = get_fp_timeseries(skyline_app, metric_id, fp_id) except Exception as e: logger.error( 'inference :: did not get fp timeseries with get_fp_timeseries(%s, %s, %s) - %s' % (skyline_app, str(metric_id), str(fp_id), e)) if not fp_timeseries: continue relate_dataset = [float(item[1]) for item in fp_timeseries] fps_timeseries[fp_id] = fp_timeseries current_best_indices = [] current_best_dists = [] best_indices = None best_dists = None try: logger.info( '%s :: running mts.mass2_batch fp_id: %s, full_duration: %s, batch_size: %s, top_matches: %s, max_distance: %s, motif_size: %s' % (function_str, str(fp_id), str(full_duration), str(batch_size), str(top_matches), str(max_distance), str(len(anomalous_ts)))) # @added 20210418 - Feature #4014: Ionosphere - inference # Handle top_matches being greater than possible kth that can be found # mts.mass2_batch error: kth(=50) out of bounds (16) use_top_matches = int(top_matches) if (len(fp_timeseries) / int(batch_size)) <= int(top_matches): use_top_matches = round( len(fp_timeseries) / int(batch_size)) - 1 if use_top_matches == 2: use_top_matches = 1 logger.info( '%s :: adjusting top_matches to %s (the maximum possible top - 1) as kth(=%s) will be out of bounds mts.mass2_batch' % (function_str, str(use_top_matches), str(top_matches))) start_mass2_batch = timer() # @modified 20210418 - Feature #4014: Ionosphere - inference # Handle top_matches being greater than possible kth that can be found # best_indices, best_dists = mts.mass2_batch(relate_dataset, anomalous_ts, batch_size=int(batch_size), top_matches=int(top_matches)) best_indices, best_dists = mts.mass2_batch( relate_dataset, anomalous_ts, batch_size=int(batch_size), top_matches=int(use_top_matches)) end_mass2_batch = timer() mass2_batch_times.append((end_mass2_batch - start_mass2_batch)) current_best_indices = best_indices.tolist() current_best_dists = best_dists.tolist() # @added 20210412 - Feature #4014: Ionosphere - inference # Branch #3590: inference # Add fp_id to fps_checked_for_motifs to enable ionosphere to update the # motif related columns in the ionosphere database table fps_checked_for_motifs.append(fp_id) except Exception as e: logger.error('error :: %s :: %s mts.mass2_batch error: %s' % (function_str, (fp_id), str(e))) continue try: if str(list(best_dists)) == str(list(empty_dists)): logger.info( '%s :: mts.mass2_batch no similar motif from fp id %s - best_dists: %s' % (function_str, (fp_id), str(list(best_dists)))) continue except Exception as e: dev_null = e if not current_best_indices[0]: continue # if list(best_indices)[0] != anomalous_index: # continue # If the best_dists is > 1 they are not very similar # if list(best_dists)[0].real > 1.0: # continue # if list(best_indices)[0] and best_dists: for index, best_dist in enumerate(current_best_dists): try: motif_added = False """ Note: mass_ts finds similar motifs NOT the same motif, the same motif will result in the best_dists being a nan+nanj So it is DIYed """ try: # @modified 20210414 - Feature #4014: Ionosphere - inference # Branch #3590: inference # Store the not anomalous motifs # motif = [fp_id, current_best_indices[index], best_dist.real] motif = [ fp_id, current_best_indices[index], best_dist.real, anomalous_timeseries_subsequence, full_duration ] except Exception as e: dev_null = e motif = [] # if list(best_indices)[0] and best_dists: # If it is greater than 1.0 it is not similar # if best_dist.real > 1.0: # if best_dist.real > IONOSPHERE_INFERENCE_MASS_TS_MAX_DISTANCE: if best_dist.real > float(max_distance): continue else: if motif: count += 1 motifs_found.append(motif) motif_added = True if not motif_added: if best_dist == nanj: count += 1 motifs_found.append(motif) motif_added = True if not motif_added: if str(best_dist) == 'nan+nanj': count += 1 motifs_found.append([ fp_id, current_best_indices[index], 0.0, anomalous_timeseries_subsequence, full_duration ]) motif_added = True if not motif_added: if best_dist == empty_dists: count += 1 motifs_found.append(motif) motif_added = True except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: %s :: could not determine is if fp id %s timeseries at index %s was a match - %s' % (function_str, str(fp_id), str(current_best_indices[index]), e)) continue # FIND EXACT MATCHES # Seeing as I cannot reproduce finding nan+nanj which represents an # exact match with mts.mass2_batch, do it DIY style - iterate the # timeseries and create a batch_size subsequence for every index and # compare the values to the anomalous_ts for an exact match. # This takes ~0.024850 seconds on a timeseries with 10079 datapoints # @modified 20210418 - Feature #4014: Ionosphere - inference # However fiding exact matches can add ~2.5 seconds on 90 minute # batch_size and with a proproptionally scaled max_distance of say 15 # finding an exact match in a longer sequence is less important, # the greater the batch_size the most likely greater the variability # and the chance of an exact match decreases. So save 2.5 seconds. # UNLESS # At a 5 (to 10) batch_size and max_distance of 1.0 an exact match # can be found. Exact matches are quite frequent and sometimes with # such little variability, similar matchs may not be found. # Therefore find find_exact_matches has its place. MASS # A CAVEAT here is that boring metrics and that change and have a # low variability even at a larger batch_size could also benefit and # possibly achieve better accruracy from the use of find_exact_matches # as they can be shapelets resemble a batch_size 5 shapelet. # It would perhaps be possible to use one or more of the features # profile tsfresh values to identify these types of shapelets, if # you knew which feature/s were most descriptive of this type of # shapelet, e.g. 'value__skewness': 3.079477685394873, etc (maybe) # However I predict that this method will perform worst on these # types of shapelets. # find_exact_matches = False # exact matches can be found in batch sizes of 500 and similar not # So actually always run it. find_exact_matches = True find_exact_matches_run = True if int(batch_size) < 10: find_exact_matches = True find_exact_matches_run = True if find_exact_matches: try: start_exact_match = timer() indexed_relate_dataset = [] for index, item in enumerate(relate_dataset): indexed_relate_dataset.append([index, item]) last_index = indexed_relate_dataset[-1][0] current_index = 0 while current_index < last_index: subsequence = [ value for index, value in indexed_relate_dataset[current_index:( current_index + int(batch_size))] ] if subsequence == anomalous_ts: exact_matches_found.append([ fp_id, current_index, 0.0, anomalous_timeseries_subsequence, full_duration ]) motifs_found.append([ fp_id, current_index, 0.0, anomalous_timeseries_subsequence, full_duration ]) current_index += 1 end_exact_match = timer() exact_match_times.append( (end_exact_match - start_exact_match)) except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: %s :: could not determine it any exact matches could be found in fp id %s timeseries - %s' % (function_str, str(fp_id), e)) logger.info( '%s :: mts.mass2_batch runs on %s fps of full_duration %s in %.6f seconds' % (function_str, str(len(mass2_batch_times)), str(full_duration), sum(mass2_batch_times))) if find_exact_matches_run: logger.info( '%s :: exact_match runs on %s fps of full_duration %s in %.6f seconds' % (function_str, str(len(exact_match_times)), str(full_duration), sum(exact_match_times))) end_full_duration = timer() logger.info( '%s :: analysed %s fps of full_duration %s in %.6f seconds' % (function_str, str(len(fp_ids)), str(full_duration), (end_full_duration - start_full_duration))) # Patterns are sorted sorted_motifs = [] motifs_found_in_fps = [] if motifs_found: sorted_motifs = sorted(motifs_found, key=lambda x: x[2]) for item in sorted_motifs: motifs_found_in_fps.append(item[0]) logger.info('%s :: %s motifs found' % (function_str, str(len(sorted_motifs)))) for motif in sorted_motifs: if (time.time() - start) >= 25: break try: add_match = False all_in_range = False fp_id = motif[0] best_index = motif[1] best_dist = motif[2] # @added 20210414 - Feature #4014: Ionosphere - inference # Branch #3590: inference # Store the not anomalous motifs motif_sequence = motif[3] motif_full_duration = motif[4] match_type = 'not_similar_enough' if motif in exact_matches_found: add_match = True match_type = 'exact' all_in_range = True exact_motifs += 1 full_relate_timeseries = fps_timeseries[fp_id] # full_relate_dataset = [float(item[1]) for item in full_relate_timeseries] relate_timeseries = [ item for index, item in enumerate(full_relate_timeseries) if index >= best_index and index < (best_index + int(batch_size)) ] relate_dataset = [item[1] for item in relate_timeseries] if not add_match: all_in_range = True for value in relate_dataset: if value < min_y_padded: all_in_range = False break if value > max_y_padded: all_in_range = False break if all_in_range: related_max_y = max(relate_dataset) if related_max_y < (max_y - range_padding): all_in_range = False if related_max_y > (max_y + range_padding): all_in_range = False related_min_y = min(relate_dataset) if related_min_y < (min_y - range_padding): all_in_range = False if related_min_y > (min_y + range_padding): all_in_range = False if all_in_range: logger.info( '%s :: ALL IN RANGE - all_in_range: %s, motif: %s' % (function_str, str(all_in_range), str(relate_dataset[0:2]))) add_match = True match_type = 'all_in_range' similar_motifs += 1 # @added 20210425 - Feature #4014: Ionosphere - inference # Compute the area using the composite trapezoidal rule. motif_area = None fp_motif_area = None percent_different = None try: batch_size_dataset = [ float(item[1]) for item in motif_sequence ] y_motif = np.array(batch_size_dataset) motif_area = np.trapz(y_motif, dx=1) except Exception as e: logger.error( 'error :: %s :: failed to get motif_area with np.trapz - %s' % (function_str, e)) try: y_fp_motif = np.array(relate_dataset) fp_motif_area = np.trapz(y_fp_motif, dx=1) except Exception as e: logger.error( 'error :: %s :: failed to get fp_motif_area with np.trapz - %s' % (function_str, e)) # Determine the percentage difference (as a # positive value) of the areas under the # curves. if motif_area and fp_motif_area: percent_different = get_percent_different( fp_motif_area, motif_area, True) if percent_different > max_area_percent_diff: if add_match: logger.info( '%s :: AREA TOO DIFFERENT - not adding all_in_range match' % (function_str)) add_match = False # BUT ... if best_dist < 3 and not add_match: logger.info( '%s :: DISTANCE VERY SIMILAR - adding match even though area_percent_diff is greater than max_area_percent_diff because best_dist: %s' % (function_str, str(best_dist))) add_match = True match_type = 'distance' distance_motifs += 1 if similarity == 'all': if not add_match: not_similar_motifs += 1 if not_similar_enough_sample >= 10: continue not_similar_enough_sample += 1 add_match = True match_type = 'not_similar_enough' if add_match: generation = 0 fp_id_row = None try: fp_id_row = get_ionosphere_fp_db_row( skyline_app, int(fp_id)) except Exception as e: logger.error( 'error :: %s :: failed to get_ionosphere_fp_db_row for fp_id %s - %s' % (function_str, str(fp_id), e)) if fp_id_row: try: generation = fp_id_row['generation'] except Exception as e: logger.error( 'error :: %s :: failed to generation from fp_id_row for fp_id %s - %s' % (function_str, str(fp_id), e)) if generation == 0: generation_str = 'trained' else: generation_str = 'LEARNT' motif_match_types = motif_match_types_dict() type_id = motif_match_types[match_type] motif_id = '%s-%s' % (str(fp_id), str(best_index)) motif_analysis[metric]['motifs'][motif_id] = {} motif_analysis[metric]['motifs'][motif_id][ 'metric_id'] = metric_id motif_analysis[metric]['motifs'][motif_id]['fp_id'] = fp_id motif_analysis[metric]['motifs'][motif_id][ 'generation'] = generation motif_analysis[metric]['motifs'][motif_id][ 'index'] = best_index motif_analysis[metric]['motifs'][motif_id][ 'distance'] = best_dist motif_analysis[metric]['motifs'][motif_id]['size'] = int( batch_size) motif_analysis[metric]['motifs'][motif_id][ 'max_distance'] = float(max_distance) motif_analysis[metric]['motifs'][motif_id][ 'timestamp'] = timestamp motif_analysis[metric]['motifs'][motif_id][ 'type_id'] = type_id motif_analysis[metric]['motifs'][motif_id][ 'type'] = match_type motif_analysis[metric]['motifs'][motif_id][ 'full_duration'] = motif_full_duration # @added 20210414 - Feature #4014: Ionosphere - inference # Branch #3590: inference # Store the not anomalous motifs motif_analysis[metric]['motifs'][motif_id][ 'motif_timeseries'] = anomalous_timeseries_subsequence motif_analysis[metric]['motifs'][motif_id][ 'motif_sequence'] = motif_sequence not_anomalous_timestamp = int( anomalous_timeseries_subsequence[-1][0]) graph_period_seconds = not_anomalous_timestamp - int( anomalous_timeseries_subsequence[0][0]) motif_analysis[metric]['motifs'][motif_id][ 'motif_period_seconds'] = graph_period_seconds motif_analysis[metric]['motifs'][motif_id][ 'motif_period_minutes'] = round(graph_period_seconds / 60) motif_analysis[metric]['motifs'][motif_id]['image'] = None motif_analysis[metric]['motifs'][motif_id][ 'motif_area'] = motif_area motif_analysis[metric]['motifs'][motif_id][ 'fp_motif_area'] = fp_motif_area motif_analysis[metric]['motifs'][motif_id][ 'area_percent_diff'] = percent_different motif_analysis[metric]['motifs'][motif_id][ 'max_area_percent_diff'] = max_area_percent_diff if (time.time() - start) >= 25: continue graph_image_file = '%s/motif.%s.%s.%s.with_max_distance.%s.png' % ( motif_images_dir, motif_id, match_type, str(batch_size), str(max_distance)) plotted_image = False on_demand_motif_analysis = True if not path.isfile(graph_image_file): plotted_image, plotted_image_file = plot_motif_match( skyline_app, metric, timestamp, fp_id, full_duration, generation_str, motif_id, best_index, int(batch_size), best_dist, type_id, relate_dataset, anomalous_timeseries_subsequence, graph_image_file, on_demand_motif_analysis) else: plotted_image = True logger.info('%s :: plot already exists - %s' % (function_str, str(graph_image_file))) if plotted_image: motif_analysis[metric]['motifs'][motif_id][ 'image'] = graph_image_file else: logger.error('failed to plot motif match plot') graph_image_file = None except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: inference :: with fp id %s proceesing motif at index: %s - %s' % (str(fp_id), str(motif[0]), str(e))) continue end_timer = timer() motif_analysis[metric]['fps_checked'] = fps_checked_for_motifs motif_analysis[metric]['exact_motifs'] = exact_motifs motif_analysis[metric]['similar_motifs'] = similar_motifs motif_analysis[metric]['distance_motifs'] = distance_motifs motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs motif_analysis[metric][ 'not_similar_enough_sample'] = not_similar_enough_sample motif_analysis_file = '%s/motif.analysis.similarity_%s.batch_size_%s.top_matches_%s.max_distance_%s.dict' % ( motif_images_dir, similarity, str(batch_size), str(top_matches), str(max_distance)) try: write_data_to_file(skyline_app, motif_analysis_file, 'w', str(motif_analysis)) except Exception as e: trace = traceback.format_exc() logger.error('%s' % trace) fail_msg = '%s :: error :: failed to write motif_analysis_file - %s' % ( function_str, motif_analysis_file) logger.error('%s' % fail_msg) dev_null = e motif_ids = list(motif_analysis[metric]['motifs'].keys()) logger.info( '%s :: %s motif matches found, %s fps where checked and motifs plotted in %.6f seconds for %s' % (function_str, str(len(motif_ids)), str(len(fps_checked_for_motifs)), (end_timer - start_timer), metric)) if dev_null: del dev_null return motif_analysis, fail_msg, trace