def __getLatest(self, dt=None): print(f'{self.dateFrom} - {self.dateTo} Grab latest data', dt) if not dt: dt = hp.GetTimeRanges(self.dateFrom, self.dateTo) allNodesDf = qrs.allTestedNodes(dt) allNodesDf = allNodesDf[(allNodesDf['ip'] != '') & ~(allNodesDf['ip'].isnull())] # in some cases there is one IP having 2 different hostanames allNodesDf = self.__removeDuplicates(allNodesDf) rows = [] # run a query for each ip because it is not a trivial task (if possible at all) to aggreagate the geolocation fields for item in allNodesDf.sort_values('host', ascending=False).to_dict('records'): lastRec = qrs.mostRecentMetaRecord(item['ip'], item['ipv6'], dt) if len(lastRec) > 0: rows.append(lastRec) else: item['site_index'] = item['site'] rows.append(item) columns = [ 'ip', 'timestamp', 'host', 'site', 'administrator', 'email', 'lat', 'lon', 'site_meta', 'site_index' ] df = pd.DataFrame(rows, columns=columns) df = df.drop_duplicates() df['last_update'] = df['timestamp'].apply( lambda ts: self.convertTime(ts)) df['last_update'].fillna(self.convertTime(dt[1]), inplace=True) return df
def runInParallel(dateFrom, dateTo): # query the past 12 hours and split the period into 8 time ranges # dateFrom, dateTo = hp.defaultTimeRange(12) # dateFrom, dateTo = ['2022-05-17 20:15', '2022-05-18 08:15'] print(f' Run for period: {dateFrom} - {dateTo}') dtList = hp.GetTimeRanges(dateFrom, dateTo, 12) with ProcessPoolExecutor(max_workers=4) as pool: result = pool.map(getTraceData, [[dtList[i], dtList[i+1]] for i in range(len(dtList)-1)])
def queryData(self, idx): data = [] intv = int(hp.CalcMinutes4Period(self.dateFrom, self.dateTo) / 60) time_list = hp.GetTimeRanges(self.dateFrom, self.dateTo, intv) for i in range(len(time_list) - 1): data.extend(qrs.query4Avg(idx, time_list[i], time_list[i + 1])) return data
def queryData(dateFrom, dateTo): data = [] # query in portions since ES does not allow aggregations with more than 10000 bins intv = int(hp.CalcMinutes4Period(dateFrom, dateTo) / 60) time_list = hp.GetTimeRanges(dateFrom, dateTo, intv) for i in range(len(time_list) - 1): data.extend(query4Avg(time_list[i], time_list[i + 1])) return data
def loadPacketLossData(dateFrom, dateTo): data = [] intv = int(hp.CalcMinutes4Period(dateFrom, dateTo) / 60) time_list = hp.GetTimeRanges(dateFrom, dateTo, intv) for i in range(len(time_list) - 1): data.extend( qrs.query4Avg('ps_packetloss', time_list[i], time_list[i + 1])) return pd.DataFrame(data)
def InOutDf(self, idx, idx_df): print(idx) in_out_values = [] time_list = hp.GetTimeRanges(self.dateFrom, self.dateTo) for t in ['dest_host', 'src_host']: meta_df = idx_df.copy() df = pd.DataFrame( qrs.queryDailyAvg(idx, t, time_list[0], time_list[1])).reset_index() df['index'] = pd.to_datetime(df['index'], unit='ms').dt.strftime('%d/%m') df = df.transpose() header = df.iloc[0] df = df[1:] df.columns = ['day-3', 'day-2', 'day-1', 'day'] meta_df = pd.merge(meta_df, df, left_on="host", right_index=True) three_days_ago = meta_df.groupby('site').agg( { 'day-3': lambda x: x.mean(skipna=False) }, axis=1).reset_index() two_days_ago = meta_df.groupby('site').agg( { 'day-2': lambda x: x.mean(skipna=False) }, axis=1).reset_index() one_day_ago = meta_df.groupby('site').agg( { 'day-1': lambda x: x.mean(skipna=False) }, axis=1).reset_index() today = meta_df.groupby('site').agg( { 'day': lambda x: x.mean(skipna=False) }, axis=1).reset_index() site_avg_df = reduce( lambda x, y: pd.merge(x, y, on='site', how='outer'), [three_days_ago, two_days_ago, one_day_ago, today]) site_avg_df.set_index('site', inplace=True) change = site_avg_df.pct_change(axis='columns') site_avg_df = pd.merge(site_avg_df, change, left_index=True, right_index=True, suffixes=('_val', '')) site_avg_df['direction'] = 'IN' if t == 'dest_host' else 'OUT' in_out_values.append(site_avg_df) site_df = pd.concat(in_out_values).reset_index() site_df = site_df.round(2) return {"data": site_df, "dates": header}
def getData(self, src, dest): time_list = hp.GetTimeRanges(self.root_parent.dateFrom, self.root_parent.dateTo) df = pd.DataFrame(qrs.queryAllValues(self._idx, src, dest, time_list)) df.rename(columns={hp.getValueField(self._idx): 'value'}, inplace=True) if len(df) > 0: df['log_value'] = np.log(df['value'].replace(0, np.nan)) df['sqrt'] = df['value']**(1 / 2) return df
def queryNodesGeoLocation(): include=["geolocation","external_address.ipv4_address", "external_address.ipv6_address", "config.site_name", "host"] period = hp.GetTimeRanges(*hp.defaultTimeRange(days=30)) query = { "query": { "bool": { "filter": [ { "range": { "timestamp": { "gte": period[0], "lte": period[1] } } } ] } } } data = scan(client=hp.es, index='ps_meta', query=query, _source=include, filter_path=['_scroll_id', '_shards', 'hits.hits._source']) count = 0 neatdata = [] ddict = {} for res in data: if not count%100000: print(count) data = res['_source'] if 'config' in data: site = data['config']['site_name'] else: site = None if 'ipv4_address' in data['external_address']: ip = data['external_address']['ipv4_address'] else: ip = data['external_address']['ipv6_address'] geoip = [None, None] if 'geolocation' in data: geoip = data['geolocation'].split(",") # if 'speed' in data['external_address']: # speed = data['external_address']['speed'] if (ip in ddict) and (site is not None): ddict[ip]['site'] = site else: ddict[ip] = {'lat': geoip[0], 'lon': geoip[1], 'site': site, 'host':data['host']} count=count+1 return ddict
def __updateDataset(self): self.metaDf = self.getMetafromES() if len(self.metaDf) > 1: print('Update meta data') self.metaDf = self.__updateMetaData(self.metaDf, self.__getLatest()) else: # Initially, grab one year of data split into 6 chunks in order to # fill in info that may not appear in the most recent data print('No data found. Query a year back.') dateTo = datetime.strftime(self.now, '%Y-%m-%d %H:%M') dateFrom = datetime.strftime(self.now - timedelta(days=365), '%Y-%m-%d %H:%M') timeRange = hp.GetTimeRanges(dateFrom, dateTo, 10) self.metaDf = self.__getLatest([timeRange[0], timeRange[1]]) for i in range(2, len(timeRange) - 1): print( f'Period: {timeRange[i]}, {timeRange[i+1]}, data size before update: {len(self.metaDf)}' ) self.metaDf = self.__updateMetaData( self.metaDf, self.__getLatest([timeRange[i], timeRange[i + 1]])) print(f'Size after update: {len(self.metaDf)}') print() # self.metaDf.loc[self.metaDf.site == 'CERN-PROD', 'lat'] = 46.2416566 # self.metaDf.loc[self.metaDf.site == 'CERN-PROD', 'lon'] = 6.0468415 # Finally, try to fix empty fields by searching for similar host names and assign their value try: self.metaDf = self.__fixMissingSites(self.metaDf) self.metaDf = self.metaDf.fillna( self.metaDf[['site', 'lat', 'lon']].groupby('site').ffill()) self.metaDf = self.metaDf.drop_duplicates(subset='ip', keep="last") # remove all >1500 nodes for which there is no meaningful info self.metaDf.fillna('', inplace=True) toRemoveIds = self.metaDf[~(self.metaDf['lat'].astype(bool))&~(self.metaDf['lon'].astype(bool))\ &~(self.metaDf['administrator'].astype(bool))&~(self.metaDf['site_index'].astype(bool))\ &~(self.metaDf['site_meta'].astype(bool))&~(self.metaDf['email'].astype(bool))].index.values self.metaDf = self.metaDf[~self.metaDf.index.isin(toRemoveIds)] except Exception as e: print(traceback.format_exc()) finally: print('Meta data done')
def getValues(self, probdf): # probdf = markNodes() df = pd.DataFrame(columns=['timestamp', 'value', 'idx', 'hash']) time_list = hp.GetTimeRanges(self.dateFrom, self.dateTo) for item in probdf[['src', 'dest', 'idx']].values: tempdf = pd.DataFrame( qrs.queryAllValues(item[2], item, time_list[0], time_list[1])) tempdf['idx'] = item[2] tempdf['hash'] = item[0] + "-" + item[1] tempdf['src'] = item[0] tempdf['dest'] = item[1] tempdf.rename(columns={hp.getValueField(item[2]): 'value'}, inplace=True) df = df.append(tempdf, ignore_index=True) return df
def run(dateFrom, dateTo): # query the past 24 hours and split the period into 8 time ranges dtList = hp.GetTimeRanges(dateFrom, dateTo, 8) with ProcessPoolExecutor(max_workers=4) as pool: result = pool.map(getTraceData, [[dtList[i], dtList[i+1]] for i in range(len(dtList)-1)])