def get_valid_df(self, table, **kwargs) -> pd.DataFrame: if not self.ctxt.engine: print("Specify an analysis engine using set engine command") return pd.DataFrame(columns=["namespace", "hostname"]) sch = SchemaForTable(table, schema=self.schemas) phy_table = sch.get_phy_table_for_table() columns = kwargs.pop('columns', ['default']) addnl_fields = kwargs.pop('addnl_fields', []) view = kwargs.pop('view', self.iobj.view) active_only = kwargs.pop('active_only', True) fields = sch.get_display_fields(columns) key_fields = sch.key_fields() drop_cols = [] if columns == ['*']: drop_cols.append('sqvers') aug_fields = sch.get_augmented_fields() if 'timestamp' not in fields: fields.append('timestamp') if 'active' not in fields + addnl_fields: addnl_fields.append('active') drop_cols.append('active') # Order matters. Don't put this before the missing key fields insert for f in aug_fields: dep_fields = sch.get_parent_fields(f) addnl_fields += dep_fields for fld in key_fields: if fld not in fields + addnl_fields: addnl_fields.insert(0, fld) drop_cols.append(fld) for f in addnl_fields: if f not in fields: # timestamp is always the last field fields.insert(-1, f) if self.iobj.start_time: try: start_time = dateparser.parse( self.iobj.start_time.replace('last night', 'yesterday')) \ .timestamp()*1000 except Exception as e: print(f"ERROR: invalid time {self.iobj.start_time}: {e}") return pd.DataFrame() else: start_time = '' if self.iobj.start_time and not start_time: # Something went wrong with our parsing print(f"ERROR: unable to parse {self.iobj.start_time}") return pd.DataFrame() if self.iobj.end_time: try: end_time = dateparser.parse( self.iobj.end_time.replace('last night', 'yesterday')) \ .timestamp()*1000 except Exception as e: print(f"ERROR: invalid time {self.iobj.end_time}: {e}") return pd.DataFrame() else: end_time = '' if self.iobj.end_time and not end_time: # Something went wrong with our parsing print(f"ERROR: Unable to parse {self.iobj.end_time}") return pd.DataFrame() table_df = self._dbeng.read(phy_table, 'pandas', start_time=start_time, end_time=end_time, columns=fields, view=view, key_fields=key_fields, **kwargs) if not table_df.empty: if view == "all" or not active_only: table_df.drop(columns=drop_cols, inplace=True) else: table_df = table_df.query('active') \ .drop(columns=drop_cols) if 'timestamp' in table_df.columns and not table_df.empty: table_df['timestamp'] = humanize_timestamp( table_df.timestamp, self.cfg.get('analyzer', {}).get('timezone', None)) return table_df
def get_valid_df(self, table, **kwargs) -> pd.DataFrame: if not self.ctxt.engine: print("Specify an analysis engine using set engine command") return pd.DataFrame(columns=["namespace", "hostname"]) sch = SchemaForTable(table, schema=self.schemas) phy_table = sch.get_phy_table_for_table() columns = kwargs.pop('columns', ['default']) addnl_fields = kwargs.pop('addnl_fields', []) view = kwargs.pop('view', self.iobj.view) active_only = kwargs.pop('active_only', True) query_str = kwargs.pop('query_str', '') # The REST API provides the query_str enclosed in ". Strip that if query_str: if query_str.startswith('"') and query_str.endswith('"'): query_str = query_str[1:-1] fields = sch.get_display_fields(columns) key_fields = sch.key_fields() drop_cols = [] if columns == ['*']: drop_cols.append('sqvers') if 'timestamp' not in fields: fields.append('timestamp') if 'active' not in fields+addnl_fields: addnl_fields.append('active') drop_cols.append('active') for fld in key_fields: if fld not in fields+addnl_fields: addnl_fields.insert(0, fld) drop_cols.append(fld) for f in addnl_fields: if f not in fields: # timestamp is always the last field fields.insert(-1, f) if self.iobj.start_time: try: start_time = dateparser.parse( self.iobj.start_time.replace('last night', 'yesterday')) \ .timestamp()*1000 except Exception as e: print(f"ERROR: invalid time {self.iobj.start_time}: {e}") return pd.DataFrame() else: start_time = '' if self.iobj.start_time and not start_time: # Something went wrong with our parsing print(f"ERROR: unable to parse {self.iobj.start_time}") return pd.DataFrame() if self.iobj.end_time: try: end_time = dateparser.parse( self.iobj.end_time.replace('last night', 'yesterday')) \ .timestamp()*1000 except Exception as e: print(f"ERROR: invalid time {self.iobj.end_time}: {e}") return pd.DataFrame() else: end_time = '' if self.iobj.end_time and not end_time: # Something went wrong with our parsing print(f"ERROR: Unable to parse {self.iobj.end_time}") return pd.DataFrame() table_df = self._dbeng.read( phy_table, 'pandas', start_time=start_time, end_time=end_time, columns=fields, view=view, key_fields=key_fields, **kwargs ) if not table_df.empty: if view == 'latest' and active_only: table_df = table_df.query('active') \ .drop(columns=drop_cols) else: table_df.drop(columns=drop_cols, inplace=True) if 'timestamp' in table_df.columns: table_df['timestamp'] = humanize_timestamp( table_df.timestamp, self.cfg.get('analyzer', {}) .get('timezone', None)) if query_str: return table_df.query(query_str) else: return table_df
def get_valid_df(self, table, **kwargs) -> pd.DataFrame: if not self.ctxt.engine: print("Specify an analysis engine using set engine command") return pd.DataFrame(columns=["namespace", "hostname"]) sch = SchemaForTable(table, schema=self.schemas) phy_table = sch.get_phy_table_for_table() columns = kwargs.pop('columns', ['default']) addnl_fields = kwargs.pop('addnl_fields', []) view = kwargs.pop('view', self.iobj.view) active_only = kwargs.pop('active_only', True) fields = sch.get_display_fields(columns) key_fields = sch.key_fields() drop_cols = [] if 'timestamp' not in fields: fields.append('timestamp') if 'active' not in fields + addnl_fields: addnl_fields.append('active') drop_cols.append('active') for fld in key_fields: if fld not in fields + addnl_fields: addnl_fields.insert(0, fld) drop_cols.append(fld) for f in addnl_fields: if f not in fields: # timestamp is always the last field fields.insert(-1, f) for dt in [self.iobj.start_time, self.iobj.end_time]: if dt: try: parse(dt) except (ValueError, ParserError) as e: print(f"invalid time {dt}: {e}") return pd.DataFrame() table_df = self.ctxt.engine.get_table_df( self.cfg, table=phy_table, start_time=self.iobj.start_time, end_time=self.iobj.end_time, columns=fields, view=view, key_fields=key_fields, **kwargs) if not table_df.empty: if view == 'latest' and active_only: table_df = table_df.query('active') \ .drop(columns=drop_cols) else: table_df.drop(columns=drop_cols, inplace=True) if 'timestamp' in table_df.columns: table_df['timestamp'] = pd.to_datetime( table_df.timestamp.astype(str), unit="ms") return table_df
def get_table_df(self, cfg, schemas, **kwargs) -> pd.DataFrame: """Use Pandas instead of Spark to retrieve the data""" MAX_FILECNT_TO_READ_FOLDER = 10000 self.cfg = cfg table = kwargs.pop("table") start = kwargs.pop("start_time") end = kwargs.pop("end_time") view = kwargs.pop("view") sort_fields = kwargs.pop("sort_fields") ign_key_fields = kwargs.pop("ign_key", []) addnl_fields = kwargs.pop("addnl_fields", []) for f in ['active', 'timestamp']: if f not in addnl_fields: addnl_fields.append(f) sch = SchemaForTable(table, schema=schemas) phy_table = sch.get_phy_table_for_table() folder = self._get_table_directory(phy_table) # Restrict to a single DC if thats whats asked if "namespace" in kwargs: v = kwargs["namespace"] if v: if not isinstance(v, list): folder += "/namespace={}/".format(v) fcnt = self.get_filecnt(folder) if fcnt == 0: return pd.DataFrame() # We are going to hard code use_get_files until we have some autoamted testing use_get_files = False # use_get_files = ( # (fcnt > MAX_FILECNT_TO_READ_FOLDER and view == "latest") or # start or end # ) if use_get_files: # Switch to more efficient method when there are lotsa files # Reduce I/O since that is the worst drag key_fields = [] if len(kwargs.get("namespace", [])) > 1: del kwargs["namespace"] files = get_latest_files(folder, start, end, view) else: # ign_key_fields contains key fields that are not partition cols key_fields = [i for i in sch.key_fields() if i not in ign_key_fields] filters = self.build_pa_filters(start, end, key_fields, **kwargs) if "columns" in kwargs: columns = kwargs["columns"] del kwargs["columns"] else: columns = ["default"] fields = sch.get_display_fields(columns) for f in addnl_fields: if f not in fields: fields.append(f) # Create the filter to select only specified columns query_str = "" prefix = "" addnl_filter = kwargs.pop('add_filter', None) for f, v in kwargs.items(): if not v or f in key_fields or f in ["groupby"]: continue if isinstance(v, str): if v.startswith('!'): v = v[1:] op = '!=' else: op = '==' query_str += "{} {}{}'{}' ".format(prefix, f, op, v) prefix = "and" else: query_str += "{} {}=={} ".format(prefix, f, v) prefix = "and" # Add the ignored fields back to key fields to ensure we # do the drop_duplicates correctly below incl reading reqd cols key_fields.extend(ign_key_fields) # Handle the case where key fields are missing from display fields fldset = set(fields) kfldset = set(key_fields) add_flds = kfldset.difference(fldset) if add_flds: fields.extend(list(add_flds)) if addnl_filter: # This is for special cases that are specific to an object if not query_str: query_str = addnl_filter else: query_str += ' and {}'.format(addnl_filter) # Restore the folder to what it needs to be folder = self._get_table_directory(phy_table) if use_get_files: if not query_str: query_str = "active == True" pdf_list = [] with Executor(max_workers=8) as exe: jobs = [ exe.submit(self.read_pq_file, f, fields, query_str) for f in files ] pdf_list = [job.result() for job in jobs] if pdf_list: final_df = pd.concat(pdf_list) else: final_df = pd.DataFrame(columns=fields) elif view == "latest": if not query_str: # Make up a dummy query string to avoid if/then/else query_str = "timestamp != 0" try: final_df = ( pa.ParquetDataset( folder, filters=filters or None, validate_schema=False ) .read(columns=fields) .to_pandas(split_blocks=True, self_destruct=True) .query(query_str) .drop_duplicates(subset=key_fields, keep="last") .query("active == True") ) except pa.lib.ArrowInvalid: return pd.DataFrame(columns=fields) else: if not query_str: # Make up a dummy query string to avoid if/then/else query_str = 'timestamp != "0"' try: final_df = ( pa.ParquetDataset( folder, filters=filters or None, validate_schema=False ) .read(columns=fields) .to_pandas() .query(query_str) ) except pa.lib.ArrowInvalid: return pd.DataFrame(columns=fields) if 'active' not in columns: final_df.drop(columns=['active'], axis=1, inplace=True) fields.remove('active') final_df = df_timestamp_to_datetime(final_df) if sort_fields and all(x in sort_fields for x in fields): return final_df[fields].sort_values(by=sort_fields) else: return final_df[fields]