Exemplo n.º 1
0
    def get_valid_df(self, table, **kwargs) -> pd.DataFrame:
        if not self.ctxt.engine:
            print("Specify an analysis engine using set engine command")
            return pd.DataFrame(columns=["namespace", "hostname"])

        sch = SchemaForTable(table, schema=self.schemas)
        phy_table = sch.get_phy_table_for_table()

        columns = kwargs.pop('columns', ['default'])
        addnl_fields = kwargs.pop('addnl_fields', [])
        view = kwargs.pop('view', self.iobj.view)
        active_only = kwargs.pop('active_only', True)

        fields = sch.get_display_fields(columns)
        key_fields = sch.key_fields()
        drop_cols = []

        if columns == ['*']:
            drop_cols.append('sqvers')

        aug_fields = sch.get_augmented_fields()

        if 'timestamp' not in fields:
            fields.append('timestamp')

        if 'active' not in fields + addnl_fields:
            addnl_fields.append('active')
            drop_cols.append('active')

        # Order matters. Don't put this before the missing key fields insert
        for f in aug_fields:
            dep_fields = sch.get_parent_fields(f)
            addnl_fields += dep_fields

        for fld in key_fields:
            if fld not in fields + addnl_fields:
                addnl_fields.insert(0, fld)
                drop_cols.append(fld)

        for f in addnl_fields:
            if f not in fields:
                # timestamp is always the last field
                fields.insert(-1, f)

        if self.iobj.start_time:
            try:
                start_time = dateparser.parse(
                    self.iobj.start_time.replace('last night', 'yesterday')) \
                    .timestamp()*1000
            except Exception as e:
                print(f"ERROR: invalid time {self.iobj.start_time}: {e}")
                return pd.DataFrame()
        else:
            start_time = ''

        if self.iobj.start_time and not start_time:
            # Something went wrong with our parsing
            print(f"ERROR: unable to parse {self.iobj.start_time}")
            return pd.DataFrame()

        if self.iobj.end_time:
            try:
                end_time = dateparser.parse(
                    self.iobj.end_time.replace('last night', 'yesterday')) \
                    .timestamp()*1000
            except Exception as e:
                print(f"ERROR: invalid time {self.iobj.end_time}: {e}")
                return pd.DataFrame()
        else:
            end_time = ''

        if self.iobj.end_time and not end_time:
            # Something went wrong with our parsing
            print(f"ERROR: Unable to parse {self.iobj.end_time}")
            return pd.DataFrame()

        table_df = self._dbeng.read(phy_table,
                                    'pandas',
                                    start_time=start_time,
                                    end_time=end_time,
                                    columns=fields,
                                    view=view,
                                    key_fields=key_fields,
                                    **kwargs)

        if not table_df.empty:
            if view == "all" or not active_only:
                table_df.drop(columns=drop_cols, inplace=True)
            else:
                table_df = table_df.query('active') \
                                   .drop(columns=drop_cols)
            if 'timestamp' in table_df.columns and not table_df.empty:
                table_df['timestamp'] = humanize_timestamp(
                    table_df.timestamp,
                    self.cfg.get('analyzer', {}).get('timezone', None))

        return table_df
Exemplo n.º 2
0
    def get_valid_df(self, table, **kwargs) -> pd.DataFrame:
        if not self.ctxt.engine:
            print("Specify an analysis engine using set engine command")
            return pd.DataFrame(columns=["namespace", "hostname"])

        sch = SchemaForTable(table, schema=self.schemas)
        phy_table = sch.get_phy_table_for_table()

        columns = kwargs.pop('columns', ['default'])
        addnl_fields = kwargs.pop('addnl_fields', [])
        view = kwargs.pop('view', self.iobj.view)
        active_only = kwargs.pop('active_only', True)
        query_str = kwargs.pop('query_str', '')

        # The REST API provides the query_str enclosed in ". Strip that
        if query_str:
            if query_str.startswith('"') and query_str.endswith('"'):
                query_str = query_str[1:-1]

        fields = sch.get_display_fields(columns)
        key_fields = sch.key_fields()
        drop_cols = []

        if columns == ['*']:
            drop_cols.append('sqvers')

        if 'timestamp' not in fields:
            fields.append('timestamp')

        if 'active' not in fields+addnl_fields:
            addnl_fields.append('active')
            drop_cols.append('active')

        for fld in key_fields:
            if fld not in fields+addnl_fields:
                addnl_fields.insert(0, fld)
                drop_cols.append(fld)

        for f in addnl_fields:
            if f not in fields:
                # timestamp is always the last field
                fields.insert(-1, f)

        if self.iobj.start_time:
            try:
                start_time = dateparser.parse(
                    self.iobj.start_time.replace('last night', 'yesterday')) \
                    .timestamp()*1000
            except Exception as e:
                print(f"ERROR: invalid time {self.iobj.start_time}: {e}")
                return pd.DataFrame()
        else:
            start_time = ''

        if self.iobj.start_time and not start_time:
            # Something went wrong with our parsing
            print(f"ERROR: unable to parse {self.iobj.start_time}")
            return pd.DataFrame()

        if self.iobj.end_time:
            try:
                end_time = dateparser.parse(
                    self.iobj.end_time.replace('last night', 'yesterday')) \
                    .timestamp()*1000
            except Exception as e:
                print(f"ERROR: invalid time {self.iobj.end_time}: {e}")
                return pd.DataFrame()
        else:
            end_time = ''

        if self.iobj.end_time and not end_time:
            # Something went wrong with our parsing
            print(f"ERROR: Unable to parse {self.iobj.end_time}")
            return pd.DataFrame()

        table_df = self._dbeng.read(
            phy_table,
            'pandas',
            start_time=start_time,
            end_time=end_time,
            columns=fields,
            view=view,
            key_fields=key_fields,
            **kwargs
        )

        if not table_df.empty:
            if view == 'latest' and active_only:
                table_df = table_df.query('active') \
                                   .drop(columns=drop_cols)
            else:
                table_df.drop(columns=drop_cols, inplace=True)
            if 'timestamp' in table_df.columns:
                table_df['timestamp'] = humanize_timestamp(
                    table_df.timestamp, self.cfg.get('analyzer', {})
                    .get('timezone', None))

        if query_str:
            return table_df.query(query_str)
        else:
            return table_df
Exemplo n.º 3
0
    def get_valid_df(self, table, **kwargs) -> pd.DataFrame:
        if not self.ctxt.engine:
            print("Specify an analysis engine using set engine command")
            return pd.DataFrame(columns=["namespace", "hostname"])

        sch = SchemaForTable(table, schema=self.schemas)
        phy_table = sch.get_phy_table_for_table()

        columns = kwargs.pop('columns', ['default'])
        addnl_fields = kwargs.pop('addnl_fields', [])
        view = kwargs.pop('view', self.iobj.view)
        active_only = kwargs.pop('active_only', True)

        fields = sch.get_display_fields(columns)
        key_fields = sch.key_fields()
        drop_cols = []

        if 'timestamp' not in fields:
            fields.append('timestamp')

        if 'active' not in fields + addnl_fields:
            addnl_fields.append('active')
            drop_cols.append('active')

        for fld in key_fields:
            if fld not in fields + addnl_fields:
                addnl_fields.insert(0, fld)
                drop_cols.append(fld)

        for f in addnl_fields:
            if f not in fields:
                # timestamp is always the last field
                fields.insert(-1, f)

        for dt in [self.iobj.start_time, self.iobj.end_time]:
            if dt:
                try:
                    parse(dt)
                except (ValueError, ParserError) as e:
                    print(f"invalid time {dt}: {e}")
                    return pd.DataFrame()

        table_df = self.ctxt.engine.get_table_df(
            self.cfg,
            table=phy_table,
            start_time=self.iobj.start_time,
            end_time=self.iobj.end_time,
            columns=fields,
            view=view,
            key_fields=key_fields,
            **kwargs)

        if not table_df.empty:
            if view == 'latest' and active_only:
                table_df = table_df.query('active') \
                                   .drop(columns=drop_cols)
            else:
                table_df.drop(columns=drop_cols, inplace=True)
            if 'timestamp' in table_df.columns:
                table_df['timestamp'] = pd.to_datetime(
                    table_df.timestamp.astype(str), unit="ms")

        return table_df
Exemplo n.º 4
0
    def get_table_df(self, cfg, schemas, **kwargs) -> pd.DataFrame:
        """Use Pandas instead of Spark to retrieve the data"""

        MAX_FILECNT_TO_READ_FOLDER = 10000

        self.cfg = cfg

        table = kwargs.pop("table")
        start = kwargs.pop("start_time")
        end = kwargs.pop("end_time")
        view = kwargs.pop("view")
        sort_fields = kwargs.pop("sort_fields")
        ign_key_fields = kwargs.pop("ign_key", [])
        addnl_fields = kwargs.pop("addnl_fields", [])

        for f in ['active', 'timestamp']:
            if f not in addnl_fields:
                addnl_fields.append(f)

        sch = SchemaForTable(table, schema=schemas)
        phy_table = sch.get_phy_table_for_table()

        folder = self._get_table_directory(phy_table)

        # Restrict to a single DC if thats whats asked
        if "namespace" in kwargs:
            v = kwargs["namespace"]
            if v:
                if not isinstance(v, list):
                    folder += "/namespace={}/".format(v)

        fcnt = self.get_filecnt(folder)

        if fcnt == 0:
            return pd.DataFrame()

        # We are going to hard code use_get_files until we have some autoamted testing
        use_get_files = False

        # use_get_files = (
        #    (fcnt > MAX_FILECNT_TO_READ_FOLDER and view == "latest") or
        #    start or end
        # )

        if use_get_files:
            # Switch to more efficient method when there are lotsa files
            # Reduce I/O since that is the worst drag
            key_fields = []
            if len(kwargs.get("namespace", [])) > 1:
                del kwargs["namespace"]
            files = get_latest_files(folder, start, end, view)
        else:
            # ign_key_fields contains key fields that are not partition cols
            key_fields = [i for i in sch.key_fields()
                          if i not in ign_key_fields]
            filters = self.build_pa_filters(start, end, key_fields, **kwargs)

        if "columns" in kwargs:
            columns = kwargs["columns"]
            del kwargs["columns"]
        else:
            columns = ["default"]

        fields = sch.get_display_fields(columns)
        for f in addnl_fields:
            if f not in fields:
                fields.append(f)

        # Create the filter to select only specified columns
        query_str = ""
        prefix = ""
        addnl_filter = kwargs.pop('add_filter', None)
        for f, v in kwargs.items():
            if not v or f in key_fields or f in ["groupby"]:
                continue
            if isinstance(v, str):
                if v.startswith('!'):
                    v = v[1:]
                    op = '!='
                else:
                    op = '=='
                query_str += "{} {}{}'{}' ".format(prefix, f, op, v)
                prefix = "and"
            else:
                query_str += "{} {}=={} ".format(prefix, f, v)
                prefix = "and"

        # Add the ignored fields back to key fields to ensure we
        # do the drop_duplicates correctly below incl reading reqd cols
        key_fields.extend(ign_key_fields)

        # Handle the case where key fields are missing from display fields
        fldset = set(fields)
        kfldset = set(key_fields)
        add_flds = kfldset.difference(fldset)
        if add_flds:
            fields.extend(list(add_flds))

        if addnl_filter:
            # This is for special cases that are specific to an object
            if not query_str:
                query_str = addnl_filter
            else:
                query_str += ' and {}'.format(addnl_filter)

        # Restore the folder to what it needs to be
        folder = self._get_table_directory(phy_table)
        if use_get_files:
            if not query_str:
                query_str = "active == True"

            pdf_list = []
            with Executor(max_workers=8) as exe:
                jobs = [
                    exe.submit(self.read_pq_file, f, fields, query_str)
                    for f in files
                ]
                pdf_list = [job.result() for job in jobs]

            if pdf_list:
                final_df = pd.concat(pdf_list)
            else:
                final_df = pd.DataFrame(columns=fields)

        elif view == "latest":
            if not query_str:
                # Make up a dummy query string to avoid if/then/else
                query_str = "timestamp != 0"

            try:
                final_df = (
                    pa.ParquetDataset(
                        folder, filters=filters or None, validate_schema=False
                    )
                    .read(columns=fields)
                    .to_pandas(split_blocks=True, self_destruct=True)
                    .query(query_str)
                    .drop_duplicates(subset=key_fields, keep="last")
                    .query("active == True")
                )
            except pa.lib.ArrowInvalid:
                return pd.DataFrame(columns=fields)
        else:
            if not query_str:
                # Make up a dummy query string to avoid if/then/else
                query_str = 'timestamp != "0"'

            try:
                final_df = (
                    pa.ParquetDataset(
                        folder, filters=filters or None, validate_schema=False
                    )
                    .read(columns=fields)
                    .to_pandas()
                    .query(query_str)
                )
            except pa.lib.ArrowInvalid:
                return pd.DataFrame(columns=fields)

        if 'active' not in columns:
            final_df.drop(columns=['active'], axis=1, inplace=True)
            fields.remove('active')

        final_df = df_timestamp_to_datetime(final_df)

        if sort_fields and all(x in sort_fields for x in fields):
            return final_df[fields].sort_values(by=sort_fields)
        else:
            return final_df[fields]