def preview_pipeline(self, pipeline: Pipeline, limit: int = 50, offset: int = 0) -> str: """ Execute a pipeline but returns only a slice of the results, determined by `limit` and `offset` parameters, as JSON. Return format follows the 'table' JSON table schema used by pandas (see https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#orient-options), with a few addition related to pagination. Note: it's required to use pandas `to_json` methods, as it convert NaN and dates to an appropriate format. """ df = self.execute_pipeline(pipeline) return json.dumps({ 'schema': build_table_schema(df, index=False), 'offset': offset, 'limit': limit, 'total': df.shape[0], 'data': json.loads(df[offset:offset + limit].to_json(orient='records')), })
def update_preview_fields_from_df(artifact, df, stats=None, preview_rows_length=None, ignore_preview_limits=False): preview_rows_length = preview_rows_length or default_preview_rows_length if hasattr(df, "dask"): artifact.length = df.shape[0].compute() preview_df = df.sample(frac=ddf_sample_pct).compute() else: artifact.length = df.shape[0] preview_df = df if artifact.length > preview_rows_length and not ignore_preview_limits: preview_df = df.head(preview_rows_length) preview_df = preview_df.reset_index() if len(preview_df.columns ) > max_preview_columns and not ignore_preview_limits: preview_df = preview_df.iloc[:, :max_preview_columns] artifact.header = preview_df.columns.values.tolist() artifact.preview = preview_df.values.tolist() artifact.schema = build_table_schema(preview_df) if (stats or (artifact.length < max_csv and len(df.columns) < max_preview_columns) or ignore_preview_limits): artifact.stats = get_df_stats(df)
def __init__(self, key=None, df=None, preview=None, format='', stats=None, target_path=None, **kwargs): format = format.lower() super().__init__(key, None, format=format, target_path=target_path) if format and format not in supported_formats: raise ValueError('unsupported format {} use one of {}'.format( format, '|'.join(supported_formats))) if format == 'pq': format = 'parquet' self.format = format self.stats = None if df is not None: self.header = df.columns.values.tolist() self.length = df.shape[0] preview = preview or preview_lines shortdf = df if self.length > preview: shortdf = df.head(preview) self.preview = shortdf.values.tolist() self.schema = build_table_schema(df) if stats or self.length < max_csv: self.stats = get_stats(df) self._df = df self._kw = kwargs
def get_schema(cls, dataframe): schema = build_table_schema(dataframe) c = {} for x in schema['fields']: c[x['name']] = cls._translate_datatypes(x['type']) return c
def get_schema(csv): """ Get schema for collected csv Args: csv: csv string file Returns: StructType with schemas """ schemas = StructType() for field in build_table_schema(csv, index=False)["fields"]: type = StringType() if field["type"] == "string" \ else FloatType() if field["type"] == "number" \ else IntegerType() schemas.add(StructField(field["name"], type, True)) return schemas
def get_pandas_df_schema(df: pd.DataFrame) -> Dict[Text, Text]: """ Get dataframe schema using pandas.io.json.build_table_schema. Args: df {pandas.DataFrame}: dataframe Returns: Dict[Text, Text]: dictionary with structure: { <column_name>: <column_type> } """ return { f['name']: f['type'] for f in build_table_schema(df, index=False)['fields'] }
def __init__( self, key=None, df=None, preview=None, format="", stats=None, target_path=None, extra_data=None, column_metadata=None, **kwargs, ): format = format.lower() super().__init__(key, None, format=format, target_path=target_path) if format and format not in supported_formats: raise ValueError("unsupported format {} use one of {}".format( format, "|".join(supported_formats))) if format == "pq": format = "parquet" self.format = format self.stats = None self.extra_data = extra_data or {} self.column_metadata = column_metadata or {} if df is not None: self.length = df.shape[0] preview = preview or preview_lines shortdf = df if self.length > preview: shortdf = df.head(preview) shortdf = shortdf.reset_index() self.header = shortdf.columns.values.tolist() self.preview = shortdf.values.tolist() self.schema = build_table_schema(df) if stats or self.length < max_csv: self.stats = get_df_stats(df) self._df = df self._kw = kwargs
def update_dataset_meta( artifact, from_df=None, schema: dict = None, header: list = None, preview: list = None, stats: dict = None, extra_data: dict = None, column_metadata: dict = None, labels: dict = None, ): """Update dataset object attributes/metadata this method will edit or add metadata to a dataset object example: update_dataset_meta(dataset, from_df=df, extra_data={'histogram': 's3://mybucket/..'}) :param from_df: read metadata (schema, preview, ..) from provided df :param artifact: dataset artifact object or path (store://..) or DataItem :param schema: dataset schema, see pandas build_table_schema :param header: column headers :param preview: list of rows and row values (from df.values.tolist()) :param stats: dict of column names and their stats (cleaned df.describe(include='all')) :param extra_data: extra data items (key: path string | artifact) :param column_metadata: dict of metadata per column :param labels: metadata labels """ if hasattr(artifact, "artifact_url"): artifact = artifact.artifact_url stores = store_manager if isinstance(artifact, DatasetArtifact): artifact_spec = artifact elif artifact.startswith(DB_SCHEMA + "://"): artifact_spec, _ = stores.get_store_artifact(artifact) else: raise ValueError( "model path must be a model store object/URL/DataItem") if not artifact_spec or artifact_spec.kind != "dataset": raise ValueError( "store artifact ({}) is not dataset kind".format(artifact)) if from_df is not None: shortdf = from_df length = from_df.shape[0] if length > preview_lines: shortdf = from_df.head(preview_lines) artifact_spec.header = shortdf.reset_index().columns.values.tolist() artifact_spec.preview = shortdf.reset_index().values.tolist() artifact_spec.schema = build_table_schema(from_df) if stats is None and length < max_csv: artifact_spec.stats = get_df_stats(from_df) if header: artifact_spec.header = header if stats: artifact_spec.stats = stats if schema: artifact_spec.schema = schema if preview: artifact_spec.preview = preview if column_metadata: artifact_spec.column_metadata = column_metadata if labels: for key, val in labels.items(): artifact_spec.labels[key] = val if extra_data: artifact_spec.extra_data = artifact_spec.extra_data or {} for key, item in extra_data.items(): if hasattr(item, "target_path"): item = item.target_path artifact_spec.extra_data[key] = item stores._get_db().store_artifact( artifact_spec.db_key, artifact_spec.to_dict(), artifact_spec.tree, iter=artifact_spec.iter, project=artifact_spec.project, )
def check_schema(df, schema): df_schema = build_table_schema(df, index=False, primary_key=None, version=False) df_schema = fields_list_to_frozenset(df_schema["fields"]) return df_schema == schema