def cast_to_field_type(self, name: str, storage: storage.Storage, field: str, field_type: FieldType): df = storage.get_api().get(name) cast(DataFrame, df) if field in df.columns: df[field] = cast_series_to_field_type(df[field], field_type) storage.get_api().put(name, df) # Unnecessary?
def cast_to_field_type(self, name: str, storage: storage.Storage, field: str, field_type: FieldType): records = storage.get_api().get(name) for r in records: if field in r: r[field] = cast_python_object_to_field_type( r[field], field_type) storage.get_api().put(name, records)
def create_empty(self, name: str, storage: storage.Storage, schema: Schema): table = schema_as_sqlalchemy_table( schema, name, field_type_parameter_defaults={Text: { "length": 255 }}) storage.get_api().create_sqlalchemy_table(table)
def infer_field_type(self, name: str, storage: storage.Storage, field: str) -> FieldType: tble: sa.Table = storage.get_api().get_as_sqlalchemy_table(name) for c in tble.columns: if c.name == field: return sqlalchemy_type_to_field_type(c.type) raise ValueError(f"Field does not exist: {field}")
def infer_field_names(self, name, storage) -> List[str]: with storage.get_api().open(name) as f: dialect = infer_csv_dialect(f.read(SAMPLE_SIZE_CHARACTERS)) f.seek(0) ln = f.readline() headers = next(csv.reader([ln], dialect=dialect)) return headers
def get_sample_records(self, name: str, storage: storage.Storage) -> Records: obj = storage.get_api().get(name) assert isinstance(obj, SampleableIterator) sample = obj.head(SAMPLE_SIZE_LINES) for r in read_csv(sample): yield r
def infer_field_type(self, name: str, storage: storage.Storage, field: str) -> FieldType: df = storage.get_api().get(name) cast(DataFrame, df) series = df[field] ft = pandas_series_to_field_type(series) return ft
def infer_data_format(self, name: str, storage: storage.Storage) -> Optional[DataFormat]: obj = storage.get_api().get(name) if isinstance(obj, SampleableIterator): s = self.get_sample_string(name, storage) if is_maybe_csv(s): return CsvLinesIteratorFormat return None
def infer_data_format(self, name: str, storage: storage.Storage) -> Optional[DataFormat]: if name.endswith(".csv"): return CsvFileFormat # TODO: how hacky is this? very with storage.get_api().open(name) as f: s = f.read(SAMPLE_SIZE_CHARACTERS) if is_maybe_csv(s): return CsvFileFormat return None
def infer_data_format( self, name: str, storage: storage.Storage ) -> Optional[DataFormat]: if name.endswith(".html"): return HtmlFileFormat # TODO: how hacky is this? very with storage.get_api().open(name) as f: s = f.read(SAMPLE_SIZE_CHARACTERS) if s.strip().lower().startswith("<html"): return HtmlFileFormat return None
def infer_field_type(self, name: str, storage: storage.Storage, field: str) -> FieldType: records = storage.get_api().get(name) sample = [] for r in records: if field in r: sample.append(r[field]) if len(sample) >= self.sample_size: break ft = select_field_type(sample) return ft
def infer_data_format(self, name, storage) -> Optional[DataFormat]: obj = storage.get_api().get(name) if isinstance(obj, list): if len(obj) > 0: if isinstance(obj[0], dict): return RecordsFormat else: return None # If empty list, default to records format return RecordsFormat return None
def infer_field_names(self, name, storage) -> List[str]: records = storage.get_api().get(name) assert isinstance(records, list) if not records: return [] names = [] for r in records[:100]: for k in r.keys(): # Ordered as of py 3.7 if k not in names: names.append(k) # Keep order # names |= set(r.keys()) return list(names)
def infer_data_format(self, name: str, storage: storage.Storage) -> Optional[DataFormat]: if name.endswith(".jsonl"): return JsonLinesFileFormat # TODO: how hacky is this? very with storage.get_api().open(name) as f: ln = f.readline() try: json.loads(ln) return JsonLinesFileFormat except json.JSONDecodeError: pass return None
def create_empty(self, name, storage, schema: Schema): df = DataFrame() for field in schema.fields: pd_type = field_type_to_pandas_dtype(field.field_type) df[field.name] = pd.Series(dtype=pd_type) storage.get_api().put(name, df)
def infer_field_names(self, name, storage) -> List[str]: return storage.get_api().get(name).columns
def create_empty(self, name, storage, schema: Schema): s = ",".join(schema.field_names()) + "\n" storage.get_api().put(name, (ln for ln in [s]))
def get_sample_string(self, name: str, storage: storage.Storage) -> str: obj = storage.get_api().get(name) assert isinstance(obj, SampleableIterator) sample = obj.head(SAMPLE_SIZE_LINES) s = "".join(sample) return s
def create_empty(self, name, storage, schema: Schema): # Not sure you'd really ever want to do this? with storage.get_api().open(name, "w") as f: f.write(",".join(schema.field_names()) + "\n")
def create_empty(self, name, storage, schema: Schema): storage.get_api().put(name, [])
def infer_field_names(self, name, storage) -> List[str]: tble = storage.get_api().get_as_sqlalchemy_table(name) return [c.name for c in tble.columns]
def create_empty(self, name, storage, schema: Schema): # Not sure you'd really ever want to do this? with storage.get_api().open(name, "w") as f: pass
def infer_field_type(self, name: str, storage: storage.Storage, field: str) -> FieldType: table: ArrowTable = storage.get_api().get(name) return arrow_type_to_field_type(str(table.field(field).type))
def create_empty(self, name, storage, schema: Schema): table = pa.Table.from_batches([], schema=schema_to_arrow_schema(schema)) storage.get_api().put(name, table)
def infer_field_names(self, name, storage) -> List[str]: table = storage.get_api().get(name) assert isinstance(table, ArrowTable) return [f.name for f in table.schema]
def infer_data_format(self, name, storage) -> Optional[DataFormat]: obj = storage.get_api().get(name) if isinstance(obj, pa.Table): return ArrowTableFormat return None
def create_empty(self, name, storage, schema: Schema): # Just "touch" with storage.get_api().open(name, "w"): pass
def infer_field_names(self, name, storage) -> List[str]: with storage.get_api().open(name) as f: ln = f.readline() return [k for k in json.loads(ln).keys()]
def create_empty(self, name, storage, schema: Schema): table = schema_as_sqlalchemy_table(schema, name) storage.get_api().create_sqlalchemy_table(table)