Exemplo n.º 1
0
 def cast_to_field_type(self, name: str, storage: storage.Storage,
                        field: str, field_type: FieldType):
     df = storage.get_api().get(name)
     cast(DataFrame, df)
     if field in df.columns:
         df[field] = cast_series_to_field_type(df[field], field_type)
     storage.get_api().put(name, df)  # Unnecessary?
Exemplo n.º 2
0
 def cast_to_field_type(self, name: str, storage: storage.Storage,
                        field: str, field_type: FieldType):
     records = storage.get_api().get(name)
     for r in records:
         if field in r:
             r[field] = cast_python_object_to_field_type(
                 r[field], field_type)
     storage.get_api().put(name, records)
Exemplo n.º 3
0
 def create_empty(self, name: str, storage: storage.Storage,
                  schema: Schema):
     table = schema_as_sqlalchemy_table(
         schema,
         name,
         field_type_parameter_defaults={Text: {
             "length": 255
         }})
     storage.get_api().create_sqlalchemy_table(table)
Exemplo n.º 4
0
Arquivo: base.py Projeto: kvh/dcp
 def infer_field_type(self, name: str, storage: storage.Storage,
                      field: str) -> FieldType:
     tble: sa.Table = storage.get_api().get_as_sqlalchemy_table(name)
     for c in tble.columns:
         if c.name == field:
             return sqlalchemy_type_to_field_type(c.type)
     raise ValueError(f"Field does not exist: {field}")
Exemplo n.º 5
0
Arquivo: csv_file.py Projeto: kvh/dcp
 def infer_field_names(self, name, storage) -> List[str]:
     with storage.get_api().open(name) as f:
         dialect = infer_csv_dialect(f.read(SAMPLE_SIZE_CHARACTERS))
         f.seek(0)
         ln = f.readline()
         headers = next(csv.reader([ln], dialect=dialect))
         return headers
Exemplo n.º 6
0
 def get_sample_records(self, name: str,
                        storage: storage.Storage) -> Records:
     obj = storage.get_api().get(name)
     assert isinstance(obj, SampleableIterator)
     sample = obj.head(SAMPLE_SIZE_LINES)
     for r in read_csv(sample):
         yield r
Exemplo n.º 7
0
 def infer_field_type(self, name: str, storage: storage.Storage,
                      field: str) -> FieldType:
     df = storage.get_api().get(name)
     cast(DataFrame, df)
     series = df[field]
     ft = pandas_series_to_field_type(series)
     return ft
Exemplo n.º 8
0
 def infer_data_format(self, name: str,
                       storage: storage.Storage) -> Optional[DataFormat]:
     obj = storage.get_api().get(name)
     if isinstance(obj, SampleableIterator):
         s = self.get_sample_string(name, storage)
         if is_maybe_csv(s):
             return CsvLinesIteratorFormat
     return None
Exemplo n.º 9
0
Arquivo: csv_file.py Projeto: kvh/dcp
 def infer_data_format(self, name: str,
                       storage: storage.Storage) -> Optional[DataFormat]:
     if name.endswith(".csv"):
         return CsvFileFormat
     # TODO: how hacky is this? very
     with storage.get_api().open(name) as f:
         s = f.read(SAMPLE_SIZE_CHARACTERS)
         if is_maybe_csv(s):
             return CsvFileFormat
     return None
Exemplo n.º 10
0
 def infer_data_format(
     self, name: str, storage: storage.Storage
 ) -> Optional[DataFormat]:
     if name.endswith(".html"):
         return HtmlFileFormat
     # TODO: how hacky is this? very
     with storage.get_api().open(name) as f:
         s = f.read(SAMPLE_SIZE_CHARACTERS)
         if s.strip().lower().startswith("<html"):
             return HtmlFileFormat
     return None
Exemplo n.º 11
0
 def infer_field_type(self, name: str, storage: storage.Storage,
                      field: str) -> FieldType:
     records = storage.get_api().get(name)
     sample = []
     for r in records:
         if field in r:
             sample.append(r[field])
         if len(sample) >= self.sample_size:
             break
     ft = select_field_type(sample)
     return ft
Exemplo n.º 12
0
 def infer_data_format(self, name, storage) -> Optional[DataFormat]:
     obj = storage.get_api().get(name)
     if isinstance(obj, list):
         if len(obj) > 0:
             if isinstance(obj[0], dict):
                 return RecordsFormat
             else:
                 return None
         # If empty list, default to records format
         return RecordsFormat
     return None
Exemplo n.º 13
0
 def infer_field_names(self, name, storage) -> List[str]:
     records = storage.get_api().get(name)
     assert isinstance(records, list)
     if not records:
         return []
     names = []
     for r in records[:100]:
         for k in r.keys():  # Ordered as of py 3.7
             if k not in names:
                 names.append(k)  # Keep order
         # names |= set(r.keys())
     return list(names)
Exemplo n.º 14
0
 def infer_data_format(self, name: str,
                       storage: storage.Storage) -> Optional[DataFormat]:
     if name.endswith(".jsonl"):
         return JsonLinesFileFormat
     # TODO: how hacky is this? very
     with storage.get_api().open(name) as f:
         ln = f.readline()
         try:
             json.loads(ln)
             return JsonLinesFileFormat
         except json.JSONDecodeError:
             pass
     return None
Exemplo n.º 15
0
 def create_empty(self, name, storage, schema: Schema):
     df = DataFrame()
     for field in schema.fields:
         pd_type = field_type_to_pandas_dtype(field.field_type)
         df[field.name] = pd.Series(dtype=pd_type)
     storage.get_api().put(name, df)
Exemplo n.º 16
0
 def infer_field_names(self, name, storage) -> List[str]:
     return storage.get_api().get(name).columns
Exemplo n.º 17
0
 def create_empty(self, name, storage, schema: Schema):
     s = ",".join(schema.field_names()) + "\n"
     storage.get_api().put(name, (ln for ln in [s]))
Exemplo n.º 18
0
 def get_sample_string(self, name: str, storage: storage.Storage) -> str:
     obj = storage.get_api().get(name)
     assert isinstance(obj, SampleableIterator)
     sample = obj.head(SAMPLE_SIZE_LINES)
     s = "".join(sample)
     return s
Exemplo n.º 19
0
Arquivo: csv_file.py Projeto: kvh/dcp
 def create_empty(self, name, storage, schema: Schema):
     # Not sure you'd really ever want to do this?
     with storage.get_api().open(name, "w") as f:
         f.write(",".join(schema.field_names()) + "\n")
Exemplo n.º 20
0
 def create_empty(self, name, storage, schema: Schema):
     storage.get_api().put(name, [])
Exemplo n.º 21
0
Arquivo: base.py Projeto: kvh/dcp
 def infer_field_names(self, name, storage) -> List[str]:
     tble = storage.get_api().get_as_sqlalchemy_table(name)
     return [c.name for c in tble.columns]
Exemplo n.º 22
0
 def create_empty(self, name, storage, schema: Schema):
     # Not sure you'd really ever want to do this?
     with storage.get_api().open(name, "w") as f:
         pass
Exemplo n.º 23
0
 def infer_field_type(self, name: str, storage: storage.Storage,
                      field: str) -> FieldType:
     table: ArrowTable = storage.get_api().get(name)
     return arrow_type_to_field_type(str(table.field(field).type))
Exemplo n.º 24
0
 def create_empty(self, name, storage, schema: Schema):
     table = pa.Table.from_batches([],
                                   schema=schema_to_arrow_schema(schema))
     storage.get_api().put(name, table)
Exemplo n.º 25
0
 def infer_field_names(self, name, storage) -> List[str]:
     table = storage.get_api().get(name)
     assert isinstance(table, ArrowTable)
     return [f.name for f in table.schema]
Exemplo n.º 26
0
 def infer_data_format(self, name, storage) -> Optional[DataFormat]:
     obj = storage.get_api().get(name)
     if isinstance(obj, pa.Table):
         return ArrowTableFormat
     return None
Exemplo n.º 27
0
 def create_empty(self, name, storage, schema: Schema):
     # Just "touch"
     with storage.get_api().open(name, "w"):
         pass
Exemplo n.º 28
0
 def infer_field_names(self, name, storage) -> List[str]:
     with storage.get_api().open(name) as f:
         ln = f.readline()
         return [k for k in json.loads(ln).keys()]
Exemplo n.º 29
0
Arquivo: base.py Projeto: kvh/dcp
 def create_empty(self, name, storage, schema: Schema):
     table = schema_as_sqlalchemy_table(schema, name)
     storage.get_api().create_sqlalchemy_table(table)