def sorted_group_by( self, *keys, values: Optional[Iterable] = None, as_pairs: bool = False, ) -> StreamInterface: if as_pairs: return super().sorted_group_by(*keys, values=values, as_pairs=True) else: output_struct = FlatStruct([]) for f in list(keys) + list(values): if isinstance(f, ARRAY_TYPES): field_name = get_name(f[0]) else: field_name = get_name(f) if f in values: field_type = FieldType.Tuple elif isinstance(f, FieldInterface) or hasattr(f, 'get_type'): field_type = f.get_type() else: field_type = AUTO output_struct.append_field(field_name, field_type) return super().sorted_group_by(*keys, values=values, as_pairs=False, output_struct=output_struct)
def get_output_struct(self) -> StructInterface: input_struct = self.get_input_struct() output_columns = self.get_output_columns() types = { f: t for f, t in input_struct.get_types_dict().items() if f in output_columns } struct = FlatStruct(output_columns).set_types(types) assert isinstance(struct, FlatStruct) struct.validate_about(input_struct, ignore_moved=True) return struct
def insert_data( self, table: Union[Table, Name], data: Data, struct: Struct = None, encoding: Optional[str] = None, skip_errors: bool = False, skip_lines: Count = 0, skip_first_line: bool = False, step: AutoCount = DEFAULT_STEP, verbose: AutoBool = AUTO, ) -> tuple: if not Auto.is_defined(skip_lines): skip_lines = 0 is_struct_description = isinstance(struct, StructInterface) or hasattr(struct, 'get_struct_str') if not is_struct_description: message = 'Struct as {} is deprecated, use FlatStruct instead'.format(type(struct)) self.log(msg=message, level=LoggingLevel.Warning) struct = FlatStruct(struct or []) input_stream = self._get_struct_stream_from_data( data, struct=struct, encoding=encoding, skip_first_line=skip_first_line, verbose=verbose, ) if skip_lines: input_stream = input_stream.skip(skip_lines) if input_stream.get_stream_type() != StreamType.StructStream: input_stream = input_stream.structure( struct, skip_bad_rows=True, verbose=True, ).update_meta( count=input_stream.get_count(), ) initial_count = input_stream.get_estimated_count() + skip_lines final_count = self.insert_struct_stream( table, input_stream, skip_errors=skip_errors, step=step, verbose=verbose, ) return initial_count, final_count
def get_struct_from_database( self, types: AutoLinks = AUTO, set_struct: bool = False, skip_missing: bool = False, verbose: AutoBool = AUTO, ) -> StructInterface: struct = FlatStruct(self.describe_table(verbose=verbose)) if struct.is_empty() and not skip_missing: raise ValueError( 'Can not get struct for non-existing table {}'.format(self)) if Auto.is_defined(types): struct.set_types(types, inplace=True) if set_struct: self.set_struct(struct, inplace=True) return struct
def set_struct(self, struct: GeneralizedStruct, inplace: bool) -> Optional[Native]: if isinstance(struct, StructInterface) or struct is None: pass elif isinstance(struct, ARRAY_TYPES): if max([isinstance(f, ARRAY_TYPES) for f in struct]): struct = FlatStruct(struct) else: struct = FlatStruct.get_struct_detected_by_title_row(struct) elif struct == AUTO: struct = self.get_struct_from_database() else: message = 'struct must be StructInterface or tuple with fields_description (got {})'.format( type(struct)) raise TypeError(message) return super().set_struct(struct, inplace=inplace)
def __init__( self, data: Row, struct: Union[Row, StructInterface], check: bool = True, ): if not isinstance(struct, StructInterface): struct = FlatStruct(struct) self._struct = struct if check: data = self._structure_row(data, struct) super().__init__(data=data, name='-')
def test_detect_struct_by_title_row(): title_row = ('page_id', 'hits_count', 'conversion_rate') expected = 'page_id int, hits_count int, conversion_rate numeric' received = FlatStruct.get_struct_detected_by_title_row(title_row).get_struct_str(DialectType.Postgres) assert received == expected, '{} != {}'.format(received, expected)