def __init__( self, data: Iterable, name: AutoName = AUTO, check: bool = False, count: AutoCount = None, less_than: AutoCount = None, source: Connector = None, context: Context = None, max_items_in_memory: AutoCount = AUTO, tmp_files: TmpMask = AUTO, ): count = arg.get_optional_len(data, count) less_than = less_than or count self.max_items_in_memory = arg.acquire(max_items_in_memory, sm.MAX_ITEMS_IN_MEMORY) super().__init__( data=data, name=name, check=check, source=source, context=context, count=count, less_than=less_than, ) self._tmp_files = arg.delayed_acquire(tmp_files, sm.get_tmp_mask, self.get_name())
def _get_filtered_items( self, *args, item_type: ItemType = ItemType.Auto, skip_errors: bool = False, logger: Union[LoggerInterface, Auto] = AUTO, **kwargs ) -> Iterable: logger = arg.delayed_acquire(logger, self.get_logger) item_type = arg.delayed_acquire(item_type, self.get_item_type) filter_function = sf.filter_items( *args, item_type=item_type, skip_errors=skip_errors, logger=logger, **kwargs, ) return filter(filter_function, self.get_items())
def set_to_item_inplace( field: FieldID, value: Value, item: SelectableItem, item_type: ItemType = ItemType.Auto, ) -> None: item_type = arg.delayed_acquire(item_type, ItemType.detect, item, default=ItemType.Any) if not isinstance(item_type, ItemType): if hasattr(item_type, 'value'): item_type = ItemType(item_type.value) else: item_type = ItemType(item_type) if item_type == ItemType.Record: item[field] = value elif item_type == ItemType.Row: cols_count = len(item) if field >= cols_count: item += [None] * (field - cols_count + 1) item[field] = value elif item_type == ItemType.StructRow: if isinstance(item, StructRowInterface): item.set_value(field, value, update_struct=True) elif isinstance(item, ROW_SUBCLASSES): assert isinstance(field, int), 'Expected column number as int, got {}'.format(field) cur_item_len = len(item) need_extend = field >= cur_item_len if need_extend: if isinstance(item, tuple): item = list(item) item += [None] * (field + 1 - cur_item_len) item[field] = value else: raise TypeError('Expected Row or StructRow, got {}'.format(item)) else: # item_type == 'any' or not item_type: raise TypeError('type {} not supported'.format(item_type))
def get_parsed_line( self, line: str, item_type: Union[ItemType, Auto] = AUTO, struct: Union[Array, StructInterface, Auto] = AUTO, ) -> Item: item_type = arg.delayed_acquire(item_type, self.get_default_item_type) if item_type == ItemType.Line: return line line_parser = fs.csv_loads(delimiter=self.get_delimiter()) row = line_parser(line) if isinstance(struct, StructInterface): field_converters = struct.get_converters() row_converter = self._get_row_converter( converters=field_converters) row = row_converter(row) if item_type in (ItemType.Row, ItemType.Any, ItemType.Auto): return row if not arg.is_defined(struct, check_name=False): column_count = len(row) struct = list(range(column_count)) if item_type == ItemType.Record: return {arg.get_name(k): v for k, v in zip(struct, row)} elif item_type == ItemType.StructRow: return ItemType.StructRow.build(data=row, struct=struct) else: msg = 'item_type {} is not supported for {}.parse_lines()' raise ValueError(msg.format(item_type, self.__class__.__name__))
def to_stream( self, data: Data = AUTO, stream_type: AutoStreamType = AUTO, ex: OptionalFields = None, **kwargs ) -> Stream: stream_type = arg.delayed_acquire(stream_type, self.get_stream_type) if isinstance(stream_type, str): stream_class = StreamType(stream_type).get_class() elif isclass(stream_type): stream_class = stream_type elif isinstance(stream_type, StreamType) or hasattr(stream_type, 'get_class'): stream_class = stream_type.get_class() else: raise TypeError('AnyStream.to_stream(data, stream_type): expected StreamType, got {}'.format(stream_type)) if not arg.is_defined(data): if hasattr(self, 'get_items_of_type'): item_type = stream_class.get_item_type() data = self.get_items_of_type(item_type) else: data = self.get_data() meta = self.get_compatible_meta(stream_class, ex=ex) meta.update(kwargs) if 'count' not in meta: meta['count'] = self.get_count() if 'source' not in meta: meta['source'] = self.get_source() stream = stream_class(data, **meta) return self._assume_stream(stream)
def to_column_file( self, filename: str, columns: Union[Iterable, Auto] = AUTO, add_title_row=True, gzip=False, delimiter='\t', encoding=AUTO, check=True, verbose=True, return_stream=True, ) -> Optional[Native]: encoding = arg.delayed_acquire(encoding, self.get_encoding) meta = self.get_meta() if not gzip: meta.pop('count') sm_csv_file = self.to_row_stream( columns=columns, add_title_row=add_title_row, ).to_line_stream(delimiter=delimiter, ).to_text_file( filename, encoding=encoding, gzip=gzip, check=check, verbose=verbose, return_stream=return_stream, ) if return_stream: return sm_csv_file.skip(1 if add_title_row else 0, ).to_row_stream( delimiter=delimiter, ).to_record_stream( columns=columns, ).update_meta(**meta)
def to_stream(self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, stream_type: Union[StreamType, Auto] = AUTO, ex: OptionalFields = None, step: AutoCount = AUTO, **kwargs) -> Stream: name = arg.delayed_acquire(name, self._get_generated_stream_name) stream_type = self._get_stream_type(stream_type) stream_class = self._get_stream_class(stream_type) if hasattr(stream_class, 'get_item_type'): item_type = stream_class.get_item_type() else: item_type = AUTO if not arg.is_defined(data): data = self._get_items_of_type(item_type, verbose=kwargs.get('verbose', AUTO), step=step) meta = self.get_compatible_meta(stream_class, name=name, ex=ex, **kwargs) if 'count' not in meta: meta['count'] = self._get_fast_count() if 'source' not in meta: meta['source'] = self stream = stream_class(data, **meta) return self._assume_stream(stream)
def group_by( self, *keys, values: Columns = None, as_pairs: bool = False, take_hash: bool = True, step: AutoCount = AUTO, verbose: bool = True, ) -> Stream: keys = arg.update(keys) keys = arg.get_names(keys) values = arg.get_names(values) if hasattr(keys[0], 'get_field_names'): # if isinstance(keys[0], FieldGroup) keys = keys[0].get_field_names() step = arg.delayed_acquire(step, self.get_limit_items_in_memory) if as_pairs: key_for_sort = keys else: key_for_sort = self._get_key_function(keys, take_hash=take_hash) sorted_stream = self.sort( key_for_sort, step=step, verbose=verbose, ) grouped_stream = sorted_stream.sorted_group_by( keys, values=values, as_pairs=as_pairs, ) return grouped_stream
def filter(self, *args, item_type: ItemType = ItemType.Auto, skip_errors: bool = False, **kwargs) -> Native: item_type = arg.delayed_acquire(item_type, self.get_item_type) stream_type = self.get_stream_type() assert isinstance(stream_type, StreamType), 'Expected StreamType, got {}'.format(stream_type) filtered_items = self._get_filtered_items(*args, item_type=item_type, skip_errors=skip_errors, **kwargs) stream = self.to_stream(data=filtered_items, stream_type=stream_type) return self._assume_native(stream)
def set_to_item_inplace( field: FieldID, value: Value, item: SelectableItem, item_type: ItemType = ItemType.Auto, ) -> NoReturn: item_type = arg.delayed_acquire(item_type, ItemType.detect, item, default=ItemType.Any) if not isinstance(item_type, ItemType): if hasattr(item_type, 'value'): item_type = ItemType(item_type.value) else: item_type = ItemType(item_type) if item_type == ItemType.Record: item[field] = value elif item_type == ItemType.Row: cols_count = len(item) if field >= cols_count: item += [None] * (field - cols_count + 1) item[field] = value elif item_type == ItemType.StructRow: item.set_value(field, value) else: # item_type == 'any' or not item_type: raise TypeError('type {} not supported'.format(item_type))
def _get_field_getter(self, field: UniKey, item_type: Union[ItemType, Auto] = AUTO, default=None): if isinstance(self, RegularStreamInterface) or hasattr(self, 'get_item_type'): item_type = arg.delayed_acquire(item_type, self.get_item_type) return lambda i: fs.it.get_field_value_from_item( field=field, item=i, item_type=item_type, default=default, logger=self.get_selection_logger(), )
def get_rows(self, columns: Union[Columns, Auto] = AUTO, add_title_row=False) -> Iterable: columns = arg.delayed_acquire(columns, self.get_columns) columns = arg.get_names(columns) if add_title_row: yield columns for r in self.get_items(): yield [r.get(c) for c in columns]
def get_fields_names_from_item(item: SelectableItem, item_type: ItemType = ItemType.Auto) -> Row: item_type = arg.delayed_acquire(item_type, ItemType.detect, item, default=ItemType.Any) if item_type == ItemType.Row: return list(range(len(item))) elif item_type == ItemType.Record: return item.keys() elif item_type == ItemType.StructRow: return item.get_columns() else: raise TypeError('type {} not supported'.format(item_type))
def simple_select_fields(fields: Array, item: SelectableItem, item_type: ItemType = ItemType.Auto) -> SelectableItem: item_type = arg.delayed_acquire(item_type, ItemType.detect, item, default=ItemType.Any) if isinstance(item_type, str): item_type = ItemType(item_type) if item_type == ItemType.Record: return {f: item.get(f) for f in fields} elif item_type == ItemType.Row: return [item[f] for f in fields] elif item_type == ItemType.StructRow: return item.simple_select_fields(fields)
def to_stream(self, data: Data = AUTO, stream_type: AutoStreamType = AUTO, ex: OptionalFields = None, **kwargs) -> Stream: stream_type = arg.delayed_acquire(stream_type, self.get_stream_type) if isinstance(stream_type, str): stream_class = StreamType(stream_type).get_class() elif isclass(stream_type): stream_class = stream_type else: stream_class = stream_type.get_class() data = arg.delayed_acquire(data, self.get_data) meta = self.get_compatible_meta(stream_class, ex=ex) meta.update(kwargs) if 'count' not in meta: meta['count'] = self.get_count() if 'source' not in meta: meta['source'] = self.get_source() return stream_class(data, **meta)
def get_date_from_day_abs( day_abs: int, min_date: Union[Date, arg.Auto] = arg.AUTO, as_iso_date: bool = True, ) -> Date: min_date = arg.delayed_acquire(min_date, get_year_start_monday, get_min_year(), as_iso_date=as_iso_date) cur_date = get_shifted_date(min_date, days=day_abs) return cur_date
def set_verbose(self, verbose: AutoBool = AUTO, parent: AutoConnector = AUTO) -> Native: if not arg.is_defined(verbose): parent = arg.delayed_acquire(parent, self.get_parent) if hasattr(parent, 'is_verbose'): verbose = parent.is_verbose() elif hasattr(parent, 'verbose'): verbose = parent.verbose else: verbose = DEFAULT_VERBOSE self._verbose = verbose return self
def filter(self, *args, item_type: ItemType = ItemType.Auto, skip_errors: bool = False, **kwargs) -> Native: item_type = arg.delayed_acquire(item_type, self.get_item_type) stream_type = StreamType.detect(item_type) filtered_items = self._get_filtered_items(*args, item_type=item_type, skip_errors=skip_errors, **kwargs) stream = self.to_stream(data=filtered_items, stream_type=stream_type) return self._assume_native(stream)
def bucket(self, name: Name, access_key=AUTO, secret_key=AUTO) -> ConnectorInterface: bucket = self.get_buckets().get(name) if bucket: if arg.is_defined(access_key) and hasattr(bucket, 'set_access_key'): bucket.set_access_key(access_key) if arg.is_defined(secret_key) and hasattr(bucket, 'set_secret_key'): bucket.set_secret_key(secret_key) else: bucket_class = self.get_default_child_obj_class() bucket = bucket_class( name=name, storage=self, access_key=arg.delayed_acquire(access_key, self.get_access_key), secret_key=arg.delayed_acquire(secret_key, self.get_secret_key), ) return bucket
def sort(self, *keys, reverse: bool = False, step: AutoCount = AUTO, verbose: bool = True) -> Native: key_function = self._get_key_function(keys) step = arg.delayed_acquire(step, self.get_limit_items_in_memory) if self.can_be_in_memory(step=step): stream = self.memory_sort(key_function, reverse, verbose=verbose) else: stream = self.disk_sort(key_function, reverse, step=step, verbose=verbose) return self._assume_native(stream)
def to_stream(self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, stream_type: Union[StreamType, Auto] = AUTO, ex: OptionalFields = None, step: AutoCount = AUTO, **kwargs) -> Stream: if arg.is_defined(data): kwargs['data'] = data stream_type = arg.delayed_acquire(stream_type, self.get_stream_type) assert not ex, 'ex-argument for LocalFile.to_stream() not supported (got {})'.format( ex) return self.to_stream_type(stream_type=stream_type, step=step, **kwargs)
def get_parsed_line(self, line: str, item_type: Union[ItemType, arg.Auto] = AUTO) -> Item: item_type = arg.delayed_acquire(item_type, self.get_default_item_type) if item_type in (ItemType.Line, ItemType.Any, ItemType.Auto): return line elif item_type == ItemType.Row: return [line] elif item_type == ItemType.Record: return dict(line=line) elif item_type == ItemType.StructRow: return ItemType.StructRow.build(data=[line], struct=['line']) else: msg = 'item_type {} is not supported for {}.parse_lines()' raise ValueError(msg.format(item_type, self.__class__.__name__))
def to_stream_type( self, stream_type: StreamType, step: AutoCount = AUTO, verbose: AutoBool = AUTO, **kwargs, ) -> Stream: stream_type = arg.delayed_acquire(stream_type, self._get_stream_type) item_type = self._get_item_type(stream_type) data = kwargs.pop('data', None) if not arg.is_defined(data): data = self._get_items_of_type(item_type, step=step, verbose=verbose) stream_kwargs = self.get_stream_kwargs(data=data, step=step, verbose=verbose, **kwargs) return stream_type.stream(**stream_kwargs)
def detect(cls, obj, default=arg.AUTO) -> ClassType: if isinstance(obj, str): name = obj elif inspect.isclass(obj): name = obj.__name__ else: name = obj.__class__.__name__ if name == 'ItemType': item_type_name = obj.get_name() if item_type_name == 'StructRow': stream_type_obj = StreamType.StructStream else: stream_type_name = '{}Stream'.format(item_type_name) stream_type_obj = cls.find_instance(stream_type_name) if stream_type_obj is None: stream_type_obj = arg.delayed_acquire( default, cls.get_default) return stream_type_obj return StreamType(name)
def log( self, msg: Union[str, list, tuple], level: Level = arg.AUTO, logger: Union[BaseLogger, arg.Auto] = arg.AUTO, end: Union[str, arg.Auto] = arg.AUTO, verbose: bool = True, truncate: bool = True, category: Optional[Type] = None, stacklevel: Optional[int] = None, ) -> None: level = arg.acquire( level, LoggingLevel.Info if verbose else LoggingLevel.Debug) logger = arg.delayed_acquire(logger, self.get_base_logger) if isinstance(msg, BaseException): msg = str(msg) if isinstance(msg, str): msg = [msg] elif isinstance(msg, Iterable): msg = list(msg) else: raise TypeError( 'Expected msg as str or list[str], got {}'.format(msg)) if category: category_name = arg.get_name(category) msg = [category_name] + msg if stacklevel: caller = getframeinfo(stack()[stacklevel + 1][0]) file_name_without_path = caller.filename.split('\\')[-1].split( '/')[-1] msg = ['{}:{}:'.format(file_name_without_path, caller.lineno) ] + msg if isinstance(msg, (list, tuple)): msg = self.format_message(*msg) if not isinstance(level, LoggingLevel): level = LoggingLevel(level) if logger: if self.is_suitable_level(level): logging_method = getattr(logger, level.get_method_name()) logging_method(msg) if verbose and not self.is_suitable_level(level): self.show(msg, end=end, truncate=truncate)
def convert( cls, obj: Union[EnumItem, Name], default: Union[EnumItem, arg.Auto, None] = arg.AUTO, skip_missing: bool = False, ): assert cls.is_prepared(), 'DynamicEnum must be prepared before usage' if isinstance(obj, cls): return obj for string in cls._get_name_and_value(obj): instance = cls.find_instance(string) if instance: return instance default = arg.delayed_acquire(default, cls.get_default) if default: return cls.convert(default) elif not skip_missing: raise ValueError( 'item {} is not an instance of DynamicEnum {}'.format( obj, cls.get_enum_name()))
def log( self, msg: Union[str, list, tuple], level: Level = arg.AUTO, logger: Union[BaseLogger, arg.Auto] = arg.AUTO, end: Union[str, arg.Auto] = arg.AUTO, verbose: bool = True, truncate: bool = True, ) -> NoReturn: level = arg.acquire(level, LoggingLevel.Info if verbose else LoggingLevel.Debug) logger = arg.delayed_acquire(logger, self.get_base_logger) if isinstance(msg, (list, tuple)): msg = self.format_message(*msg) if not isinstance(level, LoggingLevel): level = LoggingLevel(level) if logger: if self.is_suitable_level(level): logging_method = getattr(logger, level.get_method_name()) logging_method(msg) if verbose and not self.is_suitable_level(level): self.show(msg, end=end, truncate=truncate)
def get_stream_kwargs(self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, verbose: AutoBool = AUTO, step: AutoCount = AUTO, **kwargs) -> dict: name = arg.delayed_acquire(name, self._get_generated_stream_name) if not arg.is_defined(data): item_type = self._get_item_type() data = self._get_items_of_type(item_type, verbose=verbose, step=step) result = dict( data=data, name=name, source=self, count=self._get_fast_count(), context=self.get_context(), ) result.update(kwargs) return result
def get_parsed_line(self, line: str, item_type: Union[ItemType, arg.Auto] = AUTO, default_value=None) -> Item: item_type = arg.delayed_acquire(item_type, self.get_default_item_type) if item_type in (ItemType.Record, ItemType.Row, ItemType.Any, ItemType.Auto): parsed = self._parse_json_line(line, default_value=default_value) if isinstance(parsed, ARRAY_TYPES) and item_type == ItemType.Record: return dict(item=parsed) elif isinstance(parsed, dict) and item_type == ItemType.Row: return [parsed] else: return parsed elif item_type == ItemType.Line: return line elif item_type == ItemType.StructRow: return ItemType.StructRow.build(data=[line], struct=['line']) else: msg = 'item_type {} is not supported for {}.parse_lines()' raise ValueError(msg.format(item_type, self.__class__.__name__))
def get_items_from_lines( self, lines: Iterable, item_type: Union[ItemType, Auto] = AUTO, struct: Union[Array, StructInterface, Auto] = AUTO, ) -> Generator: item_type = arg.delayed_acquire(item_type, self.get_default_item_type) if item_type in (ItemType.Record, ItemType.Row, ItemType.StructRow, ItemType.Any, ItemType.Auto): iter_parser = fs.csv_reader(delimiter=self.get_delimiter()) rows = iter_parser(lines) if isinstance(struct, StructInterface): column_names = struct.get_columns() field_converters = struct.get_converters() rows = map( self._get_row_converter(converters=field_converters), rows) elif isinstance(struct, ARRAY_TYPES): column_names = struct else: column_names = None if item_type in (ItemType.Row, ItemType.Any, ItemType.Auto): yield from rows elif item_type == ItemType.Record: for r in rows: if column_names: yield {k: v for k, v in zip(column_names, r)} else: yield {k: v for k, v in enumerate(r)} elif item_type == ItemType.StructRow: assert arg.is_defined(struct, check_name=False) for r in rows: yield ItemType.StructRow.build(data=r, struct=struct) else: # item_type == ItemType.Line for line in lines: yield self.get_parsed_line(line, item_type=item_type, struct=struct)