def __init__( self, name: str, content_format: Union[ContentFormatInterface, Auto] = AUTO, struct: Union[Struct, Auto, None] = AUTO, folder: Connector = None, context: Context = AUTO, first_line_is_title: AutoBool = AUTO, expected_count: AutoCount = AUTO, caption: Optional[str] = None, verbose: AutoBool = AUTO, **kwargs ): parent = kwargs.pop('parent', None) if folder: message = 'only LocalFolder supported for *File instances (got {})'.format(type(folder)) assert isinstance(folder, ConnectorInterface) or folder.is_folder(), message assert folder == parent or not Auto.is_defined(parent) elif Auto.is_defined(parent): folder = parent elif Auto.is_defined(context): folder = context.get_job_folder() else: folder = self.get_default_folder() self._fileholder = None super().__init__( name=name, content_format=content_format, struct=struct, first_line_is_title=first_line_is_title, expected_count=expected_count, caption=caption, parent=folder, context=context, verbose=verbose, **kwargs, )
def __init__( self, data: Iterable, name: AutoName = AUTO, check: bool = False, count: AutoCount = None, less_than: AutoCount = None, source: Connector = None, context: Context = None, max_items_in_memory: AutoCount = AUTO, tmp_files: TmpMask = AUTO, ): count = get_optional_len(data, count) if count and Auto.is_defined(count) and not Auto.is_defined(less_than): less_than = count self._tmp_files = None super().__init__( data=data, name=name, check=check, source=source, context=context, count=count, less_than=less_than, max_items_in_memory=max_items_in_memory, ) self._tmp_files = Auto.delayed_acquire(tmp_files, sm.get_tmp_mask, self.get_name())
def get_new_progress(self, name: str, count: Optional[int] = None, context: AutoContext = AUTO): logger = self.get_logger() if Auto.is_defined(context) and not Auto.is_defined(logger): logger = context.get_logger() if isinstance(logger, ExtendedLoggerInterface) or hasattr( logger, 'get_new_progress'): return logger.get_new_progress(name, count=count, context=context)
def __init__( self, mask: str, parent: HierarchicConnector, context: AutoContext = None, verbose: AutoBool = AUTO, ): if not Auto.is_defined(parent): if Auto.is_defined(context): parent = context.get_local_storage() assert parent.is_folder() or parent.is_storage() super().__init__(path=mask, parent=parent, context=context, verbose=verbose)
def set_context(self, context: AutoContext, reset: bool = False, inplace: bool = True) -> Optional[Native]: if context: parent = self.get_parent() if Auto.is_defined(parent): parent.set_context(context, reset=False, inplace=True) elif Auto.is_defined(context): self.set_parent(context, reset=False, inplace=True) if not inplace: return self
def get_count(self, allow_reopen: bool = True, allow_slow_mode: bool = True, force: bool = False) -> Count: must_recount = force or self.is_outdated() or not Auto.is_defined( self.get_prev_lines_count()) if self.is_existing() and must_recount: count = self.get_actual_lines_count( allow_reopen=allow_reopen, allow_slow_mode=allow_slow_mode) self.set_count(count) else: count = self.get_prev_lines_count() if Auto.is_defined(count): return count
def get_group_header(self, name: Comment = AUTO, caption: Comment = AUTO, comment: Comment = None) -> Iterable[str]: is_title_row = name == AUTO name = Auto.acquire(name, self.get_name()) caption = Auto.acquire(caption, self.get_caption()) if Auto.is_defined(name): yield name if Auto.is_defined(caption): yield caption if is_title_row: yield self.get_str_fields_count() if Auto.is_defined(comment): yield comment
def to_stream(self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, stream_type: Union[StreamType, Auto] = AUTO, ex: OptionalFields = None, step: AutoCount = AUTO, **kwargs) -> Stream: name = Auto.delayed_acquire(name, self._get_generated_stream_name) stream_type = self._get_stream_type(stream_type) stream_class = self._get_stream_class(stream_type) if hasattr(stream_class, 'get_item_type'): item_type = stream_class.get_item_type() else: stream_obj = stream_class([]) if hasattr(stream_obj, 'get_item_type'): item_type = stream_obj.get_item_type() else: item_type = AUTO if not Auto.is_defined(data): data = self._get_items_of_type(item_type, verbose=kwargs.get('verbose', AUTO), step=step) meta = self.get_compatible_meta(stream_class, name=name, ex=ex, **kwargs) if 'count' not in meta and 'count' not in kwargs: meta['count'] = self._get_fast_count() if 'source' not in meta: meta['source'] = self stream = stream_class(data, **meta) return self._assume_stream(stream)
def get_struct_repr_lines(self, example: Optional[dict] = None, delimiter: str = COLUMN_DELIMITER, select_fields: Optional[Array] = None, count: Optional[int] = None) -> Generator: columns, template = self._get_describe_template(example) separate_by_tabs = delimiter == '\t' yield '\t'.join(columns) if separate_by_tabs else template.format( *columns) for (n, type_name, name, caption, is_valid) in self.get_struct_description(include_header=False): if type_name == GROUP_TYPE_STR: yield '' for line in self.get_group_header(name, caption=caption): yield line else: if name in (select_fields or []): is_valid = '>' if is_valid == '.' else str( is_valid).upper() if example: value = str(example.get(name)) row = (is_valid, n, type_name, name, value, caption) else: row = (is_valid, n, type_name, name, caption) yield '\t'.join(row) if separate_by_tabs else template.format( *row) if Auto.is_defined(count): if n >= count - 1: break
def simple_select( self, fields: OptionalFields, filters: OptionalFields = None, sort: OptionalFields = None, count: Count = None, stream_type: Union[StreamType, Auto] = AUTO, verbose: AutoBool = AUTO, ) -> Stream: stream_type = Auto.acquire(stream_type, StreamType.RecordStream) stream_class = stream_type.get_class() stream_rows = self.execute_select(fields=fields, filters=filters, sort=sort, count=count, verbose=verbose) if stream_type == StreamType.RowStream: stream_data = stream_rows elif stream_type == StreamType.RecordStream: columns = self.get_columns() stream_data = map(lambda r: dict(zip(columns, r)), stream_rows) else: raise NotImplementedError if Auto.is_defined(count): if count < MAX_ITEMS_IN_MEMORY: stream_data = list(stream_data) count = len(stream_data) return stream_class(stream_data, count=count, source=self, context=self.get_context())
def to_stream(self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, stream_type: Union[StreamType, Auto] = AUTO, ex: OptionalFields = None, step: AutoCount = AUTO, **kwargs) -> Stream: stream_type = Auto.acquire(stream_type, StreamType.SqlStream) if stream_type == StreamType.SqlStream: assert not Auto.is_defined(data) name = Auto.delayed_acquire(name, self._get_generated_stream_name) stream_class = stream_type.get_class() meta = self.get_compatible_meta(stream_class, name=name, ex=ex, **kwargs) meta['source'] = self return stream_class(data, **meta) else: return super().to_stream( data=data, name=name, stream_type=stream_type, ex=ex, step=step, **kwargs, )
def insert_data( self, table: Union[Table, Name], data: Data, struct: Struct = None, encoding: Optional[str] = None, skip_errors: bool = False, skip_lines: Count = 0, skip_first_line: bool = False, step: AutoCount = DEFAULT_STEP, verbose: AutoBool = AUTO, ) -> tuple: if not Auto.is_defined(skip_lines): skip_lines = 0 is_struct_description = isinstance(struct, StructInterface) or hasattr(struct, 'get_struct_str') if not is_struct_description: message = 'Struct as {} is deprecated, use FlatStruct instead'.format(type(struct)) self.log(msg=message, level=LoggingLevel.Warning) struct = FlatStruct(struct or []) input_stream = self._get_struct_stream_from_data( data, struct=struct, encoding=encoding, skip_first_line=skip_first_line, verbose=verbose, ) if skip_lines: input_stream = input_stream.skip(skip_lines) if input_stream.get_stream_type() != StreamType.StructStream: input_stream = input_stream.structure( struct, skip_bad_rows=True, verbose=True, ).update_meta( count=input_stream.get_count(), ) initial_count = input_stream.get_estimated_count() + skip_lines final_count = self.insert_struct_stream( table, input_stream, skip_errors=skip_errors, step=step, verbose=verbose, ) return initial_count, final_count
def get_types_list( self, dialect: Union[DialectType, Auto] = DialectType.String) -> list: if Auto.is_defined(dialect): return [f.get_type_in(dialect) for f in self.get_fields()] else: return [f.get_type() for f in self.get_fields()]
def get_detected_format( self, detect: bool = True, force: bool = False, skip_missing: bool = True, ) -> ContentFormatInterface: if force or (detect and not Auto.is_defined(self._detected_format)): self.reset_detected_format(use_declared_types=True, skip_missing=skip_missing) return self._detected_format
def format(self, value, skip_errors: bool = False) -> str: representation = self.get_representation() if Auto.is_defined(representation): try: return representation.format(value, skip_errors=skip_errors) except AttributeError: return representation.format(value) else: return str(value)
def _get_native_struct(self, raw_struct: Struct, save_if_not_yet: bool = False, verbose: AutoBool = AUTO) -> Struct: if hasattr(self, 'is_verbose') and not Auto.is_defined(verbose): verbose = self.is_verbose() if raw_struct is None: native_struct = None elif isinstance(raw_struct, StructInterface): native_struct = raw_struct elif hasattr(raw_struct, 'get_fields'): struct_class = self._get_struct_class() native_struct = struct_class(raw_struct) elif isinstance(raw_struct, ARRAY_TYPES): if verbose: msg = 'Struct as list is deprecated, use FlatStruct(StructInterface) class instead' if hasattr(self, 'get_logger'): logger = self.get_logger() logger.warning(msg, category=DeprecationWarning, stacklevel=2) elif hasattr(self, 'log'): self.log(msg=msg, level=30) else: print(msg) column_names = raw_struct has_types_descriptions = [ isinstance(f, ARRAY_TYPES) for f in raw_struct ] if max(has_types_descriptions): struct_class = self._get_struct_class() native_struct = struct_class(raw_struct) else: native_struct = self._get_struct_detected_by_title_row( column_names) elif raw_struct == AUTO: native_struct = None if hasattr(self, 'get_struct_from_source'): native_struct = self.get_struct_from_source( set_struct=save_if_not_yet, verbose=verbose) elif hasattr(self, 'is_first_line_title'): if self.is_first_line_title(): if hasattr(self, 'get_detected_struct_by_title_row'): native_struct = self.get_detected_struct_by_title_row( set_struct=save_if_not_yet, verbose=verbose, ) elif hasattr(self, 'get_title_row'): title_row = self.get_title_row(close=True) native_struct = self._get_struct_detected_by_title_row( title_row) else: message = 'struct must be FlatStruct(StructInterface), got {}'.format( type(raw_struct)) raise TypeError(message) return native_struct
def get_logger(self, create_if_not_yet=True) -> LoggerInterface: logger = self._logger if Auto.is_defined(logger, check_name=False): if isinstance(logger, ExtendedLoggerInterface) or hasattr(logger, 'get_context'): if not logger.get_context(): if hasattr(logger, 'set_context'): logger.set_context(self) return self._logger elif create_if_not_yet: return self.get_new_logger()
def map(self, function: Callable, to: OptStreamType = AUTO) -> Native: if Auto.is_defined(to): self.get_logger().warning( 'to-argument for map() is deprecated, use map_to() instead') stream = self.map_to(function, stream_type=to) else: stream = super().map(function) if self.is_in_memory() and hasattr(stream, 'to_memory'): stream = stream.to_memory() return self._assume_native(stream)
def reset_struct_to_initial(self, verbose: bool = True, message: Optional[str] = None) -> Native: if not Auto.is_defined(message): message = self.__repr__() initial_struct = self.get_initial_struct() if verbose: for line in self.get_struct().get_struct_comparison_iter( initial_struct, message=message): self.log(line) return self.struct(initial_struct)
def get_logger(self, create_if_not_yet: bool = True) -> Optional[Logger]: logger = self.logger if Auto.is_defined(logger): if hasattr(logger, 'get_context') and hasattr( logger, 'set_context'): if not logger.get_context(): logger.set_context(self) return logger elif create_if_not_yet: logger = lg.get_logger(context=self) self.set_logger(logger, inplace=True) return logger
def _get_stream_type(self, stream_type: Union[StreamType, Auto] = AUTO) -> StreamType: if not Auto.is_defined(stream_type): if hasattr(self, 'get_stream_type'): stream_type = self.get_stream_type() elif hasattr(self, 'get_default_stream_type'): stream_type = self.get_default_stream_type() else: item_type = self.get_default_item_type() stream_type = StreamType.detect(item_type) return stream_type
def set_verbose(self, verbose: AutoBool = AUTO, parent: AutoConnector = AUTO) -> Native: if not Auto.is_defined(verbose): parent = Auto.delayed_acquire(parent, self.get_parent) if hasattr(parent, 'is_verbose'): verbose = parent.is_verbose() elif hasattr(parent, 'verbose'): verbose = parent.verbose else: verbose = DEFAULT_VERBOSE self._verbose = verbose return self
def get_lines( self, count: Optional[int] = None, skip_first: bool = False, allow_reopen: bool = True, check: bool = True, verbose: AutoBool = AUTO, message: AutoName = AUTO, step: AutoCount = AUTO, ) -> Generator: if check and not self.is_gzip(): assert not self.is_empty(), 'for get_lines() file must be non-empty: {}'.format(self) self.open(allow_reopen=allow_reopen) lines = self.get_next_lines(count=count, skip_first=skip_first, close=True) verbose = Auto.acquire(verbose, self.is_verbose()) if verbose or Auto.is_defined(message): if not Auto.is_defined(message): message = 'Reading {}' if '{}' in message: message = message.format(self.get_name()) logger = self.get_logger() assert hasattr(logger, 'progress'), '{} has no progress in {}'.format(self, logger) if not count: count = self.get_count(allow_slow_mode=False) lines = self.get_logger().progress(lines, name=message, count=count, step=step) return lines
def get_fast_lines_count(self, ending: Union[str, Auto] = AUTO, verbose: AutoBool = AUTO) -> int: if self.is_gzip(): raise ValueError('get_fast_lines_count() method is not available for gzip-files') if not Auto.is_defined(ending): if hasattr(self, 'get_content_format'): ending = self.get_content_format().get_ending() else: ending = '\n' verbose = Auto.acquire(verbose, self.is_verbose()) self.log('Counting lines in {}...'.format(self.get_name()), end='\r', verbose=verbose) count_n_symbol = sum(chunk.count(ending) for chunk in self.get_chunks()) count_lines = count_n_symbol + 1 self.set_count(count_lines) return count_lines
def to_stream( self, data: Union[Iterable, Auto] = AUTO, name: AutoName = AUTO, stream_type: Union[StreamType, Auto] = AUTO, ex: OptionalFields = None, step: AutoCount = AUTO, **kwargs ) -> Stream: if Auto.is_defined(data): kwargs['data'] = data stream_type = Auto.delayed_acquire(stream_type, self.get_stream_type) assert not ex, 'ex-argument for LocalFile.to_stream() not supported (got {})'.format(ex) return self.to_stream_type(stream_type=stream_type, step=step, **kwargs)
def _collect_inplace(self, log: AutoBool = AUTO) -> None: estimated_count = self.get_estimated_count() if Auto.is_defined(estimated_count): log = Auto.acquire( log, estimated_count > self.get_limit_items_in_memory()) if log and estimated_count: self.log( 'Trying to collect {} items into memory from {}...'.format( estimated_count, self.__repr__())) self.set_data(self.get_list(), inplace=True) self.update_count(force=False) if log: self.log('Collected {} items into memory from {}...'.format( estimated_count, self.__repr__()))
def assert_not_empty(self, message: Union[Auto, str, None] = AUTO, skip_error: bool = False) -> Native: if self.is_iter(): self._collect_inplace() if not Auto.is_defined(message): message = 'Empty stream: {}' if '{}' in message: message = message.format(self) if self.is_empty(): logger = self.get_logger() logger.warning(msg=message, stacklevel=2) if not skip_error: raise ValueError(message) return self
def get_struct_from_database( self, types: AutoLinks = AUTO, set_struct: bool = False, skip_missing: bool = False, verbose: AutoBool = AUTO, ) -> StructInterface: struct = FlatStruct(self.describe_table(verbose=verbose)) if struct.is_empty() and not skip_missing: raise ValueError( 'Can not get struct for non-existing table {}'.format(self)) if Auto.is_defined(types): struct.set_types(types, inplace=True) if set_struct: self.set_struct(struct, inplace=True) return struct
def _get_item_type( self, stream: Union[StreamType, RegularStream, Auto] = AUTO) -> ItemType: if isinstance(stream, StreamType) or hasattr(stream, 'get_class'): stream_class = self._get_stream_class(stream) elif Auto.is_defined(stream): stream_class = stream else: stream_class = self._get_stream_class() assert isinstance(stream_class, RegularStream) or hasattr( stream_class, 'get_item_type') if hasattr(stream_class, 'get_item_type'): return stream_class.get_item_type() else: stream_obj = stream_class([]) return stream_obj.get_item_type()
def validate_fields(self, initial: bool = True) -> Native: if initial: expected_struct = self.get_initial_struct() if Auto.is_defined(expected_struct): expected_struct = expected_struct.copy() else: expected_struct = self.get_struct_from_source(set_struct=True, verbose=True) else: expected_struct = self.get_struct() actual_struct = self.get_struct_from_source(set_struct=False, verbose=False) actual_struct = self._get_native_struct(actual_struct) validated_struct = actual_struct.validate_about(expected_struct) self.set_struct(validated_struct, inplace=True) return self