def sorted_group_by( self, *keys, values: Columns = None, as_pairs: bool = False, skip_missing: bool = False, ) -> Stream: keys = arg.update(keys) keys = arg.get_names(keys) values = arg.get_names(values) key_function = get_key_function(keys) groups = self._get_groups(key_function, as_pairs=as_pairs) if as_pairs: sm_groups = sm.KeyValueStream( groups, value_stream_type=StreamType.RowStream) else: sm_groups = sm.RowStream(groups, check=False) if values: sm_groups = sm_groups.map_to_type( lambda r: ms.fold_lists( r, keys, values, skip_missing=skip_missing), stream_type=StreamType.RecordStream, ) if self.is_in_memory(): return sm_groups.to_memory() else: sm_groups.set_estimated_count(self.get_count() or self.get_estimated_count()) return sm_groups
def group_by( self, *keys, values: Columns = None, step: AutoCount = AUTO, as_pairs: bool = False, take_hash: bool = True, verbose: bool = True, ) -> Stream: keys = arg.update(keys) keys = arg.get_names(keys) values = arg.get_names(values) if hasattr(keys[0], 'get_field_names'): # if isinstance(keys[0], FieldGroup) keys = keys[0].get_field_names() step = arg.acquire(step, self.max_items_in_memory) if as_pairs: key_for_sort = keys else: key_for_sort = get_key_function(keys, take_hash=take_hash) sorted_stream = self.sort( key_for_sort, step=step, verbose=verbose, ) grouped_stream = sorted_stream.sorted_group_by( keys, values=values, as_pairs=as_pairs, ) return grouped_stream
def sorted_group_by( self, *keys, values: Columns = None, as_pairs: bool = False, skip_missing: bool = False, ) -> Stream: keys = arg.update(keys) keys = arg.get_names(keys, or_callable=True) values = arg.get_names(values) key_function = self._get_key_function(keys) iter_groups = self._get_groups(key_function, as_pairs=as_pairs) if as_pairs: stream_groups = sm.KeyValueStream( iter_groups, value_stream_type=StreamType.RowStream) else: stream_groups = sm.RowStream(iter_groups, check=False) if values: item_type = self.get_item_type() # ItemType.Record fold_mapper = fs.fold_lists(keys=keys, values=values, skip_missing=skip_missing, item_type=item_type) stream_groups = stream_groups.map_to_type( fold_mapper, stream_type=StreamType.RecordStream) if self.is_in_memory(): return stream_groups.to_memory() else: stream_groups.set_estimated_count(self.get_count() or self.get_estimated_count()) return stream_groups
def group_by(self, *keys, values: Optional[Iterable] = None, as_pairs: bool = False) -> Stream: keys = arg.get_names(keys) keys = arg.update(keys) values = arg.get_names(values) return self.sort(*keys).sorted_group_by(*keys, values=values, as_pairs=as_pairs)
def get_dataframe(self, columns: Columns = None) -> DataFrame: if pd and get_use_objects_for_output(): dataframe = DataFrame(self.get_items()) if arg.is_defined(columns): columns = arg.get_names(columns) dataframe = dataframe[columns] return dataframe
def map_side_join( self, right: Native, key: UniKey, how: How = JoinType.Left, right_is_uniq: bool = True, inplace: bool = False, ) -> Optional[Native]: key = arg.get_names(key) keys = arg.update([key]) if not isinstance(how, JoinType): how = JoinType(how) joined_items = algo.map_side_join( iter_left=self.get_items(), iter_right=right.get_items(), key_function=fs.composite_key(keys), merge_function=fs.merge_two_items(), dict_function=fs.items_to_dict(), how=how, uniq_right=right_is_uniq, ) if self.is_in_memory(): joined_items = list(joined_items) if inplace: self.set_items(joined_items, count=self.get_count(), inplace=True) else: stream = self.stream(joined_items) meta = self.get_compatible_static_meta() stream = stream.set_meta(**meta) return self._assume_native(stream)
def get_rows(self, columns: Union[Columns, Auto] = AUTO, add_title_row=False) -> Iterable: columns = arg.delayed_acquire(columns, self.get_columns) columns = arg.get_names(columns) if add_title_row: yield columns for r in self.get_items(): yield [r.get(c) for c in columns]
def get_struct_comparison_iter(self, other: StructInterface, message: Optional[str] = None) -> Iterable: if arg.is_defined(message): title = '{} {}'.format(self.__repr__(), message) else: title = self.__repr__() comparison = self.get_struct_comparison_dict(other) counts = {k: len(v) for k, v in comparison.items()} added_names = arg.get_names(comparison.get('added')) removed_names = arg.get_names(comparison.get('removed')) if added_names or removed_names: message = '{}: {saved} fields will be saved, {added} added, {removed} removed'.format(title, **counts) yield message if added_names: yield 'Added {} fields: {}'.format(len(added_names), ', '.join(added_names)) if removed_names: yield 'Removed {} fields: {}'.format(len(removed_names), ', '.join(removed_names)) else: yield '{}: Struct is actual, will not be changed'.format(title)
def get_dataframe(self, columns: Optional[Iterable] = None) -> DataFrame: if pd and get_use_objects_for_output(): if columns: dataframe = DataFrame(self.get_items(), columns=columns) columns = arg.get_names(columns) dataframe = dataframe[columns] else: dataframe = DataFrame(self.get_items()) return dataframe
def unfold_lists(fields, number_field='n', default_value=0) -> Callable: fields = arg.get_names(fields) def func(record: dict) -> Iterable: yield from ms.unfold_lists(record, fields=fields, number_field=number_field, default_value=default_value) return func
def _get_uniq_records(self, *keys) -> Iterable: keys = arg.update(keys) key_fields = arg.get_names(keys) key_function = get_key_function(key_fields) prev_value = AUTO for r in self.get_records(): value = key_function(r) if value != prev_value: yield r prev_value = value
def get_dict( self, key: Union[Field, Columns], value: Union[Field, Columns, None] = None, of_lists: bool = False, skip_errors: bool = False, ) -> dict: key = arg.get_names(key) key_value_stream = self.to_key_value_stream(key, value, skip_errors=skip_errors) return key_value_stream.get_dict(of_lists=of_lists)
def _get_key_function(self, descriptions: Array, take_hash: bool = False) -> Callable: descriptions = arg.get_names(descriptions) if len(descriptions) == 0: raise ValueError('key must be defined') elif len(descriptions) == 1: key_function = fs.partial(sf.value_from_row, descriptions[0]) else: key_function = fs.partial(sf.row_from_row, descriptions) if take_hash: return lambda r: hash(key_function(r)) else: return key_function
def remove_fields(self, *fields, multiple: bool = False, inplace: bool = True): removing_fields = arg.update(fields) removing_field_names = arg.get_names(removing_fields) existing_fields = self.get_fields() if inplace: for e in existing_fields: if arg.get_name(e) in removing_field_names: existing_fields.remove(e) if not multiple: break else: new_fields = [f for f in existing_fields if arg.get_name(f) not in removing_field_names] return self.make_new(new_fields)
def map_side_join(self, right: Native, key: UniKey, how: How = JoinType.Left, right_is_uniq: bool = True) -> Native: key = arg.get_names(key) keys = arg.update([key]) if not isinstance(how, JoinType): how = JoinType(how) joined_items = algo.map_side_join( iter_left=self.get_items(), iter_right=right.get_items(), key_function=fs.composite_key(keys), merge_function=fs.merge_two_items(), dict_function=fs.items_to_dict(), how=how, uniq_right=right_is_uniq, ) stream = self.stream( list(joined_items) if self.is_in_memory() else joined_items, ).set_meta(**self.get_static_meta()) return self._assume_native(stream)
def get_records(self, columns: AutoColumns = AUTO) -> Generator: if columns == AUTO: columns = self.get_columns() column_names = arg.get_names(columns) for row in self.get_rows(): yield {k: v for k, v in zip(column_names, row)}
def get_field_names(self) -> list: return arg.get_names(self.get_fields())
def get_records(self, columns: Union[Iterable, Auto] = AUTO) -> Iterable: if columns == AUTO: return self.get_items() else: columns = arg.get_names(columns) return self.select(*columns).get_items()