Exemplo n.º 1
0
 def stack(cls,
           *iter_streams,
           how: How = 'vertical',
           name=AUTO,
           context=None,
           **kwargs):
     iter_streams = arg.update(iter_streams)
     assert cls.is_same_stream_type(
         iter_streams), 'concat(): streams must have same type: {}'.format(
             iter_streams)
     result = None
     for cur_stream in iter_streams:
         assert isinstance(cur_stream, StreamInterface)
         if result is None:
             if hasattr(cur_stream, 'copy'):
                 result = cur_stream.copy()
             else:
                 result = cur_stream
             if arg.is_defined(name):
                 result.set_name(name)
             if arg.is_defined(context):
                 result.set_context(context)
         elif how == 'vertical':
             result = result.add_stream(cur_stream)
         else:
             result = result.join(cur_stream, how=how, **kwargs)
         gc.collect()
     return result
Exemplo n.º 2
0
 def group_by(
     self,
     *keys,
     values: Columns = None,
     step: AutoCount = AUTO,
     as_pairs: bool = False,
     take_hash: bool = True,
     verbose: bool = True,
 ) -> Stream:
     keys = arg.update(keys)
     keys = arg.get_names(keys)
     values = arg.get_names(values)
     if hasattr(keys[0],
                'get_field_names'):  # if isinstance(keys[0], FieldGroup)
         keys = keys[0].get_field_names()
     step = arg.acquire(step, self.max_items_in_memory)
     if as_pairs:
         key_for_sort = keys
     else:
         key_for_sort = get_key_function(keys, take_hash=take_hash)
     sorted_stream = self.sort(
         key_for_sort,
         step=step,
         verbose=verbose,
     )
     grouped_stream = sorted_stream.sorted_group_by(
         keys,
         values=values,
         as_pairs=as_pairs,
     )
     return grouped_stream
Exemplo n.º 3
0
 def map_side_join(
         self,
         right: Native,
         key: UniKey,
         how: How = JoinType.Left,
         right_is_uniq: bool = True,
         inplace: bool = False,
 ) -> Optional[Native]:
     key = arg.get_names(key)
     keys = arg.update([key])
     if not isinstance(how, JoinType):
         how = JoinType(how)
     joined_items = algo.map_side_join(
         iter_left=self.get_items(),
         iter_right=right.get_items(),
         key_function=fs.composite_key(keys),
         merge_function=fs.merge_two_items(),
         dict_function=fs.items_to_dict(),
         how=how,
         uniq_right=right_is_uniq,
     )
     if self.is_in_memory():
         joined_items = list(joined_items)
     if inplace:
         self.set_items(joined_items, count=self.get_count(), inplace=True)
     else:
         stream = self.stream(joined_items)
         meta = self.get_compatible_static_meta()
         stream = stream.set_meta(**meta)
         return self._assume_native(stream)
Exemplo n.º 4
0
 def sorted_group_by(
     self,
     *keys,
     values: Columns = None,
     as_pairs: bool = False,
     skip_missing: bool = False,
 ) -> Stream:
     keys = arg.update(keys)
     keys = arg.get_names(keys, or_callable=True)
     values = arg.get_names(values)
     key_function = self._get_key_function(keys)
     iter_groups = self._get_groups(key_function, as_pairs=as_pairs)
     if as_pairs:
         stream_groups = sm.KeyValueStream(
             iter_groups, value_stream_type=StreamType.RowStream)
     else:
         stream_groups = sm.RowStream(iter_groups, check=False)
     if values:
         item_type = self.get_item_type()  # ItemType.Record
         fold_mapper = fs.fold_lists(keys=keys,
                                     values=values,
                                     skip_missing=skip_missing,
                                     item_type=item_type)
         stream_groups = stream_groups.map_to_type(
             fold_mapper, stream_type=StreamType.RecordStream)
     if self.is_in_memory():
         return stream_groups.to_memory()
     else:
         stream_groups.set_estimated_count(self.get_count()
                                           or self.get_estimated_count())
         return stream_groups
Exemplo n.º 5
0
 def sorted_group_by(
     self,
     *keys,
     values: Columns = None,
     as_pairs: bool = False,
     skip_missing: bool = False,
 ) -> Stream:
     keys = arg.update(keys)
     keys = arg.get_names(keys)
     values = arg.get_names(values)
     key_function = get_key_function(keys)
     groups = self._get_groups(key_function, as_pairs=as_pairs)
     if as_pairs:
         sm_groups = sm.KeyValueStream(
             groups, value_stream_type=StreamType.RowStream)
     else:
         sm_groups = sm.RowStream(groups, check=False)
     if values:
         sm_groups = sm_groups.map_to_type(
             lambda r: ms.fold_lists(
                 r, keys, values, skip_missing=skip_missing),
             stream_type=StreamType.RecordStream,
         )
     if self.is_in_memory():
         return sm_groups.to_memory()
     else:
         sm_groups.set_estimated_count(self.get_count()
                                       or self.get_estimated_count())
         return sm_groups
Exemplo n.º 6
0
 def to_row_stream(self, *args, **kwargs) -> RowStream:
     function, delimiter = None, None
     if 'function' in kwargs:
         function = kwargs.pop('function')
     elif args:
         if callable(args[0]):
             function, args = args[0], args[1:]
         elif self.get_stream_type() in (StreamType.LineStream,
                                         StreamType.AnyStream):
             delimiter, args = args[0], args[1:]
     elif self.get_stream_type() == StreamType.RecordStream:
         add_title_row = kwargs.pop('add_title_row', None)
         columns = arg.update(args, kwargs.pop('columns', None))
         assert isinstance(self, RecordStream)
         if not columns:
             columns = self.get_columns()
         function = self.get_rows(columns=columns,
                                  add_title_row=add_title_row)
     elif 'delimiter' in kwargs and self.get_stream_type() in (
             StreamType.LineStream, StreamType.AnyStream):
         delimiter = kwargs.pop('delimiter')
     elif args:
         assert not kwargs
         return self.to_any_stream().select(*args)
     if function:
         items = self._get_mapped_items(
             lambda i: function(i, *args, **kwargs))
     elif delimiter:
         csv_reader = fs.csv_reader(delimiter=delimiter, *args, **kwargs)
         items = csv_reader(self.get_items())
     else:
         items = self.get_items()
     stream = self.stream(items, stream_type=StreamType.RowStream)
     return self._assume_native(stream)
Exemplo n.º 7
0
def not_in(*list_values) -> Callable:
    list_values = arg.update(list_values)

    def func(value: Any) -> bool:
        return value not in list_values

    return func
Exemplo n.º 8
0
def composite_key(*functions) -> Callable:
    key_functions = arg.update(functions)

    def func(item) -> tuple:
        return sf.get_composite_key(item=item, keys_descriptions=key_functions)

    return func
Exemplo n.º 9
0
 def sorted_group_by(
         self,
         *keys,
         values: Optional[Iterable] = None,
         as_pairs: bool = False,
         output_struct: Optional[StructInterface] = None,
         skip_missing: bool = True,  # tmp
 ) -> Stream:
     keys = arg.update(keys)
     key_function = self._get_key_function(keys, take_hash=False)
     output_keys = [self._get_field_getter(f) for f in keys]
     groups = self._get_groups(key_function, as_pairs=as_pairs)
     if as_pairs:
         stream_builder = StreamType.KeyValueStream.get_class()
         stream_groups = stream_builder(groups, value_stream_type=self.get_stream_type())
     else:
         stream_builder = StreamType.RowStream.get_class()
         stream_groups = stream_builder(groups, check=False)
     if values:
         item_type = self.get_item_type()
         values = [self._get_field_getter(f, item_type=item_type) for f in values]
         fold_func = fs.fold_lists(keys=output_keys, values=values, skip_missing=skip_missing, item_type=item_type)
         stream_type = StreamType.RowStream if output_struct else self.get_stream_type()
         stream_groups = stream_groups.map_to_type(fold_func, stream_type=stream_type)
         if output_struct:
             stream_groups = stream_groups.structure(output_struct)
     if self.is_in_memory():
         return stream_groups.to_memory()
     else:
         stream_groups.set_estimated_count(self.get_count() or self.get_estimated_count(), inplace=True)
         return stream_groups
Exemplo n.º 10
0
 def group_by(self,
              *keys,
              values: Optional[Iterable] = None,
              as_pairs: bool = False) -> Stream:
     keys = arg.get_names(keys)
     keys = arg.update(keys)
     values = arg.get_names(values)
     return self.sort(*keys).sorted_group_by(*keys,
                                             values=values,
                                             as_pairs=as_pairs)
Exemplo n.º 11
0
 def _get_uniq_records(self, *keys) -> Iterable:
     keys = arg.update(keys)
     key_fields = arg.get_names(keys)
     key_function = get_key_function(key_fields)
     prev_value = AUTO
     for r in self.get_records():
         value = key_function(r)
         if value != prev_value:
             yield r
         prev_value = value
Exemplo n.º 12
0
 def format_message(
         self, *messages,
         max_len: Union[int, arg.Auto] = arg.AUTO,
         truncate: bool = True,
 ) -> str:
     messages = arg.update(messages)
     max_len = arg.acquire(max_len, self.max_line_len)
     message = SPACE.join([str(m) for m in messages])
     if truncate and len(message) > max_len:
         message = message[:max_len - 2] + TRUNCATED_SUFFIX
     return message
Exemplo n.º 13
0
 def remove_fields(self, *fields, multiple: bool = False, inplace: bool = True):
     removing_fields = arg.update(fields)
     removing_field_names = arg.get_names(removing_fields)
     existing_fields = self.get_fields()
     if inplace:
         for e in existing_fields:
             if arg.get_name(e) in removing_field_names:
                 existing_fields.remove(e)
                 if not multiple:
                     break
     else:
         new_fields = [f for f in existing_fields if arg.get_name(f) not in removing_field_names]
         return self.make_new(new_fields)
Exemplo n.º 14
0
 def remove_fields(self, *fields, inplace: bool = True):
     removing_fields = arg.update(fields)
     existing_fields = self.get_fields_descriptions()
     if inplace:
         for f in existing_fields.copy():
             if isinstance(f, ARRAY_TYPES):
                 name = f[0]
             elif hasattr(f, 'get_name'):
                 name = f.get_name()
             else:
                 name = f
             if name in removing_fields:
                 existing_fields.remove(f)
     else:
         raise NotImplementedError
Exemplo n.º 15
0
def maybe(*conditions) -> Callable:
    conditions = arg.update(conditions)

    def func_conditioned(value) -> bool:
        for c in conditions:
            if c(value):
                return True
        return False

    def func_simple(*values) -> bool:
        return max(map(bool, values))

    if conditions:
        return func_conditioned
    else:
        return func_simple
Exemplo n.º 16
0
def never(*conditions) -> Callable:
    conditions = arg.update(conditions)

    def func_conditioned(value) -> bool:
        for c in conditions:
            if c(value):
                return False
        return True

    def func_simple(value) -> bool:
        return not value

    if conditions:
        return func_conditioned
    else:
        return func_simple
Exemplo n.º 17
0
def always(*conditions) -> Callable:
    conditions = arg.update(conditions)

    def func_conditioned(value) -> bool:
        for c in conditions:
            if not c(value):
                return False
        return True

    def func_simple(*values) -> bool:
        values = arg.update(values)
        return min(map(bool, values))

    if conditions:
        return func_conditioned
    else:
        return func_simple
Exemplo n.º 18
0
 def add_fields(
         self,
         *fields,
         default_type: Optional[Type] = None,
         exclude_duplicates: bool = False,
         name: StructName = None,
         reassign_struct_name: bool = False,
         inplace: bool = False,
 ) -> Optional[Native]:
     fields = arg.update(fields)
     if inplace:
         for f in fields:
             self.append(
                 f, default_type=default_type,
                 exclude_duplicates=exclude_duplicates,
                 reassign_struct_name=reassign_struct_name,
                 inplace=True,
             )
     else:
         return self.make_new(fields=self.get_fields_descriptions() + list(fields), name=name)
Exemplo n.º 19
0
 def sort(self,
          *keys,
          reverse: bool = False,
          step: AutoCount = AUTO,
          verbose: AutoBool = True) -> Native:
     keys = arg.update(keys)
     step = arg.acquire(step, self.max_items_in_memory)
     if len(keys) == 0:
         key_function = fs.same()
     else:
         key_function = fs.composite_key(keys)
     if self.can_be_in_memory(step=step) or step is None:
         stream = self.memory_sort(key_function,
                                   reverse=reverse,
                                   verbose=verbose)
     else:
         stream = self.disk_sort(key_function,
                                 reverse=reverse,
                                 step=step,
                                 verbose=verbose)
     return self._assume_native(stream)
Exemplo n.º 20
0
 def map_side_join(self,
                   right: Native,
                   key: UniKey,
                   how: How = JoinType.Left,
                   right_is_uniq: bool = True) -> Native:
     key = arg.get_names(key)
     keys = arg.update([key])
     if not isinstance(how, JoinType):
         how = JoinType(how)
     joined_items = algo.map_side_join(
         iter_left=self.get_items(),
         iter_right=right.get_items(),
         key_function=fs.composite_key(keys),
         merge_function=fs.merge_two_items(),
         dict_function=fs.items_to_dict(),
         how=how,
         uniq_right=right_is_uniq,
     )
     stream = self.stream(
         list(joined_items) if self.is_in_memory() else
         joined_items, ).set_meta(**self.get_static_meta())
     return self._assume_native(stream)
Exemplo n.º 21
0
def get_composite_key(item,
                      keys_descriptions: list,
                      item_type=arg.AUTO,
                      logger=None,
                      skip_errors=True) -> tuple:
    keys_descriptions = arg.update(keys_descriptions)
    keys_descriptions = [
        d.get_field_names() if hasattr(d, 'get_field_names') else d
        for d in keys_descriptions
    ]
    result = list()
    for d in keys_descriptions:
        if isinstance(d, Callable):
            value = d(item)
        else:
            value = value_from_item(item,
                                    d,
                                    item_type=item_type,
                                    logger=logger,
                                    skip_errors=skip_errors)
        result.append(value)
    return tuple(result)
Exemplo n.º 22
0
 def sorted_join(
     self,
     right: Native,
     key: UniKey,
     how: How = JoinType.Left,
     sorting_is_reversed: bool = False,
 ) -> Native:
     keys = arg.update([key])
     if not isinstance(how, JoinType):
         how = JoinType(how)
     joined_items = algo.sorted_join(
         iter_left=self.get_iter(),
         iter_right=right.get_iter(),
         key_function=fs.composite_key(keys),
         merge_function=fs.merge_two_items(),
         order_function=bf.is_ordered(reverse=sorting_is_reversed,
                                      including=True),
         how=how,
     )
     return self.stream(
         list(joined_items) if self.is_in_memory() else joined_items,
         **self.get_static_meta())
Exemplo n.º 23
0
 def func_simple(*values) -> bool:
     values = arg.update(values)
     return min(map(bool, values))
Exemplo n.º 24
0
 def is_same_stream_type(*iter_streams) -> bool:
     iter_streams = arg.update(iter_streams)
     stream_types = [i.get_stream_type() for i in iter_streams]
     return len(set(stream_types)) == 1