示例#1
0
 def collect(self, item, collector_set = None):
   assert collector_set is self
   collect = ItemCollector.collect
   collect(self, item, self)
   each(methodcaller('collect', item, self),
     filterfalse(attrgetter('has_collected'),
       self.values()))
示例#2
0
    def __get_dependency_chain(self,
                               collectorset_description,
                               predecessors=None):
        """
    Returns a list of phase-wise collector descriptions for a single column.

    :param collectorset_description: iterable
    :param predecessors: ItemCollectorSet
    :return: list[dict]
    """
        phase = dict(
            filterfalse(
                lambda item: item[0] is None or item[0] in predecessors,
                ((template.get_type(predecessors), template)
                 for template in collectorset_description)))
        independent = TagCollector('independent', frozenset(phase.keys()),
                                   True)

        phases = []
        collector_min_phases = dict()
        phase_pre_dependencies = None
        phase_result_dependencies = set()

        def must_add_dependency(dep):
            return (dep not in phase and dep not in phase_result_dependencies
                    and dep not in phase_pre_dependencies)

        def add_dependencies(template):
            if template.pre_dependencies:
                for dep in filterfalse(predecessors.__contains__,
                                       template.pre_dependencies):
                    phase_pre_dependencies.setdefault(dep, dep)
                    collector_min_phases[dep] = len(phases) - 1
            else:
                for dep in filter(must_add_dependency,
                                  template.result_dependencies):
                    add_dependencies(dep)
                    phase_result_dependencies.add(dep)

        # resolve dependencies and push them to an earlier phase
        while phase:
            phase_pre_dependencies = dict()
            phase_result_dependencies.clear()
            each(add_dependencies, phase.keys())
            phases.append(phase)
            phase = phase_pre_dependencies

        # remove later duplicates
        consume((each(memberfn(dict.pop, ctype, None),
                      islice(phases, 0, -min_phase_idx))
                 for ctype, min_phase_idx in collector_min_phases.items()))

        if predecessors is not None:
            predecessors[independent] = independent
        elif phases:
            phases[-1][independent] = independent

        return uiterator.filter(None, reversed(phases))
示例#3
0
  def __init__(self, collectors = (), predecessor = None):
    ItemCollector.__init__(self)
    collections.OrderedDict.__init__(self)

    self.predecessor = predecessor
    if predecessor:
      assert all(map(attrgetter('has_collected'), predecessor.values()))
      self.update(predecessor)
    each(self.add, collectors)
示例#4
0
    def collect(self, items):
        """Collects the data of all columns of a row"""
        if self.__stderr is not None and len(self) != len(items):
            self.__rowcount += 1
            print('Row {} has {} columns, expected {}: {}'.format(
                self.__rowcount, len(items), len(self), items),
                  file=self.__stderr)

        assert len(self) <= len(items)
        each(self.__collect_column, self, items)
示例#5
0
  def collect(self, items):
    """Collects the data of all columns of a row"""
    if self.__stderr is not None and len(self) != len(items):
      self.__rowcount += 1
      print(
        'Row {} has {} columns, expected {}: {}'.format(
          self.__rowcount, len(items), len(self), items),
        file=self.__stderr)

    assert len(self) <= len(items)
    each(self.__collect_column, self, items)
  def __get_dependency_chain(self, collectorset_description, predecessors=None):
    """
    Returns a list of phase-wise collector descriptions for a single column.

    :param collectorset_description: iterable
    :param predecessors: ItemCollectorSet
    :return: list[dict]
    """
    phase = dict(filterfalse(
      lambda item: item[0] is None or item[0] in predecessors,
      ((template.get_type(predecessors), template)
        for template in collectorset_description)))
    independent = TagCollector('independent', frozenset(phase.keys()), True)

    phases = []
    collector_min_phases = dict()
    phase_pre_dependencies = None
    phase_result_dependencies = set()

    def must_add_dependency(dep):
      return (dep not in phase and
        dep not in phase_result_dependencies and
        dep not in phase_pre_dependencies)

    def add_dependencies(template):
      if template.pre_dependencies:
        for dep in filterfalse(predecessors.__contains__, template.pre_dependencies):
          phase_pre_dependencies.setdefault(dep, dep)
          collector_min_phases[dep] = len(phases) - 1
      else:
        for dep in filter(must_add_dependency, template.result_dependencies):
          add_dependencies(dep)
          phase_result_dependencies.add(dep)

    # resolve dependencies and push them to an earlier phase
    while phase:
      phase_pre_dependencies = dict()
      phase_result_dependencies.clear()
      each(add_dependencies, phase.keys())
      phases.append(phase)
      phase = phase_pre_dependencies

    # remove later duplicates
    consume((
      each(memberfn(dict.pop, ctype, None), islice(phases, 0, -min_phase_idx))
      for ctype, min_phase_idx in collector_min_phases.items()))

    if predecessors is not None:
      predecessors[independent] = independent
    elif phases:
      phases[-1][independent] = independent

    return uiterator.filter(None, reversed(phases))
示例#7
0
def collect_analyse_match(collectors, collectorset_description, **kwargs):
    """
  :param collectors: list[io.IOBase | MultiphaseCollector]
  :param collectorset_description: object
  :return: list[MultiphaseCollector], list[int], list[int, int, float, list[int]]
  """
    assert isinstance(collectors,
                      collections.Sequence) and len(collectors) >= 2
    collect_functor = \
      memberfn(collect, collectorset_description.descriptions, **kwargs)

    if isinstance(collectors[0], MultiphaseCollector):
        assert all(map(memberfn(isinstance, MultiphaseCollector), collectors))
        assert utilities.iterator.issorted(collectors,
                                           MultiphaseCollector.columncount)
        sort_order = None
        each(collect_functor, collectors)
    else:
        # The first collector shall have the least columns.
        sort_order, collectors = \
          utilities.iterator.sorted_with_order(
            map(collect_functor, collectors), MultiphaseCollector.columncount)

    # analyse collected data
    norms_combinations = [[
        c1_idx, c2_idx,
        MultiphaseCollector.results_norms(collectors[c1_idx],
                                          collectors[c2_idx],
                                          collectorset_description.weights),
        None
    ] for c1_idx, c2_idx in itertools.combinations(range(len(collectors)), 2)]

    if kwargs.get('verbose', 0) >= 1:
        formatter = memberfn(format, kwargs.get('number_format', ''))
        for c1_idx, c2_idx, norms, _ in norms_combinations:
            print(collectors[c2_idx].name,
                  collectors[c1_idx].name,
                  sep=' / ',
                  end='\n| ',
                  file=sys.stderr)
            print(*('  '.join(map(formatter, row)) for row in norms),
                  sep=' |\n| ',
                  end=' |\n\n',
                  file=sys.stderr)

    # find minimal combinations
    for norms_combination in norms_combinations:  # TODO: rewrite as functional clause
        norms_combination[2:4] = get_best_schema_mapping(norms_combination[2])

    return collectors, sort_order, norms_combinations
示例#8
0
def collect_analyse_match(collectors, collectorset_description, **kwargs):
  """
  :param collectors: list[io.IOBase | MultiphaseCollector]
  :param collectorset_description: object
  :return: list[MultiphaseCollector], list[int], list[int, int, float, list[int]]
  """
  assert isinstance(collectors, collections.Sequence) and len(collectors) >= 2
  collect_functor = \
    memberfn(collect, collectorset_description.descriptions, **kwargs)

  if isinstance(collectors[0], MultiphaseCollector):
    assert all(map(memberfn(isinstance, MultiphaseCollector), collectors))
    assert utilities.iterator.issorted(collectors, MultiphaseCollector.columncount)
    sort_order = None
    each(collect_functor, collectors)
  else:
    # The first collector shall have the least columns.
    sort_order, collectors = \
      utilities.iterator.sorted_with_order(
        map(collect_functor, collectors), MultiphaseCollector.columncount)

  # analyse collected data
  norms_combinations = [
    [c1_idx, c2_idx,
      MultiphaseCollector.results_norms(collectors[c1_idx], collectors[c2_idx],
        collectorset_description.weights), None]
    for c1_idx, c2_idx in itertools.combinations(range(len(collectors)), 2)]

  if kwargs.get('verbose', 0) >= 1:
    formatter = memberfn(format, kwargs.get('number_format', ''))
    for c1_idx, c2_idx, norms, _ in norms_combinations:
      print(collectors[c2_idx].name, collectors[c1_idx].name,
        sep=' / ', end='\n| ', file=sys.stderr)
      print(*('  '.join(map(formatter, row)) for row in norms),
        sep=' |\n| ', end=' |\n\n', file=sys.stderr)

  # find minimal combinations
  for norms_combination in norms_combinations: # TODO: rewrite as functional clause
    norms_combination[2:4] = get_best_schema_mapping(norms_combination[2])

  return collectors, sort_order, norms_combinations
  def __emit_itemcollector_set(self, keep):
    if keep and isinstance(self.merged_predecessors, RowCollector):
      keep = composefn(type, keep.__contains__)
      for predecessor in self.merged_predecessors:
        ics = ItemCollectorSet()

        def add_copy_and_dependencies(collector, isdependency):
          for dep in collector.result_dependencies:
            add_copy_and_dependencies(predecessor[dep], True)
          if isdependency is None:
            isdependency = collector.isdependency
          collector = ics.setdefault(type(collector), copy.copy(collector))
          collector.isdependency &= isdependency

        each(memberfn(add_copy_and_dependencies, None),
          filter(keep, predecessor.values()))
        yield ics
    else:
      for _ in range(len(self.rowset[0])):
        ics = ItemCollectorSet()
        ics.add(ItemCountCollector(len(self.rowset)), True)
        yield ics
示例#10
0
    def __emit_itemcollector_set(self, keep):
        if keep and isinstance(self.merged_predecessors, RowCollector):
            keep = composefn(type, keep.__contains__)
            for predecessor in self.merged_predecessors:
                ics = ItemCollectorSet()

                def add_copy_and_dependencies(collector, isdependency):
                    for dep in collector.result_dependencies:
                        add_copy_and_dependencies(predecessor[dep], True)
                    if isdependency is None:
                        isdependency = collector.isdependency
                    collector = ics.setdefault(type(collector),
                                               copy.copy(collector))
                    collector.isdependency &= isdependency

                each(memberfn(add_copy_and_dependencies, None),
                     filter(keep, predecessor.values()))
                yield ics
        else:
            for _ in range(len(self.rowset[0])):
                ics = ItemCollectorSet()
                ics.add(ItemCountCollector(len(self.rowset)), True)
                yield ics
示例#11
0
  def add(self, template, isdependency=None):
    """Adds an item collector and all its result_dependencies to this set with its type a key,
    if one of the same type isn't in the set already.

    Returns the collector the same type from this set, possibly the one just added.
    """
    collector_type = template.get_type(self.predecessor)
    collector = self.get(collector_type)

    if isdependency is None:
      isdependency = self.__isdependency(collector_type, False)

    if collector is None:
      collector = ItemCollector.get_instance(template, self.predecessor)
      if not isinstance(collector, ItemCollector):
        assert collector is None
        return None
      collector.isdependency = isdependency
      each(self.__add_dependency, collector.result_dependencies)
      collector = self.setdefault(collector_type, collector)

    collector.isdependency &= isdependency
    return collector
示例#12
0
 def transform_all(self, rows):
     transformer = self.get_transformer()
     if transformer is not None:
         each(transformer, rows)
         each(methodcaller('set_transformed'), self)
示例#13
0
 def collect_all(self, rows):
     each(self.collect, rows)
     each(methodcaller('set_collected'), self)
示例#14
0
 def __forward_call(self, fn_name=None, *args):
   if fn_name is None:
     fn_name = inspect.stack()[1][3]
   each(methodcaller(fn_name, *args), self.values())
   getattr(super(ItemCollectorSet, self), fn_name)(*args)
示例#15
0
 def transform_all(self, rows):
   transformer = self.get_transformer()
   if transformer is not None:
     each(transformer, rows)
     each(methodcaller('set_transformed'), self)
示例#16
0
 def collect_all(self, rows):
   each(self.collect, rows)
   each(methodcaller('set_collected'), self)