def collect(self, item, collector_set = None): assert collector_set is self collect = ItemCollector.collect collect(self, item, self) each(methodcaller('collect', item, self), filterfalse(attrgetter('has_collected'), self.values()))
def __get_dependency_chain(self, collectorset_description, predecessors=None): """ Returns a list of phase-wise collector descriptions for a single column. :param collectorset_description: iterable :param predecessors: ItemCollectorSet :return: list[dict] """ phase = dict( filterfalse( lambda item: item[0] is None or item[0] in predecessors, ((template.get_type(predecessors), template) for template in collectorset_description))) independent = TagCollector('independent', frozenset(phase.keys()), True) phases = [] collector_min_phases = dict() phase_pre_dependencies = None phase_result_dependencies = set() def must_add_dependency(dep): return (dep not in phase and dep not in phase_result_dependencies and dep not in phase_pre_dependencies) def add_dependencies(template): if template.pre_dependencies: for dep in filterfalse(predecessors.__contains__, template.pre_dependencies): phase_pre_dependencies.setdefault(dep, dep) collector_min_phases[dep] = len(phases) - 1 else: for dep in filter(must_add_dependency, template.result_dependencies): add_dependencies(dep) phase_result_dependencies.add(dep) # resolve dependencies and push them to an earlier phase while phase: phase_pre_dependencies = dict() phase_result_dependencies.clear() each(add_dependencies, phase.keys()) phases.append(phase) phase = phase_pre_dependencies # remove later duplicates consume((each(memberfn(dict.pop, ctype, None), islice(phases, 0, -min_phase_idx)) for ctype, min_phase_idx in collector_min_phases.items())) if predecessors is not None: predecessors[independent] = independent elif phases: phases[-1][independent] = independent return uiterator.filter(None, reversed(phases))
def __init__(self, collectors = (), predecessor = None): ItemCollector.__init__(self) collections.OrderedDict.__init__(self) self.predecessor = predecessor if predecessor: assert all(map(attrgetter('has_collected'), predecessor.values())) self.update(predecessor) each(self.add, collectors)
def collect(self, items): """Collects the data of all columns of a row""" if self.__stderr is not None and len(self) != len(items): self.__rowcount += 1 print('Row {} has {} columns, expected {}: {}'.format( self.__rowcount, len(items), len(self), items), file=self.__stderr) assert len(self) <= len(items) each(self.__collect_column, self, items)
def collect(self, items): """Collects the data of all columns of a row""" if self.__stderr is not None and len(self) != len(items): self.__rowcount += 1 print( 'Row {} has {} columns, expected {}: {}'.format( self.__rowcount, len(items), len(self), items), file=self.__stderr) assert len(self) <= len(items) each(self.__collect_column, self, items)
def __get_dependency_chain(self, collectorset_description, predecessors=None): """ Returns a list of phase-wise collector descriptions for a single column. :param collectorset_description: iterable :param predecessors: ItemCollectorSet :return: list[dict] """ phase = dict(filterfalse( lambda item: item[0] is None or item[0] in predecessors, ((template.get_type(predecessors), template) for template in collectorset_description))) independent = TagCollector('independent', frozenset(phase.keys()), True) phases = [] collector_min_phases = dict() phase_pre_dependencies = None phase_result_dependencies = set() def must_add_dependency(dep): return (dep not in phase and dep not in phase_result_dependencies and dep not in phase_pre_dependencies) def add_dependencies(template): if template.pre_dependencies: for dep in filterfalse(predecessors.__contains__, template.pre_dependencies): phase_pre_dependencies.setdefault(dep, dep) collector_min_phases[dep] = len(phases) - 1 else: for dep in filter(must_add_dependency, template.result_dependencies): add_dependencies(dep) phase_result_dependencies.add(dep) # resolve dependencies and push them to an earlier phase while phase: phase_pre_dependencies = dict() phase_result_dependencies.clear() each(add_dependencies, phase.keys()) phases.append(phase) phase = phase_pre_dependencies # remove later duplicates consume(( each(memberfn(dict.pop, ctype, None), islice(phases, 0, -min_phase_idx)) for ctype, min_phase_idx in collector_min_phases.items())) if predecessors is not None: predecessors[independent] = independent elif phases: phases[-1][independent] = independent return uiterator.filter(None, reversed(phases))
def collect_analyse_match(collectors, collectorset_description, **kwargs): """ :param collectors: list[io.IOBase | MultiphaseCollector] :param collectorset_description: object :return: list[MultiphaseCollector], list[int], list[int, int, float, list[int]] """ assert isinstance(collectors, collections.Sequence) and len(collectors) >= 2 collect_functor = \ memberfn(collect, collectorset_description.descriptions, **kwargs) if isinstance(collectors[0], MultiphaseCollector): assert all(map(memberfn(isinstance, MultiphaseCollector), collectors)) assert utilities.iterator.issorted(collectors, MultiphaseCollector.columncount) sort_order = None each(collect_functor, collectors) else: # The first collector shall have the least columns. sort_order, collectors = \ utilities.iterator.sorted_with_order( map(collect_functor, collectors), MultiphaseCollector.columncount) # analyse collected data norms_combinations = [[ c1_idx, c2_idx, MultiphaseCollector.results_norms(collectors[c1_idx], collectors[c2_idx], collectorset_description.weights), None ] for c1_idx, c2_idx in itertools.combinations(range(len(collectors)), 2)] if kwargs.get('verbose', 0) >= 1: formatter = memberfn(format, kwargs.get('number_format', '')) for c1_idx, c2_idx, norms, _ in norms_combinations: print(collectors[c2_idx].name, collectors[c1_idx].name, sep=' / ', end='\n| ', file=sys.stderr) print(*(' '.join(map(formatter, row)) for row in norms), sep=' |\n| ', end=' |\n\n', file=sys.stderr) # find minimal combinations for norms_combination in norms_combinations: # TODO: rewrite as functional clause norms_combination[2:4] = get_best_schema_mapping(norms_combination[2]) return collectors, sort_order, norms_combinations
def collect_analyse_match(collectors, collectorset_description, **kwargs): """ :param collectors: list[io.IOBase | MultiphaseCollector] :param collectorset_description: object :return: list[MultiphaseCollector], list[int], list[int, int, float, list[int]] """ assert isinstance(collectors, collections.Sequence) and len(collectors) >= 2 collect_functor = \ memberfn(collect, collectorset_description.descriptions, **kwargs) if isinstance(collectors[0], MultiphaseCollector): assert all(map(memberfn(isinstance, MultiphaseCollector), collectors)) assert utilities.iterator.issorted(collectors, MultiphaseCollector.columncount) sort_order = None each(collect_functor, collectors) else: # The first collector shall have the least columns. sort_order, collectors = \ utilities.iterator.sorted_with_order( map(collect_functor, collectors), MultiphaseCollector.columncount) # analyse collected data norms_combinations = [ [c1_idx, c2_idx, MultiphaseCollector.results_norms(collectors[c1_idx], collectors[c2_idx], collectorset_description.weights), None] for c1_idx, c2_idx in itertools.combinations(range(len(collectors)), 2)] if kwargs.get('verbose', 0) >= 1: formatter = memberfn(format, kwargs.get('number_format', '')) for c1_idx, c2_idx, norms, _ in norms_combinations: print(collectors[c2_idx].name, collectors[c1_idx].name, sep=' / ', end='\n| ', file=sys.stderr) print(*(' '.join(map(formatter, row)) for row in norms), sep=' |\n| ', end=' |\n\n', file=sys.stderr) # find minimal combinations for norms_combination in norms_combinations: # TODO: rewrite as functional clause norms_combination[2:4] = get_best_schema_mapping(norms_combination[2]) return collectors, sort_order, norms_combinations
def __emit_itemcollector_set(self, keep): if keep and isinstance(self.merged_predecessors, RowCollector): keep = composefn(type, keep.__contains__) for predecessor in self.merged_predecessors: ics = ItemCollectorSet() def add_copy_and_dependencies(collector, isdependency): for dep in collector.result_dependencies: add_copy_and_dependencies(predecessor[dep], True) if isdependency is None: isdependency = collector.isdependency collector = ics.setdefault(type(collector), copy.copy(collector)) collector.isdependency &= isdependency each(memberfn(add_copy_and_dependencies, None), filter(keep, predecessor.values())) yield ics else: for _ in range(len(self.rowset[0])): ics = ItemCollectorSet() ics.add(ItemCountCollector(len(self.rowset)), True) yield ics
def add(self, template, isdependency=None): """Adds an item collector and all its result_dependencies to this set with its type a key, if one of the same type isn't in the set already. Returns the collector the same type from this set, possibly the one just added. """ collector_type = template.get_type(self.predecessor) collector = self.get(collector_type) if isdependency is None: isdependency = self.__isdependency(collector_type, False) if collector is None: collector = ItemCollector.get_instance(template, self.predecessor) if not isinstance(collector, ItemCollector): assert collector is None return None collector.isdependency = isdependency each(self.__add_dependency, collector.result_dependencies) collector = self.setdefault(collector_type, collector) collector.isdependency &= isdependency return collector
def transform_all(self, rows): transformer = self.get_transformer() if transformer is not None: each(transformer, rows) each(methodcaller('set_transformed'), self)
def collect_all(self, rows): each(self.collect, rows) each(methodcaller('set_collected'), self)
def __forward_call(self, fn_name=None, *args): if fn_name is None: fn_name = inspect.stack()[1][3] each(methodcaller(fn_name, *args), self.values()) getattr(super(ItemCollectorSet, self), fn_name)(*args)