Exemplo n.º 1
0
    def process(self, key_and_bundles):
        _, bundles = key_and_bundles
        heap = []
        for bundle in bundles:
            if not heap:
                if self._less_than or self._key:
                    heap = [
                        cy_combiners.ComparableValue(element, self._less_than,
                                                     self._key)
                        for element in bundle
                    ]
                else:
                    heap = bundle
                continue

            for element in reversed(bundle):
                if self._less_than or self._key:
                    element = cy_combiners.ComparableValue(
                        element, self._less_than, self._key)
                if len(heap) < self._n:
                    heapq.heappush(heap, element)
                elif element < heap[0]:
                    # Because _TopPerBundle returns sorted lists, all other elements
                    # will also be smaller.
                    break
                else:
                    heapq.heappushpop(heap, element)

        heap.sort()
        if self._less_than or self._key:
            yield [wrapper.value for wrapper in reversed(heap)]
        else:
            yield heap[::-1]
Exemplo n.º 2
0
    def add_input(self, accumulator, element, *args, **kwargs):
        # Caching to avoid paying the price of variadic expansion of args / kwargs
        # when it's not needed (for the 'if' case below).
        if self._less_than is None:
            if args or kwargs:
                self._less_than = lambda a, b: self._compare(
                    a, b, *args, **kwargs)
            else:
                self._less_than = self._compare

        holds_comparables, heap = accumulator
        if self._less_than is not operator.lt or self._key:
            if not holds_comparables:
                heap = [
                    cy_combiners.ComparableValue(value, self._less_than,
                                                 self._key) for value in heap
                ]
                holds_comparables = True
        else:
            assert not holds_comparables

        comparable = (cy_combiners.ComparableValue(element, self._less_than,
                                                   self._key)
                      if holds_comparables else element)

        if len(heap) < self._n:
            heapq.heappush(heap, comparable)
        else:
            heapq.heappushpop(heap, comparable)
        return (holds_comparables, heap)
Exemplo n.º 3
0
  def process(self, key_and_bundles):
    _, bundles = key_and_bundles

    def push(hp, e):
      if len(hp) < self._n:
        heapq.heappush(hp, e)
        return False
      elif e < hp[0]:
        # Because _TopPerBundle returns sorted lists, all other elements
        # will also be smaller.
        return True
      else:
        heapq.heappushpop(hp, e)
        return False

    if self._compare or self._key:
      heapc = []  # type: List[cy_combiners.ComparableValue]
      for bundle in bundles:
        if not heapc:
          heapc = [
              cy_combiners.ComparableValue(element, self._compare, self._key)
              for element in bundle
          ]
          continue
        # TODO(https://github.com/apache/beam/issues/21205): Remove this
        # workaround once legacy dataflow correctly handles coders with
        # combiner packing and/or is deprecated.
        if not isinstance(bundle, list):
          bundle = list(bundle)
        for element in reversed(bundle):
          if push(heapc,
                  cy_combiners.ComparableValue(element,
                                               self._compare,
                                               self._key)):
            break
      heapc.sort()
      yield [wrapper.value for wrapper in reversed(heapc)]

    else:
      heap = []
      for bundle in bundles:
        # TODO(https://github.com/apache/beam/issues/21205): Remove this
        # workaround once legacy dataflow correctly handles coders with
        # combiner packing and/or is deprecated.
        if not isinstance(bundle, list):
          bundle = list(bundle)
        if not heap:
          heap = bundle
          continue
        for element in reversed(bundle):
          if push(heap, element):
            break
      heap.sort()
      yield heap[::-1]
Exemplo n.º 4
0
    def add_input(self, accumulator, element, *args, **kwargs):
        # Caching to avoid paying the price of variadic expansion of args / kwargs
        # when it's not needed (for the 'if' case below).
        if self._less_than is None:
            if args or kwargs:
                self._less_than = lambda a, b: self._compare(
                    a, b, *args, **kwargs)
            else:
                self._less_than = self._compare

        holds_comparables, heap = accumulator
        if self._less_than is not operator.lt or self._key:
            heap = self._hydrated_heap(heap)
            holds_comparables = True
        else:
            assert not holds_comparables

        # this is the new part of code
        for current_top_element in enumerate(heap):
            if element[0] == current_top_element[1].value[0]:
                # logging.info("Duplicate: " + element[0] + "," + str(element[1]) + ' --- ' + current_top_element[1].value[0] + ',' + str(current_top_element[1].value[1]))
                heap[current_top_element[0]] = heap[-1]
                heap.pop()
                heapq.heapify(heap)

        comparable = (cy_combiners.ComparableValue(element, self._less_than,
                                                   self._key)
                      if holds_comparables else element)

        if len(heap) < self._n:
            heapq.heappush(heap, comparable)
        else:
            heapq.heappushpop(heap, comparable)
        return (holds_comparables, heap)
Exemplo n.º 5
0
    def merge_accumulators(self, accumulators, *args, **kwargs):
        if args or kwargs:
            self._less_than = lambda a, b: self._compare(a, b, *args, **kwargs)
            add_input = lambda accumulator, element: self.add_input(
                accumulator, element, *args, **kwargs)
        else:
            self._less_than = self._compare
            add_input = self.add_input

        result_heap = None
        holds_comparables = None
        for accumulator in accumulators:
            holds_comparables, heap = accumulator
            if self._less_than is not operator.lt or self._key:
                if not holds_comparables:
                    heap = [
                        cy_combiners.ComparableValue(value, self._less_than,
                                                     self._key)
                        for value in heap
                    ]
                    holds_comparables = True
            else:
                assert not holds_comparables

            if result_heap is None:
                result_heap = heap
            else:
                for comparable in heap:
                    _, result_heap = add_input(
                        (holds_comparables, result_heap),
                        comparable.value if holds_comparables else comparable)

        assert result_heap is not None and holds_comparables is not None
        return (holds_comparables, result_heap)
Exemplo n.º 6
0
 def process(self, element):
   if self._compare or self._key:
     element = cy_combiners.ComparableValue(element, self._compare, self._key)
   if len(self._heap) < self._n:
     heapq.heappush(self._heap, element)
   else:
     heapq.heappushpop(self._heap, element)
Exemplo n.º 7
0
    def process(self, key_and_bundles):
        _, bundles = key_and_bundles

        def push(hp, e):
            if len(hp) < self._n:
                heapq.heappush(hp, e)
                return False
            elif e < hp[0]:
                # Because _TopPerBundle returns sorted lists, all other elements
                # will also be smaller.
                return True
            else:
                heapq.heappushpop(hp, e)
                return False

        if self._compare or self._key:
            heapc = []  # type: List[cy_combiners.ComparableValue]
            for bundle in bundles:
                if not heapc:
                    heapc = [
                        cy_combiners.ComparableValue(element, self._compare,
                                                     self._key)
                        for element in bundle
                    ]
                    continue
                for element in reversed(bundle):
                    if push(
                            heapc,
                            cy_combiners.ComparableValue(
                                element, self._compare, self._key)):
                        break
            heapc.sort()
            yield [wrapper.value for wrapper in reversed(heapc)]

        else:
            heap = []
            for bundle in bundles:
                if not heap:
                    heap = bundle
                    continue
                for element in reversed(bundle):
                    if push(heap, element):
                        break
            heap.sort()
            yield heap[::-1]
Exemplo n.º 8
0
    def add_input(self, accumulator, element, *args, **kwargs):
        # Caching to avoid paying the price of variadic expansion of args / kwargs
        # when it's not needed (for the 'if' case below).
        holds_comparables, heap = accumulator
        if self._compare is not operator.lt or self._key:
            heap = self._hydrated_heap(heap)
            holds_comparables = True
        else:
            assert not holds_comparables

        comparable = (cy_combiners.ComparableValue(element, self._compare,
                                                   self._key)
                      if holds_comparables else element)

        if len(heap) < self._n:
            heapq.heappush(heap, comparable)
        else:
            heapq.heappushpop(heap, comparable)
        return (holds_comparables, heap)
Exemplo n.º 9
0
 def _hydrated_heap(self, heap):
     if heap:
         first = heap[0]
         if isinstance(first, cy_combiners.ComparableValue):
             if first.requires_hydration:
                 for comparable in heap:
                     assert comparable.requires_hydration
                     comparable.hydrate(self._compare, self._key)
                     assert not comparable.requires_hydration
                 return heap
             else:
                 return heap
         else:
             return [
                 cy_combiners.ComparableValue(element, self._compare,
                                              self._key) for element in heap
             ]
     else:
         return heap
Exemplo n.º 10
0
    def extract_output(self, accumulator, *args, **kwargs):
        if args or kwargs:
            self._less_than = lambda a, b: self._compare(a, b, *args, **kwargs)
        else:
            self._less_than = self._compare

        holds_comparables, heap = accumulator
        if self._less_than is not operator.lt or self._key:
            if not holds_comparables:
                heap = [
                    cy_combiners.ComparableValue(value, self._less_than,
                                                 self._key) for value in heap
                ]
                holds_comparables = True
        else:
            assert not holds_comparables

        assert len(heap) <= self._n
        heap.sort(reverse=True)
        return [
            comparable.value if holds_comparables else comparable
            for comparable in heap
        ]