示例#1
0
def test_hash_block_generator():
    bg = HashBlockGenerator()
    block = bg.block(ds, property_='category')
    for key, set_ in block.key_set_adapter:
        if key == 'a':
            assert set_ == set([(ds.id, '1'), (ds.id, '2')])
        elif key == 'b':
            assert set_ == set([(ds.id, '3'), (ds.id, '4'), (ds.id, '5'),
                                (ds.id, '6')])
    block = bg.block(ds, function_=lambda r: r.category)
    for key, set_ in block.key_set_adapter:
        if key == 'a':
            assert set_ == set([(ds.id, '1'), (ds.id, '2')])
        elif key == 'b':
            assert set_ == set([(ds.id, '3'), (ds.id, '4'), (ds.id, '5'),
                                (ds.id, '6')])

    block_black_list = BlockBlackList(max_size=2)
    block = bg.block(ds,
                     property_='category',
                     block_black_list=block_black_list)
    for key, set_ in block.key_set_adapter:
        assert key == 'a'
    for key, _ in block_black_list.key_set_adapter:
        assert key == 'b'
 def block(self, dataset, function_: Callable = None, property_: str = None,
           block: Block = None, block_black_list: BlockBlackList = None):
     """
     The return of `property_` or `function_` should be list or set.
     """
     block = super()._block_args_check(function_, property_, block)
     for r in dataset:
         value = function_(r) if function_ else getattr(r, property_)
         if not isinstance(value, list) and not isinstance(value, set):
             raise ValueError('Return of the function or property should be a list')
         for v in value:
             if block_black_list and block_black_list.has(v):
                 continue
             if not isinstance(v, str):
                 raise ValueError('Elements in return list should be string')
             block.add(v, dataset.id, r.id)
             if block_black_list:
                 block_black_list.add(v, block)
     return block
    def block(self,
              dataset,
              function_: Callable = None,
              property_: str = None,
              block: Block = None,
              block_black_list: BlockBlackList = None,
              base_on: Block = None):
        """
        The return of `property_` or `function_` should be a vector (list).
        """
        block = super()._block_args_check(function_, property_, block)

        if base_on:
            raise Exception('Canopy currently doesn\'t support `base_on`')
            # for block_id, dataset_id, record_id in base_on:
            #     if dataset.id == dataset_id:
            #         r = dataset.get_record(record_id)
            #         value = function_(r) if function_ else getattr(r, property_)
            #         if not isinstance(value, list):
            #             raise ValueError('Return of the function or property should be a vector (list)')
            #         value = block_id + '-' + value
            #         k = self._encode_key(value)
            #         if block_black_list and block_black_list.has(k):
            #             continue
            #         block.add(k, dataset.id, r.id)
            #         if block_black_list:
            #             block_black_list.add(k, block)

        else:
            for r in dataset:
                value = function_(r) if function_ else getattr(r, property_)
                if not isinstance(value, list):
                    raise ValueError(
                        'Return of the function or property should be a vector (list)'
                    )
                k = self._encode_key(value)
                if block_black_list and block_black_list.has(k):
                    continue
                block.add(k, dataset.id, r.id)
                if block_black_list:
                    block_black_list.add(k, block)

        return block
示例#4
0
 def block(self,
           dataset,
           function_: Callable = None,
           property_: str = None,
           block: Block = None,
           block_black_list: BlockBlackList = None):
     """
     The return of `property_` or `function_` should be a vector (list).
     """
     block = super()._block_args_check(function_, property_, block)
     for r in dataset:
         value = function_(r) if function_ else getattr(r, property_)
         k = self._encode_key(value)
         if block_black_list and block_black_list.has(k):
             continue
         if not isinstance(value, list):
             raise ValueError(
                 'Return of the function or property should be a vector (list)'
             )
         block.add(k, dataset.id, r.id)
         if block_black_list:
             block_black_list.add(k, block)
     return block
示例#5
0
def test_token_block_generator():
    bg = TokenBlockGenerator()
    block = bg.block(ds, function_=lambda r: r.name.split(' '))
    for key, set_ in block.key_set_adapter:
        if key == 'apple':
            assert set_ == set([(ds.id, '1'), (ds.id, '3')])
        elif key == 'banana':
            assert set_ == set([(ds.id, '2'), (ds.id, '3')])

    block_black_list = BlockBlackList(max_size=1)
    block = bg.block(ds,
                     function_=lambda r: r.name.split(' '),
                     block_black_list=block_black_list)
    for key, set_ in block.key_set_adapter:
        assert len(set_) <= 1
    for key, _ in block_black_list.key_set_adapter:
        assert key in ('apple', 'banana')
    def block(self,
              dataset,
              function_: Callable = None,
              property_: str = None,
              block: Block = None,
              block_black_list: BlockBlackList = None,
              base_on: Block = None):
        """
        The return of `property_` or `function_` should be string.
        """
        block = super()._block_args_check(function_, property_, block)

        if base_on:
            for block_id, dataset_id, record_id in base_on:
                if dataset.id == dataset_id:
                    r = dataset.get_record(record_id)
                    value = function_(r) if function_ else getattr(
                        r, property_)
                    if not isinstance(value, str):
                        raise ValueError(
                            'Return of the function or property should be a string'
                        )
                    value = block_id + '-' + value
                    if block_black_list and block_black_list.has(value):
                        continue
                    block.add(value, dataset.id, r.id)
                    if block_black_list:
                        block_black_list.add(value, block)

        else:
            for r in dataset:
                value = function_(r) if function_ else getattr(r, property_)
                if not isinstance(value, str):
                    raise ValueError(
                        'Return of the function or property should be a string'
                    )
                if block_black_list and block_black_list.has(value):
                    continue
                block.add(value, dataset.id, r.id)
                if block_black_list:
                    block_black_list.add(value, block)

        return block
    def block(self, dataset, function_: Callable = None, property_: str = None,
              block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None):
        """
        The return of `property_` or `function_` should be a vector (list).
        """
        block = super()._block_args_check(function_, property_, block)

        if base_on:
            for block_id, dataset_id, record_id in base_on:
                if dataset.id == dataset_id:
                    r = dataset.get_record(record_id)
                    value = function_(r) if function_ else getattr(r, property_)
                    if not isinstance(value, (list, set)):
                        value = value(set)
                    for v in value:
                        if not isinstance(v, str):
                            raise ValueError('Elements in return list should be string')
                        if block_black_list and block_black_list.has(v):
                            continue
                        v = block_id + '-' + v
                        block.add(v, dataset.id, r.id)
                        if block_black_list:
                            block_black_list.add(v, block)

        else:
            for r in dataset:
                value = function_(r) if function_ else getattr(r, property_)
                if not isinstance(value, (list, set)):
                    value = set(value)
                for v in value:
                    if not isinstance(v, str):
                        raise ValueError('Elements in return list should be string')
                    if block_black_list and block_black_list.has(v):
                        continue
                    block.add(v, dataset.id, r.id)
                    if block_black_list:
                        block_black_list.add(v, block)

        return block