def test_hash_block_generator(): bg = HashBlockGenerator() block = bg.block(ds, property_='category') for key, set_ in block.key_set_adapter: if key == 'a': assert set_ == set([(ds.id, '1'), (ds.id, '2')]) elif key == 'b': assert set_ == set([(ds.id, '3'), (ds.id, '4'), (ds.id, '5'), (ds.id, '6')]) block = bg.block(ds, function_=lambda r: r.category) for key, set_ in block.key_set_adapter: if key == 'a': assert set_ == set([(ds.id, '1'), (ds.id, '2')]) elif key == 'b': assert set_ == set([(ds.id, '3'), (ds.id, '4'), (ds.id, '5'), (ds.id, '6')]) block_black_list = BlockBlackList(max_size=2) block = bg.block(ds, property_='category', block_black_list=block_black_list) for key, set_ in block.key_set_adapter: assert key == 'a' for key, _ in block_black_list.key_set_adapter: assert key == 'b'
def block(self, dataset, function_: Callable = None, property_: str = None, block: Block = None, block_black_list: BlockBlackList = None): """ The return of `property_` or `function_` should be list or set. """ block = super()._block_args_check(function_, property_, block) for r in dataset: value = function_(r) if function_ else getattr(r, property_) if not isinstance(value, list) and not isinstance(value, set): raise ValueError('Return of the function or property should be a list') for v in value: if block_black_list and block_black_list.has(v): continue if not isinstance(v, str): raise ValueError('Elements in return list should be string') block.add(v, dataset.id, r.id) if block_black_list: block_black_list.add(v, block) return block
def block(self, dataset, function_: Callable = None, property_: str = None, block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None): """ The return of `property_` or `function_` should be a vector (list). """ block = super()._block_args_check(function_, property_, block) if base_on: raise Exception('Canopy currently doesn\'t support `base_on`') # for block_id, dataset_id, record_id in base_on: # if dataset.id == dataset_id: # r = dataset.get_record(record_id) # value = function_(r) if function_ else getattr(r, property_) # if not isinstance(value, list): # raise ValueError('Return of the function or property should be a vector (list)') # value = block_id + '-' + value # k = self._encode_key(value) # if block_black_list and block_black_list.has(k): # continue # block.add(k, dataset.id, r.id) # if block_black_list: # block_black_list.add(k, block) else: for r in dataset: value = function_(r) if function_ else getattr(r, property_) if not isinstance(value, list): raise ValueError( 'Return of the function or property should be a vector (list)' ) k = self._encode_key(value) if block_black_list and block_black_list.has(k): continue block.add(k, dataset.id, r.id) if block_black_list: block_black_list.add(k, block) return block
def block(self, dataset, function_: Callable = None, property_: str = None, block: Block = None, block_black_list: BlockBlackList = None): """ The return of `property_` or `function_` should be a vector (list). """ block = super()._block_args_check(function_, property_, block) for r in dataset: value = function_(r) if function_ else getattr(r, property_) k = self._encode_key(value) if block_black_list and block_black_list.has(k): continue if not isinstance(value, list): raise ValueError( 'Return of the function or property should be a vector (list)' ) block.add(k, dataset.id, r.id) if block_black_list: block_black_list.add(k, block) return block
def test_token_block_generator(): bg = TokenBlockGenerator() block = bg.block(ds, function_=lambda r: r.name.split(' ')) for key, set_ in block.key_set_adapter: if key == 'apple': assert set_ == set([(ds.id, '1'), (ds.id, '3')]) elif key == 'banana': assert set_ == set([(ds.id, '2'), (ds.id, '3')]) block_black_list = BlockBlackList(max_size=1) block = bg.block(ds, function_=lambda r: r.name.split(' '), block_black_list=block_black_list) for key, set_ in block.key_set_adapter: assert len(set_) <= 1 for key, _ in block_black_list.key_set_adapter: assert key in ('apple', 'banana')
def block(self, dataset, function_: Callable = None, property_: str = None, block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None): """ The return of `property_` or `function_` should be string. """ block = super()._block_args_check(function_, property_, block) if base_on: for block_id, dataset_id, record_id in base_on: if dataset.id == dataset_id: r = dataset.get_record(record_id) value = function_(r) if function_ else getattr( r, property_) if not isinstance(value, str): raise ValueError( 'Return of the function or property should be a string' ) value = block_id + '-' + value if block_black_list and block_black_list.has(value): continue block.add(value, dataset.id, r.id) if block_black_list: block_black_list.add(value, block) else: for r in dataset: value = function_(r) if function_ else getattr(r, property_) if not isinstance(value, str): raise ValueError( 'Return of the function or property should be a string' ) if block_black_list and block_black_list.has(value): continue block.add(value, dataset.id, r.id) if block_black_list: block_black_list.add(value, block) return block
def block(self, dataset, function_: Callable = None, property_: str = None, block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None): """ The return of `property_` or `function_` should be a vector (list). """ block = super()._block_args_check(function_, property_, block) if base_on: for block_id, dataset_id, record_id in base_on: if dataset.id == dataset_id: r = dataset.get_record(record_id) value = function_(r) if function_ else getattr(r, property_) if not isinstance(value, (list, set)): value = value(set) for v in value: if not isinstance(v, str): raise ValueError('Elements in return list should be string') if block_black_list and block_black_list.has(v): continue v = block_id + '-' + v block.add(v, dataset.id, r.id) if block_black_list: block_black_list.add(v, block) else: for r in dataset: value = function_(r) if function_ else getattr(r, property_) if not isinstance(value, (list, set)): value = set(value) for v in value: if not isinstance(v, str): raise ValueError('Elements in return list should be string') if block_black_list and block_black_list.has(v): continue block.add(v, dataset.id, r.id) if block_black_list: block_black_list.add(v, block) return block