def generate(self, block1: Block, block2: Block, output_block: Block = None): output_block = BlockGenerator._generate_args_check(output_block) dataset = [] for key, _ in block1.key_set_adapter: dataset.append(self._decode_key(key)) for key, _ in block2.key_set_adapter: dataset.append(self._decode_key(key)) clusters = self._run_canopy_clustering(dataset, self._t1, self._t2, self._distance_metric) for c in clusters: for vec in c: key = self._encode_key(vec) set_ = block1.get(key) if set_: for ds_id, rid in set_: output_block.add(key, ds_id, rid) set_ = block2.get(key) if set_: for ds_id, rid in set_: output_block.add(key, ds_id, rid) return output_block
def generate(self, block1: Block, block2: Block, output_block: Block = None): output_block = super()._generate_args_check(output_block) for block_id, ds_id, record_id in block1: output_block.add(block_id, ds_id, record_id) for block_id, ds_id, record_id in block2: output_block.add(block_id, ds_id, record_id) return output_block
def block(self, dataset, function_: Callable = None, property_: str = None, block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None): """ The return of `property_` or `function_` should be list or set. """ block = super()._block_args_check(function_, property_, block) if base_on: for block_id, dataset_id, record_id in base_on: if dataset.id == dataset_id: r = dataset.get_record(record_id) value = function_(r) if function_ else getattr( r, property_) if not isinstance(value, list) and not isinstance( value, set): raise ValueError( 'Return of the function or property should be a list' ) for v in value: if not isinstance(v, str): raise ValueError( 'Elements in return list should be string') if block_black_list and block_black_list.has(v): continue v = block_id + '-' + v block.add(v, dataset.id, r.id) if block_black_list: block_black_list.add(v, block) else: for r in dataset: value = function_(r) if function_ else getattr(r, property_) if not isinstance(value, list) and not isinstance(value, set): raise ValueError( 'Return of the function or property should be a list') for v in value: if not isinstance(v, str): raise ValueError( 'Elements in return list should be string') if block_black_list and block_black_list.has(v): continue block.add(v, dataset.id, r.id) if block_black_list: block_black_list.add(v, block) return block
def block(self, dataset, function_: Callable = None, property_: str = None, block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None): """ The return of `property_` or `function_` should be a vector (list). """ block = super()._block_args_check(function_, property_, block) if base_on: for block_id, dataset_id, record_id in base_on: if dataset.id == dataset_id: r = dataset.get_record(record_id) value = function_(r) if function_ else getattr( r, property_) if not isinstance(value, list): raise ValueError( 'Return of the function or property should be a vector (list)' ) value = block_id + '-' + value k = self._encode_key(value) if block_black_list and block_black_list.has(k): continue block.add(k, dataset.id, r.id) if block_black_list: block_black_list.add(k, block) else: for r in dataset: value = function_(r) if function_ else getattr(r, property_) if not isinstance(value, list): raise ValueError( 'Return of the function or property should be a vector (list)' ) k = self._encode_key(value) if block_black_list and block_black_list.has(k): continue block.add(k, dataset.id, r.id) if block_black_list: block_black_list.add(k, block) return block
def generate(self, block1: Block, block2: Block, output_block: Block = None): output_block = BlockGenerator._generate_args_check(output_block) # TODO: in-memory operations here, need to update # concatenation all_records = [] for block_id, ds_id, record_id in block1: all_records.append((block_id, ds_id, record_id)) for block_id, ds_id, record_id in block2: all_records.append((block_id, ds_id, record_id)) sorted_all_records = sorted(all_records, key=cmp_to_key(self._comparator_wrapper)) # apply slide window for i in range(len(sorted_all_records) - self.window_size + 1): block_id = self.block_id_prefix + str(i) for j in range(self.window_size): record = sorted_all_records[i + j] output_block.add(block_id, record[1], record[2]) return output_block
def block(self, dataset, function_: Callable = None, property_: str = None, block: Block = None, block_black_list: BlockBlackList = None): """ The return of `property_` or `function_` should be string. """ block = super()._block_args_check(function_, property_, block) for r in dataset: value = function_(r) if function_ else getattr(r, property_) if block_black_list and block_black_list.has(value): continue if not isinstance(value, str): raise ValueError( 'Return of the function or property should be a string') block.add(value, dataset.id, r.id) if block_black_list: block_black_list.add(value, block) return block