示例#1
0
    def generate(self,
                 block1: Block,
                 block2: Block,
                 output_block: Block = None):
        output_block = BlockGenerator._generate_args_check(output_block)
        dataset = []
        for key, _ in block1.key_set_adapter:
            dataset.append(self._decode_key(key))
        for key, _ in block2.key_set_adapter:
            dataset.append(self._decode_key(key))

        clusters = self._run_canopy_clustering(dataset, self._t1, self._t2,
                                               self._distance_metric)

        for c in clusters:
            for vec in c:
                key = self._encode_key(vec)
                set_ = block1.get(key)
                if set_:
                    for ds_id, rid in set_:
                        output_block.add(key, ds_id, rid)
                set_ = block2.get(key)
                if set_:
                    for ds_id, rid in set_:
                        output_block.add(key, ds_id, rid)
        return output_block
 def generate(self, block1: Block, block2: Block, output_block: Block = None):
     output_block = super()._generate_args_check(output_block)
     for block_id, ds_id, record_id in block1:
             output_block.add(block_id, ds_id, record_id)
     for block_id, ds_id, record_id in block2:
             output_block.add(block_id, ds_id, record_id)
     return output_block
    def block(self,
              dataset,
              function_: Callable = None,
              property_: str = None,
              block: Block = None,
              block_black_list: BlockBlackList = None,
              base_on: Block = None):
        """
        The return of `property_` or `function_` should be list or set.
        """
        block = super()._block_args_check(function_, property_, block)

        if base_on:
            for block_id, dataset_id, record_id in base_on:
                if dataset.id == dataset_id:
                    r = dataset.get_record(record_id)
                    value = function_(r) if function_ else getattr(
                        r, property_)
                    if not isinstance(value, list) and not isinstance(
                            value, set):
                        raise ValueError(
                            'Return of the function or property should be a list'
                        )
                    for v in value:
                        if not isinstance(v, str):
                            raise ValueError(
                                'Elements in return list should be string')
                        if block_black_list and block_black_list.has(v):
                            continue
                        v = block_id + '-' + v
                        block.add(v, dataset.id, r.id)
                        if block_black_list:
                            block_black_list.add(v, block)

        else:
            for r in dataset:
                value = function_(r) if function_ else getattr(r, property_)
                if not isinstance(value, list) and not isinstance(value, set):
                    raise ValueError(
                        'Return of the function or property should be a list')
                for v in value:
                    if not isinstance(v, str):
                        raise ValueError(
                            'Elements in return list should be string')
                    if block_black_list and block_black_list.has(v):
                        continue
                    block.add(v, dataset.id, r.id)
                    if block_black_list:
                        block_black_list.add(v, block)

        return block
示例#4
0
    def block(self,
              dataset,
              function_: Callable = None,
              property_: str = None,
              block: Block = None,
              block_black_list: BlockBlackList = None,
              base_on: Block = None):
        """
        The return of `property_` or `function_` should be a vector (list).
        """
        block = super()._block_args_check(function_, property_, block)

        if base_on:
            for block_id, dataset_id, record_id in base_on:
                if dataset.id == dataset_id:
                    r = dataset.get_record(record_id)
                    value = function_(r) if function_ else getattr(
                        r, property_)
                    if not isinstance(value, list):
                        raise ValueError(
                            'Return of the function or property should be a vector (list)'
                        )
                    value = block_id + '-' + value
                    k = self._encode_key(value)
                    if block_black_list and block_black_list.has(k):
                        continue
                    block.add(k, dataset.id, r.id)
                    if block_black_list:
                        block_black_list.add(k, block)

        else:
            for r in dataset:
                value = function_(r) if function_ else getattr(r, property_)
                if not isinstance(value, list):
                    raise ValueError(
                        'Return of the function or property should be a vector (list)'
                    )
                k = self._encode_key(value)
                if block_black_list and block_black_list.has(k):
                    continue
                block.add(k, dataset.id, r.id)
                if block_black_list:
                    block_black_list.add(k, block)

        return block
    def generate(self, block1: Block, block2: Block, output_block: Block = None):
        output_block = BlockGenerator._generate_args_check(output_block)

        # TODO: in-memory operations here, need to update
        # concatenation
        all_records = []
        for block_id, ds_id, record_id in block1:
            all_records.append((block_id, ds_id, record_id))
        for block_id, ds_id, record_id in block2:
            all_records.append((block_id, ds_id, record_id))
        sorted_all_records = sorted(all_records, key=cmp_to_key(self._comparator_wrapper))

        # apply slide window
        for i in range(len(sorted_all_records) - self.window_size + 1):
            block_id = self.block_id_prefix + str(i)
            for j in range(self.window_size):
                record = sorted_all_records[i + j]
                output_block.add(block_id, record[1], record[2])

        return output_block
示例#6
0
 def block(self,
           dataset,
           function_: Callable = None,
           property_: str = None,
           block: Block = None,
           block_black_list: BlockBlackList = None):
     """
     The return of `property_` or `function_` should be string.
     """
     block = super()._block_args_check(function_, property_, block)
     for r in dataset:
         value = function_(r) if function_ else getattr(r, property_)
         if block_black_list and block_black_list.has(value):
             continue
         if not isinstance(value, str):
             raise ValueError(
                 'Return of the function or property should be a string')
         block.add(value, dataset.id, r.id)
         if block_black_list:
             block_black_list.add(value, block)
     return block