Exemplo n.º 1
0
    def sample_product(self, data_1, data_2, blocked_proportion, sample_size):
        offset = len(data_1)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        deque_1 = sampling.randomDeque(data_1)
        deque_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size,
                                                         predicates,
                                                         deque_1,
                                                         deque_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = core.randomPairsMatch(len(deque_1),
                                                   len(deque_2),
                                                   random_sample_size)

        random_sample_keys = {(a, b + offset)
                              for a, b in random_sample_keys}

        self.candidates = [(data_1[k1], data_2[k2])
                           for k1, k2
                           in blocked_sample_keys | random_sample_keys]

        self.distances = self.transform(self.candidates)
Exemplo n.º 2
0
    def sample(self, data_1, data_2, sample_size=150000,
               blocked_proportion=.5):
        '''
        Draws a random sample of combinations of records from
        the first and second datasets, and initializes active
        learning with this sample

        Arguments:

        data_1      -- Dictionary of records from first dataset, where the
                       keys are record_ids and the values are dictionaries
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same
                       form as data_1

        sample_size -- Size of the sample to draw
        '''
        if len(data_1) == 0:
            raise ValueError(
                'Dictionary of records from first dataset is empty.')
        elif len(data_2) == 0:
            raise ValueError(
                'Dictionary of records from second dataset is empty.')

        if len(data_1) > len(data_2):
            data_1, data_2 = data_2, data_1

        data_1 = core.index(data_1)
        self.sampled_records_1 = Sample(data_1, 500)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)
        self.sampled_records_2 = Sample(data_2, 500)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        deque_1 = sampling.randomDeque(data_1)
        deque_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size,
                                                         predicates,
                                                         deque_1,
                                                         deque_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = core.randomPairsMatch(len(deque_1),
                                                   len(deque_2),
                                                   random_sample_size)

        random_sample_keys = {(a, b + offset)
                              for a, b in random_sample_keys}

        data_sample = ((data_1[k1], data_2[k2])
                       for k1, k2
                       in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)
Exemplo n.º 3
0
    def _sample(self: HasDataModel, data_1, data_2, blocked_proportion, sample_size) -> List[TrainingExample]:
        offset = len(data_1)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        deque_1 = sampling.randomDeque(data_1)
        deque_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size,
                                                         predicates,
                                                         deque_1,
                                                         deque_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = core.randomPairsMatch(len(deque_1),
                                                   len(deque_2),
                                                   random_sample_size)

        unique_random_sample_keys = {(a, b + offset)
                                     for a, b in random_sample_keys}

        return [(data_1[k1], data_2[k2])
                for k1, k2
                in blocked_sample_keys | unique_random_sample_keys]
Exemplo n.º 4
0
Arquivo: api.py Projeto: rkiddy/dedupe
    def sample(self,
               data_1,
               data_2,
               sample_size=150000,
               blocked_proportion=.5):
        '''
        Draws a random sample of combinations of records from 
        the first and second datasets, and initializes active
        learning with this sample
        
        Arguments:
        
        data_1      -- Dictionary of records from first dataset, where the 
                       keys are record_ids and the values are dictionaries 
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same 
                       form as data_1
        
        sample_size -- Size of the sample to draw
        '''
        if len(data_1) > len(data_2):
            data_1, data_2 = data_2, data_1

        data_1 = core.index(data_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = [
            pred for pred in predicateGenerator(self.data_model)
            if pred.type == 'SimplePredicate'
        ]

        data_1 = sampling.randomDeque(data_1)
        data_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(
            blocked_sample_size, predicates, data_1, data_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = dedupe.core.randomPairsMatch(
            len(data_1), len(data_2), random_sample_size)

        random_sample_keys = set(
            (a, b + offset) for a, b in random_sample_keys)

        data_1 = dict(data_1)
        data_2 = dict(data_2)

        data_sample = ((data_1[k1], data_2[k2])
                       for k1, k2 in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)
Exemplo n.º 5
0
    def sample(self, data_1, data_2, sample_size=150000, 
               blocked_proportion=.5) :
        '''
        Draws a random sample of combinations of records from 
        the first and second datasets, and initializes active
        learning with this sample
        
        Arguments:
        
        data_1      -- Dictionary of records from first dataset, where the 
                       keys are record_ids and the values are dictionaries 
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same 
                       form as data_1
        
        sample_size -- Size of the sample to draw
        '''
        if len(data_1) > len(data_2) :
            data_1, data_2 = data_2, data_1

        data_1 = core.index(data_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = [pred for pred in predicateGenerator(self.data_model)
                      if pred.type == 'SimplePredicate']

        data_1 = sampling.randomDeque(data_1)
        data_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size,
                                                         predicates,
                                                         data_1, 
                                                         data_2)
        
        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = dedupe.core.randomPairsMatch(len(data_1),
                                                          len(data_2), 
                                                          random_sample_size)

        random_sample_keys = set((a, b + offset) 
                                 for a, b in random_sample_keys)

        data_1 = dict(data_1)
        data_2 = dict(data_2)
        
        data_sample = ((data_1[k1], data_2[k2])
                       for k1, k2 
                       in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)
Exemplo n.º 6
0
    def sample(self, data_1, data_2, blocked_proportion, sample_size):
        offset = len(data_1)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.distances.predicates(index_predicates=False))

        deque_1 = sampling.randomDeque(data_1)
        deque_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(
            blocked_sample_size, predicates, deque_1, deque_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = core.randomPairsMatch(len(deque_1), len(deque_2),
                                                   random_sample_size)

        random_sample_keys = {(a, b + offset) for a, b in random_sample_keys}

        return [(data_1[k1], data_2[k2])
                for k1, k2 in blocked_sample_keys | random_sample_keys]