Python randomDeque 예제들, dedupe.sampling.randomDeque Python 예제들

예제 #1

0

파일 보기

파일: labeler.py 프로젝트: pombredanne/dedupe

    def sample_product(self, data_1, data_2, blocked_proportion, sample_size):
        offset = len(data_1)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        deque_1 = sampling.randomDeque(data_1)
        deque_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size,
                                                         predicates,
                                                         deque_1,
                                                         deque_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = core.randomPairsMatch(len(deque_1),
                                                   len(deque_2),
                                                   random_sample_size)

        random_sample_keys = {(a, b + offset)
                              for a, b in random_sample_keys}

        self.candidates = [(data_1[k1], data_2[k2])
                           for k1, k2
                           in blocked_sample_keys | random_sample_keys]

        self.distances = self.transform(self.candidates)

예제 #2

0

파일 보기

파일: api.py 프로젝트: ejokeeffe/dedupe

    def sample(self, data_1, data_2, sample_size=150000,
               blocked_proportion=.5):
        '''
        Draws a random sample of combinations of records from
        the first and second datasets, and initializes active
        learning with this sample

        Arguments:

        data_1      -- Dictionary of records from first dataset, where the
                       keys are record_ids and the values are dictionaries
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same
                       form as data_1

        sample_size -- Size of the sample to draw
        '''
        if len(data_1) == 0:
            raise ValueError(
                'Dictionary of records from first dataset is empty.')
        elif len(data_2) == 0:
            raise ValueError(
                'Dictionary of records from second dataset is empty.')

        if len(data_1) > len(data_2):
            data_1, data_2 = data_2, data_1

        data_1 = core.index(data_1)
        self.sampled_records_1 = Sample(data_1, 500)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)
        self.sampled_records_2 = Sample(data_2, 500)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        deque_1 = sampling.randomDeque(data_1)
        deque_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size,
                                                         predicates,
                                                         deque_1,
                                                         deque_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = core.randomPairsMatch(len(deque_1),
                                                   len(deque_2),
                                                   random_sample_size)

        random_sample_keys = {(a, b + offset)
                              for a, b in random_sample_keys}

        data_sample = ((data_1[k1], data_2[k2])
                       for k1, k2
                       in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)

예제 #3

0

파일 보기

    def _sample(self: HasDataModel, data_1, data_2, blocked_proportion, sample_size) -> List[TrainingExample]:
        offset = len(data_1)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        deque_1 = sampling.randomDeque(data_1)
        deque_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size,
                                                         predicates,
                                                         deque_1,
                                                         deque_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = core.randomPairsMatch(len(deque_1),
                                                   len(deque_2),
                                                   random_sample_size)

        unique_random_sample_keys = {(a, b + offset)
                                     for a, b in random_sample_keys}

        return [(data_1[k1], data_2[k2])
                for k1, k2
                in blocked_sample_keys | unique_random_sample_keys]

예제 #4

0

파일 보기

파일: api.py 프로젝트: rkiddy/dedupe

    def sample(self,
               data_1,
               data_2,
               sample_size=150000,
               blocked_proportion=.5):
        '''
        Draws a random sample of combinations of records from 
        the first and second datasets, and initializes active
        learning with this sample
        
        Arguments:
        
        data_1      -- Dictionary of records from first dataset, where the 
                       keys are record_ids and the values are dictionaries 
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same 
                       form as data_1
        
        sample_size -- Size of the sample to draw
        '''
        if len(data_1) > len(data_2):
            data_1, data_2 = data_2, data_1

        data_1 = core.index(data_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = [
            pred for pred in predicateGenerator(self.data_model)
            if pred.type == 'SimplePredicate'
        ]

        data_1 = sampling.randomDeque(data_1)
        data_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(
            blocked_sample_size, predicates, data_1, data_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = dedupe.core.randomPairsMatch(
            len(data_1), len(data_2), random_sample_size)

        random_sample_keys = set(
            (a, b + offset) for a, b in random_sample_keys)

        data_1 = dict(data_1)
        data_2 = dict(data_2)

        data_sample = ((data_1[k1], data_2[k2])
                       for k1, k2 in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)

예제 #5

0

파일 보기

파일: api.py 프로젝트: lazymike/dedupe

    def sample(self, data_1, data_2, sample_size=150000, 
               blocked_proportion=.5) :
        '''
        Draws a random sample of combinations of records from 
        the first and second datasets, and initializes active
        learning with this sample
        
        Arguments:
        
        data_1      -- Dictionary of records from first dataset, where the 
                       keys are record_ids and the values are dictionaries 
                       with the keys being field names
        data_2      -- Dictionary of records from second dataset, same 
                       form as data_1
        
        sample_size -- Size of the sample to draw
        '''
        if len(data_1) > len(data_2) :
            data_1, data_2 = data_2, data_1

        data_1 = core.index(data_1)

        offset = len(data_1)
        data_2 = core.index(data_2, offset)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = [pred for pred in predicateGenerator(self.data_model)
                      if pred.type == 'SimplePredicate']

        data_1 = sampling.randomDeque(data_1)
        data_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(blocked_sample_size,
                                                         predicates,
                                                         data_1, 
                                                         data_2)
        
        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = dedupe.core.randomPairsMatch(len(data_1),
                                                          len(data_2), 
                                                          random_sample_size)

        random_sample_keys = set((a, b + offset) 
                                 for a, b in random_sample_keys)

        data_1 = dict(data_1)
        data_2 = dict(data_2)
        
        data_sample = ((data_1[k1], data_2[k2])
                       for k1, k2 
                       in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)

예제 #6

0

파일 보기

    def sample(self, data, sample_size=15000, blocked_proportion=0.5):
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample

        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names

        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)
        self.sampled_records = Sample(data, 900)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(
            blocked_sample_size, predicates, data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(
            core.randomPairs(len(data), random_sample_size))
        data = dict(data)

        data_sample = [(data[k1], data[k2])
                       for k1, k2 in blocked_sample_keys | random_sample_keys]

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)

예제 #7

0

파일 보기

파일: api.py 프로젝트: MarvinFiveMaples/dedupe

    def sample(self, data, sample_size=15000,
               blocked_proportion=0.5):
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample

        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names

        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)
        self.sampled_records = Sample(data, 900)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
                                                           predicates,
                                                           data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(core.randomPairs(len(data),
                                                  random_sample_size))
        data = dict(data)

        data_sample = [(data[k1], data[k2])
                       for k1, k2
                       in blocked_sample_keys | random_sample_keys]

        self._loadSample(data_sample)

예제 #8

0

파일 보기

    def sample(self, data_1, data_2, blocked_proportion, sample_size):
        offset = len(data_1)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.distances.predicates(index_predicates=False))

        deque_1 = sampling.randomDeque(data_1)
        deque_2 = sampling.randomDeque(data_2)

        blocked_sample_keys = sampling.linkBlockedSample(
            blocked_sample_size, predicates, deque_1, deque_2)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = core.randomPairsMatch(len(deque_1), len(deque_2),
                                                   random_sample_size)

        random_sample_keys = {(a, b + offset) for a, b in random_sample_keys}

        return [(data_1[k1], data_2[k2])
                for k1, k2 in blocked_sample_keys | random_sample_keys]

예제 #9

0

파일 보기

    def sample(self, data, blocked_proportion, sample_size):
        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(
            blocked_sample_size, predicates, data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(
            core.randomPairs(len(data), random_sample_size))
        data = dict(data)

        return [(data[k1], data[k2])
                for k1, k2 in blocked_sample_keys | random_sample_keys]

예제 #10

0

파일 보기

파일: labeler.py 프로젝트: datamade/dedupe

    def sample(self, data, blocked_proportion, sample_size):
        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(self.data_model.predicates(index_predicates=False))

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
                                                           predicates,
                                                           data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(core.randomPairs(len(data),
                                                  random_sample_size))
        data = dict(data)

        return [(data[k1], data[k2])
                for k1, k2
                in blocked_sample_keys | random_sample_keys]

예제 #11

0

파일 보기

파일: api.py 프로젝트: lminer/dedupe

    def sample(self, data, sample_size=15000, 
               blocked_proportion=0.5) :
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample
        
        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names
        
        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = list(predicateGenerator(self.data_model, 
                                             index_predicates=False,
                                             canopies=self.canopies))


        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
                                                           predicates,
                                                           data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(dedupe.core.randomPairs(len(data),
                                                         random_sample_size))
        data = dict(data)

        data_sample = [(data[k1], data[k2])
                       for k1, k2 
                       in blocked_sample_keys | random_sample_keys]

        data_sample = core.freezeData(data_sample)

        # data can be a very large object, so we'll free it up as soon
        # as possible
        del data

        self._loadSample(data_sample)

예제 #12

0

파일 보기

파일: api.py 프로젝트: lazymike/dedupe

    def sample(self, data, sample_size=15000, 
               blocked_proportion=0.5) :
        '''Draw a sample of record pairs from the dataset
        (a mix of random pairs & pairs of similar records)
        and initialize active learning with this sample
        
        Arguments: data -- Dictionary of records, where the keys are
        record_ids and the values are dictionaries with the keys being
        field names
        
        sample_size         -- Size of the sample to draw
        blocked_proportion  -- Proportion of the sample that will be blocked
        '''
        data = core.index(data)

        blocked_sample_size = int(blocked_proportion * sample_size)
        predicates = [pred for pred in predicateGenerator(self.data_model)
                      if pred.type == 'SimplePredicate']

        data = sampling.randomDeque(data)
        blocked_sample_keys = sampling.dedupeBlockedSample(blocked_sample_size,
                                                           predicates,
                                                           data)

        random_sample_size = sample_size - len(blocked_sample_keys)
        random_sample_keys = set(dedupe.core.randomPairs(len(data),
                                                         random_sample_size))
        data = dict(data)

        data_sample = ((data[k1], data[k2])
                       for k1, k2 
                       in blocked_sample_keys | random_sample_keys)

        data_sample = core.freezeData(data_sample)

        self._loadSample(data_sample)