def getSample(con, sample_size, id_column, table): ''' Returns a random sample of a given size of records pairs from a given PostgresSQL table. ''' cur = con.cursor() cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table)) num_records = cur.fetchall()[0].values()[0] cur.close() random_pairs = dedupe.randomPairs(num_records, sample_size) temp_d = {} # Named cursor runs server side with psycopg2 cur = con.cursor('donor_select') cur.execute(DONOR_SELECT) for i, row in enumerate(cur): temp_d[i] = dedupe.frozendict(row) cur.close() pair_sample = [(temp_d[k1], temp_d[k2]) for k1, k2 in random_pairs] return pair_sample
def test_randomSample(self): random.seed(6) numpy.random.seed(6) self.deduper.sample(data_dict, 30, 1) correct_result = [ (dedupe.frozendict({ 'age': '50', 'name': 'Linda' }), dedupe.frozendict({ 'age': '51', 'name': 'bob belcher' })), (dedupe.frozendict({ 'age': '51', 'name': 'Bob' }), dedupe.frozendict({ 'age': '51', 'name': 'Bob B.' })), (dedupe.frozendict({ 'age': '51', 'name': 'Bob' }), dedupe.frozendict({ 'age': '51', 'name': 'bob belcher' })), (dedupe.frozendict({ 'age': '51', 'name': 'Bob B.' }), dedupe.frozendict({ 'age': '51', 'name': 'bob belcher' })), (dedupe.frozendict({ 'age': '50', 'name': 'Linda' }), dedupe.frozendict({ 'age': '50', 'name': 'linda ' })) ] print(set(correct_result) - set(self.deduper.data_sample)) assert set(self.deduper.data_sample).issuperset(correct_result)
def test_randomSample(self) : random.seed(6) numpy.random.seed(6) self.deduper.sample(data_dict, 30, 1) correct_result = [(dedupe.frozendict({'age': '50', 'name': 'Linda'}), dedupe.frozendict({'age': '51', 'name': 'bob belcher'})), (dedupe.frozendict({'age': '51', 'name': 'Bob'}), dedupe.frozendict({'age': '51', 'name': 'Bob B.'})), (dedupe.frozendict({'age': '51', 'name': 'Bob'}), dedupe.frozendict({'age': '51', 'name': 'bob belcher'})), (dedupe.frozendict({'age': '51', 'name': 'Bob B.'}), dedupe.frozendict({'age': '51', 'name': 'bob belcher'})), (dedupe.frozendict({'age': '50', 'name': 'Linda'}), dedupe.frozendict({'age': '50', 'name': 'linda '}))] print(set(correct_result) - set(self.deduper.data_sample)) assert set(self.deduper.data_sample).issuperset(correct_result)
def getSample(cur, sample_size, id_column, table): ''' Returns a random sample of a given size of records pairs from a given MySQL table. ''' cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table)) num_records = cur.fetchall()[0].values()[0] random_pairs = dedupe.randomPairs(num_records, sample_size) temp_d = {} cur.execute(DONOR_SELECT) for i, row in enumerate(cur) : temp_d[i] = dedupe.frozendict(row) pair_sample = [(temp_d[k1], temp_d[k2]) for k1, k2 in random_pairs] return pair_sample
def test_randomSample(self): random.seed(27) self.linker.sample(data_dict, data_dict_2, 50, 1) correct_result = [(dedupe.frozendict({ 'age': '51', 'name': 'Bob B.' }), dedupe.frozendict({ 'age': '51', 'name': 'BOB' })), (dedupe.frozendict({ 'age': '51', 'name': 'Bob B.' }), dedupe.frozendict({ 'age': '51', 'name': 'BOB B.' })), (dedupe.frozendict({ 'age': '51', 'name': 'Bob' }), dedupe.frozendict({ 'age': '51', 'name': 'BOB B.' })), (dedupe.frozendict({ 'age': '15', 'name': 'Tina' }), dedupe.frozendict({ 'age': '15', 'name': 'TINA' }))] assert set(self.linker.data_sample).issuperset(correct_result) self.linker.sample(data_dict, data_dict_2, 5, 0) correct_result = [(dedupe.frozendict({ 'age': '51', 'name': 'Bob B.' }), dedupe.frozendict({ 'age': '15', 'name': 'TINA' })), (dedupe.frozendict({ 'age': '51', 'name': 'Bob B.' }), dedupe.frozendict({ 'age': '50', 'name': 'LINDA' })), (dedupe.frozendict({ 'age': '12', 'name': 'Gene' }), dedupe.frozendict({ 'age': '15', 'name': 'TINA' })), (dedupe.frozendict({ 'age': '50', 'name': 'Linda' }), dedupe.frozendict({ 'age': '50', 'name': 'LINDA ' })), (dedupe.frozendict({ 'age': '50', 'name': 'linda ' }), dedupe.frozendict({ 'age': '51', 'name': 'BOB BELCHER' }))]
def test_randomSample(self) : random.seed(27) numpy.random.seed(43) self.linker.sample( data_dict, data_dict_2, 50, 1) correct_result = [(dedupe.frozendict({'age': '51', 'name': 'Bob B.'}), dedupe.frozendict({'age': '51', 'name': 'BOB'})), (dedupe.frozendict({'age': '51', 'name': 'Bob B.'}), dedupe.frozendict({'age': '51', 'name': 'BOB B.'})), (dedupe.frozendict({'age': '51', 'name': 'Bob'}), dedupe.frozendict({'age': '51', 'name': 'BOB B.'})), (dedupe.frozendict({'age': '15', 'name': 'Tina'}), dedupe.frozendict({'age': '15', 'name': 'TINA'}))] assert set(self.linker.data_sample).issuperset(correct_result) self.linker.sample(data_dict, data_dict_2, 5, 0) correct_result = [(dedupe.frozendict({'age': '51', 'name': 'Bob B.'}), dedupe.frozendict({'age': '15', 'name': 'TINA'})), (dedupe.frozendict({'age': '51', 'name': 'Bob B.'}), dedupe.frozendict({'age': '50', 'name': 'LINDA'})), (dedupe.frozendict({'age': '12', 'name': 'Gene'}), dedupe.frozendict({'age': '15', 'name': 'TINA'})), (dedupe.frozendict({'age': '50', 'name': 'Linda'}), dedupe.frozendict({'age': '50', 'name': 'LINDA '})), (dedupe.frozendict({'age': '50', 'name': 'linda '}), dedupe.frozendict({'age': '51', 'name': 'BOB BELCHER'}))]
def test_randomSample(self) : random.seed(27) self.linker.sample( data_dict, data_dict_2, 5, 1) correct_result = [(dedupe.frozendict({'age': '51', 'name': 'Bob B.'}), dedupe.frozendict({'age': '51', 'name': 'BOB'})), (dedupe.frozendict({'age': '51', 'name': 'Bob B.'}), dedupe.frozendict({'age': '51', 'name': 'BOB B.'})), (dedupe.frozendict({'age': '51', 'name': 'bob belcher'}), dedupe.frozendict({'age': '51', 'name': 'BOB'})), (dedupe.frozendict({'age': '50', 'name': 'linda '}), dedupe.frozendict({'age': '12', 'name': 'GENE'})), (dedupe.frozendict({'age': '15', 'name': 'Tina'}), dedupe.frozendict({'age': '15', 'name': 'TINA'}))] assert self.linker.data_sample == correct_result self.linker.sample(data_dict, data_dict_2, 5, 0) correct_result = [(dedupe.frozendict({'age': '51', 'name': 'Bob B.'}), dedupe.frozendict({'age': '15', 'name': 'TINA'})), (dedupe.frozendict({'age': '51', 'name': 'Bob B.'}), dedupe.frozendict({'age': '50', 'name': 'LINDA'})), (dedupe.frozendict({'age': '12', 'name': 'Gene'}), dedupe.frozendict({'age': '15', 'name': 'TINA'})), (dedupe.frozendict({'age': '50', 'name': 'Linda'}), dedupe.frozendict({'age': '50', 'name': 'LINDA '})), (dedupe.frozendict({'age': '50', 'name': 'linda '}), dedupe.frozendict({'age': '51', 'name': 'BOB BELCHER'}))]