Пример #1
0
def getSample(cur, sample_size, id_column, table):
    '''
    Returns a random sample of a given size of records pairs from a given
    MySQL table.
    '''

    cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table))
    num_records = cur.fetchone().values()[0]

    # Dedupe expects the id column to contain unique, sequential
    # integers starting at 0 or 1
    random_pairs = dedupe.randomPairs(num_records,
                                      sample_size,
                                      zero_indexed=False)

    temp_d = {}

    cur.execute(DONOR_SELECT)
    for row in cur.fetchall() :
        temp_d[int(row[id_column])] = dedupe.core.frozendict(row)

    def random_pair_generator():
        for record_id_1, record_id_2 in random_pairs:
            yield ((record_id_1, temp_d[record_id_1]),
                   (record_id_2, temp_d[record_id_2]))

    return tuple(pair for pair in random_pair_generator())
Пример #2
0
def getSample(cur, sample_size, id_column, table):
    '''
    Returns a random sample of a given size of records pairs from a given
    MySQL table.
    '''

    cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table))
    num_records = cur.fetchone().values()[0]

    # Dedupe expects the id column to contain unique, sequential
    # integers starting at 0 or 1
    random_pairs = dedupe.randomPairs(num_records,
                                      sample_size,
                                      zero_indexed=False)

    temp_d = {}

    cur.execute(DONOR_SELECT)
    for row in cur.fetchall():
        temp_d[int(row[id_column])] = dedupe.core.frozendict(row)

    def random_pair_generator():
        for k1, k2 in random_pairs:
            yield (temp_d[k1], temp_d[k2])

    return tuple(pair for pair in random_pair_generator())
Пример #3
0
def getSample(cur, sample_size, id_column, table):
    '''
    Returns a random sample of a given size of records pairs from a given
    MySQL table.
    '''

    cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table))
    num_records = cur.fetchone().values()[0]

    cur.fetchall()

    random_pairs = dedupe.randomPairs(num_records, sample_size)
    random_pairs += 1

    temp_d = {}

    cur.execute(DONOR_SELECT)
    for row in cur:
        temp_d[int(row[id_column])] = dedupe.core.frozendict(row)

    def random_pair_generator():
        for k1, k2 in random_pairs:
            yield (temp_d[k1], temp_d[k2])

    return tuple(pair for pair in random_pair_generator())
Пример #4
0
def getSample(cur, sample_size, id_column, table):
    '''
    Returns a random sample of a given size of records pairs from a given
    MySQL table.
    '''

    cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table))
    num_records = cur.fetchone().values()[0]

    cur.fetchall()

    random_pairs = dedupe.randomPairs(num_records,
                                      sample_size) 
    random_pairs += 1

    temp_d = {}

    cur.execute(DONOR_SELECT)
    for row in cur :
        temp_d[int(row[id_column])] = dedupe.core.frozendict(row)

    def random_pair_generator():
        for k1, k2 in random_pairs:
            yield (temp_d[k1], temp_d[k2])

    return tuple(pair for pair in random_pair_generator())
def getSample(con, sample_size, id_column, table):
    '''
    Returns a random sample of a given size of records pairs from a given
    PostgresSQL table.
    '''
    cur = con.cursor()

    cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table))
    num_records = cur.fetchall()[0].values()[0]

    cur.close()

    random_pairs = dedupe.randomPairs(num_records,
                                      sample_size)
    temp_d = {}

    # Named cursor runs server side with psycopg2
    cur = con.cursor('donor_select')

    cur.execute(DONOR_SELECT)

    for i, row in enumerate(cur):
        temp_d[i] = dedupe.frozendict(row)

    cur.close()

    pair_sample = [(temp_d[k1], temp_d[k2])
                   for k1, k2 in random_pairs]

    return pair_sample
Пример #6
0
def getSample(con, size):
  """
  Returns a random sample of pairs of donors of a given size
  """

  dim = con.execute("SELECT MAX(donor_id) FROM donors").next()[0]

  random_pairs = dedupe.randomPairs(dim, size, zero_indexed=False)

  all_ids = ', '.join(str(record_id) for pair in random_pairs for record_id in pair)

  temp_d = {}

  for row in con.execute(donor_select + " WHERE donor_id IN (%s)" % all_ids) :
    temp_d[row['donor_id']] = row

  return tuple((((record_id_1, temp_d[record_id_1]),
                 (record_id_2, temp_d[record_id_2]))
                for record_id_1, record_id_2
                in random_pairs))
Пример #7
0
def getSample(cur, sample_size, id_column, table):
    '''
    Returns a random sample of a given size of records pairs from a given
    MySQL table.
    '''

    cur.execute("SELECT MAX(%s) FROM %s" % (id_column, table))
    num_records = cur.fetchall()[0].values()[0]
    
    random_pairs = dedupe.randomPairs(num_records,
                                      sample_size)
    temp_d = {}

    cur.execute(DONOR_SELECT)

    for i, row in enumerate(cur) :
        temp_d[i] = dedupe.frozendict(row)

    pair_sample = [(temp_d[k1], temp_d[k2])
                   for k1, k2 in random_pairs]

    return pair_sample