示例#1
0
文件: join.py 项目: PGower/Unsync
def join(data, strategy, source_left, source_right, destination, key_left,
         key_right, prefix_left, prefix_right, presorted, buffersize, tempdir,
         cache, missing):
    """Perform a join on two data tables."""
    source_left = data.get(source_left)
    source_right = data.get(source_right)

    kwargs = {}
    if key_left == key_right:
        kwargs['key'] = key_left
    else:
        kwargs['lkey'] = key_left
        kwargs['rkey'] = key_right

    if presorted is True:
        kwargs['presorted'] = presorted

    if buffersize is not None:
        kwargs['buffersize'] = buffersize

    if tempdir:
        kwargs['tempdir'] = tempdir

    if 'anti' not in strategy:
        if prefix_left is not None:
            kwargs['lprefix'] = prefix_left
        if prefix_right is not None:
            kwargs['rprefix'] = prefix_right

    if strategy not in ['join', 'antijoin', 'hashjoin', 'hashantijoin']:
        kwargs['missing'] = missing

    if strategy == 'join':
        o = petl.join(source_left, source_right, **kwargs)
    elif strategy == 'leftjoin':
        o = petl.leftjoin(source_left, source_right, **kwargs)
    elif strategy == 'lookupjoin':
        o = petl.lookupjoin(source_left, source_right, **kwargs)
    elif strategy == 'rightjoin':
        o = petl.rightjoin(source_left, source_right, **kwargs)
    elif strategy == 'outerjoin':
        o = petl.outerjoin(source_left, source_right, **kwargs)
    elif strategy == 'antijoin':
        o = petl.antijoin(source_left, source_right, **kwargs)
    elif strategy == 'hashjoin':
        o = petl.antijoin(source_left, source_right, **kwargs)
    elif strategy == 'hashleftjoin':
        o = petl.hashleftjoin(source_left, source_right, **kwargs)
    elif strategy == 'hashlookupjoin':
        o = petl.hashlookupjoin(source_left, source_right, **kwargs)
    elif strategy == 'hashrightjoin':
        o = petl.hashrightjoin(source_left, source_right, **kwargs)

    data.set(destination, o)
示例#2
0
文件: examples.py 项目: datamade/petl
# antijoin

table1 = [['id', 'colour'],
          [0, 'black'],
          [1, 'blue'],
          [2, 'red'],
          [4, 'yellow'],
          [5, 'white']]
table2 = [['id', 'shape'],
          [1, 'circle'],
          [3, 'square']]

from petl import antijoin, look
look(table1)
look(table2)
table3 = antijoin(table1, table2, key='id')
look(table3)


# rangefacet

table1 = [['foo', 'bar'],
          ['a', 3],
          ['a', 7],
          ['b', 2],
          ['b', 1],
          ['b', 9],
          ['c', 4],
          ['d', 3]]

from petl import rangefacet, look
示例#3
0
# antijoin

table1 = [['id', 'colour'],
          [0, 'black'],
          [1, 'blue'],
          [2, 'red'],
          [4, 'yellow'],
          [5, 'white']]
table2 = [['id', 'shape'],
          [1, 'circle'],
          [3, 'square']]

from petl import antijoin, look
look(table1)
look(table2)
table3 = antijoin(table1, table2, key='id')
look(table3)


# rangefacet

table1 = [['foo', 'bar'],
          ['a', 3],
          ['a', 7],
          ['b', 2],
          ['b', 1],
          ['b', 9],
          ['c', 4],
          ['d', 3]]

from petl import rangefacet, look
示例#4
0
# Get duplicate parcel_ids:
non_unique_parcel_id_rows = engine_dor_parcel_rows.duplicates(key='parcel_id')
unique_parcel_id_rows = etl.complement(engine_dor_parcel_rows,
                                       non_unique_parcel_id_rows)

# Get address comps for condos by joining to dor_parcel with unique parcel_id on parcel_id:
print("Relating condos to parcels...")
joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \
    .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True)
print("joined rowcount: ", etl.nrows(joined))
if DEV:
    print(etl.look(joined))

# Calculate errors
print("Calculating errors...")
unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id')
print("unjoined rowcount: ", etl.nrows(unjoined))
dor_condos_unjoined_unmatched = etl.antijoin(unjoined,
                                             non_unique_parcel_id_rows,
                                             key='parcel_id').addfield(
                                                 'reason',
                                                 'non-active/remainder mapreg')
print("non-active/remainder mapreg error rowcount: ",
      etl.nrows(dor_condos_unjoined_unmatched))
if DEV:
    print(etl.look(dor_condos_unjoined_unmatched))
dor_condos_unjoined_duplicates = etl.antijoin(
    unjoined, dor_condos_unjoined_unmatched,
    key='source_object_id').addfield('reason',
                                     'non-unique active/remainder mapreg')
print("non-unique active/remainder mapreg error rowcount: ",
示例#5
0
#############

import petl as etl
table1 = [['id', 'colour'], [1, 'blue'], [2, 'red']]
table2 = [['id', 'shape'], [1, 'circle'], [3, 'square']]
table3 = etl.crossjoin(table1, table2)
table3

# antijoin()
############

import petl as etl
table1 = [['id', 'colour'], [0, 'black'], [1, 'blue'], [2, 'red'],
          [4, 'yellow'], [5, 'white']]
table2 = [['id', 'shape'], [1, 'circle'], [3, 'square']]
table3 = etl.antijoin(table1, table2, key='id')
table3

# lookupjoin()
##############

import petl as etl
table1 = [['id', 'color', 'cost'], [1, 'blue', 12], [2, 'red', 8],
          [3, 'purple', 4]]
table2 = [['id', 'shape', 'size'], [1, 'circle', 'big'],
          [1, 'circle', 'small'], [2, 'square', 'tiny'], [2, 'square', 'big'],
          [3, 'ellipse', 'small'], [3, 'ellipse', 'tiny']]
table3 = etl.lookupjoin(table1, table2, key='id')
table3

# unjoin()
示例#6
0
# Get duplicate parcel_ids:
non_unique_parcel_id_rows = engine_dor_parcel_rows.duplicates(key='parcel_id')
unique_parcel_id_rows = etl.complement(engine_dor_parcel_rows, non_unique_parcel_id_rows)

# Get address comps for condos by joining to dor_parcel with unique parcel_id on parcel_id:
print("Relating condos to parcels...")
joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \
    .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True)
print("joined rowcount: ", etl.nrows(joined))
if DEV:
    print(etl.look(joined))

# Calculate errors
print("Calculating errors...")
unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id')
print("unjoined rowcount: ", etl.nrows(unjoined))
dor_condos_unjoined_unmatched = etl.antijoin(unjoined, non_unique_parcel_id_rows, key='parcel_id').addfield('reason', 'non-active/remainder mapreg')
print("non-active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_unmatched))
if DEV:
    print(etl.look(dor_condos_unjoined_unmatched))
dor_condos_unjoined_duplicates = etl.antijoin(unjoined, dor_condos_unjoined_unmatched, key='source_object_id').addfield('reason', 'non-unique active/remainder mapreg')
print("non-unique active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_duplicates))
if DEV:
    print(etl.look(dor_condos_unjoined_duplicates))
error_table = etl.cat(dor_condos_unjoined_unmatched, dor_condos_unjoined_duplicates)
if DEV:
    print(etl.look(error_table))

# Write to engine db
if not DEV:
def extract_backend(offline=OFFLINE):
    # Done in 4 steps: (1) grab the driver table from the CloudSQL,
    # (2) use the user uuids to query for users one by one through
    # the API, (3) get the fleet table from CloudSQL and (4) join
    # everything together.

    def extract_drivers():
        query = SQLReader('sql.drivers_from_cloudsql')
        drivers_df = sql.execute(query.statements[0])
        drivers_tb = fromdataframe(drivers_df)

        mappings = {
            'driver_uuid': lambda rec: str(UUID(bytes=rec['uuid'], version=4)),
            'fleet_uuid': lambda rec: str(UUID(bytes=rec['fleet_uuid'], version=4)),
            'user_uuid': lambda rec: str(UUID(bytes=rec['user_ds_uuid'], version=4)),
            'fullname': lambda rec: rec['last_name'].strip() + ', ' + rec['first_name'].strip(),
        }

        drivers_tb = drivers_tb.fieldmap(mappings)
        drivers_tb = drivers_tb.suffixheader('_in_backend')

        return drivers_tb

    def extract_users():
        users_records = [api.get_record('users', driver.user_uuid_in_backend)
                         for driver in drivers.namedtuples()]
        users_df = DataFrame().from_records(users_records)
        users_tb = fromdataframe(users_df)

        mappings = {
            'driver_uuid': 'driver',
            'user_uuid': 'uuid',
            'backend_username': '******'
        }

        users_tb = users_tb.fieldmap(mappings)
        users_tb = users_tb.suffixheader('_in_backend')

        return users_tb

    def extract_fleets_from_dwh():
        query = SQLReader('sql.fleets_from_tableau')
        fleets_df = dwh.execute(query.statements[0])
        fleets_tb = fromdataframe(fleets_df)

        mappings = {
            'fleet_uuid': 'uuid',
            'fleetname': lambda rec: rec['backend_name'].replace('_', ' '),
            'country_code': 'country_code',
        }

        fleets_tb = fleets_tb.cutout('country_code')
        fleets_tb = fleets_tb.fieldmap(mappings)
        fleets_tb = fleets_tb.suffixheader('_in_backend')

        return fleets_tb

    if not offline:
        sql = CloudSQLConnector()
        api = ValkfleetConnector()
        dwh = WarehouseConnector()

        drivers = extract_drivers()
        fleets = extract_fleets_from_dwh()
        users = extract_users()

        drivers.topickle(DRIVERS_IN_BACKEND_FILEPATH)
        fleets.topickle(FLEETS_IN_BACKEND_FILEPATH)
        users.topickle(USERS_IN_BACKEND_FILEPATH)

    else:
        drivers = frompickle(DRIVERS_IN_BACKEND_FILEPATH)
        fleets = frompickle(FLEETS_IN_BACKEND_FILEPATH)
        users = frompickle(USERS_IN_BACKEND_FILEPATH)

    write_to_log(drivers, 'drivers', 'backend')
    write_to_log(fleets, 'fleets', 'backend')
    write_to_log(users, 'users', 'backend')

    drivers_without_fleet = antijoin(drivers, fleets, key='fleet_uuid_in_backend')
    drivers_without_user = antijoin(drivers, users, key='user_uuid_in_backend')
    write_to_log(drivers_without_fleet, 'drivers without fleet', 'backend')
    write_to_log(drivers_without_user, 'drivers without user', 'backend')

    drivers_n_fleets = join(drivers, fleets, key='fleet_uuid_in_backend').cutout('fleet_uuid_in_backend')
    backend_drivers = join(drivers_n_fleets, users, key='user_uuid_in_backend')
    backend_drivers = backend_drivers.addfield('backend_username', lambda rec: rec['backend_username_in_backend'])
    backend_drivers = backend_drivers.cutout('driver_uuid_in_backend')

    backend_drivers = standardize_missing_values(backend_drivers)
    write_to_log(backend_drivers, 'drivers', 'backend')

    return backend_drivers
示例#8
0
文件: joins.py 项目: DeanWay/petl

# antijoin()
############

import petl as etl
table1 = [['id', 'colour'],
          [0, 'black'],
          [1, 'blue'],
          [2, 'red'],
          [4, 'yellow'],
          [5, 'white']]
table2 = [['id', 'shape'],
          [1, 'circle'],
          [3, 'square']]
table3 = etl.antijoin(table1, table2, key='id')
table3


# lookupjoin()
##############

import petl as etl
table1 = [['id', 'color', 'cost'], 
          [1, 'blue', 12], 
          [2, 'red', 8], 
          [3, 'purple', 4]]
table2 = [['id', 'shape', 'size'], 
          [1, 'circle', 'big'], 
          [1, 'circle', 'small'], 
          [2, 'square', 'tiny'],