Python join 예제들, petl.join Python 예제들

예제 #1

0

파일 보기

def DataIntegration(clinics_LOC, Services_LOC, Location_LOC):
    # Reading the clinics.csv file
    fileData = pt.fromcsv(clinics_LOC)

    # Reading the clinic_services.csv file
    servicesData = pt.fromcsv(Services_LOC)

    # reading the xml file cliniclocations.xml
    locationXML = pt.fromxml(Location_LOC, 'clinic', {
        "ClinicID": "ClinicID",
        "Lat": "Lat",
        "Lon": "Lon"
    })

    # join the csv file's using the inbuilt function join using ClinicID as main key
    fileJoin = pt.join(servicesData, fileData, key="ClinicID")

    # join the csv file using the inbuilt function join using ClinicID as main key
    MainJoin = pt.join(fileJoin, locationXML, key="ClinicID")

    # acquire the required columns
    result = pt.cut(MainJoin, 'ClinicServiceID', 'Service', 'ClinicID',
                    'Suburb', 'Postcode', 'Lat', 'Lon')

    # creating the final csv file which is clinicservicelocations.csv
    pt.tocsv(result, "clinic_service_locations.csv")
    print('Csv file generated.!!!')

예제 #2

0

파일 보기

    def _shape_data(self, raw_planets: etl.Table,
                    raw_people: etl.Table) -> etl.Table:
        planets = etl.cut(raw_planets, (
            Planet.Columns.NAME,
            Planet.Columns.URL,
        ))
        people = etl.cut(
            raw_people,
            (
                Person.Columns.NAME,
                Person.Columns.HEIGHT,
                Person.Columns.MASS,
                Person.Columns.HAIR_COLOR,
                Person.Columns.SKIN_COLOR,
                Person.Columns.EYE_COLOR,
                Person.Columns.BIRTH_YEAR,
                Person.Columns.GENDER,
                Person.Columns.HOMEWORLD,
                Person.Columns.EDITED,
            ),
        )

        combined = etl.join(
            planets,
            people,
            lkey=Planet.Columns.URL,
            rkey=Person.Columns.HOMEWORLD,
            lprefix=Planet.PREFIX,
        )

        renamed = etl.rename(
            combined,
            {
                Person.Columns.EDITED: Person.RenamedColumns.DATE,
                Planet.prefix_value(Planet.Columns.NAME):
                Person.Columns.HOMEWORLD,
            },
        )

        converted = etl.convert(
            renamed, {
                Person.RenamedColumns.DATE: lambda v: parse(v).date(),
            })

        return etl.cut(
            converted,
            (
                Person.Columns.NAME,
                Person.Columns.HEIGHT,
                Person.Columns.MASS,
                Person.Columns.HAIR_COLOR,
                Person.Columns.SKIN_COLOR,
                Person.Columns.EYE_COLOR,
                Person.Columns.BIRTH_YEAR,
                Person.Columns.GENDER,
                Person.Columns.HOMEWORLD,
                Person.RenamedColumns.DATE,
            ),
        )

예제 #3

0

파일 보기

파일: join.py 프로젝트: PGower/Unsync

def join(data, strategy, source_left, source_right, destination, key_left,
         key_right, prefix_left, prefix_right, presorted, buffersize, tempdir,
         cache, missing):
    """Perform a join on two data tables."""
    source_left = data.get(source_left)
    source_right = data.get(source_right)

    kwargs = {}
    if key_left == key_right:
        kwargs['key'] = key_left
    else:
        kwargs['lkey'] = key_left
        kwargs['rkey'] = key_right

    if presorted is True:
        kwargs['presorted'] = presorted

    if buffersize is not None:
        kwargs['buffersize'] = buffersize

    if tempdir:
        kwargs['tempdir'] = tempdir

    if 'anti' not in strategy:
        if prefix_left is not None:
            kwargs['lprefix'] = prefix_left
        if prefix_right is not None:
            kwargs['rprefix'] = prefix_right

    if strategy not in ['join', 'antijoin', 'hashjoin', 'hashantijoin']:
        kwargs['missing'] = missing

    if strategy == 'join':
        o = petl.join(source_left, source_right, **kwargs)
    elif strategy == 'leftjoin':
        o = petl.leftjoin(source_left, source_right, **kwargs)
    elif strategy == 'lookupjoin':
        o = petl.lookupjoin(source_left, source_right, **kwargs)
    elif strategy == 'rightjoin':
        o = petl.rightjoin(source_left, source_right, **kwargs)
    elif strategy == 'outerjoin':
        o = petl.outerjoin(source_left, source_right, **kwargs)
    elif strategy == 'antijoin':
        o = petl.antijoin(source_left, source_right, **kwargs)
    elif strategy == 'hashjoin':
        o = petl.antijoin(source_left, source_right, **kwargs)
    elif strategy == 'hashleftjoin':
        o = petl.hashleftjoin(source_left, source_right, **kwargs)
    elif strategy == 'hashlookupjoin':
        o = petl.hashlookupjoin(source_left, source_right, **kwargs)
    elif strategy == 'hashrightjoin':
        o = petl.hashrightjoin(source_left, source_right, **kwargs)

    data.set(destination, o)

예제 #4

0

파일 보기

파일: integrator_refactored.py 프로젝트: Pedrotojal/blendle-etl-challenge

def createFacts(events, users):
    try:
        events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign')
        events_tui = etl.cutout(events, 'user_id')

        stage_uid = etl.join(users, events_uid, key='user_id')
        stage_tui = etl.join(users, events_tui, key='tracking_id')

        stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign')
        stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id')
        stage_m_s = etl.mergesort(stage_uid_utm, stage_tui, key=['created_at', 'email'])

        mappings = OrderedDict()
        mappings['tid'] = 'tracking_id'
        mappings['uid'] = 'user_id'
        mappings['utm_medium'] = 'utm_medium'
        mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'}
        mappings['utm_campaigntype'] = 'utm_campaign'
        mappings['email'] = 'email'
        mappings['subscription'] = 'type'
        mappings['sub_order'] = 'type', {'Signup Completed': '1', 'Trial Started': '2', 'Subscription Started': '3', 'Subscription Ended': '4'}
        mappings['created_at'] = 'created_at'

        # Mapping
        stage_mapping = etl.fieldmap(stage_m_s, mappings)

        # Sort
        stage_mapping_ordered = etl.sort(stage_mapping, key=['created_at', 'email', 'sub_order'])

        # Datetime split
        t1 = etl.split(stage_mapping_ordered, 'created_at', 'T', ['date', 'time'], include_original=True)
        t2 = etl.split(t1, 'date', '-', ['year', 'month', 'day'])
        stage_ready = etl.split(t2, 'time', ':', ['hour', 'minute', 'second'])

        # Export as csv to load folder
        etl.tocsv(stage_ready, 'load/facts.csv')

    except Exception as e:
        print("Something went wrong. Error {0}".format(e))

예제 #5

0

파일 보기

def xref_symbol_reports():
    symbol_reports = [
        f for f in os.listdir()
        if re.match('OCLC Datasync Unresolved.*\.csv', f)
    ]

    today = str(date.today())

    for report in symbol_reports:

        symbol_split = re.split('^.*processing.(M[A-Z]{2}).*$', report)
        symbol = symbol_split[1]
        xlsx_outfile = symbol + '_datasync_unresolved_' + today + '.xlsx'
        xls_outfile = symbol + '_datasync_unresolved_' + today + '.xls'
        txt_outfile = symbol + '_staging_OCNs_' + today + '.txt'

        symbol_table_raw = etl.fromcsv(report, encoding='utf-8')
        symbol_table = etl.rename(symbol_table_raw, '\ufeffMMS Id', 'MMS ID')
        symbol_table2 = etl.select(symbol_table, "{MMS ID} is not None")
        symbol_table_sorted = etl.sort(symbol_table2, 'MMS ID')

        xref_table = etl.fromcsv('unresxref.csv')
        xref_table2 = etl.select(xref_table, "{MMS ID} is not None")
        xref_table_sorted = etl.sort(xref_table2, 'MMS ID')

        symbol_xref_table = etl.join(symbol_table_sorted,
                                     xref_table_sorted,
                                     presorted=True,
                                     lkey="MMS ID",
                                     rkey="MMS ID")

        try:
            etl.toxlsx(symbol_xref_table, xlsx_outfile, encoding='utf-8')
        except TypeError:
            etl.toxls(symbol_xref_table,
                      xls_outfile,
                      'Sheet1',
                      encoding='utf-8')

        staging_ocns_table = etl.cut(symbol_xref_table, 'Staging OCN')
        template = '{Staging OCN}\n'
        etl.totext(staging_ocns_table, txt_outfile, template=template)

예제 #6

0

파일 보기

파일: pipeline.py 프로젝트: rmcane/streetlights

def kcmo_convert(filepath, xtrapath):
    """
    Takes the file path to a csv in the format used by Kansas City proper
    converts to universal format 
    outputs csv.
    """
    kcmo = etl.fromcsv(filepath)
    kcx = etl.fromxlsx(xtrapath)
    kcjoin = etl.join(kcmo, kcx, lkey='POLEID', rkey='IDNumber')
    del kcmo
    del kcx

    kcjoin = etl.addfield(kcjoin, 'PoleID', lambda x: x['POLEID'])
    kcjoin = etl.addfield(kcjoin, 'Longitude',
                          lambda x: geom_to_tuple(x['the_geom'])[0])
    kcjoin = etl.addfield(kcjoin, 'Latitude',
                          lambda x: geom_to_tuple(x['the_geom'])[1])
    kcjoin = etl.addfield(kcjoin, 'LightbulbType',
                          lambda x: x['LUMINAIRE TYPE'])
    kcjoin = etl.addfield(kcjoin, 'Wattage', lambda x: x['WATTS'])
    kcjoin = etl.addfield(kcjoin, 'Lumens', None)
    kcjoin = etl.addfield(
        kcjoin, 'LightAttributes', lambda x: make_a_list(
            x['ATTACHMENT 10'], x['ATTACHMENT 9'], x['ATTACHMENT 8'], x[
                'ATTACHMENT 7'], x['ATTACHMENT 6'], x['ATTACHMENT 5'], x[
                    'ATTACHMENT 4'], x['ATTACHMENT 3'], x['ATTACHMENT 2'], x[
                        'ATTACHMENT 1'], x['SPECIAL_N2'], x['SPECIAL_NO']))
    kcjoin = etl.addfield(kcjoin, 'AttachedTech',
                          lambda x: bool(x['LightAttributes']))
    kcjoin = etl.addfield(
        kcjoin, 'FiberWiFiEnable', lambda x: find_wifi(*x[
            'LightAttributes'], x['SPECIAL_N2'], x['SPECIAL_NO']))
    kcjoin = etl.addfield(kcjoin, 'PoleType', lambda x: x['POLE TYPE'])
    kcjoin = etl.addfield(kcjoin, 'PoleOwner', lambda x: x['POLE OWNER'])
    kcjoin = etl.addfield(kcjoin, 'DataSource', 'Kansas City')
    kcjoin = etl.cut(kcjoin, 'PoleID', 'Longitude', 'Latitude',
                     'LightbulbType', 'Wattage', 'Lumens', 'AttachedTech',
                     'LightAttributes', 'FiberWiFiEnable', 'PoleType',
                     'PoleOwner', 'DataSource')
    etl.tocsv(kcjoin, 'data/kcmo_clean.csv')

예제 #7

0

파일 보기

파일: run.py 프로젝트: SergeVL/petlsql

def join_execute(cl, cr, join, **kwargs):
    cl, cr = cl(), cr()
    if 'addLfields' in kwargs:
        cl = etl.addfields(cl, kwargs['addLfields'])
    if 'addRfields' in kwargs:
        cr = etl.addfields(cr, kwargs['addRfields'])
    args = cl, cr
    if join == Join.UNION:
        c = etl.crossjoin(*args)
    else:
        kwargs = filter_keys(kwargs,
                             ("key", "lkey", "rkey", "missing", "presorted",
                              "buffersize", "tempdir", "cache"))
        if join == Join.INNER:
            c = etl.join(*args, **kwargs)
        elif join == Join.LEFT:
            c = etl.leftjoin(*args, **kwargs)
        elif join == Join.RIGHT:
            c = etl.rightjoin(*args, **kwargs)
        elif join == Join.FULL:
            c = etl.outerjoin(*args, **kwargs)
    return c

예제 #8

0

파일 보기

def xls_tidy(xls,qvalue):
    d=etl.fromtsv(xls)
    sd=etl.select(d,lambda x: float(x.PepQValue) <=float(qvalue))
    psmsummary=sd

    ssd=etl.cut(sd, 'Peptide', 'Protein', 'PepQValue')
    #remove the mod info in peptide.
    ssd=etl.transform.regex.sub(ssd,'Peptide', r'^[\w-]\.(.+)\.[\w-]$', r'\1')
    ssd=etl.transform.regex.sub(ssd,'Peptide', r'[\d\.\+]+', r'')

    aggregation = OrderedDict()
    aggregation['SpecCount'] = len
    cssd=etl.aggregate(ssd, 'Peptide', aggregation)

    fssd=etl.groupselectfirst(ssd, key=('Peptide','Protein',"PepQValue"))
    aggregation = OrderedDict()
    aggregation['Protein'] = 'Protein', etl.strjoin(';')
    aggregation['PepQValue'] = 'PepQValue', etl.strjoin(';')
    assd=etl.aggregate(fssd, 'Peptide', aggregation)
    pepsummary=etl.join(assd, cssd, key='Peptide')

    return (psmsummary, pepsummary)

예제 #9

0

파일 보기


# Convenience function to convert values under the given field using a regular expression substitution."""
def substitute(table, field, pattern, repl, count=0, flags=0):
    program = re.compile(pattern, flags)
    convert = lambda tempData: program.sub(repl, tempData, count=count)
    return pt.convert(table, field, convert)


# use to read the csv file using the petl framework
# the below code is use to read services csv file
fileData = pt.fromcsv('services.csv')
# the below code is use to read clinicservices csv file
servicesData = pt.fromcsv('clinicservices.csv')
# join the csv file using the inbuilt function join using ServiceID as main key
fileJoin = pt.join(servicesData, fileData, key="ServiceID")
# the below code is use to read clinic csv file
readCsv = pt.fromcsv('clinics.csv')
# join the csv file using the inbuilt function join using ClinicID as main key
doubleJoin = pt.join(fileJoin, readCsv, key='ClinicID')
# reading the xml file cliniclocations.xml
locationXML = pt.fromxml('cliniclocations.xml', 'clinic', {
    "ClinicID": "ClinicID",
    "Lat": "Lat",
    "Lon": "Lon"
})
# join the csv file using the inbuilt function join using ClinicID as main key
doubleJoin2 = pt.join(doubleJoin, locationXML, key="ClinicID")
# removing the spaces from the email field
cleanOne = substitute(doubleJoin2, 'Email', '\s', '')
# adding @myclinic.com.au behind every email id

예제 #10

0

파일 보기

파일: etl_fact.py 프로젝트: Cimsolutions2019/ETL

def dimension_values():
    connection = psycopg2.connect(dbname='voyager',
                                  user='******',
                                  password='******',
                                  host='172.16.0.45')
    engine = create_engine('postgresql://*****:*****@172.16.0.45:5432/voyager')

    com = 'select id as id_component, name as component from dim_com'
    table_com = etl.fromdb(connection, com)

    loc = 'select id as id_location, name as name from dim_loc'
    table_loc = etl.fromdb(connection, loc)

    tim = 'select id as id_time, time as timestamp from dim_time'
    table_time = etl.fromdb(connection, tim)

    print(table_com)
    print(table_loc)
    print(table_time)

    for ran in range(0, 65424, 1000):
        sql = "select * from KNMI_station_data kk " \
              "RIGHT JOIN weatherstations w ON " \
              " CAST (kk.weather_station_id AS INTEGER)  = CAST (w.station_number AS INTEGER) " \
              "WHERE w.station_number NOT LIKE \'NL%%\' AND date > 20190901 LIMIT 1000 OFFSET %s" % ran
        print(sql)
        table = etl.fromdb(connection, sql)

        print('knmi')
        print(table)
        table.log_progress()
        table = etl.convert(table, 'date', str)
        table = etl.convert(table, 'hour', str)

        table = etl.convert(table, 'temperature', int)
        table = etl.convert(table, 'temperature_dew', int)
        table = etl.convert(table, 'temperature_min', int)
        table = etl.convert(table, 'wind_speed_avg', int)
        table = etl.convert(table, 'wind_speed', int)
        table = etl.convert(table, 'wind_speed_max', int)

        table = etl.convert(table, 'temperature', lambda v: v / 10)
        table = etl.convert(table, 'temperature_dew', lambda v: v / 10)
        table = etl.convert(table, 'temperature_min', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed_avg', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed_max', lambda v: v / 10)

        df = pd.DataFrame(table)
        df.columns = df.iloc[0]
        df = df.drop(0)
        df['timestamp'] = df['date'] + df['hour']

        df['weather_station_id'] = df['weather_station_id'].astype(str)
        df['timestamp'] = df['timestamp'].apply(custom_to_datetime)
        df['timestamp'] = df['timestamp'].astype(str)

        df = df.drop(columns=['date', 'hour'], axis=1)

        final_knmi_table = etl.fromdataframe(df)

        final_knmi_table = etl.melt(final_knmi_table,
                                    key=[
                                        'weather_station_id', 'timestamp',
                                        'id', 'latitude', 'longitude', 'name',
                                        'station_number', 'data_source_id',
                                        'altitude'
                                    ])
        final_knmi_table = etl.rename(final_knmi_table, 'variable',
                                      'component')
        print(final_knmi_table)

        final_knmi_table2 = etl.join(final_knmi_table,
                                     table_com,
                                     key='component')
        final_knmi_table2 = etl.join(final_knmi_table2, table_loc, key='name')
        final_knmi_table2 = etl.join(final_knmi_table2,
                                     table_time,
                                     key='timestamp')
        print('dos')

        print(final_knmi_table2)
        df = pd.DataFrame(final_knmi_table2)
        df.columns = df.iloc[0]
        df = df.drop(0)
        fact_source = df[[
            'id_component', 'id_location', 'id_time', 'value',
            'data_source_id', 'weather_station_id'
        ]]

        print(fact_source)
        fact_source.to_sql('fact_source',
                           engine,
                           if_exists='append',
                           index=False,
                           method='multi')

    for rn in range(0, 1148, 1000):
        print('lmn')
        final_lmn_table = etl.fromdb(
            connection,
            "select ld.id, ld.station_number, ld.value, ld.timestamp, ls.name as component, "
            "ws.id as lid, ws.latitude, ws.longitude, ws.data_source_id, ws.altitude, ws.name as name"
            " from luchtmeetnet_data ld "
            "right join luchtmeetnet_sensors ls on ld.formula = ls.formula "
            " join weatherstations ws on ld.station_number = ws.station_number "
            "where ws.station_number like \'NL%%\' AND timestamp > '2019-09-01' "
            "LIMIT 1000 OFFSET %s" % rn)
        final_lmn_table = etl.rename(final_lmn_table,
                                     {'station_number': 'weather_station_id'})
        final_lmn_table = etl.movefield(final_lmn_table, 'timestamp', 1)
        # print(final_lmn_table)
        # print(final_lmn_table)

        # print(table_com)
        final_lmn_table2 = etl.join(final_lmn_table,
                                    table_com,
                                    key='component')
        # print(final_lmn_table2)

        final_lmn_table2 = etl.join(final_lmn_table2, table_loc, key='name')
        # print(final_lmn_table2)
        df = pd.DataFrame(final_lmn_table2)
        df.columns = df.iloc[0]
        df = df.drop(0)
        df['timestamp'] = df['timestamp'].str[:-6]
        # print(df)

        final_lmn_table2 = etl.fromdataframe(df)

        final_lmn_table2 = etl.join(final_lmn_table2,
                                    table_time,
                                    key='timestamp')
        # print(final_lmn_table2)

        print(final_lmn_table2)
        final_lmn_df = pd.DataFrame(final_lmn_table2)
        final_lmn_df.columns = final_lmn_df.iloc[0]
        final_lmn_df = final_lmn_df.drop(0)
        fact_source = final_lmn_df[[
            'id_component', 'id_location', 'id_time', 'value',
            'data_source_id', 'weather_station_id'
        ]]
        print(fact_source)

        fact_source.to_sql('fact_source',
                           engine,
                           if_exists='append',
                           index=False,
                           method='multi')

예제 #11

0

파일 보기

def sales_summary(start_dt=None, end_dt=None):
    """tally up gross (sale over list) profits
    TODO: tally up net profites (gross profit vs inventory purchase total)

    TODO: Keyword Arguments:
        start_dt {[type]} -- datetime for start of query (default: {None})
        end_dt {[type]} -- datetime for start of query [description] (default: {None})

    Returns:
        [dict] -- various types of sales information, stored in a dictionary.
    """

    # products = db.session.query(Product).all()
    # sales = db.session.query(Sale).all()

    # retrieve existing tables
    products_records = etl.fromdb(db.engine, 'SELECT * FROM product')
    sales_records = etl.fromdb(db.engine, 'SELECT * FROM sale')

    # join product info to sales data
    sales_data = etl.join(sales_records,
                          products_records,
                          lkey='product_id',
                          rkey='id')

    # prep joined sales data for tabulation
    sales_data = etl.convert(sales_data, 'date', lambda dt: format_date(dt))
    sales_data = etl.sort(sales_data, 'date')
    sales_data = etl.convert(sales_data, 'quantity',
                             lambda q: handle_none(q, replace_with=1))
    sales_data = etl.addfield(sales_data, 'profit',
                              lambda rec: calculate_profit(rec))
    sales_data = etl.addfield(sales_data, 'gross_sales',
                              lambda rec: calculate_gross_sales(rec))

    # summarize data into charting-friendly data structures
    chart_count = etl.fold(sales_data,
                           'date',
                           operator.add,
                           'quantity',
                           presorted=True)
    chart_count = etl.rename(chart_count, {'key': 'x', 'value': 'y'})
    chart_count, chart_count_missing_date = etl.biselect(
        chart_count, lambda rec: rec.x is not None)
    # print(chart_count)
    # etl.lookall(chart_count)

    chart_gross = etl.fold(sales_data,
                           'date',
                           operator.add,
                           'gross_sales',
                           presorted=True)
    chart_gross = etl.rename(chart_gross, {'key': 'x', 'value': 'y'})
    chart_gross, chart_gross_missing_date = etl.biselect(
        chart_gross, lambda rec: rec.x is not None)
    # print(chart_gross)
    # etl.lookall(chart_gross)

    chart_profit = etl.fold(sales_data,
                            'date',
                            operator.add,
                            'profit',
                            presorted=True)
    chart_profit = etl.rename(chart_profit, {'key': 'x', 'value': 'y'})
    chart_profit, chart_profit_missing_date = etl.biselect(
        chart_profit, lambda rec: rec.x is not None)

    # tabulate some figures
    gross_sales = 0
    profits = 0
    for sale in etl.dicts(sales_data):
        profits += calculate_profit(sale)
        gross_sales += calculate_gross_sales(sale)

    # for i in etl.dicts(chart_count):
    #     print(i)
    # for i in etl.dicts(chart_gross):
    #     print(i)

    return {
        'gross_sales': gross_sales,
        'profits': profits,
        'chart_gross': list(etl.dicts(chart_gross)),
        'chart_gross_missing_date': list(etl.dicts(chart_gross_missing_date)),
        'chart_profit': list(etl.dicts(chart_profit)),
        'chart_profit_missing_date':
        list(etl.dicts(chart_profit_missing_date)),
        'chart_count': list(etl.dicts(chart_count)),
        'chart_count_missing_date': list(etl.dicts(chart_count_missing_date))
    }

예제 #12

0

파일 보기

actors = etl.pushheader(actors, ['id', 'first_name', 'last_name', 'gender'])

# actorfullname table
originalCursor.execute('SELECT * FROM actorfullname')
actorfullname = originalCursor.fetchall()
actorfullname = etl.pushheader(actorfullname, ['full_name', 'id'])

# roles table w/0 role
originalCursor.execute('SELECT movie_id, actor_id FROM roles')
actorIdOnly = originalCursor.fetchall()
actorIdOnly = etl.pushheader(actorIdOnly, ['movie_id', 'actor_id'])

#### Denormalizing Original Tables ####

# Denormalize movies_directors into movies
moviesAndDirectors = etl.join(movies, movies_directors, key='movie_id')

# Denormalize roles into movies
moviesAndDirectorsAndRoles = etl.join(moviesAndDirectors,
                                      actorIdOnly,
                                      key='movie_id')

# Add fullname to actors

actors = etl.join(actors, actorfullname, key='id')
# Denormalize roles into actors
""" actorsAndRoles = etl.join(
    actors, actorIdOnly, lkey='id', rkey='actor_id') """

# Add fullname to directors
directors = etl.join(directors, directorfullname, key='id')

예제 #13

0

파일 보기

파일: tailings_report_migration.py 프로젝트: NWCalvank/mds

def append_tailings_reports_to_code_required_reports(connection, commit=False):
    src_table = etl.fromdb(
        connection,
        'SELECT exp_doc.mine_guid, exp_doc.exp_document_guid, req_doc.req_document_name, exp_doc.due_date, exp_doc.exp_document_status_code, exp_doc.received_date, exp_doc.active_ind, exp_doc_x.mine_document_guid, exp_doc.create_user, exp_doc.create_timestamp, exp_doc.update_user, exp_doc.update_timestamp from mine_expected_document exp_doc \
        inner join mine_expected_document_xref exp_doc_x on exp_doc.exp_document_guid = exp_doc_x.exp_document_guid\
        inner join mds_required_document req_doc on req_doc.req_document_guid = exp_doc.req_document_guid'
    )

    req_document_crr_defintion_map = [
        ['req_document_name', 'mine_report_definition_id'],
        ['Summary of TSF and Dam Safety Recommendations', 28],
        ['ITRB Activities Report', 27],
        ['Register of Tailings Storage Facilities and Dams', 47],
        ['Dam Safety Inspection (DSI) Report', 26],
        ['Dam Safety Review (DSR) Report', 31],
        ['“As-built” Reports', 32],
        ['Annual Reclamation', 25],
        ['MERP Record of Testing', 3],
        #['Annual Manager\'s Report', __________________ ], no mapping or data, ignore.
        ['OMS Manual', 33],
        ['Annual reconciliation of water balance and water management plans', 44],
        ['TSF risk assessment', 46],
        ['Mine Emergency Preparedness and Response Plan (MERP)', 24],
        ['Performance of high risk dumps', 29]
    ]

    table1 = etl.join(src_table, req_document_crr_defintion_map, 'req_document_name')
    mine_report = etl.cutout(table1, 'req_document_name')

    #to be inserted into db
    mine_report = etl.addfield(mine_report, 'submission_year', 2019)
    mine_report = etl.rename(mine_report, 'exp_document_status_code',
                             'mine_report_submission_status_code')
    mine_report = etl.addfield(mine_report, 'deleted_ind', lambda x: not x.active_ind)
    mine_report = etl.cutout(mine_report, 'active_ind')
    #to determine what FK's will be so can insert into related tables
    max_report_id = etl.fromdb(connection,
                               'select last_value from public.mine_report_mine_report_id_seq')[1][0]
    max_report_submission_id = etl.fromdb(
        connection,
        'select last_value from public.mine_report_submission_mine_report_submission_id_seq')[1][0]

    #if sequence hasn't been used yet, fix off by one
    if max_report_id == 1:
        max_report_id = 0
    if max_report_submission_id == 1:
        max_report_submission_id = 0

    #get one-to-many
    mine_report, mine_report_submission_documents = etl.unjoin(mine_report,
                                                               'mine_document_guid',
                                                               key='exp_document_guid')

    #add PK's for mappings
    mine_report_with_ids = etl.addrownumbers(mine_report,
                                             start=max_report_id + 1,
                                             step=1,
                                             field='mine_report_id')
    mine_report_with_ids = etl.addrownumbers(mine_report_with_ids,
                                             start=max_report_submission_id + 1,
                                             step=1,
                                             field='mine_report_submission_id')
    print(f'max_report_id= {max_report_id}, max_report_submission_id={max_report_submission_id}')
    #copy out fields for submission tables
    mine_report_submissions = etl.cut(mine_report_with_ids, [
        'mine_report_id', 'exp_document_guid', 'mine_report_submission_status_code', 'create_user',
        'create_timestamp', 'update_user', 'update_timestamp'
    ])
    mine_report_submissions = etl.addfield(mine_report_submissions,
                                           'submission_date', lambda x: x.create_timestamp)
    #remove fields not in mine_report
    mine_report = etl.cutout(mine_report, 'mine_report_submission_status_code')

    #replace exp_document_guid FK with mine_report_submission FK
    submission_id_lookup = etl.cut(mine_report_with_ids,
                                   ['mine_report_submission_id', 'exp_document_guid'])
    mine_report_submission_documents = etl.join(submission_id_lookup,
                                                mine_report_submission_documents,
                                                key='exp_document_guid')
    mine_report_submission_documents = etl.cutout(mine_report_submission_documents,
                                                  'exp_document_guid')

    #removed original PK
    mine_report = etl.cutout(mine_report, 'exp_document_guid')
    mine_report_submissions = etl.cutout(mine_report_submissions, 'exp_document_guid')

    print(etl.valuecounter(etl.distinct(table1, key='exp_document_guid'), 'req_document_name'))
    print(etl.valuecounter(mine_report, 'mine_report_definition_id'))
    print(table1)
    print(mine_report)
    print(mine_report_submissions)
    print(mine_report_submission_documents)

 
    etl.appenddb(mine_report, connection, 'mine_report', commit=False)
    print('INSERT mine_report staged')
    etl.appenddb(mine_report_submissions, connection, 'mine_report_submission', commit=False)
    print('INSERT mine_report_submission staged')
    etl.appenddb(mine_report_submission_documents,
                    connection,
                    'mine_report_document_xref',
                    commit=False)
    print('INSERT mine_report_document_xref staged')
    if commit:  
        connection.commit()
        print('DATA CREATION COMPLETE')
    else:
        connection.rollback()
        print('NO DATA CREATED: add --commit=true to insert report rows')

예제 #14

0

파일 보기

table2 = (etl.fromcsv('current_covid.csv').convert(
    'median_age', float).convert('aged_65_older',
                                 float).convert('aged_70_older', float))
# same as above table , list is declared with header
table2_header = [['iso_code', 'median_age', 'aged_65_older', 'aged_70_older']]
table2_data = etl.cut(table2, 'iso_code', 'date', 'median_age',
                      'aged_65_older', 'aged_70_older')
table2_dated = etl.select(table2_data, 'date', lambda v: v == '2020-04-30')
table2_sort = etl.sort(table2_dated, key='iso_code')

count = 0
for j in etl.values(table2_sort, 'iso_code', 'median_age', 'aged_65_older',
                    'aged_70_older'):
    if count == 15:
        break

    table2_header.append(j)
    count = count + 1

table_new = etl.head(table2_header, 15)

# adding 3 column in table as per requirement
# join is used to join the table
final_table = etl.join(table_old, table_new, key='iso_code')
print(final_table)

# opening csv file in write mode and exporting data to csv file
with open('covid.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(final_table)

예제 #15

0

파일 보기

파일: data_migrator.py 프로젝트: CCI-MOC/reporting

def transfer_data(from_db_conn, to_db_conn):
    '''
    Transfer data from databases given cursor to execute queries to connected databases
    Limitations:
    1. poc.address_id is currently marked as  -1 since it was not provided in test data and is a FK non-null constraint
    2. institution2poc table is not available in old schema
    3. role table is already populated in bill.sql file so that table is skipped by this script
    4. poc_poc_id is currently set to be poc_id since no relevant information is available about the column
    5. project2moc_project.role_id column is not available in old schema and is a not null field in new schema
        so we default it to 1 for now.
    6. project2moc_project.username is not available from old schema so currently set to empty
    7. raw_item_ts.item_id has duplicates when imported from item_ts. So we currently filter out and insert only uniques.

    :param from_db_conn: source database connection
    :param to_db_conn: destination database connection
    '''

    # Emptying out tables with possible foreign key constraint issues
    fk_dep_tables = [
        'poc2project', 'poc2moc_project', 'poc', 'raw_item_ts', 'item',
        'project', 'institution2moc_project'
    ]
    for table_name in fk_dep_tables:
        table = etl.fromdb(to_db_conn,
                           "select * from {} where 1=0".format(table_name))
        etl.todb(table, to_db_conn, table_name)

    # Tables with no change in schema
    insert_as_tables = [
        'institution', 'address', 'item_type', 'item2item', 'catalog_item'
    ]
    for table_name in insert_as_tables:
        table = etl.fromdb(from_db_conn, "select * from {}".format(table_name))
        etl.todb(table, to_db_conn, table_name)

    # inserting dummy address for constraint matching
    dummy_address = [{'address_id': -1}]
    dummy_address_table = etl.fromdicts(dummy_address)
    etl.appenddb(dummy_address_table, to_db_conn, 'address')

    poc = etl.fromdb(from_db_conn, 'select * from poc')
    poc_transformed = etl.cutout(poc, 'domain_id', 'user_uid')
    poc_dummy_address = etl.replace(poc_transformed, 'address_id', None, -1)
    etl.todb(poc_dummy_address, to_db_conn, 'poc')

    project_names_table = etl.fromdb(
        from_db_conn, "select distinct project_name from project")
    moc_project_transformed = etl.addrownumbers(project_names_table)
    moc_project_transformed = etl.rename(moc_project_transformed,
                                         {'row': 'moc_project_id'})
    etl.todb(moc_project_transformed, to_db_conn, 'moc_project')

    domain = etl.fromdb(from_db_conn, "select * from domain")
    domain_table_transformed = etl.cutout(domain, 'domain_uid')
    domain_table_transformed = etl.rename(domain_table_transformed, {
        'domain_id': 'service_id',
        'domain_name': 'service_name'
    })
    etl.todb(domain_table_transformed, to_db_conn, 'service')

    project = etl.fromdb(from_db_conn, "select * from project")
    moc_project = etl.fromdb(to_db_conn, "select * from moc_project")
    project_moc_project_joined = etl.join(project,
                                          moc_project,
                                          key='project_name')
    project_table_transformed = etl.cutout(project_moc_project_joined,
                                           'project_name')
    project_table_transformed = etl.rename(project_table_transformed, {
        'domain_id': 'service_id',
        'project_uid': 'project_uuid'
    })
    etl.todb(project_table_transformed, to_db_conn, 'project')

    institution2project = etl.fromdb(from_db_conn,
                                     "Select * from institution2project")
    project = etl.fromdb(to_db_conn,
                         "select project_id, moc_project_id from project")
    inst2project_project_joined = etl.join(institution2project,
                                           project,
                                           key='project_id')
    inst2moc_project = etl.cutout(inst2project_project_joined, 'domain_id')
    etl.todb(inst2moc_project, to_db_conn, 'institution2moc_project')

    project2poc = etl.fromdb(from_db_conn, "select * from project2poc")
    project2poc_project_joined = etl.join(project2poc,
                                          project,
                                          key='project_id')
    poc2moc_project = etl.cutout(project2poc_project_joined, 'project_id',
                                 'domain_id')
    poc2moc_project = etl.addfield(poc2moc_project, 'role_id', 1)
    poc2moc_project = etl.addfield(poc2moc_project, 'poc_poc_id',
                                   lambda rec: rec['poc_id'])
    etl.todb(poc2moc_project, to_db_conn, 'poc2moc_project')

    poc2project = etl.cutout(project2poc, 'domain_id')
    poc2project = etl.addfield(poc2project, 'role_id', 1)
    poc2project = etl.addfield(poc2project, 'username', '')
    etl.todb(poc2project, to_db_conn, 'poc2project')

    item = etl.fromdb(from_db_conn, "select * from item")
    item_transformed = etl.cutout(item, 'domain_id')
    etl.todb(item_transformed, to_db_conn, 'item')

    raw_item_ts_unique = etl.fromdb(
        from_db_conn,
        "WITH summary AS ( SELECT its.item_id, its.start_ts, its.end_ts, its.state, its.catalog_item_id, ROW_NUMBER() OVER(PARTITION BY its.item_id) AS rk FROM ITEM_TS its) SELECT s.* FROM summary s WHERE s.rk = 1"
    )
    raw_item_ts_unique = etl.cutout(raw_item_ts_unique, 'rk')
    etl.todb(raw_item_ts_unique, to_db_conn, 'raw_item_ts')

예제 #16

0

파일 보기

파일: etl.py 프로젝트: gwyneth-ang/STADVDB_S11.3_T120-21

d_date = etl.addcolumn(d_date, 'date_id', generated)

# country 
d_country = etl.distinct(etl.cut(m_table, 'country'))
rows = etl.nrows(d_country)
generated =[]
for i in range(rows):
    uuid = out_cursor.execute('SELECT UUID();')
    uuid = out_cursor.fetchone()[0]
    generated.append(uuid)
d_country = etl.addcolumn(d_country, 'country_id', generated)

# movie (fact table)
f_movie = etl.cut(m_table, 'imdb_title_id', 'year', 'date_published', 'genre', 'country')
# foreign key for imdb names id (movie personnel)
f_movie = etl.join(f_movie, tp_table, key='imdb_title_id')
# get only the necessary ones
f_movie = etl.cut(f_movie, 'imdb_title_id', 'imdb_name_id', 'year', 'date_published', 'genre', 'country')
# foreign key for genre id (genre)
f_movie = etl.join(f_movie, d_genre, key='genre')
# get only the necessary ones
f_movie = etl.cut(f_movie, 'imdb_title_id', 'imdb_name_id', 'year', 'date_published', 'genre_id', 'country')
# foreign key for country id (country)
f_movie = etl.join(f_movie, d_date, key=['year', 'date_published'])
# get only the necessary ones
f_movie = etl.cut(f_movie, 'imdb_title_id', 'imdb_name_id', 'date_id', 'genre_id', 'country')
# foreign key for country id (country)
f_movie = etl.join(f_movie, d_country, key='country')
# get only the necessary ones
f_movie = etl.cut(f_movie, 'imdb_title_id', 'imdb_name_id', 'date_id', 'genre_id', 'country_id')
# get the four ratings

예제 #17

0

파일 보기

dor_parcel_read_stmt = '''
    select parcel_id, street_address, address_low, address_low_suffix, address_low_frac, address_high, street_predir, 
    street_name, street_suffix, street_postdir, street_full from {dor_parcel_table}
    '''.format(dor_parcel_table='dor_parcel')
engine_dor_parcel_rows = etl.fromdb(pg_db, dor_parcel_read_stmt)
if DEV:
    print(etl.look(engine_dor_parcel_rows))

# Get duplicate parcel_ids:
non_unique_parcel_id_rows = engine_dor_parcel_rows.duplicates(key='parcel_id')
unique_parcel_id_rows = etl.complement(engine_dor_parcel_rows,
                                       non_unique_parcel_id_rows)

# Get address comps for condos by joining to dor_parcel with unique parcel_id on parcel_id:
print("Relating condos to parcels...")
joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \
    .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True)
print("joined rowcount: ", etl.nrows(joined))
if DEV:
    print(etl.look(joined))

# Calculate errors
print("Calculating errors...")
unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id')
print("unjoined rowcount: ", etl.nrows(unjoined))
dor_condos_unjoined_unmatched = etl.antijoin(unjoined,
                                             non_unique_parcel_id_rows,
                                             key='parcel_id').addfield(
                                                 'reason',
                                                 'non-active/remainder mapreg')
print("non-active/remainder mapreg error rowcount: ",
      etl.nrows(dor_condos_unjoined_unmatched))

예제 #18

0

파일 보기

파일: HPA.py 프로젝트: pieterlukasse/data_pipeline-1

    def retrieve_rna_data(self):
        """
        Parse 'rna_tissue' csv file,
        RNA levels in 56 cell lines and 37 tissues based on RNA-seq from HPA.

        :return: dict
        """
        self.logger.info('get rna tissue rows into dicts')
        self.logger.debug('melting rna level table into geneid tissue level')

        t_level = (petl.fromcsv(URLZSource(
            self.rna_level_url), delimiter='\t').melt(
                key='ID', variablefield='tissue',
                valuefield='rna_level').rename({
                    'ID': 'gene'
                }).addfield(
                    'tissue_label', lambda rec: name_from_tissue(
                        rec['tissue'].strip(), self.t2m)).addfield(
                            'tissue_code', lambda rec: code_from_tissue(
                                rec['tissue_label'], self.t2m)).addfield(
                                    'anatomical_systems',
                                    lambda rec: asys_from_tissue(
                                        rec['tissue_label'], self.t2m)).
                   addfield(
                       'organs', lambda rec: organs_from_tissue(
                           rec['tissue_label'], self.t2m)).cutout('tissue'))

        t_value = (petl.fromcsv(URLZSource(
            self.rna_value_url), delimiter='\t').melt(
                key='ID', variablefield='tissue',
                valuefield='rna_value').rename({
                    'ID': 'gene'
                }).addfield(
                    'tissue_label', lambda rec: name_from_tissue(
                        rec['tissue'].strip(), self.t2m)).addfield(
                            'tissue_code', lambda rec: code_from_tissue(
                                rec['tissue_label'], self.t2m)).addfield(
                                    'rna_unit', 'TPM').cutout('tissue'))

        t_zscore = (petl.fromcsv(
            URLZSource(self.rna_zscore_url), delimiter='\t').melt(
                key='ID', variablefield='tissue',
                valuefield='zscore_level').rename({
                    'ID': 'gene'
                }).addfield(
                    'tissue_label',
                    lambda rec: name_from_tissue(rec['tissue'].strip(
                    ), self.t2m)).addfield(
                        'tissue_code', lambda rec: code_from_tissue(
                            rec['tissue_label'], self.t2m)).cutout('tissue'))

        t_vl = petl.join(t_level,
                         t_value,
                         key=('gene', 'tissue_code', 'tissue_label'),
                         presorted=True)

        t_join = (petl.join(t_vl,
                            t_zscore,
                            key=('gene', 'tissue_code', 'tissue_label'),
                            presorted=True).aggregate(
                                'gene',
                                aggregation={
                                    'data': (('tissue_code', 'tissue_label',
                                              'rna_level', 'rna_value',
                                              'rna_unit', 'anatomical_systems',
                                              'organs', 'zscore_level'), list)
                                },
                                presorted=True))

        return t_join

예제 #19

0

파일 보기

import petl as etl
import csv

table1=(etl.fromcsv('covid.csv'))

# importing data from xml file and creating table
table2 = etl.fromxml('Country_location.xml','.//tr',('th','td'))
# print(table2)

# removing column country from table
table3=etl.cutout(table2,'country')

# merging the covid table with xml data
table4=etl.join(table1,table3,key='location')
print(table4)

# writing result to csv file
with open('covid_countries.csv','w') as f:
    writer=csv.writer(f)
    writer.writerows(table4)

예제 #20

0

파일 보기

파일: data_merger.py 프로젝트: pjj007/store-locator

stores = etl.fromcsv('stores.csv')

# Open XML document
locations = etl.fromxml('locations.xml', 'store', {'Name': 'Name', 'Lat': 'Lat', 'Lon': 'Lon'})
print(locations)

# Set output
output_table = [["ID", "Name", "Suburb", "State", "Postcode"]]

store_id = 1

# Read through the store.csv to generate output_table
store = etl.cut(stores, 'Name', 'Suburb', 'State', 'Postcode').distinct()
print(store)
for s in etl.values(store, 'Name', 'Suburb', 'State', 'Postcode'):
    output_table.append([store_id, s])
    store_id += 1
print (output_table)

# Merge and join XML and CSV together
merge_output = etl.join(stores, locations, key="Name")
print(merge_output)

store_table = etl.cut(merge_output, 'ID', 'Name', 'Suburb', 'State', 'Postcode', 'Lat', 'Lon')
print(etl.head(store_table, 5))

# Export to CSV file
etl.tocsv(merge_output, 'store_locations.csv')

예제 #21

0

파일 보기

파일: integrator.py 프로젝트: Pedrotojal/blendle-etl-challenge

#	Load a full year (2018) with the most simple datetime analysis
#	Year, month, day, hour, minute, second

#	For the full loading process , use the reference on the references.txt
#	This should be a processure with all the validation logic there, to create the next X months when it is called

#  Facts

# This facts table will be the staging with all the needed info to quickly update with the dimension keys and load to the facts table
# The facts table will have columns to match each column on the dim Time table, to make it easier to get the reference key
#

events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign')
events_tui = etl.cutout(events, 'user_id')

stage_uid = etl.join(users, events_uid, key='user_id')
stage_tui = etl.join(users, events_tui, key='tracking_id')
stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign')
stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id')
stage_m_s = etl.mergesort(stage_uid_utm,
                          stage_tui,
                          key=['created_at', 'email'])

# Mapping definitions
mappings = OrderedDict()
mappings['tid'] = 'tracking_id'
mappings['uid'] = 'user_id'
mappings['utm_medium'] = 'utm_medium'
mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'}
mappings['utm_campaign_type'] = 'utm_campaign'
mappings['email'] = 'email'

예제 #22

0

파일 보기

파일: data_merger.py 프로젝트: ShaunGerardAlberts/MashupGoogleAPI

def joinTables(tableOne, tableTwo, key):
    return etl.join(tableOne, tableTwo, key=key)

예제 #23

0

파일 보기

파일: uk_drivers_sync_police.py 프로젝트: cyberbikepunk/archive

def extract_backend(offline=OFFLINE):
    # Done in 4 steps: (1) grab the driver table from the CloudSQL,
    # (2) use the user uuids to query for users one by one through
    # the API, (3) get the fleet table from CloudSQL and (4) join
    # everything together.

    def extract_drivers():
        query = SQLReader('sql.drivers_from_cloudsql')
        drivers_df = sql.execute(query.statements[0])
        drivers_tb = fromdataframe(drivers_df)

        mappings = {
            'driver_uuid': lambda rec: str(UUID(bytes=rec['uuid'], version=4)),
            'fleet_uuid': lambda rec: str(UUID(bytes=rec['fleet_uuid'], version=4)),
            'user_uuid': lambda rec: str(UUID(bytes=rec['user_ds_uuid'], version=4)),
            'fullname': lambda rec: rec['last_name'].strip() + ', ' + rec['first_name'].strip(),
        }

        drivers_tb = drivers_tb.fieldmap(mappings)
        drivers_tb = drivers_tb.suffixheader('_in_backend')

        return drivers_tb

    def extract_users():
        users_records = [api.get_record('users', driver.user_uuid_in_backend)
                         for driver in drivers.namedtuples()]
        users_df = DataFrame().from_records(users_records)
        users_tb = fromdataframe(users_df)

        mappings = {
            'driver_uuid': 'driver',
            'user_uuid': 'uuid',
            'backend_username': '******'
        }

        users_tb = users_tb.fieldmap(mappings)
        users_tb = users_tb.suffixheader('_in_backend')

        return users_tb

    def extract_fleets_from_dwh():
        query = SQLReader('sql.fleets_from_tableau')
        fleets_df = dwh.execute(query.statements[0])
        fleets_tb = fromdataframe(fleets_df)

        mappings = {
            'fleet_uuid': 'uuid',
            'fleetname': lambda rec: rec['backend_name'].replace('_', ' '),
            'country_code': 'country_code',
        }

        fleets_tb = fleets_tb.cutout('country_code')
        fleets_tb = fleets_tb.fieldmap(mappings)
        fleets_tb = fleets_tb.suffixheader('_in_backend')

        return fleets_tb

    if not offline:
        sql = CloudSQLConnector()
        api = ValkfleetConnector()
        dwh = WarehouseConnector()

        drivers = extract_drivers()
        fleets = extract_fleets_from_dwh()
        users = extract_users()

        drivers.topickle(DRIVERS_IN_BACKEND_FILEPATH)
        fleets.topickle(FLEETS_IN_BACKEND_FILEPATH)
        users.topickle(USERS_IN_BACKEND_FILEPATH)

    else:
        drivers = frompickle(DRIVERS_IN_BACKEND_FILEPATH)
        fleets = frompickle(FLEETS_IN_BACKEND_FILEPATH)
        users = frompickle(USERS_IN_BACKEND_FILEPATH)

    write_to_log(drivers, 'drivers', 'backend')
    write_to_log(fleets, 'fleets', 'backend')
    write_to_log(users, 'users', 'backend')

    drivers_without_fleet = antijoin(drivers, fleets, key='fleet_uuid_in_backend')
    drivers_without_user = antijoin(drivers, users, key='user_uuid_in_backend')
    write_to_log(drivers_without_fleet, 'drivers without fleet', 'backend')
    write_to_log(drivers_without_user, 'drivers without user', 'backend')

    drivers_n_fleets = join(drivers, fleets, key='fleet_uuid_in_backend').cutout('fleet_uuid_in_backend')
    backend_drivers = join(drivers_n_fleets, users, key='user_uuid_in_backend')
    backend_drivers = backend_drivers.addfield('backend_username', lambda rec: rec['backend_username_in_backend'])
    backend_drivers = backend_drivers.cutout('driver_uuid_in_backend')

    backend_drivers = standardize_missing_values(backend_drivers)
    write_to_log(backend_drivers, 'drivers', 'backend')

    return backend_drivers

예제 #24

0

파일 보기

파일: examples.py 프로젝트: aklimchak/petl

          [1, 'circle'],
          [1, 'square'],
          [2, 'ellipse']]
table8 = [['id', 'time', 'height'],
          [1, 1, 12.3],
          [1, 2, 34.5],
          [2, 1, 56.7]]
table9 = [['id', 'time', 'weight'],
          [1, 2, 4.5],
          [2, 1, 6.7],
          [2, 2, 8.9]]

from petl import join, look    
look(table1)
look(table2)
table3 = join(table1, table2, key='id')
look(table3)
# if no key is given, a natural join is tried
table4 = join(table1, table2)
look(table4)
# note behaviour if the key is not unique in either or both tables
look(table5)
look(table6)
table7 = join(table5, table6, key='id')
look(table7)
# compound keys are supported
look(table8)
look(table9)
table10 = join(table8, table9, key=['id', 'time'])
look(table10)

예제 #25

0

파일 보기

파일: load_dor_condos.py 프로젝트: CityOfPhiladelphia/ais

print("Reading parcels...")
dor_parcel_read_stmt = '''
    select parcel_id, street_address, address_low, address_low_suffix, address_low_frac, address_high, street_predir, 
    street_name, street_suffix, street_postdir, street_full from {dor_parcel_table}
    '''.format(dor_parcel_table='dor_parcel')
engine_dor_parcel_rows = etl.fromdb(pg_db, dor_parcel_read_stmt)
if DEV:
    print(etl.look(engine_dor_parcel_rows))

# Get duplicate parcel_ids:
non_unique_parcel_id_rows = engine_dor_parcel_rows.duplicates(key='parcel_id')
unique_parcel_id_rows = etl.complement(engine_dor_parcel_rows, non_unique_parcel_id_rows)

# Get address comps for condos by joining to dor_parcel with unique parcel_id on parcel_id:
print("Relating condos to parcels...")
joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \
    .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True)
print("joined rowcount: ", etl.nrows(joined))
if DEV:
    print(etl.look(joined))

# Calculate errors
print("Calculating errors...")
unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id')
print("unjoined rowcount: ", etl.nrows(unjoined))
dor_condos_unjoined_unmatched = etl.antijoin(unjoined, non_unique_parcel_id_rows, key='parcel_id').addfield('reason', 'non-active/remainder mapreg')
print("non-active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_unmatched))
if DEV:
    print(etl.look(dor_condos_unjoined_unmatched))
dor_condos_unjoined_duplicates = etl.antijoin(unjoined, dor_condos_unjoined_unmatched, key='source_object_id').addfield('reason', 'non-unique active/remainder mapreg')
print("non-unique active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_duplicates))
if DEV:

예제 #26

0

파일 보기

파일: examples.py 프로젝트: datamade/petl

          [1, 'circle'],
          [1, 'square'],
          [2, 'ellipse']]
table8 = [['id', 'time', 'height'],
          [1, 1, 12.3],
          [1, 2, 34.5],
          [2, 1, 56.7]]
table9 = [['id', 'time', 'weight'],
          [1, 2, 4.5],
          [2, 1, 6.7],
          [2, 2, 8.9]]

from petl import join, look    
look(table1)
look(table2)
table3 = join(table1, table2, key='id')
look(table3)
# if no key is given, a natural join is tried
table4 = join(table1, table2)
look(table4)
# note behaviour if the key is not unique in either or both tables
look(table5)
look(table6)
table7 = join(table5, table6, key='id')
look(table7)
# compound keys are supported
look(table8)
look(table9)
table10 = join(table8, table9, key=['id', 'time'])
look(table10)

예제 #27

0

파일 보기

파일: data_integration.py 프로젝트: surazgyawali/weekend-with-bottle-and-maps

international_code = "(+61)"

with open(IN_FILE, 'r') as infile, open(OUT_FILE, "w") as outfile:
    csv_reader = csv.reader(infile)
    writer = csv.writer(outfile)
    headers = next(csv_reader, None)  #skipping header row
    writer.writerow(headers)
    for row in csv_reader:
        number_column = row[5]
        state_column = row[3]
        clean_num = re.sub("\D", "", row[5])[-8:]
        formatted_num = international_code + " " + regional_code[
            state_column] + " " + clean_num
        row[5] = formatted_num
        writer.writerow(row)

services = petl.fromcsv(SERVICES_FILE)
offices = petl.fromcsv(OUT_FILE)
offices = offices.rename({"Contact Name": "Office", "Phone Number": "Phone"})
offices = petl.cutout(offices,"State","Postcode")

locations = petl.fromcsv(LOC_FILE)
locations = locations.rename({"officeID": "OfficeID"})
office_service = petl.join(services, offices, key='OfficeID')

office_service_locations = petl.join(
    office_service, locations, key='OfficeID')

office_service_locations = petl.convert(office_service_locations,'OfficeServiceID',int)
office_service_locations = petl.sort(office_service_locations,'OfficeServiceID')
petl.tocsv(office_service_locations, 'office_service_locations.csv')