Пример #1
0
    def _petl_transform(self, record_set):
        if "transform" in self.task:
            transform = self.task["transform"]
            if "convert" in transform:
                conversions = {}
                for field, func in transform["convert"]:
                    conversions[field] = func
                record_set = etl.convert(record_set, conversions)

            if "filter" in transform:
                record_set = etl.select(record_set, transform["filter"])

            if "remove" in transform:
                cuts = []
                for field in transform["remove"]:
                    cuts.append(field)
                record_set = etl.cutout(record_set, cuts)

            if "rename" in transform:
                names = {}
                for old, new_one in transform["rename"]:
                    names[old] = new_one
                record_set = etl.rename(record_set, names)

        return record_set
Пример #2
0
    def synctable(self, sourceDb, targetDb, sourceTable, targetTable):
        sourceCursor = sourceDb.cursor()
        targetCursor = targetDb.cursor()
        affected_total = 0
        init_rowCount = targetTable.rowCount if targetTable.rowCount < sourceTable.rowCount else sourceTable.rowCount
        pbar = tqdm(total=sourceTable.rowCount, unit='records')
        pbar.update(init_rowCount)
        while sourceTable.lastUpdatetime > targetTable.lastUpdatetime:
            affected_rows = 0
            batchSize = 100000
            sql = "SELECT * FROM (SELECT * FROM {schema}.{tablename} WHERE {timestamp}>=to_timestamp('{last_updatetime}','yyyy-mm-dd hh24:mi:ss.ff6') ORDER BY {timestamp}) WHERE ROWNUM<={batch_size}".format(
                timestamp=sourceTable.timestampField,
                schema=sourceTable.schema,
                tablename=sourceTable.tablename,
                last_updatetime=targetTable.lastUpdatetime,
                batch_size=batchSize)
            sourceRecord = etl.fromdb(lambda: CursorProxy(sourceDb.cursor()),
                                      sql)
            targetRecord = etl.fromdb(
                lambda: CursorProxy(targetDb.cursor()),
                "SELECT * FROM {schema}.{tablename} WHERE 1=0".format(
                    schema=targetTable.schema,
                    tablename=targetTable.tablename))
            sourceTable.columns = etl.header(sourceRecord)
            targetTable.columns = etl.header(targetRecord)
            for column in list(
                    set(sourceTable.columns) - set(targetTable.columns)):
                sourceRecord = etl.cutout(sourceRecord, column)
            max_updatetime = sourceRecord.cut(
                sourceTable.timestampField).skip(1).max()[0]
            sourceRecord = sourceRecord.sort(sourceTable.timestampField)
            etl.appenddb(sourceRecord,
                         CursorProxy(targetCursor),
                         targetTable.tablename,
                         schema=targetTable.schema,
                         commit=True)
            affected_rows += targetCursor.rowcount
            targetTable.lastUpdatetime = max_updatetime.strftime(
                '%Y-%m-%d %H:%M:%S.%f')
            targetTable.rowCount += affected_rows
            pbar.update(affected_rows if init_rowCount + affected_total +
                        affected_rows < sourceTable.rowCount else
                        sourceTable.rowCount - init_rowCount - affected_total)
            affected_total += affected_rows
            pbar.set_description("%s |%d records updated." %
                                 (targetTable.tablename, affected_total))

        if targetTable.lastUpdatetime > sourceTable.lastUpdatetime:
            pbar.set_description("%s |timestamp >, skip." %
                                 (targetTable.tablename))
        elif targetTable.lastUpdatetime == sourceTable.lastUpdatetime and targetTable.rowCount == sourceTable.rowCount:
            pbar.set_description("%s |no data change." %
                                 (targetTable.tablename))
        elif targetTable.lastUpdatetime == sourceTable.lastUpdatetime and targetTable.rowCount > sourceTable.rowCount:
            pbar.set_description("%s |RowCount > but timestamp ==, skip." %
                                 (targetTable.tablename))
        elif targetTable.lastUpdatetime == sourceTable.lastUpdatetime and targetTable.rowCount < sourceTable.rowCount:
            pbar.set_description("%s |RowCount < but timestamp ==, skip." %
                                 (targetTable.tablename))
        pbar.close()
Пример #3
0
def drained_entries(ctx: typer.Context, issues, entries, project):
    config = ctx.meta['config']
    empty_entries, unset_entries = petl.biselect(
        entries, lambda row: row['issue_id'] is None)

    drain_issues = list(
        petl.dicts(
            transform.select_drain_issues(
                issues,
                assignee_id=ctx.meta['rdm_user']['id'],
                drain_cf_id=get_proj_attr(config, project,
                                          'rdm_drain_cf_id'))))

    if not len(drain_issues):
        log.error('No drain issues found')
        return petl.head(unset_entries, 0), entries

    if len(drain_issues) > 1:
        log.warning(
            f'Found {len(drain_issues)} drain issues. Will use only first one')

    drain_issue = drain_issues[0]
    drained = petl.addfield(petl.cutout(empty_entries, 'issue_id'), 'issue_id',
                            drain_issue['id'])
    return drained, unset_entries
Пример #4
0
def lookup_and_transform(ts_kv_table):
    """The table has the following structure:
    +---------------------------------+---------------+---------------+--------+
    | entity_id                       | key           | ts            | value  |
    +=================================+===============+===============+========+
    | 1ea47494dc14d40bd76a73c738b665f | Temperature   | 1583010011665 |  -1.8  |
    +---------------------------------+---------------+---------------+--------+
    | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 |   227  |
    +---------------------------------+---------------+---------------+--------+
    
    The output is a dictionary {device_id:table} of tables like that:
    +--------------+--------------+---------------+
    | ts           | Temperature  | WindDirection |
    +--------------+--------------+---------------+
    |1583010011665 | -1.8         |  230          |
    +--------------+--------------+---------------+
    |1583010000692 |   -2.5       | 227           |
    +--------------+--------------+---------------+
    """

    lkp = petl.lookup(ts_kv_table, 'entity_id', value=('key', 'ts', 'value'))
    for entity_id in lkp:
        tbl = [('key', 'ts', 'value')] + lkp[entity_id]
        tbl = petl.recast(tbl, variablefield='key', valuefield='value')
        cut_keys = KEYS_TO_REMOVE & set(petl.fieldnames(tbl))
        tbl = petl.cutout(tbl, *cut_keys)
        tbl = petl.transform.headers.sortheader(tbl)
        tbl = petl.transform.basics.movefield(tbl, 'ts', 0)
        lkp[entity_id] = petl.sort(tbl, 'ts')
    return lkp
Пример #5
0
def transform_fields_254(tbl, ts_kv_dict):
    """The input is a dump of ts_kv table for TB version 2.5.4:
    +----------------------------------+---------------+---------------+--------+-------+--------+-------+
    |  entity_id                       | key           | ts            | bool_v | str_v | long_v | dbl_v |
    +==================================+===============+===============+========+=======+========+=======+
    |  1ea47494dc14d40bd76a73c738b665f | 25   | 1583010011665 |        |       |        | -1.8  |
    +----------------------------------+---------------+---------------+--------+-------+--------+-------+
    |  1ea47494dc14d40bd76a73c738b665f | 36 | 1583010000692 |        |       | 227    |       |
    +----------------------------------+---------------+---------------+--------+-------+--------+-------+  
    
    The output:
    +---------------------------------+---------------+---------------+--------+
    | entity_id                       | key           | ts            | value  |
    +=================================+===============+===============+========+
    | 1ea47494dc14d40bd76a73c738b665f | Temperature   | 1583010011665 |  -1.8  |
    +---------------------------------+---------------+---------------+--------+
    | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 |   227  |
    +---------------------------------+---------------+---------------+--------+

    ts_kv_dict is a dict like {25:'Temperature', 36:'WindDirection'}
    """
    ts_kv_table = petl.transform.conversions.convert(tbl,
                                                     {'ts': int,
                                                      'key': lambda k: ts_kv_dict[k]})
    ts_kv_table = petl.addfield(ts_kv_table, 'value', lambda row: get_value(row))
    ts_kv_table = petl.cutout(ts_kv_table, 'bool_v', 'str_v', 'long_v', 'dbl_v')
    return ts_kv_table
Пример #6
0
def get_counter_table(fields, csv_name):
    if not fields:
        return 'No data to count'
    csv_file = f'{CSV_DIR}/{csv_name}'
    csv_data = petl.fromcsv(csv_file)
    cut_csv_data = petl.cutout(petl.valuecounts(csv_data, *fields),
                               'frequency')
    html_data = get_html_data(cut_csv_data)
    return html_data
Пример #7
0
def dataPreProcessing(fileName):
    inputData = fromcsv(fileName)
    table1 = cutout(inputData, 'member_id', 'grade', 'sub_grade', 'emp_title',
                    'url', 'desc', 'title', 'accept_d', 'exp_d', 'list_d',
                    'issue_d', 'purpose', 'addr_city', 'addr_state',
                    'earliest_cr_line', 'last_pymnt_d', 'next_pymnt_d',
                    'last_credit_pull_d')
    table2 = select(
        table1,
        lambda i: i['term'] == ' 36 months' and i['loan_status'] is not "")
    labelMapping = OrderedDict()
    labelMapping['loan_status'] = 'loan_status'
    labelMapping['id'] = 'id'
    table6 = fieldmap(table2, labelMapping)
    table8 = sort(table6, 'id')
    table10 = cutout(table8, 'id')
    mappings = OrderedDict()
    mappings['id'] = 'id'
    mappings['home_ownership'] = 'ownership', {
        'MORTGAGE': '-1',
        'RENT': '0',
        'OWN': '1'
    }
    mappings['emp_length'] = 'empLength', {'n/a': 0}
    mappings['is_inc_v'] = 'verificationStatus', {
        'Source Verified': 1,
        'Verified': 0,
        'Not Verified': -1
    }
    mappings['pymnt_plan'] = 'paymentPlan', {'n': 0, 'y': 1}
    mappings['initial_list_status'] = 'listStatus', {'f': 0, 'w': 1}
    table3 = fieldmap(table2, mappings)
    table4 = cutout(table2, 'home_ownership', 'is_inc_v', 'pymnt_plan',
                    'initial_list_status', 'term', 'loan_status')
    table5 = merge(table3, table4, key='id')
    table7 = sort(table5, 'id')
    table9 = cutout(table7, 'id')
    featureFileCsv = tocsv(table9, 'featureFileCsv.csv')
    labelsFileCsv = tocsv(table10, 'labelsFileCsv.csv')
    return featureFileCsv, labelsFileCsv
def createFacts(events, users):
    try:
        events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign')
        events_tui = etl.cutout(events, 'user_id')

        stage_uid = etl.join(users, events_uid, key='user_id')
        stage_tui = etl.join(users, events_tui, key='tracking_id')

        stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign')
        stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id')
        stage_m_s = etl.mergesort(stage_uid_utm, stage_tui, key=['created_at', 'email'])

        mappings = OrderedDict()
        mappings['tid'] = 'tracking_id'
        mappings['uid'] = 'user_id'
        mappings['utm_medium'] = 'utm_medium'
        mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'}
        mappings['utm_campaigntype'] = 'utm_campaign'
        mappings['email'] = 'email'
        mappings['subscription'] = 'type'
        mappings['sub_order'] = 'type', {'Signup Completed': '1', 'Trial Started': '2', 'Subscription Started': '3', 'Subscription Ended': '4'}
        mappings['created_at'] = 'created_at'

        # Mapping
        stage_mapping = etl.fieldmap(stage_m_s, mappings)

        # Sort
        stage_mapping_ordered = etl.sort(stage_mapping, key=['created_at', 'email', 'sub_order'])

        # Datetime split
        t1 = etl.split(stage_mapping_ordered, 'created_at', 'T', ['date', 'time'], include_original=True)
        t2 = etl.split(t1, 'date', '-', ['year', 'month', 'day'])
        stage_ready = etl.split(t2, 'time', ':', ['hour', 'minute', 'second'])

        # Export as csv to load folder
        etl.tocsv(stage_ready, 'load/facts.csv')

    except Exception as e:
        print("Something went wrong. Error {0}".format(e))
Пример #9
0
    def remove_column(self, *columns):
        """
        Remove a column from your table

        `Args:`
            \*columns: str
                Column names
        `Returns:`
            `Parsons Table` and also updates self
        """  # noqa: W605

        self.table = petl.cutout(self.table, *columns)

        return self
Пример #10
0
def load_devices(file):
    """
    File header:
    id;additional_info;customer_id;type;name;label;search_text;tenant_id

    Output:
        dictionary like {'id_1': {name:'Device 1'}, ...}

    """
    tbl_devices = petl.io.csv.fromcsv(file, delimiter=';', encoding='utf-8')
    tbl_devices = petl.cutout(tbl_devices, 'customer_id', 'search_text', 'tenant_id')
    # PETL docs:
    # https://petl.readthedocs.io/en/stable/util.html#petl.util.lookups.dictlookupone
    devices = petl.dictlookupone(tbl_devices, 'id')
    return devices
Пример #11
0
def group_entries_by_day(inp):
    hdr = petl.header(inp)

    agg = OrderedDict()
    for field in hdr:
        # using first found value
        agg[field] = field, next

    agg['dur'] = 'dur', lambda durs: sum(durs, timedelta())
    agg['start'] = 'start', min

    with_day = petl.addfield(inp, 'start_date',
                             lambda row: row.get('start').date())
    index_keys = ('start_date', 'description')
    result = petl.aggregate(with_day, index_keys, agg)
    return petl.cutout(result, 'start_date')
Пример #12
0
def order_by_constraint(base_path, table, schema, self_dep_set):
    file_name = base_path + "/content/data/" + table + ".tsv"
    tempfile = NamedTemporaryFile(mode='w',
                                  dir=base_path + "/content/data/",
                                  delete=False)
    table = etl.fromcsv(file_name,
                        delimiter='\t',
                        skipinitialspace=True,
                        quoting=csv.QUOTE_NONE,
                        quotechar='',
                        escapechar='')

    key_dep_dict = {}

    # print(file_name)
    for constraint in self_dep_set:
        child_dep, parent_dep = constraint.split(':')
        data = etl.values(table, child_dep, parent_dep)
        for d in data:
            key_dep_set = {d[1]}
            key_dep_dict.update({d[0]: key_dep_set})

    key_dep_list = toposort_flatten(key_dep_dict)
    table = etl.addfield(table, 'pwb_index',
                         lambda rec: int(key_dep_list.index(rec[child_dep])))
    table = etl.sort(table, 'pwb_index')
    table = etl.cutout(table, 'pwb_index')

    writer = csv.writer(tempfile,
                        delimiter='\t',
                        quoting=csv.QUOTE_NONE,
                        quotechar='',
                        lineterminator='\n',
                        escapechar='')

    writer.writerows(table)
    shutil.move(tempfile.name, file_name)
Пример #13
0
def transform_fields_old(tbl):
    """The input is a dump of ts_kv table for TB version <= 2.5.4:
    +-------------+---------------------------------+---------------+---------------+--------+-------+--------+-------+
    | entity_type | entity_id                       | key           | ts            | bool_v | str_v | long_v | dbl_v |
    +=============+=================================+===============+===============+========+=======+========+=======+
    | DEVICE      | 1ea47494dc14d40bd76a73c738b665f | Temperature   | 1583010011665 |        |       |        | -1.8  |
    +-------------+---------------------------------+---------------+---------------+--------+-------+--------+-------+
    | DEVICE      | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 |        |       | 227    |       |
    +-------------+---------------------------------+---------------+---------------+--------+-------+--------+-------+  
    
    The output:
    +-------------+---------------------------------+---------------+---------------+--------+
    | entity_type | entity_id                       | key           | ts            | value  |
    +=============+=================================+===============+===============+========+
    | DEVICE      | 1ea47494dc14d40bd76a73c738b665f | Temperature   | 1583010011665 |  -1.8  |
    +-------------+---------------------------------+---------------+---------------+--------+
    | DEVICE      | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 |   227  |
    +-------------+---------------------------------+---------------+---------------+--------+
    
    """
    ts_kv_table = petl.transform.conversions.convert(tbl, 'ts', int)
    ts_kv_table = petl.addfield(ts_kv_table, 'value', lambda row: get_value(row))
    ts_kv_table = petl.cutout(ts_kv_table, 'bool_v', 'str_v', 'long_v', 'dbl_v')
    return ts_kv_table
Пример #14
0
        file_name = 'datasets-%s.csv' % datetime.now().strftime('%Y%m%d%H%M%S')
        directory = 'csv'
        if not os.path.exists(directory):
            os.makedirs(directory)
        # etl.tocsv(aggregated_summary, './%s/%s' % (directory, file_name))
        # logging.info('This %s has been exported' % file_name)

        rooms, participations = storing_data_preparation(aggregated_summary)

        participations = etl.leftjoin(participations,
                                      external_ids,
                                      lkey='participant_id',
                                      rkey='id',
                                      rprefix='r_')
        participations = etl.cutout(participations, 'participant_id')
        participations = etl.rename(participations, 'r_external_id',
                                    'participant_id')

        rooms = etl.leftjoin(rooms,
                             external_ids,
                             lkey='creator',
                             rkey='id',
                             rprefix='r_')
        rooms = etl.cutout(rooms, 'creator')
        rooms = etl.rename(rooms, 'r_external_id', 'creator')

        logging.info('Storing data %s to database' % file_name)
        loader.truncate_table(dBConnection.connect(**config))
        loader.store_to_db(dBConnection.connect(**config),
                           tablename='rooms',
print("Writing tables to Databridge...")
# address standardization report:
# etl.todb(processed_rows, get_cursor, address_standardization_report_table_name)
# other tables go to ais_sources account:
dsn = get_dsn('ais_sources')
connection = cx_Oracle.Connection(dsn)
# zip4:
etl.fromcsv(temp_zip4_outfile_path).todb(get_cursor, zip4_write_table_name)
# cityzip:
etl.fromcsv(cityzip_outfile_path).todb(get_cursor, cityzip_write_table_name)
# alias:
etl.fromcsv(alias_outfile_path).todb(get_cursor, alias_write_table_name)

# Write processed_rows to uspszip4.csv:
print("Writing cleaned_usps output to {zip4_outfile_path}".format(zip4_outfile_path=zip4_outfile_path))
etl.cutout(processed_rows, 'base', 'pre', 'name', 'suffix', 'post', 'change_pre', 'change_name', 'change_suffix', 'change_post') \
        .rename({'std_base': 'base', 'std_pre': 'pre', 'std_name': 'name', 'std_suffix': 'suffix', 'std_post': 'post'}) \
        .cut('street_full', 'pre', 'name', 'suffix', 'post', 'low', 'high', 'oeb', 'unit', 'unitlow', 'unithigh', 'unitoeb', 'buildingorfirm', 'recordtype', 'zipcode',	'zip4') \
        .convert('low', int) \
        .select("{low} is not None") \
        .sort(key=['name', 'pre', 'suffix', 'post', 'low', 'high', 'unit', 'unitlow', 'unithigh']) \
        .tocsv(zip4_outfile_path, write_header=False)

# Write processed_rows to s3:
print("Writing {zip4_outfile_path} to s3".format(zip4_outfile_path=zip4_outfile_path))
# s3 = boto3.resource('s3', config=Config(proxies={'http': os.environ['HTTP_PROXY'], 'https': os.environ['HTTPS_PROXY']}))
s3 = boto3.resource('s3')
s3.meta.client.upload_file(zip4_outfile_path, s3_bucket, 'static files/' + zip4_outfile_path)

# Clean up:
os.remove(temp_zip4_outfile_path)
Пример #16
0
table = etl.convert(table, 'rep_tm', timeparser('%H:%M'))
table = etl.addfield(
    table, 'reported_timestamp', lambda x: datetime.combine(
        x['rep_dt'], (x['rep_tm'] or time(0, 0))) + timedelta(hours=8))
debug(table, ['rep_dt', 'rep_tm', 'reported_timestamp'])
table = clean_up(table, 'rep_dt')
table = clean_up(table, 'rep_tm')

####### Number of fatalities
print('JOINING number_of_fatalities from mms.mssoccd checkbox')
fatalities_table = etl.fromcsv('do_fatalities.csv', encoding='utf-8')
fatalities_table = etl.convert(fatalities_table, 'min_acc_no', str)
table = etl.leftjoin(table, fatalities_table, key='min_acc_no')
table = etl.addfield(table, 'number_of_fatalities', lambda x: 1
                     if x['chk'] else 0)  #this is a boolean in src
table = etl.cutout(table, 'chk')
print('\t# OF INCIDENTS WITH FATALITIES ' +
      str(etl.valuecount(table, 'number_of_fatalities', 1)[0]))

####### Number of Injuries
print('JOINING number_of_injuries from mms.mssoccd occ_typ D02 textbox')
injuries_table = etl.fromcsv('do_injuries.csv', encoding='utf-8')
injuries_table = etl.cutout(injuries_table, 'occ_typ')
injuries_table = etl.convert(injuries_table, 'min_acc_no', str)
injuries_table = etl.convert(injuries_table, 'val', lambda x: int(x.strip())
                             if x.strip() else 0)
table = etl.leftjoin(table, injuries_table, key='min_acc_no')
table = etl.addfield(table, 'number_of_injuries', lambda x: x['val'] or 0)

num_val_zero_or_none = etl.valuecount(table, 'val', 0)[0] + etl.valuecount(
    table, 'val', None)[0]
Пример #17
0
def concat_columns(table, new_column, column_left, column_right):
    new_table = etl.addfield(
        table, new_column,
        lambda rec: rec[column_left] + ', ' + rec[column_right], 1)
    return etl.cutout(new_table, column_left, column_right)
Пример #18
0
# Use US births only
table2 = etl.select(table, lambda rec: rec.birthCountry == 'USA')

# Only use these fields
table3 = etl.cut(table2, 'nameFirst', 'nameLast', 'debut', 'bbrefID', 'weight',
                 'height', 'finalGame', 'birthCity', 'birthState', 'birthYear')

# Remove null birth city and birth year
table4 = etl.select(table3,
                    lambda rec: rec.birthCity != "" and rec.birthYear != "")

# Add Baseball Reference URL
table5 = etl.addfield(table4, 'baseball_ref_url', add_bbreflink)
# Remove unnecessary bbrefid
table6 = etl.cutout(table5, "bbrefID")

# Load city,state lat long table.
city = etl.fromcsv(sys.argv[2])
# Only use these fields
city2 = etl.cut(city, "city", "state", "lat", "long")

# Join tables by two keys
lat_table = etl.leftjoin(table6,
                         city2,
                         lkey=["birthCity", "birthState"],
                         rkey=["city", "state"])

# Output merged file to csv
lat_table.tocsv(sys.argv[3])
Пример #19
0
# Load Master.csv from the Lahman database.
table = etl.fromcsv(sys.argv[1])

# Use US births only
table2 = etl.select(table, lambda rec: rec.birthCountry == 'USA')

# Only use these fields
table3 = etl.cut(table2, 'nameFirst', 'nameLast', 'debut', 'bbrefID', 'weight', 'height', 'finalGame', 'birthCity', 'birthState', 'birthYear')

# Remove null birth city and birth year
table4 = etl.select(table3, lambda rec: rec.birthCity != "" and rec.birthYear != "")

# Add Baseball Reference URL
table5 = etl.addfield(table4, 'baseball_ref_url', add_bbreflink)
# Remove unnecessary bbrefid
table6 = etl.cutout(table5, "bbrefID")

# Load city,state lat long table.
city = etl.fromcsv(sys.argv[2])
# Only use these fields
city2 = etl.cut(city, "city", "state", "lat", "long")

# Join tables by two keys
lat_table = etl.leftjoin(table6, city2, lkey=["birthCity", "birthState"], rkey=["city", "state"])

# Output merged file to csv
lat_table.tocsv(sys.argv[3])


Пример #20
0
def append_tailings_reports_to_code_required_reports(connection, commit=False):
    src_table = etl.fromdb(
        connection,
        'SELECT exp_doc.mine_guid, exp_doc.exp_document_guid, req_doc.req_document_name, exp_doc.due_date, exp_doc.exp_document_status_code, exp_doc.received_date, exp_doc.active_ind, exp_doc_x.mine_document_guid, exp_doc.create_user, exp_doc.create_timestamp, exp_doc.update_user, exp_doc.update_timestamp from mine_expected_document exp_doc \
        inner join mine_expected_document_xref exp_doc_x on exp_doc.exp_document_guid = exp_doc_x.exp_document_guid\
        inner join mds_required_document req_doc on req_doc.req_document_guid = exp_doc.req_document_guid'
    )

    req_document_crr_defintion_map = [
        ['req_document_name', 'mine_report_definition_id'],
        ['Summary of TSF and Dam Safety Recommendations', 28],
        ['ITRB Activities Report', 27],
        ['Register of Tailings Storage Facilities and Dams', 47],
        ['Dam Safety Inspection (DSI) Report', 26],
        ['Dam Safety Review (DSR) Report', 31],
        ['“As-built” Reports', 32],
        ['Annual Reclamation', 25],
        ['MERP Record of Testing', 3],
        #['Annual Manager\'s Report', __________________ ], no mapping or data, ignore.
        ['OMS Manual', 33],
        ['Annual reconciliation of water balance and water management plans', 44],
        ['TSF risk assessment', 46],
        ['Mine Emergency Preparedness and Response Plan (MERP)', 24],
        ['Performance of high risk dumps', 29]
    ]

    table1 = etl.join(src_table, req_document_crr_defintion_map, 'req_document_name')
    mine_report = etl.cutout(table1, 'req_document_name')

    #to be inserted into db
    mine_report = etl.addfield(mine_report, 'submission_year', 2019)
    mine_report = etl.rename(mine_report, 'exp_document_status_code',
                             'mine_report_submission_status_code')
    mine_report = etl.addfield(mine_report, 'deleted_ind', lambda x: not x.active_ind)
    mine_report = etl.cutout(mine_report, 'active_ind')
    #to determine what FK's will be so can insert into related tables
    max_report_id = etl.fromdb(connection,
                               'select last_value from public.mine_report_mine_report_id_seq')[1][0]
    max_report_submission_id = etl.fromdb(
        connection,
        'select last_value from public.mine_report_submission_mine_report_submission_id_seq')[1][0]

    #if sequence hasn't been used yet, fix off by one
    if max_report_id == 1:
        max_report_id = 0
    if max_report_submission_id == 1:
        max_report_submission_id = 0

    #get one-to-many
    mine_report, mine_report_submission_documents = etl.unjoin(mine_report,
                                                               'mine_document_guid',
                                                               key='exp_document_guid')

    #add PK's for mappings
    mine_report_with_ids = etl.addrownumbers(mine_report,
                                             start=max_report_id + 1,
                                             step=1,
                                             field='mine_report_id')
    mine_report_with_ids = etl.addrownumbers(mine_report_with_ids,
                                             start=max_report_submission_id + 1,
                                             step=1,
                                             field='mine_report_submission_id')
    print(f'max_report_id= {max_report_id}, max_report_submission_id={max_report_submission_id}')
    #copy out fields for submission tables
    mine_report_submissions = etl.cut(mine_report_with_ids, [
        'mine_report_id', 'exp_document_guid', 'mine_report_submission_status_code', 'create_user',
        'create_timestamp', 'update_user', 'update_timestamp'
    ])
    mine_report_submissions = etl.addfield(mine_report_submissions,
                                           'submission_date', lambda x: x.create_timestamp)
    #remove fields not in mine_report
    mine_report = etl.cutout(mine_report, 'mine_report_submission_status_code')

    #replace exp_document_guid FK with mine_report_submission FK
    submission_id_lookup = etl.cut(mine_report_with_ids,
                                   ['mine_report_submission_id', 'exp_document_guid'])
    mine_report_submission_documents = etl.join(submission_id_lookup,
                                                mine_report_submission_documents,
                                                key='exp_document_guid')
    mine_report_submission_documents = etl.cutout(mine_report_submission_documents,
                                                  'exp_document_guid')

    #removed original PK
    mine_report = etl.cutout(mine_report, 'exp_document_guid')
    mine_report_submissions = etl.cutout(mine_report_submissions, 'exp_document_guid')

    print(etl.valuecounter(etl.distinct(table1, key='exp_document_guid'), 'req_document_name'))
    print(etl.valuecounter(mine_report, 'mine_report_definition_id'))
    print(table1)
    print(mine_report)
    print(mine_report_submissions)
    print(mine_report_submission_documents)

 
    etl.appenddb(mine_report, connection, 'mine_report', commit=False)
    print('INSERT mine_report staged')
    etl.appenddb(mine_report_submissions, connection, 'mine_report_submission', commit=False)
    print('INSERT mine_report_submission staged')
    etl.appenddb(mine_report_submission_documents,
                    connection,
                    'mine_report_document_xref',
                    commit=False)
    print('INSERT mine_report_document_xref staged')
    if commit:  
        connection.commit()
        print('DATA CREATION COMPLETE')
    else:
        connection.rollback()
        print('NO DATA CREATED: add --commit=true to insert report rows')
Пример #21
0
# A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos
# Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header
headers = etl.fieldnames(t_confirmed)
i = 0
for header in headers:
    if i >= 4:
        t_confirmed = etl.convert(t_confirmed, header,
                                  int)  # corregimos el tipo de dato
        fecha = datetime.datetime.strptime(
            header, '%m/%d/%y')  # calculamos la fecha en formato correcto
        t_confirmed = etl.rename(t_confirmed, header,
                                 fecha.strftime('%Y-%m-%d'))
    i = i + 1

# Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar
t_confirmed = etl.cutout(t_confirmed, 0, 2, 3)

# Ajustamos algunos nombres de países para luego asignarles una región/continente
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace',
                          'Congo (Brazzaville)', 'Congo')
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace',
                          'Congo (Kinshasa)',
                          'Democratic Republic of the Congo')
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Cote d\'Ivoire',
                          'Ivory Coast')
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Korea, South',
                          'South Korea')
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace',
                          'West Bank and Gaza', 'Palestine')
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Burma',
                          'Myanmar')
Пример #22
0
import petl as etl
import csv

table1=(etl.fromcsv('covid.csv'))

# importing data from xml file and creating table
table2 = etl.fromxml('Country_location.xml','.//tr',('th','td'))
# print(table2)

# removing column country from table
table3=etl.cutout(table2,'country')

# merging the covid table with xml data
table4=etl.join(table1,table3,key='location')
print(table4)

# writing result to csv file
with open('covid_countries.csv','w') as f:
    writer=csv.writer(f)
    writer.writerows(table4)
                    key_dep_dict = {}

                    print(file_name)
                    for constraint in value:
                        child_dep, parent_dep = constraint.split(':')
                        data = etl.values(table, child_dep, parent_dep)
                        for d in data:
                            key_dep_set = {d[1]}
                            key_dep_dict.update({d[0]: key_dep_set})

                    key_dep_list = toposort_flatten(key_dep_dict)
                    table = etl.addfield(
                        table, 'pwb_index',
                        lambda rec: int(key_dep_list.index(rec[child_dep])))
                    table = etl.sort(table, 'pwb_index')
                    table = etl.cutout(table, 'pwb_index')

                    writer = csv.writer(tempfile,
                                        delimiter='\t',
                                        quoting=csv.QUOTE_NONE,
                                        quotechar='',
                                        lineterminator='\n',
                                        escapechar='')

                    writer.writerows(table)
                    shutil.move(tempfile.name, file_name)

            open(tsv_done_file, 'a').close()

            ddl = []
            for table in deps_list:
international_code = "(+61)"

with open(IN_FILE, 'r') as infile, open(OUT_FILE, "w") as outfile:
    csv_reader = csv.reader(infile)
    writer = csv.writer(outfile)
    headers = next(csv_reader, None)  #skipping header row
    writer.writerow(headers)
    for row in csv_reader:
        number_column = row[5]
        state_column = row[3]
        clean_num = re.sub("\D", "", row[5])[-8:]
        formatted_num = international_code + " " + regional_code[
            state_column] + " " + clean_num
        row[5] = formatted_num
        writer.writerow(row)

services = petl.fromcsv(SERVICES_FILE)
offices = petl.fromcsv(OUT_FILE)
offices = offices.rename({"Contact Name": "Office", "Phone Number": "Phone"})
offices = petl.cutout(offices,"State","Postcode")

locations = petl.fromcsv(LOC_FILE)
locations = locations.rename({"officeID": "OfficeID"})
office_service = petl.join(services, offices, key='OfficeID')

office_service_locations = petl.join(
    office_service, locations, key='OfficeID')

office_service_locations = petl.convert(office_service_locations,'OfficeServiceID',int)
office_service_locations = petl.sort(office_service_locations,'OfficeServiceID')
petl.tocsv(office_service_locations, 'office_service_locations.csv')
Пример #25
0
def get_value_counts_from_csv_file(collection_file_name: str,
                                   fields_to_fetch: List[str]) -> List:
    table_data = etl.fromcsv(collection_file_name)
    value_counts = etl.valuecounts(table_data, *fields_to_fetch)
    return list(etl.cutout(value_counts, "frequency").dicts())
Пример #26
0
def procesar_fuente(path, nombre):
    try: 
        # Procesamos primero casos confirmados
        tabla = etl.fromcsv(path)

        # Cambiamos el nombre a los encabezados
        tabla = etl.rename(tabla, {'Country/Region': 'Country'})

        # Ajustamos los tipos de datos
        # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos
        # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header
        headers = etl.fieldnames(tabla)
        i=0
        for header in headers:
            if i>=4:
                tabla = etl.convert(tabla, header, int)        # corregimos el tipo de dato
                fecha =  datetime.datetime.strptime(header, '%m/%d/%y')    # calculamos la fecha en formato correcto
                tabla = etl.rename(tabla, header, fecha.strftime('%Y-%m-%d'))   
            i = i + 1

        # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar
        tabla = etl.cutout(tabla, 0, 2, 3)

        # Ajustamos algunos nombres de países para luego asignarles una región/continente
        tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Brazzaville)', 'Congo')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Kinshasa)', 'Democratic Republic of the Congo')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Korea, South', 'South Korea')
        tabla = etl.convert(tabla, 'Country', 'replace', 'West Bank and Gaza', 'Palestine')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Burma', 'Myanmar')
        tabla = etl.convert(tabla, 'Country', 'replace', 'US', 'USA')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Taiwan*', 'Taiwan')

        # Luego procedemos a agrupar y acumular los resultados por el país
        df_confirmed = etl.todataframe(tabla)
        df = df_confirmed.groupby(['Country']).sum()
        tabla = etl.fromdataframe(df, include_index=True)

        # Renombramos el campo de Country nuevamente
        tabla = etl.rename(tabla, {'index': 'Country'})

        # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas
        tabla = etl.melt(tabla, 'Country')
        tabla = etl.rename(tabla, {'variable': 'Date'})
        tabla = etl.rename(tabla, {'value': 'Cases'})

        # Luego agregamos el continente para agrupar
        tabla = etl.addfield(tabla, 'Continent', lambda rec: get_continent_code(rec['Country']))

        # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser.
        tabla = etl.convert(tabla, 'Cases', int)
        tabla = etl.convert(tabla, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d') )

        #Finalmente, subimos el archivo al repositorio de datos
        conn = pymysql.connect(password='******', database='covid', user='******')
        conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES')
        etl.todb(tabla, conn, nombre, create=True, drop=True)
        conn.close()
    except:
        print('Se ha presentado un error! ', sys.exc_info()[0])
        raise
Пример #27
0
def remove_fields(fields, table):
    for field in fields:
        if field in etl.fieldnames(table):
            table = etl.cutout(table, field)
    return table
Пример #28
0
def transfer_data(from_db_conn, to_db_conn):
    '''
    Transfer data from databases given cursor to execute queries to connected databases
    Limitations:
    1. poc.address_id is currently marked as  -1 since it was not provided in test data and is a FK non-null constraint
    2. institution2poc table is not available in old schema
    3. role table is already populated in bill.sql file so that table is skipped by this script
    4. poc_poc_id is currently set to be poc_id since no relevant information is available about the column
    5. project2moc_project.role_id column is not available in old schema and is a not null field in new schema
        so we default it to 1 for now.
    6. project2moc_project.username is not available from old schema so currently set to empty
    7. raw_item_ts.item_id has duplicates when imported from item_ts. So we currently filter out and insert only uniques.

    :param from_db_conn: source database connection
    :param to_db_conn: destination database connection
    '''

    # Emptying out tables with possible foreign key constraint issues
    fk_dep_tables = [
        'poc2project', 'poc2moc_project', 'poc', 'raw_item_ts', 'item',
        'project', 'institution2moc_project'
    ]
    for table_name in fk_dep_tables:
        table = etl.fromdb(to_db_conn,
                           "select * from {} where 1=0".format(table_name))
        etl.todb(table, to_db_conn, table_name)

    # Tables with no change in schema
    insert_as_tables = [
        'institution', 'address', 'item_type', 'item2item', 'catalog_item'
    ]
    for table_name in insert_as_tables:
        table = etl.fromdb(from_db_conn, "select * from {}".format(table_name))
        etl.todb(table, to_db_conn, table_name)

    # inserting dummy address for constraint matching
    dummy_address = [{'address_id': -1}]
    dummy_address_table = etl.fromdicts(dummy_address)
    etl.appenddb(dummy_address_table, to_db_conn, 'address')

    poc = etl.fromdb(from_db_conn, 'select * from poc')
    poc_transformed = etl.cutout(poc, 'domain_id', 'user_uid')
    poc_dummy_address = etl.replace(poc_transformed, 'address_id', None, -1)
    etl.todb(poc_dummy_address, to_db_conn, 'poc')

    project_names_table = etl.fromdb(
        from_db_conn, "select distinct project_name from project")
    moc_project_transformed = etl.addrownumbers(project_names_table)
    moc_project_transformed = etl.rename(moc_project_transformed,
                                         {'row': 'moc_project_id'})
    etl.todb(moc_project_transformed, to_db_conn, 'moc_project')

    domain = etl.fromdb(from_db_conn, "select * from domain")
    domain_table_transformed = etl.cutout(domain, 'domain_uid')
    domain_table_transformed = etl.rename(domain_table_transformed, {
        'domain_id': 'service_id',
        'domain_name': 'service_name'
    })
    etl.todb(domain_table_transformed, to_db_conn, 'service')

    project = etl.fromdb(from_db_conn, "select * from project")
    moc_project = etl.fromdb(to_db_conn, "select * from moc_project")
    project_moc_project_joined = etl.join(project,
                                          moc_project,
                                          key='project_name')
    project_table_transformed = etl.cutout(project_moc_project_joined,
                                           'project_name')
    project_table_transformed = etl.rename(project_table_transformed, {
        'domain_id': 'service_id',
        'project_uid': 'project_uuid'
    })
    etl.todb(project_table_transformed, to_db_conn, 'project')

    institution2project = etl.fromdb(from_db_conn,
                                     "Select * from institution2project")
    project = etl.fromdb(to_db_conn,
                         "select project_id, moc_project_id from project")
    inst2project_project_joined = etl.join(institution2project,
                                           project,
                                           key='project_id')
    inst2moc_project = etl.cutout(inst2project_project_joined, 'domain_id')
    etl.todb(inst2moc_project, to_db_conn, 'institution2moc_project')

    project2poc = etl.fromdb(from_db_conn, "select * from project2poc")
    project2poc_project_joined = etl.join(project2poc,
                                          project,
                                          key='project_id')
    poc2moc_project = etl.cutout(project2poc_project_joined, 'project_id',
                                 'domain_id')
    poc2moc_project = etl.addfield(poc2moc_project, 'role_id', 1)
    poc2moc_project = etl.addfield(poc2moc_project, 'poc_poc_id',
                                   lambda rec: rec['poc_id'])
    etl.todb(poc2moc_project, to_db_conn, 'poc2moc_project')

    poc2project = etl.cutout(project2poc, 'domain_id')
    poc2project = etl.addfield(poc2project, 'role_id', 1)
    poc2project = etl.addfield(poc2project, 'username', '')
    etl.todb(poc2project, to_db_conn, 'poc2project')

    item = etl.fromdb(from_db_conn, "select * from item")
    item_transformed = etl.cutout(item, 'domain_id')
    etl.todb(item_transformed, to_db_conn, 'item')

    raw_item_ts_unique = etl.fromdb(
        from_db_conn,
        "WITH summary AS ( SELECT its.item_id, its.start_ts, its.end_ts, its.state, its.catalog_item_id, ROW_NUMBER() OVER(PARTITION BY its.item_id) AS rk FROM ITEM_TS its) SELECT s.* FROM summary s WHERE s.rk = 1"
    )
    raw_item_ts_unique = etl.cutout(raw_item_ts_unique, 'rk')
    etl.todb(raw_item_ts_unique, to_db_conn, 'raw_item_ts')
Пример #29
0
# other tables go to ais_sources account:
dsn = get_dsn('ais_sources')
connection = cx_Oracle.Connection(dsn)
# zip4:
etl.fromcsv(temp_zip4_outfile_path).todb(get_cursor, zip4_write_table_name)
# cityzip:
etl.fromcsv(cityzip_outfile_path).todb(get_cursor, cityzip_write_table_name)
# alias:
etl.fromcsv(alias_outfile_path).todb(get_cursor, alias_write_table_name)

# Write processed_rows to uspszip4.csv:
print("Writing cleaned_usps output to {zip4_outfile_path}".format(
    zip4_outfile_path=zip4_outfile_path))
etl.cutout(processed_rows, 'base', 'pre', 'name', 'suffix', 'post', 'change_pre', 'change_name', 'change_suffix', 'change_post') \
        .rename({'std_base': 'base', 'std_pre': 'pre', 'std_name': 'name', 'std_suffix': 'suffix', 'std_post': 'post'}) \
        .cut('street_full', 'pre', 'name', 'suffix', 'post', 'low', 'high', 'oeb', 'unit', 'unitlow', 'unithigh', 'unitoeb', 'buildingorfirm', 'recordtype', 'zipcode', 'zip4') \
        .convert('low', int) \
        .select("{low} is not None") \
        .sort(key=['name', 'pre', 'suffix', 'post', 'low', 'high', 'unit', 'unitlow', 'unithigh']) \
        .tocsv(zip4_outfile_path, write_header=False)

# Write processed_rows to s3:
print("Writing {zip4_outfile_path} to s3".format(
    zip4_outfile_path=zip4_outfile_path))
# s3 = boto3.resource('s3', config=Config(proxies={'http': os.environ['HTTP_PROXY'], 'https': os.environ['HTTPS_PROXY']}))
s3 = boto3.resource('s3')
s3.meta.client.upload_file(zip4_outfile_path, s3_bucket,
                           'static files/' + zip4_outfile_path)

# Clean up:
os.remove(temp_zip4_outfile_path)
Пример #30
0
def clean_up(table, column):
    if CLEAN_UP:
        return etl.cutout(table, column)
Пример #31
0
table5 = cut(table1, *range(0, 2))
look(table5)    


# cutout

table1 = [['foo', 'bar', 'baz'],
          ['A', 1, 2.7],
          ['B', 2, 3.4],
          ['B', 3, 7.8],
          ['D', 42, 9.0],
          ['E', 12]]

from petl import cutout, look
look(table1)
table2 = cutout(table1, 'bar')
look(table2)
    

# cat

table1 = [['foo', 'bar'],
          [1, 'A'],
          [2, 'B']]
table2 = [['bar', 'baz'],
          ['C', True],
          ['D', False]]
table4 = [['foo', 'bar', 'baz'],
          ['A', 1, 2],
          ['B', '2', '3.4'],
          [u'B', u'3', u'7.8', True],
Пример #32
0
# Dim Time
# TO DO
#	Load a full year (2018) with the most simple datetime analysis
#	Year, month, day, hour, minute, second

#	For the full loading process , use the reference on the references.txt
#	This should be a processure with all the validation logic there, to create the next X months when it is called

#  Facts

# This facts table will be the staging with all the needed info to quickly update with the dimension keys and load to the facts table
# The facts table will have columns to match each column on the dim Time table, to make it easier to get the reference key
#

events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign')
events_tui = etl.cutout(events, 'user_id')

stage_uid = etl.join(users, events_uid, key='user_id')
stage_tui = etl.join(users, events_tui, key='tracking_id')
stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign')
stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id')
stage_m_s = etl.mergesort(stage_uid_utm,
                          stage_tui,
                          key=['created_at', 'email'])

# Mapping definitions
mappings = OrderedDict()
mappings['tid'] = 'tracking_id'
mappings['uid'] = 'user_id'
mappings['utm_medium'] = 'utm_medium'
Пример #33
0
table5 = cut(table1, *range(0, 2))
look(table5)    


# cutout

table1 = [['foo', 'bar', 'baz'],
          ['A', 1, 2.7],
          ['B', 2, 3.4],
          ['B', 3, 7.8],
          ['D', 42, 9.0],
          ['E', 12]]

from petl import cutout, look
look(table1)
table2 = cutout(table1, 'bar')
look(table2)
    

# cat

table1 = [['foo', 'bar'],
          [1, 'A'],
          [2, 'B']]
table2 = [['bar', 'baz'],
          ['C', True],
          ['D', False]]
table4 = [['foo', 'bar', 'baz'],
          ['A', 1, 2],
          ['B', '2', '3.4'],
          [u'B', u'3', u'7.8', True],
Пример #34
0
# select a range of fields
table5 = etl.cut(table1, *range(0, 2))
table5


# cutout()
##########

import petl as etl
table1 = [['foo', 'bar', 'baz'],
          ['A', 1, 2.7],
          ['B', 2, 3.4],
          ['B', 3, 7.8],
          ['D', 42, 9.0],
          ['E', 12]]
table2 = etl.cutout(table1, 'bar')
table2


# cat()
#######

import petl as etl
table1 = [['foo', 'bar'],
          [1, 'A'],
          [2, 'B']]
table2 = [['bar', 'baz'],
          ['C', True],
          ['D', False]]
table3 = etl.cat(table1, table2)
table3