예제 #1
0
def main(argv):

    global g

    parser = argparse.ArgumentParser()
    parser.add_argument("--contributions-filename", required=True, help="Input UTF8 CSV with contributions data "
        "dumped from Servant Keeper")
    parser.add_argument("--split-detail-files", required=False, nargs='*', default=argparse.SUPPRESS,
        help="List of CSV files which have records that can be used to replace top-level 'Split Transaction' "
        "records in the main contributions file.")
    parser.add_argument("--chart-of-accounts-filename", required=True, help="Input UTF8 CSV with Chart of Accounts "
        "data from Servant Keeper")
    parser.add_argument("--output-filename", required=True, help="Output CSV filename which will be loaded with "
        "contributions data in CCB import format ")
    parser.add_argument('--trace', action='store_true', help="If specified, prints tracing/progress messages to "
        "stdout")
    args = parser.parse_args()

    assert os.path.isfile(args.contributions_filename), "Error: cannot open file '" + args.contributions_filename + "'"

    dict_split_transaction_details = load_split_transaction_details(args.split_detail_files)

    table = petl.fromcsv(args.contributions_filename)
    table = petl.rename(table, {
        'Individual ID': 'SK Individual ID',
        'Amount': 'SK Amount'
        })

    trace('REMOVING SPLIT TRANSACTIONS...', args.trace, banner=True)

    table = replace_split_transactions(table, dict_split_transaction_details)

    table_coa = petl.fromcsv(args.chart_of_accounts_filename)
    table = petl.leftjoin(table, table_coa, lkey='Account', rkey='SK Account')

    table = petl.addfield(table, 'Individual ID', lambda rec: rec['SK Individual ID'])
    table = petl.addfield(table, 'Date of Contribution', lambda rec: rec['Batch Date'])
    table = petl.addfield(table, 'Amount', lambda rec: rec['SK Amount'])
    table = petl.addfield(table, 'Type of Gift', lambda rec: rec['Type'])
    table = petl.addfield(table, 'Check Number', lambda rec: rec['Check #'])
    table = petl.addfield(table, 'Fund', convert_fund)
    table = petl.addfield(table, 'Sub Fund', convert_sub_fund)
    table = petl.addfield(table, 'Campus', '')
    table = petl.addfield(table, 'Transaction Grouping', '')
    table = petl.addfield(table, 'Batch Number/Name', '')
    table = petl.addfield(table, 'Tax Deductible', lambda rec: rec['Tax'])
    table = petl.addfield(table, 'Memo', convert_notes)

    trace('CONVERTING AND THEN EMITTING TO CSV FILE...', args.trace, banner=True)

    table.progress(200).tocsv(args.output_filename)

    trace('OUTPUT TO CSV COMPLETE.', args.trace, banner=True)

    if len(g.set_unfound_accounts) > 0:
        trace('UNMATCHED SK ACCOUNTS!', args.trace, banner=True)
        for acct in g.set_unfound_accounts:
            trace(acct, args.trace)

    trace('DONE!', args.trace, banner=True)
예제 #2
0
    def _petl_transform(self, record_set):
        if "transform" in self.task:
            transform = self.task["transform"]
            if "convert" in transform:
                conversions = {}
                for field, func in transform["convert"]:
                    conversions[field] = func
                record_set = etl.convert(record_set, conversions)

            if "filter" in transform:
                record_set = etl.select(record_set, transform["filter"])

            if "remove" in transform:
                cuts = []
                for field in transform["remove"]:
                    cuts.append(field)
                record_set = etl.cutout(record_set, cuts)

            if "rename" in transform:
                names = {}
                for old, new_one in transform["rename"]:
                    names[old] = new_one
                record_set = etl.rename(record_set, names)

        return record_set
예제 #3
0
def transform(data,data_set):
	data = data['observations']
	data = etl.fromdicts(data, header=['value','realtime_start','realtime_end','date'])
	data = etl.cut(data,'date','value')	
	data = etl.rename(data,{'date':'date','value': data_set.lower()})
	data = etl.convert(data,data_set.lower(),lambda val: 0 if val == '.' else val)
	return data
예제 #4
0
파일: etl.py 프로젝트: pjsier/psm
def clean(table):
    """Do some cleanup of TABLE

    TABLE is a petl table."""

    # Rename column to expand name
    table = etl.rename(table, {'WVRSTATE': 'waiverstate'})

    # More conversions
    table = etl.convert(
        table,
        {
            'EXCLTYPE': lambda f: f.strip(),  # Trim extra spaces
            'EXCLDATE': munge_date,  # Arrange date for sqlite
            'REINDATE': munge_date,  # Arrange date for sqlite
            'WAIVERDATE': munge_date  # Arrange date for sqlite
        })

    # Do some cleanup conversions on individual data
    table = etl.convert(
        table,
        {
            'DOB': munge_date,
            'MIDNAME': lambda f: f
            if f != " " else ""  # no spaces as middle names
        })
    table = etl.convertall(table, lambda f: None if f.strip() == '' else f)
    return table
예제 #5
0
def transform_to_petl(data):
    isodate = etl.dateparser("%Y-%m-%d")
    data = etl.fromdataframe(data)
    data = etl.rename(data, {"index": "Date", "VALUE": "Value"})
    data = etl.convert(data, {"Date": lambda d: d[:10]})
    data = etl.convert(data, {"Date": lambda d: isodate(d)})
    return data
예제 #6
0
    def _shape_data(self, raw_planets: etl.Table,
                    raw_people: etl.Table) -> etl.Table:
        planets = etl.cut(raw_planets, (
            Planet.Columns.NAME,
            Planet.Columns.URL,
        ))
        people = etl.cut(
            raw_people,
            (
                Person.Columns.NAME,
                Person.Columns.HEIGHT,
                Person.Columns.MASS,
                Person.Columns.HAIR_COLOR,
                Person.Columns.SKIN_COLOR,
                Person.Columns.EYE_COLOR,
                Person.Columns.BIRTH_YEAR,
                Person.Columns.GENDER,
                Person.Columns.HOMEWORLD,
                Person.Columns.EDITED,
            ),
        )

        combined = etl.join(
            planets,
            people,
            lkey=Planet.Columns.URL,
            rkey=Person.Columns.HOMEWORLD,
            lprefix=Planet.PREFIX,
        )

        renamed = etl.rename(
            combined,
            {
                Person.Columns.EDITED: Person.RenamedColumns.DATE,
                Planet.prefix_value(Planet.Columns.NAME):
                Person.Columns.HOMEWORLD,
            },
        )

        converted = etl.convert(
            renamed, {
                Person.RenamedColumns.DATE: lambda v: parse(v).date(),
            })

        return etl.cut(
            converted,
            (
                Person.Columns.NAME,
                Person.Columns.HEIGHT,
                Person.Columns.MASS,
                Person.Columns.HAIR_COLOR,
                Person.Columns.SKIN_COLOR,
                Person.Columns.EYE_COLOR,
                Person.Columns.BIRTH_YEAR,
                Person.Columns.GENDER,
                Person.Columns.HOMEWORLD,
                Person.RenamedColumns.DATE,
            ),
        )
예제 #7
0
파일: vcf.py 프로젝트: pombredanne/petlx
def unpackcall(tbl, *keys, **kwargs):
    """
    Unpack the call column. E.g.::
    
        >>> from petlx.vcf import fromvcf, unpackinfo, meltsamples, unpackcall
        >>> from petl import look, cutout
        >>> t1 = fromvcf('../fixture/sample.vcf')
        >>> t2 = meltsamples(t1)
        >>> t3 = unpackcall(t2)
        >>> t4 = cutout(t3, 'INFO')
        >>> look(t4)
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | 'CHROM' | 'POS' | 'ID'        | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'SAMPLE'  | 'GT'  | 'GQ' | 'DP' | 'HQ'         |
        +=========+=======+=============+=======+=======+========+==========+===========+=======+======+======+==============+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00001' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00002' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00003' | '0/1' | None | None | [3, 3]       |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00001' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00002' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00003' | '0/1' | None | None | [3, 3]       |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00001' | '0|0' |   48 |    1 | [51, 51]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00002' | '1|0' |   48 |    8 | [51, 51]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00003' | '1/1' |   43 |    5 | [None, None] |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 17330 | None        | 'T'   | [A]   |      3 | ['q10']  | 'NA00001' | '0|0' |   49 |    3 | [58, 50]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        
    .. versionadded:: 0.5
    
    """
    if not keys:
        if hasattr(tbl, 'filename'):
            try:
                import vcf as pyvcf
            except ImportError as e:
                raise UnsatisfiedDependency(e, dep_message)
            reader = pyvcf.Reader(filename=tbl.filename)
            # all FORMAT
            keys = reader.formats.keys()
        else:
            tbl = convert(tbl, 'CALL', lambda v: v.data._asdict()
                          )  # enable sampling of keys from data
    result = unpackdict(tbl, 'CALL', keys=keys)
    if 'prefix' in kwargs:
        result = rename(result, {k: kwargs['prefix'] + k for k in keys})
    if hasattr(tbl, 'filename'):
        return VCFWrapper(result, tbl.filename)
    else:
        return result
def createDimSubscriptions(events):
    try:
        dim_subscriptions_cut = etl.cut(events, 'type')
        dim_subscriptions_rename = etl.rename(dim_subscriptions_cut, {'type': 'subscription_name'})
        dim_subscriptions = etl.distinct(dim_subscriptions_rename)
        # Export as csv to load folder
        etl.tocsv(dim_subscriptions, 'load/dim_subscriptions.csv')
    except Exception as e:
        print("Something went wrong. Error {0}".format(e))
def createDimMedium(events):
    try:
        dim_medium_cut = etl.cut(events, 'utm_medium')
        dim_medium_rename = etl.rename(dim_medium_cut, {'utm_medium': 'medium'})
        dim_medium = etl.distinct(dim_medium_rename)
        # Export as csv to load folder
        etl.tocsv(dim_medium, 'load/dim_medium.csv')
    except Exception as e:
        print("Something went wrong. Error {0}".format(e))
def createDimCampaignType(events):
    try:
        dim_campaigntype_cut = etl.cut(events, 'utm_campaign')
        dim_campaigntype_rename = etl.rename(dim_campaigntype_cut, {'utm_campaign': 'campaign_type'})
        dim_campaigntype = etl.distinct(dim_campaigntype_rename)
        # export as csv to load folder
        etl.tocsv(dim_campaigntype, 'load/dim_campaigntype.csv')
    except Exception as e:
        print("Something went wrong. Error {0}".format(e))
예제 #11
0
파일: vcf.py 프로젝트: hexatonics/petlx
def unpackcall(tbl, *keys, **kwargs):
    """
    Unpack the call column. E.g.::
    
        >>> from petlx.vcf import fromvcf, unpackinfo, meltsamples, unpackcall
        >>> from petl import look, cutout
        >>> t1 = fromvcf('../fixture/sample.vcf')
        >>> t2 = meltsamples(t1)
        >>> t3 = unpackcall(t2)
        >>> t4 = cutout(t3, 'INFO')
        >>> look(t4)
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | 'CHROM' | 'POS' | 'ID'        | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'SAMPLE'  | 'GT'  | 'GQ' | 'DP' | 'HQ'         |
        +=========+=======+=============+=======+=======+========+==========+===========+=======+======+======+==============+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00001' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00002' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   111 | None        | 'A'   | [C]   |    9.6 | []       | 'NA00003' | '0/1' | None | None | [3, 3]       |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00001' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00002' | '0|0' | None | None | [10, 10]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '19'    |   112 | None        | 'A'   | [G]   |     10 | []       | 'NA00003' | '0/1' | None | None | [3, 3]       |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00001' | '0|0' |   48 |    1 | [51, 51]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00002' | '1|0' |   48 |    8 | [51, 51]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 14370 | 'rs6054257' | 'G'   | [A]   |     29 | []       | 'NA00003' | '1/1' |   43 |    5 | [None, None] |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        | '20'    | 17330 | None        | 'T'   | [A]   |      3 | ['q10']  | 'NA00001' | '0|0' |   49 |    3 | [58, 50]     |
        +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+
        
    .. versionadded:: 0.5
    
    """
    if not keys:
        if hasattr(tbl, 'filename'):
            try:
                import vcf as pyvcf
            except ImportError as e:
                raise UnsatisfiedDependency(e, dep_message)
            reader = pyvcf.Reader(filename=tbl.filename)
            # all FORMAT
            keys = reader.formats.keys()
        else:
            tbl = convert(tbl, 'CALL', lambda v: v.data._asdict()) # enable sampling of keys from data
    result = unpackdict(tbl, 'CALL', keys=keys)
    if 'prefix' in kwargs:
        result = rename(result, {k: kwargs['prefix'] + k for k in keys})
    if hasattr(tbl, 'filename'):
        return VCFWrapper(result, tbl.filename)
    else:
        return result
예제 #12
0
파일: transforms.py 프로젝트: mnimik/SWAPI
def transform_data(data):
    tbl_data = petl.fromdicts(data)
    tbl_data = petl.convert(
        tbl_data, {k: v['value']
                   for k, v in fields_to_transform.items()})
    tbl_data = petl.rename(
        tbl_data, {k: v['key']
                   for k, v in fields_to_transform.items()})
    tbl_data_allowed = petl.cut(tbl_data, *allowed_fields)

    return tbl_data_allowed
예제 #13
0
파일: etl.py 프로젝트: BenGalewsky/psm
def clean_and_separate(table):
    """Do some cleanup of TABLE and split into individual and business tables.

    TABLE is a petl table."""

    # Rename column to expand name
    table = etl.rename(table, {'WVRSTATE': 'waiverstate'})

    # More conversions
    table = etl.convert(
        table,
        {
            'EXCLTYPE': lambda f: f.strip(),  # Trim extra spaces
            'EXCLDATE': munge_date,  # Arrange date for sqlite
            'REINDATE': munge_date,  # Arrange date for sqlite
            'WAIVERDATE': munge_date  # Arrange date for sqlite
        })

    # Separate into two tables, as this is actually two different data sets
    individual = etl.select(table, "{LASTNAME} != '' and {FIRSTNAME} != ''")
    business = etl.select(table, "{LASTNAME} == '' and {FIRSTNAME} == ''")

    # Sanity check: Make sure we split the rows without dupes or
    # missing any.  The +1 is to account for the second header row
    # that gets counted when we have two tables.
    if len(business) + len(individual) != len(table) + 1:
        fatal(
            "Separating business and individual exclusions came up with the wrong number of rows!"
        )

    # Remove unused columns
    individual = etl.transform.basics.cutout(individual, "BUSNAME")
    business = etl.transform.basics.cutout(business, "LASTNAME", "FIRSTNAME",
                                           "MIDNAME", "DOB")

    # Do some cleanup conversions on individual data
    individual = etl.convert(
        individual,
        {
            'DOB': munge_date,
            'MIDNAME': lambda f: f
            if f != " " else ""  # no spaces as middle names
        })
    return individual, business
예제 #14
0
def main(argv):

    parser = argparse.ArgumentParser()
    parser.add_argument("--pledges-filename", required=True, help="Input UTF8 CSV with pledges data "
        "dumped from Servant Keeper")
    parser.add_argument("--output-filename", required=True, help="Output CSV filename which will be loaded with "
        "pledges data in CCB import format ")
    parser.add_argument('--trace', action='store_true', help="If specified, prints tracing/progress messages to "
        "stdout")
    args = parser.parse_args()

    assert os.path.isfile(args.pledges_filename), "Error: cannot open file '" + args.pledges_filename + "'"

    table = petl.fromcsv(args.pledges_filename)

    table = petl.rename(table, {
        'Frequency': 'SK Frequency',
        'Individual ID': 'SK Individual ID'
        })

    table = petl.addfield(table, 'Individual ID', lambda rec: rec['SK Individual ID'])
    table = petl.addfield(table, 'Campus', '')
    table = petl.addfield(table, 'Category Pledged To', lambda rec: rec['Account'])
    table = petl.addfield(table, 'Amount Pledged', convert_amount)
    table = petl.addfield(table, 'Total Amount Pledged', lambda rec: rec['Pledged'])
    table = petl.addfield(table, 'Frequency', lambda rec: {1:'Yearly', 4:'Yearly', 12:'Monthly',
        52:'Weekly', 24:'Monthly', 2:'Yearly'}[int(rec['Payments'])])
    table = petl.addfield(table, 'Number of Gifts', lambda rec: {'Yearly':1, 'Monthly':12,
        'Weekly':52}[rec['Frequency']])
    table = petl.addfield(table, 'Length Multiplier', lambda rec: {'Yearly':'Years', 'Monthly':'Months',
        'Weekly':'Weeks'}[rec['Frequency']])
    table = petl.addfield(table, 'Start Date', lambda rec: {'Operating Income':'2013-01-01',
        'Mortgage Principal':'2013-01-01', 'Operating Income 2015':'2015-01-01'}[rec['Account']])
    table = petl.addfield(table, 'End Date', lambda rec: {'Operating Income':'2013-12-31',
        'Mortgage Principal':'2013-12-31', 'Operating Income 2015':'2015-12-31'}[rec['Account']])

    trace('CONVERTING AND THEN EMITTING TO CSV FILE...', args.trace, banner=True)

    table.progress(200).tocsv(args.output_filename)

    trace('OUTPUT TO CSV COMPLETE.', args.trace, banner=True)

    trace('DONE!', args.trace, banner=True)
예제 #15
0
파일: etl.py 프로젝트: kcym-3c/parsons
    def rename_column(self, column_name, new_column_name):
        """
        Rename a column

        `Args:`
            column_name: str
                The current column name
            new_column_name: str
                The new column name
        `Returns:`
            `Parsons Table` and also updates self
        """

        if new_column_name in self.columns:
            raise ValueError(f"Column {new_column_name} already exists")

        self.table = petl.rename(self.table, column_name, new_column_name)

        return self
예제 #16
0
def xref_symbol_reports():
    symbol_reports = [
        f for f in os.listdir()
        if re.match('OCLC Datasync Unresolved.*\.csv', f)
    ]

    today = str(date.today())

    for report in symbol_reports:

        symbol_split = re.split('^.*processing.(M[A-Z]{2}).*$', report)
        symbol = symbol_split[1]
        xlsx_outfile = symbol + '_datasync_unresolved_' + today + '.xlsx'
        xls_outfile = symbol + '_datasync_unresolved_' + today + '.xls'
        txt_outfile = symbol + '_staging_OCNs_' + today + '.txt'

        symbol_table_raw = etl.fromcsv(report, encoding='utf-8')
        symbol_table = etl.rename(symbol_table_raw, '\ufeffMMS Id', 'MMS ID')
        symbol_table2 = etl.select(symbol_table, "{MMS ID} is not None")
        symbol_table_sorted = etl.sort(symbol_table2, 'MMS ID')

        xref_table = etl.fromcsv('unresxref.csv')
        xref_table2 = etl.select(xref_table, "{MMS ID} is not None")
        xref_table_sorted = etl.sort(xref_table2, 'MMS ID')

        symbol_xref_table = etl.join(symbol_table_sorted,
                                     xref_table_sorted,
                                     presorted=True,
                                     lkey="MMS ID",
                                     rkey="MMS ID")

        try:
            etl.toxlsx(symbol_xref_table, xlsx_outfile, encoding='utf-8')
        except TypeError:
            etl.toxls(symbol_xref_table,
                      xls_outfile,
                      'Sheet1',
                      encoding='utf-8')

        staging_ocns_table = etl.cut(symbol_xref_table, 'Staging OCN')
        template = '{Staging OCN}\n'
        etl.totext(staging_ocns_table, txt_outfile, template=template)
예제 #17
0
 def get_relationships(self):
     "Parses a list of `Relationship` objects."
     core_file = _find_loinc_table_core_file(self.uri.path)
     core = etl.fromcsv(core_file, delimiter=',')
     core = etl.cut(core, ['LOINC_NUM', 'LONG_COMMON_NAME'])
     hierarchy_file = _find_multi_axial_hierarchy_file(self.uri.path)
     hierarchy = etl.fromcsv(hierarchy_file, delimiter=',')
     hierarchy = etl.leftjoin(hierarchy, core, lkey='CODE', rkey='LOINC_NUM')
     hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'CODE_TEXT', 'LONG_COMMON_NAME'])
     hierarchy = etl.fillright(hierarchy)
     hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'LONG_COMMON_NAME'])
     hierarchy = etl.rename(hierarchy, 'LONG_COMMON_NAME', 'CODE_TEXT')
     parents = etl.cut(hierarchy, ['CODE', 'CODE_TEXT'])
     hierarchy = etl.selectne(hierarchy, 'IMMEDIATE_PARENT', '')
     hierarchy = etl.leftjoin(hierarchy, parents, lkey='IMMEDIATE_PARENT', rkey='CODE', lprefix='source.', rprefix='target.')
     hierarchy = etl.distinct(hierarchy)
     if self.versioned:
         version = _parse_version(hierarchy_file)
         hierarchy = etl.addfield(hierarchy, 'version', version)
     hierarchy = etl.rowmapmany(hierarchy, _to_json, ['relationship'])
     return hierarchy
예제 #18
0
def get_table(characters_data):
    characters_headers = [
        'name',
        'height',
        'mass',
        'hair_color',
        'skin_color',
        'eye_color',
        'birth_year',
        'gender',
        'homeworld',
        'edited',
    ]
    characters_table = etl.fromdicts(characters_data, characters_headers)
    characters_table = etl.rename(characters_table, 'edited', 'date')
    characters_table = etl.convert(
        characters_table, 'date', lambda date_field: datetime.strptime(
            date_field, '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%Y-%m-%d'))

    characters_table = replace_homeworld_planet_name(characters_table)

    return characters_table
def tsv_fix(base_path, new_file_name, pk_list, illegal_columns_lower_case,
            tsv_process):
    if tsv_process:
        pwb_replace_in_file(new_file_name, '\0', '')  # Remove null bytes

    table = etl.fromcsv(new_file_name,
                        delimiter='\t',
                        skipinitialspace=True,
                        quoting=csv.QUOTE_NONE,
                        quotechar='',
                        escapechar='')

    row_count = etl.nrows(table)

    if tsv_process:
        tempfile = NamedTemporaryFile(mode='w',
                                      dir=base_path + "/content/data/",
                                      delete=False)

        table = pwb_lower_case_header(table)
        table = etl.rename(table, illegal_columns_lower_case, strict=False)

        print(new_file_name)
        for pk in pk_list:
            table = etl.convert(table, pk.lower(), lambda a: a
                                if len(str(a)) > 0 else '-')

        writer = csv.writer(tempfile,
                            delimiter='\t',
                            quoting=csv.QUOTE_NONE,
                            quotechar='',
                            escapechar='',
                            lineterminator='\n')
        writer.writerows(table)

        shutil.move(tempfile.name, new_file_name)
    return row_count
예제 #20
0
# Load the files
users = etl.fromcsv('data/users.csv')
events = etl.fromjson('data/events2.json')

# Transform
# Dim Customers
# Filter necessary data only
dim_customers = etl.cut(users, 'user_id', 'email')
# Export as csv to load folder
etl.tocsv(dim_customers, 'load/dim_customers.csv')

# Dim Subscriptions
# Use the distinct values present in the type column to load  into the dim subscription table
dim_subscriptions_cut = etl.cut(events, 'type')
dim_subscriptions_rename = etl.rename(dim_subscriptions_cut,
                                      {'type': 'subscription_name'})
dim_subscriptions = etl.distinct(dim_subscriptions_rename)
# Export as csv to load folder
etl.tocsv(dim_subscriptions, 'load/dim_subscriptions.csv')

# Dim Medium
# Use the distinct values present in the utm_medium colum to load into the dim medium table
dim_medium_cut = etl.cut(events, 'utm_medium')
dim_medium_rename = etl.rename(dim_medium_cut, {'utm_medium': 'medium'})
dim_medium = etl.distinct(dim_medium_rename)
# Export as csv to load folder
etl.tocsv(dim_medium, 'load/dim_medium.csv')

# Dim Campaign Type
# Use the distinct values present in the utm_campaign column to load into the dim campaign table
# Note:
예제 #21
0
from __future__ import division, print_function, absolute_import


# rename()
##########

import petl as etl
table1 = [['sex', 'age'],
          ['m', 12],
          ['f', 34],
          ['-', 56]]
# rename a single field
table2 = etl.rename(table1, 'sex', 'gender')
table2
# rename multiple fields by passing a dictionary as the second argument
table3 = etl.rename(table1, {'sex': 'gender', 'age': 'age_years'})
table3


# setheader()
#############

import petl as etl
table1 = [['foo', 'bar'],
          ['a', 1],
          ['b', 2]]
table2 = etl.setheader(table1, ['foofoo', 'barbar'])
table2


# extendheader()
예제 #22
0
def transform_and_aggregate_datetimes(query_results, rollup):
    """transform datetime to the correct TZ; aggregate the values in the query results 
    based on the datetime rollup args. Aggregation is performed for:

    * hourly or daily time intervals
    * total

    NOTE: in order to handle potential No-Data values in the DB during aggregation, we
    convert them to 0. The `src` field then indicates if any values in the rollup were N/D.
    Then, if the value field in the aggregated row still shows 0 after summation, *and*
    the src field shows N/D, we turn that zero into None. If there was a partial reading
    (e.g., the sensor has values for the first half hour but N/D for the second, and we are 
    doing an hourly rollup), then the values will stay there, but the source field will indicate
    both N/D and whatever the source was for the workable sensor values.

    TODO: move this work over to the database query

    """
    t1 = etl\
        .fromdicts(query_results)\
        .convert('xts', lambda v: v.astimezone(TZ).isoformat(), failonerror=True)
        #.rename('xts', 'ts')
    # print("t1")
    # print(t1)

    # print("rollup", rollup)
    if rollup in [INTERVAL_DAILY, INTERVAL_HOURLY]:

        petl_aggs = OrderedDict(
            val=('val', _sumround), # sum the rainfall vales
            src=('src', _listset) # create a list of all rainfall sources included in the rollup
        )

        t2 = etl\
            .convert(
                t1,
                'xts', 
                lambda v: _rollup_date(v, rollup), # convert datetimes to their rolled-up value in iso-format
                failonerror=True
            )\
            .convert(
                'val', 
                lambda v: 0 if v is None else v, # convert rainfall values to 0 if no-data
                failonerror=True
            )\
            .aggregate(
                ('xts', 'sid'), 
                petl_aggs # aggregate rainfall values (sum) and sources (list) for each timestamp+ID combo,
            )\
            .convert(
                'val', 
                lambda v, r: None if ('N/D' in r.src and v == 0) else v, # replace 0 values with no data if aggregated source says its N/D
                pass_row=True,
                failonerror=True
            )\
            .sort('sid')
            # .convert(
            #     'xts', 
            #     lambda v: TZ.localize(parse(v)).isoformat(), # convert that datetime to iso format w/ timezone
            #     failonerror=True
            # )
        # print("t2 time rollup")

    elif rollup in [INTERVAL_SUM]:

        petl_aggs = OrderedDict(
            val=('val', _sumround), # sum the rainfall vales
            src=('src', _listset), # create a list of all rainfall sources included in the rollup
            xts=('xts', _minmax) # create a iso datetime range string from the min and max datetimes found
        )

        t2 = etl\
            .aggregate(
                t1,
                'sid', 
                petl_aggs # aggregate rainfall values (sum) and sources (list), and datetimes (str) for each ID,
            )\
            .convert(
                'val', 
                lambda v, r: None if ('N/D' in r.src and v == 0) else v, # replace 0 values with no data if aggregated source says its N/D
                pass_row=True
            )\
            .sort('sid')\

        # print("t2 sum")

    else:
        t2 = t1
    # print("t2 = t1")

    # print(t2)
    # h = etl.header(t2)

    # rename the timestamp and sensor id fields, 
    # print("t2 header:", list(etl.header(t2)))
    # rename_kw = {}
    # for h1, h0 in [('xts', 'ts'), ('sid', 'id')]:
    #     if h1 in h:
    #         rename_kw[h1] = h0
    # if len(rename_kw.items()) > 0:
    #     {'xts':'ts', 'sid':'id'}
    t3 = etl.rename(t2, {'xts':'ts', 'sid':'id'}, strict=False)
    # else:
    #     t3 = t2
    # print("t3")
    # print(t3)

    # convert to list of dicts and return
    return list(etl.dicts(t3))
예제 #23
0
        elif (country == 'Kosovo') or (country == 'Holy See'):
            return 'EU'
        else:
            return 'N/A'


# Fuente de los datos que vamos a leer
uri_confirmed = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
uri_death = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
uri_recovered = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'

# Procesamos primero casos confirmados
t_confirmed = etl.fromcsv(uri_confirmed)

# Cambiamos el nombre a los encabezados
t_confirmed = etl.rename(t_confirmed, {'Country/Region': 'Country'})

# Ajustamos los tipos de datos
# A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos
# Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header
headers = etl.fieldnames(t_confirmed)
i = 0
for header in headers:
    if i >= 4:
        t_confirmed = etl.convert(t_confirmed, header,
                                  int)  # corregimos el tipo de dato
        fecha = datetime.datetime.strptime(
            header, '%m/%d/%y')  # calculamos la fecha en formato correcto
        t_confirmed = etl.rename(t_confirmed, header,
                                 fecha.strftime('%Y-%m-%d'))
    i = i + 1
예제 #24
0
파일: vcf.py 프로젝트: pombredanne/petlx
def unpackinfo(tbl, *keys, **kwargs):
    """
    Unpack the INFO field into separate fields. E.g.::

        >>> from petlx.vcf import fromvcf, unpackinfo
        >>> from petl import look
        >>> t1 = fromvcf('../fixture/sample.vcf', samples=False)
        >>> look(t1)
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | 'CHROM' | 'POS'   | 'ID'        | 'REF' | 'ALT'     | 'QUAL' | 'FILTER' | 'INFO'                                                                                  |
        +=========+=========+=============+=======+===========+========+==========+=========================================================================================+
        | '19'    |     111 | None        | 'A'   | [C]       |    9.6 | []       | {}                                                                                      |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '19'    |     112 | None        | 'A'   | [G]       |     10 | []       | {}                                                                                      |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    |   14370 | 'rs6054257' | 'G'   | [A]       |     29 | []       | OrderedDict([('NS', 3), ('DP', 14), ('AF', [0.5]), ('DB', True), ('H2', True)])         |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    |   17330 | None        | 'T'   | [A]       |      3 | ['q10']  | OrderedDict([('NS', 3), ('DP', 11), ('AF', [0.017])])                                   |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    | 1110696 | 'rs6040355' | 'A'   | [G, T]    |     67 | []       | OrderedDict([('NS', 2), ('DP', 10), ('AF', [0.333, 0.667]), ('AA', 'T'), ('DB', True)]) |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    | 1230237 | None        | 'T'   | [None]    |     47 | []       | OrderedDict([('NS', 3), ('DP', 13), ('AA', 'T')])                                       |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    | 1234567 | 'microsat1' | 'G'   | [GA, GAC] |     50 | []       | OrderedDict([('NS', 3), ('DP', 9), ('AA', 'G'), ('AN', 6), ('AC', [3, 1])])             |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    | 1235237 | None        | 'T'   | [None]    | None   | []       | {}                                                                                      |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | 'X'     |      10 | 'rsTest'    | 'AC'  | [A, ATG]  |     10 | []       | {}                                                                                      |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        
        >>> t2 = unpackinfo(t1)
        >>> look(t2)
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | 'CHROM' | 'POS'   | 'ID'        | 'REF' | 'ALT'     | 'QUAL' | 'FILTER' | 'NS' | 'AN' | 'AC'   | 'DP' | 'AF'           | 'AA' | 'DB' | 'H2' |
        +=========+=========+=============+=======+===========+========+==========+======+======+========+======+================+======+======+======+
        | '19'    |     111 | None        | 'A'   | [C]       |    9.6 | []       | None | None | None   | None | None           | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '19'    |     112 | None        | 'A'   | [G]       |     10 | []       | None | None | None   | None | None           | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    |   14370 | 'rs6054257' | 'G'   | [A]       |     29 | []       |    3 | None | None   |   14 | [0.5]          | None | True | True |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    |   17330 | None        | 'T'   | [A]       |      3 | ['q10']  |    3 | None | None   |   11 | [0.017]        | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    | 1110696 | 'rs6040355' | 'A'   | [G, T]    |     67 | []       |    2 | None | None   |   10 | [0.333, 0.667] | 'T'  | True | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    | 1230237 | None        | 'T'   | [None]    |     47 | []       |    3 | None | None   |   13 | None           | 'T'  | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    | 1234567 | 'microsat1' | 'G'   | [GA, GAC] |     50 | []       |    3 |    6 | [3, 1] |    9 | None           | 'G'  | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    | 1235237 | None        | 'T'   | [None]    | None   | []       | None | None | None   | None | None           | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | 'X'     |      10 | 'rsTest'    | 'AC'  | [A, ATG]  |     10 | []       | None | None | None   | None | None           | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
    
    .. versionadded:: 0.5
    
    """
    if not keys:
        if hasattr(tbl, 'filename'):
            try:
                import vcf as pyvcf
            except ImportError as e:
                raise UnsatisfiedDependency(e, dep_message)
            reader = pyvcf.Reader(filename=tbl.filename)
            # all INFO
            keys = reader.infos.keys()
    result = unpackdict(tbl, 'INFO', keys=keys)
    if 'prefix' in kwargs:
        result = rename(result, {k: kwargs['prefix'] + k for k in keys})
    if hasattr(tbl, 'filename'):
        return VCFWrapper(result, tbl.filename)
    else:
        return result
예제 #25
0
def sales_summary(start_dt=None, end_dt=None):
    """tally up gross (sale over list) profits
    TODO: tally up net profites (gross profit vs inventory purchase total)

    TODO: Keyword Arguments:
        start_dt {[type]} -- datetime for start of query (default: {None})
        end_dt {[type]} -- datetime for start of query [description] (default: {None})

    Returns:
        [dict] -- various types of sales information, stored in a dictionary.
    """

    # products = db.session.query(Product).all()
    # sales = db.session.query(Sale).all()

    # retrieve existing tables
    products_records = etl.fromdb(db.engine, 'SELECT * FROM product')
    sales_records = etl.fromdb(db.engine, 'SELECT * FROM sale')

    # join product info to sales data
    sales_data = etl.join(sales_records,
                          products_records,
                          lkey='product_id',
                          rkey='id')

    # prep joined sales data for tabulation
    sales_data = etl.convert(sales_data, 'date', lambda dt: format_date(dt))
    sales_data = etl.sort(sales_data, 'date')
    sales_data = etl.convert(sales_data, 'quantity',
                             lambda q: handle_none(q, replace_with=1))
    sales_data = etl.addfield(sales_data, 'profit',
                              lambda rec: calculate_profit(rec))
    sales_data = etl.addfield(sales_data, 'gross_sales',
                              lambda rec: calculate_gross_sales(rec))

    # summarize data into charting-friendly data structures
    chart_count = etl.fold(sales_data,
                           'date',
                           operator.add,
                           'quantity',
                           presorted=True)
    chart_count = etl.rename(chart_count, {'key': 'x', 'value': 'y'})
    chart_count, chart_count_missing_date = etl.biselect(
        chart_count, lambda rec: rec.x is not None)
    # print(chart_count)
    # etl.lookall(chart_count)

    chart_gross = etl.fold(sales_data,
                           'date',
                           operator.add,
                           'gross_sales',
                           presorted=True)
    chart_gross = etl.rename(chart_gross, {'key': 'x', 'value': 'y'})
    chart_gross, chart_gross_missing_date = etl.biselect(
        chart_gross, lambda rec: rec.x is not None)
    # print(chart_gross)
    # etl.lookall(chart_gross)

    chart_profit = etl.fold(sales_data,
                            'date',
                            operator.add,
                            'profit',
                            presorted=True)
    chart_profit = etl.rename(chart_profit, {'key': 'x', 'value': 'y'})
    chart_profit, chart_profit_missing_date = etl.biselect(
        chart_profit, lambda rec: rec.x is not None)

    # tabulate some figures
    gross_sales = 0
    profits = 0
    for sale in etl.dicts(sales_data):
        profits += calculate_profit(sale)
        gross_sales += calculate_gross_sales(sale)

    # for i in etl.dicts(chart_count):
    #     print(i)
    # for i in etl.dicts(chart_gross):
    #     print(i)

    return {
        'gross_sales': gross_sales,
        'profits': profits,
        'chart_gross': list(etl.dicts(chart_gross)),
        'chart_gross_missing_date': list(etl.dicts(chart_gross_missing_date)),
        'chart_profit': list(etl.dicts(chart_profit)),
        'chart_profit_missing_date':
        list(etl.dicts(chart_profit_missing_date)),
        'chart_count': list(etl.dicts(chart_count)),
        'chart_count_missing_date': list(etl.dicts(chart_count_missing_date))
    }
예제 #26
0
                              lkey='id',
                              rkey='director_id')

# print(directorsAndGenresAndMovies)
# print(actorsAndRoles)
# print(moviesAndGenresAndDirectorsAndRoles)

# Delete unnecessary columns from all tables
ranks = etl.cut(moviesAndDirectorsAndRoles, 'movie_id', 'rank', 'director_id',
                'actor_id')
movies = etl.cut(moviesAndDirectorsAndRoles, 'movie_id', 'name')
directors = etl.cut(directorsAndMovies, 'id', 'full_name')
actors = etl.cut(actors, 'id', 'full_name')

# Rename id to include table name
directors = etl.rename(directors, 'id', 'director_id')
actors = etl.rename(actors, 'id', 'actor_id')

# Remove rows with NULL ranks
ranks = etl.distinct(ranks)
ranks = etl.selectnotnone(ranks, 'rank')

# Remove duplicates after cutting columns
movies = etl.distinct(movies)
directors = etl.distinct(directors)
actors = etl.distinct(actors)

# Insert final tables into data warehouse
etl.todb(ranks, imdbWarehouse, 'ranks')
etl.todb(movies, imdbWarehouse, 'movies')
etl.todb(actors, imdbWarehouse, 'actors')
########## Json extraction and maping
tableJ = etl.fromjson('cust_data.json', header=['id','gender','first_name','last_name', 'email','ville'])
tableJ = etl.movefield(tableJ, 'gender', 4)

########## CSV extraction and conversion
tableCSV = etl.fromcsv('week_cust.csv')
tableCSV = etl.convert(tableCSV, 'id', int)

########### Sqlserver connection and extraction
connectionSqlServer=pyodbc.connect("Driver={SQL Server Native Client 11.0};" "Server=81_64_msdn;" "Database=BD4client;" "Trusted_Connection=yes;" "convert_unicode =True;")
cursor = connectionSqlServer.cursor()
cursor.execute('SELECT id, first_name, last_name, email, gender, ville FROM client_DATA')
tableSqlServer = cursor.fetchall()
tableSqlServer =[('id','first_name','last_name', 'email','gender','ville')]+tableSqlServer
cursor.close()
connectionSqlServer.close()

######### Staging area transforming and concatenation
StagingArea = etl.cat(tableCSV, tableJ,tableSqlServer)
StagingArea = etl.convert(StagingArea, 'gender', {'Male': 'M', 'Female': 'F', 'male': 'M', 'female': 'F', None: 'N'})
StagingArea = etl.rename(StagingArea, 'ville', 'city')

######## mysql
connection = mysql.connect(host="localhost", user="******", passwd="", db="customerdatabase")
curseur = connection.cursor()
curseur.execute('SET SQL_MODE=ANSI_QUOTES')
#### load data, assuming table " CustomerData" already exists in the database
etl.appenddb(StagingArea, connection, 'customerdata', schema='customerdatabase', commit='commit')
curseur.close()
connection.close()
예제 #28
0
def procesar_fuente(path, nombre):
    try: 
        # Procesamos primero casos confirmados
        tabla = etl.fromcsv(path)

        # Cambiamos el nombre a los encabezados
        tabla = etl.rename(tabla, {'Country/Region': 'Country'})

        # Ajustamos los tipos de datos
        # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos
        # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header
        headers = etl.fieldnames(tabla)
        i=0
        for header in headers:
            if i>=4:
                tabla = etl.convert(tabla, header, int)        # corregimos el tipo de dato
                fecha =  datetime.datetime.strptime(header, '%m/%d/%y')    # calculamos la fecha en formato correcto
                tabla = etl.rename(tabla, header, fecha.strftime('%Y-%m-%d'))   
            i = i + 1

        # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar
        tabla = etl.cutout(tabla, 0, 2, 3)

        # Ajustamos algunos nombres de países para luego asignarles una región/continente
        tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Brazzaville)', 'Congo')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Kinshasa)', 'Democratic Republic of the Congo')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Korea, South', 'South Korea')
        tabla = etl.convert(tabla, 'Country', 'replace', 'West Bank and Gaza', 'Palestine')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Burma', 'Myanmar')
        tabla = etl.convert(tabla, 'Country', 'replace', 'US', 'USA')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Taiwan*', 'Taiwan')

        # Luego procedemos a agrupar y acumular los resultados por el país
        df_confirmed = etl.todataframe(tabla)
        df = df_confirmed.groupby(['Country']).sum()
        tabla = etl.fromdataframe(df, include_index=True)

        # Renombramos el campo de Country nuevamente
        tabla = etl.rename(tabla, {'index': 'Country'})

        # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas
        tabla = etl.melt(tabla, 'Country')
        tabla = etl.rename(tabla, {'variable': 'Date'})
        tabla = etl.rename(tabla, {'value': 'Cases'})

        # Luego agregamos el continente para agrupar
        tabla = etl.addfield(tabla, 'Continent', lambda rec: get_continent_code(rec['Country']))

        # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser.
        tabla = etl.convert(tabla, 'Cases', int)
        tabla = etl.convert(tabla, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d') )

        #Finalmente, subimos el archivo al repositorio de datos
        conn = pymysql.connect(password='******', database='covid', user='******')
        conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES')
        etl.todb(tabla, conn, nombre, create=True, drop=True)
        conn.close()
    except:
        print('Se ha presentado un error! ', sys.exc_info()[0])
        raise
예제 #29
0
def main(argv):

    parser = argparse.ArgumentParser()
    parser.add_argument("--pledges-filename",
                        required=True,
                        help="Input UTF8 CSV with pledges data "
                        "dumped from Servant Keeper")
    parser.add_argument("--output-filename",
                        required=True,
                        help="Output CSV filename which will be loaded with "
                        "pledges data in CCB import format ")
    parser.add_argument(
        '--trace',
        action='store_true',
        help="If specified, prints tracing/progress messages to "
        "stdout")
    args = parser.parse_args()

    assert os.path.isfile(
        args.pledges_filename
    ), "Error: cannot open file '" + args.pledges_filename + "'"

    table = petl.fromcsv(args.pledges_filename)

    table = petl.rename(table, {
        'Frequency': 'SK Frequency',
        'Individual ID': 'SK Individual ID'
    })

    table = petl.addfield(table, 'Individual ID',
                          lambda rec: rec['SK Individual ID'])
    table = petl.addfield(table, 'Campus', '')
    table = petl.addfield(table, 'Category Pledged To',
                          lambda rec: rec['Account'])
    table = petl.addfield(table, 'Amount Pledged', convert_amount)
    table = petl.addfield(table, 'Total Amount Pledged',
                          lambda rec: rec['Pledged'])
    table = petl.addfield(
        table, 'Frequency', lambda rec: {
            1: 'Yearly',
            4: 'Yearly',
            12: 'Monthly',
            52: 'Weekly',
            24: 'Monthly',
            2: 'Yearly'
        }[int(rec['Payments'])])
    table = petl.addfield(
        table, 'Number of Gifts', lambda rec: {
            'Yearly': 1,
            'Monthly': 12,
            'Weekly': 52
        }[rec['Frequency']])
    table = petl.addfield(
        table, 'Length Multiplier', lambda rec: {
            'Yearly': 'Years',
            'Monthly': 'Months',
            'Weekly': 'Weeks'
        }[rec['Frequency']])
    table = petl.addfield(
        table, 'Start Date', lambda rec: {
            'Operating Income': '2013-01-01',
            'Mortgage Principal': '2013-01-01',
            'Operating Income 2015': '2015-01-01'
        }[rec['Account']])
    table = petl.addfield(
        table, 'End Date', lambda rec: {
            'Operating Income': '2013-12-31',
            'Mortgage Principal': '2013-12-31',
            'Operating Income 2015': '2015-12-31'
        }[rec['Account']])

    trace('CONVERTING AND THEN EMITTING TO CSV FILE...',
          args.trace,
          banner=True)

    table.progress(200).tocsv(args.output_filename)

    trace('OUTPUT TO CSV COMPLETE.', args.trace, banner=True)

    trace('DONE!', args.trace, banner=True)
예제 #30
0
def main():
    mort = fromcsv('mortality.csv')
    mort = skip(mort,2)
    mort = cut(mort,"Country Name","2006","2007","2008","2009","2010","2011","2012")
    mort = petl.rename(mort, 'Country Name', 'country')
    mort_rates = facetcolumns(mort, 'country')
    def getmort(c,y):
        try:
          return float(mort_rates[c][y][0])
        except:
          return None
        
    ''' faceting allows us to do for example:
    pprint(mort_rates['Malaysia']['2009'])
    pprint(mort_rates['Denmark'])
   
    '''

    f = open('mnhc.ids')
    base_url = 'http://localhost:7000/cube/cida/'
    fact_url = base_url+'facts?cut=project_number:'#A033033007
    aggregate_url = base_url+ 'aggregate?cut=project_number:'#A033033007
    bads = ['Africa MC','Americas MC','Asia MC','Europe MC']
    renamethis={'Afghanistan TIS':'Afghanistan',
                'Tanzania,Un Rep':'Tanzania',
                'Lao, Dem. Rep.':'Lao',
                'Congo, Dem Rep.':'Congo'}

    '''
    use project browser instead,since there seems to be nothing that uniquely identifies historical activities, 
    for example there are 6 entries for #A033033007
    '''

    mashup_file=open('mashup.json','w')
    mashup_data=[]
    for id in list(f)[0:-1]:

        project_data = json.load(urllib2.urlopen(fact_url + id))
         # now we can append aggregates
        aggr = json.load(urllib2.urlopen(aggregate_url + id))['summary']
        # just grab the first record
            
        p=project_data[0]
        c= p['country_region_name']
        country =  renamethis[c] if  c in renamethis.keys() else c
    
        if country not in bads:
            mashup_data.append({
                  'id':p['id'],
                  'project':p['project_number'],
                  'year':p['fiscal_year'],
                  'continent':p['continent_name'],
                  'country':country,
                  'max':int(round(p['maximum_cida_contribution'])),
                  'spent':int(round(aggr['amount_spent_sum'])),
                  'mortality_rate':getmort(c,p['fiscal_year'])})

            
    mashup=fromdicts(mashup_data)
    tocsv(mashup,'raw-mnhc.csv')
    

    key_fields = ['project', 'max', 'year', 'country', 'mortality_rate', 'id', 'continent']
    value_field = 'spent'
    ''''
    tbl_out = (
        tbl_in
        .aggregate(key=key_fields, aggregation=sum, value=value_field)
        .unpack('key', key_fields)
        .rename('value', value_field)
    )   
    '''
    tbl_out = aggregate(mashup, ['country','year','mortality_rate'], sum, 'spent')
    tbl_out = unpack(tbl_out, 'key',['country','year','mortality_rate'])
    tbl_out = rename(tbl_out, 'value','spent')
    tbl_out = cut(tbl_out, 'country','year','spent','mortality_rate') #cut to reorder
    
    
    print look(tbl_out)
    tojson(tbl_out,"mashup.json")
    tocsv(tbl_out,"mnhc-report.csv")
예제 #31
0
    '26WeekPriceReturnDaily', '3MonthAverageTradingVolume', '52WeekHigh',
    '52WeekHighDate', '52WeekLow', '52WeekLowDate', '52WeekPriceReturnDaily',
    '5DayPriceReturnDaily'
]
important_metrics = {}
for metric in list_of_metrics:
    important_metrics[metric] = metrics[metric]
important_metrics['_date'] = str(datetime.datetime.now())[:19]
table_metrics = etl.fromdicts([important_metrics])
table_financial_metrics = etl.rename(
    table_metrics, {
        '10DayAverageTradingVolume': '_10DayAverageTradingVolume',
        '13WeekPriceReturnDaily': '_13WeekPriceReturnDaily',
        '26WeekPriceReturnDaily': '_26WeekPriceReturnDaily',
        '3MonthAverageTradingVolume': '_3MonthAverageTradingVolume',
        '52WeekHigh': '_52WeekHigh',
        '52WeekHighDate': '_52WeekHighDate',
        '52WeekLow': '_52WeekLow',
        '52WeekLowDate': '_52WeekLowDate',
        '52WeekPriceReturnDaily': '_52WeekPriceReturnDaily',
        '5DayPriceReturnDaily': '_5DayPriceReturnDaily'
    })
table_financial_metrics = etl.addfield(table_financial_metrics, 'symbol',
                                       stock)

list_of_sentiments = ['bearishPercent', 'bullishPercent']
important_sentiments = {}
for sentiments in list_of_sentiments:
    important_sentiments[sentiments] = sentiment['sentiment'][sentiments]
important_sentiments['date_and_time'] = str(datetime.datetime.now())[:19]
table_sentiment = etl.fromdicts([important_sentiments])
예제 #32
0
파일: examples.py 프로젝트: aklimchak/petl
fc['b']['bar']
fc['b']['baz']
fc['c']


# rename

table1 = [['sex', 'age'],
        ['m', 12],
        ['f', 34],
        ['-', 56]]

from petl import look, rename
look(table1)
# rename a single field
table2 = rename(table1, 'sex', 'gender')
look(table2)
# rename multiple fields by passing a dictionary as the second argument
table3 = rename(table1, {'sex': 'gender', 'age': 'age_years'})
look(table3)
# the returned table object can also be used to modify the field mapping using the suffix notation
table4 = rename(table1)
table4['sex'] = 'gender'
table4['age'] = 'age_years'
look(table4)


# cut

table1 = [['foo', 'bar', 'baz'],
          ['A', 1, 2.7],
예제 #33
0
def dimension_values():
    connection = psycopg2.connect(dbname='voyager',
                                  user='******',
                                  password='******',
                                  host='172.16.0.45')
    engine = create_engine('postgresql://*****:*****@172.16.0.45:5432/voyager')

    com = 'select id as id_component, name as component from dim_com'
    table_com = etl.fromdb(connection, com)

    loc = 'select id as id_location, name as name from dim_loc'
    table_loc = etl.fromdb(connection, loc)

    tim = 'select id as id_time, time as timestamp from dim_time'
    table_time = etl.fromdb(connection, tim)

    print(table_com)
    print(table_loc)
    print(table_time)

    for ran in range(0, 65424, 1000):
        sql = "select * from KNMI_station_data kk " \
              "RIGHT JOIN weatherstations w ON " \
              " CAST (kk.weather_station_id AS INTEGER)  = CAST (w.station_number AS INTEGER) " \
              "WHERE w.station_number NOT LIKE \'NL%%\' AND date > 20190901 LIMIT 1000 OFFSET %s" % ran
        print(sql)
        table = etl.fromdb(connection, sql)

        print('knmi')
        print(table)
        table.log_progress()
        table = etl.convert(table, 'date', str)
        table = etl.convert(table, 'hour', str)

        table = etl.convert(table, 'temperature', int)
        table = etl.convert(table, 'temperature_dew', int)
        table = etl.convert(table, 'temperature_min', int)
        table = etl.convert(table, 'wind_speed_avg', int)
        table = etl.convert(table, 'wind_speed', int)
        table = etl.convert(table, 'wind_speed_max', int)

        table = etl.convert(table, 'temperature', lambda v: v / 10)
        table = etl.convert(table, 'temperature_dew', lambda v: v / 10)
        table = etl.convert(table, 'temperature_min', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed_avg', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed_max', lambda v: v / 10)

        df = pd.DataFrame(table)
        df.columns = df.iloc[0]
        df = df.drop(0)
        df['timestamp'] = df['date'] + df['hour']

        df['weather_station_id'] = df['weather_station_id'].astype(str)
        df['timestamp'] = df['timestamp'].apply(custom_to_datetime)
        df['timestamp'] = df['timestamp'].astype(str)

        df = df.drop(columns=['date', 'hour'], axis=1)

        final_knmi_table = etl.fromdataframe(df)

        final_knmi_table = etl.melt(final_knmi_table,
                                    key=[
                                        'weather_station_id', 'timestamp',
                                        'id', 'latitude', 'longitude', 'name',
                                        'station_number', 'data_source_id',
                                        'altitude'
                                    ])
        final_knmi_table = etl.rename(final_knmi_table, 'variable',
                                      'component')
        print(final_knmi_table)

        final_knmi_table2 = etl.join(final_knmi_table,
                                     table_com,
                                     key='component')
        final_knmi_table2 = etl.join(final_knmi_table2, table_loc, key='name')
        final_knmi_table2 = etl.join(final_knmi_table2,
                                     table_time,
                                     key='timestamp')
        print('dos')

        print(final_knmi_table2)
        df = pd.DataFrame(final_knmi_table2)
        df.columns = df.iloc[0]
        df = df.drop(0)
        fact_source = df[[
            'id_component', 'id_location', 'id_time', 'value',
            'data_source_id', 'weather_station_id'
        ]]

        print(fact_source)
        fact_source.to_sql('fact_source',
                           engine,
                           if_exists='append',
                           index=False,
                           method='multi')

    for rn in range(0, 1148, 1000):
        print('lmn')
        final_lmn_table = etl.fromdb(
            connection,
            "select ld.id, ld.station_number, ld.value, ld.timestamp, ls.name as component, "
            "ws.id as lid, ws.latitude, ws.longitude, ws.data_source_id, ws.altitude, ws.name as name"
            " from luchtmeetnet_data ld "
            "right join luchtmeetnet_sensors ls on ld.formula = ls.formula "
            " join weatherstations ws on ld.station_number = ws.station_number "
            "where ws.station_number like \'NL%%\' AND timestamp > '2019-09-01' "
            "LIMIT 1000 OFFSET %s" % rn)
        final_lmn_table = etl.rename(final_lmn_table,
                                     {'station_number': 'weather_station_id'})
        final_lmn_table = etl.movefield(final_lmn_table, 'timestamp', 1)
        # print(final_lmn_table)
        # print(final_lmn_table)

        # print(table_com)
        final_lmn_table2 = etl.join(final_lmn_table,
                                    table_com,
                                    key='component')
        # print(final_lmn_table2)

        final_lmn_table2 = etl.join(final_lmn_table2, table_loc, key='name')
        # print(final_lmn_table2)
        df = pd.DataFrame(final_lmn_table2)
        df.columns = df.iloc[0]
        df = df.drop(0)
        df['timestamp'] = df['timestamp'].str[:-6]
        # print(df)

        final_lmn_table2 = etl.fromdataframe(df)

        final_lmn_table2 = etl.join(final_lmn_table2,
                                    table_time,
                                    key='timestamp')
        # print(final_lmn_table2)

        print(final_lmn_table2)
        final_lmn_df = pd.DataFrame(final_lmn_table2)
        final_lmn_df.columns = final_lmn_df.iloc[0]
        final_lmn_df = final_lmn_df.drop(0)
        fact_source = final_lmn_df[[
            'id_component', 'id_location', 'id_time', 'value',
            'data_source_id', 'weather_station_id'
        ]]
        print(fact_source)

        fact_source.to_sql('fact_source',
                           engine,
                           if_exists='append',
                           index=False,
                           method='multi')
예제 #34
0
파일: examples.py 프로젝트: datamade/petl
fc['b']['bar']
fc['b']['baz']
fc['c']


# rename

table1 = [['sex', 'age'],
        ['m', 12],
        ['f', 34],
        ['-', 56]]

from petl import look, rename
look(table1)
# rename a single field
table2 = rename(table1, 'sex', 'gender')
look(table2)
# rename multiple fields by passing a dictionary as the second argument
table3 = rename(table1, {'sex': 'gender', 'age': 'age_years'})
look(table3)
# the returned table object can also be used to modify the field mapping using the suffix notation
table4 = rename(table1)
table4['sex'] = 'gender'
table4['age'] = 'age_years'
look(table4)


# cut

table1 = [['foo', 'bar', 'baz'],
          ['A', 1, 2.7],
예제 #35
0
def rename_headers(table, mappings):
    return etl.rename(table, mappings)
예제 #36
0
def append_tailings_reports_to_code_required_reports(connection, commit=False):
    src_table = etl.fromdb(
        connection,
        'SELECT exp_doc.mine_guid, exp_doc.exp_document_guid, req_doc.req_document_name, exp_doc.due_date, exp_doc.exp_document_status_code, exp_doc.received_date, exp_doc.active_ind, exp_doc_x.mine_document_guid, exp_doc.create_user, exp_doc.create_timestamp, exp_doc.update_user, exp_doc.update_timestamp from mine_expected_document exp_doc \
        inner join mine_expected_document_xref exp_doc_x on exp_doc.exp_document_guid = exp_doc_x.exp_document_guid\
        inner join mds_required_document req_doc on req_doc.req_document_guid = exp_doc.req_document_guid'
    )

    req_document_crr_defintion_map = [
        ['req_document_name', 'mine_report_definition_id'],
        ['Summary of TSF and Dam Safety Recommendations', 28],
        ['ITRB Activities Report', 27],
        ['Register of Tailings Storage Facilities and Dams', 47],
        ['Dam Safety Inspection (DSI) Report', 26],
        ['Dam Safety Review (DSR) Report', 31],
        ['“As-built” Reports', 32],
        ['Annual Reclamation', 25],
        ['MERP Record of Testing', 3],
        #['Annual Manager\'s Report', __________________ ], no mapping or data, ignore.
        ['OMS Manual', 33],
        ['Annual reconciliation of water balance and water management plans', 44],
        ['TSF risk assessment', 46],
        ['Mine Emergency Preparedness and Response Plan (MERP)', 24],
        ['Performance of high risk dumps', 29]
    ]

    table1 = etl.join(src_table, req_document_crr_defintion_map, 'req_document_name')
    mine_report = etl.cutout(table1, 'req_document_name')

    #to be inserted into db
    mine_report = etl.addfield(mine_report, 'submission_year', 2019)
    mine_report = etl.rename(mine_report, 'exp_document_status_code',
                             'mine_report_submission_status_code')
    mine_report = etl.addfield(mine_report, 'deleted_ind', lambda x: not x.active_ind)
    mine_report = etl.cutout(mine_report, 'active_ind')
    #to determine what FK's will be so can insert into related tables
    max_report_id = etl.fromdb(connection,
                               'select last_value from public.mine_report_mine_report_id_seq')[1][0]
    max_report_submission_id = etl.fromdb(
        connection,
        'select last_value from public.mine_report_submission_mine_report_submission_id_seq')[1][0]

    #if sequence hasn't been used yet, fix off by one
    if max_report_id == 1:
        max_report_id = 0
    if max_report_submission_id == 1:
        max_report_submission_id = 0

    #get one-to-many
    mine_report, mine_report_submission_documents = etl.unjoin(mine_report,
                                                               'mine_document_guid',
                                                               key='exp_document_guid')

    #add PK's for mappings
    mine_report_with_ids = etl.addrownumbers(mine_report,
                                             start=max_report_id + 1,
                                             step=1,
                                             field='mine_report_id')
    mine_report_with_ids = etl.addrownumbers(mine_report_with_ids,
                                             start=max_report_submission_id + 1,
                                             step=1,
                                             field='mine_report_submission_id')
    print(f'max_report_id= {max_report_id}, max_report_submission_id={max_report_submission_id}')
    #copy out fields for submission tables
    mine_report_submissions = etl.cut(mine_report_with_ids, [
        'mine_report_id', 'exp_document_guid', 'mine_report_submission_status_code', 'create_user',
        'create_timestamp', 'update_user', 'update_timestamp'
    ])
    mine_report_submissions = etl.addfield(mine_report_submissions,
                                           'submission_date', lambda x: x.create_timestamp)
    #remove fields not in mine_report
    mine_report = etl.cutout(mine_report, 'mine_report_submission_status_code')

    #replace exp_document_guid FK with mine_report_submission FK
    submission_id_lookup = etl.cut(mine_report_with_ids,
                                   ['mine_report_submission_id', 'exp_document_guid'])
    mine_report_submission_documents = etl.join(submission_id_lookup,
                                                mine_report_submission_documents,
                                                key='exp_document_guid')
    mine_report_submission_documents = etl.cutout(mine_report_submission_documents,
                                                  'exp_document_guid')

    #removed original PK
    mine_report = etl.cutout(mine_report, 'exp_document_guid')
    mine_report_submissions = etl.cutout(mine_report_submissions, 'exp_document_guid')

    print(etl.valuecounter(etl.distinct(table1, key='exp_document_guid'), 'req_document_name'))
    print(etl.valuecounter(mine_report, 'mine_report_definition_id'))
    print(table1)
    print(mine_report)
    print(mine_report_submissions)
    print(mine_report_submission_documents)

 
    etl.appenddb(mine_report, connection, 'mine_report', commit=False)
    print('INSERT mine_report staged')
    etl.appenddb(mine_report_submissions, connection, 'mine_report_submission', commit=False)
    print('INSERT mine_report_submission staged')
    etl.appenddb(mine_report_submission_documents,
                    connection,
                    'mine_report_document_xref',
                    commit=False)
    print('INSERT mine_report_document_xref staged')
    if commit:  
        connection.commit()
        print('DATA CREATION COMPLETE')
    else:
        connection.rollback()
        print('NO DATA CREATED: add --commit=true to insert report rows')
예제 #37
0
파일: vcf.py 프로젝트: hexatonics/petlx
def unpackinfo(tbl, *keys, **kwargs):
    """
    Unpack the INFO field into separate fields. E.g.::

        >>> from petlx.vcf import fromvcf, unpackinfo
        >>> from petl import look
        >>> t1 = fromvcf('../fixture/sample.vcf', samples=False)
        >>> look(t1)
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | 'CHROM' | 'POS'   | 'ID'        | 'REF' | 'ALT'     | 'QUAL' | 'FILTER' | 'INFO'                                                                                  |
        +=========+=========+=============+=======+===========+========+==========+=========================================================================================+
        | '19'    |     111 | None        | 'A'   | [C]       |    9.6 | []       | {}                                                                                      |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '19'    |     112 | None        | 'A'   | [G]       |     10 | []       | {}                                                                                      |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    |   14370 | 'rs6054257' | 'G'   | [A]       |     29 | []       | OrderedDict([('NS', 3), ('DP', 14), ('AF', [0.5]), ('DB', True), ('H2', True)])         |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    |   17330 | None        | 'T'   | [A]       |      3 | ['q10']  | OrderedDict([('NS', 3), ('DP', 11), ('AF', [0.017])])                                   |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    | 1110696 | 'rs6040355' | 'A'   | [G, T]    |     67 | []       | OrderedDict([('NS', 2), ('DP', 10), ('AF', [0.333, 0.667]), ('AA', 'T'), ('DB', True)]) |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    | 1230237 | None        | 'T'   | [None]    |     47 | []       | OrderedDict([('NS', 3), ('DP', 13), ('AA', 'T')])                                       |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    | 1234567 | 'microsat1' | 'G'   | [GA, GAC] |     50 | []       | OrderedDict([('NS', 3), ('DP', 9), ('AA', 'G'), ('AN', 6), ('AC', [3, 1])])             |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | '20'    | 1235237 | None        | 'T'   | [None]    | None   | []       | {}                                                                                      |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        | 'X'     |      10 | 'rsTest'    | 'AC'  | [A, ATG]  |     10 | []       | {}                                                                                      |
        +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+
        
        >>> t2 = unpackinfo(t1)
        >>> look(t2)
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | 'CHROM' | 'POS'   | 'ID'        | 'REF' | 'ALT'     | 'QUAL' | 'FILTER' | 'NS' | 'AN' | 'AC'   | 'DP' | 'AF'           | 'AA' | 'DB' | 'H2' |
        +=========+=========+=============+=======+===========+========+==========+======+======+========+======+================+======+======+======+
        | '19'    |     111 | None        | 'A'   | [C]       |    9.6 | []       | None | None | None   | None | None           | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '19'    |     112 | None        | 'A'   | [G]       |     10 | []       | None | None | None   | None | None           | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    |   14370 | 'rs6054257' | 'G'   | [A]       |     29 | []       |    3 | None | None   |   14 | [0.5]          | None | True | True |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    |   17330 | None        | 'T'   | [A]       |      3 | ['q10']  |    3 | None | None   |   11 | [0.017]        | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    | 1110696 | 'rs6040355' | 'A'   | [G, T]    |     67 | []       |    2 | None | None   |   10 | [0.333, 0.667] | 'T'  | True | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    | 1230237 | None        | 'T'   | [None]    |     47 | []       |    3 | None | None   |   13 | None           | 'T'  | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    | 1234567 | 'microsat1' | 'G'   | [GA, GAC] |     50 | []       |    3 |    6 | [3, 1] |    9 | None           | 'G'  | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | '20'    | 1235237 | None        | 'T'   | [None]    | None   | []       | None | None | None   | None | None           | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
        | 'X'     |      10 | 'rsTest'    | 'AC'  | [A, ATG]  |     10 | []       | None | None | None   | None | None           | None | None | None |
        +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+
    
    .. versionadded:: 0.5
    
    """
    if not keys:
        if hasattr(tbl, 'filename'):
            try:
                import vcf as pyvcf
            except ImportError as e:
                raise UnsatisfiedDependency(e, dep_message)
            reader = pyvcf.Reader(filename=tbl.filename)
            # all INFO
            keys = reader.infos.keys()
    result = unpackdict(tbl, 'INFO', keys=keys)
    if 'prefix' in kwargs:
        result = rename(result, {k: kwargs['prefix'] + k for k in keys})
    if hasattr(tbl, 'filename'):
        return VCFWrapper(result, tbl.filename)
    else:
        return result