def vcfunpackinfo(table, *keys): """ Unpack the INFO field into separate fields. E.g.:: >>> import petl as etl >>> # activate bio extensions ... import petlx.bio >>> table1 = ( ... etl ... .fromvcf('fixture/sample.vcf', samples=None) ... .vcfunpackinfo() ... ) >>> table1 +-------+---------+-------------+-----+--------+------+---------+------+------+----------------+------+------+------+------+------+ | CHROM | POS | ID | REF | ALT | QUAL | FILTER | AA | AC | AF | AN | DB | DP | H2 | NS | +=======+=========+=============+=====+========+======+=========+======+======+================+======+======+======+======+======+ | '19' | 111 | None | 'A' | [C] | 9.6 | None | None | None | None | None | None | None | None | None | +-------+---------+-------------+-----+--------+------+---------+------+------+----------------+------+------+------+------+------+ | '19' | 112 | None | 'A' | [G] | 10 | None | None | None | None | None | None | None | None | None | +-------+---------+-------------+-----+--------+------+---------+------+------+----------------+------+------+------+------+------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | None | None | [0.5] | None | True | 14 | True | 3 | +-------+---------+-------------+-----+--------+------+---------+------+------+----------------+------+------+------+------+------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | None | None | [0.017] | None | None | 11 | None | 3 | +-------+---------+-------------+-----+--------+------+---------+------+------+----------------+------+------+------+------+------+ | '20' | 1110696 | 'rs6040355' | 'A' | [G, T] | 67 | [] | 'T' | None | [0.333, 0.667] | None | True | 10 | None | 2 | +-------+---------+-------------+-----+--------+------+---------+------+------+----------------+------+------+------+------+------+ ... """ result = etl.unpackdict(table, 'INFO', keys=keys) return result
def unpackcall(tbl, *keys, **kwargs): """ Unpack the call column. E.g.:: >>> from petlx.vcf import fromvcf, unpackinfo, meltsamples, unpackcall >>> from petl import look, cutout >>> t1 = fromvcf('../fixture/sample.vcf') >>> t2 = meltsamples(t1) >>> t3 = unpackcall(t2) >>> t4 = cutout(t3, 'INFO') >>> look(t4) +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | 'CHROM' | 'POS' | 'ID' | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'SAMPLE' | 'GT' | 'GQ' | 'DP' | 'HQ' | +=========+=======+=============+=======+=======+========+==========+===========+=======+======+======+==============+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00001' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00002' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00003' | '0/1' | None | None | [3, 3] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00001' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00002' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00003' | '0/1' | None | None | [3, 3] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00001' | '0|0' | 48 | 1 | [51, 51] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00002' | '1|0' | 48 | 8 | [51, 51] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00003' | '1/1' | 43 | 5 | [None, None] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | 'NA00001' | '0|0' | 49 | 3 | [58, 50] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ .. versionadded:: 0.5 """ if not keys: if hasattr(tbl, 'filename'): try: import vcf as pyvcf except ImportError as e: raise UnsatisfiedDependency(e, dep_message) reader = pyvcf.Reader(filename=tbl.filename) # all FORMAT keys = reader.formats.keys() else: tbl = convert(tbl, 'CALL', lambda v: v.data._asdict() ) # enable sampling of keys from data result = unpackdict(tbl, 'CALL', keys=keys) if 'prefix' in kwargs: result = rename(result, {k: kwargs['prefix'] + k for k in keys}) if hasattr(tbl, 'filename'): return VCFWrapper(result, tbl.filename) else: return result
def unpackcall(tbl, *keys, **kwargs): """ Unpack the call column. E.g.:: >>> from petlx.vcf import fromvcf, unpackinfo, meltsamples, unpackcall >>> from petl import look, cutout >>> t1 = fromvcf('../fixture/sample.vcf') >>> t2 = meltsamples(t1) >>> t3 = unpackcall(t2) >>> t4 = cutout(t3, 'INFO') >>> look(t4) +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | 'CHROM' | 'POS' | 'ID' | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'SAMPLE' | 'GT' | 'GQ' | 'DP' | 'HQ' | +=========+=======+=============+=======+=======+========+==========+===========+=======+======+======+==============+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00001' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00002' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00003' | '0/1' | None | None | [3, 3] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00001' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00002' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00003' | '0/1' | None | None | [3, 3] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00001' | '0|0' | 48 | 1 | [51, 51] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00002' | '1|0' | 48 | 8 | [51, 51] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00003' | '1/1' | 43 | 5 | [None, None] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | 'NA00001' | '0|0' | 49 | 3 | [58, 50] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ .. versionadded:: 0.5 """ if not keys: if hasattr(tbl, 'filename'): try: import vcf as pyvcf except ImportError as e: raise UnsatisfiedDependency(e, dep_message) reader = pyvcf.Reader(filename=tbl.filename) # all FORMAT keys = reader.formats.keys() else: tbl = convert(tbl, 'CALL', lambda v: v.data._asdict()) # enable sampling of keys from data result = unpackdict(tbl, 'CALL', keys=keys) if 'prefix' in kwargs: result = rename(result, {k: kwargs['prefix'] + k for k in keys}) if hasattr(tbl, 'filename'): return VCFWrapper(result, tbl.filename) else: return result
def unpack_dict(self, column, keys=None, include_original=False, sample_size=1000, missing=None, prepend=True, prepend_value=None): """ Unpack dictionary values from one column into separate columns `Args:` column: str The column name to unpack keys: list The dict keys in the column to unpack. If ``None`` will unpack all. include_original: boolean Retain original column after unpacking sample_size: int Number of rows to sample before determining columns missing: str If a value is missing, the value to fill it with prepend: Prepend the column name of the unpacked values. Useful for avoiding duplicate column names prepend_value: Value to prepend new columns if ``prepend=True``. If None, will set to column name. """ if prepend: if prepend_value is None: prepend_value = column self.table = petl.convert( self.table, column, lambda v: self._prepend_dict(v, prepend_value)) self.table = petl.unpackdict(self.table, column, keys=keys, includeoriginal=include_original, samplesize=sample_size, missing=missing) return self
# time consumed retrieving rows from t2 c2.time # actual time consumed by the convert step c2.time - c1.time # unpackdict table1 = (('foo', 'bar'), (1, {'baz': 'a', 'quux': 'b'}), (2, {'baz': 'c', 'quux': 'd'}), (3, {'baz': 'e', 'quux': 'f'})) from petl import unpackdict, look look(table1) table2 = unpackdict(table1, 'bar') look(table2) # unique table1 = (('foo', 'bar', 'baz'), ('A', 1, 2), ('B', '2', '3.4'), ('D', 'xyz', 9.0), ('B', u'3', u'7.8'), ('B', '2', 42), ('E', None, None), ('D', 4, 12.3), ('F', 7, 2.3)) from petl import unique, look
def unpackinfo(tbl, *keys, **kwargs): """ Unpack the INFO field into separate fields. E.g.:: >>> from petlx.vcf import fromvcf, unpackinfo >>> from petl import look >>> t1 = fromvcf('../fixture/sample.vcf', samples=False) >>> look(t1) +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | 'CHROM' | 'POS' | 'ID' | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'INFO' | +=========+=========+=============+=======+===========+========+==========+=========================================================================================+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | {} | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | {} | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | OrderedDict([('NS', 3), ('DP', 14), ('AF', [0.5]), ('DB', True), ('H2', True)]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | OrderedDict([('NS', 3), ('DP', 11), ('AF', [0.017])]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 1110696 | 'rs6040355' | 'A' | [G, T] | 67 | [] | OrderedDict([('NS', 2), ('DP', 10), ('AF', [0.333, 0.667]), ('AA', 'T'), ('DB', True)]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 1230237 | None | 'T' | [None] | 47 | [] | OrderedDict([('NS', 3), ('DP', 13), ('AA', 'T')]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 1234567 | 'microsat1' | 'G' | [GA, GAC] | 50 | [] | OrderedDict([('NS', 3), ('DP', 9), ('AA', 'G'), ('AN', 6), ('AC', [3, 1])]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 1235237 | None | 'T' | [None] | None | [] | {} | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | 'X' | 10 | 'rsTest' | 'AC' | [A, ATG] | 10 | [] | {} | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ >>> t2 = unpackinfo(t1) >>> look(t2) +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | 'CHROM' | 'POS' | 'ID' | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'NS' | 'AN' | 'AC' | 'DP' | 'AF' | 'AA' | 'DB' | 'H2' | +=========+=========+=============+=======+===========+========+==========+======+======+========+======+================+======+======+======+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | None | None | None | None | None | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | None | None | None | None | None | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 3 | None | None | 14 | [0.5] | None | True | True | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | 3 | None | None | 11 | [0.017] | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 1110696 | 'rs6040355' | 'A' | [G, T] | 67 | [] | 2 | None | None | 10 | [0.333, 0.667] | 'T' | True | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 1230237 | None | 'T' | [None] | 47 | [] | 3 | None | None | 13 | None | 'T' | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 1234567 | 'microsat1' | 'G' | [GA, GAC] | 50 | [] | 3 | 6 | [3, 1] | 9 | None | 'G' | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 1235237 | None | 'T' | [None] | None | [] | None | None | None | None | None | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | 'X' | 10 | 'rsTest' | 'AC' | [A, ATG] | 10 | [] | None | None | None | None | None | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ .. versionadded:: 0.5 """ if not keys: if hasattr(tbl, 'filename'): try: import vcf as pyvcf except ImportError as e: raise UnsatisfiedDependency(e, dep_message) reader = pyvcf.Reader(filename=tbl.filename) # all INFO keys = reader.infos.keys() result = unpackdict(tbl, 'INFO', keys=keys) if 'prefix' in kwargs: result = rename(result, {k: kwargs['prefix'] + k for k in keys}) if hasattr(tbl, 'filename'): return VCFWrapper(result, tbl.filename) else: return result
key=('room_id'), aggregation=aggregations) external_ids = etl.fromdicts( [{ 'id': '3979', 'external_id': '95109151-af77-11e9-94fa-a860b6030e49' }, { 'id': '3980', 'external_id': '95d8c92e-af77-11e9-99b7-a860b6030e49' }, { 'id': '3982', 'external_id': '97163c4a-af77-11e9-bdf9-a860b6030e49' }], header=['id', 'external_id']) aggregated_summary = etl.unpackdict(aggregated_summary, 'creation_data') file_name = 'datasets-%s.csv' % datetime.now().strftime('%Y%m%d%H%M%S') directory = 'csv' if not os.path.exists(directory): os.makedirs(directory) # etl.tocsv(aggregated_summary, './%s/%s' % (directory, file_name)) # logging.info('This %s has been exported' % file_name) rooms, participations = storing_data_preparation(aggregated_summary) participations = etl.leftjoin(participations, external_ids, lkey='participant_id', rkey='id', rprefix='r_')
from __future__ import absolute_import, print_function, division # unpack() ########## import petl as etl table1 = [['foo', 'bar'], [1, ['a', 'b']], [2, ['c', 'd']], [3, ['e', 'f']]] table2 = etl.unpack(table1, 'bar', ['baz', 'quux']) table2 # unpackdict() ############## import petl as etl table1 = [['foo', 'bar'], [1, {'baz': 'a', 'quux': 'b'}], [2, {'baz': 'c', 'quux': 'd'}], [3, {'baz': 'e', 'quux': 'f'}]] table2 = etl.unpackdict(table1, 'bar') table2