def vcfmeltsamples(table, *samples): """ Melt the samples columns. E.g.:: >>> import petl as etl >>> # activate bio extensions ... import petlx.bio >>> table1 = ( ... etl ... .fromvcf('fixture/sample.vcf') ... .vcfmeltsamples() ... ) >>> table1 +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+ | CHROM | POS | ID | REF | ALT | QUAL | FILTER | INFO | SAMPLE | CALL | +=======+=====+======+=====+=====+======+========+======+===========+=====================================================+ | '19' | 111 | None | 'A' | [C] | 9.6 | None | {} | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, HQ=[10, 10])) | +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | None | {} | 'NA00002' | Call(sample=NA00002, CallData(GT=0|0, HQ=[10, 10])) | +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | None | {} | 'NA00003' | Call(sample=NA00003, CallData(GT=0/1, HQ=[3, 3])) | +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+ | '19' | 112 | None | 'A' | [G] | 10 | None | {} | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, HQ=[10, 10])) | +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+ | '19' | 112 | None | 'A' | [G] | 10 | None | {} | 'NA00002' | Call(sample=NA00002, CallData(GT=0|0, HQ=[10, 10])) | +-------+-----+------+-----+-----+------+--------+------+-----------+-----------------------------------------------------+ ... """ result = etl.melt(table, key=VCF_HEADER, variables=samples, variablefield='SAMPLE', valuefield='CALL') return result
def meltsamples(tbl, *samples): """ Melt the samples columns. E.g.:: >>> from petlx.vcf import fromvcf, unpackinfo, meltsamples >>> from petl import look, cutout >>> t1 = fromvcf('../fixture/sample.vcf') >>> t2 = meltsamples(t1) >>> t3 = cutout(t2, 'INFO') >>> look(t3) +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+ | 'CHROM' | 'POS' | 'ID' | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'SAMPLE' | 'CALL' | +=========+=======+=============+=======+=======+========+==========+===========+======================================================================+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, HQ=[10, 10])) | +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00002' | Call(sample=NA00002, CallData(GT=0|0, HQ=[10, 10])) | +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00003' | Call(sample=NA00003, CallData(GT=0/1, HQ=[3, 3])) | +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, HQ=[10, 10])) | +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00002' | Call(sample=NA00002, CallData(GT=0|0, HQ=[10, 10])) | +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00003' | Call(sample=NA00003, CallData(GT=0/1, HQ=[3, 3])) | +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, GQ=48, DP=1, HQ=[51, 51])) | +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00002' | Call(sample=NA00002, CallData(GT=1|0, GQ=48, DP=8, HQ=[51, 51])) | +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00003' | Call(sample=NA00003, CallData(GT=1/1, GQ=43, DP=5, HQ=[None, None])) | +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | 'NA00001' | Call(sample=NA00001, CallData(GT=0|0, GQ=49, DP=3, HQ=[58, 50])) | +---------+-------+-------------+-------+-------+--------+----------+-----------+----------------------------------------------------------------------+ .. versionadded:: 0.5 """ result = melt(tbl, key=fixed_fields, variables=samples, variablefield='SAMPLE', valuefield='CALL') if hasattr(tbl, 'filename'): return VCFWrapper(result, tbl.filename) else: return result
# melt table1 = [['id', 'gender', 'age'], [1, 'F', 12], [2, 'M', 17], [3, 'M', 16]] table3 = [['id', 'time', 'height', 'weight'], [1, 11, 66.4, 12.2], [2, 16, 53.2, 17.3], [3, 12, 34.5, 9.4]] from petl import melt, look look(table1) table2 = melt(table1, 'id') look(table2) # compound keys are supported look(table3) table4 = melt(table3, key=['id', 'time']) look(table4) # a subset of variable fields can be selected table5 = melt(table3, key=['id', 'time'], variables=['height']) look(table5) # recast table1 = [['id', 'variable', 'value'], [3, 'age', 16], [1, 'gender', 'F'],
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Burma', 'Myanmar') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'US', 'USA') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Taiwan*', 'Taiwan') # Luego procedemos a agrupar y acumular los resultados por el país df_confirmed = etl.todataframe(t_confirmed) df = df_confirmed.groupby(['Country']).sum() t_confirmed = etl.fromdataframe(df, include_index=True) # Renombramos el campo de Country nuevamente t_confirmed = etl.rename(t_confirmed, {'index': 'Country'}) # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas t_confirmed = etl.melt(t_confirmed, 'Country') t_confirmed = etl.rename(t_confirmed, {'variable': 'Date'}) t_confirmed = etl.rename(t_confirmed, {'value': 'Cases'}) # Luego agregamos el continente para agrupar t_confirmed = etl.addfield(t_confirmed, 'Continent', lambda rec: get_continent_code(rec['Country'])) # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser. t_confirmed = etl.convert(t_confirmed, 'Cases', int) t_confirmed = etl.convert(t_confirmed, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d')) #Finalmente, subimos el archivo al repositorio de datos conn = pymysql.connect(password='******', database='covid', user='******') conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES')
def long_table(self, key, column, key_rename=None, retain_original=False, prepend=True, prepend_value=None): """ Create a new long parsons table from a column, including the foreign key. .. code-block:: python # Begin with nested dicts in a column json = [{'id': '5421', 'name': 'Jane Green', 'emails': [{'home': '*****@*****.**'}, {'work': '*****@*****.**'} ] } ] tbl = Table(json) print (tbl) >>> {'id': '5421', 'name': 'Jane Green', 'emails': [{'home': '*****@*****.**'}, {'work': '*****@*****.**'}]} # noqa: E501 >>> {'id': '5421', 'name': 'Jane Green', 'emails': [{'home': '*****@*****.**'}, {'work': '*****@*****.**'}]} # noqa: E501 # Create skinny table of just the nested dicts email_skinny = tbl.long_table(['id'], 'emails') print (email_skinny) >>> {'id': '5421', 'emails_home': '*****@*****.**', 'emails_work': None} >>> {'id': '5421', 'emails_home': None, 'emails_work': '*****@*****.**'} `Args:` key: lst The columns to retain in the long table (e.g. foreign keys) column: str The column name to make long key_rename: dict The new name for the foreign key to better identify it. For example, you might want to rename ``id`` to ``person_id``. Ex. {'KEY_NAME': 'NEW_KEY_NAME'} retain_original: boolean Retain the original column from the source table. prepend: Prepend the column name of the unpacked values. Useful for avoiding duplicate column names prepend_value: Value to prepend new columns if ``prepend=True``. If None, will set to column name. `Returns:` Parsons Table The new long table """ if type(key) == str: key = [key] lt = self.cut(*key, column) # Create a table of key and column lt.unpack_list(column, replace=True) # Unpack the list lt.table = petl.melt(lt.table, key) # Melt into a long table lt = lt.cut(*key, 'value') # Get rid of column names created in unpack lt.rename_column('value', column) # Rename 'value' to old column name lt.remove_null_rows(column) # Remove null values # If a new key name is specified, rename if key_rename: for k, v in key_rename.items(): lt.rename_column(k, v) # If there is a nested dict in the column, unpack it if lt.num_rows > 0 and isinstance(lt.table[column][0], dict): lt.unpack_dict(column, prepend=prepend, prepend_value=prepend_value) if not retain_original: self.remove_column(column) return lt
def unpack_nested_columns_as_rows(self, column, key='id', expand_original=False): """ Unpack list or dict values from one column into separate rows. Not recommended for JSON columns (i.e. lists of dicts), but can handle columns with any mix of types. Makes use of PETL's `melt()` method. `Args:` column: str The column name to unpack key: str The column to use as a key when unpacking. Defaults to `id` expand_original: boolean or int If `True`: Add resulting unpacked rows (with all other columns) to original If `int`: Add to original unless the max added per key is above the given number If `False` (default): Return unpacked rows (with `key` column only) as standalone Removes packed list and dict rows from original either way. `Returns:` If `expand_original`, original table with packed rows replaced by unpacked rows Otherwise, standalone table with key column and unpacked values only """ if isinstance(expand_original, int) and expand_original is not True: lengths = { len(row[column]) for row in self if isinstance(row[column], (dict, list)) } max_len = sorted(lengths, reverse=True)[0] if max_len > expand_original: expand_original = False if expand_original: # Include all columns and filter out other non-dict types in table_list table = self table_list = table.select_rows( lambda row: isinstance(row[column], list)) else: # Otherwise, include only key and column, but keep all non-dict types in table_list table = self.cut(key, column) table_list = table.select_rows( lambda row: not isinstance(row[column], dict)) # All the columns other than column to ignore while melting ignore_cols = table.columns ignore_cols.remove(column) # Unpack lists as separate columns table_list.unpack_list(column, replace=True) # Rename the columns to retain only the number for col in table_list.columns: if f'{column}_' in col: table_list.rename_column(col, col.replace(f'{column}_', "")) # Filter dicts and unpack as separate columns table_dict = table.select_rows( lambda row: isinstance(row[column], dict)) table_dict.unpack_dict(column, prepend=False) from parsons.etl.table import Table # Use melt to pivot both sets of columns into their own Tables and clean out None values melted_list = Table(petl.melt(table_list.table, ignore_cols)) melted_dict = Table(petl.melt(table_dict.table, ignore_cols)) melted_list.remove_null_rows('value') melted_dict.remove_null_rows('value') melted_list.rename_column('variable', column) melted_dict.rename_column('variable', column) # Combine the list and dict Tables melted_list.concat(melted_dict) import hashlib if expand_original: # Add unpacked rows to the original table (minus packed rows) orig = self.select_rows( lambda row: not isinstance(row[column], (dict, list))) orig.concat(melted_list) # Add unique id column by hashing all the other fields if 'uid' not in self.columns: orig.add_column( 'uid', lambda row: hashlib.md5( str.encode(''.join([str(x) for x in row]))).hexdigest()) orig.move_column('uid', 0) # Rename value column in case this is done again to this Table orig.rename_column('value', f'{column}_value') # Keep column next to column_value orig.move_column(column, -1) output = orig else: orig = self.remove_column(column) # Add unique id column by hashing all the other fields melted_list.add_column( 'uid', lambda row: hashlib.md5( str.encode(''.join([str(x) for x in row]))).hexdigest()) melted_list.move_column('uid', 0) output = melted_list self = orig return output
def dimension_values(): connection = psycopg2.connect(dbname='voyager', user='******', password='******', host='172.16.0.45') engine = create_engine('postgresql://*****:*****@172.16.0.45:5432/voyager') com = 'select id as id_component, name as component from dim_com' table_com = etl.fromdb(connection, com) loc = 'select id as id_location, name as name from dim_loc' table_loc = etl.fromdb(connection, loc) tim = 'select id as id_time, time as timestamp from dim_time' table_time = etl.fromdb(connection, tim) print(table_com) print(table_loc) print(table_time) for ran in range(0, 65424, 1000): sql = "select * from KNMI_station_data kk " \ "RIGHT JOIN weatherstations w ON " \ " CAST (kk.weather_station_id AS INTEGER) = CAST (w.station_number AS INTEGER) " \ "WHERE w.station_number NOT LIKE \'NL%%\' AND date > 20190901 LIMIT 1000 OFFSET %s" % ran print(sql) table = etl.fromdb(connection, sql) print('knmi') print(table) table.log_progress() table = etl.convert(table, 'date', str) table = etl.convert(table, 'hour', str) table = etl.convert(table, 'temperature', int) table = etl.convert(table, 'temperature_dew', int) table = etl.convert(table, 'temperature_min', int) table = etl.convert(table, 'wind_speed_avg', int) table = etl.convert(table, 'wind_speed', int) table = etl.convert(table, 'wind_speed_max', int) table = etl.convert(table, 'temperature', lambda v: v / 10) table = etl.convert(table, 'temperature_dew', lambda v: v / 10) table = etl.convert(table, 'temperature_min', lambda v: v / 10) table = etl.convert(table, 'wind_speed_avg', lambda v: v / 10) table = etl.convert(table, 'wind_speed', lambda v: v / 10) table = etl.convert(table, 'wind_speed_max', lambda v: v / 10) df = pd.DataFrame(table) df.columns = df.iloc[0] df = df.drop(0) df['timestamp'] = df['date'] + df['hour'] df['weather_station_id'] = df['weather_station_id'].astype(str) df['timestamp'] = df['timestamp'].apply(custom_to_datetime) df['timestamp'] = df['timestamp'].astype(str) df = df.drop(columns=['date', 'hour'], axis=1) final_knmi_table = etl.fromdataframe(df) final_knmi_table = etl.melt(final_knmi_table, key=[ 'weather_station_id', 'timestamp', 'id', 'latitude', 'longitude', 'name', 'station_number', 'data_source_id', 'altitude' ]) final_knmi_table = etl.rename(final_knmi_table, 'variable', 'component') print(final_knmi_table) final_knmi_table2 = etl.join(final_knmi_table, table_com, key='component') final_knmi_table2 = etl.join(final_knmi_table2, table_loc, key='name') final_knmi_table2 = etl.join(final_knmi_table2, table_time, key='timestamp') print('dos') print(final_knmi_table2) df = pd.DataFrame(final_knmi_table2) df.columns = df.iloc[0] df = df.drop(0) fact_source = df[[ 'id_component', 'id_location', 'id_time', 'value', 'data_source_id', 'weather_station_id' ]] print(fact_source) fact_source.to_sql('fact_source', engine, if_exists='append', index=False, method='multi') for rn in range(0, 1148, 1000): print('lmn') final_lmn_table = etl.fromdb( connection, "select ld.id, ld.station_number, ld.value, ld.timestamp, ls.name as component, " "ws.id as lid, ws.latitude, ws.longitude, ws.data_source_id, ws.altitude, ws.name as name" " from luchtmeetnet_data ld " "right join luchtmeetnet_sensors ls on ld.formula = ls.formula " " join weatherstations ws on ld.station_number = ws.station_number " "where ws.station_number like \'NL%%\' AND timestamp > '2019-09-01' " "LIMIT 1000 OFFSET %s" % rn) final_lmn_table = etl.rename(final_lmn_table, {'station_number': 'weather_station_id'}) final_lmn_table = etl.movefield(final_lmn_table, 'timestamp', 1) # print(final_lmn_table) # print(final_lmn_table) # print(table_com) final_lmn_table2 = etl.join(final_lmn_table, table_com, key='component') # print(final_lmn_table2) final_lmn_table2 = etl.join(final_lmn_table2, table_loc, key='name') # print(final_lmn_table2) df = pd.DataFrame(final_lmn_table2) df.columns = df.iloc[0] df = df.drop(0) df['timestamp'] = df['timestamp'].str[:-6] # print(df) final_lmn_table2 = etl.fromdataframe(df) final_lmn_table2 = etl.join(final_lmn_table2, table_time, key='timestamp') # print(final_lmn_table2) print(final_lmn_table2) final_lmn_df = pd.DataFrame(final_lmn_table2) final_lmn_df.columns = final_lmn_df.iloc[0] final_lmn_df = final_lmn_df.drop(0) fact_source = final_lmn_df[[ 'id_component', 'id_location', 'id_time', 'value', 'data_source_id', 'weather_station_id' ]] print(fact_source) fact_source.to_sql('fact_source', engine, if_exists='append', index=False, method='multi')
from __future__ import absolute_import, print_function, division # melt() ######## import petl as etl table1 = [["id", "gender", "age"], [1, "F", 12], [2, "M", 17], [3, "M", 16]] table2 = etl.melt(table1, "id") table2.lookall() # compound keys are supported table3 = [["id", "time", "height", "weight"], [1, 11, 66.4, 12.2], [2, 16, 53.2, 17.3], [3, 12, 34.5, 9.4]] table4 = etl.melt(table3, key=["id", "time"]) table4.lookall() # a subset of variable fields can be selected table5 = etl.melt(table3, key=["id", "time"], variables=["height"]) table5.lookall() # recast() ########## import petl as etl table1 = [ ["id", "variable", "value"], [3, "age", 16], [1, "gender", "F"], [2, "gender", "M"], [2, "age", 17],
def procesar_fuente(path, nombre): try: # Procesamos primero casos confirmados tabla = etl.fromcsv(path) # Cambiamos el nombre a los encabezados tabla = etl.rename(tabla, {'Country/Region': 'Country'}) # Ajustamos los tipos de datos # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header headers = etl.fieldnames(tabla) i=0 for header in headers: if i>=4: tabla = etl.convert(tabla, header, int) # corregimos el tipo de dato fecha = datetime.datetime.strptime(header, '%m/%d/%y') # calculamos la fecha en formato correcto tabla = etl.rename(tabla, header, fecha.strftime('%Y-%m-%d')) i = i + 1 # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar tabla = etl.cutout(tabla, 0, 2, 3) # Ajustamos algunos nombres de países para luego asignarles una región/continente tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Brazzaville)', 'Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Kinshasa)', 'Democratic Republic of the Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast') tabla = etl.convert(tabla, 'Country', 'replace', 'Korea, South', 'South Korea') tabla = etl.convert(tabla, 'Country', 'replace', 'West Bank and Gaza', 'Palestine') tabla = etl.convert(tabla, 'Country', 'replace', 'Burma', 'Myanmar') tabla = etl.convert(tabla, 'Country', 'replace', 'US', 'USA') tabla = etl.convert(tabla, 'Country', 'replace', 'Taiwan*', 'Taiwan') # Luego procedemos a agrupar y acumular los resultados por el país df_confirmed = etl.todataframe(tabla) df = df_confirmed.groupby(['Country']).sum() tabla = etl.fromdataframe(df, include_index=True) # Renombramos el campo de Country nuevamente tabla = etl.rename(tabla, {'index': 'Country'}) # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas tabla = etl.melt(tabla, 'Country') tabla = etl.rename(tabla, {'variable': 'Date'}) tabla = etl.rename(tabla, {'value': 'Cases'}) # Luego agregamos el continente para agrupar tabla = etl.addfield(tabla, 'Continent', lambda rec: get_continent_code(rec['Country'])) # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser. tabla = etl.convert(tabla, 'Cases', int) tabla = etl.convert(tabla, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d') ) #Finalmente, subimos el archivo al repositorio de datos conn = pymysql.connect(password='******', database='covid', user='******') conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES') etl.todb(tabla, conn, nombre, create=True, drop=True) conn.close() except: print('Se ha presentado un error! ', sys.exc_info()[0]) raise