def to_dataframe(self, index=None, exclude=None, columns=None, coerce_float=False): """ Outputs table as a Pandas Dataframe `Args:` index: str, list Field of array to use as the index, alternately a specific set of input labels to use exclude: list Columns or fields to exclude columns: list Column names to use. If the passed data do not have names associated with them, this argument provides names for the columns. Otherwise this argument indicates the order of the columns in the result (any names not found in the data will become all-NA columns) `Returns:` dataframe Pandas DataFrame object """ return petl.todataframe(self.table, index=index, exclude=exclude, columns=columns, coerce_float=coerce_float)
def get_delta(source_table, target_table, key='id'): source_table_headers = etl.header(source_table) target_table_headers = etl.header(target_table) if source_table_headers != target_table_headers: raise Exception( 'Source table columns do not match target table columns') source_ids = etl.cut(source_table, key) target_ids = etl.cut(target_table, key) added_ids_table, _ = etl.diff(source_ids, target_ids) merged_table = etl.merge(source_table, target_table, key=key) load_frame = etl.todataframe( etl.selectin(target_table, key, etl.values(added_ids_table, key))) print(load_frame) for row in etl.data(merged_table): for i, col in enumerate(row): if isinstance(col, etl.transform.reductions.Conflict): changes = tuple(col) print('For car {}, {} changed from {} to {}'.format( row[0], source_table_headers[i], changes[1], changes[0])) row_dict = dict(zip(source_table_headers, list(row))) row_dict[source_table_headers[i]] = changes[0] row_dict = {key: [val] for (key, val) in row_dict.items()} print(row_dict) df = pd.DataFrame(row_dict) load_frame = load_frame.append(df, ignore_index=True) break return etl.fromdataframe(load_frame)
# ('apples', 1, 2.5), # ('oranges', 3, 4.4), # ('pears', 7, .1)] # a = etl.toarray(t, dtype='U4, i2, f4') # print(a) # # array1 = np.array([('apples', 1, 2.5), # ('oranges', 3, 4.4), # ('pears', 7, .1)], dtype='U4, i2, f4') # tb7 = etl.fromarray(array1) # print(tb7) # dataframe(pandas) import pandas as pd records = [('apple', 1, 2.5), ('orange', 3, 4.5), ('pears', 5, 6.5)] df = pd.DataFrame.from_records(records, columns=('foo', 'bar', 'baz')) tb8 = etl.fromdataframe(df) print(tb8) # load data from given table into dataframe table = [('foo', 'bar', 'baz'), ('apple', 1, 2.5), ('orange', 3, 4.5), ('pears', 5, 6.5)] df = etl.todataframe(table) print(df) # excel xls/xlsx # HDFS # oracle
conn_target = create_engine('postgresql://username:hostname:5432/password') except: logger.error( "ERROR: Unexpected error: Could not connect to PostgreSQL instance.") sys.exit() logger.info("SUCCESS: Connection to RDS PostgreSQL instance succeeded") #Source table = etl.fromdb( conn, """select res_company.name, sum(product_qty) as qty, sum(price_total) as total from report_pos_order inner join res_company on res_company.id = report_pos_order.company_id where date(report_pos_order.date AT TIME ZONE 'GMT +7') = current_date group by res_company.name order by sum(price_total) desc""") #Transformation #grouping with aggregation aggregation = OrderedDict() aggregation['qty'] = 'qty', sum aggregation['total'] = 'total', sum table1 = etl.aggregate(table, 'name', aggregation) dfsum = etl.todataframe(table1) #Target dfsum.to_sql('GMV Warung', conn_target, if_exists='replace', index=None)
'Congo (Kinshasa)', 'Democratic Republic of the Congo') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Korea, South', 'South Korea') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'West Bank and Gaza', 'Palestine') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Burma', 'Myanmar') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'US', 'USA') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Taiwan*', 'Taiwan') # Luego procedemos a agrupar y acumular los resultados por el país df_confirmed = etl.todataframe(t_confirmed) df = df_confirmed.groupby(['Country']).sum() t_confirmed = etl.fromdataframe(df, include_index=True) # Renombramos el campo de Country nuevamente t_confirmed = etl.rename(t_confirmed, {'index': 'Country'}) # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas t_confirmed = etl.melt(t_confirmed, 'Country') t_confirmed = etl.rename(t_confirmed, {'variable': 'Date'}) t_confirmed = etl.rename(t_confirmed, {'value': 'Cases'}) # Luego agregamos el continente para agrupar t_confirmed = etl.addfield(t_confirmed, 'Continent', lambda rec: get_continent_code(rec['Country']))
def procesar_fuente(path, nombre): try: # Procesamos primero casos confirmados tabla = etl.fromcsv(path) # Cambiamos el nombre a los encabezados tabla = etl.rename(tabla, {'Country/Region': 'Country'}) # Ajustamos los tipos de datos # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header headers = etl.fieldnames(tabla) i=0 for header in headers: if i>=4: tabla = etl.convert(tabla, header, int) # corregimos el tipo de dato fecha = datetime.datetime.strptime(header, '%m/%d/%y') # calculamos la fecha en formato correcto tabla = etl.rename(tabla, header, fecha.strftime('%Y-%m-%d')) i = i + 1 # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar tabla = etl.cutout(tabla, 0, 2, 3) # Ajustamos algunos nombres de países para luego asignarles una región/continente tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Brazzaville)', 'Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Kinshasa)', 'Democratic Republic of the Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast') tabla = etl.convert(tabla, 'Country', 'replace', 'Korea, South', 'South Korea') tabla = etl.convert(tabla, 'Country', 'replace', 'West Bank and Gaza', 'Palestine') tabla = etl.convert(tabla, 'Country', 'replace', 'Burma', 'Myanmar') tabla = etl.convert(tabla, 'Country', 'replace', 'US', 'USA') tabla = etl.convert(tabla, 'Country', 'replace', 'Taiwan*', 'Taiwan') # Luego procedemos a agrupar y acumular los resultados por el país df_confirmed = etl.todataframe(tabla) df = df_confirmed.groupby(['Country']).sum() tabla = etl.fromdataframe(df, include_index=True) # Renombramos el campo de Country nuevamente tabla = etl.rename(tabla, {'index': 'Country'}) # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas tabla = etl.melt(tabla, 'Country') tabla = etl.rename(tabla, {'variable': 'Date'}) tabla = etl.rename(tabla, {'value': 'Cases'}) # Luego agregamos el continente para agrupar tabla = etl.addfield(tabla, 'Continent', lambda rec: get_continent_code(rec['Country'])) # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser. tabla = etl.convert(tabla, 'Cases', int) tabla = etl.convert(tabla, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d') ) #Finalmente, subimos el archivo al repositorio de datos conn = pymysql.connect(password='******', database='covid', user='******') conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES') etl.todb(tabla, conn, nombre, create=True, drop=True) conn.close() except: print('Se ha presentado un error! ', sys.exc_info()[0]) raise
from __future__ import division, print_function, absolute_import # todataframe() ############### import petl as etl table = [("foo", "bar", "baz"), ("apples", 1, 2.5), ("oranges", 3, 4.4), ("pears", 7, 0.1)] df = etl.todataframe(table) df # fromdataframe() ################# import petl as etl import pandas as pd records = [("apples", 1, 2.5), ("oranges", 3, 4.4), ("pears", 7, 0.1)] df = pd.DataFrame.from_records(records, columns=("foo", "bar", "baz")) table = etl.fromdataframe(df) table