def display_store(): response.headers['Access-Control-Allow-Origin'] = '*' response.headers['Content-type'] = 'application/json' table = ( etl .fromcsv('store_locations.csv') .convert('Lat', float) .convert('Lon', float) ) store_id = request.query.postcode # Select rows table1 = etl.select(table, "{Postcode}=='" + store_id + "'") # Set default postcode of 2000 if etl.nrows(table1) == 0: defaultPostCode = "2000" table1 = etl.select(table, "{Postcode}=='" + defaultPostCode + "'") # Reorder fields print(table1) table2 = etl.cut(table1, 'Name', 'Lat', 'Lon').dicts()[0] print(table2) return table2
def select_execute(c, selector, **kwargs): r = c() if 'addfields' in kwargs: r = etl.addfields(r, kwargs['addfields']) if selector: r = etl.select(r, selector) return r
def download_file(win, value): from tkinter import filedialog import requests #here we are going to nstatantiate the save as window win.filename = filedialog.asksaveasfilename( initialdir="/", title="Select file", filetypes=(("csv files", "*.csv"), ("Excel files", "*.xls"), ("all files", "*.*"))) #reading the entred date and storing it selected_date = value dir = win.filename + ".csv" #data = requests.get('https://www.alphavantage.co/query?function=TIME_SERIES_DAILY_ADJUSTED&symbol=MSFT&apikey=demo&datatype=csv') #with open(dir, 'w') as f: # writer = csv.writer(f) # reader = csv.reader(data.text.splitlines()) # for row in reader: # writer.writerow(row) #Load the table from the result file wghich the timestamp colomun corresponded to the given date table1 = etl.fromcsv(r'C:\avenguard\files\results.csv') table2 = etl.rowlenselect(table1, 12) # Alter the colums table2 = etl.cut(table2, 'plate', 'timestamp') #table2 = etl.tail(table2, 15) table2 = etl.select( table2, lambda rec: rec.timestamp.split("_")[0] == selected_date) # Save to new file in format xmls (excel file) #etl.tocsv(table2, dir) etl.toxlsx(table2, win.filename + '.xlsx')
def _petl_transform(self, record_set): if "transform" in self.task: transform = self.task["transform"] if "convert" in transform: conversions = {} for field, func in transform["convert"]: conversions[field] = func record_set = etl.convert(record_set, conversions) if "filter" in transform: record_set = etl.select(record_set, transform["filter"]) if "remove" in transform: cuts = [] for field in transform["remove"]: cuts.append(field) record_set = etl.cutout(record_set, cuts) if "rename" in transform: names = {} for old, new_one in transform["rename"]: names[old] = new_one record_set = etl.rename(record_set, names) return record_set
def select_rows(self, *filters): """ Select specific rows from a Parsons table based on the passed filters. Example filters: .. code-block:: python tbl = Table([['foo', 'bar', 'baz'], ['c', 4, 9.3], ['a', 2, 88.2], ['b', 1, 23.3],]) # You can structure the filter in multiple wayss # Lambda Function tbl2 = tbl.select_rows(lambda row: row.foo == 'a' and row.baz > 88.1) tbl2 >>> {foo: 'a', 'bar': 2, 'baz': 88.1} # Expression String tbl3 = tbl.select_rows("{foo} == 'a' and {baz} > 88.1") tbl3 >>> {foo: 'a', 'bar': 2, 'baz': 88.1} `Args:` \*filters: function or str `Returns:` A new parsons table containing the selected rows """ # noqa: W605 from parsons.etl.table import Table return Table(petl.select(self.table, *filters))
def anyServices(): # requested query Postcode = request.query.Postcode #Converting the Service value to String Postcode = str(Postcode) # reading the csv file csv = petl.fromcsv(file) # json content type declaration response.headers['Content-type'] = 'application/json' response.headers['Access-Control-Allow-Origin'] = '*' for i in csv: if Postcode == i[4]: # select the data according to the given requested query dataSelect = petl.select(csv, "{Postcode} == '" + Postcode + "'") # cutting out the required column names jsonData = petl.cut(dataSelect, 'Service', 'Suburb') # convert the dictionary data into json data jsonData = json.JSONEncoder().encode(list(petl.dicts(jsonData))) # return the json data return jsonData else: jsonData = json.JSONEncoder().encode('Unable to find this Service.') return jsonData
def main_loop(): # requested query Service = request.query.Service #Converting the Service value to String Service = str(Service) csv = petl.fromcsv(file) response.headers['Content-type'] = 'application/json' response.headers['Access-Control-Allow-Origin'] = '*' for i in csv: if Service == i[1]: # select the data according to the given requested query dataSelect = petl.select(csv, "{Service} == '" + Service + "'") # cutting out the required column names jsonData = petl.cut(dataSelect, 'ClinicID', 'Suburb', 'Lat', 'Lon') # convert the dictionary data into json data jsonData = json.JSONEncoder().encode(list(petl.dicts(jsonData))) # return the json data return jsonData # this is requested link of getting all the distinct list of clinics offering any service. if Service == "0": anyServices = petl.unique(csv, key='Name') jsonData = petl.cut(anyServices, 'ClinicID', 'Suburb', 'Lat', 'Lon') jsonData = json.JSONEncoder().encode(list(petl.dicts(jsonData))) return jsonData else: jsonData = json.JSONEncoder().encode('Please Enter a Service.') return jsonData
def main_loop(): # requested query inputServiceID = request.query.serviceid csv = pt.fromcsv('clinicservicelocations.csv') response.headers['Content-type'] = 'application/json' response.headers['Access-Control-Allow-Origin'] = '*' for i in csv: if inputServiceID == i[0]: # select the data according to the given requested query dataSelect = pt.select( csv, "{ServiceID} == '" + str(inputServiceID) + "'") # cutting out the required column names jsonData = pt.cut(dataSelect, 'Name', 'Service', 'Suburb', 'State', 'Email', 'Lat', 'Lon') # convert the dictionary data into json data jsonData = json.JSONEncoder().encode(list(pt.dicts(jsonData))) # return the json data return jsonData # this is requested link of getting all the distinct list of # clinics offering any service. if inputServiceID == "0": anyServices = pt.unique(csv, key='Name') jsonData = pt.cut(anyServices, 'Name', 'Service', 'Suburb', 'State', 'Email', 'Lat', 'Lon') jsonData = json.JSONEncoder().encode(list(pt.dicts(jsonData))) return jsonData else: jsonData = json.JSONEncoder().encode('Unable to find this id.') return jsonData
def get_location(): setHeaders() postCode = request.query.postcode selectedRow = etl.select(storeLocationsTable, "{Postcode} == '" + postCode + "'") #if 2 records are not returned it didn't find a result if (selectedRow.len() != 2): defaultPostCode = "2000" selectedRow = etl.select(storeLocationsTable, "{Postcode} == '" + defaultPostCode + "'") storeData = { "name": selectedRow[1][0], "lat": selectedRow[1][2], "lon": selectedRow[1][3] } return storeData
def clean_and_separate(table): """Do some cleanup of TABLE and split into individual and business tables. TABLE is a petl table.""" # Rename column to expand name table = etl.rename(table, {'WVRSTATE': 'waiverstate'}) # More conversions table = etl.convert( table, { 'EXCLTYPE': lambda f: f.strip(), # Trim extra spaces 'EXCLDATE': munge_date, # Arrange date for sqlite 'REINDATE': munge_date, # Arrange date for sqlite 'WAIVERDATE': munge_date # Arrange date for sqlite }) # Separate into two tables, as this is actually two different data sets individual = etl.select(table, "{LASTNAME} != '' and {FIRSTNAME} != ''") business = etl.select(table, "{LASTNAME} == '' and {FIRSTNAME} == ''") # Sanity check: Make sure we split the rows without dupes or # missing any. The +1 is to account for the second header row # that gets counted when we have two tables. if len(business) + len(individual) != len(table) + 1: fatal( "Separating business and individual exclusions came up with the wrong number of rows!" ) # Remove unused columns individual = etl.transform.basics.cutout(individual, "BUSNAME") business = etl.transform.basics.cutout(business, "LASTNAME", "FIRSTNAME", "MIDNAME", "DOB") # Do some cleanup conversions on individual data individual = etl.convert( individual, { 'DOB': munge_date, 'MIDNAME': lambda f: f if f != " " else "" # no spaces as middle names }) return individual, business
def select_drain_issues(inp, assignee_id: int, drain_cf_id: int): def is_drain(fields: list) -> bool: return any( map( lambda field: field['id'] == drain_cf_id and field['value'] == '1', fields)) # custom fields have more selectivity inp = etl.select(inp, 'custom_fields', is_drain) return etl.selecteq(inp, 'assigned_to_id', assignee_id)
def xref_symbol_reports(): symbol_reports = [ f for f in os.listdir() if re.match('OCLC Datasync Unresolved.*\.csv', f) ] today = str(date.today()) for report in symbol_reports: symbol_split = re.split('^.*processing.(M[A-Z]{2}).*$', report) symbol = symbol_split[1] xlsx_outfile = symbol + '_datasync_unresolved_' + today + '.xlsx' xls_outfile = symbol + '_datasync_unresolved_' + today + '.xls' txt_outfile = symbol + '_staging_OCNs_' + today + '.txt' symbol_table_raw = etl.fromcsv(report, encoding='utf-8') symbol_table = etl.rename(symbol_table_raw, '\ufeffMMS Id', 'MMS ID') symbol_table2 = etl.select(symbol_table, "{MMS ID} is not None") symbol_table_sorted = etl.sort(symbol_table2, 'MMS ID') xref_table = etl.fromcsv('unresxref.csv') xref_table2 = etl.select(xref_table, "{MMS ID} is not None") xref_table_sorted = etl.sort(xref_table2, 'MMS ID') symbol_xref_table = etl.join(symbol_table_sorted, xref_table_sorted, presorted=True, lkey="MMS ID", rkey="MMS ID") try: etl.toxlsx(symbol_xref_table, xlsx_outfile, encoding='utf-8') except TypeError: etl.toxls(symbol_xref_table, xls_outfile, 'Sheet1', encoding='utf-8') staging_ocns_table = etl.cut(symbol_xref_table, 'Staging OCN') template = '{Staging OCN}\n' etl.totext(staging_ocns_table, txt_outfile, template=template)
def _medical_limits(id, source_db): """ get the member limits """ sql = ("SELECT dispensary_id, daily_purchase_limit, visit_purchase_limit, " "daily_visit_limit, two_week_purchase_limit " "FROM red_flags " "WHERE dispensary_id={0}").format(id) data = etl.fromdb(source_db, sql) limits = etl.select(data, lambda rec: rec.dispensary_id == id) return etl.dicts(limits)
def _get_taxes(id, source_db): """ get the dispensary taxes settings for each dispensary_id """ sql = ("SELECT DISTINCT dispensary_id, amount, name " "FROM taxes " "WHERE dispensary_id={0}").format(id) data = etl.fromdb(source_db, sql) try: lookup_taxes = etl.select(data, lambda rec: rec.dispensary_id == id) return etl.dicts(lookup_taxes) except KeyError: return 0
def valuecounts(table, col_name): return_dict = {} reported_count = 0 unreported_count = 0 column = petl.values(table, col_name) nrows = petl.nrows(table) non_blanks = petl.select(table, '{' + quote_single_quote(col_name) + "} != ''") num_blanks = nrows - petl.nrows(non_blanks) counts_table = petl.valuecounts(non_blanks, col_name) for row in petl.records(counts_table): if row['frequency'] > 0.01: return_dict[row[col_name]] = row['count'] reported_count += row['count'] else: unreported_count += row['count'] return_dict['<other>'] = unreported_count return_dict['<blank>'] = num_blanks return return_dict
def dataPreProcessing(fileName): inputData = fromcsv(fileName) table1 = cutout(inputData, 'member_id', 'grade', 'sub_grade', 'emp_title', 'url', 'desc', 'title', 'accept_d', 'exp_d', 'list_d', 'issue_d', 'purpose', 'addr_city', 'addr_state', 'earliest_cr_line', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d') table2 = select( table1, lambda i: i['term'] == ' 36 months' and i['loan_status'] is not "") labelMapping = OrderedDict() labelMapping['loan_status'] = 'loan_status' labelMapping['id'] = 'id' table6 = fieldmap(table2, labelMapping) table8 = sort(table6, 'id') table10 = cutout(table8, 'id') mappings = OrderedDict() mappings['id'] = 'id' mappings['home_ownership'] = 'ownership', { 'MORTGAGE': '-1', 'RENT': '0', 'OWN': '1' } mappings['emp_length'] = 'empLength', {'n/a': 0} mappings['is_inc_v'] = 'verificationStatus', { 'Source Verified': 1, 'Verified': 0, 'Not Verified': -1 } mappings['pymnt_plan'] = 'paymentPlan', {'n': 0, 'y': 1} mappings['initial_list_status'] = 'listStatus', {'f': 0, 'w': 1} table3 = fieldmap(table2, mappings) table4 = cutout(table2, 'home_ownership', 'is_inc_v', 'pymnt_plan', 'initial_list_status', 'term', 'loan_status') table5 = merge(table3, table4, key='id') table7 = sort(table5, 'id') table9 = cutout(table7, 'id') featureFileCsv = tocsv(table9, 'featureFileCsv.csv') labelsFileCsv = tocsv(table10, 'labelsFileCsv.csv') return featureFileCsv, labelsFileCsv
def xls_tidy(xls,qvalue): d=etl.fromtsv(xls) sd=etl.select(d,lambda x: float(x.PepQValue) <=float(qvalue)) psmsummary=sd ssd=etl.cut(sd, 'Peptide', 'Protein', 'PepQValue') #remove the mod info in peptide. ssd=etl.transform.regex.sub(ssd,'Peptide', r'^[\w-]\.(.+)\.[\w-]$', r'\1') ssd=etl.transform.regex.sub(ssd,'Peptide', r'[\d\.\+]+', r'') aggregation = OrderedDict() aggregation['SpecCount'] = len cssd=etl.aggregate(ssd, 'Peptide', aggregation) fssd=etl.groupselectfirst(ssd, key=('Peptide','Protein',"PepQValue")) aggregation = OrderedDict() aggregation['Protein'] = 'Protein', etl.strjoin(';') aggregation['PepQValue'] = 'PepQValue', etl.strjoin(';') assd=etl.aggregate(fssd, 'Peptide', aggregation) pepsummary=etl.join(assd, cssd, key='Peptide') return (psmsummary, pepsummary)
from __future__ import absolute_import, print_function, division import petl as etl table = [['foo', 'bar'], ['a', 1], ['b', None]] # raises exception under Python 3 etl.select(table, 'bar', lambda v: v > 0) # no error under Python 3 etl.selectgt(table, 'bar', 0) # or ... etl.select(table, 'bar', lambda v: v > etl.Comparable(0))
table = src_table print('TOTAL SOURCE ROWS = ' + str(etl.nrows(table))) print('SOURCE HEADERS = ' + str(etl.header(table))) #UNUSED COLUMNS if CLEAN_UP: table = clean_up(table, 'rcv_nm') table = clean_up(table, 'recp_cd') table = clean_up(table, 'ins_ind') table = clean_up(table, 'geo_ind') table = clean_up(table, 'cid') table = clean_up(table, 'occ_typ') print('TRIMMED HEADERS = ' + str(etl.header(table))) table = etl.select(table, 'occ_dt', lambda x: x > datetime(2000, 1, 1)) print('ROWS POST YR 2000 = ' + str(etl.nrows(table))) mine_table = etl.fromcsv('mines.csv', encoding='utf-8') ##handle leading 0's mine_table = etl.convert(mine_table, 'mine_no', lambda x: str(int(x))) table = etl.convert(table, 'mine_no', lambda x: str(int(x))) #MAP mine_no to mine_guid table = etl.leftjoin(table, mine_table, key='mine_no') table = clean_up(table, 'mine_no') #make sure this is 0 if etl.valuecount(table, 'mine_guid', None)[0] > 0: print('mine_guid, mine_no pair missing from mines.csv') exit(1)
# select table1 = [['foo', 'bar', 'baz'], ['a', 4, 9.3], ['a', 2, 88.2], ['b', 1, 23.3], ['c', 8, 42.0], ['d', 7, 100.9], ['c', 2]] from petl import select, look look(table1) # the second positional argument can be a function accepting a record (i.e., a # dictionary representation of a row). table2 = select(table1, lambda rec: rec['foo'] == 'a' and rec['baz'] > 88.1) look(table2) # the second positional argument can also be an expression string, which # will be converted to a function using expr() table3 = select(table1, "{foo} == 'a' and {baz} > 88.1") look(table3) # the condition can also be applied to a single field table4 = select(table1, 'foo', lambda v: v == 'a') look(table4) # fieldmap table1 = [['id', 'sex', 'age', 'height', 'weight'], [1, 'male', 16, 1.45, 62.0], [2, 'female', 19, 1.34, 55.4],
# select() ########## import petl as etl table1 = [['foo', 'bar', 'baz'], ['a', 4, 9.3], ['a', 2, 88.2], ['b', 1, 23.3], ['c', 8, 42.0], ['d', 7, 100.9], ['c', 2]] # the second positional argument can be a function accepting # a row table2 = etl.select(table1, lambda rec: rec.foo == 'a' and rec.baz > 88.1) table2 # the second positional argument can also be an expression # string, which will be converted to a function using petl.expr() table3 = etl.select(table1, "{foo} == 'a' and {baz} > 88.1") table3 # the condition can also be applied to a single field table4 = etl.select(table1, 'foo', lambda v: v == 'a') table4 # selectre() ############ import petl as etl table1 = [['foo', 'bar', 'baz'],
def sales_summary(start_dt=None, end_dt=None, staff_id=None, for_export=False): """tally up gross (sale over list) profits TODO: tally up net profites (gross profit vs inventory purchase total) TODO: Keyword Arguments: start_dt {[type]} -- datetime for start of query (default: {None}) end_dt {[type]} -- datetime for start of query [description] (default: {None}) Returns: [dict] -- various types of sales information, stored in a dictionary. """ # products = db.session.query(Product).all() # sales = db.session.query(Sale).all() # retrieve existing tables products_records = etl.fromdb(db.engine, 'SELECT * FROM product') sales_records = etl.fromdb(db.engine, 'SELECT * FROM sale') staff_records = etl.fromdb(db.engine, 'SELECT * FROM staff') # filter by start/end date if provided if start_dt and end_dt: sales_records = etl\ .selectnotnone(sales_records, 'date')\ .select(lambda r: r.date > start_dt and r.date <= end_dt) elif start_dt and not end_dt: sales_records = etl\ .selectnotnone(sales_records, 'date')\ .select(lambda r: r.date > start_dt) elif end_dt and not start_dt: sales_records = etl\ .selectnotnone(sales_records, 'date')\ .select(lambda r: r.date <= end_dt) else: pass # filter by staff id if provided if staff_id: sales_records = etl.select(sales_records, 'staff_id', lambda v: v == staff_id) # join product info to sales data sales_data = etl\ .join( sales_records, products_records, lkey='product_id', rkey='id' )\ .leftjoin( staff_records, lkey='staff_id', rkey='id' ) # prep joined sales data for tabulation sales_data = etl\ .convert(sales_data, 'date', lambda dt: format_date(dt))\ .sort('date')\ .convert('quantity', lambda q: handle_none(q, replace_with=1))\ .addfield('profit', lambda rec: calculate_profit(rec))\ .addfield('gross_sales', lambda rec: calculate_gross_sales(rec)) # tabulate some figures gross_sales = 0 profits = 0 for sale in etl.dicts(sales_data): profits += calculate_profit(sale) gross_sales += calculate_gross_sales(sale) if for_export: return { 'gross_sales': gross_sales, 'profits': profits, 'table': sales_data } # summarize data into charting-friendly data structures chart_count, chart_count_missing_date = etl\ .fold(sales_data, 'date', operator.add, 'quantity', presorted=True)\ .rename({'key': 'x', 'value': 'y'})\ .biselect(lambda rec: rec.x is not None) # print(chart_count) # etl.lookall(chart_count) chart_gross, chart_gross_missing_date = etl\ .fold(sales_data, 'date', operator.add,'gross_sales', presorted=True)\ .rename({'key': 'x', 'value': 'y'})\ .biselect(lambda rec: rec.x is not None) # print(chart_gross) # etl.lookall(chart_gross) chart_profit, chart_profit_missing_date = etl\ .fold(sales_data, 'date', operator.add, 'profit', presorted=True)\ .rename({'key': 'x', 'value': 'y'})\ .biselect(lambda rec: rec.x is not None) # for i in etl.dicts(chart_count): # print(i) # for i in etl.dicts(chart_gross): # print(i) return { 'gross_sales': gross_sales, 'profits': profits, 'chart_gross': list(etl.dicts(chart_gross)), 'chart_gross_missing_date': list(etl.dicts(chart_gross_missing_date)), 'chart_profit': list(etl.dicts(chart_profit)), 'chart_profit_missing_date': list(etl.dicts(chart_profit_missing_date)), 'chart_count': list(etl.dicts(chart_count)), 'chart_count_missing_date': list(etl.dicts(chart_count_missing_date)) }
import sys import petl as etl def add_bbreflink(rec): bid = rec['bbrefID'] initial = bid[0] return "http://www.baseball-reference.com/players/" + initial + "/" + bid + ".shtml" # Load Master.csv from the Lahman database. table = etl.fromcsv(sys.argv[1]) # Use US births only table2 = etl.select(table, lambda rec: rec.birthCountry == 'USA') # Only use these fields table3 = etl.cut(table2, 'nameFirst', 'nameLast', 'debut', 'bbrefID', 'weight', 'height', 'finalGame', 'birthCity', 'birthState', 'birthYear') # Remove null birth city and birth year table4 = etl.select(table3, lambda rec: rec.birthCity != "" and rec.birthYear != "") # Add Baseball Reference URL table5 = etl.addfield(table4, 'baseball_ref_url', add_bbreflink) # Remove unnecessary bbrefid table6 = etl.cutout(table5, "bbrefID") # Load city,state lat long table. city = etl.fromcsv(sys.argv[2]) # Only use these fields
# load expense document try: expenses = petl.io.xlsx.fromxlsx('Expenses.xlsx', sheet='Github') except Exception as e: print('could not open expenses.xlsx:' + str(e)) sys.exit() # join tables expenses = petl.outerjoin(exchangeRates, expenses, key='date') # fill down missing values expenses = petl.filldown(expenses, 'rate') # remove dates with no expenses expenses = petl.select(expenses, lambda rec: rec.USD != None) # add CDN column expenses = petl.addfield(expenses, 'CAD', lambda rec: decimal.Decimal(rec.USD) * rec.rate) # intialize database connection try: dbConnection = pymssql.connect(server=destServer, database=destDatabase) except Exception as e: print('could not connect to database:' + str(e)) sys.exit() # populate Expenses database table try:
def remove_rows(table, list_rows_to_remove): global g g.list_rows_to_remove = list_rows_to_remove return petl.select(table, row_remover, complement=True)
def transform(mmj_menu_items, mmj_categories, prices, organization_id, source_db, debug): """ Transform data """ # source data table source_dt = utils.view_to_list(mmj_menu_items) cut_menu_data = [ 'id', 'vendor_id', 'menu_id', 'dispensary_id', 'strain_id', 'created_at', 'updated_at', 'category_id', 'name', 'sativa', 'indica', 'on_hold', 'product_type', 'image_file_name', 'medicine_amount', 'product_type' ] cut_prices = [ 'menu_item_id', 'dispensary_id', 'price_half_gram', 'price_gram', 'price_two_gram', 'price_eigth', 'price_quarter', 'price_half', 'price_ounce' ] # Cut out all the fields we don't need to load menu_items = etl.cut(source_dt, cut_menu_data) prices_data = etl.cut(prices, cut_prices) menu_items = (etl.addfield( menu_items, 'createdAtEpoch').addfield('unitOfMeasure').addfield( 'locationProductDetails').addfield('keys').addfield('restockLevel') ) # Two-step transform and cut. First we need to cut the name # and id from the source data to map to. cut_source_cats = etl.cut(mmj_categories, 'name', 'id', 'measurement') source_values = etl.values(cut_source_cats, 'name', 'id') # Then we nede a dict of categories to compare against. # id is stored to match against when transforming and mapping categories mmj_categories = dict([(value, id) for (value, id) in source_values]) mappings = OrderedDict() mappings['id'] = 'id' mappings['createdAt'] = 'created_at' mappings['updatedAt'] = 'updated_at' mappings['createdAtEpoch'] = lambda x: utils.create_epoch(x.created_at) mappings['name'] = 'name' mappings['shareOnWM'] = lambda x: _wm_integration(x.id, source_db) """ 1 = Units 2 = Grams (weight) """ mappings['unitOfMeasure'] = \ lambda x: _map_uom(x.category_id, source_db) fields = etl.fieldmap(menu_items, mappings) data = etl.merge(menu_items, fields, key='id') items = [] for item in etl.dicts(data): breakpoint_pricing = (etl.select( prices_data, lambda x: x.dispensary_id == item['dispensary_id']).rename({ 'price_eigth': 'price_eighth' }).cutout('menu_item_id')) # Set image url for load to download url = None if debug and item['image_file_name'] is not None: url = ("https://wm-mmjmenu-images-development.s3." "amazonaws.com/menu_items/images/{0}/large/" "{1}").format(item['id'], item['image_file_name']) elif item['image_file_name'] is not None: url = ("https://wm-mmjmenu-images-production.s3." "amazonaws.com/menu_items/images/{0}/large/" "{1}").format(item['id'], item['image_file_name']) item['image_file_name'] = url item['categoryId'] = _map_categories(item['category_id'], item['sativa'], item['indica'], mmj_categories, menu_items) item['keys'] = { 'dispensary_id': item['dispensary_id'], 'id': item['id'], 'menu_id': item['menu_id'], 'vendor_id': item['vendor_id'], 'strain_id': item['strain_id'], 'category_id': item['category_id'] } # set a default netMJ value if the menu item is a unit product if item['unitOfMeasure'] is 2: item['netMarijuana'] = int(item['medicine_amount']) for key in item['keys'].keys(): if not item['keys'][key]: del item['keys'][key] item['locationProductDetails'] = { 'id': item['id'], 'active': _active(item['on_hold']) } item['restockLevel'] = _restock_level(item['dispensary_id'], item['product_type'], source_db) if item['shareOnWM'] is None: item['shareOnWM'] = False for price in etl.dicts(breakpoint_pricing): try: price_two_gram = price['price_two_gram'] except KeyError: price_two_gram = 0.0 item['locationProductDetails']['weightPricing'] = { 'price_half_gram': utils.dollars_to_cents(price['price_half_gram']), 'price_two_gram': utils.dollars_to_cents(price_two_gram), 'price_gram': utils.dollars_to_cents(price['price_gram']), 'price_eighth': utils.dollars_to_cents(price['price_eighth']), 'price_quarter': utils.dollars_to_cents(price['price_quarter']), 'price_half': utils.dollars_to_cents(price['price_half']), 'price_ounce': utils.dollars_to_cents(price['price_ounce']) } del item['vendor_id'] del item['indica'] del item['dispensary_id'] del item['id'] del item['strain_id'] del item['on_hold'] del item['menu_id'] del item['sativa'] del item['category_id'] del item['updated_at'] del item['created_at'] del item['product_type'] if item['image_file_name'] is None: del item['image_file_name'] # set up final structure for API items.append(item) # Remove inactive items for item in items: if item['locationProductDetails']['active'] is False: items.remove(item) if debug: result = json.dumps(items, sort_keys=True, indent=4, default=utils.json_serial) print(result) return items
from __future__ import absolute_import, print_function, division # select() ########## import petl as etl table1 = [['foo', 'bar', 'baz'], ['a', 4, 9.3], ['a', 2, 88.2], ['b', 1, 23.3], ['c', 8, 42.0], ['d', 7, 100.9], ['c', 2]] # the second positional argument can be a function accepting # a row table2 = etl.select(table1, lambda rec: rec.foo == 'a' and rec.baz > 88.1) table2 # the second positional argument can also be an expression # string, which will be converted to a function using petl.expr() table3 = etl.select(table1, "{foo} == 'a' and {baz} > 88.1") table3 # the condition can also be applied to a single field table4 = etl.select(table1, 'foo', lambda v: v == 'a') table4 # selectre() ############ import petl as etl table1 = [['foo', 'bar', 'baz'], ['aa', 4, 9.3], ['aaa', 2, 88.2], ['b', 1, 23.3], ['ccc', 8, 42.0], ['bb', 7, 100.9], ['c', 2]] table2 = etl.selectre(table1, 'foo', '[ab]{2}') table2 # selectusingcontext() ######################
for x in range(length): attr = data['attibutes'][x]['attrName'] matchingField = data['attibutes'][x]['matchingField'] mappings[attr] = matchingField mappedTable = etl.fieldmap(dataTable, mappings) cleansedTable = mappedTable #add rules to clean the table - reversed for give the priority for top attributes for x in reversed(range(length)): attr = data['attibutes'][x]['attrName'] rules = data['attibutes'][x]['rules'] rulesListSize = len(rules) for y in range(rulesListSize): if rules[y] == "Remove Null Value Rows": cleansedTable = etl.select(cleansedTable, attr, lambda v: v != '') if rules[y] == "Remove Duplicates": cleansedTable = etl.aggregate(cleansedTable, attr) if rules[y] == "Sort": cleansedTable = etl.mergesort(cleansedTable, key=attr) if rules[y] == "Number Validation": cleansedTable = etl.select(cleansedTable, attr) if rules[y] == "Fill Missing Values": cleansedTable = etl.filldown(cleansedTable, attr) etl.tocsv(cleansedTable,'src/etl/outputs/cleansed.csv') #Create rawData Table dataTable = cleansedTable rawDataTable = cleansedTable
sys.exit(1) # print("Algunos agregados de los resultados, para fecha: ", DATE) t1 = (etl.fromcsv(f"var/s1_invisible_prefixes-{DATE}.csv", delimiter="|").convert('visible', int).convert( 'dark', int).convert('total', int)) #print(t1.look()) print("Algunos agregados de los resultados, para fecha: ", DATE) print("Total de espacio de todo el pool", sum([x[4] for x in list(t1)][1:])) print(" - ") # numero de asignaciones completamente invisibles n1 = etl.select(t1, lambda r: r['dark'] == r['total']) print("numero de asignaciones completamente invisibles", n1.nrows()) print("total de ips en asignaciones completamente invisibles", sum([x[4] for x in list(n1)][1:])) print("tamano promedio de asignaciones completamente invisibles", Average([x[4] for x in list(n1)][1:])) print(" - ") # numero de asignaciones parcialmente invisibles n2 = etl.select(t1, lambda r: r['dark'] > 0) print("numero de asignaciones parcialmente invisibles", n2.nrows()) print("total de ips en asignaciones parcialmente invisibles", sum([x[3] for x in list(n2)][1:])) print("tamano promedio de asignaciones parcialmente invisibles (total) ", Average([x[4] for x in list(n2)][1:])) print("tamano promedio de asignaciones parcialmente invisibles (dark) ",
'new_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'tests_units' ]] # cut function is used to cut out the column given in the bracket below from the table # cut function is not compulsory for table1 because the value given below are the total field that are present in table1 data = etl.cut(table1, 'iso_code', 'location', 'date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'total_cases_per_million', 'new_cases_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'total_tests', 'new_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'tests_units') # selecting the data from table on the basis of current date # variable num consist of only the data of 2020-04-30 from each country.Hence the latest data is filter out. num = etl.select(data, 'date', lambda r: r == '2020-04-30') # sort function is used to sort the unsorted data on the basis of iso_code # thus ,this process help us to join the data easily in furthur steps table1_sort = etl.sort(num, key='iso_code') # counter variable is declared to count the number of country count = 0 # values function is used to read the data from table for i in etl.values(table1_sort, 'iso_code', 'location', 'date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'total_cases_per_million', 'new_cases_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'total_tests', 'new_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'tests_units'):
for name, region in q_geo.items(): if point.within(region): outrow.append(name) break writer.writerow(outrow) f.close() print("Creating new csv...") addRows(geo,'tract','saved_homes.csv','saved_homes_extended.csv') print("New csv created.") table = etl.fromcsv('saved_homes_extended.csv') zip_list = [] tract_list = [] year_list = [] print("Aggregating by zip...") for feature in zip_data['features']: zip_table = etl.select(table, 'zip', lambda x: x == feature['properties']['code']) zip_data = etl.data(zip_table) zip_data_list = list(zip_data) zip_dict = {'zip' : feature['properties']['code'], 'saved' : 0, 'lost' : 0, 'saved_fta' : 0, 'lost_fta' : 0, 'pending' : 0, 'pending_fta' : 0, 'vacant' : 0, 'nonowner' : 0, 'litig/bankr' : 0, 'litig/bankr_fta' : 0, 'shape' : feature['geometry']} for b in zip_data_list: if b[1] == 'Saved': zip_dict['saved'] = zip_dict['saved'] + 1 elif b[1] == 'Lost': zip_dict['lost'] = zip_dict['lost'] + 1 elif b[1] == 'Saved - FTA': zip_dict['saved_fta'] = zip_dict['saved_fta'] + 1 elif b[1] == 'Lost - FTA': zip_dict['lost_fta'] = zip_dict['lost_fta'] + 1 elif b[1] == 'Pending': zip_dict['pending'] = zip_dict['pending'] + 1 elif b[1] == 'Pending - FTA':
import sys import petl as etl def add_bbreflink(rec): bid = rec['bbrefID'] initial = bid[0] return "http://www.baseball-reference.com/players/" + initial + "/" + bid + ".shtml" # Load Master.csv from the Lahman database. table = etl.fromcsv(sys.argv[1]) # Use US births only table2 = etl.select(table, lambda rec: rec.birthCountry == 'USA') # Only use these fields table3 = etl.cut(table2, 'nameFirst', 'nameLast', 'debut', 'bbrefID', 'weight', 'height', 'finalGame', 'birthCity', 'birthState', 'birthYear') # Remove null birth city and birth year table4 = etl.select(table3, lambda rec: rec.birthCity != "" and rec.birthYear != "") # Add Baseball Reference URL table5 = etl.addfield(table4, 'baseball_ref_url', add_bbreflink) # Remove unnecessary bbrefid table6 = etl.cutout(table5, "bbrefID") # Load city,state lat long table.
"Free SO2", "Total SO2", "Density", "pH", "Sulfates", "Alcohol", "Quality" ] table1 = etl.addfield( etl.convertnumbers( etl.setheader(etl.fromcsv('winequality-red.csv'), table_header)), "Type", "Red") table2 = etl.addfield( etl.convertnumbers( etl.setheader(etl.fromcsv('winequality-white.csv'), table_header)), "Type", "White") #print(etl.head(table1)) #print(etl.head(table2)) table1_filtered = etl.select(table1, "Quality", lambda v: v > 6) table2_filtered = etl.select(table2, "Quality", lambda v: v > 4) good_wines = etl.cat(table1_filtered, table2_filtered) good_wines_enhanced = etl.addfields( good_wines, [("Max Acidity", lambda rec: rec["Fixed Acidity"] + rec["Volatile Acidity"]), ("Locked SO2", lambda rec: rec["Total SO2"] - rec["Free SO2"])]) #print(etl.head(good_wines_enhanced)) #print(etl.tail(good_wines_enhanced)) gwe_sorted = etl.sort(good_wines_enhanced, key=["Quality", "Sugar"]) #print(etl.head(gwe_sorted))