def __init__(self, lst=[]): self.table = None lst_type = type(lst) if lst_type in [list, tuple]: # Check for empty list if not len(lst): self.table = petl.fromdicts([]) else: row_type = type(lst[0]) # Check for list of dicts if row_type == dict: self.table = petl.fromdicts(lst) # Check for list of lists elif row_type in [list, tuple]: self.table = petl.wrap(lst) else: # Create from a petl table self.table = lst if not self.is_valid_table(): raise ValueError("Could not create Table") # Count how many times someone is indexing directly into this table, so we can warn # against inefficient usage. self._index_count = 0
def create_csv_based_on_peoples(peoples): pathlib.Path(settings.CSV_FILES_DIRECTORY).mkdir(parents=True, exist_ok=True) file_path = os.path.join(settings.CSV_FILES_DIRECTORY, get_file_name()) petl.fromdicts(peoples).convert( 'edited', convert_iso_datetime_string_to_string_date).rename( 'edited', 'date').cut(HEADERS).tocsv(file_path) return Dataset.objects.create(file=file_path)
def transform(characters: Iterator[dict], planets: Iterator[dict]) -> Table: planets_table = (etl.fromdicts(planets).addfield( "id", lambda row: row.url.split("/")[-2]).rename( "name", "planet_name").cut("id", "planet_name")) characters_table = (etl.fromdicts(characters).addfield( "date", lambda row: row.edited.split("T")[0]).convert( "homeworld", lambda hw: hw.split("/")[-2]).join( planets_table, lkey="homeworld", rkey="id").cutout("homeworld").rename( "planet_name", "homeworld").cutout("films", "species", "vehicles", "starships", "created", "edited", "url")) return characters_table
def test_fromdicts_1(): data = [{"foo": "a", "bar": 1}, {"foo": "b", "bar": 2}, {"foo": "c", "bar": 2}] actual = fromdicts(data) expect = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2)) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def _write_to_log(parameter_view, sample_rows, resource_index): parameter_view = json.dumps(parameter_view, ensure_ascii=False, indent=4) table_view = petl.look(petl.fromdicts(sample_rows)) logging.info('Processed resource %s', resource_index) logging.info('Parameters = %s', parameter_view) logging.info('Sample output: \n%s', table_view)
def extract(profile, workdir): dicts = [] outer = parse(profile) station_ids = outer.findall('./stationID') updatetime_str = outer.getroot().attrib.get('updatetime') modify_time = parser.parse(updatetime_str, ignoretz=True) if updatetime_str else datetime.datetime.now() for station_id_tree in station_ids: status = station_id_tree.find('./status') if status is not None and status.find('./station') is not None: if status.find('./station').text.strip().startswith(u'無觀測'): continue profile = station_id_tree.find('./profile') station_id = station_id_tree.attrib['id'].strip() # filename = station_id + '.xml' outer_data = { 'station_id': station_id, 'seas_chName': profile.find('./seas_chName').text.strip(), 'latitude': float(profile.find('./latitude').text.strip()), 'longitude': float(profile.find('./longitude').text.strip()), 'chName': getattr(profile.find('./chName'), 'text', None) and profile.find('./chName').text.strip(), 'chCity': getattr(profile.find('./chCity'), 'text', None) and profile.find('./chCity').text.strip(), 'kind_chName': getattr(profile.find('./kind_chName'), 'text', None) and profile.find('./kind_chName').text.strip(), 'chTown': getattr(profile.find('./chTown'), 'text', None) and profile.find('./chTown').text.strip(), 'chLocation': getattr(profile.find('./chLocation'), 'text', None) and profile.find('./chLocation').text.strip(), 'dataItem': getattr(profile.find('./dataItem'), 'text', None) and profile.find('./dataItem').text.strip(), # 'file_path': (workdir + filename) if workdir.endswith('/') else ('%s/%s' % (workdir, filename)), 'modifytime': modify_time, 'updatetime': datetime.datetime.now(), } dicts.append(outer_data) return petl.wrap([row for row in petl.fromdicts(dicts)])
def test_append_to_file_for_csv(user_export_file, tmpdir, media_root): # given export_data = [ {"id": "123", "name": "test1", "collections": "coll1"}, {"id": "345", "name": "test2"}, ] headers = ["id", "name", "collections"] delimiter = ";" file_name = "test.csv" table = etl.fromdicts([{"id": "1", "name": "A"}], header=headers, missing=" ") with NamedTemporaryFile() as temp_file: etl.tocsv(table, temp_file.name, delimiter=delimiter) user_export_file.content_file.save(file_name, temp_file) # when append_to_file(export_data, headers, user_export_file, FileTypes.CSV, delimiter) # then user_export_file.refresh_from_db() csv_file = user_export_file.content_file file_content = csv_file.read().decode().split("\r\n") assert ";".join(headers) in file_content assert ";".join(export_data[0].values()) in file_content assert (";".join(export_data[1].values()) + "; ") in file_content shutil.rmtree(tmpdir)
def shcopy_files(data, source, source_field, destination_field, results, quiet): """Copy files from the source to the destination.""" source = data.get(source) results_data = [] for i in source.dicts(): r = { source_field: i[source_field], destination_field: i[destination_field] } try: shutil.copy(i[source_field], i[destination_field]) r['success'] = True r['message'] = "File copied successfully." if not quiet: unsync.secho('Successfully copied {} to {}'.format( i[source_field], i[destination_field]), fg='green') except (shutil.Error, IOError) as e: r['success'] = False r['message'] = str(e) if not quiet: unsync.secho('Failed copying {} to {}. Reason was: {}'.format( i[source_field], i[destination_field]), str(e), fg='red') results_data.append(r) if results: results_data = petl.fromdicts(results_data) data.set(results, results_data)
def transform(data,data_set): data = data['observations'] data = etl.fromdicts(data, header=['value','realtime_start','realtime_end','date']) data = etl.cut(data,'date','value') data = etl.rename(data,{'date':'date','value': data_set.lower()}) data = etl.convert(data,data_set.lower(),lambda val: 0 if val == '.' else val) return data
def fetch_all_characters_data(): for characters_page in fetch_all_data_by_pages( f'{settings.STAR_WARS_API_URL}/people/?format=json', ): yield etl.fromdicts( characters_page, header=settings.STAR_WARS_CHARACTERS_INPUT_API_FIELDS, )
def list_account_users(data, url, api_key, account_id, search_term, destination): if not url.startswith('http') or not url.startswith('https'): url = 'https://' + url client = UsersAPI(url, api_key) r = client.list_users_in_account(account_id, search_term) data.cat(destination, petl.fromdicts(r))
def people_view(self): """Object pretending to be ``PeopleCollectionView`` instance.""" return petl.fromdicts( [ { 'name': 'Person 1', 'homeworld': 'planets/1', 'edited': '2020-08-30T20:58:18.420000Z', }, { 'name': 'Person 2', 'homeworld': 'planets/4', 'edited': '2020-08-31T13:47:18.480000Z', }, { 'name': 'Person 3', 'homeworld': 'planets/1', 'edited': '2020-08-30T20:58:18.420000Z', }, { 'name': 'Person 4', 'homeworld': 'planets/2', 'edited': '2020-08-31T06:31:18.370000Z', }, { 'name': 'Person 5', 'homeworld': 'planets/1', 'edited': '2020-09-01T06:31:18.370000Z', }, ], header=('name', 'homeworld', 'edited'), )
def test_fromdicts_2(): data = [{"foo": "a", "bar": 1}, {"foo": "b"}, {"foo": "c", "bar": 2, "baz": True}] actual = fromdicts(data) expect = (("foo", "bar", "baz"), ("a", 1, None), ("b", None, None), ("c", 2, True)) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def list_imports(data, url, api_key, account_id, created_since, destination, include_errors, include_warnings, flatten_data): if not url.startswith('http') or not url.startswith('https'): url = 'https://' + url client = SisImportsAPI(url, api_key) r = client.get_sis_import_list(account_id, created_since=created_since) t = [] for i in r: if not include_errors: i['processing_errors'] = "{} Errors Ocurrred".format( len(i.get('processing_errors', []))) if not include_warnings: i['processing_warnings'] = "{} Warnings Ocurrred".format( len(i.get('processing_warnings', []))) if flatten_data: d = i['data'] i['diffed_against_sis_batch_id'] = d.get( 'diffed_against_sis_batch_id', None) i['import_type'] = d.get('import_type', None) c = d.get('counts', {}) i['counts'] = ';'.join( ["{}: {}".format(k, c[k]) for k in c.keys()]) i['supplied_batches'] = ';'.join(d.get('supplied_batches', [])) del (i['data']) t.append(i) import_data = petl.fromdicts(t) data.set(destination, import_data)
def test_fromdicts_3(): data = [{"foo": "a", "bar": 1}, {"foo": "b"}, {"foo": "c", "bar": 2, "baz": True}] actual = fromdicts(data, header=["foo", "bar"]) expect = (("foo", "bar"), ("a", 1), ("b", None), ("c", 2)) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def download_new_collection(cls) -> None: # store small dictionary for later on transofrmation planets_arr = {} for planets in SWAPI.fetch_data(settings.SW_PLANETS_URL): planets_arr.update({i['url']: i['name'] for i in planets}) create = True file_name = '{}.csv'.format(time()) csv_path = Path(CSV_PATH, file_name) for people in SWAPI.fetch_data(settings.SW_PEOPLE_URL): table = etl.fromdicts( people, header=[ 'name', 'height', 'mass', 'hair_color', 'skin_color', 'eye_color', 'birth_year', 'gender', 'homeworld', 'edited' ]).convert('edited', lambda v: v[0:10]).convert( 'homeworld', lambda v: planets_arr.get(v, '')).rename('edited', 'date') if create: etl.tocsv(table, source=csv_path, write_header=True) create = False else: etl.appendcsv(table, source=csv_path) c = SWPeopleCollection() c.file.name = file_name c.save()
def get_user_logins(data, url, api_key, user_data, user_id_field, destination): if not url.startswith('http') or not url.startswith('https'): url = 'https://' + url user_data = data.get(user_data) user_data = user_data.dicts() login_data = [] debug = data.config.debug client = LoginsAPI(url, api_key) for user in user_data: try: r = client.list_user_logins_users(user[user_id_field]) if debug: unsync.secho( 'Retrieved {} Canvas Logins for Canvas User ID: {}'.format( len(r), user[user_id_field]), fg='green') except CanvasAPIError: unsync.secho( 'Unable to retrieve Canvas Login information for Canvas User ID: {}' .format(user[user_id_field]), fg='red') for login in r: login_data.append(login) login_data = petl.fromdicts(login_data) data.set(destination, login_data)
def update_user_logins(data, url, api_key, source, account_id_field, login_id_field, unique_id_field, password_field, sis_user_id_field, integration_id_field, results_table): if not url.startswith('http') or not url.startswith('https'): url = 'https://' + url client = LoginsAPI(url, api_key) source = data.get(source) debug = data.config.debug results = [] for row in petl.dicts(source): account_id = row[account_id_field] login_id = row[login_id_field] kwargs = {} if unique_id_field is not None and row[unique_id_field] is not None: kwargs['login_unique_id'] = row[unique_id_field] if password_field is not None and row[password_field] is not None: kwargs['login_password'] = row[password_field] if sis_user_id_field is not None and row[sis_user_id_field] is not None: kwargs['login_sis_user_id'] = row[sis_user_id_field] if integration_id_field is not None and row[ integration_id_field] is not None: kwargs['login_integration_id'] = row[integration_id_field] try: r = client.edit_user_login(login_id, account_id, **kwargs) unsync.secho('Successfully updated login: {} with data: {}'.format( login_id, str(kwargs)), fg='green') if results_table: row['_data'] = str(kwargs) row['_response_status'] = r row['_response_content'] = r results.append(row) if debug: unsync.secho(str(r), fg='yellow') except (CanvasAPIError) as e: unsync.secho('Failed updating login: {} with data: {}'.format( login_id, str(kwargs)), fg='red') unsync.secho('Response Status: {} Response Reason: {}'.format( e.response.status_code, e.response.content), fg='red') if results_table: row['_data'] = str(kwargs) row['_response_status'] = e.response.status_code row['_response_content'] = e.response.content results.append(row) results = petl.fromdicts(results) data.cat(results_table, results)
def get_account_logins(data, url, api_key, account_id, destination): if not url.startswith('http') or not url.startswith('https'): url = 'https://' + url client = LoginsAPI(url, api_key) r = client.list_user_logins_accounts(account_id) d = petl.fromdicts(r) data.set(destination, d)
def test_append_to_file_for_xlsx(user_export_file, tmpdir, media_root): # given export_data = [ { "id": "123", "name": "test1", "collections": "coll1" }, { "id": "345", "name": "test2" }, ] expected_headers = ["id", "name", "collections"] delimiter = ";" table = etl.fromdicts([{ "id": "1", "name": "A" }], header=expected_headers, missing=" ") temp_file = NamedTemporaryFile(suffix=".xlsx") etl.io.xlsx.toxlsx(table, temp_file.name) # when append_to_file(export_data, expected_headers, temp_file, FileTypes.XLSX, delimiter) # then user_export_file.refresh_from_db() wb_obj = openpyxl.load_workbook(temp_file) sheet_obj = wb_obj.active max_col = sheet_obj.max_column max_row = sheet_obj.max_row expected_headers = expected_headers headers = [ sheet_obj.cell(row=1, column=i).value for i in range(1, max_col + 1) ] data = [] for i in range(2, max_row + 1): row = [] for j in range(1, max_col + 1): row.append(sheet_obj.cell(row=i, column=j).value) data.append(row) assert headers == expected_headers assert list(export_data[0].values()) in data row2 = list(export_data[1].values()) # add string with space for collections column row2.append(" ") assert row2 in data temp_file.close() shutil.rmtree(tmpdir)
def format_data_sample(stream): """Return a table representation of a sample of the streamed data.""" keyed_rows = [] for row in stream.sample: keyed_rows.append(dict(zip(stream.headers, row))) petl_table = fromdicts(keyed_rows) return repr(look(petl_table, limit=None))
def format_data_sample(stream): """Return a table representation of a sample of the streamed data.""" keyed_rows = [] for row in stream.sample: keyed_rows.append(dict(zip(stream.headers, row))) petl_table = fromdicts(keyed_rows) return repr(look(petl_table, limit=None))
def planets_view(self): """Object pretending to be ``PlanetsCollectionView`` instance.""" return petl.fromdicts( [{ 'name': 'Planet {0}'.format(index), 'url': 'planets/{0}'.format(index), } for index in range(1, 5)], header=('url', 'name'), )
def _show(stream): """Return a table of sample data.""" keyed_rows = [] for row in stream.sample: keyed_rows.append(dict(zip(stream.headers, row))) petl_table = fromdicts(keyed_rows) return repr(look(petl_table, limit=None))
def _show(stream): """Return a table of sample data.""" keyed_rows = [] for row in stream.sample: keyed_rows.append(dict(zip(stream.headers, row))) petl_table = fromdicts(keyed_rows) return repr(look(petl_table, limit=None))
def test_fromdicts_1(): data = [{"foo": "a", "bar": 1}, {"foo": "b", "bar": 2}, {"foo": "c", "bar": 2}] actual = fromdicts(data) expect = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def test_fromdicts_3(): data = [{"foo": "a", "bar": 1}, {"foo": "b"}, {"foo": "c", "bar": 2, "baz": True}] actual = fromdicts(data, header=['foo', 'bar']) expect = (('foo', 'bar'), ('a', 1), ('b', None), ('c', 2)) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def geojson_fc_to_csv(fc): rows = [] for f in fc['features']: feature = {} feature.update(f['properties']) geom = f['geometry']['coordinate'] feature.update({'lon': geom[0], 'lat': geom[1]}) rows.append(feature) table = etl.fromdicts(rows) return etl.lookall(table)
def test_fromdicts_2(): data = [{"foo": "a", "bar": 1}, {"foo": "b"}, {"foo": "c", "bar": 2, "baz": True}] actual = fromdicts(data) expect = (('foo', 'bar', 'baz'), ('a', 1, None), ('b', None, None), ('c', 2, True)) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def transform_data(data): tbl_data = petl.fromdicts(data) tbl_data = petl.convert( tbl_data, {k: v['value'] for k, v in fields_to_transform.items()}) tbl_data = petl.rename( tbl_data, {k: v['key'] for k, v in fields_to_transform.items()}) tbl_data_allowed = petl.cut(tbl_data, *allowed_fields) return tbl_data_allowed
def format_results(results, f, geodata_model): """handle parsing the format argument to convert the results 'table' into one of the desired formats :param results: [description] :type results: [type] :param f: [description] :type f: [type] :param geodata_model: [description] :type geodata_model: [type] :return: [description] :rtype: [type] """ # make submitted value lowercase, to simplify comparison f = f.lower() # fall back to JSON if no format provided if f not in F_ALL: f = F_JSON[0] # JSON format if f in F_JSON: if f == 'time': # grouped by timestamp return _groupby(results, key='ts', sortby='id') elif f == 'sensor': # grouped by id return _groupby(results, key='id', sortby='ts') else: # (list of dicts) return results # GEOJSON format (GeoJSON Feature collection; results under 'data' key within properties) # elif f in F_GEOJSON: # results = _groupby(results, key='id', sortby='ts') # return _format_as_geojson(results, geodata_model) # ARRAYS format (2D table) elif f in F_ARRAYS: # nested arrays t = etl.fromdicts(results) #h = list(etl.header(t)) #return list(etl.data(t)).insert(0,h) return list(etl.data(t)) elif f in F_CSV: return _format_teragon(results) # elif f in F_MD: # return results else: return results
def test_fromdicts_ordered(): from collections import OrderedDict data = [ OrderedDict([('foo', 'a'), ('bar', 1)]), OrderedDict([('foo', 'b')]), OrderedDict([('foo', 'c'), ('bar', 2), ('baz', True)]) ] actual = fromdicts(data) # N.B., fields come out in original order expect = (('foo', 'bar', 'baz'), ('a', 1, None), ('b', None, None), ('c', 2, True)) ieq(expect, actual)
def test_fromdicts_3(): data = [{'foo': 'a', 'bar': 1}, {'foo': 'b'}, {'foo': 'c', 'bar': 2, 'baz': True}] actual = fromdicts(data, header=['foo', 'bar']) expect = (('foo', 'bar'), ('a', 1), ('b', None), ('c', 2)) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def test_fromdicts_ordered(): from collections import OrderedDict data = [OrderedDict([('foo', 'a'), ('bar', 1)]), OrderedDict([('foo', 'b')]), OrderedDict([('foo', 'c'), ('bar', 2), ('baz', True)])] actual = fromdicts(data) # N.B., fields come out in original order expect = (('foo', 'bar', 'baz'), ('a', 1, None), ('b', None, None), ('c', 2, True)) ieq(expect, actual)
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument("--filename", type=str, help='Input filename', required=True) # parser.add_argument("--max_iterations", type=int, help='Max number of requests', default=1000) known_args, _ = parser.parse_known_args() file_content = [line for line in read_file(known_args.filename)] table = petl.fromdicts(file_content) tokenized_table = tokenize(table) petl.tocsv(tokenized_table, 'words.csv')
def test_fromdicts_onepass(): # check that fromdicts() only makes a single pass through the data data = iter([{'foo': 'a', 'bar': 1}, {'foo': 'b', 'bar': 2}, {'foo': 'c', 'bar': 2}]) actual = fromdicts(data, header=['foo', 'bar']) expect = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) ieq(expect, actual)
def test_fromdicts_2(): data = [{'foo': 'a', 'bar': 1}, {'foo': 'b'}, {'foo': 'c', 'bar': 2, 'baz': True}] actual = fromdicts(data, header=['bar', 'baz', 'foo']) expect = (('bar', 'baz', 'foo'), (1, None, 'a'), (None, None, 'b'), (2, True, 'c')) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def get_enrollment_terms(data, url, api_key, account_id, state, destination): if not url.startswith('http') or not url.startswith('https'): url = 'https://' + url term_data = [] client = EnrollmentTermsAPI(url, api_key) r = client.list_enrollment_terms(account_id, state) for term in r: term_data.append(term) term_data = petl.fromdicts(term_data) data.cat(destination, term_data)
def write_data(filepath: PosixPath, people_data: dict, planets: Dict[str, str]): settings.DATA_PATH.mkdir(parents=True, exist_ok=True) for data in people_data: table: DictsView = etl.fromdicts(data['results']).convert( 'homeworld', planets).addfield( 'date', lambda row: parse(row['edited']).strftime('%Y-%m-%d')).cutout( *CUTOUT_FIELDS) if filepath.is_file(): table.appendcsv(filepath) else: table.tocsv(filepath)
def append_to_file( export_data: List[Dict[str, Union[str, bool]]], headers: List[str], temporary_file: Any, file_type: str, delimiter: str, ): table = etl.fromdicts(export_data, header=headers, missing=" ") if file_type == FileTypes.CSV: etl.io.csv.appendcsv(table, temporary_file.name, delimiter=delimiter) else: etl.io.xlsx.appendxlsx(table, temporary_file.name)
def main(): parser = argparse.ArgumentParser() parser.add_argument('lat', type=float) parser.add_argument('lon', type=float) parser.add_argument( 'name', nargs='+', action='append', help='Peak name to search for - multiple words will form one search term' ) args = parser.parse_args() peak_name = ' '.join(*args.name) or 'Mount Stuart' peak = Peak(peak_name, args.lat, args.lon) print('Finding trip reports for', peak) all_trip_reports = get_all_trip_reports(peak) petl.fromdicts( [report._asdict() for report in all_trip_reports], header=TripReportSummary._fields ).tocsv()
def test_fromdicts_header_list(): from collections import OrderedDict data = [ OrderedDict([('foo', 'a'), ('bar', 1)]), OrderedDict([('foo', 'b'), ('bar', 2)]), OrderedDict([('foo', 'c'), ('bar', 2)]) ] actual = fromdicts(data) header = actual.header() assert header == ('foo', 'bar') expect = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) ieq(expect, actual) ieq(expect, actual)
def print_table(ctx): """Output a list of pipelines as table.""" rows = [dict(source.state) for source in ctx.obj['sources']] message = '\nNumber of pipelines = {}\n' secho(message.format(len(rows)), **SUCCESS) subset = [ 'id', 'pipeline_status', 'validation_status', 'nb_validation_errors', 'scraper_required', 'resource_type', 'extension' ] sorted_rows = sort(cut(fromdicts(rows), *subset), key='id') echo(look(sorted_rows, limit=None))
def process_rows(resource_): for row_index, row in enumerate(resource_): if pass_row_index: parameters.update(row_index=row_index) new_row = row_processor(row, **parameters) yield new_row if verbose and row_index < LOG_SAMPLE_SIZE: sample_rows.append(new_row) if verbose: table = look(fromdicts(sample_rows), limit=LOG_SAMPLE_SIZE) message = 'Output of processor %s for resource %s is...\n%s' args = row_processor.__name__, resource_index, table logging.info(message, *args)
def print_table(ctx): """Output a list of pipelines as table.""" rows = [dict(source.state) for source in ctx.obj['sources']] message = '\nNumber of pipelines = {}\n' secho(message.format(len(rows)), **SUCCESS) subset = [ 'id', 'pipeline_status', 'validation_status', 'nb_validation_errors', 'scraper_required', 'resource_type', 'extension' ] sorted_rows = sort(cut(fromdicts(rows), *subset), key='id') echo(look(sorted_rows, limit=None))
def scrape(): res = requests.get(BASE_URL) doc = html.fromstring(res.content) query = {} for inp in doc.findall('.//form[@name="toetuse_saajad"]//input'): if inp.get('type') == 'submit': continue query[inp.get('name')] = inp.get('value') rows = [] for option in doc.findall('.//select[@id="meede"]/option'): measure = option.get('value') if len(measure): rows.extend(list(scrape_measure(query, measure))) tojson(fromdicts(rows), 'Estonia_scraper_dump.json', sort_keys=True, **JSON_FORMAT)
def season_game_bucket_combo_probs(sbs, gbs): # join bucketed season performance to bucketed game performance sg = sbs.join(gbs, key=('year', 'name'), lprefix='s_', rprefix='g_') # get all possible season/game bucket combos as our master list all_bucket_combos = petl.fromdicts([ {'s_bucket': s, 'g_bucket': g} for s in set(sg['s_bucket']) for g in set(sg['g_bucket']) ]) # get counts of all existing season/game bucket combos game_bucket_counts = sg.aggregate(('s_bucket', 'g_bucket'), len) # generate counts for all season/game bucket combos and fill in # any missing values with 0 all_game_bucket_counts = ( all_bucket_combos .leftjoin( game_bucket_counts, key=('s_bucket', 'g_bucket'), missing=0, ) .rename({'value': 'g_count'}) ) # calculate season bucket counts s_bucket_game_counts = ( all_game_bucket_counts .aggregate('s_bucket', sum, 'g_count') .rename({'value': 's_count'}) ) # get probs for each season-game turnout g_bucket_probs = ( all_game_bucket_counts .join(s_bucket_game_counts, key=('s_bucket')) .addfield('prob', lambda r: r['g_count'] / r['s_count']) .cut(('s_bucket', 'g_bucket', 'g_count', 's_count', 'prob')) ) return g_bucket_probs
def attendance_file2table(filename, output_csv_filebase, add_extra_fields): global full_name2sk_indiv_id print "*** Parsing file: " + filename print attendance_dicts = [] # CCB's Worship Service event IDs... event_ids = {} event_ids["8"] = 6 event_ids["9"] = 7 event_ids["10"] = 8 event_ids["11:15"] = 9 event_ids["Christmas"] = 13 # The following are used to create CSV output filenames and to emit human-readable event name if add_extra_fields # flag is on event_names = {} event_names[6] = "08am" event_names[7] = "09am" event_names[8] = "10am" event_names[9] = "11_15am" event_names[13] = "Christmas Eve" # Time of event in Excel-parseable format event_times = {} event_times[6] = "08:00 AM" event_times[7] = "09:00 AM" event_times[8] = "10:00 AM" event_times[9] = "11:15 AM" event_times[13] = "04:00 PM" # Starting state... prior_line = None matched_month_year = None matched_service_time = None month = None year = None service_time = None line_number = 1 total_row_dict = None event_id = None accumulated_row_totals_dict = {"week1": 0, "week2": 0, "week3": 0, "week4": 0, "week5": 0, "week6": 0, "total": 0} full_name = None phone = None num_processed_lines = 0 for line in open(filename): # First pick off line at front of file indicating month and year that this attendance file is for... if not matched_month_year: matched_month_year = re.search("For the month of ([A-Z][a-z]+), ([0-9]{4})", line) if matched_month_year: month = string2monthnum(matched_month_year.group(1)) year = string2yearnum(matched_month_year.group(2)) if not (month and year): print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number) print >> sys.stderr, "*** ERROR! Invalid month or year found" print >> sys.stderr, line print >> sys.stderr sys.exit(1) first_day_in_month, num_days_in_month = calendar.monthrange(year, month) # Create list of 6 date objects, month_sundays, representing week1, week2, ... week6 Sunday dates # If a week has no Sunday, it is None day_countup = 1 day_countup += 6 - first_day_in_month month_sundays = [] if first_day_in_month != 6: month_sundays.append(None) while day_countup <= num_days_in_month: month_sundays.append(datetime.date(year, month, day_countup)) day_countup += 7 while len(month_sundays) < 6: month_sundays.append(None) christmas_eve_date = datetime.date(year, month, 24) # Second pick off line at front of file indicating worship service time that this attendance file is for... elif not matched_service_time: matched_service_time = re.search("Worship Service - (Sunday |Summer )?([^ ]*)", line) if matched_service_time: service_time = matched_service_time.group(2) if service_time in event_ids: event_id = event_ids[service_time] event_name = event_names[event_id] else: print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number) print >> sys.stderr, '*** ERROR! Unrecognized service_time: "' + service_time + '"' print >> sys.stderr sys.exit(1) # ...then match attendance (row per person with weeks they attended) and total (summary at bottom) rows else: # Once we found row with totals...we're done, that's last line in attendance file we need to parse matched_total_line = re.search("^ {18}Total: {13}(?P<attendance>( +[0-9]+)+)\r?$", line) if matched_total_line: totals_attendance_dict = attendance_str2dict( matched_total_line.group("attendance"), [-3, -9, -15, -20, -24, -29, -35], 3 ) break matched_attendance_line = re.search( "^ {6}" + "(?P<full_name>(?P<last_name>[A-Za-z]+([ \-'][A-Za-z]+)*), " + "(?P<first_name>([A-Za-z]+\.?)+([\-' ][A-Za-z]+)*)( \((?P<nick_name>[A-Za-z]+)\))?\.?)?\r?" + "(?P<phone>( +)?([0-9]{3}-[0-9]{3}-[0-9]{4}|Unlisted))?" + "(?P<attendance> +(1 +)+[1-6])?\r?$", line, ) if matched_attendance_line: if matched_attendance_line.group("full_name"): full_name = matched_attendance_line.group("full_name").strip() if matched_attendance_line.group("phone"): phone = matched_attendance_line.group("phone").strip() if matched_attendance_line.group("attendance"): if full_name: attendance = matched_attendance_line.group("attendance").strip() row_dict = attendance_str2dict(attendance, [-1, -7, -13, -18, -22, -27, -33], 1) row_dict["full_name"] = full_name if phone: row_dict["phone"] = phone else: row_dict["phone"] = "" num_processed_lines += 1 full_name = None phone = None if row_dict["total"] != ( row_dict["week1"] + row_dict["week2"] + row_dict["week3"] + row_dict["week4"] + row_dict["week5"] + row_dict["week6"] ): print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number) print >> sys.stderr, "*** ERROR! Bad row total, doesn't match sum of weeks 1-6" print >> sys.stderr, row_dict print >> sys.stderr break for key in accumulated_row_totals_dict: accumulated_row_totals_dict[key] += row_dict[key] attendance_dicts.append(row_dict) # Buffer the current line for line folding if needed (see 'line folding' above) prior_line = line line_number += 1 print "*** Number of attendance lines processed: " + str(num_processed_lines) print "*** Number of attendees: " + str(accumulated_row_totals_dict["total"]) print if output_csv_filebase and event_id: output_csv_filename = ( output_csv_filebase + "/" + str(year) + format(month, "02d") + "_" + str(event_names[event_id]) + ".csv" ) all_columns_table = petl.fromdicts(attendance_dicts) petl.tocsv(all_columns_table, output_csv_filename) # Build 2nd list of dicts, where each list item is dict of individual date/event attendance. I.e. a row per # worship service date vs original attendance dicts format of a row per attendee across all weeks in month. # This is the actual one returned and eventually emitted into output file attendance_dicts2 = [] for attendance_dict in attendance_dicts: for key in attendance_dict: if key[:4] == "week" and attendance_dict[key] != 0: week_index = int(key[4:5]) - 1 if month_sundays[week_index] is not None: attendance_dict2 = {} full_name = attendance_dict["full_name"] if full_name in full_name2sk_indiv_id: attendance_dict2["Individual ID"] = full_name2sk_indiv_id[full_name] if event_name == "Christmas Eve": attendance_dict2["Date"] = christmas_eve_date else: attendance_dict2["Date"] = month_sundays[week_index] attendance_dict2["Event ID"] = event_id if add_extra_fields: attendance_dict2["Time"] = event_times[event_id] attendance_dict2["Full Name"] = full_name attendance_dict2["Event Name"] = event_name attendance_dict2["Week Num"] = week_index + 1 attendance_dicts2.append(attendance_dict2) else: print >> sys.stderr, '*** WARNING! Cannot find "' + full_name + '" in map' print >> sys.stderr else: print >> sys.stderr, '*** WARNING! Cannot find Sunday date for week index "' + str(week_index) + '"' print >> sys.stderr # Check if numbers on Servant Keeper's reported Total: line match the totals we've been accumulating # per attendance row entry. If they don't match, show WARNING (not ERROR, since via manual checks, it appears # that Servant Keeper totals are buggy) if totals_attendance_dict: for key in accumulated_row_totals_dict: if accumulated_row_totals_dict[key] != totals_attendance_dict[key]: pp = pprint.PrettyPrinter(stream=sys.stderr) print >> sys.stderr, "*** WARNING! Servant Keeper reported totals do not match data totals" print >> sys.stderr, "Servant Keeper Totals:" pp.pprint(totals_attendance_dict) print >> sys.stderr, "Data Totals:" pp.pprint(accumulated_row_totals_dict) print >> sys.stderr break return_table = petl.fromdicts(attendance_dicts2) header = petl.header(return_table) if "Event Name" in header: return_table = petl.cut( return_table, "Full Name", "Event Name", "Time", "Week Num", "Date", "Event ID", "Individual ID" ) else: return_table = petl.cut(return_table, "Date", "Event ID", "Individual ID") return return_table
def log_sample_table(stream): """Record a tabular representation of the stream sample to the log.""" samples = list(map(lambda x: dict(zip(stream.headers, x)), stream.sample)) table = look(fromdicts(samples), limit=len(stream.sample)) info('Data sample =\n%s', table)
def _get_field_sample(resource_sample, field): """Return a subset of the relevant data column.""" sample_table = petl.fromdicts(resource_sample) sample_column = list(petl.values(sample_table, field['name'])) return sample_column
def getTable(path): return etl.fromdicts(loadData(path))
''' with open('example.json', 'w') as f: f.write(data) table1 = etl.fromjson('example.json') table1 # fromdicts() ############# import petl as etl dicts = [{"foo": "a", "bar": 1}, {"foo": "b", "bar": 2}, {"foo": "c", "bar": 2}] table1 = etl.fromdicts(dicts) table1 # tojson() ########## import petl as etl table1 = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 2]] etl.tojson(table1, 'example.json', sort_keys=True) # check what it did print(open('example.json').read())