def replace_split_transactions(table, dict_split_transaction_details): global g list_rows_to_add = [] list_rows_to_remove = [] for row in petl.records(table): if row['Account'] == 'Split Transaction': sk_indiv_id = row['SK Individual ID'] batch_date = row['Batch Date'] amount_str = row['SK Amount'] string_key = sk_indiv_id + ',' + batch_date contrib_amount = Decimal(re.sub(r'[^\d.]', '', amount_str)) if string_key in dict_split_transaction_details: splits_total = Decimal(0) for split_entry in dict_split_transaction_details[string_key]: splits_total += Decimal(re.sub(r'[^\d.]', '', split_entry['Amount'])) if contrib_amount != splits_total: print "*** ERROR! For Individual ID, Batch Date " + string_key + ", the main 'Split " \ "Transaction' entry amount was " + str(contrib_amount) + "but sum of split detail " \ "transactions was " + str(splits_total) else: list_rows_to_remove.append({ 'SK Individual ID': sk_indiv_id, 'Batch Date': batch_date, 'SK Amount': amount_str, 'Account': 'Split Transaction' }) for split_entry in dict_split_transaction_details[string_key]: list_rows_to_add.append({ 'Env #': row['Env #'], 'Batch Date': batch_date, 'SK Amount': split_entry['Amount'], 'Type': row['Type'], 'Account': split_entry['Account'], 'Tax': split_entry['Tax'], 'Check #': row['Check #'], 'Notes': "Inserted from 'Split Transaction'. " + row['Notes'], # 'Family ID': row['Family ID'], 'SK Individual ID': row['SK Individual ID'] # 'To Date': row['To Date'], # 'Contribution Link': row['Contribution Link'] }) else: print "*** ERROR! Cannot find any 'Split Transaction' details for record with 'Batch Date' " + \ row['Batch Date'] + ", contributed by 'Individual ID' " + row['SK Individual ID'] + " for the " \ "amount " + row['SK Amount'] print '*** Count before remove_rows(): ' + str(petl.nrows(table)) table = remove_rows(table, list_rows_to_remove) print '*** Count after remove_rows(): ' + str(petl.nrows(table)) print '*** Count before add_rows(): ' + str(petl.nrows(table)) table = add_rows(table, list_rows_to_add) print '*** Count after add_rows(): ' + str(petl.nrows(table)) return table
def sync(ctx: typer.Context, project: str = typer.Argument( ..., help='The name for the project, specified in config file'), since: datetime = typer.Option(..., formats=['%Y-%m-%d']), until: datetime = typer.Option(..., formats=['%Y-%m-%d']), dry: bool = typer.Option( False, help='Use log entries instead of uploading them to redmine'), drain: bool = typer.Option( False, help='Use drain issues for entries without specified dest')): config = setup_config(ctx, ctx.meta['config_path']) setup_http(ctx) ctx.meta['rdm_user'] = extract.get_redmine_user(config["redmine"]["url"]) time_entries = get_toggl_enteries(config, project, since, until) issues = get_redmine_issues(config, project, since) issue_ids = petl.columns(issues)['id'] entries_to_load, unset_entries = petl.biselect( time_entries, lambda row: row['issue_id'] in issue_ids) if drain and petl.nrows(unset_entries): log.info('Using drain') drained, unset_entries = drained_entries(ctx, issues, unset_entries, project) log.info(f'Drained {petl.nrows(drained)} issues') entries_to_load = petl.cat(entries_to_load, drained) if petl.nrows(unset_entries): log.warning(f'There\'re {petl.nrows(unset_entries)} unset entries') if get_proj_attr(config, project, 'group_entries'): log.info('Using group by day and description') entries_to_load = transform.group_entries_by_day(entries_to_load) load.to_redmine_time(config["redmine"]["url"], entries_to_load, activity_id=get_proj_attr(config, project, 'rdm_activity_id'), user_id=ctx.meta['rdm_user'].get('id'), dry=dry)
def num_rows(self): """ `Returns:` int Number of rows in the table """ return petl.nrows(self.table)
def __bool__(self): # Try to get a single row from our table head_one = petl.head(self.table) # See if our single row is empty return petl.nrows(head_one) > 0
def display_store(): response.headers['Access-Control-Allow-Origin'] = '*' response.headers['Content-type'] = 'application/json' table = ( etl .fromcsv('store_locations.csv') .convert('Lat', float) .convert('Lon', float) ) store_id = request.query.postcode # Select rows table1 = etl.select(table, "{Postcode}=='" + store_id + "'") # Set default postcode of 2000 if etl.nrows(table1) == 0: defaultPostCode = "2000" table1 = etl.select(table, "{Postcode}=='" + defaultPostCode + "'") # Reorder fields print(table1) table2 = etl.cut(table1, 'Name', 'Lat', 'Lon').dicts()[0] print(table2) return table2
def test_different_days(): data = [ ['dur', 'description', 'start'], [timedelta(), 'test', datetime(2000, 1, 1, 15)], [timedelta(), 'test', datetime(2000, 1, 2, 15)], ] result = group_entries_by_day(data) assert petl.nrows(result) == 2
def valuecounts(table, col_name): return_dict = {} reported_count = 0 unreported_count = 0 column = petl.values(table, col_name) nrows = petl.nrows(table) non_blanks = petl.select(table, '{' + quote_single_quote(col_name) + "} != ''") num_blanks = nrows - petl.nrows(non_blanks) counts_table = petl.valuecounts(non_blanks, col_name) for row in petl.records(counts_table): if row['frequency'] > 0.01: return_dict[row[col_name]] = row['count'] reported_count += row['count'] else: unreported_count += row['count'] return_dict['<other>'] = unreported_count return_dict['<blank>'] = num_blanks return return_dict
def test_same_day(): data = [ ['dur', 'description', 'start'], [timedelta(), 'test', datetime(2000, 1, 1, 15, 10)], [timedelta(), 'test', datetime(2000, 1, 1, 15, 0)], ] result = group_entries_by_day(data) assert petl.nrows(result) == 1
def store_to_db(self, conn, tablename, data): try: if etl.nrows(data) == 0: return None except TypeError: return None cursor = conn.cursor() sql = "INSERT INTO %s (%s) " % (tablename, ','.join( etl.header(data))) + "VALUES %s" execute_values(cursor, sql, etl.data(data)) conn.commit() conn.close()
def vehicleP(): while True: feed.ParseFromString( urlopen( 'http://gtfs.openov.nl/gtfs-rt/vehiclePositions.pb').read()) data = [] timer1 = datetime.now() timer2 = datetime.now() - timedelta(minutes=1) for entity in feed.entity: vp = entity.vehicle timex = datetime.fromtimestamp(vp.timestamp) if timex < timer1 and timex > timer2: x = vp.position.longitude y = vp.position.latitude time = datetime.fromtimestamp(vp.timestamp) geo = shape.from_shape(Point(x, y), srid=4326) schedule_relationship = vp.trip.schedule_relationship, direction_id = vp.trip.direction_id, current_stop_sequence = vp.current_stop_sequence, current_status = vp.current_status, trip_id = vp.trip.trip_id, route_id = vp.trip.route_id, stop_id = vp.stop_id, # trip_start_time = datetime.strptime(vp.trip.start_time, '%H:%M:%S').time(), # trip_start_date = datetime.strptime(vp.trip.start_date, "%d%m%Y").date(), trip_start_time = vp.trip.start_time, trip_start_date = vp.trip.start_date, vehicle_label = vp.vehicle.label, data.append({ 'time': time, 'geo_loc': str(geo), 'schedule_relationship': vp.trip.schedule_relationship, 'direction_id': direction_id, 'current_stop_sequence': current_stop_sequence, 'current_status': current_status, 'trip_id': trip_id, 'route_id': route_id, 'stop_id': stop_id, 'trip_start_time': trip_start_time, 'trip_start_date': trip_start_date, 'vehicle_label': vehicle_label, }) table1 = petl.fromdicts(data) print(petl.nrows(table1)) petl.appenddb(table1, con, 'vehicle_positions') t.sleep(60)
def empty_column(self, column): """ Checks if a given column is empty. Returns ``True`` if empty and ``False`` if not empty. `Args:` column: str The column name `Returns:` bool """ if petl.nrows(petl.selectnotnone(self.table, column)) == 0: return True else: return False
def etl(self, record_id): for model in self.__models__: location_table = self.extract(model, record_id) nrows = petl.nrows(location_table) if nrows == 1: record = petl.dicts(location_table)[0] if self._has_observations(record): self._added = True location_id = self._post_location(record, model) thing_id = self._post_thing(record, model, location_id) self.add_package(record) self.observation.etl(tids=self._make_tids(thing_id, record), models=(model,)) else: print(f'multipe records found for given record_id. Skipping {record_id}')
def get_toggl_enteries(config, project, since, until): if project not in config['project']: log.error('No such project in config') raise typer.Exit(code=1) else: project_cfg = config['project'][project] time_entries = extract.from_toggl_timeenteries( workspace=config['toggl']['workspace_id'], projects=project_cfg['tgl_project_id'], since=since.date(), until=until.date()) nrows = petl.nrows(time_entries) if nrows == 0: log.info('No entries found') raise typer.Exit() time_entries = transform.parse_datetime(time_entries, ['start', 'end', 'updated']) time_entries = transform.parse_duration(time_entries) time_entries = transform.add_issue_id_from_description(time_entries) return time_entries
def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) context["filename"] = self.object.downloaded_file.name.split(os.path.sep)[-1] context["count_query_kwarg"] = self.count_query_kwarg table = petl.fromcsv(self.object.downloaded_file) context["header"] = petl.header(table) try: record_count_to_show = int(self.request.GET.get(self.count_query_kwarg)) except (TypeError, ValueError): record_count_to_show = self.count_increment # Potentially expensive, cache / save in database for dataset if petl.nrows(table) > record_count_to_show: context[ "load_more_url" ] = f"{self.request.path}?{self.count_query_kwarg}={record_count_to_show+self.count_increment}" context["rows"] = petl.records(petl.head(table, record_count_to_show)) return context
def tsv_fix(base_path, new_file_name, pk_list, illegal_columns_lower_case, tsv_process): if tsv_process: pwb_replace_in_file(new_file_name, '\0', '') # Remove null bytes table = etl.fromcsv(new_file_name, delimiter='\t', skipinitialspace=True, quoting=csv.QUOTE_NONE, quotechar='', escapechar='') row_count = etl.nrows(table) if tsv_process: tempfile = NamedTemporaryFile(mode='w', dir=base_path + "/content/data/", delete=False) table = pwb_lower_case_header(table) table = etl.rename(table, illegal_columns_lower_case, strict=False) print(new_file_name) for pk in pk_list: table = etl.convert(table, pk.lower(), lambda a: a if len(str(a)) > 0 else '-') writer = csv.writer(tempfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='', escapechar='', lineterminator='\n') writer.writerows(table) shutil.move(tempfile.name, new_file_name) return row_count
def export_process(self): total = petl.nrows(self.data) if self.data and total > 0: mappings = OrderedDict() mappings['Location'] = 'location' mappings['Position'] = 'position', lambda rec: construct_postion( rec) mappings[ 'Local Time'] = 'time', lambda rec: datetime.fromtimestamp( rec).strftime("%Y-%m-%d %H:%M:%S") mappings[ 'Conditions Time'] = 'summary', lambda rec: populate_summary( rec) mappings['Temperature'] = 'temperature' mappings['Pressure'] = 'pressure' mappings['Humidity'] = 'humidity', lambda rec: int(rec * 100) self.data = petl.fieldmap(self.data, mappings) return True else: print( 'Data store doesnt have historic Data. Please run import and export Job' ) return False
""" DB-related tests, separated from main unit tests because they need local database setup prior to running. """ import sys sys.path.insert(0, './src') from petl import dummytable, sort, nrows import logging logging.basicConfig(level=logging.DEBUG) t = (('foo', 'bar'), ('C', 2), ('A', 9), ('B', 6), ('E', 1), ('D', 10)) u = sort(t, buffersize=3) print 'buffer up the data' print nrows(u) print 'create iterators' it1 = iter(u) it2 = iter(u) print 'iterate' print 1, it1.next() print 1, it1.next() print 1, it1.next() print 2, it2.next() print 2, it2.next() print 1, it1.next() print 1, it1.next()
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--input-csv-filename", required=True, help="Input UTF8 CSV to summarize") parser.add_argument( "--sep-columns", required=False, nargs='*', default=argparse.SUPPRESS, help= "Column names of columns containing comma- or semi-colon-separated values" ) parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \ "fields. Defaults to ';' if not specified.") parser.add_argument("--skip-columns", required=False, nargs='*', default=argparse.SUPPRESS, help="Column names to NOT generate stats for") parser.add_argument("--skip-num-rows", required=False, type=int, help="Skip specified number " "of header rows") parser.add_argument( "--first-ccb-column", required=False, help="String name of first CCB column. If " "specified, all preceeding columns will be labeled 'Servant Keeper' and this column " "and all subsequent will be labeled 'CCB'") args = parser.parse_args() if args.first_ccb_column is not None: column_prefix = 'Servant Keeper ' else: column_prefix = '' assert os.path.isfile( args.input_csv_filename ), "Error: cannot open file '" + args.input_csv_filename + "'" table = petl.fromcsv(args.input_csv_filename) # Skip header rows if args.skip_num_rows: skip_num = args.skip_num_rows assert skip_num > 0, "--skip-num-rows value '" + str( skip_num) + "' is invalid. Must be positive." it = iter(table) while skip_num >= 0: row = next(it) skip_num -= 1 table = petl.setheader(table, row) table = petl.tail(table, petl.nrows(table) - args.skip_num_rows) # Print nicely formatted stats for each column sep = '' args_dict = vars(args) skip_columns_specified = 'skip_columns' in args_dict sep_char_specified = 'sep_character' in args_dict for column in petl.header(table): if args.first_ccb_column is not None and column == args.first_ccb_column: column_prefix = 'CCB ' if not skip_columns_specified or column not in args.skip_columns: output_str = column_prefix + "Column '" + column + "'" print sep + output_str print >> sys.stderr, output_str if args.sep_columns is not None and column in args.sep_columns: if sep_char_specified: sep_character = args.sep_character else: sep_character = ';' output_str = num_dict2str( dict_dump(sep_valuecounter(table, column, sep_character))) print output_str else: output_str = num_dict2str(dict_dump(valuecounts(table, column))) print output_str sep = '\n' # Flush to ensure all output is written sys.stdout.flush() sys.stderr.flush()
import sys sys.path.insert(0, './src') from petl import dummytable, sort, nrows import logging logging.basicConfig(level=logging.DEBUG) t = (('foo', 'bar'), ('C', 2), ('A', 9), ('B', 6), ('E', 1), ('D', 10)) u = sort(t, buffersize=3) print 'buffer up the data' print nrows(u) print 'create iterators' it1 = iter(u) it2 = iter(u) print 'iterate' print 1, it1.next() print 1, it1.next() print 1, it1.next() print 2, it2.next() print 2, it2.next() print 1, it1.next() print 1, it1.next()
def convert_folder(base_source_dir, base_target_dir, tmp_dir, tika=False, ocr=False, merge=False, tsv_source_path=None, tsv_target_path=None, make_unique=True, sample=False, zip=False): # WAIT: Legg inn i gui at kan velge om skal ocr-behandles txt_target_path = base_target_dir + '_result.txt' json_tmp_dir = base_target_dir + '_tmp' converted_now = False errors = False originals = False if merge is False: # TODO: Trengs begge argumentene? make_unique = False if tsv_source_path is None: tsv_source_path = base_target_dir + '.tsv' else: txt_target_path = os.path.splitext( tsv_source_path)[1][1:] + '_result.txt' if tsv_target_path is None: tsv_target_path = base_target_dir + '_processed.tsv' if os.path.exists(tsv_target_path): os.remove(tsv_target_path) Path(base_target_dir).mkdir(parents=True, exist_ok=True) # TODO: Viser mime direkte om er pdf/a eller må en sjekke mot ekstra felt i de to under? Forsjekk om Tika og siegfried? # TODO: Trengs denne sjekk om tsv her. Gjøres sjekk før kaller denne funskjonen og slik at unødvendig? if not os.path.isfile(tsv_source_path): if tika: run_tika(tsv_source_path, base_source_dir, json_tmp_dir, zip) else: run_siegfried(base_source_dir, tmp_dir, tsv_source_path, zip) # TODO: Legg inn test på at tsv-fil ikke er tom replace_text_in_file(tsv_source_path, '\0', '') table = etl.fromtsv(tsv_source_path) table = etl.rename(table, { 'filename': 'source_file_path', 'tika_batch_fs_relative_path': 'source_file_path', 'filesize': 'file_size', 'mime': 'mime_type', 'Content_Type': 'mime_type', 'Version': 'version' }, strict=False) thumbs_table = etl.select( table, lambda rec: Path(rec.source_file_path).name == 'Thumbs.db') if etl.nrows(thumbs_table) > 0: thumbs_paths = etl.values(thumbs_table, 'source_file_path') for path in thumbs_paths: if '/' not in path: path = os.path.join(base_source_dir, path) if os.path.isfile(path): os.remove(path) table = etl.select( table, lambda rec: Path(rec.source_file_path).name != 'Thumbs.db') table = etl.select(table, lambda rec: rec.source_file_path != '') table = etl.select(table, lambda rec: '#' not in rec.source_file_path) # WAIT: Ikke fullgod sjekk på embedded dokument i linje over da # faktisk kan forekomme i filnavn row_count = etl.nrows(table) file_count = sum([len(files) for r, d, files in os.walk(base_source_dir)]) if row_count == 0: print('No files to convert. Exiting.') return 'Error', file_count elif file_count != row_count: print('Row count: ' + str(row_count)) print('File count: ' + str(file_count)) print("Files listed in '" + tsv_source_path + "' doesn't match files on disk. Exiting.") return 'Error', file_count elif not zip: print('Converting files..') # WAIT: Legg inn sjekk på filstørrelse før og etter konvertering append_fields = ('version', 'norm_file_path', 'result', 'original_file_copy', 'id') table = add_fields(append_fields, table) cut_fields = ('0', '1', 'X_TIKA_EXCEPTION_runtime', 'X_TIKA_EXCEPTION_warn') table = remove_fields(cut_fields, table) header = etl.header(table) append_tsv_row(tsv_target_path, header) # Treat csv (detected from extension only) as plain text: table = etl.convert(table, 'mime_type', lambda v, row: 'text/plain' if row.id == 'x-fmt/18' else v, pass_row=True) # Update for missing mime types where id is known: table = etl.convert(table, 'mime_type', lambda v, row: 'application/xml' if row.id == 'fmt/979' else v, pass_row=True) if os.path.isfile(txt_target_path): os.remove(txt_target_path) data = etl.dicts(table) count = 0 for row in data: count += 1 count_str = ('(' + str(count) + '/' + str(file_count) + '): ') source_file_path = row['source_file_path'] if '/' not in source_file_path: source_file_path = os.path.join(base_source_dir, source_file_path) mime_type = row['mime_type'] # TODO: Virker ikke når Tika brukt -> finn hvorfor if ';' in mime_type: mime_type = mime_type.split(';')[0] version = row['version'] result = None old_result = row['result'] if not mime_type: if os.path.islink(source_file_path): mime_type = 'n/a' # kind = filetype.guess(source_file_path) extension = os.path.splitext(source_file_path)[1][1:].lower() if extension == 'xml': mime_type = 'application/xml' if not zip: print_path = os.path.relpath(source_file_path, Path(base_source_dir).parents[1]) print(count_str + '.../' + print_path + ' (' + mime_type + ')') if mime_type not in mime_to_norm.keys(): # print("|" + mime_type + "|") errors = True converted_now = True result = 'Conversion not supported' append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') row['norm_file_path'] = '' row['original_file_copy'] = '' else: keep_original = mime_to_norm[mime_type][0] if keep_original: originals = True if zip: keep_original = False function = mime_to_norm[mime_type][1] # Ensure unique file names in dir hierarchy: norm_ext = mime_to_norm[mime_type][2] if not norm_ext: norm_ext = 'none' if make_unique: norm_ext = (base64.b32encode( bytes( str(count), encoding='ascii'))).decode('utf8').replace( '=', '').lower() + '.' + norm_ext target_dir = os.path.dirname( source_file_path.replace(base_source_dir, base_target_dir)) normalized = file_convert(source_file_path, mime_type, function, target_dir, tmp_dir, None, norm_ext, version, ocr, keep_original, zip=zip) if normalized['result'] == 0: errors = True result = 'Conversion failed' append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') elif normalized['result'] == 1: result = 'Converted successfully' converted_now = True elif normalized['result'] == 2: errors = True result = 'Conversion not supported' append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') elif normalized['result'] == 3: if old_result not in ('Converted successfully', 'Manually converted'): result = 'Manually converted' converted_now = True else: result = old_result elif normalized['result'] == 4: converted_now = True errors = True result = normalized['error'] append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') elif normalized['result'] == 5: result = 'Not a document' if normalized['norm_file_path']: row['norm_file_path'] = relpath(normalized['norm_file_path'], base_target_dir) file_copy_path = normalized['original_file_copy'] if file_copy_path: file_copy_path = relpath(file_copy_path, base_target_dir) row['original_file_copy'] = file_copy_path row['result'] = result row_values = list(row.values()) # TODO: Fikset med å legge inn escapechar='\\' i append_tsv_row -> vil det skal problemer senere? # row_values = [r.replace('\n', ' ') for r in row_values if r is not None] append_tsv_row(tsv_target_path, row_values) if sample and count > 9: break if not sample: shutil.move(tsv_target_path, tsv_source_path) # TODO: Legg inn valg om at hvis merge = true kopieres alle filer til mappe på øverste nivå og så slettes tomme undermapper msg = None if sample: msg = 'Sample files converted.' if errors: msg = "Not all sample files were converted. See '" + txt_target_path + "' for details." else: if converted_now: msg = 'All files converted succcessfully.' if errors: msg = "Not all files were converted. See '" + txt_target_path + "' for details." else: msg = 'All files converted previously.' return msg, file_count, errors, originals # TODO: Fiks så bruker denne heller for oppsummering til slutt når flere mapper konvertert
n_table = etl.fromdb(data_in, 'SELECT * FROM Names') m_table = etl.fromdb(data_in, 'SELECT * FROM Movies') r_table = etl.fromdb(data_in, 'SELECT * FROM Ratings') tp_table = etl.fromdb(data_in, 'SELECT * FROM Title_principals') print("Extract DONE") # TRANSFORM # movie personnel d_movie_personnel = etl.cut(n_table, 'imdb_name_id', 'name', 'birth_name') # title d_title = etl.cut(m_table, 'imdb_title_id', 'title', 'original_title') # genre d_genre = etl.distinct(etl.cut(m_table, 'genre')) rows = etl.nrows(d_genre) generated =[] # print(rows) for i in range(rows): uuid = out_cursor.execute('SELECT UUID();') uuid = out_cursor.fetchone()[0] generated.append(uuid) d_genre = etl.addcolumn(d_genre, 'genre_id', generated) # date d_date = etl.distinct(etl.cut(m_table, 'year', 'date_published')) rows = etl.nrows(d_date) generated =[] for i in range(rows): uuid = out_cursor.execute('SELECT UUID();') uuid = out_cursor.fetchone()[0]
CLEAN_UP = 1 #print(etl.head(src_table, 1)) def debug(table, columns): if DEBUG: print(etl.cut(table, columns)) def clean_up(table, column): if CLEAN_UP: return etl.cutout(table, column) table = src_table print('TOTAL SOURCE ROWS = ' + str(etl.nrows(table))) print('SOURCE HEADERS = ' + str(etl.header(table))) #UNUSED COLUMNS if CLEAN_UP: table = clean_up(table, 'rcv_nm') table = clean_up(table, 'recp_cd') table = clean_up(table, 'ins_ind') table = clean_up(table, 'geo_ind') table = clean_up(table, 'cid') table = clean_up(table, 'occ_typ') print('TRIMMED HEADERS = ' + str(etl.header(table))) table = etl.select(table, 'occ_dt', lambda x: x > datetime(2000, 1, 1)) print('ROWS POST YR 2000 = ' + str(etl.nrows(table)))
def test_fromxml_url(): tbl = fromxml('http://feeds.bbci.co.uk/news/rss.xml', './/item', 'title') print tbl assert nrows(tbl) > 0
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("--input-csv-filename", required=True, help="Input UTF8 CSV to summarize") parser.add_argument("--sep-columns", required=False, nargs = '*', default=argparse.SUPPRESS, help="Column names of columns containing comma- or semi-colon-separated values") parser.add_argument("--sep-character", required=False, help="Character used to separate values in multi-value " \ "fields. Defaults to ';' if not specified.") parser.add_argument("--skip-columns", required=False, nargs='*', default=argparse.SUPPRESS, help="Column names to NOT generate stats for") parser.add_argument("--skip-num-rows", required=False, type=int, help="Skip specified number " "of header rows") parser.add_argument("--first-ccb-column", required=False, help="String name of first CCB column. If " "specified, all preceeding columns will be labeled 'Servant Keeper' and this column " "and all subsequent will be labeled 'CCB'") args = parser.parse_args() if args.first_ccb_column is not None: column_prefix = 'Servant Keeper ' else: column_prefix = '' assert os.path.isfile(args.input_csv_filename), "Error: cannot open file '" + args.input_csv_filename + "'" table = petl.fromcsv(args.input_csv_filename) # Skip header rows if args.skip_num_rows: skip_num = args.skip_num_rows assert skip_num > 0, "--skip-num-rows value '" + str(skip_num) + "' is invalid. Must be positive." it = iter(table) while skip_num >= 0: row = next(it) skip_num -= 1 table = petl.setheader(table, row) table = petl.tail(table, petl.nrows(table) - args.skip_num_rows) # Print nicely formatted stats for each column sep = '' args_dict = vars(args) skip_columns_specified = 'skip_columns' in args_dict sep_char_specified = 'sep_character' in args_dict for column in petl.header(table): if args.first_ccb_column is not None and column == args.first_ccb_column: column_prefix = 'CCB ' if not skip_columns_specified or column not in args.skip_columns: output_str = column_prefix + "Column '" + column + "'" print sep + output_str print >> sys.stderr, output_str if args.sep_columns is not None and column in args.sep_columns: if sep_char_specified: sep_character = args.sep_character else: sep_character = ';' output_str = num_dict2str(dict_dump(sep_valuecounter(table, column, sep_character))) print output_str else: output_str = num_dict2str(dict_dump(valuecounts(table, column))) print output_str sep = '\n' # Flush to ensure all output is written sys.stdout.flush() sys.stderr.flush()
def etl(self, tids=None, models=None): if models is None: models = self.__models__ self._sensor_id = self._add_sensor() self._observed_properties = {} # add the observed properties for m in models: payload = m.observed_property_payload payload['name'] = m.name self._observed_properties[m.name] = self._post_unique_item( 'ObservedProperties', m.observed_property_payload) added = False for thing in tids: thing_id = thing['@iot.id'] # point_id = thing['@nmbgmr.point_id'] for m in models: print(f'Add {m.name}') ds_id = self._get_datastream(thing_id, m.datastream_payload['name']) skip_nobs = 0 if ds_id: print(f'Got datastream thing={thing_id}, ds={ds_id}') # check the number of obs for this datastream matches nrows skip_nobs = self._get_nobservations(ds_id) if skip_nobs: print(f'Skipping nobs={skip_nobs}') wt = self._extract(thing, m, skip_nobs) if isinstance(wt, list): nrows = len(wt) else: nrows = petl.nrows(wt) if nrows: added = True print(f'Adding nobs={nrows}') if not ds_id: print('Adding datastream') sensor_id = self._sensor_id if self._sensor_id is None: # extract the sensor from the first record sensor_id = self._add_sensor(wt) ds_id = self._add_datastream( thing_id, self._observed_properties[m.name], sensor_id, m.datastream_payload) # r = self._make_resource(m) # self.ckan_importer.add_resource(r) # add observations to datastream self._add_observations(ds_id, wt, m) else: print('no obs to add') return added
def typeInference(table): for h in etl.header(table): col = etl.cut(table, h) print etl.nrows(col)
street_name, street_suffix, street_postdir, street_full from {dor_parcel_table} '''.format(dor_parcel_table='dor_parcel') engine_dor_parcel_rows = etl.fromdb(pg_db, dor_parcel_read_stmt) if DEV: print(etl.look(engine_dor_parcel_rows)) # Get duplicate parcel_ids: non_unique_parcel_id_rows = engine_dor_parcel_rows.duplicates(key='parcel_id') unique_parcel_id_rows = etl.complement(engine_dor_parcel_rows, non_unique_parcel_id_rows) # Get address comps for condos by joining to dor_parcel with unique parcel_id on parcel_id: print("Relating condos to parcels...") joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \ .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True) print("joined rowcount: ", etl.nrows(joined)) if DEV: print(etl.look(joined)) # Calculate errors print("Calculating errors...") unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id') print("unjoined rowcount: ", etl.nrows(unjoined)) dor_condos_unjoined_unmatched = etl.antijoin(unjoined, non_unique_parcel_id_rows, key='parcel_id').addfield( 'reason', 'non-active/remainder mapreg') print("non-active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_unmatched)) if DEV:
select parcel_id, street_address, address_low, address_low_suffix, address_low_frac, address_high, street_predir, street_name, street_suffix, street_postdir, street_full from {dor_parcel_table} '''.format(dor_parcel_table='dor_parcel') engine_dor_parcel_rows = etl.fromdb(pg_db, dor_parcel_read_stmt) if DEV: print(etl.look(engine_dor_parcel_rows)) # Get duplicate parcel_ids: non_unique_parcel_id_rows = engine_dor_parcel_rows.duplicates(key='parcel_id') unique_parcel_id_rows = etl.complement(engine_dor_parcel_rows, non_unique_parcel_id_rows) # Get address comps for condos by joining to dor_parcel with unique parcel_id on parcel_id: print("Relating condos to parcels...") joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \ .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True) print("joined rowcount: ", etl.nrows(joined)) if DEV: print(etl.look(joined)) # Calculate errors print("Calculating errors...") unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id') print("unjoined rowcount: ", etl.nrows(unjoined)) dor_condos_unjoined_unmatched = etl.antijoin(unjoined, non_unique_parcel_id_rows, key='parcel_id').addfield('reason', 'non-active/remainder mapreg') print("non-active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_unmatched)) if DEV: print(etl.look(dor_condos_unjoined_unmatched)) dor_condos_unjoined_duplicates = etl.antijoin(unjoined, dor_condos_unjoined_unmatched, key='source_object_id').addfield('reason', 'non-unique active/remainder mapreg') print("non-unique active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_duplicates)) if DEV: print(etl.look(dor_condos_unjoined_duplicates))
from __future__ import division, print_function, absolute_import # nrows() ######### import petl as etl table = [['foo', 'bar'], ['a', 1], ['b', 2]] etl.nrows(table) # valuecount() ############## import petl as etl table = [['foo', 'bar'], ['a', 1], ['b', 2], ['b', 7]] etl.valuecount(table, 'foo', 'b') # valuecounter() ################ import petl as etl table = [['foo', 'bar'], ['a', True], ['b'], ['b', True], ['c', False]]
""" DB-related tests, separated from main unit tests because they need local database setup prior to running. """ import sys sys.path.insert(0, './src') from petl import cache, nrows import logging logging.basicConfig(level=logging.DEBUG) t = (('foo', 'bar'), ('C', 2), ('A', 9), ('B', 6), ('E', 1), ('D', 10)) u = cache(t) nrows(u) nrows(u)