def transform_to_petl(data): isodate = etl.dateparser("%Y-%m-%d") data = etl.fromdataframe(data) data = etl.rename(data, {"index": "Date", "VALUE": "Value"}) data = etl.convert(data, {"Date": lambda d: d[:10]}) data = etl.convert(data, {"Date": lambda d: isodate(d)}) return data
def transform(data,data_set): data = data['observations'] data = etl.fromdicts(data, header=['value','realtime_start','realtime_end','date']) data = etl.cut(data,'date','value') data = etl.rename(data,{'date':'date','value': data_set.lower()}) data = etl.convert(data,data_set.lower(),lambda val: 0 if val == '.' else val) return data
def unpackcall(tbl, *keys, **kwargs): """ Unpack the call column. E.g.:: >>> from petlx.vcf import fromvcf, unpackinfo, meltsamples, unpackcall >>> from petl import look, cutout >>> t1 = fromvcf('../fixture/sample.vcf') >>> t2 = meltsamples(t1) >>> t3 = unpackcall(t2) >>> t4 = cutout(t3, 'INFO') >>> look(t4) +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | 'CHROM' | 'POS' | 'ID' | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'SAMPLE' | 'GT' | 'GQ' | 'DP' | 'HQ' | +=========+=======+=============+=======+=======+========+==========+===========+=======+======+======+==============+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00001' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00002' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00003' | '0/1' | None | None | [3, 3] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00001' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00002' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00003' | '0/1' | None | None | [3, 3] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00001' | '0|0' | 48 | 1 | [51, 51] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00002' | '1|0' | 48 | 8 | [51, 51] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00003' | '1/1' | 43 | 5 | [None, None] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | 'NA00001' | '0|0' | 49 | 3 | [58, 50] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ .. versionadded:: 0.5 """ if not keys: if hasattr(tbl, 'filename'): try: import vcf as pyvcf except ImportError as e: raise UnsatisfiedDependency(e, dep_message) reader = pyvcf.Reader(filename=tbl.filename) # all FORMAT keys = reader.formats.keys() else: tbl = convert(tbl, 'CALL', lambda v: v.data._asdict()) # enable sampling of keys from data result = unpackdict(tbl, 'CALL', keys=keys) if 'prefix' in kwargs: result = rename(result, {k: kwargs['prefix'] + k for k in keys}) if hasattr(tbl, 'filename'): return VCFWrapper(result, tbl.filename) else: return result
def convert_folder(base_source_dir, base_target_dir, tmp_dir, tika=False, ocr=False, merge=False, tsv_source_path=None, tsv_target_path=None, make_unique=True, sample=False, zip=False): # WAIT: Legg inn i gui at kan velge om skal ocr-behandles txt_target_path = base_target_dir + '_result.txt' json_tmp_dir = base_target_dir + '_tmp' converted_now = False errors = False originals = False if merge is False: # TODO: Trengs begge argumentene? make_unique = False if tsv_source_path is None: tsv_source_path = base_target_dir + '.tsv' else: txt_target_path = os.path.splitext( tsv_source_path)[1][1:] + '_result.txt' if tsv_target_path is None: tsv_target_path = base_target_dir + '_processed.tsv' if os.path.exists(tsv_target_path): os.remove(tsv_target_path) Path(base_target_dir).mkdir(parents=True, exist_ok=True) # TODO: Viser mime direkte om er pdf/a eller må en sjekke mot ekstra felt i de to under? Forsjekk om Tika og siegfried? # TODO: Trengs denne sjekk om tsv her. Gjøres sjekk før kaller denne funskjonen og slik at unødvendig? if not os.path.isfile(tsv_source_path): if tika: run_tika(tsv_source_path, base_source_dir, json_tmp_dir, zip) else: run_siegfried(base_source_dir, tmp_dir, tsv_source_path, zip) # TODO: Legg inn test på at tsv-fil ikke er tom replace_text_in_file(tsv_source_path, '\0', '') table = etl.fromtsv(tsv_source_path) table = etl.rename(table, { 'filename': 'source_file_path', 'tika_batch_fs_relative_path': 'source_file_path', 'filesize': 'file_size', 'mime': 'mime_type', 'Content_Type': 'mime_type', 'Version': 'version' }, strict=False) thumbs_table = etl.select( table, lambda rec: Path(rec.source_file_path).name == 'Thumbs.db') if etl.nrows(thumbs_table) > 0: thumbs_paths = etl.values(thumbs_table, 'source_file_path') for path in thumbs_paths: if '/' not in path: path = os.path.join(base_source_dir, path) if os.path.isfile(path): os.remove(path) table = etl.select( table, lambda rec: Path(rec.source_file_path).name != 'Thumbs.db') table = etl.select(table, lambda rec: rec.source_file_path != '') table = etl.select(table, lambda rec: '#' not in rec.source_file_path) # WAIT: Ikke fullgod sjekk på embedded dokument i linje over da # faktisk kan forekomme i filnavn row_count = etl.nrows(table) file_count = sum([len(files) for r, d, files in os.walk(base_source_dir)]) if row_count == 0: print('No files to convert. Exiting.') return 'Error', file_count elif file_count != row_count: print('Row count: ' + str(row_count)) print('File count: ' + str(file_count)) print("Files listed in '" + tsv_source_path + "' doesn't match files on disk. Exiting.") return 'Error', file_count elif not zip: print('Converting files..') # WAIT: Legg inn sjekk på filstørrelse før og etter konvertering append_fields = ('version', 'norm_file_path', 'result', 'original_file_copy', 'id') table = add_fields(append_fields, table) cut_fields = ('0', '1', 'X_TIKA_EXCEPTION_runtime', 'X_TIKA_EXCEPTION_warn') table = remove_fields(cut_fields, table) header = etl.header(table) append_tsv_row(tsv_target_path, header) # Treat csv (detected from extension only) as plain text: table = etl.convert(table, 'mime_type', lambda v, row: 'text/plain' if row.id == 'x-fmt/18' else v, pass_row=True) # Update for missing mime types where id is known: table = etl.convert(table, 'mime_type', lambda v, row: 'application/xml' if row.id == 'fmt/979' else v, pass_row=True) if os.path.isfile(txt_target_path): os.remove(txt_target_path) data = etl.dicts(table) count = 0 for row in data: count += 1 count_str = ('(' + str(count) + '/' + str(file_count) + '): ') source_file_path = row['source_file_path'] if '/' not in source_file_path: source_file_path = os.path.join(base_source_dir, source_file_path) mime_type = row['mime_type'] # TODO: Virker ikke når Tika brukt -> finn hvorfor if ';' in mime_type: mime_type = mime_type.split(';')[0] version = row['version'] result = None old_result = row['result'] if not mime_type: if os.path.islink(source_file_path): mime_type = 'n/a' # kind = filetype.guess(source_file_path) extension = os.path.splitext(source_file_path)[1][1:].lower() if extension == 'xml': mime_type = 'application/xml' if not zip: print_path = os.path.relpath(source_file_path, Path(base_source_dir).parents[1]) print(count_str + '.../' + print_path + ' (' + mime_type + ')') if mime_type not in mime_to_norm.keys(): # print("|" + mime_type + "|") errors = True converted_now = True result = 'Conversion not supported' append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') row['norm_file_path'] = '' row['original_file_copy'] = '' else: keep_original = mime_to_norm[mime_type][0] if keep_original: originals = True if zip: keep_original = False function = mime_to_norm[mime_type][1] # Ensure unique file names in dir hierarchy: norm_ext = mime_to_norm[mime_type][2] if not norm_ext: norm_ext = 'none' if make_unique: norm_ext = (base64.b32encode( bytes( str(count), encoding='ascii'))).decode('utf8').replace( '=', '').lower() + '.' + norm_ext target_dir = os.path.dirname( source_file_path.replace(base_source_dir, base_target_dir)) normalized = file_convert(source_file_path, mime_type, function, target_dir, tmp_dir, None, norm_ext, version, ocr, keep_original, zip=zip) if normalized['result'] == 0: errors = True result = 'Conversion failed' append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') elif normalized['result'] == 1: result = 'Converted successfully' converted_now = True elif normalized['result'] == 2: errors = True result = 'Conversion not supported' append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') elif normalized['result'] == 3: if old_result not in ('Converted successfully', 'Manually converted'): result = 'Manually converted' converted_now = True else: result = old_result elif normalized['result'] == 4: converted_now = True errors = True result = normalized['error'] append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') elif normalized['result'] == 5: result = 'Not a document' if normalized['norm_file_path']: row['norm_file_path'] = relpath(normalized['norm_file_path'], base_target_dir) file_copy_path = normalized['original_file_copy'] if file_copy_path: file_copy_path = relpath(file_copy_path, base_target_dir) row['original_file_copy'] = file_copy_path row['result'] = result row_values = list(row.values()) # TODO: Fikset med å legge inn escapechar='\\' i append_tsv_row -> vil det skal problemer senere? # row_values = [r.replace('\n', ' ') for r in row_values if r is not None] append_tsv_row(tsv_target_path, row_values) if sample and count > 9: break if not sample: shutil.move(tsv_target_path, tsv_source_path) # TODO: Legg inn valg om at hvis merge = true kopieres alle filer til mappe på øverste nivå og så slettes tomme undermapper msg = None if sample: msg = 'Sample files converted.' if errors: msg = "Not all sample files were converted. See '" + txt_target_path + "' for details." else: if converted_now: msg = 'All files converted succcessfully.' if errors: msg = "Not all files were converted. See '" + txt_target_path + "' for details." else: msg = 'All files converted previously.' return msg, file_count, errors, originals # TODO: Fiks så bruker denne heller for oppsummering til slutt når flere mapper konvertert
def substitute(table, field, pattern, repl, count=0, flags=0): program = re.compile(pattern, flags) convert = lambda tempData: program.sub(repl, tempData, count=count) return pt.convert(table, field, convert)
from __future__ import division, print_function, absolute_import # progress() ############ import petl as etl table = etl.dummytable(100000) table.progress(10000).tocsv('example.csv') # clock() ######### import petl as etl t1 = etl.dummytable(100000) c1 = etl.clock(t1) t2 = etl.convert(c1, 'foo', lambda v: v**2) c2 = etl.clock(t2) p = etl.progress(c2, 10000) etl.tocsv(p, 'example.csv') # time consumed retrieving rows from t1 c1.time # time consumed retrieving rows from t2 c2.time # actual time consumed by the convert step c2.time - c1.time
'day': date } headers = {'content-type': 'application/x-www-form-urlencoded'} #gets the information from ppms and creates a csv file r = requests.post(url, data=payload, headers=headers) f = open('todays_bookings.csv', 'wb') f.write(r.text) f.close() #Load the table table1 = petl.fromcsv('todays_bookings.csv') # Alter the columns table2 = petl.cut(table1, ' Object', ' User', ' Start time', ' End time', ' Training', ' Assisted') # Reorder the user names table3 = petl.convert(table2, ' User', lambda row: " ".join(re.findall("\S+", row)[::-1])) # Reorder the rows table4 = petl.sort(table3, key=[' Object', ' Start time']) # Save to new file petl.tocsv(table4, 'new.csv') #Reopens the CSV file (stupid, I know) and removes unnecessary characters csvfile = "" ppmscal = csv.reader(open('new.csv'), delimiter=',') for row in ppmscal: csvfile += str(row) + '\n' csvtxt = csvfile.replace("(", "").replace(")", "").replace("'", "").replace( "[", "").replace("]", "") csvtxt = csvtxt[:-1]
def unpack_list(self, column, include_original=False, missing=None, replace=False, max_columns=None): """ Unpack list values from one column into separate columns. Numbers the columns. .. code-block:: python # Begin with a list in column json = [{'id': '5421', 'name': 'Jane Green', 'phones': ['512-699-3334', '512-222-5478'] } ] tbl = Table(json) print (tbl) >>> {'id': '5421', 'name': 'Jane Green', 'phones': ['512-699-3334', '512-222-5478']} tbl.unpack_list('phones', replace=True) print (tbl) >>> {'id': '5421', 'name': 'Jane Green', 'phones_0': '512-699-3334', 'phones_1': '512-222-5478'} # noqa: E501 `Args:` column: str The column name to unpack include_original: boolean Retain original column after unpacking sample_size: int Number of rows to sample before determining columns missing: str If a value is missing, the value to fill it with replace: boolean Return new table or replace existing max_columns: int The maximum number of columns to unpack `Returns:` None """ # Convert all column values to list to avoid unpack errors self.table = petl.convert( self.table, column, lambda v: [v] if not isinstance(v, list) else v) # Find the max number of values in list for all rows col_count = 0 for row in self.cut(column): if len(row[column]) > col_count: col_count = len(row[column]) # If max columns provided, set max columns if col_count > 0 and max_columns: col_count = max_columns # Create new column names "COL_01, COL_02" new_cols = [] for i in range(col_count): new_cols.append(column + '_' + str(i)) tbl = petl.unpack(self.table, column, new_cols, include_original=include_original, missing=missing) if replace: self.table = tbl else: return tbl
import petl as etl #Extracting data from example csv file table1 = etl.fromcsv('example.csv') print table1 #etl.look(table1) #Transformation function to be applied on extracted data table2 = etl.convert(table1, 'foo', 'upper') table3 = etl.convert(table2, 'bar', int) table4 = etl.convert(table3, 'baz', float) table5 = etl.addfield(table4, 'finally', lambda row: row.bar * row.baz) print table5 #etl.look(table5) #Writing above ETL pipeline in a functional style table = (etl.fromcsv('example.csv').convert('foo', 'upper').convert( 'bar', int).convert('baz', float).addfield('finally', lambda row: row.bar * row.baz)) table.look() #look function only displays five rows. print table #OOP style programming l = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 2]] table6 = etl.wrap(l) print table6
def get_events_organization(self, organization_id=None, updated_since=None, timeslot_start=None, timeslot_end=None, timeslots_table=False, max_timeslots=None): """ Fetch all public events for an organization. This includes both events owned by the organization (as indicated by the organization field on the event object) and events of other organizations promoted by this specified organization. .. note:: API Key Required `Args:` organization_id: list or int Filter events by a single or multiple organization ids updated_since: str Filter to events updated since given date (ISO Date) timeslot_start: str Filter by a timeslot start of events using ``>``,``>=``,``<``,``<=`` operators and ISO date (ex. ``<=2018-12-13 05:00:00PM``) timeslot_end: str Filter by a timeslot end of events using ``>``,``>=``,``<``,``<=`` operators and ISO date (ex. ``<=2018-12-13 05:00:00PM``) timeslot_table: boolean Return timeslots as a separate long table. Useful for extracting to databases. zipcode: str Filter by a Events' Locations' postal code. If present, returns Events sorted by distance from zipcode. If present, virtual events will not be returned. max_dist: str Filter Events' Locations' distance from provided zipcode. visibility: str Either `PUBLIC` or `PRIVATE`. Private events only return if user is authenticated; if `visibility=PRIVATE` and user doesn't have permission, no events returned. exclude_full: bool If `exclude_full=true`, filter out full Timeslots (and Events if all of an Event's Timeslots are full) is_virtual: bool `is_virtual=false` will return only in-person events, while `is_virtual=true` will return only virtual events. If excluded, return virtual and in-person events. Note that providing a zipcode also implies `is_virtual=false`. event_types:enum The type of the event, one of: `CANVASS`, `PHONE_BANK`, `TEXT_BANK`, `MEETING`, `COMMUNITY`, `FUNDRAISER`, `MEET_GREET`, `HOUSE_PARTY`, `VOTER_REG`, `TRAINING`, `FRIEND_TO_FRIEND_OUTREACH`, `DEBATE_WATCH_PARTY`, `ADVOCACY_CALL`, `OTHER`. This list may expand in the future. max_timeslots: int If not returning a timeslot table, will unpack time slots. If do not set this arg, it will add a column for each time slot. The argument limits the number of columns and discards any additional timeslots after that. For example: If there are 20 timeslots associated with your event, and you set the max time slots to 5, it will only return the first 5 time slots as ``time_slot_0``, ``time_slot_1`` etc. This is helpful in situations where you have a regular sync running and want to ensure that the column headers remain static. `Returns` Parsons Table or dict or Parsons Tables See :ref:`parsons-table` for output options. """ if isinstance(organization_id, (str, int)): organization_id = [organization_id] args = { 'organization_id': organization_id, 'updated_since': date_to_timestamp(updated_since), 'timeslot_start': self._time_parse(timeslot_start), 'timeslot_end': self._time_parse(timeslot_end), } tbl = Table( self.request_paginate(self.uri + 'events', args=args, auth=True)) if tbl.num_rows > 0: tbl.unpack_dict('sponsor') tbl.unpack_dict('location', prepend=False) tbl.unpack_dict('location', prepend=False) # Intentional duplicate tbl.table = petl.convert(tbl.table, 'address_lines', lambda v: ' '.join(v)) if timeslots_table: timeslots_tbl = tbl.long_table(['id'], 'timeslots', 'event_id') return {'events': tbl, 'timeslots': timeslots_tbl} else: tbl.unpack_list('timeslots', replace=True, max_columns=max_timeslots) cols = tbl.columns for c in cols: if re.search('timeslots', c, re.IGNORECASE) is not None: tbl.unpack_dict(c) return tbl
from __future__ import division, print_function, absolute_import # convert() ########### import petl as etl table1 = [['foo', 'bar', 'baz'], ['A', '2.4', 12], ['B', '5.7', 34], ['C', '1.2', 56]] # using a built-in function: table2 = etl.convert(table1, 'bar', float) table2 # using a lambda function:: table3 = etl.convert(table1, 'baz', lambda v: v*2) table3 # a method of the data value can also be invoked by passing # the method name table4 = etl.convert(table1, 'foo', 'lower') table4 # arguments to the method invocation can also be given table5 = etl.convert(table1, 'foo', 'replace', 'A', 'AA') table5 # values can also be translated via a dictionary table7 = etl.convert(table1, 'foo', {'A': 'Z', 'B': 'Y'}) table7 # the same conversion can be applied to multiple fields table8 = etl.convert(table1, ('foo', 'bar', 'baz'), str) table8 # multiple conversions can be specified at the same time
def join_annotations_with_gff_features(annot_file, annot_file_out, feature_type="CDS", annot_join="interval", annot_sep="\t", gff_files=None, gff_files_list=None, max_overlap_only=True): """Join features in GFF3 file with annotations. Add annotations as attributes of GFF features. The way to join with the annotation file must be provided in annot_join, which can be either keyed by nucleotide intervals in the first three fields like GFF itself, or by feature IDs (if annotation was done for protein sequences - not yet implemented). Coordinates and strand of each annotation will be replaced with those of overlapping feature, if any (can result in more than one record per annotation). """ import petl import petlx ann_all = petl.io.csv.fromcsv(annot_file, delimiter=annot_sep) ann_all = petl.convert(ann_all, ('query_start', 'query_end', 'query_strand'), int) ann_all = petl.addcolumn(ann_all, 'ann_rec_ind', range(ann_all.nrows())) with petl_opened_file_source(annot_file_out, "w") as annot_out: for i_inp, gff_file in enumerate( util.glob_files(files_globs=gff_files, files_globs_list=gff_files_list)): log.info("Working on feature file {}".format(gff_file)) feat = petlx.bio.gff3.fromgff3(gff_file) feat = petl_fix_gff_coord(feat) feat_seqid_set = set(feat["seqid"]) if feature_type: feat = petl.selecteq(feat, 'type', feature_type) ann = petl.selectin(ann_all, 'query_id', feat_seqid_set) ## somehow we get many ORFs in GFFs (and Genbank files) from both RASTtk and ClovR where one ## ORFs ends at the start position of another ORF (and the BLAST match starts at the start of the ## second ORF). jn = petl.transform.intervals.intervalleftjoin( ann, feat, rstart="start", rstop="end", lstart="query_start", lstop="query_end", rkey="seqid", lkey="query_id", rprefix="feat_") jn = petl.addfield(jn,"overlap_len", lambda rec: (min(rec['end'],rec['query_end']) - max(rec['start'],rec['query_start']) + 1) \ if rec['start'] is not None else 0) if max_overlap_only: jn = petl.groupselectmax(jn, key="ann_rec_ind", value="overlap_len") _strand_conv = {'+': 1, '-': -1, '.': 0} jn = petl.convert(jn, { 'query_start' : lambda v,row: row.start if row.start is not None else v, 'query_end': lambda v,row: row.end if row.end is not None else v, 'query_strand': lambda v,row,_strand_conv=_strand_conv: _strand_conv[row.strand] \ if row.strand is not None else row.query_strand }, pass_row=True ) if i_inp == 0: out_func = petl.io.csv.tocsv else: out_func = petl.io.csv.appendcsv out_func(jn, annot_out, delimiter=annot_sep)
def parse_duration(inp): return etl.convert(inp, 'dur', lambda v: timedelta(milliseconds=v))
# convert table1 = [['foo', 'bar'], ['A', '2.4'], ['B', '5.7'], ['C', '1.2'], ['D', '8.3']] table6 = [['gender', 'age'], ['M', 12], ['F', 34], ['-', 56]] from petl import convert, look look(table1) # using the built-in float function: table2 = convert(table1, 'bar', float) look(table2) # using a lambda function:: table3 = convert(table2, 'bar', lambda v: v**2) look(table3) # a method of the data value can also be invoked by passing the method name table4 = convert(table1, 'foo', 'lower') look(table4) # arguments to the method invocation can also be given table5 = convert(table4, 'foo', 'replace', 'a', 'aa') look(table5) # values can also be translated via a dictionary look(table6) table7 = convert(table6, 'gender', {'M': 'male', 'F': 'female'}) look(table7)
def parse_datetime(inp, date_fields: List[str]): return etl.convert(inp, date_fields, parsedatetime)
international_code = "(+61)" with open(IN_FILE, 'r') as infile, open(OUT_FILE, "w") as outfile: csv_reader = csv.reader(infile) writer = csv.writer(outfile) headers = next(csv_reader, None) #skipping header row writer.writerow(headers) for row in csv_reader: number_column = row[5] state_column = row[3] clean_num = re.sub("\D", "", row[5])[-8:] formatted_num = international_code + " " + regional_code[ state_column] + " " + clean_num row[5] = formatted_num writer.writerow(row) services = petl.fromcsv(SERVICES_FILE) offices = petl.fromcsv(OUT_FILE) offices = offices.rename({"Contact Name": "Office", "Phone Number": "Phone"}) offices = petl.cutout(offices,"State","Postcode") locations = petl.fromcsv(LOC_FILE) locations = locations.rename({"officeID": "OfficeID"}) office_service = petl.join(services, offices, key='OfficeID') office_service_locations = petl.join( office_service, locations, key='OfficeID') office_service_locations = petl.convert(office_service_locations,'OfficeServiceID',int) office_service_locations = petl.sort(office_service_locations,'OfficeServiceID') petl.tocsv(office_service_locations, 'office_service_locations.csv')
def transform(table, *fields, **args): if len(fields) > 0: return etl.convert(table, fields, empty_as_none) else: return etl.convertall(table, empty_as_none)
look(table6) # using the header keyword argument with two input tables look(table7) look(table8) table9 = cat(table7, table8, header=['A', 'foo', 'B', 'bar', 'C']) look(table9) # convert table1 = [['foo', 'bar', 'baz'], ['A', '2.4', 12], ['B', '5.7', 34], ['C', '1.2', 56]] from petl import convert, look look(table1) # using the built-in float function: table2 = convert(table1, 'bar', float) look(table2) # using a lambda function:: table3 = convert(table1, 'baz', lambda v: v * 2) look(table3) # a method of the data value can also be invoked by passing the method name table4 = convert(table1, 'foo', 'lower') look(table4) # arguments to the method invocation can also be given table5 = convert(table1, 'foo', 'replace', 'A', 'AA') look(table5) # values can also be translated via a dictionary table7 = convert(table1, 'foo', {'A': 'Z', 'B': 'Y'}) look(table7) # the same conversion can be applied to multiple fields table8 = convert(table1, ('foo', 'bar', 'baz'), unicode)
look(table9) ######### # convert ######### table1 = [['foo', 'bar', 'baz'], ['A', '2.4', 12], ['B', '5.7', 34], ['C', '1.2', 56]] from petl import convert, look look(table1) # using the built-in float function: table2 = convert(table1, 'bar', float) look(table2) # using a lambda function:: table3 = convert(table1, 'baz', lambda v: v*2) look(table3) # a method of the data value can also be invoked by passing the method name table4 = convert(table1, 'foo', 'lower') look(table4) # arguments to the method invocation can also be given table5 = convert(table1, 'foo', 'replace', 'A', 'AA') look(table5) # values can also be translated via a dictionary table7 = convert(table1, 'foo', {'A': 'Z', 'B': 'Y'}) look(table7) # the same conversion can be applied to multiple fields table8 = convert(table1, ('foo', 'bar', 'baz'), unicode)
def cleanFormatTable(table): newtable = table for h in etl.header(table): newtable = etl.convert(table, h, sanitize) return newtable
'dbname': 'data_warehouse', 'db_schema': 'warehouse' } Model().connection(config) number_of_record = 0 loader = Loader() while True: data_loaded = load_file('./group_call.json') if len(data_loaded) <= number_of_record: logging.info("Continue to the next iteration ..") sleep(10) continue number_of_record = len(data_loaded) data = etl.fromdicts(data_loaded) converted_data = etl.convert(data, 'xml', lambda r: XMLHelper.xml_to_json(r)) converted_data = etl.convert( converted_data, 'txt', lambda r: json.loads(r.strip('group_call_summary#'))) converted_data = etl.select( converted_data, lambda r: len(re.split(r'\/', r['peer'])) == 1) converted_data = etl.addfield(converted_data, 'room_id', lambda r: r['txt'].get('roomId')) converted_data = etl.addfield( converted_data, 'from_id', lambda r: id_str_to_int(r['xml']['from'])) converted_data = etl.addfield(converted_data, 'to_id', lambda r: id_str_to_int(r['xml']['to'])) converted_data = etl.addfield(converted_data, 'message_id', lambda r: r['xml']['id']) converted_data = etl.addfield(
from __future__ import division, print_function, absolute_import example_data = """foo,bar,baz a,1,3.4 b,2,7.4 c,6,2.2 d,9,8.1 """ with open('example.csv', 'w') as f: f.write(example_data) import petl as etl table1 = etl.fromcsv('example.csv') table2 = etl.convert(table1, 'foo', 'upper') table3 = etl.convert(table2, 'bar', int) table4 = etl.convert(table3, 'baz', float) table5 = etl.addfield(table4, 'quux', lambda row: row.bar * row.baz) table5 table = ( etl .fromcsv('example.csv') .convert('foo', 'upper') .convert('bar', int) .convert('baz', float) .addfield('quux', lambda row: row.bar * row.baz) ) table l = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 2]] table = etl.wrap(l)
if CLEAN_UP: table = clean_up(table, 'rcv_nm') table = clean_up(table, 'recp_cd') table = clean_up(table, 'ins_ind') table = clean_up(table, 'geo_ind') table = clean_up(table, 'cid') table = clean_up(table, 'occ_typ') print('TRIMMED HEADERS = ' + str(etl.header(table))) table = etl.select(table, 'occ_dt', lambda x: x > datetime(2000, 1, 1)) print('ROWS POST YR 2000 = ' + str(etl.nrows(table))) mine_table = etl.fromcsv('mines.csv', encoding='utf-8') ##handle leading 0's mine_table = etl.convert(mine_table, 'mine_no', lambda x: str(int(x))) table = etl.convert(table, 'mine_no', lambda x: str(int(x))) #MAP mine_no to mine_guid table = etl.leftjoin(table, mine_table, key='mine_no') table = clean_up(table, 'mine_no') #make sure this is 0 if etl.valuecount(table, 'mine_guid', None)[0] > 0: print('mine_guid, mine_no pair missing from mines.csv') exit(1) ###### print('CONVERT AND RENAME descript1 to recommendation') table = etl.addfield(table, 'recommendation', lambda x: x['descript1']) table = clean_up(table, 'descript1')
look(table8) table9 = cat(table7, table8, header=['A', 'foo', 'B', 'bar', 'C']) look(table9) # convert table1 = [['foo', 'bar', 'baz'], ['A', '2.4', 12], ['B', '5.7', 34], ['C', '1.2', 56]] from petl import convert, look look(table1) # using the built-in float function: table2 = convert(table1, 'bar', float) look(table2) # using a lambda function:: table3 = convert(table1, 'baz', lambda v: v*2) look(table3) # a method of the data value can also be invoked by passing the method name table4 = convert(table1, 'foo', 'lower') look(table4) # arguments to the method invocation can also be given table5 = convert(table1, 'foo', 'replace', 'A', 'AA') look(table5) # values can also be translated via a dictionary table7 = convert(table1, 'foo', {'A': 'Z', 'B': 'Y'}) look(table7) # the same conversion can be applied to multiple fields table8 = convert(table1, ('foo', 'bar', 'baz'), unicode)
def get_events(self, organization_id=None, updated_since=None, timeslot_start=None, timeslot_end=None, timeslots_table=False, max_timeslots=None): """ Fetch all public events on the platform. `Args:` organization_id: list or int Filter events by a single or multiple organization ids updated_since: str Filter to events updated since given date (ISO Date) timeslot_start: str Filter by a timeslot start of events using ``>``,``>=``,``<``,``<=`` operators and ISO date (ex. ``<=2018-12-13 05:00:00PM``) timeslot_end: str Filter by a timeslot end of events using ``>``,``>=``,``<``,``<=`` operators and ISO date (ex. ``<=2018-12-13 05:00:00PM``) timeslot_table: boolean Return timeslots as a separate long table. Useful for extracting to databases. max_timeslots: int If not returning a timeslot table, will unpack time slots. If do not set this kwarg, it will add a column for each time slot. The argument limits the number of columns and discards any additional timeslots after that. For example: If there are 20 timeslots associated with your event, and you set the max time slots to 5, it will only return the first 5 time slots as ``time_slot_0``, ``time_slot_1`` etc. This is helpful in situations where you have a regular sync running and want to ensure that the column headers remain static. `Returns` Parsons Table or dict or Parsons Tables See :ref:`parsons-table` for output options. """ if isinstance(organization_id, (str, int)): organization_id = [organization_id] args = { 'organization_id': organization_id, 'updated_since': date_to_timestamp(updated_since), 'timeslot_start': self._time_parse(timeslot_start), 'timeslot_end': self._time_parse(timeslot_end) } tbl = Table(self.request_paginate(self.uri + 'events', args=args)) if tbl.num_rows > 0: tbl.unpack_dict('sponsor') tbl.unpack_dict('location', prepend=False) tbl.unpack_dict('location', prepend=False) # Intentional duplicate tbl.table = petl.convert(tbl.table, 'address_lines', lambda v: ' '.join(v)) if timeslots_table: timeslots_tbl = tbl.long_table(['id'], 'timeslots', 'event_id') return {'events': tbl, 'timeslots': timeslots_tbl} else: tbl.unpack_list('timeslots', replace=True, max_columns=max_timeslots) cols = tbl.columns for c in cols: if re.search('timeslots', c, re.IGNORECASE) is not None: tbl.unpack_dict(c) return tbl