def _parse_roster_table(self): base_tz = timezone(self.roster_values.get('timezone')) or self.default_timezone base_iata = self.roster_values.get('base').iata or self.default_base roster_table_lines = re.findall(r'│(.*)│', self.roster_text_string) roster_table = etl.fromcsv( source=sources.MemorySource('\n'.join(roster_table_lines).encode('UTF-8')), delimiter='│' ) self.staging_table = roster_table table1 = ( # add header roster_table.pushheader( ['base_date', 'local_date', 'duty', 'activity', 'role', 'sector', 'pickup_time', 'rep_fin_time', 'etd_time', 'eta_time', 'flight_time', 'duty_time', 'on_city', 'utc_dev', 'dea', 'notes'] ) # select rows that match date pattern or whitespace in first column .search(0, '(?:' + self.wkdaydatepattern + ')|(?:\A\s+)') # split report and finish times (rep/fin) into separate columns .capture('rep_fin_time', '(.{4}).(.{4})', ['report_time', 'finish_time']) # split airport-from and airport-to (sector) into separate columns .capture('sector', '(.{3}).+(.{3})', ['sector_from', 'sector_to']) # remove whitespace from all values and convert empty values to None .convertall(lambda v: None if v.strip() == '' else v.strip()) # select only rows that contain data # .select(lambda row: row.count(None) != len(row)) .select(lambda row: list(OrderedDict.fromkeys(list(row)[2:])) == [None], complement=True) ) table2 = ( table1 # copy base_date to local_date if local_date is None .convert('local_date', lambda v, row: row['base_date'], where=lambda row: row['local_date'] is None, pass_row=True) .filldown('local_date') # convert local_date to datetime .convert('local_date', self._convert_weekday_datestring_to_date) # convert time fields to time type .convert(('report_time', 'finish_time', 'etd_time', 'eta_time', 'pickup_time'), timeparser(self.timeformat)) # combine date and times to create report/finish/etd/eta/pickup .addfield('report', lambda rec: datetime.combine(rec['local_date'], rec['report_time']) if rec['report_time'] is not None else None) .addfield('finish', lambda rec: datetime.combine(rec['local_date'], rec['finish_time']) if rec['finish_time'] is not None else None) .addfield('etd', lambda rec: datetime.combine(rec['local_date'], rec['etd_time']) if rec['etd_time'] is not None else None) .addfield('eta', lambda rec: datetime.combine(rec['local_date'], rec['eta_time']) if rec['eta_time'] is not None else None) .addfield('pickup', lambda rec: datetime.combine(rec['local_date'], rec['pickup_time']) if rec['pickup_time'] is not None else None) # remove time fields now that we've combined them into a single datetime instance .cutout('report_time', 'finish_time', 'etd_time', 'eta_time', 'pickup_time') # fill down report_time until it finds it matches the first flight of the duty or the duty ends .filldown('report', until=lambda r: r['etd'] is not None or r['finish'] is not None) # copy sector_to .addfield('finish_airport', lambda rec: rec['sector_to']) .filldown('finish_airport', until=lambda r: r['finish'] is not None) ) table3 = ( table2 # get timezone of report/finish/etd/eta/pickup location .addfield('report_tz', lambda rec: str(self.get_tz(iata_code=rec['sector_from'])) or str(base_tz) if rec['report'] is not None else None) .addfield('finish_tz', lambda rec: str(self.get_tz(iata_code=rec['finish_airport'])) or str(base_tz) if rec['finish'] is not None else None) .addfield('etd_tz', lambda rec: str(self.get_tz(iata_code=rec['sector_from'])) or str(base_tz) if rec['etd'] is not None else None) .addfield('eta_tz', lambda rec: str(self.get_tz(iata_code=rec['sector_to'])) or str(base_tz) if rec['eta'] is not None else None) .addfield('pickup_tz', lambda rec: str(self.get_tz(iata_code=rec['sector_from'])) or str(base_tz)if rec['pickup'] is not None else None) # make datetimes aware .convert('report', lambda v, row: timezone(row['report_tz']).localize(v).astimezone(utc), pass_row=True, where=lambda r: r.report is not None) .convert('finish', lambda v, row: timezone(row['finish_tz']).localize(v).astimezone(utc), pass_row=True, where=lambda r: r.finish is not None) .convert('etd', lambda v, row: timezone(row['etd_tz']).localize(v).astimezone(utc), pass_row=True, where=lambda r: r.etd is not None) .convert('eta', lambda v, row: timezone(row['eta_tz']).localize(v).astimezone(utc), pass_row=True, where=lambda r: r.eta is not None) .convert('pickup', lambda v, row: timezone(row['pickup_tz']).localize(v).astimezone(utc), pass_row=True, where=lambda r: r.pickup is not None) ) table4 = ( table3 # ignore rows that only contain on_city information .select(lambda row: self._all_fields_empty_except('on_city', row), complement=True) ) table5 = ( table4 # find and fill down where duty repeats .filldown('duty', where=self._next_duty_rep_found, until=lambda r: r['finish_airport'] == base_iata and r['finish'] is not None) # # fill down report until it finds its corresponding finish entry .filldown('duty', 'report', 'report_tz', 'pickup', 'pickup_tz', until=lambda r: r['finish'] is not None) # fill down Duty and Activity if notes span multiple lines .filldown(where=lambda row: self._all_fields_empty_except('notes', row), until=lambda r: r['notes'] is None) # fill down etd until it finds its corresponding eta entry .filldown('activity', 'role', 'sector_from', 'etd', 'etd_tz', where=lambda r: r['activity'] is None, until=lambda r: r['eta'] is not None) # fill duty with activity value if duty is None .filldown('activity') .convert('duty', lambda v, row: row['activity'], where=lambda row: row['duty'] is None and row['activity'] is not None, pass_row=True) ).cache() self.table1 = table1 self.table2 = table2 self.table3 = table3 self.table4 = table4 self.table5 = table5 return table5
from petl import dateparser isodate = dateparser('%Y-%m-%d') isodate('2002-12-25') try: isodate('2002-02-30') except ValueError as e: print(e) # timeparser() ############## from petl import timeparser isotime = timeparser('%H:%M:%S') isotime('00:00:00') isotime('13:00:00') try: isotime('12:00:99') except ValueError as e: print(e) try: isotime('25:00:00') except ValueError as e: print(e) # boolparser() ##############
print(etl.valuecounter(table, 'status_code')) table = clean_up(table, 'sta_cd') ###### print('CONVERT AND RENAME rep_nm to reported_by_name') table = etl.addfield(table, 'reported_by_name', lambda x: x['rep_nm']) table = clean_up(table, 'rep_nm') ###### print('CONVERT AND RENAME descript to incident_description') table = etl.addfield(table, 'incident_description', lambda x: x['descript']) table = clean_up(table, 'descript') ###### print('COMBINING occ_dt and occ_tm into incident_timestamp') table = etl.convert(table, 'occ_tm', timeparser('%H:%M')) table = etl.addfield( table, 'incident_timestamp', lambda x: datetime.combine( x['occ_dt'], (x['occ_tm'] or time(0, 0))) + timedelta(hours=8)) debug(table, ['occ_dt', 'occ_tm', 'incident_timestamp']) table = clean_up(table, 'occ_dt') table = clean_up(table, 'occ_tm') ##### print("CREATING mine_incident_id_year from incident_timestamp") table = etl.addfield(table, 'mine_incident_id_year', lambda x: x['incident_timestamp'].year) print(etl.valuecounter(table, 'mine_incident_id_year')) ######
############## from petl import dateparser isodate = dateparser('%Y-%m-%d') isodate('2002-12-25') try: isodate('2002-02-30') except ValueError as e: print(e) # timeparser() ############## from petl import timeparser isotime = timeparser('%H:%M:%S') isotime('00:00:00') isotime('13:00:00') try: isotime('12:00:99') except ValueError as e: print(e) try: isotime('25:00:00') except ValueError as e: print(e) # boolparser() ##############