def normalize_product_volume(raw_product, raw_volume): """Normalize raw laycan date. Raw laycan inputs can be of the following formats: 1) range: '02-03 SEP 2019' Args: raw_laycan (str): year (str | int): string numeric of report's year Returns: Tuple[str]: tuple of laycan period Examples: >>> normalize_product_volume('SOKOL CRUDE OIL', '600kb') [('SOKOL CRUDE OIL', '600', 'kilobarrel')] >>> normalize_product_volume('SOKOL CRUDE OIL', '600000') [('SOKOL CRUDE OIL', '600000', 'barrel')] >>> normalize_product_volume('Amna/sirtica', '952972/904732') [('Amna', '952972', 'barrel'), ('sirtica', '904732', 'barrel')] >>> normalize_product_volume('WTI/E45', '2036690') [('WTI', '1018345.0', 'barrel'), ('E45', '1018345.0', 'barrel')] >>> normalize_product_volume('WTI/E45', '2036kb') [('WTI', '1018.0', 'kilobarrel'), ('E45', '1018.0', 'kilobarrel')] """ product_list = re.split(r'[\/\+]', raw_product) volume_list = re.split(r'[\/\+]', raw_volume) f_list = [] if len(product_list) == len(volume_list): _list = list(zip(product_list, volume_list)) for _val in _list: vol_unit = re.match(r'(\d+)(kb|bbl|bbls|bblss)', _val[1]) if not vol_unit: vol_unit = (_val[1], Unit.barrel) else: vol_unit = vol_unit.groups() f_list.append((_val[0], vol_unit[0], UNIT_MAPPING.get(vol_unit[1], vol_unit[1]))) if len(product_list) > 1 and len(volume_list) == 1: _list = list(zip_longest(product_list, volume_list)) vol_unit_match = re.match(r'(\d+)(kb|bbl|bbls|bblss)', _list[0][1]) if vol_unit_match: _vol = vol_unit_match.group(1) _unit = UNIT_MAPPING.get(vol_unit_match.group(2), vol_unit_match.group(2)) else: _vol = _list[0][1] _unit = Unit.barrel if is_number(_vol): vol = int(_vol) / len(product_list) else: vol = None for _prod in product_list: f_list.append((_prod, str(vol), _unit)) return f_list
def normalize_prod_vol_move(raw_product, raw_volume_movement, raw_opt_movement): """split products using common seperator after being normalized in split_product_vol Args: raw_product (str): raw_volume_movement (str): Examples: >>> normalize_prod_vol_move('butane', '3000', 'd') [['butane', '3000', 'd']] >>> normalize_prod_vol_move('butane/propane', '3000', 'd') [['butane', 1500.0, 'd'], ['propane', 1500.0, 'd']] >>> normalize_prod_vol_move('butane/propane', '3000/3000', 'd') [['butane', '3000', 'd'], ['propane', '3000', 'd']] >>> normalize_prod_vol_move('propane', '6500 mt (d)', None) [['propane', '6500', 'd']] >>> normalize_prod_vol_move('toluene + mx', '3133+ 950 mt (d)', None) [['toluene ', '3133', ''], [' mx', '950', 'd']] >>> normalize_prod_vol_move('toluene + mx', '5694 mt (d) + 3640 mt (d)', None) [['toluene ', '5694', 'd'], [' mx', '3640', 'd']] Returns: Tuple[str, str]: """ raw_volume_movement = raw_volume_movement.replace(',', '') product_list = re.split(r'[\\/,\&\+\|]', raw_product) volume_movement_list = re.split(r'[\\/,\&\+\|]', raw_volume_movement) volume_list, movement_list = extract_volume_movement(volume_movement_list) # overwrite movement list if the source has a movement col if raw_opt_movement: movement_list = [ may_strip(raw_opt_movement) for mov_item in product_list ] if len(product_list) == len(volume_list): zipped_list = [ list(a) for a in list( zip_longest(product_list, volume_list, movement_list)) ] return zipped_list if len(product_list) != len(volume_movement_list) and len( volume_list) == 1: if is_number(volume_list[0]): volume = [ float(volume_list[0]) / 2 for i in range(0, len(product_list)) ] else: volume = [] zipped_list = [ list(a) for a in list(zip_longest(product_list, volume, movement_list)) ] return zipped_list return None
def validate_weight(raw_weight: str) -> Optional[str]: """validate negative or 0 weights Args: raw_weight (str): Examples: >>> validate_weight('') >>> validate_weight('0') >>> validate_weight('10000') '10000' """ return str(raw_weight ) if is_number(raw_weight) and float(raw_weight) > 0 else None
def normalize_date( raw_eta: str, raw_berthed_day: str, raw_berthed_month: str, raw_berthed_time: str, reported_date: str, ) -> Tuple[str, str]: """Normalize eta and berthing dates Some examples of possible input combinations: - 'raw_eta': '01-Feb' - 'raw_berthed_day': '01-Mar-20' - 'raw_berthed_day': '01' - 'raw_berthed_time': '16:00' Examples: >>> normalize_date('01-Feb', '01', None, '17:00', '2020-02-01T00:00:00') ('2020-02-01T17:00:00', '2020-02-01T00:00:00') >>> normalize_date('01-Feb', '01', None, '17:00', '2020-02-01T00:00:00') ('2020-02-01T17:00:00', '2020-02-01T00:00:00') >>> normalize_date('01-Feb', '01-Mar-20', '01-Mar-20', '17:00', '2020-02-01T00:00:00') ('2020-03-01T17:00:00', '2020-02-01T00:00:00') >>> normalize_date('01-Feb', '01', '01-Mar-20', '17:00', '2020-02-01T00:00:00') ('2020-03-01T17:00:00', '2020-02-01T00:00:00') """ # check if berthed month contains a month else discard if len(str(raw_berthed_month).split('-')) == 3: _day, _month, _year = raw_berthed_month.split('-') _year = f'20{_year}' else: _month = parse_date(reported_date).month _year = parse_date(reported_date).year if is_number(raw_berthed_day): _day = raw_berthed_day return ( parse_date( f'{_year}-{_month}-{_day} {raw_berthed_time}:00').isoformat(), parse_date(f'{raw_eta}-{_year}', dayfirst=True).isoformat(), )
def extract_volume_movement(vol_movement): """Normalize dates Args: raw_product (str): raw_volume_movement (str): Examples: >>> extract_volume_movement(['3000']) (['3000'], ['']) >>> extract_volume_movement(['3640 mt (d)', '3640 mt (d)']) (['3640', '3640'], ['d', 'd']) >>> extract_volume_movement(['3640 mt', '3640 mt (d)']) (['3640', '3640'], ['', 'd']) >>> extract_volume_movement(['3640 mt']) (['3640'], ['']) Returns: Tuple[str, str]: """ movement_list = [] vol_list = [] for vm in vol_movement: if is_number(may_strip(vm)): vol_list.append(may_strip(vm)) movement_list.append('') else: try: separated_vol_movement = vm.split('(') movement_list.append( re.sub(r'(\s|mt|\(|\))', '', separated_vol_movement[1])) vol_list.append( re.sub(r'(\s|mt|\(|\))', '', separated_vol_movement[0])) except Exception: vol_list.append(re.sub(r'(\s|mt|\(|\))', '', vm)) movement_list.append('') return vol_list, movement_list
def process_item(raw_item): """Transform raw item into a usable event. Args: raw_item (Dict[str, str]): Returns: Dict[str, str]: normalized cargo movement item """ item = map_keys(raw_item, field_mapping()) print(item) # normalize berth strings if item.get('berth'): item['berth'] = item['berth'].replace('.', '') if item.get('vessel_name_length'): v_name, v_loa, v_beam = split_name_loa_beam( item.pop('vessel_name_length', None)) item['vessel'] = { 'name': v_name, 'length': int(float(v_loa)) if is_number(v_loa) else None, 'beam': int(float(v_beam)) if is_number(v_beam) else None, } else: _loa = item.pop('vessel_length', None) item['vessel'] = { 'name': normalize_vessel(item.pop('vessel_name', None)), 'length': int(float(_loa)) if is_number(_loa) else None, } # for kandla attachment, the remarks column does not reflect player information but # the berth information if 'kandla' in item['port_name']: item.pop('cargo_player', None) if not item['vessel']['name']: return # handle kandla attachment where berth dates potentially could be in 5 columns for berthed_col in ('berthed_1', 'berthed_2', 'berthed_3', 'berthed_4', 'berthed_5'): if item.get(berthed_col) and item[berthed_col]: item['berthed'] = item.get(berthed_col) item.pop(berthed_col, None) item.pop(berthed_col, None) for date_col in ('eta', 'arrival', 'berthed', 'departure', 'eta_holder'): if item.get(date_col): item[date_col] = normalize_dates(item[date_col], item['reported_date']) continue else: item[date_col] = None continue if item.get('eta_holder'): item['eta'] = item.get('eta_holder', None) if (not item.get('eta') and not item.get('berthed') and not item.get('arrival') and not item.get('departure')): return cargo_products = None cargo_volume = None cargo_movement = None # handle normal cases where cargo and quantity are nicely seperated if item.get('cargo_product') and item.get('cargo_volume'): cargo_products = item.pop('cargo_product', None) cargo_volume = item.pop('cargo_volume', None) cargo_movement = item.pop('cargo_movement', None) # if cargo and volume are joint together if item.get('cargo_product_volume'): cargo_products, cargo_volume = split_product_vol( item.pop('cargo_product_volume', None)) cargo_movement = item.pop('cargo_movement', None) if item.get('cargo_product') and not item.get('cargo_volume'): cargo_products, cargo_volume = split_product_vol( item.pop('cargo_product', None)) cargo_movement = item.pop('cargo_movement', None) if not cargo_products: return zipped_cargo_list = normalize_prod_vol_move(cargo_products, cargo_volume, cargo_movement) seller = None buyer = None player = item.get('cargo_player') or item.get( 'potential_cargo_player') or None for col in ('cargo_player', 'potential_cargo_player', 'eta_holder'): item.pop(col, None) if zipped_cargo_list: for zipped_item in zipped_cargo_list: movement = MOVEMENT_MAPPING.get(zipped_item[2], None) if movement == 'load': seller = player elif movement == 'discharge': buyer = player else: item.pop('cargo_player', None) item['cargo'] = { 'product': may_strip(zipped_item[0]), 'movement': movement if movement else None, 'volume': str(zipped_item[1]) if zipped_item[1] else None, 'volume_unit': Unit.tons, 'buyer': { 'name': buyer } if buyer and buyer not in BLACKLIST else None, 'seller': { 'name': seller } if seller and seller not in BLACKLIST else None, } if item['cargo']['product'] in BLACKLIST: continue yield item else: MISSING_ROWS.append(str(raw_item))
def normalize_dates(raw_date, rpt_date): """Normalize dates Args: vessel_name (str): Examples: >>> normalize_dates('1400 hrs 01.01.2020', '2020-01-01T00:00:00') '2020-01-01T14:00:00' >>> normalize_dates('2106 hrs/02.01.2020', '2020-01-01T00:00:00') '2020-01-02T21:06:00' >>> normalize_dates('2106 hrs /02.01.2020', '2020-01-01T00:00:00') '2020-01-02T21:06:00' >>> normalize_dates('02.01.2020(NOR)', '2020-01-01T00:00:00') '2020-01-02T00:00:00' >>> normalize_dates('02.01.2020', '2020-01-01T00:00:00') '2020-01-02T00:00:00' >>> normalize_dates('02.01.20', '2020-01-01T00:00:00') '2020-01-02T00:00:00' >>> normalize_dates('22.03.2020-1800', '2020-01-01T00:00:00') '2020-03-22T18:00:00' >>> normalize_dates('22.03.2020-AM', '2020-01-01T00:00:00') '2020-03-22T06:00:00' >>> normalize_dates(' am 02.01.2020', '2020-01-01T00:00:00') '2020-01-02T06:00:00' >>> normalize_dates('02.01.2020/0700 hrs', '2020-01-01T00:00:00') '2020-01-02T07:00:00' >>> normalize_dates('pm hrs 06.01.2020', '2020-01-01T00:00:00') '2020-01-06T15:00:00' >>> normalize_dates('26/01', '2020-01-01T00:00:00') '2020-01-26T00:00:00' >>> normalize_dates('26/01', '2019-12-30T00:00:00') '2020-01-26T00:00:00' Returns: str: """ # normalize dates strings raw_date = raw_date.lower().replace('am', '0600/').replace('pm', '1500/') if is_isoformat(raw_date.upper()): return raw_date.upper() # if dd/mm data is provided, guess the year and return date if '/' in raw_date and len(raw_date.split('/')) == 2: if all(is_number(rd) for rd in raw_date.split('/')): potential_date, _ = get_date_range(raw_date, '/', '-', rpt_date) return potential_date # detect date and time fields date_hour = [ may_strip(_d) for _d in re.split(r'(hrs /|hrs/|/ hrs|hrs|/|\()|\-', raw_date) if _d ] _date, _time = None, '' for dh in date_hour: if is_number(dh): _time = dh continue if len(dh.split('.')) == 3: _date = dh continue if not _date: return None try: return to_isoformat(may_strip(f'{_date} {_time}'), dayfirst=True) except Exception: return None
def parse_pdf(self, response): """Extract data from report, 606-ETA is one section consisting of 3 columns, therefore the increment is 3. There are 13 sections in total to process. Hence, the final area of interest is 41 Date | Time | Day | 606 | LOA | ETA | 607 | LOA | ETA | ... --------------------------------------------------------------------------------------- 5 | 06:00 | Wed | Ultra Wollongong = 10000 | 200 | 01-Feb | | 14:00 | | Ferro Chrome | | | | 22:00 | | Ferro Chrome | | | Args: response (scrapy.Response): Yields: Dict[str, str]: """ table = self.extract_pdf_io(response.body, **self.tabula_options) start_processing = False start_index_of_interest = 3 final_index_of_interest = 41 prev_vessel_item = None cargo_list = [] while start_index_of_interest < final_index_of_interest: prev_month_item = None for idx, row in enumerate(table): # detect relevant row to start processing if 'TimeDay' in ''.join(row): start_processing = True continue if not start_processing: continue # memoise day as subsequent cells are empty until the next day cell is # filled if row[0]: prev_day_item = row[0] prev_month_item = ( row[0] if len(str(row[0]).split('-')) == 3 else prev_month_item ) batch = row[start_index_of_interest : start_index_of_interest + 3] # detect vessel row when the loa column is filled with appropriate number if is_number(may_strip(batch[1])): # memoise vessel details vessel_item = { 'vessel_name': batch[0], 'vessel_loa': batch[1], 'eta': batch[2], 'berthed_day': prev_day_item if not row[0] else row[0], 'berthed_month': prev_month_item, 'berthed_time': row[1], 'port_name': 'Richards Bay', 'provider_name': self.provider, 'reported_date': parse_date( may_strip(self.reported_date), dayfirst=True ).isoformat(), } cargo_list = [] continue # cargo appears after the vessel row is identified # append until the next vessel row is detected cargo_list.append(batch[0]) vessel_item.update(cargo_list=cargo_list,) # once the last index is reached, restart the processing if idx == len(table) - 1: start_processing = False # yield item once cargo list is complete if prev_vessel_item and ( vessel_item.get('vessel_name') != prev_vessel_item.get('vessel_name') ): yield normalize.process_item(prev_vessel_item) # since this is based on previous and current item comparison to yield, # we need to force yield the last item in the last section since there # is nothing to compare if start_index_of_interest >= 39 and idx == len(table) - 1: yield normalize.process_item(vessel_item) prev_vessel_item = vessel_item # process next batch in the table start_index_of_interest += 3