def __init__(self, table_name='Scheduled Calls', *args, **kwargs): """Init BilbaoWeb spider. There are five tables of vessel movements given by the source. - Vessel Arrivals - Vessel Departures - Vessel Operating - Scheduled Calls - Inactive Stay Each spider job needs to choose one table to scrape. """ super().__init__(*args, **kwargs) self.table_name = may_strip(table_name)
def parse_mail(self, mail): """Parse each email that was matched with the spider filter arguments. Args: mail (Mail): Yields: Dict[str, str]: """ for attachment in mail.attachments(): for sheet in xlrd.open_workbook(file_contents=attachment.body, on_demand=True).sheets(): start_processing = False for idx, raw_row in enumerate(sheet.get_rows()): row = format_cells_in_row(raw_row, sheet.book.datemode) # detect if cell is a port cell and memoise it if any(may_strip(x.lower()) in HEADER_STR for x in (row[0], row[1], row[3])): header = row start_processing = True continue if start_processing: raw_item = { may_strip(h.lower()): may_strip(row[idx].replace('\n', '|').lower()) for idx, h in enumerate(header) } # occasionally the header might be missing an important header if '' in [key for key in raw_item.keys()]: raw_item['eta_holder'] = raw_item.pop('', None) raw_item.update( provider_name=self.provider, reported_date=mail.envelope['subject'], port_name=may_strip(sheet.name.lower()), ) yield from normalize.process_item(raw_item)
def field_mapping(): return { '0': ('vessel_status', None), '1': ignore_key('size'), '2': ('lay_can', None), '3': ('departure_zone', lambda x: ZONE_MAPPING.get(x.lower(), x)), '4': ('arrival_zone', lambda x: [ZONE_MAPPING.get(z.lower(), z) for z in x.split('-')]), '5': ('rate_value', lambda x: re.sub(TO_REMOVE, '', x).strip()), '6': ('charterer', lambda x: None if may_strip(x) == 'CNR' else x), 'provider_name': ('provider_name', None), 'reported_date': ('reported_date', None), }
def normalize_item_dates(item): """Cleanup item dates. Args: item (dict): Returns: item (dict): Examples: >>> normalize_item_dates({'lay_can_start': '12-14', \ 'reported_date': '05.03.2020'}) {'lay_can_start': '2020-03-14T00:00:00', 'reported_date': '05 Mar 2020'} >>> normalize_item_dates({'lay_can_start': 'TBA', 'reported_date': '05.03.2020'}) {'lay_can_start': None, 'reported_date': '05 Mar 2020'} """ try: reported_date = datetime.datetime.strptime(item['reported_date'], '%d.%m.%Y') item['reported_date'] = reported_date.strftime('%d %b %Y') except ValueError: item['reported_date'] = None item['lay_can_start'] = None return item if not item['lay_can_start'] or item['lay_can_start'] in STRING_BLACKLIST: item['lay_can_start'] = None else: if item['lay_can_start'].split('-'): lay_can = may_strip(item['lay_can_start'].split('-')[-1]) else: lay_can = may_strip(item['lay_can_start']) year_month = str(reported_date.year) + '-' + str(reported_date.month) + '-' item['lay_can_start'] = to_isoformat(year_month + lay_can, dayfirst=False) item['reported_date'] = reported_date.strftime('%d %b %Y') return item
def split_product(raw_product, raw_volume_unit, raw_attachment_name): """split and yield multiple products Args: raw_product (str): raw_volume_unit (str): raw_attachment_name (str): Examples: >>> split_product('Jet A1 + Gasoil', '30000 + 20000', 'yanbu') ([('Jet A1', '30000'), ('Gasoil', '20000')], 'tons') >>> split_product('Jet A1 + Gasoil', '30000', 'yanbu') ([('Jet A1', '15000.0'), ('Gasoil', '15000.0')], 'tons') >>> split_product('Jet A1', '30000', 'yanbu') ([('Jet A1', '30000')], 'tons') >>> split_product('Butane / Propane', 'TBA', 'yanbu') ([('Butane', None), ('Propane', None)], None) Returns: str: """ product_list = [ may_strip(prod) for prod in re.split(r'[\\\+\&\/\;]', raw_product) ] volume_unit_list, units = get_vol_unit(raw_volume_unit, raw_attachment_name) if len(product_list) == len(volume_unit_list): return list(zip(product_list, volume_unit_list)), units if len(product_list) > 1 and len(volume_unit_list) == 1: final_list = [] for item_product in product_list: # source may contain typo errors try: vol_append = str( float(volume_unit_list[0]) / len(product_list)) except Exception: vol_append = None final_list.append((item_product, vol_append)) return final_list, units if product_list and not volume_unit_list: return list(zip_longest(product_list, volume_unit_list)), None return None, None
def parse(self, response): """Parse and extract raw items from html tables. Each entry in the port activity table has a link on the vessel name, with the vessel IMO in the link itself. We append vessel imo to each row we extract, since it is not technically part of the table cells. Vessel imo appears as part of the html query string, e.g.: ".../phpcodes/navire_a.php?ship=9297905" ^^^^^^^ imo Args: response (scrapy.HtmlResponse): Yields: Dict[str, str]: """ # memoise reported_date so it won't need to be called repeatedly reported_date = (dt.datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0).isoformat()) logger.info(f"JJJJJJJJJJJJJJ {self.provider}") # each index corresponds to the vessel movement type in the table # 0: attendus # 1: a quai # 2: en rade for table_idx in range(3): table = response.xpath( f'//div[contains(@class, "et_pb_tab_{table_idx}")]//table') header = [ may_strip(head) for head in table.xpath('.//th/text()').extract() ] for row in table.xpath('./tbody//tr'): raw_item = row_to_dict(row, header) # conextextualise raw item with meta info raw_item.update( port_name=self.name, provider_name=self.provider, reported_date=reported_date, vessel_imo=row.xpath('./td//@href').extract_first().split( 'ship=')[1], ) yield normalize.process_item(raw_item)
def parse_response(self, response): """Parse response from Alaska website. Args: response (scrapy.Response): Yields: Dict[str, str]: """ # denotes if record should be processed when iterating over each table srow PROCESS_FLAG = False tables = response.xpath('(//table[@id="ContentPlaceHolder1_Table1"])') for raw_row in tables.xpath('//tr'): row = [ may_strip(td.xpath('.//text()').extract_first()) for td in raw_row.xpath('.//td') ] # According to the source, the number colums in the table is 8 if len(row) < 8: continue # To indicate the start of the first data row in the table if row[0] == 'Date': PROCESS_FLAG = True continue # To indicate the end of the table elif row[0] == 'Average': PROCESS_FLAG = False continue if not PROCESS_FLAG: continue yield { # 7 denotes the position of the volume in the table's row. 'volume': int(row[7].replace(',', '')), 'country': 'Alaska', 'country_type': 'region', 'start_date': parser.parse_input_date(row[0]), # is inclusive 'end_date': parser.parse_input_date(row[0], days=1), # is exclusive 'unit': Unit.barrel, 'provider_name': self.provider, 'reported_date': self.reported_date, 'balance': BalanceType.ending_stocks, 'product': 'Crude Oil', }
def field_mapping(): return { '0': ('vessel', lambda x: { 'name': x } if 'TBN' not in x.split() else None), '1': ('cargo_volume', None), '2': ('cargo_product', None), '3': ('departure_zone', None), '4': ('arrival_zone', lambda x: x.split('-') if x else None), '5': ('laycan', None), '6': ('rate_value', None), '7': ('charterer', lambda x: may_strip(x.replace('-', ''))), '8': ('status', lambda x: STATUS_MAPPING.get(x, None)), 'provider': ('provider_name', None), 'reported_date': ('reported_date', None), }
def split_row(row): """Try to split the row. CHANGELOS GABRIEL FOR UNKNWON PARTY. 60,000MT FUEL OIL. LAYCAN EX U.S GULF 03-05 DECEMBER. FREIGHT UNKNOWN NORDIC GENEVA FOR CLEARLAKE. 60,000MT FUEL OIL. LAYCAN EX TANJUNG PELEPAS 29-31 DECEMBER. FREIGHT OWN PROGRAM TC Args: row (str): Returns: Tuple(List[str], List[str]): cells and headers """ match = re.match(INFORMATION_REGEX, may_strip(row)) if match: return list(match.groups()), HEADERS return None, None
def grades_mapping(): return { 'SHIPS NAME': ('vessel', lambda x: { 'name': may_strip(x.replace('*', '')) }), 'PRODUCTS': ('cargo_product', None), 'QUANTITY': ('cargo_volume_movement', None), 'TERMINAL': ('installation', None), 'PORT': ('port_name', None), 'ARRIVAL': ('arrival', normalize_date), 'BERTHING': ('berthed', normalize_date), 'ETD': ('departure', normalize_date), 'COMMENTS': ignore_key('irrelevant'), 'provider_name': ('provider_name', None), 'reported_date': ('reported_date', None), }
def extract_headers(table, rm_headers=[]): """Extract headers of specified table. NOTE could be made generic Args: table (scrapy.Selector): rm_headers (List[str]): list of header names to remove before returning Returns: List[str]: """ headers = table.xpath('.//tr[@class="omg"]//font/text()').extract() # remove unneccessary headers return [may_strip(head) for head in headers if head not in rm_headers]
def _clean_string(raw): """Clean strings and transform empty strings into NoneType. Examples: >>> _clean_string(' ') >>> _clean_string('') >>> _clean_string(' DOW CHEMICAL TEXAS OPERATIONS') 'DOW CHEMICAL TEXAS OPERATIONS' >>> _clean_string(None) """ if not raw: return None cleaned = may_strip(raw) return cleaned if cleaned else None
def field_mapping(): return { 'vessel': ('vessel', lambda x: { 'name': may_strip(x) }), 'size': ('cargo_volume', None), 'cargo': ('cargo_product', None), 'layday': ('lay_can', None), 'load': ('departure_zone', None), 'discharge': ('arrival_zone', lambda x: normalize_voyage(x)), 'freight': ('rate_value', None), 'charterer': ('charterer', None), 'status': ('status', lambda x: STATUS_MAPPING.get(x, None)), 'provider_name': ('provider_name', None), 'reported_date': ('reported_date', None), }
def field_mapping(): return { 'eta': ('eta', None), 'etb': ('etb', None), 'discharge': ('discharge', normalize_quantity), 'installation': ('installation', None), 'load': ('load', normalize_quantity), 'port_name': ('port_name', normalize_port_name), 'product': ( 'product', lambda x: [may_strip(product) for product in x.split('/') if x not in INVALID_CARGOES], ), 'provider_name': ('provider_name', None), 'reported_date': ('reported_date', lambda x: to_isoformat(x, dayfirst=True)), 'vessel': ('vessel', lambda x: {'name': normalize_vessel_name(x)}), }
def normalize_string(raw_value): """Remove unnecessary strings Args: raw_value (str): Examples: >>> normalize_string(None) >>> normalize_string('TBC') >>> normalize_string('DHT LEOPARD') 'DHT LEOPARD' """ if (raw_value and str(raw_value) in BLACKLIST) or raw_value == '': return None return may_strip(raw_value)
def field_mapping(): return { 'SR': (ignore_key('irrelevant')), 'Date': ('berthed', None), 'Jetty': ('berth', lambda x: try_apply(x, str).replace('.0', '')), 'Operation': ('cargo_movement', lambda x: re.sub(r'[\W]', '', x.lower())), 'Product': ('cargo_product', None), 'Vessel Name': ('vessel', lambda x: {'name': may_strip(x)}), # TODO include buyer and seller 'Customer / Terminal': (ignore_key('buyer and seller, cannot be used currently')), 'Latest Value b/w CT inform & Berth Available': (ignore_key('irrelevant')), 'Parcel size M3': ('cargo_volume', lambda x: try_apply(x, str).replace(',', '')), 'port_name': ('port_name', None), 'installation': ('installation', None), 'provider_name': ('provider_name', None), 'reported_date': ('reported_date', None), }
def _remove_elements(lst, rm_elements=[]): """Remove useless elements of a list NOTE could be made generic Args: lst (List[str]): rm_elements (List[str]): list of elements to remove before returning Returns: List[str]: """ # remove unneccessary elements for rm_element in rm_elements: stripped_lst = [element for element in lst if element != rm_element] return [may_strip(element) for element in stripped_lst if element != '\n']
def parse_pdf(self, p_name, body, tab_opt, mail): """Parse PDF reports. Args: attachment (Attachment): mail attachment object Yields Dict[str, str]: """ prev_row = None reported_date = self.extract_reported_date(mail.envelope['subject']) for idx, row in enumerate(self.extract_pdf_io(body, **tab_opt)): # remove rows with no vessels if not row[1]: continue # memoise row for ffill prev_row = row if row[1] else prev_row # remove filler rows if any(sub in row[1] for sub in ('EMPTY', 'TGL', 'TERMINAL', 'TPG')): continue if 'VESSEL' in row[1]: header = row continue if not row[1]: row = [ prev_r if not row[prev_idx] else row[prev_idx] for prev_idx, prev_r in enumerate(prev_row) ] # extract data row raw_item = { head: may_strip(row[head_idx]) for head_idx, head in enumerate(header) } # contextualise raw item with meta info raw_item.update( reported_date=reported_date, port_name='Aratu', # report only contains data for Aratu port provider_name=self.provider, ) yield process_item(raw_item)
def map_berth_to_installation(raw_berth: str) -> Optional[str]: """Clean, normalize, and map a raw berth name to a known installation. Examples: >>> map_berth_to_installation('Valero 8') 'Valero Pembroke' >>> map_berth_to_installation('Dragon No1') 'Dragon' >>> map_berth_to_installation('Milford Dock') """ for known_berth in BERTH_TO_INSTALLATION_MAPPING: if known_berth in may_strip(raw_berth): return BERTH_TO_INSTALLATION_MAPPING[known_berth] logger.debug('Unknown berth: %s', raw_berth) return None
def portcall_mapping() -> Dict[str, Tuple[str, Optional[Callable]]]: return { 'Date': ('eta', lambda x: to_isoformat(x, dayfirst=False, yearfirst=True)), 'Ship': ('vessel_name', None), 'GT': ('vessel_gt', lambda x: validate_weight(x)), 'DWT': ('vessel_dwt', lambda x: validate_weight(x)), 'Move Type': ('event', lambda x: may_strip(x.lower())), 'Remarks': ignore_key('handwritten notes from port authority'), 'From': ignore_key('irrelevant'), 'To': ('installation', map_berth_to_installation), 'ACTION_STATUS_ID': ignore_key('internal ID used by port'), 'Tug': ignore_key('irrelevant'), 'port_name': ('port_name', None), 'provider_name': ('provider_name', None), 'reported_date': ('reported_date', None), }
def sanitize_date(raw_date): """Sanitise raw date as ISO8601 timestamp. Raw date is usually on 2 lines. The first one being a nicely formatted %y-%m-%d. The second one being a messy combination of hours, AM, PM, ... The date could be invalid, in this case, we want to keep calm and keep processing. Examples: >>> sanitize_date('2019-05-11') '2019-05-11T00:00:00' >>> sanitize_date('2019-05-0907:20') '2019-05-09T07:20:00' >>> sanitize_date('2019-05-19 19:00') '2019-05-19T19:00:00' >>> sanitize_date('2019-05-11 AM:-') '2019-05-11T00:00:00' >>> sanitize_date('2019-05-05 09 :00') '2019-05-05T09:00:00' >>> sanitize_date('2019-05-10 :') '2019-05-10T00:00:00' >>> sanitize_date('2019-06-01 sh:') '2019-06-01T00:00:00' >>> sanitize_date('2019-06-01 -06:-00') '2019-06-01T06:00:00' >>> sanitize_date('') """ # sanity check if not raw_date: return None # remove excessive whitespace first raw_date = may_strip(raw_date) try: date, hour, minute = re.match(DATE_PARSING_PATTERN, raw_date).groups() hour = try_apply(hour, int) if try_apply(hour, int) else '00' minute = try_apply(minute, int) if try_apply(minute, int) else '00' return to_isoformat(f'{date} {hour}:{minute}', dayfirst=False) # keep calm so that we can proceed processing except AttributeError: logger.error('Date might be invalid, please double check: %s', raw_date) return None
def field_mapping(): return { 'VESSEL': ('vessel', lambda x: {'name': may_strip(x)}), 'ETA': ('eta', normalize_date), 'ETB': ('berthed', normalize_date), 'ETS': ('departure', normalize_date), 'GRADE': ('cargo_product', lambda x: None if 'TBI' in x else x), 'QTTY': ('cargo_quantity', lambda x: try_apply(x.replace('.', '', 1), int)), 'TTL QTTY': ignore_key('redundant'), 'LD PORT': ('load_cargo_movement', None), 'DISC PORT': ('dis_cargo_movement', None), 'SH / REC': ignore_key('redundant'), 'RMK': ignore_key('redundant'), 'port_name': ('port_name', None), 'provider_name': ('provider_name', None), 'reported_date': ('reported_date', None), }
def _process_tarragona_rows(self, table): """Process rows for tarragona port. Decode all rows as unicode strings since tabula outputs byte strings by default. Extract matching date for each table section based on the section's description. Yield only rows that contain table data, skipping table section description. Matching date in table header can have the following format: - "Buques atracados el día 18 de marzo del 2018 (Información actualizada a las 8:30 horas)" # noqa - "Buques atracados el día18 de marzo del 2018 (Información actualizada a las 8:30 horas)" # noqa raw table row order: - table name (discard): ['PUE', 'RTO DE', 'TARRAGO', 'NA'] - matching_date (extract): ['Buques atracados el d\xc3\xada', '29 de marzo del 2018 (', 'Informaci\xc3\xb3n actualizada a la', 's 8:00 horas)'] # noqa - header (keep): ['BUQUE', 'MUELLE', 'CONSIGNATARIO/', 'TONS'] - 2nd header (discard): ['', '', 'ESTIBADOR', ''] - data row (keep): ['CANNETO M', 'FONDEADO ZONA II', 'IB\xc3\x89RICA MAR\xc3\x8dTIMA', '-'] # noqa - data row (keep): ['OTTOMANA', 'PANTAL\xc3\x81 REPSOL', 'MARITIME/REPSOL', 'P. PETROL\xc3\x8dFEROS/23.300-D'] # noqa - subsequent data rows (keep): ... - ... Args: table (List[List[str]]): list of table rows from pdf Yields: List[str]: """ for idx, row in enumerate(table): # tabula stores string data as bytes by default row = [cell for cell in row] # try deciphering matching_date in first row if idx == 1: matching_date = parse_raw_date( may_strip(''.join(row).split('atracados el día')[1].split( '(Info')[0])) logger.debug('Found matching date: {}'.format(matching_date)) if idx >= 2: if not ('ESTIBADOR' in ' '.join(row) or 'PUERTO DE' in ''.join(row)): # third row contains headers row.append('matching_date' if idx == 2 else matching_date) yield row
def normalize_currency_rate(raw_currency, raw_rate): """Combine currency and rate column together Args: raw_currency (str): raw_rate (str): Returns: str: Examples: >>> normalize_currency_rate('WS', '167.5') 'WS 167.5' >>> normalize_currency_rate('RNR', None) 'RNR' """ return may_strip(f'{raw_currency} {raw_rate}' ) if raw_currency and raw_rate else raw_currency
def split_row(row): """Try to split the row. Args: row (str): Returns: Tuple(List[str], List[str]): cells and headers """ for desc, value in PATTERN_HEADER_MAPPING.items(): pattern, headers = value match = re.match(pattern, may_strip(row)) if match: return list(match.groups()), headers return None, None
def parse_mail(self, mail): self.reported_date = parse_date(mail.envelope['date']).strftime('%d %b %Y') body = self.select_body_html(mail) for row_html in body.xpath('//p'): # we divide the row by <p> tag, however, some rows are in the same <p> tag, usually, # they are separated by two <br> tag, therefore in here we use `\r\n\r\n`, as single # <br> is spotted in a row. # this is not a very good solution, but due to the format is quite inconsistent, there's # no other way to detect the rows elegantly. rows = ''.join(row_html.xpath('.//text()').extract()).split('\r\n\r\n') for raw_row in rows: row = self.split_row(may_strip(raw_row)) if row: raw_item = {str(idx): cell for idx, cell in enumerate(row)} raw_item.update(self.meta_field) yield normalize.process_item(raw_item)
def normalize_charterer(raw_charterer): """Remove unnecessary strings Examples: >>> normalize_charterer('SHELL') 'SHELL' >>> normalize_charterer('SHELL-REPLACED') 'SHELL' Args: raw_charterer (str): Returns: str: """ charter = raw_charterer.partition('-')[0] return may_strip(charter)
def normalize_cargoes(item): """Normalize cargoes. Args: item (Dict[str, str]): Yields: Dict[str, str]: """ cargoes = item['product_volume'].split(';') for cargo in cargoes: # TODO confirm with analysts on volume unit, until then we cannot use volume figure # product, _, volume product, _, _ = cargo.partition(',') yield { 'product': may_strip(product), 'movement': 'load' if item['is_load'] else 'discharge', }
def normalize_cargoes(item): # filter out irrelevant cargoes if not is_relevant_cargo(item['cargo_product'], item['cargo_movement']): return # multi cargoes, ignore volume to avoid confusion elif '+' in item['cargo_product']: for product in [may_strip(prod) for prod in item['cargo_product'].split('+')]: yield {'product': product} # single cargo, assign volume and movement else: yield { 'product': item['cargo_product'], 'movement': MOVEMENT_MAPPING.get(item['cargo_movement']), 'volume': item['cargo_volume'], 'volume_unit': Unit.tons, }
def field_mapping(**kwargs): return { 'port_name': ('port_name', None), 'provider_name': ('provider_name', None), 'reported_date': ('reported_date', None), 'berth': ('berth', None), 'vessel': ('vessel', lambda x: {'name': normalize_vessel_name(x)}), 'arrived': ('arrival', lambda x: normalize_date(may_strip(x), **kwargs, event='arrived')), 'arrived eta': ( 'arrived eta', lambda x: normalize_date(may_strip(x), **kwargs, event='arrived'), ), 'eta': ('eta', lambda x: normalize_date(may_strip(x), **kwargs, event='eta')), 'etb': ('etb', lambda x: normalize_date(may_strip(x), **kwargs, event='etb')), 'etc': ('etc', lambda x: normalize_date(may_strip(x), **kwargs, event='etc')), 'ets': ('departure', lambda x: normalize_date(may_strip(x), **kwargs, event='ets')), 'berthed': ('berthed', lambda x: normalize_date(may_strip(x), **kwargs, event='berthed')), 'new etb': ('new etb', lambda x: normalize_date(may_strip(x), **kwargs, event='new etb')), 'ops': ('cargoes', lambda x: list(normalize_cargo(x))), }