def parse(self, response): """Extract vessel movements listed in `start_urls`. Args: response (scrapy.Response): Yields: Dict[str, str]: """ # memoize reported date so it won't need to be repeatedly computed later reported_date = to_isoformat( response.xpath('//p/text()').extract_first(), dayfirst=False, yearfirst=True) table = response.xpath('//table/tr') header, data_rows = table[0].xpath('./th//text()').extract(), table[1:] for row in data_rows: raw_item = row_to_dict( row, header, port_name=self.name, reported_date=reported_date, provider_name=self.provider, ) yield normalize.process_item(raw_item)
def parse(self, response): """Entrypoint for parsing website response. Args: response (scrapy.Response): Yields: event (Dict[str, str]): """ # memoise reported date since source does not provide any reported_date = (dt.datetime.utcnow().replace( hour=0, minute=0, second=0).isoformat(timespec='seconds')) # extract tabular data on vessel movements for idx, row in enumerate(response.xpath('//table//tr')): # header will always exist in the first row if idx == 0: header = row.xpath('.//th/text()').extract() continue raw_item = row_to_dict(row, header) # contextualise raw item with meta info raw_item.update( port_name=self.name, provider_name=self.provider, reported_date=reported_date, # event type depends on the page url, so remember it # (in addition to debugging benefits) url=response.url, ) yield normalize.process_item(raw_item)
def parse(self, response): """Entry point of Milford spider. Args: response: Returns: Dict[str, str]: """ # memoise reported_date so it won't need to be called repeatedly reported_date = (dt.datetime.utcnow().replace( hour=0, minute=0, second=0).isoformat(timespec='seconds')) for idx, row in enumerate( response.xpath('//table[@id="DataTable"]//tr')): # first row always contains header if idx == 0: header = row.xpath('.//th/text()').extract() continue raw_item = row_to_dict(row, header) # contextualise raw item with meta info raw_item.update(port_name=self.name, provider_name=self.provider, reported_date=reported_date) yield normalize.process_item(raw_item)
def parse(self, response): table = response.xpath('//table[contains(@class, "table-results")]//tr') # extract tabular vessel movements for idx, row in enumerate(table): # headers will always be in the first row if idx == 0: header = row.xpath('./td/text()').extract() continue # don't scrape table sub-headers as row values if len(row.xpath('./td')) < len(header): continue raw_item = row_to_dict(row, header) # contextualise raw item with meta info raw_item.update( port_name=self.name, provider_name=self.provider, reported_date=dt.datetime.utcnow() .replace(hour=0, minute=0, second=0, microsecond=0) .isoformat(), ) yield normalize.process_item(raw_item)
def parse(self, response): """Parse port activity pages. Args: response (scrapy.Response): Yields: Dict[str, str]: """ # memoise reported date so that we don't need to call it repeatedly below reported_date = response.xpath( '//strong/parent::*/text()').extract()[1] header = response.xpath('//table//th/text()').extract() # first cell of table row will always be empty as displayed on the website # hence, we append an "irrelevant" column as the first element of the header header.insert(0, 'irrelevant') for row in response.xpath('//table/tbody/tr'): raw_item = row_to_dict(row, header) # contextualise raw item with metadata raw_item.update(port_name='Puerto Moin', provider_name=self.provider, reported_date=reported_date) yield normalize.process_item(raw_item)
def parse(self, response): """Parse forecast/in-port activity of vessels at specified dock. Args: scrapy.Response: Yields: Dict[str, str]: """ reported_date = response.xpath('//h2/text()').extract_first() is_data_available = response.xpath('//th') # there is no recorded activity currently at the specified dock if not is_data_available: return for idx, row in enumerate(response.xpath('//tr')): # headers will always be at the first row of the table if idx == 0: headers = row.xpath('.//th/text()').extract() continue raw_item = row_to_dict(row, headers) # contextualise raw item with dock name and meta info raw_item.update( installation=response.meta['dock'], port_name=self.port_name, provider_name=self.provider, reported_date=reported_date, ) yield normalize.process_item(raw_item)
def parse(self, response): """Parse and extract raw items from html tables. Args: response (scrapy.HtmlResponse): Yields: Dict[str, Any]: """ reported_date = response.xpath( '//div[@class="panel radius text-center"]/text()').extract_first() for table in response.xpath('//table[@class="listado-boletin"]'): # verify if table is the one we want to scrape if may_strip(table.xpath( './caption/text()').extract_first()) != self.table_name: continue header = table.xpath('.//th/text()').extract() data_rows = table.xpath('.//tr') for row in data_rows: raw_item = row_to_dict(row, header) # contextualise raw item with meta info raw_item.update(port_name='Bilbao', provider_name=self.provider, reported_date=reported_date) yield normalize.process_item(raw_item)
def parse_table(self, response): """Parse table containing portcalls and request vessel details for each portcall. Args: response (scrapy.XmlResponse): Yields: FormRequest: """ # scrapy autoselects XmlResponse, but resource is actually in HTML # force response type to HTML html = parser.html_response(response) # iterate through each table row (i.e. portcall) in the response for row in html.xpath('//tbody/tr'): # get `source` of current row, i.e. key that describes current search category _source = row.xpath('td[2]/a/@id').extract_first() # abstract away underlying form functionality for a cleaner API form = parser.PortCallJsfForm(source=_source, viewstate=response.meta['viewstate']) yield FormRequest( url=self.start_urls[0], formdata=form.asdict(), # cache `raw_item` to allow more fields to be appended to it in the callback meta={'raw_item': row_to_dict(row, header=html.xpath('//th/text()').extract())}, callback=self.parse_vessel_details, )
def parse(self, response): """Parse responses from source containing port activity. Args: response (scrapy.Response): Yields: Dict[str, str]: """ for idx, row in enumerate( response.xpath('//table[@id="vesselinfo"]//tr')): # first row will always contain headers if idx == 0: headers = [ may_strip(cell) for cell in row.xpath('th//text()').extract() if may_strip(cell) ] continue raw_item = row_to_dict(row, headers) # append extra shipping agent info _agent_res = yield DubaiTradeSession.get_shipping_agent( rotation_id=may_strip( row.xpath('.//a/text()').extract_first())) raw_item.update( **DubaiTradeSession.parse_shipping_agent(_agent_res)) # contextualise raw item with meta info raw_item.update(port_name='Jebel Ali', provider_name=self.provider, reported_date=self.reported_date) yield normalize.process_item(raw_item)
def parse(self, response: Response) -> Iterator[Optional[Dict[str, Any]]]: """Parse reponse from IseWan Vessel Traffic Service Centre website.""" reported_date = response.xpath( '//div[@class="_inner"]/p/text()').extract_first() events = [ ] # to hold sequential list of vessel lineup for advanced parsing table = response.xpath('//table[@class="generalTB"]') for row_idx, row in enumerate(table.xpath('.//tr')): # first row of source table is always the header if row_idx == 0: headers = row.xpath('.//th/text()').extract() continue # subsequent rows are exclusively vessel movements only raw_item = row_to_dict(row, headers) # contextualise item with meta info raw_item.update(provider_name=self.provider, reported_date=reported_date) # standardize character width for key, value in raw_item.items(): raw_item[key] = may_strip(_standardize_char_width(value)) event = normalize.process_item(raw_item) events.append(event) if event else None # combine arrival and departure events into a single 'PortCall' datatype for event in events: yield from normalize.combine_event(event, events)
def parse(self, response): # get nature of portcall data scraped from each url for event in EVENT_MAPPING: if event in response.url: event_type = EVENT_MAPPING[event] break for idx, row in enumerate( response.xpath('//table[@class="ships"]//tr')): # first row is always the headers if idx == 0: headers = [ # NOTE some headers don't have a key assigned th.xpath('.//text()').extract_first() or 'unknown' for th in row.xpath('./th') ] continue raw_item = row_to_dict(row, headers) # contextualise raw item with meta info raw_item.update( event_type=event_type, port_name=self.name, provider_name=self.provider, reported_date=self.reported_date, ) yield normalize.process_item(raw_item)
def parse_manifest_page(self, response): """Parse manifest detail page response and extract desired information.""" # no cargo info present, discard if 'no tiene conocimientos' in response.text.lower(): return pc_header = [ may_strip(head.extract()) for head in response.xpath( '//body/table[@width="80%"]//tr//td//b/text()') ] portcall = response.xpath( '//body/table[@width="80%"]//tr//td/text()').extract() portcall = dict(zip(pc_header, portcall)) portcall.update( cargoes=[], port_names=set(), provider_name=self.provider, # NOTE we need to introspect url per `raw_cargo` to append data to `cargoes` raw_cargoes=[], ) # NOTE first element of `cargoes` is the header cargoes = response.xpath('//body/table[@width="100%"]//tr') header = [ may_strip(head) for head in cargoes[0].xpath('.//td//text()').extract() ] cargoes = cargoes[1:] for raw_row in cargoes: raw_cargo = row_to_dict( raw_row, header, cargo_url=response.urljoin( raw_row.xpath('.//a/@href').extract()[-1]), ) # reported_date is based on manifest publish date, take earliest one for consistency if not portcall.get('reported_date'): _reported_date = raw_cargo.get( 'Fecha de Transmisión') or raw_cargo.get( 'Fecha de Transmisi�n') portcall.update(reported_date=_reported_date) # easy optimization; don't care about cargoes in-transit and not bound for Peru # destinations are given as UN/LOCODEs if not raw_cargo['Puerto Destino'].startswith('PE'): continue portcall['raw_cargoes'].append(raw_cargo) return self.extract_cargo_data(portcall)
def parse(self, response): table = response.css('table.shipsinport')[SCHEDULE_TABLE_ORDER].css('tr') rows = table[STARTING_ROW_ID:] headers = [may_strip(head) for head in table[HEADER_ROW_ID].css('::text').extract()] title = table[DATE_ROW_ID].extract() last_updated = re.search('[0-9]{2}.[0-9]{2}.20[0-9]{2}', title).group() for row in rows: raw_item = row_to_dict(row, headers) raw_item.update( port_name=self.name, provider_name=self.provider, reported_date=last_updated ) yield normalize.process_item(raw_item)
def parse(self, response): """Parse and extract raw items from html tables. Each entry in the port activity table has a link on the vessel name, with the vessel IMO in the link itself. We append vessel imo to each row we extract, since it is not technically part of the table cells. Vessel imo appears as part of the html query string, e.g.: ".../phpcodes/navire_a.php?ship=9297905" ^^^^^^^ imo Args: response (scrapy.HtmlResponse): Yields: Dict[str, str]: """ # memoise reported_date so it won't need to be called repeatedly reported_date = (dt.datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0).isoformat()) logger.info(f"JJJJJJJJJJJJJJ {self.provider}") # each index corresponds to the vessel movement type in the table # 0: attendus # 1: a quai # 2: en rade for table_idx in range(3): table = response.xpath( f'//div[contains(@class, "et_pb_tab_{table_idx}")]//table') header = [ may_strip(head) for head in table.xpath('.//th/text()').extract() ] for row in table.xpath('./tbody//tr'): raw_item = row_to_dict(row, header) # conextextualise raw item with meta info raw_item.update( port_name=self.name, provider_name=self.provider, reported_date=reported_date, vessel_imo=row.xpath('./td//@href').extract_first().split( 'ship=')[1], ) yield normalize.process_item(raw_item)
def parse_expected_schedule_page(self, response): header = response.xpath('//th/text()').extract() # obtain table rows for row in response.xpath('//div[@id="tab_mouvement_bottom_div"]//tr'): raw_item = row_to_dict( row, header, # contextualise raw item with meta data port_name=self.name, provider_name=self.provider, reported_date=dt.datetime.utcnow(), # data on this page is exclusively vessels yet to arrive event='eta', ) yield normalize.process_item(raw_item)
def parse_vessel_details(self, response): """Parse page containing detailed portcall info. Each page contains six tables: - "Indentification escale" (internal port identification numbers) - "Entree" (eta, arrival, piloting date and timestamps) - "Sortie" (etd, departure dates and timestamps) - "Sejour a Quai" (berthed date and timestamp) - "Provenance / Destination" (previous port visited, next port to be visited) - "Caracteristiques du navire" (vessel attributes) All tables except "Caracteristiques du navire", are structured identically and can be parsed as such. Table "Caracteristiques du navire" will be parsed separately. Yields: Dict[str, str]: """ raw_item = {} # parse and extract from identical tables for table in response.xpath( '//table[@class="ormo table table-striped "]'): for idx, row in enumerate(table.xpath('.//tr')): # headers will always be the first row if idx == 0: headers = row.xpath('./th/text()').extract() continue headers = table.xpath('.//th//text()').extract() raw_item.update(row_to_dict(row, headers)) # parse and extract from "Caracteristiques du navire" table for field in response.xpath('//table[@class="ormo ref"]//td'): key = field.xpath('./div/div[1]//text()').extract_first() value = field.xpath('./div/div[2]//text()').extract_first() raw_item[key] = value # contextualise raw item with meta info raw_item.update(port_name=self.port_name, provider_name=self.provider, reported_date=self.reported_date) yield normalize.process_item(raw_item)
def parse_in_quay_page(self, response): header = response.xpath('//th/text()').extract() # `Unloading` column can appear in headers list sometimes, so we need to try and remove it self._remove_elements(header, 'Unloading') # obtain table rows for row in response.xpath('//table[@id="tab_navire_a_quai_bottom"]//tr'): raw_item = row_to_dict( row, header, # contextualise raw item with meta data port_name=self.name, provider_name=self.provider, reported_date=dt.datetime.utcnow(), # data on this page is exclusively arrived vessels event='arrival', ) yield normalize.process_item(raw_item)
def parse(self, response): # last table provides ETA schedules, other tables are not required table = response.xpath('//table')[-1] header = table.xpath('.//th//text()').extract() # memoise reported date reported_date = (dt.datetime.utcnow().replace( hour=0, minute=0, second=0).isoformat(timespec='seconds')) # iterate through table row entries for row in table.xpath('./tbody/tr'): raw_item = row_to_dict(row, header) # contextualise raw item with meta info raw_item.update(port_name=self.name, provider_name=self.provider, reported_date=reported_date) # yield raw_item yield normalize.process_item(raw_item)
def _extract_table(table): """Extract tabular data from a standard format as per the website structure. Args: scrapy.Selector: Yields: Dict[str, Optional[str]]: """ for idx, row in enumerate(table.xpath('.//tr')): if idx == 0: # headers will only occur in the first row headers = row.xpath('./th/text()').extract() continue # extract each data row of the table yield row_to_dict(row, headers), row.xpath('.//a/@href').extract_first()
def extract_current_page(self, response: Response) -> Iterator[Dict[str, Any]]: """Extract data from current HTML page. Each page contains a table of the flow/capacity levels of the specified terminal. | Jour | Stock GNL à 6h | Quantités nominées | Quantités allouées | |------------|----------------|--------------------|--------------------| | 10/03/2020 | 22 | 115 091 562 | 115 091 562 | | 11/03/2020 | 77 | 115 091 562 | 115 091 562 | | 12/03/2020 | 59 | 115 091 562 | 115 091 562 | | 13/03/2020 | 42 | 115 091 562 | 115 091 562 | | 14/03/2020 | 25 | 115 091 562 | 115 091 562 | | 15/03/2020 | 80 | 73 987 433 | 73 987 433 | | 16/03/2020 | 69 | 24 662 478 | - | | 17/03/2020 | 65 | 24 662 478 | - | | 18/03/2020 | 61 | 24 662 478 | - | | 19/03/2020 | 57 | 24 662 478 | - | | 20/03/2020 | 53 | 24 662 478 | - | | ... | ... | ... | ... | """ # date_format = "%d/%m/%Y" headers = response.xpath( '//table[@summary=""]/thead//th/text()').extract() if len(headers) != 4: self.logger.error( "Unable to extract data; resource may have changed") return rows = response.xpath('//table[@summary=""]/tbody//tr') for row in rows: raw_item = row_to_dict(row, headers) # append meta info raw_item.update( # TODO should be changed to a more descriptive provider name, like 'Elengy' provider_name=self.provider, reported_date=self.reported_date, ) yield normalize.process_item(raw_item)
def parse(self, response): """Extract table rows and header from vessel berthed page. Args: response (scrapy.HtmlResponse): Yields: dict[str, str]: """ reported_date = (dt.datetime.utcnow().replace( hour=0, minute=0, second=0).isoformat(timespec='seconds')) headers = response.xpath('//table/thead//th/text()').extract() for row in response.xpath('//table/tbody/tr'): raw_item = row_to_dict(row, headers) # contextualise raw item with meta info raw_item.update(port_name=self.name, provider_name=self.provider, reported_date=reported_date) yield normalize.process_item(raw_item)
def parse(self, response): """Parse overview website and obtain URLs for the individual PDF reports. Args: response (scrapy.Response): Yields: Dict[str, str]: """ # there are two port reports provided: # - expected vessels # - current in-port activity reported_date = dt.datetime.utcnow().isoformat() status_ids = ['berthed', 'expected', 'awaiting'] for status in status_ids: for table in response.xpath( "//div[@id='" + status + "']//div[@role='tabpanel']//table" ): port_id = table.attrib['id'] header = response.xpath( "//div[@id='" + status + "']//div[@role='tabpanel']//table[@id='" + port_id + "']//th/text()" ).extract() for row in table.xpath( "//div[@id='" + status + "']//div[@role='tabpanel']//table[@id='" + port_id + "']//tr" ): raw_item = row_to_dict(row, header) raw_item.update( port_name=port_id, provider_name=self.provider, reported_date=reported_date ) yield normalize.process_item(raw_item)
def parse_overview(self, response): raw_header, raw_table = response.xpath( '//div[@id="ContenidoForma_WebDataGrid21"]/table/tr/td/table/tbody' )[:2] header = raw_header.xpath('.//th/text()').extract() table = raw_table.xpath('./tr/td//tr') for row in table: raw_item = row_to_dict( row, header, # contextualise raw item with meta info port_name=self.name, provider_name=self.provider, reported_date=dt.datetime.utcnow().replace( hour=0, minute=0, second=0).isoformat(timespec='seconds'), ) yield Request( url=self.start_urls[1].format(vid=raw_item['VID']), callback=self.parse_vessel_attributes, meta={'raw_item': raw_item}, )
def parse(self, response): """Dispatch response to corresponding callback given URL. Args: response (scrapy.HtmlResponse): Yields: dict[str, str]: """ table, headers = parser.extract_table_and_headers(response) # memoise reported_date so it won't have to be called repeatedly for each row reported_date = parser.extract_reported_date(response) for row in parser.extract_rows_from_table(table): if len(row.xpath('.//td')) == len(headers): raw_item = row_to_dict(row, headers) # contextualise raw item with meta info raw_item.update(port_name=self.name, provider_name=self.provider, reported_date=reported_date) yield normalize.process_item(raw_item)
def parse(self, response): """Entrypoint for parsing Pampa Melchorita port website. Args: response (scrapy.Response): Yields: Dict[str, str]: """ # memoise reported_date so it won't need to be called repeatedly reported_date = (dt.datetime.utcnow().replace( hour=0, minute=0, second=0, microsecond=0).isoformat()) for idx, row in enumerate(response.xpath('//table/tr')): # first two rows contains nested headers; ignore if idx in [0, 1]: continue # use cell indexes as header header = [str(cell) for cell in range(len(row.xpath('./td')))] # don't re-scrape data that was seen before # uniqueness defined by row ID, shipping date, vessel name, and destination row_hash = parser.naive_list_hash( row.xpath('./td//text()').extract(), 1, 2, 3, 4) if not self._check_persistence(row_hash): continue # extract data rows from table raw_item = row_to_dict(row, header) # contextualise raw_item with meta info raw_item.update(port_name='Melchorita', provider_name=self.provider, reported_date=reported_date) yield normalize.process_item(raw_item)
def parse(self, response): tables = response.xpath('//center/table') table_names = response.xpath('//center/font//text()').extract() # memoise reported date reported_date = response.xpath('//body/table/tr[2]//b/text()').extract_first() for table_idx, table in enumerate(tables): # NOTE first row is actually empty and should be skipped # header is in second row, but we don't need to use it # table rows are in the subsequent rows rows = table.xpath('tr')[2:] for row in rows: # import ipdb; ipdb.set_trace() header = [str(idx) for idx in range(len(row.xpath('td')))] raw_item = row_to_dict( row, header, port_name=self.name, vessel_type=table_names[table_idx], provider_name=self.provider, reported_date=reported_date, ) yield normalize.process_item(raw_item)
def parse_cargo_pages(self, response): portcall = response.meta['raw_item'] # easy optimization; don't care about container vessels if 'contenedore' in response.xpath('body').extract_first().lower(): # break if one container cargo is present; no need to introspect the other cargoes self.logger.info( f"Vessel {portcall.get('Matrícula de la Nave')} is a container ship, discarding ..." ) return header = [ may_strip(head.extract()) for head in response.xpath( '//body/table[@width="100%"]//tr[1]//b/text()') if may_strip(head.extract()) ] products = response.xpath( '//body/table[@width="100%"]//tr[position()>1]') for product in products: product = row_to_dict(product, header) portcall['cargoes'].append(product) return self.extract_cargo_data(portcall)
def parse(self, response): """Entrypoint of Skikda spider. Args: response (scrapy.Response): Yields: Dict[str, str]: """ # memoise reported date so it won't need to be called repeatedly later reported_date = response.xpath( '//div[@id="full"]/h3/text()').extract_first() header, table = self.extract_table_and_header(response) for row in table: raw_item = row_to_dict(row, header) # let's add a bit of metadata raw_item.update( port_name=self.name, provider_name=self.provider, reported_date=reported_date, url=response.url, ) yield normalize.process_item(raw_item)
def parse(self, response): """Parse and extract raw items from html tables. Args: response (scrapy.HtmlResponse): Yields: Dict[str, str]: """ # memoise reported date so it won't need to be called repeatedly later reported_date = response.xpath( '//div[@id="tabelasNavios"]/h3/text()').extract_first() all_tables = response.xpath('//div[@id="tabelasNavios"]/table') table_types = [ may_strip(x) for x in response.xpath( '//div[@id="tabelasNavios"]/h2[@class="tableName"]//text()'). extract() if may_strip(x) ] for idx, table in enumerate(all_tables): table_type = table_types[idx] # parse each row of the table body for row in table.xpath('./tbody/tr'): # use raw indexes instead of table headers since they are inconsistent raw_item = row_to_dict( row, [str(idx) for idx in range(TABLE_WIDTH)], # contextualise raw item with some meta info port_name=self.name, provider_name=self.provider, reported_date=reported_date, table_type=table_type, ) yield normalize.process_item(raw_item)
def process_report_page(self, response): """Process the main report page """ PROCESSING_STARTED = False balance = None country, country_type = parser.get_country(response) # Each row in the table represents a information on different balance type # (refinery intake, ending stock) for a speicific period of time for row in response.xpath('//tr'): for col in row.xpath('./td'): if col.xpath('./u/text()').extract_first() in ALLOWED_BALANCE: PROCESSING_STARTED = True balance = col.xpath('./u/text()').extract_first() if PROCESSING_STARTED: table = col.xpath('./table') if not table: continue trows = table.xpath('./tr') # we are interested only in the first 2 rows mapped_titles = row_to_dict(trows[0], TABLE_HEADER) mapped_values = row_to_dict(trows[1], TABLE_INFO) # for current week items raw_item = map_keys(mapped_titles, self.current_week_mapping()) raw_item.update( map_keys(mapped_values, self.current_week_mapping())) raw_item.update({ 'reported_date': parser.get_reported_date(response), 'provider_name': self.provider, 'unit': Unit.kiloliter, 'country': COUNTRY_MAPPING.get(country, country), 'country_type': country_type, 'balance': self._infer_balance(balance), }) # Dispatch for the current week start_of_current_week, end_of_current_week = parser.parse_date_range( raw_item.pop('current_week', None)) raw_item.update({ 'start_date': start_of_current_week, 'end_date': end_of_current_week, 'volume': raw_item.pop('volume_current', None), }) yield raw_item # for previous week raw_item.update( map_keys(mapped_titles, self.previous_week_mapping())) raw_item.update( map_keys(mapped_values, self.previous_week_mapping())) # Dispatch for the Previous week start_of_prev_week, end_of_prev_week = parser.parse_date_range( raw_item.pop('prev_week', None)) raw_item.update({ 'start_date': start_of_prev_week, 'end_date': end_of_prev_week, 'volume': raw_item.pop('volume_prev', None), }) yield raw_item PROCESSING_STARTED = False