def parse(self, response): """Parse and populate form elements for each vessel. Args: response (scrapy.Response): Yields: FormRequest: """ # the source contains a huge list of vessel IMOS(6k~) # Inorder to reduce the number of request we make, vessel imo's from platform are obatined # and the request are made only to those imo's. imo_from_website = set( response.xpath('//select/option/@value').getall()) imos_from_platform = set([ vessel['imo'] for vessel in static_data.vessels() if vessel['imo'] ]) imos_matched_with_platform = imo_from_website & imos_from_platform for imo in imos_matched_with_platform: formdata = {'regLloyds': imo, 'codbuq': imo} yield FormRequest.from_response(response, formdata=formdata, callback=self._parse_listing)
def start_requests(self): # TODO use cli argument instead SF_API_KEY = '0FFD52AC4B28052D83156C053AFE4CD0' # ovewrite api key if explicitely set if hasattr(settings, 'SF_API_KEY'): SF_API_KEY = getattr(settings, 'SF_API_KEY') vessel_list = static_data.vessels() # There is a limit to size of the URL request. If the URL size is # above, a 404 Error is returned. # We compute the maximum number of MMSI we can send at once given # the length of the URL pattern and the elements to be inserted in # it (the +1 is for the ',' inserted in between the MMSIs we send). batch_size = (self.URL_MAX_SIZE - len(self.URL) - len(SF_API_KEY)) // (self.MMSI_DIGIT_COUNT + 1) # Then we send as many requests as needed. for bound in range(0, len(vessel_list), batch_size): # We build the vessels_by_mmsi dictionary inside the loop because # it will be mutated within the response parsing function. # Sharing it between requests is thus a bad idea. vessels_by_mmsi = { v['mmsi']: v for v in vessel_list[bound:bound + batch_size] if self.PROVIDER_SHORTNAME in v.get('providers', []) and v.get('mmsi') } batch = ','.join(list(vessels_by_mmsi.keys())) yield Request( self.URL.format(api_key=SF_API_KEY, mmsi_id=batch), callback=self.parse, meta={'vessels': vessels_by_mmsi}, )
def __init__(self, fleet='', showfleet='', username=None, password=None, removal='', *args, **kwargs): super(VesselTrackerSpider, self).__init__(*args, **kwargs) # TODO just remove default values # validate input params if any(x is None for x in (username, password)): raise CloseSpider('No credentials were provided !') self._user = username self._pass = password self._update_fleet = fleet.lower() == 'true' self._show_fleet = showfleet.lower() == 'true' self._allow_removal = removal.lower() == 'true' self.vessel_list = static_data.vessels() if self._show_fleet and self._update_fleet: self.logger.warning('Contradictory arguments: cannot update and ' 'show fleet at once. Will assume command is ' 'show fleet.') self._update_fleet = False
def search(cache, imo): ui.info(f"loading static fleet (caching: {cache})") fleet = vessels(disable_cache=not cache) ui.info(f"searching for vessel with imo: {imo}") vessel = fleet.get(imo) if vessel: ui.success("on Kpler radar") pp(vessel, indent=4)
def test_vessels_collection_from_local_cache(self): static_data._BASE_LOCAL_CACHE = fixtures_path() fleet = static_data.vessels() self.assertTrue(isinstance(fleet, static_data.Collection)) for vessel in fleet: self.assertTrue(vessel.get('imo')) static_data._BASE_LOCAL_CACHE = ORIGINAL_CACHE_PATH
def start_requests(self): imos = [ str(vessel['imo']) for vessel in static_data.vessels() if 'imo' in vessel and vessel['imo'] and self.provider in vessel['providers'] ] for imo in imos: yield Request( url=SEARCH_URL.format(imo), headers={'User-Agent': utils.USER_AGENT}, meta={'imo': imo}, callback=self.parse_search, )
def start_requests(self): vessels = [ {'name': vessel['name'].lower(), 'commos': vessel['_markets']} for vessel in static_data.vessels() if 'name' in vessel ] yield Request(self.start_urls[0], meta={'vessels': vessels}) formdata = {'data': yesterday(), 'submit': 'Enviar'} yield FormRequest( url=self.start_urls[1], formdata=formdata, callback=self.parse_in_port, meta={'vessels': vessels}, )
def start_requests(self): vessel_list = static_data.vessels() imos = [ vessel['imo'] for vessel in vessel_list if 'imo' in vessel and 'VF' in vessel['providers'] ] for imo in imos: should_follow = (imo in self.next_run and dt.datetime.utcnow() > parse(self.next_run[imo]) ) or (imo not in self.next_run) if should_follow: self.logger.info('Vessel %s is processing' % str(imo)) url = 'https://www.vesselfinder.com/fr/vessels/x-imo-' + imo yield Request(url=url, headers=HEADERS, callback=self.parse) else: self.logger.info('Vessel {} is schedueled to run at {}'.format( str(imo), str(self.next_run[imo])))
def __init__(self, imo: str = None): """Initialize MarineTrafficRegistry spider with IMOs to search. Initialises `self.imo` instance attribute that stores the list of IMOs to be scraped. """ # if no IMOs supplied, use complete list of IMOs on Kpler platforms if not imo: fleet = vessels(disable_cache=True) _imos = tuple(v['imo'] for v in fleet if v.get('imo')) else: _imos = tuple(may_strip(i) for i in imo.split(',')) # sanity check; in case of invalid IMO numbers self.imos = [] for vessel in _imos: if not safe_imo(vessel): self.logger.warning('Invalid IMO number: %s', vessel) else: self.imos.append(vessel)
def _kpler_vessels(): """Download a list of all vessels on our platforms. This is required because of the following analyst rules specific to this source. The source does not provide the DWT of the vessel, so we must try matching them by their names in order to obtain their DWT. Given a dry-bulk vessel arriving at Dampier: - if its DWT is below 50k, it must carry salt - if its DWT is between 50k and 75k, it must carry ore - if its DWT is above 75k, it must carry iron ore Returns: List[Dict[str, Any]]: """ global __KPLER_VESSELS __KPLER_VESSELS = __KPLER_VESSELS if __KPLER_VESSELS else vessels( disable_cache=True) return __KPLER_VESSELS
def parse(self, response): sel = Selector(response) table = sel.css( 'table.infrastructureTable.simpleTable.vesselsTable tr') if table: # Depends if he finds the table or not title = table[0].css('th::text').extract() if 'Bearth' in response.url: HEADER = AT_BERTH_HEADER elif 'Anchorage' in response.url: # TODO: Create expected header list (for now, no table is available). return elif 'Expected' in response.url: HEADER = EXPECTED_HEADER elif 'Sailed' in response.url: HEADER = SAILED_HEADER elif 'BerthIn48hrs' in response.url: HEADER = BERTH_HIST_HEADER for tr in table[1:]: row = tr.xpath('td/text()').extract() if not row: continue if not len(title) == len(row): self.logger.error('Header and row have different length') item = VesselPortCall() for key, val in zip(title, row): if HEADER.get(key): item[HEADER[key]] = val if 'vessel_name' not in item: self.logger.error('Vessel name is missing') date_fields = ['eta', 'etd', 'arrival_date', 'departure_date'] if not (any(l in item for l in date_fields)): self.logger.error('Date is missing') # hazira port add cargo type in front of the vessel name, eg 'LNG Al Huwaila' # so we split the name in 2 name = item['vessel_name'].split() item['cargo_type'] = name[0] name_without_cargo_type = " ".join(name[1:]) # but for vessels that name really start with LNG, they don t # duplicate, eg no 'LNG LNG Oyo' 1 #so we test if after removing # the cargo_type in the name, it is a valid vessel name if name_without_cargo_type in [ v['name'] for v in static_data.vessels() ]: item['vessel_name'] = name_without_cargo_type item['url'] = response.url if item['cargo_type'] == 'LNG': yield item else: self.logger.warning('Table is missing or empty') # scrap next page if not response.meta.get('next_page'): other_pages = sel.xpath( '//tr[@class="borderBorrom"]//a/@href').extract() for page in other_pages: page_number = re.search('(\d+).*', page).group(1) formdata = { '__EVENTTARGET': 'gvBreakBulkVessels', '__EVENTARGUMENT': 'Page$' + page_number, '__VIEWSTATE': '/wEPDwUKMTUxNzA1MDQzMg9kFgICAw9kFgICAQ9kFhYCAw8PFgIeBFRleHQFGkFzIE9uIDA5IERlYyAyMDE0IDAzOjA3OjI2ZGQCBQ88KwARAwAPFgQeC18hRGF0YUJvdW5kZx4LXyFJdGVtQ291bnQCC2QBEBYAFgAWAAwUKwAAFgJmD2QWFgIBD2QWEmYPDxYCHwAFBiZuYnNwO2RkAgEPDxYCHwAFDE1WIE9FTCBUUlVTVGRkAgIPDxYCHwAFBjE0MDQ1MmRkAgMPDxYCHwAFCkNPTlRBSU5FUlNkZAIEDw8WAh8ABQQ1NTAwZGQCBQ8PFgIfAAUDNTUwZGQCBg8PFgIfAAUDSS9FZGQCBw8PFgIfAAUdUkVMQVkgU0hJUFBJTkcgQUdFTkNZIExJTUlURURkZAIIDw8WAh8ABRAwOS4xMi4yMDE0IDIwOjAwZGQCAg9kFhJmDw8WAh8ABQYmbmJzcDtkZAIBDw8WAh8ABRRNViBWSUxMRSBEJiMzOTtPUklPTmRkAgIPDxYCHwAFBjE0MDQ1MWRkAgMPDxYCHwAFCkNPTlRBSU5FUlNkZAIEDw8WAh8ABQUxNjcwMGRkAgUPDxYCHwAFAzgwMGRkAgYPDxYCHwAFA0kvRWRkAgcPDxYCHwAFJE1FUkNIQU5UIFNISVBQSU5HIFNFUlZJQ0VTIFBWVC4gTFRELmRkAggPDxYCHwAFEDEwLjEyLjIwMTQgMTE6MDBkZAIDD2QWEmYPDxYCHwAFBiZuYnNwO2RkAgEPDxYCHwAFDE1WIE9FTCBLT0NISWRkAgIPDxYCHwAFBjE0MDQ0MmRkAgMPDxYCHwAFCkNPTlRBSU5FUlNkZAIEDw8WAh8ABQQ2NTAwZGQCBQ8PFgIfAAUDNjUwZGQCBg8PFgIfAAUDSS9FZGQCBw8PFgIfAAUdUkVMQVkgU0hJUFBJTkcgQUdFTkNZIExJTUlURURkZAIIDw8WAh8ABRAxMS4xMi4yMDE0IDAzOjAwZGQCBA9kFhJmDw8WAh8ABQYmbmJzcDtkZAIBDw8WAh8ABQ1NViBLT1RBIE5BWklNZGQCAg8PFgIfAAUGMTQwNDM0ZGQCAw8PFgIfAAUKQ09OVEFJTkVSU2RkAgQPDxYCHwAFBDM1MDBkZAIFDw8WAh8ABQM0MDBkZAIGDw8WAh8ABQNJL0VkZAIHDw8WAh8ABRJQSUwgTVVNQkFJIFBWVCBMVERkZAIIDw8WAh8ABRAxMS4xMi4yMDE0IDA0OjAwZGQCBQ9kFhJmDw8WAh8ABQYmbmJzcDtkZAIBDw8WAh8ABRBNVCBHT0xERU4gREVOSVNFZGQCAg8PFgIfAAUGMTQwNDU1ZGQCAw8PFgIfAAUQSEVBVlkgQUVST01BVElDU2RkAgQPDxYCHwAFCDE1MDQuMjA0ZGQCBQ8PFgIfAAUBMGRkAgYPDxYCHwAFAUlkZAIHDw8WAh8ABSFTQU1VRFJBIE1BUklORSBTRVJWSUNFUyBQVlQuIExURC5kZAIIDw8WAh8ABRAxMS4xMi4yMDE0IDEzOjAwZGQCBg9kFhJmDw8WAh8ABQYmbmJzcDtkZAIBDw8WAh8ABQ9MTkcgUyBTIFNBTEFMQUhkZAICDw8WAh8ABQYxNDA0NTRkZAIDDw8WAh8ABQYmbmJzcDtkZAIEDw8WAh8ABQYmbmJzcDtkZAIFDw8WAh8ABQEwZGQCBg8PFgIfAAUBSWRkAgcPDxYCHwAFIk9WRVJTRUFTIE1BUklUSU1FIEFHRU5DSUVTIFBWVCBMVERkZAIIDw8WAh8ABRAxMi4xMi4yMDE0IDA0OjMwZGQCBw9kFhJmDw8WAh8ABQYmbmJzcDtkZAIBDw8WAh8ABRFNVCBPUklFTlRBTCBMT1RVU2RkAgIPDxYCHwAFBjE0MDQ1NmRkAgMPDxYCHwAFBlBIRU5PTGRkAgQPDxYCHwAFBDEwNTBkZAIFDw8WAh8ABQEwZGQCBg8PFgIfAAUBSWRkAgcPDxYCHwAFIk9WRVJTRUFTIE1BUklUSU1FIEFHRU5DSUVTIFBWVCBMVERkZAIIDw8WAh8ABRAxMi4xMi4yMDE0IDEzOjAwZGQCCA9kFhJmDw8WAh8ABQYmbmJzcDtkZAIBDw8WAh8ABRRNViBORURMTE9ZRCBNRVJDQVRPUmRkAgIPDxYCHwAFBjE0MDQyNWRkAgMPDxYCHwAFCkNPTlRBSU5FUlNkZAIEDw8WAh8ABQUyMTM3NGRkAgUPDxYCHwAFAzkwOGRkAgYPDxYCHwAFA0kvRWRkAgcPDxYCHwAFMUEgUCBNT0xMRVIgTUFFUlNLIEEvUyBDL08gTUFFUlNLIExJTkUgSU5ESUEgUC5MVERkZAIIDw8WAh8ABRAxNS4xMi4yMDE0IDE4OjAwZGQCCQ9kFhJmDw8WAh8ABQYmbmJzcDtkZAIBDw8WAh8ABRdNViBWSUxMRSBEJiMzOTtBUVVBUklVU2RkAgIPDxYCHwAFBjE0MDQ1M2RkAgMPDxYCHwAFBiZuYnNwO2RkAgQPDxYCHwAFBiZuYnNwO2RkAgUPDxYCHwAFATBkZAIGDw8WAh8ABQNJL0VkZAIHDw8WAh8ABRRNQksgTE9HSVNUSVggUFZUIExURGRkAggPDxYCHwAFEDE3LjEyLjIwMTQgMDA6MDFkZAIKD2QWEmYPDxYCHwAFBiZuYnNwO2RkAgEPDxYCHwAFDE1WIE9FTCBLVVRDSGRkAgIPDxYCHwAFBjE0MDQ1N2RkAgMPDxYCHwAFCkNPTlRBSU5FUlNkZAIEDw8WAh8ABQQ3MDAwZGQCBQ8PFgIfAAUDNzAwZGQCBg8PFgIfAAUDSS9FZGQCBw8PFgIfAAUdUkVMQVkgU0hJUFBJTkcgQUdFTkNZIExJTUlURURkZAIIDw8WAh8ABRAxOC4xMi4yMDE0IDAzOjAwZGQCCw8PFgIeB1Zpc2libGVoZGQCCQ88KwARAgEQFgAWABYADBQrAABkAg0PPCsAEQIBEBYAFgAWAAwUKwAAZAIRDzwrABECARAWABYAFgAMFCsAAGQCFQ88KwARAgEQFgAWABYADBQrAABkAhkPPCsAEQIBEBYAFgAWAAwUKwAAZAIdDzwrABECARAWABYAFgAMFCsAAGQCIQ88KwARAgEQFgAWABYADBQrAABkAiUPPCsAEQIBEBYAFgAWAAwUKwAAZAIpDzwrABECARAWABYAFgAMFCsAAGQYCwUPZ3ZMaXF1aWRWZXNzZWxzD2dkBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WAQUWSGVhZGVyMSRidG5TZWFyY2hfU2l0ZQUNZ3ZCdWxrVmVzc2Vscw9nZAUPZ3ZCdW5rZXJWZXNzZWxzD2dkBQ1ndk1JQ1RWZXNzZWxzD2dkBQ1ndklPQ0xWZXNzZWxzD2dkBQ1ndkFNQ1RWZXNzZWxzD2dkBQ1ndkhNRUxWZXNzZWxzD2dkBQxndlNUU1Zlc3NlbHMPZ2QFEmd2QnJlYWtCdWxrVmVzc2Vscw88KwAMAQgCAmQFEmd2V2VzdEJhc2luVmVzc2Vscw9nZHKW6gPX3nS82tKjLyVtGdGjoKMX', # noqa '__VIEWSTATEGENERATOR': '0F687C81', '__EVENTVALIDATION': '/wEdAAVK/r2TLt/ma37K+1nQJUNVeEB0g4USW5kXY53HuZE3i/w/jkBNwg/yVhGc0oQypPkbzfkr32iJv18Vg2yuArOB3bnMusYDgvMlgczsCbAKn1NBXBEF8UtYi4dKVVw8HslDPc9Z', # noqa } yield FormRequest(url=response.url, formdata=formdata, meta={'next_page': True})
def test_vessels_collection_shortcut_init_with_local_cache(self): fleet = static_data.vessels() self.assertTrue(isinstance(fleet, static_data.Collection)) self.assertTrue(isinstance(fleet, list)) self.assertEqual(fleet.index, 'imo')