Python row_to_dict示例，kp_scrapers.lib.parser.row_to_dict Python示例

示例#1

0

显示文件

    def parse(self, response):
        """Extract vessel movements listed in `start_urls`.

        Args:
            response (scrapy.Response):

        Yields:
            Dict[str, str]:
        """
        # memoize reported date so it won't need to be repeatedly computed later
        reported_date = to_isoformat(
            response.xpath('//p/text()').extract_first(),
            dayfirst=False,
            yearfirst=True)

        table = response.xpath('//table/tr')
        header, data_rows = table[0].xpath('./th//text()').extract(), table[1:]
        for row in data_rows:
            raw_item = row_to_dict(
                row,
                header,
                port_name=self.name,
                reported_date=reported_date,
                provider_name=self.provider,
            )
            yield normalize.process_item(raw_item)

示例#2

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        """Entrypoint for parsing website response.

        Args:
            response (scrapy.Response):

        Yields:
            event (Dict[str, str]):

        """
        # memoise reported date since source does not provide any
        reported_date = (dt.datetime.utcnow().replace(
            hour=0, minute=0, second=0).isoformat(timespec='seconds'))

        # extract tabular data on vessel movements
        for idx, row in enumerate(response.xpath('//table//tr')):
            # header will always exist in the first row
            if idx == 0:
                header = row.xpath('.//th/text()').extract()
                continue

            raw_item = row_to_dict(row, header)
            # contextualise raw item with meta info
            raw_item.update(
                port_name=self.name,
                provider_name=self.provider,
                reported_date=reported_date,
                # event type depends on the page url, so remember it
                # (in addition to debugging benefits)
                url=response.url,
            )
            yield normalize.process_item(raw_item)

示例#3

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        """Entry point of Milford spider.

        Args:
            response:

        Returns:
            Dict[str, str]:

        """
        # memoise reported_date so it won't need to be called repeatedly
        reported_date = (dt.datetime.utcnow().replace(
            hour=0, minute=0, second=0).isoformat(timespec='seconds'))

        for idx, row in enumerate(
                response.xpath('//table[@id="DataTable"]//tr')):
            # first row always contains header
            if idx == 0:
                header = row.xpath('.//th/text()').extract()
                continue

            raw_item = row_to_dict(row, header)
            # contextualise raw item with meta info
            raw_item.update(port_name=self.name,
                            provider_name=self.provider,
                            reported_date=reported_date)

            yield normalize.process_item(raw_item)

示例#4

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        table = response.xpath('//table[contains(@class, "table-results")]//tr')

        # extract tabular vessel movements
        for idx, row in enumerate(table):
            # headers will always be in the first row
            if idx == 0:
                header = row.xpath('./td/text()').extract()
                continue

            # don't scrape table sub-headers as row values
            if len(row.xpath('./td')) < len(header):
                continue

            raw_item = row_to_dict(row, header)
            # contextualise raw item with meta info
            raw_item.update(
                port_name=self.name,
                provider_name=self.provider,
                reported_date=dt.datetime.utcnow()
                .replace(hour=0, minute=0, second=0, microsecond=0)
                .isoformat(),
            )

            yield normalize.process_item(raw_item)

示例#5

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        """Parse port activity pages.

        Args:
            response (scrapy.Response):

        Yields:
            Dict[str, str]:

        """
        # memoise reported date so that we don't need to call it repeatedly below
        reported_date = response.xpath(
            '//strong/parent::*/text()').extract()[1]

        header = response.xpath('//table//th/text()').extract()
        # first cell of table row will always be empty as displayed on the website
        # hence, we append an "irrelevant" column as the first element of the header
        header.insert(0, 'irrelevant')
        for row in response.xpath('//table/tbody/tr'):
            raw_item = row_to_dict(row, header)
            # contextualise raw item with metadata
            raw_item.update(port_name='Puerto Moin',
                            provider_name=self.provider,
                            reported_date=reported_date)
            yield normalize.process_item(raw_item)

示例#6

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        """Parse forecast/in-port activity of vessels at specified dock.

        Args:
            scrapy.Response:

        Yields:
            Dict[str, str]:

        """
        reported_date = response.xpath('//h2/text()').extract_first()
        is_data_available = response.xpath('//th')

        # there is no recorded activity currently at the specified dock
        if not is_data_available:
            return

        for idx, row in enumerate(response.xpath('//tr')):
            # headers will always be at the first row of the table
            if idx == 0:
                headers = row.xpath('.//th/text()').extract()
                continue

            raw_item = row_to_dict(row, headers)
            # contextualise raw item with dock name and meta info
            raw_item.update(
                installation=response.meta['dock'],
                port_name=self.port_name,
                provider_name=self.provider,
                reported_date=reported_date,
            )

            yield normalize.process_item(raw_item)

示例#7

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        """Parse and extract raw items from html tables.

        Args:
            response (scrapy.HtmlResponse):

        Yields:
            Dict[str, Any]:

        """
        reported_date = response.xpath(
            '//div[@class="panel radius text-center"]/text()').extract_first()

        for table in response.xpath('//table[@class="listado-boletin"]'):
            # verify if table is the one we want to scrape
            if may_strip(table.xpath(
                    './caption/text()').extract_first()) != self.table_name:
                continue

            header = table.xpath('.//th/text()').extract()
            data_rows = table.xpath('.//tr')
            for row in data_rows:
                raw_item = row_to_dict(row, header)
                # contextualise raw item with meta info
                raw_item.update(port_name='Bilbao',
                                provider_name=self.provider,
                                reported_date=reported_date)
                yield normalize.process_item(raw_item)

示例#8

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse_table(self, response):
        """Parse table containing portcalls and request vessel details for each portcall.

        Args:
            response (scrapy.XmlResponse):

        Yields:
            FormRequest:

        """
        # scrapy autoselects XmlResponse, but resource is actually in HTML
        # force response type to HTML
        html = parser.html_response(response)

        # iterate through each table row (i.e. portcall) in the response
        for row in html.xpath('//tbody/tr'):
            # get `source` of current row, i.e. key that describes current search category
            _source = row.xpath('td[2]/a/@id').extract_first()
            # abstract away underlying form functionality for a cleaner API
            form = parser.PortCallJsfForm(source=_source, viewstate=response.meta['viewstate'])

            yield FormRequest(
                url=self.start_urls[0],
                formdata=form.asdict(),
                # cache `raw_item` to allow more fields to be appended to it in the callback
                meta={'raw_item': row_to_dict(row, header=html.xpath('//th/text()').extract())},
                callback=self.parse_vessel_details,
            )

示例#9

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        """Parse responses from source containing port activity.

        Args:
            response (scrapy.Response):

        Yields:
            Dict[str, str]:

        """
        for idx, row in enumerate(
                response.xpath('//table[@id="vesselinfo"]//tr')):
            # first row will always contain headers
            if idx == 0:
                headers = [
                    may_strip(cell)
                    for cell in row.xpath('th//text()').extract()
                    if may_strip(cell)
                ]
                continue

            raw_item = row_to_dict(row, headers)
            # append extra shipping agent info
            _agent_res = yield DubaiTradeSession.get_shipping_agent(
                rotation_id=may_strip(
                    row.xpath('.//a/text()').extract_first()))
            raw_item.update(
                **DubaiTradeSession.parse_shipping_agent(_agent_res))

            # contextualise raw item with meta info
            raw_item.update(port_name='Jebel Ali',
                            provider_name=self.provider,
                            reported_date=self.reported_date)
            yield normalize.process_item(raw_item)

示例#10

0

显示文件

    def parse(self, response: Response) -> Iterator[Optional[Dict[str, Any]]]:
        """Parse reponse from IseWan Vessel Traffic Service Centre website."""

        reported_date = response.xpath(
            '//div[@class="_inner"]/p/text()').extract_first()
        events = [
        ]  # to hold sequential list of vessel lineup for advanced parsing

        table = response.xpath('//table[@class="generalTB"]')
        for row_idx, row in enumerate(table.xpath('.//tr')):
            # first row of source table is always the header
            if row_idx == 0:
                headers = row.xpath('.//th/text()').extract()
                continue

            # subsequent rows are exclusively vessel movements only
            raw_item = row_to_dict(row, headers)

            # contextualise item with meta info
            raw_item.update(provider_name=self.provider,
                            reported_date=reported_date)

            # standardize character width
            for key, value in raw_item.items():
                raw_item[key] = may_strip(_standardize_char_width(value))

            event = normalize.process_item(raw_item)
            events.append(event) if event else None

        # combine arrival and departure events into a single 'PortCall' datatype
        for event in events:
            yield from normalize.combine_event(event, events)

示例#11

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        # get nature of portcall data scraped from each url
        for event in EVENT_MAPPING:
            if event in response.url:
                event_type = EVENT_MAPPING[event]
                break

        for idx, row in enumerate(
                response.xpath('//table[@class="ships"]//tr')):
            # first row is always the headers
            if idx == 0:
                headers = [
                    # NOTE some headers don't have a key assigned
                    th.xpath('.//text()').extract_first() or 'unknown'
                    for th in row.xpath('./th')
                ]
                continue

            raw_item = row_to_dict(row, headers)
            # contextualise raw item with meta info
            raw_item.update(
                event_type=event_type,
                port_name=self.name,
                provider_name=self.provider,
                reported_date=self.reported_date,
            )

            yield normalize.process_item(raw_item)

示例#12

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse_manifest_page(self, response):
        """Parse manifest detail page response and extract desired information."""
        # no cargo info present, discard
        if 'no tiene conocimientos' in response.text.lower():
            return

        pc_header = [
            may_strip(head.extract()) for head in response.xpath(
                '//body/table[@width="80%"]//tr//td//b/text()')
        ]
        portcall = response.xpath(
            '//body/table[@width="80%"]//tr//td/text()').extract()
        portcall = dict(zip(pc_header, portcall))
        portcall.update(
            cargoes=[],
            port_names=set(),
            provider_name=self.provider,
            # NOTE we need to introspect url per `raw_cargo` to append data to `cargoes`
            raw_cargoes=[],
        )

        # NOTE first element of `cargoes` is the header
        cargoes = response.xpath('//body/table[@width="100%"]//tr')
        header = [
            may_strip(head)
            for head in cargoes[0].xpath('.//td//text()').extract()
        ]
        cargoes = cargoes[1:]

        for raw_row in cargoes:
            raw_cargo = row_to_dict(
                raw_row,
                header,
                cargo_url=response.urljoin(
                    raw_row.xpath('.//a/@href').extract()[-1]),
            )

            # reported_date is based on manifest publish date, take earliest one for consistency
            if not portcall.get('reported_date'):
                _reported_date = raw_cargo.get(
                    'Fecha de Transmisión') or raw_cargo.get(
                        'Fecha de Transmisi�n')
                portcall.update(reported_date=_reported_date)

            # easy optimization; don't care about cargoes in-transit and not bound for Peru
            # destinations are given as UN/LOCODEs
            if not raw_cargo['Puerto Destino'].startswith('PE'):
                continue

            portcall['raw_cargoes'].append(raw_cargo)

        return self.extract_cargo_data(portcall)

示例#13

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        table = response.css('table.shipsinport')[SCHEDULE_TABLE_ORDER].css('tr')
        rows = table[STARTING_ROW_ID:]
        headers = [may_strip(head) for head in table[HEADER_ROW_ID].css('::text').extract()]
        title = table[DATE_ROW_ID].extract()
        last_updated = re.search('[0-9]{2}.[0-9]{2}.20[0-9]{2}', title).group()

        for row in rows:
            raw_item = row_to_dict(row, headers)
            raw_item.update(
                port_name=self.name, provider_name=self.provider, reported_date=last_updated
            )
            yield normalize.process_item(raw_item)

示例#14

0

显示文件

    def parse(self, response):
        """Parse and extract raw items from html tables.

        Each entry in the port activity table has a link on the vessel name, with the vessel IMO
        in the link itself. We append vessel imo to each row we extract, since it is not technically
        part of the table cells.

        Vessel imo appears as part of the html query string, e.g.:
        ".../phpcodes/navire_a.php?ship=9297905"
                                        ^^^^^^^
                                          imo

        Args:
            response (scrapy.HtmlResponse):

        Yields:
            Dict[str, str]:

        """
        # memoise reported_date so it won't need to be called repeatedly
        reported_date = (dt.datetime.utcnow().replace(
            hour=0, minute=0, second=0, microsecond=0).isoformat())

        logger.info(f"JJJJJJJJJJJJJJ {self.provider}")

        # each index corresponds to the vessel movement type in the table
        # 0: attendus
        # 1: a quai
        # 2: en rade
        for table_idx in range(3):
            table = response.xpath(
                f'//div[contains(@class, "et_pb_tab_{table_idx}")]//table')
            header = [
                may_strip(head)
                for head in table.xpath('.//th/text()').extract()
            ]

            for row in table.xpath('./tbody//tr'):
                raw_item = row_to_dict(row, header)
                # conextextualise raw item with meta info
                raw_item.update(
                    port_name=self.name,
                    provider_name=self.provider,
                    reported_date=reported_date,
                    vessel_imo=row.xpath('./td//@href').extract_first().split(
                        'ship=')[1],
                )
                yield normalize.process_item(raw_item)

示例#15

0

显示文件

    def parse_expected_schedule_page(self, response):
        header = response.xpath('//th/text()').extract()

        # obtain table rows
        for row in response.xpath('//div[@id="tab_mouvement_bottom_div"]//tr'):
            raw_item = row_to_dict(
                row,
                header,
                # contextualise raw item with meta data
                port_name=self.name,
                provider_name=self.provider,
                reported_date=dt.datetime.utcnow(),
                # data on this page is exclusively vessels yet to arrive
                event='eta',
            )
            yield normalize.process_item(raw_item)

示例#16

0

显示文件

    def parse_vessel_details(self, response):
        """Parse page containing detailed portcall info.

        Each page contains six tables:
            - "Indentification escale" (internal port identification numbers)
            - "Entree" (eta, arrival, piloting date and timestamps)
            - "Sortie" (etd, departure dates and timestamps)
            - "Sejour a Quai" (berthed date and timestamp)
            - "Provenance / Destination" (previous port visited, next port to be visited)
            - "Caracteristiques du navire" (vessel attributes)

        All tables except "Caracteristiques du navire",
        are structured identically and can be parsed as such.

        Table "Caracteristiques du navire" will be parsed separately.

        Yields:
            Dict[str, str]:

        """
        raw_item = {}

        # parse and extract from identical tables
        for table in response.xpath(
                '//table[@class="ormo table table-striped "]'):
            for idx, row in enumerate(table.xpath('.//tr')):
                # headers will always be the first row
                if idx == 0:
                    headers = row.xpath('./th/text()').extract()
                    continue

                headers = table.xpath('.//th//text()').extract()
                raw_item.update(row_to_dict(row, headers))

        # parse and extract from "Caracteristiques du navire" table
        for field in response.xpath('//table[@class="ormo ref"]//td'):
            key = field.xpath('./div/div[1]//text()').extract_first()
            value = field.xpath('./div/div[2]//text()').extract_first()
            raw_item[key] = value

        # contextualise raw item with meta info
        raw_item.update(port_name=self.port_name,
                        provider_name=self.provider,
                        reported_date=self.reported_date)

        yield normalize.process_item(raw_item)

示例#17

0

显示文件

    def parse_in_quay_page(self, response):
        header = response.xpath('//th/text()').extract()
        # `Unloading` column can appear in headers list sometimes, so we need to try and remove it
        self._remove_elements(header, 'Unloading')

        # obtain table rows
        for row in response.xpath('//table[@id="tab_navire_a_quai_bottom"]//tr'):
            raw_item = row_to_dict(
                row,
                header,
                # contextualise raw item with meta data
                port_name=self.name,
                provider_name=self.provider,
                reported_date=dt.datetime.utcnow(),
                # data on this page is exclusively arrived vessels
                event='arrival',
            )
            yield normalize.process_item(raw_item)

示例#18

0

显示文件

    def parse(self, response):
        # last table provides ETA schedules, other tables are not required
        table = response.xpath('//table')[-1]
        header = table.xpath('.//th//text()').extract()

        # memoise reported date
        reported_date = (dt.datetime.utcnow().replace(
            hour=0, minute=0, second=0).isoformat(timespec='seconds'))

        # iterate through table row entries
        for row in table.xpath('./tbody/tr'):
            raw_item = row_to_dict(row, header)
            # contextualise raw item with meta info
            raw_item.update(port_name=self.name,
                            provider_name=self.provider,
                            reported_date=reported_date)

            # yield raw_item
            yield normalize.process_item(raw_item)

示例#19

0

显示文件

文件： parser.py 项目： theHausdorffMetric/test

def _extract_table(table):
    """Extract tabular data from a standard format as per the website structure.

    Args:
        scrapy.Selector:

    Yields:
        Dict[str, Optional[str]]:

    """
    for idx, row in enumerate(table.xpath('.//tr')):
        if idx == 0:
            # headers will only occur in the first row
            headers = row.xpath('./th/text()').extract()
            continue

        # extract each data row of the table
        yield row_to_dict(row,
                          headers), row.xpath('.//a/@href').extract_first()

示例#20

0

显示文件

    def extract_current_page(self,
                             response: Response) -> Iterator[Dict[str, Any]]:
        """Extract data from current HTML page.

        Each page contains a table of the flow/capacity levels of the specified terminal.

            | Jour       | Stock GNL à 6h | Quantités nominées | Quantités allouées |
            |------------|----------------|--------------------|--------------------|
            | 10/03/2020 |       22       |     115 091 562    |     115 091 562    |
            | 11/03/2020 |       77       |     115 091 562    |     115 091 562    |
            | 12/03/2020 |       59       |     115 091 562    |     115 091 562    |
            | 13/03/2020 |       42       |     115 091 562    |     115 091 562    |
            | 14/03/2020 |       25       |     115 091 562    |     115 091 562    |
            | 15/03/2020 |       80       |     73 987 433     |     73 987 433     |
            | 16/03/2020 |       69       |     24 662 478     |         -          |
            | 17/03/2020 |       65       |     24 662 478     |         -          |
            | 18/03/2020 |       61       |     24 662 478     |         -          |
            | 19/03/2020 |       57       |     24 662 478     |         -          |
            | 20/03/2020 |       53       |     24 662 478     |         -          |
            |    ...     |       ...      |        ...         |        ...         |


        """
        # date_format = "%d/%m/%Y"
        headers = response.xpath(
            '//table[@summary=""]/thead//th/text()').extract()
        if len(headers) != 4:
            self.logger.error(
                "Unable to extract data; resource may have changed")
            return

        rows = response.xpath('//table[@summary=""]/tbody//tr')
        for row in rows:
            raw_item = row_to_dict(row, headers)
            # append meta info
            raw_item.update(
                # TODO should be changed to a more descriptive provider name, like 'Elengy'
                provider_name=self.provider,
                reported_date=self.reported_date,
            )

            yield normalize.process_item(raw_item)

示例#21

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        """Extract table rows and header from vessel berthed page.

        Args:
            response (scrapy.HtmlResponse):

        Yields:
            dict[str, str]:

        """
        reported_date = (dt.datetime.utcnow().replace(
            hour=0, minute=0, second=0).isoformat(timespec='seconds'))

        headers = response.xpath('//table/thead//th/text()').extract()
        for row in response.xpath('//table/tbody/tr'):
            raw_item = row_to_dict(row, headers)
            # contextualise raw item with meta info
            raw_item.update(port_name=self.name,
                            provider_name=self.provider,
                            reported_date=reported_date)
            yield normalize.process_item(raw_item)

示例#22

0

显示文件

    def parse(self, response):
        """Parse overview website and obtain URLs for the individual PDF reports.

        Args:
            response (scrapy.Response):

        Yields:
            Dict[str, str]:

        """
        # there are two port reports provided:
        #   - expected vessels
        #   - current in-port activity
        reported_date = dt.datetime.utcnow().isoformat()
        status_ids = ['berthed', 'expected', 'awaiting']
        for status in status_ids:
            for table in response.xpath(
                "//div[@id='" + status + "']//div[@role='tabpanel']//table"
            ):
                port_id = table.attrib['id']
                header = response.xpath(
                    "//div[@id='"
                    + status
                    + "']//div[@role='tabpanel']//table[@id='"
                    + port_id
                    + "']//th/text()"
                ).extract()
                for row in table.xpath(
                    "//div[@id='"
                    + status
                    + "']//div[@role='tabpanel']//table[@id='"
                    + port_id
                    + "']//tr"
                ):
                    raw_item = row_to_dict(row, header)
                    raw_item.update(
                        port_name=port_id, provider_name=self.provider, reported_date=reported_date
                    )
                    yield normalize.process_item(raw_item)

示例#23

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse_overview(self, response):
        raw_header, raw_table = response.xpath(
            '//div[@id="ContenidoForma_WebDataGrid21"]/table/tr/td/table/tbody'
        )[:2]

        header = raw_header.xpath('.//th/text()').extract()
        table = raw_table.xpath('./tr/td//tr')

        for row in table:
            raw_item = row_to_dict(
                row,
                header,
                # contextualise raw item with meta info
                port_name=self.name,
                provider_name=self.provider,
                reported_date=dt.datetime.utcnow().replace(
                    hour=0, minute=0, second=0).isoformat(timespec='seconds'),
            )
            yield Request(
                url=self.start_urls[1].format(vid=raw_item['VID']),
                callback=self.parse_vessel_attributes,
                meta={'raw_item': raw_item},
            )

示例#24

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        """Dispatch response to corresponding callback given URL.

        Args:
            response (scrapy.HtmlResponse):

        Yields:
            dict[str, str]:

        """
        table, headers = parser.extract_table_and_headers(response)
        # memoise reported_date so it won't have to be called repeatedly for each row
        reported_date = parser.extract_reported_date(response)

        for row in parser.extract_rows_from_table(table):
            if len(row.xpath('.//td')) == len(headers):
                raw_item = row_to_dict(row, headers)
                # contextualise raw item with meta info
                raw_item.update(port_name=self.name,
                                provider_name=self.provider,
                                reported_date=reported_date)

                yield normalize.process_item(raw_item)

示例#25

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        """Entrypoint for parsing Pampa Melchorita port website.

        Args:
            response (scrapy.Response):

        Yields:
            Dict[str, str]:

        """
        # memoise reported_date so it won't need to be called repeatedly
        reported_date = (dt.datetime.utcnow().replace(
            hour=0, minute=0, second=0, microsecond=0).isoformat())

        for idx, row in enumerate(response.xpath('//table/tr')):
            # first two rows contains nested headers; ignore
            if idx in [0, 1]:
                continue

            # use cell indexes as header
            header = [str(cell) for cell in range(len(row.xpath('./td')))]

            # don't re-scrape data that was seen before
            # uniqueness defined by row ID, shipping date, vessel name, and destination
            row_hash = parser.naive_list_hash(
                row.xpath('./td//text()').extract(), 1, 2, 3, 4)
            if not self._check_persistence(row_hash):
                continue

            # extract data rows from table
            raw_item = row_to_dict(row, header)
            # contextualise raw_item with meta info
            raw_item.update(port_name='Melchorita',
                            provider_name=self.provider,
                            reported_date=reported_date)

            yield normalize.process_item(raw_item)

示例#26

0

显示文件

    def parse(self, response):
        tables = response.xpath('//center/table')
        table_names = response.xpath('//center/font//text()').extract()

        # memoise reported date
        reported_date = response.xpath('//body/table/tr[2]//b/text()').extract_first()

        for table_idx, table in enumerate(tables):
            # NOTE first row is actually empty and should be skipped
            # header is in second row, but we don't need to use it
            # table rows are in the subsequent rows
            rows = table.xpath('tr')[2:]
            for row in rows:
                # import ipdb; ipdb.set_trace()
                header = [str(idx) for idx in range(len(row.xpath('td')))]
                raw_item = row_to_dict(
                    row,
                    header,
                    port_name=self.name,
                    vessel_type=table_names[table_idx],
                    provider_name=self.provider,
                    reported_date=reported_date,
                )
                yield normalize.process_item(raw_item)

示例#27

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse_cargo_pages(self, response):
        portcall = response.meta['raw_item']

        # easy optimization; don't care about container vessels
        if 'contenedore' in response.xpath('body').extract_first().lower():
            # break if one container cargo is present; no need to introspect the other cargoes
            self.logger.info(
                f"Vessel {portcall.get('Matrícula de la Nave')} is a container ship, discarding ..."
            )
            return

        header = [
            may_strip(head.extract()) for head in response.xpath(
                '//body/table[@width="100%"]//tr[1]//b/text()')
            if may_strip(head.extract())
        ]
        products = response.xpath(
            '//body/table[@width="100%"]//tr[position()>1]')

        for product in products:
            product = row_to_dict(product, header)
            portcall['cargoes'].append(product)

        return self.extract_cargo_data(portcall)

示例#28

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        """Entrypoint of Skikda spider.

        Args:
            response (scrapy.Response):

        Yields:
            Dict[str, str]:
        """
        # memoise reported date so it won't need to be called repeatedly later
        reported_date = response.xpath(
            '//div[@id="full"]/h3/text()').extract_first()

        header, table = self.extract_table_and_header(response)
        for row in table:
            raw_item = row_to_dict(row, header)
            # let's add a bit of metadata
            raw_item.update(
                port_name=self.name,
                provider_name=self.provider,
                reported_date=reported_date,
                url=response.url,
            )
            yield normalize.process_item(raw_item)

示例#29

0

显示文件

文件： spider.py 项目： theHausdorffMetric/test

    def parse(self, response):
        """Parse and extract raw items from html tables.

        Args:
            response (scrapy.HtmlResponse):

        Yields:
            Dict[str, str]:

        """
        # memoise reported date so it won't need to be called repeatedly later
        reported_date = response.xpath(
            '//div[@id="tabelasNavios"]/h3/text()').extract_first()

        all_tables = response.xpath('//div[@id="tabelasNavios"]/table')
        table_types = [
            may_strip(x) for x in response.xpath(
                '//div[@id="tabelasNavios"]/h2[@class="tableName"]//text()').
            extract() if may_strip(x)
        ]

        for idx, table in enumerate(all_tables):
            table_type = table_types[idx]
            # parse each row of the table body
            for row in table.xpath('./tbody/tr'):
                # use raw indexes instead of table headers since they are inconsistent
                raw_item = row_to_dict(
                    row,
                    [str(idx) for idx in range(TABLE_WIDTH)],
                    # contextualise raw item with some meta info
                    port_name=self.name,
                    provider_name=self.provider,
                    reported_date=reported_date,
                    table_type=table_type,
                )
                yield normalize.process_item(raw_item)

示例#30

0

显示文件

    def process_report_page(self, response):
        """Process the main report page
        """
        PROCESSING_STARTED = False
        balance = None
        country, country_type = parser.get_country(response)
        # Each row in the table represents a information on different balance type
        # (refinery intake, ending stock) for a speicific period of time
        for row in response.xpath('//tr'):
            for col in row.xpath('./td'):
                if col.xpath('./u/text()').extract_first() in ALLOWED_BALANCE:
                    PROCESSING_STARTED = True
                    balance = col.xpath('./u/text()').extract_first()

                if PROCESSING_STARTED:
                    table = col.xpath('./table')
                    if not table:
                        continue

                    trows = table.xpath('./tr')

                    # we are interested only in the first 2 rows
                    mapped_titles = row_to_dict(trows[0], TABLE_HEADER)
                    mapped_values = row_to_dict(trows[1], TABLE_INFO)

                    # for current week items
                    raw_item = map_keys(mapped_titles,
                                        self.current_week_mapping())
                    raw_item.update(
                        map_keys(mapped_values, self.current_week_mapping()))

                    raw_item.update({
                        'reported_date':
                        parser.get_reported_date(response),
                        'provider_name':
                        self.provider,
                        'unit':
                        Unit.kiloliter,
                        'country':
                        COUNTRY_MAPPING.get(country, country),
                        'country_type':
                        country_type,
                        'balance':
                        self._infer_balance(balance),
                    })

                    # Dispatch for the current week
                    start_of_current_week, end_of_current_week = parser.parse_date_range(
                        raw_item.pop('current_week', None))

                    raw_item.update({
                        'start_date':
                        start_of_current_week,
                        'end_date':
                        end_of_current_week,
                        'volume':
                        raw_item.pop('volume_current', None),
                    })

                    yield raw_item

                    # for previous week
                    raw_item.update(
                        map_keys(mapped_titles, self.previous_week_mapping()))
                    raw_item.update(
                        map_keys(mapped_values, self.previous_week_mapping()))

                    # Dispatch for the Previous week
                    start_of_prev_week, end_of_prev_week = parser.parse_date_range(
                        raw_item.pop('prev_week', None))
                    raw_item.update({
                        'start_date':
                        start_of_prev_week,
                        'end_date':
                        end_of_prev_week,
                        'volume':
                        raw_item.pop('volume_prev', None),
                    })

                    yield raw_item

                    PROCESSING_STARTED = False