Python StripJoinItemLoader 예제들, ragoogle.loaders.StripJoinItemLoader Python 예제들

예제 #1

0

파일 보기

    def parse(self, response):
        for row in response.css('table.ms-listviewtable tr[class^=building-registry-row]'):
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(1)::text")
            l.add_css("number_in_order", "td:nth-child(2)::text")
            l.add_css("order_date", "td:nth-child(3)::text")
            l.add_css("decree_no", "td:nth-child(4)::text")
            l.add_css("customer", "td:nth-child(5)::text, td:nth-child(5) div::text")
            l.add_css("obj", "td:nth-child(6)::text, td:nth-child(6) div::text")
            l.add_css("address", "td:nth-child(7)::text, td:nth-child(7) div::text")
            l.add_css("changes", "td:nth-child(8) div::text")
            l.add_css("cancellation", "td:nth-child(9) div::text")

            url = row.css("td:nth-child(10) a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()

        # get next page number href next to current inactive with span tag
        nextPageJs = response.xpath('//table[@class="ms-listviewtable"]//tr[@class="building-registry-pager"]//table/tr/td[.//span]/following-sibling::td[1]/a/@href')
        if len(nextPageJs):
            yield FormRequest.from_response(
                response,
                formname="aspnetForm",
                formxpath="//form[@id='aspnetForm']",
                dont_click=True,
                formdata={
                    '__EVENTARGUMENT': nextPageJs.re('\',\'(.*)\'\)'),
                    '__EVENTTARGET': nextPageJs.re('javascript:__doPostBack\(\'(.*)\',')
                },
                dont_filter=True,
                callback=self.parse
            )

예제 #2

0

파일 보기

파일: sumy.py 프로젝트: koliambus/mbu-ukraine

    def parse(self, response):
        for i, row in enumerate(response.css("table tbody tr")):
            # skip styled as header
            if row.css("td:nth-child(1) p strong").get():
                self.logger.debug("skipped row : {}".format(row.get()))
                continue

            self.logger.debug("parsed row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no",
                      "td:nth-child(1) a.add-google-doc::text",
                      re=r"№\s?(.+)$")
            l.add_css("order_date",
                      "td:nth-child(1) a.add-google-doc::text",
                      re=r"([\d\.]*) №")
            l.add_css("customer",
                      "td:nth-child(2)::text, td:nth-child(2) p::text")
            l.add_css("obj", "td:nth-child(3)::text, td:nth-child(3) p::text")
            l.add_css("address", "td:nth-child(4)::text")
            l.add_css("changes", "td:nth-child(5)::text")
            l.add_css("cancellation", "td:nth-child(6)::text")

            cancellation_url = row.css(
                "td:nth-child(6) a.add-google-doc::attr(href)").getall()
            if len(cancellation_url):
                l.add_value(
                    "cancellation_url",
                    [response.urljoin(url) for url in cancellation_url])

            url = row.css("td:nth-child(1) a.add-google-doc::attr(href)"
                          ).extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()

예제 #3

0

파일 보기

파일: ternopil.py 프로젝트: koliambus/mbu-ukraine

    def parse(self, response):
        # only first table with data
        for index, row in enumerate(
                response.css("div.post-body>table:first-of-type>tbody>tr")):

            # first two are headers, skip
            if index < 2:
                continue

            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            # because of errors in html, get td from current root only
            l.add_xpath(
                "number_in_order",
                "./td[position()=1]/span/text()|./td[position()=1]/p/span/text()",
                re=r"(\d+)\s?")
            l.add_css(
                "order_no",
                "td:nth-child(2) p span::text, td:nth-child(2) span::text",
                re=r"^\s*№ ?(.*)\s?від")
            l.add_css(
                "order_date",
                "td:nth-child(2) p span::text, td:nth-child(2) span::text",
                re=r"(\d{1,2}[\. /]?\d{1,2}[\. /]?\d{2,4})[\sр\.]*$")
            l.add_css(
                "customer",
                "td:nth-child(3) p span::text, td:nth-child(3) span::text")
            l.add_css(
                "obj",
                "td:nth-child(4) p span::text, td:nth-child(4) span::text")
            l.add_css(
                "address",
                "td:nth-child(5) p span::text, td:nth-child(5) span::text")
            l.add_css(
                "changes",
                "td:nth-child(6) p span::text, td:nth-child(6) span::text")
            l.add_css(
                "cancellation",
                "td:nth-child(7) p span::text, td:nth-child(7) span::text")

            url = row.css(
                "td:nth-child(8) p span a::attr(href), td:nth-child(8) span a::attr(href), td:nth-child(8) a::attr(href)"
            ).extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()

예제 #4

0

파일 보기

파일: ivano_frankivsk.py 프로젝트: koliambus/mbu-ukraine

    def parse(self, response):
        for row in response.css("table.table-registry tbody tr"):
            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(1) strong::text")
            l.add_css("order_date", "td:nth-child(1)::text")
            l.add_css("prescript_no", "td:nth-child(2) strong::text")
            l.add_css("prescript_date", "td:nth-child(2)::text")
            l.add_css("customer", "td:nth-child(5)::text")
            l.add_css("obj", "td:nth-child(3)::text")
            l.add_css("address", "td:nth-child(4)::text")
            l.add_css("changes", "td:nth-child(6)::text")
            l.add_css("cancellation", "td:nth-child(7)::text")

            url = row.css("td:nth-child(8) a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()

        next_page = response.css(
            'a.next.page-numbers::attr(href)').extract_first()
        if next_page:
            self.logger.debug("follow next page : {}".format(next_page))
            yield response.follow(next_page)

예제 #5

0

파일 보기

    def parse(self, response):
        jsonresponse = json.loads(response.body_as_unicode())
        for row in jsonresponse["aaData"]:
            self.logger.debug("parsed row : {}".format(row))
            l = StripJoinItemLoader(item=MbuItem())
            l.add_value("number_in_order", row[0])
            l.add_value(
                "order_no",
                re.search("№ ?(.*) ?(ві|от)", row[1]).group(1)
                if row[1] else None)
            l.add_value(
                "order_date",
                re.search("([0-9]{1,2}\.[0-9]{1,2}\. ?[0-9]{1,4})",
                          row[1]).group(1) if row[1] else None)
            l.add_value("customer", row[2])
            l.add_value("obj", row[3])
            l.add_value("address", row[4])
            l.add_value("changes", row[5])
            l.add_value("cancellation", row[6])
            l.add_value(
                "scan_url",
                Selector(text=row[7]).css("a::attr(href)").extract_first()
                if row[7] else None)

            yield l.load_item()

예제 #6

0

파일 보기

파일: zhytomyr.py 프로젝트: koliambus/mbu-ukraine

class ZhytomyrSpider(scrapy.spiders.CSVFeedSpider):
    location_name = "Житомир"
    name = "zhytomyr"
    allowed_domains = ["zt-rada.gov.ua"]
    start_urls = ["http://zt-rada.gov.ua/?3398[0]=6281"]
    custom_settings = {
        # specifies exported fields and order
        'FEED_EXPORT_FIELDS': [
            "location_name", "order_no", "order_date", "customer", "obj",
            "address", "changes", "cancellation", "scan_url"
        ],
    }
    item_loaders = defaultdict(lambda: StripJoinItemLoader(item=MbuItem()))

    def parse_xls_and_flush(self, response):
        sheet = xlrd.open_workbook(
            file_contents=response.body).sheet_by_index(0)
        for index in range(1, sheet.nrows):
            row = sheet.row(index)

            # if row is not empty
            if row[1].value:
                order_no = row[0].value.replace('№', '').strip()
                l = self.get_item(order_no)

                if not l.get_output_value('order_no'):
                    l.add_value("order_no", order_no)

                l.add_value("order_date", row[1].value)
                l.add_value("customer", row[6].value)
                l.add_value("obj", row[2].value)
                l.add_value("address", row[3].value)
                l.add_value("changes", row[7].value)
                l.add_value("cancellation", row[8].value)
            else:
                self.logger.debug("skipped index : {}, row : {}".format(
                    index, row))

        for item in self.item_loaders.values():
            yield item.load_item()

        # clear in case of several xls files found
        self.item_loaders.clear()

    def parse(self, response):
        for row in response.css(".docrowcontainer"):
            document_url = row.css(
                "div:nth-child(2) a.docdownload::attr(href)").extract_first()

            if document_url.endswith('.xls'):
                self.logger.info(
                    "xls document found : {}".format(document_url))
                yield response.follow(document_url,
                                      callback=self.parse_xls_and_flush,
                                      priority=10)  # big priority to run last
            else:
                continue
                # self.logger.debug("parse site row : {}".format(row.get()))
                # order_no = "".join(row.css("div:nth-child(1)::text").re(r"№ ?(.*)")).strip()
                # l = self.get_item(order_no)
                # l.selector = row

                # l.add_value("order_no", order_no)
                # l.add_xpath("order_date", "./@data-year")

                # l.add_value("scan_url", response.urljoin(document_url))

        next_page_link = response.xpath(
            '//*[@id="tp6"]//ul/li[@class="active"]/following-sibling::li[1]/a/@href'
        ).get()
        if next_page_link:
            self.logger.debug("next page link : {}".format(next_page_link))
            yield response.follow(next_page_link, callback=self.parse)

    def get_item(self, order_no):
        # tries to find similar order number with different separating signs
        splitter = '[\.\s/\-\|]+'
        order_id_list = re.split(splitter, order_no.strip())
        filtered_order = list(
            filter(
                lambda ord: order_id_list == re.split(splitter, ord.strip()),
                self.item_loaders.keys()))

        if len(filtered_order):
            self.logger.debug(
                'similar to order_no : {} found in loaders : {}'.format(
                    order_no, filtered_order))

        return self.item_loaders[filtered_order[0] if len(filtered_order
                                                          ) else order_no]

예제 #7

0

파일 보기

파일: cherkasy.py 프로젝트: koliambus/mbu-ukraine

    def parse_filtered(self, response):
        jsonresponse = json.loads(response.body_as_unicode())
        table = Selector(text=jsonresponse['table'])

        for row in table.css("tr"):
            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(1) strong::text")
            l.add_css("order_date", "td:nth-child(1)::text", re=r"([\d\.])+\s*№")
            l.add_css("customer", "td:nth-child(2)::text")
            l.add_css("obj", "td:nth-child(3) small::text")
            l.add_css("address", "td:nth-child(4)::text")

            changes_url = row.xpath("./td[position()=5]//a[.//span[contains(@class, 'glyphicon glyphicon-edit')]]/@href").extract_first()
            if changes_url:
                l.add_value("changes", response.urljoin(changes_url))

            cancellation_url = row.xpath("./td[position()=5]//a[.//span[contains(@class, 'glyphicon glyphicon-ban-circle')]]/@href").extract_first()
            if cancellation_url:
                l.add_value("cancellation", response.urljoin(cancellation_url))

            scan_url = row.xpath("./td[position()=5]//a[.//span[contains(@class, 'glyphicon glyphicon-info-sign')]]/@href").extract_first()
            if scan_url:
                l.add_value("scan_url", response.urljoin(scan_url))

            yield l.load_item()

예제 #8

0

파일 보기

파일: odessa.py 프로젝트: koliambus/mbu-ukraine

    def parse(self, response):
        jsonresponse = json.loads(response.body_as_unicode())
        for attributes in jsonresponse['features']:
            self.logger.debug("parse row : {}".format(attributes))
            attributes = attributes['attributes']
            l = StripJoinItemLoader(item=MbuItem())
            l.add_value("number_in_order", str(attributes['Kadastr2016.DBO.MBU.OBJECTID']))
            l.add_value("order_no", attributes['Kadastr2016.DBO.MBU.NomKancel'] if attributes['Kadastr2016.DBO.MBU.NomKancel'] else '-')
            l.add_value("order_date",
                        str(datetime.fromtimestamp(attributes['Kadastr2016.DBO.MBU.Data'] / 1000).date()) if attributes[
                            'Kadastr2016.DBO.MBU.Data'] else None)
            l.add_value("customer", attributes['Kadastr2016.DBO.MBU.Zamovnuk_MBO'])
            l.add_value("obj", attributes['Kadastr2016.DBO.MBU.Nazobekty'])
            l.add_value("address", self.get_address(attributes))
            l.add_value("changes", attributes['Kadastr2016.DBO.MBU.Zmini'])
            l.add_value("cancellation", attributes['Kadastr2016.DBO.MBU.Skasuvannia'])

            # all original fields to be used later
            additional_fields = dict([[item[len('Kadastr2016.DBO.'):],attributes[item]] for item in attributes.keys()])

            l.add_value('additional_fields', json.dumps(additional_fields, ensure_ascii=False))

            scan_urls = []
            if attributes['Kadastr2016.DBO.MBU.Link_1']:
                scan_urls.append(response.urljoin(attributes['Kadastr2016.DBO.MBU.Link_1']))

            if attributes['Kadastr2016.DBO.MBU.Link_2']:
                scan_urls.append(response.urljoin(attributes['Kadastr2016.DBO.MBU.Link_2']))

            if scan_urls:
                l.add_value("scan_url", ",".join(scan_urls))

            yield l.load_item()

예제 #9

0

파일 보기

파일: kropyvnytskyi.py 프로젝트: koliambus/mbu-ukraine

    def parse(self, response):
        for row in response.xpath(
                '//table/tbody/tr[count(td)=8 and not(./td//span/strong)]'):
            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("number_in_order",
                      "td:nth-child(1) p::text, td:nth-child(1)::text")
            l.add_css(
                "order_no",
                "td:nth-child(2) p:nth-child(1)::text, td:nth-child(2) p:nth-child(1) span::text",
                re=r"№\s?(\d+)")
            l.add_css(
                "order_date",
                "td:nth-child(2) p:nth-child(1)::text, td:nth-child(2) p:nth-child(1) span::text",
                re=r"(\d{1,2}[\. /]?\d{1,2}[\. /]?\d{2,4})")
            l.add_css("remarks", "td:nth-child(2) p:nth-child(2)::text")
            l.add_css("customer", "td:nth-child(3) p::text")
            l.add_css("obj", "td:nth-child(4) p::text")
            l.add_css("address", "td:nth-child(5) p::text")
            l.add_css("changes",
                      "td:nth-child(6) p::text, td:nth-child(6) a::text")
            l.add_css("cancellation",
                      "td:nth-child(7) p::text, td:nth-child(7) a::text")
            l.add_css("scan_text",
                      "td:nth-child(8) a::text, td:nth-child(8) p::text")

            l.add_css("changes_url", "td:nth-child(6) a::attr(href)")
            l.add_css("cancellation_url", "td:nth-child(7) a::attr(href)")
            l.add_css("scan_url", "td:nth-child(8) a::attr(href)")
            yield l.load_item()

예제 #10

0

파일 보기

파일: kga_gov_ua.py 프로젝트: koliambus/mbu-ukraine

    def parse(self, response):
        for row in response.css("table#droptablesTbl4 tbody tr"):
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(2)::text", re=r"([\d]+)$")
            l.add_css("order_date", "td:nth-child(2)::text", re=r"^[\d.]*")
            l.add_css("customer", "td:nth-child(3)::text")
            l.add_css("obj", "td:nth-child(4)::text")
            l.add_css("address", "td:nth-child(5)::text")
            l.add_css("changes", "td:nth-child(6)::text")
            l.add_css("cancellation", "td:nth-child(7)::text")

            url = row.css("td:nth-child(8) a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()

예제 #11

0

파일 보기

    def parse(self, response):
        loaded_order_numbers = []
        for index, row in enumerate(response.css("table tbody tr")):
            # first and second are headers, skip
            if index == 0 or index == 1:
                continue

            orders_in_row = len(row.css("td:nth-child(3) p").getall())

            if orders_in_row == 0:
                orders_in_row = len(row.css("td:nth-child(3) span"))

            # each row is sub divided for main order and it's changes
            for order_in_row in range(orders_in_row):
                l = StripJoinItemLoader(item=MbuItem(), selector=row)
                l.add_value(
                    "number_in_order",
                    self.get_first_existed(row, "td:nth-child(1) p span::text",
                                           "td:nth-child(1) span::text"))

                order_no = row.css(
                    "td:nth-child(3) p:nth-child({}) span::text, td:nth-child(3) span:nth-child({})::text"
                    .format(order_in_row + 1, order_in_row + 1)).get()

                if not order_no:
                    continue

                l.add_value("order_no", order_no)

                l.add_value(
                    "order_date",
                    self.get_first_existed(
                        row, "td:nth-child(2) p:nth-child(" +
                        str(order_in_row + 1) + ") span::text",
                        "td:nth-child(2) p:nth-child(1) span::text",
                        "td:nth-child(2) span:nth-child(" +
                        str(order_in_row + 1) + ")::text",
                        "td:nth-child(2) span:nth-child(1) span::text"))

                l.add_value(
                    "customer",
                    self.get_first_existed(
                        row, "td:nth-child(4) p:nth-child(" +
                        str(order_in_row + 1) + ") span::text",
                        "td:nth-child(4) p:nth-child(1) span::text",
                        "td:nth-child(4) span:nth-child(" +
                        str(order_in_row + 1) + ")::text",
                        "td:nth-child(4) span:nth-child(1)::text"))

                l.add_value(
                    "obj",
                    self.get_first_existed(
                        row, "td:nth-child(5) p:nth-child(" +
                        str(order_in_row + 1) + ") span::text",
                        "td:nth-child(5) p:nth-child(1) span::text",
                        "td:nth-child(5) span:nth-child(" +
                        str(order_in_row + 1) + ")::text",
                        "td:nth-child(5) span:nth-child(1)::text"))

                l.add_value(
                    "address",
                    self.get_first_existed(
                        row, "td:nth-child(6) p:nth-child(" +
                        str(order_in_row + 1) + ") span::text",
                        "td:nth-child(6) p:nth-child(1) span::text",
                        "td:nth-child(6) span:nth-child(" +
                        str(order_in_row + 1) + ")::text",
                        "td:nth-child(6) span:nth-child(1)::text"))

                l.add_value(
                    "changes",
                    self.get_first_existed(
                        row, "td:nth-child(7) p:nth-child(" +
                        str(order_in_row + 1) + ") span::text",
                        "td:nth-child(7) p:nth-child(1) span::text",
                        "td:nth-child(7) span:nth-child(" +
                        str(order_in_row + 1) + ")::text",
                        "td:nth-child(7) span:nth-child(1)::text"))
                l.add_value(
                    "cancellation",
                    self.get_first_existed(
                        row, "td:nth-child(8) p:nth-child(" +
                        str(order_in_row + 1) + ") span::text",
                        "td:nth-child(8) p:nth-child(1) span::text",
                        "td:nth-child(8) span:nth-child(" +
                        str(order_in_row + 1) + ")::text",
                        "td:nth-child(8) span:nth-child(1)::text"))

                url = self.get_first_existed(
                    row, "td:nth-child(9) p:nth-child(" +
                    str(order_in_row + 1) + ") span a::attr(href)",
                    "td:nth-child(9) p:nth-child(1) span a::attr(href)",
                    "td:nth-child(9) a:nth-child(" + str(order_in_row + 1) +
                    ")::attr(href)",
                    "td:nth-child(9) a:nth-child(1)::attr(href)")

                # TODO add decree found in 357th row

                l.add_value("scan_url", response.urljoin(url))

                address_assign_url = self.get_first_existed(
                    row, "td:nth-child(10) a::attr(href)")

                if address_assign_url:
                    l.add_value("address_assign_url",
                                response.urljoin(address_assign_url))
                    l.add_css("address_assign_no",
                              "td:nth-child(10) a span::text",
                              re=r"№(.*) ?від")
                    l.add_css("address_assign_date",
                              "td:nth-child(10) a span::text",
                              re=r"від ?(.*)$")

                loaded_order_numbers.append(
                    int(l.get_collected_values("number_in_order")[0]))

                yield l.load_item()

        # check if all consecutive orders where loaded
        missed_order_numbers = {*range(1, max(loaded_order_numbers))
                                }.difference(loaded_order_numbers)

        if missed_order_numbers:
            self.logger.warning("Missed order numbers: %s",
                                missed_order_numbers)
        else:
            self.logger.info("All order numbers processed")

예제 #12

0

파일 보기

파일: chernivtsi.py 프로젝트: koliambus/mbu-ukraine

    def parse_row(self, response, row):
        self.logger.debug("parse row : {}".format(row))
        l = StripJoinItemLoader(item=MbuItem())
        l.add_value("order_no",
                    row['restrictionNumber '])  # space in the end is needed
        l.add_value("order_date", row['restrictionDate'])
        l.add_value("customer", row['objectOwner'])
        l.add_value("obj", row['objectDescription'])
        l.add_value("address", row['objectAddress'])
        l.add_value("changes", row['objectChanges'])
        l.add_value("cancellation", row['objectCancel'])
        l.add_value("status", row['objectStatus'])

        yield l.load_item()

예제 #13

0

파일 보기

파일: chernihiv.py 프로젝트: koliambus/mbu-ukraine

    def parse(self, response):
        for index, row in enumerate(response.css("table>tbody>tr")):

            # skip headers and rows with empty lines
            if index < 5 and not "".join(
                    row.css("td::text, td span::text").getall()).strip():
                self.logger.debug("skipped index : {}, row : {}".format(
                    index, row.get()))
                continue

            self.logger.debug("parse index : {}, row : {}".format(
                index, row.get()))

            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("decree", "td:nth-child(1)::text")
            l.add_css("order_no",
                      "td:nth-child(2)::text, td:nth-child(2) span::text",
                      re=r"^\s?(.*)від")
            l.add_css("order_date",
                      "td:nth-child(2)::text, td:nth-child(2) span::text",
                      re=r"від\s?([\d\.]*)\s*$")
            l.add_css("customer",
                      "td:nth-child(3)::text, td:nth-child(3) span::text")
            l.add_css("obj",
                      "td:nth-child(4)::text, td:nth-child(4) span::text")
            l.add_css("address",
                      "td:nth-child(5)::text, td:nth-child(5) span::text")
            l.add_css("changes",
                      "td:nth-child(6)::text, td:nth-child(6) span::text")
            l.add_css(
                "cancellation",
                "td:nth-child(7)::text, td:nth-child(7) a::text, td:nth-child(7) span::text"
            )

            cancellation_url = row.css(
                "td:nth-child(7) a::attr(href)").extract_first()
            if cancellation_url:
                l.add_value("cancellation_url",
                            response.urljoin(cancellation_url))

            scan_url = row.css("td:nth-child(8) a::attr(href)").extract_first()
            if scan_url:
                l.add_value("scan_url", response.urljoin(scan_url))

            yield l.load_item()

예제 #14

0

파일 보기

파일: khmelnytskyi.py 프로젝트: koliambus/mbu-ukraine

    def parse(self, response):
        for row in response.css("table#tabledataMto tbody tr"):
            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(1)::text")
            l.add_css("order_date", "td:nth-child(4)::text")
            l.add_css("customer", "td:nth-child(5)::text")
            l.add_css("obj", "td:nth-child(6)::text")
            l.add_css("obj_purpose", "td:nth-child(7)::text")
            l.add_css("address_street", "td:nth-child(2)::text")
            l.add_css("address_street_number", "td:nth-child(3)::text")
            l.add_css("address", "td:nth-child(2)::text")
            l.add_css("address", "td:nth-child(3)::text")
            l.add_css("cancellation", "td:nth-child(8)::text")

            url = row.css("td:nth-child(9) a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()

예제 #15

0

파일 보기

    def parse(self, response):
        for row in response.css("table tbody tr"):
            # first is header, skip
            if row.css("td:nth-child(1)::text").get() == "№ з/п":
                self.logger.debug("skiped row : {}".format(row.get()))
                continue

            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("number_in_order", "td:nth-child(1)::text")
            l.add_css("order_no",
                      "td:nth-child(2) p::text, td:nth-child(2)::text",
                      re=r"№ ?(.*)\s?$")
            l.add_css(
                "order_date",
                "td:nth-child(2) p:nth-child(1)::text, td:nth-child(2)::text",
                re=r"(\d{1,2}[\. /]?\d{1,2}[\. /]?\d{2,4})[\sр\.]*")
            l.add_css("customer", "td:nth-child(3)::text")
            l.add_css("obj", "td:nth-child(4)::text")
            l.add_css("address",
                      "td:nth-child(5)::text, td:nth-child(5) a::text")
            l.add_css("changes", "td:nth-child(6)::text")
            l.add_css("cancellation", "td:nth-child(7)::text")

            url = row.css("td:nth-child(8) a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            l.add_css("scan_no", "td:nth-child(8) a::text", re=r"№(.*) ?від")
            l.add_css("scan_date", "td:nth-child(8) a::text", re=r"від ?(.*)")

            item = l.load_item()
            if not item.get("number_in_order"):
                continue

            yield item

예제 #16

0

파일 보기

    def parse(self, response):
        jsonresponse = json.loads(response.body_as_unicode())
        for row in jsonresponse["aaData"]:
            self.logger.debug("parse row : {}".format(row))
            l = StripJoinItemLoader(item=MbuItem())
            l.add_value("number_in_order", row[0])
            l.add_value("order_no", row[2])
            l.add_value("order_date", row[1])
            l.add_value("customer", row[7])
            l.add_value("obj", row[3])
            l.add_value("address", row[4])
            l.add_value("changes", row[5])
            l.add_value("cancellation", row[6])
            l.add_value("scan_url",
                        response.urljoin(row[8]) if row[8] else None)

            yield l.load_item()

예제 #17

0

파일 보기

    def parse(self, response):
        for row in response.css(
                "div.table-content-container div#orders div.one_order"):
            self.logger.debug("parsed row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_date", "li.order_date::text")
            l.add_css("order_no", "li.order_number::text")
            l.add_css("customer", "li.cust::text")
            l.add_css("obj", "li.order_name::text")
            l.add_css("address", "li.addr::text")
            l.add_css("changes", "li.changes_info::text")
            l.add_css("cancellation", "li.reason_canc::text")

            url = row.css("li.download a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()

예제 #18

0

파일 보기

파일: poltava.py 프로젝트: koliambus/mbu-ukraine

    def parse(self, response):
        for i, row in enumerate(response.css("table tr")):
            # skip first as header
            if i == 0:
                self.logger.debug("skipped row : {}".format(row.get()))
                continue

            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(1)::text", re=r" (.*)$")
            l.add_css("order_date", "td:nth-child(1)::text", re=r"^[\d-]*")
            l.add_css("customer", "td:nth-child(2)::text")
            l.add_css("obj", "td:nth-child(3)::text")
            l.add_css("address", "td:nth-child(4)::text")
            l.add_css("changes", "td:nth-child(5)::text")
            l.add_css("cancellation", "td:nth-child(6)::text")
            l.add_css("scan_url", "td:nth-child(7) a::attr(href)")
            yield l.load_item()

예제 #19

0

파일 보기

    def parse(self, response):
        for row in response.css("table.table.table-striped.small tbody tr"):
            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(1)::text")
            l.add_css("order_date", "td:nth-child(2)::text")
            l.add_css("customer", "td:nth-child(3)::text")
            l.add_css("obj", "td:nth-child(4)::text")
            l.add_css("address", "td:nth-child(5)::text")
            l.add_css("cadastre_number", "td:nth-child(6)::text")
            l.add_css("document_status", "td:nth-child(7) span::text")
            l.add_css("changes", "td:nth-child(8)::text")

            document_url = row.css(
                "td:nth-child(9) a::attr(href)").extract_first()
            l.add_value("scan_url", response.urljoin(document_url))

            map_url = row.css("td:nth-child(1) a::attr(href)").extract_first()
            l.add_value("map_url", response.urljoin(map_url))
            yield l.load_item()

        # if 'Next' page label present continue crawling
        if response.css("ul.pagination li a[aria-label=Next]").get():
            yield scrapy.Request(
                self.get_next_page(response),
                callback=self.parse,
                meta={"proxy": self.get_random_proxy()},
            )

예제 #20

0

파일 보기

파일: mkrada_gov_ua.py 프로젝트: koliambus/mbu-ukraine

    def parse(self, response):
        for row in response.css(
                "article table tbody tr:not([align=\"center\"])"):
            l = StripJoinItemLoader(item=MbuItem(), selector=row)

            # skip empty lines
            date_order = ''.join(
                row.css(
                    "td:nth-child(2) p span::text, td:nth-child(2) span::text, td:nth-child(2)::text"
                ).getall())
            if not date_order or not date_order.strip():
                self.logger.debug("skipped row : {}".format(row.get()))
                continue

            self.logger.debug("parse row : {}".format(row.get()))
            # number_in_order is unique only per year
            l.add_css(
                "number_in_order",
                "td:nth-child(1) p span::text, td:nth-child(1) span::text, td:nth-child(1)::text"
            )
            l.add_value(
                "order_no",
                re.search('^\\s?№? ?(.*)\\s?(в?ід|dsl)', date_order).group(1))
            l.add_css(
                "order_date",
                "td:nth-child(2) p span::text, td:nth-child(2) span::text, td:nth-child(2)::text",
                re=r"(\d{1,2}[\. /]?\d{1,2}[\. /]?\d{2,4})\s*$")
            l.add_css(
                "customer",
                "td:nth-child(3) span::text, td:nth-child(3)::text, td:nth-child(3) p::text"
            )
            l.add_css("obj",
                      "td:nth-child(4) span::text, td:nth-child(4)::text")
            l.add_css(
                "address",
                "td:nth-child(5) span::text, td:nth-child(5) p::text, td:nth-child(5)::text"
            )
            l.add_css("changes",
                      "td:nth-child(6) span::text, td:nth-child(6)::text")
            l.add_css("cancellation",
                      "td:nth-child(7) span::text, td:nth-child(7)::text")

            url = row.css("td:nth-child(8) a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()