예제 #1
0
    def parse_filtered(self, response):
        jsonresponse = json.loads(response.body_as_unicode())
        table = Selector(text=jsonresponse['table'])

        for row in table.css("tr"):
            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(1) strong::text")
            l.add_css("order_date", "td:nth-child(1)::text", re=r"([\d\.])+\s*№")
            l.add_css("customer", "td:nth-child(2)::text")
            l.add_css("obj", "td:nth-child(3) small::text")
            l.add_css("address", "td:nth-child(4)::text")

            changes_url = row.xpath("./td[position()=5]//a[.//span[contains(@class, 'glyphicon glyphicon-edit')]]/@href").extract_first()
            if changes_url:
                l.add_value("changes", response.urljoin(changes_url))

            cancellation_url = row.xpath("./td[position()=5]//a[.//span[contains(@class, 'glyphicon glyphicon-ban-circle')]]/@href").extract_first()
            if cancellation_url:
                l.add_value("cancellation", response.urljoin(cancellation_url))

            scan_url = row.xpath("./td[position()=5]//a[.//span[contains(@class, 'glyphicon glyphicon-info-sign')]]/@href").extract_first()
            if scan_url:
                l.add_value("scan_url", response.urljoin(scan_url))

            yield l.load_item()
예제 #2
0
    def parse(self, response):
        for row in response.css("table.table.table-striped.small tbody tr"):
            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(1)::text")
            l.add_css("order_date", "td:nth-child(2)::text")
            l.add_css("customer", "td:nth-child(3)::text")
            l.add_css("obj", "td:nth-child(4)::text")
            l.add_css("address", "td:nth-child(5)::text")
            l.add_css("cadastre_number", "td:nth-child(6)::text")
            l.add_css("document_status", "td:nth-child(7) span::text")
            l.add_css("changes", "td:nth-child(8)::text")

            document_url = row.css(
                "td:nth-child(9) a::attr(href)").extract_first()
            l.add_value("scan_url", response.urljoin(document_url))

            map_url = row.css("td:nth-child(1) a::attr(href)").extract_first()
            l.add_value("map_url", response.urljoin(map_url))
            yield l.load_item()

        # if 'Next' page label present continue crawling
        if response.css("ul.pagination li a[aria-label=Next]").get():
            yield scrapy.Request(
                self.get_next_page(response),
                callback=self.parse,
                meta={"proxy": self.get_random_proxy()},
            )
예제 #3
0
    def parse(self, response):
        for row in response.css(
                "div.table-content-container div#orders div.one_order"):
            self.logger.debug("parsed row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_date", "li.order_date::text")
            l.add_css("order_no", "li.order_number::text")
            l.add_css("customer", "li.cust::text")
            l.add_css("obj", "li.order_name::text")
            l.add_css("address", "li.addr::text")
            l.add_css("changes", "li.changes_info::text")
            l.add_css("cancellation", "li.reason_canc::text")

            url = row.css("li.download a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()
예제 #4
0
    def parse(self, response):
        for row in response.css("table tbody tr"):
            # first is header, skip
            if row.css("td:nth-child(1)::text").get() == "№ з/п":
                self.logger.debug("skiped row : {}".format(row.get()))
                continue

            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("number_in_order", "td:nth-child(1)::text")
            l.add_css("order_no",
                      "td:nth-child(2) p::text, td:nth-child(2)::text",
                      re=r"№ ?(.*)\s?$")
            l.add_css(
                "order_date",
                "td:nth-child(2) p:nth-child(1)::text, td:nth-child(2)::text",
                re=r"(\d{1,2}[\. /]?\d{1,2}[\. /]?\d{2,4})[\sр\.]*")
            l.add_css("customer", "td:nth-child(3)::text")
            l.add_css("obj", "td:nth-child(4)::text")
            l.add_css("address",
                      "td:nth-child(5)::text, td:nth-child(5) a::text")
            l.add_css("changes", "td:nth-child(6)::text")
            l.add_css("cancellation", "td:nth-child(7)::text")

            url = row.css("td:nth-child(8) a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            l.add_css("scan_no", "td:nth-child(8) a::text", re=r"№(.*) ?від")
            l.add_css("scan_date", "td:nth-child(8) a::text", re=r"від ?(.*)")

            item = l.load_item()
            if not item.get("number_in_order"):
                continue

            yield item
예제 #5
0
    def parse(self, response):
        for row in response.css('table.ms-listviewtable tr[class^=building-registry-row]'):
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(1)::text")
            l.add_css("number_in_order", "td:nth-child(2)::text")
            l.add_css("order_date", "td:nth-child(3)::text")
            l.add_css("decree_no", "td:nth-child(4)::text")
            l.add_css("customer", "td:nth-child(5)::text, td:nth-child(5) div::text")
            l.add_css("obj", "td:nth-child(6)::text, td:nth-child(6) div::text")
            l.add_css("address", "td:nth-child(7)::text, td:nth-child(7) div::text")
            l.add_css("changes", "td:nth-child(8) div::text")
            l.add_css("cancellation", "td:nth-child(9) div::text")

            url = row.css("td:nth-child(10) a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()

        # get next page number href next to current inactive with span tag
        nextPageJs = response.xpath('//table[@class="ms-listviewtable"]//tr[@class="building-registry-pager"]//table/tr/td[.//span]/following-sibling::td[1]/a/@href')
        if len(nextPageJs):
            yield FormRequest.from_response(
                response,
                formname="aspnetForm",
                formxpath="//form[@id='aspnetForm']",
                dont_click=True,
                formdata={
                    '__EVENTARGUMENT': nextPageJs.re('\',\'(.*)\'\)'),
                    '__EVENTTARGET': nextPageJs.re('javascript:__doPostBack\(\'(.*)\',')
                },
                dont_filter=True,
                callback=self.parse
            )
예제 #6
0
    def parse(self, response):
        # only first table with data
        for index, row in enumerate(
                response.css("div.post-body>table:first-of-type>tbody>tr")):

            # first two are headers, skip
            if index < 2:
                continue

            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            # because of errors in html, get td from current root only
            l.add_xpath(
                "number_in_order",
                "./td[position()=1]/span/text()|./td[position()=1]/p/span/text()",
                re=r"(\d+)\s?")
            l.add_css(
                "order_no",
                "td:nth-child(2) p span::text, td:nth-child(2) span::text",
                re=r"^\s*№ ?(.*)\s?від")
            l.add_css(
                "order_date",
                "td:nth-child(2) p span::text, td:nth-child(2) span::text",
                re=r"(\d{1,2}[\. /]?\d{1,2}[\. /]?\d{2,4})[\sр\.]*$")
            l.add_css(
                "customer",
                "td:nth-child(3) p span::text, td:nth-child(3) span::text")
            l.add_css(
                "obj",
                "td:nth-child(4) p span::text, td:nth-child(4) span::text")
            l.add_css(
                "address",
                "td:nth-child(5) p span::text, td:nth-child(5) span::text")
            l.add_css(
                "changes",
                "td:nth-child(6) p span::text, td:nth-child(6) span::text")
            l.add_css(
                "cancellation",
                "td:nth-child(7) p span::text, td:nth-child(7) span::text")

            url = row.css(
                "td:nth-child(8) p span a::attr(href), td:nth-child(8) span a::attr(href), td:nth-child(8) a::attr(href)"
            ).extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()
예제 #7
0
    def parse(self, response):
        for i, row in enumerate(response.css("table tbody tr")):
            # skip styled as header
            if row.css("td:nth-child(1) p strong").get():
                self.logger.debug("skipped row : {}".format(row.get()))
                continue

            self.logger.debug("parsed row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no",
                      "td:nth-child(1) a.add-google-doc::text",
                      re=r"№\s?(.+)$")
            l.add_css("order_date",
                      "td:nth-child(1) a.add-google-doc::text",
                      re=r"([\d\.]*) №")
            l.add_css("customer",
                      "td:nth-child(2)::text, td:nth-child(2) p::text")
            l.add_css("obj", "td:nth-child(3)::text, td:nth-child(3) p::text")
            l.add_css("address", "td:nth-child(4)::text")
            l.add_css("changes", "td:nth-child(5)::text")
            l.add_css("cancellation", "td:nth-child(6)::text")

            cancellation_url = row.css(
                "td:nth-child(6) a.add-google-doc::attr(href)").getall()
            if len(cancellation_url):
                l.add_value(
                    "cancellation_url",
                    [response.urljoin(url) for url in cancellation_url])

            url = row.css("td:nth-child(1) a.add-google-doc::attr(href)"
                          ).extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()
예제 #8
0
    def parse(self, response):
        for row in response.css("table.table-registry tbody tr"):
            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(1) strong::text")
            l.add_css("order_date", "td:nth-child(1)::text")
            l.add_css("prescript_no", "td:nth-child(2) strong::text")
            l.add_css("prescript_date", "td:nth-child(2)::text")
            l.add_css("customer", "td:nth-child(5)::text")
            l.add_css("obj", "td:nth-child(3)::text")
            l.add_css("address", "td:nth-child(4)::text")
            l.add_css("changes", "td:nth-child(6)::text")
            l.add_css("cancellation", "td:nth-child(7)::text")

            url = row.css("td:nth-child(8) a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()

        next_page = response.css(
            'a.next.page-numbers::attr(href)').extract_first()
        if next_page:
            self.logger.debug("follow next page : {}".format(next_page))
            yield response.follow(next_page)
예제 #9
0
    def parse(self, response):
        for i, row in enumerate(response.css("table tr")):
            # skip first as header
            if i == 0:
                self.logger.debug("skipped row : {}".format(row.get()))
                continue

            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(1)::text", re=r" (.*)$")
            l.add_css("order_date", "td:nth-child(1)::text", re=r"^[\d-]*")
            l.add_css("customer", "td:nth-child(2)::text")
            l.add_css("obj", "td:nth-child(3)::text")
            l.add_css("address", "td:nth-child(4)::text")
            l.add_css("changes", "td:nth-child(5)::text")
            l.add_css("cancellation", "td:nth-child(6)::text")
            l.add_css("scan_url", "td:nth-child(7) a::attr(href)")
            yield l.load_item()
예제 #10
0
    def parse(self, response):
        for row in response.xpath(
                '//table/tbody/tr[count(td)=8 and not(./td//span/strong)]'):
            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("number_in_order",
                      "td:nth-child(1) p::text, td:nth-child(1)::text")
            l.add_css(
                "order_no",
                "td:nth-child(2) p:nth-child(1)::text, td:nth-child(2) p:nth-child(1) span::text",
                re=r"№\s?(\d+)")
            l.add_css(
                "order_date",
                "td:nth-child(2) p:nth-child(1)::text, td:nth-child(2) p:nth-child(1) span::text",
                re=r"(\d{1,2}[\. /]?\d{1,2}[\. /]?\d{2,4})")
            l.add_css("remarks", "td:nth-child(2) p:nth-child(2)::text")
            l.add_css("customer", "td:nth-child(3) p::text")
            l.add_css("obj", "td:nth-child(4) p::text")
            l.add_css("address", "td:nth-child(5) p::text")
            l.add_css("changes",
                      "td:nth-child(6) p::text, td:nth-child(6) a::text")
            l.add_css("cancellation",
                      "td:nth-child(7) p::text, td:nth-child(7) a::text")
            l.add_css("scan_text",
                      "td:nth-child(8) a::text, td:nth-child(8) p::text")

            l.add_css("changes_url", "td:nth-child(6) a::attr(href)")
            l.add_css("cancellation_url", "td:nth-child(7) a::attr(href)")
            l.add_css("scan_url", "td:nth-child(8) a::attr(href)")
            yield l.load_item()
예제 #11
0
    def parse(self, response):
        loaded_order_numbers = []
        for index, row in enumerate(response.css("table tbody tr")):
            # first and second are headers, skip
            if index == 0 or index == 1:
                continue

            orders_in_row = len(row.css("td:nth-child(3) p").getall())

            if orders_in_row == 0:
                orders_in_row = len(row.css("td:nth-child(3) span"))

            # each row is sub divided for main order and it's changes
            for order_in_row in range(orders_in_row):
                l = StripJoinItemLoader(item=MbuItem(), selector=row)
                l.add_value(
                    "number_in_order",
                    self.get_first_existed(row, "td:nth-child(1) p span::text",
                                           "td:nth-child(1) span::text"))

                order_no = row.css(
                    "td:nth-child(3) p:nth-child({}) span::text, td:nth-child(3) span:nth-child({})::text"
                    .format(order_in_row + 1, order_in_row + 1)).get()

                if not order_no:
                    continue

                l.add_value("order_no", order_no)

                l.add_value(
                    "order_date",
                    self.get_first_existed(
                        row, "td:nth-child(2) p:nth-child(" +
                        str(order_in_row + 1) + ") span::text",
                        "td:nth-child(2) p:nth-child(1) span::text",
                        "td:nth-child(2) span:nth-child(" +
                        str(order_in_row + 1) + ")::text",
                        "td:nth-child(2) span:nth-child(1) span::text"))

                l.add_value(
                    "customer",
                    self.get_first_existed(
                        row, "td:nth-child(4) p:nth-child(" +
                        str(order_in_row + 1) + ") span::text",
                        "td:nth-child(4) p:nth-child(1) span::text",
                        "td:nth-child(4) span:nth-child(" +
                        str(order_in_row + 1) + ")::text",
                        "td:nth-child(4) span:nth-child(1)::text"))

                l.add_value(
                    "obj",
                    self.get_first_existed(
                        row, "td:nth-child(5) p:nth-child(" +
                        str(order_in_row + 1) + ") span::text",
                        "td:nth-child(5) p:nth-child(1) span::text",
                        "td:nth-child(5) span:nth-child(" +
                        str(order_in_row + 1) + ")::text",
                        "td:nth-child(5) span:nth-child(1)::text"))

                l.add_value(
                    "address",
                    self.get_first_existed(
                        row, "td:nth-child(6) p:nth-child(" +
                        str(order_in_row + 1) + ") span::text",
                        "td:nth-child(6) p:nth-child(1) span::text",
                        "td:nth-child(6) span:nth-child(" +
                        str(order_in_row + 1) + ")::text",
                        "td:nth-child(6) span:nth-child(1)::text"))

                l.add_value(
                    "changes",
                    self.get_first_existed(
                        row, "td:nth-child(7) p:nth-child(" +
                        str(order_in_row + 1) + ") span::text",
                        "td:nth-child(7) p:nth-child(1) span::text",
                        "td:nth-child(7) span:nth-child(" +
                        str(order_in_row + 1) + ")::text",
                        "td:nth-child(7) span:nth-child(1)::text"))
                l.add_value(
                    "cancellation",
                    self.get_first_existed(
                        row, "td:nth-child(8) p:nth-child(" +
                        str(order_in_row + 1) + ") span::text",
                        "td:nth-child(8) p:nth-child(1) span::text",
                        "td:nth-child(8) span:nth-child(" +
                        str(order_in_row + 1) + ")::text",
                        "td:nth-child(8) span:nth-child(1)::text"))

                url = self.get_first_existed(
                    row, "td:nth-child(9) p:nth-child(" +
                    str(order_in_row + 1) + ") span a::attr(href)",
                    "td:nth-child(9) p:nth-child(1) span a::attr(href)",
                    "td:nth-child(9) a:nth-child(" + str(order_in_row + 1) +
                    ")::attr(href)",
                    "td:nth-child(9) a:nth-child(1)::attr(href)")

                # TODO add decree found in 357th row

                l.add_value("scan_url", response.urljoin(url))

                address_assign_url = self.get_first_existed(
                    row, "td:nth-child(10) a::attr(href)")

                if address_assign_url:
                    l.add_value("address_assign_url",
                                response.urljoin(address_assign_url))
                    l.add_css("address_assign_no",
                              "td:nth-child(10) a span::text",
                              re=r"№(.*) ?від")
                    l.add_css("address_assign_date",
                              "td:nth-child(10) a span::text",
                              re=r"від ?(.*)$")

                loaded_order_numbers.append(
                    int(l.get_collected_values("number_in_order")[0]))

                yield l.load_item()

        # check if all consecutive orders where loaded
        missed_order_numbers = {*range(1, max(loaded_order_numbers))
                                }.difference(loaded_order_numbers)

        if missed_order_numbers:
            self.logger.warning("Missed order numbers: %s",
                                missed_order_numbers)
        else:
            self.logger.info("All order numbers processed")
예제 #12
0
    def parse(self, response):
        for row in response.css("table#droptablesTbl4 tbody tr"):
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(2)::text", re=r"([\d]+)$")
            l.add_css("order_date", "td:nth-child(2)::text", re=r"^[\d.]*")
            l.add_css("customer", "td:nth-child(3)::text")
            l.add_css("obj", "td:nth-child(4)::text")
            l.add_css("address", "td:nth-child(5)::text")
            l.add_css("changes", "td:nth-child(6)::text")
            l.add_css("cancellation", "td:nth-child(7)::text")

            url = row.css("td:nth-child(8) a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()
예제 #13
0
    def parse(self, response):
        for index, row in enumerate(response.css("table>tbody>tr")):

            # skip headers and rows with empty lines
            if index < 5 and not "".join(
                    row.css("td::text, td span::text").getall()).strip():
                self.logger.debug("skipped index : {}, row : {}".format(
                    index, row.get()))
                continue

            self.logger.debug("parse index : {}, row : {}".format(
                index, row.get()))

            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("decree", "td:nth-child(1)::text")
            l.add_css("order_no",
                      "td:nth-child(2)::text, td:nth-child(2) span::text",
                      re=r"^\s?(.*)від")
            l.add_css("order_date",
                      "td:nth-child(2)::text, td:nth-child(2) span::text",
                      re=r"від\s?([\d\.]*)\s*$")
            l.add_css("customer",
                      "td:nth-child(3)::text, td:nth-child(3) span::text")
            l.add_css("obj",
                      "td:nth-child(4)::text, td:nth-child(4) span::text")
            l.add_css("address",
                      "td:nth-child(5)::text, td:nth-child(5) span::text")
            l.add_css("changes",
                      "td:nth-child(6)::text, td:nth-child(6) span::text")
            l.add_css(
                "cancellation",
                "td:nth-child(7)::text, td:nth-child(7) a::text, td:nth-child(7) span::text"
            )

            cancellation_url = row.css(
                "td:nth-child(7) a::attr(href)").extract_first()
            if cancellation_url:
                l.add_value("cancellation_url",
                            response.urljoin(cancellation_url))

            scan_url = row.css("td:nth-child(8) a::attr(href)").extract_first()
            if scan_url:
                l.add_value("scan_url", response.urljoin(scan_url))

            yield l.load_item()
예제 #14
0
    def parse(self, response):
        for row in response.css("table#tabledataMto tbody tr"):
            self.logger.debug("parse row : {}".format(row.get()))
            l = StripJoinItemLoader(item=MbuItem(), selector=row)
            l.add_css("order_no", "td:nth-child(1)::text")
            l.add_css("order_date", "td:nth-child(4)::text")
            l.add_css("customer", "td:nth-child(5)::text")
            l.add_css("obj", "td:nth-child(6)::text")
            l.add_css("obj_purpose", "td:nth-child(7)::text")
            l.add_css("address_street", "td:nth-child(2)::text")
            l.add_css("address_street_number", "td:nth-child(3)::text")
            l.add_css("address", "td:nth-child(2)::text")
            l.add_css("address", "td:nth-child(3)::text")
            l.add_css("cancellation", "td:nth-child(8)::text")

            url = row.css("td:nth-child(9) a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()
예제 #15
0
    def parse(self, response):
        for row in response.css(
                "article table tbody tr:not([align=\"center\"])"):
            l = StripJoinItemLoader(item=MbuItem(), selector=row)

            # skip empty lines
            date_order = ''.join(
                row.css(
                    "td:nth-child(2) p span::text, td:nth-child(2) span::text, td:nth-child(2)::text"
                ).getall())
            if not date_order or not date_order.strip():
                self.logger.debug("skipped row : {}".format(row.get()))
                continue

            self.logger.debug("parse row : {}".format(row.get()))
            # number_in_order is unique only per year
            l.add_css(
                "number_in_order",
                "td:nth-child(1) p span::text, td:nth-child(1) span::text, td:nth-child(1)::text"
            )
            l.add_value(
                "order_no",
                re.search('^\\s?№? ?(.*)\\s?(в?ід|dsl)', date_order).group(1))
            l.add_css(
                "order_date",
                "td:nth-child(2) p span::text, td:nth-child(2) span::text, td:nth-child(2)::text",
                re=r"(\d{1,2}[\. /]?\d{1,2}[\. /]?\d{2,4})\s*$")
            l.add_css(
                "customer",
                "td:nth-child(3) span::text, td:nth-child(3)::text, td:nth-child(3) p::text"
            )
            l.add_css("obj",
                      "td:nth-child(4) span::text, td:nth-child(4)::text")
            l.add_css(
                "address",
                "td:nth-child(5) span::text, td:nth-child(5) p::text, td:nth-child(5)::text"
            )
            l.add_css("changes",
                      "td:nth-child(6) span::text, td:nth-child(6)::text")
            l.add_css("cancellation",
                      "td:nth-child(7) span::text, td:nth-child(7)::text")

            url = row.css("td:nth-child(8) a::attr(href)").extract_first()
            if url:
                l.add_value("scan_url", response.urljoin(url))

            yield l.load_item()