def parse(self, response): for row in response.css('table.ms-listviewtable tr[class^=building-registry-row]'): l = StripJoinItemLoader(item=MbuItem(), selector=row) l.add_css("order_no", "td:nth-child(1)::text") l.add_css("number_in_order", "td:nth-child(2)::text") l.add_css("order_date", "td:nth-child(3)::text") l.add_css("decree_no", "td:nth-child(4)::text") l.add_css("customer", "td:nth-child(5)::text, td:nth-child(5) div::text") l.add_css("obj", "td:nth-child(6)::text, td:nth-child(6) div::text") l.add_css("address", "td:nth-child(7)::text, td:nth-child(7) div::text") l.add_css("changes", "td:nth-child(8) div::text") l.add_css("cancellation", "td:nth-child(9) div::text") url = row.css("td:nth-child(10) a::attr(href)").extract_first() if url: l.add_value("scan_url", response.urljoin(url)) yield l.load_item() # get next page number href next to current inactive with span tag nextPageJs = response.xpath('//table[@class="ms-listviewtable"]//tr[@class="building-registry-pager"]//table/tr/td[.//span]/following-sibling::td[1]/a/@href') if len(nextPageJs): yield FormRequest.from_response( response, formname="aspnetForm", formxpath="//form[@id='aspnetForm']", dont_click=True, formdata={ '__EVENTARGUMENT': nextPageJs.re('\',\'(.*)\'\)'), '__EVENTTARGET': nextPageJs.re('javascript:__doPostBack\(\'(.*)\',') }, dont_filter=True, callback=self.parse )
def parse(self, response): for i, row in enumerate(response.css("table tbody tr")): # skip styled as header if row.css("td:nth-child(1) p strong").get(): self.logger.debug("skipped row : {}".format(row.get())) continue self.logger.debug("parsed row : {}".format(row.get())) l = StripJoinItemLoader(item=MbuItem(), selector=row) l.add_css("order_no", "td:nth-child(1) a.add-google-doc::text", re=r"№\s?(.+)$") l.add_css("order_date", "td:nth-child(1) a.add-google-doc::text", re=r"([\d\.]*) №") l.add_css("customer", "td:nth-child(2)::text, td:nth-child(2) p::text") l.add_css("obj", "td:nth-child(3)::text, td:nth-child(3) p::text") l.add_css("address", "td:nth-child(4)::text") l.add_css("changes", "td:nth-child(5)::text") l.add_css("cancellation", "td:nth-child(6)::text") cancellation_url = row.css( "td:nth-child(6) a.add-google-doc::attr(href)").getall() if len(cancellation_url): l.add_value( "cancellation_url", [response.urljoin(url) for url in cancellation_url]) url = row.css("td:nth-child(1) a.add-google-doc::attr(href)" ).extract_first() if url: l.add_value("scan_url", response.urljoin(url)) yield l.load_item()
def parse(self, response): # only first table with data for index, row in enumerate( response.css("div.post-body>table:first-of-type>tbody>tr")): # first two are headers, skip if index < 2: continue l = StripJoinItemLoader(item=MbuItem(), selector=row) # because of errors in html, get td from current root only l.add_xpath( "number_in_order", "./td[position()=1]/span/text()|./td[position()=1]/p/span/text()", re=r"(\d+)\s?") l.add_css( "order_no", "td:nth-child(2) p span::text, td:nth-child(2) span::text", re=r"^\s*№ ?(.*)\s?від") l.add_css( "order_date", "td:nth-child(2) p span::text, td:nth-child(2) span::text", re=r"(\d{1,2}[\. /]?\d{1,2}[\. /]?\d{2,4})[\sр\.]*$") l.add_css( "customer", "td:nth-child(3) p span::text, td:nth-child(3) span::text") l.add_css( "obj", "td:nth-child(4) p span::text, td:nth-child(4) span::text") l.add_css( "address", "td:nth-child(5) p span::text, td:nth-child(5) span::text") l.add_css( "changes", "td:nth-child(6) p span::text, td:nth-child(6) span::text") l.add_css( "cancellation", "td:nth-child(7) p span::text, td:nth-child(7) span::text") url = row.css( "td:nth-child(8) p span a::attr(href), td:nth-child(8) span a::attr(href), td:nth-child(8) a::attr(href)" ).extract_first() if url: l.add_value("scan_url", response.urljoin(url)) yield l.load_item()
def parse(self, response): for row in response.css("table.table-registry tbody tr"): self.logger.debug("parse row : {}".format(row.get())) l = StripJoinItemLoader(item=MbuItem(), selector=row) l.add_css("order_no", "td:nth-child(1) strong::text") l.add_css("order_date", "td:nth-child(1)::text") l.add_css("prescript_no", "td:nth-child(2) strong::text") l.add_css("prescript_date", "td:nth-child(2)::text") l.add_css("customer", "td:nth-child(5)::text") l.add_css("obj", "td:nth-child(3)::text") l.add_css("address", "td:nth-child(4)::text") l.add_css("changes", "td:nth-child(6)::text") l.add_css("cancellation", "td:nth-child(7)::text") url = row.css("td:nth-child(8) a::attr(href)").extract_first() if url: l.add_value("scan_url", response.urljoin(url)) yield l.load_item() next_page = response.css( 'a.next.page-numbers::attr(href)').extract_first() if next_page: self.logger.debug("follow next page : {}".format(next_page)) yield response.follow(next_page)
def parse(self, response): jsonresponse = json.loads(response.body_as_unicode()) for row in jsonresponse["aaData"]: self.logger.debug("parsed row : {}".format(row)) l = StripJoinItemLoader(item=MbuItem()) l.add_value("number_in_order", row[0]) l.add_value( "order_no", re.search("№ ?(.*) ?(ві|от)", row[1]).group(1) if row[1] else None) l.add_value( "order_date", re.search("([0-9]{1,2}\.[0-9]{1,2}\. ?[0-9]{1,4})", row[1]).group(1) if row[1] else None) l.add_value("customer", row[2]) l.add_value("obj", row[3]) l.add_value("address", row[4]) l.add_value("changes", row[5]) l.add_value("cancellation", row[6]) l.add_value( "scan_url", Selector(text=row[7]).css("a::attr(href)").extract_first() if row[7] else None) yield l.load_item()
class ZhytomyrSpider(scrapy.spiders.CSVFeedSpider): location_name = "Житомир" name = "zhytomyr" allowed_domains = ["zt-rada.gov.ua"] start_urls = ["http://zt-rada.gov.ua/?3398[0]=6281"] custom_settings = { # specifies exported fields and order 'FEED_EXPORT_FIELDS': [ "location_name", "order_no", "order_date", "customer", "obj", "address", "changes", "cancellation", "scan_url" ], } item_loaders = defaultdict(lambda: StripJoinItemLoader(item=MbuItem())) def parse_xls_and_flush(self, response): sheet = xlrd.open_workbook( file_contents=response.body).sheet_by_index(0) for index in range(1, sheet.nrows): row = sheet.row(index) # if row is not empty if row[1].value: order_no = row[0].value.replace('№', '').strip() l = self.get_item(order_no) if not l.get_output_value('order_no'): l.add_value("order_no", order_no) l.add_value("order_date", row[1].value) l.add_value("customer", row[6].value) l.add_value("obj", row[2].value) l.add_value("address", row[3].value) l.add_value("changes", row[7].value) l.add_value("cancellation", row[8].value) else: self.logger.debug("skipped index : {}, row : {}".format( index, row)) for item in self.item_loaders.values(): yield item.load_item() # clear in case of several xls files found self.item_loaders.clear() def parse(self, response): for row in response.css(".docrowcontainer"): document_url = row.css( "div:nth-child(2) a.docdownload::attr(href)").extract_first() if document_url.endswith('.xls'): self.logger.info( "xls document found : {}".format(document_url)) yield response.follow(document_url, callback=self.parse_xls_and_flush, priority=10) # big priority to run last else: continue # self.logger.debug("parse site row : {}".format(row.get())) # order_no = "".join(row.css("div:nth-child(1)::text").re(r"№ ?(.*)")).strip() # l = self.get_item(order_no) # l.selector = row # l.add_value("order_no", order_no) # l.add_xpath("order_date", "./@data-year") # l.add_value("scan_url", response.urljoin(document_url)) next_page_link = response.xpath( '//*[@id="tp6"]//ul/li[@class="active"]/following-sibling::li[1]/a/@href' ).get() if next_page_link: self.logger.debug("next page link : {}".format(next_page_link)) yield response.follow(next_page_link, callback=self.parse) def get_item(self, order_no): # tries to find similar order number with different separating signs splitter = '[\.\s/\-\|]+' order_id_list = re.split(splitter, order_no.strip()) filtered_order = list( filter( lambda ord: order_id_list == re.split(splitter, ord.strip()), self.item_loaders.keys())) if len(filtered_order): self.logger.debug( 'similar to order_no : {} found in loaders : {}'.format( order_no, filtered_order)) return self.item_loaders[filtered_order[0] if len(filtered_order ) else order_no]
def parse_filtered(self, response): jsonresponse = json.loads(response.body_as_unicode()) table = Selector(text=jsonresponse['table']) for row in table.css("tr"): self.logger.debug("parse row : {}".format(row.get())) l = StripJoinItemLoader(item=MbuItem(), selector=row) l.add_css("order_no", "td:nth-child(1) strong::text") l.add_css("order_date", "td:nth-child(1)::text", re=r"([\d\.])+\s*№") l.add_css("customer", "td:nth-child(2)::text") l.add_css("obj", "td:nth-child(3) small::text") l.add_css("address", "td:nth-child(4)::text") changes_url = row.xpath("./td[position()=5]//a[.//span[contains(@class, 'glyphicon glyphicon-edit')]]/@href").extract_first() if changes_url: l.add_value("changes", response.urljoin(changes_url)) cancellation_url = row.xpath("./td[position()=5]//a[.//span[contains(@class, 'glyphicon glyphicon-ban-circle')]]/@href").extract_first() if cancellation_url: l.add_value("cancellation", response.urljoin(cancellation_url)) scan_url = row.xpath("./td[position()=5]//a[.//span[contains(@class, 'glyphicon glyphicon-info-sign')]]/@href").extract_first() if scan_url: l.add_value("scan_url", response.urljoin(scan_url)) yield l.load_item()
def parse(self, response): jsonresponse = json.loads(response.body_as_unicode()) for attributes in jsonresponse['features']: self.logger.debug("parse row : {}".format(attributes)) attributes = attributes['attributes'] l = StripJoinItemLoader(item=MbuItem()) l.add_value("number_in_order", str(attributes['Kadastr2016.DBO.MBU.OBJECTID'])) l.add_value("order_no", attributes['Kadastr2016.DBO.MBU.NomKancel'] if attributes['Kadastr2016.DBO.MBU.NomKancel'] else '-') l.add_value("order_date", str(datetime.fromtimestamp(attributes['Kadastr2016.DBO.MBU.Data'] / 1000).date()) if attributes[ 'Kadastr2016.DBO.MBU.Data'] else None) l.add_value("customer", attributes['Kadastr2016.DBO.MBU.Zamovnuk_MBO']) l.add_value("obj", attributes['Kadastr2016.DBO.MBU.Nazobekty']) l.add_value("address", self.get_address(attributes)) l.add_value("changes", attributes['Kadastr2016.DBO.MBU.Zmini']) l.add_value("cancellation", attributes['Kadastr2016.DBO.MBU.Skasuvannia']) # all original fields to be used later additional_fields = dict([[item[len('Kadastr2016.DBO.'):],attributes[item]] for item in attributes.keys()]) l.add_value('additional_fields', json.dumps(additional_fields, ensure_ascii=False)) scan_urls = [] if attributes['Kadastr2016.DBO.MBU.Link_1']: scan_urls.append(response.urljoin(attributes['Kadastr2016.DBO.MBU.Link_1'])) if attributes['Kadastr2016.DBO.MBU.Link_2']: scan_urls.append(response.urljoin(attributes['Kadastr2016.DBO.MBU.Link_2'])) if scan_urls: l.add_value("scan_url", ",".join(scan_urls)) yield l.load_item()
def parse(self, response): for row in response.xpath( '//table/tbody/tr[count(td)=8 and not(./td//span/strong)]'): self.logger.debug("parse row : {}".format(row.get())) l = StripJoinItemLoader(item=MbuItem(), selector=row) l.add_css("number_in_order", "td:nth-child(1) p::text, td:nth-child(1)::text") l.add_css( "order_no", "td:nth-child(2) p:nth-child(1)::text, td:nth-child(2) p:nth-child(1) span::text", re=r"№\s?(\d+)") l.add_css( "order_date", "td:nth-child(2) p:nth-child(1)::text, td:nth-child(2) p:nth-child(1) span::text", re=r"(\d{1,2}[\. /]?\d{1,2}[\. /]?\d{2,4})") l.add_css("remarks", "td:nth-child(2) p:nth-child(2)::text") l.add_css("customer", "td:nth-child(3) p::text") l.add_css("obj", "td:nth-child(4) p::text") l.add_css("address", "td:nth-child(5) p::text") l.add_css("changes", "td:nth-child(6) p::text, td:nth-child(6) a::text") l.add_css("cancellation", "td:nth-child(7) p::text, td:nth-child(7) a::text") l.add_css("scan_text", "td:nth-child(8) a::text, td:nth-child(8) p::text") l.add_css("changes_url", "td:nth-child(6) a::attr(href)") l.add_css("cancellation_url", "td:nth-child(7) a::attr(href)") l.add_css("scan_url", "td:nth-child(8) a::attr(href)") yield l.load_item()
def parse(self, response): for row in response.css("table#droptablesTbl4 tbody tr"): l = StripJoinItemLoader(item=MbuItem(), selector=row) l.add_css("order_no", "td:nth-child(2)::text", re=r"([\d]+)$") l.add_css("order_date", "td:nth-child(2)::text", re=r"^[\d.]*") l.add_css("customer", "td:nth-child(3)::text") l.add_css("obj", "td:nth-child(4)::text") l.add_css("address", "td:nth-child(5)::text") l.add_css("changes", "td:nth-child(6)::text") l.add_css("cancellation", "td:nth-child(7)::text") url = row.css("td:nth-child(8) a::attr(href)").extract_first() if url: l.add_value("scan_url", response.urljoin(url)) yield l.load_item()
def parse(self, response): loaded_order_numbers = [] for index, row in enumerate(response.css("table tbody tr")): # first and second are headers, skip if index == 0 or index == 1: continue orders_in_row = len(row.css("td:nth-child(3) p").getall()) if orders_in_row == 0: orders_in_row = len(row.css("td:nth-child(3) span")) # each row is sub divided for main order and it's changes for order_in_row in range(orders_in_row): l = StripJoinItemLoader(item=MbuItem(), selector=row) l.add_value( "number_in_order", self.get_first_existed(row, "td:nth-child(1) p span::text", "td:nth-child(1) span::text")) order_no = row.css( "td:nth-child(3) p:nth-child({}) span::text, td:nth-child(3) span:nth-child({})::text" .format(order_in_row + 1, order_in_row + 1)).get() if not order_no: continue l.add_value("order_no", order_no) l.add_value( "order_date", self.get_first_existed( row, "td:nth-child(2) p:nth-child(" + str(order_in_row + 1) + ") span::text", "td:nth-child(2) p:nth-child(1) span::text", "td:nth-child(2) span:nth-child(" + str(order_in_row + 1) + ")::text", "td:nth-child(2) span:nth-child(1) span::text")) l.add_value( "customer", self.get_first_existed( row, "td:nth-child(4) p:nth-child(" + str(order_in_row + 1) + ") span::text", "td:nth-child(4) p:nth-child(1) span::text", "td:nth-child(4) span:nth-child(" + str(order_in_row + 1) + ")::text", "td:nth-child(4) span:nth-child(1)::text")) l.add_value( "obj", self.get_first_existed( row, "td:nth-child(5) p:nth-child(" + str(order_in_row + 1) + ") span::text", "td:nth-child(5) p:nth-child(1) span::text", "td:nth-child(5) span:nth-child(" + str(order_in_row + 1) + ")::text", "td:nth-child(5) span:nth-child(1)::text")) l.add_value( "address", self.get_first_existed( row, "td:nth-child(6) p:nth-child(" + str(order_in_row + 1) + ") span::text", "td:nth-child(6) p:nth-child(1) span::text", "td:nth-child(6) span:nth-child(" + str(order_in_row + 1) + ")::text", "td:nth-child(6) span:nth-child(1)::text")) l.add_value( "changes", self.get_first_existed( row, "td:nth-child(7) p:nth-child(" + str(order_in_row + 1) + ") span::text", "td:nth-child(7) p:nth-child(1) span::text", "td:nth-child(7) span:nth-child(" + str(order_in_row + 1) + ")::text", "td:nth-child(7) span:nth-child(1)::text")) l.add_value( "cancellation", self.get_first_existed( row, "td:nth-child(8) p:nth-child(" + str(order_in_row + 1) + ") span::text", "td:nth-child(8) p:nth-child(1) span::text", "td:nth-child(8) span:nth-child(" + str(order_in_row + 1) + ")::text", "td:nth-child(8) span:nth-child(1)::text")) url = self.get_first_existed( row, "td:nth-child(9) p:nth-child(" + str(order_in_row + 1) + ") span a::attr(href)", "td:nth-child(9) p:nth-child(1) span a::attr(href)", "td:nth-child(9) a:nth-child(" + str(order_in_row + 1) + ")::attr(href)", "td:nth-child(9) a:nth-child(1)::attr(href)") # TODO add decree found in 357th row l.add_value("scan_url", response.urljoin(url)) address_assign_url = self.get_first_existed( row, "td:nth-child(10) a::attr(href)") if address_assign_url: l.add_value("address_assign_url", response.urljoin(address_assign_url)) l.add_css("address_assign_no", "td:nth-child(10) a span::text", re=r"№(.*) ?від") l.add_css("address_assign_date", "td:nth-child(10) a span::text", re=r"від ?(.*)$") loaded_order_numbers.append( int(l.get_collected_values("number_in_order")[0])) yield l.load_item() # check if all consecutive orders where loaded missed_order_numbers = {*range(1, max(loaded_order_numbers)) }.difference(loaded_order_numbers) if missed_order_numbers: self.logger.warning("Missed order numbers: %s", missed_order_numbers) else: self.logger.info("All order numbers processed")
def parse_row(self, response, row): self.logger.debug("parse row : {}".format(row)) l = StripJoinItemLoader(item=MbuItem()) l.add_value("order_no", row['restrictionNumber ']) # space in the end is needed l.add_value("order_date", row['restrictionDate']) l.add_value("customer", row['objectOwner']) l.add_value("obj", row['objectDescription']) l.add_value("address", row['objectAddress']) l.add_value("changes", row['objectChanges']) l.add_value("cancellation", row['objectCancel']) l.add_value("status", row['objectStatus']) yield l.load_item()
def parse(self, response): for index, row in enumerate(response.css("table>tbody>tr")): # skip headers and rows with empty lines if index < 5 and not "".join( row.css("td::text, td span::text").getall()).strip(): self.logger.debug("skipped index : {}, row : {}".format( index, row.get())) continue self.logger.debug("parse index : {}, row : {}".format( index, row.get())) l = StripJoinItemLoader(item=MbuItem(), selector=row) l.add_css("decree", "td:nth-child(1)::text") l.add_css("order_no", "td:nth-child(2)::text, td:nth-child(2) span::text", re=r"^\s?(.*)від") l.add_css("order_date", "td:nth-child(2)::text, td:nth-child(2) span::text", re=r"від\s?([\d\.]*)\s*$") l.add_css("customer", "td:nth-child(3)::text, td:nth-child(3) span::text") l.add_css("obj", "td:nth-child(4)::text, td:nth-child(4) span::text") l.add_css("address", "td:nth-child(5)::text, td:nth-child(5) span::text") l.add_css("changes", "td:nth-child(6)::text, td:nth-child(6) span::text") l.add_css( "cancellation", "td:nth-child(7)::text, td:nth-child(7) a::text, td:nth-child(7) span::text" ) cancellation_url = row.css( "td:nth-child(7) a::attr(href)").extract_first() if cancellation_url: l.add_value("cancellation_url", response.urljoin(cancellation_url)) scan_url = row.css("td:nth-child(8) a::attr(href)").extract_first() if scan_url: l.add_value("scan_url", response.urljoin(scan_url)) yield l.load_item()
def parse(self, response): for row in response.css("table#tabledataMto tbody tr"): self.logger.debug("parse row : {}".format(row.get())) l = StripJoinItemLoader(item=MbuItem(), selector=row) l.add_css("order_no", "td:nth-child(1)::text") l.add_css("order_date", "td:nth-child(4)::text") l.add_css("customer", "td:nth-child(5)::text") l.add_css("obj", "td:nth-child(6)::text") l.add_css("obj_purpose", "td:nth-child(7)::text") l.add_css("address_street", "td:nth-child(2)::text") l.add_css("address_street_number", "td:nth-child(3)::text") l.add_css("address", "td:nth-child(2)::text") l.add_css("address", "td:nth-child(3)::text") l.add_css("cancellation", "td:nth-child(8)::text") url = row.css("td:nth-child(9) a::attr(href)").extract_first() if url: l.add_value("scan_url", response.urljoin(url)) yield l.load_item()
def parse(self, response): for row in response.css("table tbody tr"): # first is header, skip if row.css("td:nth-child(1)::text").get() == "№ з/п": self.logger.debug("skiped row : {}".format(row.get())) continue self.logger.debug("parse row : {}".format(row.get())) l = StripJoinItemLoader(item=MbuItem(), selector=row) l.add_css("number_in_order", "td:nth-child(1)::text") l.add_css("order_no", "td:nth-child(2) p::text, td:nth-child(2)::text", re=r"№ ?(.*)\s?$") l.add_css( "order_date", "td:nth-child(2) p:nth-child(1)::text, td:nth-child(2)::text", re=r"(\d{1,2}[\. /]?\d{1,2}[\. /]?\d{2,4})[\sр\.]*") l.add_css("customer", "td:nth-child(3)::text") l.add_css("obj", "td:nth-child(4)::text") l.add_css("address", "td:nth-child(5)::text, td:nth-child(5) a::text") l.add_css("changes", "td:nth-child(6)::text") l.add_css("cancellation", "td:nth-child(7)::text") url = row.css("td:nth-child(8) a::attr(href)").extract_first() if url: l.add_value("scan_url", response.urljoin(url)) l.add_css("scan_no", "td:nth-child(8) a::text", re=r"№(.*) ?від") l.add_css("scan_date", "td:nth-child(8) a::text", re=r"від ?(.*)") item = l.load_item() if not item.get("number_in_order"): continue yield item
def parse(self, response): jsonresponse = json.loads(response.body_as_unicode()) for row in jsonresponse["aaData"]: self.logger.debug("parse row : {}".format(row)) l = StripJoinItemLoader(item=MbuItem()) l.add_value("number_in_order", row[0]) l.add_value("order_no", row[2]) l.add_value("order_date", row[1]) l.add_value("customer", row[7]) l.add_value("obj", row[3]) l.add_value("address", row[4]) l.add_value("changes", row[5]) l.add_value("cancellation", row[6]) l.add_value("scan_url", response.urljoin(row[8]) if row[8] else None) yield l.load_item()
def parse(self, response): for row in response.css( "div.table-content-container div#orders div.one_order"): self.logger.debug("parsed row : {}".format(row.get())) l = StripJoinItemLoader(item=MbuItem(), selector=row) l.add_css("order_date", "li.order_date::text") l.add_css("order_no", "li.order_number::text") l.add_css("customer", "li.cust::text") l.add_css("obj", "li.order_name::text") l.add_css("address", "li.addr::text") l.add_css("changes", "li.changes_info::text") l.add_css("cancellation", "li.reason_canc::text") url = row.css("li.download a::attr(href)").extract_first() if url: l.add_value("scan_url", response.urljoin(url)) yield l.load_item()
def parse(self, response): for i, row in enumerate(response.css("table tr")): # skip first as header if i == 0: self.logger.debug("skipped row : {}".format(row.get())) continue self.logger.debug("parse row : {}".format(row.get())) l = StripJoinItemLoader(item=MbuItem(), selector=row) l.add_css("order_no", "td:nth-child(1)::text", re=r" (.*)$") l.add_css("order_date", "td:nth-child(1)::text", re=r"^[\d-]*") l.add_css("customer", "td:nth-child(2)::text") l.add_css("obj", "td:nth-child(3)::text") l.add_css("address", "td:nth-child(4)::text") l.add_css("changes", "td:nth-child(5)::text") l.add_css("cancellation", "td:nth-child(6)::text") l.add_css("scan_url", "td:nth-child(7) a::attr(href)") yield l.load_item()
def parse(self, response): for row in response.css("table.table.table-striped.small tbody tr"): self.logger.debug("parse row : {}".format(row.get())) l = StripJoinItemLoader(item=MbuItem(), selector=row) l.add_css("order_no", "td:nth-child(1)::text") l.add_css("order_date", "td:nth-child(2)::text") l.add_css("customer", "td:nth-child(3)::text") l.add_css("obj", "td:nth-child(4)::text") l.add_css("address", "td:nth-child(5)::text") l.add_css("cadastre_number", "td:nth-child(6)::text") l.add_css("document_status", "td:nth-child(7) span::text") l.add_css("changes", "td:nth-child(8)::text") document_url = row.css( "td:nth-child(9) a::attr(href)").extract_first() l.add_value("scan_url", response.urljoin(document_url)) map_url = row.css("td:nth-child(1) a::attr(href)").extract_first() l.add_value("map_url", response.urljoin(map_url)) yield l.load_item() # if 'Next' page label present continue crawling if response.css("ul.pagination li a[aria-label=Next]").get(): yield scrapy.Request( self.get_next_page(response), callback=self.parse, meta={"proxy": self.get_random_proxy()}, )
def parse(self, response): for row in response.css( "article table tbody tr:not([align=\"center\"])"): l = StripJoinItemLoader(item=MbuItem(), selector=row) # skip empty lines date_order = ''.join( row.css( "td:nth-child(2) p span::text, td:nth-child(2) span::text, td:nth-child(2)::text" ).getall()) if not date_order or not date_order.strip(): self.logger.debug("skipped row : {}".format(row.get())) continue self.logger.debug("parse row : {}".format(row.get())) # number_in_order is unique only per year l.add_css( "number_in_order", "td:nth-child(1) p span::text, td:nth-child(1) span::text, td:nth-child(1)::text" ) l.add_value( "order_no", re.search('^\\s?№? ?(.*)\\s?(в?ід|dsl)', date_order).group(1)) l.add_css( "order_date", "td:nth-child(2) p span::text, td:nth-child(2) span::text, td:nth-child(2)::text", re=r"(\d{1,2}[\. /]?\d{1,2}[\. /]?\d{2,4})\s*$") l.add_css( "customer", "td:nth-child(3) span::text, td:nth-child(3)::text, td:nth-child(3) p::text" ) l.add_css("obj", "td:nth-child(4) span::text, td:nth-child(4)::text") l.add_css( "address", "td:nth-child(5) span::text, td:nth-child(5) p::text, td:nth-child(5)::text" ) l.add_css("changes", "td:nth-child(6) span::text, td:nth-child(6)::text") l.add_css("cancellation", "td:nth-child(7) span::text, td:nth-child(7)::text") url = row.css("td:nth-child(8) a::attr(href)").extract_first() if url: l.add_value("scan_url", response.urljoin(url)) yield l.load_item()