def parse(self, response): gazettes = response.css("a") for gazette in gazettes: gazette_info = gazette.css("::text").re( r"Edi..o\s*n.\s*(\d+) de (\d{2}) de (.*?) de (\d{4})") if not gazette_info: self.logger.warning( f"Unable to identify gazette info for {response.url}.") continue edition_number, day, month, year = gazette_info gazette_date = parse(f"{day} de {month} de {year}", languages=["pt"]).date() if gazette_date < self.start_date: continue gazette_url = response.urljoin(gazette.css("::attr(href)").get()) link_text = gazette.css("::text").get().lower() is_extra_edition = bool( re.search(r"suplemento|complemento|especial", link_text)) yield Gazette( date=gazette_date, edition_number=edition_number, file_urls=[gazette_url], is_extra_edition=is_extra_edition, power="executive", )
def parse(self, response): follow_next_page = True gazettes = response.css("h4.card-title") for gazette in gazettes: gazette_url = gazette.xpath(".//following-sibling::a/@href").get() edition_number = gazette.css("a::text").re_first( r"Edição (\d+\/\d+)") raw_gazette_date = gazette.css("a::text").re_first( r"(\d{2}\/\d{2}\/\d{4})") if not raw_gazette_date: continue gazette_date = datetime.datetime.strptime(raw_gazette_date, "%d/%m/%Y").date() yield Gazette( date=gazette_date, edition_number=edition_number, file_urls=[gazette_url], power="executive_legislative", ) if gazette_date < self.start_date: follow_next_page = False break next_page_url = response.css("a.next::attr(href)").get() if follow_next_page and next_page_url: yield scrapy.Request(next_page_url)
def parse(self, response, page=1): gazettes = response.css(".list-group-item") last_page_number_css = ".pagination > li:nth-last-child(-n+2) > a > span::text" last_page_number = int(response.css(last_page_number_css).get()) follow_next_page = True for gazette in gazettes: gazette_date_raw = gazette.css( "div > div > span::text").get().strip() gazette_date = parse(gazette_date_raw, languages=["pt"]).date() gazette_title_raw = gazette.css( "h4 > div > div > strong::text").get() edition_number = gazette_title_raw.strip() if gazette_date < self.start_date or page == last_page_number: follow_next_page = False partial_url = gazette.css("a::attr(href)").get() url = f"https://paudosferros.rn.gov.br/{partial_url}" yield Gazette( date=gazette_date, file_urls=[url], edition_number=edition_number, power="executive_legislative", ) if follow_next_page: next_page = page + 1 yield Request( f"{self.start_urls[0]}&pagina={page}", cb_kwargs={"page": next_page}, )
def parse(self, response): follow_next_page = True gazettes = response.css(".listing tbody tr") for gazette in gazettes: gazette_date_raw = gazette.xpath("./td[1]//text()").re_first( r"\d{2}\/\d{2}\/\d{4}") gazette_date = datetime.datetime.strptime(gazette_date_raw, "%d/%m/%Y").date() if gazette_date < self.start_date: follow_next_page = False break title = "".join(gazette.xpath("./td[2]//text()").getall()).strip() edition_number = self._extract_edition_number(title, gazette_date) is_extra_edition = re.search(r"eex|ext", title.lower()) is not None gazette_url = gazette.css("a::attr(href)").get() yield Gazette( date=gazette_date, edition_number=edition_number, is_extra_edition=is_extra_edition, file_urls=[gazette_url], power="executive", ) if follow_next_page: next_page_url = response.css(".next a::attr(href)").get() yield scrapy.Request(next_page_url)
def parse(self, response): gazettes = response.css(".diario-resultado-pesquisa tbody tr") for gazette in gazettes: gazette_date = gazette.xpath("./td[2]/text()").get() gazette_url = response.urljoin(gazette.css("a::attr(href)").get()) is_extra_edition = bool( gazette.xpath(".//*[contains(., 'Suplemento')]")) if is_extra_edition: # Extra Editions doesn't have a date in its line. We need to get it from # the main edition of that day gazette_date = self._get_date_from_parent_edition( response, gazette) item = Gazette( date=dateparser.parse(gazette_date, languages=["pt"]).date(), is_extra_edition=is_extra_edition, power="executive_legislative", ) yield scrapy.Request( gazette_url, method="HEAD", callback=self.parse_pdf_url, cb_kwargs={"item": item}, ) next_pages_urls = response.css(".pagination a::attr(href)").getall() for next_page_url in next_pages_urls: yield scrapy.Request(response.urljoin(next_page_url))
def parse(self, response): """Parses gazettes page and requests next page. Normal gazettes are displayed in a weekly basis, so, the date which is taken into account for this type of gazette is the last in the publication period (i.e. "29/08/2020" from "23/08/2020 to 29/08/2020"). Special gazzetes are daily, but that same logic applies here and it works correctly. """ gazettes = response.css(".table-semanarios table tbody tr") for gazette in gazettes: url = gazette.css("td:last-child a::attr(href)").get() gazette_date = (gazette.css("td:nth-last-child(2)::text").re( r"[0-9]{2}/[0-9]{2}/[0-9]{4}").pop()) gazette_date = datetime.datetime.strptime(gazette_date, "%d/%m/%Y").date() is_extra = "Especial" in gazette.css("td:first-child").get() yield Gazette( date=gazette_date, file_urls=[url], is_extra_edition=is_extra, power="executive_legislative", ) for url in response.css(".pagination a.next::attr(href)").getall(): yield response.follow(url)
def parse_year(self, response): # The page with the list of gazettes is simply a table with links links = response.css("a") items = [] for link in links: url = link.css("::attr(href)").extract_first() if url[-4:] != ".pdf": continue url = response.urljoin(url) # Apparently, Goiânia doesn't have a separate gazette for executive and legislative power = "executive_legislature" link_text = link.css("::text").extract_first() if link_text is None: continue date = re.match(".*(\d{2} .* de \d{4})", link_text)[1] # Extra editions are marked either with 'suplemento' or 'comunicado' is_extra_edition = ("suplemento" in link_text.lower() or "comunicado" in link_text.lower()) date = parse(date.split("-")[0], languages=["pt"]).date() items.append( Gazette( date=date, file_urls=[url], is_extra_edition=is_extra_edition, power=power, )) return items
def parse(self, response): gazette_table = response.css(".style166") gazettes_links = gazette_table.xpath("a//@href").extract() dates = gazette_table.css("a::text").extract() for url, date in zip(gazettes_links, dates): edition = self._extract_edition(url) power = self._extract_power(url) power_id = self.powers[power] gazette = Gazette( date=parse(date, languages=["pt"]).date(), is_extra_edition=False, power=power, ) gazette_details_page = f"abrir.asp?edi={edition}&p={power_id}" gazette_url = response.urljoin(gazette_details_page) yield Request(gazette_url, callback=self.parse_document_url, meta={"item": gazette}) current_page_selector = "#pages ul li.current::text" current_page = response.css(current_page_selector).extract_first() next_page = int(current_page) + 1 next_page_url = response.urljoin(f"/?p={next_page}") if next_page > self.last_page: self.last_page = next_page yield Request(next_page_url)
def parse(self, response): """ @url http://apps.fortaleza.ce.gov.br/diariooficial/ @returns requests 1 @scrapes date file_urls is_extra_edition territory_id power scraped_at """ for element in response.css(self.GAZETTE_ELEMENT_CSS): url = response.urljoin( element.css("a::attr(href)").extract_first()) date = dateparser.parse(element.css( self.DATE_CSS).extract_first(""), languages=["pt"]).date() # Extra edition is maked with a "s" on description. Example: Diário Oficial Nº 15923s extra_edition = element.css( self.EXTRA_CSS).extract_first("").endswith("s") yield Gazette( date=date, file_urls=[url], is_extra_edition=extra_edition, territory_id=self.TERRITORY_ID, power="executive", scraped_at=datetime.utcnow(), ) for page_number in response.css(self.NEXT_PAGE_CSS).re("#(\d)+"): next_url = w3lib.url.add_or_replace_parameter( response.url, "current", page_number) yield Request(next_url)
def parse_page(self, response): for idx, row in enumerate(response.css(".grid_Row")): pdf_date = row.css("td:nth-child(2) span ::text").extract_first() gazette_id = row.css( "td:nth-child(3) a ::attr(data-teste)").extract_first() parsed_date = parse(f"{pdf_date}", languages=["pt"]).date() if gazette_id == "0": starting_offset = 3 formdata = { "__LASTFOCUS": "", "__EVENTTARGET": f"ctl00$cphMasterPrincipal$gdvGrid2$ctl{idx + starting_offset:02d}$lnkVisualizar", "__EVENTARGUMENT": "", "__ASYNCPOST": "true", } yield scrapy.FormRequest.from_response( response, formdata=formdata, callback=self.parse_regular_edition, meta={"parsed_date": parsed_date}, ) else: yield Gazette( date=parsed_date, file_urls=[ f"http://legisladocexterno.curitiba.pr.gov.br/DiarioSuplementoConsultaExterna_Download.aspx?id={gazette_id}" ], is_extra_edition=True, territory_id=self.TERRITORY_ID, power="executive_legislature", scraped_at=datetime.utcnow(), )
def parse(self, response): for element in response.css(self.GAZETTE_ELEMENT_CSS): url = element.css("a::attr(href)").extract_first() date = dateparser.parse( element.xpath(self.DATE_XPATH).extract_first(), languages=["pt"] ).date() yield Gazette( date=date, file_urls=[url], is_extra_edition=False, territory_id=self.TERRITORY_ID, power="executive", scraped_at=datetime.utcnow(), ) current_page = w3lib.url.url_query_parameter(response.url, "pg") if ( not response.css(self.LAST_PAGE_CSS) .extract_first() .endswith("pg=" + current_page) ): next_url = w3lib.url.add_or_replace_parameter( response.url, "pg", str(int(current_page) + 1) ) yield Request(next_url)
def parse(self, response): """ @url http://www.cascavel.pr.gov.br/servicos/orgao_oficial.php @returns items 1 @scrapes date file_urls is_extra_edition territory_id power scraped_at """ for row in response.xpath("//table//tr[position()>1]"): date = row.xpath(".//td[2]//font//text()").extract_first() date = parse(date, languages=["pt"]).date() for link in row.xpath(".//td[3]//a"): link_text = link.xpath(".//text()").extract_first() power = "executive" if "Executivo" in link_text else "legislature" url = response.urljoin(link.xpath("./@href").extract_first("")) yield Gazette( date=date, file_urls=[url], is_extra_edition=False, territory_id=self.TERRITORY_ID, power=power, scraped_at=dt.datetime.utcnow(), ) next_page_xpath = '//a[@title="Próxima página"]/@href' next_page_url = response.xpath(next_page_xpath).extract_first() if next_page_url: yield response.follow(next_page_url)
def get_gazette(self, document, is_after_transition): """ Extract the information from the document and return a Gazette item """ title = document.css("::text").get() edition_number = re.search(r"\d+", title).group(0) is_extra_edition = bool(re.search(r"EXTRA", title, re.IGNORECASE)) date_text = re.search(r"(\d{1,2}\w+\d{4})|(\d{1,2}.\d{1,2}.\d{4})", title).group(0) date = dateparser.parse(date_text, languages=["pt"]).date() if is_after_transition: file_url = self.get_file_url(title, date) else: file_url = document.css("::attr(href)").get() return Gazette( date=date, edition_number=edition_number, file_urls=[file_url], power="executive_legislative", is_extra_edition=is_extra_edition, )
def build_gazzete(self, date, url, power, is_extra_edition=False): return Gazette( date=date, file_urls=[url], is_extra_edition=is_extra_edition, power=power, )
def parse(self, response): """Parses gazettes page and requests next page. Normal gazettes are displayed in a weekly basis, so, the date which is taken into account for this type of gazette is the last in the publication period (i.e. "29/08/2020" from "23/08/2020 to 29/08/2020"). Special gazzetes are daily, but that same logic applies here and it works correctly. """ for element in response.css(self.GAZETTE_ROW_CSS): url = element.css(self.GAZETTE_URL_CSS).extract_first() date = element.css(self.DATE_CSS).re(self.DATE_REGEX).pop() date = dateparser.parse(date, languages=["pt"]).date() is_extra = "Especial" in element.css( self.EXTRA_EDITION_CSS).extract_first() yield Gazette( date=date, file_urls=[url], is_extra_edition=is_extra, power="executive_legislative", ) for url in response.css(self.NEXT_PAGE_CSS).extract(): yield response.follow(url)
def parse(self, response): gazettes = response.css("#ContentPlaceHolder1_gvResultado tbody tr") for gazette in gazettes: gazette_raw_date = gazette.xpath(".//td[2]/text()").get() gazette_date = datetime.datetime.strptime(gazette_raw_date, "%d/%m/%Y").date() edition = gazette.xpath(".//td[1]/text()") edition_number = edition.re_first(r"\d+") is_extra_edition = "suplemento" in edition.get().lower() gazette_item = Gazette( date=gazette_date, edition_number=edition_number, is_extra_edition=is_extra_edition, power="executive", ) download_url = response.urljoin( gazette.xpath(".//td[6]/a/@href").get()) yield scrapy.Request( download_url, method="HEAD", callback=self.parse_gazette_download_url, cb_kwargs={"item": gazette_item}, )
def parse(self, response): texts = response.xpath( "//div[1]/div/div/div[1]/div/article/div[1]/ul/li").getall() texts = [ self._clean_edition_text(edition_title) for edition_title in texts ] gazette_urls = response.xpath( "//div[1]/div/div/div[1]/div/article/div[1]/ul/li/a[1]/@href" ).getall() for gazette_url, text in zip(gazette_urls, texts): # year needs to be 3 or 4 because of typos date = re.match(r"[0-9]{2}/[0-9]{2}/\s?[0-9]{3,4}", text).group() if len(date) < 10: date = self._handle_date_typos( date, response.meta.get("current_year")) gazette_date = dateparser.parse(date, languages=["pt"]).date() file_urls = [gazette_url] is_extra_edition = any([ word in text for word in [ "EXTRAORDINÁRIA", "ESPECIAL", "GABARITO", "EXTRAORDINÁRIO", ] ]) yield Gazette( date=gazette_date, file_urls=file_urls, is_extra_edition=is_extra_edition, power="executive", )
def parse(self, response): """ @url http://www.pontagrossa.pr.gov.br/diario-oficial/ @returns requests 1 """ links = response.css(".view-content .field a") smallest_year = min( (p["date"].year for p in self.pdf_infos(links, self.starting_year)), default=0, ) if smallest_year >= self.starting_year: next_page_url = response.urljoin( response.css(".pager-next a::attr(href)").extract_first() ) yield scrapy.Request(next_page_url) for pdf_info in self.pdf_infos(links, self.starting_year): gazette_date = pdf_info["date"].strftime("%Y-%m-%d") yield Gazette( date=gazette_date, file_urls=[pdf_info["url"]], is_extra_edition=pdf_info["is_extra_edition"], territory_id=self.TERRITORY_ID, power="executive_legislature", scraped_at=datetime.utcnow(), )
def parse(self, response): iframe = response.css("iframe") if not iframe: # If iframe is not present on page, we don't have a valid gazette # in the response return parts_script = response.xpath("//script[contains(., 'pdfjs-frame')]") if parts_script: file_urls = parts_script.re(r"\(\'src\', \'(.*)\'\);") else: query_src = urllib.parse.urlparse( iframe.css("::attr(src)").get()).query file_urls = urllib.parse.parse_qs(query_src).get("file", []) gazette_year = response.css( "#diario-select-year option[selected]::attr(value)").get() gazette_month = response.css( "#diario-select-month option[selected]::attr(value)").get() gazette_day = response.css( "#diario-select-day option[selected]::attr(value)").get() gazette_date = datetime.date(int(gazette_year), int(gazette_month), int(gazette_day)) yield Gazette( date=gazette_date, file_urls=file_urls, is_extra_edition=False, power="executive_legislative", )
def parse(self, response): """ @url http://www.guarulhos.sp.gov.br/diario-oficial/index.php?mes=1&ano=2018 @returns items 17 17 @scrapes date file_urls is_extra_edition municipality_id power scraped_at """ diarios = response.xpath('//div[contains(@id, "diario")]') items = [] for diario in diarios: date = diario.xpath('.//h3/text()').extract_first() date = parse(date[-10:], languages=['pt']).date() is_extra_edition = False links = diario.xpath('.//a[contains(@href, ".pdf")]').xpath( '@href') url = [response.urljoin(link) for link in links.extract()] power = 'executive' items.append( Gazette( date=date, file_urls=url, is_extra_edition=is_extra_edition, municipality_id=self.MUNICIPALITY_ID, power=power, scraped_at=dt.datetime.utcnow(), )) return items
def parse_gazette(self, response): """Parses list of documents to request each one for the date.""" json_response = response.json() if not json_response: self.logger.warning(f"Document not found in {response.url}") return json_dir = json_response["dir"] date = re.search(self.DATE_REGEX, json_dir).group() date = dateparser.parse(date, settings={"DATE_ORDER": "DMY"}) is_extra_edition = self.EXTRA_EDITION_TEXT in json_dir path = json_dir.replace("/", "|") json_data = json_response["data"] file_urls = [ self.PDF_URL.format(path, url.split("/")[-1]) for url in json_data ] yield Gazette( date=date, file_urls=file_urls, is_extra_edition=is_extra_edition, power="executive_legislative", )
def parse_month_page(self, response): """ @url http://www.campinas.sp.gov.br/diario-oficial/index.php?mes=1&ano=2018 @returns items 23 23 @scrapes date file_urls is_extra_edition municipality_id power scraped_at """ items = [] month_year = response.css( ".tabelaDiario:first-child tr th:nth-child(2)::text" ).extract_first() # "janeiro 2018" links = response.css(".tabelaDiario:first-child tr td a") for link in links: url = link.css('::attr(href)').extract_first().replace('../', '') day = link.css('::text').extract_first() date = parse(f'{day} {month_year}', languages=['pt']).date() url = f'{self.sp_campinas_url}{url}' is_extra_edition = False power = 'executive_legislature' items.append( Gazette( date=date, file_urls=[url], is_extra_edition=is_extra_edition, municipality_id=self.MUNICIPALITY_ID, power=power, scraped_at=dt.datetime.utcnow(), )) return items
def create_gazette(self, date, url, is_extra_edition): return Gazette( date=date, file_urls=[url], is_extra_edition=is_extra_edition, power="executive", )
def parse_gazette(self, response): """ @url https://gravatai.atende.net/?pg=diariooficial&pagina=1 @returns items 1 @scrapes date file_urls is_extra_edition territory_id power scraped_at """ for element in response.css(".nova_listagem > .linha"): info = element.css(".info") is_extra_edition = (info.css(".tipo::text").extract_first() in self.extra_editions_options) date = parse(info.css(".data::text").extract_first(), languages=["pt"]).date() code = element.css( ".opcoes > button::attr(data-codigo)").extract_first() url = ("https://gravatai.atende.net/atende.php?rot=54002&aca=737" f"&processo=download&codigo={code}") yield Gazette( date=date, file_urls=[url], is_extra_edition=is_extra_edition, territory_id=self.TERRITORY_ID, power="executive", scraped_at=datetime.utcnow(), )
def parse_month_page(self, response): """ @url http://www2.portoalegre.rs.gov.br/dopa/default.php?p_secao=1431 @returns items 58 58 @scrapes date file_urls is_extra_edition territory_id power scraped_at """ links = response.css('#conteudo a') items = [] for link in links: url = link.css('::attr(href)').extract_first() if url[-4:] != '.pdf': continue url = response.urljoin(url) power = 'executive' if 'executivo' in url.lower( ) else 'legislature' date = link.css('::text').extract_first() is_extra_edition = 'extra' in date.lower() date = parse(date.split('-')[0], languages=['pt']).date() items.append( Gazette( date=date, file_urls=[url], is_extra_edition=is_extra_edition, territory_id=self.TERRITORY_ID, power=power, scraped_at=dt.datetime.utcnow(), )) return items
def parse_items(self, response): body = response.body if self.is_body_empty(body): return definition, rows = self.parse_definitions_and_rows(body) for row in rows: item = dict(zip(definition, row)) date_values = item["DTPUBLICACAO"] item_date = date(date_values[0], date_values[1] + 1, date_values[2]) url = "https://www.valadares.mg.gov.br/abrir_arquivo.aspx?cdLocal=12&arquivo={}{}".format( item["NMARQUIVO"], item["NMEXTENSAOARQUIVO"]) yield Gazette( date=item_date, file_urls=[url], is_extra_edition=False, territory_id=self.TERRITORY_ID, power="executive", scraped_at=datetime.utcnow(), ) self.current_page += 1 yield self.make_request(self.current_page)
def parse_month_page(self, response): """ @url http://www.campinas.sp.gov.br/diario-oficial/index.php?mes=1&ano=2018 @returns items 23 23 @scrapes date file_urls is_extra_edition territory_id power scraped_at """ items = [] month_year = response.css( ".tabelaDiario:first-child tr th:nth-child(2)::text" ).extract_first() # "janeiro 2018" links = response.css(".tabelaDiario:first-child tr td a") for link in links: url = link.css("::attr(href)").extract_first().replace("../", "") day = link.css("::text").extract_first() date = parse(f"{day} {month_year}", languages=["pt"]).date() url = f"{self.sp_campinas_url}{url}" is_extra_edition = False power = "executive_legislature" items.append( Gazette( date=date, file_urls=[url], is_extra_edition=is_extra_edition, territory_id=self.TERRITORY_ID, power=power, scraped_at=dt.datetime.utcnow(), )) return items
def parse_page(self, response): """Parses list of gazettes. Extra editions can already have its item built. Regular editions need an extra request. """ for idx, row in enumerate(response.css(".grid_Row")): pdf_date = row.css("td:nth-child(2) span ::text").extract_first() gazette_id = row.css( "td:nth-child(3) a ::attr(data-teste)").extract_first() parsed_date = parse(f"{pdf_date}", languages=["pt"]).date() eventtarget = row.css("td:nth-child(3) a ::attr(href)").re_first( "'(.*lnkVisualizar)'") if gazette_id == "0": yield scrapy.FormRequest.from_response( response, formdata={"__EVENTTARGET": eventtarget}, callback=self.parse_regular_edition, meta={"parsed_date": parsed_date}, ) else: yield Gazette( date=parsed_date, file_urls=[ f"https://legisladocexterno.curitiba.pr.gov.br/DiarioSuplementoConsultaExterna_Download.aspx?Id={gazette_id}" ], is_extra_edition=True, power="executive_legislative", )
def parse(self, response): lines = response.xpath('//table[contains(@class, "adminlist")]/tr') urls = [ response.urljoin(relative_url) for relative_url in lines.xpath("td[1]/a/@href").extract() ] is_extra_edition = [ "Extra" in text for text in lines.xpath("td[1]/a/text()").extract() ] dates = [ parse(date, languages=["pt"]).date() for date in lines.xpath("td[2]/text()").extract() ] for url, is_extra, date in zip(urls, is_extra_edition, dates): yield Gazette( date=date, file_urls=[url], is_extra_edition=is_extra, power="executive_legislative", ) for page in range(2, len(response.css(".button.othersOptPage")) + 1): yield FormRequest(response.url, formdata={"hpage": str(page)}, callback=self.parse)
def parse(self, response): """ @url https://sistemas.canoas.rs.gov.br/domc/api/public/diary-by-day?day=08/06/2018 # noqa @returns items 3 3 @scrapes date file_urls is_extra_edition territory_id power scraped_at """ data = json.loads(response.body_as_unicode()) items = [] # "editions" is empty when there were no gazettes in the date for edition in data.get("editions", []): file_url = f"{self.BASE_URL}/edition-file/{edition['id']}" is_extra_edition = edition["type"] == "C" items.append( Gazette( date=data["day"], file_urls=[file_url], is_extra_edition=is_extra_edition, territory_id=self.TERRITORY_ID, power="executive", scraped_at=datetime.utcnow(), ) ) return items