def save_document(item): public_view_url = "https://e.tcm.ba.gov.br/epp/ConsultaPublica/listView.seam" document, created = TCMBADocument.objects.get_or_create( year=item["year"], month=item["month"], period=item["period"].lower(), category=item["category"], unit=item["unit"], inserted_at=from_str_to_date(item["inserted_at"]), inserted_by=item["inserted_by"], original_filename=item["original_filename"], crawled_from=public_view_url, defaults={ "crawled_at": item["crawled_at"], }, ) content_type = get_content_type_for_model(document) if created: _, file_created = File.objects.get_or_create( url=public_view_url, content_type=content_type, object_id=document.pk, local_path=f"{item['filepath']}{item['filename']}", original_filename=item["original_filename"], )
def parse(self, response): dates = response.css("section div.row div div h3 ::text").getall() dates = [date_txt.replace("Data: ", "") for date_txt in dates] event_titles = response.css( "section div.row div div ul li ::text").getall() file_urls = response.css( "section div.row div div a::attr(href)").extract() file_urls = [ f"https://www.feiradesantana.ba.leg.br/{url}" for url in file_urls ] for event_date, title, file_url in zip(dates, event_titles, file_urls): event_date = from_str_to_date(event_date) yield CityCouncilMinuteItem( crawled_at=datetime_utcnow_aware(), crawled_from=response.url, date=event_date, title=title.strip(), event_type=response.meta["event_type"], files=[response.urljoin(file_url)], ) pagination = response.css("section div ul.pagination li a") if pagination: current_page = response.css( "section div ul.pagination li.active ::text").get() if not current_page: current_page = response.css( "section div ul.pagination li.current ::text").get() if current_page: next_page = int(current_page) + 1 url = f"{response.meta['url_without_page']}&p={next_page}" yield scrapy.Request(url, callback=self.parse, meta=response.meta)
def parse_details(self, response): gazette = response.meta["gazette"] gazette["year_and_edition"] = ( response.css("span.style4 ::text").extract()[1].strip() ) titles = response.xpath("//tr/td/table/tr/td[@colspan='2']/text()").extract() descriptions = response.css("td.destaqt ::text").extract() events = [] while titles: events.append( { "title": titles.pop(0).strip(), "secretariat": descriptions.pop(0).strip(), "summary": descriptions.pop(0).strip(), } ) titles.pop(0) if gazette.get("events") is None: gazette["events"] = events else: gazette["events"].extend(events.copy()) current_page = response.css("ul li.current ::text").extract_first() last_page = response.css("ul li:last-child ::text").extract_first() if current_page: current_page = current_page.strip() last_page = last_page.strip() if current_page != last_page: next_page = int(current_page) + 1 url = response.css("ul li a::attr(href)").extract_first() url = replace_query_param(url, "p", next_page) yield Request( response.urljoin(url), callback=self.parse_details, meta={"gazette": gazette}, ) else: gazette_item = GazetteItem( date=from_str_to_date(gazette["date"]), power=gazette["power"], year_and_edition=gazette["year_and_edition"], events=gazette["events"], crawled_at=datetime_utcnow_aware(), crawled_from=response.url, ) yield Request( gazette["file_url"], callback=self.parse_document_url, meta={"gazette": gazette_item}, )
def handle(self, *args, **options): self.echo(f"Caminho no S3: {options.get('s3_path')}") file_items = client.download_file(options.get("s3_path")) json_items = json.loads(open(file_items).read()) public_view_url = "https://e.tcm.ba.gov.br/epp/ConsultaPublica/listView.seam" if options.get("drop_all"): confirmation = input("Apagar todos os arquivos do TCM-BA? s/n ") if confirmation.lower() in ["s", "y"]: TCMBADocument.objects.all().delete() failed = 0 for item in json_items: path = build_path(options.get("s3_path"), item["unit"], item["category"], item["filename"]) s3_url = f"https://dadosabertosdefeira.s3.eu-central-1.amazonaws.com/{path}" s3_file_path = f"s3://dadosabertosdefeira/{path}" document, created = TCMBADocument.objects.get_or_create( year=item["year"], month=item["month"], period=item["period"].lower(), category=item["category"], unit=item["unit"], inserted_at=from_str_to_date(item["inserted_at"]), inserted_by=item["inserted_by"], original_filename=item["original_filename"], crawled_from=public_view_url, defaults={ "crawled_at": datetime.fromisoformat( item["crawled_at"]).replace(tzinfo=timezone.utc), }, ) content_type = get_content_type_for_model(document) if created: _, file_created = File.objects.get_or_create( url=public_view_url, content_type=content_type, object_id=document.pk, s3_url=s3_url, s3_file_path=s3_file_path, original_filename=item["original_filename"], ) if not file_created: self.warn(f"Arquivo já existe: {document.pk} - {item}") else: self.warn(f"Documento já existe: {document.pk} - {item}") failed += 1 self.warn(f"Warnings: {failed}")
def parse_list_page(self, response): council_members = response.css( "div.row div div ul li a::text").extract() status = response.css("div.row div div div a::text").extract() for council_member, status in zip(council_members, status): yield CityCouncilAttendanceListItem( crawled_at=datetime_utcnow_aware(), crawled_from=response.url, date=from_str_to_date(response.meta["date"]), council_member=council_member.strip(), status=self.get_status(status), )
def parse(self, response): dates = response.xpath("//table/tbody/tr/td[1]/strong/text()").getall() event_titles = response.xpath( "//table/tbody/tr/td[2]/p/strong/text()").getall() file_urls = response.xpath("//p/a/@href").extract() for event_date, title, file_url in zip(dates, event_titles, file_urls): event_date = from_str_to_date(event_date) yield CityCouncilMinuteItem( crawled_at=datetime_utcnow_aware(), crawled_from=response.url, date=event_date, title=title.strip(), event_type=self.get_type(title), files=[response.urljoin(file_url)], )
def parse_page(self, response): event_details = response.css("div.feature-box") dates = response.xpath("//table/tbody/tr/td[1]/strong/text()").getall() event_titles = response.xpath( "//table/tbody/tr/td[2]/p/strong/text()").getall() for details, event_date, title in zip(event_details, dates, event_titles): events = [ line.strip() for line in details.css("p ::text").getall() if line.strip() != "" ] event_date = from_str_to_date(event_date) yield CityCouncilAgendaItem( crawled_at=datetime_utcnow_aware(), crawled_from=response.url, date=event_date, details=" ".join(events), title=title.strip(), event_type=self.get_type(title), )
def parse(self, response): if "SEM INFORMA" not in response.text: # it means page found events, urls = self.extract_events(response) for event, url in zip(events, urls): yield LegacyGazetteItem( title=event["event"], published_on=event["published_on"], date=from_str_to_date(event["date"]), details=url["details"], files=[url["url"]], crawled_at=datetime_utcnow_aware(), crawled_from=response.url, ) current_page = self.get_current_page(response) last_page = response.xpath("//table/tr[10]/td/ul/li/a/text()") if current_page and last_page: last_page = int(last_page[-1].get().strip()) next_page = int(current_page.strip()) + 1 if next_page <= last_page: url = replace_query_param(response.url, "p", next_page) yield response.follow(url, callback=self.parse)
def test_possible_date(date_str, expected_obj): assert from_str_to_date(date_str) == expected_obj
def extract_date(str_with_date): DATE_PATTERN = re.compile(r"\d+\/\d+\/\d+") result = re.search(DATE_PATTERN, str_with_date) if result: return from_str_to_date(result.group(0)) return