示例#1
0
def save_document(item):
    public_view_url = "https://e.tcm.ba.gov.br/epp/ConsultaPublica/listView.seam"
    document, created = TCMBADocument.objects.get_or_create(
        year=item["year"],
        month=item["month"],
        period=item["period"].lower(),
        category=item["category"],
        unit=item["unit"],
        inserted_at=from_str_to_date(item["inserted_at"]),
        inserted_by=item["inserted_by"],
        original_filename=item["original_filename"],
        crawled_from=public_view_url,
        defaults={
            "crawled_at": item["crawled_at"],
        },
    )
    content_type = get_content_type_for_model(document)
    if created:
        _, file_created = File.objects.get_or_create(
            url=public_view_url,
            content_type=content_type,
            object_id=document.pk,
            local_path=f"{item['filepath']}{item['filename']}",
            original_filename=item["original_filename"],
        )
示例#2
0
    def parse(self, response):
        dates = response.css("section div.row div div h3 ::text").getall()
        dates = [date_txt.replace("Data: ", "") for date_txt in dates]
        event_titles = response.css(
            "section div.row div div ul li ::text").getall()
        file_urls = response.css(
            "section div.row div div a::attr(href)").extract()
        file_urls = [
            f"https://www.feiradesantana.ba.leg.br/{url}" for url in file_urls
        ]

        for event_date, title, file_url in zip(dates, event_titles, file_urls):
            event_date = from_str_to_date(event_date)
            yield CityCouncilMinuteItem(
                crawled_at=datetime_utcnow_aware(),
                crawled_from=response.url,
                date=event_date,
                title=title.strip(),
                event_type=response.meta["event_type"],
                files=[response.urljoin(file_url)],
            )

        pagination = response.css("section div ul.pagination li a")
        if pagination:
            current_page = response.css(
                "section div ul.pagination li.active ::text").get()
            if not current_page:
                current_page = response.css(
                    "section div ul.pagination li.current ::text").get()
            if current_page:
                next_page = int(current_page) + 1
                url = f"{response.meta['url_without_page']}&p={next_page}"
                yield scrapy.Request(url,
                                     callback=self.parse,
                                     meta=response.meta)
示例#3
0
    def parse_details(self, response):
        gazette = response.meta["gazette"]

        gazette["year_and_edition"] = (
            response.css("span.style4 ::text").extract()[1].strip()
        )
        titles = response.xpath("//tr/td/table/tr/td[@colspan='2']/text()").extract()
        descriptions = response.css("td.destaqt ::text").extract()

        events = []
        while titles:
            events.append(
                {
                    "title": titles.pop(0).strip(),
                    "secretariat": descriptions.pop(0).strip(),
                    "summary": descriptions.pop(0).strip(),
                }
            )
            titles.pop(0)

        if gazette.get("events") is None:
            gazette["events"] = events
        else:
            gazette["events"].extend(events.copy())

        current_page = response.css("ul li.current ::text").extract_first()
        last_page = response.css("ul li:last-child ::text").extract_first()
        if current_page:
            current_page = current_page.strip()
            last_page = last_page.strip()
            if current_page != last_page:
                next_page = int(current_page) + 1
                url = response.css("ul li a::attr(href)").extract_first()
                url = replace_query_param(url, "p", next_page)

                yield Request(
                    response.urljoin(url),
                    callback=self.parse_details,
                    meta={"gazette": gazette},
                )
            else:
                gazette_item = GazetteItem(
                    date=from_str_to_date(gazette["date"]),
                    power=gazette["power"],
                    year_and_edition=gazette["year_and_edition"],
                    events=gazette["events"],
                    crawled_at=datetime_utcnow_aware(),
                    crawled_from=response.url,
                )
                yield Request(
                    gazette["file_url"],
                    callback=self.parse_document_url,
                    meta={"gazette": gazette_item},
                )
示例#4
0
    def handle(self, *args, **options):
        self.echo(f"Caminho no S3: {options.get('s3_path')}")

        file_items = client.download_file(options.get("s3_path"))
        json_items = json.loads(open(file_items).read())

        public_view_url = "https://e.tcm.ba.gov.br/epp/ConsultaPublica/listView.seam"

        if options.get("drop_all"):
            confirmation = input("Apagar todos os arquivos do TCM-BA? s/n ")
            if confirmation.lower() in ["s", "y"]:
                TCMBADocument.objects.all().delete()

        failed = 0
        for item in json_items:
            path = build_path(options.get("s3_path"), item["unit"],
                              item["category"], item["filename"])
            s3_url = f"https://dadosabertosdefeira.s3.eu-central-1.amazonaws.com/{path}"
            s3_file_path = f"s3://dadosabertosdefeira/{path}"

            document, created = TCMBADocument.objects.get_or_create(
                year=item["year"],
                month=item["month"],
                period=item["period"].lower(),
                category=item["category"],
                unit=item["unit"],
                inserted_at=from_str_to_date(item["inserted_at"]),
                inserted_by=item["inserted_by"],
                original_filename=item["original_filename"],
                crawled_from=public_view_url,
                defaults={
                    "crawled_at":
                    datetime.fromisoformat(
                        item["crawled_at"]).replace(tzinfo=timezone.utc),
                },
            )
            content_type = get_content_type_for_model(document)
            if created:
                _, file_created = File.objects.get_or_create(
                    url=public_view_url,
                    content_type=content_type,
                    object_id=document.pk,
                    s3_url=s3_url,
                    s3_file_path=s3_file_path,
                    original_filename=item["original_filename"],
                )
                if not file_created:
                    self.warn(f"Arquivo já existe: {document.pk} - {item}")
            else:
                self.warn(f"Documento já existe: {document.pk} - {item}")
                failed += 1
        self.warn(f"Warnings: {failed}")
示例#5
0
    def parse_list_page(self, response):
        council_members = response.css(
            "div.row div div ul li a::text").extract()
        status = response.css("div.row div div div a::text").extract()

        for council_member, status in zip(council_members, status):
            yield CityCouncilAttendanceListItem(
                crawled_at=datetime_utcnow_aware(),
                crawled_from=response.url,
                date=from_str_to_date(response.meta["date"]),
                council_member=council_member.strip(),
                status=self.get_status(status),
            )
    def parse(self, response):
        dates = response.xpath("//table/tbody/tr/td[1]/strong/text()").getall()
        event_titles = response.xpath(
            "//table/tbody/tr/td[2]/p/strong/text()").getall()
        file_urls = response.xpath("//p/a/@href").extract()

        for event_date, title, file_url in zip(dates, event_titles, file_urls):
            event_date = from_str_to_date(event_date)
            yield CityCouncilMinuteItem(
                crawled_at=datetime_utcnow_aware(),
                crawled_from=response.url,
                date=event_date,
                title=title.strip(),
                event_type=self.get_type(title),
                files=[response.urljoin(file_url)],
            )
    def parse_page(self, response):
        event_details = response.css("div.feature-box")
        dates = response.xpath("//table/tbody/tr/td[1]/strong/text()").getall()
        event_titles = response.xpath(
            "//table/tbody/tr/td[2]/p/strong/text()").getall()

        for details, event_date, title in zip(event_details, dates,
                                              event_titles):
            events = [
                line.strip() for line in details.css("p ::text").getall()
                if line.strip() != ""
            ]
            event_date = from_str_to_date(event_date)
            yield CityCouncilAgendaItem(
                crawled_at=datetime_utcnow_aware(),
                crawled_from=response.url,
                date=event_date,
                details=" ".join(events),
                title=title.strip(),
                event_type=self.get_type(title),
            )
示例#8
0
    def parse(self, response):
        if "SEM INFORMA" not in response.text:  # it means page found
            events, urls = self.extract_events(response)
            for event, url in zip(events, urls):
                yield LegacyGazetteItem(
                    title=event["event"],
                    published_on=event["published_on"],
                    date=from_str_to_date(event["date"]),
                    details=url["details"],
                    files=[url["url"]],
                    crawled_at=datetime_utcnow_aware(),
                    crawled_from=response.url,
                )

            current_page = self.get_current_page(response)
            last_page = response.xpath("//table/tr[10]/td/ul/li/a/text()")

            if current_page and last_page:
                last_page = int(last_page[-1].get().strip())
                next_page = int(current_page.strip()) + 1

                if next_page <= last_page:
                    url = replace_query_param(response.url, "p", next_page)
                    yield response.follow(url, callback=self.parse)
示例#9
0
def test_possible_date(date_str, expected_obj):
    assert from_str_to_date(date_str) == expected_obj
def extract_date(str_with_date):
    DATE_PATTERN = re.compile(r"\d+\/\d+\/\d+")
    result = re.search(DATE_PATTERN, str_with_date)
    if result:
        return from_str_to_date(result.group(0))
    return