예제 #1
0
    def parse_details(self, response):
        gazette = response.meta["gazette"]

        gazette["year_and_edition"] = (
            response.css("span.style4 ::text").extract()[1].strip())
        titles = response.xpath(
            "//tr/td/table/tr/td[@colspan='2']/text()").extract()
        descriptions = response.css("td.destaqt ::text").extract()

        events = []
        while titles:
            events.append({
                "title": titles.pop(0).strip(),
                "secretariat": descriptions.pop(0).strip(),
                "summary": descriptions.pop(0).strip(),
            })
            titles.pop(0)

        if gazette.get("events") is None:
            gazette["events"] = events
        else:
            gazette["events"].extend(events.copy())

        current_page = response.css("ul li.current ::text").extract_first()
        last_page = response.css("ul li:last-child ::text").extract_first()
        if current_page:
            current_page = current_page.strip()
            last_page = last_page.strip()
            if current_page != last_page:
                next_page = int(current_page) + 1
                url = response.css("ul li a::attr(href)").extract_first()
                url = replace_query_param(url, "p", next_page)

                yield Request(
                    response.urljoin(url),
                    callback=self.parse_details,
                    meta={"gazette": gazette},
                )
            else:
                gazette_item = GazetteItem(
                    date=from_str_to_date(gazette["date"]),
                    power=gazette["power"],
                    year_and_edition=gazette["year_and_edition"],
                    events=gazette["events"],
                    crawled_at=datetime.now(),
                    crawled_from=response.url,
                )
                yield Request(
                    gazette["file_url"],
                    callback=self.parse_document_url,
                    meta={"gazette": gazette_item},
                )
예제 #2
0
    def parse(self, response):
        dates = response.xpath("//table/tbody/tr/td[1]/strong/text()").getall()
        event_titles = response.xpath("//table/tbody/tr/td[2]/p/strong/text()").getall()
        file_urls = response.xpath("//p/a/@href").extract()

        for event_date, title, file_url in zip(dates, event_titles, file_urls):
            event_date = from_str_to_date(event_date)
            yield CityCouncilMinuteItem(
                crawled_at=datetime.now(),
                crawled_from=response.url,
                date=event_date,
                title=title.strip(),
                event_type=self.get_type(title),
                files=[response.urljoin(file_url)],
            )
예제 #3
0
    def parse_page(self, response):
        event_details = response.css("div.feature-box")
        dates = response.xpath("//table/tbody/tr/td[1]/strong/text()").getall()
        event_titles = response.xpath("//table/tbody/tr/td[2]/p/strong/text()").getall()

        for details, event_date, title in zip(event_details, dates, event_titles):
            events = [
                line.strip()
                for line in details.css("p ::text").getall()
                if line.strip() != ""
            ]
            event_date = from_str_to_date(event_date)
            yield CityCouncilAgendaItem(
                crawled_at=datetime.now(),
                crawled_from=response.url,
                date=event_date,
                details=" ".join(events),
                title=title.strip(),
                event_type=self.get_type(title),
            )
예제 #4
0
    def parse(self, response):
        if "SEM INFORMA" not in response.text:  # it means page found
            events, urls = self.extract_events(response)
            for event, url in zip(events, urls):
                yield LegacyGazetteItem(
                    title=event["event"],
                    published_on=event["published_on"],
                    date=from_str_to_date(event["date"]),
                    details=url["details"],
                    file_urls=[url["url"]],
                    crawled_at=datetime.now(),
                    crawled_from=response.url,
                )

            current_page = self.get_current_page(response)
            last_page = response.xpath("//table/tr[10]/td/ul/li/a/text()")

            if current_page and last_page:
                last_page = int(last_page[-1].get().strip())
                next_page = int(current_page.strip()) + 1

                if next_page <= last_page:
                    url = replace_query_param(response.url, "p", next_page)
                    yield response.follow(url, callback=self.parse)
예제 #5
0
def extract_date(str_with_date):
    DATE_PATTERN = re.compile(r"\d+\/\d+\/\d+")
    result = re.search(DATE_PATTERN, str_with_date)
    if result:
        return from_str_to_date(result.group(0))
    return
예제 #6
0
def test_possible_date(date_str, expected_obj):
    assert from_str_to_date(date_str) == expected_obj