def parse_page(self, response): raw_modalities = response.xpath("//tr/td[1]/table/tr/td/text()").extract() raw_descriptions = response.xpath( "//table/tr[2]/td/table/tr[6]/td/table/tr/td[2]/table[1]" ) raw_bids_history = response.xpath( "//table/tr[2]/td/table/tr[6]/td/table/tr/td[2]/table[2]" ) raw_date = response.xpath("//tr/td[3]/table/tr/td/text()").extract() descriptions = self._parse_descriptions(raw_descriptions) bids_history = self._parse_bids_history(raw_bids_history) modalities = self._parse_modalities(raw_modalities) date = self._parse_date(raw_date) bid_data = zip(modalities, descriptions, bids_history, date) url_pattern = re.compile(r"licitacoes_pm\.asp[\?|&]cat=(\w+)\&dt=(\d+-\d+)") for modality_and_code, (description, document_url), history, date in bid_data: match = url_pattern.search(response.url) month, year = match.group(2).split("-") item = CityHallBidItem( crawled_at=datetime_utcnow_aware(), crawled_from=response.url, public_agency=match.group(1).upper(), month=int(month), year=int(year), description=description, history=history, codes=modality_and_code["codes"], modality=modality_and_code["modality"], session_at=from_str_to_datetime(date), ) if document_url: item["files"] = [response.urljoin(document_url)] yield item
def _parse_bids_history(self, raw_bids_history): all_bids_history = [] for raw_bid_history in raw_bids_history: bids_history = [] for row in raw_bid_history.xpath(".//tr"): date = row.xpath(".//td[2]/text()").get().strip() date = from_str_to_datetime(date) event = row.xpath(".//td[3]/div/text()").get() url = row.xpath(".//td[4]/div/a//@href").get() if event and date: url = url if url else "" bids_history.append( {"published_at": date, "event": event.capitalize(), "url": url} ) all_bids_history.append(bids_history) return all_bids_history
def test_possible_datetime(datetime_str, expected_obj): assert from_str_to_datetime(datetime_str) == expected_obj
def test_possible_date_formats(datetime_str, expected_obj): formats = ["%d/%m/%Y", "%d/%m/%y"] assert from_str_to_datetime(datetime_str, formats) == expected_obj
def test_dates_older_than_city_creation(datetime_str, expected_obj): formats = ["%d/%m/%Y", "%d/%m/%y"] assert from_str_to_datetime(datetime_str, formats) == expected_obj