def parse_well_data(self, response): self.crawler.stats.inc_value('2_welldata_response_count', spider=self) item = response.meta['current_item'] hxs = HtmlXPathSelector(response) tds = hxs.select('//td') # Note that there may be two instances of lat/lng in the record, # one is 'as planned' and the 2nd is 'as built'. # We want the 2nd if it's there. try: lat, lng = find_well_data(tds, parse_well_latlng, "Lat/Long:", all=True)[-1] except IndexError: lat = lng = None well_status = find_well_data(tds, parse_text, "Status:", embedded=True) well_spud_date = find_well_data(tds, parse_date, "Spud Date:", nexttd=True) if lat: item['well_lat'] = lat item['well_lng'] = lng if well_status and well_status != item.get('well_status'): item['well_status'] = well_status item['well_status_date'] = convert_fuzzy_date(str(date.today())) elif 'well_status' not in item: # To avoid keyerrors in feed generator, make sure well keys exist item['well_status'] = None item['well_status_date'] = None if well_spud_date or 'well_spud_date' not in item: item['well_spud_date'] = well_spud_date for result in self.process_permit_item(item): yield result
def extract_date(td): raw_date = extract_text(td) if raw_date is None or raw_date == '' or raw_date == 'N/A': return "" date = convert_fuzzy_date(raw_date).split()[0] return date
def parse_date(text): return None if not text else convert_fuzzy_date(text).split()[0]