class BerwickParser: comments_email_address = "*****@*****.**" def __init__(self, *args): self.authority_name = "Berwick-upon-Tweed Borough Council" self.authority_short_name = "Berwick" self.base_url = "http://www.berwick-upon-tweed.gov.uk/planning/register/wl/%s.htm" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) monday_before = search_day - datetime.timedelta(search_day.weekday()) thursday = monday_before + datetime.timedelta(3) if search_day.weekday() > 3: # i.e. It is friday, saturday, or sunday # We need to add a week thursday = thursday + datetime.timedelta(7) this_url = self.base_url %(thursday.strftime(search_date_format)) # Now get the search page response = urllib2.urlopen(this_url) soup = BeautifulSoup(response.read()) # Each app is stored in a table of its own. The tables don't have # any useful attributes, so we'll find all the NavigableString objects # which look like " Application Number:" and then look at the #tables they are in. nav_strings = soup.findAll(text=" Application Number:") for nav_string in nav_strings: application = PlanningApplication() application.council_reference = nav_string.findNext("p").string.strip() result_table = nav_string.findPrevious("table") application.date_received = datetime.datetime.strptime(result_table.find(text=" Registration Date: ").findNext("p").contents[0].strip(), reg_date_format) application.osgb_x = result_table.find(text=" Easting:").findNext("p").string.strip() application.osgb_y = result_table.find(text=" Northing:").findNext("p").string.strip() application.description = result_table.find(text=" Proposed Development:").findNext("p").string.strip() application.address = result_table.find(text=" Location:").findNext("p").string.strip() application.postcode = getPostcodeFromText(application.address) application.info_url = this_url application.comment_url = self.comments_email_address self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class CarmarthenshireParser: def __init__(self, *args): self.comments_email_address = "*****@*****.**" self.authority_name = "Carmarthenshire County Council" self.authority_short_name = "Carmarthenshire" self.base_url = "http://www.carmarthenshire.gov.uk/CCC_APPS/eng/plannaps/CCC_PlanningApplicationsResults.asp?datemode=range&in_lo_date=%(day)s%%2F%(month)s%%2F%(year)s&in_hi_date=%(day)s%%2F%(month)s%%2F%(year)s&SUBMIT=Search" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # Now get the search page response = urllib2.urlopen(self.base_url %{"day": day, "month": month, "year": year, }) soup = BeautifulSoup(response.read()) trs = soup.findAll("tr", valign="middle") count = 0 for tr in trs: # The odd trs are just spacers if count % 2 == 0: application = PlanningApplication() tds = tr.findAll("td") application.date_received = search_day application.council_reference = tds[1].a.string application.address = tds[3].a.string application.postcode = getPostcodeFromText(application.address) # All the links in this <tr> go to the same place... application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) # Still looking for description and comment url # For the description, we'll need the info page info_soup = BeautifulSoup(urllib2.urlopen(application.info_url).read()) application.description = info_soup.find(text="Description").findNext("td").findNext("td").font.string # While we're here, lets get the OSGB grid ref application.osgb_x, application.osgb_y = info_soup.find(text="Grid Reference").findNext("td").font.string.split("-") # We'll have to use an email address for comments application.comment_url = self.comments_email_address self._results.addApplication(application) count += 1 return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class CrawleyParser: comment_url_template = "http://www.crawley.gov.uk/stellent/idcplg?IdcService=SS_GET_PAGE&nodeId=561&pageCSS=&pAppNo=%(pAppNo)s&pAppDocName=%(pAppDocName)s" def __init__(self, *args): self.authority_name = "Crawley Borough Council" self.authority_short_name = "Crawley" self.base_url = "http://www.crawley.gov.uk/stellent/idcplg?IdcService=SS_GET_PAGE&nodeId=560&is_NextRow=1&accept=yes&strCSS=null&pApplicationNo=&pProposal=&pLocation=&pPostcode=&pWard=&pDateType=received&pDayFrom=%(dayFrom)s&pMonthFrom=%(monthFrom)s&pYearFrom=%(yearFrom)s&pDayTo=%(dayTo)s&pMonthTo=%(monthTo)s&pYearTo=%(yearTo)s&submit=Search" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) #- Crawley only allows searches from-to, so: next = self.base_url %{ "dayFrom": day, "monthFrom": month, "yearFrom": year, "dayTo": day, "monthTo": month, "yearTo": year, } # Now get the search page response = urllib2.urlopen(next) soup = BeautifulSoup.BeautifulSoup(response.read()) if soup.table: #- Empty result set has no table trs = soup.table.findAll("tr")[1:] # First one is just headers for tr in trs: tds = tr.findAll("td") application = PlanningApplication() application.council_reference = tds[0].a.contents[0].strip().replace("/", "/") application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) info_qs = cgi.parse_qs(urlparse.urlsplit(application.info_url)[3]) comment_qs = { "pAppNo": application.council_reference, "pAppDocName": info_qs["ssDocName"][0], } application.comment_url = self.comment_url_template %comment_qs application.address = tds[1].string.strip() if tds[2].string: #- if postcode present, append it to the address too application.postcode = tds[2].string.replace(" ", " ").strip() application.address += ", " + application.postcode application.description = tds[3].string.strip() application.date_received = datetime.datetime(*(time.strptime(tds[4].string.strip(), date_format)[0:6])) self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class ForestOfDeanParser: def __init__(self, *args): self.authority_name = "Forest of Dean District Council" self.authority_short_name = "Forest of Dean" self.base_url = "http://www.fdean.gov.uk/content.asp" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) search_data = urllib.urlencode( [ ("parent_directory_id", "200"), ("nav", "679"), ("id", "13266"), ("RecStart", "1"), ("RecCount", "100"), ("SDate", search_date.strftime(date_format)), ("EDate", search_date.strftime(date_format)), ] ) search_url = self.base_url + "?" + search_data response = urllib2.urlopen(search_url) soup = BeautifulSoup(response.read()) results_table = soup.find("table", summary="List of planning applications that match your query") for tr in results_table.findAll("tr")[1:]: application = PlanningApplication() application.date_received = search_date tds = tr.findAll("td") application.council_reference = tds[0].a.string.strip() application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) application.comment_url = application.info_url application.address = ' '.join(tds[1].string.strip().split()) application.postcode = getPostcodeFromText(application.address) application.description = tds[2].string.strip() self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class CalderdaleParser: def __init__(self, *args): self.authority_name = "Calderdale Council" self.authority_short_name = "Calderdale" self.base_url = "http://www.calderdale.gov.uk/environment/planning/search-applications/planapps.jsp?status=0&date1=%(date)s&date2=%(date)s&Search=Search" self.info_url = "http://www.calderdale.gov.uk/environment/planning/search-applications/planapps.jsp?app=%s&Search=Search" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) next_page_url = self.base_url %{"date": search_date.strftime(date_format)} while next_page_url: try: response = urllib2.urlopen(next_page_url) except urllib2.HTTPError: # This is what seems to happen if there are no apps break soup = BeautifulSoup(response.read()) next = soup.find(text="Next") if next: next_page_url = urlparse.urljoin(self.base_url, next.parent['href']) else: next_page_url = None # There is an <h3> for each app that we can use for h3 in soup.findAll("h3", {"class": "resultsnavbar"}): application = PlanningApplication() application.date_received = search_date application.council_reference = h3.string.split(": ")[1] application.description = h3.findNext("div").find(text="Proposal:").parent.nextSibling.strip() application.address = ', '.join(h3.findNext("div").find(text="Address of proposal:").parent.nextSibling.strip().split("\r")) application.postcode = getPostcodeFromText(application.address) application.comment_url = urlparse.urljoin(self.base_url, h3.findNext("div").find(text=re.compile("Comment on Application")).parent['href']) application.info_url = self.info_url %(urllib.quote(application.council_reference)) application.osgb_x, application.osgb_y = h3.findNext("div").find(text="Grid Reference:").parent.nextSibling.strip().split() self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class SolihullParser: def __init__(self, *args): self.authority_name = "Solihull Metropolitan Borough Council" self.authority_short_name = "Solihull" self.base_url = "http://www.solihull.gov.uk/planning/dc/weeklist.asp?SD=%s&ward=ALL" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # What we actually need is the monday before the date searched for: monday_before = search_day - datetime.timedelta(search_day.weekday()) # Now get the search page response = urllib2.urlopen(self.base_url %(monday_before.strftime(date_format))) soup = BeautifulSoup(response.read()) result_tables = soup.findAll("table", width="98%", cellpadding="2") for table in result_tables: application = PlanningApplication() trs = table.findAll("tr") application.council_reference = trs[0].strong.string.strip() relative_info_url = trs[0].a['href'] application.info_url = urlparse.urljoin(self.base_url, relative_info_url) application.address = trs[1].findAll("td")[1].string.strip() application.postcode = getPostcodeFromText(application.address) application.description = trs[2].findAll("td")[1].string.strip() #There's probably a prettier way to get the date, but with Python, it's easier for me to reinvent the wheel than to find an existing wheel! raw_date_recv = trs[3].findAll("td")[3].string.strip().split("/") #Check whether the application is on the target day. If not, discard it and move on. if int(raw_date_recv[0]) != day: continue application.date_received = datetime.date(int(raw_date_recv[2]), int(raw_date_recv[1]), int(raw_date_recv[0])) try: relative_comment_url = trs[5].findAll("td")[1].a['href'] application.comment_url = urlparse.urljoin(self.base_url, relative_comment_url) except: application.comment_url = "No Comment URL." self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class LeicestershireParser: def __init__(self, *args): self.authority_name = "Leicestershire County Council" self.authority_short_name = "Leicestershire" self.base_url = "http://www.leics.gov.uk/index/environment/community_services_planning/planning_applications/index/environment/community_services_planning/planning_applications/eplanning_searchform/eplanning_resultpage.htm?sd=%(date)s&ed=%(date)s&kw=&map=f" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) response = urllib2.urlopen(self.base_url %{"date": search_date.strftime(search_date_format)}) soup = BeautifulSoup.BeautifulSoup(response.read()) if not soup.find(text=re.compile("No Results Found")): trs = soup.findAll("table", {"class": "dataTable"})[1].findAll("tr")[1:] for tr in trs: tds = tr.findAll("td") application = PlanningApplication() # We can fill in the date received without actually looking at the data application.date_received = search_date application.council_reference = tds[0].a.string.strip() application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) application.address = ', '.join([x for x in tds[1].contents if isinstance(x, BeautifulSoup.NavigableString)]) application.postcode = getPostcodeFromText(application.address) application.description = tds[2].string.strip() # To get the comment link we need to fetch the info page info_response = urllib2.urlopen(application.info_url) info_soup = BeautifulSoup.BeautifulSoup(info_response.read()) base = info_soup.base['href'] application.comment_url = urlparse.urljoin(base, info_soup.find("a", target="Planning Application Consultation Form")['href']) self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class HounslowParser: def __init__(self, *args): self.authority_name = "London Borough of Hounslow" self.authority_short_name = "Hounslow" self.base_url = "http://planning.hounslow.gov.uk/planningv2/planning_summary.aspx?strWeekListType=SRCH&strRecTo=%(date)s&strRecFrom=%(date)s&strWard=ALL&strAppTyp=ALL&strWardTxt=All%%20Wards&strAppTypTxt=All%%20Application%%20Types&strArea=ALL&strAreaTxt=All%%20Areas&strStreet=ALL&strStreetTxt=All%%20Streets&strPC=&strLimit=500" # Limited to 500 cases - putting 1000 causes a default value of 50 to be used. 500 should be plenty. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # Now get the search page response = urllib2.urlopen(self.base_url % {"date": search_day.strftime(date_format)}) soup = BeautifulSoup(response.read()) # Results are shown in a table each. The tables don't have any nice # attributes, but they do all contain a NavString "Application", # and nothing else does... nav_strings = soup.findAll(text="Application") for nav_string in nav_strings: result_table = nav_string.findPrevious("table") application = PlanningApplication() application.date_received = search_day links = result_table.findAll("a") # We can get OSGB coordinates from the link to streetmap map_qs_dict = cgi.parse_qs(urlparse.urlsplit(links[0]["href"])[3]) application.osgb_x = map_qs_dict.get("x")[0] application.osgb_y = map_qs_dict.get("y")[0] application.council_reference = links[1].string.strip() application.info_url = urlparse.urljoin(self.base_url, links[1]["href"]) application.comment_url = urlparse.urljoin(self.base_url, links[2]["href"]) application.address = " ".join(links[0].previous.strip().split()) application.postcode = getPostcodeFromText(application.address) application.description = links[2].previous.strip() self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class KensingtonParser: def __init__(self, *args): self.authority_name = "The Royal Borough of Kensington and Chelsea" self.authority_short_name = "Kensington and Chelsea" self.base_url = "http://www.rbkc.gov.uk/Planning/scripts/weeklyresults.asp" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # We want the sunday of the week being searched for. # (sunday is at the end of the week). friday = search_day - datetime.timedelta(search_day.weekday()) + datetime.timedelta(4) # Not using urllib.urlencode as it insists on turning the "+" into "%2B" post_data = "WeekEndDate=%d%%2F%d%%2F%d&order=Received+Date&submit=search" %(friday.day, friday.month, friday.year) # Now get the search page response = urllib2.urlopen(self.base_url, post_data) soup = BeautifulSoup(response.read()) trs = soup.find("table", summary="Planning Application search results table").findAll("tr")[1:] for tr in trs: application = PlanningApplication() tds = tr.findAll("td") # Not sure why these are entities. We'll convert them back. application.council_reference = tds[0].a.contents[1].strip().replace("/", "/") application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) application.comment_url = application.info_url application.date_received = datetime.datetime(*(time.strptime(tds[1].string.strip(), date_format)[0:6])) application.address = tds[2].string.strip() application.postcode = getPostcodeFromText(application.address) application.description = tds[3].string.strip() self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class ExmoorParser: def __init__(self, *args): self.authority_name = "Exmoor National Park" self.authority_short_name = "Exmoor" self.base_url = "http://www.exmoor-nationalpark.gov.uk/planning_weekly_list.htm?weeklylist=%s" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) response = urllib2.urlopen(self.base_url %(search_day.strftime(search_date_format))) soup = BeautifulSoup(response.read()) # The first <tr> contains headers trs = soup.table.findAll("tr")[1:] for tr in trs: application = PlanningApplication() tds = tr.findAll("td") application.date_received = datetime.datetime.strptime(tds[0].string, received_date_format).date() application.info_url = urllib.unquote(urllib.quote_plus(urlparse.urljoin(self.base_url, tds[1].a['href']))) application.council_reference = tds[1].a.string.strip() application.address = tds[2].a.string.strip() application.postcode = getPostcodeFromText(application.address) # Now fetch the info url info_response = urllib.urlopen(application.info_url) info_soup = BeautifulSoup(info_response.read()) application.description = info_soup.find(text="Proposal:").findNext("td").string.strip() try: application.comment_url = urlparse.urljoin(self.base_url, info_soup.find(text="Comment").parent['href']) except: application.comment_url = "No Comments" self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class KingstonParser: comments_email_address = "*****@*****.**" def __init__(self, *args): self.authority_name = "Royal Borough of Kingston upon Thames" self.authority_short_name = "Kingston upon Thames" self.base_url = "http://maps.kingston.gov.uk/isis_main/planning/planning_summary.aspx?strWeekListType=SRCH&strRecTo=%(date)s&strRecFrom=%(date)s&strWard=ALL&strAppTyp=ALL&strWardTxt=All%%20Wards&strAppTypTxt=All%%20Application%%20Types&strStreets=ALL&strStreetsTxt=All%%20Streets&strLimit=500" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # Now get the search page response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)}) soup = BeautifulSoup(response.read()) # Each app is stored in a table on it's own. # These tables don't have any nice distinguishing features, # but they do all contain a NavigableString "Application", # and nothing else in the page does. nav_strings = soup.findAll(text="Application") for nav_string in nav_strings: results_table = nav_string.findPrevious("table") application = PlanningApplication() application.date_received = search_day application.council_reference = results_table.a.string.strip() application.info_url = urlparse.urljoin(self.base_url, results_table.a['href']) application.address = results_table.findAll("td")[7].a.string.strip() application.postcode = getPostcodeFromText(application.address) application.description = results_table.findAll("td")[-1].contents[0].strip() # A few applications have comment urls, but most don't. # When they do, they have a case officer - I don't think we can # work out the other urls - even if they exist. # Best to use the email address. application.comment_url = self.comments_email_address self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class BarnsleyParser: comments_email_address = "*****@*****.**" def __init__(self, *args): self.authority_name = "Barnsley Metropolitan Borough Council" self.authority_short_name = "Barnsley" self.base_url = "http://applications.barnsley.gov.uk/service/development/week_compact.asp?AppDate=%s" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # What we actually need is the monday before the date searched for: monday_before = search_day - datetime.timedelta(search_day.weekday()) # Now get the search page response = urllib2.urlopen(self.base_url %(monday_before.strftime(date_format))) soup = BeautifulSoup(response.read()) result_tables = soup.findAll("table", align="Center", cellpadding="3") for table in result_tables: application = PlanningApplication() # We can set the date received and the comment url straight away. application.comment_url = self.comments_email_address trs = table.findAll("tr") application.council_reference = trs[0].a.string.strip() relative_info_url = trs[0].a['href'] application.info_url = urlparse.urljoin(self.base_url, relative_info_url) application.date_received = monday_before application.address = trs[1].findAll("td")[1].string.strip() application.postcode = getPostcodeFromText(application.address) application.description = trs[2].findAll("td")[1].string.strip() self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class HarrowParser: def __init__(self, *args): self.authority_name = "London Borough of Harrow" self.authority_short_name = "Harrow" # This is a link to the last seven days applications # The day, month, and year arguments will be ignored. self.base_url = "http://www.harrow.gov.uk/www4/planning/dcweek1.asp" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): # Now get the search page response = urllib2.urlopen(self.base_url) soup = BeautifulSoup(response.read()) # Each application contains the nav string "Application: " nav_strings = soup.findAll(text="Application: ") for nav_string in nav_strings: application = PlanningApplication() application.council_reference = nav_string.findPrevious("tr").findAll("td", limit=2)[1].string.strip() application.address = nav_string.findNext(text=location_re).split(":")[1].strip() application.postcode = getPostcodeFromText(application.address) application.description = nav_string.findNext(text="Proposal: ").findNext("td").string.strip() application.comment_url = urlparse.urljoin(self.base_url, nav_string.findNext(text="Proposal: ").findNext("a")['href']) application.date_received = datetime.datetime.strptime(nav_string.findNext(text=date_received_re).split(": ")[1], date_format).date() # FIXME: There is no appropriate info_url for the Harrow apps. # I'll put the base url for the moment, but as that is # a list of apps from the last 7 days that will quickly be out of date. application.info_url = self.base_url self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class HampshireParser: def __init__(self, *args): self.authority_name = "Hampshire County Council" self.authority_short_name = "Hampshire" self.base_url = "http://www3.hants.gov.uk/planning/mineralsandwaste/planning-applications/applications/applications-open.htm" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): # Now get the search page response = urllib2.urlopen(self.base_url) soup = BeautifulSoup(response.read()) trs = soup.table.table.findAll("tr", {"class": re.compile("(?:odd)|(?:even)")}) for tr in trs: application = PlanningApplication() tds = tr.findAll("td") application.council_reference = tds[0].a.string.strip() application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) application.address = tds[2].string.strip() application.postcode = getPostcodeFromText(application.address) application.description = tds[3].string.strip() # Fetch the info url in order to get the date received and the comment url info_response = urllib2.urlopen(application.info_url) info_soup = BeautifulSoup(info_response.read()) application.date_received = datetime.datetime.strptime(info_soup.find(text=re.compile("\s*Received:\s*")).findNext("td").string.strip(), date_format).date() application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("input", value="Comment on this application").parent['action']) self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class LichfieldParser: def __init__(self, *args): self.authority_name = "Lichfield District Council" self.authority_short_name = "Lichfield" self.base_url = "http://www.lichfielddc.gov.uk/site/scripts/planning_list.php" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): response = urllib2.urlopen(self.base_url) soup = BeautifulSoup.BeautifulSoup(response.read()) trs = soup.find("table", {"class": "planningtable"}).tbody.findAll("tr") for tr in trs: application = PlanningApplication() tds = tr.findAll("td") application.council_reference = tds[0].a.string.strip() application.info_url = urlparse.urljoin(self.base_url, tds[0].a["href"]) application.address = " ".join(tds[1].contents[1].strip().split()[1:]) application.postcode = getPostcodeFromText(application.address) # We're going to need to download the info page in order to get # the comment link, the date received, and the description. info_response = urllib2.urlopen(application.info_url) info_soup = BeautifulSoup.BeautifulSoup(info_response.read()) application.description = info_soup.find(text="Proposal").findNext(text=True).strip() date_received_str = info_soup.find(text="Date Application Valid").findNext(text=True).split(",")[1].strip() # This is a nasty botch, but the easiest way I can see to get a date out of this is to make another string and use strptime better_date_str = "%s %s %s" % date_received_re.match(date_received_str).groups() application.date_received = datetime.datetime.strptime(better_date_str, "%d %B %Y").date() application.comment_url = info_soup.find("a", title="Comment on this planning application.")["href"] self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class WeymouthParser: def __init__(self, *args): self.authority_name = "Weymouth and Portland Borough Council" self.authority_short_name = "Weymouth and Portland" self.base_url = "http://www.weymouth.gov.uk/Planning/applications/newapps.asp" self.search_url = "http://www.weymouth.gov.uk/planning/applications/planregister.asp" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): response = urllib2.urlopen(self.base_url) soup = BeautifulSoup(response.read()) for details_input in soup.find("table", summary="Planning Applications Received in the last 7 days").findAll("input", alt="Planning Details"): application = PlanningApplication() first_tr = details_input.findPrevious("tr") other_trs = first_tr.findNextSiblings("tr", limit=8) application.council_reference = first_tr.find("input", {"name": "refval"})['value'] application.address = other_trs[0].findAll("td")[1].string.strip() application.description = other_trs[1].findAll("td")[1].string.strip() application.date_received = datetime.datetime.strptime(other_trs[3].findAll("td")[1].string.strip(), date_format).date() # Both the info page and the comment page can only be got to # by a POST. The best we can do is give the url of the search page application.info_url = application.comment_url = self.search_url self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class WychavonParser: def __init__(self, *args): self.authority_name = "Wychavon" self.authority_short_name = "Wychavon" # Currently hard coded--if this address updates, we'll need to scrape # the search form to get it each time. self.base_url = "http://www.e-wychavon.org.uk/scripts/plan2005/\ acolnetcgi.exe?ACTION=UNWRAP&WhereDescription=General%20Search&\ Whereclause3=%27%30%31%2F%7BEdtMonthEnd%7D%2F%7BEdtYearEnd%7D%27&\ RIPNAME=Root%2EPages%2EPgeDC%2EPgeListCases" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): form_data = "EdtYearNo=&EdtCaseNo=&EdtApplicant=&EdtAgent=&EdtLocation"\ + "=&EdtWard=&EdtMonthStart1=" + str(month) + "&EdtYearStart=" \ + str(year) + "&EdtMonthEnd=" + str(month) + "&EdtYearEnd="\ + str(year) + "&submit=Search" # Fetch the results response = urllib.urlopen(self.base_url, form_data) soup = BeautifulSoup(response.read()) #Each set of results has its own table results_tables = soup.findAll("table", cellpadding="2", cols="4") for table in results_tables: application = PlanningApplication() trs = table.findAll("tr") application.council_reference = trs[0].findAll("td")[1].font.font.\ font.string.strip() relative_info_url = trs[0].findAll("td")[1].a['href'] application.info_url = urlparse.urljoin(self.base_url, relative_info_url) application.address = trs[1].findAll("td")[1].font.string.strip() application.postcode = getPostcodeFromText(application.address) #This code avoids an error if there's no description given. descrip = trs[2].findAll("td")[1].font.string if descrip == None: application.description = "" else: application.description = descrip.strip() date_format = "%d/%m/%y" date_string = trs[1].findAll("td")[3].font.string.strip() application.date_received = datetime.datetime.strptime(date_string, date_format) apptype = trs[0].findAll("td")[3].font.string # Avoids throwing an error if no apptype is given (this can happen) if apptype != None: apptype = apptype.strip() # Is all this really necessary? I don't know, but I've assumed that # it is. The form will appear without the suffix, I don't know if # the council's backend would accept it or not. Current behaviour # is to degrade silently to no suffix if it can't match an # application type. if apptype == "Telecommunications": # Don't know why it's a naked IP rather than sitting on the # same site, but there it is. application.comment_url = "http://81.171.139.151/WAM/createCom"\ +"ment.do?action=CreateApplicationComment&applicationType=PLANNI"\ +"NG&appNumber=T3/" + application.council_reference + "/TC" else: comment_url = "http://81.171.139.151/WAM/createComment.do?acti"\ +"on=CreateApplicationComment&applicationType=PLANNING&appNumber"\ +"=W/" + application.council_reference + "/" suffix = "" if apptype == "Householder planning application": suffix = "PP" elif apptype == "Non-householder planning application": suffix = "PN" elif apptype == "Outline applications": suffix = "OU" elif apptype == "Change of use": suffix = "CU" elif apptype == "Listed Building consent": suffix = "LB" elif apptype == "Advertisement application": suffix = "AA" elif apptype == "Certificate of Lawfulness Existing": suffix = "LUE" elif apptype == "Approval of reserved matters": suffix = "VOC" #These are all the ones that I found, except "Advice - Pre-app/ #Householder", the suffix for which is inconsistent. The suffix #for this could be obtained by scraping the description page for #each application. application.comment_url = comment_url + suffix self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class EastbourneParser: def __init__(self, *args): self.authority_name = "Eastbourne Borough Council" self.authority_short_name = "Eastbourne" # self.base_url = "http://www.eastbourne.gov.uk/planningapplications/search.asp" self.first_url = "http://www.eastbourne.gov.uk/planningapplications/index.asp" self.base_url = "http://www.eastbourne.gov.uk/planningapplications/results.asp" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # There's going to be some faffing around here. We need a cookie to say we have agreed to some T&Cs. # First get the search page - we'll be redirected somewhere else for not having the cookie first_request = urllib2.Request(self.first_url) first_response = urllib2.urlopen(first_request) cookie_jar.extract_cookies(first_response, first_request) first_page_soup = BeautifulSoup.BeautifulSoup(first_response.read()) first_page_action = urlparse.urljoin(self.first_url, first_page_soup.form['action']) the_input = first_page_soup.form.input second_page_post_data = urllib.urlencode( ( (the_input['name'], the_input['value']), ) ) second_request = urllib2.Request(first_page_action, second_page_post_data) cookie_jar.add_cookie_header(second_request) second_response = urllib2.urlopen(second_request) cookie_jar.extract_cookies(second_response, second_request) # Now (finally) get the search page #ApplicationNumber=&AddressPrefix=&Postcode=&CaseOfficer=&WardMember=&DateReceivedStart=31%2F08%2F2008&DateReceivedEnd=31%2F08%2F2008&DateDecidedStart=&DateDecidedEnd=&Locality=&AgentName=&ApplicantName=&ShowDecided=&DecisionLevel=&Sort1=FullAddressPrefix&Sort2=DateReceived+DESC&Submit=Search post_data = urllib.urlencode( ( ("ApplicationNumber", ""), ("AddressPrefix", ""), ("Postcode", ""), ("CaseOfficer", ""), ("WardMember", ""), ("DateReceivedStart", search_day.strftime(date_format)), ("DateReceivedEnd", search_day.strftime(date_format)), ("DateDecidedStart", ""), ("DateDecidedEnd", ""), ("Locality", ""), ("AgentName", ""), ("ApplicantName", ""), ("ShowDecided", ""), ("DecisionLevel", ""), ("Sort1", "FullAddressPrefix"), ("Sort2", "DateReceived DESC"), ("Submit", "Search"), ) ) search_request = urllib2.Request(self.base_url) cookie_jar.add_cookie_header(search_request) search_response = urllib2.urlopen(search_request, post_data) soup = BeautifulSoup.BeautifulSoup(search_response.read()) app_no_strings = soup.findAll(text="App. No.:") for app_no_string in app_no_strings: application = PlanningApplication() application.date_received = search_day application.council_reference = app_no_string.findNext("a").string.strip() application.info_url = urlparse.urljoin(self.base_url, app_no_string.findNext("a")['href']) application.address = ' '.join([x.strip() for x in app_no_string.findNext(text="Site Address:").findNext("td").contents if type(x) == BeautifulSoup.NavigableString]) application.postcode = getPostcodeFromText(application.address) application.comment_url = urlparse.urljoin(self.base_url, app_no_string.findNext(text="Comment on application").parent['href']) application.description = app_no_string.findNext(text="Description:").findNext("td").string.strip() self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class MedwayParser: comment_email_address = "*****@*****.**" def __init__(self, *args): self.authority_name = "Medway Council" self.authority_short_name = "Medway" self.base_url = "http://www.medway.gov.uk/index/environment/planning/planapp/planonline.htm" self._split_base_url = urlparse.urlsplit(self.base_url) self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) search_date_string = search_date.strftime(date_format) "appstat=&decision=&appdec=&ward=&parish=&dadfrom=&dadto=&davfrom=01%2F06%2F2008&davto=02%2F06%2F2008&searchbut=Search" search_data = urllib.urlencode( [("searchtype", "1"), ("appstat", ""), ("decision", ""), ("appdec", ""), ("ward", ""), ("parish", ""), ("dadfrom", ""), ("dadto", ""), ("davfrom", search_date_string), ("davto", search_date_string), ("searchbut", "Search"), ] ) split_search_url = self._split_base_url[:3] + (search_data, '') search_url = urlparse.urlunsplit(split_search_url) response = urllib2.urlopen(search_url) soup = BeautifulSoup(response.read()) results_table = soup.find(text="Application No").parent.parent.parent trs = results_table.findAll("tr")[1:] tr_counter = 0 while tr_counter < len(trs): tr = trs[tr_counter] if tr_counter % 2 == 0: application = PlanningApplication() application.date_received = search_date application.comment_url = self.comment_email_address tds = tr.findAll("td") application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) application.council_reference = tr.a.string.strip() application.address = tds[1].string.strip() application.postcode = getPostcodeFromText(application.address) application.description = tds[2].string.strip() self._results.addApplication(application) tr_counter += 1 return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class OcellaParser: received_date_format = search_date_format def __init__(self, authority_name, authority_short_name, base_url, debug=False): self.authority_name = authority_name self.authority_short_name = authority_short_name self.base_url = base_url self.debug = debug self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) # These will be used to store the column numbers of the appropriate items in the results table self.reference_col = None self.address_col = None self.applicant_col = None self.description_col = None self.received_date_col = None self.accepted_date_col = None def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) # First get the search page get_request = urllib2.Request(self.base_url) get_request.add_header('Accept', 'text/html') get_response = urllib2.urlopen(get_request) cookie_jar.extract_cookies(get_response, get_request) get_soup = BeautifulSoup(get_response.read()) # We need to find where the post action goes action = get_soup.form['action'] try: session_id = get_soup.find('input', {'name': 'p_session_id'})['value'] except TypeError: # In the case of Middlesbrough, there is no session cookie, # but it seems we don't need it... session_id = None # Unless we retrieve the correct form name, we will simply get the last week's applications submit_tag = get_soup.find('input', {'value': 'Search'}) or get_soup.find('input', {'value': 'Search for Applications'}) or get_soup.find('input', {'value': 'Submit'}) try: submit_name = submit_tag['name'] form_name = submit_name.split('.')[0] except TypeError: form_name = 'FRM_PLANNING_LIST' # # From Breckland # p_object_name=FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01 # p_instance=1 # p_event_type=ON_CLICK # p_user_args= # p_session_id=53573 # p_page_url=http%3A%2F%2Fwplan01.intranet.breckland.gov.uk%3A7778%2Fportal%2Fpage%3F_pageid%3D33%2C30988%26_dad%3Dportal%26_schema%3DPORTAL # FRM_WEEKLY_LIST.DEFAULT.START_DATE.01=02-06-2008 # FRM_WEEKLY_LIST.DEFAULT.END_DATE.01=09-06-2008 # FRM_WEEKLY_LIST.DEFAULT.PARISH.01= post_data = urllib.urlencode( [('p_object_name', form_name + '.DEFAULT.SUBMIT_TOP.01'), ('p_instance', '1'), ('p_event_type', 'ON_CLICK'), ('p_user_args', ''), ('p_session_id', session_id), ('p_page_url', self.base_url), (form_name + '.DEFAULT.AGENT.01', ''), (form_name + '.DEFAULT.START_DATE.01', search_date.strftime(search_date_format)), (form_name + '.DEFAULT.END_DATE.01', search_date.strftime(search_date_format)), (form_name + '.DEFAULT.PARISH.01', ''), ] ) post_request = urllib2.Request(action, post_data) cookie_jar.add_cookie_header(post_request) post_request.add_header('Accept', 'text/html') post_request.add_header('Referer', self.base_url) post_response = cookie_handling_opener.open(post_request) post_soup = BeautifulSoup(post_response.read()) results_table = post_soup.find("table", summary="Printing Table Headers") trs = results_table.findAll("tr") # We'll use the headings in the first tr to find out what columns the address, description, etc are in. ths = trs[0].findAll("th") th_index = 0 for th in ths: th_content = th.font.string.strip() if th_content == 'Reference' or th_content == 'Application Ref' or th_content == 'Application Number': self.reference_col = th_index elif th_content == 'Location': self.address_col = th_index elif th_content == 'Applicant Details': self.applicant_col = th_index elif th_content == 'Proposal': self.description_col = th_index elif th_content == 'Development Description': self.description_col = th_index elif th_content == 'Received Date' or th_content == 'Date Received': self.received_date_col = th_index elif th_content == 'Accepted Date': self.accepted_date_col = th_index th_index += 1 # If there is a received date, we'll use that, otherwise, we'll have to settle for the accepted date. self.received_date_col = self.received_date_col or self.accepted_date_col # We want all the trs except the first one, which is just headers, # and the last, which is empty trs = trs[1:-1] for tr in trs: self._current_application = PlanningApplication() tds = tr.findAll("td") self._current_application.council_reference = (tds[self.reference_col].font.a or tds[self.reference_col].a.font).string.strip() date_string = tds[self.received_date_col] for possible_format in possible_date_formats: try: self._current_application.date_received = datetime.datetime(*(time.strptime(tds[self.received_date_col].font.string.strip(), possible_format)[0:6])) except ValueError: pass self._current_application.address = tds[self.address_col].font.string.strip() self._current_application.postcode = getPostcodeFromText(self._current_application.address) if self._current_application.postcode is None and self.applicant_col is not None: # won't always be accurate to do this but better than nothing (needed for Havering) self._current_application.postcode = getPostcodeFromText(tds[self.applicant_col].font.string.strip()) self._current_application.description = tds[self.description_col].font.string.strip() # seems to be dependent on the implementation whether the URL is encoded (e.g. Great Yarmouth does this), so we cannot do anything more "standard" self._current_application.info_url = urlparse.urljoin(post_response.geturl(), tds[self.reference_col].a['href'].replace('&','&')) # This is what a comment url looks like # It seems to be no problem to remove the sessionid (which is in any case blank...) # I can't see a good way to avoid having to go to the info page to find the moduleid though. #http://wplan01.intranet.breckland.gov.uk:7778/pls/portal/PORTAL.wwa_app_module.link?p_arg_names=_moduleid&p_arg_values=8941787057&p_arg_names=_sessionid&p_arg_values=&p_arg_names=APPLICATION_REFERENCE&p_arg_values=3PL%2F2008%2F0877%2FF # For the moment, we'll just use the info url, as that seems to work. self._current_application.comment_url = self._current_application.info_url self._results.addApplication(self._current_application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class FlintshireParser: def __init__(self, *args): self.authority_name = "Flintshire County Council" self.authority_short_name = "Flintshire" # I've removed some extra variables from this, it seems to be happy without them, and now doesn't need to paginate... self.base_url = "http://www.flintshire.gov.uk/webcont/fssplaps.nsf/vwa_Search?searchview&Query=(%%5BfrmDteAppldate%%5D%%20%%3E=%%20%(start_date)s%%20AND%%20%%5BfrmDteAppldate%%5D%%20%%3C=%%20%(end_date)s)" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) # We'll set the start date to be one day earlier in order to catch the first result on every day at some point - see TODO list response = urllib2.urlopen(self.base_url %{"end_date": search_date.strftime(date_format), "start_date": (search_date - datetime.timedelta(1)).strftime(date_format)}) soup = BeautifulSoup(response.read()) # Each app is stored in it's own table result_tables = soup.findAll("table", border="1") # For the moment, we'll have to ignore the first result (see TODO list). for table in result_tables[1:]: application = PlanningApplication() # It's not clear to me why this next one isn't the string of the next sibling. This works though! application.council_reference = table.find(text=re.compile("Reference")).parent.findNextSibling().contents[0] application.address = table.find(text="Location").parent.findNextSibling().string.strip() application.postcode = getPostcodeFromText(application.address) application.info_url = urlparse.urljoin(self.base_url, table.a['href']) # Let's go to the info_page and get the OSGB and the date_received info_request = urllib2.Request(application.info_url) # We need to add the language header in order to get UK style dates info_request.add_header("Accept-Language", "en-gb,en") info_response = urllib2.urlopen(info_request) info_soup = BeautifulSoup(info_response.read()) grid_reference_td = info_soup.find(text="Grid Reference").findNext("td") x_element = grid_reference_td.font application.osgb_x = x_element.string.strip() application.osgb_y = x_element.nextSibling.nextSibling.string.strip() date_string = info_soup.find(text="Date Valid").findNext("td").string.strip() application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6])) application.description = table.find(text=re.compile("Description of Proposal")).parent.nextSibling.string.strip() # There is a link to comment from the info page, though I can't click it. application.comment_url = application.info_url self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class CairngormsParser: def __init__(self, *args): self.authority_name = "Cairngorms National Park" self.authority_short_name = "Cairngorms" self.referer = "http://www.cairngorms.co.uk/planning/e-planning/index.php" self.base_url = "http://www.cairngorms.co.uk/planning/e-planning/holding.php" # The timestamp here looks like the number of milliseconds since 1970 self.first_post_url = "http://www.cairngorms.co.uk/planning/e-planning/search.php?timeStamp=%d" self.comments_email_address = "*****@*****.**" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) post_data = [ ("CNPA_ref", ""), ("application_number", ""), ("LA_id", "%"), ("applicant_type", "%"), ("applicant_name", ""), ("development_address", ""), ("agent_name", ""), ("status", "%"), ("startDay", "%02d" %day), ("startMonth", "%02d" %month), ("startYear", "%d" %year), ("endDay", "%02d" %day), ("endMonth", "%02d" %month), ("endYear", "%d" %year), ] first_post_data = "CNPA_ref=&application_number=&applicant_name=&development_address=&agent_name=&applicant_type=%%&LA_id=%%&status=%%&startYear=%(year)d&startMonth=%(month)02d&startDay=%(day)02d&endYear=%(year)d&endMonth=%(month)02d&endDay=%(day)02d" %{"day": day, "month": month, "year": year} curlobj = pycurl.Curl() curlobj.setopt(pycurl.FOLLOWLOCATION, True) curlobj.setopt(pycurl.MAXREDIRS, 10) # First we do a normal post, this would happen as an AJAX query # from the browser and just returns the number of applications found. fakefile = StringIO.StringIO() curlobj.setopt(pycurl.URL, self.first_post_url %(int(time.time()*1000))) curlobj.setopt(pycurl.POST, True) curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) curlobj.setopt(pycurl.POSTFIELDS, first_post_data) curlobj.perform() app_count = int(fakefile.getvalue()) fakefile.close() if app_count: # Now we do another multipart form post # This gives us something to use as the callback fakefile = StringIO.StringIO() curlobj.setopt(pycurl.URL, self.base_url) curlobj.setopt(pycurl.HTTPPOST, post_data) curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) curlobj.setopt(pycurl.REFERER, self.referer) curlobj.perform() soup = BeautifulSoup(fakefile.getvalue()) # We may as well free up the memory used by fakefile fakefile.close() for tr in soup.table.findAll("tr")[1:]: application = PlanningApplication() application.date_received = search_date application.comment_url = self.comments_email_address tds = tr.findAll("td") application.council_reference = tds[1].string.strip() application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) application.address = tds[2].string.strip() application.postcode = getPostcodeFromText(application.address) # We're going to need to get the info page in order to get the description # We can't pass a unicode string to pycurl, so we'll have to encode it. curlobj.setopt(pycurl.URL, application.info_url.encode()) curlobj.setopt(pycurl.HTTPGET, True) # This gives us something to use as the callback fakefile = StringIO.StringIO() curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) curlobj.perform() info_soup = BeautifulSoup(fakefile.getvalue()) fakefile.close() application.description = info_soup.find(text="Development Details").findNext("td").string.strip() application.osgb_x = info_soup.find(text="Grid Ref East").findNext("td").string.strip() application.osgb_y = info_soup.find(text="Grid Ref North").findNext("td").string.strip() self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class AcolnetParser: received_date_label = "Registration Date:" received_date_format = "%d/%m/%Y" comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=%s" # There is no online comment facility in these, so we provide an # appropriate email address instead comments_email_address = None # The optional amp; is to cope with Oldham, which seems to have started # quoting this url. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&(?:amp;)?RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE) def _getResultsSections(self, soup): """In most cases, there is a table per app.""" return soup.findAll("table", {"class": "results-table"}) def _getCouncilReference(self, app_table): # return app_table.findAll("a")[1].string.strip() return app_table.a.string.strip() def _getDateReceived(self, app_table): date_str = ''.join(app_table.find(text=self.received_date_label).findNext("td").string.strip().split()) day, month, year = date_str.split('/') return date(int(year), int(month), int(day)) # This will be better from python 2.5 #return datetime.datetime.strptime(date_str, self.received_date_format) def _getAddress(self, app_table): return app_table.find(text="Location:").findNext("td").string.strip() def _getDescription(self, app_table): return app_table.find(text="Proposal:").findNext("td").string.strip() def _getInfoUrl(self, app_table): """Returns the info url for this app. We also set the system key on self._current_application, as we'll need that for the comment url. """ url = app_table.a['href'] self._current_application.system_key = system_key_regex.search(url).groups()[0] # This is the right way to do this, but it doesn't work in Python 2.5 as # it doesn't quite implement RFC 3986. This will work fine when we are on # Python 2.6 # info_url = urlparse.urljoin(self.base_url, url) # In the meantime, we'll have to work around it. Let's assume url # is a query string split_base_url = urlparse.urlsplit(self.base_url) split_info_url = urlparse.urlsplit(url) info_url = urlparse.urlunsplit(split_base_url[:3] + (split_info_url.query,) + split_base_url[4:]) return info_url def _getCommentUrl(self, app_table): """This must be run after _getInfoUrl""" if self.comments_email_address: return self.comments_email_address split_info_url = urlparse.urlsplit(self._current_application.info_url) comment_qs = self.comment_qs_template %self._current_application.system_key return urlparse.urlunsplit(split_info_url[:3] + (comment_qs,) + split_info_url[4:]) def _getWard(self, app_table): return app_table.findAll("td")[8].string.strip() def __init__(self, authority_name, authority_short_name, base_url, debug=False): self.authority_name = authority_name self.authority_short_name = authority_short_name self.base_url = base_url self.debug = debug # This in where we store the results self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) # This will store the planning application we are currently working on. self._current_application = None def _cleanupHTML(self, html): """This method should be overridden in subclasses to perform site specific HTML cleanup.""" return html def _getSearchResponse(self): # It looks like we sometimes need to do some stuff to get around a # javascript redirect and cookies. search_form_request = urllib2.Request(self.base_url) search_form_response = urllib2.urlopen(search_form_request) return search_form_response def getResultsByDateRange(self, date_from, date_to): # first we fetch the search page to get ourselves some session info... search_form_response = self._getSearchResponse() search_form_contents = search_form_response.read() # This sometimes causes a problem in HTMLParser, so let's just get the link # out with a regex... groups = self.action_regex.search(search_form_contents).groups() action = groups[0] #print action # This is to handle the amp; which seems to have appeared in this # url on the Oldham site action = ''.join(action.split('amp;')) action_url = urlparse.urljoin(self.base_url, action) #print action_url search_data = {"regdate1": date_from.strftime(date_format), "regdate2": date_to.strftime(date_format), } opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler) response = opener.open(action_url, search_data) results_html = response.read() # This is for doing site specific html cleanup results_html = self._cleanupHTML(results_html) #some javascript garbage in the header upsets HTMLParser, #so we'll just have the body just_body = "<html>" + end_head_regex.split(results_html)[-1] #self.feed(just_body) soup = BeautifulSoup(just_body, convertEntities=BeautifulSoup.ALL_ENTITIES) # Each app is in a table of it's own. results_tables = self._getResultsSections(soup) for app_table in results_tables: self._current_application = PlanningApplication() self._current_application.council_reference = self._getCouncilReference(app_table) self._current_application.address = self._getAddress(app_table) # Get the postcode from the address self._current_application.postcode = getPostcodeFromText(self._current_application.address) self._current_application.description = self._getDescription(app_table) self._current_application.info_url = self._getInfoUrl(app_table) self._current_application.comment_url = self._getCommentUrl(app_table) self._current_application.date_received = self._getDateReceived(app_table) self._current_application.ward_name = self._getWard(app_table) self._results.addApplication(self._current_application) return self._results def getResultsByDayMonthYear(self, day, month, year): our_date = date(year, month, day) return self.getResultsByDateRange(our_date, our_date) def getResults(self, day, month, year): results = self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() # import pdb;pdb.set_trace() return results
class WandsworthParser: def __init__(self, *args): self.authority_name = "London Borough of Wandsworth" self.authority_short_name = "Wandsworth" self.base_url = "http://www.wandsworth.gov.uk/gis/search/Search.aspx" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) formatted_search_day = search_day.strftime("%d-%m-%Y") post_data = urllib.urlencode([ ("__EVENTTARGET", ""), ("__EVENTARGUMENT", ""), ("cboNumRecs", "99999"), ("cmdSearch", "Search"), ("drReceived:txtStart", formatted_search_day), ("drReceived:txtEnd", formatted_search_day) ]) response = urllib2.urlopen(self.base_url, post_data) # Modify the redirect response URL to remove the XSL template param # so we get more detailed XML embedded in HTML instead redirect_url = response.geturl() redirect_url = re.sub("&XSLTemplate=xslt/Results.xslt", "", redirect_url) results_response = urllib2.urlopen(redirect_url) try: soup = BeautifulSoup(results_response.read()) # Get the XML content contained in the HTML doc td = soup.find("td", colspan="3") xml = str(td.contents[2]) xml_soup = BeautifulStoneSoup(xml) except: return self._results for entry in xml_soup.findAll('internet_web_search'): application = PlanningApplication() primary_key = entry.find('primary_key').renderContents() kind = entry.find('object_id').renderContents() application.council_reference = entry.find('application_number').renderContents() application.comment_url = "http://www.wandsworth.gov.uk/apply/createComment.do?action=CreateApplicationComment&appNumber=%s" \ % urllib.quote(application.council_reference) str_date_received = entry.find('received_date').renderContents()[0:10] date_received = datetime.datetime.strptime(str_date_received, "%Y-%m-%d") application.date_received = date_received application.address = entry.find('site_address').renderContents() application.description = entry.find('development_description').renderContents() # We need to make another request to get postcode details details_url = "http://www.wandsworth.gov.uk/gis/search/StdDetails.aspx?" if kind == 'PLANNINGAPPLICATION': application.info_url = "http://www.wandsworth.gov.uk/apply/showCaseFile.do?appNumber=%s" \ % urllib.quote(application.council_reference) details_url = details_url + urllib.urlencode([ ("PT", "Planning Application Details"), ("TYPE", "WBCPLANNINGREF"), ("PARAM0", primary_key), ("XSLT", "xslt/planningdetails.xslt"), ("DAURI", "PLANNING") ]) else: details_url = details_url + urllib.urlencode([ ("PT", "Building Control Application Details"), ("TYPE", "WBCBuildingControlREF"), ("PARAM0", primary_key), ("XSLT", "xslt/bcdetails.xslt"), ("DAURI", "PLANNING") ]) application.info_url = details_url details_response = urllib2.urlopen(details_url) details_soup = BeautifulSoup(details_response.read()) postcode_row = details_soup.find('table', "bodytextsmall").findAll('tr')[5] postcode_cell = postcode_row.find('td', "searchinput") if postcode_cell.string: application.postcode = getPostcodeFromText(postcode_cell.string.strip()) self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class PlanningExplorerParser: # If this authority doesn't have a comments page, # then set this email_address to an address for the # planning department, and it will be used in lieu of # a comments url. comments_email_address = None # These are the directories where the info urls, and search urls, # usually live underneath the base_url. # If these are different for a particular # authority, then they can be overridden in a subclass. info_url_path = "MVM/Online/Generic/" search_url_path = "MVM/Online/PL/GeneralSearch.aspx" # This is the most common place for comments urls to live # The %s will be filled in with an application code comments_path = "MVM/Online/PL/PLComments.aspx?pk=%s" # Most authorities don't need the referer header on the post # request. If one does, override this in the subclass use_referer = False # Some authorities won't give us anything back if we use the # python urllib2 useragent string. In that case, override this # in a subclass to pretend to be firefox. use_firefox_user_agent = False # This is the most common css class of the table containing the # the search results. If it is different for a particular authority # it can be overridden in a subclass results_table_attrs = {"class": "ResultsTable"} # These are the most common column positions for the # council reference, the address, and the description # in the results table. # They should be overridden in subclasses if they are different # for a particular authority. reference_td_no = 0 address_td_no = 1 description_td_no = 2 # In some cases we won't be able to get the full address/description/postcode without getting the info page for each app. # If fetch_info_page is set to true, then we need to get a copy of the info page and store it as an attribute on current_application (naughty!) fetch_info_page = False asp_args_regex = re.compile('<input[^>]*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>') def _modify_response(self, response): """For most sites, we have managed to get all the apps on a single page by choosing the right parameters. If that hasn't been possible, override this method to get a new response object which has all the apps in one page. (See, for example, Hackney). """ return response def _find_trs(self, results_table): """Normally, we just want a list of all the trs except the first one (which is usually a header). If the authority requires a different list of trs, override this method. """ return results_table.findAll("tr")[1:] def _sanitisePostHtml(self, html): """This method can be overriden in subclasses if the html that comes back from the post request is bad, and needs tidying up before giving it to BeautifulSoup.""" return html def _sanitiseInfoUrl(self, url): """If an authority has info urls which are for some reason full of crap (like Broadland does), then this method should be overridden in order to tidy them up.""" return ''.join(url.split()) def _getHeaders(self): """If the authority requires any headers for the post request, override this method returning a dictionary of header key to header value.""" headers = {} if self.use_firefox_user_agent: headers["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.8.1.10) Gecko/20071126 Ubuntu/7.10 (gutsy) Firefox/2.0.0.10" if self.use_referer: headers["Referer"] = self.search_url return headers def _getPostData(self, asp_args, search_date): """Accepts asp_args (a tuple of key value pairs of the pesky ASP parameters, and search_date, a datetime.date object for the day we are searching for. This seems to be the most common set of post data which is needed for PlanningExplorer sites. It won't work for all of them, so will sometimes need to be overridden in a subclass. The parameter edrDateSelection is often not needed. It is needed by Charnwood though, so I've left it in to keep things simple. """ year_month_day = search_date.timetuple()[:3] post_data = urllib.urlencode(asp_args + ( ("_ctl0", "DATE_REGISTERED"), ("rbGroup", "_ctl5"), ("_ctl7_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)), ("_ctl8_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)), ("edrDateSelection", "1"), ("csbtnSearch", "Search"), ("cboNumRecs", "99999"), )) return post_data def _getAddress(self, tds, info_soup): # If this td contains a div, then the address is the # string in there - otherwise, use the string in the td. address_td = tds[self.address_td_no] if address_td.div is not None: address = address_td.div.string else: address = address_td.string return address def _getPostCode(self, info_soup): """In most cases, the postcode can be got from the address in the results table. Some councils put the address there without the postcode. In this case we will have to go to the info page to get the postcode. This should be done by overriding this method with one that parses the info page.""" return getPostcodeFromText(self._current_application.address) def _getDescription(self, tds, info_soup): description_td = tds[self.description_td_no] if description_td.div is not None: # Mostly this is in a div # Use the empty string if the description is missing description = description_td.div.string or "" else: # But sometimes (eg Crewe) it is directly in the td. # Use the empty string if the description is missing description = description_td.string or "" return description def __init__(self, authority_name, authority_short_name, base_url, debug=False): self.authority_name = authority_name self.authority_short_name = authority_short_name self.base_url = base_url self.search_url = urlparse.urljoin(base_url, self.search_url_path) self.info_url_base = urlparse.urljoin(self.base_url, self.info_url_path) self.debug = debug self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) # First do a get, to get some state get_request = urllib2.Request(self.search_url) get_response = urllib2.urlopen(get_request) cookie_jar.extract_cookies(get_response, get_request) html = get_response.read() # We need to find those ASP parameters such as __VIEWSTATE # so we can use them in the next POST # re.findall gets us a list of key value pairs. # We want to concatenate it with a tuple, so we must # make it a tuple asp_args = tuple(re.findall(self.asp_args_regex, html)) # The post data needs to be different for different councils # so we have a method on each council's scraper to make it. post_data = self._getPostData(asp_args, search_date) headers = self._getHeaders() request = urllib2.Request(self.search_url, post_data, headers) cookie_jar.add_cookie_header(request) post_response = urllib2.urlopen(request) # We have actually been returned here by an http302 object # moved, and the response called post_response is really a get. # In some cases, we can't get the page size set high # until now. In that case, override _modify_response # so that we get back a response with all the apps on one page. # We pass in headers so that any post_response = self._modify_response(post_response) html = self._sanitisePostHtml(post_response.read()) soup = BeautifulSoup(html) results_table = soup.find("table", attrs=self.results_table_attrs) # If there is no results table, then there were no apps on that day. if results_table: trs = self._find_trs(results_table) self._current_application = None # The first tr is just titles, cycle through the trs after that for tr in trs: self._current_application = PlanningApplication() # There is no need to search for the date_received, it's what # we searched for self._current_application.date_received = search_date tds = tr.findAll("td") self._current_application.council_reference = tds[self.reference_td_no].a.string relative_info_url = self._sanitiseInfoUrl(tds[self.reference_td_no].a['href']) self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url) # Fetch the info page if we need it, otherwise set it to None if self.fetch_info_page: # We need to quote the spaces in the info url info_request = urllib2.Request(urllib.quote(self._current_application.info_url, ":/&?=")) info_soup = BeautifulSoup(urllib2.urlopen(info_request)) else: info_soup = None # What about a comment url? # There doesn't seem to be one, so we'll use the email address if self.comments_email_address is not None: # We're using the email address, as there doesn't seem # to be a web form for comments self._current_application.comment_url = self.comments_email_address else: # This link contains a code which we need for the comments url # (on those sites that use it) application_code = app_code_regex.search(relative_info_url).groups()[0] relative_comments_url = self.comments_path %(application_code) self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url) self._current_application.address = self._getAddress(tds, info_soup) self._current_application.postcode = self._getPostCode(info_soup) self._current_application.description = self._getDescription(tds, info_soup) self._results.addApplication(self._current_application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class RutlandLikeParser: def __init__(self, authority_name, authority_short_name, base_url, debug=False): self.authority_name = authority_name self.authority_short_name = authority_short_name self.base_url = base_url self.debug = debug self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) date_string = search_date.strftime(date_format) search_data = urllib.urlencode({"reference": "", "undecided": "yes", "dateFrom": date_string, "dateTo": date_string, "Address": "", "validate": "true", }) request = urllib2.Request(self.base_url, search_data) response = urllib2.urlopen(request) html = response.read() soup = BeautifulSoup(html) tables = soup.findAll("table", {"style": "width:auto;"}) if not tables: return self._results # We don't want the first or last tr trs = tables[0].findAll("tr")[1:-1] for tr in trs: app = PlanningApplication() tds = tr.findAll("td") if len(tds) == 4: local_info_url = tds[0].a['href'] app.info_url = urlparse.urljoin(self.base_url, local_info_url) app.council_reference = tds[0].a.string app.address = tds[1].string app.postcode = getPostcodeFromText(app.address) app.description = tds[2].string app.comment_url = urlparse.urljoin(self.base_url, comment_url_end %app.council_reference) app.date_received = search_date self._results.addApplication(app) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class BroxtoweParser: def __init__(self, *args): self.authority_name = "Broxtowe Borough Council" self.authority_short_name = "Broxtowe" self.base_url = "http://planning.broxtowe.gov.uk" self.info_url = "http://planning.broxtowe.gov.uk/ApplicationDetail.aspx?RefVal=%s" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # Now get the search page get_response = urllib2.urlopen(self.base_url) get_soup = BeautifulSoup(get_response.read()) # These are the inputs with a default value inputs_needed = [(x['id'], x['value']) for x in get_soup.form.findAll("input", value=True, type=lambda x: x != "submit")] # Add the submit button inputs_needed.append(('cmdWeeklyList', 'Search Database')) # We also need to add the date we want to search for. # This is the friday after the date searched for. # At weekends this will get you the friday before, but that isn't # a problem as there are no apps then. friday = search_day + datetime.timedelta(4 - search_day.weekday()) inputs_needed.append(("ddlWeeklyList", friday.strftime(date_format))) # We'd like as many results as we can get away with on one page. # 50 is the largest option offerend inputs_needed.append(("ddlResultsPerPageWeeklyList", "50")) post_data = dict(inputs_needed) post_url = get_response.url # In case something goes wrong here, let's break out of the loop after at most 10 passes passes = 0 while True: passes += 1 post_response = urllib2.urlopen(post_url, urllib.urlencode(post_data)) post_soup = BeautifulSoup(post_response.read()) result_tables = post_soup.table.findAll("table") for result_table in result_tables: application = PlanningApplication() application.address = ', '.join(result_table.findPrevious("b").string.strip().split("\r")) application.postcode = getPostcodeFromText(application.address) trs = result_table.findAll("tr") application.council_reference = trs[0].findAll("td")[1].string.strip() application.date_received = datetime.datetime.strptime(trs[1].findAll("td")[1].string.strip(), date_format).date() application.description = trs[3].findAll("td")[1].string.strip() application.info_url = self.info_url %(urllib.quote(application.council_reference)) # In order to avoid having to do a download for every app, # I'm setting the comment url to be the same as the info_url. # There is a comment page which can be got to by pressing the button application.comment_url = application.info_url self._results.addApplication(application) # Which page are we on? page_no = int(post_soup.find("span", id="lblPageNo").b.string) total_pages = int(post_soup.find("span", id="lblTotalPages").b.string) if passes > 10 or not page_no < total_pages: break post_data = [ ("__EVENTTARGET", "hlbNext"), ("__EVENTARGUMENT", ""), ("__VIEWSTATE", post_soup.find("input", id="__VIEWSTATE")['value']), ("__EVENTVALIDATION", post_soup.find("input", id="__EVENTVALIDATION")['value']), ] post_url = urlparse.urljoin(post_response.url, post_soup.find("form")['action']) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class ShropshireParser: reference_input_name = "ApplNum" contact_email_name = "offemail" comment_url = None use_validated_date = False def _get_info_link_list(self, soup): return [tr.a for tr in soup.find("table", id="tbllist").findAll("tr", recursive=False)[:-1]] def _get_postcode(self, info_soup): return info_soup.find("input", {"name": "Postcode"})['value'] def __init__(self, authority_name, authority_short_name, base_url, debug=False): self.debug = debug self.authority_name = authority_name self.authority_short_name = authority_short_name self.base_url = base_url self._split_base_url = urlparse.urlsplit(base_url) self._current_application = None self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) search_date_string = search_date.strftime(date_format) if self.use_validated_date: received_search_string = "" validated_search_string = search_date_string else: received_search_string = search_date_string validated_search_string = "" search_data = urllib.urlencode([ ("txtAppNum", ""), ("txtAppName", ""), ("txtAppLocn", ""), ("txtAppPCode", ""), ("txtAppRecFrom", received_search_string), ("txtAppRecTo", received_search_string), ("txtAppDecFrom", ""), ("txtAppDecTo", ""), ("txtAppValFrom", validated_search_string), ("txtAppValTo", validated_search_string), ("district_drop", ""), ("parish_drop", ""), ("ward_drop", ""), ("ft", "yes"), ("submit1", "Submit"), ]) split_search_url = self._split_base_url[:3] + (search_data, '') search_url = urlparse.urlunsplit(split_search_url) response = urllib2.urlopen(search_url) soup = BeautifulSoup(response.read()) # Handle the case where there are no apps if soup.find(text=re.compile("No applications matched your query")): return self._results info_link_list = self._get_info_link_list(soup) for app_link in info_link_list: self._current_application = PlanningApplication() # We could get this from the info soup, but as we already know it, why bother. self._current_application.date_received = search_date self._current_application.info_url = urlparse.urljoin(self.base_url, app_link['href']) # To get the postcode we will need to download each info page info_response = urllib2.urlopen(self._current_application.info_url) info_soup = BeautifulSoup(info_response.read()) self._current_application.council_reference = info_soup.find("input", {"name": self.reference_input_name})['value'] self._current_application.address = info_soup.find("textarea", {"name": "Location"}).string.strip() self._current_application.postcode = self._get_postcode(info_soup) self._current_application.description = info_soup.find("textarea", {"name": "Proposal"}).string.strip() if self.comment_url: self._current_application.comment_url = self.comment_url else: self._current_application.comment_url = info_soup.find("input", {"name": self.contact_email_name})['value'] # There is an OSGB position here :-) self._current_application.osgb_x = info_soup.find("input", {"name": "Easting"})['value'] self._current_application.osgb_y = info_soup.find("input", {"name": "Northing"})['value'] self._results.addApplication(self._current_application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class WestminsterParser: def __init__(self, *args): self.authority_name = "City of Westminster" self.authority_short_name = "Westminster" self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # post_data = [ # ("EFNO", ""), # ("STName", ""), # ("STNUMB", ""), # ("ADRSNO", ""), # ("WARD", "AllWards"), # ("AGT", ""), # ("ATCDE", "AllApps"), # ("DECDE", "AllDecs"), # ("DTErec", search_day.strftime(date_format)), # ("DTErecTo", search_day.strftime(date_format)), # ("DTEvalid", ""), # ("DTEvalidTo", ""), # ("APDECDE", "AllAppDecs"), # ("submit", "Start+Search"), # ] post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)} while post_data: # Now get the search page # sys.stderr.write("Fetching: %s\n" %self.base_url) # sys.stderr.write("post data: %s\n" %post_data) response = urllib2.urlopen(self.base_url, post_data) # sys.stderr.write("Got it\n") soup = BeautifulSoup(response.read()) # sys.stderr.write("Created soup\n") results_form = soup.find("form", {"name": "currentsearchresultsNext"}) # Sort out the post_data for the next page, if there is one # If there is no next page then there will be no inputs in the form. # In this case, post_data will be '', which is false. # sys.stderr.write("Found form containing results\n") post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")]) # sys.stderr.write("Got post data\n") # Each result has one link, and they are the only links in the form links = results_form.findAll("a") # sys.stderr.write("Got list of links\n") for link in links: # sys.stderr.write("Working on link: %s\n" %link['href']) application = PlanningApplication() application.date_received = search_day application.info_url = urlparse.urljoin(self.base_url, link['href']) application.council_reference = link.string.strip() application.address = link.findNext("td").string.strip() application.postcode = getPostcodeFromText(application.address) application.description = link.findNext("tr").findAll("td")[-1].string.strip() # To get the comment url, we're going to have to go to each info url :-( # sys.stderr.write("Fetching: %s\n" %application.info_url) info_response = urllib2.urlopen(application.info_url) # sys.stderr.write("Got it\n") info_soup = BeautifulSoup(info_response) comment_nav_string = info_soup.find(text="Comment on this case") if comment_nav_string: application.comment_url = comment_nav_string.parent['href'] else: application.comment_url = "No Comments" #http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500 self._results.addApplication(application) # sys.stderr.write("Finished that link\n") # sys.stderr.write("Finished while loop, returning stuff.\n") return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class ShetlandParser: def __init__(self, *args): self.authority_name = "Shetland Islands Council" self.authority_short_name = "Shetland Islands" self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=%d" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.datetime(year, month, day) offset = 0 # First get the search page response = urllib2.urlopen(self.base_url %(offset)) contents = response.read() # First let's find out how many records there are (they are displayed ten per page). match = page_count_regex.search(contents) app_count = int(match.groups()[0]) while offset < app_count: if offset != 0: contents = urllib2.urlopen(self.base_url %(offset)).read() soup = BeautifulSoup(contents) # The apps are in the 5th table on the page (not a very good way to get it...) results_table = soup.findAll("table")[5] # Now we need to find the trs which contain the apps. # The first TR is just headers. # After that they alternate between containing an app and just some display graphics # until the third from last. After that, they contain more rubbish. trs = results_table.findAll("tr")[1:-2] for i in range(len(trs)): # We are only interested in the trs in even positions in the list. if i % 2 == 0: tr = trs[i] application = PlanningApplication() comment_url_element = tr.find(text="comment on this planning application").parent application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6])) # If the date of this application is earlier than the date # we are searching for then don't download it. # We could optimize this a bit more by not doing the later pages. if application.date_received < search_date: break application.council_reference = tr.a.string application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href']) application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) info_response = urllib2.urlopen(application.info_url) info_soup = BeautifulSoup(info_response.read()) info_table = info_soup.findAll("table")[2] application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip() application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip() # Now to get the address. This will be split across several tds. address_start_td = info_table.find("td", rowspan="4") # We need the first bit of the address from this tr address_bits = [address_start_td.findNext("td").string.strip()] # We will need the first td from the next three trs after this for address_tr in address_start_td.findAllNext("tr")[:3]: address_line = address_tr.td.string.strip() if address_line: address_bits.append(address_line) address_bits.append(application.postcode) application.address = ', '.join(address_bits) self._results.addApplication(application) offset += 10 return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()