示例#1
0
class BerwickParser:
    comments_email_address = "*****@*****.**"

    def __init__(self, *args):

        self.authority_name = "Berwick-upon-Tweed Borough Council"
        self.authority_short_name = "Berwick"
        self.base_url = "http://www.berwick-upon-tweed.gov.uk/planning/register/wl/%s.htm"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

        monday_before = search_day - datetime.timedelta(search_day.weekday())

        thursday = monday_before + datetime.timedelta(3)
        if search_day.weekday() > 3: # i.e. It is friday, saturday, or sunday
            # We need to add a week
            thursday = thursday + datetime.timedelta(7)

        this_url = self.base_url %(thursday.strftime(search_date_format))
        # Now get the search page
        response = urllib2.urlopen(this_url)
        soup = BeautifulSoup(response.read())

        # Each app is stored in a table of its own. The tables don't have
        # any useful attributes, so we'll find all the NavigableString objects
        # which look like " Application Number:" and then look at the 
        #tables they are in.

        nav_strings = soup.findAll(text=" Application Number:")

        for nav_string in nav_strings:
            application = PlanningApplication()

            application.council_reference = nav_string.findNext("p").string.strip()

            result_table = nav_string.findPrevious("table")

            application.date_received = datetime.datetime.strptime(result_table.find(text=" Registration Date: ").findNext("p").contents[0].strip(), reg_date_format)

            application.osgb_x = result_table.find(text=" Easting:").findNext("p").string.strip()
            application.osgb_y = result_table.find(text=" Northing:").findNext("p").string.strip()

            application.description = result_table.find(text=" Proposed Development:").findNext("p").string.strip()
            application.address = result_table.find(text=" Location:").findNext("p").string.strip()
            application.postcode = getPostcodeFromText(application.address)

            application.info_url = this_url

            application.comment_url = self.comments_email_address

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class CarmarthenshireParser:
    def __init__(self, *args):
        self.comments_email_address = "*****@*****.**"

        self.authority_name = "Carmarthenshire County Council"
        self.authority_short_name = "Carmarthenshire"
        self.base_url = "http://www.carmarthenshire.gov.uk/CCC_APPS/eng/plannaps/CCC_PlanningApplicationsResults.asp?datemode=range&in_lo_date=%(day)s%%2F%(month)s%%2F%(year)s&in_hi_date=%(day)s%%2F%(month)s%%2F%(year)s&SUBMIT=Search"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

        # Now get the search page
        response = urllib2.urlopen(self.base_url %{"day": day,
                                                   "month": month,
                                                   "year": year,
                                                   })
        soup = BeautifulSoup(response.read())

        trs = soup.findAll("tr", valign="middle")

        count = 0
        for tr in trs:
            # The odd trs are just spacers
            if count % 2 == 0:
                application = PlanningApplication()

                tds = tr.findAll("td")
                
                application.date_received = search_day
                application.council_reference = tds[1].a.string
                application.address = tds[3].a.string
                application.postcode = getPostcodeFromText(application.address)
                
                # All the links in this <tr> go to the same place...
                application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])

                # Still looking for description and comment url
                
                # For the description, we'll need the info page
                info_soup = BeautifulSoup(urllib2.urlopen(application.info_url).read())

                application.description = info_soup.find(text="Description").findNext("td").findNext("td").font.string

                # While we're here, lets get the OSGB grid ref
                application.osgb_x, application.osgb_y = info_soup.find(text="Grid Reference").findNext("td").font.string.split("-")

                # We'll have to use an email address for comments
                application.comment_url = self.comments_email_address

                self._results.addApplication(application)

            count += 1

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#3
0
class CrawleyParser:
    comment_url_template = "http://www.crawley.gov.uk/stellent/idcplg?IdcService=SS_GET_PAGE&nodeId=561&pageCSS=&pAppNo=%(pAppNo)s&pAppDocName=%(pAppDocName)s"
    
    def __init__(self, *args):

        self.authority_name = "Crawley Borough Council"
        self.authority_short_name = "Crawley"
        self.base_url =   "http://www.crawley.gov.uk/stellent/idcplg?IdcService=SS_GET_PAGE&nodeId=560&is_NextRow=1&accept=yes&strCSS=null&pApplicationNo=&pProposal=&pLocation=&pPostcode=&pWard=&pDateType=received&pDayFrom=%(dayFrom)s&pMonthFrom=%(monthFrom)s&pYearFrom=%(yearFrom)s&pDayTo=%(dayTo)s&pMonthTo=%(monthTo)s&pYearTo=%(yearTo)s&submit=Search"


        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)
        #- Crawley only allows searches from-to, so:

        next = self.base_url %{
            "dayFrom": day,
            "monthFrom": month,
            "yearFrom": year,
            "dayTo": day,
            "monthTo": month,
            "yearTo": year,
            }
        # Now get the search page
        response = urllib2.urlopen(next)
        soup = BeautifulSoup.BeautifulSoup(response.read())
        
        if soup.table: #- Empty result set has no table
            trs = soup.table.findAll("tr")[1:] # First one is just headers    
            for tr in trs:    
                tds = tr.findAll("td")
                application = PlanningApplication()         
                application.council_reference = tds[0].a.contents[0].strip().replace("&#47;", "/")
                application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])

                info_qs = cgi.parse_qs(urlparse.urlsplit(application.info_url)[3])

                comment_qs = {
                  "pAppNo": application.council_reference,
                  "pAppDocName": info_qs["ssDocName"][0],
                  }
                application.comment_url = self.comment_url_template %comment_qs

                application.address = tds[1].string.strip()
                if tds[2].string: #- if postcode present, append it to the address too
                    application.postcode = tds[2].string.replace("&nbsp;", " ").strip()
                    application.address += ", " + application.postcode
                application.description = tds[3].string.strip()
                application.date_received = datetime.datetime(*(time.strptime(tds[4].string.strip(), date_format)[0:6]))
                self._results.addApplication(application)
        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class ForestOfDeanParser:
    def __init__(self, *args):

        self.authority_name = "Forest of Dean District Council"
        self.authority_short_name = "Forest of Dean"
        self.base_url = "http://www.fdean.gov.uk/content.asp"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)

        search_data = urllib.urlencode(
            [
                ("parent_directory_id", "200"),
                ("nav", "679"),
                ("id", "13266"),
                ("RecStart", "1"),
                ("RecCount", "100"),
                ("SDate", search_date.strftime(date_format)),
                ("EDate", search_date.strftime(date_format)),
                ]
            )

        search_url = self.base_url + "?" + search_data

        response = urllib2.urlopen(search_url)
        soup = BeautifulSoup(response.read())

        results_table = soup.find("table", summary="List of planning applications that match your query")

        for tr in results_table.findAll("tr")[1:]:
            application = PlanningApplication()
            
            application.date_received = search_date
            
            tds = tr.findAll("td")

            application.council_reference = tds[0].a.string.strip()
            application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
            application.comment_url = application.info_url

            application.address = ' '.join(tds[1].string.strip().split())
            application.postcode = getPostcodeFromText(application.address)

            application.description = tds[2].string.strip()

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#5
0
class CalderdaleParser:
    def __init__(self, *args):
        self.authority_name = "Calderdale Council"
        self.authority_short_name = "Calderdale"
        self.base_url = "http://www.calderdale.gov.uk/environment/planning/search-applications/planapps.jsp?status=0&date1=%(date)s&date2=%(date)s&Search=Search"
        self.info_url = "http://www.calderdale.gov.uk/environment/planning/search-applications/planapps.jsp?app=%s&Search=Search"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)

        next_page_url = self.base_url %{"date": search_date.strftime(date_format)}

        while next_page_url:
            try:
                response = urllib2.urlopen(next_page_url)
            except urllib2.HTTPError:
                # This is what seems to happen if there are no apps
                break

            soup = BeautifulSoup(response.read())

            next = soup.find(text="Next")
            if next:
                next_page_url = urlparse.urljoin(self.base_url, next.parent['href'])
            else:
                next_page_url = None

            # There is an <h3> for each app that we can use 
            for h3 in soup.findAll("h3", {"class": "resultsnavbar"}):
                application = PlanningApplication()

                application.date_received = search_date
                application.council_reference = h3.string.split(": ")[1]
                application.description = h3.findNext("div").find(text="Proposal:").parent.nextSibling.strip()

                application.address = ', '.join(h3.findNext("div").find(text="Address of proposal:").parent.nextSibling.strip().split("\r"))
                application.postcode = getPostcodeFromText(application.address)

                application.comment_url = urlparse.urljoin(self.base_url, h3.findNext("div").find(text=re.compile("Comment on Application")).parent['href'])

                application.info_url = self.info_url %(urllib.quote(application.council_reference))

                application.osgb_x, application.osgb_y = h3.findNext("div").find(text="Grid Reference:").parent.nextSibling.strip().split()

                self._results.addApplication(application)

        return self._results


    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#6
0
class SolihullParser:

    def __init__(self, *args):

        self.authority_name = "Solihull Metropolitan Borough Council"
        self.authority_short_name = "Solihull"
        self.base_url = "http://www.solihull.gov.uk/planning/dc/weeklist.asp?SD=%s&ward=ALL"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

        # What we actually need is the monday before the date searched for:
        monday_before = search_day - datetime.timedelta(search_day.weekday())

        # Now get the search page
        response = urllib2.urlopen(self.base_url %(monday_before.strftime(date_format)))
        soup = BeautifulSoup(response.read())

        result_tables = soup.findAll("table", width="98%", cellpadding="2")

        for table in result_tables:
            application = PlanningApplication()

            trs = table.findAll("tr")
	    application.council_reference = trs[0].strong.string.strip()
            relative_info_url = trs[0].a['href']
            application.info_url = urlparse.urljoin(self.base_url, relative_info_url)

            application.address = trs[1].findAll("td")[1].string.strip()
            application.postcode = getPostcodeFromText(application.address)
            application.description = trs[2].findAll("td")[1].string.strip()

	    #There's probably a prettier way to get the date, but with Python, it's easier for me to reinvent the wheel than to find an existing wheel!
	    raw_date_recv = trs[3].findAll("td")[3].string.strip().split("/")
	    #Check whether the application is on the target day. If not, discard it and move on.
	    if int(raw_date_recv[0]) != day:
	      continue
	    application.date_received = datetime.date(int(raw_date_recv[2]), int(raw_date_recv[1]), int(raw_date_recv[0]))

            try:
                relative_comment_url = trs[5].findAll("td")[1].a['href']
                application.comment_url = urlparse.urljoin(self.base_url, relative_comment_url)
            except:
                application.comment_url = "No Comment URL."

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class LeicestershireParser:
    def __init__(self, *args):

        self.authority_name = "Leicestershire County Council"
        self.authority_short_name = "Leicestershire"
        self.base_url = "http://www.leics.gov.uk/index/environment/community_services_planning/planning_applications/index/environment/community_services_planning/planning_applications/eplanning_searchform/eplanning_resultpage.htm?sd=%(date)s&ed=%(date)s&kw=&map=f"
 
        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)

        response = urllib2.urlopen(self.base_url %{"date": search_date.strftime(search_date_format)})
        soup = BeautifulSoup.BeautifulSoup(response.read())

        if not soup.find(text=re.compile("No Results Found")):
            
            trs = soup.findAll("table", {"class": "dataTable"})[1].findAll("tr")[1:]

            for tr in trs:
                tds = tr.findAll("td")

                application = PlanningApplication()

                # We can fill in the date received without actually looking at the data
                application.date_received = search_date

                application.council_reference = tds[0].a.string.strip()
                application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
                application.address = ', '.join([x for x in tds[1].contents 
                                                 if isinstance(x, BeautifulSoup.NavigableString)])
                application.postcode = getPostcodeFromText(application.address)
                application.description = tds[2].string.strip()

                # To get the comment link we need to fetch the info page

                info_response = urllib2.urlopen(application.info_url)
                info_soup = BeautifulSoup.BeautifulSoup(info_response.read())

                base = info_soup.base['href']

                application.comment_url = urlparse.urljoin(base,
                                                           info_soup.find("a", target="Planning Application Consultation Form")['href'])

                self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#8
0
class HounslowParser:
    def __init__(self, *args):

        self.authority_name = "London Borough of Hounslow"
        self.authority_short_name = "Hounslow"
        self.base_url = "http://planning.hounslow.gov.uk/planningv2/planning_summary.aspx?strWeekListType=SRCH&strRecTo=%(date)s&strRecFrom=%(date)s&strWard=ALL&strAppTyp=ALL&strWardTxt=All%%20Wards&strAppTypTxt=All%%20Application%%20Types&strArea=ALL&strAreaTxt=All%%20Areas&strStreet=ALL&strStreetTxt=All%%20Streets&strPC=&strLimit=500"
        # Limited to 500 cases - putting 1000 causes a default value of 50 to be used. 500 should be plenty.

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

        # Now get the search page
        response = urllib2.urlopen(self.base_url % {"date": search_day.strftime(date_format)})
        soup = BeautifulSoup(response.read())

        # Results are shown in a table each. The tables don't have any nice
        # attributes, but they do all contain a NavString "Application",
        # and nothing else does...
        nav_strings = soup.findAll(text="Application")

        for nav_string in nav_strings:
            result_table = nav_string.findPrevious("table")

            application = PlanningApplication()
            application.date_received = search_day

            links = result_table.findAll("a")

            # We can get OSGB coordinates from the link to streetmap
            map_qs_dict = cgi.parse_qs(urlparse.urlsplit(links[0]["href"])[3])

            application.osgb_x = map_qs_dict.get("x")[0]
            application.osgb_y = map_qs_dict.get("y")[0]

            application.council_reference = links[1].string.strip()
            application.info_url = urlparse.urljoin(self.base_url, links[1]["href"])
            application.comment_url = urlparse.urljoin(self.base_url, links[2]["href"])

            application.address = " ".join(links[0].previous.strip().split())
            application.postcode = getPostcodeFromText(application.address)

            application.description = links[2].previous.strip()

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#9
0
class KensingtonParser:

    def __init__(self, *args):

        self.authority_name = "The Royal Borough of Kensington and Chelsea"
        self.authority_short_name = "Kensington and Chelsea"
        self.base_url = "http://www.rbkc.gov.uk/Planning/scripts/weeklyresults.asp"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

        # We want the sunday of the week being searched for.
        # (sunday is at the end of the week).
        friday = search_day - datetime.timedelta(search_day.weekday()) + datetime.timedelta(4)

        # Not using urllib.urlencode as it insists on turning the "+" into "%2B"
        post_data = "WeekEndDate=%d%%2F%d%%2F%d&order=Received+Date&submit=search" %(friday.day, friday.month, friday.year)


        # Now get the search page
        response = urllib2.urlopen(self.base_url, post_data)
        soup = BeautifulSoup(response.read())

        trs = soup.find("table", summary="Planning Application search results table").findAll("tr")[1:]

        for tr in trs:
            application = PlanningApplication()

            tds = tr.findAll("td")

            # Not sure why these are entities. We'll convert them back.
            application.council_reference = tds[0].a.contents[1].strip().replace("&#47;", "/")
            application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
            application.comment_url = application.info_url

            application.date_received = datetime.datetime(*(time.strptime(tds[1].string.strip(), date_format)[0:6]))

            application.address = tds[2].string.strip()
            application.postcode = getPostcodeFromText(application.address)

            application.description = tds[3].string.strip()

            self._results.addApplication(application)
        
        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#10
0
class ExmoorParser:
    def __init__(self, *args):

        self.authority_name = "Exmoor National Park"
        self.authority_short_name = "Exmoor"
        self.base_url = "http://www.exmoor-nationalpark.gov.uk/planning_weekly_list.htm?weeklylist=%s"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

        response = urllib2.urlopen(self.base_url %(search_day.strftime(search_date_format)))
        soup = BeautifulSoup(response.read())

        # The first <tr> contains headers
        trs = soup.table.findAll("tr")[1:]

        for tr in trs:
            application = PlanningApplication()

            tds = tr.findAll("td")

            application.date_received = datetime.datetime.strptime(tds[0].string, received_date_format).date()

            application.info_url = urllib.unquote(urllib.quote_plus(urlparse.urljoin(self.base_url, tds[1].a['href'])))
            application.council_reference = tds[1].a.string.strip()
            application.address = tds[2].a.string.strip()
            application.postcode = getPostcodeFromText(application.address)

            # Now fetch the info url

            info_response = urllib.urlopen(application.info_url)
            info_soup = BeautifulSoup(info_response.read())

            application.description = info_soup.find(text="Proposal:").findNext("td").string.strip()

            try:
                application.comment_url = urlparse.urljoin(self.base_url, info_soup.find(text="Comment").parent['href'])
            except:
                application.comment_url = "No Comments"

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class KingstonParser:
    comments_email_address = "*****@*****.**"

    def __init__(self, *args):
        self.authority_name = "Royal Borough of Kingston upon Thames"
        self.authority_short_name = "Kingston upon Thames"
        self.base_url = "http://maps.kingston.gov.uk/isis_main/planning/planning_summary.aspx?strWeekListType=SRCH&strRecTo=%(date)s&strRecFrom=%(date)s&strWard=ALL&strAppTyp=ALL&strWardTxt=All%%20Wards&strAppTypTxt=All%%20Application%%20Types&strStreets=ALL&strStreetsTxt=All%%20Streets&strLimit=500"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

        # Now get the search page
        response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})
        soup = BeautifulSoup(response.read())

        # Each app is stored in a table on it's own. 
        # These tables don't have any nice distinguishing features,
        # but they do all contain a NavigableString "Application",
        # and nothing else in the page does.
        nav_strings = soup.findAll(text="Application")
        
        for nav_string in nav_strings:
            results_table = nav_string.findPrevious("table")

            application = PlanningApplication()
            application.date_received = search_day

            application.council_reference = results_table.a.string.strip()
            application.info_url = urlparse.urljoin(self.base_url, results_table.a['href'])
            application.address = results_table.findAll("td")[7].a.string.strip()

            application.postcode = getPostcodeFromText(application.address)
            application.description = results_table.findAll("td")[-1].contents[0].strip()

            # A few applications have comment urls, but most don't.
            # When they do, they have a case officer - I don't think we can
            # work out the other urls - even if they exist.
            # Best to use the email address.
            application.comment_url = self.comments_email_address

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#12
0
class BarnsleyParser:
    comments_email_address = "*****@*****.**"

    def __init__(self, *args):

        self.authority_name = "Barnsley Metropolitan Borough Council"
        self.authority_short_name = "Barnsley"
        self.base_url = "http://applications.barnsley.gov.uk/service/development/week_compact.asp?AppDate=%s"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

        # What we actually need is the monday before the date searched for:
        monday_before = search_day - datetime.timedelta(search_day.weekday())

        # Now get the search page
        response = urllib2.urlopen(self.base_url %(monday_before.strftime(date_format)))
        soup = BeautifulSoup(response.read())

        result_tables = soup.findAll("table", align="Center", cellpadding="3")

        for table in result_tables:
            application = PlanningApplication()

            # We can set the date received and the comment url straight away.
            application.comment_url = self.comments_email_address

            trs = table.findAll("tr")

            application.council_reference = trs[0].a.string.strip()
            relative_info_url = trs[0].a['href']

            application.info_url = urlparse.urljoin(self.base_url, relative_info_url)

            application.date_received = monday_before

            application.address = trs[1].findAll("td")[1].string.strip()
            application.postcode = getPostcodeFromText(application.address)
            application.description = trs[2].findAll("td")[1].string.strip()

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#13
0
class HarrowParser:
    def __init__(self, *args):

        self.authority_name = "London Borough of Harrow"
        self.authority_short_name = "Harrow"

        # This is a link to the last seven days applications
        # The day, month, and year arguments will be ignored.
        self.base_url = "http://www.harrow.gov.uk/www4/planning/dcweek1.asp"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        # Now get the search page
        response = urllib2.urlopen(self.base_url)

        soup = BeautifulSoup(response.read())

        # Each application contains the nav string "Application: "
        nav_strings = soup.findAll(text="Application: ")

        for nav_string in nav_strings:
            application = PlanningApplication()

            application.council_reference = nav_string.findPrevious("tr").findAll("td", limit=2)[1].string.strip()

            application.address = nav_string.findNext(text=location_re).split(":")[1].strip()
            application.postcode = getPostcodeFromText(application.address)

            application.description = nav_string.findNext(text="Proposal: ").findNext("td").string.strip()

            application.comment_url = urlparse.urljoin(self.base_url, nav_string.findNext(text="Proposal: ").findNext("a")['href'])

            application.date_received = datetime.datetime.strptime(nav_string.findNext(text=date_received_re).split(": ")[1], date_format).date()

            # FIXME: There is no appropriate info_url for the Harrow apps. 
            # I'll put the base url for the moment, but as that is
            # a list of apps from the last 7 days that will quickly be out of date.

            application.info_url = self.base_url
            
            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#14
0
class HampshireParser:
    def __init__(self, *args):

        self.authority_name = "Hampshire County Council"
        self.authority_short_name = "Hampshire"
        self.base_url = "http://www3.hants.gov.uk/planning/mineralsandwaste/planning-applications/applications/applications-open.htm"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        # Now get the search page
        response = urllib2.urlopen(self.base_url)
        soup = BeautifulSoup(response.read())

        trs = soup.table.table.findAll("tr", {"class": re.compile("(?:odd)|(?:even)")})


        for tr in trs:
            application = PlanningApplication()

            tds = tr.findAll("td")

            application.council_reference = tds[0].a.string.strip()
            application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
            application.address = tds[2].string.strip()
            application.postcode = getPostcodeFromText(application.address)
            application.description = tds[3].string.strip()

            # Fetch the info url in order to get the date received and the comment url

            info_response = urllib2.urlopen(application.info_url)

            info_soup = BeautifulSoup(info_response.read())

            application.date_received = datetime.datetime.strptime(info_soup.find(text=re.compile("\s*Received:\s*")).findNext("td").string.strip(), date_format).date()

            application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("input", value="Comment on this application").parent['action'])


            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#15
0
class LichfieldParser:
    def __init__(self, *args):

        self.authority_name = "Lichfield District Council"
        self.authority_short_name = "Lichfield"
        self.base_url = "http://www.lichfielddc.gov.uk/site/scripts/planning_list.php"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

    def getResultsByDayMonthYear(self, day, month, year):
        response = urllib2.urlopen(self.base_url)
        soup = BeautifulSoup.BeautifulSoup(response.read())

        trs = soup.find("table", {"class": "planningtable"}).tbody.findAll("tr")

        for tr in trs:
            application = PlanningApplication()

            tds = tr.findAll("td")

            application.council_reference = tds[0].a.string.strip()
            application.info_url = urlparse.urljoin(self.base_url, tds[0].a["href"])
            application.address = " ".join(tds[1].contents[1].strip().split()[1:])
            application.postcode = getPostcodeFromText(application.address)

            # We're going to need to download the info page in order to get
            # the comment link, the date received, and the description.

            info_response = urllib2.urlopen(application.info_url)
            info_soup = BeautifulSoup.BeautifulSoup(info_response.read())

            application.description = info_soup.find(text="Proposal").findNext(text=True).strip()
            date_received_str = info_soup.find(text="Date Application Valid").findNext(text=True).split(",")[1].strip()

            # This is a nasty botch, but the easiest way I can see to get a date out of this is to make another string and use strptime
            better_date_str = "%s %s %s" % date_received_re.match(date_received_str).groups()
            application.date_received = datetime.datetime.strptime(better_date_str, "%d %B %Y").date()
            application.comment_url = info_soup.find("a", title="Comment on this planning application.")["href"]

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#16
0
class WeymouthParser:
    def __init__(self, *args):

        self.authority_name = "Weymouth and Portland Borough Council"
        self.authority_short_name = "Weymouth and Portland"
        self.base_url = "http://www.weymouth.gov.uk/Planning/applications/newapps.asp"
        self.search_url = "http://www.weymouth.gov.uk/planning/applications/planregister.asp"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        response = urllib2.urlopen(self.base_url)
        soup = BeautifulSoup(response.read())
        
        for details_input in soup.find("table", summary="Planning Applications Received in the last 7 days").findAll("input", alt="Planning Details"):
            application = PlanningApplication()

            first_tr = details_input.findPrevious("tr")

            other_trs = first_tr.findNextSiblings("tr", limit=8)

            application.council_reference = first_tr.find("input", {"name": "refval"})['value']
            application.address = other_trs[0].findAll("td")[1].string.strip()
            application.description = other_trs[1].findAll("td")[1].string.strip()
            application.date_received = datetime.datetime.strptime(other_trs[3].findAll("td")[1].string.strip(), date_format).date()

            # Both the info page and the comment page can only be got to
            # by a POST. The best we can do is give the url of the search page
            application.info_url = application.comment_url = self.search_url

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#17
0
class WychavonParser:
    
    def __init__(self, *args):
        self.authority_name = "Wychavon"
        self.authority_short_name = "Wychavon"
        # Currently hard coded--if this address updates, we'll need to scrape
        # the search form to get it each time.
        self.base_url = "http://www.e-wychavon.org.uk/scripts/plan2005/\
acolnetcgi.exe?ACTION=UNWRAP&WhereDescription=General%20Search&\
Whereclause3=%27%30%31%2F%7BEdtMonthEnd%7D%2F%7BEdtYearEnd%7D%27&\
RIPNAME=Root%2EPages%2EPgeDC%2EPgeListCases"
        
        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        
        form_data = "EdtYearNo=&EdtCaseNo=&EdtApplicant=&EdtAgent=&EdtLocation"\
        + "=&EdtWard=&EdtMonthStart1=" + str(month) + "&EdtYearStart=" \
        + str(year) + "&EdtMonthEnd=" + str(month) + "&EdtYearEnd="\
        + str(year) + "&submit=Search"

        # Fetch the results
        response = urllib.urlopen(self.base_url, form_data)
        soup = BeautifulSoup(response.read())
        
        
        #Each set of results has its own table
        results_tables = soup.findAll("table", cellpadding="2", cols="4")

        for table in results_tables:
            application = PlanningApplication()

            trs = table.findAll("tr")
            
            application.council_reference = trs[0].findAll("td")[1].font.font.\
                                                        font.string.strip()
            
            relative_info_url = trs[0].findAll("td")[1].a['href']
            application.info_url = urlparse.urljoin(self.base_url, relative_info_url)
            
            application.address = trs[1].findAll("td")[1].font.string.strip()
            application.postcode = getPostcodeFromText(application.address)
            
            #This code avoids an error if there's no description given.
            descrip = trs[2].findAll("td")[1].font.string
            if descrip == None:
                application.description = ""
            else:
                application.description = descrip.strip()

            date_format = "%d/%m/%y"
            date_string = trs[1].findAll("td")[3].font.string.strip()
                                                                    
            application.date_received = datetime.datetime.strptime(date_string, date_format) 

            apptype = trs[0].findAll("td")[3].font.string
            # Avoids throwing an error if no apptype is given (this can happen)
            if apptype != None:
                apptype = apptype.strip()
            
            # Is all this really necessary? I don't know, but I've assumed that
            # it is. The form will appear without the suffix, I don't know if
            # the council's backend would accept it or not. Current behaviour
            # is to degrade silently to no suffix if it can't match an
            # application type.
            if apptype == "Telecommunications":
                # Don't know why it's a naked IP rather than sitting on the
                # same site, but there it is.
                application.comment_url = "http://81.171.139.151/WAM/createCom"\
                +"ment.do?action=CreateApplicationComment&applicationType=PLANNI"\
                +"NG&appNumber=T3/" + application.council_reference + "/TC"
            else:
                comment_url = "http://81.171.139.151/WAM/createComment.do?acti"\
                +"on=CreateApplicationComment&applicationType=PLANNING&appNumber"\
                +"=W/" + application.council_reference + "/"
                suffix = ""
                if apptype == "Householder planning application":
                    suffix = "PP"
                elif apptype == "Non-householder planning application":
                    suffix = "PN"
                elif apptype == "Outline applications":
                    suffix = "OU"
                elif apptype == "Change of use":
                    suffix = "CU"
                elif apptype == "Listed Building consent":
                    suffix = "LB"
                elif apptype == "Advertisement application":
                    suffix = "AA"
                elif apptype == "Certificate of Lawfulness Existing":
                    suffix = "LUE"
                elif apptype == "Approval of reserved matters":
                    suffix = "VOC"
                #These are all the ones that I found, except "Advice - Pre-app/
                #Householder", the suffix for which is inconsistent. The suffix
                #for this could be obtained by scraping the description page for
                #each application.
                application.comment_url = comment_url + suffix

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#18
0
class EastbourneParser:
    def __init__(self, *args):

        self.authority_name = "Eastbourne Borough Council"
        self.authority_short_name = "Eastbourne"
#        self.base_url = "http://www.eastbourne.gov.uk/planningapplications/search.asp"
        self.first_url = "http://www.eastbourne.gov.uk/planningapplications/index.asp"
        self.base_url = "http://www.eastbourne.gov.uk/planningapplications/results.asp"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

        # There's going to be some faffing around here. We need a cookie to say we have agreed to some T&Cs.

        # First get the search page - we'll be redirected somewhere else for not having the cookie

        first_request = urllib2.Request(self.first_url)
        first_response = urllib2.urlopen(first_request)
        cookie_jar.extract_cookies(first_response, first_request)

        first_page_soup = BeautifulSoup.BeautifulSoup(first_response.read())

        first_page_action = urlparse.urljoin(self.first_url, first_page_soup.form['action'])
        
        the_input = first_page_soup.form.input

        second_page_post_data = urllib.urlencode(
            (
                (the_input['name'], the_input['value']),
                )
            )
        
        second_request = urllib2.Request(first_page_action, second_page_post_data)
        cookie_jar.add_cookie_header(second_request)
        second_response = urllib2.urlopen(second_request)
        cookie_jar.extract_cookies(second_response, second_request)

        # Now (finally) get the search page

#ApplicationNumber=&AddressPrefix=&Postcode=&CaseOfficer=&WardMember=&DateReceivedStart=31%2F08%2F2008&DateReceivedEnd=31%2F08%2F2008&DateDecidedStart=&DateDecidedEnd=&Locality=&AgentName=&ApplicantName=&ShowDecided=&DecisionLevel=&Sort1=FullAddressPrefix&Sort2=DateReceived+DESC&Submit=Search

        post_data = urllib.urlencode(
            (
                ("ApplicationNumber", ""),
                ("AddressPrefix", ""),
                ("Postcode", ""),
                ("CaseOfficer", ""),
                ("WardMember", ""),
                ("DateReceivedStart", search_day.strftime(date_format)),
                ("DateReceivedEnd", search_day.strftime(date_format)),
                ("DateDecidedStart", ""),
                ("DateDecidedEnd", ""),
                ("Locality", ""),
                ("AgentName", ""),
                ("ApplicantName", ""),
                ("ShowDecided", ""),
                ("DecisionLevel", ""),
                ("Sort1", "FullAddressPrefix"),
                ("Sort2", "DateReceived DESC"),
                ("Submit", "Search"),
                )
            )

        search_request = urllib2.Request(self.base_url)
        cookie_jar.add_cookie_header(search_request)
        search_response = urllib2.urlopen(search_request, post_data)

        soup = BeautifulSoup.BeautifulSoup(search_response.read())

        app_no_strings = soup.findAll(text="App. No.:")

        for app_no_string in app_no_strings:
            application = PlanningApplication()
            application.date_received = search_day

            application.council_reference = app_no_string.findNext("a").string.strip()
            application.info_url = urlparse.urljoin(self.base_url, app_no_string.findNext("a")['href'])

            application.address = ' '.join([x.strip() for x in app_no_string.findNext(text="Site Address:").findNext("td").contents if type(x) == BeautifulSoup.NavigableString])
            application.postcode = getPostcodeFromText(application.address)

            application.comment_url = urlparse.urljoin(self.base_url, app_no_string.findNext(text="Comment on application").parent['href'])

            application.description = app_no_string.findNext(text="Description:").findNext("td").string.strip()

            self._results.addApplication(application)
        
        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#19
0
class MedwayParser:
    comment_email_address = "*****@*****.**"

    def __init__(self, *args):
        self.authority_name = "Medway Council"
        self.authority_short_name = "Medway"

        self.base_url = "http://www.medway.gov.uk/index/environment/planning/planapp/planonline.htm"
        self._split_base_url = urlparse.urlsplit(self.base_url)

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)
        search_date_string = search_date.strftime(date_format)

        "appstat=&decision=&appdec=&ward=&parish=&dadfrom=&dadto=&davfrom=01%2F06%2F2008&davto=02%2F06%2F2008&searchbut=Search"
        search_data = urllib.urlencode(
            [("searchtype", "1"),
             ("appstat", ""),
             ("decision", ""),
             ("appdec", ""),
             ("ward", ""),
             ("parish", ""),
             ("dadfrom", ""),
             ("dadto", ""),
             ("davfrom", search_date_string),
             ("davto", search_date_string),
             ("searchbut", "Search"),
                ]
            )

        split_search_url = self._split_base_url[:3] + (search_data, '')
        search_url = urlparse.urlunsplit(split_search_url)

        response = urllib2.urlopen(search_url)
        soup = BeautifulSoup(response.read())

        results_table = soup.find(text="Application No").parent.parent.parent
        trs = results_table.findAll("tr")[1:]

        tr_counter = 0
        
        while tr_counter < len(trs):
            tr = trs[tr_counter]

            if tr_counter % 2 == 0:
                application = PlanningApplication()
                application.date_received = search_date
                application.comment_url = self.comment_email_address

                tds = tr.findAll("td")

                application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
                application.council_reference = tr.a.string.strip()

                application.address = tds[1].string.strip()
                application.postcode = getPostcodeFromText(application.address)

                application.description = tds[2].string.strip()

                self._results.addApplication(application)

            tr_counter += 1

        return self._results


    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#20
0
class OcellaParser:
    received_date_format = search_date_format

    def __init__(self,
                 authority_name,
                 authority_short_name,
                 base_url,
                 debug=False):

        self.authority_name = authority_name
        self.authority_short_name = authority_short_name
        self.base_url = base_url

        self.debug = debug

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

        # These will be used to store the column numbers of the appropriate items in the results table
        self.reference_col = None
        self.address_col = None
        self.applicant_col = None
        self.description_col = None
        self.received_date_col = None
        self.accepted_date_col = None

    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)

        # First get the search page
        get_request = urllib2.Request(self.base_url)
        get_request.add_header('Accept', 'text/html')
        get_response = urllib2.urlopen(get_request)

        cookie_jar.extract_cookies(get_response, get_request)

        get_soup = BeautifulSoup(get_response.read())

        # We need to find where the post action goes
        action = get_soup.form['action']

        try:
            session_id = get_soup.find('input', {'name': 'p_session_id'})['value']
        except TypeError:
            # In the case of Middlesbrough, there is no session cookie, 
            # but it seems we don't need it...
            session_id = None

        # Unless we retrieve the correct form name, we will simply get the last week's applications
        submit_tag = get_soup.find('input', {'value': 'Search'}) or get_soup.find('input', {'value': 'Search for Applications'}) or get_soup.find('input', {'value': 'Submit'})
        try:
            submit_name = submit_tag['name']
            form_name = submit_name.split('.')[0]
        except TypeError:
            form_name = 'FRM_PLANNING_LIST'

# # From Breckland

# p_object_name=FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01
# p_instance=1
# p_event_type=ON_CLICK
# p_user_args=
# p_session_id=53573
# p_page_url=http%3A%2F%2Fwplan01.intranet.breckland.gov.uk%3A7778%2Fportal%2Fpage%3F_pageid%3D33%2C30988%26_dad%3Dportal%26_schema%3DPORTAL
# FRM_WEEKLY_LIST.DEFAULT.START_DATE.01=02-06-2008
# FRM_WEEKLY_LIST.DEFAULT.END_DATE.01=09-06-2008
# FRM_WEEKLY_LIST.DEFAULT.PARISH.01=

        post_data = urllib.urlencode(
            [('p_object_name', form_name + '.DEFAULT.SUBMIT_TOP.01'),
             ('p_instance', '1'),
             ('p_event_type', 'ON_CLICK'),
             ('p_user_args', ''),
             ('p_session_id', session_id),
             ('p_page_url', self.base_url),
             (form_name + '.DEFAULT.AGENT.01', ''),
             (form_name + '.DEFAULT.START_DATE.01', search_date.strftime(search_date_format)),
             (form_name + '.DEFAULT.END_DATE.01', search_date.strftime(search_date_format)),
             (form_name + '.DEFAULT.PARISH.01', ''),
                ]
            )
        
        post_request = urllib2.Request(action, post_data)
        cookie_jar.add_cookie_header(post_request)

        post_request.add_header('Accept', 'text/html')
        post_request.add_header('Referer', self.base_url)

        post_response = cookie_handling_opener.open(post_request)

        post_soup = BeautifulSoup(post_response.read())

        results_table = post_soup.find("table", summary="Printing Table Headers")

        trs = results_table.findAll("tr")

        # We'll use the headings in the first tr to find out what columns the address, description, etc are in.
        ths = trs[0].findAll("th")

        th_index = 0
        for th in ths:
            th_content = th.font.string.strip()
            if th_content == 'Reference' or th_content == 'Application Ref' or th_content == 'Application Number':
                self.reference_col = th_index
            elif th_content == 'Location':
                self.address_col = th_index
            elif th_content == 'Applicant Details':
                self.applicant_col = th_index
            elif th_content == 'Proposal':
                self.description_col = th_index
            elif th_content == 'Development Description':
                self.description_col = th_index
            elif th_content == 'Received Date' or th_content == 'Date Received':
                self.received_date_col = th_index
            elif th_content == 'Accepted Date':
                self.accepted_date_col = th_index

            th_index += 1
            
        # If there is a received date, we'll use that, otherwise, we'll have to settle for the accepted date.
        self.received_date_col = self.received_date_col or self.accepted_date_col

        # We want all the trs except the first one, which is just headers, 
        # and the last, which is empty
        trs = trs[1:-1]

        for tr in trs:
            self._current_application = PlanningApplication()

            tds = tr.findAll("td")

            self._current_application.council_reference = (tds[self.reference_col].font.a or tds[self.reference_col].a.font).string.strip()

            date_string = tds[self.received_date_col]

            for possible_format in possible_date_formats:
                
                try:
                    self._current_application.date_received = datetime.datetime(*(time.strptime(tds[self.received_date_col].font.string.strip(), possible_format)[0:6]))
                except ValueError:
                    pass

            self._current_application.address = tds[self.address_col].font.string.strip()
            self._current_application.postcode = getPostcodeFromText(self._current_application.address)
            if self._current_application.postcode is None and self.applicant_col is not None:
                # won't always be accurate to do this but better than nothing (needed for Havering)
                self._current_application.postcode = getPostcodeFromText(tds[self.applicant_col].font.string.strip())
            self._current_application.description = tds[self.description_col].font.string.strip()
            # seems to be dependent on the implementation whether the URL is encoded (e.g. Great Yarmouth does this), so we cannot do anything more "standard"
            self._current_application.info_url = urlparse.urljoin(post_response.geturl(), tds[self.reference_col].a['href'].replace('&amp;','&'))

# This is what a comment url looks like
# It seems to be no problem to remove the sessionid (which is in any case blank...)
# I can't see a good way to avoid having to go to the info page to find the moduleid though.

#http://wplan01.intranet.breckland.gov.uk:7778/pls/portal/PORTAL.wwa_app_module.link?p_arg_names=_moduleid&p_arg_values=8941787057&p_arg_names=_sessionid&p_arg_values=&p_arg_names=APPLICATION_REFERENCE&p_arg_values=3PL%2F2008%2F0877%2FF

            # For the moment, we'll just use the info url, as that seems to work.
            self._current_application.comment_url = self._current_application.info_url
            
            self._results.addApplication(self._current_application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#21
0
class FlintshireParser:
    def __init__(self, *args):

        self.authority_name = "Flintshire County Council"
        self.authority_short_name = "Flintshire"

        # I've removed some extra variables from this, it seems to be happy without them, and now doesn't need to paginate...
        self.base_url = "http://www.flintshire.gov.uk/webcont/fssplaps.nsf/vwa_Search?searchview&Query=(%%5BfrmDteAppldate%%5D%%20%%3E=%%20%(start_date)s%%20AND%%20%%5BfrmDteAppldate%%5D%%20%%3C=%%20%(end_date)s)"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)

        # We'll set the start date to be one day earlier in order to catch the first result on every day at some point - see TODO list
        response = urllib2.urlopen(self.base_url %{"end_date": search_date.strftime(date_format),
                                                   "start_date": (search_date - datetime.timedelta(1)).strftime(date_format)})
        soup = BeautifulSoup(response.read())

        # Each app is stored in it's own table
        result_tables = soup.findAll("table", border="1")

        # For the moment, we'll have to ignore the first result (see TODO list).
        for table in result_tables[1:]:
            application = PlanningApplication()

            # It's not clear to me why this next one isn't the string of the next sibling. This works though!
            application.council_reference = table.find(text=re.compile("Reference")).parent.findNextSibling().contents[0]

            application.address = table.find(text="Location").parent.findNextSibling().string.strip()
            application.postcode = getPostcodeFromText(application.address)

            application.info_url = urlparse.urljoin(self.base_url, table.a['href'])

            # Let's go to the info_page and get the OSGB and the date_received
            info_request = urllib2.Request(application.info_url)

            # We need to add the language header in order to get UK style dates
            info_request.add_header("Accept-Language", "en-gb,en")
            info_response = urllib2.urlopen(info_request)
            info_soup = BeautifulSoup(info_response.read())
            
            grid_reference_td = info_soup.find(text="Grid Reference").findNext("td")
            x_element = grid_reference_td.font
            
            application.osgb_x = x_element.string.strip()
            application.osgb_y = x_element.nextSibling.nextSibling.string.strip()
            
            date_string = info_soup.find(text="Date Valid").findNext("td").string.strip()

            application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6]))

            application.description = table.find(text=re.compile("Description of Proposal")).parent.nextSibling.string.strip()


            # There is a link to comment from the info page, though I can't click it.
            application.comment_url = application.info_url

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#22
0
class CairngormsParser:
    def __init__(self, *args):
        self.authority_name = "Cairngorms National Park"
        self.authority_short_name = "Cairngorms"
        self.referer = "http://www.cairngorms.co.uk/planning/e-planning/index.php"

        self.base_url = "http://www.cairngorms.co.uk/planning/e-planning/holding.php"

        # The timestamp here looks like the number of milliseconds since 1970
        self.first_post_url = "http://www.cairngorms.co.uk/planning/e-planning/search.php?timeStamp=%d"

        self.comments_email_address = "*****@*****.**"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)

        post_data = [
            ("CNPA_ref", ""),
            ("application_number", ""),
            ("LA_id", "%"),
            ("applicant_type", "%"),
            ("applicant_name", ""),
            ("development_address", ""),
            ("agent_name", ""),
            ("status", "%"),
            ("startDay", "%02d" %day),
            ("startMonth", "%02d" %month),
            ("startYear", "%d" %year),
            ("endDay", "%02d" %day),
            ("endMonth", "%02d" %month),
            ("endYear", "%d" %year),
            ]

        first_post_data = "CNPA_ref=&application_number=&applicant_name=&development_address=&agent_name=&applicant_type=%%&LA_id=%%&status=%%&startYear=%(year)d&startMonth=%(month)02d&startDay=%(day)02d&endYear=%(year)d&endMonth=%(month)02d&endDay=%(day)02d" %{"day": day, "month": month, "year": year}

        curlobj = pycurl.Curl()
        curlobj.setopt(pycurl.FOLLOWLOCATION, True)
        curlobj.setopt(pycurl.MAXREDIRS, 10)


        # First we do a normal post, this would happen as an AJAX query 
        # from the browser and just returns the number of applications found.
        fakefile = StringIO.StringIO() 

        curlobj.setopt(pycurl.URL, self.first_post_url %(int(time.time()*1000)))
        curlobj.setopt(pycurl.POST, True)
        curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
        curlobj.setopt(pycurl.POSTFIELDS, first_post_data)

        curlobj.perform()

        app_count = int(fakefile.getvalue())
        fakefile.close()

        if app_count:
            # Now we do another multipart form post
            # This gives us something to use as the callback
            fakefile = StringIO.StringIO() 

            curlobj.setopt(pycurl.URL, self.base_url)
            curlobj.setopt(pycurl.HTTPPOST, post_data)
            curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
            curlobj.setopt(pycurl.REFERER, self.referer)
            curlobj.perform()

            soup = BeautifulSoup(fakefile.getvalue())
            # We may as well free up the memory used by fakefile
            fakefile.close()

            for tr in soup.table.findAll("tr")[1:]:
                application = PlanningApplication()
                application.date_received = search_date
                application.comment_url = self.comments_email_address

                tds = tr.findAll("td")

                application.council_reference = tds[1].string.strip()
                application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])

                application.address = tds[2].string.strip()
                application.postcode = getPostcodeFromText(application.address)

                # We're going to need to get the info page in order to get the description
                # We can't pass a unicode string to pycurl, so we'll have to encode it.
                curlobj.setopt(pycurl.URL, application.info_url.encode())
                curlobj.setopt(pycurl.HTTPGET, True)

                # This gives us something to use as the callback
                fakefile = StringIO.StringIO() 
                curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)

                curlobj.perform()
                info_soup = BeautifulSoup(fakefile.getvalue())
                fakefile.close()

                application.description = info_soup.find(text="Development Details").findNext("td").string.strip()
                application.osgb_x = info_soup.find(text="Grid Ref East").findNext("td").string.strip()
                application.osgb_y = info_soup.find(text="Grid Ref North").findNext("td").string.strip()

                self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#23
0
class AcolnetParser:
    received_date_label = "Registration Date:"
    received_date_format = "%d/%m/%Y"

    comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=%s"

    # There is no online comment facility in these, so we provide an
    # appropriate email address instead
    comments_email_address = None

    # The optional amp; is to cope with Oldham, which seems to have started
    # quoting this url.
    action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&(?:amp;)?RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)    
  
    def _getResultsSections(self, soup):
        """In most cases, there is a table per app."""
        return soup.findAll("table", {"class": "results-table"})
  
    def _getCouncilReference(self, app_table):
#        return app_table.findAll("a")[1].string.strip()
        return app_table.a.string.strip()

    def _getDateReceived(self, app_table):
        date_str = ''.join(app_table.find(text=self.received_date_label).findNext("td").string.strip().split())
        day, month, year = date_str.split('/')
        return date(int(year), int(month), int(day))

        # This will be better from python 2.5
        #return datetime.datetime.strptime(date_str, self.received_date_format)

    def _getAddress(self, app_table):
        return app_table.find(text="Location:").findNext("td").string.strip()
    
    def _getDescription(self, app_table):
        return app_table.find(text="Proposal:").findNext("td").string.strip()

    def _getInfoUrl(self, app_table):
        """Returns the info url for this app.
        
        We also set the system key on self._current_application, 
        as we'll need that for the comment url.

        """
        url = app_table.a['href']
        self._current_application.system_key = system_key_regex.search(url).groups()[0]

        # This is the right way to do this, but it doesn't work in Python 2.5 as
        # it doesn't quite implement RFC 3986. This will work fine when we are on
        # Python 2.6
        # info_url = urlparse.urljoin(self.base_url, url)

        # In the meantime, we'll have to work around it. Let's assume url
        # is a query string

        split_base_url = urlparse.urlsplit(self.base_url)
        split_info_url = urlparse.urlsplit(url)
        info_url = urlparse.urlunsplit(split_base_url[:3] + (split_info_url.query,) + split_base_url[4:])

        return info_url

    def _getCommentUrl(self, app_table):
        """This must be run after _getInfoUrl"""

        if self.comments_email_address:
            return self.comments_email_address

        split_info_url = urlparse.urlsplit(self._current_application.info_url)

        comment_qs = self.comment_qs_template %self._current_application.system_key

        return urlparse.urlunsplit(split_info_url[:3] + (comment_qs,) + split_info_url[4:])

    def _getWard(self, app_table):
        return app_table.findAll("td")[8].string.strip()

    def __init__(self,
                 authority_name,
                 authority_short_name,
                 base_url,
                 debug=False):
        self.authority_name = authority_name
        self.authority_short_name = authority_short_name
        self.base_url = base_url

        self.debug = debug

        # This in where we store the results
        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

        # This will store the planning application we are currently working on.
        self._current_application = None


    def _cleanupHTML(self, html):
        """This method should be overridden in subclasses to perform site specific
        HTML cleanup."""
        return html

    def _getSearchResponse(self):
        # It looks like we sometimes need to do some stuff to get around a
        # javascript redirect and cookies.
        search_form_request = urllib2.Request(self.base_url)
        search_form_response = urllib2.urlopen(search_form_request)

        return search_form_response
        

    def getResultsByDateRange(self, date_from, date_to):
        # first we fetch the search page to get ourselves some session info...
        search_form_response = self._getSearchResponse()
        
        search_form_contents = search_form_response.read()

        # This sometimes causes a problem in HTMLParser, so let's just get the link
        # out with a regex...
        groups = self.action_regex.search(search_form_contents).groups()

        action = groups[0] 
        #print action

        # This is to handle the amp; which seems to have appeared in this
        # url on the Oldham site
        action = ''.join(action.split('amp;'))

        action_url = urlparse.urljoin(self.base_url, action)
        #print action_url

        search_data = {"regdate1": date_from.strftime(date_format),
                       "regdate2": date_to.strftime(date_format),
                       }
        
        opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
        response = opener.open(action_url, search_data)
        results_html = response.read()

        # This is for doing site specific html cleanup
        results_html = self._cleanupHTML(results_html)

        #some javascript garbage in the header upsets HTMLParser,
        #so we'll just have the body
        just_body = "<html>" + end_head_regex.split(results_html)[-1]

        #self.feed(just_body)
        
        soup = BeautifulSoup(just_body, convertEntities=BeautifulSoup.ALL_ENTITIES)

        # Each app is in a table of it's own.
        results_tables = self._getResultsSections(soup)


        for app_table in results_tables:
            self._current_application = PlanningApplication()
            self._current_application.council_reference = self._getCouncilReference(app_table)
            self._current_application.address = self._getAddress(app_table)
            
            # Get the postcode from the address
            self._current_application.postcode = getPostcodeFromText(self._current_application.address)
            
            self._current_application.description = self._getDescription(app_table)
            self._current_application.info_url = self._getInfoUrl(app_table)
            self._current_application.comment_url = self._getCommentUrl(app_table)
            self._current_application.date_received = self._getDateReceived(app_table)
            self._current_application.ward_name = self._getWard(app_table)

            self._results.addApplication(self._current_application)

        return self._results

    def getResultsByDayMonthYear(self, day, month, year):
        our_date = date(year, month, day)
        return self.getResultsByDateRange(our_date, our_date)

    def getResults(self, day, month, year):
        results =  self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
        #        import pdb;pdb.set_trace()
        return results
示例#24
0
class WandsworthParser:

    def __init__(self, *args):

        self.authority_name = "London Borough of Wandsworth"
        self.authority_short_name = "Wandsworth"
        self.base_url = "http://www.wandsworth.gov.uk/gis/search/Search.aspx"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)
        formatted_search_day = search_day.strftime("%d-%m-%Y")

        post_data = urllib.urlencode([
            ("__EVENTTARGET", ""),
            ("__EVENTARGUMENT", ""),
            ("cboNumRecs", "99999"),
            ("cmdSearch", "Search"),
            ("drReceived:txtStart", formatted_search_day),
            ("drReceived:txtEnd", formatted_search_day)
        ])

        response = urllib2.urlopen(self.base_url, post_data)

        # Modify the redirect response URL to remove the XSL template param
        # so we get more detailed XML embedded in HTML instead
        redirect_url = response.geturl()
        redirect_url = re.sub("&XSLTemplate=xslt/Results.xslt", "", redirect_url)

        results_response = urllib2.urlopen(redirect_url)

        try:
            soup = BeautifulSoup(results_response.read())

            # Get the XML content contained in the HTML doc
            td = soup.find("td", colspan="3")
            xml = str(td.contents[2])
            xml_soup = BeautifulStoneSoup(xml)
        except:
            return self._results

        for entry in xml_soup.findAll('internet_web_search'):
            application = PlanningApplication()

            primary_key = entry.find('primary_key').renderContents()
            kind = entry.find('object_id').renderContents()

            application.council_reference = entry.find('application_number').renderContents()

            application.comment_url = "http://www.wandsworth.gov.uk/apply/createComment.do?action=CreateApplicationComment&appNumber=%s" \
                % urllib.quote(application.council_reference)

            str_date_received = entry.find('received_date').renderContents()[0:10]
            date_received = datetime.datetime.strptime(str_date_received, "%Y-%m-%d")

            application.date_received = date_received

            application.address = entry.find('site_address').renderContents()

            application.description = entry.find('development_description').renderContents()

            # We need to make another request to get postcode details
            details_url = "http://www.wandsworth.gov.uk/gis/search/StdDetails.aspx?"
            
            if kind == 'PLANNINGAPPLICATION':
                application.info_url = "http://www.wandsworth.gov.uk/apply/showCaseFile.do?appNumber=%s" \
                    % urllib.quote(application.council_reference)

                details_url = details_url + urllib.urlencode([
                    ("PT", "Planning Application Details"),
                    ("TYPE", "WBCPLANNINGREF"),
                    ("PARAM0", primary_key),
                    ("XSLT", "xslt/planningdetails.xslt"),
                    ("DAURI", "PLANNING")
                ])
            else:
                details_url = details_url + urllib.urlencode([
                    ("PT", "Building Control Application Details"),
                    ("TYPE", "WBCBuildingControlREF"),
                    ("PARAM0", primary_key),
                    ("XSLT", "xslt/bcdetails.xslt"),
                    ("DAURI", "PLANNING")
                ])
                
                application.info_url = details_url

            details_response = urllib2.urlopen(details_url)
            details_soup = BeautifulSoup(details_response.read())
            postcode_row = details_soup.find('table', "bodytextsmall").findAll('tr')[5]
            postcode_cell = postcode_row.find('td', "searchinput")

            if postcode_cell.string:
                application.postcode = getPostcodeFromText(postcode_cell.string.strip())

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class PlanningExplorerParser:
    # If this authority doesn't have a comments page,
    # then set this email_address to an address for the
    # planning department, and it will be used in lieu of
    # a comments url.
    comments_email_address = None

    # These are the directories where the info urls, and search urls,
    # usually live underneath the base_url.
    # If these are different for a particular
    # authority, then they can be overridden in a subclass.
    info_url_path = "MVM/Online/Generic/"
    search_url_path = "MVM/Online/PL/GeneralSearch.aspx"
   
    # This is the most common place for comments urls to live
    # The %s will be filled in with an application code
    comments_path = "MVM/Online/PL/PLComments.aspx?pk=%s"

    # Most authorities don't need the referer header on the post
    # request. If one does, override this in the subclass
    use_referer = False

    # Some authorities won't give us anything back if we use the
    # python urllib2 useragent string. In that case, override this
    # in a subclass to pretend to be firefox.
    use_firefox_user_agent = False

    # This is the most common css class of the table containing the
    # the search results. If it is different for a particular authority
    # it can be overridden in a subclass
    results_table_attrs = {"class": "ResultsTable"}

    # These are the most common column positions for the
    # council reference, the address, and the description
    # in the results table.
    # They should be overridden in subclasses if they are different
    # for a particular authority.
    reference_td_no = 0
    address_td_no = 1
    description_td_no = 2

    # In some cases we won't be able to get the full address/description/postcode without getting the info page for each app.
    # If fetch_info_page is set to true, then we need to get a copy of the info page and store it as an attribute on current_application (naughty!)
    fetch_info_page = False

    asp_args_regex = re.compile('<input[^>]*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>')

    def _modify_response(self, response):
        """For most sites, we have managed to get all the apps on a
        single page by choosing the right parameters.
        If that hasn't been possible, override this method to get a
        new response object which has all the apps in one page.
        (See, for example, Hackney).
        """
        return response

    def _find_trs(self, results_table):
        """Normally, we just want a list of all the trs except the first one
        (which is usually a header).
        If the authority requires a different list of trs, override this method.
        """
        return results_table.findAll("tr")[1:]

    def _sanitisePostHtml(self, html):
        """This method can be overriden in subclasses if the
        html that comes back from the post request is bad, and
        needs tidying up before giving it to BeautifulSoup."""
        return html

    def _sanitiseInfoUrl(self, url):
        """If an authority has info urls which are for some reason full
        of crap (like Broadland does), then this method should be overridden
        in order to tidy them up."""
        return ''.join(url.split())

    def _getHeaders(self):
        """If the authority requires any headers for the post request,
        override this method returning a dictionary of header key to
        header value."""
        headers = {}
       
        if self.use_firefox_user_agent:
            headers["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.8.1.10) Gecko/20071126 Ubuntu/7.10 (gutsy) Firefox/2.0.0.10"

        if self.use_referer:
            headers["Referer"] = self.search_url

        return headers

    def _getPostData(self, asp_args, search_date):
        """Accepts asp_args (a tuple of key value pairs of the pesky ASP
        parameters, and search_date, a datetime.date object for the day
        we are searching for.

        This seems to be the most common set of post data which is needed
        for PlanningExplorer sites. It won't work for all of them, so
        will sometimes need to be overridden in a subclass.

        The parameter edrDateSelection is often not needed.
        It is needed by Charnwood though, so I've left it in
        to keep things simple.
        """
        year_month_day = search_date.timetuple()[:3]

        post_data = urllib.urlencode(asp_args + (
                ("_ctl0", "DATE_REGISTERED"),
                ("rbGroup", "_ctl5"),
                ("_ctl7_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
                ("_ctl8_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
                ("edrDateSelection", "1"),
                ("csbtnSearch", "Search"),
                ("cboNumRecs", "99999"),
                ))
       
        return post_data


    def _getAddress(self, tds, info_soup):
        # If this td contains a div, then the address is the
        # string in there - otherwise, use the string in the td.
        address_td = tds[self.address_td_no]
        if address_td.div is not None:
            address = address_td.div.string
        else:
            address = address_td.string
           
        return address


    def _getPostCode(self, info_soup):
        """In most cases, the postcode can be got from the address in
        the results table. Some councils put the address there without the
        postcode. In this case we will have to go to the info page to get
        the postcode. This should be done by overriding this method with
        one that parses the info page."""

        return getPostcodeFromText(self._current_application.address)
       
    def _getDescription(self, tds, info_soup):
        description_td = tds[self.description_td_no]
       
        if description_td.div is not None:
            # Mostly this is in a div
            # Use the empty string if the description is missing
            description = description_td.div.string or ""
        else:
            # But sometimes (eg Crewe) it is directly in the td.
            # Use the empty string if the description is missing
            description = description_td.string or ""

        return description


    def __init__(self,
                 authority_name,
                 authority_short_name,
                 base_url,
                 debug=False):

        self.authority_name = authority_name
        self.authority_short_name = authority_short_name
        self.base_url = base_url

        self.search_url = urlparse.urljoin(base_url, self.search_url_path)
        self.info_url_base = urlparse.urljoin(self.base_url, self.info_url_path)
   
        self.debug = debug

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)

        # First do a get, to get some state
        get_request = urllib2.Request(self.search_url)
        get_response = urllib2.urlopen(get_request)


        cookie_jar.extract_cookies(get_response, get_request)

        html = get_response.read()

        # We need to find those ASP parameters such as __VIEWSTATE
        # so we can use them in the next POST

        # re.findall gets us a list of key value pairs.
        # We want to concatenate it with a tuple, so we must
        # make it a tuple
        asp_args = tuple(re.findall(self.asp_args_regex, html))

        # The post data needs to be different for different councils
        # so we have a method on each council's scraper to make it.
        post_data = self._getPostData(asp_args, search_date)
       
        headers = self._getHeaders()

        request = urllib2.Request(self.search_url, post_data, headers)
        cookie_jar.add_cookie_header(request)
        post_response = urllib2.urlopen(request)

        # We have actually been returned here by an http302 object
        # moved, and the response called post_response is really a get.

        # In some cases, we can't get the page size set high
        # until now. In that case, override _modify_response
        # so that we get back a response with all the apps on one page.
        # We pass in headers so that any
        post_response = self._modify_response(post_response)

        html = self._sanitisePostHtml(post_response.read())

        soup = BeautifulSoup(html)

        results_table = soup.find("table", attrs=self.results_table_attrs)

        # If there is no results table, then there were no apps on that day.
        if results_table:
            trs = self._find_trs(results_table)

            self._current_application = None

            # The first tr is just titles, cycle through the trs after that
            for tr in trs:
                self._current_application = PlanningApplication()

                # There is no need to search for the date_received, it's what
                # we searched for            
                self._current_application.date_received = search_date

                tds = tr.findAll("td")

                self._current_application.council_reference = tds[self.reference_td_no].a.string

                relative_info_url = self._sanitiseInfoUrl(tds[self.reference_td_no].a['href'])
                self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url)

                # Fetch the info page if we need it, otherwise set it to None

                if self.fetch_info_page:
                    # We need to quote the spaces in the info url
                    info_request = urllib2.Request(urllib.quote(self._current_application.info_url, ":/&?="))
                   
                    info_soup = BeautifulSoup(urllib2.urlopen(info_request))
                else:
                    info_soup = None

                # What about a comment url?
                # There doesn't seem to be one, so we'll use the email address
                if self.comments_email_address is not None:
                    # We're using the email address, as there doesn't seem
                    # to be a web form for comments
                    self._current_application.comment_url = self.comments_email_address
                else:
                    # This link contains a code which we need for the comments url
                    # (on those sites that use it)
                    application_code = app_code_regex.search(relative_info_url).groups()[0]

                    relative_comments_url = self.comments_path %(application_code)
                    self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url)


                self._current_application.address = self._getAddress(tds, info_soup)
                self._current_application.postcode = self._getPostCode(info_soup)
                self._current_application.description = self._getDescription(tds, info_soup)

                self._results.addApplication(self._current_application)

        return self._results


    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#26
0
class RutlandLikeParser:
    def __init__(self,
                 authority_name,
                 authority_short_name,
                 base_url,
                 debug=False):

        self.authority_name = authority_name
        self.authority_short_name = authority_short_name
        self.base_url = base_url

        self.debug = debug

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)
        date_string = search_date.strftime(date_format)
        
        search_data = urllib.urlencode({"reference": "",
                                        "undecided": "yes",
                                        "dateFrom": date_string,
                                        "dateTo": date_string,
                                        "Address": "",
                                        "validate": "true",
                                        })


        request = urllib2.Request(self.base_url, search_data)
        response = urllib2.urlopen(request)

        html =  response.read()

        soup = BeautifulSoup(html)

        tables = soup.findAll("table", {"style": "width:auto;"})

        if not tables:
            return self._results

        # We don't want the first or last tr
        trs = tables[0].findAll("tr")[1:-1]

        for tr in trs:
            app = PlanningApplication()

            tds = tr.findAll("td")

            if len(tds) == 4:
                local_info_url = tds[0].a['href']
                app.info_url = urlparse.urljoin(self.base_url, local_info_url)
                app.council_reference = tds[0].a.string

                app.address = tds[1].string
                app.postcode = getPostcodeFromText(app.address)

                app.description = tds[2].string

                app.comment_url = urlparse.urljoin(self.base_url, comment_url_end %app.council_reference)
                app.date_received = search_date

                self._results.addApplication(app)

        return self._results


    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#27
0
class BroxtoweParser:
    def __init__(self, *args):

        self.authority_name = "Broxtowe Borough Council"
        self.authority_short_name = "Broxtowe"
        self.base_url = "http://planning.broxtowe.gov.uk"

        self.info_url = "http://planning.broxtowe.gov.uk/ApplicationDetail.aspx?RefVal=%s"


        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

        # Now get the search page
        get_response = urllib2.urlopen(self.base_url)
        get_soup = BeautifulSoup(get_response.read())

        # These are the inputs with a default value
        inputs_needed = [(x['id'], x['value']) for x in get_soup.form.findAll("input", value=True, type=lambda x: x != "submit")]

        # Add the submit button
        inputs_needed.append(('cmdWeeklyList', 'Search Database'))

        # We also need to add the date we want to search for.
        # This is the friday after the date searched for.
        # At weekends this will get you the friday before, but that isn't
        # a problem as there are no apps then.
        friday = search_day + datetime.timedelta(4 - search_day.weekday())

        inputs_needed.append(("ddlWeeklyList", friday.strftime(date_format)))

        # We'd like as many results as we can get away with on one page.
        # 50 is the largest option offerend
        inputs_needed.append(("ddlResultsPerPageWeeklyList", "50"))

        post_data = dict(inputs_needed)
        post_url = get_response.url

        # In case something goes wrong here, let's break out of the loop after at most 10 passes
        passes = 0

        while True:
            passes += 1

            post_response = urllib2.urlopen(post_url, urllib.urlencode(post_data))
            post_soup = BeautifulSoup(post_response.read())

            result_tables = post_soup.table.findAll("table")

            for result_table in result_tables:
                application = PlanningApplication()

                application.address = ', '.join(result_table.findPrevious("b").string.strip().split("\r"))
                application.postcode = getPostcodeFromText(application.address)

                trs = result_table.findAll("tr")

                application.council_reference = trs[0].findAll("td")[1].string.strip()
                application.date_received = datetime.datetime.strptime(trs[1].findAll("td")[1].string.strip(), date_format).date()
                application.description = trs[3].findAll("td")[1].string.strip()

                application.info_url = self.info_url %(urllib.quote(application.council_reference))

                # In order to avoid having to do a download for every app,
                # I'm setting the comment url to be the same as the info_url.
                # There is a comment page which can be got to by pressing the button
                application.comment_url = application.info_url

                self._results.addApplication(application)

            # Which page are we on?
            page_no = int(post_soup.find("span", id="lblPageNo").b.string)
            total_pages = int(post_soup.find("span", id="lblTotalPages").b.string)

            if passes > 10 or not page_no < total_pages:
                break

            post_data = [
                ("__EVENTTARGET", "hlbNext"),
                ("__EVENTARGUMENT", ""),
                ("__VIEWSTATE", post_soup.find("input", id="__VIEWSTATE")['value']),
                ("__EVENTVALIDATION", post_soup.find("input", id="__EVENTVALIDATION")['value']),
                 ]

            post_url = urlparse.urljoin(post_response.url, post_soup.find("form")['action'])

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#28
0
class ShropshireParser:
    reference_input_name = "ApplNum"
    contact_email_name = "offemail"

    comment_url = None

    use_validated_date = False

    def _get_info_link_list(self, soup):
        return [tr.a for tr in soup.find("table", id="tbllist").findAll("tr", recursive=False)[:-1]]

    def _get_postcode(self, info_soup):
        return info_soup.find("input", {"name": "Postcode"})['value']

    def __init__(self, authority_name, authority_short_name, base_url, debug=False):
        self.debug = debug

        self.authority_name = authority_name
        self.authority_short_name = authority_short_name
        self.base_url = base_url
        self._split_base_url = urlparse.urlsplit(base_url)
        
        self._current_application = None
        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)
        search_date_string = search_date.strftime(date_format)

        if self.use_validated_date:
            received_search_string = ""
            validated_search_string = search_date_string
        else:
            received_search_string = search_date_string
            validated_search_string = ""

        search_data = urllib.urlencode([
                ("txtAppNum", ""),
                ("txtAppName", ""),
                ("txtAppLocn", ""),
                ("txtAppPCode", ""),
                ("txtAppRecFrom", received_search_string),
                ("txtAppRecTo", received_search_string),
                ("txtAppDecFrom", ""),
                ("txtAppDecTo", ""),
                ("txtAppValFrom", validated_search_string),
                ("txtAppValTo", validated_search_string),
                ("district_drop", ""),
                ("parish_drop", ""),
                ("ward_drop", ""),
                ("ft", "yes"),
                ("submit1", "Submit"),
                ])

        split_search_url = self._split_base_url[:3] + (search_data, '')
        search_url = urlparse.urlunsplit(split_search_url)

        response = urllib2.urlopen(search_url)
        soup = BeautifulSoup(response.read())

        # Handle the case where there are no apps
        if soup.find(text=re.compile("No applications matched your query")):
            return self._results


        info_link_list = self._get_info_link_list(soup)

        for app_link in info_link_list:
            self._current_application = PlanningApplication()

            # We could get this from the info soup, but as we already know it, why bother.
            self._current_application.date_received = search_date

            self._current_application.info_url = urlparse.urljoin(self.base_url, app_link['href'])
    
            # To get the postcode we will need to download each info page
            info_response = urllib2.urlopen(self._current_application.info_url)
            info_soup = BeautifulSoup(info_response.read())

            self._current_application.council_reference = info_soup.find("input", {"name": self.reference_input_name})['value']
            self._current_application.address = info_soup.find("textarea", {"name": "Location"}).string.strip()
            self._current_application.postcode = self._get_postcode(info_soup)
                        
            self._current_application.description = info_soup.find("textarea", {"name": "Proposal"}).string.strip()

            if self.comment_url:
                self._current_application.comment_url = self.comment_url
            else:
                self._current_application.comment_url = info_soup.find("input", {"name": self.contact_email_name})['value']

            # There is an OSGB position here :-)
            self._current_application.osgb_x = info_soup.find("input", {"name": "Easting"})['value']
            self._current_application.osgb_y = info_soup.find("input", {"name": "Northing"})['value']

            self._results.addApplication(self._current_application)

        return self._results


    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#29
0
class WestminsterParser:
    def __init__(self, *args):

        self.authority_name = "City of Westminster"
        self.authority_short_name = "Westminster"
        self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

#         post_data = [
#             ("EFNO", ""),
#             ("STName", ""),
#             ("STNUMB", ""),
#             ("ADRSNO", ""),
#             ("WARD", "AllWards"),
#             ("AGT", ""),
#             ("ATCDE", "AllApps"),
#             ("DECDE", "AllDecs"),
#             ("DTErec", search_day.strftime(date_format)),
#             ("DTErecTo", search_day.strftime(date_format)),
#             ("DTEvalid", ""),
#             ("DTEvalidTo", ""),
#             ("APDECDE", "AllAppDecs"),
#             ("submit", "Start+Search"),
#             ]
        post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)}

        while post_data:
            

            # Now get the search page

#            sys.stderr.write("Fetching: %s\n" %self.base_url)
#            sys.stderr.write("post data: %s\n" %post_data) 
            
            response = urllib2.urlopen(self.base_url, post_data)

#            sys.stderr.write("Got it\n")
            soup = BeautifulSoup(response.read())

#            sys.stderr.write("Created soup\n")

            results_form = soup.find("form", {"name": "currentsearchresultsNext"})

            # Sort out the post_data for the next page, if there is one
            # If there is no next page then there will be no inputs in the form.
            # In this case, post_data will be '', which is false.

#            sys.stderr.write("Found form containing results\n")

            post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])

#            sys.stderr.write("Got post data\n")

            # Each result has one link, and they are the only links in the form

            links = results_form.findAll("a")

#            sys.stderr.write("Got list of links\n")

            for link in links:

#                sys.stderr.write("Working on link: %s\n" %link['href'])

                application = PlanningApplication()

                application.date_received = search_day
                application.info_url = urlparse.urljoin(self.base_url, link['href'])
                application.council_reference = link.string.strip()

                application.address = link.findNext("td").string.strip()
                application.postcode = getPostcodeFromText(application.address)

                application.description = link.findNext("tr").findAll("td")[-1].string.strip()

                # To get the comment url, we're going to have to go to each info url :-(

#                sys.stderr.write("Fetching: %s\n" %application.info_url)
                info_response = urllib2.urlopen(application.info_url)
#                sys.stderr.write("Got it\n")

                info_soup = BeautifulSoup(info_response)

                comment_nav_string = info_soup.find(text="Comment on this case")
                if comment_nav_string:
                    application.comment_url = comment_nav_string.parent['href']
                else:
                    application.comment_url = "No Comments"

    #http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500

                self._results.addApplication(application)

#                sys.stderr.write("Finished that link\n")


#        sys.stderr.write("Finished while loop, returning stuff.\n")

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
示例#30
0
class ShetlandParser:
    def __init__(self, *args):

        self.authority_name = "Shetland Islands Council"
        self.authority_short_name = "Shetland Islands"
        self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=%d"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.datetime(year, month, day)

        offset = 0

        # First get the search page
        response = urllib2.urlopen(self.base_url %(offset))
        
        contents = response.read()

        # First let's find out how many records there are (they are displayed ten per page).
        match = page_count_regex.search(contents)        
        app_count = int(match.groups()[0])

        while offset < app_count:
            if offset != 0:
                contents = urllib2.urlopen(self.base_url %(offset)).read()

            soup = BeautifulSoup(contents)
            
            # The apps are in the 5th table on the page (not a very good way to get it...)
            results_table = soup.findAll("table")[5]

            # Now we need to find the trs which contain the apps.
            # The first TR is just headers.
            # After that they alternate between containing an app and just some display graphics
            # until the third from last. After that, they contain more rubbish.

            trs = results_table.findAll("tr")[1:-2]

            for i in range(len(trs)):
                # We are only interested in the trs in even positions in the list.
                if i % 2 == 0:
                    tr = trs[i]

                    application = PlanningApplication()

                    comment_url_element = tr.find(text="comment on this planning application").parent
                    application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6]))

                    # If the date of this application is earlier than the date 
                    # we are searching for then don't download it.
                    # We could optimize this a bit more by not doing the later pages.

                    if application.date_received < search_date:
                        break

                    application.council_reference = tr.a.string

                    application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href'])

                    application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])

                    info_response = urllib2.urlopen(application.info_url)

                    info_soup = BeautifulSoup(info_response.read())

                    info_table = info_soup.findAll("table")[2]

                    application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip()
                    application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip()

                    # Now to get the address. This will be split across several tds.

                    address_start_td = info_table.find("td", rowspan="4")

                    # We need the first bit of the address from this tr
                    address_bits = [address_start_td.findNext("td").string.strip()]

                    # We will need the first td from the next three trs after this
                    for address_tr in address_start_td.findAllNext("tr")[:3]:
                        address_line = address_tr.td.string.strip()

                        if address_line:
                            address_bits.append(address_line)

                    address_bits.append(application.postcode)

                    application.address = ', '.join(address_bits)

                    self._results.addApplication(application)
                    
            offset += 10

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()