def main():
  if get_var('columns_to_do') == None:
    columns = COLUMNS
  else:
    columns = loads(get_var('columns_to_do'))

  while len(columns) > 0:
    column = columns[0]
    d = load_data(column)
    out = []
    for row in d:
      p = Place(row[column], (row['latitude'], row['longitude']) )
      row_geocode = p.geocode()
      row_geocode.update({
        "address-column":column,
        "branchId": row['branchId']
      })
      out = row_geocode
      sleep(3)
      save([], out, 'geocoded')
    columns.remove(column)

    if len(columns) == 0:
      save_var('columns_to_do',None)
    else:
      save_var('columns_to_do',dumps(columns))
Exemplo n.º 2
0
def main():
    #finalpage=get_var('finalpage')
    prevpage = get_var('prevpage')

    #if None==finalpage:
    if True:
        finalpage = int(get_lastpage(getpage(1)))
        save_var('finalpage', finalpage)
    if None == prevpage:
        prevpage = 1

    if prevpage < finalpage:
        step1(prevpage, finalpage)
    elif prevpage == finalpage:
        if not "step2completion" in show_tables():
            execute(
                'create table `step2completion` (`url` text, `browsed` boolean)'
            )
            execute("""
        INSERT INTO `step2completion`
        ( url , browsed )
        SELECT url, 0 as "browsed"
        FROM locations
        """)
            commit()
        step2()
def main():
  if get_var('skip')==None:
    save_var('skip',0)
  routesTable=getroutes()
  for row in routesTable:
    if row['key'][0:2]!=row['key'][2:4]:
      get_route_schedules(row['id'],row['key'])
def main():
    if get_var('skip') == None:
        save_var('skip', 0)
    routesTable = getroutes()
    for row in routesTable:
        if row['key'][0:2] != row['key'][2:4]:
            get_route_schedules(row['id'], row['key'])
Exemplo n.º 5
0
def shallow_scrape():
    br = mechanize.Browser()

    c = sqlite.get_var("last_page", 0) + 1
    max_c = c + 6

    resultspage = br.open("http://www.education.gov.uk/edubase/quickSearchResult.xhtml?page=%d" % c)

    while c < max_c:
        print ""
        print "Handling page %d..." % c
        print "  [" + br.geturl() + "]"

        ### extract data from page
        page = html.parse(resultspage)

        for u in page.getroot().findall("body/div/div/div/div/table/tr/td/table/tbody/tr/td/a"):
            urn = re.search("urn=([0-9]{6})", u.get("href")).group(1)
            yield urn

        ### get new page
        try:
            resultspage = br.follow_link(text="Next")
            sqlite.save_var("last_page", c)

            c += 1
            if c % 2 == 0:
                time.sleep(10)

        except mechanize.LinkNotFoundError:
            c += 1
            sqlite.save_var("last_page", 0)
            break
def main():
  if None==get_var('downloaded'):
    download()
    save_var('downloaded',1)
  execute('DROP TABLE IF EXISTS `final`')
  clean()
  save_var('downloaded',None)
def wayback(url):
  """Download from the wayback machine."""
  xml=pull(url)
  try:
    parse(url,xml,suffix='_wayback')
    url=xml.xpath('//a[img[@src="http://staticweb.archive.org/images/toolbar/wm_tb_prv_on.png"]]')[0].attrib['href']
    print url
    wayback(url)
  except:
    save_var('wayback_url',url)
def main():
    foo=get_var('runId')
    runId=1 if foo==None else foo+1
    save_var('runId',runId)
    try:
        nonsense()
    except:
        try:
            nonsense()
        except:
            exceeded(runId)
def main():
  b=PostbankBrowser()
  branches=b.get_branch_list()
  if FIRST_RUN:
    save_branches(branches)

  for branchId in select_branchIds(branches):
    b.load_branch(branchId)
    d=b.get_branch_info()
    d['branchId']=branchId
    save([],d,'branch_info')
    save_var('previous_branchId',branchId)

  save_var('previous_branchId',None)
Exemplo n.º 10
0
def cp1():
  execute('''
CREATE TABLE IF NOT EXISTS `businessPremises` (
  `date_scraped` REAL,
  `businessPremisesURL` TEXT,
  FOREIGN KEY (date_scraped, businessPremisesUrl)
  REFERENCES cp1(date_scraped, businessPremisesUrl)
)
''')

  if get_var('crashed') == 1:
    pagenum = select('max(pagenum) from cp1 where date_scraped = (select max(date_scraped) from cp1)')[0]['max(pagenum)']
    print "Resuming from page %d" % pagenum
    p = Page('CP1')
    p = Page('CP1', s=p.s, pagenum=pagenum)
  else:
    print "Starting a new run"
    p = Page('CP1')

  while p.lastpage()==False:
    print "Beginning page %d" % p.pagenum
    tables=p.table().subtables()
    d = []
    for table in tables:
        row = table.parse()
        row['businessPremisesURL'] = table.business_premises_url()

        try:
            business_premises_data, more_registrant_data = table.business_premises(p.s)
        except Exception, msg:
            print "Error on %s: msg" % table.business_premises_url()
            sleep(60)
            print "Trying again"
            business_premises_data, more_registrant_data = table.business_premises(p.s)

        row['date_scraped']=DATE
        row['pagenum']=p.pagenum
        row['url']=URL+"?page=%d"%p.pagenum

        row.update(more_registrant_data)

        save([], business_premises_data, 'businessPremises')
        save(['date_scraped', 'businessPremisesURL'],row,'cp1')

        sleep(1)
    save_var('crashed', 1)
    p=p.next25()
def step1(prevpage,finalpage):
  for page in range(prevpage,finalpage+1):
    try:
      theaters_in=get_theaters(page)
    except BadStatusLine:
      url=URLS["ct-base"]+str(page)
      save(['url'],{"url":url,"scrape_error":'BadStatusLine'},'errors')
      continue

    theatres_out=[]
    for theater in theaters_in:
      info=theater_info(theater)
      info=clean_info(info)
      theatres_out.append(info2dictRow(info,page))

    save(['url'],theatres_out,'locations')
    sleep(INTERVAL)
    save_var('prevpage',page)
def get_route_schedules(routeId,route):
  #Check that it's not a route within one city
  assert route[0:2]!=route[2:4]

  xml,theurl=grab(route)
  save(['routeId','url'],{
    "routeId":routeId
  , "url":theurl
  },'urls')

  try:
    table=get_table(xml)
  except:
    save([],{"url":theurl},'errors')
  else:
    d_raw=parse_table(table)
    d=[]

    for row_raw in d_raw:
      row_clean={}
      for key in row_raw:
        if key==":Route/Trip":
          row_clean['routeNum']=row_raw[key]
        else:
          foo,bar,baz=key.split(':')
          if foo=="From":
            row_clean['fromCity']=bar
            row_clean['fromStop']=baz
            row_clean['fromTime']=row_raw[key]
          elif foo=="To":
            row_clean['toCity']=bar
            row_clean['toStop']=baz
            row_clean['toTime']=row_raw[key]
      row_clean['routeId']=routeId

      if row_clean['toStop']=='megabus.com stop' and row_clean['fromStop']=='megabus.com stop':
        table_name='megabus'
      else:
        table_name='schedules'

      save([],row_clean,table_name)
    save_var('skip',get_var('skip')+1)
Exemplo n.º 13
0
def step1(prevpage, finalpage):
    for page in range(prevpage, finalpage + 1):
        try:
            theaters_in = get_theaters(page)
        except BadStatusLine:
            url = URLS["ct-base"] + str(page)
            save(['url'], {
                "url": url,
                "scrape_error": 'BadStatusLine'
            }, 'errors')
            continue

        theatres_out = []
        for theater in theaters_in:
            info = theater_info(theater)
            info = clean_info(info)
            theatres_out.append(info2dictRow(info, page))

        save(['url'], theatres_out, 'locations')
        sleep(INTERVAL)
        save_var('prevpage', page)
def get_route_schedules(routeId, route):
    #Check that it's not a route within one city
    assert route[0:2] != route[2:4]

    xml, theurl = grab(route)
    save(['routeId', 'url'], {"routeId": routeId, "url": theurl}, 'urls')

    try:
        table = get_table(xml)
    except:
        save([], {"url": theurl}, 'errors')
    else:
        d_raw = parse_table(table)
        d = []

        for row_raw in d_raw:
            row_clean = {}
            for key in row_raw:
                if key == ":Route/Trip":
                    row_clean['routeNum'] = row_raw[key]
                else:
                    foo, bar, baz = key.split(':')
                    if foo == "From":
                        row_clean['fromCity'] = bar
                        row_clean['fromStop'] = baz
                        row_clean['fromTime'] = row_raw[key]
                    elif foo == "To":
                        row_clean['toCity'] = bar
                        row_clean['toStop'] = baz
                        row_clean['toTime'] = row_raw[key]
            row_clean['routeId'] = routeId

            if row_clean['toStop'] == 'megabus.com stop' and row_clean[
                    'fromStop'] == 'megabus.com stop':
                table_name = 'megabus'
            else:
                table_name = 'schedules'

            save([], row_clean, table_name)
        save_var('skip', get_var('skip') + 1)
Exemplo n.º 15
0
def main():
  if None==get_var('DATE'):
    save_var('DATE',time())

  searchTerms=get_searchTerms()
  for searchTerm in searchTerms:
    d=paginate(searchTerm)
    for row in d:
      row['date_scraped']=get_var('DATE')
      row['searchTerm']=searchTerm

    save_var('previous_searchTerm',searchTerm)
    save(['date_scraped', 'Name'],d,'initial')

  save_var('previous_searchTerm',None)
  save_var('DATE',None)
def download(abridge=False):
  d=[]

  #Resume the saved provinces
  provinces=getprovinces()
  province=get_var('province', provinces[0])

  #Put the date in. This will get passed along, so this is the only time I add it.
  province['date_scraped']=get_var('DATE', int(time()))

  #Get the cities
  cities=getcities(province['provinceId'])

  for city in cities:
    #Pass along the province
    city.update(province)

    branches=getbranches_with_info(city['cityId'])
    for branch in branches:
      #print branch
      branch.update(city)
      d.append(branch)

    if abridge:
      break

  i=provinces.index(province)+1
  print provinces
  if i<len(provinces):
    save_var('province',dumps(provinces[i]))
    print('Finished with branches in %s' % province['provinceName'])
  else:
    save_var('province',None)
    print('Finished with all the downloading!')

  save([],d,'initial')
Exemplo n.º 17
0
            'date_scraped': DATE,
            'businessPremisesURL': url
        })
    save(['date_scraped', 'businessPremisesURL'], data, 'business_premises')
    randomsleep()

#execute('DELETE from cp1 where date_scraped != (select min(date_scraped) from cp1)')
#execute('DELETE from businessPremises where date_scraped != (select min(date_scraped) from cp1)')
#commit()

#execute('UPDATE swvariables SET value_blob = (select min(date_scraped) from cp1) where name = "DATE"')
#commit()

Test(download=False)
cp1()
save_var('crashed', 0)
execute('DELETE FROM swvariables WHERE name = "DATE"')
commit()from scraperwiki.sqlite import save, select, execute, save_var, get_var, commit
from scraperwiki import swimport
from requests import session
from lxml.html import fromstring, tostring
import re
from time import time, sleep
keyify=swimport('keyify').keyify

URL="http://www.ncr.org.za/register_of_registrants/index.php"

#DEV=True
DEV=False

DATE = get_var('DATE', time())
Exemplo n.º 18
0
def grab(from_city,from_state,to_city,to_state):
  theurl=url(from_city,from_state,to_city,to_state)
  opener = build_opener(HTTPCookieProcessor())

  try:
    o=opener.open(theurl)
  except BadStatusLine:
    return None

  xml=fromstring(o.read())
  if not route_exists(xml):
    return None

  try:
    table=xml.xpath('//table[tr[@class="tableHilightHeader"]]')[0]
  except:
    save([],{
      "from_city":from_city
    , "from_stat":from_state
    , "to_city":to_city
    , "to_state":to_state
    },'errors')
    return None

  #cities=table.xpath('tr[position()=1]/td')
  schedules=table.xpath('tr[position()>2]')
  columns=get_columns(table)

  #Get the id
  odId=get_var('origin_destination_id')
  sId=get_var('schedule_id')
  if None==odId:
    odId=1
  if None==sId:
    sId=1

  #Initialize for the loop
  d=[]
  on_fromstops=True

  for schedule in schedules:
    times=schedule.xpath('td/child::node()[position()=1]')
    #times.pop()
    #times.append(schedule.xpath('td/text()')[-1])
    print zip(times,columns)
    #assert False
    for value,column in zip(times,columns):
      if "days"==column:
        row={"key":"days"}
      elif "arrow"==column:
        on_fromstops=False
        continue
      elif "Route/Trip"==column:
        row={"key":"route_code"}

      elif on_fromstops:
        row={
          "key":"fromstop"
        , "stop":column
        }
      elif not on_fromstops:
        row={
          "key":"tostop"
        , "stop":column
        }
      #End if statement
      row.update({
        "value":value
      , "sId":sId
      , "odId":odId
      })
      d.append(row)
    #End for loop
    sId+=1
  #End for loop

  #Save origin-destination information
  save(['id'],{
    "id":odId
  , "from_city":from_city
  , "from_stat":from_state
  , "to_city":to_city
  , "to_state":to_state
  },'origin_destinations')

  #Save schedule information
  save([],d,'schedules')

  odId+=1
  save_var('origin_destination_id',odId)
  save_var('schedule_id',sId)
      return __VARS[a]
  def save_var(a,b):
    __VARS[a]=b

  def options(*args,**kwargs):
    return [{"branchId":"174","branchName":"DUNNO"}]
else:
  options=swimport('options').options

URL="http://www.postbank.co.za/contact.aspx?ID=3"

def log(foo):
  print(foo)

if get_var('previous_branchId')==None:
  save_var('DATE',time())
  FIRST_RUN=True
else:
  FIRST_RUN=False

DATE=get_var('DATE')

def main():
  b=PostbankBrowser()
  branches=b.get_branch_list()
  if FIRST_RUN:
    save_branches(branches)

  for branchId in select_branchIds(branches):
    b.load_branch(branchId)
    d=b.get_branch_info()
Exemplo n.º 20
0
import json
import requests
import scraperwiki.sqlite as db
import time

begin = 1

counciltype = json.loads(requests.get('http://mapit.mysociety.org/areas/LBO').content)
for council, data1 in counciltype.items():
    if(db.get_var('id') == council and begin == 0):
        begin = 1
    if(begin == 1):
        print data1['name']
        db.save_var('id', council)
        children = json.loads(requests.get('http://mapit.mysociety.org/area/%s/children' % council).content)
        for id, data in children.items(): 
                #time.sleep(1)
                json.loads(requests.get('http://mapit.mysociety.org/area/%s' % id).content)
                if (data['type'] == 'LBW'):            
                    #time.sleep(0.1)
                    kml = requests.get('http://mapit.mysociety.org/area/%s.kml' % id).content
                    councildata = {'type': data['type'],
                                   'parent_name': data1['name'],
                                   'id': int(id),
                                   'name': data['name'],
                                   'kml': kml[85:-7]}
                    db.save(['id'], councildata, verbose=0)import json
import requests
import scraperwiki.sqlite as db
import time
from scraperwiki.sqlite import save_var,execute,commit,show_tables
import os

if "swvariables" in show_tables():
  execute("DROP TABLE swvariables;")

before=set(os.listdir('.'))
save_var('foo','bar')
#os.system('rm *.pyc')
after=set(os.listdir('.'))

#print before-after
#print after
s=[]
for f in after:
  if f[0:4]!='data' and f[-3:-1]!='pyc':
    s.append(f)

print s

baz=[]
baz.append('script.rb')
baz.append('.cache')
for f in baz:
  print open(f).read()
from scraperwiki.sqlite import save_var,execute,commit,show_tables
import os

if "swvariables" in show_tables():
  execute("DROP TABLE swvariables;")
from scraperwiki.sqlite import save_var, execute, commit, show_tables
import os

if "swvariables" in show_tables():
    execute("DROP TABLE swvariables;")

before = set(os.listdir('.'))
save_var('foo', 'bar')
#os.system('rm *.pyc')
after = set(os.listdir('.'))

#print before-after
#print after
s = []
for f in after:
    if f[0:4] != 'data' and f[-3:-1] != 'pyc':
        s.append(f)

print s

baz = []
baz.append('script.rb')
baz.append('.cache')
for f in baz:
    print open(f).read()
from scraperwiki.sqlite import save_var, execute, commit, show_tables
import os

if "swvariables" in show_tables():
    execute("DROP TABLE swvariables;")
      link = row.find('td/a')
      if link is None or not 'ShipDetails' in link.get('href'):
          continue
      number, name, type, dwt, built, flag, _ = map(lambda c: c.text, row)
      
      d.append({
        'number': number,
        'name': name, 
        'type': type,
        'dwt': dwt,
        'built': built,
        'flag': flag
      })
    save(['number'], d)
    i+=1
    save_var('page',i)from scraperwiki.sqlite import save,get_var,save_var
from lxml import html

URL = "http://www.e-ships.net/new/?View=ShipSearchResult"
URL += "&ship_name=&fdwt=&tdwt=&last_ex_name=&fgt=&tgt=&imo=&fnrt=&tnrt=&ship_type=-1&fteu=&tteu=&"
URL += "flag=-1&floa=&tloa=&ship_class=-1&fbeam=&tbeam=&call_sign=&fdraft=&tdraft=&owner_id="
URL += "&fbuilt=&tbuilt=&manager_id=&fengine_kw_total=&tengine_kw_total=&builder_id=&fengine_hp_total="
URL += "&tengine_hp_total=&sortby=ship_name&p=%s"

i=get_var('page')
if i==None:
  i=0
while i<=1174:
    doc = html.parse(URL % i).getroot()
    rows = doc.xpath('//tr')
    if len(rows) == 1:
def save_state(js,level):
  """Save the second-to-next directory to be searched."""
  save_var(str(level),js)
    return branch

def parse_maphref(maphref):
    html=maphref.split("'")[1].replace('<br>','')
    x=fromstring(html)
    keys=["map_%s" % keyify(key) for key in x.xpath('strong/text()')]
    values=x.xpath('text()')
    return dict(zip(keys,values))

execute('CREATE TABLE IF NOT EXISTS provinces (provinceUrl TEXT )')
execute('CREATE TABLE IF NOT EXISTS cities (provinceUrl TEXT, cityUrl TEXT, FOREIGN KEY(provinceUrl) REFERENCES provinces(provinceUrl) )')
execute('CREATE TABLE IF NOT EXISTS branches (cityUrl TEXT, branchUrl TEXT, FOREIGN KEY(cityUrl) REFERENCES cities(cityUrl) )')
commit()

scraperrun = get_var('scraperrun', int(time()))
save_var('scraperrun', scraperrun)
seed([Menu(URLS['main'])])
execute('delete from swvariables where name = "scraperrun"')
commit()from lxml.html import fromstring
#from lxml.etree import fromstring
from time import time
import requests
from scraperwiki.sqlite import save,save_var, get_var, select, commit, execute
from scraperwiki import swimport
options=swimport('options').options
keyify=swimport('keyify').keyify
randomsleep=swimport('randomsleep').randomsleep
from json import loads,dumps
strip_address = swimport('strip_address').strip_address

# --------------------------------------------------
Exemplo n.º 26
0
            else:
                row['enter_manually'] = 1

            row.update({'date_scraped': time(), 'ScraperRun': scraper_run, 'url': self.url, 'Record': int(self.url.split('=')[-1])})
            data.append(row)

        save([], data, 'BusinessPremises')

execute('CREATE TABLE IF NOT EXISTS Registrant (ScraperRun INTEGER, Record INTEGER)')
execute('CREATE INDEX IF NOT EXISTS RegistrantRecord ON Registrant(record)')
execute('CREATE TABLE IF NOT EXISTS BusinessPremises (ScraperRun INTEGER, Record INTEGER, FOREIGN KEY(Record) REFERENCES Registrant(Record))')
execute('CREATE INDEX IF NOT EXISTS BusinessPremisesRecord ON BusinessPremises(ScraperRun, Record)')
commit()

if "stack" not in show_tables() or select('count(*) as "c" from stack')[0]['c'] == 0:
    save_var('scraper_run', int(time()))

scraper_run = get_var('scraper_run', None)
if scraper_run == None:
    raise NameError('scraper_run is not defined.') 

seed([SearchResults(None)])
#seed([BusinessPremises('http://www.ncr.org.za/register_of_registrants/viewpremises.php?record=11296')])Xfrom scraperwiki.sqlite import save, select, execute, save_var, get_var, commit, show_tables
from scraperwiki import swimport
#from requests import session
import requests
from lxml.html import fromstring, tostring
import re
from time import time, sleep
keyify=swimport('keyify').keyify
randomsleep=swimport('randomsleep').randomsleep
Exemplo n.º 27
0
URL += "&fbuilt=&tbuilt=&manager_id=&fengine_kw_total=&tengine_kw_total=&builder_id=&fengine_hp_total="
URL += "&tengine_hp_total=&sortby=ship_name&p=%s"

i=get_var('page')
if i==None:
  i=0
while i<=1174:
    doc = html.parse(URL % i).getroot()
    rows = doc.xpath('//tr')
    if len(rows) == 1:
        break

    d=[]
    for row in rows:
      link = row.find('td/a')
      if link is None or not 'ShipDetails' in link.get('href'):
          continue
      number, name, type, dwt, built, flag, _ = map(lambda c: c.text, row)
      
      d.append({
        'number': number,
        'name': name, 
        'type': type,
        'dwt': dwt,
        'built': built,
        'flag': flag
      })
    save(['number'], d)
    i+=1
    save_var('page',i)
Exemplo n.º 28
0
def save_state(js, level):
    """Save the second-to-next directory to be searched."""
    save_var(str(level), js)
Exemplo n.º 29
0
'''
I figure out how different types are saved to the database.
'''
import datetime
from scraperwiki.sqlite import save, save_var, execute, commit

execute('drop table if exists swdata')
execute('drop table if exists swvariables')
execute('drop table if exists complex')
commit()

for t in {dict, list, set, str, unicode, bool, int, float, long}:
    save_var(str(t), t())
    save([], {str(t).replace("'","").replace(' ', '').replace('<type', '').replace('>', ''): t()})

save([], {
    u'list': [u'thing'],
    u'dict': {u'key': u'value'},
    u'set': {u'thing'},
}, 'complex')'''
I figure out how different types are saved to the database.
'''
import datetime
from scraperwiki.sqlite import save, save_var, execute, commit

execute('drop table if exists swdata')
execute('drop table if exists swvariables')
execute('drop table if exists complex')
commit()

for t in {dict, list, set, str, unicode, bool, int, float, long}:
 def process_response(response):
     print response.url
     doc = prepare_doc(response.text)
     gather_items_from(doc)
     db.save_var(response.url, 1, verbose=0)
Exemplo n.º 31
0
import json
import requests
import scraperwiki.sqlite as db
import time

begin = 0

counciltype = json.loads(requests.get("http://mapit.mysociety.org/areas/DIS").content)
time.sleep(1)
for council, data1 in counciltype.items():
    print data1["name"]
    if db.get_var("id") == council and begin == 0:
        begin = 1
    if begin == 1:
        db.save_var("id", council)
        children = json.loads(requests.get("http://mapit.mysociety.org/area/%s/children" % council).content)
        time.sleep(1)
        for id, data in children.items():
            json.loads(requests.get("http://mapit.mysociety.org/area/%s" % id).content)
            time.sleep(1)
            if data["type"] == "DIW":
                kml = requests.get("http://mapit.mysociety.org/area/%s.kml" % id).content
                time.sleep(1)
                councildata = {
                    "type": data["type"],
                    "parent_name": data1["name"],
                    "id": int(id),
                    "name": data["name"],
                    "kml": kml[85:-7],
                }
                db.save(["id"], councildata, verbose=0)
Exemplo n.º 32
0
URL += "&fbuilt=&tbuilt=&manager_id=&fengine_kw_total=&tengine_kw_total=&builder_id=&fengine_hp_total="
URL += "&tengine_hp_total=&sortby=ship_name&p=%s"

i = get_var('page')
if i == None:
    i = 0
while i <= 1174:
    doc = html.parse(URL % i).getroot()
    rows = doc.xpath('//tr')
    if len(rows) == 1:
        break

    d = []
    for row in rows:
        link = row.find('td/a')
        if link is None or not 'ShipDetails' in link.get('href'):
            continue
        number, name, type, dwt, built, flag, _ = map(lambda c: c.text, row)

        d.append({
            'number': number,
            'name': name,
            'type': type,
            'dwt': dwt,
            'built': built,
            'flag': flag
        })
    save(['number'], d)
    i += 1
    save_var('page', i)