Exemplos de save_var em Python, exemplos de scraperwiki.sqlite.save_var em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: south_africa_nominatim_geocoding_validation.py Projeto: flyeven/scraperwiki-scraper-vault

def main():
  if get_var('columns_to_do') == None:
    columns = COLUMNS
  else:
    columns = loads(get_var('columns_to_do'))

  while len(columns) > 0:
    column = columns[0]
    d = load_data(column)
    out = []
    for row in d:
      p = Place(row[column], (row['latitude'], row['longitude']) )
      row_geocode = p.geocode()
      row_geocode.update({
        "address-column":column,
        "branchId": row['branchId']
      })
      out = row_geocode
      sleep(3)
      save([], out, 'geocoded')
    columns.remove(column)

    if len(columns) == 0:
      save_var('columns_to_do',None)
    else:
      save_var('columns_to_do',dumps(columns))

Exemplo n.º 2

0

Exibir arquivo

def main():
    #finalpage=get_var('finalpage')
    prevpage = get_var('prevpage')

    #if None==finalpage:
    if True:
        finalpage = int(get_lastpage(getpage(1)))
        save_var('finalpage', finalpage)
    if None == prevpage:
        prevpage = 1

    if prevpage < finalpage:
        step1(prevpage, finalpage)
    elif prevpage == finalpage:
        if not "step2completion" in show_tables():
            execute(
                'create table `step2completion` (`url` text, `browsed` boolean)'
            )
            execute("""
        INSERT INTO `step2completion`
        ( url , browsed )
        SELECT url, 0 as "browsed"
        FROM locations
        """)
            commit()
        step2()

Exemplo n.º 3

0

Exibir arquivo

Arquivo: bettercoach_schedules.py Projeto: flyeven/scraperwiki-scraper-vault

def main():
  if get_var('skip')==None:
    save_var('skip',0)
  routesTable=getroutes()
  for row in routesTable:
    if row['key'][0:2]!=row['key'][2:4]:
      get_route_schedules(row['id'],row['key'])

Exemplo n.º 4

0

Exibir arquivo

Arquivo: bettercoach_schedules.py Projeto: rayassch/scraperwiki-scraper-vault

def main():
    if get_var('skip') == None:
        save_var('skip', 0)
    routesTable = getroutes()
    for row in routesTable:
        if row['key'][0:2] != row['key'][2:4]:
            get_route_schedules(row['id'], row['key'])

Exemplo n.º 5

0

Exibir arquivo

Arquivo: scraper.py Projeto: richardingham/edubase-schools-data

def shallow_scrape():
    br = mechanize.Browser()

    c = sqlite.get_var("last_page", 0) + 1
    max_c = c + 6

    resultspage = br.open("http://www.education.gov.uk/edubase/quickSearchResult.xhtml?page=%d" % c)

    while c < max_c:
        print ""
        print "Handling page %d..." % c
        print "  [" + br.geturl() + "]"

        ### extract data from page
        page = html.parse(resultspage)

        for u in page.getroot().findall("body/div/div/div/div/table/tr/td/table/tbody/tr/td/a"):
            urn = re.search("urn=([0-9]{6})", u.get("href")).group(1)
            yield urn

        ### get new page
        try:
            resultspage = br.follow_link(text="Next")
            sqlite.save_var("last_page", c)

            c += 1
            if c % 2 == 0:
                time.sleep(10)

        except mechanize.LinkNotFoundError:
            c += 1
            sqlite.save_var("last_page", 0)
            break

Exemplo n.º 6

0

Exibir arquivo

Arquivo: fnb_south_africa.py Projeto: rayassch/scraperwiki-scraper-vault

def main():
  if None==get_var('downloaded'):
    download()
    save_var('downloaded',1)
  execute('DROP TABLE IF EXISTS `final`')
  clean()
  save_var('downloaded',None)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: postsecret.py Projeto: flyeven/scraperwiki-scraper-vault

def wayback(url):
  """Download from the wayback machine."""
  xml=pull(url)
  try:
    parse(url,xml,suffix='_wayback')
    url=xml.xpath('//a[img[@src="http://staticweb.archive.org/images/toolbar/wm_tb_prv_on.png"]]')[0].attrib['href']
    print url
    wayback(url)
  except:
    save_var('wayback_url',url)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: can_we_catch_the_cpu_time_exceeded_exception.py Projeto: flyeven/scraperwiki-scraper-vault

def main():
    foo=get_var('runId')
    runId=1 if foo==None else foo+1
    save_var('runId',runId)
    try:
        nonsense()
    except:
        try:
            nonsense()
        except:
            exceeded(runId)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: south_africa_postbank.py Projeto: flyeven/scraperwiki-scraper-vault

def main():
  b=PostbankBrowser()
  branches=b.get_branch_list()
  if FIRST_RUN:
    save_branches(branches)

  for branchId in select_branchIds(branches):
    b.load_branch(branchId)
    d=b.get_branch_info()
    d['branchId']=branchId
    save([],d,'branch_info')
    save_var('previous_branchId',branchId)

  save_var('previous_branchId',None)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: ncr4.py Projeto: flyeven/scraperwiki-scraper-vault

def cp1():
  execute('''
CREATE TABLE IF NOT EXISTS `businessPremises` (
  `date_scraped` REAL,
  `businessPremisesURL` TEXT,
  FOREIGN KEY (date_scraped, businessPremisesUrl)
  REFERENCES cp1(date_scraped, businessPremisesUrl)
)
''')

  if get_var('crashed') == 1:
    pagenum = select('max(pagenum) from cp1 where date_scraped = (select max(date_scraped) from cp1)')[0]['max(pagenum)']
    print "Resuming from page %d" % pagenum
    p = Page('CP1')
    p = Page('CP1', s=p.s, pagenum=pagenum)
  else:
    print "Starting a new run"
    p = Page('CP1')

  while p.lastpage()==False:
    print "Beginning page %d" % p.pagenum
    tables=p.table().subtables()
    d = []
    for table in tables:
        row = table.parse()
        row['businessPremisesURL'] = table.business_premises_url()

        try:
            business_premises_data, more_registrant_data = table.business_premises(p.s)
        except Exception, msg:
            print "Error on %s: msg" % table.business_premises_url()
            sleep(60)
            print "Trying again"
            business_premises_data, more_registrant_data = table.business_premises(p.s)

        row['date_scraped']=DATE
        row['pagenum']=p.pagenum
        row['url']=URL+"?page=%d"%p.pagenum

        row.update(more_registrant_data)

        save([], business_premises_data, 'businessPremises')
        save(['date_scraped', 'businessPremisesURL'],row,'cp1')

        sleep(1)
    save_var('crashed', 1)
    p=p.next25()

Exemplo n.º 11

0

Exibir arquivo

Arquivo: movie_theatres2.py Projeto: flyeven/scraperwiki-scraper-vault

def step1(prevpage,finalpage):
  for page in range(prevpage,finalpage+1):
    try:
      theaters_in=get_theaters(page)
    except BadStatusLine:
      url=URLS["ct-base"]+str(page)
      save(['url'],{"url":url,"scrape_error":'BadStatusLine'},'errors')
      continue

    theatres_out=[]
    for theater in theaters_in:
      info=theater_info(theater)
      info=clean_info(info)
      theatres_out.append(info2dictRow(info,page))

    save(['url'],theatres_out,'locations')
    sleep(INTERVAL)
    save_var('prevpage',page)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: bettercoach_schedules.py Projeto: flyeven/scraperwiki-scraper-vault

def get_route_schedules(routeId,route):
  #Check that it's not a route within one city
  assert route[0:2]!=route[2:4]

  xml,theurl=grab(route)
  save(['routeId','url'],{
    "routeId":routeId
  , "url":theurl
  },'urls')

  try:
    table=get_table(xml)
  except:
    save([],{"url":theurl},'errors')
  else:
    d_raw=parse_table(table)
    d=[]

    for row_raw in d_raw:
      row_clean={}
      for key in row_raw:
        if key==":Route/Trip":
          row_clean['routeNum']=row_raw[key]
        else:
          foo,bar,baz=key.split(':')
          if foo=="From":
            row_clean['fromCity']=bar
            row_clean['fromStop']=baz
            row_clean['fromTime']=row_raw[key]
          elif foo=="To":
            row_clean['toCity']=bar
            row_clean['toStop']=baz
            row_clean['toTime']=row_raw[key]
      row_clean['routeId']=routeId

      if row_clean['toStop']=='megabus.com stop' and row_clean['fromStop']=='megabus.com stop':
        table_name='megabus'
      else:
        table_name='schedules'

      save([],row_clean,table_name)
    save_var('skip',get_var('skip')+1)

Exemplo n.º 13

0

Exibir arquivo

def step1(prevpage, finalpage):
    for page in range(prevpage, finalpage + 1):
        try:
            theaters_in = get_theaters(page)
        except BadStatusLine:
            url = URLS["ct-base"] + str(page)
            save(['url'], {
                "url": url,
                "scrape_error": 'BadStatusLine'
            }, 'errors')
            continue

        theatres_out = []
        for theater in theaters_in:
            info = theater_info(theater)
            info = clean_info(info)
            theatres_out.append(info2dictRow(info, page))

        save(['url'], theatres_out, 'locations')
        sleep(INTERVAL)
        save_var('prevpage', page)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: bettercoach_schedules.py Projeto: rayassch/scraperwiki-scraper-vault

def get_route_schedules(routeId, route):
    #Check that it's not a route within one city
    assert route[0:2] != route[2:4]

    xml, theurl = grab(route)
    save(['routeId', 'url'], {"routeId": routeId, "url": theurl}, 'urls')

    try:
        table = get_table(xml)
    except:
        save([], {"url": theurl}, 'errors')
    else:
        d_raw = parse_table(table)
        d = []

        for row_raw in d_raw:
            row_clean = {}
            for key in row_raw:
                if key == ":Route/Trip":
                    row_clean['routeNum'] = row_raw[key]
                else:
                    foo, bar, baz = key.split(':')
                    if foo == "From":
                        row_clean['fromCity'] = bar
                        row_clean['fromStop'] = baz
                        row_clean['fromTime'] = row_raw[key]
                    elif foo == "To":
                        row_clean['toCity'] = bar
                        row_clean['toStop'] = baz
                        row_clean['toTime'] = row_raw[key]
            row_clean['routeId'] = routeId

            if row_clean['toStop'] == 'megabus.com stop' and row_clean[
                    'fromStop'] == 'megabus.com stop':
                table_name = 'megabus'
            else:
                table_name = 'schedules'

            save([], row_clean, table_name)
        save_var('skip', get_var('skip') + 1)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: absa3.py Projeto: flyeven/scraperwiki-scraper-vault

def main():
  if None==get_var('DATE'):
    save_var('DATE',time())

  searchTerms=get_searchTerms()
  for searchTerm in searchTerms:
    d=paginate(searchTerm)
    for row in d:
      row['date_scraped']=get_var('DATE')
      row['searchTerm']=searchTerm

    save_var('previous_searchTerm',searchTerm)
    save(['date_scraped', 'Name'],d,'initial')

  save_var('previous_searchTerm',None)
  save_var('DATE',None)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: nedbank_branches.py Projeto: flyeven/scraperwiki-scraper-vault

def download(abridge=False):
  d=[]

  #Resume the saved provinces
  provinces=getprovinces()
  province=get_var('province', provinces[0])

  #Put the date in. This will get passed along, so this is the only time I add it.
  province['date_scraped']=get_var('DATE', int(time()))

  #Get the cities
  cities=getcities(province['provinceId'])

  for city in cities:
    #Pass along the province
    city.update(province)

    branches=getbranches_with_info(city['cityId'])
    for branch in branches:
      #print branch
      branch.update(city)
      d.append(branch)

    if abridge:
      break

  i=provinces.index(province)+1
  print provinces
  if i<len(provinces):
    save_var('province',dumps(provinces[i]))
    print('Finished with branches in %s' % province['provinceName'])
  else:
    save_var('province',None)
    print('Finished with all the downloading!')

  save([],d,'initial')

Exemplo n.º 17

0

Exibir arquivo

Arquivo: ncr4.py Projeto: flyeven/scraperwiki-scraper-vault

            'date_scraped': DATE,
            'businessPremisesURL': url
        })
    save(['date_scraped', 'businessPremisesURL'], data, 'business_premises')
    randomsleep()

#execute('DELETE from cp1 where date_scraped != (select min(date_scraped) from cp1)')
#execute('DELETE from businessPremises where date_scraped != (select min(date_scraped) from cp1)')
#commit()

#execute('UPDATE swvariables SET value_blob = (select min(date_scraped) from cp1) where name = "DATE"')
#commit()

Test(download=False)
cp1()
save_var('crashed', 0)
execute('DELETE FROM swvariables WHERE name = "DATE"')
commit()from scraperwiki.sqlite import save, select, execute, save_var, get_var, commit
from scraperwiki import swimport
from requests import session
from lxml.html import fromstring, tostring
import re
from time import time, sleep
keyify=swimport('keyify').keyify

URL="http://www.ncr.org.za/register_of_registrants/index.php"

#DEV=True
DEV=False

DATE = get_var('DATE', time())

Exemplo n.º 18

0

Exibir arquivo

def grab(from_city,from_state,to_city,to_state):
  theurl=url(from_city,from_state,to_city,to_state)
  opener = build_opener(HTTPCookieProcessor())

  try:
    o=opener.open(theurl)
  except BadStatusLine:
    return None

  xml=fromstring(o.read())
  if not route_exists(xml):
    return None

  try:
    table=xml.xpath('//table[tr[@class="tableHilightHeader"]]')[0]
  except:
    save([],{
      "from_city":from_city
    , "from_stat":from_state
    , "to_city":to_city
    , "to_state":to_state
    },'errors')
    return None

  #cities=table.xpath('tr[position()=1]/td')
  schedules=table.xpath('tr[position()>2]')
  columns=get_columns(table)

  #Get the id
  odId=get_var('origin_destination_id')
  sId=get_var('schedule_id')
  if None==odId:
    odId=1
  if None==sId:
    sId=1

  #Initialize for the loop
  d=[]
  on_fromstops=True

  for schedule in schedules:
    times=schedule.xpath('td/child::node()[position()=1]')
    #times.pop()
    #times.append(schedule.xpath('td/text()')[-1])
    print zip(times,columns)
    #assert False
    for value,column in zip(times,columns):
      if "days"==column:
        row={"key":"days"}
      elif "arrow"==column:
        on_fromstops=False
        continue
      elif "Route/Trip"==column:
        row={"key":"route_code"}

      elif on_fromstops:
        row={
          "key":"fromstop"
        , "stop":column
        }
      elif not on_fromstops:
        row={
          "key":"tostop"
        , "stop":column
        }
      #End if statement
      row.update({
        "value":value
      , "sId":sId
      , "odId":odId
      })
      d.append(row)
    #End for loop
    sId+=1
  #End for loop

  #Save origin-destination information
  save(['id'],{
    "id":odId
  , "from_city":from_city
  , "from_stat":from_state
  , "to_city":to_city
  , "to_state":to_state
  },'origin_destinations')

  #Save schedule information
  save([],d,'schedules')

  odId+=1
  save_var('origin_destination_id',odId)
  save_var('schedule_id',sId)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: south_africa_postbank.py Projeto: flyeven/scraperwiki-scraper-vault

      return __VARS[a]
  def save_var(a,b):
    __VARS[a]=b

  def options(*args,**kwargs):
    return [{"branchId":"174","branchName":"DUNNO"}]
else:
  options=swimport('options').options

URL="http://www.postbank.co.za/contact.aspx?ID=3"

def log(foo):
  print(foo)

if get_var('previous_branchId')==None:
  save_var('DATE',time())
  FIRST_RUN=True
else:
  FIRST_RUN=False

DATE=get_var('DATE')

def main():
  b=PostbankBrowser()
  branches=b.get_branch_list()
  if FIRST_RUN:
    save_branches(branches)

  for branchId in select_branchIds(branches):
    b.load_branch(branchId)
    d=b.get_branch_info()

Exemplo n.º 20

0

Exibir arquivo

Arquivo: lbo.py Projeto: flyeven/scraperwiki-scraper-vault

import json
import requests
import scraperwiki.sqlite as db
import time

begin = 1

counciltype = json.loads(requests.get('http://mapit.mysociety.org/areas/LBO').content)
for council, data1 in counciltype.items():
    if(db.get_var('id') == council and begin == 0):
        begin = 1
    if(begin == 1):
        print data1['name']
        db.save_var('id', council)
        children = json.loads(requests.get('http://mapit.mysociety.org/area/%s/children' % council).content)
        for id, data in children.items(): 
                #time.sleep(1)
                json.loads(requests.get('http://mapit.mysociety.org/area/%s' % id).content)
                if (data['type'] == 'LBW'):            
                    #time.sleep(0.1)
                    kml = requests.get('http://mapit.mysociety.org/area/%s.kml' % id).content
                    councildata = {'type': data['type'],
                                   'parent_name': data1['name'],
                                   'id': int(id),
                                   'name': data['name'],
                                   'kml': kml[85:-7]}
                    db.save(['id'], councildata, verbose=0)import json
import requests
import scraperwiki.sqlite as db
import time

Exemplo n.º 21

0

Exibir arquivo

Arquivo: scraperwiki_script_directory_hacks.py Projeto: flyeven/scraperwiki-scraper-vault

from scraperwiki.sqlite import save_var,execute,commit,show_tables
import os

if "swvariables" in show_tables():
  execute("DROP TABLE swvariables;")

before=set(os.listdir('.'))
save_var('foo','bar')
#os.system('rm *.pyc')
after=set(os.listdir('.'))

#print before-after
#print after
s=[]
for f in after:
  if f[0:4]!='data' and f[-3:-1]!='pyc':
    s.append(f)

print s

baz=[]
baz.append('script.rb')
baz.append('.cache')
for f in baz:
  print open(f).read()
from scraperwiki.sqlite import save_var,execute,commit,show_tables
import os

if "swvariables" in show_tables():
  execute("DROP TABLE swvariables;")

Exemplo n.º 22

0

Exibir arquivo

Arquivo: scraperwiki_script_directory_hacks.py Projeto: rayassch/scraperwiki-scraper-vault

from scraperwiki.sqlite import save_var, execute, commit, show_tables
import os

if "swvariables" in show_tables():
    execute("DROP TABLE swvariables;")

before = set(os.listdir('.'))
save_var('foo', 'bar')
#os.system('rm *.pyc')
after = set(os.listdir('.'))

#print before-after
#print after
s = []
for f in after:
    if f[0:4] != 'data' and f[-3:-1] != 'pyc':
        s.append(f)

print s

baz = []
baz.append('script.rb')
baz.append('.cache')
for f in baz:
    print open(f).read()
from scraperwiki.sqlite import save_var, execute, commit, show_tables
import os

if "swvariables" in show_tables():
    execute("DROP TABLE swvariables;")

Exemplo n.º 23

0

Exibir arquivo

Arquivo: global_ships_db.py Projeto: yuandra/scraperwiki-scraper-vault

      link = row.find('td/a')
      if link is None or not 'ShipDetails' in link.get('href'):
          continue
      number, name, type, dwt, built, flag, _ = map(lambda c: c.text, row)
      
      d.append({
        'number': number,
        'name': name, 
        'type': type,
        'dwt': dwt,
        'built': built,
        'flag': flag
      })
    save(['number'], d)
    i+=1
    save_var('page',i)from scraperwiki.sqlite import save,get_var,save_var
from lxml import html

URL = "http://www.e-ships.net/new/?View=ShipSearchResult"
URL += "&ship_name=&fdwt=&tdwt=&last_ex_name=&fgt=&tgt=&imo=&fnrt=&tnrt=&ship_type=-1&fteu=&tteu=&"
URL += "flag=-1&floa=&tloa=&ship_class=-1&fbeam=&tbeam=&call_sign=&fdraft=&tdraft=&owner_id="
URL += "&fbuilt=&tbuilt=&manager_id=&fengine_kw_total=&tengine_kw_total=&builder_id=&fengine_hp_total="
URL += "&tengine_hp_total=&sortby=ship_name&p=%s"

i=get_var('page')
if i==None:
  i=0
while i<=1174:
    doc = html.parse(URL % i).getroot()
    rows = doc.xpath('//tr')
    if len(rows) == 1:

Exemplo n.º 24

0

Exibir arquivo

Arquivo: new_york_law.py Projeto: flyeven/scraperwiki-scraper-vault

def save_state(js,level):
  """Save the second-to-next directory to be searched."""
  save_var(str(level),js)

Exemplo n.º 25

0

Exibir arquivo

Arquivo: nedbank_branches2.py Projeto: flyeven/scraperwiki-scraper-vault

    return branch

def parse_maphref(maphref):
    html=maphref.split("'")[1].replace('<br>','')
    x=fromstring(html)
    keys=["map_%s" % keyify(key) for key in x.xpath('strong/text()')]
    values=x.xpath('text()')
    return dict(zip(keys,values))

execute('CREATE TABLE IF NOT EXISTS provinces (provinceUrl TEXT )')
execute('CREATE TABLE IF NOT EXISTS cities (provinceUrl TEXT, cityUrl TEXT, FOREIGN KEY(provinceUrl) REFERENCES provinces(provinceUrl) )')
execute('CREATE TABLE IF NOT EXISTS branches (cityUrl TEXT, branchUrl TEXT, FOREIGN KEY(cityUrl) REFERENCES cities(cityUrl) )')
commit()

scraperrun = get_var('scraperrun', int(time()))
save_var('scraperrun', scraperrun)
seed([Menu(URLS['main'])])
execute('delete from swvariables where name = "scraperrun"')
commit()from lxml.html import fromstring
#from lxml.etree import fromstring
from time import time
import requests
from scraperwiki.sqlite import save,save_var, get_var, select, commit, execute
from scraperwiki import swimport
options=swimport('options').options
keyify=swimport('keyify').keyify
randomsleep=swimport('randomsleep').randomsleep
from json import loads,dumps
strip_address = swimport('strip_address').strip_address

# --------------------------------------------------

Exemplo n.º 26

0

Exibir arquivo

Arquivo: ncr5.py Projeto: flyeven/scraperwiki-scraper-vault

            else:
                row['enter_manually'] = 1

            row.update({'date_scraped': time(), 'ScraperRun': scraper_run, 'url': self.url, 'Record': int(self.url.split('=')[-1])})
            data.append(row)

        save([], data, 'BusinessPremises')

execute('CREATE TABLE IF NOT EXISTS Registrant (ScraperRun INTEGER, Record INTEGER)')
execute('CREATE INDEX IF NOT EXISTS RegistrantRecord ON Registrant(record)')
execute('CREATE TABLE IF NOT EXISTS BusinessPremises (ScraperRun INTEGER, Record INTEGER, FOREIGN KEY(Record) REFERENCES Registrant(Record))')
execute('CREATE INDEX IF NOT EXISTS BusinessPremisesRecord ON BusinessPremises(ScraperRun, Record)')
commit()

if "stack" not in show_tables() or select('count(*) as "c" from stack')[0]['c'] == 0:
    save_var('scraper_run', int(time()))

scraper_run = get_var('scraper_run', None)
if scraper_run == None:
    raise NameError('scraper_run is not defined.') 

seed([SearchResults(None)])
#seed([BusinessPremises('http://www.ncr.org.za/register_of_registrants/viewpremises.php?record=11296')])Xfrom scraperwiki.sqlite import save, select, execute, save_var, get_var, commit, show_tables
from scraperwiki import swimport
#from requests import session
import requests
from lxml.html import fromstring, tostring
import re
from time import time, sleep
keyify=swimport('keyify').keyify
randomsleep=swimport('randomsleep').randomsleep

Exemplo n.º 27

0

Exibir arquivo

Arquivo: scraper.py Projeto: adileg/global_ships_db

URL += "&fbuilt=&tbuilt=&manager_id=&fengine_kw_total=&tengine_kw_total=&builder_id=&fengine_hp_total="
URL += "&tengine_hp_total=&sortby=ship_name&p=%s"

i=get_var('page')
if i==None:
  i=0
while i<=1174:
    doc = html.parse(URL % i).getroot()
    rows = doc.xpath('//tr')
    if len(rows) == 1:
        break

    d=[]
    for row in rows:
      link = row.find('td/a')
      if link is None or not 'ShipDetails' in link.get('href'):
          continue
      number, name, type, dwt, built, flag, _ = map(lambda c: c.text, row)
      
      d.append({
        'number': number,
        'name': name, 
        'type': type,
        'dwt': dwt,
        'built': built,
        'flag': flag
      })
    save(['number'], d)
    i+=1
    save_var('page',i)

Exemplo n.º 28

0

Exibir arquivo

def save_state(js, level):
    """Save the second-to-next directory to be searched."""
    save_var(str(level), js)

Exemplo n.º 29

0

Exibir arquivo

Arquivo: cast.py Projeto: flyeven/scraperwiki-scraper-vault

'''
I figure out how different types are saved to the database.
'''
import datetime
from scraperwiki.sqlite import save, save_var, execute, commit

execute('drop table if exists swdata')
execute('drop table if exists swvariables')
execute('drop table if exists complex')
commit()

for t in {dict, list, set, str, unicode, bool, int, float, long}:
    save_var(str(t), t())
    save([], {str(t).replace("'","").replace(' ', '').replace('<type', '').replace('>', ''): t()})

save([], {
    u'list': [u'thing'],
    u'dict': {u'key': u'value'},
    u'set': {u'thing'},
}, 'complex')'''
I figure out how different types are saved to the database.
'''
import datetime
from scraperwiki.sqlite import save, save_var, execute, commit

execute('drop table if exists swdata')
execute('drop table if exists swvariables')
execute('drop table if exists complex')
commit()

for t in {dict, list, set, str, unicode, bool, int, float, long}:

Exemplo n.º 30

0

Exibir arquivo

Arquivo: boersenverein_publishing_in_europe.py Projeto: yuandra/scraperwiki-scraper-vault

 def process_response(response):
     print response.url
     doc = prepare_doc(response.text)
     gather_items_from(doc)
     db.save_var(response.url, 1, verbose=0)

Exemplo n.º 31

0

Exibir arquivo

Arquivo: ward_v2.py Projeto: flyeven/scraperwiki-scraper-vault

import json
import requests
import scraperwiki.sqlite as db
import time

begin = 0

counciltype = json.loads(requests.get("http://mapit.mysociety.org/areas/DIS").content)
time.sleep(1)
for council, data1 in counciltype.items():
    print data1["name"]
    if db.get_var("id") == council and begin == 0:
        begin = 1
    if begin == 1:
        db.save_var("id", council)
        children = json.loads(requests.get("http://mapit.mysociety.org/area/%s/children" % council).content)
        time.sleep(1)
        for id, data in children.items():
            json.loads(requests.get("http://mapit.mysociety.org/area/%s" % id).content)
            time.sleep(1)
            if data["type"] == "DIW":
                kml = requests.get("http://mapit.mysociety.org/area/%s.kml" % id).content
                time.sleep(1)
                councildata = {
                    "type": data["type"],
                    "parent_name": data1["name"],
                    "id": int(id),
                    "name": data["name"],
                    "kml": kml[85:-7],
                }
                db.save(["id"], councildata, verbose=0)

Exemplo n.º 32

0

Exibir arquivo

Arquivo: scraper.py Projeto: adileg/global_ships_db

URL += "&fbuilt=&tbuilt=&manager_id=&fengine_kw_total=&tengine_kw_total=&builder_id=&fengine_hp_total="
URL += "&tengine_hp_total=&sortby=ship_name&p=%s"

i = get_var('page')
if i == None:
    i = 0
while i <= 1174:
    doc = html.parse(URL % i).getroot()
    rows = doc.xpath('//tr')
    if len(rows) == 1:
        break

    d = []
    for row in rows:
        link = row.find('td/a')
        if link is None or not 'ShipDetails' in link.get('href'):
            continue
        number, name, type, dwt, built, flag, _ = map(lambda c: c.text, row)

        d.append({
            'number': number,
            'name': name,
            'type': type,
            'dwt': dwt,
            'built': built,
            'flag': flag
        })
    save(['number'], d)
    i += 1
    save_var('page', i)