예제 #1
0
def main():
  """Check what has been scraped so far, then resume.
  It might be good to check for gaps in the scraping.
  Or maybe a recursive approach isn't the best for
  search pages like this."""

  #What's already been scraped recently?
  if not 'directory' in show_tables():
    last_searched=0
  else:
    #Only skip things from the current scraper completion attempt.
    if 'scrape_completions' in show_tables():
      raw_ids=select('scrape_ids from scrape_completions order by completion_id desc limit 1')[0]['scrape_ids']
      max_to_ignore=max(map(int,raw_ids.split(',')))
      min_to_scrape=max_to_ignore+1
    else:
      min_to_scrape=1
    incomplete_scrape=select('max("search_id") as m from directory where scrape_id>='+str(min_to_scrape))[0]['m']
    if incomplete_scrape!=None:
      last_searched=incomplete_scrape
    else:
      last_searched=0

  if 'scrape_times' in show_tables():
    last_id=select('max("scrape_id") as m from scrape_times')[0]['m']
  else:
    last_id=0

  #Time of scrape start
  scrape_id=last_id+1
  save(['scrape_id'],{"scrape_id":scrape_id,"scrape_time":time()},'scrape_times')
  grab(last_searched+1,{"scrape_id":scrape_id},oncompletion=oncompletion)
def main():
  if 'splitnames' in show_tables():
    print "Already finished"
  elif 'lobbyists' in show_tables():
    parsenames()
  else:
    download()
    parsenames()
def main():
  #What has already been scraped
  if 'contributions' in show_tables():
    scraped=[row['querystring'] for row in select('querystring from contributions')]
  else:
    scraped=[]

  pagenumber=0
  while True:
    pagenumber=pagenumber+1
    xml=load(pagenumber)

    #Get the header row
    rows=xml.xpath('//table[@class="table_text"][tr[@class="tan_row"]]')[0].getchildren()[1:]
    keys=['name','contestant_party_district','date_received','class_and_partnum','association','monetary','non-monetary']

    #Get the data rows
    ds=[]
    d={}
    for row in rows:
      cells=row.getchildren()
      contributor=cells.pop(0).getchildren()[0]

      d['querystring']=contributor.attrib['href'].replace("javascript:PopUp('contributor.aspx?",'').replace("', '300', '300');",'')
      d[keys[0]]=contributor.text
      for i in range(1,len(cells)):
        d[keys[i]]=cells[i].text
      ds.append(d)

    #Don't run again if already run
    if ds[0]['querystring'] in scraped:
      break
    else:
      save(['querystring'],ds,'contributions')
예제 #4
0
def main():
    #finalpage=get_var('finalpage')
    prevpage = get_var('prevpage')

    #if None==finalpage:
    if True:
        finalpage = int(get_lastpage(getpage(1)))
        save_var('finalpage', finalpage)
    if None == prevpage:
        prevpage = 1

    if prevpage < finalpage:
        step1(prevpage, finalpage)
    elif prevpage == finalpage:
        if not "step2completion" in show_tables():
            execute(
                'create table `step2completion` (`url` text, `browsed` boolean)'
            )
            execute("""
        INSERT INTO `step2completion`
        ( url , browsed )
        SELECT url, 0 as "browsed"
        FROM locations
        """)
            commit()
        step2()
예제 #5
0
def get_page(url,table_name="pages"):
  if not table_name in show_tables():
    raise PageNotSavedError(url)
  else:
    rows=select("`text` from %s where url=?" % table_name,[url])
    l=len(rows)
    if l==0:
      raise PageNotSavedError(url)
    elif l>1:
      raise DatastoreError(url,"Multiple rows match this url.")
    elif l==1:
      if not 'text' in rows[0].keys():
        raise DatastoreError(url,"The database does not have a `text` column.")
      else:
        return rows[0]['text']


#Tests

#import unittest
#class TestGetPage(unittest.TestCase):
#  def test_good_page(self):
#    url="https://scraperwiki.com/scrapers/dbgetpy/"
#    get_page(url)
#    row=select('* from `pages` where url=?',[url])[0]
#    assertEqual(set(row.keys()),set(["url","text"]))
#    assertIn("dbget=swimport('dbgetpy')",row['text'])

#if __name__ == '__main__':
#  print "Running tests"
#  unittest.main()
#else:
#  import os
#  print "Running from bash"
#  print os.execvp("python",["script.py"])
예제 #6
0
def swversion(table_name='swdata'):
    if table_name in show_tables():
        timestamp = select("max(date_extracted) as m from %s;" %
                           table_name)[0]['m']
        execute("ALTER TABLE `%s` RENAME TO `%s_%d`;" %
                (table_name, table_name, timestamp))
        commit()
예제 #7
0
def nextid():
    defaultquery = [{"id": 0}]
    if not OBS in show_tables():
        idquery = defaultquery
    else:
        idquery = select('max(id) as id from %s' % OBS)
        if len(idquery) == 0:
            idquery = defaultquery
    id = idquery[0]['id']
    return id
def scrape(url,table_name="swdata", how_many = 10000):
  listurl=attendeelisturl(url)
  d=getattendeelist(listurl)
  d = getattendeelist(listurl + '&show_more=%d&sortid=0' % how_many)

  if table_name in show_tables():
    scraped_so_far=select('count(*) as "c" from `%s`'%table_name)[0]['c']
    saveattendeelist(d[0:-scraped_so_far],table_name)
  else:
    saveattendeelist(d,table_name)
def nextid():
  defaultquery=[{"id":0}]
  if not OBS in show_tables():
    idquery=defaultquery
  else:
    idquery=select('max(id) as id from %s' % OBS)
    if len(idquery)==0:
      idquery=defaultquery
  id=idquery[0]['id']
  return id
예제 #10
0
def geocode():
    if "scraped" not in show_tables():
        d = swimport('csv2sw').read.csv('https://views.scraperwiki.com/run/combine_mix_scraper_spreadsheets/')
        save([], d, 'scraped')

    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]

        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
예제 #11
0
def scrape(url, table_name="swdata", how_many=10000):
    listurl = attendeelisturl(url)
    d = getattendeelist(listurl)
    d = getattendeelist(listurl + '&show_more=%d&sortid=0' % how_many)

    if table_name in show_tables():
        scraped_so_far = select('count(*) as "c" from `%s`' %
                                table_name)[0]['c']
        saveattendeelist(d[0:-scraped_so_far], table_name)
    else:
        saveattendeelist(d, table_name)
def geocode():
    if "scraped" not in show_tables():
        d = swimport('csv2sw').read.csv('https://views.scraperwiki.com/run/combine_mix_scraper_spreadsheets/')
        save([], d, 'scraped')

    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]

        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
def check_identical_screenshot(image_base64):
  """Check whether there's an identical screenshot already saved"""

  #If,else to handle new tables
  if 'images' in show_tables():
    identical_screenshot=select('screenshot_id from images where image="'+image_base64+'" limit 1')
  else:
    identical_screenshot=[]

  if len(identical_screenshot)==0:
    #No identical screenshot
    if 'images' in show_tables():
      screenshot_id=select('max(screenshot_id) as id from images')[0]['id']+1
    else:
      screenshot_id=1
    return (False,{
      "screenshot_id":screenshot_id
    , "image":image_base64
    })
  elif len(identical_screenshot)==1:
    return (True,identical_screenshot[0])
def check_identical_screenshot(image_base64):
    """Check whether there's an identical screenshot already saved"""

    #If,else to handle new tables
    if 'images' in show_tables():
        identical_screenshot = select(
            'screenshot_id from images where image="' + image_base64 +
            '" limit 1')
    else:
        identical_screenshot = []

    if len(identical_screenshot) == 0:
        #No identical screenshot
        if 'images' in show_tables():
            screenshot_id = select(
                'max(screenshot_id) as id from images')[0]['id'] + 1
        else:
            screenshot_id = 1
        return (False, {"screenshot_id": screenshot_id, "image": image_base64})
    elif len(identical_screenshot) == 1:
        return (True, identical_screenshot[0])
def geocode():
    if "scraped" not in show_tables():
        d = swimport('csv2sw').read.csv('http://hacks.thomaslevine.com/all.csv')
        save([], d, 'scraped')
        execute('DELETE FROM `scraped` WHERE `Country` != "South Africa"')
        commit()

    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]

        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
def go(number=1,pagetype="SCRAPERS"):
  foo=scrapepage(number,pagetype)
  is_end=('scraper_urls' in show_tables()) and (foo['lasturl'] in select('url from scraper_urls'))
  #Save after checking whether it's the end because that's how I check.
  save(['url'],foo['scraper_urls'],'scraper_urls')

  if foo['lastpage']:
    #End when we reach the last page
    print "I scraped all the scrapers!"
  elif is_end:
    #End when we reach page where a scraper has already been scraped
    print "I scraped all of the new scrapers!"
  else:
    go(number+1,pagetype)
def atomic():
  if "client"==pagetype(get_var('previous_href')):
    table_names=CLIENT_TABLES
  elif "lobbyist"==pagetype(get_var('previous_href')):
    table_names=LOBBYIST_TABLES
  else:
    raise ResumeError('The type of the previous href, "%s", could not be determined.' % get_var('previous_href'))

  if "clients_urls" in show_tables():
    sourceUrl=select('distinct sourceUrl as "s" from `clients_urls` where jobId=(select max(jobId) from `clients_urls`)')[0]['s']
    for table_name in table_names:
      execute('DELETE FROM `%s` where jobId in (select jobId from clients_urls where sourceUrl="%s")' % (table_name,sourceUrl))
    commit()
    return sourceUrl
예제 #18
0
def main():
  if not 'cities_done' in show_tables():
    cities_done=[]
  else:
    cities_done=select('* from cities_done')

  for fromcity in CITIES_NY:
    for tocity in CITIES_NY:
      if fromcity==tocity:
        print 'Skipping within-%s route' % fromcity
      elif {"from":fromcity,"to":tocity} in cities_done:
        print 'Already scraped %s to %s' % (fromcity,tocity)
      else:
        grab(fromcity,"NY",tocity,"NY")
        save([],{"from":fromcity,"to":tocity},'cities_done')
예제 #19
0
def get_page(url,table_name="pages"):
  if not table_name in show_tables():
    raise PageNotSavedError(url)
  else:
    rows=select("`text` from %s where url=?" % table_name,[url])
    l=len(rows)
    if l==0:
      raise PageNotSavedError(url)
    elif l>1:
      raise DatastoreError(url,"Multiple rows match this url.")
    elif l==1:
      if not 'text' in rows[0].keys():
        raise DatastoreError(url,"The database does not have a `text` column.")
      else:
        return rows[0]['text']
def get_scraper_state():
  all_views=[row['value'] for row in select('value FROM views ORDER BY value', verbose=False)]
  if 'links' not in show_tables():
    years_to_do=[row['value'] for row in select('value FROM years ORDER BY value', verbose=False)]
    remaining_views_this_year=all_views
  else:
    finished=select('max(view) as "view",year from links where year=(select max(year) from links)', verbose=False)
    years_to_do=[row['value'] for row in select('value FROM years WHERE value>"%s" ORDER BY value' % finished[0]['year'], verbose=False)]
    remaining_views_this_year=[row['value'] for row in select('value from views where value>"%s"' % finished[0]['view'], verbose=False)]
    del(finished)
  return {
    "all-views":all_views
  , "years-to-do":years_to_do
  , "remaining-views-this-year":remaining_views_this_year
  }
def moreparsing_map():
  "Map along the most recent results in the table (like a Couch map) and return a new one"
  d=select("* FROM `swdata` WHERE date_scraped=(SELECT max(date_scraped) from `swdata`);")
  for row in d:
    row['street-address'],row['postal-code']=splitAddress(row['Address_'])
    row['town']=extractTown(row['branchName'])
  if 'final' in show_tables():
    execute('DROP TABLE `final`;')

  d_final = []
  for row in d:
    if row['regionName'] not in ["Botswana", "Malawi", "Nambia"]:
      d_final.append(row)

  save([],d_final,'final')
예제 #22
0
def geocode():
    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]
        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
def parse(url, xml=None, suffix=''):
    if xml == None:
        xml = pull(url)
    print "Loading the page"
    scrapers = xml.xpath(PATH)
    for scraper in scrapers:
        if 'observations' in show_tables():
            observation_id = select(
                'max(observation_id) as id from observations')[0]['id'] + 1
        else:
            observation_id = 1
        identifiers = {"observation_id": observation_id}
        info = copy(identifiers)
        screenshot_identity = copy(identifiers)

        identifiers['time_scraped'] = time()
        identifiers['url'] = scraper.xpath('a')[0].attrib['href']

        print "Extracting metadata"
        info['owner'], info['title'] = scraper.xpath('a/h4')[0].text.split(
            '/', 1)
        info['language'], info['type'] = re.split(
            r'[^a-zA-Z]+',
            scraper.xpath('a/span[@class="about"]')[0].text)
        info['created'] = scraper.xpath('a/span[@class="when"]')[0].text

        screenshot_identity['url'] = scraper.xpath('a/img')[0].attrib['src']
        print "Checking whether I've already saved the screenshot"
        exists, image = check_identical_screenshot(
            getimage(screenshot_identity['url']))
        if exists:
            #If I have, don't do anything with theimage
            print "Screenshot already saved"
        else:
            #If I haven't, save a new image
            print "Saving the new screenshot"
            image['observation_scraped_on'] = observation_id
            save(['observation_scraped_on', 'screenshot_id'], image, 'images')

        #Either way, link the observation to the saved image
        screenshot_identity['screenshot_id'] = image['screenshot_id']
        save(['observation_id'], screenshot_identity, 'screenshot_identidies')

        #Save these at the end to avoid partial rows
        print "Saving"
        save(['observation_id'], info, 'homepage_metadata')
        save(['observation_id'], identifiers, 'observations')
예제 #24
0
def oncompletion():
  scrape_ids=[str(row['scrape_id']) for row in select('scrape_id from scrape_times')]
  if 'scrape_completions' in show_tables():
    #Increment id
    completion_id=1+select('max("completion_id") as m from scrape_completions')[0]['m']
    #Remove old scrape_ids
    completion_rows=[row['scrape_ids'] for row in select('scrape_ids from scrape_completions')]
    old_scrapes=(','.join(completion_rows)).split(',')
    for old_scrape in old_scrapes:
      scrape_ids.remove(old_scrape)
  else:
    completion_id=1
  d={
    "completion_id":completion_id
  , "scrape_ids":','.join(scrape_ids)
  }
  save(['completion_id'],d,'scrape_completions')
  def _parse_and_save(self,SpecificDataRow,maintable):
    "Clean up stuff"

    #Skip the raw parse
    #job_raw=self.rawparse()
    #for row in job_raw:
    #  row['url']=self.url
    #save([],job_raw,maintable+'raw',verbose=False)

    for tr in self.getTableRows():
      #Get the next jobId
      if maintable in show_tables():
        jobId=select('max(jobId) as "jobId" from `%s`' % maintable, verbose=False)[0]['jobId']+1
      else:
        jobId=1

      r=SpecificDataRow(tr,jobId,self.url)
      r.parse_and_save()
def main():
    #What has already been scraped
    if 'contributions' in show_tables():
        scraped = [
            row['querystring']
            for row in select('querystring from contributions')
        ]
    else:
        scraped = []

    pagenumber = 0
    while True:
        pagenumber = pagenumber + 1
        xml = load(pagenumber)

        #Get the header row
        rows = xml.xpath('//table[@class="table_text"][tr[@class="tan_row"]]'
                         )[0].getchildren()[1:]
        keys = [
            'name', 'contestant_party_district', 'date_received',
            'class_and_partnum', 'association', 'monetary', 'non-monetary'
        ]

        #Get the data rows
        ds = []
        d = {}
        for row in rows:
            cells = row.getchildren()
            contributor = cells.pop(0).getchildren()[0]

            d['querystring'] = contributor.attrib['href'].replace(
                "javascript:PopUp('contributor.aspx?",
                '').replace("', '300', '300');", '')
            d[keys[0]] = contributor.text
            for i in range(1, len(cells)):
                d[keys[i]] = cells[i].text
            ds.append(d)

        #Don't run again if already run
        if ds[0]['querystring'] in scraped:
            break
        else:
            save(['querystring'], ds, 'contributions')
def main():
    if "urls" not in show_tables():
        copyUrlsDb()
    for url in getUrls():
        slug = getScraperSlug(url)
        code, user = getCode(slug)
        if code != None:
            c = code.lower()
            save(['url'], {
                "code":code, "user": user, "url": url,
                "has_join": " join " in c,
                "has_attach": "attach" in c,
                "has_twitter": "twitter" in c,
            })
        execute('UPDATE `urls` SET `scraped`=1 WHERE `url` = ?', url)
        commit()

    d = select('`user`, count(*) AS "attach-and-join-count" from `swdata` WHERE (`has_join` = 1 and `has_attach` = 1) GROUP BY `user`')
    save(['user'], d, 'results')
def parse(url,xml=None,suffix=''):
  if xml==None:
    xml=pull(url)
  print "Loading the page"
  scrapers=xml.xpath(PATH)
  for scraper in scrapers:
    if 'observations' in show_tables():
      observation_id=select('max(observation_id) as id from observations')[0]['id']+1
    else:
      observation_id=1
    identifiers={"observation_id":observation_id}
    info=copy(identifiers)
    screenshot_identity=copy(identifiers)

    identifiers['time_scraped']=time()
    identifiers['url']=scraper.xpath('a')[0].attrib['href']

    print "Extracting metadata"
    info['owner'],info['title']=scraper.xpath('a/h4')[0].text.split('/',1)
    info['language'],info['type']=re.split(r'[^a-zA-Z]+',scraper.xpath('a/span[@class="about"]')[0].text)
    info['created']=scraper.xpath('a/span[@class="when"]')[0].text

    screenshot_identity['url']=scraper.xpath('a/img')[0].attrib['src']
    print "Checking whether I've already saved the screenshot"
    exists,image=check_identical_screenshot(getimage(screenshot_identity['url']))
    if exists:
      #If I have, don't do anything with theimage
      print "Screenshot already saved"
    else:
      #If I haven't, save a new image
      print "Saving the new screenshot"
      image['observation_scraped_on']=observation_id
      save(['observation_scraped_on','screenshot_id'],image,'images')

    #Either way, link the observation to the saved image
    screenshot_identity['screenshot_id']=image['screenshot_id']
    save(['observation_id'],screenshot_identity,'screenshot_identidies')

    #Save these at the end to avoid partial rows
    print "Saving"
    save(['observation_id'],info,'homepage_metadata')
    save(['observation_id'],identifiers,'observations')
def resume_siblings(js,level):
  if level==1:
    print "Finished resuming"
  elif not OBS in show_tables():
    pass
  else:
    parent=select('parentjs from %s order by date_scraped desc limit 1' % OBS)[0]['parentjs']
    foo,bar,baz=(eval(parent.replace('getlaw','')))
    xml=fromstring(getlaw(foo,bar,baz))
    links=get_law_links(xml,parent)
    linkslist=[link['observation']['js'] for link in links]
    if not js in linkslist:
      #It looks like the last sibling scraped was the last child of its parent;
      #None of its siblings need to be scraped
      pass
    else:
      first=linkslist.index(js)+1
      last=len(linkslist)
      print level,first,last
      if first<last:
        for link in linkslist[first:last]:
          search_directory_tree(link,level)
def main():
  if 'productlines' not in show_tables():
    save(['href'],getproductlinelinks(MENU),'productlines')
  hrefs=[row['href'] for row in select('href from productlines')]
  for href in hrefs:
    p=ProductLine(href)
    t=p.current_models_table()

    #Overview
    save(['href'],p.overview(),'overview')

    #Specifications
    save([],t.specifications(units="english"),'specifications')
    save([],t.specifications(units="metric"),'specifications')

    #Links to models
    model_links=t.model_links()
    for model_link in model_links:
      model_link['product-line-href']=p.href
    save(['href'],model_links,'models')

    #Links to non-current models
    save([],p.noncurrent_models_link(),'current_noncurrent')
예제 #31
0
def resume_siblings(js, level):
    if level == 1:
        print "Finished resuming"
    elif not OBS in show_tables():
        pass
    else:
        parent = select('parentjs from %s order by date_scraped desc limit 1' %
                        OBS)[0]['parentjs']
        foo, bar, baz = (eval(parent.replace('getlaw', '')))
        xml = fromstring(getlaw(foo, bar, baz))
        links = get_law_links(xml, parent)
        linkslist = [link['observation']['js'] for link in links]
        if not js in linkslist:
            #It looks like the last sibling scraped was the last child of its parent;
            #None of its siblings need to be scraped
            pass
        else:
            first = linkslist.index(js) + 1
            last = len(linkslist)
            print level, first, last
            if first < last:
                for link in linkslist[first:last]:
                    search_directory_tree(link, level)
from scraperwiki.sqlite import save_var,execute,commit,show_tables
import os

if "swvariables" in show_tables():
  execute("DROP TABLE swvariables;")

before=set(os.listdir('.'))
save_var('foo','bar')
#os.system('rm *.pyc')
after=set(os.listdir('.'))

#print before-after
#print after
s=[]
for f in after:
  if f[0:4]!='data' and f[-3:-1]!='pyc':
    s.append(f)

print s

baz=[]
baz.append('script.rb')
baz.append('.cache')
for f in baz:
  print open(f).read()
from scraperwiki.sqlite import save_var,execute,commit,show_tables
import os

if "swvariables" in show_tables():
  execute("DROP TABLE swvariables;")
from scraperwiki.sqlite import save_var, execute, commit, show_tables
import os

if "swvariables" in show_tables():
    execute("DROP TABLE swvariables;")

before = set(os.listdir('.'))
save_var('foo', 'bar')
#os.system('rm *.pyc')
after = set(os.listdir('.'))

#print before-after
#print after
s = []
for f in after:
    if f[0:4] != 'data' and f[-3:-1] != 'pyc':
        s.append(f)

print s

baz = []
baz.append('script.rb')
baz.append('.cache')
for f in baz:
    print open(f).read()
from scraperwiki.sqlite import save_var, execute, commit, show_tables
import os

if "swvariables" in show_tables():
    execute("DROP TABLE swvariables;")
예제 #34
0
                row.update({'premises_name': premises_name, 'town': town,})
            else:
                row['enter_manually'] = 1

            row.update({'date_scraped': time(), 'ScraperRun': scraper_run, 'url': self.url, 'Record': int(self.url.split('=')[-1])})
            data.append(row)

        save([], data, 'BusinessPremises')

execute('CREATE TABLE IF NOT EXISTS Registrant (ScraperRun INTEGER, Record INTEGER)')
execute('CREATE INDEX IF NOT EXISTS RegistrantRecord ON Registrant(record)')
execute('CREATE TABLE IF NOT EXISTS BusinessPremises (ScraperRun INTEGER, Record INTEGER, FOREIGN KEY(Record) REFERENCES Registrant(Record))')
execute('CREATE INDEX IF NOT EXISTS BusinessPremisesRecord ON BusinessPremises(ScraperRun, Record)')
commit()

if "stack" not in show_tables() or select('count(*) as "c" from stack')[0]['c'] == 0:
    save_var('scraper_run', int(time()))

scraper_run = get_var('scraper_run', None)
if scraper_run == None:
    raise NameError('scraper_run is not defined.') 

seed([SearchResults(None)])
#seed([BusinessPremises('http://www.ncr.org.za/register_of_registrants/viewpremises.php?record=11296')])Xfrom scraperwiki.sqlite import save, select, execute, save_var, get_var, commit, show_tables
from scraperwiki import swimport
#from requests import session
import requests
from lxml.html import fromstring, tostring
import re
from time import time, sleep
keyify=swimport('keyify').keyify
예제 #35
0
        save([], data, 'BusinessPremises')


execute(
    'CREATE TABLE IF NOT EXISTS Registrant (ScraperRun INTEGER, Record INTEGER)'
)
execute('CREATE INDEX IF NOT EXISTS RegistrantRecord ON Registrant(record)')
execute(
    'CREATE TABLE IF NOT EXISTS BusinessPremises (ScraperRun INTEGER, Record INTEGER, FOREIGN KEY(Record) REFERENCES Registrant(Record))'
)
execute(
    'CREATE INDEX IF NOT EXISTS BusinessPremisesRecord ON BusinessPremises(ScraperRun, Record)'
)
commit()

if "stack" not in show_tables() or select(
        'count(*) as "c" from stack')[0]['c'] == 0:
    save_var('scraper_run', int(time()))

scraper_run = get_var('scraper_run', None)
if scraper_run == None:
    raise NameError('scraper_run is not defined.')

seed([SearchResults(None)])
#seed([BusinessPremises('http://www.ncr.org.za/register_of_registrants/viewpremises.php?record=11296')])Xfrom scraperwiki.sqlite import save, select, execute, save_var, get_var, commit, show_tables
from scraperwiki import swimport
#from requests import session
import requests
from lxml.html import fromstring, tostring
import re
from time import time, sleep
 def is_new(this):
   """Check whether I've already saved it"""
   if 'tweets' not in show_tables():
     return True
   else:
     return 0==select('count(*) as c from tweets where id="%s"' % this._tweet['id'])[0]['c']
예제 #37
0
def swversion(table_name='swdata'):
  if table_name in show_tables():
    timestamp=select("max(date_extracted) as m from %s;" % table_name)[0]['m']
    execute("ALTER TABLE `%s` RENAME TO `%s_%d`;"%(table_name,table_name,timestamp))
    commit()