示例#1
0
def main():
    #finalpage=get_var('finalpage')
    prevpage = get_var('prevpage')

    #if None==finalpage:
    if True:
        finalpage = int(get_lastpage(getpage(1)))
        save_var('finalpage', finalpage)
    if None == prevpage:
        prevpage = 1

    if prevpage < finalpage:
        step1(prevpage, finalpage)
    elif prevpage == finalpage:
        if not "step2completion" in show_tables():
            execute(
                'create table `step2completion` (`url` text, `browsed` boolean)'
            )
            execute("""
        INSERT INTO `step2completion`
        ( url , browsed )
        SELECT url, 0 as "browsed"
        FROM locations
        """)
            commit()
        step2()
示例#2
0
def extract_postcodes():
  sql = ' `rowid`, `address` from `branches`;'
  for row in select(sql):
    postcodes = findall(r'[0-9]{4}', row['address'])
    if len(postcodes) != 0:
      execute("UPDATE `branches` SET `postcode` = ? WHERE `rowid` = ? ", (postcodes[-1], row['rowid']) )
  commit()
示例#3
0
def save_file(link, name, position, length, hash):
    f = urlopen(urlunsplit((scheme, host, link, '', '')))
    content_length = long(f.headers['Content-Length'])
    try:
        if position and length and hash and length == content_length and position <= length:
            print 'Checking hash'
            hash_object = replay_hash(readlines(f, newline, position), b64decode(hash))
            print 'Hash OK'
            if position == length:
                print 'File OK'
                return
        elif position == 0 and length == hash == None:
            print 'Starting fresh'
            position, length, hash_object = (0, content_length, sha512())
            sqlite.execute('UPDATE swvariables SET length=? WHERE file=?', (length, name))
        else:
            print "Save error"
            raise DataError
        for line in readlines(f, newline, content_length - position):
            save_line(name, line, hash_object)
        sqlite.commit()
        print 'All OK'
    except DataError:
        delete_table(name)
        print 'DataError'
示例#4
0
def swversion(table_name='swdata'):
    if table_name in show_tables():
        timestamp = select("max(date_extracted) as m from %s;" %
                           table_name)[0]['m']
        execute("ALTER TABLE `%s` RENAME TO `%s_%d`;" %
                (table_name, table_name, timestamp))
        commit()
示例#5
0
def clean_description_keys():
    KEY_GLOB_PAIRS = [
        ('affiliations', '*affiliations*'),
        ('courses', '*courses*'),
        ('research', '*research*'),
        ('research', '*interests*'),
        ('honors', '*honors*'),
        ('publications', '*publications*'),
        ('education', '*education*'),
        ('introduction', 'introduction*'),
        ('introduction', 'biography'),
    ]

    try:
        execute('ALTER TABLE descriptions ADD COLUMN key_cleaned TEXT')
    except:
        pass
    else:
        commit()

    for pair in KEY_GLOB_PAIRS:
        execute(
            'UPDATE descriptions SET key_cleaned = "%s" WHERE lower(key) GLOB "%s" AND key_cleaned IS NULL'
            % pair)
        commit()
    def pop(self):
        obj = self.last()

        # Delete
        execute('delete from main.stack where rowid = (select max(rowid) from main.stack)')
        commit()

        return obj
示例#7
0
def aggregate():
    execute('create table if not exists twitter (handle text, times integer)')
    execute('create unique index if not exists twitter_handle on twitter(handle)')
    execute('delete from twitter where 1 = 1')
    execute('insert into twitter '
        'select replace(`twitter handle`, "@", ""), count(*) from `tcamp` '
        'where `twitter handle` is not null group by [twitter handle]'
    )
    commit()
示例#8
0
def main():
    if get_var('province') == 'step2':
        separate_addresses()
        execute('DELETE FROM swvariables WHERE name = "province"')
        commit()
        print("""
    ================================
    This run is finished!
    ================================
    """)
    else:
        download()
def main():
  for url in getUrls():
    slug=getScraperSlug(url)
    try:
      owners=getScraperOwners(slug)
    except:
      save(['url'],{"url":url},'errors')
    else:
      for owner in owners:
        save(['username'],{"username":owner},'users')
    save(['url'],{"url":url,"scraped":True},'urls')

  print 'Add bio html'
  if "`bio` TEXT" not in execute("SELECT sql FROM sqlite_master WHERE tbl_name = 'users' AND type = 'table'")['data'][0][0]:
    execute("ALTER TABLE `users` ADD COLUMN `bio` TEXT;")

  for username in getUsernames("bio"):
    bio=getUserProfile(username)
    save(['username'],{"username":username,"bio":bio},'users')

  print 'Add biotext'
  if "`biotext` TEXT" not in execute("SELECT sql FROM sqlite_master WHERE tbl_name = 'users' AND type = 'table'")['data'][0][0]:
    execute("ALTER TABLE `users` ADD COLUMN `biotext` TEXT;")

  for username in getUsernames("biotext"):
    bio=select('`bio` FROM `users` WHERE `username`=?',[username])[0]["bio"]
    biotext=getBioText(bio)
    save(['username'],{"username":username,"bio":bio,"biotext":biotext},'users')

  print 'Add code roles'
  if "`owns` INT" not in execute("SELECT sql FROM sqlite_master WHERE tbl_name = 'users' AND type = 'table'")['data'][0][0]:
    execute("ALTER TABLE `users` ADD COLUMN `owns` INT;")
    execute("ALTER TABLE `users` ADD COLUMN `edits` INT;")

  for username in getUsernames("owns"):
    d=getCodeRoles(username)
    execute("UPDATE `users` SET owns=?,edits=? WHERE username=?",[d["owns"],d["edits"],username])
    commit()

  print 'Add title variation'
  if "`distinct_title_tokens_count` INT" not in execute("SELECT sql FROM sqlite_master WHERE tbl_name = 'users' AND type = 'table'")['data'][0][0]:
    execute("ALTER TABLE `users` ADD COLUMN `distinct_title_tokens_count` INT;")
    execute("ALTER TABLE `users` ADD COLUMN `title_tokens` TEXT;")

  for username in getUsernames("distinct_title_tokens_count"):
    json=getUserJSON(username)
    d=titleVariation(json)
    execute("""
      UPDATE `users` SET distinct_title_tokens_count=?,title_tokens_count=?,title_tokens=?
      WHERE username=?;
      """,[d["distinct_count"],d["total_count"],d["text"],username]
    )
    commit()
示例#10
0
def initialize():
    execute("""
CREATE TABLE `address` (
  `address-column` text,
  `address-input` text,
  `finished` integer
)""")
    execute("CREATE UNIQUE INDEX column ON `address` (`address-column`,`address-input`);")
    execute("""
CREATE TABLE `geocode` (
  `address-geocode` TEXT,
  `latitude-geocode` REAL,
  `longitude-geocode` REAL,

  `number-of-matches` text,

  `service` TEXT,
  `address-input` TEXT
)""")
    execute("CREATE UNIQUE INDEX geocode_key ON `geocode` (`address-input`, `service`);")

    execute('''
CREATE TABLE `branch_address` (
  `address-column` TEXT,
  `address-input` TEXT,
  `entityRecord` INTEGER,
  FOREIGN KEY(entityRecord) REFERENCES scraped(rowid)
)''')
    execute("CREATE UNIQUE INDEX branch_key ON `branch_address` (`entityRecord`, `address-column`);")
    commit()

    for column in COLUMNS:
        execute('''
INSERT INTO `address` (
  `address-column`,
  `address-input`,
  `finished`
) SELECT DISTINCT
    ? as "address-column",
    %s as "address-input",
    0 as "finished"
  FROM
    `scraped`
  ''' % COLUMNS[column], column)
        commit()
        execute('''
INSERT INTO `branch_address` (
  `entityRecord`,
  `address-column`,
  `address-input`
) SELECT
    `rowid`, 
    ? as "address-column",
    %s as "address-input"
  FROM
    `scraped`
  ''' % COLUMNS[column], column)
        commit()
    execute('DELETE FROM `address` WHERE `address-input` IS NULL')
    commit()
def atomic():
  if "client"==pagetype(get_var('previous_href')):
    table_names=CLIENT_TABLES
  elif "lobbyist"==pagetype(get_var('previous_href')):
    table_names=LOBBYIST_TABLES
  else:
    raise ResumeError('The type of the previous href, "%s", could not be determined.' % get_var('previous_href'))

  if "clients_urls" in show_tables():
    sourceUrl=select('distinct sourceUrl as "s" from `clients_urls` where jobId=(select max(jobId) from `clients_urls`)')[0]['s']
    for table_name in table_names:
      execute('DELETE FROM `%s` where jobId in (select jobId from clients_urls where sourceUrl="%s")' % (table_name,sourceUrl))
    commit()
    return sourceUrl
示例#12
0
def geocode():
    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]
        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
示例#13
0
def separate_addresses():
    execute('DROP TABLE IF EXISTS final')
    commit()
    d = select('* from `initial`')
    for row in d:
        splitaddress = row['address'].split('\n')
        l = len(splitaddress)
        if l == 3:
            row['street-address'], row['subtown'], row['town2'] = splitaddress
        elif l == 2:
            row['street-address'], row['subtown'] = splitaddress
        else:
            raise AddressError
        row['street-address'] = row['street-address'].strip()
        row['address'] = strip_address(row['address'])
    save([], d, 'final')
示例#14
0
    def pop(self):
        # Query
        query = select('* from stack where rowid = (select max(rowid) from stack)')

        # Load
        instantiate = "%s(%s)" % (query[0]['classname'], dumps(query[0]['url']))
        print instantiate
        obj = eval(instantiate)

        # Delete
        execute('delete from stack where rowid = (select max(rowid) from stack)')
        commit()

        # Remember in case of error
        justpopped = obj

        return obj
def main():
    if "urls" not in show_tables():
        copyUrlsDb()
    for url in getUrls():
        slug = getScraperSlug(url)
        code, user = getCode(slug)
        if code != None:
            c = code.lower()
            save(['url'], {
                "code":code, "user": user, "url": url,
                "has_join": " join " in c,
                "has_attach": "attach" in c,
                "has_twitter": "twitter" in c,
            })
        execute('UPDATE `urls` SET `scraped`=1 WHERE `url` = ?', url)
        commit()

    d = select('`user`, count(*) AS "attach-and-join-count" from `swdata` WHERE (`has_join` = 1 and `has_attach` = 1) GROUP BY `user`')
    save(['user'], d, 'results')
示例#16
0
def join():
    execute('DROP TABLE IF EXISTS `accuracy`')
    commit()
    d = select('''
* FROM `geocode`
LEFT JOIN `branch_address` ON (
  `branch_address`.`address-input` = `geocode`.`address-input`
)
''')
    for row in d:
        if row['address-geocode'] == None:
            row['kilometers_off'] = None
        else:
            row['kilometers_off'] = distance.distance(
                (row['latitude-scrape'], row['longitude-scrape']),
                (row['latitude-geocode'], row['longitude-geocode'])
            ).km
        for key in ['latitude-scrape', 'longitude-scrape', 'latitude-geocode', 'longitude-geocode', 'address-input']:
            del(row[key])
    save([], d, 'accuracy')
示例#17
0
def geocode():
    if "scraped" not in show_tables():
        d = swimport('csv2sw').read.csv('https://views.scraperwiki.com/run/combine_mix_scraper_spreadsheets/')
        save([], d, 'scraped')

    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]

        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
def geocode():
    if "scraped" not in show_tables():
        d = swimport('csv2sw').read.csv('http://hacks.thomaslevine.com/all.csv')
        save([], d, 'scraped')
        execute('DELETE FROM `scraped` WHERE `Country` != "South Africa"')
        commit()

    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]

        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
def initialize():
    execute("""
CREATE TABLE `address` (
  `address-column` text,
  `address-input` text,
  `finished` integer
)""")
    execute("CREATE UNIQUE INDEX column ON `address` (`address-column`,`address-input`);")

    try:
        PREVIOUS_SCRAPER
    except:
        execute("""
    CREATE TABLE `geocode` (
      `address-geocode` TEXT,
      `latitude-geocode` REAL,
      `longitude-geocode` REAL,
    
      `number-of-matches` text,
    
      `service` TEXT,
      `address-input` TEXT
    )""")
    else:
        attach(PREVIOUS_SCRAPER)
        save([], select('* FROM `geocode`'), 'geocode')

    execute("CREATE UNIQUE INDEX geocode_key ON `geocode` (`address-input`, `service`);")

    execute('''
CREATE TABLE `branch_address` (
  `address-column` TEXT,
  `address-input` TEXT,
  `entityRecord` INTEGER,
  FOREIGN KEY(entityRecord) REFERENCES scraped(rowid)
)''')
    execute("CREATE UNIQUE INDEX branch_key ON `branch_address` (`entityRecord`, `address-column`);")
    commit()

    for column in COLUMNS:
        execute('''
INSERT INTO `address` (
  `address-column`,
  `address-input`,
  `finished`
) SELECT DISTINCT
    ? as "address-column",
    %s as "address-input",
    0 as "finished"
  FROM
    `scraped`
  ''' % COLUMNS[column], column)
        commit()
        execute('''
INSERT INTO `branch_address` (
  `entityRecord`,
  `address-column`,
  `address-input`
) SELECT
    `rowid`, 
    ? as "address-column",
    %s as "address-input"
  FROM
    `scraped`
  ''' % COLUMNS[column], column)
        commit()
    execute('DELETE FROM `address` WHERE `address-input` IS NULL')
    commit()
    execute('''
UPDATE address SET finished = 1
WHERE rowid in (
  select address.rowid from `address`
  join `geocode` on 
  address.`address-input` = geocode.`address-input` where service = "nominatim"
)
''')
    commit()
def delete_table(name):
    sqlite.execute('UPDATE swvariables SET position=0,length=NULL,hash=NULL WHERE file=?', name)
    #sqlite.execute('DELETE FROM ' + name)
    sqlite.execute('DROP TABLE ' + name)
    sqlite.commit()
def save_line(name, line, hash):
    hash.update(line)
    save_row(name, imap(process_cell, line[:-2].split(cell_delimiter)))
    sqlite.execute('UPDATE swvariables SET position=position+?,hash=? WHERE file=?', (len(line), b64encode(hash.digest()), name), verbose=0)
    sqlite.commit()
            raise DataError
        for line in readlines(f, newline, content_length - position):
            save_line(name, line, hash_object)
        sqlite.commit()
        print 'All OK'
    except DataError:
        delete_table(name)
        print 'DataError'

try:
    save_files()  
except URLError:
    pass
except Exception, e:
    print e
    sqlite.commit()import lxml.html, re
from base64 import b64decode, b64encode
from collections import deque
from functools import reduce
from hashlib import sha512
from itertools import chain, ifilter, imap, izip, product, repeat
from operator import add, and_, eq
from scraperwiki import datastore, sqlite
from urllib2 import URLError, urlopen
from urlparse import urlunsplit


scheme='http'
host = 'www.ars.usda.gov'
path = '/Services/docs.htm'
query = 'docid=8964'
示例#23
0
 def __init__(self, startingstack):
     execute('create table if not exists stack (obj, blob);')
     commit()
     self.extend(startingstack)
    maphref=td.xpath('a/attribute::href')[0]
    branch.update(parse_maphref(maphref))

    return branch

def parse_maphref(maphref):
    html=maphref.split("'")[1].replace('<br>','')
    x=fromstring(html)
    keys=["map_%s" % keyify(key) for key in x.xpath('strong/text()')]
    values=x.xpath('text()')
    return dict(zip(keys,values))

execute('CREATE TABLE IF NOT EXISTS provinces (provinceUrl TEXT )')
execute('CREATE TABLE IF NOT EXISTS cities (provinceUrl TEXT, cityUrl TEXT, FOREIGN KEY(provinceUrl) REFERENCES provinces(provinceUrl) )')
execute('CREATE TABLE IF NOT EXISTS branches (cityUrl TEXT, branchUrl TEXT, FOREIGN KEY(cityUrl) REFERENCES cities(cityUrl) )')
commit()

scraperrun = get_var('scraperrun', int(time()))
save_var('scraperrun', scraperrun)
seed([Menu(URLS['main'])])
execute('delete from swvariables where name = "scraperrun"')
commit()from lxml.html import fromstring
#from lxml.etree import fromstring
from time import time
import requests
from scraperwiki.sqlite import save,save_var, get_var, select, commit, execute
from scraperwiki import swimport
options=swimport('options').options
keyify=swimport('keyify').keyify
randomsleep=swimport('randomsleep').randomsleep
from json import loads,dumps
示例#25
0
        })
    save(['date_scraped', 'businessPremisesURL'], data, 'business_premises')
    randomsleep()

#execute('DELETE from cp1 where date_scraped != (select min(date_scraped) from cp1)')
#execute('DELETE from businessPremises where date_scraped != (select min(date_scraped) from cp1)')
#commit()

#execute('UPDATE swvariables SET value_blob = (select min(date_scraped) from cp1) where name = "DATE"')
#commit()

Test(download=False)
cp1()
save_var('crashed', 0)
execute('DELETE FROM swvariables WHERE name = "DATE"')
commit()from scraperwiki.sqlite import save, select, execute, save_var, get_var, commit
from scraperwiki import swimport
from requests import session
from lxml.html import fromstring, tostring
import re
from time import time, sleep
keyify=swimport('keyify').keyify

URL="http://www.ncr.org.za/register_of_registrants/index.php"

#DEV=True
DEV=False

DATE = get_var('DATE', time())

RE={
        except:
            sleep(60)
# Testing
#print get_branches('7','711')

if __name__ == '__main__':
    # If this is run locally, actually run it.
    getlocations('1')
    getlocations('2')
    getlocations('3')
elif __name__ == 'scraper':
    # If this is run on scraperwiki, just create the schema.
    from scraperwiki.sqlite import execute, commit
    execute('CREATE TABLE IF NOT EXISTS branches ( City TEXT, Latitude TEXT, Longitute TEXT, Status TEXT, SuburdId TEXT, TownName TEXT, Address1 TEXT, Address2 TEXT, EntityType TEXT, EntityCentreNumber TEXT, ProvinceId TEXT, RegionName TEXT, Province TEXT, EntityTypeOptionId TEXT, OperatingHoursWeekend TEXT, EntityName TEXT, EntitySwiftCode TEXT, IBTNumber TEXT, EntityTypeId TEXT, OperatingHoursWeekDay TEXT, DateAdded TEXT, EntityTypeOption TEXT, AddedBy TEXT, CityId TEXT, SuburdSuburb TEXT, EntityDescription TEXT, Guid TEXT, Id TEXT, StreetName TEXT )')
    execute('CREATE TABLE IF NOT EXISTS branches_backup ( City TEXT, Latitude TEXT, Longitute TEXT, Status TEXT, SuburdId TEXT, TownName TEXT, Address1 TEXT, Address2 TEXT, EntityType TEXT, EntityCentreNumber TEXT, ProvinceId TEXT, RegionName TEXT, Province TEXT, EntityTypeOptionId TEXT, OperatingHoursWeekend TEXT, EntityName TEXT, EntitySwiftCode TEXT, IBTNumber TEXT, EntityTypeId TEXT, OperatingHoursWeekDay TEXT, DateAdded TEXT, EntityTypeOption TEXT, AddedBy TEXT, CityId TEXT, SuburdSuburb TEXT, EntityDescription TEXT, Guid TEXT, Id TEXT, StreetName TEXT )')
    commit()"""This doesn't work on ScraperWiki because of the SSL; run it locally instead.
Use these commands to return the appropriately formatted CSV file.

SELECT
  `date_scraped`,
  "Standard Bank" as "source",
  "Bank" as "source-type",
  "Standard Bank" as entity,
  "Bank" as "entity-type",
  `location-type` as "location-type",
  "http://locator.standardbank.co.za/Default.aspx" as "url",
  `StreetName` as "street-address",
  `TownName` as "town",
  `provinceName` as "province",
  `StreetName` || "\n" || `TownName` || "\n" || `provinceName` as "full-address",
  "South Africa" as "country"
'''
I figure out how different types are saved to the database.
'''
import datetime
from scraperwiki.sqlite import save, save_var, execute, commit

execute('drop table if exists swdata')
execute('drop table if exists swvariables')
execute('drop table if exists complex')
commit()

for t in {dict, list, set, str, unicode, bool, int, float, long}:
    save_var(str(t), t())
    save([], {str(t).replace("'","").replace(' ', '').replace('<type', '').replace('>', ''): t()})

save([], {
    u'list': [u'thing'],
    u'dict': {u'key': u'value'},
    u'set': {u'thing'},
}, 'complex')'''
I figure out how different types are saved to the database.
'''
import datetime
from scraperwiki.sqlite import save, save_var, execute, commit

execute('drop table if exists swdata')
execute('drop table if exists swvariables')
execute('drop table if exists complex')
commit()

for t in {dict, list, set, str, unicode, bool, int, float, long}:
示例#28
0
 def commit(self):
     lite.commit()
def drop_above_jobId(jobId,table_names):
  for table_name in table_names:
    execute('DELETE FROM `%s` where jobId>%d' % (table_name,jobId))
  commit()
示例#30
0
 def __close__(self):
     lite.commit()