def analyze(): d = select(""" `link-href`, GROUP_CONCAT(`author`) AS `authors`, count(*) AS "count" FROM `links` JOIN `topics` ON `links`.`topic-href` = `topics`.`topic-href` GROUP BY `link-href` """) execute('DROP TABLE IF EXISTS `wrote-about-same-things`') save([], d, 'wrote-about-same-things') print ''' These look most exciting because three different people wrote about each. 3 Kiana Fitzgerald,Sara Peralta,Susan Raybuck http://schedule.sxsw.com/2012/events/event_IAP100409 3 Shawn Dullye,Joe Vasquez,Sara Peralta http://schedule.sxsw.com/2012/events/event_IAP10593 3 Shawn Dullye,Kiana Fitzgerald,Sara Peralta http://schedule.sxsw.com/2012/events/event_IAP13848 Of course, that isn't adjusted for how many each person wrote. ''' d = select(""" author, count(*) AS `how-many` FROM `links` JOIN topics on links.`topic-href` = topics.`topic-href` GROUP BY author ORDER BY 2 DESC """) save(['author'], d, 'how-many-did-you-link') print """
def save_file(link, name, position, length, hash): f = urlopen(urlunsplit((scheme, host, link, '', ''))) content_length = long(f.headers['Content-Length']) try: if position and length and hash and length == content_length and position <= length: print 'Checking hash' hash_object = replay_hash(readlines(f, newline, position), b64decode(hash)) print 'Hash OK' if position == length: print 'File OK' return elif position == 0 and length == hash == None: print 'Starting fresh' position, length, hash_object = (0, content_length, sha512()) sqlite.execute('UPDATE swvariables SET length=? WHERE file=?', (length, name)) else: print "Save error" raise DataError for line in readlines(f, newline, content_length - position): save_line(name, line, hash_object) sqlite.commit() print 'All OK' except DataError: delete_table(name) print 'DataError'
def extract_postcodes(): sql = ' `rowid`, `address` from `branches`;' for row in select(sql): postcodes = findall(r'[0-9]{4}', row['address']) if len(postcodes) != 0: execute("UPDATE `branches` SET `postcode` = ? WHERE `rowid` = ? ", (postcodes[-1], row['rowid']) ) commit()
def clean_description_keys(): KEY_GLOB_PAIRS = [ ('affiliations', '*affiliations*'), ('courses', '*courses*'), ('research', '*research*'), ('research', '*interests*'), ('honors', '*honors*'), ('publications', '*publications*'), ('education', '*education*'), ('introduction', 'introduction*'), ('introduction', 'biography'), ] try: execute('ALTER TABLE descriptions ADD COLUMN key_cleaned TEXT') except: pass else: commit() for pair in KEY_GLOB_PAIRS: execute( 'UPDATE descriptions SET key_cleaned = "%s" WHERE lower(key) GLOB "%s" AND key_cleaned IS NULL' % pair) commit()
def swversion(table_name='swdata'): if table_name in show_tables(): timestamp = select("max(date_extracted) as m from %s;" % table_name)[0]['m'] execute("ALTER TABLE `%s` RENAME TO `%s_%d`;" % (table_name, table_name, timestamp)) commit()
def main(): #finalpage=get_var('finalpage') prevpage = get_var('prevpage') #if None==finalpage: if True: finalpage = int(get_lastpage(getpage(1))) save_var('finalpage', finalpage) if None == prevpage: prevpage = 1 if prevpage < finalpage: step1(prevpage, finalpage) elif prevpage == finalpage: if not "step2completion" in show_tables(): execute( 'create table `step2completion` (`url` text, `browsed` boolean)' ) execute(""" INSERT INTO `step2completion` ( url , browsed ) SELECT url, 0 as "browsed" FROM locations """) commit() step2()
def download(): execute('CREATE TABLE IF NOT EXISTS `tcamp`' '(`year` integer, `first_scraped` real, `Twitter handle` text, `Intro for your fellow campers` text)') execute('DELETE FROM tcamp') union(2013, 'tcamp13', '2013') union(2012, 'tcamp12', '2012') union(2011, 'tcamp11', '2011')
def main(): if None==get_var('downloaded'): download() save_var('downloaded',1) execute('DROP TABLE IF EXISTS `final`') clean() save_var('downloaded',None)
def pop(self): obj = self.last() # Delete execute('delete from main.stack where rowid = (select max(rowid) from main.stack)') commit() return obj
def save_row(name, row): try: row = tuple(row) sqlite.execute('INSERT INTO %s VALUES (%s)' % (name, join(repeat('?', len(columns(name))))), row, verbose=0) except sqlite.SqliteError, e: print e print row raise DataError
def step2(): urls = [ row['url'] for row in select( 'url from step2completion where browsed=0 limit 1456') ] #That seems to be near the CPU-time limit for url in urls: save_sidebar(url) #Then update step2completion execute('UPDATE step2completion SET browsed=1 WHERE url=?', url)
def main(): if get_var('province')=='step2': separate_addresses() execute('DELETE FROM swvariables WHERE name = "province"') commit() print(""" ================================ This run is finished! ================================ """) else: download()
def main(): if get_var('province') == 'step2': separate_addresses() execute('DELETE FROM swvariables WHERE name = "province"') commit() print(""" ================================ This run is finished! ================================ """) else: download()
def atomic(): if "client"==pagetype(get_var('previous_href')): table_names=CLIENT_TABLES elif "lobbyist"==pagetype(get_var('previous_href')): table_names=LOBBYIST_TABLES else: raise ResumeError('The type of the previous href, "%s", could not be determined.' % get_var('previous_href')) if "clients_urls" in show_tables(): sourceUrl=select('distinct sourceUrl as "s" from `clients_urls` where jobId=(select max(jobId) from `clients_urls`)')[0]['s'] for table_name in table_names: execute('DELETE FROM `%s` where jobId in (select jobId from clients_urls where sourceUrl="%s")' % (table_name,sourceUrl)) commit() return sourceUrl
def moreparsing_map(): "Map along the most recent results in the table (like a Couch map) and return a new one" d=select("* FROM `swdata` WHERE date_scraped=(SELECT max(date_scraped) from `swdata`);") for row in d: row['street-address'],row['postal-code']=splitAddress(row['Address_']) row['town']=extractTown(row['branchName']) if 'final' in show_tables(): execute('DROP TABLE `final`;') d_final = [] for row in d: if row['regionName'] not in ["Botswana", "Malawi", "Nambia"]: d_final.append(row) save([],d_final,'final')
def geocode(): if "address" not in show_tables(): initialize() while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0: address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0] #print address if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0: d = all_services(address['address-input']) for row in d: row['address-input'] = address['address-input'] save([], d, 'geocode') params = (address['address-column'], address['address-input']) execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params ) commit()
def separate_addresses(): execute('DROP TABLE IF EXISTS final') commit() d=select('* from `initial`') for row in d: splitaddress=row['address'].split('\n') l=len(splitaddress) if l==3: row['street-address'],row['subtown'],row['town2']=splitaddress elif l==2: row['street-address'],row['subtown']=splitaddress else: raise AddressError row['street-address'] = row['street-address'].strip() row['address'] = strip_address(row['address']) save([],d,'final')
def separate_addresses(): execute('DROP TABLE IF EXISTS final') commit() d = select('* from `initial`') for row in d: splitaddress = row['address'].split('\n') l = len(splitaddress) if l == 3: row['street-address'], row['subtown'], row['town2'] = splitaddress elif l == 2: row['street-address'], row['subtown'] = splitaddress else: raise AddressError row['street-address'] = row['street-address'].strip() row['address'] = strip_address(row['address']) save([], d, 'final')
def aggregate(): execute('create table if not exists twitter (handle text, times integer)') execute('create unique index if not exists twitter_handle on twitter(handle)') execute('delete from twitter where 1 = 1') execute('insert into twitter ' 'select replace(`twitter handle`, "@", ""), count(*) from `tcamp` ' 'where `twitter handle` is not null group by [twitter handle]' ) commit()
def cp1(): execute(''' CREATE TABLE IF NOT EXISTS `businessPremises` ( `date_scraped` REAL, `businessPremisesURL` TEXT, FOREIGN KEY (date_scraped, businessPremisesUrl) REFERENCES cp1(date_scraped, businessPremisesUrl) ) ''') if get_var('crashed') == 1: pagenum = select('max(pagenum) from cp1 where date_scraped = (select max(date_scraped) from cp1)')[0]['max(pagenum)'] print "Resuming from page %d" % pagenum p = Page('CP1') p = Page('CP1', s=p.s, pagenum=pagenum) else: print "Starting a new run" p = Page('CP1') while p.lastpage()==False: print "Beginning page %d" % p.pagenum tables=p.table().subtables() d = [] for table in tables: row = table.parse() row['businessPremisesURL'] = table.business_premises_url() try: business_premises_data, more_registrant_data = table.business_premises(p.s) except Exception, msg: print "Error on %s: msg" % table.business_premises_url() sleep(60) print "Trying again" business_premises_data, more_registrant_data = table.business_premises(p.s) row['date_scraped']=DATE row['pagenum']=p.pagenum row['url']=URL+"?page=%d"%p.pagenum row.update(more_registrant_data) save([], business_premises_data, 'businessPremises') save(['date_scraped', 'businessPremisesURL'],row,'cp1') sleep(1) save_var('crashed', 1) p=p.next25()
def pop(self): # Query query = select('* from stack where rowid = (select max(rowid) from stack)') # Load instantiate = "%s(%s)" % (query[0]['classname'], dumps(query[0]['url'])) print instantiate obj = eval(instantiate) # Delete execute('delete from stack where rowid = (select max(rowid) from stack)') commit() # Remember in case of error justpopped = obj return obj
def main(): if "urls" not in show_tables(): copyUrlsDb() for url in getUrls(): slug = getScraperSlug(url) code, user = getCode(slug) if code != None: c = code.lower() save(['url'], { "code":code, "user": user, "url": url, "has_join": " join " in c, "has_attach": "attach" in c, "has_twitter": "twitter" in c, }) execute('UPDATE `urls` SET `scraped`=1 WHERE `url` = ?', url) commit() d = select('`user`, count(*) AS "attach-and-join-count" from `swdata` WHERE (`has_join` = 1 and `has_attach` = 1) GROUP BY `user`') save(['user'], d, 'results')
def save_files(): file_info = sqlite.execute('SELECT file,position,length,hash FROM swvariables ORDER BY position==length,position,RANDOM()')['data'] file_links = links(imap(lambda x: x[0], file_info), file_suffix, links_url) for link, info in izip(file_links, file_info): name, position, length, hash = info print link, name, position, length, hash try: save_file(link, name, position, length, hash) except URLError: pass
def joinbest(): d = select(''' `Date_data_was_extracted` , `Name_of_data_source` , `Type_of_source` , `URL_of_data_source` , `Location_type` , `Address` , `Fax` , `Email` , `Telephone` , `License_number` --, `License_date` , `Name_of_entity` , `Type_of_entity` , `Country` , `Postal_code` , `Province` , `Sub-district` , `Town` , `Street_address` , `latitude-geocode` , `longitude-geocode` FROM `branch_best-address` JOIN `branch_address` ON ( `branch_best-address`.`entityRecord` = `branch_address`.`entityRecord` AND `branch_best-address`.`address-column`= `branch_address`.`address-column` ) JOIN `geocode` ON ( `geocode`.`service` = `branch_best-address`.`service` AND `geocode`.`address-input` = `branch_address`.`address-input` ) JOIN `scraped` ON `scraped`.`rowid` = `branch_best-address`.`entityRecord` ''') execute('DROP TABLE IF EXISTS `final`') save([], d, 'final')
def geocode(): if "scraped" not in show_tables(): d = swimport('csv2sw').read.csv('https://views.scraperwiki.com/run/combine_mix_scraper_spreadsheets/') save([], d, 'scraped') if "address" not in show_tables(): initialize() while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0: address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0] #print address if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0: d = all_services(address['address-input']) for row in d: row['address-input'] = address['address-input'] save([], d, 'geocode') params = (address['address-column'], address['address-input']) execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params ) commit()
def join(): execute('DROP TABLE IF EXISTS `accuracy`') commit() d = select(''' * FROM `geocode` LEFT JOIN `branch_address` ON ( `branch_address`.`address-input` = `geocode`.`address-input` ) ''') for row in d: if row['address-geocode'] == None: row['kilometers_off'] = None else: row['kilometers_off'] = distance.distance( (row['latitude-scrape'], row['longitude-scrape']), (row['latitude-geocode'], row['longitude-geocode']) ).km for key in ['latitude-scrape', 'longitude-scrape', 'latitude-geocode', 'longitude-geocode', 'address-input']: del(row[key]) save([], d, 'accuracy')
def geocode(): if "scraped" not in show_tables(): d = swimport('csv2sw').read.csv('http://hacks.thomaslevine.com/all.csv') save([], d, 'scraped') execute('DELETE FROM `scraped` WHERE `Country` != "South Africa"') commit() if "address" not in show_tables(): initialize() while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0: address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0] #print address if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0: d = all_services(address['address-input']) for row in d: row['address-input'] = address['address-input'] save([], d, 'geocode') params = (address['address-column'], address['address-input']) execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params ) commit()
def setup(self): if len(lite.table_info(self._table)) > 0: return 0 query = "CREATE TABLE IF NOT EXISTS "+self._table+" (" for key in self._keys: query = query+" "+key[0]+" "+key[1] unique = "" for key in self._keys: if "uk" in key[0]: unique = unique+" "+key[0]+"," print "Unique Keys in",self._table+":",unique query = query+" UNIQUE("+unique+") ON CONFLICT IGNORE)" query = query.replace("INT ","INT, ") query = query.replace("REAL ","REAL, ") query = query.replace("TEXT ","TEXT, ") query = query.replace(",)", ")") print query lite.execute(query)
def setup(self): if len(lite.table_info(self._table)) > 0: return 0 query = "CREATE TABLE IF NOT EXISTS " + self._table + " (" for key in self._keys: query = query + " " + key[0] + " " + key[1] unique = "" for key in self._keys: if "uk" in key[0]: unique = unique + " " + key[0] + "," print "Unique Keys in", self._table + ":", unique query = query + " UNIQUE(" + unique + ") ON CONFLICT IGNORE)" query = query.replace("INT ", "INT, ") query = query.replace("REAL ", "REAL, ") query = query.replace("TEXT ", "TEXT, ") query = query.replace(",)", ")") print query lite.execute(query)
def clean_description_keys(): KEY_GLOB_PAIRS = [ ('affiliations', '*affiliations*'), ('courses', '*courses*'), ('research', '*research*'), ('research', '*interests*'), ('honors', '*honors*'), ('publications', '*publications*'), ('education', '*education*'), ('introduction', 'introduction*'), ('introduction', 'biography'), ] try: execute('ALTER TABLE descriptions ADD COLUMN key_cleaned TEXT') except: pass else: commit() for pair in KEY_GLOB_PAIRS: execute('UPDATE descriptions SET key_cleaned = "%s" WHERE lower(key) GLOB "%s" AND key_cleaned IS NULL' % pair) commit()
goalDifference = int(row.find('td', 'col-gd').string) points = int(row.find('td', 'col-pts').string) #print pos, team,"gf", goalsFor, "ga", goalsAgainst, "gd", goalDifference, "pts", points teamItem = { 'pos': pos, 'team': team, 'gf': goalsFor, 'ga': goalsAgainst, 'gd': goalDifference, 'pts': points } premierLeagueData.append(teamItem) if len(premierLeagueData) > 0: #truncate data store sqlite.execute("DELETE FROM `swdata`") #add each table line to data store for teamItem in premierLeagueData: sqlite.save(unique_keys=['team'], data=teamItem) import BeautifulSoup from scraperwiki import sqlite from scraperwiki import scrape html = scrape('http://www.premierleague.com/en-gb/matchday/league-table.html') page = BeautifulSoup.BeautifulSoup(html) premierLeagueData = [] for row in page.find('table').find('tbody').findAll('tr', 'club-row'):
''' I figure out how different types are saved to the database. ''' import datetime from scraperwiki.sqlite import save, save_var, execute, commit execute('drop table if exists swdata') execute('drop table if exists swvariables') execute('drop table if exists complex') commit() for t in {dict, list, set, str, unicode, bool, int, float, long}: save_var(str(t), t()) save([], {str(t).replace("'","").replace(' ', '').replace('<type', '').replace('>', ''): t()}) save([], { u'list': [u'thing'], u'dict': {u'key': u'value'}, u'set': {u'thing'}, }, 'complex')''' I figure out how different types are saved to the database. ''' import datetime from scraperwiki.sqlite import save, save_var, execute, commit execute('drop table if exists swdata') execute('drop table if exists swvariables') execute('drop table if exists complex') commit() for t in {dict, list, set, str, unicode, bool, int, float, long}:
for content_div in html.cssselect('#maincol > div.content'): try: key = content_div.xpath('h3')[0].text_content() value = content_div.xpath('div[@class="text"]')[0].text_content() except Exception, msg: save([], {"url": self.url, "column": "content_div", "msg": msg}, 'errors') else: data2.append({ "key": key, "value": value, "url": self.url }) save(['url'], {"url": url, "plaintext": html.get_element_by_id('maincol').text_content()}, 'maincol') save(['url'], data1, 'faculty2') execute('CREATE TABLE IF NOT EXISTS descriptions ( url, key TEXT, value TEXT, FOREIGN KEY(url) REFERENCES faculty2(url))') save(['url'], data2, 'descriptions') def clean_description_keys(): KEY_GLOB_PAIRS = [ ('affiliations', '*affiliations*'), ('courses', '*courses*'), ('research', '*research*'), ('research', '*interests*'), ('honors', '*honors*'), ('publications', '*publications*'), ('education', '*education*'), ('introduction', 'introduction*'), ('introduction', 'biography'), ]
'businessPremisesURL': url }) save(['date_scraped', 'businessPremisesURL'], data, 'business_premises') randomsleep() #execute('DELETE from cp1 where date_scraped != (select min(date_scraped) from cp1)') #execute('DELETE from businessPremises where date_scraped != (select min(date_scraped) from cp1)') #commit() #execute('UPDATE swvariables SET value_blob = (select min(date_scraped) from cp1) where name = "DATE"') #commit() Test(download=False) cp1() save_var('crashed', 0) execute('DELETE FROM swvariables WHERE name = "DATE"') commit()from scraperwiki.sqlite import save, select, execute, save_var, get_var, commit from scraperwiki import swimport from requests import session from lxml.html import fromstring, tostring import re from time import time, sleep keyify=swimport('keyify').keyify URL="http://www.ncr.org.za/register_of_registrants/index.php" #DEV=True DEV=False DATE = get_var('DATE', time())
def initialize(): execute(""" CREATE TABLE `address` ( `address-column` text, `address-input` text, `finished` integer )""") execute("CREATE UNIQUE INDEX column ON `address` (`address-column`,`address-input`);") execute(""" CREATE TABLE `geocode` ( `address-geocode` TEXT, `latitude-geocode` REAL, `longitude-geocode` REAL, `number-of-matches` text, `service` TEXT, `address-input` TEXT )""") execute("CREATE UNIQUE INDEX geocode_key ON `geocode` (`address-input`, `service`);") execute(''' CREATE TABLE `branch_address` ( `branchId` TEXT, `address-column` TEXT, `address-input` TEXT, `latitude-scrape` REAL, `longitude-scrape` REAL )''') execute("CREATE UNIQUE INDEX branch_key ON `branch_address` (`branchId`, `address-column`);") commit() for column in COLUMNS: execute(''' INSERT INTO `address` ( `address-column`, `address-input`, `finished` ) SELECT DISTINCT ? as "address-column", %s as "address-input", 0 as "finished" FROM `branches` ''' % COLUMNS[column], column) commit() execute(''' INSERT INTO `branch_address` ( `branchId`, `address-column`, `address-input`, `latitude-scrape`, `longitude-scrape` ) SELECT `branchId`, ? as "address-column", %s as "address-input", `latitude`, `longitude` FROM `branches` ''' % COLUMNS[column], column) commit() execute('DELETE FROM `address` WHERE `address-input` IS NULL') commit()
def drop_above_jobId(jobId,table_names): for table_name in table_names: execute('DELETE FROM `%s` where jobId>%d' % (table_name,jobId)) commit()
def delete_table(name): sqlite.execute('UPDATE swvariables SET position=0,length=NULL,hash=NULL WHERE file=?', name) #sqlite.execute('DELETE FROM ' + name) sqlite.execute('DROP TABLE ' + name) sqlite.commit()
class DataError(Exception): pass class LinkError(Exception): pass def join(items): return ','.join(items) def columns(name): return files[name][0] + files[name][1] def primary_key(name): return files[name][0] sqlite.execute('CREATE TABLE IF NOT EXISTS swvariables (file TEXT PRIMARY KEY, position INTEGER, length INTEGER, hash TEXT)') for name in files: sqlite.execute('INSERT OR IGNORE INTO swvariables (file,position) VALUES (?,0)', name) cols = join(columns(name)) if primary_key(name): cols = join((cols, ' PRIMARY KEY (%s)' % join(primary_key(name)))) sqlite.execute('CREATE TABLE IF NOT EXISTS %s (%s)' % (name, cols)) def isquoted(s, q): return len(s) >= 2 * len(q) and s.startswith(q) and s.endswith(q) #def rmap(value, functions): # return (function(value) for function in functions) def process_cell(cell):
def save_line(name, line, hash): hash.update(line) save_row(name, imap(process_cell, line[:-2].split(cell_delimiter))) sqlite.execute('UPDATE swvariables SET position=position+?,hash=? WHERE file=?', (len(line), b64encode(hash.digest()), name), verbose=0) sqlite.commit()
def clear(): execute('DROP TABLE IF EXISTS clients') execute('DROP TABLE IF EXISTS clientsraw') execute('DROP TABLE IF EXISTS clients_urls') execute('DROP TABLE IF EXISTS clients_lobbyists') execute('DROP TABLE IF EXISTS clients_details') execute('DROP TABLE IF EXISTS lobbyists') execute('DROP TABLE IF EXISTS lobbyistsraw') execute('DROP TABLE IF EXISTS lobbyists_urls') execute('DROP TABLE IF EXISTS lobbyists_lobbyists') execute('DROP TABLE IF EXISTS lobbyists_details') save_var('previous_href',None)
def __init__(self, startingstack): execute('create table if not exists stack (obj, blob);') commit() self.extend(startingstack)