def test_create_if_exists(self): dt = DumpTruck(dbname = '/tmp/test.db') dt.execute('create table pineapple (bar integer, baz integer);') dt.create_index(['bar', 'baz'], 'pineapple') with self.assertRaises(sqlite3.OperationalError): dt.create_index(['bar', 'baz'], 'pineapple', if_not_exists = False)
def test_non_unique(self): dt = DumpTruck(dbname = '/tmp/test.db') dt.execute('create table tomato (bar integer, baz integer);') dt.create_index(['bar', 'baz'], 'tomato') observed = dt.execute('PRAGMA index_info(tomato_bar_baz)') # Indexness self.assertIsNotNone(observed) # Indexed columns expected = [ {u'seqno': 0, u'cid': 0, u'name': u'bar'}, {u'seqno': 1, u'cid': 1, u'name': u'baz'}, ] self.assertListEqual(observed, expected) # Uniqueness indices = dt.execute('PRAGMA index_list(tomato)') for index in indices: if index[u'name'] == u'tomato_bar_baz': break else: index = {} self.assertEqual(index[u'unique'], 0)
def test_create_if_exists(self): dt = DumpTruck(dbname="/tmp/test.db") dt.execute("create table pineapple (bar integer, baz integer);") dt.create_index(["bar", "baz"], "pineapple") with self.assertRaises(sqlite3.OperationalError): dt.create_index(["bar", "baz"], "pineapple", if_not_exists=False)
def test_create_if_not_exists(self): dt = DumpTruck(dbname="/tmp/test.db") dt.execute("create table mango (bar integer, baz integer);") dt.create_index(["bar", "baz"], "mango") # This should not raise an error. dt.create_index(["bar", "baz"], "mango", if_not_exists=True)
def test_create_if_not_exists(self): dt = DumpTruck(dbname = '/tmp/test.db') dt.execute('create table mango (bar integer, baz integer);') dt.create_index(['bar', 'baz'], 'mango') # This should not raise an error. dt.create_index(['bar', 'baz'], 'mango', if_not_exists = True)
def apis(): dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False) dt.create_table({'catalog':'abc.def'}, 'socrata_apis') dt.create_index(['catalog'], 'socrata_apis', unique = True, if_not_exists = True) socrata_catalogs = filter(lambda x: x[0] == 'socrata', catalogs()) for _, catalog in socrata_catalogs: dt.upsert({ 'catalog': catalog.split('://')[-1], 'apis': count_apis(catalog), }, 'socrata_apis')
def main(): edges = build_network()['edges'] dt = DumpTruck(dbname = '/tmp/open-data.sqlite', adapt_and_convert = True) datasets_in = dt.execute('SELECT * FROM socrata') dt.create_table({'id': 'blah-blah'}, 'socrata_deduplicated') dt.create_index(['id'], 'socrata_deduplicated', if_not_exists = True, unique = True) for dataset in dedupe(datasets_in, edges): dt.upsert(dataset, 'socrata_deduplicated')
def to_sqlite3(): dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False) dummyrow = dict(zip(['software','catalog','identifier'], ['blah']*3)) dt.create_table(dummyrow, 'datasets', if_not_exists = True) dt.create_index(['software','catalog','identifier'], 'datasets', if_not_exists = True, unique = True) for table in ['ckan','socrata']: dt.create_table({'catalog':'blah','identifier':'blah'}, table, if_not_exists = True) dt.create_index(['catalog','identifier'], table, if_not_exists = True, unique = True) dt.create_table({'view_id':'abc','table_id':123}, 'socrata_tables') dt.create_index(['view_id'], 'socrata_tables', if_not_exists = True, unique = True) dt.create_index(['table_id'], 'socrata_tables', if_not_exists = True) for dataset in datasets(): row = { 'software': dataset['software'], 'catalog': dataset['catalog'], 'identifier': dataset[SOFTWARE_MAP['identifier'][dataset['software']]], } sql = 'SELECT * FROM datasets WHERE software = ? AND catalog = ? AND identifier = ?' if dt.execute(sql, [row['software'],row['catalog'],row['identifier']]) != []: continue dt.upsert(row, 'datasets') if dataset['software'] == 'socrata': socrata_table = { 'view_id': row['identifier'], 'table_id': dataset['tableId'], } dt.upsert(socrata_table, 'socrata_tables') dt.upsert(dataset,dataset['software']) dt.commit()
def extract_dataset_table_info(): dt = DumpTruck(dbname = '/tmp/table_info.db') dt.create_table({'portal': 'abc', 'id': 'abcd-efgh'}, 'table_info') dt.create_index(['portal', 'id'], 'table_info', unique = True) dt.create_index(['tableId'], 'table_info', unique = False) done = set([tuple(row.keys()) for row in dt.execute('SELECT portal, id FROM table_info')]) for portal in os.listdir('data'): for viewid in os.listdir(os.path.join('data', portal, 'views')): if (portal, viewid) in done: continue d = _dataset_table_info(portal, viewid) if d == None: continue dt.upsert(d, 'table_info')
def users(): dt = DumpTruck(dbname = '/tmp/socrata.db') dt.create_table({'id': 'abcd-efgh'}, 'user') dt.create_index(['id'], 'user', unique = True) _users = {} for portal in os.listdir('data'): for viewid in os.listdir(os.path.join('data', portal, 'views')): handle = open(os.path.join('data', portal, 'views', viewid), 'r') try: view = json.load(handle) except: # *cringe* continue handle.close() if view['owner']['id'] in _users: _users[view['owner']['id']]['views'].add(view['id']) try: _users[view['owner']['id']]['publicationDates'].add((view['id'], view['publicationDate'])) except: return view else: _users[view['owner']['id']] = view['owner'] _users[view['owner']['id']]['views'] = {view['id']} _users[view['owner']['id']]['tables'] = set() _users[view['owner']['id']]['publicationDates'] = set() if view['tableAuthor']['id'] in _users: _users[view['tableAuthor']['id']]['tables'].add(view['tableId']) else: _users[view['tableAuthor']['id']] = view['tableAuthor'] _users[view['tableAuthor']['id']]['views'] = set() _users[view['tableAuthor']['id']]['tables'] = {view['tableId']} _users[view['tableAuthor']['id']]['publicationDates'] = set() for uid in _users.keys(): for key in ['views', 'rights', 'tables']: if key in _users[uid]: _users[uid]['n_' + key] = len(_users[uid][key]) del _users[uid][key] dt.insert(_users.values(), 'user') for uid, user in _users.items(): for viewid, publicationDate in user['publicationDates']: dt.insert({'userid': user['id'], 'viewid': viewid, 'publicationDate': publicationDate}, 'publications', commit = False) dt.commit() return _users
def get_links(softwares = ['ckan','socrata']): dt = DumpTruck('/tmp/open-data.sqlite') dummyrow = dict(zip(['software','catalog','identifier', 'status_code', 'headers', 'error'], (['blah'] * 3) + ([234] * 1) + ([{'a':'b'}] * 2))) dt.create_table(dummyrow, 'links', if_not_exists = True) dt.create_index(['software','catalog','identifier'], 'links', if_not_exists = True, unique = True) for software in softwares: for catalog in read.catalogs(software): if SOCRATA_FIX.get(catalog, 'this is a string, not None') == None: continue try: for row in _check_catalog(software, catalog): dt.upsert(row, 'links') except: print(os.path.join('downloads',software,catalog)) raise
def check_links(): dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False) dt.create_index(['url'], 'links', if_not_exists = True, unique = False) dt.create_index(['status_code'], 'links', if_not_exists = True, unique = False) # Source urls = Queue() sql = ''' SELECT DISTINCT url FROM links WHERE (status_code = -42 OR status_code IS NULL) AND is_link AND url NOT NULL ORDER BY status_code, substr(30, 100); ''' # Order by the substring so that we randomly bounce around catalogs url_list = [row['url'] for row in dt.execute(sql)] for url in url_list: urls.put(url) # Sink to the database def _db(queue): dt = DumpTruck('/tmp/open-data.sqlite') while True: dt.execute(*queue.get()) db_updates = Queue() db_thread = Thread(None, target = _db, args = (db_updates,)) db_thread.start() # Check links def _check_link(url_queue): while not urls.empty(): url = url_queue.get() if url == None: raise ValueError('url is None') status_code, headers, error = links.is_alive(url) sql = 'UPDATE links SET status_code = ?, headers = ?, error = ? WHERE is_link = 1 AND url = ?' db_updates.put((sql, (status_code, headers, error, url))) print(url) threads = {} for i in range(100): threads[i] = Thread(None, target = _check_link, args = (urls,)) for thread in threads.values(): thread.start()
def test_non_unique(self): dt = DumpTruck(dbname="/tmp/test.db") dt.execute("create table tomato (bar integer, baz integer);") dt.create_index(["bar", "baz"], "tomato") observed = dt.execute("PRAGMA index_info(tomato_bar_baz)") # Indexness self.assertIsNotNone(observed) # Indexed columns expected = [{u"seqno": 0, u"cid": 0, u"name": u"bar"}, {u"seqno": 1, u"cid": 1, u"name": u"baz"}] self.assertListEqual(observed, expected) # Uniqueness indices = dt.execute("PRAGMA index_list(tomato)") for index in indices: if index[u"name"] == u"tomato_bar_baz": break else: index = {} self.assertEqual(index[u"unique"], 0)
def do_row(tr): try: return l.parse_row(tr) except: print tostring(tr) raise return map(do_row, trs[2:]) # Schema dt = DumpTruck(dbname='/tmp/finalip.db') dt.create_table({u'DA Number': u'NAE-2009-01067'}, 'finalip', if_not_exists=True) dt.create_index(['Da Number'], 'finalip', unique=True, if_not_exists=True) # Skip finished stuff pages = set([(row['Year'], row['Month'], row['Page']) for row in dt.execute('SELECT Year, Month, Page FROM finalip')]) # Populate for dirname, subdirnames, filenames in os.walk( os.path.join(os.environ['READER_ROOT'], '..', 'finalips')): if subdirnames != []: continue for filename in filenames: year, month = map(int, dirname.split('/')[-2:]) page = (year, month, filename) if page in pages: continue
def read_finalip(path): html = parse(path) trs = html.xpath('//table[@style="border-collapse: collapse; width: 100%;"]/descendant::tr') def do_row(tr): try: return l.parse_row(tr) except: print tostring(tr) raise return map(do_row, trs[2:]) # Schema dt = DumpTruck(dbname = '/tmp/finalip.db') dt.create_table({u'DA Number': u'NAE-2009-01067'}, 'finalip', if_not_exists = True) dt.create_index(['Da Number'], 'finalip', unique = True, if_not_exists = True) # Skip finished stuff pages = set([(row['Year'], row['Month'], row['Page']) for row in dt.execute('SELECT Year, Month, Page FROM finalip')]) # Populate for dirname, subdirnames, filenames in os.walk(os.path.join(os.environ['READER_ROOT'], '..', 'finalips')): if subdirnames != []: continue for filename in filenames: year, month = map(int, dirname.split('/')[-2:]) page = (year, month, filename) if page in pages: continue path = os.path.join(dirname, filename)
# We initialize DumpTruck, with dummy data that won't be inserted. dump_truck.create_table( { "stored_filename": "filename.20130227205616.tgz", "size": 1, "metadata": {"is_enc": False}, "backup_date": 1361994976, "filename": "filename", "backend": "s3", "is_deleted": False, "last_updated": 1361994976, "tags": [], "backend_hash": "backendhash", }, "backups", ) dump_truck.create_index(["stored_filename"], "backups", unique=True) if not "inventory" in dump_truck.tables(): dump_truck.create_table({"filename": "filename", "archive_id": "glacier-archive-id"}, "inventory") dump_truck.create_index(["filename"], "inventory", unique=True) if not "jobs" in dump_truck.tables(): dump_truck.create_table({"filename": "filename", "job_id": "job_id"}, "jobs") dump_truck.create_index(["filename"], "jobs", unique=True) if not "config" in dump_truck.tables(): dump_truck.save_var("client_id", "") dump_truck.save_var("sync_ts", 0) dump_truck.save_var("tags", set())
["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16677&mon=jul", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16676&mon=aug", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16678&mon=sep", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16679&mon=oct", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/16668.141.html?tID=16680&mon=nov", "Latest releases"], ["http://www.scotlandoffice.gov.uk/scotlandoffice/10804.146.html", "Archive releases"], # 2005 ["http://www.scotlandoffice.gov.uk/scotlandoffice/10805.145.html", "Archive releases"], # 2006 ["http://www.scotlandoffice.gov.uk/scotlandoffice/10806.144.html", "Archive releases"], # 2007 ["http://www.scotlandoffice.gov.uk/scotlandoffice/10807.143.html", "Archive releases"], # 2008 ["http://www.scotlandoffice.gov.uk/scotlandoffice/13342.html", "Archive releases"], # 2009 ["http://www.scotlandoffice.gov.uk/scotlandoffice/13661.html", "Archive releases"], # 2010 ["http://www.scotlandoffice.gov.uk/scotlandoffice/15263.html", "Archive releases"], # 2011 ] dt = DumpTruck(dbname="scotland.db") dt.create_table({"Title": "", "Publication date": "", "Old URL": "", "Summary": "", "Attachments": "", "Type": "", "Associated organisations": ""}, "publications") dt.create_index(["Title", "Old URL"], "publications", unique=True) for url, page_type in URLS: for publication in scrape_list_page(url): publication['Type'] = page_type dt.upsert(publication, "publications") dumptruck_to_csv(dt, "publications", "/home/http/scotland/publications.csv")
def main(): dt = DumpTruck(dbname='metrics.db') dt.create_table({'portal': 'abc', 'date': datetime.date.today()}, 'series') dt.create_index(['portal', 'date'], 'series') dt.upsert(list(table()), 'series')
from common import dumptruck_to_csv URL = "http://www.scotlandoffice.gov.uk/scotlandoffice/14463.363.html" req = requests.get(URL) doc = lxml.html.fromstring(req.text) dt = DumpTruck(dbname="scotland.db") dt.create_table({"Title": "", "Publication date": "", "Old URL": "", "Body": "", "Attachments": "", "Associated organisations": "", "Associated Document Series": ""}, "statistics") dt.create_index(["Title", "Old URL"], "statistics", unique=True) for link in doc.xpath("//div[@class='wrapper']/ul/li/a"): series_title, series_url = link.text, urlparse.urljoin(URL, link.attrib["href"]) print series_title series_req = requests.get(series_url) series_doc = lxml.html.fromstring(series_req.text) for table_line in series_doc.xpath("//tr[not(@bgcolor) or @bgcolor!='#004093']"): file_pub_date = table_line.xpath("./td[3]")[0].text for file_node in table_line.xpath("./td[2]//a"): file_title = etree.tostring(file_node, method="text", encoding="utf8") file_link = file_node.attrib["href"] if not file_link.startswith("http"):
def scrape_main_article(url): req = requests.get(url) doc = lxml.html.fromstring(req.text) div = doc.xpath("//*[@class='wrapper']")[0] div.remove(div.find("h1")) for para in div.findall("p"): if para.find("strong") is not None: div.remove(para) return htmlize(etree.tostring(div)) dt = DumpTruck(dbname="scotland.db") dt.create_table({"Title": "", "Publication date": "", "Old URL": "", "Summary": "", "Body": "", "Associated organisations": ""}, "news") dt.create_index(["Title", "Old URL"], "news", unique=True) for url in URLS: for news_item in scrape_list_page(url): attachments = json.loads(news_item.pop("Attachments")) link = attachments[0]["link"] news_item["Old URL"] = link news_item["Body"] = scrape_main_article(link) dt.upsert(news_item, "news") dumptruck_to_csv(dt, "news", "/home/http/scotland/news.csv")
def main(): dt = DumpTruck(dbname = 'metrics.db') dt.create_table({'portal': 'abc', 'date': datetime.date.today()}, 'series') dt.create_index(['portal', 'date'], 'series') dt.upsert(list(table()), 'series')
from dumptruck import DumpTruck import random import os,sys import logging from sqlite3 import OperationalError import datetime import copy,types,time logging.basicConfig(level=logging.DEBUG,file=sys.stdout,format='%(levelname)s %(asctime)-15s %(filename)s %(lineno)s %(message)s') logger=logging.getLogger(os.path.split(__file__)[1]) _here=os.path.split(__file__)[0] dt=DumpTruck(dbname=os.path.join(_here,"data/events.sqlite")) try : dt.create_index(["id"],"events",unique=True) dt.create_index(["url"],"events",unique=True) except OperationalError : pass def scrape_now() : nn=0 for l in listevents() : try : d=dt.execute("select count(*) as c from events where url='%(url)s'" % l) except OperationalError : d=[{ "c" : 0 }] pass if d[0]["c"]==0 : l.update(eventdata(l["url"])) logger.info("found new %s %s" % (l.get("id",l.get("threadid","")),l.get("title","")))
'http://www.nwo.usace.army.mil', 'http://www.nws.usace.army.mil', 'http://www.nww.usace.army.mil', 'http://www.pof.usace.army.mil', 'http://www.poj.usace.army.mil', 'http://www.saw.usace.army.mil', 'http://www.spa.usace.army.mil', 'http://www.spk.usace.army.mil', 'http://www.spl.usace.army.mil', 'http://www.swf.usace.army.mil', 'http://www.swg.usace.army.mil', 'http://www.tam.usace.army.mil', } if __name__ == '__main__': dt = DumpTruck(dbname = 'usace.db') dt.create_table({'permit_application_number': 'abcd'}, 'notice') dt.create_index(['permit_application_number'], 'notice') for division in parse.locations(get('http://www.usace.army.mil/Locations.aspx')): for district in division['districts']: domain = re.sub(r'.usace.army.mil.*$', '.usace.army.mil', district['href']) path = '/Missions/Regulatory/PublicNotices.aspx' if domain in SKIPPED_DISTRICTS: continue pn_list = None while pn_list == None or pn_list['last_page'] > pn_list['current_page']: pn_list = parse.public_notice_list(get(domain + path)) dt.upsert(list(pn_list['notices']), 'notice')
#!/usr/bin/env python from dumptruck import DumpTruck dt = DumpTruck(dbname='scotland.db') dt.create_table({'title': '', 'date': ''}, 'publications') dt.create_index(['title'], 'publications', unique=True) dt.upsert({'title': 'one', 'date': 'today'}, 'publications') dt.upsert({'title': 'one', 'date': 'yesterday'}, 'publications') data = dt.execute('SELECT * FROM `publications`') print data