def setUp(self): self.cleanUp() h = DumpTruck(dbname=u"/tmp/test.db") h.save_var(u"birthday", u"November 30, 1888") h.close() connection = sqlite3.connect(u"/tmp/test.db") self.cursor = connection.cursor()
def setUp(self): self.cleanUp() h = DumpTruck(dbname = u'/tmp/test.db') h.save_var(u'birthday', u'November 30, 1888') h.close() connection=sqlite3.connect(u'/tmp/test.db') self.cursor=connection.cursor()
def build_table_from_db(dbname = '/tmp/table_info.db'): dt = DumpTruck(dbname = dbname) tableIds = [row['tableId'] for row in dt.execute(''' SELECT tableId, count(*) FROM table_info GROUP BY tableId ORDER BY count(*) DESC limit 10; ''')] try: os.mkdir('geneology') except OSError: pass json.dump(tableIds, open(os.path.join('geneology', 'index.json'), 'w')) for tableId in tableIds: result = { 'source': dt.execute('SELECT * FROM table_info WHERE tableId = ? ORDER BY createdAt ASC LIMIT 1', [tableId])[0], 'datasets': {}, } for dataset in dt.execute('SELECT * FROM table_info WHERE tableId = ?', [tableId]): if dataset['id'] not in result['datasets']: result['datasets'][dataset['id']] = dataset result['datasets'][dataset['id']]['portals'] = [] result['datasets'][dataset['id']]['portals'].append(dataset['portal']) result['datasets'] = result['datasets'].values() for dataset in result['datasets']: del dataset['portal'] json.dump(result, open(os.path.join('geneology', '%d.json' % tableId), 'w'))
def test_select(self): shutil.copy(u'fixtures/landbank_branches.sqlite', u'.') h = DumpTruck(dbname = u'landbank_branches.sqlite') data_observed = h.execute(u'SELECT * FROM `branches` WHERE Fax is not null ORDER BY Fax LIMIT 3;') data_expected = [{'town': u'\r\nCenturion', 'date_scraped': 1327791915.618461, 'Fax': u' (012) 312 3647', 'Tel': u' (012) 686 0500', 'address_raw': u'\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001\n (012) 686 0500\n (012) 312 3647', 'blockId': 14, 'street-address': None, 'postcode': u'\r\n0001', 'address': u'\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001', 'branchName': u'Head Office'}, {'town': u'\r\nCenturion', 'date_scraped': 1327792245.787187, 'Fax': u' (012) 312 3647', 'Tel': u' (012) 686 0500', 'address_raw': u'\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001\n (012) 686 0500\n (012) 312 3647', 'blockId': 14, 'street-address': u'\r\n420 Witch Hazel Ave\n\r\nEcopark', 'postcode': u'\r\n0001', 'address': u'\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001', 'branchName': u'Head Office'}, {'town': u'\r\nMiddelburg', 'date_scraped': 1327791915.618461, 'Fax': u' (013) 282 6558', 'Tel': u' (013) 283 3500', 'address_raw': u'\r\n184 Jan van Riebeeck Street\n\r\nMiddelburg\n\r\n1050\n (013) 283 3500\n (013) 282 6558', 'blockId': 17, 'street-address': None, 'postcode': u'\r\n1050', 'address': u'\r\n184 Jan van Riebeeck Street\n\r\nMiddelburg\n\r\n1050', 'branchName': u'Middelburg'}] self.assertListEqual(data_observed, data_expected) os.remove('landbank_branches.sqlite')
def check_timeouts(): import requests from unidecode import unidecode dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False) dt.execute(''' CREATE TABLE IF NOT EXISTS link_speeds ( url TEXT NOT NULL, elapsed FLOAT, error_type TEXT NOT NULL, error TEXT NOT NULL, UNIQUE(url) );''') urls = Queue() url_list = [row['url'] for row in dt.execute('SELECT DISTINCT url FROM links WHERE status_code = -42 and URL NOT IN (SELECT url from link_speeds)')] for url in url_list: urls.put(url) # Sink to the database def _db(queue): dt = DumpTruck('/tmp/open-data.sqlite') while True: dt.execute(*queue.get()) db_updates = Queue() db_thread = Thread(None, target = _db, args = (db_updates,)) db_thread.start() # def signal_handler(signal, frame): # db_thread.terminate() # sys.exit(0) # signal.signal(signal.SIGINT, signal_handler) # Check links def _check_link(url_queue): while not urls.empty(): url = url_queue.get() if url == None: raise ValueError('url is None') try: r = requests.head(url, allow_redirects=True, timeout = 30) except Exception as e: try: msg = unicode(e) except: msg = '' sql = 'INSERT INTO link_speeds (url, error_type, error) VALUES (?,?,?)' db_updates.put((sql, (url, unicode(type(e)), msg))) # ew python 2 else: sql = 'INSERT INTO link_speeds (url, elapsed, error_type, error) VALUES (?,?,\'\',\'\')' db_updates.put((sql, (url, r.elapsed.total_seconds()))) threads = {} for i in range(500): threads[i] = Thread(None, target = _check_link, args = (urls,)) for thread in threads.values(): thread.start()
def save_and_check(self, dataIn, tableIn, dataOut, tableOut = None, twice = True): if tableOut == None: tableOut = quote(tableIn) # Insert h = DumpTruck(dbname = '/tmp/test.db') h.insert(dataIn, tableIn) h.close() # Observe with pysqlite connection=sqlite3.connect('/tmp/test.db') cursor=connection.cursor() cursor.execute(u'SELECT * FROM %s' % tableOut) observed1 = cursor.fetchall() connection.close() if twice: # Observe with DumpTruck h = DumpTruck(dbname = '/tmp/test.db') observed2 = h.execute(u'SELECT * FROM %s' % tableOut) h.close() #Check expected1 = dataOut expected2 = [dataIn] if type(dataIn) in (dict, OrderedDict) else dataIn self.assertListEqual(observed1, expected1) self.assertListEqual(observed2, expected2)
def savegetvar(self, var): h = DumpTruck(dbname="/tmp/test.db") h.save_var(u"weird", var) h.close() h = DumpTruck(dbname="/tmp/test.db") t = os.stat("/tmp/test.db").st_mtime self.assertEqual(h.get_var(u"weird"), var) h.close() assert os.stat("/tmp/test.db").st_mtime == t
def savegetvar(self, var): h = DumpTruck(dbname = '/tmp/test.db') h.save_var(u'weird', var) h.close() h = DumpTruck(dbname = '/tmp/test.db') t=os.stat('/tmp/test.db').st_mtime self.assertEqual(h.get_var(u'weird'), var) h.close() assert os.stat('/tmp/test.db').st_mtime==t
def test_create_table(self): h = DumpTruck(dbname = '/tmp/test.db') h.create_table({'foo': 0, 'bar': 1, 'baz': 2}, 'zombies') h.close() connection=sqlite3.connect('/tmp/test.db') cursor=connection.cursor() cursor.execute('SELECT foo, bar, baz FROM zombies') observed = cursor.fetchall() connection.close() expected = [] self.assertListEqual(observed, expected)
def test_create_table(self): h = DumpTruck(dbname="/tmp/test.db") h.create_table({"foo": 0, "bar": 1, "baz": 2}, "zombies") h.close() connection = sqlite3.connect("/tmp/test.db") cursor = connection.cursor() cursor.execute("SELECT foo, bar, baz FROM zombies") observed = cursor.fetchall() connection.close() expected = [] self.assertListEqual(observed, expected)
def extract_dataset_table_info(): dt = DumpTruck(dbname = '/tmp/table_info.db') dt.create_table({'portal': 'abc', 'id': 'abcd-efgh'}, 'table_info') dt.create_index(['portal', 'id'], 'table_info', unique = True) dt.create_index(['tableId'], 'table_info', unique = False) done = set([tuple(row.keys()) for row in dt.execute('SELECT portal, id FROM table_info')]) for portal in os.listdir('data'): for viewid in os.listdir(os.path.join('data', portal, 'views')): if (portal, viewid) in done: continue d = _dataset_table_info(portal, viewid) if d == None: continue dt.upsert(d, 'table_info')
def test_no_rows_second_insert(self): "Nothing happens if no rows are inserted to a table that is there." dt = DumpTruck(dbname="/tmp/test.db") dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "ninety") dt.insert([], "ninety") c = dt.execute("select count(*) as c from ninety")[0]["c"] dt.close() self.assertEqual(c, 0)
def test_empty_row_second_insert(self): "An empty row acts like any other row." dt = DumpTruck(dbname="/tmp/test.db") dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "nine") dt.insert({}, "nine") c = dt.execute("select count(*) as c from nine")[0]["c"] dt.close() self.assertEqual(c, 1)
def test_second_insert(self): "Inserting a second row that is all null adds an empty row." dt = DumpTruck(dbname = '/tmp/test.db') dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'three') dt.insert({'foo': None, 'bar': None}, 'three') c = dt.execute('select count(*) as c from three')[0]['c'] dt.close() self.assertEqual(c, 1)
def test_empty_row_second_insert(self): "An empty row acts like any other row." dt = DumpTruck(dbname = '/tmp/test.db') dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'nine') dt.insert({}, 'nine') c = dt.execute('select count(*) as c from nine')[0]['c'] dt.close() self.assertEqual(c, 1)
def test_second_insert(self): "Inserting a second row that is all null adds an empty row." dt = DumpTruck(dbname="/tmp/test.db") dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "three") dt.insert({"foo": None, "bar": None}, "three") c = dt.execute("select count(*) as c from three")[0]["c"] dt.close() self.assertEqual(c, 1)
def test_no_rows_second_insert(self): "Nothing happens if no rows are inserted to a table that is there." dt = DumpTruck(dbname = '/tmp/test.db') dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'ninety') dt.insert([], 'ninety') c = dt.execute('select count(*) as c from ninety')[0]['c'] dt.close() self.assertEqual(c, 0)
class AuctionsPipeline(object): def open_spider(self, spider): self.dt = DumpTruck(dbname=settings['DB_PATH'], auto_commit=True) id_data = self.dt.execute('SELECT id FROM auctions') self.ids = [x['id'] for x in id_data] def process_item(self, item, spider): if 'auctions' not in getattr(spider, 'pipelines', []): return item item['id'] = int(item['id'][0]) item['auctioneer'] = ' '.join(item['auctioneer']) item['contact_number'] = ' '.join(item['contact_number']) item['date'] = '%s %s' % (' '.join(item['date']), ' '.join( item['time'])) item['location'] = ' '.join(item['location']) item['link'] = ' '.join(item['link']) item['listing'] = ' '.join(item['listing']) #format phonenumber parsed_number = phonenumbers.parse(item['contact_number'], 'US') item['contact_number'] = phonenumbers.format_number( parsed_number, phonenumbers.PhoneNumber()) # format listing / remove any html cludge soup_listing = BeautifulSoup(item['listing']) item['listing'] = soup_listing.get_text() # format date and time to standard format dt = parse(item['date']) item['date'] = dt.datetime.strftime('%Y-%m-%d %H:%M:%S') if item['id'] in self.ids: raise DropItem('Dupe auction stored, ignoring listing: %s' % item) else: self.dt.insert( { 'id': item['id'], 'auctioneer': item['auctioneer'], 'contact_number': item['contact_number'], 'date': item['date'], 'location': item['location'], 'link': item['link'], 'listing': item['listing'], }, 'auctions') return item
def main(): dt = DumpTruck(dbname='/tmp/yoga.db') dt.execute(''' CREATE TABLE IF NOT EXISTS page_source ( page_number INTEGER NOT NULL, page_source TEXT NOT NULL, UNIQUE(page_number) )''') print('Running the search') # Get the search page driver = _driver_setup() driver.get(url) # 100 per page option = driver.find_elements_by_xpath('id("ctl00_TemplateBody_ucTeacherDirectory_ddhowmany")/option[@value="100"]')[0] option.click() # Search button = driver.find_elements_by_id('ctl00_TemplateBody_ucTeacherDirectory_imgSearch')[0] button.click() while True: print('Pausing for a few seconds to let the page load and be polite') sleep(8) randomsleep() # Current page number page_numbers = driver.find_elements_by_css_selector('#ctl00_TemplateBody_ucTeacherDirectory_gvTeacherDirectory tr td tr span') # Fast forward to new pages: Ellipsis ellipses = driver.find_elements_by_xpath('//a[text()="..."]') # Fast forward to new pages: Maximum page max_page_numbers = driver.find_elements_by_xpath('//td[a[text()="..."]]/preceding-sibling::td[position()=1]') if max_page_numbers == []: max_page_numbers = [page_number.find_elements_by_xpath('../td[position()=last()]') for page_number in page_numbers] for nodes in [page_numbers, ellipses, max_page_numbers]: print(nodes) print [n.text for n in nodes] if len(nodes) == 1: import pdb; pdb.set_trace() raise ValueError('Only one navigation row') elif nodes[0].text != nodes[1].text: import pdb; pdb.set_trace() raise ValueError('Page navigation rows don\'t match.')
def test_create_if_not_exists(self): dt = DumpTruck(dbname="/tmp/test.db") dt.execute("create table mango (bar integer, baz integer);") dt.create_index(["bar", "baz"], "mango") # This should not raise an error. dt.create_index(["bar", "baz"], "mango", if_not_exists=True)
def test_create_if_exists(self): dt = DumpTruck(dbname = '/tmp/test.db') dt.execute('create table pineapple (bar integer, baz integer);') dt.create_index(['bar', 'baz'], 'pineapple') with self.assertRaises(sqlite3.OperationalError): dt.create_index(['bar', 'baz'], 'pineapple', if_not_exists = False)
def test_create_if_not_exists(self): dt = DumpTruck(dbname = '/tmp/test.db') dt.execute('create table mango (bar integer, baz integer);') dt.create_index(['bar', 'baz'], 'mango') # This should not raise an error. dt.create_index(['bar', 'baz'], 'mango', if_not_exists = True)
def test_non_unique(self): dt = DumpTruck(dbname = '/tmp/test.db') dt.execute('create table tomato (bar integer, baz integer);') dt.create_index(['bar', 'baz'], 'tomato') observed = dt.execute('PRAGMA index_info(tomato_bar_baz)') # Indexness self.assertIsNotNone(observed) # Indexed columns expected = [ {u'seqno': 0, u'cid': 0, u'name': u'bar'}, {u'seqno': 1, u'cid': 1, u'name': u'baz'}, ] self.assertListEqual(observed, expected) # Uniqueness indices = dt.execute('PRAGMA index_list(tomato)') for index in indices: if index[u'name'] == u'tomato_bar_baz': break else: index = {} self.assertEqual(index[u'unique'], 0)
class AuctionsPipeline(object): def open_spider(self, spider): self.dt = DumpTruck(dbname=settings['DB_PATH'],auto_commit=True) id_data = self.dt.execute('SELECT id FROM auctions') self.ids = [x['id'] for x in id_data] def process_item(self, item, spider): if 'auctions' not in getattr(spider,'pipelines',[]): return item item['id'] = int(item['id'][0]) item['auctioneer'] = ' '.join(item['auctioneer']) item['contact_number'] = ' '.join(item['contact_number']) item['date'] = '%s %s' % (' '.join(item['date']), ' '.join(item['time'])) item['location'] = ' '.join(item['location']) item['link'] = ' '.join(item['link']) item['listing'] = ' '.join(item['listing']) #format phonenumber parsed_number = phonenumbers.parse(item['contact_number'],'US') item['contact_number'] = phonenumbers.format_number(parsed_number, phonenumbers.PhoneNumber()) # format listing / remove any html cludge soup_listing = BeautifulSoup(item['listing']) item['listing'] = soup_listing.get_text() # format date and time to standard format dt = parse(item['date']) item['date'] = dt.datetime.strftime('%Y-%m-%d %H:%M:%S') if item['id'] in self.ids: raise DropItem('Dupe auction stored, ignoring listing: %s' % item) else: self.dt.insert({ 'id': item['id'], 'auctioneer': item['auctioneer'], 'contact_number': item['contact_number'], 'date': item['date'], 'location': item['location'], 'link': item['link'], 'listing': item['listing'], }, 'auctions') return item
def test_create_if_exists(self): dt = DumpTruck(dbname="/tmp/test.db") dt.execute("create table pineapple (bar integer, baz integer);") dt.create_index(["bar", "baz"], "pineapple") with self.assertRaises(sqlite3.OperationalError): dt.create_index(["bar", "baz"], "pineapple", if_not_exists=False)
def main(): edges = build_network()['edges'] dt = DumpTruck(dbname = '/tmp/open-data.sqlite', adapt_and_convert = True) datasets_in = dt.execute('SELECT * FROM socrata') dt.create_table({'id': 'blah-blah'}, 'socrata_deduplicated') dt.create_index(['id'], 'socrata_deduplicated', if_not_exists = True, unique = True) for dataset in dedupe(datasets_in, edges): dt.upsert(dataset, 'socrata_deduplicated')
def test_select(self): shutil.copy(u"fixtures/landbank_branches.sqlite", u".") h = DumpTruck(dbname=u"landbank_branches.sqlite") data_observed = h.execute(u"SELECT * FROM `branches` WHERE Fax is not null ORDER BY Fax LIMIT 3;") data_expected = [ { "town": u"\r\nCenturion", "date_scraped": 1327791915.618461, "Fax": u" (012) 312 3647", "Tel": u" (012) 686 0500", "address_raw": u"\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001\n (012) 686 0500\n (012) 312 3647", "blockId": 14, "street-address": None, "postcode": u"\r\n0001", "address": u"\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001", "branchName": u"Head Office", }, { "town": u"\r\nCenturion", "date_scraped": 1327792245.787187, "Fax": u" (012) 312 3647", "Tel": u" (012) 686 0500", "address_raw": u"\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001\n (012) 686 0500\n (012) 312 3647", "blockId": 14, "street-address": u"\r\n420 Witch Hazel Ave\n\r\nEcopark", "postcode": u"\r\n0001", "address": u"\r\n420 Witch Hazel Ave\n\r\nEcopark\n\r\nCenturion\n\r\n0001", "branchName": u"Head Office", }, { "town": u"\r\nMiddelburg", "date_scraped": 1327791915.618461, "Fax": u" (013) 282 6558", "Tel": u" (013) 283 3500", "address_raw": u"\r\n184 Jan van Riebeeck Street\n\r\nMiddelburg\n\r\n1050\n (013) 283 3500\n (013) 282 6558", "blockId": 17, "street-address": None, "postcode": u"\r\n1050", "address": u"\r\n184 Jan van Riebeeck Street\n\r\nMiddelburg\n\r\n1050", "branchName": u"Middelburg", }, ] self.assertListEqual(data_observed, data_expected) os.remove("landbank_branches.sqlite")
def avail_within(db, start=0, end=2**32): ''' Given a database filename, uid and a time range (POSIX times), return how long everyone was online, in seconds. ''' # Database connection dt = DumpTruck(db) updates = dt.execute(''' SELECT uid, ts, status FROM log_status WHERE ts > ? AND ts < ? ORDER BY uid, ts ;''', [start, end]) # This loop depends on the order by clause above total_time = {} for u in updates: if u['status'] == 'notavail' and (u['uid'] not in total_time): # First time we see person, but it's 'notavail', so skip it. continue elif u['status'] == 'avail' and (u['uid'] not in total_time): # First time we see person becoming available total_time[u['uid']] = 0 elif u['status'] == 'avail': # The person went avail. We don't need to do anything; # prev_time is recorded below the if statements continue elif u['status'] == 'notavail': # The person went notavail, so record the time when the user was available. total_time[u['uid']] += u['ts'] - prev_time else: raise ValueError('The update\'s status "%s" is neither "avail" nor "notavail."' % u['status']) # Record the current timestamp as prev_time prev_time = u['ts'] return total_time
def apis(): dt = DumpTruck('/tmp/open-data.sqlite', auto_commit = False) dt.create_table({'catalog':'abc.def'}, 'socrata_apis') dt.create_index(['catalog'], 'socrata_apis', unique = True, if_not_exists = True) socrata_catalogs = filter(lambda x: x[0] == 'socrata', catalogs()) for _, catalog in socrata_catalogs: dt.upsert({ 'catalog': catalog.split('://')[-1], 'apis': count_apis(catalog), }, 'socrata_apis')
def test_special_type_date(self): """Adapters and converters should not be enabled.""" dt = DumpTruck(dbname="/tmp/test.db", adapt_and_convert=False) dt.execute("CREATE TABLE pork_sales (week date);") dt.execute("INSERT INTO pork_sales VALUES ('2012-10-08')") observedData = dt.execute("SELECT week FROM pork_sales") self.assertListEqual(observedData, [{u"week": u"2012-10-08"}])
def test_special_type_list(self): """Adapters and converters should not be enabled.""" dt = DumpTruck(dbname = '/tmp/test.db', adapt_and_convert = False) dt.execute('CREATE TABLE pork_sales (week json);') dt.execute("INSERT INTO pork_sales VALUES ('[12,3,4]')") observedData = dt.execute('SELECT week FROM pork_sales') self.assertListEqual(observedData, [{u"week": u"[12,3,4]"}])
#!/usr/bin/env python import json from dumptruck import DumpTruck dt = DumpTruck(dbname='applications.db') def scott_data(): sql = ''' SELECT "parish", sum("acreage") AS 'acreage' FROM application WHERE "type" = 'impact' AND "parish" != '' GROUP BY "parish"; ''' return { row['parish'].upper().replace('SAINT', 'ST'): (row['parish'], row['acreage']) for row in dt.execute(sql) } scott = scott_data() parishes = json.load(open('parishes.json')) max_impacted_acres = max([v[1] for v in scott.values()]) for feature in parishes['features']: feature['properties']['impacted_acres'] = scott.get( feature['properties']['COUNTY'], (None, 0))[1] feature['properties']['impacted_acres_prop_max'] = scott.get( feature['properties']['COUNTY'], (None, 0))[1] / max_impacted_acres
def _connect(dbname=DATABASE_NAME, timeout=DATABASE_TIMEOUT): 'Initialize the database (again). This is mainly for testing' global dt dt = DumpTruck(dbname=dbname, adapt_and_convert=False, timeout=timeout)
def main(): dt = DumpTruck(dbname='metrics.db') dt.create_table({'portal': 'abc', 'date': datetime.date.today()}, 'series') dt.create_index(['portal', 'date'], 'series') dt.upsert(list(table()), 'series')
def open_spider(self, spider): self.dt = DumpTruck(dbname=settings['DB_PATH'], auto_commit=True) id_data = self.dt.execute('SELECT id FROM auctions') self.ids = [x['id'] for x in id_data]
# coding: utf-8 from dumptruck import DumpTruck import csv, os, glob _here = os.path.split(__file__)[0] store = DumpTruck(dbname="db/documents.db") already = dict([(a[26:36], a) for a in [ os.path.split(a)[1] for a in glob.glob("/home/martin/Dropbox/blackrock-scraper/data/*") ]]) already_downloaded = dict([(a[26:36], a) for a in [ os.path.split(a)[1] for a in glob.glob(os.path.join(_here, "data/html/*")) ]]) print len(already_downloaded.items()), len(already.items()) #s # # Liste aller Gesellschaften mit aktuellstem Berichtsdatum und Anzahl der Berichte dt = csv.DictWriter(open(os.path.join(_here, "data/tables", "ciks.csv"), "w"), [ 'act', 'num', 'cik', 'name', 'filename', 'already', 'already_downloaded', 'exists', 'link' ], delimiter=";") dt.writerow( dict(act="Aktuellster Bericht", num="Anzahl der Berichte", cik="Central Index Key",
#!/usr/bin/env python2 import os, json from dumptruck import DumpTruck dt = DumpTruck(dbname='/tmp/catalog.db') # Create a unique index on `identifier`. dt.execute(''' CREATE TABLE IF NOT EXISTS "catalog" ( "portal" TEXT NOT NULL, "identifier" TEXT NOT NULL, PRIMARY KEY ("portal", "identifier") );''') for data_json in os.listdir('catalogs'): # Load into memory. data = json.load(open(os.path.join('catalogs', data_json)))[1:] # Add the portal. portal = data_json.replace('.json', '') for row in data: row['portal'] = portal # Put in the database. dt.insert(data, 'catalog')
import logging, sys, os from lxml import etree import requests from dumptruck import DumpTruck, Pickle _here = os.path.split(__file__)[0] store = DumpTruck(dbname=os.path.join(_here, "db/documents.db")) parser = etree.HTMLParser() def getTree(url): return etree.parse(url, etree.HTMLParser()) logger = logging.getLogger(os.path.split(__file__)[1]) logging.basicConfig(level=logging.DEBUG, file=sys.stderr) def get_nq_for_cik(cik): try: tree = getTree( "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=%s&type=N-Q%%25&dateb=&owner=include&start=0&count=40&output=atom" % cik) except Exception, e: logger.error("Error searching for CIK %s:%s" % (cik, e)) pass for entry in tree.xpath("//entry"): link = entry.xpath("link/@href")[0] date = entry.xpath("updated/text()")[0]
def _connect(dbname='scraperwiki.sqlite'): 'Initialize the database (again). This is mainly for testing' global dt dt = DumpTruck(dbname=dbname, adapt_and_convert=False)
trs = html.xpath( '//table[@style="border-collapse: collapse; width: 100%;"]/descendant::tr' ) def do_row(tr): try: return l.parse_row(tr) except: print tostring(tr) raise return map(do_row, trs[2:]) # Schema dt = DumpTruck(dbname='/tmp/finalip.db') dt.create_table({u'DA Number': u'NAE-2009-01067'}, 'finalip', if_not_exists=True) dt.create_index(['Da Number'], 'finalip', unique=True, if_not_exists=True) # Skip finished stuff pages = set([(row['Year'], row['Month'], row['Page']) for row in dt.execute('SELECT Year, Month, Page FROM finalip')]) # Populate for dirname, subdirnames, filenames in os.walk( os.path.join(os.environ['READER_ROOT'], '..', 'finalips')): if subdirnames != []: continue for filename in filenames: