def save_and_check(self, dataIn, tableIn, dataOut, tableOut = None, twice = True): if tableOut == None: tableOut = quote(tableIn) # Insert h = DumpTruck(dbname = '/tmp/test.db') h.insert(dataIn, tableIn) h.close() # Observe with pysqlite connection=sqlite3.connect('/tmp/test.db') cursor=connection.cursor() cursor.execute(u'SELECT * FROM %s' % tableOut) observed1 = cursor.fetchall() connection.close() if twice: # Observe with DumpTruck h = DumpTruck(dbname = '/tmp/test.db') observed2 = h.execute(u'SELECT * FROM %s' % tableOut) h.close() #Check expected1 = dataOut expected2 = [dataIn] if type(dataIn) in (dict, OrderedDict) else dataIn self.assertListEqual(observed1, expected1) self.assertListEqual(observed2, expected2)
def test_empty_row_second_insert(self): "An empty row acts like any other row." dt = DumpTruck(dbname="/tmp/test.db") dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "nine") dt.insert({}, "nine") c = dt.execute("select count(*) as c from nine")[0]["c"] dt.close() self.assertEqual(c, 1)
def test_second_insert(self): "Inserting a second row that is all null adds an empty row." dt = DumpTruck(dbname="/tmp/test.db") dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "three") dt.insert({"foo": None, "bar": None}, "three") c = dt.execute("select count(*) as c from three")[0]["c"] dt.close() self.assertEqual(c, 1)
def test_no_rows_second_insert(self): "Nothing happens if no rows are inserted to a table that is there." dt = DumpTruck(dbname = '/tmp/test.db') dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'ninety') dt.insert([], 'ninety') c = dt.execute('select count(*) as c from ninety')[0]['c'] dt.close() self.assertEqual(c, 0)
def test_second_insert(self): "Inserting a second row that is all null adds an empty row." dt = DumpTruck(dbname = '/tmp/test.db') dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'three') dt.insert({'foo': None, 'bar': None}, 'three') c = dt.execute('select count(*) as c from three')[0]['c'] dt.close() self.assertEqual(c, 1)
def test_empty_row_second_insert(self): "An empty row acts like any other row." dt = DumpTruck(dbname = '/tmp/test.db') dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'nine') dt.insert({}, 'nine') c = dt.execute('select count(*) as c from nine')[0]['c'] dt.close() self.assertEqual(c, 1)
def test_no_rows_second_insert(self): "Nothing happens if no rows are inserted to a table that is there." dt = DumpTruck(dbname="/tmp/test.db") dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "ninety") dt.insert([], "ninety") c = dt.execute("select count(*) as c from ninety")[0]["c"] dt.close() self.assertEqual(c, 0)
def users(): dt = DumpTruck(dbname = '/tmp/socrata.db') dt.create_table({'id': 'abcd-efgh'}, 'user') dt.create_index(['id'], 'user', unique = True) _users = {} for portal in os.listdir('data'): for viewid in os.listdir(os.path.join('data', portal, 'views')): handle = open(os.path.join('data', portal, 'views', viewid), 'r') try: view = json.load(handle) except: # *cringe* continue handle.close() if view['owner']['id'] in _users: _users[view['owner']['id']]['views'].add(view['id']) try: _users[view['owner']['id']]['publicationDates'].add((view['id'], view['publicationDate'])) except: return view else: _users[view['owner']['id']] = view['owner'] _users[view['owner']['id']]['views'] = {view['id']} _users[view['owner']['id']]['tables'] = set() _users[view['owner']['id']]['publicationDates'] = set() if view['tableAuthor']['id'] in _users: _users[view['tableAuthor']['id']]['tables'].add(view['tableId']) else: _users[view['tableAuthor']['id']] = view['tableAuthor'] _users[view['tableAuthor']['id']]['views'] = set() _users[view['tableAuthor']['id']]['tables'] = {view['tableId']} _users[view['tableAuthor']['id']]['publicationDates'] = set() for uid in _users.keys(): for key in ['views', 'rights', 'tables']: if key in _users[uid]: _users[uid]['n_' + key] = len(_users[uid][key]) del _users[uid][key] dt.insert(_users.values(), 'user') for uid, user in _users.items(): for viewid, publicationDate in user['publicationDates']: dt.insert({'userid': user['id'], 'viewid': viewid, 'publicationDate': publicationDate}, 'publications', commit = False) dt.commit() return _users
class AuctionsPipeline(object): def open_spider(self, spider): self.dt = DumpTruck(dbname=settings['DB_PATH'], auto_commit=True) id_data = self.dt.execute('SELECT id FROM auctions') self.ids = [x['id'] for x in id_data] def process_item(self, item, spider): if 'auctions' not in getattr(spider, 'pipelines', []): return item item['id'] = int(item['id'][0]) item['auctioneer'] = ' '.join(item['auctioneer']) item['contact_number'] = ' '.join(item['contact_number']) item['date'] = '%s %s' % (' '.join(item['date']), ' '.join( item['time'])) item['location'] = ' '.join(item['location']) item['link'] = ' '.join(item['link']) item['listing'] = ' '.join(item['listing']) #format phonenumber parsed_number = phonenumbers.parse(item['contact_number'], 'US') item['contact_number'] = phonenumbers.format_number( parsed_number, phonenumbers.PhoneNumber()) # format listing / remove any html cludge soup_listing = BeautifulSoup(item['listing']) item['listing'] = soup_listing.get_text() # format date and time to standard format dt = parse(item['date']) item['date'] = dt.datetime.strftime('%Y-%m-%d %H:%M:%S') if item['id'] in self.ids: raise DropItem('Dupe auction stored, ignoring listing: %s' % item) else: self.dt.insert( { 'id': item['id'], 'auctioneer': item['auctioneer'], 'contact_number': item['contact_number'], 'date': item['date'], 'location': item['location'], 'link': item['link'], 'listing': item['listing'], }, 'auctions') return item
class AuctionsPipeline(object): def open_spider(self, spider): self.dt = DumpTruck(dbname=settings['DB_PATH'],auto_commit=True) id_data = self.dt.execute('SELECT id FROM auctions') self.ids = [x['id'] for x in id_data] def process_item(self, item, spider): if 'auctions' not in getattr(spider,'pipelines',[]): return item item['id'] = int(item['id'][0]) item['auctioneer'] = ' '.join(item['auctioneer']) item['contact_number'] = ' '.join(item['contact_number']) item['date'] = '%s %s' % (' '.join(item['date']), ' '.join(item['time'])) item['location'] = ' '.join(item['location']) item['link'] = ' '.join(item['link']) item['listing'] = ' '.join(item['listing']) #format phonenumber parsed_number = phonenumbers.parse(item['contact_number'],'US') item['contact_number'] = phonenumbers.format_number(parsed_number, phonenumbers.PhoneNumber()) # format listing / remove any html cludge soup_listing = BeautifulSoup(item['listing']) item['listing'] = soup_listing.get_text() # format date and time to standard format dt = parse(item['date']) item['date'] = dt.datetime.strftime('%Y-%m-%d %H:%M:%S') if item['id'] in self.ids: raise DropItem('Dupe auction stored, ignoring listing: %s' % item) else: self.dt.insert({ 'id': item['id'], 'auctioneer': item['auctioneer'], 'contact_number': item['contact_number'], 'date': item['date'], 'location': item['location'], 'link': item['link'], 'listing': item['listing'], }, 'auctions') return item
def test_empty_row_first_insert(self): "The first row must have a non-null value so the schema can be defined." dt = DumpTruck(dbname = '/tmp/test.db') with self.assertRaises(ValueError): dt.insert({}, 'two') dt.close()
def save_and_select(self, d): dt = DumpTruck() dt.insert({'foo': d}) observed = dt.dump()[0]['foo'] self.assertEqual(d, observed)
def test_save(self): h = DumpTruck(dbname="/tmp/test.db") h.insert({"firstname": "Robert", "lastname": "LeTourneau"}, "foo") h.drop("foo") self.assertEqual(h.tables(), set([])) h.close()
def test_if_not_exists(self): dt = DumpTruck(dbname = '/tmp/test.db') dt.insert({'foo': 'bar'}, 'baz') dt.insert({'foo': 'bar'}, 'baz')
def test_if_not_exists(self): dt = DumpTruck(dbname="/tmp/test.db") dt.insert({"foo": "bar"}, "baz") dt.insert({"foo": "bar"}, "baz")
def test_many(self): dt = DumpTruck(dbname="/tmp/test.db") rowid = dt.insert([{u"foo": 8}, {u"bar": 5}]) self.assertEqual(rowid, [1, 2])
def test_save(self): h = DumpTruck(dbname = '/tmp/test.db') h.insert({'firstname': 'Robert', 'lastname': 'LeTourneau'}, 'foo') h.drop('foo') self.assertEqual(h.tables(), set([])) h.close()
def test_save2(self): dt = DumpTruck(dbname = DB_FILE) dt.insert([[('foo', 'bar')]], 'baz2', structure = zip) self.assertDictEqual(dt.execute('select * from baz2')[0], {'foo': 'bar'})
def test_retrieve(self): dt = DumpTruck(dbname = DB_FILE) dt.insert([{'a': 'b'}], 'c') self.assertEqual(dt.execute('select * from c', structure = zip)[0], ('a', 'b'))
def test_many(self): dt = DumpTruck(dbname = '/tmp/test.db') rowid = dt.insert([{u'foo': 8}, {u'bar': 5}]) self.assertEqual(rowid, [1, 2])
def test_convert_lxml(self): dt = DumpTruck(dbname="/tmp/test.db", adapt_and_convert=True) elementstringresult = lxml.html.fromstring("<html>Hi</html>").xpath("//*/text()")[0] dt.insert({"bar": elementstringresult}, "foo") self.assertEqual(type(dt.dump("foo")[0]["bar"]), unicode)
def test_first_insert(self): "The first row must have a non-null value so the schema can be defined." dt = DumpTruck(dbname="/tmp/test.db") with self.assertRaises(ValueError): dt.insert({"foo": None, "bar": None}, "two") dt.close()
#!/usr/bin/env python2 import os, json from dumptruck import DumpTruck dt = DumpTruck(dbname='/tmp/catalog.db') # Create a unique index on `identifier`. dt.execute(''' CREATE TABLE IF NOT EXISTS "catalog" ( "portal" TEXT NOT NULL, "identifier" TEXT NOT NULL, PRIMARY KEY ("portal", "identifier") );''') for data_json in os.listdir('catalogs'): # Load into memory. data = json.load(open(os.path.join('catalogs', data_json)))[1:] # Add the portal. portal = data_json.replace('.json', '') for row in data: row['portal'] = portal # Put in the database. dt.insert(data, 'catalog')
title += authors authors = [] elif ICMPCParser.author_regex.search(line): appending = "authors" if appending == "title": title.append(line) else: authors.append(line) abstracts.append({ "Title": clean_title(title), "Authors": clean_authors(authors), "Abstract": clean_abstract(text[abstract_indices[i]:document_indices[i + 1]]), "Place": "ICMPC", "Year": 2010, }) return abstracts dt = DumpTruck(dbname="icmpc.db") abstracts = ICMPC11("data/ICMPC11_ABSTRACTS.txt").parse() dt.insert(abstracts) abstracts = ICMPC10("data/ICMPC10_absbook.txt").parse() dt.insert(abstracts) # SMPC2011-program.pdf # need to correct whitespace # icmpc-escom2012_book_of_abstracts.pdf # Work with docs version
def test_save(self): h = DumpTruck(dbname="/tmp/test.db") data = [{"firstname": "Robert", "lastname": "LeTourneau"}] h.insert(data, "foo") self.assertEqual(data, h.dump("foo")) h.close()
def test_save_integers(self): dt = DumpTruck() with self.assertRaises(ValueError): dt.insert({"foo": {1: "A", 2: "B", 3: "C"}})
def test_no_rows_first_insert(self): "Nothing happens if no rows are inserted to a table that isn't there." dt = DumpTruck(dbname = '/tmp/test.db') dt.insert([], 'ninety') self.assertSetEqual(dt.tables(), set()) dt.close()
def test_one(self): dt = DumpTruck(dbname="/tmp/test.db") rowid = dt.insert({u"foo": 8}) self.assertEqual(rowid, 1)
def test_one(self): dt = DumpTruck(dbname = '/tmp/test.db') rowid = dt.insert({u'foo': 8}) self.assertEqual(rowid, 1)
def main(): dt = DumpTruck(dbname = '/tmp/smart.db') for data in table(): dt.insert(data, 'smart', commit = False) dt.commit()
def test_save(self): h = DumpTruck(dbname = '/tmp/test.db') data = [{'firstname': 'Robert', 'lastname': 'LeTourneau'}] h.insert(data, 'foo') self.assertEqual(data, h.dump('foo')) h.close()
from dumptruck import DumpTruck dt = DumpTruck(dbname = 'cites.db') top10 = dt.execute(''' select shipment_year, export_country_code, taxon_family, sum(quantity_1) as "export" from cites where taxon_family in ( select taxon_family from cites group by taxon_family order by count(taxon_family) desc limit 10 ) group by shipment_year || export_country_code || taxon_family order by export_country_code, taxon_family, shipment_year ;''') for i in range(len(top10) - 1): try: if top10[i]['taxon_family'] == top10[i+1]['taxon_family']: top10[i+1]['change_since_last_year'] = top10[i+1]['export'] - top10[i]['export'] dt.insert(top10[i+1], 'changes_tmp', commit = False) except: dt.commit() raise dt.insert(top10, 'changes')