示例#1
0
  def save_and_check(self, dataIn, tableIn, dataOut, tableOut = None, twice = True):
    if tableOut == None:
      tableOut = quote(tableIn)

    # Insert
    h = DumpTruck(dbname = '/tmp/test.db')
    h.insert(dataIn, tableIn)
    h.close()

    # Observe with pysqlite
    connection=sqlite3.connect('/tmp/test.db')
    cursor=connection.cursor()
    cursor.execute(u'SELECT * FROM %s' % tableOut)
    observed1 = cursor.fetchall()
    connection.close()

    if twice:
      # Observe with DumpTruck
      h = DumpTruck(dbname = '/tmp/test.db')
      observed2 = h.execute(u'SELECT * FROM %s' % tableOut)
      h.close()
 
      #Check
      expected1 = dataOut
      expected2 = [dataIn] if type(dataIn) in (dict, OrderedDict) else dataIn
 
      self.assertListEqual(observed1, expected1)
      self.assertListEqual(observed2, expected2)
示例#2
0
文件: tests.py 项目: 4bic/entity_log
 def test_empty_row_second_insert(self):
     "An empty row acts like any other row."
     dt = DumpTruck(dbname="/tmp/test.db")
     dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "nine")
     dt.insert({}, "nine")
     c = dt.execute("select count(*) as c from nine")[0]["c"]
     dt.close()
     self.assertEqual(c, 1)
示例#3
0
文件: tests.py 项目: 4bic/entity_log
 def test_second_insert(self):
     "Inserting a second row that is all null adds an empty row."
     dt = DumpTruck(dbname="/tmp/test.db")
     dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "three")
     dt.insert({"foo": None, "bar": None}, "three")
     c = dt.execute("select count(*) as c from three")[0]["c"]
     dt.close()
     self.assertEqual(c, 1)
示例#4
0
 def test_no_rows_second_insert(self):
   "Nothing happens if no rows are inserted to a table that is there."
   dt = DumpTruck(dbname = '/tmp/test.db')
   dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'ninety')
   dt.insert([], 'ninety')
   c = dt.execute('select count(*) as c from ninety')[0]['c']
   dt.close()
   self.assertEqual(c, 0)
示例#5
0
 def test_second_insert(self):
   "Inserting a second row that is all null adds an empty row."
   dt = DumpTruck(dbname = '/tmp/test.db')
   dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'three')
   dt.insert({'foo': None, 'bar': None}, 'three')
   c = dt.execute('select count(*) as c from three')[0]['c']
   dt.close()
   self.assertEqual(c, 1)
示例#6
0
 def test_empty_row_second_insert(self):
   "An empty row acts like any other row."
   dt = DumpTruck(dbname = '/tmp/test.db')
   dt.create_table({'foo': 'uhtnh', 'bar': 'aoue'}, 'nine')
   dt.insert({}, 'nine')
   c = dt.execute('select count(*) as c from nine')[0]['c']
   dt.close()
   self.assertEqual(c, 1)
示例#7
0
文件: tests.py 项目: 4bic/entity_log
 def test_no_rows_second_insert(self):
     "Nothing happens if no rows are inserted to a table that is there."
     dt = DumpTruck(dbname="/tmp/test.db")
     dt.create_table({"foo": "uhtnh", "bar": "aoue"}, "ninety")
     dt.insert([], "ninety")
     c = dt.execute("select count(*) as c from ninety")[0]["c"]
     dt.close()
     self.assertEqual(c, 0)
示例#8
0
def users():
    dt = DumpTruck(dbname = '/tmp/socrata.db')
    dt.create_table({'id': 'abcd-efgh'}, 'user')
    dt.create_index(['id'], 'user', unique = True)

    _users = {}
    for portal in os.listdir('data'):
        for viewid in os.listdir(os.path.join('data', portal, 'views')):
            handle = open(os.path.join('data', portal, 'views', viewid), 'r')
            try:
                view = json.load(handle)
            except:
                # *cringe*
                continue
            handle.close()

            if view['owner']['id'] in _users:
                _users[view['owner']['id']]['views'].add(view['id'])
                try:
                    _users[view['owner']['id']]['publicationDates'].add((view['id'], view['publicationDate']))
                except:
                    return view
            else:
                _users[view['owner']['id']] = view['owner']
                _users[view['owner']['id']]['views'] = {view['id']}
                _users[view['owner']['id']]['tables'] = set()
                _users[view['owner']['id']]['publicationDates'] = set()

            if view['tableAuthor']['id'] in _users:
                _users[view['tableAuthor']['id']]['tables'].add(view['tableId'])
            else:
                _users[view['tableAuthor']['id']] = view['tableAuthor']
                _users[view['tableAuthor']['id']]['views'] = set()
                _users[view['tableAuthor']['id']]['tables'] = {view['tableId']}
                _users[view['tableAuthor']['id']]['publicationDates'] = set()

    for uid in _users.keys():
        for key in ['views', 'rights', 'tables']:
            if key in _users[uid]:
                _users[uid]['n_' + key] = len(_users[uid][key])
                del _users[uid][key]

    dt.insert(_users.values(), 'user')
    for uid, user in _users.items():
        for viewid, publicationDate in user['publicationDates']:
            dt.insert({'userid': user['id'], 'viewid': viewid, 'publicationDate': publicationDate}, 'publications', commit = False)
    dt.commit()
    return _users
示例#9
0
class AuctionsPipeline(object):
    def open_spider(self, spider):
        self.dt = DumpTruck(dbname=settings['DB_PATH'], auto_commit=True)

        id_data = self.dt.execute('SELECT id FROM auctions')
        self.ids = [x['id'] for x in id_data]

    def process_item(self, item, spider):
        if 'auctions' not in getattr(spider, 'pipelines', []):
            return item

        item['id'] = int(item['id'][0])
        item['auctioneer'] = ' '.join(item['auctioneer'])
        item['contact_number'] = ' '.join(item['contact_number'])
        item['date'] = '%s %s' % (' '.join(item['date']), ' '.join(
            item['time']))
        item['location'] = ' '.join(item['location'])
        item['link'] = ' '.join(item['link'])
        item['listing'] = ' '.join(item['listing'])

        #format phonenumber
        parsed_number = phonenumbers.parse(item['contact_number'], 'US')
        item['contact_number'] = phonenumbers.format_number(
            parsed_number, phonenumbers.PhoneNumber())

        # format listing / remove any html cludge
        soup_listing = BeautifulSoup(item['listing'])
        item['listing'] = soup_listing.get_text()

        # format date and time to standard format
        dt = parse(item['date'])
        item['date'] = dt.datetime.strftime('%Y-%m-%d %H:%M:%S')

        if item['id'] in self.ids:
            raise DropItem('Dupe auction stored, ignoring listing: %s' % item)
        else:
            self.dt.insert(
                {
                    'id': item['id'],
                    'auctioneer': item['auctioneer'],
                    'contact_number': item['contact_number'],
                    'date': item['date'],
                    'location': item['location'],
                    'link': item['link'],
                    'listing': item['listing'],
                }, 'auctions')

            return item
示例#10
0
class AuctionsPipeline(object):
    def open_spider(self, spider):
        self.dt = DumpTruck(dbname=settings['DB_PATH'],auto_commit=True)

        id_data = self.dt.execute('SELECT id FROM auctions')
        self.ids = [x['id'] for x in id_data]

    def process_item(self, item, spider):
        if 'auctions' not in getattr(spider,'pipelines',[]):
            return item

        item['id'] = int(item['id'][0])
        item['auctioneer'] = ' '.join(item['auctioneer'])
        item['contact_number'] = ' '.join(item['contact_number'])
        item['date'] = '%s %s' % (' '.join(item['date']), ' '.join(item['time']))
        item['location'] = ' '.join(item['location'])
        item['link'] = ' '.join(item['link'])
        item['listing'] = ' '.join(item['listing'])

        #format phonenumber
        parsed_number = phonenumbers.parse(item['contact_number'],'US')
        item['contact_number'] = phonenumbers.format_number(parsed_number, phonenumbers.PhoneNumber())

        # format listing / remove any html cludge
        soup_listing = BeautifulSoup(item['listing'])
        item['listing'] = soup_listing.get_text()

        # format date and time to standard format
        dt = parse(item['date'])
        item['date'] = dt.datetime.strftime('%Y-%m-%d %H:%M:%S')

        if item['id'] in self.ids:
            raise DropItem('Dupe auction stored, ignoring listing: %s' % item)
        else:
            self.dt.insert({
                'id': item['id'],
                'auctioneer': item['auctioneer'],
                'contact_number': item['contact_number'],
                'date': item['date'],
                'location': item['location'],
                'link': item['link'],
                'listing': item['listing'],
            }, 'auctions')

            return item
示例#11
0
 def test_empty_row_first_insert(self):
   "The first row must have a non-null value so the schema can be defined."
   dt = DumpTruck(dbname = '/tmp/test.db')
   with self.assertRaises(ValueError):
     dt.insert({}, 'two')
   dt.close()
示例#12
0
  def save_and_select(self, d):
    dt = DumpTruck()
    dt.insert({'foo': d})

    observed = dt.dump()[0]['foo']
    self.assertEqual(d, observed)
示例#13
0
文件: tests.py 项目: 4bic/entity_log
 def test_save(self):
     h = DumpTruck(dbname="/tmp/test.db")
     h.insert({"firstname": "Robert", "lastname": "LeTourneau"}, "foo")
     h.drop("foo")
     self.assertEqual(h.tables(), set([]))
     h.close()
示例#14
0
 def test_if_not_exists(self):
   dt = DumpTruck(dbname = '/tmp/test.db')
   dt.insert({'foo': 'bar'}, 'baz')
   dt.insert({'foo': 'bar'}, 'baz')
示例#15
0
文件: tests.py 项目: 4bic/entity_log
 def test_if_not_exists(self):
     dt = DumpTruck(dbname="/tmp/test.db")
     dt.insert({"foo": "bar"}, "baz")
     dt.insert({"foo": "bar"}, "baz")
示例#16
0
文件: tests.py 项目: 4bic/entity_log
 def test_many(self):
     dt = DumpTruck(dbname="/tmp/test.db")
     rowid = dt.insert([{u"foo": 8}, {u"bar": 5}])
     self.assertEqual(rowid, [1, 2])
示例#17
0
 def test_save(self):
   h = DumpTruck(dbname = '/tmp/test.db')
   h.insert({'firstname': 'Robert', 'lastname': 'LeTourneau'}, 'foo')
   h.drop('foo')
   self.assertEqual(h.tables(), set([]))
   h.close()
示例#18
0
文件: tests.py 项目: drj11/dumptruck
 def test_save2(self):
   dt = DumpTruck(dbname = DB_FILE)
   dt.insert([[('foo', 'bar')]], 'baz2', structure = zip)
   self.assertDictEqual(dt.execute('select * from baz2')[0], {'foo': 'bar'})
示例#19
0
文件: tests.py 项目: drj11/dumptruck
 def test_retrieve(self):
   dt = DumpTruck(dbname = DB_FILE)
   dt.insert([{'a': 'b'}], 'c')
   self.assertEqual(dt.execute('select * from c', structure = zip)[0], ('a', 'b'))
示例#20
0
 def test_many(self):
   dt = DumpTruck(dbname = '/tmp/test.db')
   rowid = dt.insert([{u'foo': 8}, {u'bar': 5}])
   self.assertEqual(rowid, [1, 2])
示例#21
0
文件: tests.py 项目: 4bic/entity_log
 def test_convert_lxml(self):
     dt = DumpTruck(dbname="/tmp/test.db", adapt_and_convert=True)
     elementstringresult = lxml.html.fromstring("<html>Hi</html>").xpath("//*/text()")[0]
     dt.insert({"bar": elementstringresult}, "foo")
     self.assertEqual(type(dt.dump("foo")[0]["bar"]), unicode)
示例#22
0
文件: tests.py 项目: 4bic/entity_log
 def test_first_insert(self):
     "The first row must have a non-null value so the schema can be defined."
     dt = DumpTruck(dbname="/tmp/test.db")
     with self.assertRaises(ValueError):
         dt.insert({"foo": None, "bar": None}, "two")
     dt.close()
示例#23
0
#!/usr/bin/env python2
import os, json
from dumptruck import DumpTruck

dt = DumpTruck(dbname='/tmp/catalog.db')

# Create a unique index on `identifier`.
dt.execute('''
CREATE TABLE IF NOT EXISTS "catalog" (
  "portal" TEXT NOT NULL,
  "identifier" TEXT NOT NULL,
  PRIMARY KEY ("portal", "identifier")
);''')

for data_json in os.listdir('catalogs'):
    # Load into memory.
    data = json.load(open(os.path.join('catalogs', data_json)))[1:]

    # Add the portal.
    portal = data_json.replace('.json', '')
    for row in data:
        row['portal'] = portal

    # Put in the database.
    dt.insert(data, 'catalog')
示例#24
0
          title += authors
          authors = []
        elif ICMPCParser.author_regex.search(line):
          appending = "authors"
        if appending == "title":
          title.append(line)
        else:
          authors.append(line)

      abstracts.append({
        "Title": clean_title(title),
        "Authors": clean_authors(authors),
        "Abstract": clean_abstract(text[abstract_indices[i]:document_indices[i + 1]]),
        "Place": "ICMPC",
        "Year": 2010,
      })

    return abstracts

dt = DumpTruck(dbname="icmpc.db")
abstracts = ICMPC11("data/ICMPC11_ABSTRACTS.txt").parse()
dt.insert(abstracts)
abstracts = ICMPC10("data/ICMPC10_absbook.txt").parse()
dt.insert(abstracts)

# SMPC2011-program.pdf
# need to correct whitespace

# icmpc-escom2012_book_of_abstracts.pdf
# Work with docs version
示例#25
0
文件: tests.py 项目: 4bic/entity_log
 def test_save(self):
     h = DumpTruck(dbname="/tmp/test.db")
     data = [{"firstname": "Robert", "lastname": "LeTourneau"}]
     h.insert(data, "foo")
     self.assertEqual(data, h.dump("foo"))
     h.close()
示例#26
0
文件: tests.py 项目: 4bic/entity_log
 def test_save_integers(self):
     dt = DumpTruck()
     with self.assertRaises(ValueError):
         dt.insert({"foo": {1: "A", 2: "B", 3: "C"}})
示例#27
0
 def test_no_rows_first_insert(self):
   "Nothing happens if no rows are inserted to a table that isn't there."
   dt = DumpTruck(dbname = '/tmp/test.db')
   dt.insert([], 'ninety')
   self.assertSetEqual(dt.tables(), set())
   dt.close()
示例#28
0
文件: tests.py 项目: 4bic/entity_log
 def test_one(self):
     dt = DumpTruck(dbname="/tmp/test.db")
     rowid = dt.insert({u"foo": 8})
     self.assertEqual(rowid, 1)
示例#29
0
 def test_one(self):
   dt = DumpTruck(dbname = '/tmp/test.db')
   rowid = dt.insert({u'foo': 8})
   self.assertEqual(rowid, 1)
示例#30
0
def main():
    dt = DumpTruck(dbname = '/tmp/smart.db')
    for data in table():
        dt.insert(data, 'smart', commit = False)
    dt.commit()
示例#31
0
 def test_save(self):
   h = DumpTruck(dbname = '/tmp/test.db')
   data = [{'firstname': 'Robert', 'lastname': 'LeTourneau'}]
   h.insert(data, 'foo')
   self.assertEqual(data, h.dump('foo'))
   h.close()
示例#32
0
from dumptruck import DumpTruck

dt = DumpTruck(dbname = 'cites.db')
top10 = dt.execute('''
select
  shipment_year, export_country_code, taxon_family,
  sum(quantity_1) as "export"
from cites where taxon_family in (
  select taxon_family from cites
  group by taxon_family
  order by count(taxon_family)
  desc limit 10
)

group by shipment_year || export_country_code || taxon_family
order by export_country_code, taxon_family, shipment_year

;''')

for i in range(len(top10) - 1):
  try:
    if top10[i]['taxon_family'] == top10[i+1]['taxon_family']:
      top10[i+1]['change_since_last_year'] = top10[i+1]['export'] - top10[i]['export']
    dt.insert(top10[i+1], 'changes_tmp', commit = False)
  except:
    dt.commit()
    raise

dt.insert(top10, 'changes')