class WorkLoader: def __init__(self, **dbparams): self.loader = DocumentLoader(**dbparams) self.tmpdir = "/tmp" # a bug in web.group has been fixed in 0.33 assert web.__version__ == "0.33" def load_works(self, filename, author="/user/ImportBot"): self.author = author root = os.path.dirname(filename) editions_file = open(os.path.join(root, 'editions.txt'), 'a') try: for i, lines in enumerate(web.group(open(filename), 1000)): t0 = time.time() self.load_works_chunk(lines, editions_file) t1 = time.time() log(i, "%.3f sec" % (t1-t0)) finally: editions_file.close() def load_works_chunk(self, lines, editions_file): authors = [eval(line) for line in lines] keys = self.loader.new_work_keys(len(works)) editions = {} for akey, works in authors: for work, key in zip(works, keys): work['key'] = key work['type'] = {'key': "/type/work"} work['authors'] = [{'author': {'key': akey}, 'type': '/type/author_role'}] editions[key] = work.pop('editions') result = self.loader.bulk_new(works, comment="add works page", author=self.author) def process(result): for r in result: for e in editions[r['key']]: yield "\t".join([e, r['key'], str(r['id'])]) + "\n" editions_file.writelines(process(result)) def update_editions(self, filename, author="/user/ImportBot"): self.author = author root = os.path.dirname(filename) index_file = open(os.path.join(root, 'edition_ref.txt'), 'a') type_edition_id = self.loader.get_thing_id("/type/edition") keyid = Reindexer(self.loader.db).get_property_id(type_edition_id, "works") log("begin") try: for i, lines in enumerate(web.group(open(filename), 1000)): t0 = time.time() self.update_editions_chunk(lines, index_file, keyid) t1 = time.time() log(i, "%.3f sec" % (t1-t0)) finally: index_file.close() log("end") def update_editions_chunk(self, lines, index_file, keyid): data = [line.strip().split("\t") for line in lines] editions = [{"key": e, "works": [{"key": w}]} for e, w, wid in data] result = self.loader.bulk_update(editions, comment="link works", author=self.author) def process(): edition_map = dict((row[0], row) for row in data) for row in result: eid = row['id'] wid = edition_map[row['key']] ordering = 0 yield "\t".join(map(str, [eid, keyid, wid, ordering])) + "\n" index_file.writelines(process()) def add_index(self, editions, keys2id): rows = [] for e in editions: row = dict(thing_id=keys2id[e['key']], key_id=self.key_id_works, value=keys2id[e['works'][0]['key']], ordering=0) rows.append(row) self.loader.db.multiple_insert("edition_ref", rows, seqname=False)
def __init__(self, **dbparams): self.loader = DocumentLoader(**dbparams) self.tmpdir = "/tmp" # a bug in web.group has been fixed in 0.33 assert web.__version__ == "0.33"
class WorkLoader: def __init__(self, **dbparams): self.loader = DocumentLoader(**dbparams) self.tmpdir = "/tmp" # a bug in web.group has been fixed in 0.33 assert web.__version__ == "0.33" def load_works(self, filename, author="/user/ImportBot"): self.author = author root = os.path.dirname(filename) editions_file = open(os.path.join(root, 'editions.txt'), 'a') try: for i, lines in enumerate(web.group(open(filename), 1000)): t0 = time.time() self.load_works_chunk(lines, editions_file) t1 = time.time() log(i, "%.3f sec" % (t1 - t0)) finally: editions_file.close() def load_works_chunk(self, lines, editions_file): authors = [eval(line) for line in lines] editions = {} for akey, works in authors: keys = self.loader.new_work_keys(len(works)) for work, key in zip(works, keys): work['key'] = key work['type'] = {'key': "/type/work"} work['authors'] = [{ 'author': { 'key': akey }, 'type': '/type/author_role' }] if 'subjects' in work: del work['subjects'] if 'toc' in work: del work['toc'] editions[key] = work.pop('editions') result = self.loader.bulk_new(works, comment="add works page", author=self.author) def process(result): for r in result: for e in editions[r['key']]: yield "\t".join([e, r['key'], str(r['id'])]) + "\n" editions_file.writelines(process(result)) def update_editions(self, filename, author="/user/ImportBot"): self.author = author root = os.path.dirname(filename) index_file = open(os.path.join(root, 'edition_ref.txt'), 'a') type_edition_id = self.loader.get_thing_id("/type/edition") keyid = Reindexer(self.loader.db).get_property_id( type_edition_id, "works") log("begin") try: for i, lines in enumerate(web.group(open(filename), 1000)): t0 = time.time() self.update_editions_chunk(lines, index_file, keyid) t1 = time.time() log(i, "%.3f sec" % (t1 - t0)) finally: index_file.close() log("end") def update_editions_chunk(self, lines, index_file, keyid): data = [line.strip().split("\t") for line in lines] editions = [{"key": e, "works": [{"key": w}]} for e, w, wid in data] result = self.loader.bulk_update(editions, comment="link works", author=self.author) def process(): edition_map = dict((row[0], row) for row in data) for row in result: eid = row['id'] wid = edition_map[row['key']] ordering = 0 yield "\t".join(map(str, [eid, keyid, wid, ordering])) + "\n" index_file.writelines(process()) def add_index(self, editions, keys2id): rows = [] for e in editions: row = dict(thing_id=keys2id[e['key']], key_id=self.key_id_works, value=keys2id[e['works'][0]['key']], ordering=0) rows.append(row) self.loader.db.multiple_insert("edition_ref", rows, seqname=False)