class ZoteroImporter(object): def __init__( self, library_id, library_type, api_key, papers2, keyword_types=("user", "label"), label_map={}, add_to_collections=[], upload_attachments="all", batch_size=50, checkpoint=None, dryrun=None, ): self.client = Zotero(library_id, library_type, api_key) self.papers2 = papers2 self.keyword_types = keyword_types self.label_map = label_map self.upload_attachments = upload_attachments self.checkpoint = checkpoint self.dryrun = JSONWriter(dryrun) if dryrun is not None else None self._batch = Batch(batch_size) self._load_collections(add_to_collections) # Load Zotero collections and create any # Papers2 collections that don't exist. # TODO: need to handle collection hierarchies def _load_collections(self, add_to_collections): self.collections = {} if add_to_collections is None: add_to_collections = list(c.name for c in self.papers2.get_collections()) if len(add_to_collections) > 0: if self.dryrun is not None: for c in add_to_collections: self.collections[c] = "<{0}>".format(c) else: # fetch existing zotero collections existing_collections = {} for zc in self.client.collections(): data = zc["data"] existing_collections[data["name"]] = data["key"] # add any papers2 collections that do not already exist payload = [] for pc in add_to_collections: if pc not in existing_collections: payload.append(dict(name=pc)) if len(payload) > 0: self.client.create_collection(payload) # re-fetch zotero collections in order to get keys for zc in self.client.collections(): data = zc["data"] if data["name"] in add_to_collections: self.collections[data["name"]] = data["key"] def add_pub(self, pub): # ignore publications we've already imported if self.checkpoint is not None and self.checkpoint.contains(pub.ROWID): log.debug("Skipping already imported publication {0}".format(pub.ROWID)) return False # convert the Papers2 publication type to a Zotero item type item_type = ITEM_TYPES[self.papers2.get_pub_type(pub)] # get the template to fill in for an item of this type item = self.client.item_template(item_type) # fill in template fields for key, value in item.iteritems(): if key in EXTRACTORS: value = EXTRACTORS[key].extract(pub, self, value) if value is not None: item[key] = value # add notes, if any notes = [] if pub.notes is not None and len(pub.notes) > 0: notes.append(pub.notes) reviews = self.papers2.get_reviews(pub) for r in reviews: notes.append("{0} Rating: {1}".format(r.content, r.rating)) # get paths to attachments attachments = [] if self.upload_attachments == "all" or (self.upload_attachments == "unread" and pub.times_read == 0): attachments = list(self.papers2.get_attachments(pub)) # add to batch and checkpoint self._batch.add(item, notes, attachments) if self.checkpoint is not None: self.checkpoint.add(pub.ROWID) # commit the batch if it's full self._commit_batch() return True def close(self): if self._batch is not None: self._commit_batch(force=True) self._batch = None if self.dryrun is not None: self.dryrun.close() def _commit_batch(self, force=False): if self._batch.is_full or (force and not self._batch.is_empty): try: if self.dryrun is not None: for item, attachments in self._batch.iter(): self.dryrun.write(item, attachments) else: # upload metadata status = self.client.create_items(self._batch.items) if len(status["failed"]) > 0: for status_idx, status_msg in status["failed"].iteritems(): item_idx = int(status_idx) # remove failures from the checkpoint if self.checkpoint is not None: self.checkpoint.remove(item_idx) item = self._batch.items[item_idx] log.error( "Upload failed for item {0}; code {1}; {2}".format( item["title"], status_msg["code"], status_msg["message"] ) ) successes = {} successes.update(status["success"]) successes.update(status["unchanged"]) for k, objKey in successes.iteritems(): item_idx = int(k) # add notes notes = self._batch.notes[item_idx] if len(notes) > 0: note_batch = [] for note_text in notes: note = self.client.item_template("note") note["parentItem"] = objKey note["note"] = note_text note_batch.append(note) note_status = self.client.create_items(note_batch) if len(note_status["failed"]) > 0: for status_idx, status_msg in note_status["failed"].iteritems(): note_idx = int(status_idx) # just warn about these failures note = note_batch[note_idx] log.error( "Failed to create note {0} for item item {1}; code {2}; {3}".format( note["note"], self.batch.items[idx]["title"], status_msg["code"], status_msg["message"], ) ) # upload attachments and add items to collections if self.upload_attachments != "none": # TODO: modify pyzotero to pass MIME type for contentType key attachments = list(path for path, mime in self._batch.attachments[item_idx]) if len(attachments) > 0: try: self.client.attachment_simple(attachments, objKey) # This is to work around a bug in pyzotero where an exception is # thrown if an attachment already exists except KeyError: log.info("One or more attachment already exists: {0}".format(",".join(attachments))) # update checkpoint if self.checkpoint is not None: self.checkpoint.commit() log.info( "Batch committed: {0} items created and {1} items unchanged out of {2} attempted".format( len(status["success"]), len(status["unchanged"]), self._batch.size ) ) except: log.error("Error importing {0} items to Zotero".format(self._batch.size)) if self.checkpoint is not None: self.checkpoint.rollback() raise finally: self._batch.clear()
class ZoteroImporter(object): def __init__(self, library_id, library_type, api_key, papers2, keyword_types=('user', 'label'), label_map={}, add_to_collections=[], upload_attachments="all", batch_size=50, checkpoint=None, dryrun=None): self.client = Zotero(library_id, library_type, api_key) self.papers2 = papers2 self.keyword_types = keyword_types self.label_map = label_map self.upload_attachments = upload_attachments self.checkpoint = checkpoint self.dryrun = JSONWriter(dryrun) if dryrun is not None else None self._batch = Batch(batch_size) self._load_collections(add_to_collections) # Load Zotero collections and create any # Papers2 collections that don't exist. # TODO: need to handle collection hierarchies def _load_collections(self, add_to_collections): self.collections = {} if add_to_collections is None: add_to_collections = list(c.name for c in self.papers2.get_collections()) if len(add_to_collections) > 0: if self.dryrun is not None: for c in add_to_collections: self.collections[c] = "<{0}>".format(c) else: # fetch existing zotero collections existing_collections = {} for zc in self.client.collections(): data = zc['data'] existing_collections[data['name']] = data['key'] # add any papers2 collections that do not already exist payload = [] for pc in add_to_collections: if pc not in existing_collections: payload.append(dict(name=pc)) if len(payload) > 0: self.client.create_collection(payload) # re-fetch zotero collections in order to get keys for zc in self.client.collections(): data = zc['data'] if data['name'] in add_to_collections: self.collections[data['name']] = data['key'] def add_pub(self, pub): # ignore publications we've already imported if self.checkpoint is not None and self.checkpoint.contains(pub.ROWID): log.debug("Skipping already imported publication {0}".format( pub.ROWID)) return False # convert the Papers2 publication type to a Zotero item type item_type = ITEM_TYPES[self.papers2.get_pub_type(pub)] # get the template to fill in for an item of this type item = self.client.item_template(item_type) # fill in template fields for key, value in item.iteritems(): if key in EXTRACTORS: value = EXTRACTORS[key].extract(pub, self, value) if value is not None: item[key] = value # add notes, if any notes = [] if pub.notes is not None and len(pub.notes) > 0: notes.append(pub.notes) reviews = self.papers2.get_reviews(pub) for r in reviews: notes.append("{0} Rating: {1}".format(r.content, r.rating)) # get paths to attachments attachments = [] if self.upload_attachments == "all" or ( self.upload_attachments == "unread" and pub.times_read == 0): attachments = list(self.papers2.get_attachments(pub)) # add to batch and checkpoint self._batch.add(item, notes, attachments) if self.checkpoint is not None: self.checkpoint.add(pub.ROWID) # commit the batch if it's full self._commit_batch() return True def close(self): if self._batch is not None: self._commit_batch(force=True) self._batch = None if self.dryrun is not None: self.dryrun.close() def _commit_batch(self, force=False): if self._batch.is_full or (force and not self._batch.is_empty): try: if self.dryrun is not None: for item, attachments in self._batch.iter(): self.dryrun.write(item, attachments) else: # upload metadata status = self.client.create_items(self._batch.items) if len(status['failed']) > 0: for status_idx, status_msg in status[ 'failed'].iteritems(): item_idx = int(status_idx) # remove failures from the checkpoint if self.checkpoint is not None: self.checkpoint.remove(item_idx) item = self._batch.items[item_idx] log.error( "Upload failed for item {0}; code {1}; {2}". format(item['title'], status_msg['code'], status_msg['message'])) successes = {} successes.update(status['success']) successes.update(status['unchanged']) for k, objKey in successes.iteritems(): item_idx = int(k) # add notes notes = self._batch.notes[item_idx] if len(notes) > 0: note_batch = [] for note_text in notes: note = self.client.item_template('note') note['parentItem'] = objKey note['note'] = note_text note_batch.append(note) note_status = self.client.create_items(note_batch) if len(note_status['failed']) > 0: for status_idx, status_msg in note_status[ 'failed'].iteritems(): note_idx = int(status_idx) # just warn about these failures note = note_batch[note_idx] log.error( "Failed to create note {0} for item item {1}; code {2}; {3}" .format(note['note'], self.batch.items[idx]['title'], status_msg['code'], status_msg['message'])) # upload attachments and add items to collections if self.upload_attachments != "none": # TODO: modify pyzotero to pass MIME type for contentType key attachments = list( path for path, mime in self._batch.attachments[item_idx]) if len(attachments) > 0: try: self.client.attachment_simple( attachments, objKey) # This is to work around a bug in pyzotero where an exception is # thrown if an attachment already exists except KeyError: log.info( "One or more attachment already exists: {0}" .format(",".join(attachments))) # update checkpoint if self.checkpoint is not None: self.checkpoint.commit() log.info( "Batch committed: {0} items created and {1} items unchanged out of {2} attempted" .format(len(status['success']), len(status['unchanged']), self._batch.size)) except: log.error("Error importing {0} items to Zotero".format( self._batch.size)) if self.checkpoint is not None: self.checkpoint.rollback() raise finally: self._batch.clear()