Exemplo n.º 1
0
class ZoteroImporter(object):
    def __init__(
        self,
        library_id,
        library_type,
        api_key,
        papers2,
        keyword_types=("user", "label"),
        label_map={},
        add_to_collections=[],
        upload_attachments="all",
        batch_size=50,
        checkpoint=None,
        dryrun=None,
    ):
        self.client = Zotero(library_id, library_type, api_key)
        self.papers2 = papers2
        self.keyword_types = keyword_types
        self.label_map = label_map
        self.upload_attachments = upload_attachments
        self.checkpoint = checkpoint
        self.dryrun = JSONWriter(dryrun) if dryrun is not None else None
        self._batch = Batch(batch_size)
        self._load_collections(add_to_collections)

    # Load Zotero collections and create any
    # Papers2 collections that don't exist.
    # TODO: need to handle collection hierarchies
    def _load_collections(self, add_to_collections):
        self.collections = {}
        if add_to_collections is None:
            add_to_collections = list(c.name for c in self.papers2.get_collections())

        if len(add_to_collections) > 0:
            if self.dryrun is not None:
                for c in add_to_collections:
                    self.collections[c] = "<{0}>".format(c)

            else:
                # fetch existing zotero collections
                existing_collections = {}
                for zc in self.client.collections():
                    data = zc["data"]
                    existing_collections[data["name"]] = data["key"]

                # add any papers2 collections that do not already exist
                payload = []
                for pc in add_to_collections:
                    if pc not in existing_collections:
                        payload.append(dict(name=pc))
                if len(payload) > 0:
                    self.client.create_collection(payload)

                # re-fetch zotero collections in order to get keys
                for zc in self.client.collections():
                    data = zc["data"]
                    if data["name"] in add_to_collections:
                        self.collections[data["name"]] = data["key"]

    def add_pub(self, pub):
        # ignore publications we've already imported
        if self.checkpoint is not None and self.checkpoint.contains(pub.ROWID):
            log.debug("Skipping already imported publication {0}".format(pub.ROWID))
            return False

        # convert the Papers2 publication type to a Zotero item type
        item_type = ITEM_TYPES[self.papers2.get_pub_type(pub)]

        # get the template to fill in for an item of this type
        item = self.client.item_template(item_type)

        # fill in template fields
        for key, value in item.iteritems():
            if key in EXTRACTORS:
                value = EXTRACTORS[key].extract(pub, self, value)
                if value is not None:
                    item[key] = value

        # add notes, if any
        notes = []
        if pub.notes is not None and len(pub.notes) > 0:
            notes.append(pub.notes)

        reviews = self.papers2.get_reviews(pub)
        for r in reviews:
            notes.append("{0} Rating: {1}".format(r.content, r.rating))

        # get paths to attachments
        attachments = []
        if self.upload_attachments == "all" or (self.upload_attachments == "unread" and pub.times_read == 0):
            attachments = list(self.papers2.get_attachments(pub))

        # add to batch and checkpoint
        self._batch.add(item, notes, attachments)
        if self.checkpoint is not None:
            self.checkpoint.add(pub.ROWID)

        # commit the batch if it's full
        self._commit_batch()

        return True

    def close(self):
        if self._batch is not None:
            self._commit_batch(force=True)
            self._batch = None
        if self.dryrun is not None:
            self.dryrun.close()

    def _commit_batch(self, force=False):
        if self._batch.is_full or (force and not self._batch.is_empty):
            try:
                if self.dryrun is not None:
                    for item, attachments in self._batch.iter():
                        self.dryrun.write(item, attachments)

                else:
                    # upload metadata
                    status = self.client.create_items(self._batch.items)

                    if len(status["failed"]) > 0:
                        for status_idx, status_msg in status["failed"].iteritems():
                            item_idx = int(status_idx)
                            # remove failures from the checkpoint
                            if self.checkpoint is not None:
                                self.checkpoint.remove(item_idx)
                            item = self._batch.items[item_idx]
                            log.error(
                                "Upload failed for item {0}; code {1}; {2}".format(
                                    item["title"], status_msg["code"], status_msg["message"]
                                )
                            )

                    successes = {}
                    successes.update(status["success"])
                    successes.update(status["unchanged"])

                    for k, objKey in successes.iteritems():
                        item_idx = int(k)

                        # add notes
                        notes = self._batch.notes[item_idx]
                        if len(notes) > 0:
                            note_batch = []
                            for note_text in notes:
                                note = self.client.item_template("note")
                                note["parentItem"] = objKey
                                note["note"] = note_text
                                note_batch.append(note)

                            note_status = self.client.create_items(note_batch)

                            if len(note_status["failed"]) > 0:
                                for status_idx, status_msg in note_status["failed"].iteritems():
                                    note_idx = int(status_idx)
                                    # just warn about these failures
                                    note = note_batch[note_idx]
                                    log.error(
                                        "Failed to create note {0} for item item {1}; code {2}; {3}".format(
                                            note["note"],
                                            self.batch.items[idx]["title"],
                                            status_msg["code"],
                                            status_msg["message"],
                                        )
                                    )

                        # upload attachments and add items to collections
                        if self.upload_attachments != "none":

                            # TODO: modify pyzotero to pass MIME type for contentType key
                            attachments = list(path for path, mime in self._batch.attachments[item_idx])
                            if len(attachments) > 0:
                                try:
                                    self.client.attachment_simple(attachments, objKey)

                                # This is to work around a bug in pyzotero where an exception is
                                # thrown if an attachment already exists
                                except KeyError:
                                    log.info("One or more attachment already exists: {0}".format(",".join(attachments)))

                    # update checkpoint
                    if self.checkpoint is not None:
                        self.checkpoint.commit()

                    log.info(
                        "Batch committed: {0} items created and {1} items unchanged out of {2} attempted".format(
                            len(status["success"]), len(status["unchanged"]), self._batch.size
                        )
                    )

            except:
                log.error("Error importing {0} items to Zotero".format(self._batch.size))
                if self.checkpoint is not None:
                    self.checkpoint.rollback()
                raise

            finally:
                self._batch.clear()
Exemplo n.º 2
0
class ZoteroImporter(object):
    def __init__(self,
                 library_id,
                 library_type,
                 api_key,
                 papers2,
                 keyword_types=('user', 'label'),
                 label_map={},
                 add_to_collections=[],
                 upload_attachments="all",
                 batch_size=50,
                 checkpoint=None,
                 dryrun=None):
        self.client = Zotero(library_id, library_type, api_key)
        self.papers2 = papers2
        self.keyword_types = keyword_types
        self.label_map = label_map
        self.upload_attachments = upload_attachments
        self.checkpoint = checkpoint
        self.dryrun = JSONWriter(dryrun) if dryrun is not None else None
        self._batch = Batch(batch_size)
        self._load_collections(add_to_collections)

    # Load Zotero collections and create any
    # Papers2 collections that don't exist.
    # TODO: need to handle collection hierarchies
    def _load_collections(self, add_to_collections):
        self.collections = {}
        if add_to_collections is None:
            add_to_collections = list(c.name
                                      for c in self.papers2.get_collections())

        if len(add_to_collections) > 0:
            if self.dryrun is not None:
                for c in add_to_collections:
                    self.collections[c] = "<{0}>".format(c)

            else:
                # fetch existing zotero collections
                existing_collections = {}
                for zc in self.client.collections():
                    data = zc['data']
                    existing_collections[data['name']] = data['key']

                # add any papers2 collections that do not already exist
                payload = []
                for pc in add_to_collections:
                    if pc not in existing_collections:
                        payload.append(dict(name=pc))
                if len(payload) > 0:
                    self.client.create_collection(payload)

                # re-fetch zotero collections in order to get keys
                for zc in self.client.collections():
                    data = zc['data']
                    if data['name'] in add_to_collections:
                        self.collections[data['name']] = data['key']

    def add_pub(self, pub):
        # ignore publications we've already imported
        if self.checkpoint is not None and self.checkpoint.contains(pub.ROWID):
            log.debug("Skipping already imported publication {0}".format(
                pub.ROWID))
            return False

        # convert the Papers2 publication type to a Zotero item type
        item_type = ITEM_TYPES[self.papers2.get_pub_type(pub)]

        # get the template to fill in for an item of this type
        item = self.client.item_template(item_type)

        # fill in template fields
        for key, value in item.iteritems():
            if key in EXTRACTORS:
                value = EXTRACTORS[key].extract(pub, self, value)
                if value is not None:
                    item[key] = value

        # add notes, if any
        notes = []
        if pub.notes is not None and len(pub.notes) > 0:
            notes.append(pub.notes)

        reviews = self.papers2.get_reviews(pub)
        for r in reviews:
            notes.append("{0} Rating: {1}".format(r.content, r.rating))

        # get paths to attachments
        attachments = []
        if self.upload_attachments == "all" or (
                self.upload_attachments == "unread" and pub.times_read == 0):
            attachments = list(self.papers2.get_attachments(pub))

        # add to batch and checkpoint
        self._batch.add(item, notes, attachments)
        if self.checkpoint is not None:
            self.checkpoint.add(pub.ROWID)

        # commit the batch if it's full
        self._commit_batch()

        return True

    def close(self):
        if self._batch is not None:
            self._commit_batch(force=True)
            self._batch = None
        if self.dryrun is not None:
            self.dryrun.close()

    def _commit_batch(self, force=False):
        if self._batch.is_full or (force and not self._batch.is_empty):
            try:
                if self.dryrun is not None:
                    for item, attachments in self._batch.iter():
                        self.dryrun.write(item, attachments)

                else:
                    # upload metadata
                    status = self.client.create_items(self._batch.items)

                    if len(status['failed']) > 0:
                        for status_idx, status_msg in status[
                                'failed'].iteritems():
                            item_idx = int(status_idx)
                            # remove failures from the checkpoint
                            if self.checkpoint is not None:
                                self.checkpoint.remove(item_idx)
                            item = self._batch.items[item_idx]
                            log.error(
                                "Upload failed for item {0}; code {1}; {2}".
                                format(item['title'], status_msg['code'],
                                       status_msg['message']))

                    successes = {}
                    successes.update(status['success'])
                    successes.update(status['unchanged'])

                    for k, objKey in successes.iteritems():
                        item_idx = int(k)

                        # add notes
                        notes = self._batch.notes[item_idx]
                        if len(notes) > 0:
                            note_batch = []
                            for note_text in notes:
                                note = self.client.item_template('note')
                                note['parentItem'] = objKey
                                note['note'] = note_text
                                note_batch.append(note)

                            note_status = self.client.create_items(note_batch)

                            if len(note_status['failed']) > 0:
                                for status_idx, status_msg in note_status[
                                        'failed'].iteritems():
                                    note_idx = int(status_idx)
                                    # just warn about these failures
                                    note = note_batch[note_idx]
                                    log.error(
                                        "Failed to create note {0} for item item {1}; code {2}; {3}"
                                        .format(note['note'],
                                                self.batch.items[idx]['title'],
                                                status_msg['code'],
                                                status_msg['message']))

                        # upload attachments and add items to collections
                        if self.upload_attachments != "none":

                            # TODO: modify pyzotero to pass MIME type for contentType key
                            attachments = list(
                                path for path, mime in
                                self._batch.attachments[item_idx])
                            if len(attachments) > 0:
                                try:
                                    self.client.attachment_simple(
                                        attachments, objKey)

                                # This is to work around a bug in pyzotero where an exception is
                                # thrown if an attachment already exists
                                except KeyError:
                                    log.info(
                                        "One or more attachment already exists: {0}"
                                        .format(",".join(attachments)))

                    # update checkpoint
                    if self.checkpoint is not None:
                        self.checkpoint.commit()

                    log.info(
                        "Batch committed: {0} items created and {1} items unchanged out of {2} attempted"
                        .format(len(status['success']),
                                len(status['unchanged']), self._batch.size))

            except:
                log.error("Error importing {0} items to Zotero".format(
                    self._batch.size))
                if self.checkpoint is not None:
                    self.checkpoint.rollback()
                raise

            finally:
                self._batch.clear()