Пример #1
0
class ZoteroWrap:

    CACHE_REFERENCE_LIST = "references"
    CACHE_REFERENCE_TYPES = "reference_types"
    CACHE_REFERENCE_TEMPLATES = "reference_templates"

    def __init__(self, library_id, library_type, api_key, directory):
        cache_filename = "{}-{}-{}.pkl".format(library_id, library_type,
                                               api_key)
        self.cache_path = os.path.join(directory, cache_filename)
        # reference_types and reference_templates must have the same ordering.
        self.reference_types = []
        self.reference_templates = {}
        self._zotero_lib = Zotero(library_id, library_type, api_key)
        self._references = []

    # Data I/O methods section.

    def initialize(self):
        """Load the cached Zotero data, or retrieve them if there is none."""
        try:
            self.load_cache()
        except FileNotFoundError:
            self.load_distant()

    def load_cache(self):
        """Load the cached Zotero data."""
        with open(self.cache_path, "rb") as f:
            print("Loading cached Zotero data...")
            cache = pickle.load(f)
            self._references = cache[self.CACHE_REFERENCE_LIST]
            self.reference_types = cache[self.CACHE_REFERENCE_TYPES]
            self.reference_templates = cache[self.CACHE_REFERENCE_TEMPLATES]
            print("Cached Zotero data loaded.")

    def load_distant(self):
        """Load the distant Zotero data."""
        print("Loading distant Zotero data...")
        self._references = self.get_references()
        self.reference_types = self.get_reference_types()
        self.reference_templates = self.get_reference_templates(
            self.reference_types)
        print("Distant Zotero data loaded.")
        self.cache()

    def cache(self):
        """Cache the Zotero data."""
        with open(self.cache_path, "wb") as f:
            cache = {
                self.CACHE_REFERENCE_LIST: self._references,
                self.CACHE_REFERENCE_TYPES: self.reference_types,
                self.CACHE_REFERENCE_TEMPLATES: self.reference_templates
            }
            pickle.dump(cache, f)

    def create_local_reference(self, ref):
        """Append the reference at the end of the reference list and cache it."""
        self._references.append(ref)
        self.cache()

    def create_distant_reference(self, ref_data):
        """Validate and create the reference in Zotero and return the created item."""
        self.validate_reference_data(ref_data)
        creation_status = self._zotero_lib.create_items([ref_data])
        try:
            created_item = creation_status["successful"]["0"]
            return created_item
        except KeyError as e:
            print(creation_status)
            raise CreateZoteroItemError from e

    def update_local_reference(self, index, ref):
        """Replace the reference in the reference list and cache it."""
        self._references[index] = ref
        self.cache()

    def update_distant_reference(self, ref):
        """Validate and update the reference in Zotero.

        Existing fields not present will be left unmodified.
        """
        self.validate_reference_data(ref["data"])
        self._zotero_lib.update_item(ref)

    def validate_reference_data(self, ref_data):
        """Validate the reference data.

        Zotero.check_items() caches data after the first API call.
        """
        try:
            self._zotero_lib.check_items([ref_data])
        except InvalidItemFields as e:
            raise InvalidZoteroItemError from e

    def get_references(self):
        """Return all references in the Zotero database. Takes time..."""
        return self._zotero_lib.everything(self._zotero_lib.top())

    def get_reference_types(self):
        """Return the reference types.

        Zotero.item_types() caches data after the first API call.
        """
        item_types = self._zotero_lib.item_types()
        return sorted([x["itemType"] for x in item_types])

    def get_reference_templates(self, ref_types):
        """Return the reference templates for the types as an ordered dictionary."""
        return OrderedDict([(x, self.get_reference_template(x))
                            for x in ref_types])

    def get_reference_template(self, ref_type):
        """Return the reference template for the type as an ordered dictionary.

        Zotero.item_template() caches data after the first API call.
        """
        template = self._zotero_lib.item_template(ref_type)
        return OrderedDict(sorted(template.items(), key=lambda x: x[0]))

    def get_reference(self, ref_key):
        """Return the reference for the key."""
        return self._zotero_lib.item(ref_key)

    # Public @properties surrogates section.

    def reference_count(self):
        """Return the number of references."""
        return len(self._references)

    def reference_data(self, index):
        """Return the 'data' field of the reference."""
        return self._references[index]["data"]

    def reference_extra_field(self, field, index):
        """Return the value of the field in 'extra', otherwise ''."""
        ref_data = self.reference_data(index)
        extra_fields = ref_data["extra"].split("\n")
        field_id = field + ":"
        matched = next((x for x in extra_fields if x.startswith(field_id)),
                       None)
        if matched:
            return matched.replace(field_id, "", 1).strip()
        else:
            return ""

    def reference_type(self, index):
        """Return the reference type."""
        return self.reference_data(index)["itemType"]

    def reference_key(self, index):
        """Return the reference key."""
        return self._references[index]["key"]

    def reference_id(self, index):
        """Return the reference ID (locally defined)."""
        # TODO Include ISBN and ISSN?
        doi = self.reference_doi(index)
        if doi:
            return doi
        else:
            pmid = self.reference_pmid(index)
            if pmid:
                return "PMID_" + pmid
            else:
                unpublished_id = self.reference_unpublished_id(index)
                if unpublished_id:
                    return "UNPUBLISHED_" + unpublished_id
        return ""

    def reference_doi(self, index):
        """Return the reference DOI."""
        return self.reference_data(index).get(
            "DOI", self.reference_extra_field("DOI", index))

    def reference_pmid(self, index):
        """Return the reference PMID."""
        return self.reference_extra_field("PMID", index)

    def reference_unpublished_id(self, index):
        """Return the reference UNPUBLISHED ID."""
        return self.reference_extra_field("UNPUBLISHED", index)

    def reference_title(self, index):
        """Return the reference title."""
        return self.reference_data(index)["title"]

    def reference_creator_surnames(self, index):
        """Return as a list the surnames of the reference creators (locally defined)."""
        # TODO Not true, ex: ISBN 978-1-4398-3778-8. Return all creator types?
        # Academic books published as a collection of chapters contributed by
        # different authors have editors but not authors at the level of the
        # book (as opposed to the level of a chapter).
        creators = self.reference_data(index)["creators"]
        creator_types = [x["creatorType"] for x in creators]
        # 'name' (not split) might be used instead of 'firstName' and 'lastName'.
        try:
            if "author" in creator_types:
                return [
                    x["lastName"] for x in creators
                    if x["creatorType"] == "author"
                ]
            else:
                return [x["lastName"] for x in creators]
        except KeyError:
            return []

    def reference_creator_surnames_str(self, index):
        """Return as a string the surnames of the reference creators (locally defined)."""
        # NB: str.join() returns an empty string for an empty list.
        return ", ".join(self.reference_creator_surnames(index))

    def reference_date(self, index):
        """Return the reference publication date."""
        return self.reference_data(index)["date"]

    def reference_year(self, index):
        """Return the reference publication year."""
        # TODO Use meta:parsedDate field instead?
        ref_date = self.reference_date(index)
        try:
            # NB: datetime.year returns an int.
            return parse(ref_date).year
        except ValueError:
            matched = re.search(r"\d{4}", ref_date)
            if matched:
                return int(matched.group())
            else:
                return ""

    def reference_journal(self, index):
        """Return the reference journal name."""
        # TODO Change the column name 'Journal' to an other?
        ref_type = self.reference_type(index)
        if ref_type == "journalArticle":
            return self.reference_data(index)["publicationTitle"]
        else:
            return "({})".format(ref_type)

    # Public methods section.

    def reference_index(self, ref_id):
        """Return the first reference with this ID."""
        try:
            indexes = range(self.reference_count())
            return next(i for i in indexes if self.reference_id(i) == ref_id)
        except StopIteration as e:
            raise ReferenceNotFoundError("ID: " + ref_id) from e

    def reference_creators_citation(self, ref_id):
        """Return for citation the creator surnames (locally defined) and the publication year."""
        # FIXME Delayed refactoring. Use an index instead of an ID.
        index = self.reference_index(ref_id)
        creators = self.reference_creator_surnames(index)
        creator_count = len(creators)
        if creator_count == 0:
            return ""
        year = self.reference_year(index)
        if creator_count == 1:
            return "{} ({})".format(creators[0], year)
        elif creator_count == 2:
            return "{} and {} ({})".format(creators[0], creators[1], year)
        else:
            return "{} et al. ({})".format(creators[0], year)
Пример #2
0
def _from_zotero_library(engine,
                         library_id,
                         library_type,
                         api_key=None,
                         verbose=False):
    library_type_id = "zot_%s_%i" % (library_type[:1], library_id)

    # Every library gets a separate schema within the database
    item_type_schema = schema.for_library(engine, library_type_id, verbose)
    # returns dictionary of item table fields.

    # Setup the Zotero connection through pyzotero
    z = Zotero(library_id, library_type, api_key)
    check_access = z.items(limit=1, format="json", includeTrashed=1)
    library_name = check_access[0]['library']['name']

    print("\n%s %s ¶" % (library_type_id, library_name))

    # Start the engine and fetch items from the cloud!
    with engine.connect() as db:
        # Start sync timer and log attempt to sync.
        # Duration and latest version will be updated when finished.
        query = """
        INSERT INTO logs.zot_fetch (timestamp, library, name)
        VALUES ( DEFAULT, :lib, :name) RETURNING id,timestamp;
        """
        sync = db.execute(text(query), lib=library_type_id,
                          name=library_name).fetchone()  # ( Int, datetime )
        print("Sync #%i was started at %s" % (sync[0], sync[1].strftime('%c')))

        # Get current local library version
        query = """
        SELECT version FROM logs.zot_fetch WHERE library='%s' AND duration IS NOT NULL ORDER BY timestamp DESC LIMIT 1;
        """ % library_type_id
        res_last_sync_version = db.execute(
            text(query)).fetchone()  # ( Int, ) or None
        if res_last_sync_version:
            last_sync_version = res_last_sync_version[0]
            query = """
            SELECT COUNT(*) FROM %s.items WHERE NOT deleted ;
            """ % library_type_id
            local_count = db.execute(
                text(query)).fetchone()  # ( Int, ) or None
            print("local mirror is at version %i and contains %i items" %
                  (last_sync_version, local_count[0]))
        else:
            last_sync_version = 0
            print("Starting initial sync of library %s" % library_type_id)

        # Get current remote library count and version
        z.top(limit=1, format='keys')
        remote_count = int(z.request.headers.get('total-results', 0))
        library_version = int(z.request.headers.get('last-modified-version',
                                                    0))
        print("remote cloud is at version %i and contains %i items" %
              (library_version, remote_count))

        if last_sync_version < library_version:
            # Get list of local item keys and their versions
            query = """
            SELECT key,version FROM %s.items ;
            """ % library_type_id
            local_versions = dict(db.execute(
                text(query)).fetchall())  # { String: Int, }

            def _fetch_updates_and_inserts(start=0):
                start_round = _start_duration()
                inserts = 0
                update_list = z.top(limit=100,
                                    start=start,
                                    format='json',
                                    since=last_sync_version,
                                    includeTrashed=1)
                total_results = int(z.request.headers.get('Total-Results'))
                # Maybe there are only deletions to handle, so checking number of updates to handle
                if len(update_list) > 0:
                    for item in update_list:
                        data = {}
                        for field, value in item['data'].items():
                            data[field] = schema._typeset_for_db(
                                field, value, item['data']['itemType'])
                            if field == 'version':
                                update_string = '"version"=:version'
                                insert_field_string = '"key", "version"'
                                insert_value_string = ':key, :version'
                            elif field != 'key':
                                update_string += ', "%s"=:%s' % (field, field)
                                insert_field_string += ', "%s"' % field
                                insert_value_string += ', :%s' % field
                            if field == 'note':
                                data['customJSON'] = schema._typeset_for_db(
                                    "customJSON",
                                    json.loads(re.findall(r'{.*}', value)[0]),
                                    "note")
                                update_string += ', "%s"=:%s' % ("customJSON",
                                                                 "customJSON")
                                insert_field_string += ', "%s"' % "customJSON"
                                insert_value_string += ', :%s' % "customJSON"
                        for field, value in item['meta'].items():
                            data[field] = schema._typeset_for_db(
                                field, value, item['data']['itemType'])
                            update_string += ', "%s"=:%s' % (field, field)
                            insert_field_string += ', "%s"' % field
                            insert_value_string += ', :%s' % field
                        item_type = item['data']['itemType']
                        if item['key'] in local_versions:
                            query = """
                            UPDATE %s."%s"
                            SET %s
                            WHERE key=:key ;
                            """ % (library_type_id, item_type, update_string)
                            db.execute(text(query), **data)
                        else:
                            query = """
                            INSERT INTO %s."%s" (%s)
                            VALUES ( %s ) ;
                            """ % (library_type_id, item_type,
                                   insert_field_string, insert_value_string)
                            db.execute(text(query), **data)
                            inserts += 1
                    round_duration = _duration(start_round)
                    print("Finished processing %i updates in %s seconds." %
                          (len(update_list), str(round_duration)))
                    if len(update_list) == 100 and start + 100 < total_results:
                        print(
                            "%i of %i updates done: fetching more updates now."
                            % (start + 100, total_results))
                        inserts = inserts + _fetch_updates_and_inserts(
                            start=start + 100)
                    else:
                        print("%i of %i updates have been processed." %
                              (total_results, total_results))
                else:
                    round_duration = _duration(start_round)
                    print(
                        "Zero updates to process (it took %s seconds to figure that out)"
                        % str(round_duration))
                return inserts

            # fetch all updates in batches of 100 (includes updates to existing items and new items)
            inserts = _fetch_updates_and_inserts()

            def _fetch_deletions(since_version):
                deletions = 0
                start_round = _start_duration()
                print("Fetching list of deletions since last successful sync.")
                # Get list of deleted items from cloud
                delete_list = z.deleted(since=since_version)
                if len(delete_list['items']) > 0:
                    for item in delete_list['items']:
                        if item in local_versions:
                            query = """
                            DELETE FROM %s.items WHERE key=:key ;
                            """ % library_type_id
                            db.execute(text(query), key=item)
                            deletions += 1
                        else:
                            print(
                                "Tried to DELETE item with key %s, but this item is not in local library..."
                                % item)
                round_duration = _duration(start_round)
                print("Finished processing %i deletions in %s seconds" %
                      (len(delete_list['items']), str(round_duration)))
                return deletions

            # if this is not the initial sync, there's nothing to delete...
            if last_sync_version > 0:
                deletions = _fetch_deletions(last_sync_version)
                final_count = local_count[0] + inserts - deletions
            else:
                print(
                    "Initial sync has been successful. Next time atomic updates will be performed!"
                )
        else:
            print("Nothing to sync, everything is up to date.")

        duration = _duration(sync[1])
        query = """
        UPDATE logs.zot_fetch
        SET duration=:duration, version=:version
        WHERE id=:id ;
        """
        db.execute(text(query),
                   duration=math.ceil(duration),
                   version=library_version,
                   id=sync[0])
        # Closing connection to database ༺ with engine.connect() as db : ༻
    print("Syncing library %s took %s seconds\n" %
          (library_type_id, str(duration)))
Пример #3
0
class ZoteroWrap:

    CACHE_REFERENCE_LIST = "references"
    CACHE_REFERENCE_TYPES = "reference_types"
    CACHE_REFERENCE_TEMPLATES = "reference_templates"

    def __init__(self, library_id, library_type, api_key, directory):
        cache_filename = "{}-{}-{}.pkl".format(library_id, library_type, api_key)
        self.cache_path = os.path.join(directory, cache_filename)
        # reference_types and reference_templates must have the same ordering.
        self.reference_types = []
        self.reference_templates = {}
        self._zotero_lib = Zotero(library_id, library_type, api_key)
        self._references = []

    # Data I/O methods section.

    def initialize(self):
        """Load the cached Zotero data, or retrieve them if there is none."""
        try:
            self.load_cache()
        except FileNotFoundError:
            self.load_distant()

    def load_cache(self):
        """Load the cached Zotero data."""
        with open(self.cache_path, "rb") as f:
            print("Loading cached Zotero data...")
            cache = pickle.load(f)
            self._references = cache[self.CACHE_REFERENCE_LIST]
            self.reference_types = cache[self.CACHE_REFERENCE_TYPES]
            self.reference_templates = cache[self.CACHE_REFERENCE_TEMPLATES]
            print("Cached Zotero data loaded.")

    def load_distant(self):
        """Load the distant Zotero data."""
        print("Loading distant Zotero data...")
        self._references = self.get_references()
        self.reference_types = self.get_reference_types()
        self.reference_templates = self.get_reference_templates(self.reference_types)
        print("Distant Zotero data loaded.")
        self.cache()

    def cache(self):
        """Cache the Zotero data."""
        with open(self.cache_path, "wb") as f:
            cache = {self.CACHE_REFERENCE_LIST: self._references,
                     self.CACHE_REFERENCE_TYPES: self.reference_types,
                     self.CACHE_REFERENCE_TEMPLATES: self.reference_templates}
            pickle.dump(cache, f)

    def create_local_reference(self, ref):
        """Append the reference at the end of the reference list and cache it."""
        self._references.append(ref)
        self.cache()

    def create_distant_reference(self, ref_data):
        """Validate and create the reference in Zotero and return the created item."""
        self.validate_reference_data(ref_data)
        creation_status = self._zotero_lib.create_items([ref_data])
        try:
            created_item = creation_status["successful"]["0"]
            return created_item
        except KeyError as e:
            print(creation_status)
            raise CreateZoteroItemError from e

    def update_local_reference(self, index, ref):
        """Replace the reference in the reference list and cache it."""
        self._references[index] = ref
        self.cache()

    def update_distant_reference(self, ref):
        """Validate and update the reference in Zotero.

        Existing fields not present will be left unmodified.
        """
        self.validate_reference_data(ref["data"])
        self._zotero_lib.update_item(ref)

    def validate_reference_data(self, ref_data):
        """Validate the reference data.

        Zotero.check_items() caches data after the first API call.
        """
        try:
            self._zotero_lib.check_items([ref_data])
        except InvalidItemFields as e:
            raise InvalidZoteroItemError from e

    def get_references(self):
        """Return all references in the Zotero database. Takes time..."""
        return self._zotero_lib.everything(self._zotero_lib.top())

    def get_reference_types(self):
        """Return the reference types.

        Zotero.item_types() caches data after the first API call.
        """
        item_types = self._zotero_lib.item_types()
        return sorted([x["itemType"] for x in item_types])

    def get_reference_templates(self, ref_types):
        """Return the reference templates for the types as an ordered dictionary."""
        return OrderedDict([(x, self.get_reference_template(x)) for x in ref_types])

    def get_reference_template(self, ref_type):
        """Return the reference template for the type as an ordered dictionary.

        Zotero.item_template() caches data after the first API call.
        """
        template = self._zotero_lib.item_template(ref_type)
        return OrderedDict(sorted(template.items(), key=lambda x: x[0]))

    def get_reference(self, ref_key):
        """Return the reference for the key."""
        return self._zotero_lib.item(ref_key)

    # Public @properties surrogates section.

    def reference_count(self):
        """Return the number of references."""
        return len(self._references)

    def reference_data(self, index):
        """Return the 'data' field of the reference."""
        return self._references[index]["data"]

    def reference_extra_field(self, field, index):
        """Return the value of the field in 'extra', otherwise ''."""
        ref_data = self.reference_data(index)
        extra_fields = ref_data["extra"].split("\n")
        field_id = field + ":"
        matched = next((x for x in extra_fields if x.startswith(field_id)), None)
        if matched:
            return matched.replace(field_id, "", 1).strip()
        else:
            return ""

    def reference_type(self, index):
        """Return the reference type."""
        return self.reference_data(index)["itemType"]

    def reference_key(self, index):
        """Return the reference key."""
        return self._references[index]["key"]

    def reference_id(self, index):
        """Return the reference ID (locally defined)."""
        # TODO Include ISBN and ISSN?
        doi = self.reference_doi(index)
        if doi:
            return doi
        else:
            pmid = self.reference_pmid(index)
            if pmid:
                return "PMID_" + pmid
            else:
                unpublished_id = self.reference_unpublished_id(index)
                if unpublished_id:
                    return "UNPUBLISHED_" + unpublished_id
        return ""

    def reference_doi(self, index):
        """Return the reference DOI."""
        return self.reference_data(index).get("DOI", self.reference_extra_field("DOI", index))

    def reference_pmid(self, index):
        """Return the reference PMID."""
        return self.reference_extra_field("PMID", index)

    def reference_unpublished_id(self, index):
        """Return the reference UNPUBLISHED ID."""
        return self.reference_extra_field("UNPUBLISHED", index)

    def reference_title(self, index):
        """Return the reference title."""
        return self.reference_data(index)["title"]

    def reference_creator_surnames(self, index):
        """Return as a list the surnames of the reference creators (locally defined)."""
        # TODO Not true, ex: ISBN 978-1-4398-3778-8. Return all creator types?
        # Academic books published as a collection of chapters contributed by
        # different authors have editors but not authors at the level of the
        # book (as opposed to the level of a chapter).
        creators = self.reference_data(index)["creators"]
        creator_types = [x["creatorType"] for x in creators]
        # 'name' (not split) might be used instead of 'firstName' and 'lastName'.
        try:
            if "author" in creator_types:
                return [x["lastName"] for x in creators if x["creatorType"] == "author"]
            else:
                return [x["lastName"] for x in creators]
        except KeyError:
            return []

    def reference_creator_surnames_str(self, index):
        """Return as a string the surnames of the reference creators (locally defined)."""
        # NB: str.join() returns an empty string for an empty list.
        return ", ".join(self.reference_creator_surnames(index))

    def reference_date(self, index):
        """Return the reference publication date."""
        return self.reference_data(index)["date"]

    def reference_year(self, index):
        """Return the reference publication year."""
        # TODO Use meta:parsedDate field instead?
        ref_date = self.reference_date(index)
        try:
            # NB: datetime.year returns an int.
            return parse(ref_date).year
        except ValueError:
            matched = re.search(r"\d{4}", ref_date)
            if matched:
                return int(matched.group())
            else:
                return ""

    def reference_journal(self, index):
        """Return the reference journal name."""
        # TODO Change the column name 'Journal' to an other?
        ref_type = self.reference_type(index)
        if ref_type == "journalArticle":
            return self.reference_data(index)["publicationTitle"]
        else:
            return "({})".format(ref_type)

    # Public methods section.

    def reference_index(self, ref_id):
        """Return the first reference with this ID."""
        try:
            indexes = range(self.reference_count())
            return next(i for i in indexes if self.reference_id(i) == ref_id)
        except StopIteration as e:
            raise ReferenceNotFoundError("ID: " + ref_id) from e

    def reference_creators_citation(self, ref_id):
        """Return for citation the creator surnames (locally defined) and the publication year."""
        # FIXME Delayed refactoring. Use an index instead of an ID.
        index = self.reference_index(ref_id)
        creators = self.reference_creator_surnames(index)
        creator_count = len(creators)
        if creator_count == 0:
            return ""
        year = self.reference_year(index)
        if creator_count == 1:
            return "{} ({})".format(creators[0], year)
        elif creator_count == 2:
            return "{} and {} ({})".format(creators[0], creators[1], year)
        else:
            return "{} et al. ({})".format(creators[0], year)