예제 #1
0
파일: common.py 프로젝트: cclauss/fatcat
    def match_existing_release_fuzzy(
            self, release: ReleaseEntity
    ) -> Optional[Tuple[str, str, ReleaseEntity]]:
        """
        This helper function uses fuzzycat (and elasticsearch) to look for
        existing release entities with similar metadata.

        Returns None if there was no match of any kind, or a single tuple
        (status: str, reason: str, existing: ReleaseEntity) if there was a match.

        Status string is one of the fuzzycat.common.Status, with "strongest
        match" in this sorted order:

        - EXACT
        - STRONG
        - WEAK
        - AMBIGUOUS

        Eg, if there is any EXACT match that is always returned; an AMBIGUOUS
        result is only returned if all the candidate matches were ambiguous.
        """

        # this map used to establish priority order of verified matches
        STATUS_SORT = {
            fuzzycat.common.Status.TODO: 0,
            fuzzycat.common.Status.EXACT: 10,
            fuzzycat.common.Status.STRONG: 20,
            fuzzycat.common.Status.WEAK: 30,
            fuzzycat.common.Status.AMBIGUOUS: 40,
            fuzzycat.common.Status.DIFFERENT: 60,
        }

        # TODO: the size here is a first guess; what should it really be?
        candidates = match_release_fuzzy(release, size=10, es=self.es_client)
        if not candidates:
            return None

        release_dict = entity_to_dict(release, api_client=self.api.api_client)
        verified = [(
            fuzzycat.verify.verify(
                release_dict, entity_to_dict(c,
                                             api_client=self.api.api_client)),
            c,
        ) for c in candidates]

        # chose the "closest" match
        closest = sorted(verified, key=lambda v: STATUS_SORT[v[0].status])[0]
        if closest[0].status == fuzzycat.common.Status.DIFFERENT:
            return None
        elif closest[0].status == fuzzycat.common.Status.TODO:
            raise NotImplementedError("fuzzycat verify hit a Status.TODO")
        else:
            return (closest[0].status.name, closest[0].reason.value,
                    closest[1])
예제 #2
0
    def try_update(self, re):
        """
        When debug is true, write the RE to stdout, not to the database. Might
        hide schema mismatch bugs.
        """
        if self.debug is True:
            print(json.dumps(entity_to_dict(re, api_client=None)))
            return False

        # lookup existing DOI (don't need to try other ext idents for crossref)
        existing = None
        try:
            existing = self.api.lookup_release(doi=re.ext_ids.doi)
        except fatcat_openapi_client.rest.ApiException as err:
            if err.status != 404:
                raise err
            # doesn't exist, need to update
            return True

        # eventually we'll want to support "updates", but for now just skip if
        # entity already exists
        if existing:
            self.counts['exists'] += 1
            return False

        return True
예제 #3
0
 def insert_batch(self, batch):
     print('inserting batch ({})'.format(len(batch)), file=sys.stderr)
     if self.insert_log_file:
         with open(self.insert_log_file, 'a') as f:
             for doc in batch:
                 json.dump(entity_to_dict(doc, api_client=None), f)
                 f.write('\n')
     self.api.create_release_auto_batch(
         fatcat_openapi_client.ReleaseAutoBatch(
             editgroup=fatcat_openapi_client.Editgroup(
                 description=self.editgroup_description,
                 extra=self.editgroup_extra),
             entity_list=batch))
예제 #4
0
파일: common.py 프로젝트: cclauss/fatcat
    def push_record(self, record: Dict[str, Any]) -> None:
        """
        Intended to be called by "pusher" class (which could be pulling from
        JSON file, Kafka, whatever).

        Input is expected to be an entity in JSON-like dict form.

        Returns nothing.
        """
        self.counts["lines"] += 1
        if not record:
            self.counts["skip-null"] += 1
            return

        entity = entity_from_dict(record, self.entity_type, api_client=self.ac)

        if entity.state != "active":
            self.counts["skip-inactive"] += 1
            return

        cleaned = self.clean_entity(copy.deepcopy(entity))
        if entity == cleaned:
            self.counts["skip-clean"] += 1
            return
        else:
            self.counts["cleaned"] += 1

        if self.dry_run_mode:
            entity_dict = entity_to_dict(entity, api_client=self.ac)
            print(json.dumps(entity_dict))
            return

        if entity.ident in self._idents_inflight:
            raise ValueError(
                "Entity already part of in-process update: {}".format(
                    entity.ident))

        updated = self.try_update(cleaned)
        if updated:
            self.counts["updated"] += updated
            self._edit_count += updated
            self._idents_inflight.append(entity.ident)

        if self._edit_count >= self.edit_batch_size:
            self.api.accept_editgroup(self._editgroup_id)
            self._editgroup_id = None
            self._edit_count = 0
            self._idents_inflight = []
        return
예제 #5
0
def test_datacite_conversions(datacite_importer):
    """
    Datacite JSON to release entity JSON representation. The count is hardcoded
    for now.
    """
    datacite_importer.debug = True
    for i in range(35):
        src = "tests/files/datacite/datacite_doc_{0:02d}.json".format(i)
        dst = "tests/files/datacite/datacite_result_{0:02d}.json".format(i)
        with open(src, "r") as f:
            re = datacite_importer.parse_record(json.load(f))
            result = entity_to_dict(re)
        with open(dst, "r") as f:
            expected = json.loads(f.read())

        assert result == expected, "output mismatch in {}".format(dst)
예제 #6
0
    def parse_record(self, xml_elem: Any) -> Optional[ReleaseEntity]:
        """
        - title
            => may contain <i>, <sub>, <sup>, <tt>
        - journal (abbrev?)
        - volume, pages, number (number -> issue)
        - publisher
        - year
            => for conferences, year of conference not of publication
        - month
        - crossref (from inproceedings to specific proceedings volume)
        - booktitle
            => for inproceedings, this is the name of conference or workshop. acronym.
        - isbn
        """

        dblp_key = xml_elem.get("key")
        if not dblp_key:
            self.counts["skip-empty-key"] += 1
            return False
        dblp_key_type = dblp_key.split("/")[0]

        # dblp_prefix may be used for container lookup
        dblp_prefix = None
        if dblp_key_type in ("journals", "conf"):
            dblp_prefix = "/".join(dblp_key.split("/")[:2])
        elif dblp_key_type in ("series", "reference", "tr", "books"):
            dblp_prefix = "/".join(dblp_key.split("/")[:-1])

        publtype = xml_elem.get("publtype") or None

        dblp_type = xml_elem.name
        if dblp_type not in self.ELEMENT_TYPES:
            self.counts[f"skip-dblp-type:{dblp_type}"] += 1

        if dblp_key_type in ("homepages", "persons", "dblpnote"):
            self.counts["skip-key-type"] += 1
            return False

        if dblp_key.startswith("journals/corr/"):
            self.counts["skip-arxiv-corr"] += 1
            return False

        title = clean_str(" ".join(xml_elem.title.stripped_strings),
                          force_xml=True)
        if not title:
            self.counts["skip-title"] += 1
            return False
        if title.endswith("."):
            title = title[:-1]

        release_type = None
        release_stage = "published"
        withdrawn_status = None

        # primary releae_type detection: type of XML element, then prefix of key for granularity
        if dblp_type == "article":
            release_type = "article"
            if dblp_key_type == "journals" and publtype != "informal":
                release_type = "article-journal"
            elif dblp_key_type == "tr":
                release_type = "report"
            elif title.startswith("Review:"):
                release_type = "review"
        elif dblp_type == "inproceedings":
            release_type = "paper-conference"
        elif dblp_type == "book":
            release_type = "book"
        elif dblp_type == "incollection":
            # XXX: part vs. chapter?
            release_type = "chapter"
        elif dblp_type == "data":
            release_type = "dataset"
        elif dblp_type in ("mastersthesis", "phdthesis"):
            release_type = "thesis"

        # overrides/extensions of the above
        if publtype == "informal":
            # for conferences, seems to indicate peer-review status
            # for journals, seems to indicate things like book reviews; split out above
            pass
        elif publtype == "encyclopedia":
            release_type = "entry-encyclopedia"
        elif publtype == "edited":
            # XXX: article?
            release_type = "editorial"
        elif publtype == "data":
            release_type = "dataset"
        elif publtype == "data":
            release_type = "dataset"
        elif publtype == "software":
            release_type = "software"
        elif publtype == "widthdrawn":
            withdrawn_status = "widthdrawn"
        elif publtype == "survey":
            # XXX: flag as a review/survey article?
            pass

        # print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)

        container_name = None
        booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text)
        series = clean_str(xml_elem.series and xml_elem.series.text)

        if xml_elem.journal:
            container_name = clean_str(xml_elem.journal.text)

        container_id = None
        if dblp_prefix:
            container_id = self.lookup_dblp_prefix(dblp_prefix)
            # note: we will skip later if couldn't find prefix

        publisher = clean_str(xml_elem.publisher and xml_elem.publisher.text)
        volume = clean_str(xml_elem.volume and xml_elem.volume.text)
        issue = clean_str(xml_elem.number and xml_elem.number.text)
        pages = clean_str(xml_elem.pages and xml_elem.pages.text)
        release_year_str = clean_str(xml_elem.year and xml_elem.year.text)
        if release_year_str and release_year_str.isdigit():
            release_year: Optional[int] = int(release_year_str)
        else:
            release_year = None
        release_month = parse_month(
            clean_str(xml_elem.month and xml_elem.month.text))
        isbn = clean_isbn13(xml_elem.isbn and xml_elem.isbn.text)
        part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text)

        # block bogus far-future years/dates
        if release_year is not None and (release_year > (self.this_year + 5)
                                         or release_year < 1000):
            release_month = None
            release_year = None

        contribs = self.dblp_contribs(xml_elem)
        ext_ids = self.dblp_ext_ids(xml_elem, dblp_key)
        if isbn:
            ext_ids.isbn13 = isbn
        if ext_ids.doi:
            self.counts["has-doi"] += 1

        # dblp-specific extra
        dblp_extra = dict(type=dblp_type)
        note = clean_str(xml_elem.note and xml_elem.note.text)
        if note and "base-search.net" not in note:
            dblp_extra["note"] = note
        if part_of_key:
            dblp_extra["part_of_key"] = part_of_key

        # generic extra
        extra: Dict[str, Any] = dict()
        if not container_id and container_name:
            extra["container_name"] = container_name

        if series and (dblp_key_type == "series" or dblp_type == "book"):
            extra["series-title"] = series
        elif series:
            dblp_extra["series"] = series

        if booktitle and dblp_key_type == "series":
            extra["container-title"] = booktitle
        elif booktitle and dblp_key_type == "conf":
            extra["event"] = booktitle
        elif booktitle:
            dblp_extra["booktitle"] = booktitle

        if release_year and release_month:
            # TODO: release_month schema migration
            extra["release_month"] = release_month

        if dblp_extra:
            extra["dblp"] = dblp_extra

        re = fatcat_openapi_client.ReleaseEntity(
            work_id=None,
            container_id=container_id,
            release_type=release_type,
            release_stage=release_stage,
            withdrawn_status=withdrawn_status,
            title=title,
            release_year=release_year,
            # release_date,
            publisher=publisher,
            ext_ids=ext_ids,
            contribs=contribs or None,
            volume=volume,
            issue=issue,
            pages=pages,
            extra=extra or None,
        )
        re = self.biblio_hacks(re)

        if self.dump_json_mode:
            re_dict = entity_to_dict(re, api_client=self.api.api_client)
            re_dict["_dblp_ee_urls"] = self.dblp_ext_urls(xml_elem)
            re_dict["_dblp_prefix"] = dblp_prefix
            print(json.dumps(re_dict, sort_keys=True))
            return False

        if not re.container_id:
            self.counts["skip-dblp-container-missing"] += 1
            return False
        return re
예제 #7
0
    def parse_record(self, xml_elem):
        """
        - title
            => may contain <i>, <sub>, <sup>, <tt>
        - journal (abbrev?)
        - volume, pages, number (number -> issue)
        - publisher
        - year
            => for conferences, year of conference not of publication
        - month
        - crossref (from inproceedings to specific proceedings volume)
        - booktitle
            => for inproceedings, this is the name of conference or workshop. acronym.
        - isbn
        """

        dblp_key = xml_elem.get('key')
        if not dblp_key:
            self.counts['skip-empty-key'] += 1
            return False
        dblp_key_type = dblp_key.split('/')[0]

        # dblp_prefix may be used for container lookup
        dblp_prefix = None
        if dblp_key_type in ('journals', 'conf'):
            dblp_prefix = '/'.join(dblp_key.split('/')[:2])
        elif dblp_key_type in ('series', 'reference', 'tr', 'books'):
            dblp_prefix = '/'.join(dblp_key.split('/')[:-1])

        publtype = xml_elem.get('publtype') or None

        dblp_type = xml_elem.name
        if dblp_type not in self.ELEMENT_TYPES:
            self.counts[f'skip-dblp-type:{dblp_type}'] += 1

        if dblp_key_type in ('homepages', 'persons', 'dblpnote'):
            self.counts['skip-key-type'] += 1
            return False

        if dblp_key.startswith('journals/corr/'):
            self.counts['skip-arxiv-corr'] += 1
            return False

        title = clean_str(" ".join(xml_elem.title.stripped_strings), force_xml=True)
        if not title:
            self.counts['skip-title'] += 1
            return False
        if title.endswith('.'):
            title = title[:-1]

        release_type = None
        release_stage = 'published'
        withdrawn_status = None

        # primary releae_type detection: type of XML element, then prefix of key for granularity
        if dblp_type == 'article':
            release_type = 'article'
            if dblp_key_type == 'journals' and publtype != 'informal':
                release_type = 'article-journal'
            elif dblp_key_type == 'tr':
                release_type = 'report'
            elif title.startswith("Review:"):
                release_type = 'review'
        elif dblp_type == 'inproceedings':
            release_type = 'paper-conference'
        elif dblp_type == 'book':
            release_type = 'book'
        elif dblp_type == 'incollection':
            # XXX: part vs. chapter?
            release_type = 'chapter'
        elif dblp_type == 'data':
            release_type = 'dataset'
        elif dblp_type in ('mastersthesis', 'phdthesis'):
            release_type = 'thesis'

        # overrides/extensions of the above
        if publtype == 'informal':
            # for conferences, seems to indicate peer-review status
            # for journals, seems to indicate things like book reviews; split out above
            pass
        elif publtype == 'encyclopedia':
            release_type = 'entry-encyclopedia'
        elif publtype == 'edited':
            # XXX: article?
            release_type = 'editorial'
        elif publtype == 'data':
            release_type = 'dataset'
        elif publtype == 'data':
            release_type = 'dataset'
        elif publtype == 'software':
            release_type = 'software'
        elif publtype == 'widthdrawn':
            withdrawn_status = 'widthdrawn'
        elif publtype == 'survey':
            # XXX: flag as a review/survey article?
            pass

        #print((release_type, dblp_type, dblp_key_type, publtype), file=sys.stderr)

        container_name = None
        booktitle = clean_str(xml_elem.booktitle and xml_elem.booktitle.text)
        series = clean_str(xml_elem.series and xml_elem.series.text)

        if xml_elem.journal:
            container_name = clean_str(xml_elem.journal.text)

        container_id = None
        if dblp_prefix:
            container_id = self.lookup_dblp_prefix(dblp_prefix)
            # note: we will skip later if couldn't find prefix

        publisher = clean_str(xml_elem.publisher and xml_elem.publisher.text)
        volume = clean_str(xml_elem.volume and xml_elem.volume.text)
        issue = clean_str(xml_elem.number and xml_elem.number.text)
        pages = clean_str(xml_elem.pages and xml_elem.pages.text)
        release_year = clean_str(xml_elem.year and xml_elem.year.text)
        if release_year and release_year.isdigit():
            release_year = int(release_year)
        else:
            release_year = None
        release_month = parse_month(clean_str(xml_elem.month and xml_elem.month.text))
        isbn = clean_isbn13(xml_elem.isbn and xml_elem.isbn.text)
        part_of_key = clean_str(xml_elem.crossref and xml_elem.crossref.text)

        # block bogus far-future years/dates
        if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
            release_month = None
            release_year = None

        contribs = self.dblp_contribs(xml_elem or [])
        ext_ids = self.dblp_ext_ids(xml_elem, dblp_key)
        if isbn:
            ext_ids.isbn13 = isbn
        if ext_ids.doi:
            self.counts['has-doi'] += 1

        # dblp-specific extra
        dblp_extra = dict(type=dblp_type)
        note = clean_str(xml_elem.note and xml_elem.note.text)
        if note and not 'base-search.net' in note:
            dblp_extra['note'] = note
        if part_of_key:
            dblp_extra['part_of_key'] = part_of_key

        # generic extra
        extra = dict()
        if not container_id and container_name:
            extra['container_name'] = container_name

        if series and (dblp_key_type == 'series' or dblp_type == 'book'):
            extra['series-title'] = series
        elif series:
            dblp_extra['series'] = series

        if booktitle and dblp_key_type == 'series':
            extra['container-title'] = booktitle
        elif booktitle and dblp_key_type == 'conf':
            extra['event'] = booktitle
        elif booktitle:
            dblp_extra['booktitle'] = booktitle

        if release_year and release_month:
            # TODO: release_month schema migration
            extra['release_month'] = release_month

        if dblp_extra:
            extra['dblp'] = dblp_extra
        if not extra:
            extra = None

        re = fatcat_openapi_client.ReleaseEntity(
            work_id=None,
            container_id=container_id,
            release_type=release_type,
            release_stage=release_stage,
            withdrawn_status=withdrawn_status,
            title=title,
            release_year=release_year,
            #release_date,
            publisher=publisher,
            ext_ids=ext_ids,
            contribs=contribs,
            volume=volume,
            issue=issue,
            pages=pages,
            extra=extra,
        )
        re = self.biblio_hacks(re)

        if self.dump_json_mode:
            re_dict = entity_to_dict(re, api_client=self.api.api_client)
            re_dict['_dblp_ee_urls'] = self.dblp_ext_urls(xml_elem)
            re_dict['_dblp_prefix'] = dblp_prefix
            print(json.dumps(re_dict, sort_keys=True))
            return False

        if not re.container_id:
            self.counts["skip-dblp-container-missing"] += 1
            return False
        return re