Exemplo n.º 1
0
def get_author_year_score_for_input_fields(result_record, hypothesis):
    """
    returns evidences based on just author and year.

    For most sources, you should rather use get_basic_score_for_input_fields
    -- see there for more information.
    
    :param result_record: 
    :param hypothesis: 
    :return: 
    """
    input_fields = hypothesis.get_detail('input_fields')

    evidences = Evidences()

    normalized_authors = hypothesis.get_detail('normalized_authors')
    if normalized_authors is None:
        normalized_authors = normalize_author_list(
            input_fields.get('author', ''))

    add_author_evidence(evidences,
                        normalized_authors,
                        result_record['author_norm'],
                        result_record['first_author_norm'],
                        has_etal=hypothesis.get_detail('has_etal'))

    add_year_evidence(evidences, input_fields.get('year'),
                      result_record.get('year'))

    return evidences
Exemplo n.º 2
0
def get_author_year_score_for_input_fields(result_record, hypothesis):
    """
    returns evidences based on just author and year.

    :param result_record:
    :param hypothesis: 
    :return: 
    """
    input_fields = hypothesis.get_detail('input_fields')

    normalized_authors = hypothesis.get_detail('normalized_authors')
    if normalized_authors is None:
        normalized_authors = normalize_author_list(
            input_fields.get('author', ''))

    evidences = Evidences()

    add_author_evidence(evidences,
                        normalized_authors,
                        result_record['author_norm'],
                        result_record['first_author_norm'],
                        has_etal=hypothesis.get_detail('has_etal'))

    add_year_evidence(evidences, input_fields.get('year'),
                      result_record.get('year'))

    return evidences
Exemplo n.º 3
0
def make_solr_condition_author(value):
    """

    :param value:
    :return:
    """
    # only if not already normalized
    if ";" not in value:
        value = re.sub(
            r"\.( ?[A-Z]\.)*",
            "",
            # ... and silly "double initials"
            re.sub(r"-[A-Z]\.", "",
                   normalize_author_list(value, initials='.' in value)))
    # something went wrong with normalization,
    # so grab all last names and insert semicolon between them
    if ";" not in value:
        lastname = '; '.join(AUTHOR_LAST_NAME.findall(value))
        # most probably lastname is not capitalized
        # so grab the words
        if len(lastname) == 0:
            lastname = '; '.join(
                AUTHOR_LAST_NAME_CASE_INSENSITIVE.findall(value))
        value = lastname
    # authors fields have special serialization rules
    return " AND ".join('"%s"' % s.strip() for s in value.split(";"))
Exemplo n.º 4
0
def make_solr_condition_author(value):
    """

    :param value:
    :return:
    """
    value = re.sub(
        r"\.( ?[A-Z]\.)*",
        "",
        # ... and silly "double initials"
        re.sub("-[A-Z]\.", "",
               normalize_author_list(value, initials='.' in value)))
    # authors fields have special serialization rules
    return " AND ".join('"%s"' % s.strip() for s in value.split(";"))
Exemplo n.º 5
0
    def make_digested_record(self):
        """
        adds a digested_record attribute from field_mappings and self.ref.

        This is exclusively called by the constructor.
        :return:
        """
        self.digested_record = {}
        for dest_key, src_key in self.field_mappings:
            value = self.ref.get(src_key)
            if value:
                self.digested_record[dest_key] = value

        self.normalized_authors = None
        if "author" in self.digested_record:
            self.digested_record["author"] = self.ETAL_PAT.sub(
                '', self.digested_record["author"])
            self.normalized_authors = normalize_author_list(
                self.digested_record["author"], initials=True)
            self.normalized_first_author = re.sub(
                r"\.( ?[A-Z]\.)*", "",
                re.sub("-[A-Z]\.", "",
                       self.normalized_authors)).split(";")[0].strip()

        if "year" in self.digested_record and len(
                self.digested_record["year"]) > 4:
            # the extra character(s) are at the end, just to be smart about it let's go with RE
            self.digested_record["year"] = re.findall(
                r'^([12][089]\d\d)', self.digested_record["year"])[0]

        if "-" in self.digested_record.get("page", ""):
            # we are querying on page stat, for now through out the page end
            self.digested_record["page"] = self.digested_record["page"].split(
                "-")[0]

        if "volume" in self.digested_record and "pub" in self.digested_record:
            # if volume has a alpha character at the beginning, remove it and attach it to the journal
            # ie. A. Arvanitaki, S. Dimopoulos, S. Dubovsky, N. Kaloper, and J. March-Russell, "String Axiverse," "Phys. Rev.", vol. D81, p. 123530, 2010.
            # which is in fact Journal `Phys. Rev. D.` Volume `81`
            match = self.JOURNAL_LETTER_ATTACHED_VOLUME.match(
                self.digested_record["volume"])
            if match:
                self.digested_record["pub"] = '%s %s' % (
                    self.digested_record["pub"],
                    self.digested_record["volume"][0])
                self.digested_record["volume"] = self.digested_record[
                    "volume"][1:]
Exemplo n.º 6
0
def get_score_for_baas_match(result_record, hypothesis):
    """
    scores a BAAS->DDA match.

    For these, volume and page are hidden deep inside pub_raw.

    We also expect an expected_bibstem detail in the hypothesis, mainly
    for robustness in case this gets used to score something else.

    :param result_record:
    :param hypothesis:
    :return:
    """
    evidences = Evidences()
    if not re.match(r'....%s' % hypothesis.get_detail('expected_bibstem'),
                    result_record['bibcode']):
        evidences.add_evidence(current_app.config['EVIDENCE_SCORE_RANGE'][0],
                               'no DDA bibcode')
        return evidences

    input_fields = hypothesis.get_detail('input_fields')

    normalized_authors = hypothesis.get_detail('normalized_authors')
    if normalized_authors is None:
        normalized_authors = normalize_author_list(
            input_fields.get('author', ''))

    add_author_evidence(evidences, normalized_authors,
                        result_record['author_norm'],
                        result_record['first_author_norm'])

    add_boolean_evidence(
        evidences, 'Vol. %s' % input_fields['volume']
        in result_record['pub_raw'], 'vol in pub_raw?')

    add_boolean_evidence(
        evidences,
        re.search(r'p\.\s*%s\b' % input_fields['page'],
                  result_record['pub_raw']), 'page in pub_raw?')

    return evidences
Exemplo n.º 7
0
    def make_digested_record(self):
        """
        adds a digested_record attribute from field_mappings and self.ref.

        This is exclusively called by the constructor.
        :return:
        """
        self.digested_record = {}
        for dest_key, src_key in self.field_mappings:
            value = self.ref.get(src_key)
            if value:
                self.digested_record[dest_key] = value

        self.normalized_authors = None
        if "author" in self.digested_record:
            self.digested_record["author"] = self.ETAL_PAT.sub('', self.digested_record["author"])
            self.normalized_authors = normalize_author_list(self.digested_record["author"], initials='.' in self.digested_record["author"])
            self.normalized_first_author = re.sub(r"\.( ?[A-Z]\.)*", "", re.sub(r"-[A-Z]\.", "", self.normalized_authors)).split(";")[0].strip()
            if len(self.normalized_first_author) <= 3:
                self.digested_record.pop("author")
        if "year" in self.digested_record and len(self.digested_record["year"]) > 4:
            # the extra character(s) are at the end, just to be smart about it let's go with RE
            self.digested_record["year"] = self.YEAR_PATTERN.findall(self.digested_record["year"])[0]

        # sometimes parser identifies title where it is actually journal
        # if we have volume and page, no pub, but title, it is actually pub so switch
        if not self.digested_record.get("pub", None) and self.digested_record.get("title", None) and \
                self.digested_record.get("volume", None) and self.digested_record.get("page", None):
            self.digested_record["pub"] = self.digested_record.pop("title")

        if self.digested_record.get("page", None):
            if "-" in self.digested_record.get("page"):
                # we are querying on page stat, for now through out the page end
                self.digested_record["page"] = self.digested_record["page"].split("-")[0]
            qualifier, self.digested_record["page"] = self.tokenize_page(self.digested_record["page"])
            if qualifier is not None:
                self.digested_record["qualifier"] = qualifier

        if "volume" in self.digested_record and "pub" in self.digested_record:
            # if volume has a alpha character at the beginning, remove it and attach it to the journal
            # ie. A. Arvanitaki, S. Dimopoulos, S. Dubovsky, N. Kaloper, and J. March-Russell, "String Axiverse," "Phys. Rev.", vol. D81, p. 123530, 2010.
            # which is in fact Journal `Phys. Rev. D.` Volume `81`
            match = self.JOURNAL_LETTER_ATTACHED_VOLUME.match(self.digested_record["volume"])
            if match:
                self.digested_record["pub"] = '%s %s'%(self.digested_record["pub"], self.digested_record["volume"][0])
                self.digested_record["volume"] = self.digested_record["volume"][1:]

        if "title" in self.digested_record:
            # remove too much information
            self.digested_record["title"] = self.TITLE_MAIN.split(self.digested_record["title"])[0]

        pub = self.digested_record.get("pub", None) or self.digested_record.get("title", None)
        if pub:
            try:
                if len(pub) <= 2:
                    if self.digested_record.get("pub", None):
                        self.digested_record.remove("pub")
                    elif self.digested_record.get("title", None):
                        self.digested_record.remove("title")
                else:
                    bibstem = get_best_bibstem_for(pub)
                    # if bibstem is one of the multi-section journal,
                    # sometimes user do not include the section char
                    # so included a wildcard in bibstem
                    if any([bib==bibstem for bib in self.BIBSTEM_WITH_SECTIONS]):
                        self.digested_record["bibstem"] = '%s*'%bibstem
                    else:
                        self.digested_record["bibstem"] = bibstem
            except KeyError:
                # when bibstem can not be infered from pub, get_best_bibstem_for raises this exception
                self.digested_record["bibstem"] = ''

        if "arxiv" in self.digested_record:
            # authors specify arxiv id different ways,
            # sometimes for the new format they include class name, which is wrong
            # sometimes for the old format they include class name, but out of order
            # get the correct arxiv_id
            self.digested_record["arxiv"] = self.digested_record["arxiv"].split(":")[-1]
            for aie in self.ARXIV_ID_EXTRACTOR:
                match = aie.match(self.digested_record["arxiv"])
                if match:
                    group_names = list(match.groupdict().keys())
                    if 'new_pattern' in group_names:
                        self.digested_record["arxiv"] = match.group('new_pattern')
                    elif 'old_pattern' in group_names:
                        self.digested_record["arxiv"] = "%s/%s"%(match.group('class_name'), match.group('old_pattern'))
                    break

        if "ascl" in self.digested_record:
            # remove the ascl prefix if included
            self.digested_record["ascl"] = self.digested_record["ascl"].split(":")[-1]