예제 #1
0
def get_score_for_reference_identifier(result_record, hypothesis):
    """
    returns Evidences for result_record matching if an identifier (doi or arXiv id) was matched
    
    :param result_record: 
    :return: 
    """
    evidences = Evidences()

    input_fields = hypothesis.get_detail("input_fields")

    if compare_doi(input_fields.get("doi", "not in ref"),
                   result_record.get("doi", ["not in ads"])):
        evidences.add_evidence(current_app.config["EVIDENCE_SCORE_RANGE"][1],
                               "bibcode")
    elif input_fields.get("arxiv",
                          "not in ref") == get_arxiv_id(result_record):
        evidences.add_evidence(current_app.config["EVIDENCE_SCORE_RANGE"][1],
                               "bibcode")
    elif input_fields.get("bibcode", "not in ref") == result_record.get(
            "bibcode", ["not in ads"]):
        evidences.add_evidence(current_app.config["EVIDENCE_SCORE_RANGE"][1],
                               "bibcode")
    else:
        evidences.add_evidence(current_app.config["EVIDENCE_SCORE_RANGE"][0],
                               "bibcode")

    return evidences
예제 #2
0
def get_thesis_score_for_input_fields(result_record, hypothesis):
    """
    returns Evidences for result_record being some sort of thesis matching
    hypothesis.

    This involves matching of author (including, to some extent, an initial),
    and year.

    We could try to match institutions, but for now we don't.

    :param result_record:
    :param hypothesis:
    :return:
    """
    evidences = Evidences()

    # consider only thesis records
    if result_record["doctype"] in ["phdthesis", "mastersthesis"]:
        evidences.add_evidence(current_app.config["EVIDENCE_SCORE_RANGE"][1],
                               "doctype")
    else:
        evidences.add_evidence(current_app.config["EVIDENCE_SCORE_RANGE"][0],
                               "doctype")

    input_fields = hypothesis.get_detail("input_fields")

    # consider number of authors
    if len(result_record["author_norm"]) == 1:
        # compare authors manually to have initials included.
        ref_lastname, ref_first_init = re.sub(
            r"[\s.]", "",
            hypothesis.get_detail("normalized_authors")).lower().split(",")
        ads_lastname, ads_first_init = re.sub(
            r"[\s.]", "", result_record["author_norm"][0].lower()).split(",")
        # lastname match is worth 0.7, first inital 0.3
        author_score = int(ref_lastname==ads_lastname) * current_app.config["EVIDENCE_SCORE_RANGE"][1] * 0.7 + \
                       int(ref_first_init==ads_first_init) * current_app.config["EVIDENCE_SCORE_RANGE"][1] * 0.3
    else:
        author_score = current_app.config["EVIDENCE_SCORE_RANGE"][0]
    evidences.add_evidence(author_score, "author")

    add_year_evidence(evidences, input_fields.get('year'),
                      result_record.get('year'))

    # count how many words of affiliation is in reference string
    ref_str = input_fields.get('refstr')
    aff_raw = ' '.join(result_record["aff_raw"]).split()
    aff_score = sum([1.0
                     for word in aff_raw if word in ref_str]) / len(aff_raw)
    evidences.add_evidence(aff_score, "affiliation")

    return evidences
예제 #3
0
def get_author_year_score_for_input_fields(result_record, hypothesis):
    """
    returns evidences based on just author and year.

    :param result_record:
    :param hypothesis: 
    :return: 
    """
    input_fields = hypothesis.get_detail('input_fields')

    normalized_authors = hypothesis.get_detail('normalized_authors')
    if normalized_authors is None:
        normalized_authors = normalize_author_list(
            input_fields.get('author', ''))

    evidences = Evidences()

    add_author_evidence(evidences,
                        normalized_authors,
                        result_record['author_norm'],
                        result_record['first_author_norm'],
                        has_etal=hypothesis.get_detail('has_etal'))

    add_year_evidence(evidences, input_fields.get('year'),
                      result_record.get('year'))

    return evidences
예제 #4
0
def get_author_year_score_for_input_fields(result_record, hypothesis):
    """
    returns evidences based on just author and year.

    For most sources, you should rather use get_basic_score_for_input_fields
    -- see there for more information.
    
    :param result_record: 
    :param hypothesis: 
    :return: 
    """
    input_fields = hypothesis.get_detail('input_fields')

    evidences = Evidences()

    normalized_authors = hypothesis.get_detail('normalized_authors')
    if normalized_authors is None:
        normalized_authors = normalize_author_list(
            input_fields.get('author', ''))

    add_author_evidence(evidences,
                        normalized_authors,
                        result_record['author_norm'],
                        result_record['first_author_norm'],
                        has_etal=hypothesis.get_detail('has_etal'))

    add_year_evidence(evidences, input_fields.get('year'),
                      result_record.get('year'))

    return evidences
예제 #5
0
def get_score_for_baas_match(result_record, hypothesis):
    """
    scores a BAAS->DDA match.

    For these, volume and page are hidden deep inside pub_raw.

    We also expect an expected_bibstem detail in the hypothesis, mainly
    for robustness in case this gets used to score something else.

    :param result_record:
    :param hypothesis:
    :return:
    """
    evidences = Evidences()
    if not re.match(r'....%s' % hypothesis.get_detail('expected_bibstem'),
                    result_record['bibcode']):
        evidences.add_evidence(current_app.config['EVIDENCE_SCORE_RANGE'][0],
                               'no DDA bibcode')
        return evidences

    input_fields = hypothesis.get_detail('input_fields')

    normalized_authors = hypothesis.get_detail('normalized_authors')
    if normalized_authors is None:
        normalized_authors = normalize_author_list(
            input_fields.get('author', ''))

    add_author_evidence(evidences, normalized_authors,
                        result_record['author_norm'],
                        result_record['first_author_norm'])

    add_boolean_evidence(
        evidences, 'Vol. %s' % input_fields['volume']
        in result_record['pub_raw'], 'vol in pub_raw?')

    add_boolean_evidence(
        evidences,
        re.search(r'p\.\s*%s\b' % input_fields['page'],
                  result_record['pub_raw']), 'page in pub_raw?')

    return evidences
예제 #6
0
def get_volume_page_score_for_input_fields(result_record, hypothesis):
    """

    :param result_record:
    :param hypothesis:
    :return:
    """
    input_fields = hypothesis.get_detail('input_fields')

    evidences = Evidences()

    exist = bool('volume' in input_fields) + bool('page' in input_fields)

    ads_volume = result_record.get('volume', '')
    ads_page = result_record.get('page', '')

    ref_str = input_fields.get('refstr', '')

    # neither page nor volume were tagged
    if exist == 0:
        # see if ads_volume/ads_page is in the ref_str
        ref_volume = match_ads_numeric_in_ref_str(ads_volume, ref_str)
        ref_page = match_ads_numeric_in_ref_str(ads_page, ref_str)
    # one value was tagged, see what matches what?
    elif exist == 1:
        if 'volume' in input_fields:
            ref_volume, ref_page = match_ads_numerics_with_ref_numeric(
                ads_volume, input_fields['volume'], ads_page, ref_str)
        elif 'page' in input_fields:
            ref_volume, ref_page = match_ads_numerics_with_ref_numeric(
                ads_page, input_fields['page'], ads_volume, ref_str)
    # both values were tagged
    else:  # == 2
        ref_page = input_fields['page']
        ref_volume = input_fields['volume']

    if ref_volume:
        add_volume_evidence(evidences, ref_volume, ads_volume,
                            result_record.get('issue'),
                            result_record.get('pub_raw'))
    if ref_page:
        add_page_evidence(evidences, ref_page, ads_page,
                          result_record.get('page_range', ''),
                          result_record.get('eid', None),
                          hypothesis.get_detail('page_qualifier'),
                          input_fields.get('refstr', ''))

    return evidences
예제 #7
0
def get_chapter_score_for_input_fields(result_record, hypothesis):
    """
    returns evidences based on author, year, volume and/or page, and publication or title,
    when solr record is a chapter in a proceeding or in a book it comes here

    :param result_record:
    :param hypothesis:
    :return:
    """
    evidences = get_author_year_score_for_input_fields(result_record, hypothesis) + \
                get_volume_page_score_for_input_fields(result_record, hypothesis)

    input_fields = hypothesis.get_detail("input_fields")

    # if comparing against inproceedigns record in solr, compare both pub and title
    # aginst both pub and title in solr
    # inproceedings reference string, depending on the publications, interchanges the order of title and journal
    # include the one with the highest score in the final score
    ref_pubs = [
        input_fields.get("pub", ""),
        input_fields.get("pub", ""),
        input_fields.get("title", ""),
        input_fields.get("title", "")
    ]
    ads_pubs = [
        result_record.get("title", ""),
        result_record.get("pub_raw", ""),
        result_record.get("title", ""),
        result_record.get("pub_raw", "")
    ]
    track_evidence = Evidences()
    for ref_pub, ads_pub in zip(ref_pubs, ads_pubs):
        tmp_evidence = Evidences()
        add_publication_evidence(tmp_evidence, ref_pub,
                                 input_fields.get("bibstem", ""),
                                 input_fields.get("refstr", ""), ads_pub,
                                 result_record.get("bibcode", ""),
                                 result_record.get("bibstem", ""))
        if tmp_evidence.get_score() > track_evidence.get_score():
            track_evidence = tmp_evidence
    evidences = evidences + track_evidence
    return evidences
예제 #8
0
def get_volume_page_score_for_input_fields(result_record, hypothesis):
    """

    :param result_record:
    :param hypothesis:
    :return:
    """
    input_fields = hypothesis.get_detail('input_fields')

    evidences = Evidences()

    # references can eliminate volume and include only page, or only volume can be included
    # parser can not differentiate between if only page was included or only volume,
    # so if there is only one value, do a reverse engineering, see if the page and or volume in ads
    # matches this one value, or if it can be matched in the ref_str
    exist = bool('volume' in input_fields) + bool('page' in input_fields)

    ads_volume = result_record.get('volume', '')
    ads_page = result_record.get('page', '')

    ref_str = input_fields.get('refstr', '')

    if exist == 0:
        # see if ads_volume is in the ref_str
        ref_volume = ads_volume if re.search(r'\b(%s)\b' %
                                             ads_volume, ref_str) else None
        # see if ads_page is in the ref_str
        ref_page = ads_page if re.search(r'\b(%s)\b' %
                                         ads_page, ref_str) else None
    elif exist == 1:
        if 'volume' in input_fields:
            if input_fields['volume'] == ads_page:
                ref_volume = ads_page
                # see if ads_volume is in the ref_str
                ref_page = ads_volume if re.search(r'\b(%s)\b' % ads_volume,
                                                   ref_str) else None
            elif input_fields['volume'] == ads_volume:
                # volume matches, see if ads_page is in the ref_str
                ref_volume = ads_volume
                ref_page = ads_page if re.search(r'\b(%s)\b' %
                                                 ads_page, ref_str) else None
            else:
                ref_volume = input_fields['volume']
                ref_page = None
        elif 'page' in input_fields:
            if input_fields['page'] == ads_volume:
                ref_page = ads_volume
                # see if ads_page is in the ref_str
                ref_volume = ads_page if re.search(r'\b(%s)\b' %
                                                   ads_page, ref_str) else None
            elif input_fields['page'] == ads_page:
                # page matches, see if ads_volume is in the ref_str
                ref_page = ads_page
                ref_volume = ads_volume if re.search(r'\b(%s)\b' % ads_volume,
                                                     ref_str) else None
            else:
                ref_page = input_fields['page']
                ref_volume = None
    else:  # == 2
        ref_page = input_fields['page']
        ref_volume = input_fields['volume']

    if ref_volume:
        add_volume_evidence(evidences, ref_volume, ads_volume,
                            result_record.get('issue'),
                            result_record.get('pub_raw'))
    if ref_page:
        add_page_evidence(evidences, ref_page, ads_page,
                          result_record.get('page_range', ''),
                          result_record.get('eid', None),
                          hypothesis.get_detail('page_qualifier'),
                          input_fields.get('refstr', ''))

    return evidences
예제 #9
0
def get_thesis_score_for_input_fields(result_record, hypothesis):
    """
    returns Evidences for result_record being some sort of thesis matching
    hypothesis.

    This involves matching of author (including, to some extent, an initial),
    and year.

    We could try to match institutions, but for now we don't.

    :param result_record:
    :param hypothesis:
    :return:
    """
    evidences = Evidences()

    # Theses should only have one author
    if len(result_record["author_norm"]) > 1:
        evidences.add_evidence(-0.1, "thesis with multiple authors?")

    input_fields = hypothesis.get_detail("input_fields")

    # compare authors manually to have initials included.
    ref_last, ref_first_init = re.sub(
        r"[\s.]", "",
        hypothesis.get_detail("normalized_authors")).lower().split(",")
    ref_first_init = ref_first_init[0]
    ads_last, ads_first_init = re.sub(
        r"[\s.]", "", result_record["author_norm"][0].lower()).split(",")

    if ref_last == ads_last and ref_first_init == ads_first_init:
        evidences.add_evidence(current_app.config["EVIDENCE_SCORE_RANGE"][1],
                               "author")
    else:
        evidences.add_evidence(current_app.config["EVIDENCE_SCORE_RANGE"][0],
                               "author")

    add_year_evidence(evidences, input_fields.get('year'),
                      result_record.get('year'))

    if has_thesis_indicators(result_record["pub_raw"]):
        evidences.add_evidence(current_app.config["EVIDENCE_SCORE_RANGE"][1],
                               "thesisString")
    else:
        evidences.add_evidence(current_app.config["EVIDENCE_SCORE_RANGE"][0],
                               "thesisString")
    # XXX TODO: When we have pub_raw, we could also check for places;
    # ideally, there would be -1 for a place in refstring not in ADS
    # and +1 for a place in ADS that's also in the refstring.  We'd
    # need a list of places then, though.  I'd have that as a seperate
    # evidence.

    return evidences