示例#1
0
def _compile_authors_query(query, record):
    parsed_name = ParsedName(record["full_name"])
    nested_query = {"query": parsed_name.generate_es_query()}
    if "inner_hits" in query:
        nested_query['query']['nested']['inner_hits'] = query['inner_hits']

    return nested_query
示例#2
0
def match_literature_author(author, record):
    configs = [
        current_app.config["AUTHOR_MATCHER_NAME_CONFIG"],
        current_app.config["AUTHOR_MATCHER_NAME_INITIALS_CONFIG"],
    ]

    validators = [(collaboration_validator, affiliations_validator), None]

    parsed_name = ParsedName.loads(author.get("full_name"))
    author_matcher_data = {
        "first_name": parsed_name.first,
        "last_name": parsed_name.last,
        "full_name": author.get("full_name"),
        "collaborations": get_value(record, "collaborations.value", []),
        "affiliations": get_value(author, "affiliations.value", []),
    }

    for config, validator in zip(configs, validators):
        matched_records = match_literature_author_with_config(
            author_matcher_data, config)
        matched_author_data = (
            get_reference_and_bai_if_unambiguous_literature_author_match(
                matched_records))
        if not matched_author_data and validator:
            for validator_function in validator:
                valid_matches = (
                    match for match in matched_records
                    if validator_function(author_matcher_data, match))
                matched_author_data = (
                    get_reference_and_bai_if_unambiguous_literature_author_match(
                        valid_matches))
                if matched_author_data:
                    break
        if matched_author_data:
            return matched_author_data
示例#3
0
def check_author_compability_with_lit_authors(literature_control_number):
    current_author_profile = _get_current_user_author_profile()
    if not current_author_profile:
        return False

    lit_record = _get_lit_record_from_db(literature_control_number)
    if not lit_record:
        return False

    author_name = current_author_profile.get_value("name.value")
    author_parsed_name = ParsedName.loads(author_name)

    matched_authors_recid_last_name = _check_names_compability(
        lit_record, author_parsed_name, last_names_only=True)
    if matched_authors_recid_last_name:
        return matched_authors_recid_last_name

    matched_authors_recid_full_name = _check_names_compability(
        lit_record, author_parsed_name)
    if matched_authors_recid_full_name:
        return matched_authors_recid_full_name

    matched_author_recid_name_with_initials = _find_matching_author_in_lit_record(
        author_parsed_name, literature_control_number)
    if matched_author_recid_name_with_initials:
        return matched_author_recid_name_with_initials
示例#4
0
def test_parsed_name_from_parts():
    parsed_name = ParsedName.from_parts("John", "Smith", "Peter", "Jr", "Sir")

    expected = "Smith, John Peter, Jr."
    result = parsed_name.dumps()

    assert result == expected
示例#5
0
def test_parsed_wrong_names_and_not_fail():
    names = [
        (u'Proffesor.M.', u'Proffesor.M.'),
        (u'ˇ Sirˇ', u'Sirˇ, ˇ.'),
    ]

    for name, expected in names:
        assert ParsedName(name).dumps() == expected
示例#6
0
def _name_variation_has_only_initials(name):
    """Detects whether the name variation consists only from initials."""
    def _is_initial(name_variation):
        return len(name_variation) == 1 or u'.' in name_variation

    parsed_name = ParsedName.loads(name)

    return all([_is_initial(name_part) for name_part in parsed_name])
示例#7
0
def test_parsed_name_initials():
    parsed_name = ParsedName("Holland, Tom Stanley")
    expected = "T. S."

    assert expected == parsed_name.first_initials

    expected = ["T.", "S."]

    assert expected == parsed_name.first_initials_list
示例#8
0
def get_authors(record):
    """Return the authors of a record.

    Queries the Institution records linked from the authors affiliations
    to add, whenever it exists, the HAL identifier of the institution to
    the affiliation.

    Args:
        record(InspireRecord): a record.

    Returns:
        list(dict): the authors of the record.

    Examples:
        >>> record = {
        ...     'authors': [
        ...         'affiliations': [
        ...             {
        ...                 'record': {
        ...                     '$ref': 'http://localhost:5000/api/institutions/902725',
        ...                 }
        ...             },
        ...         ],
        ...     ],
        ... }
        >>> authors = get_authors(record)
        >>> authors[0]['hal_id']
        '300037'

    """
    hal_id_map = _get_hal_id_map(record)

    result = []

    for author in record.get('authors', []):
        affiliations = []

        parsed_name = ParsedName.loads(author['full_name'])
        first_name, last_name = parsed_name.first, parsed_name.last

        for affiliation in author.get('affiliations', []):
            recid = get_recid_from_ref(affiliation.get('record'))
            if recid in hal_id_map and hal_id_map[recid]:
                affiliations.append({'hal_id': hal_id_map[recid]})

        result.append({
            'affiliations': affiliations,
            'first_name': first_name,
            'last_name': last_name,
        })

    return result
示例#9
0
def get_authors(record):
    """Return the authors of a record.

    Queries the Institution records linked from the authors affiliations
    to add, whenever it exists, the HAL identifier of the institution to
    the affiliation.

    Args:
        record(InspireRecord): a record.

    Returns:
        list(dict): the authors of the record.

    Examples:
        >>> record = {
        ...     'authors': [
        ...         'affiliations': [
        ...             {
        ...                 'record': {
        ...                     '$ref': 'http://localhost:5000/api/institutions/902725',
        ...                 }
        ...             },
        ...         ],
        ...     ],
        ... }
        >>> authors = get_authors(record)
        >>> authors[0]['hal_id']
        '300037'

    """
    hal_id_map = _get_hal_id_map(record)

    result = []

    for author in record.get('authors', []):
        affiliations = []

        parsed_name = ParsedName.loads(author['full_name'])
        first_name, last_name = parsed_name.first, parsed_name.last

        for affiliation in author.get('affiliations', []):
            recid = get_recid_from_ref(affiliation.get('record'))
            if recid in hal_id_map and hal_id_map[recid]:
                affiliations.append({'hal_id': hal_id_map[recid]})

        result.append({
            'affiliations': affiliations,
            'first_name': first_name,
            'last_name': last_name,
        })

    return result
示例#10
0
def author_name_contains_fullnames(author_name):
    """Recognizes whether the name contains full name parts and not initials or only lastname.

    Returns:
          bool: True if name has only full name parts, e.g. 'Ellis John', False otherwise. So for example, False is
            returned for 'Ellis, J.' or 'Ellis'.
    """
    parsed_name = ParsedName(author_name)

    if len(parsed_name) == 1:
        return False
    elif any([is_initial_of_a_name(name_part) for name_part in parsed_name]):
        return False

    return True
示例#11
0
    def build_texkey_first_part(cls, data):
        full_name = get_value(data, "authors[0].full_name")
        if full_name:
            parsed_name = ParsedName.loads(full_name)
            parsed_name = (parsed_name.last if len(parsed_name) > 1 else
                           full_name.split(",")[0])
        else:
            parsed_name = None

        if parsed_name and len(data["authors"]) < 10:
            return cls.sanitize(parsed_name)
        elif "collaborations" in data:
            return cls.sanitize(data["collaborations"][0]["value"])
        elif "corporate_author" in data:
            return cls.sanitize(data["corporate_author"][0])
        elif "proceedings" in data["document_type"]:
            return cls.sanitize("Proceedings")
        elif parsed_name:
            return cls.sanitize(parsed_name)
        return None
示例#12
0
def generate_minimal_name_variations(author_name):
    """Generate a small number of name variations.

    Notes:
        Unidecodes the name, so that we use its transliterated version, since this is how the field is being indexed.

        For names with more than one part, {lastname} x {non lastnames, non lastnames initial} variations.
        Additionally, it generates the swapped version of those, for supporting queries like ``Mele Salvatore`` which
        ``ParsedName`` parses as lastname: Salvatore and firstname: Mele. So in those cases, we need to generate both
        ``Mele, Salvatore`` and ``Mele, S``.

        Wherever, the '-' is replaced by ' ', it's done because it's the way the name variations are being index, thus
        we want our minimal name variations to be generated identically. This has to be done after the creation of
        ParsedName, otherwise the name is parsed differently. E.g. 'Caro-Estevez' as is, it's a lastname, if we replace
        the '-' with ' ', then it's a firstname and lastname.
    """
    parsed_name = ParsedName.loads(unidecode(author_name))

    if len(parsed_name) > 1:
        lastnames = parsed_name.last.replace('-', ' ')

        non_lastnames = ' '.join(parsed_name.first_list +
                                 parsed_name.middle_list +
                                 parsed_name.suffix_list)
        # Strip extra whitespace added if any of middle_list and suffix_list are empty.
        non_lastnames = non_lastnames.strip().replace('-', ' ')

        # Adding into a set first, so as to drop identical name variations.
        return list({
            name_variation.lower()
            for name_variation in [
                lastnames + ' ' + non_lastnames,
                lastnames + ' ' + non_lastnames[0],
                non_lastnames + ' ' + lastnames,
                non_lastnames + ' ' + lastnames[0],
            ] if not _name_variation_has_only_initials(name_variation)
        })
    else:
        return [parsed_name.dumps().replace('-', ' ').lower()]
示例#13
0
def get_display_name_for_author_name(author_name):
    parsed_name = ParsedName.loads(author_name)
    return " ".join(parsed_name.first_list + parsed_name.last_list)
示例#14
0
    def _generate_author_query(self, author_name):
        """Generates a query handling specifically authors.

        Notes:
            There are three main cases:

            1) ``a Smith``
            This will just generate a ``match`` query on ``last_name``

            2) ``a John Smith``
             This will just generate a ``match`` query on ``last_name`` and  a ``prefix`` query on ``first_name``
             and a ``match`` query on the initial ``J``. This will return results from ``Smith, John`` and ``Smith, J``
             but not from ``Smith, Jane``.

            3) ``a J Smith``
            This will just generate a ``match`` query on ``last_name`` and a match query on ``first_name.initials``.

            Please note, cases such as ``J.D.`` have been properly handled by the tokenizer.
        """
        parsed_name = ParsedName(author_name)

        def _match_query_with_names_initials_analyzer_with_and_operator(
                field, value):
            return {
                "match": {
                    self.KEYWORD_TO_ES_FIELDNAME[field]: {
                        "query": value,
                        'operator': 'AND',
                        "analyzer": "names_initials_analyzer"
                    }
                }
            }

        def _match_query_with_and_operator(field, value):
            return {
                'match': {
                    self.KEYWORD_TO_ES_FIELDNAME[field]: {
                        'query': value,
                        'operator': 'AND'
                    }
                }
            }

        def _match_phrase_prefix_query(field, value):
            return {
                "match_phrase_prefix": {
                    self.KEYWORD_TO_ES_FIELDNAME[field]: {
                        "query": value,
                        "analyzer": "names_analyzer"
                    }
                }
            }

        if len(parsed_name) == 1 and '.' not in parsed_name.first:
            # ParsedName returns first name if there is only one name i.e. `Smith`
            # in our case we consider it as a lastname
            last_name = parsed_name.first
            query = _match_query_with_and_operator("author_last_name",
                                                   last_name)
            return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)

        bool_query_build = []
        bool_query_build.append(
            _match_query_with_and_operator("author_last_name",
                                           parsed_name.last))

        should_query = []
        first_names = retokenize_first_names(parsed_name.first_list)
        for name in first_names:
            name_query = []
            if is_initial_of_a_name(name):
                name_query.append(
                    _match_query_with_names_initials_analyzer_with_and_operator(
                        "author_first_name_initials", name))
            else:
                name_query.extend([
                    _match_phrase_prefix_query("author_first_name", name),
                    _match_query_with_names_initials_analyzer_with_and_operator(
                        "author_first_name", name)
                ])
            should_query.append(
                wrap_queries_in_bool_clauses_if_more_than_one(
                    name_query, use_must_clause=False))

        bool_query_build.append(
            wrap_queries_in_bool_clauses_if_more_than_one(
                should_query, use_must_clause=True))

        query = wrap_queries_in_bool_clauses_if_more_than_one(
            bool_query_build, use_must_clause=True)
        return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)
示例#15
0
def get_author_display_name(name):
    """Returns the display name in format Firstnames Lastnames"""
    parsed_name = ParsedName.loads(name)
    return " ".join(parsed_name.first_list + parsed_name.last_list)
示例#16
0
    def _generate_author_query(self, author_name):
        """Generates a query handling specifically authors.

        Notes:
            The match query is generic enough to return many results. Then, using the filter clause we truncate these
            so that we imitate legacy's behaviour on returning more "exact" results. E.g. Searching for `Smith, John`
            shouldn't return papers of 'Smith, Bob'.

            Additionally, doing a ``match`` with ``"operator": "and"`` in order to be even more exact in our search, by
            requiring that ``full_name`` field contains both
        """

        parsed_name = ParsedName(author_name)

        def _is_initial(name_part):
            return len(name_part) == 1 or u'.' in name_part

        # This case we treat ti just like lastname
        if len(parsed_name) == 1:
            query = {
                'bool': {
                    'must': {
                        'match': {
                            ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['author_last_name_raw']:
                            author_name
                        }
                    }
                }
            }
            return generate_nested_query(
                ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query)

        if ',' not in author_name:
            parts = author_name.split(' ')
            lastname = parts[-1]
            firstnames = parts[0:-1]
        else:
            parts = author_name.split(',')
            lastname = parts[0]
            firstnames = parts[1].replace('.', ' ').split(' ')

        # do something else
        query_build = []
        for name in firstnames:
            if _is_initial(name):
                query_build.append(
                    {"match": {
                        "authors.first_name": name.strip()
                    }})
            else:
                query_build.append({
                    "prefix": {
                        "authors.first_name.raw": name.strip().lower()
                    }
                })

        query = {
            'bool': {
                'must': [{
                    "match": {
                        "authors.last_name.raw": {
                            "query": lastname.strip(),
                            "operator": "AND"
                        }
                    }
                }, {
                    "bool": {
                        "must": query_build
                    }
                }]
            }
        }
        return generate_nested_query(
            ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query)
示例#17
0
def get_author_display_name(name):
    """Returns the display name in format Firstnames Lastnames"""
    parsed_name = ParsedName.loads(name)
    return " ".join(parsed_name.first_list + parsed_name.last_list)