Python iunaccent示例，papers.utils.iunaccent Python示例

示例#1

0

显示文件

文件： name.py 项目： Lysxia/dissemin

def name_signature(first, last):
    ident = iunaccent(last.strip())
    ident = nn_escaping_chars_re.sub('',ident)
    ident = nn_final_nontext_re.sub('',ident)
    ident = nn_nontext_re.sub('-',ident)
    if len(first):
        ident = iunaccent(first[0])+'-'+ident
    return ident

示例#2

0

显示文件

def name_signature(first, last):
    ident = iunaccent(last.strip())
    ident = nn_escaping_chars_re.sub('', ident)
    ident = nn_final_nontext_re.sub('', ident)
    ident = nn_nontext_re.sub('-', ident)
    if len(first):
        ident = iunaccent(first[0]) + '-' + ident
    return ident

示例#3

0

显示文件

def name_similarity(a, b):
    """
    Returns a float: how similar are these two names?
    """

    if not a or not b or len(a) != 2 or len(b) != 2:
        return False
    firstA, lastA = a
    firstB, lastB = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = list(zip(partsA, partsB))
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = list(zip(partsA, partsB))
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    expanded = []
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
            expanded.append((len(partsA[i]) > 1, len(partsB[i]) > 1))
        elif i < len(partsA):
            sumscores -= 0.25*weight_first_name(partsA[i])
            expanded.append((len(partsA[i]) > 1, False))
        else:
            sumscores -= 0.25*weight_first_name(partsB[i])
            expanded.append((False, len(partsB[i]) > 1))

    # Make sure expanded first names of A are included in that of B
    # or that of B and included in that of A
    # This prevents ('Amanda P.','Brown') and ('A. Patrick','Brown')
    # frow matching
    if not (all([wa or not wb for wa, wb in expanded]) or
            all([wb or not wa for wa, wb in expanded])):
        return 0.

    sumscores = max(min(sumscores, 1), 0)
    return sumscores

示例#4

0

显示文件

def name_similarity(a, b):
    """
    Returns a float: how similar are these two names?
    Examples:
    name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')) == 0.8
    name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')) == 0.4
    name_similarity(('R.', 'Ryder'),('R.', 'Ryder')) == 0.4
    name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')) ==0.3
    name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')) == 0.8
    name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')) == 0.3
    name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')) == 0.7
    name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')) == 0.7
    name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')) == 0
    name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu') == 0
    """

    if not a or not b:
        return False
    (firstA, lastA) = a
    (firstB, lastB) = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    #if firstA == firstB:
    #    return 1.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.
    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
        elif i < len(partsA):
            sumscores -= 0.25 * weight_first_name(partsA[i])
        else:
            sumscores -= 0.25 * weight_first_name(partsB[i])
    sumscores = max(min(sumscores, 1), 0)
    return sumscores

示例#5

0

显示文件

文件： name.py 项目： jilljenn/dissemin

def name_similarity(a,b):
    """
    Returns a float: how similar are these two names?
    Examples:
    name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')) == 0.8
    name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')) == 0.4
    name_similarity(('R.', 'Ryder'),('R.', 'Ryder')) == 0.4
    name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')) ==0.3
    name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')) == 0.8
    name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')) == 0.3
    name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')) == 0.7
    name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')) == 0.7
    name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')) == 0
    name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu') == 0
    """

    if not a or not b:
        return False
    (firstA,lastA) = a
    (firstB,lastB) = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    #if firstA == firstB:
    #    return 1.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.
    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
        elif i < len(partsA):
            sumscores -= 0.25*weight_first_name(partsA[i])
        else:
            sumscores -= 0.25*weight_first_name(partsB[i])
    sumscores = max(min(sumscores, 1), 0)
    return sumscores

示例#6

0

显示文件

def populate_identifiers(apps, se):
    Institution = apps.get_model('papers', 'Institution')
    for i in Institution.objects.all():
        if i.country and i.name:
            i.identifiers = [i.identifier, i.country + ':' + iunaccent(i.name)]
        else:
            i.identifiers = [i.identifier]
        i.save(update_fields=['identifiers'])

示例#7

0

显示文件

文件： 0043_institutions_multiple_identifiers.py 项目： Phyks/dissemin

def populate_identifiers(apps, se):
    Institution = apps.get_model('papers', 'Institution')
    for i in Institution.objects.all():
        if i.country and i.name:
            i.identifiers = [i.identifier, i.country+':'+iunaccent(i.name)]
        else:
            i.identifiers = [i.identifier]
        i.save(update_fields=['identifiers'])

示例#8

0

显示文件

 def create(cls, first, last):
     """
     Creates an instance of the Name object without saving it.
     Useful for name lookups where we are not sure we want to
     keep the name in the model.
     """
     instance = cls()
     instance.first = sanitize_html(first[:MAX_NAME_LENGTH].strip())
     instance.last = sanitize_html(last[:MAX_NAME_LENGTH].strip())
     instance.full = iunaccent(instance.first+' '+instance.last)
     return instance

示例#9

0

显示文件

文件： baremodels.py 项目： Phyks/dissemin

 def create(cls, first, last):
     """
     Creates an instance of the Name object without saving it.
     Useful for name lookups where we are not sure we want to
     keep the name in the model.
     """
     instance = cls()
     instance.first = sanitize_html(first[:MAX_NAME_LENGTH].strip())
     instance.last = sanitize_html(last[:MAX_NAME_LENGTH].strip())
     instance.full = iunaccent(instance.first+' '+instance.last)
     return instance

示例#10

0

显示文件

def shallower_name_similarity(a, b):
    """
    Same as name_similarity, but accepts differences in the last names.
    This heuristics is more costly but is only used to attribute an ORCID
    affiliation to the right author in papers fetched from ORCID.
    (in the next function)
    """
    if not a or not b or len(a) != 2 or len(b) != 2:
        return False
    firstA, lastA = a
    firstB, lastB = b

    # Matching last names
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    wordsA, _ = split_name_words(lastA)
    wordsB, _ = split_name_words(lastB)
    wordsA = set(wordsA)
    wordsB = set(wordsB)
    if not wordsA or not wordsB:
        return False
    ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB)

    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    partsA = [p[0] for p in partsA]
    partsB = [p[0] for p in partsB]

    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    return ratio * (len(parts) + 1) / (maxlen + 1)

示例#11

0

显示文件

文件： name.py 项目： Phyks/dissemin

def shallower_name_similarity(a, b):
    """
    Same as name_similarity, but accepts differences in the last names.
    This heuristics is more costly but is only used to attribute an ORCID
    affiliation to the right author in papers fetched from ORCID.
    (in the next function)
    """
    if not a or not b or len(a) != 2 or len(b) != 2:
        return False
    firstA, lastA = a
    firstB, lastB = b

    # Matching last names
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    wordsA, _ = split_name_words(lastA)
    wordsB, _ = split_name_words(lastB)
    wordsA = set(wordsA)
    wordsB = set(wordsB)
    if not wordsA or not wordsB:
        return False
    ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB)

    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    partsA = [ p[0] for p in partsA ]
    partsB = [ p[0] for p in partsB ]

    parts = list(zip(partsA, partsB))
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = list(zip(partsA, partsB))
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    return ratio*(len(parts)+1)/(maxlen+1)

示例#12

0

显示文件

def shallower_name_similarity(a, b):
    """
    Same as name_similarity, but accepts differences in the last names.
    This heuristics is more costly but is only used to attribute an ORCID
    affiliation to the right author in papers fetched from ORCID.
    """
    if not a or not b:
        return False
    firstA, lastA = a
    firstB, lastB = b

    # Matching last names
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    wordsA, sepA = split_name_words(lastA)
    wordsB, sepB = split_name_words(lastB)
    wordsA = set(wordsA)
    wordsB = set(wordsB)
    ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB)

    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    partsA = map(lambda x: x[0], partsA)
    partsB = map(lambda x: x[0], partsB)

    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    if maxlen > 0:
        return ratio * len(parts) / maxlen
    return 0.

示例#13

0

显示文件

文件： name.py 项目： jilljenn/dissemin

def shallower_name_similarity(a, b):
    """
    Same as name_similarity, but accepts differences in the last names.
    This heuristics is more costly but is only used to attribute an ORCID
    affiliation to the right author in papers fetched from ORCID.
    """
    if not a or not b:
        return False
    firstA, lastA = a
    firstB, lastB = b

    # Matching last names
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    wordsA, sepA = split_name_words(lastA)
    wordsB, sepB = split_name_words(lastB)
    wordsA = set(wordsA)
    wordsB = set(wordsB)
    ratio = float(len(wordsA & wordsB)) / len(wordsA | wordsB)

    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    partsA = map(lambda x: x[0], partsA)
    partsB = map(lambda x: x[0], partsB)

    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    if maxlen > 0:
        return ratio*len(parts)/maxlen
    return 0.

示例#14

0

显示文件

 def test_iunaccent(self):
         self.assertEqual(iunaccent('BÉPO forever'), 'bepo forever')

示例#15

0

显示文件

def name_similarity(a, b):
    """
    Returns a float: how similar are these two names?
    Examples:

    >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')))
    8
    >>> int(10*name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')))
    4
    >>> int(10*name_similarity(('R.', 'Ryder'),('R.', 'Ryder')))
    4
    >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')))
    3
    >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')))
    8
    >>> int(10*name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')))
    3
    >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')))
    7
    >>> int(10*name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')))
    7
    >>> int(10*name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')))
    0
    >>> int(10*name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu')))
    0
    >>> int(10*name_similarity(('Amanda P.','Brown'),('Patrick','Brown')))
    0
    """

    if not a or not b:
        return False
    (firstA, lastA) = a
    (firstB, lastB) = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    expanded = []
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
            expanded.append((len(partsA[i]) > 1, len(partsB[i]) > 1))
        elif i < len(partsA):
            sumscores -= 0.25 * weight_first_name(partsA[i])
            expanded.append((len(partsA[i]) > 1, False))
        else:
            sumscores -= 0.25 * weight_first_name(partsB[i])
            expanded.append((False, len(partsB[i]) > 1))

    # Make sure expanded first names of A are included in that of B
    # or that of B and included in that of A
    # This prevents ('Amanda P.','Brown') and ('A. Patrick','Brown')
    # frow matching
    if not (all([a or not b for a, b in expanded])
            or all([b or not a for a, b in expanded])):
        return 0.

    sumscores = max(min(sumscores, 1), 0)
    return sumscores

示例#16

0

显示文件

文件： name.py 项目： Lysxia/dissemin

def name_similarity(a,b):
    """
    Returns a float: how similar are these two names?
    Examples:

    >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin', 'Ryder')))
    8
    >>> int(10*name_similarity(('Robin', 'Ryder'),('R.', 'Ryder')))
    4
    >>> int(10*name_similarity(('R.', 'Ryder'),('R.', 'Ryder')))
    4
    >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R.', 'Ryder')))
    3
    >>> int(10*name_similarity(('Robin J.', 'Ryder'),('R. J.', 'Ryder')))
    8
    >>> int(10*name_similarity(('R. J.', 'Ryder'),('J.', 'Ryder')))
    3
    >>> int(10*name_similarity(('Robin', 'Ryder'),('Robin J.', 'Ryder')))
    7
    >>> int(10*name_similarity(('W. Timothy','Gowers'), ('Timothy','Gowers')))
    7
    >>> int(10*name_similarity(('Robin K.','Ryder'), ('Robin J.', 'Ryder')))
    0
    >>> int(10*name_similarity(('Claire', 'Mathieu'),('Claire', 'Kenyon-Mathieu')))
    0
    >>> int(10*name_similarity(('Amanda P.','Brown'),('Patrick','Brown')))
    0
    """

    if not a or not b:
        return False
    (firstA,lastA) = a
    (firstB,lastB) = b
    firstA = iunaccent(firstA)
    firstB = iunaccent(firstB)
    lastA = iunaccent(lastA)
    lastB = iunaccent(lastB)
    if lastA != lastB:
        return 0.
    partsA, sepsA = split_name_words(firstA)
    partsB, sepsB = split_name_words(firstB)
    parts = zip(partsA, partsB)
    if not all(map(match_first_names, parts)):
        # Try to match in reverse
        partsA.reverse()
        partsB.reverse()
        parts = zip(partsA, partsB)
        if not all(map(match_first_names, parts)):
            return 0.

    maxlen = max(len(partsA), len(partsB))
    sumscores = 0
    expanded = []
    for i in range(maxlen):
        if i < len(parts):
            sumscores += weight_first_names(parts[i])
            expanded.append((len(partsA[i])>1, len(partsB[i])>1))
        elif i < len(partsA):
            sumscores -= 0.25*weight_first_name(partsA[i])
            expanded.append((len(partsA[i])>1, False))
        else:
            sumscores -= 0.25*weight_first_name(partsB[i])
            expanded.append((False, len(partsB[i])>1))

    # Make sure expanded first names of A are included in that of B
    # or that of B and included in that of A
    # This prevents ('Amanda P.','Brown') and ('A. Patrick','Brown')
    # frow matching
    if not (all([a or not b for a,b in expanded]) or
        all([b or not a for a,b in expanded])):
        return 0.

    sumscores = max(min(sumscores, 1), 0)
    return sumscores