예제 #1
0
파일: _qgram.py 프로젝트: chrislit/abydos
    def fingerprint(self, phrase):
        """Return Q-Gram fingerprint.

        Parameters
        ----------
        phrase : str
            The string from which to calculate the q-gram fingerprint

        Returns
        -------
        str
            The q-gram fingerprint of the phrase

        Examples
        --------
        >>> qf = QGram()
        >>> qf.fingerprint('The quick brown fox jumped over the lazy dog.')
        'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy'
        >>> qf.fingerprint('Christopher')
        'cherhehrisopphristto'
        >>> qf.fingerprint('Niall')
        'aliallni'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower()))
        phrase = ''.join(c for c in phrase if c.isalnum())
        phrase = self._tokenizer.tokenize(phrase).get_set()
        phrase = self._joiner.join(sorted(phrase))
        return phrase
예제 #2
0
파일: _qgram.py 프로젝트: LEFTazs/abydos
    def fingerprint(self, phrase):
        """Return Q-Gram fingerprint.

        Parameters
        ----------
        phrase : str
            The string from which to calculate the q-gram fingerprint

        Returns
        -------
        str
            The q-gram fingerprint of the phrase

        Examples
        --------
        >>> qf = QGram()
        >>> qf.fingerprint('The quick brown fox jumped over the lazy dog.')
        'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy'
        >>> qf.fingerprint('Christopher')
        'cherhehrisopphristto'
        >>> qf.fingerprint('Niall')
        'aliallni'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        phrase = unicode_normalize('NFKD', phrase.strip().lower())
        phrase = ''.join(c for c in phrase if c.isalnum())
        phrase = self._tokenizer.tokenize(phrase).get_set()
        phrase = self._joiner.join(sorted(phrase))
        return phrase
예제 #3
0
def qgram_fingerprint(phrase, qval=2, start_stop='', joiner=''):
    """Return Q-Gram fingerprint.

    A q-gram fingerprint is a string consisting of all of the unique q-grams
    in a string, alphabetized & concatenated. This fingerprint is described at
    :cite:`OpenRefine:2012`.

    :param str phrase: the string from which to calculate the q-gram
        fingerprint
    :param int qval: the length of each q-gram (by default 2)
    :param str start_stop: the start & stop symbol(s) to concatenate on either
        end of the phrase, as defined in abydos.util.qgram()
    :param str joiner: the string that will be placed between each word
    :returns: the q-gram fingerprint of the phrase
    :rtype: str

    >>> qgram_fingerprint('The quick brown fox jumped over the lazy dog.')
    'azbrckdoedeleqerfoheicjukblampnfogovowoxpequrortthuiumvewnxjydzy'
    >>> qgram_fingerprint('Christopher')
    'cherhehrisopphristto'
    >>> qgram_fingerprint('Niall')
    'aliallni'
    """
    phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower()))
    phrase = ''.join(c for c in phrase if c.isalnum())
    phrase = QGrams(phrase, qval, start_stop)
    phrase = joiner.join(sorted(phrase))
    return phrase
예제 #4
0
def soundex_br(word, max_length=4, zero_pad=True):
    """Return the SoundexBR encoding of a word.

    This is based on :cite:`Marcelino:2015`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 4)
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
        max_length string
    :returns: the SoundexBR code
    :rtype: str

    >>> soundex_br('Oliveira')
    'O416'
    >>> soundex_br('Almeida')
    'A453'
    >>> soundex_br('Barbosa')
    'B612'
    >>> soundex_br('Araújo')
    'A620'
    >>> soundex_br('Gonçalves')
    'G524'
    >>> soundex_br('Goncalves')
    'G524'
    """
    _soundex_br_translation = dict(
        zip((ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
            '01230120022455012623010202'))

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = ''.join(
        c for c in word if c in {
            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
            'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
        })

    if word[:2] == 'WA':
        first = 'V'
    elif word[:1] == 'K' and word[1:2] in {'A', 'O', 'U'}:
        first = 'C'
    elif word[:1] == 'C' and word[1:2] in {'I', 'E'}:
        first = 'S'
    elif word[:1] == 'G' and word[1:2] in {'E', 'I'}:
        first = 'J'
    elif word[:1] == 'Y':
        first = 'I'
    elif word[:1] == 'H':
        first = word[1:2]
        word = word[1:]
    else:
        first = word[:1]

    sdx = first + word[1:].translate(_soundex_br_translation)
    sdx = _delete_consecutive_repeats(sdx)
    sdx = sdx.replace('0', '')

    if zero_pad:
        sdx += ('0' * max_length)

    return sdx[:max_length]
예제 #5
0
    def fingerprint(self, phrase):
        """Return string fingerprint.

        Parameters
        ----------
        phrase : str
            The string from which to calculate the fingerprint

        Returns
        -------
        str
            The fingerprint of the phrase

        Example
        -------
        >>> sf = String()
        >>> sf.fingerprint('The quick brown fox jumped over the lazy dog.')
        'brown dog fox jumped lazy over quick the'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        phrase = unicode_normalize('NFKD', phrase.strip().lower())
        phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()])
        phrase = self._joiner.join(sorted(set(phrase.split())))
        return phrase
예제 #6
0
    def encode(self, word):
        """Return the Roger Root code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Roger Root code

        Examples
        --------
        >>> pe = RogerRoot()
        >>> pe.encode('Christopher')
        '06401'
        >>> pe.encode('Niall')
        '02500'
        >>> pe.encode('Smith')
        '00310'
        >>> pe.encode('Schmidt')
        '06310'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', word.upper())
        word = ''.join(c for c in word if c in self._uc_set)

        code = ''
        pos = 0

        # Do first digit(s) first
        for num in range(4, 0, -1):
            if word[:num] in self._init_patterns[num]:
                code = self._init_patterns[num][word[:num]]
                pos += num
                break

        # Then code subsequent digits
        while pos < len(word):
            for num in range(4, 0, -1):  # pragma: no branch
                if word[pos:pos + num] in self._med_patterns[num]:
                    code += self._med_patterns[num][word[pos:pos + num]]
                    pos += num
                    break

        code = self._delete_consecutive_repeats(code)
        code = code.replace('*', '')

        if self._zero_pad:
            code += '0' * self._max_length

        return code[:self._max_length]
예제 #7
0
파일: _string.py 프로젝트: chrislit/abydos
    def fingerprint(self, phrase):
        """Return string fingerprint.

        Parameters
        ----------
        phrase : str
            The string from which to calculate the fingerprint

        Returns
        -------
        str
            The fingerprint of the phrase

        Example
        -------
        >>> sf = String()
        >>> sf.fingerprint('The quick brown fox jumped over the lazy dog.')
        'brown dog fox jumped lazy over quick the'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower()))
        phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()])
        phrase = self._joiner.join(sorted(list(set(phrase.split()))))
        return phrase
예제 #8
0
파일: _string.py 프로젝트: kgpsrini/abydos
    def fingerprint(self, phrase, joiner=' '):
        """Return string fingerprint.

        Parameters
        ----------
        phrase : str
            The string from which to calculate the fingerprint
        joiner : str
            The string that will be placed between each word

        Returns
        -------
        str
            The fingerprint of the phrase

        Example
        -------
        >>> sf = String()
        >>> sf.fingerprint('The quick brown fox jumped over the lazy dog.')
        'brown dog fox jumped lazy over quick the'

        """
        phrase = unicode_normalize('NFKD', text_type(phrase.strip().lower()))
        phrase = ''.join([c for c in phrase if c.isalnum() or c.isspace()])
        phrase = joiner.join(sorted(list(set(phrase.split()))))
        return phrase
예제 #9
0
    def encode(self, word, max_length=5, zero_pad=True):
        """Return the Roger Root code for a word.

        Parameters
        ----------
        word : str
            The word to transform
        max_length : int
            The maximum length (default 5) of the code to return
        zero_pad : bool
            Pad the end of the return value with 0s to achieve a max_length
            string

        Returns
        -------
        str
            The Roger Root code

        Examples
        --------
        >>> roger_root('Christopher')
        '06401'
        >>> roger_root('Niall')
        '02500'
        >>> roger_root('Smith')
        '00310'
        >>> roger_root('Schmidt')
        '06310'

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)

        code = ''
        pos = 0

        # Do first digit(s) first
        for num in range(4, 0, -1):
            if word[:num] in self._init_patterns[num]:
                code = self._init_patterns[num][word[:num]]
                pos += num
                break

        # Then code subsequent digits
        while pos < len(word):
            for num in range(4, 0, -1):  # pragma: no branch
                if word[pos:pos + num] in self._med_patterns[num]:
                    code += self._med_patterns[num][word[pos:pos + num]]
                    pos += num
                    break

        code = self._delete_consecutive_repeats(code)
        code = code.replace('*', '')

        if zero_pad:
            code += '0' * max_length

        return code[:max_length]
예제 #10
0
    def encode(self, word):
        """Return the SoundD code.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The SoundD code

        Examples
        --------
        >>> pe = SoundD()
        >>> pe.encode('Gough')
        '2000'
        >>> pe.encode('pneuma')
        '5500'
        >>> pe.encode('knight')
        '5300'
        >>> pe.encode('trice')
        '3620'
        >>> pe.encode('judge')
        '2200'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)

        if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}:
            word = word[1:]
        elif word[:1] == 'X':
            word = 'S' + word[1:]
        elif word[:2] == 'WH':
            word = 'W' + word[2:]

        word = (
            word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0')
        )

        word = word.translate(self._trans)
        word = self._delete_consecutive_repeats(word)
        word = word.replace('0', '')

        if self._max_length != -1:
            if len(word) < self._max_length:
                word += '0' * (self._max_length - len(word))
            else:
                word = word[: self._max_length]

        return word
예제 #11
0
def normalize_column(name):
    name = name.replace('_', ' ')
    name = text_normalize(name)
    name = name.replace(' ', '_')
    name = unicode_normalize('NFKC', name)
    # column names can be 63 *bytes* max in postgresql
    while len(name.encode('utf-8')) >= 64:
        name = name[:len(name) - 1]
    return name
예제 #12
0
    def encode(self, word, max_length=4):
        """Return the SoundD code.

        Parameters
        ----------
        word : str
            The word to transform
        max_length : int
            The length of the code returned (defaults to 4)

        Returns
        -------
        str
            The SoundD code

        Examples
        --------
        >>> sound_d('Gough')
        '2000'
        >>> sound_d('pneuma')
        '5500'
        >>> sound_d('knight')
        '5300'
        >>> sound_d('trice')
        '3620'
        >>> sound_d('judge')
        '2200'

        """
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)

        if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}:
            word = word[1:]
        elif word[:1] == 'X':
            word = 'S' + word[1:]
        elif word[:2] == 'WH':
            word = 'W' + word[2:]

        word = (word.replace('DGE', '20').replace('DGI',
                                                  '20').replace('GH', '0'))

        word = word.translate(self._trans)
        word = self._delete_consecutive_repeats(word)
        word = word.replace('0', '')

        if max_length != -1:
            if len(word) < max_length:
                word += '0' * (max_length - len(word))
            else:
                word = word[:max_length]

        return word
예제 #13
0
def sound_d(word, max_length=4):
    """Return the SoundD code.

    SoundD is defined in :cite:`Varol:2012`.

    :param str word: the word to transform
    :param int max_length: the length of the code returned (defaults to 4)
    :returns: the SoundD code
    :rtype: str

    >>> sound_d('Gough')
    '2000'
    >>> sound_d('pneuma')
    '5500'
    >>> sound_d('knight')
    '5300'
    >>> sound_d('trice')
    '3620'
    >>> sound_d('judge')
    '2200'
    """
    _ref_soundd_translation = dict(
        zip((ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
            '01230120022455012623010202'))

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(
        c for c in word if c in {
            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
            'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
        })

    if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}:
        word = word[1:]
    elif word[:1] == 'X':
        word = 'S' + word[1:]
    elif word[:2] == 'WH':
        word = 'W' + word[2:]

    word = word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0')

    word = word.translate(_ref_soundd_translation)
    word = _delete_consecutive_repeats(word)
    word = word.replace('0', '')

    if max_length != -1:
        if len(word) < max_length:
            word += '0' * (max_length - len(word))
        else:
            word = word[:max_length]

    return word
예제 #14
0
파일: _waahlin.py 프로젝트: chrislit/abydos
    def encode(self, word, alphabetic=False):
        """Return the Wåhlin code for a word.

        Parameters
        ----------
        word : str
            The word to transform
        alphabetic : bool
            If True, the encoder will apply its alphabetic form (.encode_alpha
            rather than .encode)

        Returns
        -------
        str
            The Wåhlin code value

        Examples
        --------
        >>> pe = Waahlin()
        >>> pe.encode('Christopher')
        'KRISTOFER'
        >>> pe.encode('Niall')
        'NJALL'
        >>> pe.encode('Smith')
        'SMITH'
        >>> pe.encode('Schmidt')
        '*MIDT'


        .. versionadded:: 0.4.0

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFC', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        if not word:
            return ''

        if self._encoder is None:
            code = ''
            while word:
                part, word = self._encode_next(word)
                code += part
            return code

        code, word = self._encode_next(word)
        return code + (
            self._encoder.encode_alpha(word)
            if alphabetic
            else self._encoder.encode(word)
        )
예제 #15
0
    def encode(self, word: str) -> str:
        """Return the Russell Index (integer output) of a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Russell Index value

        Examples
        --------
        >>> pe = RussellIndex()
        >>> pe.encode('Christopher')
        '3813428'
        >>> pe.encode('Niall')
        '715'
        >>> pe.encode('Smith')
        '3614'
        >>> pe.encode('Schmidt')
        '3614'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class
        .. versionchanged:: 0.6.0
            Made return a str

        """
        word = unicode_normalize('NFKD', word.upper())
        word = word.replace('GH', '')  # discard gh (rule 3)
        word = word.rstrip('SZ')  # discard /[sz]$/ (rule 3)

        # translate according to Russell's mapping
        word = ''.join(c for c in word if c in self._uc_set)
        sdx = word.translate(self._trans)

        # remove any 1s after the first occurrence
        one = sdx.find('1') + 1
        if one:
            sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1')

        # remove repeating characters
        sdx = self._delete_consecutive_repeats(sdx)

        return sdx
예제 #16
0
    def encode(self, word, max_length=-1, zero_pad=False, retain_vowels=False):
        """Return the Refined Soundex code for a word.

        Parameters
        ----------
        word : str
            The word to transform
        max_length : int
            The length of the code returned (defaults to unlimited)
        zero_pad : bool
            Pad the end of the return value with 0s to achieve a max_length
            string
        retain_vowels : bool
            Retain vowels (as 0) in the resulting code

        Returns
        -------
        str
            The Refined Soundex value

        Examples
        --------
        >>> pe = RefinedSoundex()
        >>> pe.encode('Christopher')
        'C393619'
        >>> pe.encode('Niall')
        'N87'
        >>> pe.encode('Smith')
        'S386'
        >>> pe.encode('Schmidt')
        'S386'

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)

        # apply the Soundex algorithm
        sdx = word[:1] + word.translate(self._trans)
        sdx = self._delete_consecutive_repeats(sdx)
        if not retain_vowels:
            sdx = sdx.replace('0', '')  # Delete vowels, H, W, Y

        if max_length > 0:
            if zero_pad:
                sdx += '0' * max_length
            sdx = sdx[:max_length]

        return sdx
예제 #17
0
def normalize_text(text: str) -> str:
    """Performs text normalization using regex patterns
    """
    text = unicode_normalize("NFC", text)
    text = text.lower()
    text = re.sub("```(.|\n|\r)*?```", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub("[-_:/]", " ", text)

    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\.+", ".", text)
    text = re.sub("[?!;…]", ".", text)
    text = text.replace("\n", ".")
    return text
예제 #18
0
    def encode(self, word):
        """Return the Russell Index (integer output) of a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        int
            The Russell Index value

        Examples
        --------
        >>> pe = RussellIndex()
        >>> pe.encode('Christopher')
        3813428
        >>> pe.encode('Niall')
        715
        >>> pe.encode('Smith')
        3614
        >>> pe.encode('Schmidt')
        3614


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = word.replace('GH', '')  # discard gh (rule 3)
        word = word.rstrip('SZ')  # discard /[sz]$/ (rule 3)

        # translate according to Russell's mapping
        word = ''.join(c for c in word if c in self._uc_set)
        sdx = word.translate(self._trans)

        # remove any 1s after the first occurrence
        one = sdx.find('1') + 1
        if one:
            sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1')

        # remove repeating characters
        sdx = self._delete_consecutive_repeats(sdx)

        # return as an int
        return int(sdx) if sdx else float('NaN')
예제 #19
0
파일: _waahlin.py 프로젝트: wish2018/abydos
    def encode(self, word, alphabetic=False):
        """Return the Wåhlin code for a word.

        Parameters
        ----------
        word : str
            The word to transform
        alphabetic : bool
            If True, the encoder will apply its alphabetic form (.encode_alpha
            rather than .encode)

        Returns
        -------
        str
            The Wåhlin code value

        Examples
        --------
        >>> pe = Waahlin()
        >>> pe.encode('Christopher')
        'KRISTOFER'
        >>> pe.encode('Niall')
        'NJALL'
        >>> pe.encode('Smith')
        'SMITH'
        >>> pe.encode('Schmidt')
        '*MIDT'


        .. versionadded:: 0.4.0

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFC', word.upper())
        if not word:
            return ''

        if self._encoder is None:
            code = ''
            while word:
                part, word = self._encode_next(word)
                code += part
            return code

        code, word = self._encode_next(word)
        return code + (
            self._encoder.encode_alpha(word)
            if alphabetic
            else self._encoder.encode(word)
        )
예제 #20
0
        def _cost(s, t):
            if s[-1:] == '-':
                if s[-2:] == '--':
                    return self._weights[6]
                else:
                    return self._weights[7]
            elif t[-1:] == '-':
                if t[-2:] == '--':
                    return self._weights[6]
                else:
                    return self._weights[7]

            s = unicode_normalize('NFC', s)[-1:]
            t = unicode_normalize('NFC', t)[-1:]

            if s == t:
                if s in self._consonants or s in self._glides:
                    return self._weights[0]
                else:
                    return self._weights[1]

            if ''.join(sorted([s, t])) in {'iy', 'uw'}:
                return self._weights[2]

            sd = unicode_normalize('NFKD', s)
            td = unicode_normalize('NFKD', t)

            if sd[0] == td[0] and s in self._vowels:
                return self._weights[2]

            if sd[0] in self._vowels and td[0] in self._vowels:
                return self._weights[3]
            if sd[0] in self._consonants and td[0] in self._consonants:
                return self._weights[4]

            return self._weights[5]
예제 #21
0
    def encode(self, word):
        """Return the Refined Soundex code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Refined Soundex value

        Examples
        --------
        >>> pe = RefinedSoundex()
        >>> pe.encode('Christopher')
        'C93619'
        >>> pe.encode('Niall')
        'N7'
        >>> pe.encode('Smith')
        'S86'
        >>> pe.encode('Schmidt')
        'S386'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)

        # apply the Soundex algorithm
        sdx = word[:1] + word[1:].translate(self._trans)
        sdx = self._delete_consecutive_repeats(sdx)
        if not self._retain_vowels:
            sdx = sdx.replace('0', '')  # Delete vowels, H, W, Y

        if self._max_length > 0:
            if self._zero_pad:
                sdx += '0' * self._max_length
            sdx = sdx[: self._max_length]

        return sdx
예제 #22
0
    def encode(self, word):
        """Return the PhoneticSpanish coding of word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The PhoneticSpanish code

        Examples
        --------
        >>> pe = PhoneticSpanish()
        >>> pe.encode('Perez')
        '094'
        >>> pe.encode('Martinez')
        '69364'
        >>> pe.encode('Gutierrez')
        '83994'
        >>> pe.encode('Santiago')
        '4638'
        >>> pe.encode('Nicolás')
        '6454'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # uppercase, normalize, and decompose, filter to A-Z minus vowels & W
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = ''.join(c for c in word if c in self._uc_set)

        # merge repeated Ls & Rs
        word = word.replace('LL', 'L')
        word = word.replace('R', 'R')

        # apply the Soundex algorithm
        sdx = word.translate(self._trans)

        if self._max_length > 0:
            sdx = (sdx + ('0' * self._max_length))[: self._max_length]

        return sdx
예제 #23
0
    def encode(self, word):
        """Return the Refined Soundex code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Refined Soundex value

        Examples
        --------
        >>> pe = RefinedSoundex()
        >>> pe.encode('Christopher')
        'C93619'
        >>> pe.encode('Niall')
        'N7'
        >>> pe.encode('Smith')
        'S86'
        >>> pe.encode('Schmidt')
        'S386'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)

        # apply the Soundex algorithm
        sdx = word[:1] + word[1:].translate(self._trans)
        sdx = self._delete_consecutive_repeats(sdx)
        if not self._retain_vowels:
            sdx = sdx.replace('0', '')  # Delete vowels, H, W, Y

        if self._max_length > 0:
            if self._zero_pad:
                sdx += '0' * self._max_length
            sdx = sdx[:self._max_length]

        return sdx
예제 #24
0
def refined_soundex(word, max_length=-1, zero_pad=False,
                    retain_vowels=False):
    """Return the Refined Soundex code for a word.

    This is Soundex, but with more character classes. It was defined at
    :cite:`Boyce:1998`.

    :param word: the word to transform
    :param max_length: the length of the code returned (defaults to unlimited)
    :param zero_pad: pad the end of the return value with 0s to achieve a
        max_length string
    :param retain_vowels: retain vowels (as 0) in the resulting code
    :returns: the Refined Soundex value
    :rtype: str

    >>> refined_soundex('Christopher')
    'C393619'
    >>> refined_soundex('Niall')
    'N87'
    >>> refined_soundex('Smith')
    'S386'
    >>> refined_soundex('Schmidt')
    'S386'
    """
    _ref_soundex_translation = dict(zip((ord(_) for _ in
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
                                        '01360240043788015936020505'))

    # uppercase, normalize, decompose, and filter non-A-Z out
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    # apply the Soundex algorithm
    sdx = word[:1] + word.translate(_ref_soundex_translation)
    sdx = _delete_consecutive_repeats(sdx)
    if not retain_vowels:
        sdx = sdx.replace('0', '')  # Delete vowels, H, W, Y

    if max_length > 0:
        if zero_pad:
            sdx += ('0' * max_length)
        sdx = sdx[:max_length]

    return sdx
예제 #25
0
파일: _fonem.py 프로젝트: chrislit/abydos
    def encode(self, word):
        """Return the FONEM code of a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The FONEM code

        Examples
        --------
        >>> pe = FONEM()
        >>> pe.encode('Marchand')
        'MARCHEN'
        >>> pe.encode('Beaulieu')
        'BOLIEU'
        >>> pe.encode('Beaumont')
        'BOMON'
        >>> pe.encode('Legrand')
        'LEGREN'
        >>> pe.encode('Pelletier')
        'PELETIER'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # normalize, upper-case, and filter non-French letters
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.translate({198: 'AE', 338: 'OE'})
        word = ''.join(c for c in word if c in self._uc_set)

        for rule in self._rule_order:
            regex, repl = self._rule_table[rule]
            if isinstance(regex, text_type):
                word = word.replace(regex, repl)
            else:
                word = regex.sub(repl, word)

        return word
예제 #26
0
    def encode(self, word):
        """Return the Statistics Canada code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Statistics Canada name code value

        Examples
        --------
        >>> pe = StatisticsCanada()
        >>> pe.encode('Christopher')
        'CHRS'
        >>> pe.encode('Niall')
        'NL'
        >>> pe.encode('Smith')
        'SMTH'
        >>> pe.encode('Schmidt')
        'SCHM'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)
        if not word:
            return ''

        code = word[1:]
        for vowel in self._uc_vy_set:
            code = code.replace(vowel, '')
        code = word[0] + code
        code = self._delete_consecutive_repeats(code)
        code = code.replace(' ', '')

        return code[:self._max_length]
예제 #27
0
    def encode(self, word):
        """Return the Statistics Canada code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Statistics Canada name code value

        Examples
        --------
        >>> pe = StatisticsCanada()
        >>> pe.encode('Christopher')
        'CHRS'
        >>> pe.encode('Niall')
        'NL'
        >>> pe.encode('Smith')
        'SMTH'
        >>> pe.encode('Schmidt')
        'SCHM'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)
        if not word:
            return ''

        code = word[1:]
        for vowel in self._uc_vy_set:
            code = code.replace(vowel, '')
        code = word[0] + code
        code = self._delete_consecutive_repeats(code)
        code = code.replace(' ', '')

        return code[: self._max_length]
예제 #28
0
    def fingerprint(self, word):
        """Return the skeleton key.

        Parameters
        ----------
        word : str
            The word to transform into its skeleton key

        Returns
        -------
        str
            The skeleton key

        Examples
        --------
        >>> sk = SkeletonKey()
        >>> sk.fingerprint('The quick brown fox jumped over the lazy dog.')
        'THQCKBRWNFXJMPDVLZYGEUIOA'
        >>> sk.fingerprint('Christopher')
        'CHRSTPIOE'
        >>> sk.fingerprint('Niall')
        'NLIA'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        word = unicode_normalize('NFKD', word.upper())
        word = ''.join(c for c in word if c in self._letters)
        start = word[0:1]
        consonant_part = ''
        vowel_part = ''

        # add consonants & vowels to to separate strings
        # (omitting the first char & duplicates)
        for char in word[1:]:
            if char != start:
                if char in self._vowels:
                    if char not in vowel_part:
                        vowel_part += char
                elif char not in consonant_part:
                    consonant_part += char
        # return the first char followed by consonants followed by vowels
        return start + consonant_part + vowel_part
예제 #29
0
    def encode(self, word):
        """Return the FONEM code of a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The FONEM code

        Examples
        --------
        >>> pe = FONEM()
        >>> pe.encode('Marchand')
        'MARCHEN'
        >>> pe.encode('Beaulieu')
        'BOLIEU'
        >>> pe.encode('Beaumont')
        'BOMON'
        >>> pe.encode('Legrand')
        'LEGREN'
        >>> pe.encode('Pelletier')
        'PELETIER'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # normalize, upper-case, and filter non-French letters
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.translate({198: 'AE', 338: 'OE'})
        word = ''.join(c for c in word if c in self._uc_set)

        for rule in self._rule_order:
            regex, repl = self._rule_table[rule]
            if isinstance(regex, text_type):
                word = word.replace(regex, repl)
            else:
                word = regex.sub(repl, word)

        return word
예제 #30
0
def russell_index(word):
    """Return the Russell Index (integer output) of a word.

    This follows Robert C. Russell's Index algorithm, as described in
    :cite:`Russell:1917`.

    :param str word: the word to transform
    :returns: the Russell Index value
    :rtype: int

    >>> russell_index('Christopher')
    3813428
    >>> russell_index('Niall')
    715
    >>> russell_index('Smith')
    3614
    >>> russell_index('Schmidt')
    3614
    """
    _russell_translation = dict(
        zip((ord(_) for _ in 'ABCDEFGIKLMNOPQRSTUVXYZ'),
            '12341231356712383412313'))

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = word.replace('GH', '')  # discard gh (rule 3)
    word = word.rstrip('SZ')  # discard /[sz]$/ (rule 3)

    # translate according to Russell's mapping
    word = ''.join(
        c for c in word if c in {
            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N', 'O',
            'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'
        })
    sdx = word.translate(_russell_translation)

    # remove any 1s after the first occurrence
    one = sdx.find('1') + 1
    if one:
        sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1')

    # remove repeating characters
    sdx = _delete_consecutive_repeats(sdx)

    # return as an int
    return int(sdx) if sdx else float('NaN')
예제 #31
0
    def encode(self, word, max_length=-1):
        """Return the PhoneticSpanish coding of word.

        Parameters
        ----------
        word : str
            The word to transform
        max_length : int
            The length of the code returned (defaults to unlimited)

        Returns
        -------
        str
            The PhoneticSpanish code

        Examples
        --------
        >>> pe = PhoneticSpanish()
        >>> pe.encode('Perez')
        '094'
        >>> pe.encode('Martinez')
        '69364'
        >>> pe.encode('Gutierrez')
        '83994'
        >>> pe.encode('Santiago')
        '4638'
        >>> pe.encode('Nicolás')
        '6454'

        """
        # uppercase, normalize, and decompose, filter to A-Z minus vowels & W
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = ''.join(c for c in word if c in self._uc_set)

        # merge repeated Ls & Rs
        word = word.replace('LL', 'L')
        word = word.replace('R', 'R')

        # apply the Soundex algorithm
        sdx = word.translate(self._trans)

        if max_length > 0:
            sdx = (sdx + ('0' * max_length))[:max_length]

        return sdx
예제 #32
0
    def encode(self, word, max_length=4, zero_pad=True):
        """Return the Lein code for a word.

        Parameters
        ----------
        word : str
            The word to transform
        max_length : int
            The length of the code returned (defaults to 4)
        zero_pad : bool
            Pad the end of the return value with 0s to achieve a max_length
            string

        Returns
        -------
        str
            The Lein code

        Examples
        --------
        >>> pe = Lein()
        >>> pe.encode('Christopher')
        'C351'
        >>> pe.encode('Niall')
        'N300'
        >>> pe.encode('Smith')
        'S210'
        >>> pe.encode('Schmidt')
        'S521'

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)

        code = word[:1]  # Rule 1
        word = word[1:].translate(self._del_trans)  # Rule 2
        word = self._delete_consecutive_repeats(word)  # Rule 3
        code += word.translate(self._trans)  # Rule 4

        if zero_pad:
            code += '0' * max_length  # Rule 4

        return code[:max_length]
예제 #33
0
def statistics_canada(word, max_length=4):
    """Return the Statistics Canada code for a word.

    The original description of this algorithm could not be located, and
    may only have been specified in an unpublished TR. The coding does not
    appear to be in use by Statistics Canada any longer. In its place, this is
    an implementation of the "Census modified Statistics Canada name coding
    procedure".

    The modified version of this algorithm is described in Appendix B of
     :cite:`Moore:1977`.

    :param str word: the word to transform
    :param int max_length: the maximum length (default 4) of the code to return
    :returns: the Statistics Canada name code value
    :rtype: str

    >>> statistics_canada('Christopher')
    'CHRS'
    >>> statistics_canada('Niall')
    'NL'
    >>> statistics_canada('Smith')
    'SMTH'
    >>> statistics_canada('Schmidt')
    'SCHM'
    """
    # uppercase, normalize, decompose, and filter non-A-Z out
    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')
    word = ''.join(
        c for c in word if c in {
            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
            'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
        })
    if not word:
        return ''

    code = word[1:]
    for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}:
        code = code.replace(vowel, '')
    code = word[0] + code
    code = _delete_consecutive_repeats(code)
    code = code.replace(' ', '')

    return code[:max_length]
예제 #34
0
    def encode(self, word):
        """Return the Russell Index (integer output) of a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        int
            The Russell Index value

        Examples
        --------
        >>> pe = RussellIndex()
        >>> pe.encode('Christopher')
        3813428
        >>> pe.encode('Niall')
        715
        >>> pe.encode('Smith')
        3614
        >>> pe.encode('Schmidt')
        3614

        """
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = word.replace('GH', '')  # discard gh (rule 3)
        word = word.rstrip('SZ')  # discard /[sz]$/ (rule 3)

        # translate according to Russell's mapping
        word = ''.join(c for c in word if c in self._uc_set)
        sdx = word.translate(self._trans)

        # remove any 1s after the first occurrence
        one = sdx.find('1') + 1
        if one:
            sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1')

        # remove repeating characters
        sdx = self._delete_consecutive_repeats(sdx)

        # return as an int
        return int(sdx) if sdx else float('NaN')
예제 #35
0
파일: _lein.py 프로젝트: armonge/abydos
    def encode(self, word):
        """Return the LEIN code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The LEIN code

        Examples
        --------
        >>> pe = LEIN()
        >>> pe.encode('Christopher')
        'C351'
        >>> pe.encode('Niall')
        'N300'
        >>> pe.encode('Smith')
        'S210'
        >>> pe.encode('Schmidt')
        'S521'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)

        code = word[:1]  # Rule 1
        word = word[1:].translate(self._del_trans)  # Rule 2
        word = self._delete_consecutive_repeats(word)  # Rule 3
        code += word.translate(self._trans)  # Rule 4

        if self._zero_pad:
            code += '0' * self._max_length  # Rule 4

        return code[:self._max_length]
예제 #36
0
    def fingerprint(self, word):
        """Return the omission key.

        Parameters
        ----------
        word : str
            The word to transform into its omission key

        Returns
        -------
        str
            The omission key

        Examples
        --------
        >>> ok = OmissionKey()
        >>> ok.fingerprint('The quick brown fox jumped over the lazy dog.')
        'JKQXZVWYBFMGPDHCLNTREUIOA'
        >>> ok.fingerprint('Christopher')
        'PHCTSRIOE'
        >>> ok.fingerprint('Niall')
        'LNIA'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = ''.join(c for c in word if c in self._letters)

        key = ''

        # add consonants in order supplied by _consonants (no duplicates)
        for char in self._consonants:
            if char in word:
                key += char

        # add vowels in order they appeared in the word (no duplicates)
        for char in word:
            if char not in self._consonants and char not in key:
                key += char

        return key
예제 #37
0
def phonem(word):
    """Return the Phonem code for a word.

    Phonem is defined in :cite:`Wilde:1988`.

    This version is based on the Perl implementation documented at
    :cite:`Wilz:2005`.
    It includes some enhancements presented in the Java port at
    :cite:`dcm4che:2011`.

    Phonem is intended chiefly for German names/words.

    :param str word: the word to transform
    :returns: the Phonem value
    :rtype: str

    >>> phonem('Christopher')
    'CRYSDOVR'
    >>> phonem('Niall')
    'NYAL'
    >>> phonem('Smith')
    'SMYD'
    >>> phonem('Schmidt')
    'CMYD'
    """
    _phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'),
                             ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'),
                             ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'),
                             ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'),
                             ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'),
                             ('AU', 'A§'), ('OU', '§'))
    _phonem_translation = dict(
        zip((ord(_) for _ in 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'),
            'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ'))

    word = unicode_normalize('NFC', text_type(word.upper()))
    for i, j in _phonem_substitutions:
        word = word.replace(i, j)
    word = word.translate(_phonem_translation)

    return ''.join(
        c for c in _delete_consecutive_repeats(word) if c in {
            'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S', 'U', 'V', 'W',
            'X', 'Y', 'Ö'
        })
예제 #38
0
    def fingerprint(self, word: str) -> str:
        """Return the omission key.

        Parameters
        ----------
        word : str
            The word to transform into its omission key

        Returns
        -------
        str
            The omission key

        Examples
        --------
        >>> ok = OmissionKey()
        >>> ok.fingerprint('The quick brown fox jumped over the lazy dog.')
        'JKQXZVWYBFMGPDHCLNTREUIOA'
        >>> ok.fingerprint('Christopher')
        'PHCTSRIOE'
        >>> ok.fingerprint('Niall')
        'LNIA'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        word = unicode_normalize('NFKD', word.upper())
        word = ''.join(c for c in word if c in self._letters)

        key = ''

        # add consonants in order supplied by _consonants (no duplicates)
        for char in self._consonants:
            if char in word:
                key += char

        # add vowels in order they appeared in the word (no duplicates)
        for char in word:
            if char not in self._consonants and char not in key:
                key += char

        return key
예제 #39
0
파일: _editex.py 프로젝트: chrislit/abydos
    def dist_abs(self, src, tar):
        """Return the Editex distance between two strings.

        Parameters
        ----------
        src : str
            Source string for comparison
        tar : str
            Target string for comparison

        Returns
        -------
        int
            Editex distance

        Examples
        --------
        >>> cmp = Editex()
        >>> cmp.dist_abs('cat', 'hat')
        2
        >>> cmp.dist_abs('Niall', 'Neil')
        2
        >>> cmp.dist_abs('aluminum', 'Catalan')
        12
        >>> cmp.dist_abs('ATCG', 'TAGC')
        6


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        match_cost, group_cost, mismatch_cost = self._cost

        def r_cost(ch1, ch2):
            """Return r(a,b) according to Zobel & Dart's definition.

            Parameters
            ----------
            ch1 : str
                The first character to compare
            ch2 : str
                The second character to compare

            Returns
            -------
            int
                r(a,b) according to Zobel & Dart's definition

            .. versionadded:: 0.1.0

            """
            if ch1 == ch2:
                return match_cost
            if ch1 in self._all_letters and ch2 in self._all_letters:
                for group in self._letter_groups:
                    if ch1 in group and ch2 in group:
                        return group_cost
            return mismatch_cost

        def d_cost(ch1, ch2):
            """Return d(a,b) according to Zobel & Dart's definition.

            Parameters
            ----------
            ch1 : str
                The first character to compare
            ch2 : str
                The second character to compare

            Returns
            -------
            int
                d(a,b) according to Zobel & Dart's definition

            .. versionadded:: 0.1.0

            """
            if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'):
                return group_cost
            return r_cost(ch1, ch2)

        # convert both src & tar to NFKD normalized unicode
        src = unicode_normalize('NFKD', text_type(src.upper()))
        tar = unicode_normalize('NFKD', text_type(tar.upper()))
        # convert ß to SS (for Python2)
        src = src.replace('ß', 'SS')
        tar = tar.replace('ß', 'SS')

        src_len = len(src)
        tar_len = len(tar)
        max_len = max(src_len, tar_len)

        if src == tar:
            return 0.0
        if not src:
            return sum(
                mismatch_cost * self._taper(pos, max_len)
                for pos in range(tar_len)
            )
        if not tar:
            return sum(
                mismatch_cost * self._taper(pos, max_len)
                for pos in range(src_len)
            )

        d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float)
        src = ' ' + src
        tar = ' ' + tar

        if not self._local:
            for i in range(1, src_len + 1):
                d_mat[i, 0] = d_mat[i - 1, 0] + d_cost(
                    src[i - 1], src[i]
                ) * self._taper(i, max_len)
        for j in range(1, tar_len + 1):
            d_mat[0, j] = d_mat[0, j - 1] + d_cost(
                tar[j - 1], tar[j]
            ) * self._taper(j, max_len)

        for i in range(1, src_len + 1):
            for j in range(1, tar_len + 1):
                d_mat[i, j] = min(
                    d_mat[i - 1, j]
                    + d_cost(src[i - 1], src[i])
                    * self._taper(max(i, j), max_len),
                    d_mat[i, j - 1]
                    + d_cost(tar[j - 1], tar[j])
                    * self._taper(max(i, j), max_len),
                    d_mat[i - 1, j - 1]
                    + r_cost(src[i], tar[j]) * self._taper(max(i, j), max_len),
                )

        if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
            return int(d_mat[src_len, tar_len])
        else:
            return d_mat[src_len, tar_len]
예제 #40
0
    def encode(self, word):
        """Return the Roger Root code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Roger Root code

        Examples
        --------
        >>> pe = RogerRoot()
        >>> pe.encode('Christopher')
        '06401'
        >>> pe.encode('Niall')
        '02500'
        >>> pe.encode('Smith')
        '00310'
        >>> pe.encode('Schmidt')
        '06310'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)

        code = ''
        pos = 0

        # Do first digit(s) first
        for num in range(4, 0, -1):
            if word[:num] in self._init_patterns[num]:
                code = self._init_patterns[num][word[:num]]
                pos += num
                break

        # Then code subsequent digits
        while pos < len(word):
            for num in range(4, 0, -1):  # pragma: no branch
                if word[pos : pos + num] in self._med_patterns[num]:
                    code += self._med_patterns[num][word[pos : pos + num]]
                    pos += num
                    break

        code = self._delete_consecutive_repeats(code)
        code = code.replace('*', '')

        if self._zero_pad:
            code += '0' * self._max_length

        return code[: self._max_length]
예제 #41
0
    def encode(self, word):
        """Return the IBM Alpha Search Inquiry System code for a word.

        A collection is necessary as the return type since there can be
        multiple values for a single word. But the collection must be ordered
        since the first value is the primary coding.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        tuple
            The Alpha-SIS value

        Examples
        --------
        >>> pe = AlphaSIS()
        >>> pe.encode('Christopher')
        ('06401840000000', '07040184000000', '04018400000000')
        >>> pe.encode('Niall')
        ('02500000000000',)
        >>> pe.encode('Smith')
        ('03100000000000',)
        >>> pe.encode('Schmidt')
        ('06310000000000',)


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        alpha = ['']
        pos = 0
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)

        # Do special processing for initial substrings
        for k in self._alpha_sis_initials_order:
            if word.startswith(k):
                alpha[0] += self._alpha_sis_initials[k]
                pos += len(k)
                break

        # Add a '0' if alpha is still empty
        if not alpha[0]:
            alpha[0] += '0'

        # Whether or not any special initial codes were encoded, iterate
        # through the length of the word in the main encoding loop
        while pos < len(word):
            orig_pos = pos
            for k in self._alpha_sis_basic_order:
                if word[pos:].startswith(k):
                    if isinstance(self._alpha_sis_basic[k], tuple):
                        newalpha = []
                        for i in range(len(self._alpha_sis_basic[k])):
                            newalpha += [
                                _ + self._alpha_sis_basic[k][i] for _ in alpha
                            ]
                        alpha = newalpha
                    else:
                        alpha = [_ + self._alpha_sis_basic[k] for _ in alpha]
                    pos += len(k)
                    break
            if pos == orig_pos:
                alpha = [_ + '_' for _ in alpha]
                pos += 1

        # Trim doublets and placeholders
        for i in range(len(alpha)):
            pos = 1
            while pos < len(alpha[i]):
                if alpha[i][pos] == alpha[i][pos - 1]:
                    alpha[i] = alpha[i][:pos] + alpha[i][pos + 1 :]
                pos += 1
        alpha = (_.replace('_', '') for _ in alpha)

        # Trim codes and return tuple
        alpha = (
            (_ + ('0' * self._max_length))[: self._max_length] for _ in alpha
        )
        return tuple(alpha)
예제 #42
0
    def encode(self, fname):
        """Calculate the PSHP Soundex/Viewex Coding of a first name.

        Parameters
        ----------
        fname : str
            The first name to encode

        Returns
        -------
        str
            The PSHP Soundex/Viewex Coding

        Examples
        --------
        >>> pe = PSHPSoundexFirst()
        >>> pe.encode('Smith')
        'S530'
        >>> pe.encode('Waters')
        'W352'
        >>> pe.encode('James')
        'J700'
        >>> pe.encode('Schmidt')
        'S500'
        >>> pe.encode('Ashcroft')
        'A220'
        >>> pe.encode('John')
        'J500'
        >>> pe.encode('Colin')
        'K400'
        >>> pe.encode('Niall')
        'N400'
        >>> pe.encode('Sally')
        'S400'
        >>> pe.encode('Jane')
        'J500'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        fname = unicode_normalize('NFKD', text_type(fname.upper()))
        fname = fname.replace('ß', 'SS')
        fname = ''.join(c for c in fname if c in self._uc_set)

        # special rules
        if fname == 'JAMES':
            code = 'J7'
        elif fname == 'PAT':
            code = 'P7'

        else:
            # A. Prefix treatment
            if fname[:2] in {'GE', 'GI', 'GY'}:
                fname = 'J' + fname[1:]
            elif fname[:2] in {'CE', 'CI', 'CY'}:
                fname = 'S' + fname[1:]
            elif fname[:3] == 'CHR':
                fname = 'K' + fname[1:]
            elif fname[:1] == 'C' and fname[:2] != 'CH':
                fname = 'K' + fname[1:]

            if fname[:2] == 'KN':
                fname = 'N' + fname[1:]
            elif fname[:2] == 'PH':
                fname = 'F' + fname[1:]
            elif fname[:3] in {'WIE', 'WEI'}:
                fname = 'V' + fname[1:]

            if self._german and fname[:1] in {'W', 'M', 'Y', 'Z'}:
                fname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[
                    fname[0]
                ] + fname[1:]

            code = fname[:1]

            # B. Soundex coding
            # code for Y unspecified, but presumably is 0
            fname = fname.translate(self._trans)
            fname = self._delete_consecutive_repeats(fname)

            code += fname[1:]
            syl_ptr = code.find('0')
            syl2_ptr = code[syl_ptr + 1 :].find('0')
            if syl_ptr != -1 and syl2_ptr != -1 and syl2_ptr - syl_ptr > -1:
                code = code[: syl_ptr + 2]

            code = code.replace('0', '')  # rule 1

        if self._max_length != -1:
            if len(code) < self._max_length:
                code += '0' * (self._max_length - len(code))
            else:
                code = code[: self._max_length]

        return code
예제 #43
0
파일: _phonix.py 프로젝트: chrislit/abydos
    def encode(self, word):
        """Return the Phonix code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Phonix value

        Examples
        --------
        >>> pe = Phonix()
        >>> pe.encode('Christopher')
        'K683'
        >>> pe.encode('Niall')
        'N400'
        >>> pe.encode('Smith')
        'S530'
        >>> pe.encode('Schmidt')
        'S530'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """

        def _start_repl(word, src, tar, post=None):
            """Replace src with tar at the start of word.

            Parameters
            ----------
            word : str
                The word to modify
            src : str
                Substring to match
            tar : str
                Substring to substitute
            post : set
                Following characters

            Returns
            -------
            str
                Modified string

            .. versionadded:: 0.1.0

            """
            if post:
                for i in post:
                    if word.startswith(src + i):
                        return tar + word[len(src) :]
            elif word.startswith(src):
                return tar + word[len(src) :]
            return word

        def _end_repl(word, src, tar, pre=None):
            """Replace src with tar at the end of word.

            Parameters
            ----------
            word : str
                The word to modify
            src : str
                Substring to match
            tar : str
                Substring to substitute
            pre : set
                Preceding characters

            Returns
            -------
            str
                Modified string

            .. versionadded:: 0.1.0

            """
            if pre:
                for i in pre:
                    if word.endswith(i + src):
                        return word[: -len(src)] + tar
            elif word.endswith(src):
                return word[: -len(src)] + tar
            return word

        def _mid_repl(word, src, tar, pre=None, post=None):
            """Replace src with tar in the middle of word.

            Parameters
            ----------
            word : str
                The word to modify
            src : str
                Substring to match
            tar : str
                Substring to substitute
            pre : set
                Preceding characters
            post : set
                Following characters

            Returns
            -------
            str
                Modified string

            .. versionadded:: 0.1.0

            """
            if pre or post:
                if not pre:
                    return word[0] + _all_repl(word[1:], src, tar, pre, post)
                elif not post:
                    return _all_repl(word[:-1], src, tar, pre, post) + word[-1]
                return _all_repl(word, src, tar, pre, post)
            return (
                word[0] + _all_repl(word[1:-1], src, tar, pre, post) + word[-1]
            )

        def _all_repl(word, src, tar, pre=None, post=None):
            """Replace src with tar anywhere in word.

            Parameters
            ----------
            word : str
                The word to modify
            src : str
                Substring to match
            tar : str
                Substring to substitute
            pre : set
                Preceding characters
            post : set
                Following characters

            Returns
            -------
            str
                Modified string

            .. versionadded:: 0.1.0

            """
            if pre or post:
                if post:
                    post = post
                else:
                    post = frozenset(('',))
                if pre:
                    pre = pre
                else:
                    pre = frozenset(('',))

                for i, j in ((i, j) for i in pre for j in post):
                    word = word.replace(i + src + j, i + tar + j)
                return word
            else:
                return word.replace(src, tar)

        repl_at = (_start_repl, _end_repl, _mid_repl, _all_repl)

        sdx = ''

        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)
        if word:
            for trans in self._substitutions:
                word = repl_at[trans[0]](word, *trans[1:])
            if word[0] in self._uc_vy_set:
                sdx = 'v' + word[1:].translate(self._trans)
            else:
                sdx = word[0] + word[1:].translate(self._trans)
            sdx = self._delete_consecutive_repeats(sdx)
            sdx = sdx.replace('0', '')

        if self._zero_pad:
            sdx += '0' * self._max_length
        if not sdx:
            sdx = '0'
        return sdx[: self._max_length]
예제 #44
0
파일: _phonex.py 프로젝트: chrislit/abydos
    def encode(self, word):
        """Return the Phonex code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Phonex value

        Examples
        --------
        >>> pe = Phonex()
        >>> pe.encode('Christopher')
        'C623'
        >>> pe.encode('Niall')
        'N400'
        >>> pe.encode('Schmidt')
        'S253'
        >>> pe.encode('Smith')
        'S530'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        name = unicode_normalize('NFKD', text_type(word.upper()))
        name = name.replace('ß', 'SS')

        name_code = last = ''

        # Deletions effected by replacing with next letter which
        # will be ignored due to duplicate handling of Soundex code.
        # This is faster than 'moving' all subsequent letters.

        # Remove any trailing Ss
        while name[-1:] == 'S':
            name = name[:-1]

        # Phonetic equivalents of first 2 characters
        # Works since duplicate letters are ignored
        if name[:2] == 'KN':
            name = 'N' + name[2:]  # KN.. == N..
        elif name[:2] == 'PH':
            name = 'F' + name[2:]  # PH.. == F.. (H ignored anyway)
        elif name[:2] == 'WR':
            name = 'R' + name[2:]  # WR.. == R..

        if name:
            # Special case, ignore H first letter (subsequent Hs ignored
            # anyway)
            # Works since duplicate letters are ignored
            if name[0] == 'H':
                name = name[1:]

        if name:
            # Phonetic equivalents of first character
            if name[0] in self._uc_vy_set:
                name = 'A' + name[1:]
            elif name[0] in {'B', 'P'}:
                name = 'B' + name[1:]
            elif name[0] in {'V', 'F'}:
                name = 'F' + name[1:]
            elif name[0] in {'C', 'K', 'Q'}:
                name = 'C' + name[1:]
            elif name[0] in {'G', 'J'}:
                name = 'G' + name[1:]
            elif name[0] in {'S', 'Z'}:
                name = 'S' + name[1:]

            name_code = last = name[0]

        # Modified Soundex code
        for i in range(1, len(name)):
            code = '0'
            if name[i] in {'B', 'F', 'P', 'V'}:
                code = '1'
            elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}:
                code = '2'
            elif name[i] in {'D', 'T'}:
                if name[i + 1 : i + 2] != 'C':
                    code = '3'
            elif name[i] == 'L':
                if name[i + 1 : i + 2] in self._uc_vy_set or i + 1 == len(
                    name
                ):
                    code = '4'
            elif name[i] in {'M', 'N'}:
                if name[i + 1 : i + 2] in {'D', 'G'}:
                    name = name[: i + 1] + name[i] + name[i + 2 :]
                code = '5'
            elif name[i] == 'R':
                if name[i + 1 : i + 2] in self._uc_vy_set or i + 1 == len(
                    name
                ):
                    code = '6'

            if code != last and code != '0' and i != 0:
                name_code += code

            last = name_code[-1]

        if self._zero_pad:
            name_code += '0' * self._max_length
        if not name_code:
            name_code = '0'
        return name_code[: self._max_length]
예제 #45
0
    def encode(self, word):
        """Return the SoundexBR encoding of a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The SoundexBR code

        Examples
        --------
        >>> pe = SoundexBR()
        >>> pe.encode('Oliveira')
        'O416'
        >>> pe.encode('Almeida')
        'A453'
        >>> pe.encode('Barbosa')
        'B612'
        >>> pe.encode('Araújo')
        'A620'
        >>> pe.encode('Gonçalves')
        'G524'
        >>> pe.encode('Goncalves')
        'G524'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = ''.join(c for c in word if c in self._uc_set)

        if word[:2] == 'WA':
            first = 'V'
        elif word[:1] == 'K' and word[1:2] in {'A', 'O', 'U'}:
            first = 'C'
        elif word[:1] == 'C' and word[1:2] in {'I', 'E'}:
            first = 'S'
        elif word[:1] == 'G' and word[1:2] in {'E', 'I'}:
            first = 'J'
        elif word[:1] == 'Y':
            first = 'I'
        elif word[:1] == 'H':
            first = word[1:2]
            word = word[1:]
        else:
            first = word[:1]

        sdx = first + word[1:].translate(self._trans)
        sdx = self._delete_consecutive_repeats(sdx)
        sdx = sdx.replace('0', '')

        if self._zero_pad:
            sdx += '0' * self._max_length

        return sdx[: self._max_length]
예제 #46
0
    def encode(self, lname):
        """Calculate the PSHP Soundex/Viewex Coding of a last name.

        Parameters
        ----------
        lname : str
            The last name to encode

        Returns
        -------
        str
            The PSHP Soundex/Viewex Coding

        Examples
        --------
        >>> pe = PSHPSoundexLast()
        >>> pe.encode('Smith')
        'S530'
        >>> pe.encode('Waters')
        'W350'
        >>> pe.encode('James')
        'J500'
        >>> pe.encode('Schmidt')
        'S530'
        >>> pe.encode('Ashcroft')
        'A225'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        lname = unicode_normalize('NFKD', text_type(lname.upper()))
        lname = lname.replace('ß', 'SS')
        lname = ''.join(c for c in lname if c in self._uc_set)

        # A. Prefix treatment
        if lname[:3] == 'VON' or lname[:3] == 'VAN':
            lname = lname[3:].strip()

        # The rule implemented below says "MC, MAC become 1". I believe it
        # meant to say they become M except in German data (where superscripted
        # 1 indicates "except in German data"). It doesn't make sense for them
        # to become 1 (BPFV -> 1) or to apply outside German. Unfortunately,
        # both articles have this error(?).
        if not self._german:
            if lname[:3] == 'MAC':
                lname = 'M' + lname[3:]
            elif lname[:2] == 'MC':
                lname = 'M' + lname[2:]

        # The non-German-only rule to strip ' is unnecessary due to filtering

        if lname[:1] in {'E', 'I', 'O', 'U'}:
            lname = 'A' + lname[1:]
        elif lname[:2] in {'GE', 'GI', 'GY'}:
            lname = 'J' + lname[1:]
        elif lname[:2] in {'CE', 'CI', 'CY'}:
            lname = 'S' + lname[1:]
        elif lname[:3] == 'CHR':
            lname = 'K' + lname[1:]
        elif lname[:1] == 'C' and lname[:2] != 'CH':
            lname = 'K' + lname[1:]

        if lname[:2] == 'KN':
            lname = 'N' + lname[1:]
        elif lname[:2] == 'PH':
            lname = 'F' + lname[1:]
        elif lname[:3] in {'WIE', 'WEI'}:
            lname = 'V' + lname[1:]

        if self._german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
            lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]] + lname[
                1:
            ]

        code = lname[:1]

        # B. Postfix treatment
        if self._german:  # moved from end of postfix treatment due to blocking
            if lname[-3:] == 'TES':
                lname = lname[:-3]
            elif lname[-2:] == 'TS':
                lname = lname[:-2]
            if lname[-3:] == 'TZE':
                lname = lname[:-3]
            elif lname[-2:] == 'ZE':
                lname = lname[:-2]
            if lname[-1:] == 'Z':
                lname = lname[:-1]
            elif lname[-2:] == 'TE':
                lname = lname[:-2]

        if lname[-1:] == 'R':
            lname = lname[:-1] + 'N'
        elif lname[-2:] in {'SE', 'CE'}:
            lname = lname[:-2]
        if lname[-2:] == 'SS':
            lname = lname[:-2]
        elif lname[-1:] == 'S':
            lname = lname[:-1]

        if not self._german:
            l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
            l4_repl = {
                'NSEN': 'ASEN',
                'MSON': 'ASON',
                'STEN': 'SAEN',
                'STON': 'SAON',
            }
            if lname[-5:] in l5_repl:
                lname = lname[:-5] + l5_repl[lname[-5:]]
            elif lname[-4:] in l4_repl:
                lname = lname[:-4] + l4_repl[lname[-4:]]

        if lname[-2:] in {'NG', 'ND'}:
            lname = lname[:-1]
        if not self._german and lname[-3:] in {'GAN', 'GEN'}:
            lname = lname[:-3] + 'A' + lname[-2:]

        # C. Infix Treatment
        lname = lname.replace('CK', 'C')
        lname = lname.replace('SCH', 'S')
        lname = lname.replace('DT', 'T')
        lname = lname.replace('ND', 'N')
        lname = lname.replace('NG', 'N')
        lname = lname.replace('LM', 'M')
        lname = lname.replace('MN', 'M')
        lname = lname.replace('WIE', 'VIE')
        lname = lname.replace('WEI', 'VEI')

        # D. Soundexing
        # code for X & Y are unspecified, but presumably are 2 & 0

        lname = lname.translate(self._trans)
        lname = self._delete_consecutive_repeats(lname)

        code += lname[1:]
        code = code.replace('0', '')  # rule 1

        if self._max_length != -1:
            if len(code) < self._max_length:
                code += '0' * (self._max_length - len(code))
            else:
                code = code[: self._max_length]

        return code
예제 #47
0
    def encode(self, word):
        """Return the Daitch-Mokotoff Soundex code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Daitch-Mokotoff Soundex value

        Examples
        --------
        >>> pe = DaitchMokotoff()
        >>> sorted(pe.encode('Christopher'))
        ['494379', '594379']
        >>> pe.encode('Niall')
        {'680000'}
        >>> pe.encode('Smith')
        {'463000'}
        >>> pe.encode('Schmidt')
        {'463000'}

        >>> sorted(DaitchMokotoff(max_length=20,
        ... zero_pad=False).encode('The quick brown fox'))
        ['35457976754', '3557976754']


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        dms = ['']  # initialize empty code list

        # uppercase, normalize, decompose, and filter non-A-Z
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = ''.join(c for c in word if c in self._uc_set)

        # Nothing to convert, return base case
        if not word:
            if self._zero_pad:
                return {'0' * self._max_length}
            return {'0'}

        pos = 0
        while pos < len(word):
            # Iterate through _dms_order, which specifies the possible
            # substrings for which codes exist in the Daitch-Mokotoff coding
            for sstr in self._dms_order[word[pos]]:  # pragma: no branch
                if word[pos:].startswith(sstr):
                    # Having determined a valid substring start, retrieve the
                    # code
                    dm_val = self._dms_table[sstr]

                    # Having retried the code (triple), determine the correct
                    # positional variant (first, pre-vocalic, elsewhere)
                    if pos == 0:
                        dm_val = dm_val[0]
                    elif (
                        pos + len(sstr) < len(word)
                        and word[pos + len(sstr)] in self._uc_v_set
                    ):
                        dm_val = dm_val[1]
                    else:
                        dm_val = dm_val[2]

                    # Build the code strings
                    if isinstance(dm_val, tuple):
                        dms = [_ + text_type(dm_val[0]) for _ in dms] + [
                            _ + text_type(dm_val[1]) for _ in dms
                        ]
                    else:
                        dms = [_ + text_type(dm_val) for _ in dms]
                    pos += len(sstr)
                    break

        # Filter out double letters and _ placeholders
        dms = (
            ''.join(c for c in self._delete_consecutive_repeats(_) if c != '_')
            for _ in dms
        )

        # Trim codes and return set
        if self._zero_pad:
            dms = (
                (_ + ('0' * self._max_length))[: self._max_length] for _ in dms
            )
        else:
            dms = (_[: self._max_length] for _ in dms)
        return set(dms)
예제 #48
0
    def encode(self, word):
        """Return the SfinxBis code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        tuple
            The SfinxBis value

        Examples
        --------
        >>> pe = SfinxBis()
        >>> pe.encode('Christopher')
        ('K68376',)
        >>> pe.encode('Niall')
        ('N4',)
        >>> pe.encode('Smith')
        ('S53',)
        >>> pe.encode('Schmidt')
        ('S53',)

        >>> pe.encode('Johansson')
        ('J585',)
        >>> pe.encode('Sjöberg')
        ('#162',)


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """

        def _foersvensker(lokal_ordet):
            """Return the Swedish-ized form of the word.

            Parameters
            ----------
            lokal_ordet : str
                Word to transform

            Returns
            -------
            str
                Transformed word

            .. versionadded:: 0.1.0

            """
            lokal_ordet = lokal_ordet.replace('STIERN', 'STJÄRN')
            lokal_ordet = lokal_ordet.replace('HIE', 'HJ')
            lokal_ordet = lokal_ordet.replace('SIÖ', 'SJÖ')
            lokal_ordet = lokal_ordet.replace('SCH', 'SH')
            lokal_ordet = lokal_ordet.replace('QU', 'KV')
            lokal_ordet = lokal_ordet.replace('IO', 'JO')
            lokal_ordet = lokal_ordet.replace('PH', 'F')

            for i in self._harde_vokaler:
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')
            for i in self._mjuka_vokaler:
                lokal_ordet = lokal_ordet.replace(i + 'Ü', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'Y', i + 'J')
                lokal_ordet = lokal_ordet.replace(i + 'I', i + 'J')

            if 'H' in lokal_ordet:
                for i in self._uc_c_set:
                    lokal_ordet = lokal_ordet.replace('H' + i, i)

            lokal_ordet = lokal_ordet.translate(self._substitutions)

            lokal_ordet = lokal_ordet.replace('Ð', 'ETH')
            lokal_ordet = lokal_ordet.replace('Þ', 'TH')
            lokal_ordet = lokal_ordet.replace('ß', 'SS')

            return lokal_ordet

        def _koda_foersta_ljudet(lokal_ordet):
            """Return the word with the first sound coded.

            Parameters
            ----------
            lokal_ordet : str
                Word to transform

            Returns
            -------
            str
                Transformed word

            .. versionadded:: 0.1.0

            """
            if (
                lokal_ordet[0:1] in self._mjuka_vokaler
                or lokal_ordet[0:1] in self._harde_vokaler
            ):
                lokal_ordet = '$' + lokal_ordet[1:]
            elif lokal_ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
                lokal_ordet = 'J' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:1] == 'G'
                and lokal_ordet[1:2] in self._mjuka_vokaler
            ):
                lokal_ordet = 'J' + lokal_ordet[1:]
            elif lokal_ordet[0:1] == 'Q':
                lokal_ordet = 'K' + lokal_ordet[1:]
            elif lokal_ordet[0:2] == 'CH' and lokal_ordet[2:3] in frozenset(
                self._mjuka_vokaler | self._harde_vokaler
            ):
                lokal_ordet = '#' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:1] == 'C'
                and lokal_ordet[1:2] in self._harde_vokaler
            ):
                lokal_ordet = 'K' + lokal_ordet[1:]
            elif (
                lokal_ordet[0:1] == 'C' and lokal_ordet[1:2] in self._uc_c_set
            ):
                lokal_ordet = 'K' + lokal_ordet[1:]
            elif lokal_ordet[0:1] == 'X':
                lokal_ordet = 'S' + lokal_ordet[1:]
            elif (
                lokal_ordet[0:1] == 'C'
                and lokal_ordet[1:2] in self._mjuka_vokaler
            ):
                lokal_ordet = 'S' + lokal_ordet[1:]
            elif lokal_ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
                lokal_ordet = '#' + lokal_ordet[3:]
            elif lokal_ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
                lokal_ordet = '#' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:2] == 'SK'
                and lokal_ordet[2:3] in self._mjuka_vokaler
            ):
                lokal_ordet = '#' + lokal_ordet[2:]
            elif (
                lokal_ordet[0:1] == 'K'
                and lokal_ordet[1:2] in self._mjuka_vokaler
            ):
                lokal_ordet = '#' + lokal_ordet[1:]
            return lokal_ordet

        # Steg 1, Versaler
        word = unicode_normalize('NFC', text_type(word.upper()))
        word = word.replace('ß', 'SS')
        word = word.replace('-', ' ')

        # Steg 2, Ta bort adelsprefix
        for adelstitel in self._adelstitler:
            while adelstitel in word:
                word = word.replace(adelstitel, ' ')
            if word.startswith(adelstitel[1:]):
                word = word[len(adelstitel) - 1 :]

        # Split word into tokens
        ordlista = word.split()

        # Steg 3, Ta bort dubbelteckning i början på namnet
        ordlista = [
            self._delete_consecutive_repeats(ordet) for ordet in ordlista
        ]
        if not ordlista:
            # noinspection PyRedundantParentheses
            return ('',)

        # Steg 4, Försvenskning
        ordlista = [_foersvensker(ordet) for ordet in ordlista]

        # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
        ordlista = [
            ''.join(c for c in ordet if c in self._uc_set)
            for ordet in ordlista
        ]

        # Steg 6, Koda första ljudet
        ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]

        # Steg 7, Dela upp namnet i två delar
        rest = [ordet[1:] for ordet in ordlista]

        # Steg 8, Utför fonetisk transformation i resten
        rest = [ordet.replace('DT', 'T') for ordet in rest]
        rest = [ordet.replace('X', 'KS') for ordet in rest]

        # Steg 9, Koda resten till en sifferkod
        for vokal in self._mjuka_vokaler:
            rest = [ordet.replace('C' + vokal, '8' + vokal) for ordet in rest]
        rest = [ordet.translate(self._trans) for ordet in rest]

        # Steg 10, Ta bort intilliggande dubbletter
        rest = [self._delete_consecutive_repeats(ordet) for ordet in rest]

        # Steg 11, Ta bort alla "9"
        rest = [ordet.replace('9', '') for ordet in rest]

        # Steg 12, Sätt ihop delarna igen
        ordlista = [
            ''.join(ordet) for ordet in zip((_[0:1] for _ in ordlista), rest)
        ]

        # truncate, if max_length is set
        if self._max_length > 0:
            ordlista = [ordet[: self._max_length] for ordet in ordlista]

        return tuple(ordlista)
예제 #49
0
파일: _spfc.py 프로젝트: chrislit/abydos
    def encode(self, word):
        """Return the Standardized Phonetic Frequency Code (SPFC) of a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The SPFC value

        Raises
        ------
        AttributeError
            Word attribute must be a string with a space or period dividing the
            first and last names or a tuple/list consisting of the first and
            last names

        Examples
        --------
        >>> pe = SPFC()
        >>> pe.encode('Christopher Smith')
        '01160'
        >>> pe.encode('Christopher Schmidt')
        '01160'
        >>> pe.encode('Niall Smith')
        '01660'
        >>> pe.encode('Niall Schmidt')
        '01660'

        >>> pe.encode('L.Smith')
        '01960'
        >>> pe.encode('R.Miller')
        '65490'

        >>> pe.encode(('L', 'Smith'))
        '01960'
        >>> pe.encode(('R', 'Miller'))
        '65490'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """

        def _raise_word_ex():
            """Raise an AttributeError.

            Raises
            ------
            AttributeError
                Word attribute must be a string with a space or period dividing
                the first and last names or a tuple/list consisting of the
                first and last names

            .. versionadded:: 0.1.0

            """
            raise AttributeError(
                'Word attribute must be a string with a space or period '
                + 'dividing the first and last names or a tuple/list '
                + 'consisting of the first and last names'
            )

        if not word:
            return ''

        names = []
        if isinstance(word, (str, text_type)):
            names = word.split('.', 1)
            if len(names) != 2:
                names = word.split(' ', 1)
                if len(names) != 2:
                    _raise_word_ex()
        elif hasattr(word, '__iter__'):
            if len(word) != 2:
                _raise_word_ex()
            names = word
        else:
            _raise_word_ex()

        names = [
            unicode_normalize(
                'NFKD', text_type(_.strip().replace('ß', 'SS').upper())
            )
            for _ in names
        ]
        code = ''

        def _steps_one_to_three(name):
            """Perform the first three steps of SPFC.

            Parameters
            ----------
            name : str
                Name to transform

            Returns
            -------
            str
                Transformed name

            .. versionadded:: 0.1.0

            """
            # filter out non A-Z
            name = ''.join(_ for _ in name if _ in self._uc_set)

            # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
            # and MN to N
            for subst in self._substitutions:
                name = name.replace(subst[0], subst[1])

            # 2. In the name field, replace multiple letters with a single
            # letter
            name = self._delete_consecutive_repeats(name)

            # 3. Remove vowels, W, H, and Y, but keep the first letter in the
            # name field.
            if name:
                name = name[0] + ''.join(
                    _
                    for _ in name[1:]
                    if _ not in {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}
                )
            return name

        names = [_steps_one_to_three(_) for _ in names]

        # 4. The first digit of the code is obtained using PF1 and the first
        # letter of the name field. Remove this letter after coding.
        if names[1]:
            code += names[1][0].translate(self._pf1)
            names[1] = names[1][1:]

        # 5. Using the last letters of the name, use Table PF3 to obtain the
        # second digit of the code. Use as many letters as possible and remove
        # after coding.
        if names[1]:
            if names[1][-3:] in {'DRS', 'STN', 'PRS', 'STR'}:
                code += '7'
                names[1] = names[1][:-3]
            elif names[1][-2:] in {'MN', 'TR', 'SN', 'SR', 'TN', 'TD'}:
                code += '7'
                names[1] = names[1][:-2]
            else:
                code += names[1][-1].translate(self._pf3)
                names[1] = names[1][:-1]

        # 6. The third digit is found using Table PF2 and the first character
        # of the first name. Remove after coding.
        if names[0]:
            code += names[0][0].translate(self._pf2)
            names[0] = names[0][1:]

        # 7. The fourth digit is found using Table PF2 and the first character
        # of the name field. If no letters remain use zero. After coding remove
        # the letter.
        # 8. The fifth digit is found in the same manner as the fourth using
        # the remaining characters of the name field if any.
        for _ in range(2):
            if names[1]:
                code += names[1][0].translate(self._pf2)
                names[1] = names[1][1:]
            else:
                code += '0'

        return code
예제 #50
0
파일: _haase.py 프로젝트: chrislit/abydos
    def encode(self, word):
        """Return the Haase Phonetik (numeric output) code for a word.

        While the output code is numeric, it is nevertheless a str.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        tuple
            The Haase Phonetik value as a numeric string

        Examples
        --------
        >>> pe = Haase()
        >>> pe.encode('Joachim')
        ('9496',)
        >>> pe.encode('Christoph')
        ('4798293', '8798293')
        >>> pe.encode('Jörg')
        ('974',)
        >>> pe.encode('Smith')
        ('8692',)
        >>> pe.encode('Schmidt')
        ('8692', '4692')


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """

        def _after(word, pos, letters):
            """Return True if word[pos] follows one of the supplied letters.

            Parameters
            ----------
            word : str
                Word to modify
            pos : int
                Position to examine
            letters : set
                Letters to check for

            Returns
            -------
            bool
                True if word[pos] follows one of letters

            .. versionadded:: 0.3.0

            """
            if pos > 0 and word[pos - 1] in letters:
                return True
            return False

        def _before(word, pos, letters):
            """Return True if word[pos] precedes one of the supplied letters.

            Parameters
            ----------
            word : str
                Word to modify
            pos : int
                Position to examine
            letters : set
                Letters to check for

            Returns
            -------
            bool
                True if word[pos] precedes one of letters

            .. versionadded:: 0.3.0

            """
            if pos + 1 < len(word) and word[pos + 1] in letters:
                return True
            return False

        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')

        word = word.replace('Ä', 'AE')
        word = word.replace('Ö', 'OE')
        word = word.replace('Ü', 'UE')
        word = ''.join(c for c in word if c in self._uc_set)

        variants = []
        if self._primary_only:
            variants = [word]
        else:
            pos = 0
            if word[:2] == 'CH':
                variants.append(('CH', 'SCH'))
                pos += 2
            len_3_vars = {
                'OWN': 'AUN',
                'WSK': 'RSK',
                'SCH': 'CH',
                'GLI': 'LI',
                'AUX': 'O',
                'EUX': 'O',
            }
            while pos < len(word):
                if word[pos : pos + 4] == 'ILLE':
                    variants.append(('ILLE', 'I'))
                    pos += 4
                elif word[pos : pos + 3] in len_3_vars:
                    variants.append(
                        (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
                    )
                    pos += 3
                elif word[pos : pos + 2] == 'RB':
                    variants.append(('RB', 'RW'))
                    pos += 2
                elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
                    variants.append(('EAU', 'O'))
                    pos += 3
                elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
                    if word[pos:] == 'O':
                        variants.append(('O', 'OW'))
                    else:
                        variants.append(('A', 'AR'))
                    pos += 1
                else:
                    variants.append((word[pos],))
                    pos += 1

            variants = [''.join(letters) for letters in product(*variants)]

        def _haase_code(word):
            sdx = ''
            for i in range(len(word)):
                if word[i] in self._uc_v_set:
                    sdx += '9'
                elif word[i] == 'B':
                    sdx += '1'
                elif word[i] == 'P':
                    if _before(word, i, {'H'}):
                        sdx += '3'
                    else:
                        sdx += '1'
                elif word[i] in {'D', 'T'}:
                    if _before(word, i, {'C', 'S', 'Z'}):
                        sdx += '8'
                    else:
                        sdx += '2'
                elif word[i] in {'F', 'V', 'W'}:
                    sdx += '3'
                elif word[i] in {'G', 'K', 'Q'}:
                    sdx += '4'
                elif word[i] == 'C':
                    if _after(word, i, {'S', 'Z'}):
                        sdx += '8'
                    elif i == 0:
                        if _before(
                            word,
                            i,
                            {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'},
                        ):
                            sdx += '4'
                        else:
                            sdx += '8'
                    elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
                        sdx += '4'
                    else:
                        sdx += '8'
                elif word[i] == 'X':
                    if _after(word, i, {'C', 'K', 'Q'}):
                        sdx += '8'
                    else:
                        sdx += '48'
                elif word[i] == 'L':
                    sdx += '5'
                elif word[i] in {'M', 'N'}:
                    sdx += '6'
                elif word[i] == 'R':
                    sdx += '7'
                elif word[i] in {'S', 'Z'}:
                    sdx += '8'

            sdx = self._delete_consecutive_repeats(sdx)

            return sdx

        encoded = tuple(_haase_code(word) for word in variants)
        if len(encoded) > 1:
            encoded_set = set()
            encoded_single = []
            for code in encoded:
                if code not in encoded_set:
                    encoded_set.add(code)
                    encoded_single.append(code)
            return tuple(encoded_single)

        return encoded
예제 #51
0
파일: _soundex.py 프로젝트: chrislit/abydos
    def encode(self, word):
        """Return the Soundex code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Soundex value

        Examples
        --------
        >>> pe = Soundex()
        >>> pe.encode("Christopher")
        'C623'
        >>> pe.encode("Niall")
        'N400'
        >>> pe.encode('Smith')
        'S530'
        >>> pe.encode('Schmidt')
        'S530'

        >>> Soundex(max_length=-1).encode('Christopher')
        'C623160000000000000000000000000000000000000000000000000000000000'
        >>> Soundex(max_length=-1, zero_pad=False).encode('Christopher')
        'C62316'

        >>> Soundex(reverse=True).encode('Christopher')
        'R132'

        >>> pe.encode('Ashcroft')
        'A261'
        >>> pe.encode('Asicroft')
        'A226'

        >>> pe_special = Soundex(var='special')
        >>> pe_special.encode('Ashcroft')
        'A226'
        >>> pe_special.encode('Asicroft')
        'A226'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # uppercase, normalize, decompose, and filter non-A-Z out
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')

        if self._var == 'Census':
            if word[:3] in {'VAN', 'CON'} and len(word) > 4:
                return (
                    soundex(
                        word,
                        self._max_length,
                        'American',
                        self._reverse,
                        self._zero_pad,
                    ),
                    soundex(
                        word[3:],
                        self._max_length,
                        'American',
                        self._reverse,
                        self._zero_pad,
                    ),
                )
            if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
                return (
                    soundex(
                        word,
                        self._max_length,
                        'American',
                        self._reverse,
                        self._zero_pad,
                    ),
                    soundex(
                        word[2:],
                        self._max_length,
                        'American',
                        self._reverse,
                        self._zero_pad,
                    ),
                )
            # Otherwise, proceed as usual (var='American' mode, ostensibly)

        word = ''.join(c for c in word if c in self._uc_set)

        # Nothing to convert, return base case
        if not word:
            if self._zero_pad:
                return '0' * self._max_length
            return '0'

        # Reverse word if computing Reverse Soundex
        if self._reverse:
            word = word[::-1]

        # apply the Soundex algorithm
        sdx = word.translate(self._trans)

        if self._var == 'special':
            sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
        else:
            sdx = sdx.replace('9', '')  # rule 1
        sdx = self._delete_consecutive_repeats(sdx)  # rule 3

        if word[0] in 'HW':
            sdx = word[0] + sdx
        else:
            sdx = word[0] + sdx[1:]
        sdx = sdx.replace('0', '')  # rule 1

        if self._zero_pad:
            sdx += '0' * self._max_length  # rule 4

        return sdx[: self._max_length]
예제 #52
0
    def encode(self, word):
        """Return the Spanish Metaphone of a word.

        Parameters
        ----------
        word : str
            The word to transform


        Returns
        -------
        str
            The Spanish Metaphone code

        Examples
        --------
        >>> pe = SpanishMetaphone()
        >>> pe.encode('Perez')
        'PRZ'
        >>> pe.encode('Martinez')
        'MRTNZ'
        >>> pe.encode('Gutierrez')
        'GTRRZ'
        >>> pe.encode('Santiago')
        'SNTG'
        >>> pe.encode('Nicolás')
        'NKLS'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class


        """

        def _is_vowel(pos):
            """Return True if the character at word[pos] is a vowel.

            Parameters
            ----------
            pos : int
                Position to check for a vowel

            Returns
            -------
            bool
                True if word[pos] is a vowel

            .. versionadded:: 0.3.0

            """
            return pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'}

        word = unicode_normalize('NFC', text_type(word.upper()))

        meta_key = ''
        pos = 0

        # do some replacements for the modified version
        if self._modified:
            word = word.replace('MB', 'NB')
            word = word.replace('MP', 'NP')
            word = word.replace('BS', 'S')
            if word[:2] == 'PS':
                word = word[1:]

        # simple replacements
        word = word.replace('Á', 'A')
        word = word.replace('CH', 'X')
        word = word.replace('Ç', 'S')
        word = word.replace('É', 'E')
        word = word.replace('Í', 'I')
        word = word.replace('Ó', 'O')
        word = word.replace('Ú', 'U')
        word = word.replace('Ñ', 'NY')
        word = word.replace('GÜ', 'W')
        word = word.replace('Ü', 'U')
        word = word.replace('B', 'V')
        word = word.replace('LL', 'Y')

        while len(meta_key) < self._max_length:
            if pos >= len(word):
                break

            # get the next character
            current_char = word[pos]

            # if a vowel in pos 0, add to key
            if _is_vowel(pos) and pos == 0:
                meta_key += current_char
                pos += 1
            # otherwise, do consonant rules
            else:
                # simple consonants (unmutated)
                if current_char in {
                    'D',
                    'F',
                    'J',
                    'K',
                    'M',
                    'N',
                    'P',
                    'T',
                    'V',
                    'L',
                    'Y',
                }:
                    meta_key += current_char
                    # skip doubled consonants
                    if word[pos + 1 : pos + 2] == current_char:
                        pos += 2
                    else:
                        pos += 1
                else:
                    if current_char == 'C':
                        # special case 'acción', 'reacción',etc.
                        if word[pos + 1 : pos + 2] == 'C':
                            meta_key += 'X'
                            pos += 2
                        # special case 'cesar', 'cien', 'cid', 'conciencia'
                        elif word[pos + 1 : pos + 2] in {'E', 'I'}:
                            meta_key += 'Z'
                            pos += 2
                        # base case
                        else:
                            meta_key += 'K'
                            pos += 1
                    elif current_char == 'G':
                        # special case 'gente', 'ecologia',etc
                        if word[pos + 1 : pos + 2] in {'E', 'I'}:
                            meta_key += 'J'
                            pos += 2
                        # base case
                        else:
                            meta_key += 'G'
                            pos += 1
                    elif current_char == 'H':
                        # since the letter 'H' is silent in Spanish,
                        # set the meta key to the vowel after the letter 'H'
                        if _is_vowel(pos + 1):
                            meta_key += word[pos + 1]
                            pos += 2
                        else:
                            meta_key += 'H'
                            pos += 1
                    elif current_char == 'Q':
                        if word[pos + 1 : pos + 2] == 'U':
                            pos += 2
                        else:
                            pos += 1
                        meta_key += 'K'
                    elif current_char == 'W':
                        meta_key += 'U'
                        pos += 1
                    elif current_char == 'R':
                        meta_key += 'R'
                        pos += 1
                    elif current_char == 'S':
                        if not _is_vowel(pos + 1) and pos == 0:
                            meta_key += 'ES'
                            pos += 1
                        else:
                            meta_key += 'S'
                            pos += 1
                    elif current_char == 'Z':
                        meta_key += 'Z'
                        pos += 1
                    elif current_char == 'X':
                        if (
                            len(word) > 1
                            and pos == 0
                            and not _is_vowel(pos + 1)
                        ):
                            meta_key += 'EX'
                            pos += 1
                        else:
                            meta_key += 'X'
                            pos += 1
                    else:
                        pos += 1

        # Final change from S to Z in modified version
        if self._modified:
            meta_key = meta_key.replace('S', 'Z')

        return meta_key
예제 #53
0
    def run(self, text, options, path):
        """
        Search the dictionary, walk the returned articles, then download
        articles that look like a match, and find MP3s in those articles
        that match the original input.
        """

        assert options['voice'] == 'de', "Only German is supported."

        if len(text) > INPUT_MAXIMUM:
            raise IOError("Your input text is too long for Duden.")

        try:
            text.encode('us-ascii')
        except UnicodeEncodeError:
            raise IOError("Your input text uses characters that cannot be "
                          "accurately searched for in the Duden.")

        text_search = text.replace('sz', '\u00df')
        self._logger.debug('Duden: Searching on "%s"', text_search)
        try:
            search_html = self.net_stream((SEARCH_FORM, dict(s=text_search)),
                                          require=dict(mime='text/html')).decode()
        except IOError as io_error:
            if getattr(io_error, 'code', None) == 404:
                raise IOError("Duden does not recognize this input.")
            else:
                raise

        text_lower = text.lower()
        text_lower_underscored_trailing = text_lower. \
            replace(' ', '_').replace('-', '_') + '_'
        text_compressed = text.replace(' ', '').replace('-', '')
        text_lower_compressed = text_compressed.lower()
        text_deumlauted_compressed = text_compressed.replace('ae', 'a'). \
            replace('oe', 'o').replace('ue', 'u')
        self._logger.debug('Got a search response; will follow links whose '
                           'lowercased+compressed article segment equals "%s" '
                           'or whose lowercased-but-still-underscored article '
                           'segment begins with "%s"; looking for MP3s whose '
                           'compressed guide says "%s" or "%s"',
                           text_lower_compressed,
                           text_lower_underscored_trailing,
                           text_compressed,
                           text_deumlauted_compressed)

        seen_article_urls = {}

        for article_match in RE_DETAIL.finditer(search_html):
            article_url = article_match.group(1)

            if article_url in seen_article_urls:
                continue
            seen_article_urls[article_url] = True

            segment = article_match.group(2)
            segment_lower = segment.lower()
            segment_lower_compressed = segment_lower.replace('_', '')

            if segment_lower_compressed == text_lower_compressed:
                self._logger.debug('Duden: lowered+compressed article segment '
                                   'for %s are same ("%s")',
                                   article_url, segment_lower_compressed)

            elif segment_lower.startswith(text_lower_underscored_trailing):
                self._logger.debug('Duden: lowered segment "%s" for %s begins '
                                   'with "%s"',
                                   segment_lower, article_url,
                                   text_lower_underscored_trailing)

            else:
                self._logger.debug('Duden: article segment for %s does not '
                                   'match; skipping', article_url)
                continue

            article_html = self.net_stream(article_url).decode()

            for mp3_match in RE_MP3.finditer(article_html):
                guide = mp3_match.group(3)
                guide = ''.join(HTML_PARSER.unescape(node)
                                for node
                                in BeautifulSoup(guide, 'html.parser').findAll(text=True))
                guide_normalized = unicode_normalize(
                    'NFKD',
                    self.modify(guide).replace('-', '').replace(' ', ''),
                ).encode('ASCII', 'ignore').decode()

                mp3_url = mp3_match.group(5)

                if guide_normalized == text_compressed or \
                        guide_normalized == text_deumlauted_compressed:

                    self._logger.debug('Duden: found MATCHING MP3 at %s for '
                                       '"%s", which normalized to "%s" and '
                                       'matches our input',
                                       mp3_url, guide, guide_normalized)

                    self.net_download(path, mp3_url,
                                      require=dict(mime='audio/mpeg'))
                    return

                else:
                    self._logger.debug('Duden: found non-matching MP3 at %s '
                                       'for "%s", which normalized to "%s" '
                                       'and does not match our input',
                                       mp3_url, guide, guide_normalized)

        raise IOError("Duden does not have recorded audio for this word.")
예제 #54
0
    def encode(self, word):
        """Return the Fuzzy Soundex code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Fuzzy Soundex value

        Examples
        --------
        >>> pe = FuzzySoundex()
        >>> pe.encode('Christopher')
        'K6931'
        >>> pe.encode('Niall')
        'N4000'
        >>> pe.encode('Smith')
        'S5300'
        >>> pe.encode('Smith')
        'S5300'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')

        if not word:
            if self._zero_pad:
                return '0' * self._max_length
            return '0'

        if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}:
            word = 'SS' + word[2:]
        elif word[:2] == 'GN':
            word = 'NN' + word[2:]
        elif word[:2] in {'HR', 'WR'}:
            word = 'RR' + word[2:]
        elif word[:2] == 'HW':
            word = 'WW' + word[2:]
        elif word[:2] in {'KN', 'NG'}:
            word = 'NN' + word[2:]

        if word[-2:] == 'CH':
            word = word[:-2] + 'KK'
        elif word[-2:] == 'NT':
            word = word[:-2] + 'TT'
        elif word[-2:] == 'RT':
            word = word[:-2] + 'RR'
        elif word[-3:] == 'RDT':
            word = word[:-3] + 'RR'

        word = word.replace('CA', 'KA')
        word = word.replace('CC', 'KK')
        word = word.replace('CK', 'KK')
        word = word.replace('CE', 'SE')
        word = word.replace('CHL', 'KL')
        word = word.replace('CL', 'KL')
        word = word.replace('CHR', 'KR')
        word = word.replace('CR', 'KR')
        word = word.replace('CI', 'SI')
        word = word.replace('CO', 'KO')
        word = word.replace('CU', 'KU')
        word = word.replace('CY', 'SY')
        word = word.replace('DG', 'GG')
        word = word.replace('GH', 'HH')
        word = word.replace('MAC', 'MK')
        word = word.replace('MC', 'MK')
        word = word.replace('NST', 'NSS')
        word = word.replace('PF', 'FF')
        word = word.replace('PH', 'FF')
        word = word.replace('SCH', 'SSS')
        word = word.replace('TIO', 'SIO')
        word = word.replace('TIA', 'SIO')
        word = word.replace('TCH', 'CHH')

        sdx = word.translate(self._trans)
        sdx = sdx.replace('-', '')

        # remove repeating characters
        sdx = self._delete_consecutive_repeats(sdx)

        if word[0] in {'H', 'W', 'Y'}:
            sdx = word[0] + sdx
        else:
            sdx = word[0] + sdx[1:]

        sdx = sdx.replace('0', '')

        if self._zero_pad:
            sdx += '0' * self._max_length

        return sdx[: self._max_length]
예제 #55
0
    def encode(self, word):
        """Calculate the early version of the Henry code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The early Henry code

        Examples
        --------
        >>> pe = HenryEarly()
        >>> pe.encode('Marchand')
        'MRC'
        >>> pe.encode('Beaulieu')
        'BL'
        >>> pe.encode('Beaumont')
        'BM'
        >>> pe.encode('Legrand')
        'LGR'
        >>> pe.encode('Pelletier')
        'PLT'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = ''.join(c for c in word if c in self._uc_set)

        if not word:
            return ''

        # Rule Ia seems to be covered entirely in II

        # Rule Ib
        if word[0] in self._uc_vy_set:
            # Ib1
            if (
                word[1:2] in self._uc_c_set - {'M', 'N'}
                and word[2:3] in self._uc_c_set
            ) or (
                word[1:2] in self._uc_c_set and word[2:3] not in self._uc_c_set
            ):
                if word[0] == 'Y':
                    word = 'I' + word[1:]
            # Ib2
            elif word[1:2] in {'M', 'N'} and word[2:3] in self._uc_c_set:
                if word[0] == 'E':
                    word = 'A' + word[1:]
                elif word[0] in {'I', 'U', 'Y'}:
                    word = 'E' + word[1:]
            # Ib3
            elif word[:2] in self._diph:
                word = self._diph[word[:2]] + word[2:]
            # Ib4
            elif word[1:2] in self._uc_vy_set and word[0] == 'Y':
                word = 'I' + word[1:]

        code = ''
        skip = 0

        # Rule II
        for pos, char in enumerate(word):
            nxch = word[pos + 1 : pos + 2]
            prev = word[pos - 1 : pos]

            if skip:
                skip -= 1
            elif char in self._uc_vy_set:
                code += char
            # IIc
            elif char == nxch:
                skip = 1
                code += char
            elif word[pos : pos + 2] in {'CQ', 'DT', 'SC'}:
                continue
            # IIb
            elif char in self._simple:
                code += self._simple[char]
            elif char in {'C', 'G', 'P', 'Q', 'S'}:
                if char == 'C':
                    if nxch in {'A', 'O', 'U', 'L', 'R'}:
                        code += 'K'
                    elif nxch in {'E', 'I', 'Y'}:
                        code += 'S'
                    elif nxch == 'H':
                        if word[pos + 2 : pos + 3] in self._uc_vy_set:
                            code += 'C'
                        else:  # CHR, CHL, etc.
                            code += 'K'
                    else:
                        code += 'C'
                elif char == 'G':
                    if nxch in {'A', 'O', 'U', 'L', 'R'}:
                        code += 'G'
                    elif nxch in {'E', 'I', 'Y'}:
                        code += 'J'
                    elif nxch == 'N':
                        code += 'N'
                elif char == 'P':
                    if nxch != 'H':
                        code += 'P'
                    else:
                        code += 'F'
                elif char == 'Q':
                    if word[pos + 1 : pos + 3] in {'UE', 'UI', 'UY'}:
                        code += 'G'
                    else:  # QUA, QUO, etc.
                        code += 'K'
                else:  # S...
                    if word[pos : pos + 6] == 'SAINTE':
                        code += 'X'
                        skip = 5
                    elif word[pos : pos + 5] == 'SAINT':
                        code += 'X'
                        skip = 4
                    elif word[pos : pos + 3] == 'STE':
                        code += 'X'
                        skip = 2
                    elif word[pos : pos + 2] == 'ST':
                        code += 'X'
                        skip = 1
                    elif nxch in self._uc_c_set:
                        continue
                    else:
                        code += 'S'
            # IId
            elif char == 'H' and prev in self._uc_c_set:
                continue
            elif char in self._uc_c_set - {
                'L',
                'R',
            } and nxch in self._uc_c_set - {'L', 'R'}:
                continue
            elif char == 'L' and nxch in {'M', 'N'}:
                continue
            elif (
                char in {'M', 'N'}
                and prev in self._uc_vy_set
                and nxch in self._uc_c_set
            ):
                continue
            # IIa
            else:
                code += char

        # IIe1
        if code[-4:] in {'AULT', 'EULT', 'OULT'}:
            code = code[:-2]
        # The following are blocked by rules above
        # elif code[-4:-3] in _vows and code[-3:] == 'MPS':
        #    code = code[:-3]
        # elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND',
        #                                             'NS', 'NT'}:
        #    code = code[:-2]
        elif code[-2:-1] == 'R' and code[-1:] in self._uc_c_set:
            code = code[:-1]
        # IIe2
        elif code[-2:-1] in self._uc_vy_set and code[-1:] in {
            'D',
            'M',
            'N',
            'S',
            'T',
        }:
            code = code[:-1]
        elif code[-2:] == 'ER':
            code = code[:-1]

        # Drop non-initial vowels
        code = code[:1] + code[1:].translate(
            {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
        )

        if self._max_length != -1:
            code = code[: self._max_length]

        return code
예제 #56
0
파일: _koelner.py 프로젝트: chrislit/abydos
    def encode(self, word):
        """Return the Kölner Phonetik (numeric output) code for a word.

        While the output code is numeric, it is still a str because 0s can lead
        the code.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Kölner Phonetik value as a numeric string

        Example
        -------
        >>> pe = Koelner()
        >>> pe.encode('Christopher')
        '478237'
        >>> pe.encode('Niall')
        '65'
        >>> pe.encode('Smith')
        '862'
        >>> pe.encode('Schmidt')
        '862'
        >>> pe.encode('Müller')
        '657'
        >>> pe.encode('Zimmermann')
        '86766'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """

        def _after(word, pos, letters):
            """Return True if word[pos] follows one of the supplied letters.

            Parameters
            ----------
            word : str
                The word to check
            pos : int
                Position within word to check
            letters : str
                Letters to confirm precede word[pos]

            Returns
            -------
            bool
                True if word[pos] follows a value in letters

            .. versionadded:: 0.1.0

            """
            return pos > 0 and word[pos - 1] in letters

        def _before(word, pos, letters):
            """Return True if word[pos] precedes one of the supplied letters.

            Parameters
            ----------
            word : str
                The word to check
            pos : int
                Position within word to check
            letters : str
                Letters to confirm follow word[pos]

            Returns
            -------
            bool
                True if word[pos] precedes a value in letters

            .. versionadded:: 0.1.0

            """
            return pos + 1 < len(word) and word[pos + 1] in letters

        sdx = ''

        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')

        word = word.replace('Ä', 'AE')
        word = word.replace('Ö', 'OE')
        word = word.replace('Ü', 'UE')
        word = ''.join(c for c in word if c in self._uc_set)

        # Nothing to convert, return base case
        if not word:
            return sdx

        for i in range(len(word)):
            if word[i] in self._uc_v_set:
                sdx += '0'
            elif word[i] == 'B':
                sdx += '1'
            elif word[i] == 'P':
                if _before(word, i, {'H'}):
                    sdx += '3'
                else:
                    sdx += '1'
            elif word[i] in {'D', 'T'}:
                if _before(word, i, {'C', 'S', 'Z'}):
                    sdx += '8'
                else:
                    sdx += '2'
            elif word[i] in {'F', 'V', 'W'}:
                sdx += '3'
            elif word[i] in {'G', 'K', 'Q'}:
                sdx += '4'
            elif word[i] == 'C':
                if _after(word, i, {'S', 'Z'}):
                    sdx += '8'
                elif i == 0:
                    if _before(
                        word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}
                    ):
                        sdx += '4'
                    else:
                        sdx += '8'
                elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
                    sdx += '4'
                else:
                    sdx += '8'
            elif word[i] == 'X':
                if _after(word, i, {'C', 'K', 'Q'}):
                    sdx += '8'
                else:
                    sdx += '48'
            elif word[i] == 'L':
                sdx += '5'
            elif word[i] in {'M', 'N'}:
                sdx += '6'
            elif word[i] == 'R':
                sdx += '7'
            elif word[i] in {'S', 'Z'}:
                sdx += '8'

        sdx = self._delete_consecutive_repeats(sdx)

        if sdx:
            sdx = sdx[:1] + sdx[1:].replace('0', '')

        return sdx