Exemplo n.º 1
0
def tok_alphabetic(input_string):
    """
    This function returns a list of tokens that are maximal sequences of
    consecutive alphabetical characters.

    Args:
        input_string (string): Input string that should be tokenized.

    Returns:
        A list of tokens, if the input string is not NaN ,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_alphabetic('data99science, data#integration.')
        ['data', 'science', 'data', 'integration']
        >>> em.tok_alphabetic('99')
        []
        >>> em.tok_alphabetic(None)
        nan


    """
    if pd.isnull(input_string):
        return pd.np.NaN
    measure = sm.AlphabeticTokenizer()

    input_string = gh.convert_to_str_unicode(input_string)

    return measure.tokenize(input_string)
Exemplo n.º 2
0
def tok_alphabetic(input_string):
    """
    This function returns a list of tokens that are maximal sequences of
    consecutive alphabetical characters.

    Args:
        input_string (string): Input string that should be tokenized.

    Returns:
        A list of tokens, if the input string is not NaN ,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_alphabetic('data99science, data#integration.')
        ['data', 'science', 'data', 'integration']
        >>> em.tok_alphabetic('99')
        []
        >>> em.tok_alphabetic(None)
        nan


    """
    if pd.isnull(input_string):
        return pd.np.NaN
    measure = sm.AlphabeticTokenizer()

    if not (isinstance(input_string, six.string_types)
            or isinstance(input_string, bytes)):
        input_string = str(input_string)
    else:
        if isinstance(input_string, bytes):
            input_string = input_string.decode('utf-8')

    return measure.tokenize(input_string)
Exemplo n.º 3
0
    '9grams': sm.QgramTokenizer(qval=9),
    '1gram_set': sm.QgramTokenizer(qval=1, return_set=True),
    '1grams_set': sm.QgramTokenizer(qval=1, return_set=True),
    '2grams_set': sm.QgramTokenizer(qval=2, return_set=True),
    '3grams_set': sm.QgramTokenizer(qval=3, return_set=True),
    '4grams_set': sm.QgramTokenizer(qval=4, return_set=True),
    '5grams_set': sm.QgramTokenizer(qval=5, return_set=True),
    '6grams_set': sm.QgramTokenizer(qval=6, return_set=True),
    '7grams_set': sm.QgramTokenizer(qval=7, return_set=True),
    '8grams_set': sm.QgramTokenizer(qval=8, return_set=True),
    '9grams_set': sm.QgramTokenizer(qval=9, return_set=True),

    # Word tokenizers
    'alphanumeric': sm.AlphanumericTokenizer(),
    'alphanum': sm.AlphanumericTokenizer(),
    'alphabetic': sm.AlphabeticTokenizer(),
    'whitespace': sm.WhitespaceTokenizer(),
    'alphanumeric_set': sm.AlphanumericTokenizer(return_set=True),
    'alphanum_set': sm.AlphanumericTokenizer(return_set=True),
    'alphabetic_set': sm.AlphabeticTokenizer(return_set=True),
    'whitespace_set': sm.WhitespaceTokenizer(return_set=True),
}

cleaner_lookup = {
    'lower_and_strip': lower_and_strip,
    'alphanumeric': clean_to_alphanum,
    'alphanum': clean_to_alphanum,
}


def get_similarity_measure(measure, **kwargs):