def tok_alphanumeric(input_string):
    """
    This function returns a list of tokens that are maximal sequences of
    consecutive alphanumeric characters.

    Args:
        input_string (string): Input string that should be tokenized.

    Returns:
        A list of tokens, if the input string is not NaN ,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_alphanumeric('data9,(science), data9#.(integration).88')
        ['data9', 'science', 'data9', 'integration', '88']
        >>> em.tok_alphanumeric('#.$')
        []
        >>> em.tok_alphanumeric(None)
        nan

    """
    if pd.isnull(input_string):
        return pd.np.NaN

    input_string = gh.convert_to_str_unicode(input_string)

    measure = sm.AlphanumericTokenizer()
    return measure.tokenize(input_string)
示例#2
0
def tok_alphanumeric(input_string):
    """
    This function returns a list of tokens that are maximal sequences of
    consecutive alphanumeric characters.

    Args:
        input_string (string): Input string that should be tokenized.

    Returns:
        A list of tokens, if the input string is not NaN ,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_alphanumeric('data9,(science), data9#.(integration).88')
        ['data9', 'science', 'data9', 'integration', '88']
        >>> em.tok_alphanumeric('#.$')
        []
        >>> em.tok_alphanumeric(None)
        nan

    """
    if pd.isnull(input_string):
        return pd.np.NaN

    if not (isinstance(input_string, six.string_types)
            or isinstance(input_string, bytes)):
        input_string = str(input_string)
    else:
        if isinstance(input_string, bytes):
            input_string = input_string.decode('utf-8')
    measure = sm.AlphanumericTokenizer()
    return measure.tokenize(input_string)
def get_tokenized_data(data):
    an_tokenizer = string_matching.AlphanumericTokenizer()
    tokenized_data = []
    for entry in data:
        new_entry = {}
        for field in util.field_names:
            new_entry[field] = an_tokenizer.tokenize(str(entry[field]))

        tokenized_data.append(new_entry)

    return tokenized_data
示例#4
0
    def __init__(self):
        self.similarity_function = [
            sm.BagDistance(),
            sm.Cosine(),
            sm.Dice(),
            sm.Editex(),
            sm.GeneralizedJaccard(),
            sm.Jaccard(),
            sm.Jaro(),
            sm.JaroWinkler(),
            sm.Levenshtein(),
            sm.OverlapCoefficient(),
            sm.TverskyIndex()
        ]

        self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
 def __init__(self):
     self.genJac = py_stringmatching.GeneralizedJaccard()
     self.tokenizer = py_stringmatching.AlphanumericTokenizer()
示例#6
0
import py_stringmatching as sm

alnum_tok = sm.AlphanumericTokenizer()
qg3_tok = sm.QgramTokenizer(qval=3)

jac = sm.Jaccard()
lev = sm.Levenshtein()


def calcola_similarita(string1, string2):
    a = jac.get_sim_score(alnum_tok.tokenize(string1),
                          alnum_tok.tokenize(string2))
    b = lev.get_sim_score(string1, string2)
    c = jac.get_sim_score(qg3_tok.tokenize(string1), qg3_tok.tokenize(string2))
    return [{"alnum_jac": a}, {"alnum_lev": b}, {"qg3_jac": c}]


def add_features(elem):
    line, count = elem
    title1 = line[4]
    director1 = line[3]
    date1 = line[5]
    title2 = line[7]
    director2 = line[6]
    date2 = line[8]
    return (line+(calcola_similarita(title1,title2),calcola_similarita(director1,director2),\
    calcola_similarita(date1,date2)),count)


def precision(true):
    tp = true.map(lambda row:("true",row.response))\
示例#7
0
    '7grams': sm.QgramTokenizer(qval=7),
    '8grams': sm.QgramTokenizer(qval=8),
    '9grams': sm.QgramTokenizer(qval=9),
    '1gram_set': sm.QgramTokenizer(qval=1, return_set=True),
    '1grams_set': sm.QgramTokenizer(qval=1, return_set=True),
    '2grams_set': sm.QgramTokenizer(qval=2, return_set=True),
    '3grams_set': sm.QgramTokenizer(qval=3, return_set=True),
    '4grams_set': sm.QgramTokenizer(qval=4, return_set=True),
    '5grams_set': sm.QgramTokenizer(qval=5, return_set=True),
    '6grams_set': sm.QgramTokenizer(qval=6, return_set=True),
    '7grams_set': sm.QgramTokenizer(qval=7, return_set=True),
    '8grams_set': sm.QgramTokenizer(qval=8, return_set=True),
    '9grams_set': sm.QgramTokenizer(qval=9, return_set=True),

    # Word tokenizers
    'alphanumeric': sm.AlphanumericTokenizer(),
    'alphanum': sm.AlphanumericTokenizer(),
    'alphabetic': sm.AlphabeticTokenizer(),
    'whitespace': sm.WhitespaceTokenizer(),
    'alphanumeric_set': sm.AlphanumericTokenizer(return_set=True),
    'alphanum_set': sm.AlphanumericTokenizer(return_set=True),
    'alphabetic_set': sm.AlphabeticTokenizer(return_set=True),
    'whitespace_set': sm.WhitespaceTokenizer(return_set=True),
}

cleaner_lookup = {
    'lower_and_strip': lower_and_strip,
    'alphanumeric': clean_to_alphanum,
    'alphanum': clean_to_alphanum,
}