def tok_alphanumeric(input_string): """ This function returns a list of tokens that are maximal sequences of consecutive alphanumeric characters. Args: input_string (string): Input string that should be tokenized. Returns: A list of tokens, if the input string is not NaN , else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_alphanumeric('data9,(science), data9#.(integration).88') ['data9', 'science', 'data9', 'integration', '88'] >>> em.tok_alphanumeric('#.$') [] >>> em.tok_alphanumeric(None) nan """ if pd.isnull(input_string): return pd.np.NaN input_string = gh.convert_to_str_unicode(input_string) measure = sm.AlphanumericTokenizer() return measure.tokenize(input_string)
def tok_alphanumeric(input_string): """ This function returns a list of tokens that are maximal sequences of consecutive alphanumeric characters. Args: input_string (string): Input string that should be tokenized. Returns: A list of tokens, if the input string is not NaN , else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_alphanumeric('data9,(science), data9#.(integration).88') ['data9', 'science', 'data9', 'integration', '88'] >>> em.tok_alphanumeric('#.$') [] >>> em.tok_alphanumeric(None) nan """ if pd.isnull(input_string): return pd.np.NaN if not (isinstance(input_string, six.string_types) or isinstance(input_string, bytes)): input_string = str(input_string) else: if isinstance(input_string, bytes): input_string = input_string.decode('utf-8') measure = sm.AlphanumericTokenizer() return measure.tokenize(input_string)
def get_tokenized_data(data): an_tokenizer = string_matching.AlphanumericTokenizer() tokenized_data = [] for entry in data: new_entry = {} for field in util.field_names: new_entry[field] = an_tokenizer.tokenize(str(entry[field])) tokenized_data.append(new_entry) return tokenized_data
def __init__(self): self.similarity_function = [ sm.BagDistance(), sm.Cosine(), sm.Dice(), sm.Editex(), sm.GeneralizedJaccard(), sm.Jaccard(), sm.Jaro(), sm.JaroWinkler(), sm.Levenshtein(), sm.OverlapCoefficient(), sm.TverskyIndex() ] self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
def __init__(self): self.genJac = py_stringmatching.GeneralizedJaccard() self.tokenizer = py_stringmatching.AlphanumericTokenizer()
import py_stringmatching as sm alnum_tok = sm.AlphanumericTokenizer() qg3_tok = sm.QgramTokenizer(qval=3) jac = sm.Jaccard() lev = sm.Levenshtein() def calcola_similarita(string1, string2): a = jac.get_sim_score(alnum_tok.tokenize(string1), alnum_tok.tokenize(string2)) b = lev.get_sim_score(string1, string2) c = jac.get_sim_score(qg3_tok.tokenize(string1), qg3_tok.tokenize(string2)) return [{"alnum_jac": a}, {"alnum_lev": b}, {"qg3_jac": c}] def add_features(elem): line, count = elem title1 = line[4] director1 = line[3] date1 = line[5] title2 = line[7] director2 = line[6] date2 = line[8] return (line+(calcola_similarita(title1,title2),calcola_similarita(director1,director2),\ calcola_similarita(date1,date2)),count) def precision(true): tp = true.map(lambda row:("true",row.response))\
'7grams': sm.QgramTokenizer(qval=7), '8grams': sm.QgramTokenizer(qval=8), '9grams': sm.QgramTokenizer(qval=9), '1gram_set': sm.QgramTokenizer(qval=1, return_set=True), '1grams_set': sm.QgramTokenizer(qval=1, return_set=True), '2grams_set': sm.QgramTokenizer(qval=2, return_set=True), '3grams_set': sm.QgramTokenizer(qval=3, return_set=True), '4grams_set': sm.QgramTokenizer(qval=4, return_set=True), '5grams_set': sm.QgramTokenizer(qval=5, return_set=True), '6grams_set': sm.QgramTokenizer(qval=6, return_set=True), '7grams_set': sm.QgramTokenizer(qval=7, return_set=True), '8grams_set': sm.QgramTokenizer(qval=8, return_set=True), '9grams_set': sm.QgramTokenizer(qval=9, return_set=True), # Word tokenizers 'alphanumeric': sm.AlphanumericTokenizer(), 'alphanum': sm.AlphanumericTokenizer(), 'alphabetic': sm.AlphabeticTokenizer(), 'whitespace': sm.WhitespaceTokenizer(), 'alphanumeric_set': sm.AlphanumericTokenizer(return_set=True), 'alphanum_set': sm.AlphanumericTokenizer(return_set=True), 'alphabetic_set': sm.AlphabeticTokenizer(return_set=True), 'whitespace_set': sm.WhitespaceTokenizer(return_set=True), } cleaner_lookup = { 'lower_and_strip': lower_and_strip, 'alphanumeric': clean_to_alphanum, 'alphanum': clean_to_alphanum, }