def tok_delim(input_string, d): """ This function splits the input string into a list of tokens (based on the delimiter). Args: input_string (string): Input string that should be tokenized. d (string): Delimiter string. Returns: A list of tokens, if the input string is not NaN , else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_delim('data science', ' ') ['data', 'science'] >>> em.tok_delim('data$#$science', '$#$') ['data', 'science'] >>> em.tok_delim(None, ' ') nan """ if pd.isnull(input_string): return pd.np.NaN if not (isinstance(input_string, six.string_types) or isinstance(input_string, bytes)): input_string = str(input_string) else: if isinstance(input_string, bytes): input_string = input_string.decode('utf-8') measure = sm.DelimiterTokenizer(delim_set=[d]) return measure.tokenize(input_string)
def tok_delim(input_string, d): """ This function splits the input string into a list of tokens (based on the delimiter). Args: input_string (string): Input string that should be tokenized. d (string): Delimiter string. Returns: A list of tokens, if the input string is not NaN , else returns NaN. Examples: >>> import py_entitymatching as em >>> em.tok_delim('data science', ' ') ['data', 'science'] >>> em.tok_delim('data$#$science', '$#$') ['data', 'science'] >>> em.tok_delim(None, ' ') nan """ if pd.isnull(input_string): return pd.np.NaN input_string = gh.convert_to_str_unicode(input_string) measure = sm.DelimiterTokenizer(delim_set=[d]) return measure.tokenize(input_string)
def matchHeaders(headers): jac = sm.Jaccard() lev = sm.Levenshtein() oc = sm.OverlapCoefficient() i = 0 j = 0 header_len = len(headers) for i in range(0, header_len - 1): for first in headers[i]: j = i + 1 if j == header_len: break for second in headers[j]: # print(first, '' , second, '') # i = i + 1 # if(i == header_len): # continue x = first y = second delim_tok = sm.DelimiterTokenizer(delim_set=['_']) jacScore = jac.get_sim_score(delim_tok.tokenize(x), delim_tok.tokenize(y)) levScore = lev.get_sim_score(x, y) ocScore = oc.get_sim_score(delim_tok.tokenize(x), delim_tok.tokenize(y)) if (ocScore == 1 or levScore >= 0.5 or jacScore >= 0.5): print(first + ' of Table' + str(i + 1) + ' and ' + second + ' of Table' + str(j + 1) + ' matched')
def tok_delim(s): # check if the input is of type base string if pd.isnull(s): return s # Remove non ascii characters. Note: This should be fixed in the # next version. #s = remove_non_ascii(s) s = gh.convert_to_str_unicode(s) # Initialize the tokenizer measure object measure = sm.DelimiterTokenizer(delim_set=[d]) # Call the function that will tokenize the input string. return measure.tokenize(s)
def tok_delim(s): # check if the input is of type base string if pd.isnull(s): return s # Remove non ascii characters. Note: This should be fixed in the # next version. #s = remove_non_ascii(s) if not (isinstance(s, six.string_types) or isinstance(s, bytes)): s = str(s) else: if isinstance(s, bytes): s = s.decode('utf-8') # Initialize the tokenizer measure object measure = sm.DelimiterTokenizer(delim_set=[d]) # Call the function that will tokenize the input string. return measure.tokenize(s)
def tok_delim(input_string, d): if pd.isnull(input_string): return pd.np.NaN measure = sm.DelimiterTokenizer(delim_set=[d]) return measure.tokenize(input_string)