def jaro(s1, s2): if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN s1 = remove_non_ascii(s1) s2 = remove_non_ascii(s2) sim = jpype.JClass('build.SimilarityFunction')() return sim.jaro(s1, s2)
def smith_waterman_gotoh(s1, s2): if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN s1 = remove_non_ascii(s1) s2 = remove_non_ascii(s2) sim = jpype.JClass('build.SimilarityFunction')() return sim.smithWatermanGotoh(s1, s2)
def needleman_wunsch(s1, s2): if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN if isinstance(s1, basestring): s1 = remove_non_ascii(s1) if isinstance(s2, basestring): s2 = remove_non_ascii(s2) sim = jpype.JClass('build.SimilarityFunction')() return sim.needlemanWunch(s1, s2)
def soundex(s1, s2): if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN if isinstance(s1, basestring): s1 = remove_non_ascii(s1) if isinstance(s2, basestring): s2 = remove_non_ascii(s2) sim = jpype.JClass('build.SimilarityFunction')() return sim.soundex(s1, s2)
def smith_waterman(s1, s2): if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN if isinstance(s1, basestring): s1 = remove_non_ascii(s1) if isinstance(s2, basestring): s2 = remove_non_ascii(s2) sim = jpype.JClass('build.SimilarityFunction')() return sim.smithWaterman(s1, s2)
def lev(s1, s2): if s1 is None or s2 is None: return np.NaN if pd.isnull(s1) or pd.isnull(s2): return np.NaN if isinstance(s1, basestring): s1 = remove_non_ascii(s1) if isinstance(s2, basestring): s2 = remove_non_ascii(s2) sim = jpype.JClass('build.SimilarityFunction')() return sim.levenshtein(str(s1), str(s2))
def process_table(self, df, overlap_attr, qgram, rem_stop_words): # get ltable attr column attr_col_values = df[overlap_attr].values # remove non-ascii chars attr_col_values = [remove_non_ascii(v) for v in attr_col_values] # remove special characters attr_col_values = [ self.rem_punctuations(v).lower() for v in attr_col_values ] # chop the attribute values col_values_chopped = [v.split() for v in attr_col_values] # convert it into set col_values_chopped = [list(set(v)) for v in col_values_chopped] # remove stop words if rem_stop_words == True: col_values_chopped = [ self.rem_stopwords(v) for v in col_values_chopped ] if qgram is not None: values = [' '.join(v) for v in col_values_chopped] col_values_chopped = [ngrams(v, qgram) for v in values] return col_values_chopped
def process_table(self, table, overlap_attr, q_val, rem_stop_words): # get overlap_attr column attr_col_values = table[overlap_attr] # remove non-ascii chars attr_col_values = [helper.remove_non_ascii(val) for val in attr_col_values] # remove special characters attr_col_values = [self.rem_punctuations(val).lower() for val in attr_col_values] # chop the attribute values col_values_chopped = [val.split() for val in attr_col_values] # convert the chopped values into a set col_values_chopped = [list(set(val)) for val in col_values_chopped] # remove stop words if rem_stop_words == True: col_values_chopped = [self.rem_stopwords(val) for val in col_values_chopped] if q_val is not None: values = [' '.join(val) for val in col_values_chopped] col_values_chopped = [qgram(val, q_val) for val in values] return col_values_chopped
def _tok_qgram(s, q): """ q-gram tokenizer; splits the input string into a list of q-grams Parameters ---------- s : string source string to be converted into qgrams q : integer q-value Returns ------- qgram_list : list, q-gram list of source string """ # check if the input is of type base string if pd.isnull(s): return s if not isinstance(s, basestring): raise ValueError('Input should be of type string') if q <= 0: raise ValueError('q value must be greater than 0') s = remove_non_ascii(s) # assume that JVM is already started !!! tok_cls = jpype.JClass('build.Tokenizers') tokenizer = tok_cls() return list(tokenizer.qgramTokenizer( s, float(q))) # fix in java, it should be int
def _tok_qgram(s, q): """ q-gram tokenizer; splits the input string into a list of q-grams Parameters ---------- s : string source string to be converted into qgrams q : integer q-value Returns ------- qgram_list : list, q-gram list of source string """ # check if the input is of type base string if pd.isnull(s): return s if not isinstance(s, basestring): raise ValueError('Input should be of type string') if q <= 0: raise ValueError('q value must be greater than 0') s = remove_non_ascii(s) # assume that JVM is already started !!! tok_cls = jpype.JClass('build.Tokenizers') tokenizer = tok_cls() return list(tokenizer.qgramTokenizer(s, float(q))) # fix in java, it should be int
def tok_delim(s): # check if the input is of type base string if pd.isnull(s): return s if not isinstance(s, basestring): raise ValueError('Input should be of type string') s = remove_non_ascii(s) return s.split(d)
def process_val(self, val, overlap_attr, q_val, rem_stop_words): val = helper.remove_non_ascii(val) val = self.rem_punctuations(val).lower() chopped_vals = val.split() if rem_stop_words == True: chopped_vals = self.rem_stopwords(chopped_vals) if q_val != None: values = ' '.join(chopped_vals) chopped_vals = qgram(values, q_val) return list(set(chopped_vals))
def process_val(self, val, overlap_attr, qgram, rem_stop_words): val = remove_non_ascii(val) val = self.rem_punctuations(val).lower() chopped_vals = val.split() if rem_stop_words == True: chopped_vals = self.rem_stopwords(chopped_vals) if qgram != None: values = ' '.join(chopped_vals) chopped_vals = ngrams(values, qgram) return list(set(chopped_vals))
def tok_whitespace(s): # check if the input is of type base string if pd.isnull(s): return s if not isinstance(s, basestring): raise ValueError('Input should be of type string') s = remove_non_ascii(s) # assume that JVM is already started !!! tok_cls = jpype.JClass('build.Tokenizers') tokenizer = tok_cls() return tokenizer.whitespaceTokenizer(s)
def tok_qgram(s): # check if the input is of type base string if pd.isnull(s): return s if not isinstance(s, basestring): raise ValueError('Input should be of type string') if q <= 0: raise ValueError('q value must be greater than 0') s = remove_non_ascii(s) # assume that JVM is already started !!! tok_cls = jpype.JClass('build.Tokenizers') tokenizer = tok_cls() return list(tokenizer.qgramTokenizer(s, float(q))) # fix in java, it should be int
def tok_qgram(s): # check if the input is of type base string if pd.isnull(s): return s if not isinstance(s, basestring): raise ValueError('Input should be of type string') if q <= 0: raise ValueError('q value must be greater than 0') s = remove_non_ascii(s) # assume that JVM is already started !!! tok_cls = jpype.JClass('build.Tokenizers') tokenizer = tok_cls() return list(tokenizer.qgramTokenizer( s, float(q))) # fix in java, it should be int
def cast_val(v, i): if v == "None": return None elif isinstance(i, bool): return bool(v) elif isinstance(i, float): return float(v) elif isinstance(i, int): return int(v) elif isinstance(i, basestring): v = remove_non_ascii(unicode(v)) return str(v) elif isinstance(i, object): return v else: logger.warning("Input value did not match any of the known types") return v
def cast_val(v, i): if v == "None": return None elif isinstance(i, bool): return bool(v) elif isinstance(i, float): return float(v) elif isinstance(i, int): return int(v) elif isinstance(i, basestring): v = remove_non_ascii(unicode(v)) return str(v) elif isinstance(i, object): return v else: logger.warning('Input value did not match any of the known types') return v
def process_table(self, df, overlap_attr, qgram, rem_stop_words): # get ltable attr column attr_col_values = df[overlap_attr].values # remove non-ascii chars attr_col_values = [remove_non_ascii(v) for v in attr_col_values] # remove special characters attr_col_values = [self.rem_punctuations(v).lower() for v in attr_col_values] # chop the attribute values col_values_chopped = [v.split() for v in attr_col_values] # convert it into set col_values_chopped = [list(set(v)) for v in col_values_chopped] # remove stop words if rem_stop_words == True: col_values_chopped = [self.rem_stopwords(v) for v in col_values_chopped] if qgram is not None: values = [' '.join(v) for v in col_values_chopped] col_values_chopped = [ngrams(v, qgram) for v in values] return col_values_chopped