예제 #1
0
def jaro(s1, s2):
    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN
    s1 = remove_non_ascii(s1)
    s2 = remove_non_ascii(s2)
    sim = jpype.JClass('build.SimilarityFunction')()
    return sim.jaro(s1, s2)
예제 #2
0
def smith_waterman_gotoh(s1, s2):
    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN
    s1 = remove_non_ascii(s1)
    s2 = remove_non_ascii(s2)

    sim = jpype.JClass('build.SimilarityFunction')()
    return sim.smithWatermanGotoh(s1, s2)
예제 #3
0
def needleman_wunsch(s1, s2):
    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN
    if isinstance(s1, basestring):
        s1 = remove_non_ascii(s1)
    if isinstance(s2, basestring):
        s2 = remove_non_ascii(s2)
    sim = jpype.JClass('build.SimilarityFunction')()
    return sim.needlemanWunch(s1, s2)
예제 #4
0
def soundex(s1, s2):
    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN
    if isinstance(s1, basestring):
        s1 = remove_non_ascii(s1)
    if isinstance(s2, basestring):
        s2 = remove_non_ascii(s2)
    sim = jpype.JClass('build.SimilarityFunction')()
    return sim.soundex(s1, s2)
예제 #5
0
def smith_waterman(s1, s2):
    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN
    if isinstance(s1, basestring):
        s1 = remove_non_ascii(s1)
    if isinstance(s2, basestring):
        s2 = remove_non_ascii(s2)
    sim = jpype.JClass('build.SimilarityFunction')()
    return sim.smithWaterman(s1, s2)
예제 #6
0
def lev(s1, s2):
    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN
    if isinstance(s1, basestring):
        s1 = remove_non_ascii(s1)
    if isinstance(s2, basestring):
        s2 = remove_non_ascii(s2)
    sim = jpype.JClass('build.SimilarityFunction')()
    return sim.levenshtein(str(s1), str(s2))
예제 #7
0
    def process_table(self, df, overlap_attr, qgram, rem_stop_words):
        # get ltable attr column
        attr_col_values = df[overlap_attr].values
        # remove non-ascii chars
        attr_col_values = [remove_non_ascii(v) for v in attr_col_values]

        # remove special characters
        attr_col_values = [
            self.rem_punctuations(v).lower() for v in attr_col_values
        ]
        # chop the attribute values
        col_values_chopped = [v.split() for v in attr_col_values]
        # convert it into set

        col_values_chopped = [list(set(v)) for v in col_values_chopped]

        # remove stop words
        if rem_stop_words == True:
            col_values_chopped = [
                self.rem_stopwords(v) for v in col_values_chopped
            ]
        if qgram is not None:
            values = [' '.join(v) for v in col_values_chopped]
            col_values_chopped = [ngrams(v, qgram) for v in values]

        return col_values_chopped
예제 #8
0
    def process_table(self, table, overlap_attr, q_val, rem_stop_words):

        # get overlap_attr column
        attr_col_values = table[overlap_attr]

        # remove non-ascii chars
        attr_col_values = [helper.remove_non_ascii(val) for val in attr_col_values]

        # remove special characters
        attr_col_values = [self.rem_punctuations(val).lower() for val in attr_col_values]

        # chop the attribute values
        col_values_chopped = [val.split() for val in attr_col_values]

        # convert the chopped values into a set
        col_values_chopped = [list(set(val)) for val in col_values_chopped]

        # remove stop words
        if rem_stop_words == True:
            col_values_chopped = [self.rem_stopwords(val) for val in col_values_chopped]

        if q_val is not None:
            values = [' '.join(val) for val in col_values_chopped]
            col_values_chopped = [qgram(val, q_val) for val in values]

        return col_values_chopped
예제 #9
0
def _tok_qgram(s, q):
    """
    q-gram tokenizer; splits the input string into a list of q-grams

    Parameters
    ----------
    s : string
        source string to be converted into qgrams
    q : integer
        q-value

    Returns
    -------
    qgram_list : list,
         q-gram list of source string
    """
    # check if the input is of type base string
    if pd.isnull(s):
        return s
    if not isinstance(s, basestring):
        raise ValueError('Input should be of type string')
    if q <= 0:
        raise ValueError('q value must be greater than 0')
    s = remove_non_ascii(s)
    # assume that JVM is already started !!!
    tok_cls = jpype.JClass('build.Tokenizers')
    tokenizer = tok_cls()
    return list(tokenizer.qgramTokenizer(
        s, float(q)))  # fix in java, it should be int
예제 #10
0
def _tok_qgram(s, q):
    """
    q-gram tokenizer; splits the input string into a list of q-grams

    Parameters
    ----------
    s : string
        source string to be converted into qgrams
    q : integer
        q-value

    Returns
    -------
    qgram_list : list,
         q-gram list of source string
    """
    # check if the input is of type base string
    if pd.isnull(s):
        return s
    if not isinstance(s, basestring):
        raise ValueError('Input should be of type string')
    if q <= 0:
        raise ValueError('q value must be greater than 0')
    s = remove_non_ascii(s)
    # assume that JVM is already started !!!
    tok_cls = jpype.JClass('build.Tokenizers')
    tokenizer = tok_cls()
    return list(tokenizer.qgramTokenizer(s, float(q))) # fix in java, it should be int
예제 #11
0
 def tok_delim(s):
     # check if the input is of type base string
     if pd.isnull(s):
         return s
     if not isinstance(s, basestring):
         raise ValueError('Input should be of type string')
     s = remove_non_ascii(s)
     return s.split(d)
예제 #12
0
 def tok_delim(s):
     # check if the input is of type base string
     if pd.isnull(s):
         return s
     if not isinstance(s, basestring):
         raise ValueError('Input should be of type string')
     s = remove_non_ascii(s)
     return s.split(d)
예제 #13
0
 def process_val(self, val, overlap_attr, q_val, rem_stop_words):
     val = helper.remove_non_ascii(val)
     val = self.rem_punctuations(val).lower()
     chopped_vals = val.split()
     if rem_stop_words == True:
         chopped_vals = self.rem_stopwords(chopped_vals)
     if q_val != None:
         values = ' '.join(chopped_vals)
         chopped_vals = qgram(values, q_val)
     return list(set(chopped_vals))
예제 #14
0
 def process_val(self, val, overlap_attr, qgram, rem_stop_words):
     val = remove_non_ascii(val)
     val = self.rem_punctuations(val).lower()
     chopped_vals = val.split()
     if rem_stop_words == True:
         chopped_vals = self.rem_stopwords(chopped_vals)
     if qgram != None:
         values = ' '.join(chopped_vals)
         chopped_vals = ngrams(values, qgram)
     return list(set(chopped_vals))
예제 #15
0
 def tok_whitespace(s):
     # check if the input is of type base string
     if pd.isnull(s):
         return s
     if not isinstance(s, basestring):
         raise ValueError('Input should be of type string')
     s = remove_non_ascii(s)
     # assume that JVM is already started !!!
     tok_cls = jpype.JClass('build.Tokenizers')
     tokenizer = tok_cls()
     return tokenizer.whitespaceTokenizer(s)
예제 #16
0
 def tok_qgram(s):
     # check if the input is of type base string
     if pd.isnull(s):
         return s
     if not isinstance(s, basestring):
         raise ValueError('Input should be of type string')
     if q <= 0:
         raise ValueError('q value must be greater than 0')
     s = remove_non_ascii(s)
     # assume that JVM is already started !!!
     tok_cls = jpype.JClass('build.Tokenizers')
     tokenizer = tok_cls()
     return list(tokenizer.qgramTokenizer(s, float(q))) # fix in java, it should be int
예제 #17
0
 def tok_qgram(s):
     # check if the input is of type base string
     if pd.isnull(s):
         return s
     if not isinstance(s, basestring):
         raise ValueError('Input should be of type string')
     if q <= 0:
         raise ValueError('q value must be greater than 0')
     s = remove_non_ascii(s)
     # assume that JVM is already started !!!
     tok_cls = jpype.JClass('build.Tokenizers')
     tokenizer = tok_cls()
     return list(tokenizer.qgramTokenizer(
         s, float(q)))  # fix in java, it should be int
예제 #18
0
def cast_val(v, i):
    if v == "None":
        return None
    elif isinstance(i, bool):
        return bool(v)
    elif isinstance(i, float):
        return float(v)
    elif isinstance(i, int):
        return int(v)
    elif isinstance(i, basestring):
        v = remove_non_ascii(unicode(v))
        return str(v)
    elif isinstance(i, object):
        return v
    else:
        logger.warning("Input value did not match any of the known types")
        return v
예제 #19
0
def cast_val(v, i):
    if v == "None":
        return None
    elif isinstance(i, bool):
        return bool(v)
    elif isinstance(i, float):
        return float(v)
    elif isinstance(i, int):
        return int(v)
    elif isinstance(i, basestring):
        v = remove_non_ascii(unicode(v))
        return str(v)
    elif isinstance(i, object):
        return v
    else:
        logger.warning('Input value did not match any of the known types')
        return v
예제 #20
0
    def process_table(self, df, overlap_attr, qgram, rem_stop_words):
        # get ltable attr column
        attr_col_values = df[overlap_attr].values
        # remove non-ascii chars
        attr_col_values = [remove_non_ascii(v) for v in attr_col_values]

        # remove special characters
        attr_col_values = [self.rem_punctuations(v).lower() for v in attr_col_values]
        # chop the attribute values
        col_values_chopped = [v.split() for v in attr_col_values]
        # convert it into set

        col_values_chopped = [list(set(v)) for v in col_values_chopped]

        # remove stop words
        if rem_stop_words == True:
            col_values_chopped = [self.rem_stopwords(v) for v in col_values_chopped]
        if qgram is not None:
            values = [' '.join(v) for v in col_values_chopped]
            col_values_chopped = [ngrams(v, qgram) for v in values]

        return col_values_chopped