def dice(arr1, arr2): """ This function computes the Dice score between the two input lists/sets. Args: arr1,arr2 (list or set): The input list or sets for which the Dice score should be computed. Returns: The Dice score if both the lists/set are not None and do not have any missing tokens (i.e NaN), else returns NaN. """ if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create Dice object measure = sm.Dice() # Call the function to return the dice score return measure.get_raw_score(arr1, arr2)
def dice(arr1, arr2): if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create Dice object measure = sm.Dice() # Call the function to return the dice score return measure.get_raw_score(arr1, arr2)
def __init__(self): self.similarity_function = [ sm.BagDistance(), sm.Cosine(), sm.Dice(), sm.Editex(), sm.GeneralizedJaccard(), sm.Jaccard(), sm.Jaro(), sm.JaroWinkler(), sm.Levenshtein(), sm.OverlapCoefficient(), sm.TverskyIndex() ] self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
def dice(arr1, arr2): """ This function computes the Dice score between the two input lists/sets. Args: arr1,arr2 (list or set): The input list or sets for which the Dice score should be computed. Returns: The Dice score if both the lists/set are not None and do not have any missing tokens (i.e NaN), else returns NaN. Examples: >>> import py_entitymatching as em >>> em.dice(['data', 'science'], ['data']) 0.6666666666666666 >>> em.dice(['data', 'science'], None) nan """ if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create Dice object measure = sm.Dice() # Call the function to return the dice score return measure.get_raw_score(arr1, arr2)
axis=1) df['Q4'] = df.apply( lambda x: jac.get_sim_score(x['Q-gram_4_Tokens1'], x['Q-gram_4_Tokens2']), axis=1) df.head() # In[31]: cos = sm.Cosine() df['Cosine'] = df.apply( lambda x: cos.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # In[32]: dice = sm.Dice() df['Dice'] = df.apply(lambda x: dice.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # In[33]: oc = sm.OverlapCoefficient() df['Overlap'] = df.apply( lambda x: oc.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # In[34]: # Set alpha beta https://en.wikipedia.org/wiki/Tversky_index # Setting alpha beta as 0.5 is same as Dice Similarity
def __init__(self): self.dice = py_stringmatching.Dice() self.tokenizer = py_stringmatching.QgramTokenizer(qval=3)
SOInsampleFile = 'stackoverflowdata/' + insample_data SOOutsampleFile = 'stackoverflowdata/' + outsample_data SOInsampleData = pickle.load(open(SOInsampleFile, 'rb')) SOOutsampleData = pickle.load(open(SOOutsampleFile, 'rb')) csAbstract = FVC.CosSim('CSAbs', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True), False) csSentence = FVC.CosSim('CSSent', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True), True) jac = FVC.stringMatchExcerpts('Jacc', sm.Jaccard(), sm.WhitespaceTokenizer(return_set=True)) jacq3 = FVC.stringMatchExcerpts('FuzzJacc', sm.Jaccard(), sm.QgramTokenizer(qval=3, return_set=True)) dice = FVC.stringMatchExcerpts('Dice', sm.Dice(), sm.WhitespaceTokenizer(return_set=True)) diceq3 = FVC.stringMatchExcerpts('Dice', sm.Dice(), sm.QgramTokenizer(qval=3, return_set=True)) cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(), sm.WhitespaceTokenizer(return_set=True)) cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure', sm.Cosine(), sm.QgramTokenizer(return_set=True)) LVdist = FVC.stringMatchTitles('LVDist', sm.Levenshtein()) sw = FVC.stringMatchTitles('SW', sm.SmithWaterman()) nw = FVC.stringMatchTitles('NW', sm.NeedlemanWunsch()) jw = FVC.stringMatchTitles('JW', sm.JaroWinkler()) def writeToCSV(fileName, header, tableList): wr = csv.writer(open(fileName, 'wb'), quoting=csv.QUOTE_ALL)