def tfidf(arr1, arr2): if arr1 is None or arr2 is None: return pd.np.NaN if not isinstance(arr1, list): arr1 = [arr1] if any(pd.isnull(arr1)): return pd.np.NaN if not isinstance(arr2, list): arr2 = [arr2] if any(pd.isnull(arr2)): return pd.np.NaN # Create TFIDF measure object measure = sm.TfIdf() # Call the function to compute the TFIDF measure return measure.get_sim_score(arr1, arr2)
# In[36]: corpus = [] def generate_corpus(tokens): corpus.append(tokens) df['aTokens'].apply(generate_corpus) df['bTokens'].apply(generate_corpus) print(len(corpus)) # In[37]: tfidf = sm.TfIdf(corpus) df['TfIdf'] = df.apply( lambda x: tfidf.get_sim_score(x['aTokens'], x['bTokens']), axis=1) df.head() # # Sequence Based Similarities # In[38]: aff = sm.Affine() df['Affine'] = df.apply( lambda x: aff.get_raw_score(x['Sequence1'], x['Sequence2']), axis=1) df.head() # In[39]: