예제 #1
0
def tfidf(arr1, arr2):                                                    
    if arr1 is None or arr2 is None:                                            
        return pd.np.NaN                                                        
    if not isinstance(arr1, list):                                              
        arr1 = [arr1]                                                           
    if any(pd.isnull(arr1)):                                                    
        return pd.np.NaN                                                        
    if not isinstance(arr2, list):                                              
        arr2 = [arr2]                                                           
    if any(pd.isnull(arr2)):                                                    
        return pd.np.NaN                                                        
    # Create TFIDF measure object                                         
    measure = sm.TfIdf()                                                   
    # Call the function to compute the TFIDF measure                      
    return measure.get_sim_score(arr1, arr2)    
예제 #2
0
# In[36]:

corpus = []


def generate_corpus(tokens):
    corpus.append(tokens)


df['aTokens'].apply(generate_corpus)
df['bTokens'].apply(generate_corpus)
print(len(corpus))

# In[37]:

tfidf = sm.TfIdf(corpus)
df['TfIdf'] = df.apply(
    lambda x: tfidf.get_sim_score(x['aTokens'], x['bTokens']), axis=1)
df.head()

# # Sequence Based Similarities

# In[38]:

aff = sm.Affine()
df['Affine'] = df.apply(
    lambda x: aff.get_raw_score(x['Sequence1'], x['Sequence2']), axis=1)
df.head()

# In[39]: