def cosine(arr1, arr2):
    """
    This function computes the cosine measure between the two input
    lists/sets.

    Args:
        arr1,arr2 (list or set): The input list or sets for which the cosine
         measure should be computed.

    Returns:
        The cosine measure if both the lists/set are not None and do not have
        any missing tokens (i.e NaN), else  returns NaN.
    """

    if arr1 is None or arr2 is None:
        return pd.np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return pd.np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return pd.np.NaN
    # Create cosine measure object
    measure = sm.Cosine()
    # Call the function to compute the cosine measure.
    return measure.get_raw_score(arr1, arr2)
Exemplo n.º 2
0
 def cos_score(self, str_pair, sim_score=True):
     """
     calculate cosine similarity between two single sets of tokens
     :return: similarity score (0 to 1)
     """
     e1, e2 = self._check_input(str_pair, type_=list)
     cos = sm.Cosine()
     return cos.get_sim_score(e1, e2) if sim_score else cos.get_raw_score(
         e1, e2)
Exemplo n.º 3
0
def cosine(arr1, arr2):
    if arr1 is None or arr2 is None:
        return pd.np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return pd.np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return pd.np.NaN
    # Create cosine measure object
    measure = sm.Cosine()
    # Call the function to compute the cosine measure.
    return measure.get_raw_score(arr1, arr2)
Exemplo n.º 4
0
    def __init__(self):
        self.similarity_function = [
            sm.BagDistance(),
            sm.Cosine(),
            sm.Dice(),
            sm.Editex(),
            sm.GeneralizedJaccard(),
            sm.Jaccard(),
            sm.Jaro(),
            sm.JaroWinkler(),
            sm.Levenshtein(),
            sm.OverlapCoefficient(),
            sm.TverskyIndex()
        ]

        self.alphanumeric_tokenizer = sm.AlphanumericTokenizer(return_set=True)
Exemplo n.º 5
0
def cosine(arr1, arr2):
    """
    This function computes the cosine measure between the two input
    lists/sets.

    Args:
        arr1,arr2 (list or set): The input list or sets for which the cosine
         measure should be computed.

    Returns:
        The cosine measure if both the lists/set are not None and do not have
        any missing tokens (i.e NaN), else  returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.cosine(['data', 'science'], ['data'])
        0.7071067811865475
        >>> em.cosine(['data', 'science'], None)
        nan

    """

    if arr1 is None or arr2 is None:
        return pd.np.NaN
    if not isinstance(arr1, list):
        arr1 = [arr1]
    if any(pd.isnull(arr1)):
        return pd.np.NaN
    if not isinstance(arr2, list):
        arr2 = [arr2]
    if any(pd.isnull(arr2)):
        return pd.np.NaN
    # Create cosine measure object
    measure = sm.Cosine()
    # Call the function to compute the cosine measure.
    return measure.get_raw_score(arr1, arr2)
Exemplo n.º 6
0
SOInsampleData = pickle.load(open(SOInsampleFile, 'rb'))
SOOutsampleData = pickle.load(open(SOOutsampleFile, 'rb'))

nlmInsampleFile = 'NLMdata/dataCached/insample_abstracts_outfile'
nlmOutsampleFile = 'NLMdata/dataCached/outSample_abstracts_outfile'
nlmInsampleData = pickle.load(open(nlmInsampleFile, 'rb'))
nlmOutsampleData = pickle.load(open(nlmOutsampleFile, 'rb'))

# Instantiate FVComponent instances
csAbstract = FVC.CosSim('CSAbs',
                        TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True),
                        False)
csSentence = FVC.CosSim('CSSent',
                        TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True),
                        True)
cosM = FVC.stringMatchExcerpts('CosMeasure', sm.Cosine(),
                               sm.WhitespaceTokenizer(return_set=True))
LVDist = FVC.stringMatchTitles('LVDist', sm.Levenshtein())

FVCList = [csAbstract, csSentence, cosM, LVDist]


def classifyAndPredict(insampleData, outsampleData, folderName, componentList):
    print len(insampleData[0])
    print len(outsampleData[1])
    # Declare instance of a join object with input arguments
    easyJoin = myJoin.join(insampleData, outsampleData, folderName)
    easyJoin.setComponentList(componentList)
    # Build feature vector
    easyJoin.buildInsampleFV()
    easyJoin.buildOutsampleFVReduced(0.01)
Exemplo n.º 7
0
import os

def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)


INSAMPLE_FV_OUTFILE = 'dataCached/insampleFV_outfile'
OUTSAMPLE_FV_OUTFILE = 'dataCached/outsampleFV_outfile'
OUTSAMPLE_FV_REDUCED_OUTFILE = 'dataCached/outsampleFVreduced_outfile'

csAbstract = FVC.CosSim('CSAbs',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),False)
csSentence = FVC.CosSim('CSSent',TfidfVectorizer( ngram_range = ( 1, 3 ), sublinear_tf = True ),True)
jacq3 = FVC.stringMatchExcerpts('FuzzJacc',sm.Jaccard(),sm.QgramTokenizer(qval=3,return_set = True))
cosM = FVC.stringMatchExcerpts('CosMeasure',sm.Cosine(),sm.WhitespaceTokenizer(return_set = True))
cosMq3 = FVC.stringMatchExcerpts('FuzzCosMeasure',sm.Cosine(),sm.QgramTokenizer(return_set = True))
LVdist = FVC.stringMatchTitles('LVDist',sm.Levenshtein())

DEFAULTFV = [jacq3,cosM,cosMq3,LVdist]
DEFAULTMODEL = LR()
DEFAULTMODELNAME = 'LogisiticRegression'
DEFAULTITERATIONS = 25


class join:
    def __init__(self,insampleData,outsampleData,dataFolder):
        self.insampleData = insampleData #pairs,labels,pairedAbstracts,pairedTitles
        self.outsampleData = outsampleData #pairs,labels,pairedAbstracts,pairedTitles
        self.dataFolder = dataFolder
        self.labels = insampleData[1]
Exemplo n.º 8
0
# In[30]:

df['Q2'] = df.apply(
    lambda x: jac.get_sim_score(x['Q-gram_2_Tokens1'], x['Q-gram_2_Tokens2']),
    axis=1)
df['Q3'] = df.apply(
    lambda x: jac.get_sim_score(x['Q-gram_3_Tokens1'], x['Q-gram_3_Tokens2']),
    axis=1)
df['Q4'] = df.apply(
    lambda x: jac.get_sim_score(x['Q-gram_4_Tokens1'], x['Q-gram_4_Tokens2']),
    axis=1)
df.head()

# In[31]:

cos = sm.Cosine()
df['Cosine'] = df.apply(
    lambda x: cos.get_sim_score(x['aTokens'], x['bTokens']), axis=1)
df.head()

# In[32]:

dice = sm.Dice()
df['Dice'] = df.apply(lambda x: dice.get_sim_score(x['aTokens'], x['bTokens']),
                      axis=1)
df.head()

# In[33]:

oc = sm.OverlapCoefficient()
df['Overlap'] = df.apply(