예제 #1
0
    def __init__(self, definition) :
        super(TextType, self).__init__(definition)

        if 'corpus' not in definition :
            definition['corpus'] = []

        self.comparator = CosineTextSimilarity(definition['corpus'])
예제 #2
0
def dedupe_cosine(s1, s2):
    s1_2 = pd.Series(list(zip(s1, s2)));

    # build corpus
    corpus_set = []
    for index, value in s1_2.iteritems():
        corpus_set.append(value[0])
        corpus_set.append(value[1])

    # init cosine instance
    cosine = CosineTextSimilarity(corpus_set)

    # calc similarity
    return s1_2.apply(lambda x: cosine(x[0], x[1]))
예제 #3
0
 def test_cosine_na(self):
     cosine = CosineTextSimilarity(self.ilist)
     cosine_sim = cosine(self.ilist[0], '')
     assert numpy.isnan(cosine_sim)
예제 #4
0
 def test_cosine_identical(self):
     cosine = CosineTextSimilarity(self.ilist)
     cosine_sim = cosine(self.ilist[0], self.ilist[0])
     self.assertAlmostEqual(cosine_sim, 1, places=5)
예제 #5
0
 def test_cosine(self):
     cosine = CosineTextSimilarity(self.ilist)
     s1 = self.ilist[0]
     s2 = self.ilist[1]
     cosine_sim = cosine(s1, s2)
     self.assertAlmostEqual(cosine_sim, 0.378, places=3)