def compare_algs_tfidf_simhashtfidf(self): token_length = 1 test_set = self.generate_random_triples() ds = DocSim(self.document_set) sh = SimHashTfIdf(self.document_set) total = float(len(test_set)) correct = 0. for t1, t2, t3 in test_set: dsim1 = ds.similarity(t1, t2) dsim2 = ds.similarity(t1, t3) ssim1 = sh.similarity(t1, t2) ssim2 = sh.similarity(t1, t3) if ((abs(dsim1 - 0) < 0.000001 and abs(dsim2 - 0) < 0.000001)): total -= 1. continue db = dsim1 < dsim2 sb = ssim1 < ssim2 if db == sb: correct += 1. # print len(test_set) # print total return correct / total
def benchmark_memory_tfidf(self, iterations): ds = DocSim(self.document_set) for i in range(iterations): title1 = self.select_random_document() title2 = self.select_random_document() sim = ds.similarity(title1, title2) return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000
def benchmark_tfidf(self, iterations): t0 = time.clock() ds = DocSim(self.document_set) for i in range(iterations): title1 = self.select_random_document() title2 = self.select_random_document() sim = ds.similarity(title1, title2) t1 = time.clock() span = t1 - t0 return span