def test_jaccard(self): m1 = MinHash(4, 1, hashfunc=fake_hash_func) m2 = MinHash(4, 1, hashfunc=fake_hash_func) lm1 = LeanMinHash(m1) lm2 = LeanMinHash(m2) self.assertTrue(lm1.jaccard(lm2) == 1.0) m2.update(12) lm2 = LeanMinHash(m2) self.assertTrue(lm1.jaccard(lm2) == 0.0) m1.update(13) lm1 = LeanMinHash(m1) self.assertTrue(lm1.jaccard(lm2) < 1.0)
def test_jaccard(self): m1 = MinHash(4, 1, hashobj=FakeHash) m2 = MinHash(4, 1, hashobj=FakeHash) lm1 = LeanMinHash(m1) lm2 = LeanMinHash(m2) self.assertTrue(lm1.jaccard(lm2) == 1.0) m2.update(12) lm2 = LeanMinHash(m2) self.assertTrue(lm1.jaccard(lm2) == 0.0) m1.update(13) lm1 = LeanMinHash(m1) self.assertTrue(lm1.jaccard(lm2) < 1.0)
def joinable_column_search(): query_id = request.args.get('id', None, type=uuid.UUID) if query_id == None: return jsonify([]) limit = request.args.get('limit', default=50, type=int) original_host_filter = tuple(request.args.getlist('original_host')) cnx = cnxpool.getconn() # Obtain the MinHash of the query. with cnx.cursor(cursor_factory=RealDictCursor) as cursor: _execute_get_column_sketches(cursor, (query_id,)) query = cursor.fetchone() if query is None: # The query does not exist. cnxpool.putconn(cnx) abort(404) # Query the LSH Server. try: resp = requests.post(lshserver_endpoint+"/query", json={"seed": query["seed"], "minhash": query["minhash"]}) resp.raise_for_status() except requests.exceptions.HTTPError as err: app.logger.error("Error in querying the LSH server: {}".format(err)) cnxpool.putconn(cnx) abort(500) column_ids = [column_id for column_id in resp.json() if column_id != str(query_id)] if len(column_ids) == 0: # Return empty result. cnxpool.putconn(cnx) return jsonify([]) # Create the final query results. results = [] query_minhash = LeanMinHash(seed=query["seed"], hashvalues=query["minhash"]) # Obtain the column sketches of the results. with cnx.cursor(cursor_factory=RealDictCursor) as cursor: _execute_get_column_sketches(cursor, tuple(column_ids), original_hosts=original_host_filter) for column in cursor: # Skip columns from query table. if column["package_file_id"] == query["package_file_id"]: continue # Compute the similarities for each column in the result. jaccard = query_minhash.jaccard(LeanMinHash( seed=column["seed"], hashvalues=column["minhash"])) containment = _containment(jaccard, column["distinct_count"], query["distinct_count"]) column.pop("seed") column.pop("minhash") column["jaccard"] = jaccard column["containment"] = containment if len(results) < limit: heapq.heappush(results, (containment, column["id"], dict(column))) else: heapq.heappushpop(results, (containment, column["id"], dict(column))) # Done with SQL. cnxpool.putconn(cnx) results = [column for _, _, column in heapq.nlargest(limit, results)] return jsonify(results)
def compare_minhashes(m1: datasketch.LeanMinHash, m2: datasketch.LeanMinHash) -> float: return m1.jaccard(m2)