예제 #1
0
 def test_jaccard(self):
     m1 = MinHash(4, 1, hashfunc=fake_hash_func)
     m2 = MinHash(4, 1, hashfunc=fake_hash_func)
     lm1 = LeanMinHash(m1)
     lm2 = LeanMinHash(m2)
     self.assertTrue(lm1.jaccard(lm2) == 1.0)
     m2.update(12)
     lm2 = LeanMinHash(m2)
     self.assertTrue(lm1.jaccard(lm2) == 0.0)
     m1.update(13)
     lm1 = LeanMinHash(m1)
     self.assertTrue(lm1.jaccard(lm2) < 1.0)
예제 #2
0
 def test_jaccard(self):
     m1 = MinHash(4, 1, hashobj=FakeHash)
     m2 = MinHash(4, 1, hashobj=FakeHash)
     lm1 = LeanMinHash(m1)
     lm2 = LeanMinHash(m2)
     self.assertTrue(lm1.jaccard(lm2) == 1.0)
     m2.update(12)
     lm2 = LeanMinHash(m2)
     self.assertTrue(lm1.jaccard(lm2) == 0.0)
     m1.update(13)
     lm1 = LeanMinHash(m1)
     self.assertTrue(lm1.jaccard(lm2) < 1.0)
예제 #3
0
def joinable_column_search():
    query_id = request.args.get('id', None, type=uuid.UUID)
    if query_id == None:
        return jsonify([])
    limit = request.args.get('limit', default=50, type=int)
    original_host_filter = tuple(request.args.getlist('original_host'))
    cnx = cnxpool.getconn()
    # Obtain the MinHash of the query.
    with cnx.cursor(cursor_factory=RealDictCursor) as cursor:
        _execute_get_column_sketches(cursor, (query_id,))
        query = cursor.fetchone()
    if query is None:
        # The query does not exist.
        cnxpool.putconn(cnx)
        abort(404)
    # Query the LSH Server.
    try:
        resp = requests.post(lshserver_endpoint+"/query",
                json={"seed": query["seed"], "minhash": query["minhash"]})
        resp.raise_for_status()
    except requests.exceptions.HTTPError as err:
        app.logger.error("Error in querying the LSH server: {}".format(err))
        cnxpool.putconn(cnx)
        abort(500)
    column_ids = [column_id for column_id in resp.json()
            if column_id != str(query_id)]
    if len(column_ids) == 0:
        # Return empty result.
        cnxpool.putconn(cnx)
        return jsonify([])
    # Create the final query results.
    results = []
    query_minhash = LeanMinHash(seed=query["seed"], hashvalues=query["minhash"])
    # Obtain the column sketches of the results.
    with cnx.cursor(cursor_factory=RealDictCursor) as cursor:
        _execute_get_column_sketches(cursor, tuple(column_ids),
                original_hosts=original_host_filter)
        for column in cursor:
            # Skip columns from query table.
            if column["package_file_id"] == query["package_file_id"]:
                continue
            # Compute the similarities for each column in the result.
            jaccard = query_minhash.jaccard(LeanMinHash(
                    seed=column["seed"], hashvalues=column["minhash"]))
            containment = _containment(jaccard, column["distinct_count"],
                    query["distinct_count"])
            column.pop("seed")
            column.pop("minhash")
            column["jaccard"] = jaccard
            column["containment"] = containment
            if len(results) < limit:
                heapq.heappush(results,
                        (containment, column["id"], dict(column)))
            else:
                heapq.heappushpop(results,
                        (containment, column["id"], dict(column)))
    # Done with SQL.
    cnxpool.putconn(cnx)
    results = [column for _, _, column in heapq.nlargest(limit, results)]
    return jsonify(results)
예제 #4
0
def compare_minhashes(m1: datasketch.LeanMinHash,
                      m2: datasketch.LeanMinHash) -> float:
    return m1.jaccard(m2)