def fill_lower_triangle(results): """Duplicate each entry of `results` to its transpose This is used after the symmetric threshold search to turn the upper-triangle results into a full matrix. """ _chemfp.fill_lower_triangle(results, len(results))
def threshold_tanimoto_search_symmetric(arena, threshold=0.7, include_lower_triangle=True, batch_size=100): """Search for the hits in the `arena` at least `threshold` similar to the fingerprints in the arena When `include_lower_triangle` is True, compute the upper-triangle similarities, then copy the results to get the full set of results. When `include_lower_triangle` is False, only compute the upper triangle. The computation can take a long time. Python won't check check for a ^C until the function finishes. This can be irritating. Instead, process only `batch_size` rows at a time before checking for a ^C. The hits in the returned `SearchResults` are in arbitrary order. Example:: arena = chemfp.load_fingerprints("queries.fps") full_result = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=0.2) upper_triangle = chemfp.search.threshold_tanimoto_search_symmetric( arena, threshold=0.2, include_lower_triangle=False) assert sum(map(len, full_result)) == sum(map(len, upper_triangle))*2 :param arena: the set of fingerprints :type arena: a FingerprintArena :param threshold: The minimum score threshold. :type threshold: float between 0.0 and 1.0, inclusive :param include_lower_triangle: if False, compute only the upper triangle, otherwise use symmetry to compute the full matrix :type include_lower_triangle: boolean :param batch_size: the number of rows to process before checking for a ^C :type batch_size: integer :returns: a SearchResults instance """ if batch_size <= 0: raise ValueError("batch_size must be positive") N = len(arena) results = SearchResults(N, arena.arena_ids) if N: # Break it up into batch_size groups in order to let Python's # interrupt handler check for a ^C, which is otherwise # suppressed until the function finishes. for query_start in xrange(0, N, batch_size): query_end = min(query_start + batch_size, N) _chemfp.threshold_tanimoto_arena_symmetric( threshold, arena.num_bits, arena.start_padding, arena.end_padding, arena.storage_size, arena.arena, query_start, query_end, 0, N, arena.popcount_indices, results) if include_lower_triangle: _chemfp.fill_lower_triangle(results, N) return results
def threshold_tanimoto_search_symmetric(arena, threshold, include_lower_triangle=True): assert arena.popcount_indices N = len(arena) results = SearchResults(N, arena.ids) if N: _chemfp.threshold_tanimoto_arena_symmetric( threshold, arena.num_bits, arena.start_padding, arena.end_padding, arena.storage_size, arena.arena, 0, N, 0, N, arena.popcount_indices, results) if include_lower_triangle: _chemfp.fill_lower_triangle(results, N) return results
def fill_lower_triangle(results): _chemfp.fill_lower_triangle(results, len(results))