def similarity_score(kmers1, kmers2, scoring="jaccard"): """Compute Jaccard similarity index""" # count of kmers in common intersection = float(kmerizer.count_common(kmers1, kmers2)) if scoring == "jaccard": # Use Jaccard similarity index score = intersection / (kmers1.size + kmers2.size - intersection) elif scoring == "minsize": # Use intersection / min_size (proper subset scores 1.0) score = intersection / min(kmers1.size, kmers2.size) elif scoring == "meansize": # Use mean size in denominator (used in Mash) score = intersection / ((kmers1.size + kmers2.size) / 2) elif scoring == "maxsize": # Use intersection / max_size (proper subset scores min/max) score = intersection / max(kmers1.size, kmers2.size) elif scoring == "reference": # Use intersection / size of reference (useful for comparing reads to # assembled references) score = intersection / kmers2.size else: assert scoring in ( "jaccard", "minsize", "maxsize", "meansize", "reference"), \ "unknown scoring method" return score
def reference(kmers1, kmers2): """Assume k-merset 2 is the k-merset of a reference genome.""" intersection = kmerizer.count_common(kmers1, kmers2) return intersection / kmers2.size
def subset(kmers1, kmers2): """Calculate the fraction of k-mers in k-merset 1 that are also in k-merset 2, useful to check whether k-merset 1 is a subset of another.""" intersection = kmerizer.count_common(kmers1, kmers2) return intersection / kmers1.size
def maxsize(kmers1, kmers2): intersection = kmerizer.count_common(kmers1, kmers2) return intersection / max(kmers1.size, kmers2.size)
def meansize(kmers1, kmers2): intersection = kmerizer.count_common(kmers1, kmers2) return intersection / ((kmers1.size + kmers2.size) / 2)
def jaccard(kmers1, kmers2): """Computes jaccard similarity. Returns numerator and denominator separately.""" intersection = kmerizer.count_common(kmers1, kmers2) return intersection / (kmers1.size + kmers2.size - intersection)