def test_intersection(self): """ Checks for intersection(). """ set1 = Multiset({"a": 10, "b": 20}) set2 = Multiset({"a": 1, "c": 2}) set3 = Multiset({"a": 1, "b": 0, "c": 0}) self.assertEqual(set1.intersection(set2), set3) set1 = Multiset({"a": 10, "b": 20}) set2 = Multiset({"x": 1, "y": 2}) set3 = Multiset({"a": 0, "b": 0, "x": 0, "y": 0}) self.assertEqual(set1.intersection(set2), set3)
def GCF(x, y): pfx = Multiset(primefactors(x)) pfy = Multiset(primefactors(y)) gcf = 1 for p in pfx.intersection(pfy): gcf = gcf * p return gcf
class NGramms(object): def __init__(self, words=None, n=None, ngramms=None): if ngramms is not None: self.ngramms = ngramms return self.ngramms = Multiset( [NGramm(words[i:i + n]) for i in range(len(words) - n)]) self.words = words def __str__(self): return unicode(self).encode('utf-8') def __unicode__(self): m = map(lambda s: u'"{}"'.format(unicode(s)), self.ngramms) return u', '.join(m) def __len__(self): return len(self.ngramms) def intersection(self, other): both = self.ngramms.intersection(other.ngramms) return NGramms(ngramms=both) def count(self): return sum([self.ngramms[x] for x in self.ngramms])
def extract(self, question: str, doc: str) -> float: tokenized_question = self.tokenizer.tokenize(question) tokenized_doc = self.tokenizer.tokenize(doc) question_bigrams = Multiset(nltk.bigrams(tokenized_question)) doc_bigrams = Multiset(nltk.bigrams(tokenized_doc)) overlap = sum(question_bigrams.intersection(doc_bigrams).values()) if self.normalized: overlap /= len(tokenized_question) return overlap
def jaccard_index(a: multiset.Multiset, b: multiset.Multiset) -> float: num = len(a.intersection(b)) den = len(a.union(b)) return num / den if den else np.nan