def test_content_id_text(): cid_t_np = iscc.content_id_text("") assert len(cid_t_np) == 13 assert cid_t_np == "CT7A4zpmccuEv" cid_t_p = iscc.content_id_text("", partial=True) assert cid_t_p == "Ct7A4zpmccuEv" assert 0 == iscc.distance(cid_t_p, cid_t_np) cid_t_a = iscc.content_id_text(TEXT_A) cid_t_b = iscc.content_id_text(TEXT_B) assert iscc.distance(cid_t_a, cid_t_b) == 2
def test_hamming_distance(): a = 0b0001111 b = 0b1000111 assert iscc.distance(a, b) == 2 mid1 = iscc.meta_id("Die Unendliche Geschichte", "von Michael Ende")[0] # Change one Character mid2 = iscc.meta_id("Die UnXndliche Geschichte", "von Michael Ende")[0] assert iscc.distance(mid1, mid2) <= 10 # Delete one Character mid2 = iscc.meta_id("Die nendliche Geschichte", "von Michael Ende")[0] assert iscc.distance(mid1, mid2) <= 14 # Add one Character mid2 = iscc.meta_id("Die UnendlicheX Geschichte", "von Michael Ende")[0] assert iscc.distance(mid1, mid2) <= 13 # Add, change, delete mid2 = iscc.meta_id("Diex Unandlische Geschiche", "von Michael Ende")[0] assert iscc.distance(mid1, mid2) <= 22 # Change Word order mid2 = iscc.meta_id("Unendliche Geschichte, Die", "von Michael Ende")[0] assert iscc.distance(mid1, mid2) <= 13 # Totaly different mid2 = iscc.meta_id("Now for something different")[0] assert iscc.distance(mid1, mid2) >= 24
def sim(a, b): """Estimate Similarity of ISCC Codes A & B. Example: $ iscc sim CCUcKwdQc1jUM CCjMmrCsKWu1D You may also compare fully qualified ISCC Codes with each other. """ try: iscc_verify(a) iscc_verify(b) except ValueError as e: click.echo(str(e)) sys.exit(1) # Fully Qualified ISCC Code Similarity avg_msg = None if len(iscc_clean(a)) == 52 and len(iscc_clean(b)) == 52: digest_a = b"".join(iscc.decode(code)[1:] for code in iscc_split(a)) digest_b = b"".join(iscc.decode(code)[1:] for code in iscc_split(b)) int_a = int.from_bytes(digest_a, "big", signed=False) int_b = int.from_bytes(digest_b, "big", signed=False) dist = bin(int_a ^ int_b).count("1") similarity = ((192 - dist) / 192) * 100 avg_msg = "Average Estimated Similarity: {:.2f} % ({} of 192 bits differnt)".format( similarity, dist) # Per Component Similarity a = iscc_split(a) b = iscc_split(b) if len(a) == 1 and len(b) == 1: type_a = ISCC_COMPONENT_CODES.get(a[0][:2])["name"] type_b = ISCC_COMPONENT_CODES.get(b[0][:2])["name"] if type_a != type_b: click.echo("Incompatible component types ({} & {}).".format( type_a, type_b)) for ca in a: for cb in b: type_a = ISCC_COMPONENT_CODES.get(ca[:2])["name"] type_b = ISCC_COMPONENT_CODES.get(cb[:2])["name"] if type_a == type_b and type_a != "Instance-ID": hamming_dist = iscc.distance(ca, cb) hamming_sim = 64 - hamming_dist similarity = round(hamming_sim / (2 * 64 - hamming_sim) * 100) click.echo( "Estimated Similarity of {}: {:.2f} % ({} of 64 bits match)" .format(type_a, similarity, hamming_sim)) if type_a == "Instance-ID" and type_b == "Instance-ID": if ca == cb: click.echo("Identical Instance-ID") if avg_msg: click.echo(avg_msg)
def test_meta_id(): mid1, _, _ = iscc.meta_id("ISCC Content Identifiers") assert mid1 == "CCDFPFc87MhdT" mid1, _, _ = iscc.meta_id(b"ISCC Content Identifiers") assert mid1 == "CCDFPFc87MhdT" mid1, title, extra = iscc.meta_id("Die Unendliche Geschichte") assert mid1 == "CCAKevDpE1eEL" assert title == "die unendliche geschichte" assert extra == "" mid2 = iscc.meta_id(" Die unéndlÃche, Geschichte ")[0] assert mid1 == mid2 mid3 = iscc.meta_id("Die Unentliche Geschichte")[0] assert iscc.distance(mid1, mid3) == 8 mid4 = iscc.meta_id("Geschichte, Die Unendliche")[0] assert iscc.distance(mid1, mid4) == 9 with pytest.raises(UnicodeDecodeError): iscc.meta_id(b"\xc3\x28")
def test_meta_id(): mid1, _, _ = iscc.meta_id('ISCC Content Identifiers') assert mid1 == 'CCDGhLx6tREif' mid1, _, _ = iscc.meta_id(b'ISCC Content Identifiers') assert mid1 == 'CCDGhLx6tREif' mid1, title, extra = iscc.meta_id('Die Unendliche Geschichte') assert mid1 == "CCAZF4K1bBv8i" assert title == 'die unendliche geschichte' assert extra == '' mid2 = iscc.meta_id(' Die unéndlÃche, Geschichte ')[0] assert mid1 == mid2 mid3 = iscc.meta_id('Die Unentliche Geschichte')[0] assert iscc.distance(mid1, mid3) == 12 mid4 = iscc.meta_id('Geschichte, Die Unendliche')[0] assert iscc.distance(mid1, mid4) == 7 with pytest.raises(UnicodeDecodeError): iscc.meta_id(b"\xc3\x28")
def test_content_id_image(): cid_i = iscc.content_id_image("lenna.jpg") assert len(cid_i) == 13 assert cid_i == "CYmLoqBRgV32u" data = BytesIO(open("lenna.jpg", "rb").read()) cid_i = iscc.content_id_image(data, partial=True) assert len(cid_i) == 13 assert cid_i == "CimLoqBRgV32u" img1 = Image.open("lenna.jpg") img2 = img1.filter(ImageFilter.GaussianBlur(10)) img3 = ImageEnhance.Brightness(img1).enhance(1.4) img4 = ImageEnhance.Contrast(img1).enhance(1.2) cid1 = iscc.content_id_image(img1) cid2 = iscc.content_id_image(img2) cid3 = iscc.content_id_image(img3) cid4 = iscc.content_id_image(img4) assert iscc.distance(cid1, cid2) == 0 assert iscc.distance(cid1, cid3) == 2 assert iscc.distance(cid1, cid4) == 0
def test_data_id(): random.seed(1) data = bytearray([random.getrandbits(8) for _ in range(1000000)]) # 1 mb did_a = iscc.data_id(data) assert did_a == "CDK2KdVAz5XTs" data.insert(500000, 1) data.insert(500001, 2) data.insert(500002, 3) did_b = iscc.data_id(data) assert did_b == did_b for x in range(100): # insert 100 bytes random noise data.insert(random.randint(0, 1000000), random.randint(0, 255)) did_c = iscc.data_id(data) assert iscc.distance(did_a, did_c) == 17