예제 #1
0
def test_content_id_text():
    cid_t_np = iscc.content_id_text("")
    assert len(cid_t_np) == 13
    assert cid_t_np == "CT7A4zpmccuEv"
    cid_t_p = iscc.content_id_text("", partial=True)
    assert cid_t_p == "Ct7A4zpmccuEv"
    assert 0 == iscc.distance(cid_t_p, cid_t_np)

    cid_t_a = iscc.content_id_text(TEXT_A)
    cid_t_b = iscc.content_id_text(TEXT_B)
    assert iscc.distance(cid_t_a, cid_t_b) == 2
예제 #2
0
def test_hamming_distance():
    a = 0b0001111
    b = 0b1000111
    assert iscc.distance(a, b) == 2

    mid1 = iscc.meta_id("Die Unendliche Geschichte", "von Michael Ende")[0]

    # Change one Character
    mid2 = iscc.meta_id("Die UnXndliche Geschichte", "von Michael Ende")[0]
    assert iscc.distance(mid1, mid2) <= 10

    # Delete one Character
    mid2 = iscc.meta_id("Die nendliche Geschichte", "von Michael Ende")[0]
    assert iscc.distance(mid1, mid2) <= 14

    # Add one Character
    mid2 = iscc.meta_id("Die UnendlicheX Geschichte", "von Michael Ende")[0]
    assert iscc.distance(mid1, mid2) <= 13

    # Add, change, delete
    mid2 = iscc.meta_id("Diex Unandlische Geschiche", "von Michael Ende")[0]
    assert iscc.distance(mid1, mid2) <= 22

    # Change Word order
    mid2 = iscc.meta_id("Unendliche Geschichte, Die", "von Michael Ende")[0]
    assert iscc.distance(mid1, mid2) <= 13

    # Totaly different
    mid2 = iscc.meta_id("Now for something different")[0]
    assert iscc.distance(mid1, mid2) >= 24
예제 #3
0
def sim(a, b):
    """Estimate Similarity of ISCC Codes A & B.

    Example:

        $ iscc sim CCUcKwdQc1jUM CCjMmrCsKWu1D

    You may also compare fully qualified ISCC Codes with each other.
    """
    try:
        iscc_verify(a)
        iscc_verify(b)
    except ValueError as e:
        click.echo(str(e))
        sys.exit(1)

    # Fully Qualified ISCC Code Similarity
    avg_msg = None
    if len(iscc_clean(a)) == 52 and len(iscc_clean(b)) == 52:
        digest_a = b"".join(iscc.decode(code)[1:] for code in iscc_split(a))
        digest_b = b"".join(iscc.decode(code)[1:] for code in iscc_split(b))
        int_a = int.from_bytes(digest_a, "big", signed=False)
        int_b = int.from_bytes(digest_b, "big", signed=False)
        dist = bin(int_a ^ int_b).count("1")
        similarity = ((192 - dist) / 192) * 100
        avg_msg = "Average Estimated Similarity: {:.2f} % ({} of 192 bits differnt)".format(
            similarity, dist)

    # Per Component Similarity
    a = iscc_split(a)
    b = iscc_split(b)

    if len(a) == 1 and len(b) == 1:
        type_a = ISCC_COMPONENT_CODES.get(a[0][:2])["name"]
        type_b = ISCC_COMPONENT_CODES.get(b[0][:2])["name"]
        if type_a != type_b:
            click.echo("Incompatible component types ({} & {}).".format(
                type_a, type_b))

    for ca in a:
        for cb in b:
            type_a = ISCC_COMPONENT_CODES.get(ca[:2])["name"]
            type_b = ISCC_COMPONENT_CODES.get(cb[:2])["name"]
            if type_a == type_b and type_a != "Instance-ID":
                hamming_dist = iscc.distance(ca, cb)
                hamming_sim = 64 - hamming_dist
                similarity = round(hamming_sim / (2 * 64 - hamming_sim) * 100)
                click.echo(
                    "Estimated Similarity of {}: {:.2f} % ({} of 64 bits match)"
                    .format(type_a, similarity, hamming_sim))
            if type_a == "Instance-ID" and type_b == "Instance-ID":
                if ca == cb:
                    click.echo("Identical Instance-ID")
    if avg_msg:
        click.echo(avg_msg)
예제 #4
0
def test_meta_id():
    mid1, _, _ = iscc.meta_id("ISCC Content Identifiers")
    assert mid1 == "CCDFPFc87MhdT"

    mid1, _, _ = iscc.meta_id(b"ISCC Content Identifiers")
    assert mid1 == "CCDFPFc87MhdT"

    mid1, title, extra = iscc.meta_id("Die Unendliche Geschichte")
    assert mid1 == "CCAKevDpE1eEL"
    assert title == "die unendliche geschichte"
    assert extra == ""
    mid2 = iscc.meta_id(" Die unéndlíche,  Geschichte ")[0]
    assert mid1 == mid2

    mid3 = iscc.meta_id("Die Unentliche Geschichte")[0]
    assert iscc.distance(mid1, mid3) == 8

    mid4 = iscc.meta_id("Geschichte, Die Unendliche")[0]
    assert iscc.distance(mid1, mid4) == 9

    with pytest.raises(UnicodeDecodeError):
        iscc.meta_id(b"\xc3\x28")
예제 #5
0
def test_meta_id():
    mid1, _, _ = iscc.meta_id('ISCC Content Identifiers')
    assert mid1 == 'CCDGhLx6tREif'

    mid1, _, _ = iscc.meta_id(b'ISCC Content Identifiers')
    assert mid1 == 'CCDGhLx6tREif'

    mid1, title, extra = iscc.meta_id('Die Unendliche Geschichte')
    assert mid1 == "CCAZF4K1bBv8i"
    assert title == 'die unendliche geschichte'
    assert extra == ''
    mid2 = iscc.meta_id(' Die unéndlíche,  Geschichte ')[0]
    assert mid1 == mid2

    mid3 = iscc.meta_id('Die Unentliche Geschichte')[0]
    assert iscc.distance(mid1, mid3) == 12

    mid4 = iscc.meta_id('Geschichte, Die Unendliche')[0]
    assert iscc.distance(mid1, mid4) == 7

    with pytest.raises(UnicodeDecodeError):
        iscc.meta_id(b"\xc3\x28")
예제 #6
0
def test_content_id_image():
    cid_i = iscc.content_id_image("lenna.jpg")
    assert len(cid_i) == 13
    assert cid_i == "CYmLoqBRgV32u"

    data = BytesIO(open("lenna.jpg", "rb").read())
    cid_i = iscc.content_id_image(data, partial=True)
    assert len(cid_i) == 13
    assert cid_i == "CimLoqBRgV32u"

    img1 = Image.open("lenna.jpg")
    img2 = img1.filter(ImageFilter.GaussianBlur(10))
    img3 = ImageEnhance.Brightness(img1).enhance(1.4)
    img4 = ImageEnhance.Contrast(img1).enhance(1.2)

    cid1 = iscc.content_id_image(img1)
    cid2 = iscc.content_id_image(img2)
    cid3 = iscc.content_id_image(img3)
    cid4 = iscc.content_id_image(img4)

    assert iscc.distance(cid1, cid2) == 0
    assert iscc.distance(cid1, cid3) == 2
    assert iscc.distance(cid1, cid4) == 0
예제 #7
0
def test_data_id():
    random.seed(1)
    data = bytearray([random.getrandbits(8) for _ in range(1000000)])  # 1 mb
    did_a = iscc.data_id(data)
    assert did_a == "CDK2KdVAz5XTs"
    data.insert(500000, 1)
    data.insert(500001, 2)
    data.insert(500002, 3)
    did_b = iscc.data_id(data)
    assert did_b == did_b
    for x in range(100):  # insert 100 bytes random noise
        data.insert(random.randint(0, 1000000), random.randint(0, 255))
    did_c = iscc.data_id(data)
    assert iscc.distance(did_a, did_c) == 17