예제 #1
0
def test_iscc_from_url_no_meta():
    url = "https://github.com/iscc/iscc-cli/raw/master/tests/image/demo.png"
    res = lib.iscc_from_url(url)
    assert isinstance(res, dict)
    assert "CYDfTq7Qc7Fre-CDij3vGU1BkCZ-CRNssh4Qc1x5B" in res["iscc"]
    meta_id, _, _ = iscc.meta_id("demo")
    assert meta_id in res["iscc"]
예제 #2
0
    def save(self, *args, **kwargs):
        mid, title, extra = iscc.meta_id(self.title, self.extra)
        if self.ident:
            new_ident = [mid] + list(self.ident.split('-')[1:])
            self.ident = '-'.join(new_ident)
        if self.file:
            new_upload = isinstance(self.file.file, UploadedFile)
            if new_upload:
                # Generate ISCC

                filename, file_extension = splitext(self.file.name)
                ext = file_extension.lower().lstrip('.')
                data = self.file.open('rb').read()
                if ext in self.TEXT_EXTENSIONS:
                    if ext == 'docx':
                        text = docx2txt.process(BytesIO(data))
                        print(text)
                    else:
                        text = self.file.open().read()
                    cid = iscc.content_id_text(text)
                elif ext in self.IMAGE_EXTENSIONS:
                    cid = iscc.content_id_image(BytesIO(data))
                did = iscc.data_id(data)
                iid, self.tophash = iscc.instance_id(data)
                iscc_code = '-'.join((mid, cid, did, iid))
                self.ident = iscc_code
        super(MediaContent, self).save(*args, **kwargs)
예제 #3
0
def test_hamming_distance():
    a = 0b0001111
    b = 0b1000111
    assert iscc.distance(a, b) == 2

    mid1 = iscc.meta_id("Die Unendliche Geschichte", "von Michael Ende")[0]

    # Change one Character
    mid2 = iscc.meta_id("Die UnXndliche Geschichte", "von Michael Ende")[0]
    assert iscc.distance(mid1, mid2) <= 10

    # Delete one Character
    mid2 = iscc.meta_id("Die nendliche Geschichte", "von Michael Ende")[0]
    assert iscc.distance(mid1, mid2) <= 14

    # Add one Character
    mid2 = iscc.meta_id("Die UnendlicheX Geschichte", "von Michael Ende")[0]
    assert iscc.distance(mid1, mid2) <= 13

    # Add, change, delete
    mid2 = iscc.meta_id("Diex Unandlische Geschiche", "von Michael Ende")[0]
    assert iscc.distance(mid1, mid2) <= 22

    # Change Word order
    mid2 = iscc.meta_id("Unendliche Geschichte, Die", "von Michael Ende")[0]
    assert iscc.distance(mid1, mid2) <= 13

    # Totaly different
    mid2 = iscc.meta_id("Now for something different")[0]
    assert iscc.distance(mid1, mid2) >= 24
예제 #4
0
파일: iscc.py 프로젝트: haoleism/gui-demo
 def meta_changed(self):
     title = self.edit_title.text()
     extra = self.edit_extra.text()
     mid, tf, ef = iscc.meta_id(title=title, extra=extra)
     self.meta_id = mid
     self.title_formatted = tf
     self.extra_formatted = ef
     if self.content_id:
         self.show_conflicts()
예제 #5
0
def site_iscc():
    title = "ISCC - Content Identifiers"
    text = get_content('text')
    data = get_content('data')
    mid, title, extra = iscc.meta_id(title)
    cidt = iscc.content_id_text(text)
    did = iscc.data_id(data)
    iid, hash_ = iscc.instance_id(data)
    code = '-'.join((mid, cidt, did, iid))
    print('SITE:')
    print('TITLE:', title, extra)
    print('ISCC:', code)
    print('IIDF:', hash_)
예제 #6
0
def spec_iscc():
    title = "ISCC - Specification"
    text = open('docs/specification.md', encoding='utf-8').read()
    data = open('docs/specification.md', 'rb').read()
    mid, title, extra = iscc.meta_id(title)
    cidt = iscc.content_id_text(text)
    did = iscc.data_id(data)
    iid, hash_ = iscc.instance_id(data)
    code = '-'.join((mid, cidt, did, iid))
    print('SPEC:')
    print('TITLE:', title, extra)
    print('ISCC:', code)
    print('IIDF:', hash_)
예제 #7
0
def site_iscc():
    title = "ISCC - Content Identifiers"
    text = get_content("text")
    data = get_content("data")
    mid, title, extra = iscc.meta_id(title)
    cidt = iscc.content_id_text(text)
    did = iscc.data_id(data)
    iid, hash_ = iscc.instance_id(data)
    code = "-".join((mid, cidt, did, iid))
    print("SITE:")
    print("TITLE:", title, extra)
    print("ISCC:", code)
    print("IIDF:", hash_)
예제 #8
0
def spec_iscc():
    title = "ISCC - Specification"
    text = open(join(PROJECT_DIR, "docs/specification.md"), encoding="utf-8").read()
    data = open(join(PROJECT_DIR, "docs/specification.md"), "rb").read()
    mid, title, extra = iscc.meta_id(title)
    cidt = iscc.content_id_text(text)
    did = iscc.data_id(data)
    iid, hash_ = iscc.instance_id(data)
    code = "-".join((mid, cidt, did, iid))
    print("SPEC:")
    print("TITLE:", title, extra)
    print("ISCC:", code)
    print("IIDF:", hash_)
예제 #9
0
파일: main.py 프로젝트: iscc/iscc-service
def meta_id(meta: Metadata):
    """Generate MetaID from 'title' and optional 'extra' metadata"""
    extra = meta.extra or ""
    mid, title_trimmed, extra_trimmed = iscc.meta_id(meta.title, extra)
    result = {
        "code": mid,
        "bits": code_to_bits(mid),
        "ident": code_to_int(mid),
        "title": meta.title,
        "title_trimmed": title_trimmed,
    }

    if extra:
        result["extra"] = extra
        result["extra_trimmed"] = extra_trimmed

    return result
예제 #10
0
def test_meta_id():
    mid1, _, _ = iscc.meta_id("ISCC Content Identifiers")
    assert mid1 == "CCDFPFc87MhdT"

    mid1, _, _ = iscc.meta_id(b"ISCC Content Identifiers")
    assert mid1 == "CCDFPFc87MhdT"

    mid1, title, extra = iscc.meta_id("Die Unendliche Geschichte")
    assert mid1 == "CCAKevDpE1eEL"
    assert title == "die unendliche geschichte"
    assert extra == ""
    mid2 = iscc.meta_id(" Die unéndlíche,  Geschichte ")[0]
    assert mid1 == mid2

    mid3 = iscc.meta_id("Die Unentliche Geschichte")[0]
    assert iscc.distance(mid1, mid3) == 8

    mid4 = iscc.meta_id("Geschichte, Die Unendliche")[0]
    assert iscc.distance(mid1, mid4) == 9

    with pytest.raises(UnicodeDecodeError):
        iscc.meta_id(b"\xc3\x28")
예제 #11
0
def test_meta_id():
    mid1, _, _ = iscc.meta_id('ISCC Content Identifiers')
    assert mid1 == 'CCDGhLx6tREif'

    mid1, _, _ = iscc.meta_id(b'ISCC Content Identifiers')
    assert mid1 == 'CCDGhLx6tREif'

    mid1, title, extra = iscc.meta_id('Die Unendliche Geschichte')
    assert mid1 == "CCAZF4K1bBv8i"
    assert title == 'die unendliche geschichte'
    assert extra == ''
    mid2 = iscc.meta_id(' Die unéndlíche,  Geschichte ')[0]
    assert mid1 == mid2

    mid3 = iscc.meta_id('Die Unentliche Geschichte')[0]
    assert iscc.distance(mid1, mid3) == 12

    mid4 = iscc.meta_id('Geschichte, Die Unendliche')[0]
    assert iscc.distance(mid1, mid4) == 7

    with pytest.raises(UnicodeDecodeError):
        iscc.meta_id(b"\xc3\x28")
예제 #12
0
파일: batch.py 프로젝트: iscc/iscc-cli
def batch(path, recursive, guess, debug):
    """Create ISCC Codes for all files in PATH.

    Example:

      $ iscc batch ~/Documents

    """
    if debug:
        log.add(sys.stdout)

    results = []
    for f in get_files(path, recursive=recursive):
        filesize = os.path.getsize(f)
        if not filesize:
            msg = "Cannot proccess empty file: {}".format(f)
            log.warning(msg)
            continue

        media_type = mime_clean(mime_guess(f))
        if media_type not in SUPPORTED_MIME_TYPES:
            fname = basename(f)
            msg = "Unsupported file {} with mime type: {},,,,".format(
                fname, media_type)
            log.warning(msg)
            continue

        if media_type == "application/x-mobipocket-ebook":
            try:
                tempdir, epub_filepath = mobi.extract(f)
                tika_result = parser.from_file(epub_filepath)
                shutil.rmtree(tempdir)
            except Exception as e:
                msg = "Error with mobi extraction %s"
                log.error(msg)
                continue
        else:
            tika_result = parser.from_file(f)

        title = get_title(tika_result, guess=guess, uri=f)

        mid, norm_title, _ = iscc.meta_id(title)
        gmt = mime_to_gmt(media_type, file_path=f)
        if gmt == GMT.IMAGE:
            try:
                cid = iscc.content_id_image(f)
            except Exception as e:
                msg = "Clould not proccess image: {} ({})".format(f, e)
                log.error(msg)
                continue

        elif gmt == GMT.TEXT:
            text = tika_result["content"]
            if not text:
                msg = "Could not extract text from {}".format(basename(f))
                log.warning(msg)
                continue
            cid = iscc.content_id_text(tika_result["content"])
        elif gmt == GMT.AUDIO:
            if not fpcalc.is_installed():
                fpcalc.install()
            features = audio_id.get_chroma_vector(f)
            cid = audio_id.content_id_audio(features)
        elif gmt == GMT.VIDEO:
            features = video_id.get_frame_vectors(abspath(f))
            cid = video_id.content_id_video(features)
        else:
            log.error("Could not generate ISCC")
            continue

        did = iscc.data_id(f)
        iid, tophash = iscc.instance_id(f)

        iscc_code_cs = ",".join((mid, cid, did, iid))

        click.echo("{iscc_code},{tophash},{fname},{gmt},{title}".format(
            iscc_code=iscc_code_cs,
            tophash=tophash,
            fname=basename(f),
            gmt=gmt,
            title=norm_title,
        ))
        iscc_code = "-".join((mid, cid, did, iid))
        results.append(
            dict(
                iscc=iscc_code,
                norm_title=norm_title,
                tophash=tophash,
                gmt=gmt,
                file_name=basename(f),
            ))

    return results
예제 #13
0
def gen(file, guess, title, extra, verbose):
    """Generate ISCC Code for FILE."""
    filesize = os.path.getsize(file.name)
    if not filesize:
        raise click.BadParameter("Cannot proccess empty file: {}".format(
            file.name))

    media_type = mime_clean(mime_guess(file.name))
    if media_type not in SUPPORTED_MIME_TYPES:
        click.echo("Unsupported media type {}.".format(media_type))
        click.echo(
            "Please request support at https://github.com/iscc/iscc-cli/issues"
        )

    if media_type == "application/x-mobipocket-ebook":
        tempdir, epub_filepath = mobi.extract(file.name)
        tika_result = parser.from_file(epub_filepath)
        shutil.rmtree(tempdir)
    else:
        tika_result = parser.from_file(file.name)

    if not title:
        title = get_title(tika_result, guess=guess, uri=file.name)

    if not extra:
        extra = ""

    mid, norm_title, _ = iscc.meta_id(title, extra)
    gmt = mime_to_gmt(media_type, file_path=file.name)
    if gmt == GMT.IMAGE:
        cid = iscc.content_id_image(file.name)
    elif gmt == GMT.TEXT:
        text = tika_result["content"]
        if not text:
            click.echo("Could not extract text from {}".format(file.name))
            return
        cid = iscc.content_id_text(tika_result["content"])
    elif gmt == GMT.AUDIO:
        if not fpcalc.is_installed():
            fpcalc.install()
        features = audio_id.get_chroma_vector(file.name)
        cid = audio_id.content_id_audio(features)
    elif gmt == GMT.VIDEO:
        features = video_id.get_frame_vectors(abspath(file.name))
        cid = video_id.content_id_video(features)
    else:
        click.echo("Could not generate ISCC")
        return

    did = iscc.data_id(file.name)
    iid, tophash = iscc.instance_id(file.name)

    if not norm_title:
        iscc_code = "-".join((cid, did, iid))
    else:
        iscc_code = "-".join((mid, cid, did, iid))

    click.echo("ISCC:{}".format(iscc_code))

    if verbose:
        if norm_title:
            click.echo("Norm Title: %s" % norm_title)
        click.echo("Tophash:    %s" % tophash)
        click.echo("Filepath:   %s" % file.name)
        click.echo("GMT:        %s" % gmt)

    return dict(iscc=iscc_code,
                norm_title=norm_title,
                tophash=tophash,
                gmt=gmt)
예제 #14
0
파일: web.py 프로젝트: pombredanne/iscc-cli
def web(url, guess, title, extra, verbose):
    """Generate ISCC Code from URL."""

    extra = extra or ""

    try:
        resp = requests.get(url, headers=HEADERS, stream=True)
    except Exception as e:
        raise click.BadArgumentUsage(e)

    data = BytesIO(resp.content)
    media_type = clean_mime(detector.from_buffer(data))
    if media_type not in SUPPORTED_MIME_TYPES:
        click.echo("Unsupported media type {}".format(media_type))
        click.echo(
            "Please request support at https://github.com/iscc/iscc-cli/issues"
        )
        return

    if media_type == "application/x-mobipocket-ebook":
        data.seek(0)
        tempdir, filepath = mobi.extract(data)
        tika_result = parser.from_file(filepath)
        shutil.rmtree(tempdir)
    else:
        data.seek(0)
        tika_result = parser.from_buffer(data)

    if not title:
        title = get_title(tika_result, guess=guess, uri=url)

    mid, norm_title, _ = iscc.meta_id(title, extra)
    gmt = mime_to_gmt(media_type)
    if gmt == GMT.IMAGE:
        data.seek(0)
        cid = iscc.content_id_image(data)
    elif gmt == GMT.TEXT:
        text = tika_result["content"]
        if not text:
            click.echo("Could not extract text")
            return
        cid = iscc.content_id_text(tika_result["content"])
    elif gmt == GMT.AUDIO:
        if not fpcalc.is_installed():
            fpcalc.install()
        data.seek(0)
        features = audio_id.get_chroma_vector(data)
        cid = audio_id.content_id_audio(features)
    elif gmt == GMT.VIDEO:
        local_path = download_file(url, sanitize=True)
        features = video_id.get_frame_vectors(local_path)
        cid = video_id.content_id_video(features)
        os.remove(local_path)

    data.seek(0)
    did = iscc.data_id(data)
    data.seek(0)
    iid, tophash = iscc.instance_id(data)

    if not norm_title:
        iscc_code = "-".join((cid, did, iid))
    else:
        iscc_code = "-".join((mid, cid, did, iid))

    click.echo("ISCC:{}".format(iscc_code))

    if verbose:
        if norm_title:
            click.echo("Norm Title: %s" % norm_title)
        click.echo("Tophash:    %s" % tophash)
        click.echo("Filepath:   %s" % url)
        click.echo("GMT:        %s" % gmt)

    return dict(iscc=iscc_code,
                norm_title=norm_title,
                tophash=tophash,
                gmt=gmt)
예제 #15
0
파일: main.py 프로젝트: iscc/iscc-service
def from_file(file: UploadFile = File(...),
              title: str = Form(""),
              extra: str = Form("")):
    """Generate Full ISCC Code from Media File with optional explicit metadata."""

    media_type = detector.from_buffer(file.file)
    if media_type not in SUPPORTED_MIME_TYPES:
        raise HTTPException(
            HTTP_415_UNSUPPORTED_MEDIA_TYPE,
            "Unsupported media type '{}'. Please request support at "
            "https://github.com/iscc/iscc-service/issues.".format(media_type),
        )

    if media_type == "application/x-mobipocket-ebook":
        file.file.seek(0)
        tempdir, filepath = mobi.extract(file.file)
        tika_result = parser.from_file(filepath)
        shutil.rmtree(tempdir)
    else:
        file.file.seek(0)
        tika_result = parser.from_buffer(file.file)

    if not title:
        title = get_title(tika_result, guess=True)

    mid, norm_title, norm_extra = iscc.meta_id(title, extra)
    gmt = mime_to_gmt(media_type)
    if gmt == GMT.IMAGE:
        file.file.seek(0)
        cid = iscc.content_id_image(file.file)
    elif gmt == GMT.TEXT:
        text = tika_result["content"]
        if not text:
            raise HTTPException(HTTP_422_UNPROCESSABLE_ENTITY,
                                "Could not extract text")
        cid = iscc.content_id_text(tika_result["content"])
    elif gmt == GMT.AUDIO:
        file.file.seek(0)
        features = audio_id.get_chroma_vector(file.file)
        cid = audio_id.content_id_audio(features)
    elif gmt == GMT.VIDEO:
        file.file.seek(0)
        _, ext = splitext(file.filename)
        fn = "{}{}".format(uuid.uuid4(), ext)
        tmp_path = join(APP_DIR, fn)
        with open(tmp_path, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)
        features = video_id.get_frame_vectors(tmp_path)
        cid = video_id.content_id_video(features)
        os.remove(tmp_path)

    file.file.seek(0)
    did = iscc.data_id(file.file)
    file.file.seek(0)
    iid, tophash = iscc.instance_id(file.file)

    if not norm_title:
        iscc_code = "-".join((cid, did, iid))
    else:
        iscc_code = "-".join((mid, cid, did, iid))

    components = iscc_split(iscc_code)

    result = dict(
        iscc=iscc_code,
        tophash=tophash,
        gmt=gmt,
        bits=[code_to_bits(c) for c in components],
    )
    if norm_title:
        result["title"] = title
        result["title_trimmed"] = norm_title
    if norm_extra:
        result["extra"] = extra
        result["extra_trimmed"] = norm_extra

    file.file.close()
    return result