Exemplos de MagicScraper em Python, exemplos de file_scraper.magic_scraper.magic_scraper.MagicScraper em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: magic_test.py Projeto: Matoking/file-scraper

def run_filetype_test(filename, result_dict, filetype, evaluate_scraper):
    """
    Runs scraper result evaluation for a scraper with forced MIME type/version

    :filename: Name of the file, not containing the tests/data/mime_type/ part
    :result_dict: Result dict to be given to Correct
    :filetype: A dict containing the forced, expected and real file types under
               the following keys:
                * given_mimetype: the forced MIME type
                * given_version: the forced version
                * expected_mimetype: the expected resulting MIME type
                * expected_version: the expected resulting version
                * correct_mimetype: the real MIME type of the file
    """
    correct = force_correct_filetype(filename, result_dict, filetype)
    if correct.mimetype == "application/xhtml+xml":
        correct.streams[0]["stream_type"] = "text"

    if filetype["given_mimetype"]:
        mimetype_guess = filetype["given_mimetype"]
    else:
        mimetype_guess = filetype["correct_mimetype"]
    params = {
        "mimetype": filetype["given_mimetype"],
        "version": filetype["given_version"],
        "mimetype_guess": mimetype_guess
    }
    scraper = MagicScraper(correct.filename, True, params)
    scraper.scrape_file()

    evaluate_scraper(scraper, correct)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: magic_test.py Projeto: Matoking/file-scraper

def test_mixed_filetype(filename, result_dict, filetype, evaluate_scraper):
    """
    Test scraping files as wrong but supported file type.

    Some metadata models support many file types. For example, OfficeMagicMeta
    supports text, spreadsheet and presentation in both MS and open formats,
    among other file types. A side effect of this is that it is entirely
    possible to scrape e.g. an ods file as a doc (or xls) file by just forcing
    the file type the scraper uses, and this does not produce errors and the
    file is reported as well-formed.

    Currently this does not cause problems if the user is aware of this
    functionality, as no metadata scraping results are affected by it. This
    test can hopefully catch if problematic metadata functions are
    introduced in the future.
    """
    correct = parse_results(filename, filetype["correct_mimetype"],
                            result_dict, True)
    correct.update_mimetype(filetype["expected_mimetype"])
    correct.update_version(filetype["expected_version"])

    if filetype["given_mimetype"]:
        mimetype_guess = filetype["given_mimetype"]
    else:
        mimetype_guess = filetype["correct_mimetype"]

    params = {
        "mimetype": filetype["given_mimetype"],
        "version": filetype["given_version"],
        "mimetype_guess": mimetype_guess
    }
    scraper = MagicScraper(correct.filename, True, params)
    scraper.scrape_file()

    evaluate_scraper(scraper, correct)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: magic_test.py Projeto: Matoking/file-scraper

def test_no_wellformed():
    """Test scraper without well-formed check."""
    scraper = MagicScraper("tests/data/image_jpeg/valid_1.01.jpg", False,
                           {"mimetype_guess": "image/jpeg"})
    scraper.scrape_file()
    assert not partial_message_included("Skipping scraper", scraper.messages())
    assert scraper.well_formed is None

Exemplo n.º 4

0

Exibir arquivo

Arquivo: magic_test.py Projeto: Matoking/file-scraper

def test_is_supported_deny(mime, ver):
    """Test is_supported method."""
    assert MagicScraper.is_supported(mime, ver, True)
    assert MagicScraper.is_supported(mime, None, True)
    assert MagicScraper.is_supported(mime, ver, False)
    assert not MagicScraper.is_supported(mime, "foo", True)
    assert not MagicScraper.is_supported("foo", ver, True)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: magic_test.py Projeto: Matoking/file-scraper

def test_scraper_valid(filename, mimetype, evaluate_scraper):
    """Test scraper."""
    result_dict = {
        "purpose": "Test valid file.",
        "stdout_part": "successfully",
        "stderr_part": ""
    }
    correct = parse_results(filename, mimetype, result_dict, True)
    params = correct.params
    params["mimetype_guess"] = correct.mimetype
    scraper = MagicScraper(correct.filename, True, params)
    scraper.scrape_file()

    if correct.mimetype == "application/xhtml+xml":
        correct.streams[0]["stream_type"] = "text"
    if (OfficeFileMagicMeta.is_supported(correct.mimetype)
            or HtmlFileMagicMeta.is_supported(correct.mimetype)):
        correct.version = None
        correct.streams[0]["version"] = None
    if correct.mimetype in ["text/plain", "text/csv"]:
        correct.streams[0]["charset"] = "UTF-8"
        correct.streams[0]["version"] = "(:unap)"
    if filename == "valid__iso8859.txt":
        correct.streams[0]["charset"] = "ISO-8859-15"
    if mimetype == "text/html" or "vnd." in mimetype or "msword" in mimetype:
        correct.streams[0]["version"] = "(:unav)"
    if mimetype == "image/jp2":
        correct.streams[0]["version"] = ""

    evaluate_scraper(scraper, correct)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: magic_test.py Projeto: Matoking/file-scraper

def test_no_mime_given():
    """Test that an error is recorded when no MIME type is given."""
    scraper = MagicScraper("tests/data/text_plain/valid__utf8.txt", True, {})
    with pytest.raises(AttributeError) as error:
        scraper.scrape_file()
    assert ("not given a parameter dict containing key 'mimetype_guess'"
            in six.text_type(error.value))
    assert ("not given a parameter dict containing key 'mimetype_guess'"
            in str(error.value))
    assert not scraper.well_formed
    assert not scraper.streams

Exemplo n.º 7

0

Exibir arquivo

Arquivo: magic_test.py Projeto: Matoking/file-scraper

def test_wrong_mime_with_xml(filepath):
    """
    Test giving wrong MIME type for text/xml or application/xhtml+xml file.

    This should cause an error to be recorded by the scraper, as those scrapers
    need the MIME type information from outside.
    """
    scraper = MagicScraper(filepath, True, {"mimetype_guess": "wrong/mime"})
    scraper.scrape_file()
    assert not scraper.well_formed
    assert not scraper.streams
    assert partial_message_included("does not match", scraper.errors())

Exemplo n.º 8

0

Exibir arquivo

Arquivo: magic_test.py Projeto: Matoking/file-scraper

def test_invalid_text(filename, mimetype):
    """Test TextFileMagic with invalid files."""
    result_dict = {
        "purpose": "Test invalid file.",
        "stdout_part": "",
        "stderr_part": "Unsupported MIME type"
    }
    correct = parse_results(filename, mimetype, result_dict, True)
    params = correct.params
    params["mimetype_guess"] = correct.mimetype
    scraper = MagicScraper(correct.filename, True, params)
    scraper.scrape_file()

    if "empty" in filename:
        correct.streams[0]["mimetype"] = "inode/x-empty"
        correct.mimetype = "inode/x-empty"
    else:
        correct.streams[0]["mimetype"] = "application/octet-stream"
        correct.mimetype = "application/octet-stream"

    correct.version = None
    correct.streams[0]["version"] = None
    correct.streams[0]["charset"] = None

    assert not scraper.well_formed
    assert not scraper.streams
    assert partial_message_included(correct.stdout_part, scraper.messages())
    assert partial_message_included(correct.stderr_part, scraper.errors())

Exemplo n.º 9

0

Exibir arquivo

Arquivo: magic_test.py Projeto: Matoking/file-scraper

def test_invalid_markdown_pdf_arc(filename, mimetype, evaluate_scraper):
    """Test scrapers for invalid XML, XHTML, HTML, pdf and arc files."""
    result_dict = {
        "purpose": "Test invalid file.",
        "stdout_part": "successfully",
        "stderr_part": ""
    }
    correct = parse_results(filename, mimetype, result_dict, True)
    params = correct.params
    params["mimetype_guess"] = correct.mimetype
    scraper = MagicScraper(correct.filename, True, params)
    scraper.scrape_file()

    correct.well_formed = True

    if "empty" in filename:
        correct.streams[0]["mimetype"] = "inode/x-empty"

    if correct.mimetype == "text/html":
        correct.streams[0]["version"] = "(:unav)"
    if correct.mimetype == "application/xhtml+xml":
        correct.streams[0]["stream_type"] = "text"

    evaluate_scraper(scraper, correct)