Exemplo n.º 1
0
    def test_cli_extract_from_url(self):
        """bibclassify -k HEP.rdf http://arxiv.org/pdf/0808.1825"""

        args = "-k HEP.rdf http://arxiv.org/pdf/0808.1825".split()
        options = bibclassify_cli._read_options(args)

        bibclassify_engine.output_keywords_for_sources(options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])


        self.stdout.seek(0)
        results = self.stdout.read()
        self.stderr.seek(0)
        errors = self.stderr.read()


        res, msg = check_pdf0(results)
        if not res:
            self.fail(msg)
    def test_cli_extract_from_url(self):
        """bibclassify -k ${taxonomy}.rdf {url/record/94}"""

        path, url = self.get_test_file(94)

        args = ("-k %s.rdf %s" % (self.taxonomy_name, url)).split()
        options = bibclassify_cli._read_options(args)

        self.redirect()

        bibclassify_engine.output_keywords_for_sources(
            options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])

        results, errors = self.unredirect()

        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)
    def test_cli_extract_from_directory(self):
        """bibclassify -k ${taxonomy}.rdf directory/"""

        path, url = self.get_test_file(94)

        path = os.path.dirname(path)

        if not os.path.exists(path):
            sys.stderr.write("No PDF folder for testing found, returning\n")
            return

        args = ("-k %s.rdf %s" % (self.taxonomy_name, path)).split()
        options = bibclassify_cli._read_options(args)

        self.redirect()

        bibclassify_engine.output_keywords_for_sources(
            options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])

        results, errors = self.unredirect()

        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)
    def test_cli_extract_from_url(self):
        """bibclassify -k ${taxonomy}.rdf {url/record/94}"""

        path, url = self.get_test_file(94)

        args = ("-k %s.rdf %s" % (self.taxonomy_name, url)).split()
        options = bibclassify_cli._read_options(args)

        self.redirect()

        bibclassify_engine.output_keywords_for_sources(options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])


        results, errors = self.unredirect()

        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)
    def test_cli_extract_from_filepath(self):
        """bibclassify -k ${taxonomy}.rdf {cache}/article.pdf"""


        path, url = self.get_test_file(94)

        if not os.path.exists(path):
            sys.stderr.write("No PDF for testing found, please load demo records\n")
            return

        args = ("-k %s.rdf %s" % (self.taxonomy_name, path)).split()
        options = bibclassify_cli._read_options(args)

        self.redirect()

        bibclassify_engine.output_keywords_for_sources(options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])


        results, errors = self.unredirect()


        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)
Exemplo n.º 6
0
    def test_full_and_partial_matching_mode(self):
        """bibclassify - difference of extraction on part or full contents of pdf"""

        path = os.path.join(os.path.dirname(__file__), '../../../var/data/files/g0/90/9611103.pdf;1')
        if not os.path.exists(path):
            sys.stderr.write("No PDF for testing found, returning")
            return

        results = []
        for case in ["-k HEP.rdf %s" % path, "-k HEP.rdf %s -m partial" % path]:
            args = (case).split()
            options = bibclassify_cli._read_options(args)

            self.stdout.truncate(0)
            self.stderr.truncate(0)

            bibclassify_engine.output_keywords_for_sources(options["text_files"],
                options["taxonomy"],
                rebuild_cache=options["rebuild_cache"],
                no_cache=options["no_cache"],
                output_mode=options["output_mode"],
                output_limit=options["output_limit"],
                spires=options["spires"],
                match_mode=options["match_mode"],
                with_author_keywords=options["with_author_keywords"],
                extract_acronyms=options["extract_acronyms"],
                only_core_tags=options["only_core_tags"])


            self.stdout.flush()
            self.stdout.seek(0)
            results.append(self.stdout.read())
            self.stderr.flush()
            self.stderr.seek(0)
            errors = self.stderr.read()

        res, msg = check_pdf1(results[1])
        if not res:
            self.fail(msg)
        res, msg = check_pdf2(results[0])
        if not res:
            self.fail(msg)
    def test_full_and_partial_matching_mode(self):
        """bibclassify - difference of extraction on part or full contents of pdf"""

        path, url = self.get_test_file(94)

        if not os.path.exists(path):
            sys.stderr.write("No PDF for testing found, returning\n")
            return

        results = []
        for case in [
                "-k %s.rdf %s" % (self.taxonomy_name, path),
                "-k %s.rdf %s -m partial" % (self.taxonomy_name, path)
        ]:
            args = (case).split()
            options = bibclassify_cli._read_options(args)

            self.redirect()

            bibclassify_engine.output_keywords_for_sources(
                options["text_files"],
                options["taxonomy"],
                rebuild_cache=options["rebuild_cache"],
                no_cache=options["no_cache"],
                output_mode=options["output_mode"],
                output_limit=options["output_limit"],
                spires=options["spires"],
                match_mode=options["match_mode"],
                with_author_keywords=options["with_author_keywords"],
                extract_acronyms=options["extract_acronyms"],
                only_core_tags=options["only_core_tags"])

            r, e = self.unredirect()
            results.append(r)

        res, msg = check_pdf1(results[1])
        if not res:
            self.fail(msg)
        res, msg = check_pdf2(results[0])
        if not res:
            self.fail(msg)
    def test_full_and_partial_matching_mode(self):
        """bibclassify - difference of extraction on part or full contents of pdf"""

        path, url = self.get_test_file(94)

        if not os.path.exists(path):
            sys.stderr.write("No PDF for testing found, returning\n")
            return

        results = []
        for case in ["-k %s.rdf %s" % (self.taxonomy_name, path), "-k %s.rdf %s -m partial" % (self.taxonomy_name, path)]:
            args = (case).split()
            options = bibclassify_cli._read_options(args)

            self.redirect()

            bibclassify_engine.output_keywords_for_sources(options["text_files"],
                options["taxonomy"],
                rebuild_cache=options["rebuild_cache"],
                no_cache=options["no_cache"],
                output_mode=options["output_mode"],
                output_limit=options["output_limit"],
                spires=options["spires"],
                match_mode=options["match_mode"],
                with_author_keywords=options["with_author_keywords"],
                extract_acronyms=options["extract_acronyms"],
                only_core_tags=options["only_core_tags"])

            r, e = self.unredirect()
            results.append(r)



        res, msg = check_pdf1(results[1])
        if not res:
            self.fail(msg)
        res, msg = check_pdf2(results[0])
        if not res:
            self.fail(msg)
Exemplo n.º 9
0
    def test_cli_extract_from_directory(self):
        """bibclassify -k HEP.rdf directory/"""


        path = os.path.abspath(os.path.dirname(__file__) + '/../../../var/data/files/g0/90')

        if not os.path.exists(path):
            print "No PDF folder for testing found, returning"
            return


        args = ("-k HEP.rdf %s" % path).split()
        options = bibclassify_cli._read_options(args)

        bibclassify_engine.output_keywords_for_sources(options["text_files"],
            options["taxonomy"],
            rebuild_cache=options["rebuild_cache"],
            no_cache=options["no_cache"],
            output_mode=options["output_mode"],
            output_limit=options["output_limit"],
            spires=options["spires"],
            match_mode=options["match_mode"],
            with_author_keywords=options["with_author_keywords"],
            extract_acronyms=options["extract_acronyms"],
            only_core_tags=options["only_core_tags"])


        self.stdout.flush()
        self.stdout.seek(0)
        results = self.stdout.read()
        self.stderr.flush()
        self.stderr.seek(0)
        errors = self.stderr.read()

        res, msg = check_pdf2(results)
        if not res:
            self.fail(msg)