def test_cli_extract_from_url(self): """bibclassify -k HEP.rdf http://arxiv.org/pdf/0808.1825""" args = "-k HEP.rdf http://arxiv.org/pdf/0808.1825".split() options = bibclassify_cli._read_options(args) bibclassify_engine.output_keywords_for_sources(options["text_files"], options["taxonomy"], rebuild_cache=options["rebuild_cache"], no_cache=options["no_cache"], output_mode=options["output_mode"], output_limit=options["output_limit"], spires=options["spires"], match_mode=options["match_mode"], with_author_keywords=options["with_author_keywords"], extract_acronyms=options["extract_acronyms"], only_core_tags=options["only_core_tags"]) self.stdout.seek(0) results = self.stdout.read() self.stderr.seek(0) errors = self.stderr.read() res, msg = check_pdf0(results) if not res: self.fail(msg)
def test_cli_extract_from_url(self): """bibclassify -k ${taxonomy}.rdf {url/record/94}""" path, url = self.get_test_file(94) args = ("-k %s.rdf %s" % (self.taxonomy_name, url)).split() options = bibclassify_cli._read_options(args) self.redirect() bibclassify_engine.output_keywords_for_sources( options["text_files"], options["taxonomy"], rebuild_cache=options["rebuild_cache"], no_cache=options["no_cache"], output_mode=options["output_mode"], output_limit=options["output_limit"], spires=options["spires"], match_mode=options["match_mode"], with_author_keywords=options["with_author_keywords"], extract_acronyms=options["extract_acronyms"], only_core_tags=options["only_core_tags"]) results, errors = self.unredirect() res, msg = check_pdf2(results) if not res: self.fail(msg)
def test_cli_extract_from_directory(self): """bibclassify -k ${taxonomy}.rdf directory/""" path, url = self.get_test_file(94) path = os.path.dirname(path) if not os.path.exists(path): sys.stderr.write("No PDF folder for testing found, returning\n") return args = ("-k %s.rdf %s" % (self.taxonomy_name, path)).split() options = bibclassify_cli._read_options(args) self.redirect() bibclassify_engine.output_keywords_for_sources( options["text_files"], options["taxonomy"], rebuild_cache=options["rebuild_cache"], no_cache=options["no_cache"], output_mode=options["output_mode"], output_limit=options["output_limit"], spires=options["spires"], match_mode=options["match_mode"], with_author_keywords=options["with_author_keywords"], extract_acronyms=options["extract_acronyms"], only_core_tags=options["only_core_tags"]) results, errors = self.unredirect() res, msg = check_pdf2(results) if not res: self.fail(msg)
def test_cli_extract_from_url(self): """bibclassify -k ${taxonomy}.rdf {url/record/94}""" path, url = self.get_test_file(94) args = ("-k %s.rdf %s" % (self.taxonomy_name, url)).split() options = bibclassify_cli._read_options(args) self.redirect() bibclassify_engine.output_keywords_for_sources(options["text_files"], options["taxonomy"], rebuild_cache=options["rebuild_cache"], no_cache=options["no_cache"], output_mode=options["output_mode"], output_limit=options["output_limit"], spires=options["spires"], match_mode=options["match_mode"], with_author_keywords=options["with_author_keywords"], extract_acronyms=options["extract_acronyms"], only_core_tags=options["only_core_tags"]) results, errors = self.unredirect() res, msg = check_pdf2(results) if not res: self.fail(msg)
def test_cli_extract_from_filepath(self): """bibclassify -k ${taxonomy}.rdf {cache}/article.pdf""" path, url = self.get_test_file(94) if not os.path.exists(path): sys.stderr.write("No PDF for testing found, please load demo records\n") return args = ("-k %s.rdf %s" % (self.taxonomy_name, path)).split() options = bibclassify_cli._read_options(args) self.redirect() bibclassify_engine.output_keywords_for_sources(options["text_files"], options["taxonomy"], rebuild_cache=options["rebuild_cache"], no_cache=options["no_cache"], output_mode=options["output_mode"], output_limit=options["output_limit"], spires=options["spires"], match_mode=options["match_mode"], with_author_keywords=options["with_author_keywords"], extract_acronyms=options["extract_acronyms"], only_core_tags=options["only_core_tags"]) results, errors = self.unredirect() res, msg = check_pdf2(results) if not res: self.fail(msg)
def main(): """Main function """ arguments = sys.argv for index, argument in enumerate(arguments): if 'bibclassify' in argument: arguments = arguments[index + 1:] break else: arguments = arguments[1:] run_as_daemon = False # Check if running in standalone or daemon mode. if not arguments and not bconfig.STANDALONE: run_as_daemon = True elif len(arguments) == 1 and arguments[0].isdigit(): # Running the task with its PID number (bibsched style). run_as_daemon = True specific_daemon_options = ('-i', '--recid', '-c', '--collection', '-f') for option in specific_daemon_options: for arg in arguments: if arg.startswith(option): run_as_daemon = True if run_as_daemon: import bibclassify_daemon as daemon if daemon: daemon.bibclassify_daemon() else: log.error( "We are running in a standalone mode, can't start daemon") else: options = _read_options(arguments) if options['check_taxonomy']: reader.check_taxonomy(options['taxonomy']) engine.output_keywords_for_sources( options["text_files"], options["taxonomy"], rebuild_cache=options["rebuild_cache"], no_cache=options["no_cache"], output_mode=options["output_mode"], output_limit=options["output_limit"], spires=options["spires"], match_mode=options["match_mode"], with_author_keywords=options["with_author_keywords"], extract_acronyms=options["extract_acronyms"], only_core_tags=options["only_core_tags"])
def main(): """Main function """ arguments = sys.argv for index, argument in enumerate(arguments): if 'bibclassify' in argument: arguments = arguments[index+1:] break else: arguments = arguments[1:] run_as_daemon = False # Check if running in standalone or daemon mode. if not arguments and not bconfig.STANDALONE: run_as_daemon = True elif len(arguments) == 1 and arguments[0].isdigit(): # Running the task with its PID number (bibsched style). run_as_daemon = True specific_daemon_options = ('-i', '--recid', '-c', '--collection', '-f') for option in specific_daemon_options: for arg in arguments: if arg.startswith(option): run_as_daemon = True if run_as_daemon: import bibclassify_daemon as daemon if daemon: daemon.bibclassify_daemon() else: log.error("We are running in a standalone mode, can't start daemon") else: options = _read_options(arguments) if options['check_taxonomy']: reader.check_taxonomy(options['taxonomy']) engine.output_keywords_for_sources(options["text_files"], options["taxonomy"], rebuild_cache=options["rebuild_cache"], no_cache=options["no_cache"], output_mode=options["output_mode"], output_limit=options["output_limit"], spires=options["spires"], match_mode=options["match_mode"], with_author_keywords=options["with_author_keywords"], extract_acronyms=options["extract_acronyms"], only_core_tags=options["only_core_tags"])
def test_full_and_partial_matching_mode(self): """bibclassify - difference of extraction on part or full contents of pdf""" path = os.path.join(os.path.dirname(__file__), '../../../var/data/files/g0/90/9611103.pdf;1') if not os.path.exists(path): sys.stderr.write("No PDF for testing found, returning") return results = [] for case in ["-k HEP.rdf %s" % path, "-k HEP.rdf %s -m partial" % path]: args = (case).split() options = bibclassify_cli._read_options(args) self.stdout.truncate(0) self.stderr.truncate(0) bibclassify_engine.output_keywords_for_sources(options["text_files"], options["taxonomy"], rebuild_cache=options["rebuild_cache"], no_cache=options["no_cache"], output_mode=options["output_mode"], output_limit=options["output_limit"], spires=options["spires"], match_mode=options["match_mode"], with_author_keywords=options["with_author_keywords"], extract_acronyms=options["extract_acronyms"], only_core_tags=options["only_core_tags"]) self.stdout.flush() self.stdout.seek(0) results.append(self.stdout.read()) self.stderr.flush() self.stderr.seek(0) errors = self.stderr.read() res, msg = check_pdf1(results[1]) if not res: self.fail(msg) res, msg = check_pdf2(results[0]) if not res: self.fail(msg)
def test_full_and_partial_matching_mode(self): """bibclassify - difference of extraction on part or full contents of pdf""" path, url = self.get_test_file(94) if not os.path.exists(path): sys.stderr.write("No PDF for testing found, returning\n") return results = [] for case in [ "-k %s.rdf %s" % (self.taxonomy_name, path), "-k %s.rdf %s -m partial" % (self.taxonomy_name, path) ]: args = (case).split() options = bibclassify_cli._read_options(args) self.redirect() bibclassify_engine.output_keywords_for_sources( options["text_files"], options["taxonomy"], rebuild_cache=options["rebuild_cache"], no_cache=options["no_cache"], output_mode=options["output_mode"], output_limit=options["output_limit"], spires=options["spires"], match_mode=options["match_mode"], with_author_keywords=options["with_author_keywords"], extract_acronyms=options["extract_acronyms"], only_core_tags=options["only_core_tags"]) r, e = self.unredirect() results.append(r) res, msg = check_pdf1(results[1]) if not res: self.fail(msg) res, msg = check_pdf2(results[0]) if not res: self.fail(msg)
def test_full_and_partial_matching_mode(self): """bibclassify - difference of extraction on part or full contents of pdf""" path, url = self.get_test_file(94) if not os.path.exists(path): sys.stderr.write("No PDF for testing found, returning\n") return results = [] for case in ["-k %s.rdf %s" % (self.taxonomy_name, path), "-k %s.rdf %s -m partial" % (self.taxonomy_name, path)]: args = (case).split() options = bibclassify_cli._read_options(args) self.redirect() bibclassify_engine.output_keywords_for_sources(options["text_files"], options["taxonomy"], rebuild_cache=options["rebuild_cache"], no_cache=options["no_cache"], output_mode=options["output_mode"], output_limit=options["output_limit"], spires=options["spires"], match_mode=options["match_mode"], with_author_keywords=options["with_author_keywords"], extract_acronyms=options["extract_acronyms"], only_core_tags=options["only_core_tags"]) r, e = self.unredirect() results.append(r) res, msg = check_pdf1(results[1]) if not res: self.fail(msg) res, msg = check_pdf2(results[0]) if not res: self.fail(msg)
def test_cli_extract_from_directory(self): """bibclassify -k HEP.rdf directory/""" path = os.path.abspath(os.path.dirname(__file__) + '/../../../var/data/files/g0/90') if not os.path.exists(path): print "No PDF folder for testing found, returning" return args = ("-k HEP.rdf %s" % path).split() options = bibclassify_cli._read_options(args) bibclassify_engine.output_keywords_for_sources(options["text_files"], options["taxonomy"], rebuild_cache=options["rebuild_cache"], no_cache=options["no_cache"], output_mode=options["output_mode"], output_limit=options["output_limit"], spires=options["spires"], match_mode=options["match_mode"], with_author_keywords=options["with_author_keywords"], extract_acronyms=options["extract_acronyms"], only_core_tags=options["only_core_tags"]) self.stdout.flush() self.stdout.seek(0) results = self.stdout.read() self.stderr.flush() self.stderr.seek(0) errors = self.stderr.read() res, msg = check_pdf2(results) if not res: self.fail(msg)