def load_bibliography(path=None, text=None, input_format=None): """ Convert a bibliography to CSL JSON using `pandoc-citeproc --bib2json`. Accepts either a bibliography path or text (string). If supplying text, pandoc-citeproc will likely require input_format be specified. The CSL JSON is returned as Python objects. Parameters ---------- path : str, pathlike, or None Path to a bibliography file. Extension is used by pandoc-citeproc to infer the format of the input. text : str or None Text representation of the bibligriophy, such as a JSON-formatted string. `input_format` should be specified if providing text input. input_format : str or None Manually specified input formatted that is supported by pandoc-citeproc: https://github.com/jgm/pandoc-citeproc/blob/master/man/pandoc-citeproc.1.md#options Returns ------- csl_json : JSON-like object CSL JSON Data for the references encoded by the input bibliography. """ use_text = path is None use_path = text is None if not (use_text ^ use_path): raise ValueError( "load_bibliography: specify either path or text but not both.") if not get_pandoc_info()["pandoc-citeproc"]: logging.error( "pandoc-citeproc not found on system: manubot.pandoc.bibliography.load_bibliography returning empty CSL JSON" ) return [] args = ["pandoc-citeproc", "--bib2json"] if input_format: args.extend(["--format", input_format]) run_kwargs = {} if use_path: args.append(str(path)) if use_text: run_kwargs["input"] = text logging.info("call_pandoc subprocess args:\n>>> " + shlex_join(args)) process = subprocess.run( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf-8", **run_kwargs, ) logging.info(f"captured stderr:\n{process.stderr}") process.check_returncode() try: csl_json = json.loads(process.stdout) except Exception: logging.exception( f"Error parsing bib2json output as JSON:\n{process.stdout}") csl_json = [] return csl_json
def _exit_without_pandoc(): """ Given info from get_pandoc_info, exit Python if Pandoc is not available. """ info = get_pandoc_info() for command in 'pandoc', 'pandoc-citeproc': if not info[command]: logging.critical(f'"{command}" not found on system. ' f'Check that Pandoc is installed.') raise SystemExit(1)
def _exit_without_pandoc() -> None: """ Given info from get_pandoc_info, exit Python if Pandoc is not available. """ if get_pandoc_info()["pandoc"]: return logging.critical( f"pandoc command not found on system. Ensure that Pandoc is installed." ) raise SystemExit(1)
def _exit_without_pandoc(): """ Given info from get_pandoc_info, exit Python if Pandoc is not available. """ info = get_pandoc_info() for command in "pandoc", "pandoc-citeproc": if not info[command]: logging.critical( f"{command!r} not found on system. Check that Pandoc is installed." ) raise SystemExit(1)
def test_cite_pandoc_filter(): """ Test the stdout output of `manubot cite --render` with various formats. The output is sensitive to the version of Pandoc used, so rather than fail when the system's pandoc is outdated, the test is skipped. ```shell # Command to regenerate the expected output pandoc \ --to=plain \ --wrap=preserve \ --csl=https://github.com/manubot/rootstock/raw/8b9b5ced2c7c963bf3ea5afb8f31f9a4a54ab697/build/assets/style.csl \ --output=manubot/pandoc/tests/test_cite_filter/output.txt \ --bibliography=manubot/pandoc/tests/test_cite_filter/bibliography.json \ --bibliography=manubot/pandoc/tests/test_cite_filter/bibliography.bib \ --filter=pandoc-manubot-cite \ --filter=pandoc-citeproc \ manubot/pandoc/tests/test_cite_filter/input.md ``` """ data_dir = directory.joinpath("test_cite_filter") pandoc_version = get_pandoc_info()["pandoc version"] if pandoc_version < (1, 12): pytest.skip("Test requires pandoc >= 1.12 to support --filter") input_md = data_dir.joinpath("input.md").read_text(encoding="utf-8-sig") expected = data_dir.joinpath("output.txt").read_text(encoding="utf-8-sig") args = [ "pandoc", "--wrap=preserve", "--csl=https://github.com/manubot/rootstock/raw/8b9b5ced2c7c963bf3ea5afb8f31f9a4a54ab697/build/assets/style.csl", "--bibliography", str(directory.joinpath("test_cite_filter", "bibliography.json")), "--bibliography", str(directory.joinpath("test_cite_filter", "bibliography.bib")), "--filter=pandoc-manubot-cite", "--filter=pandoc-citeproc", "--to=plain", ] process = subprocess.run( args, input=input_md, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf-8", ) print(shlex_join(process.args)) print(process.stdout) print(process.stderr) assert process.stdout.lower() == expected.lower()
def test_cite_pandoc_filter(): """ Test the stdout output of `manubot cite --render` with various formats. The output is sensitive to the version of Pandoc used, so rather than fail when the system's pandoc is outdated, the test is skipped. ```shell # Command to regenerate the expected output pandoc \ --to=plain \ --wrap=preserve \ --output=manubot/pandoc/tests/test_cite_filter/output.txt \ --filter=pandoc-manubot-cite \ --filter=pandoc-citeproc \ manubot/pandoc/tests/test_cite_filter/input.md # Command to generate Pandoc JSON input for pandoc-manubot-cite pandoc \ --to=json \ --wrap=preserve \ --output=manubot/pandoc/tests/test_cite_filter/filter-input.json \ manubot/pandoc/tests/test_cite_filter/input.md ``` """ data_dir = directory.joinpath("test_cite_filter") pandoc_version = get_pandoc_info()["pandoc version"] if pandoc_version < (1, 12): pytest.skip("Test requires pandoc >= 1.12 to support --filter") input_md = data_dir.joinpath("input.md").read_text(encoding="utf-8-sig") expected = data_dir.joinpath("output.txt").read_text(encoding="utf-8-sig") args = [ "pandoc", "--wrap=preserve", "--filter=pandoc-manubot-cite", "--filter=pandoc-citeproc" if pandoc_version < (2, 11) else "--citeproc", "--to=plain", ] process = subprocess.run( args, input=input_md, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf-8", ) print(shlex_join(process.args)) print(process.stdout) print(process.stderr) assert process.stdout.lower() == expected.lower()
def load_bibliography( path: Optional[str] = None, text: Optional[str] = None, input_format: Optional[str] = None, ) -> List[Dict[str, Any]]: """ Convert a bibliography to CSL JSON using either `pandoc-citeproc --bib2json` or `pandoc --to=csljson`, depending on availability of pandoc commands on the system. Accepts either a bibliography path or text (string). If supplying text, pandoc-citeproc will likely require input_format be specified. The CSL JSON is returned as Python objects. If loading fails, log an error and return an empty list. Parameters ---------- path : str, pathlike, or None Path to a bibliography file. Extension is used by pandoc-citeproc to infer the format of the input. text : str or None Text representation of the bibliography, such as a JSON-formatted string. `input_format` should be specified if providing text input. input_format : str or None Manually specified input formatted that is supported by pandoc-citeproc: https://github.com/jgm/pandoc-citeproc/blob/master/man/pandoc-citeproc.1.md#options Use 'bib' for BibLaTeX. Use 'json' for CSL JSON. Returns ------- csl_json : JSON-like object CSL JSON Data for the references encoded by the input bibliography. """ use_text = path is None use_path = text is None if use_path: path = os.fspath(path) if not (use_text ^ use_path): raise ValueError( "load_bibliography: specify either path or text but not both.") pdoc_info = get_pandoc_info() if pdoc_info["pandoc-citeproc"]: return _load_bibliography_pandoc_citeproc(path, text, input_format) if input_format == "bib" or (use_path and path.endswith(".bib")): return _load_bibliography_pandoc(path, text) logging.error( "pandoc-citeproc not found on system, but is required to convert any format besides 'bib': " "manubot.pandoc.bibliography.load_bibliography returning empty CSL JSON" ) return []
def call_pandoc(metadata, path, format="plain"): """ path is the path to write to. """ _exit_without_pandoc() info = get_pandoc_info() _check_pandoc_version(info, metadata, format) metadata_block = "---\n{yaml}\n...\n".format( yaml=json.dumps(metadata, ensure_ascii=False, indent=2) ) args = [ "pandoc", "--filter", "pandoc-citeproc", "--output", str(path) if path else "-", ] if format == "markdown": args.extend(["--to", "markdown_strict", "--wrap", "none"]) elif format == "jats": args.extend(["--to", "jats", "--standalone"]) elif format == "docx": args.extend(["--to", "docx"]) elif format == "html": args.extend(["--to", "html"]) elif format == "plain": args.extend(["--to", "plain", "--wrap", "none"]) if info["pandoc version"] >= (2,): # Do not use ALL_CAPS for bold & underscores for italics # https://github.com/jgm/pandoc/issues/4834#issuecomment-412972008 filter_path = ( pathlib.Path(__file__) .joinpath("..", "plain-pandoc-filter.lua") .resolve() ) assert filter_path.exists() args.extend(["--lua-filter", str(filter_path)]) logging.info("call_pandoc subprocess args:\n" + shlex_join(args)) process = subprocess.run( args=args, input=metadata_block.encode(), ) process.check_returncode()
def call_pandoc(metadata, path, format='plain'): """ path is the path to write to. """ _exit_without_pandoc() info = get_pandoc_info() _check_pandoc_version(info, metadata, format) metadata_block = '---\n{yaml}\n...\n'.format( yaml=json.dumps(metadata, ensure_ascii=False, indent=2)) args = [ 'pandoc', '--filter', 'pandoc-citeproc', '--output', str(path) if path else '-', ] if format == 'markdown': args.extend(['--to', 'markdown_strict', '--wrap', 'none']) elif format == 'jats': args.extend(['--to', 'jats', '--standalone']) elif format == 'docx': args.extend(['--to', 'docx']) elif format == 'html': args.extend(['--to', 'html']) elif format == 'plain': args.extend(['--to', 'plain', '--wrap', 'none']) if info['pandoc version'] >= (2, ): # Do not use ALL_CAPS for bold & underscores for italics # https://github.com/jgm/pandoc/issues/4834#issuecomment-412972008 filter_path = pathlib.Path(__file__).joinpath( '..', 'plain-pandoc-filter.lua').resolve() assert filter_path.exists() args.extend(['--lua-filter', str(filter_path)]) logging.info('call_pandoc subprocess args:\n' + shlex_join(args)) process = subprocess.run( args=args, input=metadata_block.encode(), stdout=subprocess.PIPE if path else sys.stdout, stderr=sys.stderr, ) process.check_returncode()
def _load_bibliography_pandoc( path: Optional[str] = None, text: Optional[str] = None, ) -> List[Dict[str, Any]]: """ Convert a biblatex (.bib) bibliography to CSL JSON data using pandoc directly. Pandoc support for csljson output requires pandoc >= 2.11. """ pdoc_info = get_pandoc_info() if not pdoc_info["pandoc"]: logging.error( "pandoc not found on system: " "manubot.pandoc.bibliography.load_bibliography returning empty CSL JSON" ) return [] if pdoc_info["pandoc version"] < (2, 11): logging.error( "pandoc >= version 2.11 required for biblatex to csljson conversion. " "manubot.pandoc.bibliography.load_bibliography returning empty CSL JSON" ) return [] command_args = "pandoc --from=biblatex --to=csljson".split() return _pandoc_system_call(command_args, path, text)
def test_cite_command_render_stdout(args, expected): """ Test the stdout output of `manubot cite --render` with various formats. The output is sensitive to the version of Pandoc used, so rather than fail when the system's pandoc is outdated, the test is skipped. """ pandoc_version = get_pandoc_info()['pandoc version'] for output in 'markdown', 'html', 'jats': if output in args and pandoc_version < (2, 5): pytest.skip(f"Test {output} output assumes pandoc >= 2.5") if pandoc_version < (2, 0): pytest.skip( "Test requires pandoc >= 2.0 to support --lua-filter and --csl=URL" ) expected = (pathlib.Path(__file__).parent.joinpath('cite-command-rendered', expected).read_text()) args = [ 'manubot', 'cite', '--render', '--csl', 'https://github.com/greenelab/manubot-rootstock/raw/e83e51dcd89256403bb787c3d9a46e4ee8d04a9e/build/assets/style.csl', 'arxiv:1806.05726v1', 'doi:10.7717/peerj.338', 'pmid:29618526', ] + args process = subprocess.run( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, ) print(' '.join(process.args)) print(process.stdout) print(process.stderr) assert process.stdout == expected