Пример #1
0
def call_pdflatex(src_tex: str,
                  src_dir: str,
                  dest_dir: str,
                  timeout: int = 1200) -> str:
    """
    Call pdflatex on the tex source file src_tex, save its output to dest_dir, and return the path of the
    resulting pdf.
    """
    # Need to be in the same directory as the file to compile it
    file_util.safe_makedirs(dest_dir)
    # Shell-escape required due to https://www.scivision.co/pdflatex-error-epstopdf-output-filename-not-allowed-in-restricted-mode/
    cmd = [
        'pdflatex', '-interaction=nonstopmode', '-shell-escape',
        '-output-directory=' + dest_dir, src_tex
    ]
    # Run twice so that citations are built correctly
    # Had some issues getting latexmk to work
    try:
        subprocess.run(cmd,
                       stdout=subprocess.PIPE,
                       cwd=src_dir,
                       timeout=timeout)
        res = subprocess.run(cmd,
                             stdout=subprocess.PIPE,
                             cwd=src_dir,
                             timeout=timeout)
    except subprocess.TimeoutExpired:
        raise LatexException(' '.join(cmd), -1,
                             'Timeout exception after %d' % timeout)
    if res.returncode != 0:
        raise LatexException(' '.join(cmd), res.returncode, res.stdout)
    paperid = os.path.splitext(os.path.basename(src_tex))[0]
    return dest_dir + paperid + '.pdf'
def process_paper_tar(paper_tarname: str) -> None:
    parts = paper_tarname.split('/')
    partition_name = parts[-2]
    paper_name = os.path.splitext(parts[-1])[0]
    result_path = os.path.join(ARXIV_FIGURE_JSON_DIR, partition_name,
                               paper_name + '.json')
    paper_dir = os.path.join(ARXIV_SRC_DIR, partition_name, paper_name)
    if os.path.isfile(result_path):
        return
    print('.', end='', flush=True)
    try:
        file_util.extract_tarfile(paper_tarname, paper_dir)
    except tarfile.ReadError:
        logging.debug('File %s is not a tar' % paper_tarname)
        return
    diffs = generate_diffs(paper_dir)
    if diffs is None:
        return
    figures_by_page = dict()
    for diff in diffs:
        figures = consume_diff_generate_figures(diff)
        if figures is None:
            continue
        page_name = os.path.dirname(
            diff) + '/' + diff[diff.find('black.pdf-'):]
        figures_by_page[page_name] = figures
    file_util.safe_makedirs(os.path.dirname(result_path))
    file_util.write_json_atomic(
        result_path,
        config.JsonSerializable.serialize(figures_by_page),
        sort_keys=True)
Пример #3
0
 def process_paper_tar(self):
     print("------Processing paper_tarname : {}--------".format(
         self.paper_tarname))
     parts = self.paper_tarname.split('/')
     partition_name = parts[-2]
     paper_name = os.path.splitext(parts[-1])[0]
     result_path = os.path.join(self.ARXIV_FIGURE_JSON_DIR, partition_name,
                                paper_name + '.json')
     paper_dir = os.path.join(self.ARXIV_SRC_DIR, partition_name,
                              paper_name)
     if os.path.isfile(result_path):
         return
     print('.', end='', flush=True)
     try:
         file_util.extract_tarfile(self.paper_tarname, paper_dir)
     except tarfile.ReadError:
         logging.debug('File %s is not a tar' % self.paper_tarname)
         return
     try:
         diffs, black_ims_paths = self.generate_diffs(paper_dir)
     except TypeError:
         return
     if diffs is None:
         return
     figures_by_page = dict()
     for idx, diff in enumerate(diffs):
         figures = self.consume_diff_generate_figures(diff)
         if figures is None:
             continue
         try:
             figures = self.augment_images(black_ims_paths[idx], figures)
         except Exception as e:
             print(
                 "Error augmenting images for image path: {}. Exception message: {}"
                 .format(black_ims_paths[idx], e))
         page_name = os.path.dirname(
             diff) + '/' + diff[diff.find('black.pdf-'):]
         figures_by_page[page_name] = figures
     file_util.safe_makedirs(os.path.dirname(result_path))
     file_util.write_json_atomic(
         result_path,
         config.JsonSerializable.serialize(figures_by_page),
         sort_keys=True)
     figure_boundaries, caption_boundaries = transform_figure_json(
         result_path, self.ignore_pages_with_no_figures)
     return result_path, figure_boundaries, caption_boundaries
Пример #4
0
def run_full_pipeline(
    tarpath: str, skip_done: bool=True, save_intermediate: bool=False
) -> None:
    foldername = str(os.path.basename(tarpath).split('.')[0])
    result_path = LOCAL_FIGURE_JSON_DIR + get_bin(
        tarpath
    ) + foldername + '.json'
    if skip_done and file_util.exists(result_path):
        return
    d = LOCAL_INTERMEDIATE_DIR + get_bin(tarpath)
    while True:
        try:
            file_util.extract_tarfile(tarpath, d, streaming=False)
            # botocore.vendored.requests.packages.urllib3.exceptions.ReadTimeoutError can't be caught because it doesn't
            # inherit from BaseException, so don't use streaming
            break
        except FileNotFoundError as e:
            logging.exception('Failure reading %s, retrying' % tarpath)
        except ReadTimeout as e:
            logging.exception('Timeout reading %s, retrying' % tarpath)
    pdfs = glob.glob(d + foldername + '/' + '*.pdf')
    res = dict()
    for pdf in pdfs:
        sha1sum = file_util.compute_sha1(pdf)
        with open(pdf + '.sha1', 'w') as f:
            print(sha1sum, file=f)
        paper_figures = match_figures(pdf)
        if paper_figures is not None:
            res.update(paper_figures)
    if save_intermediate:
        intermediate_path = PUBMED_INTERMEDIATE_DIR + get_bin(
            tarpath
        ) + foldername + '/'
        for file in glob.glob(d + '/' + foldername + '/' + '*'):
            file_util.copy(file, intermediate_path + os.path.basename(file))
    file_util.safe_makedirs(os.path.dirname(result_path))
    file_util.write_json_atomic(
        result_path,
        config.JsonSerializable.serialize(res),
        indent=2,
        sort_keys=True
    )