def process_paper_tar(paper_tarname: str) -> None:
    parts = paper_tarname.split('/')
    partition_name = parts[-2]
    paper_name = os.path.splitext(parts[-1])[0]
    result_path = os.path.join(ARXIV_FIGURE_JSON_DIR, partition_name,
                               paper_name + '.json')
    paper_dir = os.path.join(ARXIV_SRC_DIR, partition_name, paper_name)
    if os.path.isfile(result_path):
        return
    print('.', end='', flush=True)
    try:
        file_util.extract_tarfile(paper_tarname, paper_dir)
    except tarfile.ReadError:
        logging.debug('File %s is not a tar' % paper_tarname)
        return
    diffs = generate_diffs(paper_dir)
    if diffs is None:
        return
    figures_by_page = dict()
    for diff in diffs:
        figures = consume_diff_generate_figures(diff)
        if figures is None:
            continue
        page_name = os.path.dirname(
            diff) + '/' + diff[diff.find('black.pdf-'):]
        figures_by_page[page_name] = figures
    file_util.safe_makedirs(os.path.dirname(result_path))
    file_util.write_json_atomic(
        result_path,
        config.JsonSerializable.serialize(figures_by_page),
        sort_keys=True)
def download_and_extract_tar(tarname: str,
                             extract_dir: str,
                             n_attempts: int = 100) -> None:
    print('.', end='', flush=True)
    logging.info('Downloading %s' % tarname)
    for attempt in range(n_attempts):
        try:
            cached_file = file_util.cache_file(tarname)
            break
        except FileNotFoundError:
            if attempt == n_attempts - 1:
                raise
            logging.exception('Download failed, retrying')
            time.sleep(10)
    file_util.extract_tarfile(cached_file, extract_dir)
    os.remove(cached_file)
示例#3
0
 def process_paper_tar(self):
     print("------Processing paper_tarname : {}--------".format(
         self.paper_tarname))
     parts = self.paper_tarname.split('/')
     partition_name = parts[-2]
     paper_name = os.path.splitext(parts[-1])[0]
     result_path = os.path.join(self.ARXIV_FIGURE_JSON_DIR, partition_name,
                                paper_name + '.json')
     paper_dir = os.path.join(self.ARXIV_SRC_DIR, partition_name,
                              paper_name)
     if os.path.isfile(result_path):
         return
     print('.', end='', flush=True)
     try:
         file_util.extract_tarfile(self.paper_tarname, paper_dir)
     except tarfile.ReadError:
         logging.debug('File %s is not a tar' % self.paper_tarname)
         return
     try:
         diffs, black_ims_paths = self.generate_diffs(paper_dir)
     except TypeError:
         return
     if diffs is None:
         return
     figures_by_page = dict()
     for idx, diff in enumerate(diffs):
         figures = self.consume_diff_generate_figures(diff)
         if figures is None:
             continue
         try:
             figures = self.augment_images(black_ims_paths[idx], figures)
         except Exception as e:
             print(
                 "Error augmenting images for image path: {}. Exception message: {}"
                 .format(black_ims_paths[idx], e))
         page_name = os.path.dirname(
             diff) + '/' + diff[diff.find('black.pdf-'):]
         figures_by_page[page_name] = figures
     file_util.safe_makedirs(os.path.dirname(result_path))
     file_util.write_json_atomic(
         result_path,
         config.JsonSerializable.serialize(figures_by_page),
         sort_keys=True)
     figure_boundaries, caption_boundaries = transform_figure_json(
         result_path, self.ignore_pages_with_no_figures)
     return result_path, figure_boundaries, caption_boundaries
示例#4
0
def run_full_pipeline(
    tarpath: str, skip_done: bool=True, save_intermediate: bool=False
) -> None:
    foldername = str(os.path.basename(tarpath).split('.')[0])
    result_path = LOCAL_FIGURE_JSON_DIR + get_bin(
        tarpath
    ) + foldername + '.json'
    if skip_done and file_util.exists(result_path):
        return
    d = LOCAL_INTERMEDIATE_DIR + get_bin(tarpath)
    while True:
        try:
            file_util.extract_tarfile(tarpath, d, streaming=False)
            # botocore.vendored.requests.packages.urllib3.exceptions.ReadTimeoutError can't be caught because it doesn't
            # inherit from BaseException, so don't use streaming
            break
        except FileNotFoundError as e:
            logging.exception('Failure reading %s, retrying' % tarpath)
        except ReadTimeout as e:
            logging.exception('Timeout reading %s, retrying' % tarpath)
    pdfs = glob.glob(d + foldername + '/' + '*.pdf')
    res = dict()
    for pdf in pdfs:
        sha1sum = file_util.compute_sha1(pdf)
        with open(pdf + '.sha1', 'w') as f:
            print(sha1sum, file=f)
        paper_figures = match_figures(pdf)
        if paper_figures is not None:
            res.update(paper_figures)
    if save_intermediate:
        intermediate_path = PUBMED_INTERMEDIATE_DIR + get_bin(
            tarpath
        ) + foldername + '/'
        for file in glob.glob(d + '/' + foldername + '/' + '*'):
            file_util.copy(file, intermediate_path + os.path.basename(file))
    file_util.safe_makedirs(os.path.dirname(result_path))
    file_util.write_json_atomic(
        result_path,
        config.JsonSerializable.serialize(res),
        indent=2,
        sort_keys=True
    )
示例#5
0
def download_and_extract_tar(
        tarname: str,
        extract_dir: str,
        n_attempts: int = 100,
        cache_dir: str = settings.ARXIV_DATA_CACHE_DIR,
        delete_tar_after_extracting: bool = False) -> None:
    print('.', end='', flush=True)
    logging.info('Downloading %s' % tarname)
    for attempt in range(n_attempts):
        try:
            cached_file = file_util.cache_file_2(tarname, cache_dir=cache_dir)
            break
        except FileNotFoundError:
            if attempt == n_attempts - 1:
                raise
            logging.exception('Download failed, retrying')
            time.sleep(10)
    logging.info("Proceeding to extract tar file: {}".format(cached_file))
    file_util.extract_tarfile(cached_file, extract_dir)
    if delete_tar_after_extracting:
        os.remove(cached_file)