def process_paper_tar(paper_tarname: str) -> None: parts = paper_tarname.split('/') partition_name = parts[-2] paper_name = os.path.splitext(parts[-1])[0] result_path = os.path.join(ARXIV_FIGURE_JSON_DIR, partition_name, paper_name + '.json') paper_dir = os.path.join(ARXIV_SRC_DIR, partition_name, paper_name) if os.path.isfile(result_path): return print('.', end='', flush=True) try: file_util.extract_tarfile(paper_tarname, paper_dir) except tarfile.ReadError: logging.debug('File %s is not a tar' % paper_tarname) return diffs = generate_diffs(paper_dir) if diffs is None: return figures_by_page = dict() for diff in diffs: figures = consume_diff_generate_figures(diff) if figures is None: continue page_name = os.path.dirname( diff) + '/' + diff[diff.find('black.pdf-'):] figures_by_page[page_name] = figures file_util.safe_makedirs(os.path.dirname(result_path)) file_util.write_json_atomic( result_path, config.JsonSerializable.serialize(figures_by_page), sort_keys=True)
def download_and_extract_tar(tarname: str, extract_dir: str, n_attempts: int = 100) -> None: print('.', end='', flush=True) logging.info('Downloading %s' % tarname) for attempt in range(n_attempts): try: cached_file = file_util.cache_file(tarname) break except FileNotFoundError: if attempt == n_attempts - 1: raise logging.exception('Download failed, retrying') time.sleep(10) file_util.extract_tarfile(cached_file, extract_dir) os.remove(cached_file)
def process_paper_tar(self): print("------Processing paper_tarname : {}--------".format( self.paper_tarname)) parts = self.paper_tarname.split('/') partition_name = parts[-2] paper_name = os.path.splitext(parts[-1])[0] result_path = os.path.join(self.ARXIV_FIGURE_JSON_DIR, partition_name, paper_name + '.json') paper_dir = os.path.join(self.ARXIV_SRC_DIR, partition_name, paper_name) if os.path.isfile(result_path): return print('.', end='', flush=True) try: file_util.extract_tarfile(self.paper_tarname, paper_dir) except tarfile.ReadError: logging.debug('File %s is not a tar' % self.paper_tarname) return try: diffs, black_ims_paths = self.generate_diffs(paper_dir) except TypeError: return if diffs is None: return figures_by_page = dict() for idx, diff in enumerate(diffs): figures = self.consume_diff_generate_figures(diff) if figures is None: continue try: figures = self.augment_images(black_ims_paths[idx], figures) except Exception as e: print( "Error augmenting images for image path: {}. Exception message: {}" .format(black_ims_paths[idx], e)) page_name = os.path.dirname( diff) + '/' + diff[diff.find('black.pdf-'):] figures_by_page[page_name] = figures file_util.safe_makedirs(os.path.dirname(result_path)) file_util.write_json_atomic( result_path, config.JsonSerializable.serialize(figures_by_page), sort_keys=True) figure_boundaries, caption_boundaries = transform_figure_json( result_path, self.ignore_pages_with_no_figures) return result_path, figure_boundaries, caption_boundaries
def run_full_pipeline( tarpath: str, skip_done: bool=True, save_intermediate: bool=False ) -> None: foldername = str(os.path.basename(tarpath).split('.')[0]) result_path = LOCAL_FIGURE_JSON_DIR + get_bin( tarpath ) + foldername + '.json' if skip_done and file_util.exists(result_path): return d = LOCAL_INTERMEDIATE_DIR + get_bin(tarpath) while True: try: file_util.extract_tarfile(tarpath, d, streaming=False) # botocore.vendored.requests.packages.urllib3.exceptions.ReadTimeoutError can't be caught because it doesn't # inherit from BaseException, so don't use streaming break except FileNotFoundError as e: logging.exception('Failure reading %s, retrying' % tarpath) except ReadTimeout as e: logging.exception('Timeout reading %s, retrying' % tarpath) pdfs = glob.glob(d + foldername + '/' + '*.pdf') res = dict() for pdf in pdfs: sha1sum = file_util.compute_sha1(pdf) with open(pdf + '.sha1', 'w') as f: print(sha1sum, file=f) paper_figures = match_figures(pdf) if paper_figures is not None: res.update(paper_figures) if save_intermediate: intermediate_path = PUBMED_INTERMEDIATE_DIR + get_bin( tarpath ) + foldername + '/' for file in glob.glob(d + '/' + foldername + '/' + '*'): file_util.copy(file, intermediate_path + os.path.basename(file)) file_util.safe_makedirs(os.path.dirname(result_path)) file_util.write_json_atomic( result_path, config.JsonSerializable.serialize(res), indent=2, sort_keys=True )
def download_and_extract_tar( tarname: str, extract_dir: str, n_attempts: int = 100, cache_dir: str = settings.ARXIV_DATA_CACHE_DIR, delete_tar_after_extracting: bool = False) -> None: print('.', end='', flush=True) logging.info('Downloading %s' % tarname) for attempt in range(n_attempts): try: cached_file = file_util.cache_file_2(tarname, cache_dir=cache_dir) break except FileNotFoundError: if attempt == n_attempts - 1: raise logging.exception('Download failed, retrying') time.sleep(10) logging.info("Proceeding to extract tar file: {}".format(cached_file)) file_util.extract_tarfile(cached_file, extract_dir) if delete_tar_after_extracting: os.remove(cached_file)