def test_extract_captions(self): '''Test extracting captions''' if not self.app.config.get('GRAPHICS_ENABLE_UPDATES', False): return True from file_ops import untar from file_ops import convert_images from invenio_tools import extract_captions from invenio_tools import prepare_image_data from invenio_tools import extract_context # Assuming the previous tests succeeded, we know we can extract # the images fro the archive archive = "%s/tests/stubdata/arXiv/YY/NN.tar.gz" % PROJECT_HOME tex, imgs, sdir = untar(archive) # Now use the update machinery to convert the images to PNGs converted_images = convert_images(imgs) # Pick the right TeX file tex_file = [f for f in tex if f.split('.')[-1] == 'tex'][0] # Extract captions TMP = self.app.config.get('GRAPHICS_TMP_DIR') im_data = extract_captions(tex_file, TMP, converted_images) # Did we get what we expected expected = ('', 'noimgDistance to M~51', '') self.assertEqual(im_data[0], expected) # Check cleaned data cleaned = prepare_image_data(im_data, tex_file, converted_images) self.assertEqual(os.path.basename(cleaned[-1][0]), 'figure09.png') self.assertEqual(cleaned[-1][1], 'figure05.ps') self.assertEqual(cleaned[-1][2], '') # Check extracted context context = extract_context(tex_file, cleaned) expected = ('', 'noimgDistance to M~51', '', []) self.assertEqual(context[0], expected) # Cleanup the extracted data extract_dir = "%s/NN" % TMP shutil.rmtree(extract_dir)
pass return # We now have a list with successfully converted (PNG) images extracted_image_data = [] for tex_file in tex_files: # Extract images, captions and labels partly_extracted_image_data = extract_captions(tex_file, xdir, img_files) if not partly_extracted_image_data == []: # Add proper filepaths and do various cleaning cleaned_image_data = prepare_image_data( partly_extracted_image_data, tex_file, converted_images) # Using prev. extracted info, get contexts for each image found extracted_image_data.extend( (extract_context(tex_file, cleaned_image_data))) extracted_image_data = remove_dups(extracted_image_data) fid = 1 source2target = {} for item in extracted_image_data: if not os.path.exists(item[0]) or not item[0].strip(): continue fig_data = {} if arx_id.find('arXiv') > -1: figure_id = 'arxiv%s_f%s' % (arx_id.replace('arXiv:', ''), fid) subdir = arx_id.replace('arXiv:', '').split('.')[0] eprdir = arx_id.replace('arXiv:', '').split('.')[1] else: figure_id = '%s_f%s' % (arx_id.replace('/', '_'), fid) subdir = arx_id.split('/')[1][:4] eprdir = arx_id.split('/')[1][4:]
try: shutil.rmtree(xdir) except: pass return # We now have a list with successfully converted (PNG) images extracted_image_data = [] for tex_file in tex_files: # Extract images, captions and labels partly_extracted_image_data = extract_captions(tex_file, xdir, img_files) if not partly_extracted_image_data == []: # Add proper filepaths and do various cleaning cleaned_image_data = prepare_image_data(partly_extracted_image_data, tex_file, converted_images) # Using prev. extracted info, get contexts for each image found extracted_image_data.extend((extract_context(tex_file, cleaned_image_data))) extracted_image_data = remove_dups(extracted_image_data) fid = 1 source2target = {} for item in extracted_image_data: if not os.path.exists(item[0]) or not item[0].strip(): continue fig_data = {} if arx_id.find("arXiv") > -1: figure_id = "arxiv%s_f%s" % (arx_id.replace("arXiv:", ""), fid) subdir = arx_id.replace("arXiv:", "").split(".")[0] eprdir = arx_id.replace("arXiv:", "").split(".")[1] else: figure_id = "%s_f%s" % (arx_id.replace("/", "_"), fid) subdir = arx_id.split("/")[1][:4] eprdir = arx_id.split("/")[1][4:]