def pdf_to_text(path_in, path_out): if (conf.get_prop("use_pdf2txt") == '0'): try: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams from pdfminer.image import ImageWriter except: print ("system doesn't have PDFminer.six library installed. Try to use pdf2txt.") conf.set_prop("use_pdf2txt", "1") if (conf.get_prop("use_pdf2txt") == '0'): rsrcmgr = PDFResourceManager(caching=True) outfp = open(path_out, 'w') codec = 'utf-8' laparams = LAParams() imagewriter = None device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = open(path_in, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, set(), caching=True, check_extractable=True): interpreter.process_page(page) fp.close() device.close() outfp.close()
def get_image_ids(swf_path): # gets the output of swftools swfextractOut = str(subprocess.check_output([conf.get_prop("swftools_path") +\ "swfextract", swf_path])) # finds the string with the JPEG ids beginpoint = swfextractOut.find("JPEG") if (beginpoint is not -1): ids = [] # finds start/end of the juicy bits startpoint = swfextractOut.find(")", beginpoint) + 2 endpoint = swfextractOut.find('\\n', beginpoint) idsStr = swfextractOut[startpoint:endpoint] # prints debug info print("starting point: " + str(startpoint)) print("ending point: " + str(endpoint)) print(swfextractOut[startpoint:endpoint]) if (idsStr.find(",") == -1): return [int(idsStr)] else: for idStr in idsStr.split(","): ids.append(int(idStr)) return ids return []
def extract_images(swf_path, out_dir): # gets ids for images in SWF imageIds = get_image_ids(swf_path) # use swfextract to extract images based off id for imageId in imageIds: os.system(conf.get_prop("swftools_path") + "swfextract -j " + str(imageId) +\ " -o " + out_dir + "/" + str(imageId) + ".jpg " + swf_path)
def extract_audio(swf_path, out_path): os.system(conf.get_prop("swftools_path") + "swfextract -m -o " + out_path + " " + swf_path)
if (len(sys.argv) == 1): print("0 = gets_swf.fetch_all") print("1 = gets_swf.extract_audio") print("2 = gets_swf.extract_audio_all") print("3 = gets_swf.convert_pdf") print("4 = conf.set_prop") print("5 = conf.get_prop") print("6 = conf.check_prop_exists") print("7 = gets_swf.gets_image_IDs") print("8 = extract_images") print("9 = extract_images_all") print("10 = gets_swf.get_linked_text") print("11 = gets_swf.get_linked_text_all") else: if (sys.argv[1] == "0"): gets_swf.fetch_all(sys.argv[2], sys.argv[3]) if (sys.argv[1] == "1"): gets_swf.extract_audio(sys.argv[2], sys.argv[3]) if (sys.argv[1] == "2"): gets_swf.extract_audio_all(sys.argv[2]) if (sys.argv[1] == "3"): gets_swf.convert_pdf(sys.argv[2], sys.argv[3]) if (sys.argv[1] == "4"): conf.set_prop(sys.argv[2], sys.argv[3]) if (sys.argv[1] == "5"): print(conf.get_prop(sys.argv[2])) if (sys.argv[1] == "6"): print(conf.check_prop_exist(sys.argv[2])) if (sys.argv[1] == "7"): print(gets_swf.get_image_ids(sys.argv[2])) if (sys.argv[1] == "8"): gets_swf.extract_images(sys.argv[2], sys.argv[3]) if (sys.argv[1] == "9"): gets_swf.extract_images_all(sys.argv[2]) if (sys.argv[1] == "10"): print(gets_swf.get_linked_text(sys.argv[2], sys.argv[3])) if (sys.argv[1] == "11"): gets_swf.get_linked_text_all(sys.argv[2])
if (not os.path.exists(arguments[1])): #print ("Could not find path " + arguments[1]) #sys.exit(0) os.system("mkdir " + arguments[1]) # the code expects paths to end in '/'s so append them if needed if (not arguments[0].endswith("/")): arguments[0] = arguments[0] + "/" if (not arguments[1].endswith("/")): arguments[1] = arguments[1] + "/" # set output_dir conf.set_prop("output_dir", arguments[1]) kewUrl = arguments[0] print("STEP 1: Acquiring KEW slides from server") gets_swf.fetch_all(kewUrl, conf.get_prop("output_dir")) print("STEP 2: Fetching linked text") gets_swf.get_linked_text_all(conf.get_prop("output_dir")) print("STEP 3: Extracting audio") gets_swf.extract_audio_all(conf.get_prop("output_dir")) print("STEP 4: Extracting images") gets_swf.extract_images_all(conf.get_prop("output_dir")) gets_swf.clean_up_all(conf.get_prop("output_dir")) if (len(arguments) > 2): if (arguments[2] == "-z"): # remove '/' at end of string zip_dir = conf.get_prop("output_dir") if (zip_dir.endswith("/")): zip_dir = zip_dir[:-1] # run zip on output_dir and delete the original directory