def extract_to_text(i): # The pdfReader variable is a readable object that will be parsed path = get_path() pdf_file_path = get_path(i, ".pdf") txt_file_path = get_path(i, ".txt") if not os.path.exists(path): os.makedirs(path) if not os.path.exists(txt_file_path): mock_test_file = get_past_paper(i) with open(pdf_file_path, "wb") as f: f.write(mock_test_file) s = 'pdf2txt -o%s %s' % (txt_file_path, pdf_file_path) pdf2txt.main(s.split(' ')[1:]) os.remove(pdf_file_path) with open(txt_file_path, "r+", encoding="utf-8") as f: tmp_text = f.read() clean_mock_paper_text = filter_mock_paper_text(tmp_text) with open(txt_file_path, "wb") as f: f.write(str.encode(clean_mock_paper_text)) return clean_mock_paper_text
def run(datapath, filename, options=None): i = path + datapath + filename + '.pdf' o = path + filename + '.txt' if options: s = 'pdf2txt -o%s %s %s' % (o, options, i) else: s = 'pdf2txt -o%s %s' % (o, i) pdf2txt.main(s.split(' ')[1:])
def run(datapath,filename,options=None): i=path+datapath+filename+'.pdf' o=path+filename+'.txt' if options: s='pdf2txt -o%s %s %s'%(o,options,i) else: s='pdf2txt -o%s %s'%(o,i) pdf2txt.main(s.split(' ')[1:])
def run(sample_path, options=None): absolute_path = absolute_sample_path(sample_path) with NamedTemporaryFile() as output_file: if options: s = 'pdf2txt -o{} {} {}' \ .format(output_file.name, options, absolute_path) else: s = 'pdf2txt -o{} {}'.format(output_file.name, absolute_path) pdf2txt.main(s.split(' ')[1:])
def extract_images(self, input_file): output_dir = mkdtemp() with NamedTemporaryFile() as output_file: commands = [ '-o', output_file.name, '--output-dir', output_dir, input_file ] pdf2txt.main(commands) image_files = os.listdir(output_dir) rmtree(output_dir) return image_files
) mock_test_response = session.get(pdf_mock_test_url) # stream=True mock_test_response.raise_for_status() mock_test_file = io.BytesIO(mock_test_response.content) # The pdfReader variable is a readable object that will be parsed path = os.path.dirname(os.path.abspath(__file__)) + '/tmp/' file_name = "tmp_{}".format(i) pdf_file_path = path + file_name+".pdf" txt_file_path = path + file_name+".txt" if not os.path.exists(path): os.makedirs(path) with open(pdf_file_path, "wb") as f: f.write(mock_test_response.content) s = 'pdf2txt -o%s %s' % (txt_file_path, pdf_file_path) pdf2txt.main(s.split(' ')[1:]) os.remove(pdf_file_path) with open(txt_file_path, "r+", encoding="utf-8") as f: tmp_text = f.read() clean_mock_paper_text = filter_mock_paper_text(tmp_text) f.write(clean_mock_paper_text) mock_test_pattern = r"(.*)(Answers\s?:)(.*)" mock_test_regex = re.compile(mock_test_pattern) g = mock_test_regex.findall(clean_mock_paper_text) questions_str = next(g.__iter__())[0] answers_str = next(g.__iter__())[2] # question_pattern = r"(\d+\.)(^(\d+\.))*" # questions_regex = re.compile(question_pattern)
i + 1) mock_test_response = session.get(pdf_mock_test_url) # stream=True mock_test_response.raise_for_status() mock_test_file = io.BytesIO(mock_test_response.content) # The pdfReader variable is a readable object that will be parsed path = os.path.dirname(os.path.abspath(__file__)) + '/tmp/' file_name = "tmp_{}".format(i) pdf_file_path = path + file_name + ".pdf" txt_file_path = path + file_name + ".txt" if not os.path.exists(path): os.makedirs(path) with open(pdf_file_path, "wb") as f: f.write(mock_test_response.content) s = 'pdf2txt -o%s %s' % (txt_file_path, pdf_file_path) pdf2txt.main(s.split(' ')[1:]) os.remove(pdf_file_path) with open(txt_file_path, "r+", encoding="utf-8") as f: tmp_text = f.read() clean_mock_paper_text = filter_mock_paper_text(tmp_text) f.write(clean_mock_paper_text) mock_test_pattern = r"(.*)(Answers\s?:)(.*)" mock_test_regex = re.compile(mock_test_pattern) g = mock_test_regex.findall(clean_mock_paper_text) questions_str = next(g.__iter__())[0] answers_str = next(g.__iter__())[2] # question_pattern = r"(\d+\.)(^(\d+\.))*" # questions_regex = re.compile(question_pattern)