def process_info(app_context, base_dir): try: files = get_files(app_context.application_context) file_images = [] output = [] for index, file_new in enumerate(files): start_time = time.time() file = get_json(file_new['file']['name'], base_dir)[0] file_properties = File(file) ocr_level, lang = get_ocr_config(file_new, file_properties.get_pages()) file = preprocess_file(file_properties, lang, ocr_level) file['file'] = file_new['file'] file['config'] = file_new['config'] output.append(file) output[index]['status'] = { 'code': 200, 'message': "tesseract ocr successful" } end_time = time.time() extraction_time = (end_time - start_time) / len( file_properties.get_pages()) log_info( 'tesseract ocr per page completed in {}'.format( extraction_time), app_context.application_context) app_context.application_context["outputs"] = output log_info("successfully completed tesseract ocr", None) except Exception as e: log_exception("Error occured during tesseract ocr ", app_context.application_context, e) return None return app_context.application_context
def process_input(app_context, base_dir): try: files = get_files(app_context.application_context) output_files = [] langs = [] for index, file in enumerate(files): file_output = {"status": {}} file = get_json(base_dir, file['file']['name'])[0] file_properties = File(file) if "page_info" in file.keys(): page_paths = file_properties.get_pages() else: page_paths = doc_pre_processing(file['file']['name'], config.BASE_DIR) page_res = text_extraction(file_properties, page_paths, file) output_files.append(page_res) langs.append(file_properties.get_language()) app_context.application_context["outputs"] = output_files log_info("successfully completed google vision ocr", None) except Exception as e: log_exception("Error occured during google vision ocr", app_context.application_context, e) return None, None return app_context.application_context, langs
def get_segmented_regions(app_context,base_dir) : try: files = get_files(app_context.application_context) output = [] for index,file in enumerate(files): file = get_json(base_dir, file['file']['name']) file_properties = File(file) pages = file_properties.get_pages() page_counts = len(pages) start_time = time.time() for page_index in range(page_counts): print('processing for page : ', page_index) # page_lines = file_properties.get_lines(page_index) # page_regions = file_properties.get_regions(page_index) # page_words = file_properties.get_words(page_index) #font_meta = font_properties(file_properties.get_page(page_index)) font_meta = [] #page_regions = region_unifier.region_unifier(page_lines,page_regions) #file_properties.set_regions(page_index, segment_regions(page_words,page_lines,page_regions)) file_properties.set_font_properties(page_index,font_meta) output.append(file_properties.get_file()) output[index]['status']= {'message':"block-segmenter successful"} end_time = time.time() extraction_time = (end_time - start_time)/page_counts log_info('block segmentation per page completed in {}'.format(extraction_time), app_context.application_context) app_context.application_context["outputs"] =output log_info("successfully completed block segmentation", None) except Exception as e: log_exception("Error occured during block segmentation ", app_context.application_context, e) return None return app_context.application_context
def get_layout(app_context): try: files = get_files(app_context.application_context) #files = get_json(app_context.application_context) #files = get_files(json_data) file_images = [] output = [] for index, file_new in enumerate(files): file = get_json(file_new['file']['name'])[0] file_properties = File(file) page_paths = file_properties.get_pages() start_time = time.time() for idx, page_path in enumerate(page_paths): page_lines = file_properties.get_lines(idx) page_words = file_properties.get_words(idx) line_coords = get_coord(page_lines) #page_path = '/'.join(page_path.split('/')[-4:]) page_path = 'upload/' + page_path.split('upload/')[-1] #masked_image, table_and_lines = extract_table_line_regions(page_path) #cell_regions = cell_layout(table_and_lines,page_path) if torch.cuda.is_available(): torch.cuda.device(0) print("*******cuda available") torch.cuda.empty_cache() time.sleep(1) regions = primalaynet.predict_primanet(page_path, line_coords) #regions += cell_regions file['pages'][idx]["regions"] = regions file['file'] = file_new['file'] file['config'] = file_new['config'] output.append(file) output[index]['status'] = {} output[index]['status']['message'] = "layout-detector successful" end_time = time.time() extraction_time = (end_time - start_time) / len(page_paths) log_info( 'Layout detection per page completed in {}'.format( extraction_time), app_context.application_context) app_context.application_context["outputs"] = output log_info("successfully completed layout detection", None) except Exception as e: log_exception("Error occured during prima layout detection ", app_context.application_context, e) return None return app_context.application_context