def sanitise_files(): rtitle = 'QGMJ' rtype = 'WELCOM' ref = pd.read_excel( 'C:/Users/andraszeka/OneDrive - ITP (Queensland Government)/gsq-boreholes/investigations/QDEX_metada_export.xlsx', dtype={'REPNO': int}) bad = ref.loc[ref.RTITLE.str.contains(rtitle) | ref.RTYPE.str.contains(rtype)] bad_docids = bad.REPNO.values removed = [] ids = paths.get_files_from_path('restructpageinfo') lines_docs = paths.get_files_from_path('restructpageinfo', get_file_paths=True) for id, lines_doc in zip(ids, lines_docs): docid, filenum = id[0], id[1] if docid in bad_docids: if not os.path.exists('nottraining/restructpageinfo/'): os.makedirs('nottraining/restructpageinfo/') os.rename( lines_doc, paths.get_restructpageinfo_file(docid, local_path=True, training=False, file_num=filenum)) removed.append([docid, filenum]) print("Removed: ", len(removed), ", ", removed)
def create_individual_dataset(docid): pageinfo = paths.get_restructpageinfo_file(docid) pi = json.load(open(pageinfo)) df = pd.DataFrame(columns=columns) write_to_dataset(df, pi, docid) unnormed = np.array(df['Centrality']) normalized = (unnormed - min(unnormed)) / (max(unnormed) - min(unnormed)) df['Centrality'] = normalized return df
def clean_and_restruct(docid, save=True, training=True, report_num=1): json_file = paths.get_full_json_file(docid, training=training, file_num=report_num) with open(json_file, 'r') as file: json_doc = json.load(file) json_res = json2res(json_doc) pagelineinfo = get_pagelineinfo_map(json_res) # takes json response clean_page = get_clean_page(pagelineinfo, docid) restructpageinfo = get_restructpagelines(clean_page) if save: fp = paths.get_restructpageinfo_file(docid, training=training, file_num=report_num) p = fp.rsplit('/', 1)[0] if not os.path.exists(p): os.makedirs(p) o = open(fp, "w") json.dump(restructpageinfo, o) else: return restructpageinfo
def display_doc( docid): # doc has to be pageinfo type - made for restructpageinfo report_path = paths.get_report_name(docid, local_path=True, file_extension=True) images = convert_from_path(report_path) docfile = open(paths.get_restructpageinfo_file(docid), "r") doc = json.load(docfile) drawn_images = [] # Create image showing bounding box/polygon the detected lines/text for page in doc.items(): i = int(page[0]) - 1 image = images[i] width, height = image.size #draw = ImageDraw.Draw(image) draw = ImageDraw.Draw(image) for line in page[1]: # Uncomment to draw bounding box box = line['BoundingBox'] left = width * box['Left'] top = height * box['Top'] draw.rectangle([ left, top, left + (width * box['Width']), top + (height * box['Height']) ], outline='green') #image.save(docid + '_' + page[0] + ".jpeg", "JPEG") drawn_images.append(image) save_path = paths.result_path + docid + '_boxed.pdf' if not os.path.exists(save_path): os.makedirs(save_path) drawn_images[0].save(save_path, save_all=True, append_images=drawn_images[1:])
def display_page(docid, page, line=None, mode=paths.dataset_version): pg_path = paths.get_report_page_path(int(docid), int(page)) # docid, page image = Image.open(pg_path) width, height = image.size if line: draw = ImageDraw.Draw(image, 'RGBA') draw.line([(1, 1), (1, height - 3)], fill="blue", width=3) # draw parallel lines down the page draw.line([(width - 3, 1), (width - 3, height - 3)], fill="blue", width=3) docinfofile = paths.get_restructpageinfo_file(docid) docinfo = json.load(open(docinfofile, "r")) pageinfo = docinfo[str(page)] lineinfo = pageinfo[int(line) - 1] #-1 because linenum starts from 1 box = lineinfo['BoundingBox'] ln_left = width * box['Left'] ln_top = height * box['Top'] crop_height = height / 3 left = 0 right = width top = ln_top - box['Height'] - ( crop_height / 2) # bottom > top bc of coordinate system bottom = ln_top + (crop_height / 2) if top < 0: # if top is outside of bounds, add to it to make it 0, and sub from bottom change = top top = 0 bottom -= change draw.line([(1, 1), (width - 3, 1)], fill="blue", width=3) elif bottom > height: change = bottom - height bottom = height top -= change draw.line([(1, height - 3), (width - 3, height - 3)], fill="blue", width=3) draw.rectangle([ ln_left, ln_top, ln_left + (width * box['Width']), ln_top + (height * box['Height']) ], outline='green', width=2) crop_image = image.crop((left, top, right, bottom)) #crop_ln_top = crop_height * box['Top'] #draw.rectangle([ln_left, crop_ln_top, ln_left + (width * box['Width']), crop_ln_top + (crop_height * box['Height'])], outline='green') image = crop_image display.display(image) # line option: draw a box around the line # get docinfo, query the line number and bounding box # crop page to about 1/3 of it to make it more focused on the line print(pg_path) if line: print("line: ", line)
def get_doc_info(self): pageinfo = paths.get_restructpageinfo_file(self.docid, file_num=self.filenum) pi = json.load(open(pageinfo, "r")) return pi
except DecompressionBombError as e: print(e) continue textract_end = time.time() textract_time = textract_end - textract_start print("Time to textract: " + str(docid) + "_" + str(num) + " " + "{0:.2f}".format(textract_time) + " seconds") else: print("Report ", docid, "_", str(num), " already textracted") textract_time = 0 # check if clean and restruct needs to be run or if restructpageinfo alredy exists if (not os.path.exists( paths.get_restructpageinfo_file( docid, training=training, file_num=num)) and (not args.force)): texttransforming.clean_and_restruct(docid, save=True, training=training, report_num=num) else: print("Report ", docid, "_", str(num), " already cleaned and reconstructed") if special_mode == 'welcom': # copy json, tables, kvpairs, to extrafolder jsonsrc = paths.get_full_json_file(docid, training=training, file_num=num) jsondest = paths.get_full_json_file(
print('Nums: ', nums) for num in nums: if not (os.path.exists(paths.get_full_json_file(docid, file_num=num))) and (not args.force): textract_start = time.time() try: textract(docid, features=['TABLES'], report_num=num) except FileNotFoundError as e: #print("Report file", docid, "_", str(num), "doesn't exist in S3") print(e) continue except TextBasedFileException as e: print(e) continue textract_end = time.time() textract_time = textract_end - textract_start print("Time to textract: " + str(docid) + "_" + str(num) + " " + "{0:.2f}".format(textract_time) + " seconds") else: print("Report ", docid, "_", str(num), " already textracted") # check if clean and restruct needs to be run or if restructpageinfo alredy exists if (not os.path.exists(paths.get_restructpageinfo_file(docid, file_num=num)) and (not args.force)): texttransforming.clean_and_restruct(docid, save=True, report_num=num) else: print("Report ", docid, "_", str(num), " already cleaned and reconstructed") cont = input("Run again?") if 'n' in cont: not_exit = False else: new_args = input("Enter new args: ") args = parser.parse_args(new_args.split())