def ocr_pdf_merge(): jpg_files = [ f for f in os.listdir('.') if os.path.isfile(f) and f.startswith('page') and f.endswith('.jpg') ] for jpg_file in jpg_files: tesseract_ocr(jpg_file) os.remove(jpg_file) pdf_files = [ f for f in os.listdir('.') if os.path.isfile(f) and f.startswith('page') and f.endswith('.pdf') ] pdf_merge(pdf_files, 'ocred_document.pdf', delete=True)
) # input parser.add_argument( "inputs", type=str, default=None, nargs="+", help="list of input files" ) # output parser.add_argument( "-o", "--output", type=str, default=None, help="filename of the output file", required=True, ) # delete parser.add_argument( "-d", "--delete", action="store_true", help="delete input files after merge", ) return parser.parse_args(args) if __name__ == "__main__": args = process_arguments(sys.argv[1:]) pdf_merge(args.inputs, args.output, args.delete)
def main(): incremental_results_dir = path.join(DATA_DIR, "results", "incremental") if path.isdir(incremental_results_dir): print("Creating plots for incremental...") for language in scandir(incremental_results_dir): print(f" {language.name}") for result_file in scandir(language.path): csv_basename = ".".join(result_file.name.split(".")[:-1]) print(f" {csv_basename}") result_data = read_csv(result_file.path) result_data[0]["Added"] = None result_data_except_first = result_data[1:] if len([x for x in result_data if any(x[parser_type] for parser_type in parser_types())]) <= 0: print(" No data found, skipping") continue figure_path = path.join(FIGURES_DIR, "incremental", language.name, csv_basename) makedirs(figure_path, exist_ok=True) figures = [ (plot_times(result_data, parser_types(True, "implode" in csv_basename)), "report"), (plot_times(result_data_except_first, parser_types(False, "implode" in csv_basename)), "report-except-first"), (plot_times_vs_changes(result_data_except_first, "bytes", "Added", "Removed"), "report-time-vs-bytes"), (plot_times_vs_changes(result_data_except_first, "chunks", "Changes"), "report-time-vs-changes"), (plot_times_vs_changes_3d(result_data_except_first), "report-time-vs-changes-3D"), ] for fig, name in figures: fig.savefig(path.join(figure_path, name + ".pdf")) fig.savefig(path.join(figure_path, name + ".svg")) plt.close("all") merged_path = path.join(figure_path, "merged.pdf") if path.exists(merged_path): remove(merged_path) pdftools.pdf_merge([path.join(figure_path, name + ".pdf") for _, name in figures], merged_path) memory_benchmarks_dir = path.join(DATA_DIR, "memoryBenchmarks") if path.isdir(memory_benchmarks_dir): print("Creating plots for memory benchmarks...") for language in scandir(memory_benchmarks_dir): print(f" {language.name}") result_data_batch = read_csv(path.join(language.path, "batch.csv")) result_data_incremental = read_csv(path.join(language.path, "incremental.csv")) figure_path = path.join(FIGURES_DIR, "memoryBenchmarks", language.name) makedirs(figure_path, exist_ok=True) figures = [ (plot_memory_batch(result_data_batch, "incl"), "report-allocations-batch"), (plot_memory_batch(result_data_batch, "excl"), "report-cache-size-batch"), (plot_memory_incremental(result_data_incremental, "incl"), "report-allocations-incremental"), (plot_memory_incremental(result_data_incremental, "excl"), "report-cache-size-incremental"), ] for fig, name in figures: fig.savefig(path.join(figure_path, name + ".pdf")) fig.savefig(path.join(figure_path, name + ".svg")) plt.close("all") merged_path = path.join(figure_path, "merged.pdf") if path.exists(merged_path): remove(merged_path) pdftools.pdf_merge([path.join(figure_path, name + ".pdf") for _, name in figures], merged_path)
parser = argparse.ArgumentParser( parents=[parentparser], description="Merge the pages of multiple input files in one output file.") # input parser.add_argument('inputs', type=str, default=None, nargs='+', help='list of input files') # output parser.add_argument('-o', '--output', type=str, default=None, help='filename of the output file', required=True) # delete parser.add_argument('-d', '--delete', action='store_true', help='delete input files after merge') return parser.parse_args(args) if __name__ == "__main__": args = process_arguments(sys.argv[1:]) pdf_merge(args.inputs, args.output, args.delete)