def main(): parser = argparse.ArgumentParser( description='Function to extract metadata and split a PDF containing multiple journal articles. Set up for ' 'Buffalo Law Review book reviews.' ) parser.add_argument('filename', help='PDF file to analyze and split.', type=str, ) parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Print status messages.', ) parser.add_argument('-t', '--test', action='store_true', help="Test only. Don't output any files. Use with debug options to see test output.", ) parser.add_argument('--write-csv-only', action="store_true", dest="csvOnly", help="Write CSV file, but don't split PDFs. Test flag takes precedence.", ) parser.add_argument('-d', '--debug', dest='debug', type=int, help="Set debug level (1-6). Levels 1-4 offer increasing levels of output. Level 5 displays " "the PDF text. Level 6 prints all of the records.", default=0, ) parser.add_argument('-o', '--output-file', dest='destination', type=str, help='Use supplied filename as filename template for output files.', ) parser.add_argument('-i', '--input-file', dest='input_file', type=str, help="Import CSV file to be used for PDF splitting. Must be in same format as export.") args = parser.parse_args() # Split filename from extension before passing to the various functions. Use input filename for template # if no output filename specified. if args.destination: output_file, output_extension = os.path.splitext(args.destination) else: output_file, output_extension = os.path.splitext(args.filename) # If importCSV is specified, read that file and get start_pdf_page and end_pdf_page to pass to SplitPDFs # If no importCSV is selected, process args.filename if args.input_file: start_pdf_page, end_pdf_page = journaltools.importcsv(args.input_file, args.debug) else: # Fetch OCR page text from PDF file page_text = journaltools.getpdf(args.filename, 0, args.verbose, args.debug) # Process pages title, start_page, start_pdf_page, end_pdf_page, author = processpdfnew( args.verbose, args.debug, page_text) # Export CSV file, or show what output would be if test flag is set journaltools.exportcsvnew(output_file, args.verbose, args.debug, args.test, title, start_page, start_pdf_page, end_pdf_page, author) # Split Original PDF into separate documents for each piece, unless test or csvOnly flags are set if not args.test and not args.csvOnly: journaltools.splitpdf(args.filename, args.verbose, args.debug, start_pdf_page, end_pdf_page, output_file)
) parser.add_argument('-i', '--input-file', dest='input_file', type=str, help="Import CSV file to be used for PDF splitting. Must be in same format as export. " "Default is filename with .csv extension.") args = parser.parse_args() # Split filename from extension before passing to the various functions. Use input filename for template # if no output filename specified. if args.destination: output_file, output_extension = os.path.splitext(args.destination) else: output_file, output_extension = os.path.splitext(args.filename) # Set input CSV filename. If no filename provided, use the input filename with CSV extension. if args.input_file: input_file = args.input_file else: input_file, input_extension = os.path.splitext(args.filename) input_file = input_file + '.csv' # Read CSVfile and get starting and ending PDF pages to pass to splitpdf if os.path.exists(input_file): start_pdf_page, end_pdf_page = journaltools.importcsv(input_file, args.debug) # Split Original PDF into separate documents for each piece, unless test flag is set if not args.test: journaltools.splitpdf(args.filename, args.verbose, args.debug, start_pdf_page, end_pdf_page, output_file) else: print(f'{input_file} not present. Please specify a valid CSV file to use for the split points.')