def processpdfnew(verbose, debug, pagetext): # Create lists for all values to be exported to CSV file. Each index value will correspond to the metadata # for one article across all lists. This code assumes that there will be no more than two authors on each article. title = [] author = [] start_page = [] start_pdf_page = [] end_pdf_page = [] page_number = 0 # Get titles and authors from first page--for original format publications toc = re.findall( r'([IVXL]{1,4}\.?)\s([A-Za-z0-9., ]+)\s(\(([A-Za-z ]{5,})\))?', pagetext[0], re.DOTALL) # Clean up title. Then append titles and authors to title and Author lists. Append blank start page. for r in range(0, len(toc)): if 0 < debug < 3: print(f'Record {r}, {toc[r]}') temp_title = toc[r][1] temp_title = temp_title.strip() temp_title = re.sub(r' {2,}', " ", temp_title) temp_title = temp_title.title() temp_title = journaltools.capitalize_title(temp_title) title.append(temp_title) find_author = re.sub(r' {2,}', ' ', toc[r][3]) find_author = re.split(r' and ', find_author, 2) if find_author: author_list = [] for count in range(0, 4): try: f_name, m_name, l_name, suffix = journaltools.splitname( find_author[count]) author_temp = f_name, m_name, l_name, suffix except IndexError: author_temp = '', '', '', '' author_list.append(author_temp) author.append(author_list) start_page.append('') if 1 < debug < 5: print(f'{title[r]}, {author[r]}') # Process each page. Step through pages and attempt to find titles, authors, and page numbers in OCR text. # Store this metadata and the start and end pages of each article into lists. if verbose: print('Processing PDF pages') for page_number in range(1, len(pagetext)): if 0 < debug < 6: print('Processing PDF page number %d' % page_number) # look for an article title on this page title_parts = re.search( r'(?<=\n)([XIVHLlixv]{1,4}\.)\s([A-Za-z0-9.,*\- ]*)\s(?=\n)', pagetext[page_number], flags=0) if 1 < debug < 5 and title_parts: print('title parts: %s' % title_parts) # Append temp_title to title list. Don't look for original page number in OCR text, because they didn't come # through on these. Append placeholder string for original start page. Append the page number of the # PDF file to start_pdf_page list. For every page after the first, append the page number to end_pdf_page. # This will add a garbage page to articles that start a page, but there's no great way to determine if # the article starts the page. if title_parts: # OriginalPageNumber = re.search(r'^[\d]{1,4}$', pagetext[page_number], re.MULTILINE) # if OriginalPageNumber: # start_page.append(OriginalPageNumber[0]) # else: # start_page.append(" ") start_pdf_page.append(page_number) if page_number > 1: end_pdf_page.append(page_number) if 1 < debug < 5: print(f'PDF start pages: {start_pdf_page}') print(f'PDF end pages: {end_pdf_page}') end_pdf_page.append(page_number) # Compare all lists to make sure they contain the same number of items. Add empty items to short lists. if len(start_pdf_page) < len(title): for r in range(len(start_pdf_page), len(title)): start_pdf_page.append(0) print('WARNING! Missing Start PDF Page(s)') if len(end_pdf_page) < len(title): for r in range(len(end_pdf_page), len(title)): end_pdf_page.append(0) print('WARNING! Missing End PDF Page(s)') # Lots of debugging output # Print all of the lists; debug levels 2 & 4 if debug == 2 or debug == 4: print('\n\nAll list values:') print(title) print(author) print(start_page) print(start_pdf_page) print(end_pdf_page) # step through each record and print all contents; debug level 6 if debug == 6: print('\n\nAll records:') for r in range(0, len(title)): print( f'Record {r}: {title[r]}; {author[r]}; {start_page[r]}; {start_pdf_page[r]};' f' {end_pdf_page[r]}') # Return all collected metadata lists. return title, start_page, start_pdf_page, end_pdf_page, author
def processpdfnew(verbose, debug, pagetext): # Create lists for all values to be exported to CSV file. Each index value will correspond to the metadata # for one article across all lists. This code assumes that there will be no more than two authors on each article. title = [] author = [] start_page = [] start_pdf_page = [] end_pdf_page = [] page_number = 0 # Process each page. Step through pages and attempt to find titles, authors, and page numbers in OCR text. # Store this metadata and the start and end pages of each article into lists. if verbose: print('Processing PDF pages') for page_number in range(0, len(pagetext)): if 1 < debug < 6: print('Processing PDF page number %d' % page_number) # look for an article title on this page, add lines to title_parts, # then add them together and title capitalize temp_title = "" # Look for all lines that consist only of three or more of the following characters on their own line: # all caps, spaces, hyphens and single quotes. title_parts = re.findall(r'(?<=\n)[A-Z][A-Za-z0-9 .,():"\'\-]{3,}\.(?=\s+By)|' r'By\s{1,2}[A-Za-z \-,&.]+\.', pagetext[page_number]) if 1 < debug < 5 and title_parts: print('title parts: %s' % title_parts) # Join all returned lines together in temp_title. Strip extra spaces and use title capitalization. # Any word with an apostrophe comes out with a space before the apostrophe and the next letter # capitalized. Fix in a future version. for t in title_parts: temp_title = temp_title + " " + t temp_title = re.sub(r'\n', ' ', temp_title) temp_title = temp_title.strip() temp_title = re.sub(r' {2,}', " ", temp_title) temp_title = temp_title.title() temp_title = journaltools.capitalize_title(temp_title) # Print processed title at debug levels 1-4. if 0 < debug < 5 and temp_title: print('TITLE: %s' % temp_title) # If title is at least four characters long, append to title list. This should be enough to get rid of # garbage lines, but short enough to keep short ones. Look for original page number in OCR text, and # if found append to start_page list. If not, append placeholder string. Append the page number of the # PDF file to start_pdf_page list. if len(temp_title) > 5: title.append(temp_title) original_page_number = re.search(r'^[\d]{1,4}$', pagetext[page_number], re.MULTILINE) if original_page_number: start_page.append(original_page_number[0]) else: start_page.append(" ") start_pdf_page.append(page_number) if 0 < debug < 5: if original_page_number: print('Start page in PDF text: %s' % original_page_number[0]) else: print('No start page found in PDF text') # Find authors. If one or two lines # are returned, append them to find_author. Append the current PDF file page to end_pdf_page. find_author = re.findall( r'(?<=\n)[A-Z][A-Za-z]*\.? +[A-Z][A-Za-z]*\.? +[A-Za-z]+\.?[,. A-Za-z]{0,6}\*?(?=\n)|' r'(?<=\n)[A-Z][A-Za-z]+ +[A-Z][a-z]+[,. A-Za-z]{0,6}\*?(?=\n)|' r'(?<=\n)[A-Z][A-Za-z]*\.? *[A-Z][a-z]*\.? +[A-Za-z]+\.? +[A-Za-z]+\.?[,. A-Za-z]{0,6}\*?(?=\n)', pagetext[page_number]) if find_author: author_list = [] for count in range(0, 4): try: f_name, m_name, l_name, suffix = journaltools.splitname(find_author[count]) author_temp = f_name, m_name, l_name, suffix except IndexError: author_temp = '', '', '', '' author_list.append(author_temp) author.append(author_list) end_pdf_page.append(page_number) if 0 < debug < 5: print('Author: %s' % find_author) if 1 < debug < 5: print(f'PDF start pages: {start_pdf_page}') print(f'PDF end pages: {end_pdf_page}') if len(start_pdf_page) > len(end_pdf_page): end_pdf_page.append(page_number) # Compare lists to see if they contain the same number of values. If not, then pad out the short lists with # empty values and throw a warning. Evaluation is in two groups: The values updated when a title is found, # and the values updated when an author is found. if len(title) > len(author): print('WARNING! Missing authors and ending PDF pages') for r in range(len(author), len(title)): author.append([('', '', '', ''), ('', '', '', ''), ('', '', '', ''), ('', '', '', '')]) end_pdf_page.append(0) elif len(author) > len(title): print('WARNING! Missing titles, start pages, and starting PDF pages') for r in range(len(title), len(author)): title.append('') start_page.append('') start_pdf_page.append(0) # Lots of debugging output # Print all of the lists; debug levels 2 & 4 if debug == 2 or debug == 4: print('\n\nAll list values:') print(title) print(author) print(start_page) print(start_pdf_page) print(end_pdf_page) # step through each record and print all contents; debug level 6 if debug == 6: print('\n\nAll records:') for r in range(0, len(title)): print(f'Record {r}: {title[r]}; {author[r]}; {start_page[r]}; {start_pdf_page[r]};' f' {end_pdf_page[r]}') # Return all collected metadata lists. return title, start_page, start_pdf_page, end_pdf_page, author
def importxl(import_file): # Import CSV file for this issue. User should in theory use a template with the right fields. Note to self: create # a template. Read each row, do some minor processing, then put the contents into a set of variables. # Eventually, return the variables so they can be passed to another procedure that will write a CSV file that # can be used to split the full issue PDF and also to be converted to Digital Commons format for "easy" batch # importing. Har har har. title = [] page = [] pdf_start_page = [] pdf_end_page = [] author = [] section = [] # Set defaults for columns; will be overwritten as necessary section_col = 1 title_col = 2 page_col = 3 start_col = 4 end_col = 5 first_col = 6 middle_col = 7 last_col = 8 suffix_col = 9 author2_first_col = 0 author2_middle_col = 0 author2_last_col = 0 author2_suffix_col = 0 wb = load_workbook(filename=import_file, data_only=True) ws = wb.active # Read first row and get headers. headers = [] for c in range(1, ws.max_column + 1): headers.append(ws.cell(row=1, column=c).internal_value) for c in range(0, len(headers)): if headers[c] == 'section': section_col = c + 1 if headers[c] == 'title': title_col = c + 1 if headers[c] == 'page': page_col = c + 1 if headers[c] == 'start_pdf_page': start_col = c + 1 if headers[c] == 'end_pdf_page': end_col = c + 1 if headers[c] == 'author_first': first_col = c + 1 if headers[c] == 'author_middle': middle_col = c + 1 if headers[c] == 'author_last': last_col = c + 1 if headers[c] == 'author_suffix': suffix_col = c + 1 if headers[c] == 'author2_first': author2_first_col = c + 1 if headers[c] == 'author2_middle': author2_middle_col = c + 1 if headers[c] == 'author2_last': author2_last_col = c + 1 if headers[c] == 'author2_suffix': author2_suffix_col = c + 1 # Iterate through all rows, reading values into lists to pass back to main. max_row = ws.max_row for i in range(2, max_row + 1): section_temp = ws.cell(row=i, column=section_col).internal_value if section_temp: section_temp = section_temp.title() section_temp = capitalize_title(section_temp) section.append(section_temp) temp_title = ws.cell(row=i, column=title_col).internal_value temp_title = temp_title.title() temp_title = capitalize_title(temp_title) title.append(temp_title) page_temp = ws.cell(row=i, column=page_col).internal_value if page_temp: page.append(page_temp) else: page.append('') pdf_start_page.append(ws.cell(row=i, column=start_col).internal_value) pdf_end_page.append(ws.cell(row=i, column=end_col).internal_value) author_temp = ws.cell(row=i, column=first_col).internal_value, \ ws.cell(row=i, column=middle_col).internal_value, ws.cell(row=i, column=last_col).internal_value, \ ws.cell(row=i, column=suffix_col).internal_value author_list = [author_temp] # Only look for second author if there were second author columns in the input file. if author2_first_col: # If there are columns in the input file for a second author, check to make sure there's a value in the # first name field. If there is, then pull all four columns into a tuple, then append it to the list. if ws.cell(row=i, column=author2_first_col).internal_value: author_temp = ws.cell(row=i, column=author2_first_col).internal_value, \ ws.cell(row=i, column=author2_middle_col).internal_value, \ ws.cell(row=i, column=author2_last_col).internal_value, \ ws.cell(row=i, column=author2_suffix_col).internal_value author_list.append(author_temp) if author_list: author.append(author_list) else: author.append('') return title, page, pdf_start_page, pdf_end_page, author, section