def processpdfnew(verbose, debug, pagetext): # Create lists for all values to be exported to CSV file. Each index value will correspond to the metadata # for one article across all lists. This code assumes that there will be no more than two authors on each article. title = [] author = [] start_page = [] start_pdf_page = [] end_pdf_page = [] page_number = 0 # Get titles and authors from first page--for original format publications toc = re.findall( r'([IVXL]{1,4}\.?)\s([A-Za-z0-9., ]+)\s(\(([A-Za-z ]{5,})\))?', pagetext[0], re.DOTALL) # Clean up title. Then append titles and authors to title and Author lists. Append blank start page. for r in range(0, len(toc)): if 0 < debug < 3: print(f'Record {r}, {toc[r]}') temp_title = toc[r][1] temp_title = temp_title.strip() temp_title = re.sub(r' {2,}', " ", temp_title) temp_title = temp_title.title() temp_title = journaltools.capitalize_title(temp_title) title.append(temp_title) find_author = re.sub(r' {2,}', ' ', toc[r][3]) find_author = re.split(r' and ', find_author, 2) if find_author: author_list = [] for count in range(0, 4): try: f_name, m_name, l_name, suffix = journaltools.splitname( find_author[count]) author_temp = f_name, m_name, l_name, suffix except IndexError: author_temp = '', '', '', '' author_list.append(author_temp) author.append(author_list) start_page.append('') if 1 < debug < 5: print(f'{title[r]}, {author[r]}') # Process each page. Step through pages and attempt to find titles, authors, and page numbers in OCR text. # Store this metadata and the start and end pages of each article into lists. if verbose: print('Processing PDF pages') for page_number in range(1, len(pagetext)): if 0 < debug < 6: print('Processing PDF page number %d' % page_number) # look for an article title on this page title_parts = re.search( r'(?<=\n)([XIVHLlixv]{1,4}\.)\s([A-Za-z0-9.,*\- ]*)\s(?=\n)', pagetext[page_number], flags=0) if 1 < debug < 5 and title_parts: print('title parts: %s' % title_parts) # Append temp_title to title list. Don't look for original page number in OCR text, because they didn't come # through on these. Append placeholder string for original start page. Append the page number of the # PDF file to start_pdf_page list. For every page after the first, append the page number to end_pdf_page. # This will add a garbage page to articles that start a page, but there's no great way to determine if # the article starts the page. if title_parts: # OriginalPageNumber = re.search(r'^[\d]{1,4}$', pagetext[page_number], re.MULTILINE) # if OriginalPageNumber: # start_page.append(OriginalPageNumber[0]) # else: # start_page.append(" ") start_pdf_page.append(page_number) if page_number > 1: end_pdf_page.append(page_number) if 1 < debug < 5: print(f'PDF start pages: {start_pdf_page}') print(f'PDF end pages: {end_pdf_page}') end_pdf_page.append(page_number) # Compare all lists to make sure they contain the same number of items. Add empty items to short lists. if len(start_pdf_page) < len(title): for r in range(len(start_pdf_page), len(title)): start_pdf_page.append(0) print('WARNING! Missing Start PDF Page(s)') if len(end_pdf_page) < len(title): for r in range(len(end_pdf_page), len(title)): end_pdf_page.append(0) print('WARNING! Missing End PDF Page(s)') # Lots of debugging output # Print all of the lists; debug levels 2 & 4 if debug == 2 or debug == 4: print('\n\nAll list values:') print(title) print(author) print(start_page) print(start_pdf_page) print(end_pdf_page) # step through each record and print all contents; debug level 6 if debug == 6: print('\n\nAll records:') for r in range(0, len(title)): print( f'Record {r}: {title[r]}; {author[r]}; {start_page[r]}; {start_pdf_page[r]};' f' {end_pdf_page[r]}') # Return all collected metadata lists. return title, start_page, start_pdf_page, end_pdf_page, author
def processpdfnew(verbose, debug, pagetext): # Create lists for all values to be exported to CSV file. Each index value will correspond to the metadata # for one article across all lists. This code assumes that there will be no more than two authors on each article. title = [] author = [] start_page = [] start_pdf_page = [] end_pdf_page = [] page_number = 0 # Process each page. Step through pages and attempt to find titles, authors, and page numbers in OCR text. # Store this metadata and the start and end pages of each article into lists. if verbose: print('Processing PDF pages') for page_number in range(0, len(pagetext)): if 1 < debug < 6: print('Processing PDF page number %d' % page_number) # look for an article title on this page, add lines to title_parts, # then add them together and title capitalize temp_title = "" # Look for all lines that consist only of three or more of the following characters on their own line: # all caps, spaces, hyphens and single quotes. title_parts = re.findall(r'(?<=\n)[A-Z][A-Za-z0-9 .,():"\'\-]{3,}\.(?=\s+By)|' r'By\s{1,2}[A-Za-z \-,&.]+\.', pagetext[page_number]) if 1 < debug < 5 and title_parts: print('title parts: %s' % title_parts) # Join all returned lines together in temp_title. Strip extra spaces and use title capitalization. # Any word with an apostrophe comes out with a space before the apostrophe and the next letter # capitalized. Fix in a future version. for t in title_parts: temp_title = temp_title + " " + t temp_title = re.sub(r'\n', ' ', temp_title) temp_title = temp_title.strip() temp_title = re.sub(r' {2,}', " ", temp_title) temp_title = temp_title.title() temp_title = journaltools.capitalize_title(temp_title) # Print processed title at debug levels 1-4. if 0 < debug < 5 and temp_title: print('TITLE: %s' % temp_title) # If title is at least four characters long, append to title list. This should be enough to get rid of # garbage lines, but short enough to keep short ones. Look for original page number in OCR text, and # if found append to start_page list. If not, append placeholder string. Append the page number of the # PDF file to start_pdf_page list. if len(temp_title) > 5: title.append(temp_title) original_page_number = re.search(r'^[\d]{1,4}$', pagetext[page_number], re.MULTILINE) if original_page_number: start_page.append(original_page_number[0]) else: start_page.append(" ") start_pdf_page.append(page_number) if 0 < debug < 5: if original_page_number: print('Start page in PDF text: %s' % original_page_number[0]) else: print('No start page found in PDF text') # Find authors. If one or two lines # are returned, append them to find_author. Append the current PDF file page to end_pdf_page. find_author = re.findall( r'(?<=\n)[A-Z][A-Za-z]*\.? +[A-Z][A-Za-z]*\.? +[A-Za-z]+\.?[,. A-Za-z]{0,6}\*?(?=\n)|' r'(?<=\n)[A-Z][A-Za-z]+ +[A-Z][a-z]+[,. A-Za-z]{0,6}\*?(?=\n)|' r'(?<=\n)[A-Z][A-Za-z]*\.? *[A-Z][a-z]*\.? +[A-Za-z]+\.? +[A-Za-z]+\.?[,. A-Za-z]{0,6}\*?(?=\n)', pagetext[page_number]) if find_author: author_list = [] for count in range(0, 4): try: f_name, m_name, l_name, suffix = journaltools.splitname(find_author[count]) author_temp = f_name, m_name, l_name, suffix except IndexError: author_temp = '', '', '', '' author_list.append(author_temp) author.append(author_list) end_pdf_page.append(page_number) if 0 < debug < 5: print('Author: %s' % find_author) if 1 < debug < 5: print(f'PDF start pages: {start_pdf_page}') print(f'PDF end pages: {end_pdf_page}') if len(start_pdf_page) > len(end_pdf_page): end_pdf_page.append(page_number) # Compare lists to see if they contain the same number of values. If not, then pad out the short lists with # empty values and throw a warning. Evaluation is in two groups: The values updated when a title is found, # and the values updated when an author is found. if len(title) > len(author): print('WARNING! Missing authors and ending PDF pages') for r in range(len(author), len(title)): author.append([('', '', '', ''), ('', '', '', ''), ('', '', '', ''), ('', '', '', '')]) end_pdf_page.append(0) elif len(author) > len(title): print('WARNING! Missing titles, start pages, and starting PDF pages') for r in range(len(title), len(author)): title.append('') start_page.append('') start_pdf_page.append(0) # Lots of debugging output # Print all of the lists; debug levels 2 & 4 if debug == 2 or debug == 4: print('\n\nAll list values:') print(title) print(author) print(start_page) print(start_pdf_page) print(end_pdf_page) # step through each record and print all contents; debug level 6 if debug == 6: print('\n\nAll records:') for r in range(0, len(title)): print(f'Record {r}: {title[r]}; {author[r]}; {start_page[r]}; {start_pdf_page[r]};' f' {end_pdf_page[r]}') # Return all collected metadata lists. return title, start_page, start_pdf_page, end_pdf_page, author
def processpdfnew(verbose, debug, page_text): # This is the main processing function. It looks through each page of the list passed to it and tries to pull # as much metadata as it can find. # # The format of this journal is: the first page of each issue contains a header that has the journal name, # volume number, issue number, month and year, and then the article title in mixed case, then the author in all # caps. That will usually be followed by the first heading of the article. Subsequent articles will follow this # format without the header. Subsequent pages will feature a header containing the journal title, page number, and # alternating volume and years, in arabic numerals. Each piece of both headers is on its own line in the PDF. # The title will be broken into multiple lines, with blank lines between some lines. There seems to be no # rhyme or reason to the pattern of blank lines. The author is always last, so the algorithm takes everything # that it doesn't recognize as an author or a piece of a heading as a piece of the title. # Process each page. Step through pages and attempt to find titles, authors, and page numbers in OCR text. # Store this metadata and the start and end pages of each article into lists. if verbose: print(f'Processing PDF pages') title = '' volume = 0 start_page = 0 issue_number = 0 year = '' month = '' author = [] doc_type = '' for page_number in range(0, 3): if 0 < debug < 6: print('Processing PDF page number %d' % page_number) if page_number == 0: # Process first page. Split page into lines, then step through lines page_lines = page_text[page_number].splitlines() if 2 < debug < 5: print(f'{page_lines}') line = 0 author_flag = 0 header_flag = 0 # Step through lines looking for metadata until all authors are found (author_flag == 2). The general # format of BLR for the first page of an issue is a header with the journal name, number, month and year, # then the article title, then authors. The remainder of the issue will not have the full issue header, # but may have a designation of "essay" or "comment." The lines of the titles may be interwoven with # the header. The only thing that is consistent is that the authors have always been last. So once the # authors appear, there is no more metadata to gather. while author_flag < 2: line += 1 if 1 < debug < 5: print(f'Line: {line}') print(page_lines[line]) # Check to make sure that we're not past the last line. If we're at the end of the page, then there # is either no author or the author wasn't found. Set the author_flag to 2 and end the loop so we # avoid throwing an error. But give the user a warning. if line == len(page_lines): print( 'WARNING! No author found. Title probably incorrect (and very long).' ) author_temp = '', '', '', '' author.append(author_temp) author_flag = 2 continue # Skip empty lines. If the author flag is set to 1 (at least one author has been found), then a blank # line should mean all the authors are found and the next line will be the start of the text or a # heading just before the text. Set the author_flag to 2 to end the loop and avoid pulling in an # all-caps heading as an author. if page_lines[line] == '' or page_lines[line] == ' ': if author_flag == 0: continue elif author_flag == 1: author_flag = 2 # If the header_flag is set, look for the issue number, month and year, and volume number. if header_flag == 1: if volume == 0: volume_test = re.search(r'(?<=VOLUME )\d{1,3}', page_lines[line]) if volume_test is not None: volume = volume_test[0] continue if issue_number == 0: issue_test = re.search(r'(?<=NUMBER )\d', page_lines[line]) if issue_test is not None: issue_number = issue_test[0] continue if year == '': date_parts = re.search(r'([A-Z]+) (\d{4})', page_lines[line]) if date_parts is not None: month = date_parts[1] year = date_parts[2] continue # If the line is the journal name, the issue header should be present. Set the header_flag. if re.match(r'Buffalo Law Review|BUFFALO LAW REVIEW', page_lines[line]): header_flag = 1 continue # If the line is a document type header, set the doc_type, then move to the next loop cycle to # avoid adding the document type to the title. if re.match(r'ESSAY ?', page_lines[line]): doc_type = 'essay' continue if re.match(r'COMMENT ?', page_lines[line]): doc_type = 'comment' continue # Look for something that looks like an author. It will be in all caps and may have one or more # symbols at the end (designating the author's biographical information). author_search = re.search( r'([A-ZÁÄÀÉËÈÍÏÌÑÓÖÙ]+ [A-ZÁÄÀÉËÈÍÏÌÑÓÖÙ]+\.? ?[A-ZÁÄÀÉËÈÍÏÌÑÓÖÙ]*,? ?' r'[A-ZÁÄÀÉËÈÍÏÌÑÓÖÙ]*\.?)\W* ?$', page_lines[line]) # If an author hasn't been found, then this must be part of the title. If the author_flag is still 0 # (no authors have been found), add the line to the title, unless it's already 254 characters. # If the author flag is set to 1, then the whole title has been found already and this must be the # start of the paper. Set the author_flag to 2 to end the loop. if author_search is None: if author_flag == 0: if len(title) < 255: title = title + page_lines[line] elif author_flag == 1: author_flag = 2 else: author_temp = author_search[1].title() f_name, m_name, l_name, suffix = splitname(author_temp) author_temp = f_name, m_name, l_name, suffix author.append(author_temp) author_flag = 1 # Post-processing. Strip spaces from beginning and end of title. Set doc_type to article if it isn't # already set to something else. Look for the page number and set. title = title.strip() if doc_type == '': doc_type = 'article' original_page_number = re.search(r'^([\d]{1,4}) ?$', page_text[page_number], re.MULTILINE) if original_page_number: start_page = original_page_number[1] else: # If volume and year were not on the front page, look for them on the next two pages. # Process first page. Split page into lines, then step through lines try: page_lines = page_text[page_number].splitlines() except IndexError: continue if 2 < debug < 5: print(f'{page_lines}') for line in range(0, 10): if 1 < debug < 3: print(f'Line: {line}') print(f'{page_lines[line]}') if volume == 0: volume_test = re.search(r'(?<=Vol.) +([\dXVILC]{1,3})', page_lines[line]) if volume_test is not None: volume = volume_test[0] if year == '': year_test = re.search(r'^[\d]{4}[\-—–][\d]{4}|^[\d]{4}', page_lines[line]) if year_test is not None: year = year_test[0] if 1 < debug < 3: print(f'Year: {year}, Line: {line}') if 0 < debug < 5: print( f'{volume}, {month}, {year}, {issue_number}, {author}, {title}, {start_page}, {doc_type}' ) return title, volume, start_page, issue_number, month, year, doc_type, author