def find_numeration_in_body(docbody): marker_patterns = get_reference_line_numeration_marker_patterns() ref_details = None found_title = False for line in docbody: # Move past blank lines if line.isspace(): continue # Is this line numerated like a reference line? mark_match = regex_match_list(line, marker_patterns) if mark_match: mark = mark_match.group('mark') mk_ptn = mark_match.re.pattern ref_details = { 'marker': mark, 'marker_pattern': mk_ptn, 'title_marker_same_line': False, } # Check if it's the first reference # Something like [1] or (1), etc. m_num = re_num.search(mark) if m_num and m_num.group(0) == '1': # 1st ref truly found break else: # No numeration ref_details = { 'title_marker_same_line': False, 'marker': None, 'marker_pattern': None, } return ref_details, found_title
def find_numeration_in_body(docbody): marker_patterns = get_reference_line_numeration_marker_patterns() ref_details = None found_title = False # No numeration unless we find one ref_details = { 'title_marker_same_line': False, 'marker': None, 'marker_pattern': None, } for line in docbody: # Move past blank lines if line.isspace(): continue # Is this line numerated like a reference line? m_num = None mark_match = regex_match_list(line, marker_patterns) if mark_match: # Check if it's the first reference # Something like [1] or (1), etc. try: m_num = mark_match.group('marknum') if m_num != '1': continue except IndexError: pass mark = mark_match.group('mark') mk_ptn = mark_match.re.pattern ref_details = { 'marker': mark, 'marker_pattern': mk_ptn, 'title_marker_same_line': False, } break return ref_details, found_title
def remove_reference_line_marker(line): """Trim a reference line's 'marker' from the beginning of the line. @param line: (string) - the reference line. @return: (tuple) containing two strings: + The reference line's marker (or if there was not one, a 'space' character. + The reference line with it's marker removed from the beginning. """ # Get patterns to identify reference-line marker patterns: marker_patterns = get_reference_line_numeration_marker_patterns() line = line.lstrip() marker_match = regex_match_list(line, marker_patterns) if marker_match is not None: # found a marker: marker_val = marker_match.group(u'mark') # trim the marker from the start of the line: line = line[marker_match.end():].lstrip() else: marker_val = u" " return (marker_val, line)
def find_numeration_in_title(docbody, title): ref_details = None found_title = False try: first_line = docbody[0] except IndexError: return ref_details, found_title # Need to escape to avoid problems like 'References[' title = re.escape(title) mk_with_title_ptns = \ get_reference_line_numeration_marker_patterns(title) mk_with_title_match = \ regex_match_list(first_line, mk_with_title_ptns) if mk_with_title_match: mk = mk_with_title_match.group('mark') mk_ptn = mk_with_title_match.re.pattern m_num = re_num.search(mk) if m_num and m_num.group(0) == '1': # Mark found found_title = True ref_details = { 'marker': mk, 'marker_pattern': mk_ptn, 'title_marker_same_line': True } else: ref_details = { 'marker': mk, 'marker_pattern': mk_ptn, 'title_marker_same_line': True } return ref_details, found_title
def find_end_of_reference_section(docbody, ref_start_line, ref_line_marker, ref_line_marker_ptn): """Given that the start of a document's reference section has already been recognised, this function is tasked with finding the line-number in the document of the last line of the reference section. @param docbody: (list) of strings - the entire plain-text document body. @param ref_start_line: (integer) - the index in docbody of the first line of the reference section. @param ref_line_marker: (string) - the line marker of the first reference line. @param ref_line_marker_ptn: (string) - the pattern used to search for a reference line marker. @return: (integer) - index in docbody of the last reference line -- OR -- (None) - if ref_start_line was invalid. """ section_ended = False x = ref_start_line if type(x) is not int or x < 0 or \ x > len(docbody) or len(docbody) < 1: # The provided 'first line' of the reference section was invalid. # Either it was out of bounds in the document body, or it was not a # valid integer. # Can't safely find end of refs with this info - quit. return None # Get patterns for testing line: t_patterns = get_post_reference_section_title_patterns() kw_patterns = get_post_reference_section_keyword_patterns() if None not in (ref_line_marker, ref_line_marker_ptn): mk_patterns = [re.compile(ref_line_marker_ptn, re.I|re.UNICODE)] else: mk_patterns = get_reference_line_numeration_marker_patterns() current_reference_count = 0 while x < len(docbody) and not section_ended: # save the reference count num_match = regex_match_list(docbody[x].strip(), mk_patterns) if num_match: try: current_reference_count = int(num_match.group('marknum')) except (ValueError, IndexError): # non numerical references marking pass # look for a likely section title that would follow a reference section: end_match = regex_match_list(docbody[x].strip(), t_patterns) if not end_match: # didn't match a section title - try looking for keywords that # suggest the end of a reference section: end_match = regex_match_list(docbody[x].strip(), kw_patterns) else: # Is it really the end of the reference section? Check within the next # 5 lines for other reference numeration markers: y = x + 1 line_found = False while y < x + 200 and y < len(docbody) and not line_found: num_match = regex_match_list(docbody[y].strip(), mk_patterns) if num_match and not num_match.group(0).isdigit(): try: num = int(num_match.group('marknum')) if current_reference_count + 1 == num: line_found = True except ValueError: # We have the marknum index so it is # numeric pattern for references like # [1], [2] but this match is not a number pass except IndexError: # We have a non numerical references marking # we don't check for a number continuity line_found = True y += 1 if not line_found: # No ref line found-end section section_ended = True if not section_ended: # Does this & the next 5 lines simply contain numbers? If yes, it's # probably the axis scale of a graph in a fig. End refs section digit_test_str = docbody[x].replace(" ", "").\ replace(".", "").\ replace("-", "").\ replace("+", "").\ replace(u"\u00D7", "").\ replace(u"\u2212", "").\ strip() if len(digit_test_str) > 10 and digit_test_str.isdigit(): # The line contains only digits and is longer than 10 chars: y = x + 1 digit_lines = 4 num_digit_lines = 1 while y < x + digit_lines and y < len(docbody): digit_test_str = docbody[y].replace(" ", "").\ replace(".", "").\ replace("-", "").\ replace("+", "").\ replace(u"\u00D7", "").\ replace(u"\u2212", "").\ strip() if len(digit_test_str) > 10 and digit_test_str.isdigit(): num_digit_lines += 1 elif len(digit_test_str) == 0: # This is a blank line. Don't count it, to accommodate # documents that are double-line spaced: digit_lines += 1 y = y + 1 if num_digit_lines == digit_lines: section_ended = True x += 1 return x - 1
def find_end_of_reference_section(docbody, ref_start_line, ref_line_marker, ref_line_marker_ptn): """Given that the start of a document's reference section has already been recognised, this function is tasked with finding the line-number in the document of the last line of the reference section. @param docbody: (list) of strings - the entire plain-text document body. @param ref_start_line: (integer) - the index in docbody of the first line of the reference section. @param ref_line_marker: (string) - the line marker of the first reference line. @param ref_line_marker_ptn: (string) - the pattern used to search for a reference line marker. @return: (integer) - index in docbody of the last reference line -- OR -- (None) - if ref_start_line was invalid. """ section_ended = False x = ref_start_line if type(x) is not int or x < 0 or \ x > len(docbody) or len(docbody) < 1: # The provided 'first line' of the reference section was invalid. # Either it was out of bounds in the document body, or it was not a # valid integer. # Can't safely find end of refs with this info - quit. return None # Get patterns for testing line: t_patterns = get_post_reference_section_title_patterns() kw_patterns = get_post_reference_section_keyword_patterns() if None not in (ref_line_marker, ref_line_marker_ptn): mk_patterns = [re.compile(ref_line_marker_ptn, re.I | re.UNICODE)] else: mk_patterns = get_reference_line_numeration_marker_patterns() current_reference_count = 0 while x < len(docbody) and not section_ended: # save the reference count num_match = regex_match_list(docbody[x].strip(), mk_patterns) if num_match: try: current_reference_count = int(num_match.group('marknum')) except (ValueError, IndexError): # non numerical references marking pass # look for a likely section title that would follow a reference section: end_match = regex_match_list(docbody[x].strip(), t_patterns) if not end_match: # didn't match a section title - try looking for keywords that # suggest the end of a reference section: end_match = regex_match_list(docbody[x].strip(), kw_patterns) else: # Is it really the end of the reference section? Check within the next # 5 lines for other reference numeration markers: y = x + 1 line_found = False while y < x + 200 and y < len(docbody) and not line_found: num_match = regex_match_list(docbody[y].strip(), mk_patterns) if num_match and not num_match.group(0).isdigit(): try: num = int(num_match.group('marknum')) if current_reference_count + 1 == num: line_found = True except ValueError: # We have the marknum index so it is # numeric pattern for references like # [1], [2] but this match is not a number pass except IndexError: # We have a non numerical references marking # we don't check for a number continuity line_found = True y += 1 if not line_found: # No ref line found-end section section_ended = True if not section_ended: # Does this & the next 5 lines simply contain numbers? If yes, it's # probably the axis scale of a graph in a fig. End refs section digit_test_str = docbody[x].replace(" ", "").\ replace(".", "").\ replace("-", "").\ replace("+", "").\ replace(u"\u00D7", "").\ replace(u"\u2212", "").\ strip() if len(digit_test_str) > 10 and digit_test_str.isdigit(): # The line contains only digits and is longer than 10 chars: y = x + 1 digit_lines = 4 num_digit_lines = 1 while y < x + digit_lines and y < len(docbody): digit_test_str = docbody[y].replace(" ", "").\ replace(".", "").\ replace("-", "").\ replace("+", "").\ replace(u"\u00D7", "").\ replace(u"\u2212", "").\ strip() if len(digit_test_str) > 10 and digit_test_str.isdigit(): num_digit_lines += 1 elif len(digit_test_str) == 0: # This is a blank line. Don't count it, to accommodate # documents that are double-line spaced: digit_lines += 1 y = y + 1 if num_digit_lines == digit_lines: section_ended = True x += 1 return x - 1