def test_get_post_reference_section_title_patterns(self): r = refextract_re.get_post_reference_section_title_patterns() self.assert_(len(r) > 2)
def find_end_of_reference_section(docbody, ref_start_line, ref_line_marker, ref_line_marker_ptn): """Given that the start of a document's reference section has already been recognised, this function is tasked with finding the line-number in the document of the last line of the reference section. @param docbody: (list) of strings - the entire plain-text document body. @param ref_start_line: (integer) - the index in docbody of the first line of the reference section. @param ref_line_marker: (string) - the line marker of the first reference line. @param ref_line_marker_ptn: (string) - the pattern used to search for a reference line marker. @return: (integer) - index in docbody of the last reference line -- OR -- (None) - if ref_start_line was invalid. """ section_ended = False x = ref_start_line if type(x) is not int or x < 0 or \ x > len(docbody) or len(docbody) < 1: # The provided 'first line' of the reference section was invalid. # Either it was out of bounds in the document body, or it was not a # valid integer. # Can't safely find end of refs with this info - quit. return None # Get patterns for testing line: t_patterns = get_post_reference_section_title_patterns() kw_patterns = get_post_reference_section_keyword_patterns() if None not in (ref_line_marker, ref_line_marker_ptn): mk_patterns = [re.compile(ref_line_marker_ptn, re.I|re.UNICODE)] else: mk_patterns = get_reference_line_numeration_marker_patterns() current_reference_count = 0 while x < len(docbody) and not section_ended: # save the reference count num_match = regex_match_list(docbody[x].strip(), mk_patterns) if num_match: try: current_reference_count = int(num_match.group('marknum')) except (ValueError, IndexError): # non numerical references marking pass # look for a likely section title that would follow a reference section: end_match = regex_match_list(docbody[x].strip(), t_patterns) if not end_match: # didn't match a section title - try looking for keywords that # suggest the end of a reference section: end_match = regex_match_list(docbody[x].strip(), kw_patterns) else: # Is it really the end of the reference section? Check within the next # 5 lines for other reference numeration markers: y = x + 1 line_found = False while y < x + 200 and y < len(docbody) and not line_found: num_match = regex_match_list(docbody[y].strip(), mk_patterns) if num_match and not num_match.group(0).isdigit(): try: num = int(num_match.group('marknum')) if current_reference_count + 1 == num: line_found = True except ValueError: # We have the marknum index so it is # numeric pattern for references like # [1], [2] but this match is not a number pass except IndexError: # We have a non numerical references marking # we don't check for a number continuity line_found = True y += 1 if not line_found: # No ref line found-end section section_ended = True if not section_ended: # Does this & the next 5 lines simply contain numbers? If yes, it's # probably the axis scale of a graph in a fig. End refs section digit_test_str = docbody[x].replace(" ", "").\ replace(".", "").\ replace("-", "").\ replace("+", "").\ replace(u"\u00D7", "").\ replace(u"\u2212", "").\ strip() if len(digit_test_str) > 10 and digit_test_str.isdigit(): # The line contains only digits and is longer than 10 chars: y = x + 1 digit_lines = 4 num_digit_lines = 1 while y < x + digit_lines and y < len(docbody): digit_test_str = docbody[y].replace(" ", "").\ replace(".", "").\ replace("-", "").\ replace("+", "").\ replace(u"\u00D7", "").\ replace(u"\u2212", "").\ strip() if len(digit_test_str) > 10 and digit_test_str.isdigit(): num_digit_lines += 1 elif len(digit_test_str) == 0: # This is a blank line. Don't count it, to accommodate # documents that are double-line spaced: digit_lines += 1 y = y + 1 if num_digit_lines == digit_lines: section_ended = True x += 1 return x - 1
def test_get_post_reference_section_title_patterns(self): r = refextract_re.get_post_reference_section_title_patterns() self.assert_(len(r) > 2)
def find_end_of_reference_section(docbody, ref_start_line, ref_line_marker, ref_line_marker_ptn): """Given that the start of a document's reference section has already been recognised, this function is tasked with finding the line-number in the document of the last line of the reference section. @param docbody: (list) of strings - the entire plain-text document body. @param ref_start_line: (integer) - the index in docbody of the first line of the reference section. @param ref_line_marker: (string) - the line marker of the first reference line. @param ref_line_marker_ptn: (string) - the pattern used to search for a reference line marker. @return: (integer) - index in docbody of the last reference line -- OR -- (None) - if ref_start_line was invalid. """ section_ended = False x = ref_start_line if type(x) is not int or x < 0 or x > len(docbody) or len(docbody) < 1: # The provided 'first line' of the reference section was invalid. # Either it was out of bounds in the document body, or it was not a # valid integer. # Can't safely find end of refs with this info - quit. return None # Get patterns for testing line: t_patterns = get_post_reference_section_title_patterns() kw_patterns = get_post_reference_section_keyword_patterns() if None not in (ref_line_marker, ref_line_marker_ptn): mk_patterns = [re.compile(ref_line_marker_ptn, re.I | re.UNICODE)] else: mk_patterns = get_reference_line_numeration_marker_patterns() current_reference_count = 0 while x < len(docbody) and not section_ended: # save the reference count num_match = regex_match_list(docbody[x].strip(), mk_patterns) if num_match: try: current_reference_count = int(num_match.group("marknum")) except (ValueError, IndexError): # non numerical references marking pass # look for a likely section title that would follow a reference section: end_match = regex_match_list(docbody[x].strip(), t_patterns) if not end_match: # didn't match a section title - try looking for keywords that # suggest the end of a reference section: end_match = regex_match_list(docbody[x].strip(), kw_patterns) else: # Is it really the end of the reference section? Check within the next # 5 lines for other reference numeration markers: y = x + 1 line_found = False while y < x + 200 and y < len(docbody) and not line_found: num_match = regex_match_list(docbody[y].strip(), mk_patterns) if num_match and not num_match.group(0).isdigit(): try: num = int(num_match.group("marknum")) if current_reference_count + 1 == num: line_found = True except ValueError: # We have the marknum index so it is # numeric pattern for references like # [1], [2] but this match is not a number pass except IndexError: # We have a non numerical references marking # we don't check for a number continuity line_found = True y += 1 if not line_found: # No ref line found-end section section_ended = True if not section_ended: # Does this & the next 5 lines simply contain numbers? If yes, it's # probably the axis scale of a graph in a fig. End refs section digit_test_str = ( docbody[x] .replace(" ", "") .replace(".", "") .replace("-", "") .replace("+", "") .replace(u"\u00D7", "") .replace(u"\u2212", "") .strip() ) if len(digit_test_str) > 10 and digit_test_str.isdigit(): # The line contains only digits and is longer than 10 chars: y = x + 1 digit_lines = 4 num_digit_lines = 1 while y < x + digit_lines and y < len(docbody): digit_test_str = ( docbody[y] .replace(" ", "") .replace(".", "") .replace("-", "") .replace("+", "") .replace(u"\u00D7", "") .replace(u"\u2212", "") .strip() ) if len(digit_test_str) > 10 and digit_test_str.isdigit(): num_digit_lines += 1 elif len(digit_test_str) == 0: # This is a blank line. Don't count it, to accommodate # documents that are double-line spaced: digit_lines += 1 y = y + 1 if num_digit_lines == digit_lines: section_ended = True x += 1 return x - 1