Exemplo n.º 1
0
 def test_get_post_reference_section_title_patterns(self):
     r = refextract_re.get_post_reference_section_title_patterns()
     self.assert_(len(r) > 2)
Exemplo n.º 2
0
def find_end_of_reference_section(docbody,
                                  ref_start_line,
                                  ref_line_marker,
                                  ref_line_marker_ptn):
    """Given that the start of a document's reference section has already been
       recognised, this function is tasked with finding the line-number in the
       document of the last line of the reference section.
       @param docbody: (list) of strings - the entire plain-text document body.
       @param ref_start_line: (integer) - the index in docbody of the first line
        of the reference section.
       @param ref_line_marker: (string) - the line marker of the first reference
        line.
       @param ref_line_marker_ptn: (string) - the pattern used to search for a
        reference line marker.
       @return: (integer) - index in docbody of the last reference line
         -- OR --
                (None) - if ref_start_line was invalid.
    """
    section_ended = False
    x = ref_start_line
    if type(x) is not int or x < 0 or \
           x > len(docbody) or len(docbody) < 1:
        # The provided 'first line' of the reference section was invalid.
        # Either it was out of bounds in the document body, or it was not a
        # valid integer.
        # Can't safely find end of refs with this info - quit.
        return None
    # Get patterns for testing line:
    t_patterns = get_post_reference_section_title_patterns()
    kw_patterns = get_post_reference_section_keyword_patterns()

    if None not in (ref_line_marker, ref_line_marker_ptn):
        mk_patterns = [re.compile(ref_line_marker_ptn, re.I|re.UNICODE)]
    else:
        mk_patterns = get_reference_line_numeration_marker_patterns()

    current_reference_count = 0
    while x < len(docbody) and not section_ended:
        # save the reference count
        num_match = regex_match_list(docbody[x].strip(), mk_patterns)
        if num_match:
            try:
                current_reference_count = int(num_match.group('marknum'))
            except (ValueError, IndexError):
                # non numerical references marking
                pass
        # look for a likely section title that would follow a reference section:
        end_match = regex_match_list(docbody[x].strip(), t_patterns)
        if not end_match:
            # didn't match a section title - try looking for keywords that
            # suggest the end of a reference section:
            end_match = regex_match_list(docbody[x].strip(), kw_patterns)
        else:
            # Is it really the end of the reference section? Check within the next
            # 5 lines for other reference numeration markers:
            y = x + 1
            line_found = False
            while y < x + 200 and y < len(docbody) and not line_found:
                num_match = regex_match_list(docbody[y].strip(), mk_patterns)
                if num_match and not num_match.group(0).isdigit():
                    try:
                        num = int(num_match.group('marknum'))
                        if current_reference_count + 1 == num:
                            line_found = True
                    except ValueError:
                        # We have the marknum index so it is
                        # numeric pattern for references like
                        # [1], [2] but this match is not a number
                        pass
                    except IndexError:
                        # We have a non numerical references marking
                        # we don't check for a number continuity
                        line_found = True
                y += 1
            if not line_found:
                # No ref line found-end section
                section_ended = True
        if not section_ended:
            # Does this & the next 5 lines simply contain numbers? If yes, it's
            # probably the axis scale of a graph in a fig. End refs section
            digit_test_str = docbody[x].replace(" ", "").\
                                        replace(".", "").\
                                        replace("-", "").\
                                        replace("+", "").\
                                        replace(u"\u00D7", "").\
                                        replace(u"\u2212", "").\
                                        strip()
            if len(digit_test_str) > 10 and digit_test_str.isdigit():
                # The line contains only digits and is longer than 10 chars:
                y = x + 1
                digit_lines = 4
                num_digit_lines = 1
                while y < x + digit_lines and y < len(docbody):
                    digit_test_str = docbody[y].replace(" ", "").\
                                     replace(".", "").\
                                     replace("-", "").\
                                     replace("+", "").\
                                     replace(u"\u00D7", "").\
                                     replace(u"\u2212", "").\
                                     strip()
                    if len(digit_test_str) > 10 and digit_test_str.isdigit():
                        num_digit_lines += 1
                    elif len(digit_test_str) == 0:
                        # This is a blank line. Don't count it, to accommodate
                        # documents that are double-line spaced:
                        digit_lines += 1
                    y = y + 1
                if num_digit_lines == digit_lines:
                    section_ended = True
            x += 1
    return x - 1
Exemplo n.º 3
0
 def test_get_post_reference_section_title_patterns(self):
     r = refextract_re.get_post_reference_section_title_patterns()
     self.assert_(len(r) > 2)
Exemplo n.º 4
0
def find_end_of_reference_section(docbody, ref_start_line, ref_line_marker, ref_line_marker_ptn):
    """Given that the start of a document's reference section has already been
       recognised, this function is tasked with finding the line-number in the
       document of the last line of the reference section.
       @param docbody: (list) of strings - the entire plain-text document body.
       @param ref_start_line: (integer) - the index in docbody of the first line
        of the reference section.
       @param ref_line_marker: (string) - the line marker of the first reference
        line.
       @param ref_line_marker_ptn: (string) - the pattern used to search for a
        reference line marker.
       @return: (integer) - index in docbody of the last reference line
         -- OR --
                (None) - if ref_start_line was invalid.
    """
    section_ended = False
    x = ref_start_line
    if type(x) is not int or x < 0 or x > len(docbody) or len(docbody) < 1:
        # The provided 'first line' of the reference section was invalid.
        # Either it was out of bounds in the document body, or it was not a
        # valid integer.
        # Can't safely find end of refs with this info - quit.
        return None
    # Get patterns for testing line:
    t_patterns = get_post_reference_section_title_patterns()
    kw_patterns = get_post_reference_section_keyword_patterns()

    if None not in (ref_line_marker, ref_line_marker_ptn):
        mk_patterns = [re.compile(ref_line_marker_ptn, re.I | re.UNICODE)]
    else:
        mk_patterns = get_reference_line_numeration_marker_patterns()

    current_reference_count = 0
    while x < len(docbody) and not section_ended:
        # save the reference count
        num_match = regex_match_list(docbody[x].strip(), mk_patterns)
        if num_match:
            try:
                current_reference_count = int(num_match.group("marknum"))
            except (ValueError, IndexError):
                # non numerical references marking
                pass
        # look for a likely section title that would follow a reference section:
        end_match = regex_match_list(docbody[x].strip(), t_patterns)
        if not end_match:
            # didn't match a section title - try looking for keywords that
            # suggest the end of a reference section:
            end_match = regex_match_list(docbody[x].strip(), kw_patterns)
        else:
            # Is it really the end of the reference section? Check within the next
            # 5 lines for other reference numeration markers:
            y = x + 1
            line_found = False
            while y < x + 200 and y < len(docbody) and not line_found:
                num_match = regex_match_list(docbody[y].strip(), mk_patterns)
                if num_match and not num_match.group(0).isdigit():
                    try:
                        num = int(num_match.group("marknum"))
                        if current_reference_count + 1 == num:
                            line_found = True
                    except ValueError:
                        # We have the marknum index so it is
                        # numeric pattern for references like
                        # [1], [2] but this match is not a number
                        pass
                    except IndexError:
                        # We have a non numerical references marking
                        # we don't check for a number continuity
                        line_found = True
                y += 1
            if not line_found:
                # No ref line found-end section
                section_ended = True
        if not section_ended:
            # Does this & the next 5 lines simply contain numbers? If yes, it's
            # probably the axis scale of a graph in a fig. End refs section
            digit_test_str = (
                docbody[x]
                .replace(" ", "")
                .replace(".", "")
                .replace("-", "")
                .replace("+", "")
                .replace(u"\u00D7", "")
                .replace(u"\u2212", "")
                .strip()
            )
            if len(digit_test_str) > 10 and digit_test_str.isdigit():
                # The line contains only digits and is longer than 10 chars:
                y = x + 1
                digit_lines = 4
                num_digit_lines = 1
                while y < x + digit_lines and y < len(docbody):
                    digit_test_str = (
                        docbody[y]
                        .replace(" ", "")
                        .replace(".", "")
                        .replace("-", "")
                        .replace("+", "")
                        .replace(u"\u00D7", "")
                        .replace(u"\u2212", "")
                        .strip()
                    )
                    if len(digit_test_str) > 10 and digit_test_str.isdigit():
                        num_digit_lines += 1
                    elif len(digit_test_str) == 0:
                        # This is a blank line. Don't count it, to accommodate
                        # documents that are double-line spaced:
                        digit_lines += 1
                    y = y + 1
                if num_digit_lines == digit_lines:
                    section_ended = True
            x += 1
    return x - 1