def test_no_title_via_numbers(self): from invenio.legacy.refextract.find import get_reference_section_beginning sect = get_reference_section_beginning(["Hello", "1 Ref1" "2 Ref2"]) self.assertEqual( sect, { 'marker': '1', 'marker_pattern': u'(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))', 'start_line': 1, 'title_string': None, 'title_marker_same_line': False, 'how_found_start': 4, })
def extract_references_from_fulltext(fulltext): """Locate and extract the reference section from a fulltext document. Return the extracted reference section as a list of strings, whereby each string in the list is considered to be a single reference line. E.g. a string could be something like: '[19] Wilson, A. Unpublished (1986). @param fulltext: (list) of strings, whereby each string is a line of the document. @return: (list) of strings, where each string is an extracted reference line. """ # Try to remove pagebreaks, headers, footers fulltext = remove_page_boundary_lines(fulltext) status = 0 # How ref section found flag how_found_start = 0 # Find start of refs section ref_sect_start = get_reference_section_beginning(fulltext) if ref_sect_start is None: ## No References refs = [] status = 4 write_message( "* extract_references_from_fulltext: " "ref_sect_start is None", verbose=2) else: # If a reference section was found, however weak ref_sect_end = \ find_end_of_reference_section(fulltext, ref_sect_start["start_line"], ref_sect_start["marker"], ref_sect_start["marker_pattern"]) if ref_sect_end is None: # No End to refs? Not safe to extract refs = [] status = 5 write_message( "* extract_references_from_fulltext: " "no end to refs!", verbose=2) else: # If the end of the reference section was found.. start extraction refs = get_reference_lines( fulltext, ref_sect_start["start_line"], ref_sect_end, ref_sect_start["title_string"], ref_sect_start["marker_pattern"], ref_sect_start["title_marker_same_line"]) return refs, status, how_found_start
def test_simple(self): from invenio.legacy.refextract.find import get_reference_section_beginning sect = get_reference_section_beginning( ["Hello", "References", "[1] Ref1"]) self.assertEqual( sect, { 'marker': '[1]', 'marker_pattern': u'\\s*(?P<mark>\\[\\s*(?P<marknum>\\d+)\\s*\\])', 'start_line': 1, 'title_string': 'References', 'title_marker_same_line': False, 'how_found_start': 1, })
def extract_references_from_fulltext(fulltext): """Locate and extract the reference section from a fulltext document. Return the extracted reference section as a list of strings, whereby each string in the list is considered to be a single reference line. E.g. a string could be something like: '[19] Wilson, A. Unpublished (1986). @param fulltext: (list) of strings, whereby each string is a line of the document. @return: (list) of strings, where each string is an extracted reference line. """ # Try to remove pagebreaks, headers, footers fulltext = remove_page_boundary_lines(fulltext) status = 0 # How ref section found flag how_found_start = 0 # Find start of refs section ref_sect_start = get_reference_section_beginning(fulltext) if ref_sect_start is None: ## No References refs = [] status = 4 write_message("* extract_references_from_fulltext: " \ "ref_sect_start is None", verbose=2) else: # If a reference section was found, however weak ref_sect_end = \ find_end_of_reference_section(fulltext, ref_sect_start["start_line"], ref_sect_start["marker"], ref_sect_start["marker_pattern"]) if ref_sect_end is None: # No End to refs? Not safe to extract refs = [] status = 5 write_message("* extract_references_from_fulltext: " \ "no end to refs!", verbose=2) else: # If the end of the reference section was found.. start extraction refs = get_reference_lines(fulltext, ref_sect_start["start_line"], ref_sect_end, ref_sect_start["title_string"], ref_sect_start["marker_pattern"], ref_sect_start["title_marker_same_line"], ref_sect_start["marker"]) return refs, status, how_found_start
def test_no_title_via_numbers(self): from invenio.legacy.refextract.find import get_reference_section_beginning sect = get_reference_section_beginning([ "Hello", "1 Ref1" "2 Ref2" ]) self.assertEqual(sect, { 'marker': '1', 'marker_pattern': u'(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))', 'start_line': 1, 'title_string': None, 'title_marker_same_line': False, 'how_found_start': 4, })
def test_simple(self): from invenio.legacy.refextract.find import get_reference_section_beginning sect = get_reference_section_beginning([ "Hello", "References", "[1] Ref1" ]) self.assertEqual(sect, { 'marker': '[1]', 'marker_pattern': u'^\\s*(?P<mark>\\[\\s*(?P<marknum>\\d+)\\s*\\])', 'start_line': 1, 'title_string': 'References', 'title_marker_same_line': False, 'how_found_start': 1, })
def test_no_title_via_numbers2(self): from invenio.legacy.refextract.find import get_reference_section_beginning sect = get_reference_section_beginning(["Hello", "1", "Ref1", "(3)", "2", "Ref2"]) self.assertEqual( sect, { "marker": "1", "marker_pattern": u"(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))", "start_line": 1, "title_string": None, "title_marker_same_line": False, "how_found_start": 4, }, )
def test_simple(self): from invenio.legacy.refextract.find import get_reference_section_beginning sect = get_reference_section_beginning(["Hello", "References", "[1] Ref1"]) self.assertEqual( sect, { "marker": "[1]", "marker_pattern": u"\\s*(?P<mark>\\[\\s*(?P<marknum>\\d+)\\s*\\])", "start_line": 1, "title_string": "References", "title_marker_same_line": False, "how_found_start": 1, }, )
def test_no_section(self): from invenio.legacy.refextract.find import get_reference_section_beginning sect = get_reference_section_beginning("") self.assertEqual(sect, None)