def test_no_title_via_numbers(self): sect = get_reference_section_beginning(["Hello", "1 Ref1" "2 Ref2"]) self.assertEqual( sect, { 'marker': '1', 'marker_pattern': u'(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))', 'start_line': 1, 'title_string': None, 'title_marker_same_line': False, 'how_found_start': 4, })
def test_simple(self): sect = get_reference_section_beginning( ["Hello", "References", "[1] Ref1"]) self.assertEqual( sect, { 'marker': '[1]', 'marker_pattern': u'\\s*(?P<mark>\\[\\s*(?P<marknum>\\d+)\\s*\\])', 'start_line': 1, 'title_string': 'References', 'title_marker_same_line': False, 'how_found_start': 1, })
def test_no_title_via_dots(self): from invenio.refextract_find import get_reference_section_beginning sect = get_reference_section_beginning(["Hello", "1. Ref1" "2. Ref2"]) self.assertEqual( sect, { 'marker': '1.', 'marker_pattern': u'(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>\\.))', 'start_line': 1, 'title_string': None, 'title_marker_same_line': False, 'how_found_start': 3, })
def extract_references_from_fulltext(fulltext): """Locate and extract the reference section from a fulltext document. Return the extracted reference section as a list of strings, whereby each string in the list is considered to be a single reference line. E.g. a string could be something like: '[19] Wilson, A. Unpublished (1986). @param fulltext: (list) of strings, whereby each string is a line of the document. @return: (list) of strings, where each string is an extracted reference line. """ # Try to remove pagebreaks, headers, footers fulltext = remove_page_boundary_lines(fulltext) status = 0 # How ref section found flag how_found_start = 0 # Find start of refs section ref_sect_start = get_reference_section_beginning(fulltext) if ref_sect_start is None: ## No References refs = [] status = 4 write_message( "* extract_references_from_fulltext: " "ref_sect_start is None", verbose=2) else: # If a reference section was found, however weak ref_sect_end = \ find_end_of_reference_section(fulltext, ref_sect_start["start_line"], ref_sect_start["marker"], ref_sect_start["marker_pattern"]) if ref_sect_end is None: # No End to refs? Not safe to extract refs = [] status = 5 write_message( "* extract_references_from_fulltext: " "no end to refs!", verbose=2) else: # If the end of the reference section was found.. start extraction refs = get_reference_lines( fulltext, ref_sect_start["start_line"], ref_sect_end, ref_sect_start["title_string"], ref_sect_start["marker_pattern"], ref_sect_start["title_marker_same_line"]) return refs, status, how_found_start
def test_no_title_via_brackets(self): sect = get_reference_section_beginning( ["Hello", "[1] Ref1" "[2] Ref2"]) self.assertEqual( sect, { 'marker': '[1]', 'marker_pattern': u'(?P<mark>(?P<left>\\[)\\s*(?P<marknum>\\d+)\\s*(?P<right>\\]))', 'start_line': 1, 'title_string': None, 'title_marker_same_line': False, 'how_found_start': 2, })
def test_no_title_via_numbers(self): sect = get_reference_section_beginning([ "Hello", "1 Ref1" "2 Ref2" ]) self.assertEqual(sect, { 'marker': '1', 'marker_pattern': u'(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))', 'start_line': 1, 'title_string': None, 'title_marker_same_line': False, 'how_found_start': 4, })
def test_no_title_via_brackets(self): sect = get_reference_section_beginning([ "Hello", "[1] Ref1" "[2] Ref2" ]) self.assertEqual(sect, { 'marker': '[1]', 'marker_pattern': u'(?P<mark>(?P<left>\\[)\\s*(?P<marknum>\\d+)\\s*(?P<right>\\]))', 'start_line': 1, 'title_string': None, 'title_marker_same_line': False, 'how_found_start': 2, })
def test_simple(self): sect = get_reference_section_beginning([ "Hello", "References", "[1] Ref1" ]) self.assertEqual(sect, { 'marker': '[1]', 'marker_pattern': u'\\s*(?P<mark>\\[\\s*(?P<marknum>\\d+)\\s*\\])', 'start_line': 1, 'title_string': 'References', 'title_marker_same_line': False, 'how_found_start': 1, })
def extract_references_from_fulltext(fulltext): """Locate and extract the reference section from a fulltext document. Return the extracted reference section as a list of strings, whereby each string in the list is considered to be a single reference line. E.g. a string could be something like: '[19] Wilson, A. Unpublished (1986). @param fulltext: (list) of strings, whereby each string is a line of the document. @return: (list) of strings, where each string is an extracted reference line. """ # Try to remove pagebreaks, headers, footers fulltext = remove_page_boundary_lines(fulltext) status = 0 # How ref section found flag how_found_start = 0 # Find start of refs section ref_sect_start = get_reference_section_beginning(fulltext) if ref_sect_start is None: ## No References refs = [] status = 4 write_message("* extract_references_from_fulltext: " "ref_sect_start is None", verbose=2) else: # If a reference section was found, however weak ref_sect_end = \ find_end_of_reference_section(fulltext, ref_sect_start["start_line"], ref_sect_start["marker"], ref_sect_start["marker_pattern"]) if ref_sect_end is None: # No End to refs? Not safe to extract refs = [] status = 5 write_message("* extract_references_from_fulltext: " "no end to refs!", verbose=2) else: # If the end of the reference section was found.. start extraction refs = get_reference_lines(fulltext, ref_sect_start["start_line"], ref_sect_end, ref_sect_start["title_string"], ref_sect_start["marker_pattern"], ref_sect_start["title_marker_same_line"]) return refs, status, how_found_start
def extract_references_from_string_xml(source, is_only_references=True): """Extract references from a string The single parameter is the document The result is given in marcxml. """ docbody = source.split("\n") if not is_only_references: reflines, dummy, dummy = extract_references_from_fulltext(docbody) else: refs_info = get_reference_section_beginning(docbody) if not refs_info: refs_info, dummy = find_numeration_in_body(docbody) refs_info["start_line"] = 0 refs_info["end_line"] = (len(docbody) - 1,) reflines = rebuild_reference_lines(docbody, refs_info["marker_pattern"]) return parse_references(reflines)
def extract_references_from_string_xml(source, is_only_references=True): """Extract references from a string The single parameter is the document The result is given in marcxml. """ docbody = source.split('\n') if not is_only_references: reflines, dummy, dummy = extract_references_from_fulltext(docbody) else: refs_info = get_reference_section_beginning(docbody) if not refs_info: refs_info, dummy = find_numeration_in_body(docbody) refs_info['start_line'] = 0 refs_info['end_line'] = len(docbody) - 1, reflines = rebuild_reference_lines(docbody, refs_info['marker_pattern']) return parse_references(reflines)
def test_no_section(self): sect = get_reference_section_beginning("") self.assertEqual(sect, None)
def test_no_section(self): from invenio.refextract_find import get_reference_section_beginning sect = get_reference_section_beginning("") self.assertEqual(sect, None)