def test_2_lines_together(self): marker_pattern = ur"\s*(?P<mark>\[\s*(?P<marknum>\d+)\s*\])" refs = [ u"[1] hello", u"hello2 [2] foo", ] rebuilt_refs = rebuild_reference_lines(refs, marker_pattern) self.assertEqual(rebuilt_refs, [ u"[1] hello hello2", u"[2] foo", ])
def test_pagination_non_removal(self): marker_pattern = ur"^\s*(?P<mark>\[\s*(?P<marknum>\d+)\s*\])" refs = [ u"[1] hello", u"hello2", u"[2]", u"foo", ] rebuilt_refs = rebuild_reference_lines(refs, marker_pattern) self.assertEqual(rebuilt_refs, [ u"[1] hello hello2", u"[2] foo", ])
def extract_references_from_string_xml(source, is_only_references=True): """Extract references from a string The single parameter is the document The result is given in marcxml. """ docbody = source.split("\n") if not is_only_references: reflines, dummy, dummy = extract_references_from_fulltext(docbody) else: refs_info = get_reference_section_beginning(docbody) if not refs_info: refs_info, dummy = find_numeration_in_body(docbody) refs_info["start_line"] = 0 refs_info["end_line"] = (len(docbody) - 1,) reflines = rebuild_reference_lines(docbody, refs_info["marker_pattern"]) return parse_references(reflines)
def extract_references_from_string_xml(source, is_only_references=True): """Extract references from a string The single parameter is the document The result is given in marcxml. """ docbody = source.split('\n') if not is_only_references: reflines, dummy, dummy = extract_references_from_fulltext(docbody) else: refs_info = get_reference_section_beginning(docbody) if not refs_info: refs_info, dummy = find_numeration_in_body(docbody) refs_info['start_line'] = 0 refs_info['end_line'] = len(docbody) - 1, reflines = rebuild_reference_lines(docbody, refs_info['marker_pattern']) return parse_references(reflines)