def extract_references_from_file_xml(path, recid=1): """Extract references from a local pdf file The single parameter is the path to the file It raises FullTextNotAvailable if the file does not exist The result is given in marcxml. """ if not os.path.isfile(path): raise FullTextNotAvailable() docbody, dummy = get_plaintext_document_body(path) reflines, dummy, dummy = extract_references_from_fulltext(docbody) if not len(reflines): docbody, dummy = get_plaintext_document_body(path, keep_layout=True) reflines, dummy, dummy = extract_references_from_fulltext(docbody) return parse_references(reflines, recid=recid)
def extract_references_from_file_xml(path, recid=1): """Extract references from a local pdf file The single parameter is the path to the file It raises FullTextNotAvailable if the file does not exist The result is given in marcxml. """ if not os.path.isfile(path): raise FullTextNotAvailable() docbody, dummy = get_plaintext_document_body(path) reflines, dummy, dummy = extract_references_from_fulltext(docbody) if not len(reflines): docbody, dummy = get_plaintext_document_body(path, keep_layout=True) reflines, dummy, dummy = extract_references_from_fulltext(docbody) return parse_references(reflines, recid=recid)
def extract_references_from_string_xml(source, is_only_references=True): """Extract references from a string The single parameter is the document The result is given in marcxml. """ docbody = source.split("\n") if not is_only_references: reflines, dummy, dummy = extract_references_from_fulltext(docbody) else: refs_info = get_reference_section_beginning(docbody) if not refs_info: refs_info, dummy = find_numeration_in_body(docbody) refs_info["start_line"] = 0 refs_info["end_line"] = (len(docbody) - 1,) reflines = rebuild_reference_lines(docbody, refs_info["marker_pattern"]) return parse_references(reflines)
def extract_references_from_string_xml(source, is_only_references=True): """Extract references from a string The single parameter is the document The result is given in marcxml. """ docbody = source.split('\n') if not is_only_references: reflines, dummy, dummy = extract_references_from_fulltext(docbody) else: refs_info = get_reference_section_beginning(docbody) if not refs_info: refs_info, dummy = find_numeration_in_body(docbody) refs_info['start_line'] = 0 refs_info['end_line'] = len(docbody) - 1, reflines = rebuild_reference_lines(docbody, refs_info['marker_pattern']) return parse_references(reflines)
def extract_references_from_file(path, recid=None): """Extract references from a local pdf file The single parameter is the path to the file It raises FullTextNotAvailable if the file does not exist The result is given as a bibrecord class. """ if not os.path.isfile(path): raise FullTextNotAvailable() docbody, dummy = get_plaintext_document_body(path) reflines, dummy, dummy = extract_references_from_fulltext(docbody) if not len(reflines): docbody, dummy = get_plaintext_document_body(path, keep_layout=True) reflines, dummy, dummy = extract_references_from_fulltext(docbody) references = parse_references(reflines, recid=recid) references['999C6'][0].add_subfield('v', os.path.basename(path)) return references