Exemplo n.º 1
0
def build_authors_kb(fpath):
    replacements = []

    if isinstance(fpath, six.string_types):
        fpath_needs_closing = True
        try:
            fh = open(fpath, "r")
        except IOError:
            # problem opening KB for reading, or problem while reading from it:
            emsg = "Error: Could not build list of authors - failed " \
                   "to read from KB %(kb)s." % {'kb' : fpath}
            write_message(emsg, sys.stderr, verbose=0)
            raise IOError("Error: Unable to open authors kb '%s'" % fpath)
    else:
        fpath_needs_closing = False
        fh = fpath

    try:
        for rawline in fh:
            if rawline.startswith('#'):
                continue

            # Extract the seek->replace terms from this KB line:
            m_kb_line = re_kb_line.search(rawline.decode('utf-8'))
            if m_kb_line:
                seek = m_kb_line.group('seek')
                repl = m_kb_line.group('repl')
                replacements.append((seek, repl))
    finally:
        if fpath_needs_closing:
            fh.close()

    return replacements
Exemplo n.º 2
0
def build_authors_kb(fpath):
    replacements = []

    if isinstance(fpath, six.string_types):
        fpath_needs_closing = True
        try:
            fh = open(fpath, "r")
        except IOError:
            # problem opening KB for reading, or problem while reading from it:
            emsg = "Error: Could not build list of authors - failed " \
                   "to read from KB %(kb)s." % {'kb' : fpath}
            write_message(emsg, sys.stderr, verbose=0)
            raise IOError("Error: Unable to open authors kb '%s'" % fpath)
    else:
        fpath_needs_closing = False
        fh = fpath

    try:
        for rawline in fh:
            if rawline.startswith('#'):
                continue

            # Extract the seek->replace terms from this KB line:
            m_kb_line = re_kb_line.search(rawline.decode('utf-8'))
            if m_kb_line:
                seek = m_kb_line.group('seek')
                repl = m_kb_line.group('repl')
                replacements.append((seek, repl))
    finally:
        if fpath_needs_closing:
            fh.close()

    return replacements
Exemplo n.º 3
0
def extract_one(config, pdf_path):
    """Extract references from one file"""
    # If necessary, locate the reference section:
    if config.treat_as_reference_section:
        docbody = open(pdf_path).read().decode('utf-8')
        record = extract_references_from_string(docbody)
    else:
        write_message("* processing pdffile: %s" % pdf_path, verbose=2)
        record = extract_references_from_file(pdf_path)

    return record
Exemplo n.º 4
0
def extract_one(config, pdf_path):
    """Extract references from one file"""
    # If necessary, locate the reference section:
    if config.treat_as_reference_section:
        docbody = open(pdf_path).read().decode('utf-8')
        record = extract_references_from_string(docbody)
    else:
        write_message("* processing pdffile: %s" % pdf_path, verbose=2)
        record = extract_references_from_file(pdf_path)

    return record
Exemplo n.º 5
0
def parse_reference_line(ref_line, kbs, bad_titles_count={}):
    """Parse one reference line

    @input a string representing a single reference bullet
    @output parsed references (a list of elements objects)
    """
    # Strip the 'marker' (e.g. [1]) from this reference line:
    (line_marker, ref_line) = remove_reference_line_marker(ref_line)
    # Find DOI sections in citation
    (ref_line, identified_dois) = identify_and_tag_DOI(ref_line)
    # Identify and replace URLs in the line:
    (ref_line, identified_urls) = identify_and_tag_URLs(ref_line)
    # Tag <cds.JOURNAL>, etc.
    tagged_line, bad_titles_count = tag_reference_line(ref_line,
                                                       kbs,
                                                       bad_titles_count)

    # Debug print tagging (authors, titles, volumes, etc.)
    write_message('* tags %r' % tagged_line, verbose=9)

    # Using the recorded information, create a MARC XML representation
    # of the rebuilt line:
    # At the same time, get stats of citations found in the reference line
    # (titles, urls, etc):
    citation_elements, line_marker, counts = \
        parse_tagged_reference_line(line_marker,
                                    tagged_line,
                                    identified_dois,
                                    identified_urls)

    # Transformations on elements
    citation_elements = split_volume_from_journal(citation_elements)
    citation_elements = format_volume(citation_elements)
    citation_elements = handle_special_journals(citation_elements, kbs)
    citation_elements = format_report_number(citation_elements)
    citation_elements = format_author_ed(citation_elements)
    citation_elements = look_for_books(citation_elements, kbs)
    citation_elements = format_hep(citation_elements)
    citation_elements = remove_b_for_nucl_phys(citation_elements)
    citation_elements = mangle_volume(citation_elements)

    # Split the reference in multiple ones if needed
    splitted_citations = split_citations(citation_elements)

    # Remove references with only misc text
    splitted_citations = remove_invalid_references(splitted_citations)
    # Find year
    splitted_citations = add_year_elements(splitted_citations)
    # For debugging puposes
    print_citations(splitted_citations, line_marker)

    return splitted_citations, line_marker, counts, bad_titles_count
Exemplo n.º 6
0
def convert_PDF_to_plaintext(fpath, keep_layout=False):
    """ Convert PDF to txt using pdftotext

    Take the path to a PDF file and run pdftotext for this file, capturing
    the output.
    @param fpath: (string) path to the PDF file
    @return: (list) of unicode strings (contents of the PDF file translated
    into plaintext; each string is a line in the document.)
    """
    if keep_layout:
        layout_option = "-layout"
    else:
        layout_option = "-raw"
    status = 0
    doclines = []
    # Pattern to check for lines with a leading page-break character.
    # If this pattern is matched, we want to split the page-break into
    # its own line because we rely upon this for trying to strip headers
    # and footers, and for some other pattern matching.
    p_break_in_line = re.compile(ur'^\s*\f(.+)$', re.UNICODE)
    # build pdftotext command:
    cmd_pdftotext = [
        CFG_PATH_PDFTOTEXT, layout_option, "-q", "-enc", "UTF-8", fpath, "-"
    ]
    write_message("* %s" % ' '.join(cmd_pdftotext), verbose=2)
    # open pipe to pdftotext:
    pipe_pdftotext = subprocess.Popen(cmd_pdftotext, stdout=subprocess.PIPE)

    # read back results:
    for docline in pipe_pdftotext.stdout:
        unicodeline = docline.decode("utf-8")
        # Check for a page-break in this line:
        m_break_in_line = p_break_in_line.match(unicodeline)
        if m_break_in_line is None:
            # There was no page-break in this line. Just add the line:
            doclines.append(unicodeline)
        else:
            # If there was a page-break character in the same line as some
            # text, split it out into its own line so that we can later
            # try to find headers and footers:
            doclines.append(u"\f")
            doclines.append(m_break_in_line.group(1))

    write_message("* convert_PDF_to_plaintext found: " \
                     "%s lines of text" % len(doclines), verbose=2)

    # finally, check conversion result not bad:
    if pdftotext_conversion_is_bad(doclines):
        status = 2
        doclines = []

    return (doclines, status)
Exemplo n.º 7
0
def load_kb(path, builder):
    try:
        path.startswith
    except AttributeError:
        write_message("Loading kb from array", verbose=3)
        return load_kb_from_iterable(path, builder)
    else:
        write_message("Loading kb from %s" % path, verbose=3)
        kb_start = 'kb:'
        if path.startswith(kb_start):
            return load_kb_from_db(path[len(kb_start):], builder)
        else:
            return load_kb_from_file(path, builder)
Exemplo n.º 8
0
def convert_PDF_to_plaintext(fpath, keep_layout=False):
    """ Convert PDF to txt using pdftotext

    Take the path to a PDF file and run pdftotext for this file, capturing
    the output.
    @param fpath: (string) path to the PDF file
    @return: (list) of unicode strings (contents of the PDF file translated
    into plaintext; each string is a line in the document.)
    """
    if keep_layout:
        layout_option = "-layout"
    else:
        layout_option = "-raw"
    status = 0
    doclines = []
    # Pattern to check for lines with a leading page-break character.
    # If this pattern is matched, we want to split the page-break into
    # its own line because we rely upon this for trying to strip headers
    # and footers, and for some other pattern matching.
    p_break_in_line = re.compile(ur'^\s*\f(.+)$', re.UNICODE)
    # build pdftotext command:
    cmd_pdftotext = [CFG_PATH_PDFTOTEXT, layout_option, "-q",
                      "-enc", "UTF-8", fpath, "-"]
    write_message("* %s" % ' '.join(cmd_pdftotext), verbose=2)
    # open pipe to pdftotext:
    pipe_pdftotext = subprocess.Popen(cmd_pdftotext, stdout=subprocess.PIPE)

    # read back results:
    for docline in pipe_pdftotext.stdout:
        unicodeline = docline.decode("utf-8")
        # Check for a page-break in this line:
        m_break_in_line = p_break_in_line.match(unicodeline)
        if m_break_in_line is None:
            # There was no page-break in this line. Just add the line:
            doclines.append(unicodeline)
        else:
            # If there was a page-break character in the same line as some
            # text, split it out into its own line so that we can later
            # try to find headers and footers:
            doclines.append(u"\f")
            doclines.append(m_break_in_line.group(1))

    write_message("* convert_PDF_to_plaintext found: " \
                     "%s lines of text" % len(doclines), verbose=2)

    # finally, check conversion result not bad:
    if pdftotext_conversion_is_bad(doclines):
        status = 2
        doclines = []

    return (doclines, status)
Exemplo n.º 9
0
def extract_references_from_fulltext(fulltext):
    """Locate and extract the reference section from a fulltext document.
       Return the extracted reference section as a list of strings, whereby each
       string in the list is considered to be a single reference line.
        E.g. a string could be something like:
        '[19] Wilson, A. Unpublished (1986).
       @param fulltext: (list) of strings, whereby each string is a line of the
        document.
       @return: (list) of strings, where each string is an extracted reference
        line.
    """
    # Try to remove pagebreaks, headers, footers
    fulltext = remove_page_boundary_lines(fulltext)
    status = 0
    # How ref section found flag
    how_found_start = 0
    # Find start of refs section
    ref_sect_start = get_reference_section_beginning(fulltext)

    if ref_sect_start is None:
        ## No References
        refs = []
        status = 4
        write_message(
            "* extract_references_from_fulltext: "
            "ref_sect_start is None",
            verbose=2)
    else:
        # If a reference section was found, however weak
        ref_sect_end = \
           find_end_of_reference_section(fulltext,
                                         ref_sect_start["start_line"],
                                         ref_sect_start["marker"],
                                         ref_sect_start["marker_pattern"])
        if ref_sect_end is None:
            # No End to refs? Not safe to extract
            refs = []
            status = 5
            write_message(
                "* extract_references_from_fulltext: "
                "no end to refs!",
                verbose=2)
        else:
            # If the end of the reference section was found.. start extraction
            refs = get_reference_lines(
                fulltext, ref_sect_start["start_line"], ref_sect_end,
                ref_sect_start["title_string"],
                ref_sect_start["marker_pattern"],
                ref_sect_start["title_marker_same_line"])

    return refs, status, how_found_start
Exemplo n.º 10
0
def main(config, args, run):
    """Main wrapper function for begin_extraction, and is
    always accessed in a standalone/independent way. (i.e. calling main
    will cause refextract to run in an independent mode)"""
    # Flag as running out of bibtask
    global RUNNING_INDEPENDENTLY
    RUNNING_INDEPENDENTLY = True

    if config.verbosity not in range(0, 10):
        usage("Error: Verbosity must be an integer between 0 and 10")

    setup_loggers(config.verbosity)

    if config.version:
        # version message and exit
        write_message(__revision__, verbose=0)
        halt(exit_code=0)

    if config.help:
        usage()

    if not args:
        # no files provided for reference extraction - error message
        usage("Error: No valid input file specified (file1 [file2 ...])")

    try:
        run(config, args)
        write_message("Extraction complete", verbose=2)
    except StandardError as e:
        # Remove extra '\n'
        write_message(traceback.format_exc()[:-1], verbose=9)
        write_message("Error: %s" % e, verbose=0)
        halt(exit_code=1)
Exemplo n.º 11
0
def main(config, args, run):
    """Main wrapper function for begin_extraction, and is
    always accessed in a standalone/independent way. (i.e. calling main
    will cause refextract to run in an independent mode)"""
    # Flag as running out of bibtask
    global RUNNING_INDEPENDENTLY
    RUNNING_INDEPENDENTLY = True

    if config.verbosity not in range(0, 10):
        usage("Error: Verbosity must be an integer between 0 and 10")

    setup_loggers(config.verbosity)

    if config.version:
        # version message and exit
        write_message(__revision__, verbose=0)
        halt(exit_code=0)

    if config.help:
        usage()

    if not args:
        # no files provided for reference extraction - error message
        usage("Error: No valid input file specified (file1 [file2 ...])")

    try:
        run(config, args)
        write_message("Extraction complete", verbose=2)
    except StandardError as e:
        # Remove extra '\n'
        write_message(traceback.format_exc()[:-1], verbose=9)
        write_message("Error: %s" % e, verbose=0)
        halt(exit_code=1)
Exemplo n.º 12
0
def print_citations(splitted_citations, line_marker):
    write_message('* splitted_citations', verbose=9)
    write_message('  * line marker %s' % line_marker, verbose=9)
    for citation in splitted_citations:
        write_message("  * elements", verbose=9)
        for el in citation:
            write_message('    * %s %s' % (el['type'], repr(el)), verbose=9)
Exemplo n.º 13
0
def make_collaborations_regex_str():
    """ From the authors knowledge-base, construct a single regex holding the or'd possibilities of patterns
    which should be included in $h subfields. The word 'Collaboration' is also converted to 'Coll', and
    used in finding matches. Letter case is not considered during the search.
    @return: (string) The single pattern built from each line in the author knowledge base.
    """
    def add_to_auth_list(s):
        """Strip the line, replace spaces with 'backslash s' and append 'the'
        to the start and 's' to the end. Add the prepared line to the list of
        extra kb authors."""
        s = ur"(?:the\s)?" + s.strip().replace(u' ', ur'\s') + u"s?"
        auths.append(s)

    ## Build the 'or'd regular expression of the author lines in the author knowledge base
    auths = []
    fpath = CFG_REFEXTRACT_KBS['collaborations']

    try:
        fh = open(fpath, "r")
    except IOError:
        ## problem opening KB for reading, or problem while reading from it:
        emsg = """Error: Could not build knowledge base containing """ \
               """author patterns - failed """ \
               """to read from KB %(kb)s.\n""" \
               % {'kb' : fpath}
        write_message(emsg, sys.stderr, verbose=0)
        raise IOError("Error: Unable to open collaborations kb '%s'" % fpath)

    for line_num, rawline in enumerate(fh):
        try:
            rawline = rawline.decode("utf-8")
        except UnicodeError:
            write_message("*** Unicode problems in %s for line %d"
                             % (fpath, line_num), sys.stderr, verbose=0)
            raise UnicodeError("Error: Unable to parse collaboration kb (line: %s)" % str(line_num))
        if rawline.strip() and rawline[0].strip() != '#':
            add_to_auth_list(rawline)
            ## Shorten collaboration to 'coll'
            if rawline.lower().endswith('collaboration\n'):
                coll_version = rawline[:rawline.lower().find(u'collaboration\n')] + ur"coll[\.\,]"
                add_to_auth_list(coll_version.strip().replace(' ', r'\s') + u"s?")

    author_match_re = ""
    if len(auths) > 0:
        author_match_re = u'|'.join([u"(?:" + a + u")" for a in auths])
        author_match_re = ur"(?:(?:[\(\"]?(?P<extra_auth>" + \
            author_match_re + ur")[\)\"]?[\,\.]?\s?(?:and\s)?)+)"

    return author_match_re
Exemplo n.º 14
0
def extract_references_from_fulltext(fulltext):
    """Locate and extract the reference section from a fulltext document.
       Return the extracted reference section as a list of strings, whereby each
       string in the list is considered to be a single reference line.
        E.g. a string could be something like:
        '[19] Wilson, A. Unpublished (1986).
       @param fulltext: (list) of strings, whereby each string is a line of the
        document.
       @return: (list) of strings, where each string is an extracted reference
        line.
    """
    # Try to remove pagebreaks, headers, footers
    fulltext = remove_page_boundary_lines(fulltext)
    status = 0
    # How ref section found flag
    how_found_start = 0
    # Find start of refs section
    ref_sect_start = get_reference_section_beginning(fulltext)

    if ref_sect_start is None:
        ## No References
        refs = []
        status = 4
        write_message("* extract_references_from_fulltext: " \
                         "ref_sect_start is None", verbose=2)
    else:
        # If a reference section was found, however weak
        ref_sect_end = \
           find_end_of_reference_section(fulltext,
                                         ref_sect_start["start_line"],
                                         ref_sect_start["marker"],
                                         ref_sect_start["marker_pattern"])
        if ref_sect_end is None:
            # No End to refs? Not safe to extract
            refs = []
            status = 5
            write_message("* extract_references_from_fulltext: " \
                             "no end to refs!", verbose=2)
        else:
            # If the end of the reference section was found.. start extraction
            refs = get_reference_lines(fulltext,
                                       ref_sect_start["start_line"],
                                       ref_sect_end,
                                       ref_sect_start["title_string"],
                                       ref_sect_start["marker_pattern"],
                                       ref_sect_start["title_marker_same_line"],
                                       ref_sect_start["marker"])

    return refs, status, how_found_start
Exemplo n.º 15
0
def make_collaborations_regex_str():
    """ From the authors knowledge-base, construct a single regex holding the or'd possibilities of patterns
    which should be included in $h subfields. The word 'Collaboration' is also converted to 'Coll', and
    used in finding matches. Letter case is not considered during the search.
    @return: (string) The single pattern built from each line in the author knowledge base.
    """

    def add_to_auth_list(s):
        """Strip the line, replace spaces with 'backslash s' and append 'the'
        to the start and 's' to the end. Add the prepared line to the list of
        extra kb authors."""
        s = ur"(?:the\s)?" + s.strip().replace(u" ", ur"\s") + u"s?"
        auths.append(s)

    ## Build the 'or'd regular expression of the author lines in the author knowledge base
    auths = []
    fpath = CFG_REFEXTRACT_KBS["collaborations"]

    try:
        fh = open(fpath, "r")
    except IOError:
        ## problem opening KB for reading, or problem while reading from it:
        emsg = (
            """Error: Could not build knowledge base containing """
            """author patterns - failed """
            """to read from KB %(kb)s.\n""" % {"kb": fpath}
        )
        write_message(emsg, sys.stderr, verbose=0)
        raise IOError("Error: Unable to open collaborations kb '%s'" % fpath)

    for line_num, rawline in enumerate(fh):
        try:
            rawline = rawline.decode("utf-8")
        except UnicodeError:
            write_message("*** Unicode problems in %s for line %d" % (fpath, line_num), sys.stderr, verbose=0)
            raise UnicodeError("Error: Unable to parse collaboration kb (line: %s)" % str(line_num))
        if rawline.strip() and rawline[0].strip() != "#":
            add_to_auth_list(rawline)
            ## Shorten collaboration to 'coll'
            if rawline.lower().endswith("collaboration\n"):
                coll_version = rawline[: rawline.lower().find(u"collaboration\n")] + ur"coll[\.\,]"
                add_to_auth_list(coll_version.strip().replace(" ", r"\s") + u"s?")

    author_match_re = ""
    if len(auths) > 0:
        author_match_re = u"|".join([u"(?:" + a + u")" for a in auths])
        author_match_re = ur"(?:(?:[\(\"]?(?P<extra_auth>" + author_match_re + ur")[\)\"]?[\,\.]?\s?(?:and\s)?)+)"

    return author_match_re
Exemplo n.º 16
0
def load_kb(path, builder):
    try:
        path.startswith
    except AttributeError:
        write_message("Loading kb from array", verbose=3)
        return load_kb_from_iterable(path, builder)
    else:
        write_message("Loading kb from %s" % path, verbose=3)
        kb_start = 'kb:'
        records_start = 'records:'
        if path.startswith(kb_start):
            return load_kb_from_db(path[len(kb_start):], builder)
        elif path.startswith(records_start):
            return load_kb_from_records(path[len(kb_start):], builder)
        else:
            return load_kb_from_file(path, builder)
Exemplo n.º 17
0
def begin_extraction(config, files):
    """Starts the core extraction procedure. [Entry point from main]

       Only refextract_daemon calls this directly, from _task_run_core()
       @param daemon_cli_options: contains the pre-assembled list of cli flags
       and values processed by the Refextract Daemon. This is full only when
       called as a scheduled bibtask inside bibsched.
    """
    # Store records here
    records = []

    for num, path in enumerate(files):
        # Announce the document extraction number
        write_message("Extracting %d of %d" % (num + 1, len(files)), verbose=1)
        # Parse references
        rec = extract_one(config, path)
        records.append(rec)

    # Write our references
    write_references(config, records)
Exemplo n.º 18
0
def halt(err=StandardError, msg=None, exit_code=1):
    """ Stop extraction, and deal with the error in the appropriate
    manner, based on whether Refextract is running in standalone or
    bibsched mode.
    @param err: (exception) The exception raised from an error, if any
    @param msg: (string) The brief error message, either displayed
    on the bibsched interface, or written to stderr.
    @param exit_code: (integer) Either 0 or 1, depending on the cause
    of the halting. This is only used when running standalone."""
    # If refextract is running independently, exit.
    # 'RUNNING_INDEPENDENTLY' is a global variable
    if RUNNING_INDEPENDENTLY:
        if msg:
            write_message(msg, stream=sys.stderr, verbose=0)
        sys.exit(exit_code)
    # Else, raise an exception so Bibsched will flag this task.
    else:
        if msg:
            # Update the status of refextract inside the Bibsched UI
            task_update_progress(msg.strip())
        raise err(msg)
Exemplo n.º 19
0
def begin_extraction(config, files):
    """Starts the core extraction procedure. [Entry point from main]

       Only refextract_daemon calls this directly, from _task_run_core()
       @param daemon_cli_options: contains the pre-assembled list of cli flags
       and values processed by the Refextract Daemon. This is full only when
       called as a scheduled bibtask inside bibsched.
    """
    # Store records here
    records = []

    for num, path in enumerate(files):
        # Announce the document extraction number
        write_message("Extracting %d of %d" % (num + 1, len(files)),
                      verbose=1)
        # Parse references
        rec = extract_one(config, path)
        records.append(rec)

    # Write our references
    write_references(config, records)
Exemplo n.º 20
0
def halt(err=StandardError, msg=None, exit_code=1):
    """ Stop extraction, and deal with the error in the appropriate
    manner, based on whether Refextract is running in standalone or
    bibsched mode.
    @param err: (exception) The exception raised from an error, if any
    @param msg: (string) The brief error message, either displayed
    on the bibsched interface, or written to stderr.
    @param exit_code: (integer) Either 0 or 1, depending on the cause
    of the halting. This is only used when running standalone."""
    # If refextract is running independently, exit.
    # 'RUNNING_INDEPENDENTLY' is a global variable
    if RUNNING_INDEPENDENTLY:
        if msg:
            write_message(msg, stream=sys.stderr, verbose=0)
        sys.exit(exit_code)
    # Else, raise an exception so Bibsched will flag this task.
    else:
        if msg:
            # Update the status of refextract inside the Bibsched UI
            task_update_progress(msg.strip())
        raise err(msg)
Exemplo n.º 21
0
def build_books_kb(fpath):
    if isinstance(fpath, six.string_types):
        fpath_needs_closing = True
        try:
            write_message('Loading books kb from %s' % fpath, verbose=3)
            fh = open(fpath, "r")
            source = csv.reader(fh, delimiter='|', lineterminator=';')
        except IOError:
            # problem opening KB for reading, or problem while reading from it:
            emsg = "Error: Could not build list of books - failed " \
                   "to read from KB %(kb)s." % {'kb' : fpath}
            raise IOError(emsg)
    else:
        fpath_needs_closing = False
        source = fpath

    try:
        books = {}
        for line in source:
            try:
                books[line[1].upper()] = line
            except IndexError:
                write_message('Invalid line in books kb %s' % line, verbose=1)
    finally:
        if fpath_needs_closing:
            fh.close()
            write_message('Loaded books kb', verbose=3)

    return books
Exemplo n.º 22
0
def build_books_kb(fpath):
    if isinstance(fpath, six.string_types):
        fpath_needs_closing = True
        try:
            write_message('Loading books kb from %s' % fpath, verbose=3)
            fh = open(fpath, "r")
            source = csv.reader(fh, delimiter='|', lineterminator=';')
        except IOError:
            # problem opening KB for reading, or problem while reading from it:
            emsg = "Error: Could not build list of books - failed " \
                   "to read from KB %(kb)s." % {'kb' : fpath}
            raise IOError(emsg)
    else:
        fpath_needs_closing = False
        source = fpath

    try:
        books = {}
        for line in source:
            try:
                books[line[1].upper()] = line
            except IndexError:
                write_message('Invalid line in books kb %s' % line, verbose=1)
    finally:
        if fpath_needs_closing:
            fh.close()
            write_message('Loaded books kb', verbose=3)

    return books
Exemplo n.º 23
0
def build_publishers_kb(fpath):
    if isinstance(fpath, six.string_types):
        fpath_needs_closing = True
        try:
            write_message('Loading publishers kb from %s' % fpath, verbose=3)
            fh = open(fpath, "r")
            source = csv.reader(fh, delimiter='|', lineterminator='\n')
        except IOError:
            # problem opening KB for reading, or problem while reading from it:
            emsg = "Error: Could not build list of publishers - failed " \
                   "to read from KB %(kb)s." % {'kb' : fpath}
            raise IOError(emsg)
    else:
        fpath_needs_closing = False
        source = fpath

    try:
        publishers = {}
        for line in source:
            try:
                pattern = re.compile(ur'(\b|^)%s(\b|$)' % line[0], re.I|re.U)
                publishers[line[0]] = {'pattern': pattern, 'repl': line[1]}
            except IndexError:
                write_message('Invalid line in books kb %s' % line, verbose=1)
    finally:
        if fpath_needs_closing:
            fh.close()
            write_message('Loaded publishers kb', verbose=3)

    return publishers
Exemplo n.º 24
0
def build_publishers_kb(fpath):
    if isinstance(fpath, six.string_types):
        fpath_needs_closing = True
        try:
            write_message('Loading publishers kb from %s' % fpath, verbose=3)
            fh = open(fpath, "r")
            source = csv.reader(fh, delimiter='|', lineterminator='\n')
        except IOError:
            # problem opening KB for reading, or problem while reading from it:
            emsg = "Error: Could not build list of publishers - failed " \
                   "to read from KB %(kb)s." % {'kb' : fpath}
            raise IOError(emsg)
    else:
        fpath_needs_closing = False
        source = fpath

    try:
        publishers = {}
        for line in source:
            try:
                pattern = re.compile(ur'(\b|^)%s(\b|$)' % line[0], re.I | re.U)
                publishers[line[0]] = {'pattern': pattern, 'repl': line[1]}
            except IndexError:
                write_message('Invalid line in books kb %s' % line, verbose=1)
    finally:
        if fpath_needs_closing:
            fh.close()
            write_message('Loaded publishers kb', verbose=3)

    return publishers
Exemplo n.º 25
0
def filter_processed_references(out):
    """ apply filters to reference lines found - to remove junk"""
    reference_lines = out.split('\n')

    # Removes too long and too short m tags
    m_restricted, ref_lines = restrict_m_subfields(reference_lines)

    if m_restricted:
        a_tag = re.compile('\<subfield code=\"a\"\>(.*?)\<\/subfield\>')
        for i in range(len(ref_lines)):
            # Checks to see that the datafield has the attribute ind2="6",
            # Before looking to see if the subfield code attribute is 'a'
            if ref_lines[i].find('<datafield tag="999" ind1="C" ind2="6">') != -1 \
                and (len(ref_lines) - 1) > i:
                # For each line in this datafield element, try to find the subfield whose code attribute is 'a'
                while ref_lines[i].find('</datafield>') != -1 and (len(ref_lines) - 1) > i:
                    i += 1
                    # <subfield code="a">Invenio/X.XX.X
                    # refextract/X.XX.X-timestamp-err-repnum-title-URL-misc
                    # remake the "a" tag for new numbe of "m" tags
                    if a_tag.search(ref_lines[i]):
                        data = a_tag.search(ref_lines[i]).group(1)
                        words1 = data.split()
                        words2 = words1[-1].split('-')
                        old_m = int(words2[-1])
                        words2[-1] = str(old_m - m_restricted)
                        data1 = '-'.join(words2)
                        words1[-1] = data1
                        new_data = ' '.join(words1)
                        ref_lines[i] = '      <subfield code="a">' + new_data + '</subfield>'
                        break

    new_out = '\n'.join([l for l in [rec.rstrip() for rec in ref_lines] if l])

    if len(reference_lines) != len(new_out):
        write_message("  * filter results: unfilter references line length is %d and filtered length is %d" \
              % (len(reference_lines), len(new_out)), verbose=2)

    return new_out
Exemplo n.º 26
0
def write_references(config, records):
    """Write in marcxml"""
    if config.xmlfile:
        ofilehdl = open(config.xmlfile, 'w')
    else:
        ofilehdl = sys.stdout

    if config.xmlfile:
        for rec in records:
            for subfield in rec.find_subfields('999C5m'):
                if len(subfield.value) > 2048:
                    subfield.value = subfield.value[:2048]

    try:
        xml = print_records(records)
        print >>ofilehdl, xml
        ofilehdl.flush()
    except IOError, err:
        write_message("%s\n%s\n" % (config.xmlfile, err),
                          sys.stderr, verbose=0)
        halt(err=IOError, msg="Error: Unable to write to '%s'"
                 % config.xmlfile, exit_code=1)
Exemplo n.º 27
0
def write_references(config, records):
    """Write in marcxml"""
    if config.xmlfile:
        ofilehdl = open(config.xmlfile, 'w')
    else:
        ofilehdl = sys.stdout

    if config.xmlfile:
        for rec in records:
            for subfield in rec.find_subfields('999C5m'):
                if len(subfield.value) > 2048:
                    subfield.value = subfield.value[:2048]

    try:
        xml = print_records(records)
        print >> ofilehdl, xml
        ofilehdl.flush()
    except IOError, err:
        write_message("%s\n%s\n" % (config.xmlfile, err),
                      sys.stderr,
                      verbose=0)
        halt(err=IOError,
             msg="Error: Unable to write to '%s'" % config.xmlfile,
             exit_code=1)
Exemplo n.º 28
0
def build_special_journals_kb(fpath):
    """Load special journals database from file

    Special journals are journals that have a volume which is not unique
    among different years. To keep the volume unique we are adding the year
    before the volume.
    """
    journals = set()
    write_message('Loading special journals kb from %s' % fpath, verbose=3)
    fh = open(fpath, "r")
    try:
        for line in fh:
            # Skip commented lines
            if line.startswith('#'):
                continue
            # Skip empty line
            if not line.strip():
                continue
            journals.add(line.strip())
    finally:
        fh.close()
        write_message('Loaded special journals kb', verbose=3)

    return journals
Exemplo n.º 29
0
def build_special_journals_kb(fpath):
    """Load special journals database from file

    Special journals are journals that have a volume which is not unique
    among different years. To keep the volume unique we are adding the year
    before the volume.
    """
    journals = set()
    write_message('Loading special journals kb from %s' % fpath, verbose=3)
    fh = open(fpath, "r")
    try:
        for line in fh:
            # Skip commented lines
            if line.startswith('#'):
                continue
            # Skip empty line
            if not line.strip():
                continue
            journals.add(line.strip())
    finally:
        fh.close()
        write_message('Loaded special journals kb', verbose=3)

    return journals
Exemplo n.º 30
0
def get_reference_section_beginning(fulltext):

    sect_start = {
        'start_line': None,
        'end_line': None,
        'title_string': None,
        'marker_pattern': None,
        'marker': None,
        'how_found_start': None,
    }

    ## Find start of refs section:
    sect_start = find_reference_section(fulltext)
    if sect_start is not None:
        sect_start['how_found_start'] = 1
    else:
        ## No references found - try with no title option
        sect_start = find_reference_section_no_title_via_brackets(fulltext)
        if sect_start is not None:
            sect_start['how_found_start'] = 2
        ## Try weaker set of patterns if needed
        if sect_start is None:
            ## No references found - try with no title option (with weaker patterns..)
            sect_start = find_reference_section_no_title_via_dots(fulltext)
            if sect_start is not None:
                sect_start['how_found_start'] = 3
            if sect_start is None:
                ## No references found - try with no title option (with even weaker patterns..)
                sect_start = find_reference_section_no_title_via_numbers(
                    fulltext)
                if sect_start is not None:
                    sect_start['how_found_start'] = 4

    if sect_start:
        write_message('* title %r' % sect_start['title_string'], verbose=3)
        write_message('* marker %r' % sect_start['marker'], verbose=3)
        write_message('* title_marker_same_line %s' \
            % sect_start['title_marker_same_line'], verbose=3)
    else:
        write_message('* could not find references section', verbose=3)
    return sect_start
Exemplo n.º 31
0
def get_reference_section_beginning(fulltext):

    sect_start = {'start_line'     : None,
                  'end_line'       : None,
                  'title_string'   : None,
                  'marker_pattern' : None,
                  'marker'         : None,
                  'how_found_start': None,
                  }

    ## Find start of refs section:
    sect_start = find_reference_section(fulltext)
    if sect_start is not None:
        sect_start['how_found_start'] = 1
    else:
        ## No references found - try with no title option
        sect_start = find_reference_section_no_title_via_brackets(fulltext)
        if sect_start is not None:
            sect_start['how_found_start'] = 2
        ## Try weaker set of patterns if needed
        if sect_start is None:
            ## No references found - try with no title option (with weaker patterns..)
            sect_start = find_reference_section_no_title_via_dots(fulltext)
            if sect_start is not None:
                sect_start['how_found_start'] = 3
            if sect_start is None:
                ## No references found - try with no title option (with even weaker patterns..)
                sect_start = find_reference_section_no_title_via_numbers(fulltext)
                if sect_start is not None:
                    sect_start['how_found_start'] = 4

    if sect_start:
        write_message('* title %r' % sect_start['title_string'], verbose=3)
        write_message('* marker %r' % sect_start['marker'], verbose=3)
        write_message('* title_marker_same_line %s' \
            % sect_start['title_marker_same_line'], verbose=3)
    else:
        write_message('* could not find references section', verbose=3)
    return sect_start
Exemplo n.º 32
0
def build_journals_kb(knowledgebase):
    """Given the path to a knowledge base file, read in the contents
       of that file into a dictionary of search->replace word phrases.
       The search phrases are compiled into a regex pattern object.
       The knowledge base file should consist only of lines that take
       the following format:
         seek-term       ---   replace-term
       (i.e. a seek phrase on the left hand side, a replace phrase on
       the right hand side, with the two phrases being separated by 3
       hyphens.) E.g.:
         ASTRONOMY AND ASTROPHYSICS              ---Astron. Astrophys.

       The left-hand side term is a non-standard version of the title,
       whereas the right-hand side term is the standard version.
       If the KB file cannot be read from, or an unexpected line is
       encountered in the KB, an error
       message is output to standard error and execution is halted with
       an error-code 0.

       @param fpath: (string) the path to the knowledge base file.
       @return: (tuple) containing a list and a dictionary. The list
        contains compiled regex patterns used as search terms and will
        be used to force searching order to match that of the knowledge
        base.
        The dictionary contains the search->replace terms.  The keys of
        the dictionary are the compiled regex word phrases used for
        searching in the reference lines; The values in the dictionary are
        the replace terms for matches.
    """
    # Initialise vars:
    # dictionary of search and replace phrases from KB:
    kb = {}
    standardised_titles = {}
    seek_phrases = []
    # A dictionary of "replacement terms" (RHS) to be inserted into KB as
    # "seek terms" later, if they were not already explicitly added
    # by the KB:
    repl_terms = {}

    write_message('Processing journals kb', verbose=3)
    for seek_phrase, repl in knowledgebase:
        # We match on a simplified line, thus dots are replaced
        # with spaces
        seek_phrase = seek_phrase.replace('.', ' ').upper()

        # good KB line
        # Add the 'replacement term' into the dictionary of
        # replacement terms:
        repl_terms[repl] = None

        # add the phrase from the KB if the 'seek' phrase is longer
        # compile the seek phrase into a pattern:
        seek_ptn = re.compile(ur'(?<!\w)(%s)\W' % re.escape(seek_phrase),
                              re.UNICODE)

        kb[seek_phrase] = seek_ptn
        standardised_titles[seek_phrase] = repl
        seek_phrases.append(seek_phrase)

    # Now, for every 'replacement term' found in the KB, if it is
    # not already in the KB as a "search term", add it:
    for repl_term in repl_terms.keys():
        raw_repl_phrase = repl_term.upper()
        raw_repl_phrase = re_punctuation.sub(u' ', raw_repl_phrase)
        raw_repl_phrase = \
             re_group_captured_multiple_space.sub(u' ', raw_repl_phrase)
        raw_repl_phrase = raw_repl_phrase.strip()
        if raw_repl_phrase not in kb:
            # The replace-phrase was not in the KB as a seek phrase
            # It should be added.
            pattern = ur'(?<!\/)\b(%s)[^A-Z0-9]' % re.escape(raw_repl_phrase)
            seek_ptn = re.compile(pattern, re.U)
            kb[raw_repl_phrase] = seek_ptn
            standardised_titles[raw_repl_phrase] = repl_term
            seek_phrases.append(raw_repl_phrase)

    # Sort the titles by string length (long - short)
    seek_phrases.sort(_cmp_bystrlen_reverse)

    write_message('Processed journals kb', verbose=3)

    # return the raw knowledge base:
    return kb, standardised_titles, seek_phrases
Exemplo n.º 33
0
def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn):
    """Given a reference section, rebuild the reference lines. After translation
       from PDF to text, reference lines are often broken. This is because
       pdftotext doesn't know what is a wrapped-line and what is a genuine new
       line. As a result, the following 2 reference lines:
        [1] See http://invenio-software.org/ for more details.
        [2] Example, AN: private communication (1996).
       ...could be broken into the following 4 lines during translation from PDF
       to plaintext:
        [1] See http://invenio-software.org/ fo
        r more details.
        [2] Example, AN: private communica
        tion (1996).
       Such a situation could lead to a citation being separated across 'lines',
       meaning that it wouldn't be correctly recognised.
       This function tries to rebuild the reference lines. It uses the pattern
       used to recognise a reference line's numeration marker to indicate the
       start of a line. If no reference line numeration was recognised, it will
       simply join all lines together into one large reference line.
       @param ref_sectn: (list) of strings. The (potentially broken) reference
        lines.
       @param ref_line_marker_ptn: (string) - the pattern used to recognise a
        reference line's numeration marker.
       @return: (list) of strings - the rebuilt reference section. Each string
        in the list represents a complete reference line.
    """
    indentation_splitting = False

    # This should be moved the function detecting the pattern!
    if not ref_line_marker_ptn:
        if test_for_blank_lines_separating_reference_lines(ref_sectn):
            # Use blank lines to separate ref lines
            ref_line_marker_ptn = ur'^\s*$'
        else:
            # No ref line dividers
            # We are guessing this the format:
            # Reference1
            #      etc
            # Reference2
            #      etc
            # We split when there's no identation
            indentation_splitting = True
            ref_line_marker_ptn = ur'^[^\s]'

    write_message('* references separator %s' % ref_line_marker_ptn, verbose=2)
    p_ref_line_marker = re.compile(ref_line_marker_ptn, re.I|re.UNICODE)

    # Start from ref 1
    # Append each fixed reference line to rebuilt_references
    # and rebuild references as we go
    current_ref = 0
    rebuilt_references = []
    working_ref = []

    def prepare_ref(working_ref):
        working_ref = working_ref[:CFG_REFEXTRACT_MAX_LINES]
        working_line = ""
        for l in working_ref:
            working_line = join_lines(working_line, l.strip())
        working_line = working_line.rstrip()
        return wash_and_repair_reference_line(working_line)

    lower_case_start = re.compile(ur'[a-z]')
    continuing_line_markers = re.compile(ur'[,&-]$')

    for line in ref_sectn:
        # Can't find a good way to distinguish between
        # pagination and the page number of a journal numeration that
        # happens to be alone in a new line
        # m = match_pagination(line)
        # if m and current_ref and current_ref != m + 1:
        #     continue

        # Try to find the marker for the reference line
        m_ref_line_marker = p_ref_line_marker.search(line)

        if m_ref_line_marker:
            try:
                marknum = int(m_ref_line_marker.group('marknum'))
            except IndexError:
                marknum = None
            except ValueError:
                # If the mark is a unicode character category [Nd],
                # it is not always convertible to int by int()
                # We can't use its numerical value, but we still accept it
                # as numeration
                pass

            new_line_detected = False
            if marknum is None or current_ref + 1 == marknum:
                new_line_detected = True
            if indentation_splitting:
                if lower_case_start.match(line.strip()):
                    new_line_detected = False
                if working_ref and \
                       continuing_line_markers.search(working_ref[-1].strip()):
                    new_line_detected = False

            if new_line_detected:
                # Reference line marker found! : Append this reference to the
                # list of fixed references and reset the working_line to 'blank'
                start = m_ref_line_marker.start()
                if line[:start]:
                    # If it's not a blank line to separate refs
                    # Only append from the start of the marker
                    # For this case:
                    # [1] hello
                    # hello2 [2] foo
                    working_ref.append(line[:start])

                # Append current working line to the refs list
                if working_ref:
                    rebuilt_references.append(prepare_ref(working_ref))

                current_ref = marknum
                working_ref = []
                if line[start:]:
                    working_ref.append(line[start:])

            else:
                # Our marker does not match the counting
                # Either we missed one, the author missed one or
                # it is not a line marker
                # For now we assume it is not line marker
                working_ref.append(line)

        elif line:
            # Continuation of line
            working_ref.append(line)

    if working_ref:
        # Append last line
        rebuilt_references.append(prepare_ref(working_ref))

    return rebuilt_references
Exemplo n.º 34
0
def build_journals_kb(knowledgebase):
    """Given the path to a knowledge base file, read in the contents
       of that file into a dictionary of search->replace word phrases.
       The search phrases are compiled into a regex pattern object.
       The knowledge base file should consist only of lines that take
       the following format:
         seek-term       ---   replace-term
       (i.e. a seek phrase on the left hand side, a replace phrase on
       the right hand side, with the two phrases being separated by 3
       hyphens.) E.g.:
         ASTRONOMY AND ASTROPHYSICS              ---Astron. Astrophys.

       The left-hand side term is a non-standard version of the title,
       whereas the right-hand side term is the standard version.
       If the KB file cannot be read from, or an unexpected line is
       encountered in the KB, an error
       message is output to standard error and execution is halted with
       an error-code 0.

       @param fpath: (string) the path to the knowledge base file.
       @return: (tuple) containing a list and a dictionary. The list
        contains compiled regex patterns used as search terms and will
        be used to force searching order to match that of the knowledge
        base.
        The dictionary contains the search->replace terms.  The keys of
        the dictionary are the compiled regex word phrases used for
        searching in the reference lines; The values in the dictionary are
        the replace terms for matches.
    """
    # Initialise vars:
    # dictionary of search and replace phrases from KB:
    kb = {}
    standardised_titles = {}
    seek_phrases = []
    # A dictionary of "replacement terms" (RHS) to be inserted into KB as
    # "seek terms" later, if they were not already explicitly added
    # by the KB:
    repl_terms = {}

    write_message('Processing journals kb', verbose=3)
    for seek_phrase, repl in knowledgebase:
        # We match on a simplified line, thus dots are replaced
        # with spaces
        seek_phrase = seek_phrase.replace('.', ' ').upper()

        # good KB line
        # Add the 'replacement term' into the dictionary of
        # replacement terms:
        repl_terms[repl] = None

        # add the phrase from the KB if the 'seek' phrase is longer
        # compile the seek phrase into a pattern:
        seek_ptn = re.compile(ur'(?<!\w)(%s)\W' % re.escape(seek_phrase),
                              re.UNICODE)

        kb[seek_phrase] = seek_ptn
        standardised_titles[seek_phrase] = repl
        seek_phrases.append(seek_phrase)

    # Now, for every 'replacement term' found in the KB, if it is
    # not already in the KB as a "search term", add it:
    for repl_term in repl_terms.keys():
        raw_repl_phrase = repl_term.upper()
        raw_repl_phrase = re_punctuation.sub(u' ', raw_repl_phrase)
        raw_repl_phrase = \
             re_group_captured_multiple_space.sub(u' ', raw_repl_phrase)
        raw_repl_phrase = raw_repl_phrase.strip()
        if raw_repl_phrase not in kb:
            # The replace-phrase was not in the KB as a seek phrase
            # It should be added.
            pattern = ur'(?<!\/)\b(%s)[^A-Z0-9]' % re.escape(raw_repl_phrase)
            seek_ptn = re.compile(pattern, re.U)
            kb[raw_repl_phrase] = seek_ptn
            standardised_titles[raw_repl_phrase] = repl_term
            seek_phrases.append(raw_repl_phrase)

    # Sort the titles by string length (long - short)
    seek_phrases.sort(_cmp_bystrlen_reverse)

    write_message('Processed journals kb', verbose=3)

    # return the raw knowledge base:
    return kb, standardised_titles, seek_phrases
Exemplo n.º 35
0
def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn):
    """Given a reference section, rebuild the reference lines. After translation
       from PDF to text, reference lines are often broken. This is because
       pdftotext doesn't know what is a wrapped-line and what is a genuine new
       line. As a result, the following 2 reference lines:
        [1] See http://invenio-software.org/ for more details.
        [2] Example, AN: private communication (1996).
       ...could be broken into the following 4 lines during translation from PDF
       to plaintext:
        [1] See http://invenio-software.org/ fo
        r more details.
        [2] Example, AN: private communica
        tion (1996).
       Such a situation could lead to a citation being separated across 'lines',
       meaning that it wouldn't be correctly recognised.
       This function tries to rebuild the reference lines. It uses the pattern
       used to recognise a reference line's numeration marker to indicate the
       start of a line. If no reference line numeration was recognised, it will
       simply join all lines together into one large reference line.
       @param ref_sectn: (list) of strings. The (potentially broken) reference
        lines.
       @param ref_line_marker_ptn: (string) - the pattern used to recognise a
        reference line's numeration marker.
       @return: (list) of strings - the rebuilt reference section. Each string
        in the list represents a complete reference line.
    """
    indentation_splitting = False

    # This should be moved the function detecting the pattern!
    if not ref_line_marker_ptn:
        if test_for_blank_lines_separating_reference_lines(ref_sectn):
            # Use blank lines to separate ref lines
            ref_line_marker_ptn = ur'^\s*$'
        else:
            # No ref line dividers
            # We are guessing this the format:
            # Reference1
            #      etc
            # Reference2
            #      etc
            # We split when there's no identation
            indentation_splitting = True
            ref_line_marker_ptn = ur'^[^\s]'

    write_message('* references separator %s' % ref_line_marker_ptn, verbose=2)
    p_ref_line_marker = re.compile(ref_line_marker_ptn, re.I | re.UNICODE)

    # Start from ref 1
    # Append each fixed reference line to rebuilt_references
    # and rebuild references as we go
    current_ref = 0
    rebuilt_references = []
    working_ref = []

    def prepare_ref(working_ref):
        working_ref = working_ref[:CFG_REFEXTRACT_MAX_LINES]
        working_line = ""
        for l in working_ref:
            working_line = join_lines(working_line, l.strip())
        working_line = working_line.rstrip()
        return wash_and_repair_reference_line(working_line)

    lower_case_start = re.compile(ur'[a-z]')
    continuing_line_markers = re.compile(ur'[,&-]$')

    for line in ref_sectn:
        # Can't find a good way to distinguish between
        # pagination and the page number of a journal numeration that
        # happens to be alone in a new line
        # m = match_pagination(line)
        # if m and current_ref and current_ref != m + 1:
        #     continue

        # Try to find the marker for the reference line
        m_ref_line_marker = p_ref_line_marker.search(line)

        if m_ref_line_marker:
            try:
                marknum = int(m_ref_line_marker.group('marknum'))
            except IndexError:
                marknum = None
            except ValueError:
                # If the mark is a unicode character category [Nd],
                # it is not always convertible to int by int()
                # We can't use its numerical value, but we still accept it
                # as numeration
                pass

            new_line_detected = False
            if marknum is None or current_ref + 1 == marknum:
                new_line_detected = True
            if indentation_splitting:
                if lower_case_start.match(line.strip()):
                    new_line_detected = False
                if working_ref and \
                       continuing_line_markers.search(working_ref[-1].strip()):
                    new_line_detected = False

            if new_line_detected:
                # Reference line marker found! : Append this reference to the
                # list of fixed references and reset the working_line to 'blank'
                start = m_ref_line_marker.start()
                if line[:start]:
                    # If it's not a blank line to separate refs
                    # Only append from the start of the marker
                    # For this case:
                    # [1] hello
                    # hello2 [2] foo
                    working_ref.append(line[:start])

                # Append current working line to the refs list
                if working_ref:
                    rebuilt_references.append(prepare_ref(working_ref))

                current_ref = marknum
                working_ref = []
                if line[start:]:
                    working_ref.append(line[start:])

            else:
                # Our marker does not match the counting
                # Either we missed one, the author missed one or
                # it is not a line marker
                # For now we assume it is not line marker
                working_ref.append(line)

        elif line:
            # Continuation of line
            working_ref.append(line)

    if working_ref:
        # Append last line
        rebuilt_references.append(prepare_ref(working_ref))

    return rebuilt_references
Exemplo n.º 36
0
def build_reportnum_kb(fpath):
    """Given the path to a knowledge base file containing the details
       of institutes and the patterns that their preprint report
       numbering schemes take, create a dictionary of regexp search
       patterns to recognise these preprint references in reference
       lines, and a dictionary of replacements for non-standard preprint
       categories in these references.

       The knowledge base file should consist only of lines that take one
       of the following 3 formats:

         #####Institute Name####

       (the name of the institute to which the preprint reference patterns
        belong, e.g. '#####LANL#####', surrounded by 5 # on either side.)

         <pattern>

       (numeration patterns for an institute's preprints, surrounded by
        < and >.)

         seek-term       ---   replace-term
       (i.e. a seek phrase on the left hand side, a replace phrase on the
       right hand side, with the two phrases being separated by 3 hyphens.)
       E.g.:
         ASTRO PH        ---astro-ph

       The left-hand side term is a non-standard version of the preprint
       reference category; the right-hand side term is the standard version.

       If the KB file cannot be read from, or an unexpected line is
       encountered in the KB, an error message is output to standard error
       and execution is halted with an error-code 0.

       @param fpath: (string) the path to the knowledge base file.
       @return: (tuple) containing 2 dictionaries. The first contains regexp
        search patterns used to identify preprint references in a line. This
        dictionary is keyed by a tuple containing the line number of the
        pattern in the KB and the non-standard category string.
        E.g.: (3, 'ASTRO PH').
        The second dictionary contains the standardised category string,
        and is keyed by the non-standard category string. E.g.: 'astro-ph'.
    """
    def _add_institute_preprint_patterns(preprint_classifications,
                                         preprint_numeration_ptns,
                                         preprint_reference_search_regexp_patterns,
                                         standardised_preprint_reference_categories,
                                         kb_line_num):
        """For a list of preprint category strings and preprint numeration
           patterns for a given institute, create the regexp patterns for
           each of the preprint types.  Add the regexp patterns to the
           dictionary of search patterns
           (preprint_reference_search_regexp_patterns), keyed by the line
           number of the institute in the KB, and the preprint category
           search string.  Also add the standardised preprint category string
           to another dictionary, keyed by the line number of its position
           in the KB and its non-standardised version.
           @param preprint_classifications: (list) of tuples whereby each tuple
            contains a preprint category search string and the line number of
            the name of institute to which it belongs in the KB.
            E.g.: (45, 'ASTRO PH').
           @param preprint_numeration_ptns: (list) of preprint reference
            numeration search patterns (strings)
           @param preprint_reference_search_regexp_patterns: (dictionary) of
            regexp patterns used to search in document lines.
           @param standardised_preprint_reference_categories: (dictionary)
            containing the standardised strings for preprint reference
            categories. (E.g. 'astro-ph'.)
           @param kb_line_num: (integer) - the line number int the KB at
            which a given institute name was found.
           @return: None
        """
        if preprint_classifications and preprint_numeration_ptns:
            # the previous institute had both numeration styles and categories
            # for preprint references.
            # build regexps and add them for this institute:
            # First, order the numeration styles by line-length, and build a
            # grouped regexp for recognising numeration:
            ordered_patterns = \
              order_reportnum_patterns_bylen(preprint_numeration_ptns)
            # create a grouped regexp for numeration part of
            # preprint reference:
            numeration_regexp = \
              create_institute_numeration_group_regexp_pattern(ordered_patterns)

            # for each "classification" part of preprint references, create a
            # complete regex:
            # will be in the style "(categ)-(numatn1|numatn2|numatn3|...)"
            for classification in preprint_classifications:
                search_pattern_str = ur'(?:^|[^a-zA-Z0-9\/\.\-])([\[\(]?(?P<categ>' \
                                     + classification[0].strip() + u')' \
                                     + numeration_regexp + u'[\]\)]?)'

                re_search_pattern = re.compile(search_pattern_str,
                                                 re.UNICODE)
                preprint_reference_search_regexp_patterns[(kb_line_num,
                                                          classification[0])] =\
                                                          re_search_pattern
                standardised_preprint_reference_categories[(kb_line_num,
                                                          classification[0])] =\
                                                          classification[1]

    preprint_reference_search_regexp_patterns = {}  # a dictionary of patterns
                                                     # used to recognise
                                                     # categories of preprints
                                                     # as used by various
                                                     # institutes
    standardised_preprint_reference_categories = {}  # dictionary of
                                                     # standardised category
                                                     # strings for preprint cats
    current_institute_preprint_classifications = []  # list of tuples containing
                                                     # preprint categories in
                                                     # their raw & standardised
                                                     # forms, as read from KB
    current_institute_numerations = []               # list of preprint
                                                     # numeration patterns, as
                                                     # read from the KB

    # pattern to recognise an institute name line in the KB
    re_institute_name = re.compile(ur'^\*{5}\s*(.+)\s*\*{5}$', re.UNICODE)

    # pattern to recognise an institute preprint categ line in the KB
    re_preprint_classification = \
                re.compile(ur'^\s*(\w.*)\s*---\s*(\w.*)\s*$', re.UNICODE)

    # pattern to recognise a preprint numeration-style line in KB
    re_numeration_pattern = re.compile(ur'^\<(.+)\>$', re.UNICODE)

    kb_line_num = 0    # when making the dictionary of patterns, which is
                       # keyed by the category search string, this counter
                       # will ensure that patterns in the dictionary are not
                       # overwritten if 2 institutes have the same category
                       # styles.

    try:
        if isinstance(fpath, six.string_types):
            write_message('Loading reports kb from %s' % fpath, verbose=3)
            fh = open(fpath, "r")
            fpath_needs_closing = True
        else:
            fpath_needs_closing = False
            fh = fpath

        for rawline in fh:
            if rawline.startswith('#'):
                continue

            kb_line_num += 1
            try:
                rawline = rawline.decode("utf-8")
            except UnicodeError:
                write_message("*** Unicode problems in %s for line %e"
                                 % (fpath, kb_line_num), sys.stderr, verbose=0)
                raise UnicodeError("Error: Unable to parse report number kb (line: %s)" % str(kb_line_num))

            m_institute_name = re_institute_name.search(rawline)
            if m_institute_name:
                # This KB line is the name of an institute
                # append the last institute's pattern list to the list of
                # institutes:
                _add_institute_preprint_patterns(current_institute_preprint_classifications,
                                                 current_institute_numerations,
                                                 preprint_reference_search_regexp_patterns,
                                                 standardised_preprint_reference_categories,
                                                 kb_line_num)

                # Now start a new dictionary to contain the search patterns
                # for this institute:
                current_institute_preprint_classifications = []
                current_institute_numerations = []
                # move on to the next line
                continue

            m_preprint_classification = \
                                     re_preprint_classification.search(rawline)
            if m_preprint_classification:
                # This KB line contains a preprint classification for
                # the current institute
                try:
                    current_institute_preprint_classifications.append((m_preprint_classification.group(1),
                                                                      m_preprint_classification.group(2)))
                except (AttributeError, NameError):
                    # didn't match this line correctly - skip it
                    pass
                # move on to the next line
                continue

            m_numeration_pattern = re_numeration_pattern.search(rawline)
            if m_numeration_pattern:
                # This KB line contains a preprint item numeration pattern
                # for the current institute
                try:
                    current_institute_numerations.append(m_numeration_pattern.group(1))
                except (AttributeError, NameError):
                    # didn't match the numeration pattern correctly - skip it
                    pass
                continue

        _add_institute_preprint_patterns(current_institute_preprint_classifications,
                                         current_institute_numerations,
                                         preprint_reference_search_regexp_patterns,
                                         standardised_preprint_reference_categories,
                                         kb_line_num)
        if fpath_needs_closing:
            write_message('Loaded reports kb', verbose=3)
            fh.close()
    except IOError:
        # problem opening KB for reading, or problem while reading from it:
        emsg = """Error: Could not build knowledge base containing """ \
               """institute preprint referencing patterns - failed """ \
               """to read from KB %(kb)s.""" \
               % {'kb' : fpath}
        write_message(emsg, sys.stderr, verbose=0)
        raise IOError("Error: Unable to open report number kb '%s'" % fpath)

    # return the preprint reference patterns and the replacement strings
    # for non-standard categ-strings:
    return (preprint_reference_search_regexp_patterns,
            standardised_preprint_reference_categories)
Exemplo n.º 37
0
def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn):
    """Given a reference section, rebuild the reference lines. After translation
       from PDF to text, reference lines are often broken. This is because
       pdftotext doesn't know what is a wrapped-line and what is a genuine new
       line. As a result, the following 2 reference lines:
        [1] See http://invenio-software.org/ for more details.
        [2] Example, AN: private communication (1996).
       ...could be broken into the following 4 lines during translation from PDF
       to plaintext:
        [1] See http://invenio-software.org/ fo
        r more details.
        [2] Example, AN: private communica
        tion (1996).
       Such a situation could lead to a citation being separated across 'lines',
       meaning that it wouldn't be correctly recognised.
       This function tries to rebuild the reference lines. It uses the pattern
       used to recognise a reference line's numeration marker to indicate the
       start of a line. If no reference line numeration was recognised, it will
       simply join all lines together into one large reference line.
       @param ref_sectn: (list) of strings. The (potentially broken) reference
        lines.
       @param ref_line_marker_ptn: (string) - the pattern used to recognise a
        reference line's numeration marker.
       @return: (list) of strings - the rebuilt reference section. Each string
        in the list represents a complete reference line.
    """
    ## initialise some vars:
    rebuilt_references = []
    working_ref = []

    strip_before = True
    if ref_line_marker_ptn is None or \
           type(ref_line_marker_ptn) not in (str, unicode):
        if test_for_blank_lines_separating_reference_lines(ref_sectn):
            ## Use blank lines to separate ref lines
            ref_line_marker_ptn = ur'^\s*$'
        else:
            ## No ref line dividers: unmatchable pattern
            #ref_line_marker_ptn = ur'^A$^A$$'
            # I am adding a new format, hopefully
            # this case wasn't useful
            # Reference1
            #      etc
            # Reference2
            #      etc
            # We split when there's no identation
            ref_line_marker_ptn = ur'^[^\s]'
            strip_before = False

    write_message('* references separator %s' % ref_line_marker_ptn, verbose=2)
    p_ref_line_marker = re.compile(ref_line_marker_ptn, re.I|re.UNICODE)
    # Work backwards, starting from the last 'broken' reference line
    # Append each fixed reference line to rebuilt_references
    current_ref = None
    line_counter = 0

    def prepare_ref(working_ref):
        working_line = ""
        for l in reversed(working_ref):
            working_line = join_lines(working_line, l)
        working_line = working_line.rstrip()
        return wash_and_repair_reference_line(working_line)

    for line in reversed(ref_sectn):
        # Try to find the marker for the reference line
        if strip_before:
            current_string = line.strip()
            m_ref_line_marker = p_ref_line_marker.search(current_string)
        else:
            m_ref_line_marker = p_ref_line_marker.search(line)
            current_string = line.strip()

        if m_ref_line_marker and (not current_ref \
                or current_ref == int(m_ref_line_marker.group('marknum')) + 1):
            # Reference line marker found! : Append this reference to the
            # list of fixed references and reset the working_line to 'blank'
            if current_string != '':
                ## If it's not a blank line to separate refs
                working_ref.append(current_string)
            # Append current working line to the refs list
            if line_counter < CFG_REFEXTRACT_MAX_LINES:
                rebuilt_references.append(prepare_ref(working_ref))
            try:
                current_ref = int(m_ref_line_marker.group('marknum'))
            except IndexError:
                pass  # this line doesn't have numbering
            working_ref = []
            line_counter = 0
        elif current_string != u'':
            # Continuation of line
            working_ref.append(current_string)
            line_counter += 1

    if working_ref:
        # Append last line
        rebuilt_references.append(prepare_ref(working_ref))

    # A list of reference lines has been built backwards - reverse it:
    rebuilt_references.reverse()

    # Make sure mulitple markers within references are correctly
    # in place (compare current marker num with current marker num +1)
    # rebuilt_references = correct_rebuilt_lines(rebuilt_references, \
    #                                            p_ref_line_marker)

    # For each properly formated reference line, try to identify cases
    # where there is more than one citation in a single line. This is
    # done by looking for semi-colons, which could be used to
    # separate references
    return rebuilt_references
Exemplo n.º 38
0
def build_reportnum_kb(fpath):
    """Given the path to a knowledge base file containing the details
       of institutes and the patterns that their preprint report
       numbering schemes take, create a dictionary of regexp search
       patterns to recognise these preprint references in reference
       lines, and a dictionary of replacements for non-standard preprint
       categories in these references.

       The knowledge base file should consist only of lines that take one
       of the following 3 formats:

         #####Institute Name####

       (the name of the institute to which the preprint reference patterns
        belong, e.g. '#####LANL#####', surrounded by 5 # on either side.)

         <pattern>

       (numeration patterns for an institute's preprints, surrounded by
        < and >.)

         seek-term       ---   replace-term
       (i.e. a seek phrase on the left hand side, a replace phrase on the
       right hand side, with the two phrases being separated by 3 hyphens.)
       E.g.:
         ASTRO PH        ---astro-ph

       The left-hand side term is a non-standard version of the preprint
       reference category; the right-hand side term is the standard version.

       If the KB file cannot be read from, or an unexpected line is
       encountered in the KB, an error message is output to standard error
       and execution is halted with an error-code 0.

       @param fpath: (string) the path to the knowledge base file.
       @return: (tuple) containing 2 dictionaries. The first contains regexp
        search patterns used to identify preprint references in a line. This
        dictionary is keyed by a tuple containing the line number of the
        pattern in the KB and the non-standard category string.
        E.g.: (3, 'ASTRO PH').
        The second dictionary contains the standardised category string,
        and is keyed by the non-standard category string. E.g.: 'astro-ph'.
    """
    def _add_institute_preprint_patterns(
            preprint_classifications, preprint_numeration_ptns,
            preprint_reference_search_regexp_patterns,
            standardised_preprint_reference_categories, kb_line_num):
        """For a list of preprint category strings and preprint numeration
           patterns for a given institute, create the regexp patterns for
           each of the preprint types.  Add the regexp patterns to the
           dictionary of search patterns
           (preprint_reference_search_regexp_patterns), keyed by the line
           number of the institute in the KB, and the preprint category
           search string.  Also add the standardised preprint category string
           to another dictionary, keyed by the line number of its position
           in the KB and its non-standardised version.
           @param preprint_classifications: (list) of tuples whereby each tuple
            contains a preprint category search string and the line number of
            the name of institute to which it belongs in the KB.
            E.g.: (45, 'ASTRO PH').
           @param preprint_numeration_ptns: (list) of preprint reference
            numeration search patterns (strings)
           @param preprint_reference_search_regexp_patterns: (dictionary) of
            regexp patterns used to search in document lines.
           @param standardised_preprint_reference_categories: (dictionary)
            containing the standardised strings for preprint reference
            categories. (E.g. 'astro-ph'.)
           @param kb_line_num: (integer) - the line number int the KB at
            which a given institute name was found.
           @return: None
        """
        if preprint_classifications and preprint_numeration_ptns:
            # the previous institute had both numeration styles and categories
            # for preprint references.
            # build regexps and add them for this institute:
            # First, order the numeration styles by line-length, and build a
            # grouped regexp for recognising numeration:
            ordered_patterns = \
              order_reportnum_patterns_bylen(preprint_numeration_ptns)
            # create a grouped regexp for numeration part of
            # preprint reference:
            numeration_regexp = \
              create_institute_numeration_group_regexp_pattern(ordered_patterns)

            # for each "classification" part of preprint references, create a
            # complete regex:
            # will be in the style "(categ)-(numatn1|numatn2|numatn3|...)"
            for classification in preprint_classifications:
                search_pattern_str = ur'(?:^|[^a-zA-Z0-9\/\.\-])([\[\(]?(?P<categ>' \
                                     + classification[0].strip() + u')' \
                                     + numeration_regexp + u'[\]\)]?)'

                re_search_pattern = re.compile(search_pattern_str, re.UNICODE)
                preprint_reference_search_regexp_patterns[(kb_line_num,
                                                          classification[0])] =\
                                                          re_search_pattern
                standardised_preprint_reference_categories[(kb_line_num,
                                                          classification[0])] =\
                                                          classification[1]

    preprint_reference_search_regexp_patterns = {}  # a dictionary of patterns
    # used to recognise
    # categories of preprints
    # as used by various
    # institutes
    standardised_preprint_reference_categories = {}  # dictionary of
    # standardised category
    # strings for preprint cats
    current_institute_preprint_classifications = [
    ]  # list of tuples containing
    # preprint categories in
    # their raw & standardised
    # forms, as read from KB
    current_institute_numerations = []  # list of preprint
    # numeration patterns, as
    # read from the KB

    # pattern to recognise an institute name line in the KB
    re_institute_name = re.compile(ur'^\*{5}\s*(.+)\s*\*{5}$', re.UNICODE)

    # pattern to recognise an institute preprint categ line in the KB
    re_preprint_classification = \
                re.compile(ur'^\s*(\w.*)\s*---\s*(\w.*)\s*$', re.UNICODE)

    # pattern to recognise a preprint numeration-style line in KB
    re_numeration_pattern = re.compile(ur'^\<(.+)\>$', re.UNICODE)

    kb_line_num = 0  # when making the dictionary of patterns, which is
    # keyed by the category search string, this counter
    # will ensure that patterns in the dictionary are not
    # overwritten if 2 institutes have the same category
    # styles.

    try:
        if isinstance(fpath, six.string_types):
            write_message('Loading reports kb from %s' % fpath, verbose=3)
            fh = open(fpath, "r")
            fpath_needs_closing = True
        else:
            fpath_needs_closing = False
            fh = fpath

        for rawline in fh:
            if rawline.startswith('#'):
                continue

            kb_line_num += 1
            try:
                rawline = rawline.decode("utf-8")
            except UnicodeError:
                write_message("*** Unicode problems in %s for line %e" %
                              (fpath, kb_line_num),
                              sys.stderr,
                              verbose=0)
                raise UnicodeError(
                    "Error: Unable to parse report number kb (line: %s)" %
                    str(kb_line_num))

            m_institute_name = re_institute_name.search(rawline)
            if m_institute_name:
                # This KB line is the name of an institute
                # append the last institute's pattern list to the list of
                # institutes:
                _add_institute_preprint_patterns(
                    current_institute_preprint_classifications,
                    current_institute_numerations,
                    preprint_reference_search_regexp_patterns,
                    standardised_preprint_reference_categories, kb_line_num)

                # Now start a new dictionary to contain the search patterns
                # for this institute:
                current_institute_preprint_classifications = []
                current_institute_numerations = []
                # move on to the next line
                continue

            m_preprint_classification = \
                                     re_preprint_classification.search(rawline)
            if m_preprint_classification:
                # This KB line contains a preprint classification for
                # the current institute
                try:
                    current_institute_preprint_classifications.append(
                        (m_preprint_classification.group(1),
                         m_preprint_classification.group(2)))
                except (AttributeError, NameError):
                    # didn't match this line correctly - skip it
                    pass
                # move on to the next line
                continue

            m_numeration_pattern = re_numeration_pattern.search(rawline)
            if m_numeration_pattern:
                # This KB line contains a preprint item numeration pattern
                # for the current institute
                try:
                    current_institute_numerations.append(
                        m_numeration_pattern.group(1))
                except (AttributeError, NameError):
                    # didn't match the numeration pattern correctly - skip it
                    pass
                continue

        _add_institute_preprint_patterns(
            current_institute_preprint_classifications,
            current_institute_numerations,
            preprint_reference_search_regexp_patterns,
            standardised_preprint_reference_categories, kb_line_num)
        if fpath_needs_closing:
            write_message('Loaded reports kb', verbose=3)
            fh.close()
    except IOError:
        # problem opening KB for reading, or problem while reading from it:
        emsg = """Error: Could not build knowledge base containing """ \
               """institute preprint referencing patterns - failed """ \
               """to read from KB %(kb)s.""" \
               % {'kb' : fpath}
        write_message(emsg, sys.stderr, verbose=0)
        raise IOError("Error: Unable to open report number kb '%s'" % fpath)

    # return the preprint reference patterns and the replacement strings
    # for non-standard categ-strings:
    return (preprint_reference_search_regexp_patterns,
            standardised_preprint_reference_categories)