def write_references(config, xml_references): """Write marcxml to file * Output xml header * Output collection opening tag * Output xml for each record * Output collection closing tag """ if config.xmlfile: ofilehdl = open(config.xmlfile, 'w') else: ofilehdl = sys.stdout try: print >> ofilehdl, CFG_REFEXTRACT_XML_VERSION.encode("utf-8") print >> ofilehdl, CFG_REFEXTRACT_XML_COLLECTION_OPEN.encode("utf-8") for out in xml_references: print >> ofilehdl, out.encode("utf-8") print >> ofilehdl, CFG_REFEXTRACT_XML_COLLECTION_CLOSE.encode("utf-8") ofilehdl.flush() except IOError, err: write_message("%s\n%s\n" % (config.xmlfile, err), \ sys.stderr, verbose=0) halt(err=IOError, msg="Error: Unable to write to '%s'" \ % config.xmlfile, exit_code=1)
def write_references(config, xml_references): """Write marcxml to file * Output xml header * Output collection opening tag * Output xml for each record * Output collection closing tag """ if config.xmlfile: ofilehdl = open(config.xmlfile, 'w') else: ofilehdl = sys.stdout try: print >>ofilehdl, CFG_REFEXTRACT_XML_VERSION.encode("utf-8") print >>ofilehdl, CFG_REFEXTRACT_XML_COLLECTION_OPEN.encode("utf-8") for out in xml_references: print >>ofilehdl, out.encode("utf-8") print >>ofilehdl, CFG_REFEXTRACT_XML_COLLECTION_CLOSE.encode("utf-8") ofilehdl.flush() except IOError, err: write_message("%s\n%s\n" % (config.xmlfile, err), \ sys.stderr, verbose=0) halt(err=IOError, msg="Error: Unable to write to '%s'" \ % config.xmlfile, exit_code=1)
def build_authors_kb(fpath): replacements = [] if isinstance(fpath, basestring): fpath_needs_closing = True try: fh = open(fpath, "r") except IOError: # problem opening KB for reading, or problem while reading from it: emsg = "Error: Could not build list of authors - failed " \ "to read from KB %(kb)s." % {'kb' : fpath} write_message(emsg, sys.stderr, verbose=0) raise IOError("Error: Unable to open authors kb '%s'" % fpath) else: fpath_needs_closing = False fh = fpath try: for rawline in fh: if rawline.startswith('#'): continue # Extract the seek->replace terms from this KB line: m_kb_line = re_kb_line.search(rawline.decode('utf-8')) if m_kb_line: seek = m_kb_line.group('seek') repl = m_kb_line.group('repl') replacements.append((seek, repl)) finally: if fpath_needs_closing: fh.close() return replacements
def convert_PDF_to_plaintext(fpath, keep_layout=False): """ Convert PDF to txt using pdftotext Take the path to a PDF file and run pdftotext for this file, capturing the output. @param fpath: (string) path to the PDF file @return: (list) of unicode strings (contents of the PDF file translated into plaintext; each string is a line in the document.) """ if not os.path.isfile(CFG_PATH_PDFTOTEXT): raise Exception('Missing pdftotext executable') if keep_layout: layout_option = "-layout" else: layout_option = "-raw" status = 0 doclines = [] # Pattern to check for lines with a leading page-break character. # If this pattern is matched, we want to split the page-break into # its own line because we rely upon this for trying to strip headers # and footers, and for some other pattern matching. p_break_in_line = re.compile(ur'^\s*\f(.+)$', re.UNICODE) # build pdftotext command: cmd_pdftotext = [ CFG_PATH_PDFTOTEXT, layout_option, "-q", "-enc", "UTF-8", fpath, "-" ] write_message("* %s" % ' '.join(cmd_pdftotext), verbose=2) # open pipe to pdftotext: pipe_pdftotext = subprocess.Popen(cmd_pdftotext, stdout=subprocess.PIPE) # read back results: for docline in pipe_pdftotext.stdout: unicodeline = docline.decode("utf-8") # Check for a page-break in this line: m_break_in_line = p_break_in_line.match(unicodeline) if m_break_in_line is None: # There was no page-break in this line. Just add the line: doclines.append(unicodeline) else: # If there was a page-break character in the same line as some # text, split it out into its own line so that we can later # try to find headers and footers: doclines.append(u"\f") doclines.append(m_break_in_line.group(1)) write_message("* convert_PDF_to_plaintext found: " \ "%s lines of text" % len(doclines), verbose=2) # finally, check conversion result not bad: if pdftotext_conversion_is_bad(doclines): status = 2 doclines = [] return (doclines, status)
def extract_one(config, pdf_path): """Extract references from one file""" # If necessary, locate the reference section: if config.treat_as_reference_section: docbody = open(pdf_path).read().decode('utf-8') record = extract_references_from_string(docbody) else: write_message("* processing pdffile: %s" % pdf_path, verbose=2) record = extract_references_from_file(pdf_path) return record
def parse_reference_line(ref_line, kbs, bad_titles_count={}): """Parse one reference line @input a string representing a single reference bullet @output parsed references (a list of elements objects) """ # Strip the 'marker' (e.g. [1]) from this reference line: (line_marker, ref_line) = remove_reference_line_marker(ref_line) # Find DOI sections in citation (ref_line, identified_dois) = identify_and_tag_DOI(ref_line) # Identify and replace URLs in the line: (ref_line, identified_urls) = identify_and_tag_URLs(ref_line) # Tag <cds.JOURNAL>, etc. tagged_line, bad_titles_count = tag_reference_line(ref_line, kbs, bad_titles_count) # Debug print tagging (authors, titles, volumes, etc.) write_message('* tags %r' % tagged_line, verbose=9) # Using the recorded information, create a MARC XML representation # of the rebuilt line: # At the same time, get stats of citations found in the reference line # (titles, urls, etc): citation_elements, line_marker, counts = \ parse_tagged_reference_line(line_marker, tagged_line, identified_dois, identified_urls) # Transformations on elements citation_elements = split_volume_from_journal(citation_elements) citation_elements = format_volume(citation_elements) citation_elements = handle_special_journals(citation_elements, kbs) citation_elements = format_report_number(citation_elements) citation_elements = format_author_ed(citation_elements) citation_elements = look_for_books(citation_elements, kbs) citation_elements = format_hep(citation_elements) citation_elements = remove_b_for_nucl_phys(citation_elements) citation_elements = mangle_volume(citation_elements) # Split the reference in multiple ones if needed splitted_citations = split_citations(citation_elements) # Remove references with only misc text splitted_citations = remove_invalid_references(splitted_citations) # Find year splitted_citations = add_year_elements(splitted_citations) # For debugging puposes print_citations(splitted_citations, line_marker) return splitted_citations, line_marker, counts, bad_titles_count
def extract_one(config, pdf_path): """Extract references from one file""" # the document body is not empty: # 2. If necessary, locate the reference section: if config.treat_as_reference_section: docbody = open(pdf_path).read().decode('utf-8') out = extract_references_from_string_xml(docbody) else: write_message("* processing pdffile: %s" % pdf_path, verbose=2) out = extract_references_from_file_xml(pdf_path) return out
def task_run_core(recid, records, bibcatalog_system=None): setup_loggers(None, use_bibtask=True) try: extract_one(recid=recid, records=records, overwrite=task_get_option('overwrite'), create_a_ticket=task_get_option('new') or task_get_option('create-ticket'), bibcatalog_system=bibcatalog_system) except FullTextNotAvailable: write_message("No full text available for %s" % recid) except NotSafeForExtraction: write_message('Record not safe for re-extraction, skipping')
def load_kb(path, builder): try: path.startswith except AttributeError: write_message("Loading kb from array", verbose=3) return load_kb_from_iterable(path, builder) else: write_message("Loading kb from %s" % path, verbose=3) kb_start = 'kb:' if path.startswith(kb_start): return load_kb_from_db(path[len(kb_start):], builder) else: return load_kb_from_file(path, builder)
def convert_PDF_to_plaintext(fpath, keep_layout=False): """ Convert PDF to txt using pdftotext Take the path to a PDF file and run pdftotext for this file, capturing the output. @param fpath: (string) path to the PDF file @return: (list) of unicode strings (contents of the PDF file translated into plaintext; each string is a line in the document.) """ if keep_layout: layout_option = "-layout" else: layout_option = "-raw" status = 0 doclines = [] # Pattern to check for lines with a leading page-break character. # If this pattern is matched, we want to split the page-break into # its own line because we rely upon this for trying to strip headers # and footers, and for some other pattern matching. p_break_in_line = re.compile(ur'^\s*\f(.+)$', re.UNICODE) # build pdftotext command: cmd_pdftotext = [CFG_PATH_PDFTOTEXT, layout_option, "-q", "-enc", "UTF-8", fpath, "-"] write_message("* %s" % ' '.join(cmd_pdftotext), verbose=2) # open pipe to pdftotext: pipe_pdftotext = subprocess.Popen(cmd_pdftotext, stdout=subprocess.PIPE) # read back results: for docline in pipe_pdftotext.stdout: unicodeline = docline.decode("utf-8") # Check for a page-break in this line: m_break_in_line = p_break_in_line.match(unicodeline) if m_break_in_line is None: # There was no page-break in this line. Just add the line: doclines.append(unicodeline) else: # If there was a page-break character in the same line as some # text, split it out into its own line so that we can later # try to find headers and footers: doclines.append(u"\f") doclines.append(m_break_in_line.group(1)) write_message("* convert_PDF_to_plaintext found: " \ "%s lines of text" % len(doclines), verbose=2) # finally, check conversion result not bad: if pdftotext_conversion_is_bad(doclines): status = 2 doclines = [] return (doclines, status)
def extract_references_from_fulltext(fulltext): """Locate and extract the reference section from a fulltext document. Return the extracted reference section as a list of strings, whereby each string in the list is considered to be a single reference line. E.g. a string could be something like: '[19] Wilson, A. Unpublished (1986). @param fulltext: (list) of strings, whereby each string is a line of the document. @return: (list) of strings, where each string is an extracted reference line. """ # Try to remove pagebreaks, headers, footers fulltext = remove_page_boundary_lines(fulltext) status = 0 # How ref section found flag how_found_start = 0 # Find start of refs section ref_sect_start = get_reference_section_beginning(fulltext) if ref_sect_start is None: ## No References refs = [] status = 4 write_message( "* extract_references_from_fulltext: " "ref_sect_start is None", verbose=2) else: # If a reference section was found, however weak ref_sect_end = \ find_end_of_reference_section(fulltext, ref_sect_start["start_line"], ref_sect_start["marker"], ref_sect_start["marker_pattern"]) if ref_sect_end is None: # No End to refs? Not safe to extract refs = [] status = 5 write_message( "* extract_references_from_fulltext: " "no end to refs!", verbose=2) else: # If the end of the reference section was found.. start extraction refs = get_reference_lines( fulltext, ref_sect_start["start_line"], ref_sect_end, ref_sect_start["title_string"], ref_sect_start["marker_pattern"], ref_sect_start["title_marker_same_line"]) return refs, status, how_found_start
def main(config, args, run): """Main wrapper function for begin_extraction, and is always accessed in a standalone/independent way. (i.e. calling main will cause refextract to run in an independent mode)""" # Flag as running out of bibtask global RUNNING_INDEPENDENTLY RUNNING_INDEPENDENTLY = True if config.verbosity not in range(0, 10): usage("Error: Verbosity must be an integer between 0 and 10") setup_loggers(config.verbosity) if config.version: # version message and exit write_message(__revision__, verbose=0) halt(exit_code=0) if config.help: usage() if not args: # no files provided for reference extraction - error message usage("Error: No valid input file specified (file1 [file2 ...])") try: run(config, args) write_message("Extraction complete", verbose=2) except StandardError, e: # Remove extra '\n' write_message(traceback.format_exc()[:-1], verbose=9) write_message("Error: %s" % e, verbose=0) halt(exit_code=1)
def print_citations(splitted_citations, line_marker): write_message('* splitted_citations', verbose=9) write_message(' * line marker %s' % line_marker, verbose=9) for citation in splitted_citations: write_message(" * elements", verbose=9) for el in citation: write_message(' * %s %s' % (el['type'], repr(el)), verbose=9)
def make_collaborations_regex_str(): """ From the authors knowledge-base, construct a single regex holding the or'd possibilities of patterns which should be included in $h subfields. The word 'Collaboration' is also converted to 'Coll', and used in finding matches. Letter case is not considered during the search. @return: (string) The single pattern built from each line in the author knowledge base. """ def add_to_auth_list(s): """Strip the line, replace spaces with 'backslash s' and append 'the' to the start and 's' to the end. Add the prepared line to the list of extra kb authors.""" s = ur"(?:the\s)?" + s.strip().replace(u' ', ur'\s') + u"s?" auths.append(s) ## Build the 'or'd regular expression of the author lines in the author knowledge base auths = [] fpath = CFG_REFEXTRACT_KBS['collaborations'] try: fh = open(fpath, "r") except IOError: ## problem opening KB for reading, or problem while reading from it: emsg = """Error: Could not build knowledge base containing """ \ """author patterns - failed """ \ """to read from KB %(kb)s.\n""" \ % {'kb' : fpath} write_message(emsg, sys.stderr, verbose=0) raise IOError("Error: Unable to open collaborations kb '%s'" % fpath) for line_num, rawline in enumerate(fh): try: rawline = rawline.decode("utf-8") except UnicodeError: write_message("*** Unicode problems in %s for line %d" % (fpath, line_num), sys.stderr, verbose=0) raise UnicodeError("Error: Unable to parse collaboration kb (line: %s)" % str(line_num)) if rawline.strip() and rawline[0].strip() != '#': add_to_auth_list(rawline) ## Shorten collaboration to 'coll' if rawline.lower().endswith('collaboration\n'): coll_version = rawline[:rawline.lower().find(u'collaboration\n')] + ur"coll[\.\,]" add_to_auth_list(coll_version.strip().replace(' ', r'\s') + u"s?") author_match_re = "" if len(auths) > 0: author_match_re = u'|'.join([u"(?:" + a + u")" for a in auths]) author_match_re = ur"(?:(?:[\(\"]?(?P<extra_auth>" + \ author_match_re + ur")[\)\"]?[\,\.]?\s?(?:and\s)?)+)" return author_match_re
def extract_references_from_fulltext(fulltext): """Locate and extract the reference section from a fulltext document. Return the extracted reference section as a list of strings, whereby each string in the list is considered to be a single reference line. E.g. a string could be something like: '[19] Wilson, A. Unpublished (1986). @param fulltext: (list) of strings, whereby each string is a line of the document. @return: (list) of strings, where each string is an extracted reference line. """ # Try to remove pagebreaks, headers, footers fulltext = remove_page_boundary_lines(fulltext) status = 0 # How ref section found flag how_found_start = 0 # Find start of refs section ref_sect_start = get_reference_section_beginning(fulltext) if ref_sect_start is None: ## No References refs = [] status = 4 write_message("* extract_references_from_fulltext: " "ref_sect_start is None", verbose=2) else: # If a reference section was found, however weak ref_sect_end = \ find_end_of_reference_section(fulltext, ref_sect_start["start_line"], ref_sect_start["marker"], ref_sect_start["marker_pattern"]) if ref_sect_end is None: # No End to refs? Not safe to extract refs = [] status = 5 write_message("* extract_references_from_fulltext: " "no end to refs!", verbose=2) else: # If the end of the reference section was found.. start extraction refs = get_reference_lines(fulltext, ref_sect_start["start_line"], ref_sect_end, ref_sect_start["title_string"], ref_sect_start["marker_pattern"], ref_sect_start["title_marker_same_line"]) return refs, status, how_found_start
def begin_extraction(config, files): """Starts the core extraction procedure. [Entry point from main] Only refextract_daemon calls this directly, from _task_run_core() @param daemon_cli_options: contains the pre-assembled list of cli flags and values processed by the Refextract Daemon. This is full only when called as a scheduled bibtask inside bibsched. """ # Store xml records here output = [] for num, path in enumerate(files): # Announce the document extraction number write_message("Extracting %d of %d" % (num + 1, len(files)), verbose=1) out = extract_one(config, path) output.append(out) # Write our references write_references(config, output)
def filter_processed_references(out): """ apply filters to reference lines found - to remove junk""" reference_lines = out.split('\n') # Removes too long and too short m tags m_restricted, ref_lines = restrict_m_subfields(reference_lines) if m_restricted: a_tag = re.compile('\<subfield code=\"a\"\>(.*?)\<\/subfield\>') for i in range(len(ref_lines)): # Checks to see that the datafield has the attribute ind2="6", # Before looking to see if the subfield code attribute is 'a' if ref_lines[i].find('<datafield tag="999" ind1="C" ind2="6">') != -1 \ and (len(ref_lines) - 1) > i: # For each line in this datafield element, try to find the subfield whose code attribute is 'a' while ref_lines[i].find('</datafield>') != -1 and ( len(ref_lines) - 1) > i: i += 1 # <subfield code="a">Invenio/X.XX.X # refextract/X.XX.X-timestamp-err-repnum-title-URL-misc # remake the "a" tag for new numbe of "m" tags if a_tag.search(ref_lines[i]): data = a_tag.search(ref_lines[i]).group(1) words1 = data.split() words2 = words1[-1].split('-') old_m = int(words2[-1]) words2[-1] = str(old_m - m_restricted) data1 = '-'.join(words2) words1[-1] = data1 new_data = ' '.join(words1) ref_lines[ i] = ' <subfield code="a">' + new_data + '</subfield>' break new_out = '\n'.join([l for l in [rec.rstrip() for rec in ref_lines] if l]) if len(reference_lines) != len(new_out): write_message(" * filter results: unfilter references line length is %d and filtered length is %d" \ % (len(reference_lines), len(new_out)), verbose=2) return new_out
def halt(err=StandardError, msg=None, exit_code=1): """ Stop extraction, and deal with the error in the appropriate manner, based on whether Refextract is running in standalone or bibsched mode. @param err: (exception) The exception raised from an error, if any @param msg: (string) The brief error message, either displayed on the bibsched interface, or written to stderr. @param exit_code: (integer) Either 0 or 1, depending on the cause of the halting. This is only used when running standalone.""" # If refextract is running independently, exit. # 'RUNNING_INDEPENDENTLY' is a global variable if RUNNING_INDEPENDENTLY: if msg: write_message(msg, stream=sys.stderr, verbose=0) sys.exit(exit_code) # Else, raise an exception so Bibsched will flag this task. else: if msg: # Update the status of refextract inside the Bibsched UI task_update_progress(msg.strip()) raise err(msg)
def extract_one(recid, records, overwrite=False, bibcatalog_system=None, create_a_ticket=False): msg = "Extracting references for %s" % recid if overwrite: write_message("%s (overwrite)" % msg) safe_to_extract = record_can_overwrite_refs(recid) else: write_message(msg) safe_to_extract = record_can_extract_refs(recid) if safe_to_extract: record = extract_references_from_record(recid) records.append(record) # Create a RT ticket if necessary if create_a_ticket: create_ticket(recid, bibcatalog_system) else: raise NotSafeForExtraction()
def build_books_kb(fpath): if isinstance(fpath, basestring): fpath_needs_closing = True try: write_message('Loading books kb from %s' % fpath, verbose=3) fh = open(fpath, "r") source = csv.reader(fh, delimiter='|', lineterminator=';') except IOError: # problem opening KB for reading, or problem while reading from it: emsg = "Error: Could not build list of books - failed " \ "to read from KB %(kb)s." % {'kb' : fpath} raise IOError(emsg) else: fpath_needs_closing = False source = fpath try: books = {} for line in source: try: books[line[1].upper()] = line except IndexError: write_message('Invalid line in books kb %s' % line, verbose=1) finally: if fpath_needs_closing: fh.close() write_message('Loaded books kb', verbose=3) return books
def build_publishers_kb(fpath): if isinstance(fpath, basestring): fpath_needs_closing = True try: write_message('Loading publishers kb from %s' % fpath, verbose=3) fh = open(fpath, "r") source = csv.reader(fh, delimiter='|', lineterminator='\n') except IOError: # problem opening KB for reading, or problem while reading from it: emsg = "Error: Could not build list of publishers - failed " \ "to read from KB %(kb)s." % {'kb' : fpath} raise IOError(emsg) else: fpath_needs_closing = False source = fpath try: publishers = {} for line in source: try: pattern = re.compile(ur'(\b|^)%s(\b|$)' % line[0], re.I | re.U) publishers[line[0]] = {'pattern': pattern, 'repl': line[1]} except IndexError: write_message('Invalid line in books kb %s' % line, verbose=1) finally: if fpath_needs_closing: fh.close() write_message('Loaded publishers kb', verbose=3) return publishers
def build_publishers_kb(fpath): if isinstance(fpath, basestring): fpath_needs_closing = True try: write_message('Loading publishers kb from %s' % fpath, verbose=3) fh = open(fpath, "r") source = csv.reader(fh, delimiter='|', lineterminator='\n') except IOError: # problem opening KB for reading, or problem while reading from it: emsg = "Error: Could not build list of publishers - failed " \ "to read from KB %(kb)s." % {'kb' : fpath} raise IOError(emsg) else: fpath_needs_closing = False source = fpath try: publishers = {} for line in source: try: pattern = re.compile(ur'(\b|^)%s(\b|$)' % line[0], re.I|re.U) publishers[line[0]] = {'pattern': pattern, 'repl': line[1]} except IndexError: write_message('Invalid line in books kb %s' % line, verbose=1) finally: if fpath_needs_closing: fh.close() write_message('Loaded publishers kb', verbose=3) return publishers
def limit_m_tags(xml_file, length_limit): """Limit size of miscellaneous tags""" temp_xml_file = xml_file + '.temp' try: ofilehdl = open(xml_file, 'r') except IOError: write_message("***%s\n" % xml_file, verbose=0) raise IOError("Error: Unable to read from '%s'" % xml_file) try: nfilehdl = open(temp_xml_file, 'w') except IOError: write_message("***%s\n" % temp_xml_file, verbose=0) raise IOError("Error: Unable to write to '%s'" % temp_xml_file) for line in ofilehdl: line_dec = line.decode("utf-8") start_ind = line_dec.find('<subfield code="m">') if start_ind != -1: # This line is an "m" line: last_ind = line_dec.find('</subfield>') if last_ind != -1: # This line contains the end-tag for the "m" section leng = last_ind - start_ind - 19 if leng > length_limit: # want to truncate on a blank to avoid problems.. end = start_ind + 19 + length_limit for lett in range(end - 1, last_ind): xx = line_dec[lett:lett + 1] if xx == ' ': break else: end += 1 middle = line_dec[start_ind + 19:end - 1] line_dec = start_ind * ' ' + '<subfield code="m">' + \ middle + ' !Data truncated! ' + '</subfield>\n' nfilehdl.write("%s" % line_dec.encode("utf-8")) nfilehdl.close() # copy back to original file name os.rename(temp_xml_file, xml_file)
def write_references(config, records): """Write in marcxml""" if config.xmlfile: ofilehdl = open(config.xmlfile, 'w') else: ofilehdl = sys.stdout if config.xmlfile: for rec in records: for subfield in rec.find_subfields('999C5m'): if len(subfield.value) > 2048: subfield.value = subfield.value[:2048] try: xml = print_records(records) print >>ofilehdl, xml ofilehdl.flush() except IOError, err: write_message("%s\n%s\n" % (config.xmlfile, err), sys.stderr, verbose=0) halt(err=IOError, msg="Error: Unable to write to '%s'" % config.xmlfile, exit_code=1)
def filter_processed_references(out): """ apply filters to reference lines found - to remove junk""" reference_lines = out.split('\n') # Removes too long and too short m tags m_restricted, ref_lines = restrict_m_subfields(reference_lines) if m_restricted: a_tag = re.compile('\<subfield code=\"a\"\>(.*?)\<\/subfield\>') for i in range(len(ref_lines)): # Checks to see that the datafield has the attribute ind2="6", # Before looking to see if the subfield code attribute is 'a' if ref_lines[i].find('<datafield tag="999" ind1="C" ind2="6">') != -1 \ and (len(ref_lines) - 1) > i: # For each line in this datafield element, try to find the subfield whose code attribute is 'a' while ref_lines[i].find('</datafield>') != -1 and (len(ref_lines) - 1) > i: i += 1 # <subfield code="a">Invenio/X.XX.X # refextract/X.XX.X-timestamp-err-repnum-title-URL-misc # remake the "a" tag for new numbe of "m" tags if a_tag.search(ref_lines[i]): data = a_tag.search(ref_lines[i]).group(1) words1 = data.split() words2 = words1[-1].split('-') old_m = int(words2[-1]) words2[-1] = str(old_m - m_restricted) data1 = '-'.join(words2) words1[-1] = data1 new_data = ' '.join(words1) ref_lines[i] = ' <subfield code="a">' + new_data + '</subfield>' break new_out = '\n'.join([l for l in [rec.rstrip() for rec in ref_lines] if l]) if len(reference_lines) != len(new_out): write_message(" * filter results: unfilter references line length is %d and filtered length is %d" \ % (len(reference_lines), len(new_out)), verbose=2) return new_out
def limit_m_tags(xml_file, length_limit): """Limit size of miscellaneous tags""" temp_xml_file = xml_file + '.temp' try: ofilehdl = open(xml_file, 'r') except IOError: write_message("***%s\n" % xml_file, verbose=0) raise IOError("Error: Unable to read from '%s'" % xml_file) try: nfilehdl = open(temp_xml_file, 'w') except IOError: write_message("***%s\n" % temp_xml_file, verbose=0) raise IOError("Error: Unable to write to '%s'" % temp_xml_file) for line in ofilehdl: line_dec = line.decode("utf-8") start_ind = line_dec.find('<subfield code="m">') if start_ind != -1: # This line is an "m" line: last_ind = line_dec.find('</subfield>') if last_ind != -1: # This line contains the end-tag for the "m" section leng = last_ind - start_ind - 19 if leng > length_limit: # want to truncate on a blank to avoid problems.. end = start_ind + 19 + length_limit for lett in range(end - 1, last_ind): xx = line_dec[lett:lett+1] if xx == ' ': break else: end += 1 middle = line_dec[start_ind+19:end-1] line_dec = start_ind * ' ' + '<subfield code="m">' + \ middle + ' !Data truncated! ' + '</subfield>\n' nfilehdl.write("%s" % line_dec.encode("utf-8")) nfilehdl.close() # copy back to original file name os.rename(temp_xml_file, xml_file)
def build_special_journals_kb(fpath): """Load special journals database from file Special journals are journals that have a volume which is not unique among different years. To keep the volume unique we are adding the year before the volume. """ journals = set() write_message('Loading special journals kb from %s' % fpath, verbose=3) fh = open(fpath, "r") try: for line in fh: # Skip commented lines if line.startswith('#'): continue # Skip empty line if not line.strip(): continue journals.add(line.strip()) finally: fh.close() write_message('Loaded special journals kb', verbose=3) return journals
def create_ticket(recid, bibcatalog_system, queue=CFG_REFEXTRACT_TICKET_QUEUE): write_message('ticket system: %s' % bibcatalog_system.__class__.__name__) write_message('queue: %s' % queue) if bibcatalog_system and queue: results = bibcatalog_system.ticket_search(None, recordid=recid, queue=queue) if results: write_message("Ticket #%s found" % results[0]) else: _create_ticket(recid, bibcatalog_system, queue)
def get_reference_section_beginning(fulltext): sect_start = { 'start_line': None, 'end_line': None, 'title_string': None, 'marker_pattern': None, 'marker': None, 'how_found_start': None, } ## Find start of refs section: sect_start = find_reference_section(fulltext) if sect_start is not None: sect_start['how_found_start'] = 1 else: ## No references found - try with no title option sect_start = find_reference_section_no_title_via_brackets(fulltext) if sect_start is not None: sect_start['how_found_start'] = 2 ## Try weaker set of patterns if needed if sect_start is None: ## No references found - try with no title option (with weaker patterns..) sect_start = find_reference_section_no_title_via_dots(fulltext) if sect_start is not None: sect_start['how_found_start'] = 3 if sect_start is None: ## No references found - try with no title option (with even weaker patterns..) sect_start = find_reference_section_no_title_via_numbers( fulltext) if sect_start is not None: sect_start['how_found_start'] = 4 if sect_start: write_message('* title %r' % sect_start['title_string'], verbose=3) write_message('* marker %r' % sect_start['marker'], verbose=3) write_message('* title_marker_same_line %s' \ % sect_start['title_marker_same_line'], verbose=3) else: write_message('* could not find references section', verbose=3) return sect_start
def get_reference_section_beginning(fulltext): sect_start = {'start_line' : None, 'end_line' : None, 'title_string' : None, 'marker_pattern' : None, 'marker' : None, 'how_found_start': None, } ## Find start of refs section: sect_start = find_reference_section(fulltext) if sect_start is not None: sect_start['how_found_start'] = 1 else: ## No references found - try with no title option sect_start = find_reference_section_no_title_via_brackets(fulltext) if sect_start is not None: sect_start['how_found_start'] = 2 ## Try weaker set of patterns if needed if sect_start is None: ## No references found - try with no title option (with weaker patterns..) sect_start = find_reference_section_no_title_via_dots(fulltext) if sect_start is not None: sect_start['how_found_start'] = 3 if sect_start is None: ## No references found - try with no title option (with even weaker patterns..) sect_start = find_reference_section_no_title_via_numbers(fulltext) if sect_start is not None: sect_start['how_found_start'] = 4 if sect_start: write_message('* title %r' % sect_start['title_string'], verbose=3) write_message('* marker %r' % sect_start['marker'], verbose=3) write_message('* title_marker_same_line %s' \ % sect_start['title_marker_same_line'], verbose=3) else: write_message('* could not find references section', verbose=3) return sect_start
def get_reference_section_beginning(fulltext): sect_start = { "start_line": None, "end_line": None, "title_string": None, "marker_pattern": None, "marker": None, "how_found_start": None, } ## Find start of refs section: sect_start = find_reference_section(fulltext) if sect_start is not None: sect_start["how_found_start"] = 1 else: ## No references found - try with no title option sect_start = find_reference_section_no_title_via_brackets(fulltext) if sect_start is not None: sect_start["how_found_start"] = 2 ## Try weaker set of patterns if needed if sect_start is None: ## No references found - try with no title option (with weaker patterns..) sect_start = find_reference_section_no_title_via_dots(fulltext) if sect_start is not None: sect_start["how_found_start"] = 3 if sect_start is None: ## No references found - try with no title option (with even weaker patterns..) sect_start = find_reference_section_no_title_via_numbers(fulltext) if sect_start is not None: sect_start["how_found_start"] = 4 if sect_start: write_message("* title %r" % sect_start["title_string"], verbose=3) write_message("* marker %r" % sect_start["marker"], verbose=3) write_message("* title_marker_same_line %s" % sect_start["title_marker_same_line"], verbose=3) else: write_message("* could not find references section", verbose=3) return sect_start
def _create_ticket(recid, bibcatalog_system, queue): subject = "Refs for #%s" % recid if CFG_INSPIRE_SITE: # Add report number in the subjecet report_number = "" record = get_bibrecord(recid) in_core = False for collection_tag in record_get_field_instances(record, "980"): for collection in field_get_subfield_values(collection_tag, 'a'): if collection == 'CORE': in_core = True if collection == 'arXiv': # Do not create tickets for arxiv papers # Tickets for arxiv papers are created in bibcatelog write_message("arXiv paper", verbose=1) return # Do not create tickets for user submissions for source_field in record_get_field_instances(record, "541"): for source in field_get_subfield_values(source_field, "c"): if source == "submission": write_message("User submitted paper", verbose=1) return # Only create tickets for CORE papers if not in_core: write_message("not in core papers", verbose=1) return # Do not create tickets for old records creation_date = run_sql( """SELECT creation_date FROM bibrec WHERE id = %s""", [recid])[0][0] if creation_date < datetime.now() - timedelta(days=30 * 4): return for report_tag in record_get_field_instances(record, "037"): for report_number in field_get_subfield_values(report_tag, 'a'): subject += " " + report_number break text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, recid) bibcatalog_system.ticket_submit(subject=subject, queue=queue, text=text, recordid=recid)
def build_reportnum_kb(fpath): """Given the path to a knowledge base file containing the details of institutes and the patterns that their preprint report numbering schemes take, create a dictionary of regexp search patterns to recognise these preprint references in reference lines, and a dictionary of replacements for non-standard preprint categories in these references. The knowledge base file should consist only of lines that take one of the following 3 formats: #####Institute Name#### (the name of the institute to which the preprint reference patterns belong, e.g. '#####LANL#####', surrounded by 5 # on either side.) <pattern> (numeration patterns for an institute's preprints, surrounded by < and >.) seek-term --- replace-term (i.e. a seek phrase on the left hand side, a replace phrase on the right hand side, with the two phrases being separated by 3 hyphens.) E.g.: ASTRO PH ---astro-ph The left-hand side term is a non-standard version of the preprint reference category; the right-hand side term is the standard version. If the KB file cannot be read from, or an unexpected line is encountered in the KB, an error message is output to standard error and execution is halted with an error-code 0. @param fpath: (string) the path to the knowledge base file. @return: (tuple) containing 2 dictionaries. The first contains regexp search patterns used to identify preprint references in a line. This dictionary is keyed by a tuple containing the line number of the pattern in the KB and the non-standard category string. E.g.: (3, 'ASTRO PH'). The second dictionary contains the standardised category string, and is keyed by the non-standard category string. E.g.: 'astro-ph'. """ def _add_institute_preprint_patterns( preprint_classifications, preprint_numeration_ptns, preprint_reference_search_regexp_patterns, standardised_preprint_reference_categories, kb_line_num): """For a list of preprint category strings and preprint numeration patterns for a given institute, create the regexp patterns for each of the preprint types. Add the regexp patterns to the dictionary of search patterns (preprint_reference_search_regexp_patterns), keyed by the line number of the institute in the KB, and the preprint category search string. Also add the standardised preprint category string to another dictionary, keyed by the line number of its position in the KB and its non-standardised version. @param preprint_classifications: (list) of tuples whereby each tuple contains a preprint category search string and the line number of the name of institute to which it belongs in the KB. E.g.: (45, 'ASTRO PH'). @param preprint_numeration_ptns: (list) of preprint reference numeration search patterns (strings) @param preprint_reference_search_regexp_patterns: (dictionary) of regexp patterns used to search in document lines. @param standardised_preprint_reference_categories: (dictionary) containing the standardised strings for preprint reference categories. (E.g. 'astro-ph'.) @param kb_line_num: (integer) - the line number int the KB at which a given institute name was found. @return: None """ if preprint_classifications and preprint_numeration_ptns: # the previous institute had both numeration styles and categories # for preprint references. # build regexps and add them for this institute: # First, order the numeration styles by line-length, and build a # grouped regexp for recognising numeration: ordered_patterns = \ order_reportnum_patterns_bylen(preprint_numeration_ptns) # create a grouped regexp for numeration part of # preprint reference: numeration_regexp = \ create_institute_numeration_group_regexp_pattern(ordered_patterns) # for each "classification" part of preprint references, create a # complete regex: # will be in the style "(categ)-(numatn1|numatn2|numatn3|...)" for classification in preprint_classifications: search_pattern_str = ur'(?:^|[^a-zA-Z0-9\/\.\-])([\[\(]?(?P<categ>' \ + classification[0].strip() + u')' \ + numeration_regexp + ur'[\]\)]?)' re_search_pattern = re.compile(search_pattern_str, re.UNICODE) preprint_reference_search_regexp_patterns[(kb_line_num, classification[0])] =\ re_search_pattern standardised_preprint_reference_categories[(kb_line_num, classification[0])] =\ classification[1] preprint_reference_search_regexp_patterns = {} # a dictionary of patterns # used to recognise # categories of preprints # as used by various # institutes standardised_preprint_reference_categories = {} # dictionary of # standardised category # strings for preprint cats current_institute_preprint_classifications = [ ] # list of tuples containing # preprint categories in # their raw & standardised # forms, as read from KB current_institute_numerations = [] # list of preprint # numeration patterns, as # read from the KB # pattern to recognise an institute name line in the KB re_institute_name = re.compile(ur'^\*{5}\s*(.+)\s*\*{5}$', re.UNICODE) # pattern to recognise an institute preprint categ line in the KB re_preprint_classification = \ re.compile(ur'^\s*(\w.*)\s*---\s*(\w.*)\s*$', re.UNICODE) # pattern to recognise a preprint numeration-style line in KB re_numeration_pattern = re.compile(ur'^\<(.+)\>$', re.UNICODE) kb_line_num = 0 # when making the dictionary of patterns, which is # keyed by the category search string, this counter # will ensure that patterns in the dictionary are not # overwritten if 2 institutes have the same category # styles. try: if isinstance(fpath, basestring): write_message('Loading reports kb from %s' % fpath, verbose=3) fh = open(fpath, "r") fpath_needs_closing = True else: fpath_needs_closing = False fh = fpath for rawline in fh: if rawline.startswith('#'): continue kb_line_num += 1 try: rawline = rawline.decode("utf-8") except UnicodeError: write_message("*** Unicode problems in %s for line %e" % (fpath, kb_line_num), sys.stderr, verbose=0) raise UnicodeError( "Error: Unable to parse report number kb (line: %s)" % str(kb_line_num)) m_institute_name = re_institute_name.search(rawline) if m_institute_name: # This KB line is the name of an institute # append the last institute's pattern list to the list of # institutes: _add_institute_preprint_patterns( current_institute_preprint_classifications, current_institute_numerations, preprint_reference_search_regexp_patterns, standardised_preprint_reference_categories, kb_line_num) # Now start a new dictionary to contain the search patterns # for this institute: current_institute_preprint_classifications = [] current_institute_numerations = [] # move on to the next line continue m_preprint_classification = \ re_preprint_classification.search(rawline) if m_preprint_classification: # This KB line contains a preprint classification for # the current institute try: current_institute_preprint_classifications.append( (m_preprint_classification.group(1), m_preprint_classification.group(2))) except (AttributeError, NameError): # didn't match this line correctly - skip it pass # move on to the next line continue m_numeration_pattern = re_numeration_pattern.search(rawline) if m_numeration_pattern: # This KB line contains a preprint item numeration pattern # for the current institute try: current_institute_numerations.append( m_numeration_pattern.group(1)) except (AttributeError, NameError): # didn't match the numeration pattern correctly - skip it pass continue _add_institute_preprint_patterns( current_institute_preprint_classifications, current_institute_numerations, preprint_reference_search_regexp_patterns, standardised_preprint_reference_categories, kb_line_num) if fpath_needs_closing: write_message('Loaded reports kb', verbose=3) fh.close() except IOError: # problem opening KB for reading, or problem while reading from it: emsg = """Error: Could not build knowledge base containing """ \ """institute preprint referencing patterns - failed """ \ """to read from KB %(kb)s.""" \ % {'kb' : fpath} write_message(emsg, sys.stderr, verbose=0) raise IOError("Error: Unable to open report number kb '%s'" % fpath) # return the preprint reference patterns and the replacement strings # for non-standard categ-strings: return (preprint_reference_search_regexp_patterns, standardised_preprint_reference_categories)
def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn): """Given a reference section, rebuild the reference lines. After translation from PDF to text, reference lines are often broken. This is because pdftotext doesn't know what is a wrapped-line and what is a genuine new line. As a result, the following 2 reference lines: [1] See http://invenio-software.org/ for more details. [2] Example, AN: private communication (1996). ...could be broken into the following 4 lines during translation from PDF to plaintext: [1] See http://invenio-software.org/ fo r more details. [2] Example, AN: private communica tion (1996). Such a situation could lead to a citation being separated across 'lines', meaning that it wouldn't be correctly recognised. This function tries to rebuild the reference lines. It uses the pattern used to recognise a reference line's numeration marker to indicate the start of a line. If no reference line numeration was recognised, it will simply join all lines together into one large reference line. @param ref_sectn: (list) of strings. The (potentially broken) reference lines. @param ref_line_marker_ptn: (string) - the pattern used to recognise a reference line's numeration marker. @return: (list) of strings - the rebuilt reference section. Each string in the list represents a complete reference line. """ ## initialise some vars: rebuilt_references = [] working_ref = [] strip_before = True if ref_line_marker_ptn is None or type(ref_line_marker_ptn) not in (str, unicode): if test_for_blank_lines_separating_reference_lines(ref_sectn): ## Use blank lines to separate ref lines ref_line_marker_ptn = ur"^\s*$" else: ## No ref line dividers: unmatchable pattern # ref_line_marker_ptn = ur'^A$^A$$' # I am adding a new format, hopefully # this case wasn't useful # Reference1 # etc # Reference2 # etc # We split when there's no identation ref_line_marker_ptn = ur"^[^\s]" strip_before = False write_message("* references separator %s" % ref_line_marker_ptn, verbose=2) p_ref_line_marker = re.compile(ref_line_marker_ptn, re.I | re.UNICODE) # Work backwards, starting from the last 'broken' reference line # Append each fixed reference line to rebuilt_references current_ref = None line_counter = 0 def prepare_ref(working_ref): working_line = "" for l in reversed(working_ref): working_line = join_lines(working_line, l) working_line = working_line.rstrip() return wash_and_repair_reference_line(working_line) for line in reversed(ref_sectn): # Try to find the marker for the reference line if strip_before: current_string = line.strip() m_ref_line_marker = p_ref_line_marker.search(current_string) else: m_ref_line_marker = p_ref_line_marker.search(line) current_string = line.strip() if m_ref_line_marker and (not current_ref or current_ref == int(m_ref_line_marker.group("marknum")) + 1): # Reference line marker found! : Append this reference to the # list of fixed references and reset the working_line to 'blank' if current_string != "": ## If it's not a blank line to separate refs working_ref.append(current_string) # Append current working line to the refs list if line_counter < CFG_REFEXTRACT_MAX_LINES: rebuilt_references.append(prepare_ref(working_ref)) try: current_ref = int(m_ref_line_marker.group("marknum")) except IndexError: pass # this line doesn't have numbering working_ref = [] line_counter = 0 elif current_string != u"": # Continuation of line working_ref.append(current_string) line_counter += 1 if working_ref: # Append last line rebuilt_references.append(prepare_ref(working_ref)) # A list of reference lines has been built backwards - reverse it: rebuilt_references.reverse() # Make sure mulitple markers within references are correctly # in place (compare current marker num with current marker num +1) # rebuilt_references = correct_rebuilt_lines(rebuilt_references, \ # p_ref_line_marker) # For each properly formated reference line, try to identify cases # where there is more than one citation in a single line. This is # done by looking for semi-colons, which could be used to # separate references return rebuilt_references
def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn): """Given a reference section, rebuild the reference lines. After translation from PDF to text, reference lines are often broken. This is because pdftotext doesn't know what is a wrapped-line and what is a genuine new line. As a result, the following 2 reference lines: [1] See http://invenio-software.org/ for more details. [2] Example, AN: private communication (1996). ...could be broken into the following 4 lines during translation from PDF to plaintext: [1] See http://invenio-software.org/ fo r more details. [2] Example, AN: private communica tion (1996). Such a situation could lead to a citation being separated across 'lines', meaning that it wouldn't be correctly recognised. This function tries to rebuild the reference lines. It uses the pattern used to recognise a reference line's numeration marker to indicate the start of a line. If no reference line numeration was recognised, it will simply join all lines together into one large reference line. @param ref_sectn: (list) of strings. The (potentially broken) reference lines. @param ref_line_marker_ptn: (string) - the pattern used to recognise a reference line's numeration marker. @return: (list) of strings - the rebuilt reference section. Each string in the list represents a complete reference line. """ indentation_splitting = False # This should be moved the function detecting the pattern! if not ref_line_marker_ptn: if test_for_blank_lines_separating_reference_lines(ref_sectn): # Use blank lines to separate ref lines ref_line_marker_ptn = ur'^\s*$' else: # No ref line dividers # We are guessing this the format: # Reference1 # etc # Reference2 # etc # We split when there's no identation indentation_splitting = True ref_line_marker_ptn = ur'^[^\s]' write_message('* references separator %s' % ref_line_marker_ptn, verbose=2) p_ref_line_marker = re.compile(ref_line_marker_ptn, re.I|re.UNICODE) # Start from ref 1 # Append each fixed reference line to rebuilt_references # and rebuild references as we go current_ref = 0 rebuilt_references = [] working_ref = [] def prepare_ref(working_ref): working_ref = working_ref[:CFG_REFEXTRACT_MAX_LINES] working_line = "" for l in working_ref: working_line = join_lines(working_line, l.strip()) working_line = working_line.rstrip() return wash_and_repair_reference_line(working_line) lower_case_start = re.compile(ur'[a-z]') continuing_line_markers = re.compile(ur'[,&-]$') for line in ref_sectn: # Can't find a good way to distinguish between # pagination and the page number of a journal numeration that # happens to be alone in a new line # m = match_pagination(line) # if m and current_ref and current_ref != m + 1: # continue # Try to find the marker for the reference line m_ref_line_marker = p_ref_line_marker.search(line) if m_ref_line_marker: try: marknum = int(m_ref_line_marker.group('marknum')) except IndexError: marknum = None except ValueError: # If the mark is a unicode character category [Nd], # it is not always convertible to int by int() # We can't use its numerical value, but we still accept it # as numeration pass new_line_detected = False if marknum is None or current_ref + 1 == marknum: new_line_detected = True if indentation_splitting: if lower_case_start.match(line.strip()): new_line_detected = False if working_ref and \ continuing_line_markers.search(working_ref[-1].strip()): new_line_detected = False if new_line_detected: # Reference line marker found! : Append this reference to the # list of fixed references and reset the working_line to 'blank' start = m_ref_line_marker.start() if line[:start]: # If it's not a blank line to separate refs # Only append from the start of the marker # For this case: # [1] hello # hello2 [2] foo working_ref.append(line[:start]) # Append current working line to the refs list if working_ref: rebuilt_references.append(prepare_ref(working_ref)) current_ref = marknum working_ref = [] if line[start:]: working_ref.append(line[start:]) else: # Our marker does not match the counting # Either we missed one, the author missed one or # it is not a line marker # For now we assume it is not line marker working_ref.append(line) elif line: # Continuation of line working_ref.append(line) if working_ref: # Append last line rebuilt_references.append(prepare_ref(working_ref)) return rebuilt_references
def build_journals_kb(knowledgebase): """Given the path to a knowledge base file, read in the contents of that file into a dictionary of search->replace word phrases. The search phrases are compiled into a regex pattern object. The knowledge base file should consist only of lines that take the following format: seek-term --- replace-term (i.e. a seek phrase on the left hand side, a replace phrase on the right hand side, with the two phrases being separated by 3 hyphens.) E.g.: ASTRONOMY AND ASTROPHYSICS ---Astron. Astrophys. The left-hand side term is a non-standard version of the title, whereas the right-hand side term is the standard version. If the KB file cannot be read from, or an unexpected line is encountered in the KB, an error message is output to standard error and execution is halted with an error-code 0. @param fpath: (string) the path to the knowledge base file. @return: (tuple) containing a list and a dictionary. The list contains compiled regex patterns used as search terms and will be used to force searching order to match that of the knowledge base. The dictionary contains the search->replace terms. The keys of the dictionary are the compiled regex word phrases used for searching in the reference lines; The values in the dictionary are the replace terms for matches. """ # Initialise vars: # dictionary of search and replace phrases from KB: kb = {} standardised_titles = {} seek_phrases = [] # A dictionary of "replacement terms" (RHS) to be inserted into KB as # "seek terms" later, if they were not already explicitly added # by the KB: repl_terms = {} write_message('Processing journals kb', verbose=3) for seek_phrase, repl in knowledgebase: # We match on a simplified line, thus dots are replaced # with spaces seek_phrase = seek_phrase.replace('.', ' ').decode('utf-8').upper() # good KB line # Add the 'replacement term' into the dictionary of # replacement terms: repl_terms[repl] = None # add the phrase from the KB if the 'seek' phrase is longer # compile the seek phrase into a pattern: seek_ptn = re.compile(ur'(?<!\w)(%s)\W' % re.escape(seek_phrase), re.UNICODE) kb[seek_phrase] = seek_ptn standardised_titles[seek_phrase] = repl seek_phrases.append(seek_phrase) # Now, for every 'replacement term' found in the KB, if it is # not already in the KB as a "search term", add it: for repl_term in repl_terms.keys(): raw_repl_phrase = repl_term.upper() raw_repl_phrase = re_punctuation.sub(u' ', raw_repl_phrase) raw_repl_phrase = \ re_group_captured_multiple_space.sub(u' ', raw_repl_phrase) raw_repl_phrase = raw_repl_phrase.strip() if raw_repl_phrase not in kb: # The replace-phrase was not in the KB as a seek phrase # It should be added. pattern = ur'(?<!\/)\b(%s)[^A-Z0-9]' % re.escape(raw_repl_phrase) seek_ptn = re.compile(pattern, re.U) kb[raw_repl_phrase] = seek_ptn standardised_titles[raw_repl_phrase] = repl_term seek_phrases.append(raw_repl_phrase) # Sort the titles by string length (long - short) seek_phrases.sort(_cmp_bystrlen_reverse) write_message('Processed journals kb', verbose=3) # return the raw knowledge base: return kb, standardised_titles, seek_phrases
def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn): """Given a reference section, rebuild the reference lines. After translation from PDF to text, reference lines are often broken. This is because pdftotext doesn't know what is a wrapped-line and what is a genuine new line. As a result, the following 2 reference lines: [1] See http://invenio-software.org/ for more details. [2] Example, AN: private communication (1996). ...could be broken into the following 4 lines during translation from PDF to plaintext: [1] See http://invenio-software.org/ fo r more details. [2] Example, AN: private communica tion (1996). Such a situation could lead to a citation being separated across 'lines', meaning that it wouldn't be correctly recognised. This function tries to rebuild the reference lines. It uses the pattern used to recognise a reference line's numeration marker to indicate the start of a line. If no reference line numeration was recognised, it will simply join all lines together into one large reference line. @param ref_sectn: (list) of strings. The (potentially broken) reference lines. @param ref_line_marker_ptn: (string) - the pattern used to recognise a reference line's numeration marker. @return: (list) of strings - the rebuilt reference section. Each string in the list represents a complete reference line. """ indentation_splitting = False # This should be moved the function detecting the pattern! if not ref_line_marker_ptn: if test_for_blank_lines_separating_reference_lines(ref_sectn): # Use blank lines to separate ref lines ref_line_marker_ptn = ur'^\s*$' else: # No ref line dividers # We are guessing this the format: # Reference1 # etc # Reference2 # etc # We split when there's no identation indentation_splitting = True ref_line_marker_ptn = ur'^[^\s]' write_message('* references separator %s' % ref_line_marker_ptn, verbose=2) p_ref_line_marker = re.compile(ref_line_marker_ptn, re.I | re.UNICODE) # Start from ref 1 # Append each fixed reference line to rebuilt_references # and rebuild references as we go current_ref = 0 rebuilt_references = [] working_ref = [] def prepare_ref(working_ref): working_ref = working_ref[:CFG_REFEXTRACT_MAX_LINES] working_line = "" for l in working_ref: working_line = join_lines(working_line, l.strip()) working_line = working_line.rstrip() return wash_and_repair_reference_line(working_line) lower_case_start = re.compile(ur'[a-z]') continuing_line_markers = re.compile(ur'[,&-]$') for line in ref_sectn: # Can't find a good way to distinguish between # pagination and the page number of a journal numeration that # happens to be alone in a new line # m = match_pagination(line) # if m and current_ref and current_ref != m + 1: # continue # Try to find the marker for the reference line m_ref_line_marker = p_ref_line_marker.search(line) if m_ref_line_marker: try: marknum = int(m_ref_line_marker.group('marknum')) except IndexError: marknum = None except ValueError: # If the mark is a unicode character category [Nd], # it is not always convertible to int by int() # We can't use its numerical value, but we still accept it # as numeration pass new_line_detected = False if marknum is None or current_ref + 1 == marknum: new_line_detected = True if indentation_splitting: if lower_case_start.match(line.strip()): new_line_detected = False if working_ref and \ continuing_line_markers.search(working_ref[-1].strip()): new_line_detected = False if new_line_detected: # Reference line marker found! : Append this reference to the # list of fixed references and reset the working_line to 'blank' start = m_ref_line_marker.start() if line[:start]: # If it's not a blank line to separate refs # Only append from the start of the marker # For this case: # [1] hello # hello2 [2] foo working_ref.append(line[:start]) # Append current working line to the refs list if working_ref: rebuilt_references.append(prepare_ref(working_ref)) current_ref = marknum working_ref = [] if line[start:]: working_ref.append(line[start:]) else: # Our marker does not match the counting # Either we missed one, the author missed one or # it is not a line marker # For now we assume it is not line marker working_ref.append(line) elif line: # Continuation of line working_ref.append(line) if working_ref: # Append last line rebuilt_references.append(prepare_ref(working_ref)) return rebuilt_references
def build_journals_kb(knowledgebase): """Given the path to a knowledge base file, read in the contents of that file into a dictionary of search->replace word phrases. The search phrases are compiled into a regex pattern object. The knowledge base file should consist only of lines that take the following format: seek-term --- replace-term (i.e. a seek phrase on the left hand side, a replace phrase on the right hand side, with the two phrases being separated by 3 hyphens.) E.g.: ASTRONOMY AND ASTROPHYSICS ---Astron. Astrophys. The left-hand side term is a non-standard version of the title, whereas the right-hand side term is the standard version. If the KB file cannot be read from, or an unexpected line is encountered in the KB, an error message is output to standard error and execution is halted with an error-code 0. @param fpath: (string) the path to the knowledge base file. @return: (tuple) containing a list and a dictionary. The list contains compiled regex patterns used as search terms and will be used to force searching order to match that of the knowledge base. The dictionary contains the search->replace terms. The keys of the dictionary are the compiled regex word phrases used for searching in the reference lines; The values in the dictionary are the replace terms for matches. """ # Initialise vars: # dictionary of search and replace phrases from KB: kb = {} standardised_titles = {} seek_phrases = [] # A dictionary of "replacement terms" (RHS) to be inserted into KB as # "seek terms" later, if they were not already explicitly added # by the KB: repl_terms = {} write_message('Processing journals kb', verbose=3) for seek_phrase, repl in knowledgebase: # We match on a simplified line, thus dots are replaced # with spaces seek_phrase = seek_phrase.replace('.', ' ').upper() # good KB line # Add the 'replacement term' into the dictionary of # replacement terms: repl_terms[repl] = None # add the phrase from the KB if the 'seek' phrase is longer # compile the seek phrase into a pattern: seek_ptn = re.compile(ur'(?<!\w)(%s)\W' % re.escape(seek_phrase), re.UNICODE) kb[seek_phrase] = seek_ptn standardised_titles[seek_phrase] = repl seek_phrases.append(seek_phrase) # Now, for every 'replacement term' found in the KB, if it is # not already in the KB as a "search term", add it: for repl_term in repl_terms.keys(): raw_repl_phrase = repl_term.upper() raw_repl_phrase = re_punctuation.sub(u' ', raw_repl_phrase) raw_repl_phrase = \ re_group_captured_multiple_space.sub(u' ', raw_repl_phrase) raw_repl_phrase = raw_repl_phrase.strip() if raw_repl_phrase not in kb: # The replace-phrase was not in the KB as a seek phrase # It should be added. pattern = ur'(?<!\/)\b(%s)[^A-Z0-9]' % re.escape(raw_repl_phrase) seek_ptn = re.compile(pattern, re.U) kb[raw_repl_phrase] = seek_ptn standardised_titles[raw_repl_phrase] = repl_term seek_phrases.append(raw_repl_phrase) # Sort the titles by string length (long - short) seek_phrases.sort(_cmp_bystrlen_reverse) write_message('Processed journals kb', verbose=3) # return the raw knowledge base: return kb, standardised_titles, seek_phrases
def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn): """Given a reference section, rebuild the reference lines. After translation from PDF to text, reference lines are often broken. This is because pdftotext doesn't know what is a wrapped-line and what is a genuine new line. As a result, the following 2 reference lines: [1] See http://invenio-software.org/ for more details. [2] Example, AN: private communication (1996). ...could be broken into the following 4 lines during translation from PDF to plaintext: [1] See http://invenio-software.org/ fo r more details. [2] Example, AN: private communica tion (1996). Such a situation could lead to a citation being separated across 'lines', meaning that it wouldn't be correctly recognised. This function tries to rebuild the reference lines. It uses the pattern used to recognise a reference line's numeration marker to indicate the start of a line. If no reference line numeration was recognised, it will simply join all lines together into one large reference line. @param ref_sectn: (list) of strings. The (potentially broken) reference lines. @param ref_line_marker_ptn: (string) - the pattern used to recognise a reference line's numeration marker. @return: (list) of strings - the rebuilt reference section. Each string in the list represents a complete reference line. """ ## initialise some vars: rebuilt_references = [] working_ref = [] strip_before = True if ref_line_marker_ptn is None or \ type(ref_line_marker_ptn) not in (str, unicode): if test_for_blank_lines_separating_reference_lines(ref_sectn): ## Use blank lines to separate ref lines ref_line_marker_ptn = ur'^\s*$' else: ## No ref line dividers: unmatchable pattern #ref_line_marker_ptn = ur'^A$^A$$' # I am adding a new format, hopefully # this case wasn't useful # Reference1 # etc # Reference2 # etc # We split when there's no identation ref_line_marker_ptn = ur'^[^\s]' strip_before = False write_message('* references separator %s' % ref_line_marker_ptn, verbose=2) p_ref_line_marker = re.compile(ref_line_marker_ptn, re.I | re.UNICODE) # Work backwards, starting from the last 'broken' reference line # Append each fixed reference line to rebuilt_references current_ref = None line_counter = 0 def prepare_ref(working_ref): working_line = "" for l in reversed(working_ref): working_line = join_lines(working_line, l) working_line = working_line.rstrip() return wash_and_repair_reference_line(working_line) for line in reversed(ref_sectn): # Try to find the marker for the reference line if strip_before: current_string = line.strip() m_ref_line_marker = p_ref_line_marker.search(current_string) else: m_ref_line_marker = p_ref_line_marker.search(line) current_string = line.strip() if m_ref_line_marker and (not current_ref \ or current_ref == int(m_ref_line_marker.group('marknum')) + 1): # Reference line marker found! : Append this reference to the # list of fixed references and reset the working_line to 'blank' if current_string != '': ## If it's not a blank line to separate refs working_ref.append(current_string) # Append current working line to the refs list if line_counter < CFG_REFEXTRACT_MAX_LINES: rebuilt_references.append(prepare_ref(working_ref)) try: current_ref = int(m_ref_line_marker.group('marknum')) except IndexError: pass # this line doesn't have numbering working_ref = [] line_counter = 0 elif current_string != u'': # Continuation of line working_ref.append(current_string) line_counter += 1 if working_ref: # Append last line rebuilt_references.append(prepare_ref(working_ref)) # A list of reference lines has been built backwards - reverse it: rebuilt_references.reverse() # Make sure mulitple markers within references are correctly # in place (compare current marker num with current marker num +1) # rebuilt_references = correct_rebuilt_lines(rebuilt_references, \ # p_ref_line_marker) # For each properly formated reference line, try to identify cases # where there is more than one citation in a single line. This is # done by looking for semi-colons, which could be used to # separate references return rebuilt_references
def build_reportnum_kb(fpath): """Given the path to a knowledge base file containing the details of institutes and the patterns that their preprint report numbering schemes take, create a dictionary of regexp search patterns to recognise these preprint references in reference lines, and a dictionary of replacements for non-standard preprint categories in these references. The knowledge base file should consist only of lines that take one of the following 3 formats: #####Institute Name#### (the name of the institute to which the preprint reference patterns belong, e.g. '#####LANL#####', surrounded by 5 # on either side.) <pattern> (numeration patterns for an institute's preprints, surrounded by < and >.) seek-term --- replace-term (i.e. a seek phrase on the left hand side, a replace phrase on the right hand side, with the two phrases being separated by 3 hyphens.) E.g.: ASTRO PH ---astro-ph The left-hand side term is a non-standard version of the preprint reference category; the right-hand side term is the standard version. If the KB file cannot be read from, or an unexpected line is encountered in the KB, an error message is output to standard error and execution is halted with an error-code 0. @param fpath: (string) the path to the knowledge base file. @return: (tuple) containing 2 dictionaries. The first contains regexp search patterns used to identify preprint references in a line. This dictionary is keyed by a tuple containing the line number of the pattern in the KB and the non-standard category string. E.g.: (3, 'ASTRO PH'). The second dictionary contains the standardised category string, and is keyed by the non-standard category string. E.g.: 'astro-ph'. """ def _add_institute_preprint_patterns(preprint_classifications, preprint_numeration_ptns, preprint_reference_search_regexp_patterns, standardised_preprint_reference_categories, kb_line_num): """For a list of preprint category strings and preprint numeration patterns for a given institute, create the regexp patterns for each of the preprint types. Add the regexp patterns to the dictionary of search patterns (preprint_reference_search_regexp_patterns), keyed by the line number of the institute in the KB, and the preprint category search string. Also add the standardised preprint category string to another dictionary, keyed by the line number of its position in the KB and its non-standardised version. @param preprint_classifications: (list) of tuples whereby each tuple contains a preprint category search string and the line number of the name of institute to which it belongs in the KB. E.g.: (45, 'ASTRO PH'). @param preprint_numeration_ptns: (list) of preprint reference numeration search patterns (strings) @param preprint_reference_search_regexp_patterns: (dictionary) of regexp patterns used to search in document lines. @param standardised_preprint_reference_categories: (dictionary) containing the standardised strings for preprint reference categories. (E.g. 'astro-ph'.) @param kb_line_num: (integer) - the line number int the KB at which a given institute name was found. @return: None """ if preprint_classifications and preprint_numeration_ptns: # the previous institute had both numeration styles and categories # for preprint references. # build regexps and add them for this institute: # First, order the numeration styles by line-length, and build a # grouped regexp for recognising numeration: ordered_patterns = \ order_reportnum_patterns_bylen(preprint_numeration_ptns) # create a grouped regexp for numeration part of # preprint reference: numeration_regexp = \ create_institute_numeration_group_regexp_pattern(ordered_patterns) # for each "classification" part of preprint references, create a # complete regex: # will be in the style "(categ)-(numatn1|numatn2|numatn3|...)" for classification in preprint_classifications: search_pattern_str = ur'(?:^|[^a-zA-Z0-9\/\.\-])([\[\(]?(?P<categ>' \ + classification[0].strip() + u')' \ + numeration_regexp + u'[\]\)]?)' re_search_pattern = re.compile(search_pattern_str, re.UNICODE) preprint_reference_search_regexp_patterns[(kb_line_num, classification[0])] =\ re_search_pattern standardised_preprint_reference_categories[(kb_line_num, classification[0])] =\ classification[1] preprint_reference_search_regexp_patterns = {} # a dictionary of patterns # used to recognise # categories of preprints # as used by various # institutes standardised_preprint_reference_categories = {} # dictionary of # standardised category # strings for preprint cats current_institute_preprint_classifications = [] # list of tuples containing # preprint categories in # their raw & standardised # forms, as read from KB current_institute_numerations = [] # list of preprint # numeration patterns, as # read from the KB # pattern to recognise an institute name line in the KB re_institute_name = re.compile(ur'^\*{5}\s*(.+)\s*\*{5}$', re.UNICODE) # pattern to recognise an institute preprint categ line in the KB re_preprint_classification = \ re.compile(ur'^\s*(\w.*)\s*---\s*(\w.*)\s*$', re.UNICODE) # pattern to recognise a preprint numeration-style line in KB re_numeration_pattern = re.compile(ur'^\<(.+)\>$', re.UNICODE) kb_line_num = 0 # when making the dictionary of patterns, which is # keyed by the category search string, this counter # will ensure that patterns in the dictionary are not # overwritten if 2 institutes have the same category # styles. try: if isinstance(fpath, basestring): write_message('Loading reports kb from %s' % fpath, verbose=3) fh = open(fpath, "r") fpath_needs_closing = True else: fpath_needs_closing = False fh = fpath for rawline in fh: if rawline.startswith('#'): continue kb_line_num += 1 try: rawline = rawline.decode("utf-8") except UnicodeError: write_message("*** Unicode problems in %s for line %e" % (fpath, kb_line_num), sys.stderr, verbose=0) raise UnicodeError("Error: Unable to parse report number kb (line: %s)" % str(kb_line_num)) m_institute_name = re_institute_name.search(rawline) if m_institute_name: # This KB line is the name of an institute # append the last institute's pattern list to the list of # institutes: _add_institute_preprint_patterns(current_institute_preprint_classifications, current_institute_numerations, preprint_reference_search_regexp_patterns, standardised_preprint_reference_categories, kb_line_num) # Now start a new dictionary to contain the search patterns # for this institute: current_institute_preprint_classifications = [] current_institute_numerations = [] # move on to the next line continue m_preprint_classification = \ re_preprint_classification.search(rawline) if m_preprint_classification: # This KB line contains a preprint classification for # the current institute try: current_institute_preprint_classifications.append((m_preprint_classification.group(1), m_preprint_classification.group(2))) except (AttributeError, NameError): # didn't match this line correctly - skip it pass # move on to the next line continue m_numeration_pattern = re_numeration_pattern.search(rawline) if m_numeration_pattern: # This KB line contains a preprint item numeration pattern # for the current institute try: current_institute_numerations.append(m_numeration_pattern.group(1)) except (AttributeError, NameError): # didn't match the numeration pattern correctly - skip it pass continue _add_institute_preprint_patterns(current_institute_preprint_classifications, current_institute_numerations, preprint_reference_search_regexp_patterns, standardised_preprint_reference_categories, kb_line_num) if fpath_needs_closing: write_message('Loaded reports kb', verbose=3) fh.close() except IOError: # problem opening KB for reading, or problem while reading from it: emsg = """Error: Could not build knowledge base containing """ \ """institute preprint referencing patterns - failed """ \ """to read from KB %(kb)s.""" \ % {'kb' : fpath} write_message(emsg, sys.stderr, verbose=0) raise IOError("Error: Unable to open report number kb '%s'" % fpath) # return the preprint reference patterns and the replacement strings # for non-standard categ-strings: return (preprint_reference_search_regexp_patterns, standardised_preprint_reference_categories)