def solr_add_range(lower_recid, upper_recid, tags_to_index, next_commit_counter): """ Adds the regarding field values of all records from the lower recid to the upper one to Solr. It preserves the fulltext information. """ for recid in range(lower_recid, upper_recid + 1): if record_exists(recid): abstract = get_field_content_in_utf8(recid, 'abstract', tags_to_index) author = get_field_content_in_utf8(recid, 'author', tags_to_index) keyword = get_field_content_in_utf8(recid, 'keyword', tags_to_index) title = get_field_content_in_utf8(recid, 'title', tags_to_index) try: bibrecdocs = BibRecDocs(recid) fulltext = unicode(bibrecdocs.get_text(), 'utf-8') except: fulltext = '' solr_add(recid, abstract, author, fulltext, keyword, title) next_commit_counter = solr_commit_if_necessary(next_commit_counter, recid=recid) return next_commit_counter
def xapian_add_all(lower_recid, upper_recid): """ Adds the regarding field values of all records from the lower recid to the upper one to Xapian. It preserves the fulltext information. """ xapian_init_databases() for recid in range(lower_recid, upper_recid + 1): try: abstract = unicode( get_fieldvalues(recid, CFG_MARC_ABSTRACT)[0], 'utf-8') except: abstract = "" xapian_add(recid, "abstract", abstract) try: first_author = get_fieldvalues(recid, CFG_MARC_AUTHOR_NAME)[0] additional_authors = reduce( lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_ADDITIONAL_AUTHOR_NAME), '') author = unicode(first_author + " " + additional_authors, 'utf-8') except: author = "" xapian_add(recid, "author", author) try: bibrecdocs = BibRecDocs(recid) fulltext = unicode(bibrecdocs.get_text(), 'utf-8') except: fulltext = "" xapian_add(recid, "fulltext", fulltext) try: keyword = unicode( reduce(lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_KEYWORD), ''), 'utf-8') except: keyword = "" xapian_add(recid, "keyword", keyword) try: title = unicode(get_fieldvalues(recid, CFG_MARC_TITLE)[0], 'utf-8') except: title = "" xapian_add(recid, "title", title)
def solr_add_range(lower_recid, upper_recid, tags_to_index, next_commit_counter): """ Adds the regarding field values of all records from the lower recid to the upper one to Solr. It preserves the fulltext information. """ for recid in range(lower_recid, upper_recid + 1): if record_exists(recid): abstract = get_field_content_in_utf8(recid, 'abstract', tags_to_index) author = get_field_content_in_utf8(recid, 'author', tags_to_index) keyword = get_field_content_in_utf8(recid, 'keyword', tags_to_index) title = get_field_content_in_utf8(recid, 'title', tags_to_index) try: bibrecdocs = BibRecDocs(recid) fulltext = unicode(bibrecdocs.get_text(), 'utf-8') except: fulltext = '' solr_add(recid, abstract, author, fulltext, keyword, title) next_commit_counter = solr_commit_if_necessary(next_commit_counter,recid=recid) return next_commit_counter
def xapian_add_all(lower_recid, upper_recid): """ Adds the regarding field values of all records from the lower recid to the upper one to Xapian. It preserves the fulltext information. """ xapian_init_databases() for recid in range(lower_recid, upper_recid + 1): try: abstract = unicode(get_fieldvalues(recid, CFG_MARC_ABSTRACT)[0], 'utf-8') except: abstract = "" xapian_add(recid, "abstract", abstract) try: first_author = get_fieldvalues(recid, CFG_MARC_AUTHOR_NAME)[0] additional_authors = reduce(lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_ADDITIONAL_AUTHOR_NAME), '') author = unicode(first_author + " " + additional_authors, 'utf-8') except: author = "" xapian_add(recid, "author", author) try: bibrecdocs = BibRecDocs(recid) fulltext = unicode(bibrecdocs.get_text(), 'utf-8') except: fulltext = "" xapian_add(recid, "fulltext", fulltext) try: keyword = unicode(reduce(lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_KEYWORD), ''), 'utf-8') except: keyword = "" xapian_add(recid, "keyword", keyword) try: title = unicode(get_fieldvalues(recid, CFG_MARC_TITLE)[0], 'utf-8') except: title = "" xapian_add(recid, "title", title)
def get_words_from_fulltext(self, url_direct_or_indirect): """Returns all the words contained in the document specified by URL_DIRECT_OR_INDIRECT with the words being split by various SRE_SEPARATORS regexp set earlier. If FORCE_FILE_EXTENSION is set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF file. (This is interesting to index Indico for example.) Note also that URL_DIRECT_OR_INDIRECT may be either a direct URL to the fulltext file or an URL to a setlink-like page body that presents the links to be indexed. In the latter case the URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs to fulltext documents, for all knows file extensions as specified by global CONV_PROGRAMS config variable. """ write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2) try: if bibdocfile_url_p(url_direct_or_indirect): write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2) bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect) indexer = get_idx_indexer('fulltext') if indexer != 'native': # A document might belong to multiple records for rec_link in bibdoc.bibrec_links: recid = rec_link["recid"] # Adds fulltexts of all files once per records if not recid in fulltext_added: bibrecdocs = BibRecDocs(recid) text = bibrecdocs.get_text() if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext(recid, text) elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: xapian_add(recid, 'fulltext', text) fulltext_added.add(recid) # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: return [] else: text = "" if hasattr(bibdoc, "get_text"): text = bibdoc.get_text() return self.tokenize_for_words_default(text) else: if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY: write_message("... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2) return [] write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2) urls_to_index = set() for splash_re, url_re in iteritems(CFG_BIBINDEX_SPLASH_PAGES): if re.match(splash_re, url_direct_or_indirect): write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2) html = urllib2.urlopen(url_direct_or_indirect).read() urls = get_links_in_html_page(html) write_message("... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3) for url in urls: if re.match(url_re, url): write_message("... will index %s (matched by %s)" % (url, url_re), verbose=2) urls_to_index.add(url) if not urls_to_index: urls_to_index.add(url_direct_or_indirect) write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2) words = {} for url in urls_to_index: tmpdoc = download_url(url) file_converter_logger = get_file_converter_logger() old_logging_level = file_converter_logger.getEffectiveLevel() if self.verbose > 3: file_converter_logger.setLevel(logging.DEBUG) try: try: tmptext = convert_file(tmpdoc, output_format='.txt') text = open(tmptext).read() os.remove(tmptext) indexer = get_idx_indexer('fulltext') if indexer != 'native': if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext(None, text) # FIXME: use real record ID if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: #xapian_add(None, 'fulltext', text) # FIXME: use real record ID pass # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: tmpwords = [] else: tmpwords = self.tokenize_for_words_default(text) words.update(dict(map(lambda x: (x, 1), tmpwords))) except Exception as e: message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (url, url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) finally: os.remove(tmpdoc) if self.verbose > 3: file_converter_logger.setLevel(old_logging_level) return words.keys() except Exception as e: message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) return []
def get_words_from_fulltext(self, url_direct_or_indirect): """Returns all the words contained in the document specified by URL_DIRECT_OR_INDIRECT with the words being split by various SRE_SEPARATORS regexp set earlier. If FORCE_FILE_EXTENSION is set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF file. (This is interesting to index Indico for example.) Note also that URL_DIRECT_OR_INDIRECT may be either a direct URL to the fulltext file or an URL to a setlink-like page body that presents the links to be indexed. In the latter case the URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs to fulltext documents, for all knows file extensions as specified by global CONV_PROGRAMS config variable. """ write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2) try: if bibdocfile_url_p(url_direct_or_indirect): write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2) try: bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect) except InvenioBibDocFileError: # Outdated 8564 tag return [] indexer = get_idx_indexer('fulltext') if indexer != 'native': # A document might belong to multiple records for rec_link in bibdoc.bibrec_links: recid = rec_link["recid"] # Adds fulltexts of all files once per records if not recid in fulltext_added: bibrecdocs = BibRecDocs(recid) try: text = bibrecdocs.get_text() except InvenioBibDocFileError: # Invalid PDF continue if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext(recid, text) elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: xapian_add(recid, 'fulltext', text) fulltext_added.add(recid) # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: return [] else: text = "" if hasattr(bibdoc, "get_text"): text = bibdoc.get_text() return self.tokenize_for_words_default(text) else: if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY: write_message( "... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2) return [] write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2) urls_to_index = set() for splash_re, url_re in iteritems(CFG_BIBINDEX_SPLASH_PAGES): if re.match(splash_re, url_direct_or_indirect): write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2) html = urllib2.urlopen(url_direct_or_indirect).read() urls = get_links_in_html_page(html) write_message( "... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3) for url in urls: if re.match(url_re, url): write_message( "... will index %s (matched by %s)" % (url, url_re), verbose=2) urls_to_index.add(url) if not urls_to_index: urls_to_index.add(url_direct_or_indirect) write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2) words = {} for url in urls_to_index: tmpdoc = download_url(url) file_converter_logger = get_file_converter_logger() old_logging_level = file_converter_logger.getEffectiveLevel( ) if self.verbose > 3: file_converter_logger.setLevel(logging.DEBUG) try: try: tmptext = convert_file(tmpdoc, output_format='.txt') text = open(tmptext).read() os.remove(tmptext) indexer = get_idx_indexer('fulltext') if indexer != 'native': if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext( None, text) # FIXME: use real record ID if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: #xapian_add(None, 'fulltext', text) # FIXME: use real record ID pass # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: tmpwords = [] else: tmpwords = self.tokenize_for_words_default( text) words.update(dict(map(lambda x: (x, 1), tmpwords))) except Exception as e: message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % ( url, url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) finally: os.remove(tmpdoc) if self.verbose > 3: file_converter_logger.setLevel(old_logging_level) return words.keys() except Exception as e: message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % ( url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) return []