def solr_add_range(lower_recid, upper_recid): """ Adds the regarding field values of all records from the lower recid to the upper one to Solr. It preserves the fulltext information. """ for recid in range(lower_recid, upper_recid + 1): if record_exists(recid): try: abstract = unicode(remove_control_characters(get_fieldvalues(recid, CFG_MARC_ABSTRACT)[0]), 'utf-8') except: abstract = "" try: first_author = remove_control_characters(get_fieldvalues(recid, CFG_MARC_AUTHOR_NAME)[0]) additional_authors = remove_control_characters(reduce(lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_ADDITIONAL_AUTHOR_NAME), '')) author = unicode(first_author + " " + additional_authors, 'utf-8') except: author = "" try: bibrecdocs = BibRecDocs(recid) fulltext = unicode(remove_control_characters(bibrecdocs.get_text()), 'utf-8') except: fulltext = "" try: keyword = unicode(remove_control_characters(reduce(lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_KEYWORD), '')), 'utf-8') except: keyword = "" try: title = unicode(remove_control_characters(get_fieldvalues(recid, CFG_MARC_TITLE)[0]), 'utf-8') except: title = "" solr_add(recid, abstract, author, fulltext, keyword, title) SOLR_CONNECTION.commit() task_sleep_now_if_required(can_stop_too=True)
def solr_add_fulltext(recid, text): """ Helper function that dispatches TEXT to Solr for given record ID. Returns True/False upon success/failure. """ if recid: try: text = remove_control_characters(text) utext = unicode(text, 'utf-8') utext = remove_invalid_solr_characters(utext) SOLR_CONNECTION.add(id=recid, abstract="", author="", fulltext=utext, keyword="", title="") return True except (UnicodeDecodeError, UnicodeEncodeError): # forget about bad UTF-8 files pass return False
def test_remove_control_characters(self): """textutils - stripping of accented letters""" self.assertEqual("foo\nbar\tfab\n\r", remove_control_characters('foo\nbar\tfab\n\r')) self.assertEqual("abc de", remove_control_characters('abc\02de'))