def run(self, todo): if todo: counter = 0 if todo == 'things' or todo == 'all': print 'dropping things index' solr.delete(queries=solr.Q(content_type="thing")) solr.commit() print 'reindexing things' for t in Thing.objects().all(): t.add_to_solr(commit=False) if counter == 100: solr.commit() print " 100 done - at ", t.title counter = 0 counter += 1 if todo == 'collections' or todo == 'all': print 'dropping collections index' solr.delete(queries=solr.Q(content_type="collection")) solr.commit() print 'reindexing collections' if todo == 'makers' or todo == 'all': print 'dropping makers index' solr.delete(queries=solr.Q(content_type="maker")) solr.commit() print 'reindexing makers' for m in Maker.objects().all(): m.add_to_solr(commit=False) if counter == 100: solr.commit() print " 100 done - at ", m.display_name counter = 0 counter += 1 if todo == 'discussions' or todo == 'all': print 'dropping discussions index' solr.delete(queries=solr.Q(content_type="thread")) solr.commit() print 'reindexing discussions' if todo == 'pages' or todo == 'all': print 'dropping pages index' solr.delete(queries=solr.Q(content_type="page")) solr.commit() if todo == 'uploads' or todo == 'all': print 'dropping uploads index' solr.delete(queries=solr.Q(content_type="upload")) solr.commit()
def index_upload(self, u, force=False): """ Indexes a file upload, if possible; forces the issue, if necessary; update """ # try to get the first page def upload_already_indexed(upload): ''' Has the upload already been indexed? Look for page 1 ''' try: p = es.get(index="aaaarg", doc_type="page", id="%s_%s" % (str(upload.id), 1), fields='md5') return True except: return False try_path = u.full_path() n, e = os.path.splitext(try_path) # only handle pdfs if not e == '.pdf': return False # Determine the job is_indexed = upload_already_indexed(u) needs_extraction = force or not is_indexed _illegal_xml_chars_RE = re.compile( u'[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]') # Try to extract if needs_extraction: print "Opening", u.structured_file_name, "for extraction" try: pages = Pdf(try_path).dump_pages() num_pages = len(pages) except: return False else: try: num_pages = Pdf(try_path).npages except: return False # This is the base document t = Thing.objects(files=u)[0] body = { 'md5': u.md5, 'thing': str(t.id), 'title': t.title, 'makers': [str(m.maker.id) for m in t.makers], 'makers_string': t.format_makers_string(), 'collections': [str(c.id) for c in Collection.objects.filter(things__thing=t)], 'page_count': len(pages), 'page': 1, } if needs_extraction and pages: for page_num, content in pages.iteritems(): if content: print "Page:", page_num id = "%s_%s" % (str(u.id), page_num) try: content = unicode(content, 'utf-8') content = unidecode(content) except: pass # re.sub(_illegal_xml_chars_RE, '?', content) body['searchable_text'] = content body['page'] = page_num es.index( index="aaaarg", doc_type="page", id=id, body=body) elif not needs_extraction: print "Updating ", num_pages, "pages - extraction not needed." for page_num in range(num_pages): # 0 index, needs to be corrected id = "%s_%s" % (str(u.id), page_num + 1) body['page'] = page_num + 1 es.update( index="aaaarg", doc_type="page", id=id, body={'doc': body})
def index_upload(self, u, force=False): """ Indexes a file upload, if possible; forces the issue, if necessary; update """ # try to get the first page def upload_already_indexed(upload): ''' Has the upload already been indexed? Look for page 1 ''' try: p = es.get(index="aaaarg", doc_type="page", id="%s_%s" % (str(upload.id), 1), fields='md5') return True except: return False try_path = u.full_path() n, e = os.path.splitext(try_path) # only handle pdfs if not e == '.pdf': return False # Determine the job is_indexed = upload_already_indexed(u) needs_extraction = force or not is_indexed _illegal_xml_chars_RE = re.compile( u'[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]') # Try to extract if needs_extraction: print "Opening", u.structured_file_name, "for extraction" try: pages = Pdf(try_path).dump_pages() num_pages = len(pages) except: return False else: try: num_pages = Pdf(try_path).npages except: return False # This is the base document t = Thing.objects(files=u)[0] body = { 'md5': u.md5, 'thing': str(t.id), 'title': t.title, 'makers': [str(m.maker.id) for m in t.makers], 'makers_string': t.format_makers_string(), 'collections': [str(c.id) for c in Collection.objects.filter(things__thing=t)], 'page_count': len(pages), 'page': 1, } if needs_extraction and pages: for page_num, content in pages.iteritems(): if content: print "Page:", page_num id = "%s_%s" % (str(u.id), page_num) try: content = unicode(content, 'utf-8') content = unidecode(content) except: pass # re.sub(_illegal_xml_chars_RE, '?', content) body['searchable_text'] = content body['page'] = page_num es.index(index="aaaarg", doc_type="page", id=id, body=body) elif not needs_extraction: print "Updating ", num_pages, "pages - extraction not needed." for page_num in range(num_pages): # 0 index, needs to be corrected id = "%s_%s" % (str(u.id), page_num + 1) body['page'] = page_num + 1 es.update(index="aaaarg", doc_type="page", id=id, body={'doc': body})