def get_all_dirs_under_index_path(self): ''' returns the absolute path to all the index folders (optionally filtered by --if command line option)''' from digipal.utils import get_all_files_under ret = get_all_files_under( settings.SEARCH_INDEX_PATH, filters=self.get_filtered_indexes()) return ret
def get_all_dirs_under_index_path(self): ''' returns the absolute path to all the index folders (optionally filtered by --if command line option)''' from digipal.utils import get_all_files_under ret = get_all_files_under(settings.SEARCH_INDEX_PATH, filters=self.get_filtered_indexes()) return ret
def get_index_info(self, path): ret = {'date': 0, 'size': 0, 'fields': [], 'entries': '?', 'segments': []} # basic filesystem info from digipal.utils import get_all_files_under for file in get_all_files_under(path, file_types='f'): ret['size'] += os.path.getsize(file) ret['date'] = max(ret['date'], os.path.getmtime(file)) # whoosh info import whoosh from whoosh.index import open_dir index = None try: index = open_dir(path) except whoosh.index.EmptyIndexError: pass query = self.options['qs'] afield = self.options['field'] if index: with index.searcher() as searcher: ret['entries'] = searcher.doc_count() for segment in index._segments(): ret['segments'].append( {'id': segment.segid, 'entries': segment.doc_count()}) for item in index.schema.items(): field_info = { 'name': item[0], 'type': item[1].__class__.__name__, 'range': [None, None]} #values = list(searcher.lexicon(item[0])) values = list(searcher.field_terms(item[0])) #values_filtered = [v for v in values if repr(v) not in ['-2147483640L', '-2147483641L', '-2147483520L']] values_filtered = values if field_info['type'] == 'NUMERIC' and 'date' in field_info['name']: values_filtered = [ v for v in values if v < 5000 and v > -5000] if not values_filtered: values_filtered = [0] field_info['unique_values'] = len(list(values)) field_info['range'] = [repr(v)[0:12] for v in [ min(values_filtered), max(values_filtered)]] ret['fields'].append(field_info) if field_info['name'] == afield: ret['values'] = sorted(list(set(values))) if query: info = {} ret['results'] = self.whoosh_search( query, searcher, index, info) return ret
def md2cms(self): from digipal.views import doc doc_slug = 'doc' self.update_cms_page(doc_slug, draft=True) for path in utils.get_all_files_under(doc.get_doc_root_path('digipal'), file_types='f', filters=self.options['filter'], extensions='md', can_return_root=True): print path info = doc.get_doc_from_md(utils.read_file(path)) page = None if info: content = u'<div class="mddoc">%s</div>' % info['content'] page = self.update_cms_page(info['title'], content, doc_slug) if page: print ' => # %s (%s)' % (page.id, page.slug)
def html2md(self): if len(self.args) < 2: print 'ERROR: missing path. Check help.' exit() path = self.args[1] from digipal.views import doc from django.utils.text import slugify for path in utils.get_all_files_under(path, file_types='f', filters=self.options['filter'], extensions=['html', 'htm'], can_return_root=True): info = doc.get_md_from_html(path) target = os.path.join(doc.get_doc_root_path('digipal'), slugify(info['title']))+'.md' if 'confluence-workbox' in target: continue utils.write_file(target, info['md']) print '%s\n => %s' % (path, target) for f in info['files']: print ' + %s' % f
context['running'] = context['indexing'] and context['indexing']['progress'] < 1.0 now = datetime.now() if context['indexing'] and\ (not context['running'] and ((now - context['indexing']['updated']).total_seconds() > (60 * 10))): context['indexing'] = None # read the index stats for ct in content_types: info = {'date': 0, 'size': 0} context['indexes'][ct.key] = { 'object': ct, 'info': info, 'indexing': context['indexing']['indexes'].get(ct.key, None) if context['indexing'] else None, } for afile in get_all_files_under(ct.get_whoosh_index_path(), file_types='f'): info['size'] += os.path.getsize(afile) info['date'] = max(info['date'], os.path.getmtime(afile)) info['date'] = datetime.fromtimestamp(info['date']) info['size'] = int(info['size']) context['title'] = 'Search Indexer' template = 'search/search_index.html' if request.is_ajax(): template = 'search/search_index_fragment.html' ret = render_to_response( template, context, context_instance=RequestContext(request)) return ret
def search_index_view(request): context = {"indexes": SortedDict()} """ todo DONE reindex selected indexes in background . show when indexer is working . lock form (if working) . show last time indexer started (if working) . ajaxify . vue.js? """ from digipal.views.faceted_search import faceted_search from digipal.utils import get_all_files_under from datetime import datetime from digipal.views.faceted_search.search_indexer import SearchIndexer indexer = SearchIndexer() content_types = faceted_search.get_types(True) # process request action = request.POST.get("action", "") reindexes = [] if action == "reindex": for ct in content_types: if request.POST.get("select-%s" % ct.key): reindexes.append(ct.key) if reindexes: dputils.call_management_command("dpsearch", "index_facets", **{"if": ",".join(reindexes)}) context["indexing"] = indexer.get_state_initial(reindexes) if not "indexing" in context: context["indexing"] = indexer.read_state() context["running"] = context["indexing"] and context["indexing"]["progress"] < 1.0 now = datetime.now() if context["indexing"] and ( not context["running"] and ((now - context["indexing"]["updated"]).total_seconds() > (60 * 10)) ): context["indexing"] = None # read the index stats for ct in content_types: info = {"date": 0, "size": 0} context["indexes"][ct.key] = { "object": ct, "info": info, "indexing": context["indexing"]["indexes"].get(ct.key, None) if context["indexing"] else None, } for afile in get_all_files_under(ct.get_whoosh_index_path(), file_types="f"): info["size"] += os.path.getsize(afile) info["date"] = max(info["date"], os.path.getmtime(afile)) info["date"] = datetime.fromtimestamp(info["date"]) info["size"] = int(info["size"]) context["title"] = "Search Indexer" template = "search/search_index.html" if request.is_ajax(): template = "search/search_index_fragment.html" ret = render_to_response(template, context, context_instance=RequestContext(request)) return ret
def get_index_info(self, path): ret = { 'date': 0, 'size': 0, 'fields': [], 'entries': '?', 'segments': [] } # basic filesystem info from digipal.utils import get_all_files_under for file in get_all_files_under(path, file_types='f'): ret['size'] += os.path.getsize(file) ret['date'] = max(ret['date'], os.path.getmtime(file)) # whoosh info import whoosh from whoosh.index import open_dir index = None try: index = open_dir(path) except whoosh.index.EmptyIndexError: pass query = self.options['qs'] afield = self.options['field'] if index: with index.searcher() as searcher: ret['entries'] = searcher.doc_count() for segment in index._segments(): ret['segments'].append({ 'id': segment.segid, 'entries': segment.doc_count() }) for item in index.schema.items(): field_info = { 'name': item[0], 'type': item[1].__class__.__name__, 'range': [None, None] } #values = list(searcher.lexicon(item[0])) values = list(searcher.field_terms(item[0])) #values_filtered = [v for v in values if repr(v) not in ['-2147483640L', '-2147483641L', '-2147483520L']] values_filtered = values if field_info['type'] == 'NUMERIC' and 'date' in field_info[ 'name']: values_filtered = [ v for v in values if v < 5000 and v > -5000 ] if not values_filtered: values_filtered = [0] field_info['unique_values'] = len(list(values)) field_info['range'] = [ repr(v)[0:12] for v in [min(values_filtered), max(values_filtered)] ] ret['fields'].append(field_info) if field_info['name'] == afield: ret['values'] = sorted(list(set(values))) if query: info = {} ret['results'] = self.whoosh_search( query, searcher, index, info) return ret
context['indexing'] = None # read the index stats for ct in content_types: info = {'date': 0, 'size': 0} context['indexes'][ct.key] = { 'object': ct, 'info': info, 'indexing': context['indexing']['indexes'].get(ct.key, None) if context['indexing'] else None, } for afile in get_all_files_under(ct.get_whoosh_index_path(), file_types='f'): info['size'] += os.path.getsize(afile) info['date'] = max(info['date'], os.path.getmtime(afile)) info['date'] = datetime.fromtimestamp(info['date']) info['size'] = int(info['size']) context['title'] = 'Search Indexer' template = 'search/search_index.html' if request.is_ajax(): template = 'search/search_index_fragment.html' ret = render_to_response(template, context, context_instance=RequestContext(request))