def test_solr_interface(self, mocksunburnt, mockhttplib): # basic init with no options solr_interface() # httplib2.Http should be initialized with defaults (no args, no cert) mockhttplib.Http.called_with(ca_certs=None) mocksunburnt.SolrInterface.assert_called_with(settings.SOLR_SERVER_URL, schemadoc=settings.SOLR_SCHEMA, http_connection=mockhttplib.Http.return_value)
def test_solr_interface(self, mocksunburnt, mockhttplib): # basic init with no options solr_interface() # httplib2.Http should be initialized with defaults (no args, no cert) mockhttplib.Http.called_with(ca_certs=None) mocksunburnt.SolrInterface.assert_called_with( settings.SOLR_SERVER_URL, schemadoc=settings.SOLR_SCHEMA, http_connection=mockhttplib.Http.return_value)
def test_solr_interface_cert(self, mocksunburnt, mockhttplib): # init with a ca cert settings.SOLR_CA_CERT_PATH = '/some/path/to/certs' solr_interface() # httplib should be initialized with ca_certs option mockhttplib.Http.assert_called_with( ca_certs=settings.SOLR_CA_CERT_PATH # proxy_info=mockhttplib.ProxyInfo.return_value )
def test_search_collections(self, mockpaginator, mocksolr_interface, mocksearch_libs): solr = solr_interface() search_url = reverse('search:keyword') mocksolr = mocksolr_interface.return_value mocksolr.Q = MagicMock(solr.Q) mocksolr.query.return_value = mocksolr.query for method in [ 'query', 'facet_by', 'sort_by', 'field_limit', 'exclude', 'filter' ]: getattr(mocksolr.query, method).return_value = mocksolr.query # create researcher IP for localhost so anonymous access will be # treated as anonymous researcher researchip = ResearcherIP(name='test client', ip_address='127.0.0.1') researchip.save() response = self.client.get(search_url, {'collection': '1000'}) # check solr query args # - collection should trigger OR query against collection label and number fields mocksolr.Q.assert_any_call(collection_label='1000') mocksolr.Q.assert_any_call(collection_source_id='1000') # NOTE: not checking OR query direclty because unclear how to replicate in mock self.assertContains(response, '<input class="form-control" id="id_collection" name="collection" placeholder="Search by collection name or number" type="text" value="%s">' % \ '1000', html=True, msg_prefix='collection search value should be displayed on result page via form') researchip.delete()
def archives(format=None): """Find Archives objects, to which CollectionObjects belong. :returns: list of :class:`CollectionObject` :rtype: list """ # NOTE: formerly called top-level collections or Repository / # Owning Repository; should now be called archive and labeled # as such anywhere user-facing # TODO: search logic very similar to item_collections and # subcollections methods; consider refactoring search logic # into a common search method. if CollectionObject._archives is None: # find all objects with cmodel collection-1.1 and no parents # search solr for collection objects with NO parent collection id solr = solr_interface() # NOTE: not filtering on pidspace, since top-level objects are loaded as fixtures # and may not match the configured pidspace in a dev environment solrquery = solr.query(content_model=CollectionObject.COLLECTION_CONTENT_MODEL) collections = solrquery.exclude(archive_id__any=True).sort_by('title_exact').execute() # store the solr response format CollectionObject._archives = collections if format == dict: return CollectionObject._archives # otherwise, initialize as instances of CollectionObject repo = Repository() return [repo.get_object(arch['pid'], type=CollectionObject) for arch in CollectionObject._archives]
def find_by_collection_number(num, parent=None): '''Find a CollectionObject in Fedora by collection number (or source id), optionally limited by parent collection (owning archive). :param num: collection number to search for (aka source id) :param parent: optional; archive that the collection must belong to :return: generator of any matching items, as instances of :class:`CollectionObject` ''' solr = solr_interface() solrquery = solr.query(content_model=CollectionObject.COLLECTION_CONTENT_MODEL, pid='%s:*' % settings.FEDORA_PIDSPACE, source_id=int(num)) # if parent is specified, restrict by archive id (parent should be a pid) if parent is not None: # remove prefix on parent prefix ='info:fedora/' if parent.startswith(prefix): parent = parent[12:] solrquery = solrquery.query(archive_id=parent) # by default, only returns 10; get everything # - solr response is a list of dictionary with collection info # use dictsort in template for sorting where appropriate collections = solrquery.paginate(start=0, rows=1000).execute() # return a generator of matching items, as instances of CollectionObject repo = Repository() for coll in collections: yield repo.get_object(coll['pid'], type=CollectionObject)
def _duplicate_exists(self, cleaned_data): """Determine if saving this form would create a duplicate collection. Specifically, verify that there is no other collection with the same collection (archive) and source_id present in solr. """ collection = cleaned_data.get('collection') source_id = cleaned_data.get('source_id') solr = solr_interface() query = solr.query( content_model=CollectionObject.COLLECTION_CONTENT_MODEL, source_id=source_id, archive_id=collection) response = query.execute() # if there are no matches then this is definitely not a if response.result.numFound == 0: return False if response.result.numFound > 1: # if there's already more than one match then this is definitely # a duplicate return True # otherwise there's exactly one. if it's this object then this *is* # the collection with that archive/id. return (response[0]['pid'] != self.object_instance.pid)
def test_solr_interface_proxy(self, mocksunburnt, mockhttplib): # init with an http proxy set in env os.environ['HTTP_PROXY'] = 'http://localhost:3128/' solr_interface() # proxy info should be configured & passed to httplib2 mockhttplib.ProxyInfo.assert_called_with(proxy_type=mockhttplib.socks.PROXY_TYPE_HTTP_NO_TUNNEL, proxy_host='localhost', proxy_port=3128) mockhttplib.Http.assert_called_with( proxy_info=mockhttplib.ProxyInfo.return_value, ca_certs=settings.SOLR_CA_CERT_PATH) # when solr url is https, no proxy should be set mockhttplib.reset_mock() settings.SOLR_SERVER_URL = 'https://test.solr/' solr_interface() mockhttplib.ProxyInfo.assert_not_called() # no args except default cert path mockhttplib.Http.assert_called_with(ca_certs=settings.SOLR_CA_CERT_PATH)
def search(request): '''Search for :class:`~keep.collection.models.CollectionObject` instances. ''' form = CollectionSearch(request.GET, prefix='collection') context = {'search': form} if form.is_valid(): # include all non-blank fields from the form as search terms search_opts = dict((key, val) for key, val in form.cleaned_data.iteritems() if val is not None and val != '') # but need to search by 0 # restrict to currently configured pidspace and collection content model search_opts.update({ 'pid': '%s:*' % settings.FEDORA_PIDSPACE, 'content_model': CollectionObject.COLLECTION_CONTENT_MODEL, }) # collect non-empty, non-default search terms to display to user on results page search_info = {} for field, val in form.cleaned_data.iteritems(): key = form.fields[field].label # use form display label if key is None: # if field label is not set, use field name as a fall-back key = field if val is not None and val != '': # if search value is not empty, selectively add it if hasattr(val, 'lstrip'): # solr strings can't start with wildcards extra_solr_cleaned = val.lstrip('*?') if val != extra_solr_cleaned: if not extra_solr_cleaned: messages.info(request, 'Ignoring search term "%s": Text fields can\'t start with wildcards.' % (val,)) del search_opts[field] continue messages.info(request, 'Searching for "%s" instead of "%s": Text fields can\'t start with wildcards.' % (extra_solr_cleaned, val)) val = extra_solr_cleaned search_opts[field] = val if field == 'archive_id': # for archive, get info search_info[key] = CollectionObject.find_by_pid(val) elif val != form.fields[field].initial: # ignore default values search_info[key] = val context['search_info'] = search_info solr = solr_interface() solrquery = solr.query(**search_opts).sort_by('source_id') # TODO: eventually, we'll need proper pagination here; # for now, set a large max to return everything context['results'] = solrquery.paginate(start=0, rows=1000).execute() # if the form was not valid, set the current instance of the form # as the sidebar form instance to display the error else: context['collection_search'] = form # render search results page; if there was an error, results will be displayed as empty return TemplateResponse(request, 'collection/search.html', context)
def test_solr_interface_proxy(self, mocksunburnt, mockhttplib): # init with an http proxy set in env os.environ['HTTP_PROXY'] = 'http://localhost:3128/' solr_interface() # proxy info should be configured & passed to httplib2 mockhttplib.ProxyInfo.assert_called_with( proxy_type=mockhttplib.socks.PROXY_TYPE_HTTP_NO_TUNNEL, proxy_host='localhost', proxy_port=3128) mockhttplib.Http.assert_called_with( proxy_info=mockhttplib.ProxyInfo.return_value, ca_certs=settings.SOLR_CA_CERT_PATH) # when solr url is https, no proxy should be set mockhttplib.reset_mock() settings.SOLR_SERVER_URL = 'https://test.solr/' solr_interface() mockhttplib.ProxyInfo.assert_not_called() # no args except default cert path mockhttplib.Http.assert_called_with( ca_certs=settings.SOLR_CA_CERT_PATH)
def find_file_object(self, file_path): '''Find a file object by checksum in fedora based on a file path. Returns a file object if one matches the checksum for the file specified, or else None if no match is found. :returns: :class:`keep.arrangement.models.RushdieArrangementFile` or None ''' file_md5 = md5sum(file_path) solr = solr_interface() q = solr.query(content_md5=file_md5).field_limit('pid') if len(q): return self.repo.get_object(q[0]['pid'], type=RushdieArrangementFile)
def item_collection_query(): """Solr query to find all collection objects in the configured Fedora pidspace that can contain items. Currently this includes all collections that belong to an archive. :returns: list of dict :rtype: list """ # search solr for collection objects with NO parent collection id solr = solr_interface() return solr.query(content_model=CollectionObject.COLLECTION_CONTENT_MODEL, archive_id__any=True)
def collection_suggest(request): '''Suggest view for collections, for use with use with `JQuery UI Autocomplete`_ widget. Searches for collections on all of the terms passed in (as multiple keywords), similar to the way the combined search works. .. _JQuery UI Autocomplete: http://jqueryui.com/demos/autocomplete/ :param request: the http request passed to the original view method (used to retrieve the search term) ''' term = request.GET.get('term', '') suggestions = [] if term: # If the search term doesn't end in space, add a wildcard to # the last word to allow for partial word matching. if term[-1] != ' ': term += '*' terms = search_terms(term) solr = solr_interface() # common query parameters and options base_query = solr.query() \ .filter(content_model=CollectionObject.COLLECTION_CONTENT_MODEL) \ .field_limit(['pid', 'source_id', 'title', 'archive_short_name', 'creator', 'archive_id']) \ .sort_by('-score') q = base_query.query(terms) # NOTE: there seems to be a Lucene/Solr bug/quirk where adding # a wildcard at the end of a word causes Solr not to match the # exact word (even though docs indicate this should work). # As a work-around, if we added a * and got 0 results, # try the search again without the wildcard. if term[-1] == '*' and q.count() == 0: q = base_query.query(search_terms(term[:-1])) #Exclude archival collection (Top-level library) q=q.filter(archive_id__any=True) suggestions = [{'label': '%s %s' % (c.get('source_id', ''), c.get('title', '(no title')), 'value': c['pid'], # FIXME: do we need URI here? 'category':c.get('archive_short_name', ''), 'desc': c.get('creator', '')} for c in q[:15]] return HttpResponse(json_serializer.encode(suggestions), content_type='application/json')
def subcollections(self): """Find all sub-collections that are members of the current collection in the configured Fedora pidspace. :rtype: list of dict """ solr = solr_interface() solrquery = solr.query(content_model=CollectionObject.COLLECTION_CONTENT_MODEL, pid='%s:' % settings.FEDORA_PIDSPACE, archive_id=self.pid) # by default, only returns 10; get everything # - solr response is a list of dictionary with collection info # use dictsort in template for sorting where appropriate return solrquery.paginate(start=0, rows=1000).execute()
def find_by_pid(pid): 'Find a collection by pid and return a dictionary with collection information.' # NOTE: this method added as a replacement for # get_cached_collection_dict that was used elsewhere # throughout the site (audio app, etc.) It should probably be # consolidated with other find methods... if pid.startswith('info:fedora/'): # allow passing in uri pid = pid[len('info:fedora/'):] solr = solr_interface() solrquery = solr.query(content_model=SimpleCollection.COLLECTION_CONTENT_MODEL, pid=pid) result = solrquery.execute() if len(result) == 1: return result[0]
def disk_images(self): self.stderr.write('Disk images') ### disk images # representative sample of aff and ad1 # DO NOT include anything in these collections: # Trethewey (ghsdj), Rushdie (94k9k), Mackey (g1btw), # Clifton (94kf4), and Grennan (9k0st) solr = solr_interface() repo = Repository() q = solr.query(content_model=DiskImage.DISKIMAGE_CONTENT_MODEL) \ .exclude(collection_id=self.collections['trethewey']) \ .exclude(collection_id=self.collections['rushdie']) \ .exclude(collection_id=self.collections['mackey']) \ .exclude(collection_id=self.collections['clifton']) \ .exclude(collection_id=self.collections['grennan']) \ .field_limit('pid') if self.verbosity >= self.v_normal: self.stderr.write( 'Found %d disk images not in restricted collections' % q.count()) # currently there is no way to filter on format or size in either # solr or fedora risearch # so, go through individually and group them by type, # then sort by size and pick the smallest ones diskimgs_by_type = defaultdict(list) for result in q: diskimg = repo.get_object(result['pid'], type=DiskImage) if not diskimg.exists: if self.verbosity >= self.v_normal: self.stderr.write('Referenced disk image %s does not exist or is inaccessible' \ % result['pid']) continue fmt = diskimg.provenance.content.object.format.name diskimgs_by_type[fmt].append(diskimg) for fmt, diskimages in diskimgs_by_type.iteritems(): if self.verbosity >= self.v_normal: self.stderr.write('Selecting %s disk images' % fmt) # sort on binary file size so we sync the smallest ones diskimages = sorted(diskimages, key=lambda diskimg: diskimg.content.size) # use the first 10 of each type for d in diskimages[:10]: self.stdout.write(d.pid)
def simple_collections(): """Find all simpleCollection objects in the configured Fedora pidspace that can contain items. :returns: list of dict :rtype: list """ # search solr for simpleCollection objects solr = solr_interface() solrquery = solr.query(content_model=SimpleCollection.COLLECTION_CONTENT_MODEL, \ type=REPO.SimpleCollection) # by default, only returns 10; get everything # - solr response is a list of dictionary with collection info # use dictsort and regroup in templates for sorting where appropriate return solrquery.paginate(start=0, rows=1000).execute()
def find_by_field(field, value, repo=None): ''' Static method to find a single :class:`EmailMessage` by an indexed value. Looks for the item in Solr and returns an :class:`EmailMessage` instance initialized from the repository if a single match is found for the requested field and value. Raises :class:`django.core.exceptions.MultipleObjectsReturned` if more than one match is found; raises :class:`django.core.exceptions.ObjectDoesNotExist` if no matches are found in the Solr index. :param field: solr field to search :param value: value to search on in the specified field :param repo: optional :class:`eulfedora.server.Repository` to use an existing connection with specific credentials :returns: :class:`EmailMessage` ''' solr = solr_interface() search_terms = { field: value, 'content_model': ArrangementObject.ARRANGEMENT_CONTENT_MODEL } q = solr.query(**search_terms).field_limit('pid') # check that we found one and only one found = len(q) # borrowing custom django exceptions for not found / too many # matches if found > 1: raise MultipleObjectsReturned('Found %d records with %s %s' % \ (found, field, value)) if not found: raise ObjectDoesNotExist('No record found with %s %s' % (field, value)) if repo is None: repo = Repository() return repo.get_object(q[0]['pid'], type=EmailMessage)
def disk_images(self): self.stderr.write('Disk images') ### disk images # representative sample of aff and ad1 # DO NOT include anything in these collections: # Trethewey (ghsdj), Rushdie (94k9k), Mackey (g1btw), # Clifton (94kf4), and Grennan (9k0st) solr = solr_interface() repo = Repository() q = solr.query(content_model=DiskImage.DISKIMAGE_CONTENT_MODEL) \ .exclude(collection_id=self.collections['trethewey']) \ .exclude(collection_id=self.collections['rushdie']) \ .exclude(collection_id=self.collections['mackey']) \ .exclude(collection_id=self.collections['clifton']) \ .exclude(collection_id=self.collections['grennan']) \ .field_limit('pid') if self.verbosity >= self.v_normal: self.stderr.write('Found %d disk images not in restricted collections' % q.count()) # currently there is no way to filter on format or size in either # solr or fedora risearch # so, go through individually and group them by type, # then sort by size and pick the smallest ones diskimgs_by_type = defaultdict(list) for result in q: diskimg = repo.get_object(result['pid'], type=DiskImage) if not diskimg.exists: if self.verbosity >= self.v_normal: self.stderr.write('Referenced disk image %s does not exist or is inaccessible' \ % result['pid']) continue fmt = diskimg.provenance.content.object.format.name diskimgs_by_type[fmt].append(diskimg) for fmt, diskimages in diskimgs_by_type.iteritems(): if self.verbosity >= self.v_normal: self.stderr.write('Selecting %s disk images' % fmt) # sort on binary file size so we sync the smallest ones diskimages = sorted(diskimages, key=lambda diskimg: diskimg.content.size) # use the first 10 of each type for d in diskimages[:10]: self.stdout.write(d.pid)
def by_arrangement_id(id, repo=None): ''' Static method to find an :class:`ArrangementObject` by its local or arrangement id. Looks for the item in Solr and returns an :class:`ArrangementObject` instance initialized from the repository if a single match is found for the requested id. Raises :class:`django.core.exceptions.MultipleObjectsReturned` if more than one match is found; raises :class:`django.core.exceptions.ObjectDoesNotExist` if no matches are found in the Solr index. :param id: arrangement id or local id :param repo: optional :class:`eulfedora.server.Repository` to use an existing connection with specific credentials :returns: :class:`ArrangementObject` ''' solr = solr_interface() q = solr.query(arrangement_id=id, content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) \ .field_limit('pid') # check that we found one and only one found = len(q) # borrowing custom django exceptions for not found / too many # matches if found > 1: raise MultipleObjectsReturned('Found %d records with arrangement id %s' % \ (found, id)) if not found: raise ObjectDoesNotExist('No record found with arrangement id %s' % id) if repo is None: repo = Repository() return repo.get_object(q[0]['pid'], type=ArrangementObject)
def view_item(request, pid): ''' Display information about a single object. Currently only supports :class:`eulcm.models.boda.EmailMessage` and :class:`eulcm.models.boda.Mailbox` objects. :param pid: The pid of the object to be displayed. ''' repo = TypeInferringRepository(request=request) obj = repo.get_object(pid) context = {'obj': obj} if isinstance(obj, boda.EmailMessage): template_name = 'arrangement/email_view.html' elif isinstance(obj, boda.Mailbox): template_name = 'arrangement/mailbox_view.html' # use Solr to find paginated messages in this mailbox solr = solr_interface() q = solr.query(isPartOf=obj.uri) paginator = Paginator(q, 30) try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 try: results = paginator.page(page) except (EmptyPage, InvalidPage): results = paginator.page(paginator.num_pages) # calculate page links to show show_pages = pages_to_show(paginator, page) # add paginated messages to context context.update({ 'page': results, 'show_pages': show_pages, 'search_opts': request.GET.urlencode() }) else: raise Http404 return TemplateResponse(request, template_name, context)
def library_choices_by_user(self): # this method shouldn't be set if user isn't defined, but just in case if not self.user: return archive_choices() # NOTE: should be possible to query for archives directly, # but filtering on audio items requires two levels of joins, # and it's unclear how that actually works # use collection facet query to get list of archives q = CollectionObject.item_collection_query() q = q.facet_by('archive_id', sort='count', mincount=1) \ .paginate(rows=0) # - depending on permissions, restrict to collections with researcher content if not self.user.has_perm('collection.view_collection') and \ self.user.has_perm('collection.view_researcher_collection'): q = q.join('collection_id', 'pid', researcher_access=True) q = q.join('collection_id', 'pid', has_access_copy=True) facets = q.execute().facet_counts.facet_fields solr = solr_interface() archive_info = dict([(pid.replace('info:fedora/', ''), {'count': count}) for pid, count in facets['archive_id']]) # construct a boolean pid query to match any archive pids # in order to lookup titles and match them to pids pid_q = solr.Q() for pid in archive_info.keys(): pid_q |= solr.Q(pid=pid) query = solr.query(pid_q) \ .field_limit(['pid', 'title']) \ .sort_by('title') # ignore any spurious results that don't have titles (bad data in prod?) choices = [(a['pid'], a['title']) for a in query if 'title' in a] choices.insert(0, ('', '---')) # blank option at the beginning (default) return choices
def rushdie_files(self): self.stderr.write('Rushdie files') solr = solr_interface() ### individual rushdie files # select 100 individual rushdie files to simulate the way they # currently clutter up born-digital search in production q = solr.query(content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL, collection_id=self.collections['rushdie']).field_limit('pid') self.stderr.write('Found %d Rushdie arrangement objects' % q.count()) # over 6000 of these in production; pull a subset and randomize # to ensure diversity, get chunks from various points in the results pids = [r['pid'] for r in q[:100]] pids.extend([r['pid'] for r in q[1000:1100]]) pids.extend([r['pid'] for r in q[2000:2100]]) pids.extend([r['pid'] for r in q[3000:3100]]) pids.extend([r['pid'] for r in q[4000:4100]]) pids.extend([r['pid'] for r in q[5000:5100]]) pids.extend([r['pid'] for r in q[6000:6100]]) # then shuffle that and pick the first 100 random.shuffle(pids) for p in pids[:100]: self.stdout.write(p)
def save(self, logMessage=None): # check for duplicate content before initial ingest if self._create and self.content_md5 is not None: solr = solr_interface() q = solr.query(content_md5=self.content_md5).field_limit( ['pid', 'content_model']) # if a duplicate is found, raise custom exception with info on the dupes if q.count(): msg = 'Detected %s duplicate record%s' % \ (q.count(), 's' if q.count() != 1 else '') results = list(q) pids = [r['pid'] for r in results] # dictionary of pid : list of cmodels pid_cmodels = dict([(r['pid'], r['content_model']) for r in results]) raise DuplicateContent(msg, pids, pid_cmodels) # update the ark label in pidman when there is a name conflict self.update_ark_label() return super(DigitalObject, self).save(logMessage)
def test_search_bylibrary(self, mockpaginator, mocksolr_interface, mocksearch_libs): solr = solr_interface() search_url = reverse('search:keyword') mocksolr = mocksolr_interface.return_value mocksolr.Q = MagicMock(solr.Q) mocksolr.query.return_value = mocksolr.query for method in [ 'query', 'facet_by', 'sort_by', 'field_limit', 'exclude', 'filter', 'join' ]: getattr(mocksolr.query, method).return_value = mocksolr.query # create researcher IP for localhost so anonymous access will be # treated as anonymous researcher researchip = ResearcherIP(name='test client', ip_address='127.0.0.1') researchip.save() # NOTE: currently uses info:fedora/ pid format for select, so use that here libpid = 'info:fedora/%s' % settings.PID_ALIASES['marbl'] marbl_name = 'Manuscript, Archives, and Rare Book Library' # set mock list of library choices to include our test value so form will be valid mocksearch_libs.return_value = [(libpid, marbl_name)] response = self.client.get(search_url, {'library': libpid}) # check solr query args # - date should query dates created and issued explicitly mocksolr.query.join.assert_called_with('pid', 'collection_id', archive_id=libpid) self.assertContains( response, '<option value="%s" selected="selected">%s</option>' % (libpid, marbl_name), html=True, msg_prefix='library filter should be selected on result page form')
def rushdie_files(self): self.stderr.write('Rushdie files') solr = solr_interface() ### individual rushdie files # select 100 individual rushdie files to simulate the way they # currently clutter up born-digital search in production q = solr.query( content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL, collection_id=self.collections['rushdie']).field_limit('pid') self.stderr.write('Found %d Rushdie arrangement objects' % q.count()) # over 6000 of these in production; pull a subset and randomize # to ensure diversity, get chunks from various points in the results pids = [r['pid'] for r in q[:100]] pids.extend([r['pid'] for r in q[1000:1100]]) pids.extend([r['pid'] for r in q[2000:2100]]) pids.extend([r['pid'] for r in q[3000:3100]]) pids.extend([r['pid'] for r in q[4000:4100]]) pids.extend([r['pid'] for r in q[5000:5100]]) pids.extend([r['pid'] for r in q[6000:6100]]) # then shuffle that and pick the first 100 random.shuffle(pids) for p in pids[:100]: self.stdout.write(p)
def keyword_search(request): '''Combined keyword search across all :mod:`keep` repository items. ''' searchform = KeywordSearch(request.GET) missing_label = '[null]' ctx = {'form': searchform} if searchform.is_valid(): search_terms = searchform.cleaned_data['keyword'] solr = solr_interface() # start with a default query to add filters & search terms # *first* filter to restrict to content models user has permission to view # q = filter_by_perms(solr.query(), request.user) q = solr.query() # optional date filter for fixity check fixity_check_mindate = searchform.cleaned_data.get('fixity_check_mindate', None) if fixity_check_mindate: today = date.today() q = q.query(last_fixity_check__range=(fixity_check_mindate, today)) # use solr grouping queries to cluster original and migrated objects # if they appear in the same search result set q = q.group_by('original_pid', limit=5, sort='created desc', format='simple') # separate out normal and fielded search terms in keyword search string # TODO: should this logic be shifted to form validation/cleaning? search_info = MultiValueDict() terms = [] # add field-based search terms to query and search info for display for t in search_terms: field, val = t # add non-field terms to list of terms # - no field name if field is None: terms.append(val) # - unrecognized field name or incomplete term elif val is None or field not in searchform.allowed_fields: # just search on the text we were given if val is None: term = '%s:' % field else: if ' ' in val: # assume exact phrase if quoted val = "%s" % val term = '%s:%s' % (field, val) terms.append(term) # field/value pair else: solr_field = searchform.allowed_fields[field] search_val = val # special case for searching for collection source id if field == 'coll' and search_val and search_val.isdigit(): solr_field = 'collection_source_id' # add wildcard to end of search dates # (indexed by YYYY-MM-DD; allow match on YYYY or YYYY-MM) if field == 'created': search_val += '*' # add field/value search to the solr query q = q.query(**{solr_field: search_val}) # add to search info for display to user field = 'collection' if field == 'coll' else field search_info.update({field: val}) # search on all collected search terms q = q.query(*terms) # FIXME: there should be a way to exclude these by type # Exclude archival collection (Top-level library) for p in settings.PID_ALIASES.values(): q = q.exclude(pid=p) # get a copy of current url options for pagination # and to generate links to remove active filters urlopts = request.GET.copy() # handle facets display_filters = [] # - list of tuples: display name, link to remove the filter active_filters = dict((field, []) for field in searchform.facet_field_names.iterkeys()) # - dictionary of filters in use, for exclusion from displayed # facets # filter the solr search based on any facets in the request for filter_val, facet_field in searchform.facet_field_names.iteritems(): # For multi-valued fields (author, subject), we could have multiple # filters on the same field; treat all facet fields as lists. for val in request.GET.getlist(filter_val): # ignore any facet if the value is not set if not val: continue # special case: search for items without a field if val == missing_label: q = q.exclude(**{'%s__any' % facet_field: True}) else: # filter the current solr query q = q.filter(**{facet_field: val}) # add to list of active filters active_filters[filter_val].append(val) # add to list for user display & removal # - copy the urlopts and remove only the current value unfacet_urlopts = urlopts.copy() val_list = unfacet_urlopts.getlist(filter_val) val_list.remove(val) unfacet_urlopts.setlist(filter_val, val_list) # tuple of filter display value, url to remove it # - add details to label when the value doesn't make it obvious if filter_val in ['added by', 'modified by']: label = '%s %s' % (filter_val, val) elif filter_val == 'fixity_check': label = 'fixity check: %s' % 'valid' if val == 'pass' else 'invalid' elif val == missing_label: label = '%s: null' % filter_val elif filter_val == 'access status': # use access status abbreviation instead of numeric code label = rights_access_terms_dict[val].abbreviation else: label = val display_filters.append((label, unfacet_urlopts.urlencode())) # Update solr query to return values & counts for the # configured facet fields q = q.facet_by(searchform.facet_field_names.values(), mincount=1, limit=15, sort='count', missing=True) # NOTE: missing true displays count for items without any value # for the facet field (e.g., no access code set) # if there are any *keyword* terms, sort by relevance and display score # (for fielded search terms, items will either match or not, so relevance # is not as useful) if terms: # NOTE: possibly a change in sunburnt? # including score now requires specifying *all* fields that # should be returned q = q.sort_by('-score').field_limit([ # common item information "object_type", "content_model", "pid", "label", "title", "creator", "created", "last_modified", "added_by", # collection "archive_short_name", "hasMember", # item "collection_id", # audio "part", "collection_label", "duration", "has_access_copy", "access_copy_mimetype", "access_copy_size", "source_id", # arrangement/disk image "simpleCollection_label", "rights", "state", # migrated / original "original_pid", "isDerivationOf", "hasDerivation", # format and size, used for disk images display (at least) "content_size", "content_format" ], score=True) ctx['show_relevance'] = True # then sort by most recently created # (primary sort when no search terms, secondary otherwise) q = q.sort_by('-created') # list of currently known types for display in results # FIXME: are these used anywhere? known_object_types = ['audio', 'collection', 'born-digital'] # paginate the solr result set paginator = Paginator(q, 30) try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 try: results = paginator.page(page) except (EmptyPage, InvalidPage): results = paginator.page(paginator.num_pages) # calculate page links to show show_pages = pages_to_show(paginator, page) # convert the facets from the solr result for display to user facets = SortedDict() facet_fields = results.object_list.facet_counts.facet_fields for display_name, field in searchform.facet_field_names.iteritems(): #do not display coll facet because it is redundant with the collection facet if display_name in ['coll', 'fixity_check']: continue if field in facet_fields and facet_fields[field]: show_facets = [] # skip any display facet values that are already in effect for val in facet_fields[field]: try: if val[0] not in active_filters[display_name]: show_facets.append(val) except TypeError: # when solr missing=True is turned on, # last result is a count of items with no value # for this field if val is not 0 and field in searchform.show_missing_facets \ and missing_label not in active_filters[display_name]: show_facets.append((missing_label, val)) if show_facets: facets[display_name] = show_facets ctx.update({ 'page': results, 'show_pages': show_pages, # 'known_types': known_object_types, 'search_opts': request.GET.urlencode(), 'search_terms': terms, 'search_info': search_info, 'url_params': urlopts.urlencode(), 'facets': facets, 'active_filters': display_filters, }) return TemplateResponse(request, 'repoadmin/results.html', ctx)
def handle(self, **options): verbosity = int(options['verbosity']) errors = 0 solr = solr_interface() collections = open('collections.txt', 'r') gb = 1024*1024*1024 with open('keep_collection_report.csv', 'wb') as file: writer = csv.writer(file, delimiter = ',') writer.writerow(['title', 'collection_code', 'library_name','size','object_count','dv_count','mov_count','mpg_count','ad1_count','aff_count','dd_count','e01_count','img_count','iso_count','tar_count','wav_count','status_code_2','status_code_3','status_code_4', 'status_code_5', 'status_code_10','status_code_11','status_code_12','status_code_13']) for line in collections: print line.strip() solrquery = solr.query().filter(title=str(line).strip()).sort_by('-created') for doc in solrquery: try: library_name = doc['isMemberOfCollection'] library_name = library_name.split('/')[1] library_name = library_name.split("'")[0] except: library_name = '' solrquery = solr.query().filter(collection_label=str(line).strip()) object_count = 0 size = 0 dv_count = 0 mov_count = 0 mpg_count = 0 ad1_count = 0 aff_count = 0 dd_count = 0 e01_count = 0 img_count = 0 iso_count = 0 tar_count = 0 wav_count = 0 status_code_2 = 0 status_code_3 = 0 status_code_4 = 0 status_code_5 = 0 status_code_10 = 0 status_code_11 = 0 status_code_12 = 0 status_code_13 = 0 title = '' collection_code = 0 for doc in solrquery: object_count += 1 try: object_type = doc['object_type'] except: object_type = '' if object_type == 'audio': wav_count = wav_count + 1 try: size = int(doc['access_copy_size']) + size except: pass elif object_type == 'video': try: size = int(doc['content_size']) + size except: pass elif object_type == 'disk image': try: size = int(doc['content_size']) + size except: pass # collection code try: collection_code = doc['collection_source_id'] except: collection_code = '' # access code counting try: access_code = int(doc['access_code']) if access_code == 2: status_code_2 += 1 elif access_code == 3: status_code_3 += 1 elif access_code == 4: status_code_4 += 1 elif access_code == 5: status_code_5 += 1 elif access_code == 10: status_code_10 += 1 elif access_code == 11: status_code_11 += 1 elif access_code == 12: status_code_12 += 1 elif access_code == 13: status_code_13 += 1 except: pass # content format count try: content_format = doc['content_format'] if content_format == 'AD1': ad1_count += 1 elif content_format == 'AFF': aff_count += 1 elif content_format == 'DD': dd_count += 1 elif content_format == 'E01': e01_count += 1 elif content_format == 'IMG': img_count += 1 elif content_format == 'ISO': iso_count += 1 elif content_format == 'TAR': tar_count += 1 elif content_format == 'DV': dv_count += 1 elif content_format == 'MOV': mov_count += 1 elif content_format == 'MPG': mpg_count += 1 except: pass size = float(size) / gb writer.writerow([title, collection_code, library_name,size,object_count,dv_count,mov_count,mpg_count,ad1_count,aff_count,dd_count,e01_count,img_count,iso_count,tar_count,wav_count,status_code_2,status_code_3,status_code_4, status_code_5, status_code_10,status_code_11,status_code_12,status_code_13])
def video(self): self.stderr.write('Video') ### video # need a representative sample of all mime types # representative sample of old dm and native Keep objects # (dm carries an access status of 11) # representative sample of different access codes # 5-10 collections represented # about 40 objects total (can be smallest size objects) # NOTE: there is currently no easy way to ensure we have # a representative sample of all mimetypes (master mimetypes are # not indexed, and there is too much content, so it would be too # slow to look in fedora. Hopefully the diversity of codes and # old dm content will provide sufficient representation. solr = solr_interface() # desired minimum number of collections # (minimum since more may be added in order to find # representative objects by status) num_collections = 5 # desired number of objects desired_total = 40 pids = [] collections = set() # find all video, and sort smallest first all_video = solr.query(content_model=Video.VIDEO_CONTENT_MODEL) \ .field_limit(['pid', 'collection_id']).sort_by('access_copy_size') # master size is not indexed, but hopefully access copy # can serve as a proxy total_pids = all_video.count() if self.verbosity >= self.v_normal: self.stderr.write('Found %d total video objects' % all_video.count()) facet_q = all_video.facet_by('collection_id', sort='count', mincount=1) \ .facet_by('access_code', sort='count', mincount=1) \ .paginate(rows=0) facets = facet_q.execute().facet_counts.facet_fields # pick the requested number of collections with the most items top_collections = [pid for (pid, count) in facets['collection_id']][:num_collections] # restrict query to video in those collections collection_filter = solr.Q() for coll in top_collections: collection_filter |= solr.Q(collection_id=coll) q = all_video.filter(collection_filter) self.stderr.write('Found %d total video objects in %d largest collections' \ % (q.count(), num_collections)) # Nothing here ensures we get content from all of these # collections, but hopefully the diversity of status codes # will help provide a reasonable distribution. # figure out some representative percentage based on our desired total # - by far the most content is old dm (93%), so don't use that % # first facet is old dm (largest total); facet is label, count old_dm_code = facets['access_code'][0][0] old_dm_total = facets['access_code'][0][1] # get percentages based on the total *without* old dm for code, num in facets['access_code'][1:]: # determine number of pids to grab as a percentage # of half the desired number percent = float(num) / (total_pids - old_dm_total) # minimum of at least 1 per code num_pids = max(int((percent) * (desired_total/2)), 1) if self.verbosity >= self.v_normal: self.stderr.write(' Looking for %d pid(s) for access code %s' % \ (num_pids, code)) # first try to find within the request collections pids_by_code = q.filter(access_code=code) # if no pids are found for this code in our collections, # look for them elsewhere if not pids_by_code.count(): pids_by_code = all_video.filter(access_code=code) for r in pids_by_code[:num_pids]: pids.append(r['pid']) collections.add(r['collection_id']) # other codes will provide slightly more than half, # because we are rounding up; get the rest of the # requested objects from old dm remainder = desired_total - len(pids) for r in q.filter(access_code=old_dm_code)[:remainder]: pids.append(r['pid']) collections.add(r['collection_id']) if self.verbosity >= self.v_normal: self.stderr.write('Selected %d pids from %d collections' % \ (len(pids), len(collections))) for p in pids: self.stdout.write(p)
def audio(self): self.stderr.write('Audio') ### audio # representative sample of all mime types # representative sample of old dm and native Keep objects # (dm carries an access status of 11) # representative sample of different access codes # 10 collections represented # (please include material from Dawson (94jz3) # NOTE: this is largely the same logic as for video solr = solr_interface() # desired number of collections # (could be adjusted some since more may be added in order to # find representative objects by status) num_collections = 10 # desired number of objects desired_total = 100 pids = [] collections = set() # find all audioo, and sort smallest first all_audio = solr.query(content_model=AudioObject.AUDIO_CONTENT_MODEL) \ .field_limit(['pid', 'collection_id']).sort_by('access_copy_size') # master size is not indexed, but hopefully access copy # can serve as a proxy total_pids = all_audio.count() if self.verbosity >= self.v_normal: self.stderr.write('Found %d total audio objects' % all_audio.count()) facet_q = all_audio.facet_by('collection_id', sort='count', mincount=1) \ .facet_by('access_code', sort='count', mincount=1) \ .paginate(rows=0) facets = facet_q.execute().facet_counts.facet_fields # pick the requested number of collections with the most items top_collections = [pid for (pid, count) in facets['collection_id']][:num_collections] # restrict query to video in those collections # OR in the dawson collection # (dawson is *probably* included in those, but explicitly include # since it was requested) collection_filter = solr.Q(collection_id=self.collections['dawson']) for coll in top_collections: collection_filter |= solr.Q(collection_id=coll) q = all_audio.filter(collection_filter) self.stderr.write('Found %d total audio objects in %d largest collections (including dawson)' \ % (q.count(), num_collections)) # Nothing here ensures we get content from all of these # collections, but hopefully the diversity of status codes # will help provide a reasonable distribution. # calculate and find a representative percentage of items # for each status based on the desired total for code, num in facets['access_code']: # determine number of pids to grab as a percentage # of the desired number percent = float(num) / total_pids # minimum of at least 1 per code num_pids = max(int(percent * desired_total), 1) if self.verbosity >= self.v_normal: self.stderr.write(' Looking for %d pid(s) for access code %s' % \ (num_pids, code)) # first try to find within the request collections pids_by_code = q.filter(access_code=code) # if no pids are found for this code in our collections, # look for them elsewhere if not pids_by_code.count(): pids_by_code = all_audio.filter(access_code=code) for r in pids_by_code[:num_pids]: pids.append(r['pid']) collections.add(r['collection_id']) if self.verbosity >= self.v_normal: self.stderr.write('Selected %d pids from %d collections' % \ (len(pids), len(collections))) for p in pids: self.stdout.write(p)
def ingest_message(self, msg_data, mailbox, folder_order): # read content and redact IP addresses / email addresses msg_data = redact_email(msg_data) # generate email object from data email_msg = email.message_from_string(msg_data, _class=MacEncodedMessage) # check and warn if email has attachments attachments = self.email_attachments(email_msg) if attachments: print 'Warning! Email has attachments (not yet handled): %s' % \ ','.join(attachments) # get current content type to preserve the original value, # and also to determine how to decode content_type = email_msg.get('Content-Type', '') orig_content_type = email_msg.get_content_type() orig_content_charset = email_msg.get_content_charset() # at least one email in this set has a charset of 'unknown-8bit', # but the \xa0 in the content indicates it is probably latin 1 if 'charset=unknown-8bit' in content_type: latin1_charset = email.charset.Charset('latin_1') email_msg.set_charset(latin1_charset) # otherwise, if charset is not set, assume mac roman elif not email_msg.get_charset(): # tell email that charset should be mac roman, # so it can decode special characters mac_charset = email.charset.Charset('mac_roman') email_msg.set_charset(mac_charset) # decode headers from mac roman charset # (some messages contain improperly formatted # accented characters in a from/to header) email_msg.decode_headers() # create a new object to populate with data msg_obj = self.repo.get_object(type=EmailMessagePidReuse) # generate cerp from mime message # - store folder order as message local id msg_obj.cerp.content = cerp.Message.from_email_message(email_msg, local_id=folder_order) # The generated CERP may have modified mac roman charset headers # which were needed to convert instead of the original; # update thex ml to store the original value, NOT the encoding # that was used to decode the content. if content_type: if msg_obj.cerp.content.single_body: msg_obj.cerp.content.single_body.content_type_list[0] = orig_content_type msg_obj.cerp.content.single_body.charset_list[0] = orig_content_charset else: if msg_obj.cerp.content.single_body: del msg_obj.cerp.content.single_body.content_type_list[0] del msg_obj.cerp.content.single_body.charset_list[0] # loop through headers to set/remove content type for h in msg_obj.cerp.content.headers: if h.name == 'Content-Type': if content_type: h.value = content_type else: h.value = None h.name = None break # construct an object label based on from/to/date/subject msg_from = email_msg['From'] # NOTE: it would be nice to suppress redundant redaction email text here; # at least simplify label for rushdie, since that is what we'll see most if 'REDACTED: Salman Rushdie\'s email' in msg_from: msg_from = 'Salman Rushdie' label = u'Email from %s' % msg_from if email_msg.get('To', None): # FIXME: could have multiple recipients # we *should* be able to get split-out version from email.Message ... to = email_msg['To'] label += u' to %s' % email_msg['To'] # date/subject not always present, but add if they are if email_msg.get('Date', None): label += u' on %s' % email_msg['Date'] if email_msg.get('Subject', None): label += u' %s' % email_msg['Subject'] # set as object label and dc:title msg_obj.label = label msg_obj.dc.content.title = label # in verbose noact mode, print label so user can see what is being done if self.verbosity > self.v_normal and self.noact: print label # generate a pristine email Message for saving fedora # (don't save modified charset, content type, etc.) msg_obj.mime_data.content = email.message_from_string(msg_data, _class=MacEncodedMessage) # calculate an MD5 of the email content *as it will be serialized* md5 = hashlib.md5() md5.update(str(msg_obj.mime_data.content)) email_md5 = md5.hexdigest() msg_obj.mime_data.checksum = email_md5 # check if this email has already been ingested via checksum; # don't re-ingest if it is already in the repository solr = solr_interface() q = solr.query(content_md5=msg_obj.mime_data.checksum).field_limit('pid') if len(q): if self.verbosity >= self.v_normal: print 'Email message has already been ingested as %s; skipping' \ % q[0]['pid'] self.stats['previously_ingested'] += 1 return # associate with current mailbox object msg_obj.mailbox = mailbox # belongs to same collection as its mailbox if mailbox.collection: msg_obj.collection = mailbox.collection # ingest items as accessioned/unprocessed msg_obj.arrangement_status = 'accessioned' # ingest with a default rights code of 10 "Undetermined" in rights DS msg_obj.rights.content.create_access_status() msg_obj.rights.content.access_status.code = "10" msg_obj.rights.content.access_status.text = rights_access_terms_dict["10"].text if not self.noact: try: msg_obj.save('ingesting email message from rushdie 5300c') if self.verbosity >= self.v_normal: print 'Ingested message %s : %s' % \ (msg_obj.pid, msg_obj.label) self.stats['ingested'] += 1 except RequestFailed as rf: self.stats['ingest_error'] += 1 print 'Error ingesting email message %s: %s' % \ (msg_obj.label, rf)
def dashboard(request): '''Admin dashboard page for staff users, with links to main functionality and date/month facets linking to searches for recently added or checksummed items. ''' today = date.today() month_ago = today - timedelta(days=30) three_months = today - timedelta(days=31 * 3) solr = solr_interface() # search for all content added in the last month # and return just the facets for date created and collection name # - limit of 31 to ensure we get all dates in range facetq = solr.query().filter(created_date__range=(month_ago, today)) \ .facet_by('created_date', sort='index', limit=31, mincount=1) \ .facet_by('collection_label_facet', sort='count', limit=10, mincount=1) \ .paginate(rows=0) # filter the facet query by user permissions # facetq = filter_by_perms(facetq, request.user) facets = facetq.execute().facet_counts.facet_fields # reverse order and convert to datetime.date for use with naturalday recent_items = [] recent_dates = facets['created_date'] recent_dates.reverse() # limit to just the 10 most recent dates for day, count in recent_dates[:10]: y, m, d = day.split('-') recent_items.append((date(int(y), int(m), int(d)), count)) recent_collections = facets['collection_label_facet'] # search for content added in the last few months # and return just the facets for year-month facetq = solr.query().filter(created_date__range=(three_months, today)) \ .facet_by('created_month', sort='index', mincount=1) \ .paginate(rows=0) # also filter this query by user perms # facetq = filter_by_perms(facetq, request.user) recent_month_facet = facetq.execute().facet_counts.facet_fields['created_month'] recent_month_facet.reverse() recent_months = [] for month, count in recent_month_facet: y, m = month.split('-') recent_months.append((date(int(y), int(m), 1), count)) # search for fixity checks in the last 30 days facetq = solr.query().filter(last_fixity_check__range=(month_ago, today)) \ .facet_by('last_fixity_result', mincount=1) \ .paginate(rows=0) # facetq = filter_by_perms(facetq, request.user) facets = facetq.execute().facet_counts.facet_fields recent_fixity_checks = facets['last_fixity_result'] return TemplateResponse(request, 'repoadmin/site_dashboard.html', {'recent_items': recent_items, 'recent_months': recent_months, 'recent_collections': recent_collections, 'recent_fixity_checks': recent_fixity_checks, 'month_ago': month_ago, 'manual_url': settings.KEEP_MANUAL_URL, 'find_collection': FindCollection()})
def search(request): '''Search for :class:`~keep.audio.models.AudioObject` or :class:`~keep.arrangement.models.ArrangementObject`by pid, title, description, collection, date, rights, etc.''' # if NO search terms are specified, return an advanced search page if not request.GET: return TemplateResponse(request, 'common/advanced-search.html', {'searchform': commonforms.ItemSearch(prefix='audio')}) form = commonforms.ItemSearch(request.GET, prefix='audio') ctx_dict = {'searchform': form} if form.is_valid(): solr = solr_interface() # solr search options from posted data search_opts = form.search_options() # search term/value display info for user based on posted data ctx_dict['search_info'] = form.search_info() # solr query to restrict this search to appropriate content models cm_query = solr.Q(solr.Q(content_model=ArrangementObject.ARRANGEMENT_CONTENT_MODEL) \ | solr.Q(content_model=AudioObject.AUDIO_CONTENT_MODEL)\ | solr.Q(content_model=Video.VIDEO_CONTENT_MODEL)) # for now, sort by most recently created solrquery = solr.query(**search_opts).filter(cm_query).sort_by('-created') # if user requested specific display fields, handle output display and formatting if form.cleaned_data['display_fields']: fields = form.cleaned_data['display_fields'] # pid and content model are always needed to construct html search results solr_fields = fields + ['pid', 'content_model'] solrquery = solrquery.field_limit(solr_fields) class FieldList(list): # extended list object with pid and content model attributes def __init__(self, pid=None, content_model=None, values=[]): super(FieldList, self).__init__(values) if pid: self.pid = pid if content_model: self.content_model = content_model else: self.content_model = [] def field_list(**kwargs): # method to construct a custom solr result based on the requested field list l = FieldList(pid=kwargs.get('pid', None), content_model=kwargs.get('content_model', None)) for f in fields: val = kwargs.get(f, '') if solr.schema.fields[f].multi_valued: val = '; '.join(val) l.append(val) return l solrquery = solrquery.results_as(field_list) ctx_dict.update({ 'display_fields': fields, 'display_labels': [commonforms.ItemSearch.display_field_opts[f] for f in fields] }) # if CSV is requested with display_fields, return as csv before paginating if form.cleaned_data['output'] == 'csv': response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename=Keep-report_%s.csv' \ % date.today() writer = unicodecsv.writer(response) # write out list of field labels writer.writerow(ctx_dict['display_labels']) # then append all matching values # FIXME: csv output for very large results is VERY slow # TODO: append rows in chunks of 50-100, to handle # large result sets better - maybe use paginator? writer.writerows(solrquery) return response paginator = Paginator(solrquery, 30) try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 try: results = paginator.page(page) except (EmptyPage, InvalidPage): results = paginator.page(paginator.num_pages) # calculate page links to show show_pages = pages_to_show(paginator, page) ctx_dict.update({ 'results': results.object_list, 'page': results, 'show_pages': show_pages, # pass search term query opts to view for pagination links 'search_opts': request.GET.urlencode(), }) return TemplateResponse(request, 'common/search.html', ctx_dict)
def handle(self, **options): verbosity = int(options['verbosity']) errors = 0 solr = solr_interface() with open('collection_report.csv', 'wb') as file: writer = csv.writer(file, delimiter = ',') writer.writerow(['ark_uri', 'object_type','pid','duration','content_model','has_original','title','content_size','researcher_access','label','content_format','state','collection_source_id','type','original_pid','access_copy_mimetype','access_code','collection_id', 'collection_label', 'isMemberOfCollection','rights','created_year','has_access_copy','access_copy_size']) solrquery = solr.query().sort_by('-created') for doc in solrquery: try: ark_uri = doc['ark_uri'] except: ark_uri = '' try: object_type = doc['object_type'] except: object_type = '' try: pid = doc['pid'] except: pid = '' try: duration = doc['duration'] except: duration = '' try: content_model = doc['content_model'] except: content_model = '' try: has_original = doc['has_original'] except: has_original = '' try: title = doc['title'] except: title = '' try: content_size = doc['content_size'] except: content_size = '' try: researcher_access = doc['researcher_access'] except: researcher_access = '' try: label = doc['label'] except: label = '' try: content_format = doc['content_format'] except: content_format = '' try: state = doc['state'] except: state = '' try: collection_source_id = doc['collection_source_id'] except: collection_source_id = '' try: my_type = doc['type'] except: my_type = '' try: original_pid = doc['original_pid'] except: original_pid = '' try: access_copy_mimetype = doc['access_copy_mimetype'] except: access_copy_mimetype = '' try: access_code = doc['access_code'] except: access_code = '' try: collection_id = doc['collection_id'] except: collection_id = '' try: collection_label = doc['collection_label'] except: collection_label = '' try: ismemberofcollection = doc['isMemberOfCollection'] except: ismemberofcollection = '' try: rights = doc['rights'] except: rights = '' try: created_year = doc['created_year'] except: created_year = '' try: has_access_copy = doc['has_access_copy'] except: has_access_copy = '' try: access_copy_size = doc['access_copy_size'] except: access_copy_size = '' items = [ark_uri, object_type, pid, duration, content_model, has_original, title, content_size, researcher_access, label, content_format, state, collection_source_id, my_type, original_pid, access_copy_mimetype, access_code, collection_id, collection_label, ismemberofcollection, rights, created_year, has_access_copy, access_copy_size] array = [] for item in items: if isinstance(item, basestring): array.append(item.encode('utf-8')) else: array.append(item) writer.writerow(array)
def test_search_bydate(self, mockpaginator, mocksolr_interface, mocksearch_libs): solr = solr_interface() search_url = reverse('search:keyword') mocksolr = mocksolr_interface.return_value mocksolr.Q = MagicMock(solr.Q) mocksolr.query.return_value = mocksolr.query for method in [ 'query', 'facet_by', 'sort_by', 'field_limit', 'exclude', 'filter' ]: getattr(mocksolr.query, method).return_value = mocksolr.query # create researcher IP for localhost so anonymous access will be # treated as anonymous researcher researchip = ResearcherIP(name='test client', ip_address='127.0.0.1') researchip.save() # start date only sdate = '1980' response = self.client.get(search_url, {'start_date': sdate}) # check solr query args # - date should query dates created and issued explicitly mocksolr.Q.assert_any_call(date_created__gte=sdate) mocksolr.Q.assert_any_call(date_issued__gte=sdate) self.assertContains( response, '<input class="form-control" id="id_start_date" name="start_date" placeholder="Start year" type="tel" value="%s">' % sdate, html=True, msg_prefix= 'start date search value should be displayed on result page via form' ) # end date only edate = '2001' response = self.client.get(search_url, {'end_date': edate}) # check solr query args # - date should query dates created and issued explicitly search_edate = '%s-12-31' % edate mocksolr.Q.assert_any_call(date_created__lte=search_edate) mocksolr.Q.assert_any_call(date_issued__lte=search_edate) self.assertContains( response, '<input class="form-control" id="id_end_date" name="end_date" placeholder="End year" type="tel" value="%s">' % edate, html=True, msg_prefix= 'start date search value should be displayed on result page via form' ) # start and end date together response = self.client.get(search_url, { 'start_date': sdate, 'end_date': edate }) # check solr query args # - date should query dates created and issued explicitly mocksolr.Q.assert_any_call(date_created__range=(sdate, search_edate)) mocksolr.Q.assert_any_call(date_issued__range=(sdate, search_edate)) researchip.delete()
def audio(self): self.stderr.write('Audio') ### audio # representative sample of all mime types # representative sample of old dm and native Keep objects # (dm carries an access status of 11) # representative sample of different access codes # 10 collections represented # (please include material from Dawson (94jz3) # NOTE: this is largely the same logic as for video solr = solr_interface() # desired number of collections # (could be adjusted some since more may be added in order to # find representative objects by status) num_collections = 10 # desired number of objects desired_total = 100 pids = [] collections = set() # find all audioo, and sort smallest first all_audio = solr.query(content_model=AudioObject.AUDIO_CONTENT_MODEL) \ .field_limit(['pid', 'collection_id']).sort_by('access_copy_size') # master size is not indexed, but hopefully access copy # can serve as a proxy total_pids = all_audio.count() if self.verbosity >= self.v_normal: self.stderr.write('Found %d total audio objects' % all_audio.count()) facet_q = all_audio.facet_by('collection_id', sort='count', mincount=1) \ .facet_by('access_code', sort='count', mincount=1) \ .paginate(rows=0) facets = facet_q.execute().facet_counts.facet_fields # pick the requested number of collections with the most items top_collections = [pid for (pid, count) in facets['collection_id'] ][:num_collections] # restrict query to video in those collections # OR in the dawson collection # (dawson is *probably* included in those, but explicitly include # since it was requested) collection_filter = solr.Q(collection_id=self.collections['dawson']) for coll in top_collections: collection_filter |= solr.Q(collection_id=coll) q = all_audio.filter(collection_filter) self.stderr.write('Found %d total audio objects in %d largest collections (including dawson)' \ % (q.count(), num_collections)) # Nothing here ensures we get content from all of these # collections, but hopefully the diversity of status codes # will help provide a reasonable distribution. # calculate and find a representative percentage of items # for each status based on the desired total for code, num in facets['access_code']: # determine number of pids to grab as a percentage # of the desired number percent = float(num) / total_pids # minimum of at least 1 per code num_pids = max(int(percent * desired_total), 1) if self.verbosity >= self.v_normal: self.stderr.write(' Looking for %d pid(s) for access code %s' % \ (num_pids, code)) # first try to find within the request collections pids_by_code = q.filter(access_code=code) # if no pids are found for this code in our collections, # look for them elsewhere if not pids_by_code.count(): pids_by_code = all_audio.filter(access_code=code) for r in pids_by_code[:num_pids]: pids.append(r['pid']) collections.add(r['collection_id']) if self.verbosity >= self.v_normal: self.stderr.write('Selected %d pids from %d collections' % \ (len(pids), len(collections))) for p in pids: self.stdout.write(p)
def video(self): self.stderr.write('Video') ### video # need a representative sample of all mime types # representative sample of old dm and native Keep objects # (dm carries an access status of 11) # representative sample of different access codes # 5-10 collections represented # about 40 objects total (can be smallest size objects) # NOTE: there is currently no easy way to ensure we have # a representative sample of all mimetypes (master mimetypes are # not indexed, and there is too much content, so it would be too # slow to look in fedora. Hopefully the diversity of codes and # old dm content will provide sufficient representation. solr = solr_interface() # desired minimum number of collections # (minimum since more may be added in order to find # representative objects by status) num_collections = 5 # desired number of objects desired_total = 40 pids = [] collections = set() # find all video, and sort smallest first all_video = solr.query(content_model=Video.VIDEO_CONTENT_MODEL) \ .field_limit(['pid', 'collection_id']).sort_by('access_copy_size') # master size is not indexed, but hopefully access copy # can serve as a proxy total_pids = all_video.count() if self.verbosity >= self.v_normal: self.stderr.write('Found %d total video objects' % all_video.count()) facet_q = all_video.facet_by('collection_id', sort='count', mincount=1) \ .facet_by('access_code', sort='count', mincount=1) \ .paginate(rows=0) facets = facet_q.execute().facet_counts.facet_fields # pick the requested number of collections with the most items top_collections = [pid for (pid, count) in facets['collection_id'] ][:num_collections] # restrict query to video in those collections collection_filter = solr.Q() for coll in top_collections: collection_filter |= solr.Q(collection_id=coll) q = all_video.filter(collection_filter) self.stderr.write('Found %d total video objects in %d largest collections' \ % (q.count(), num_collections)) # Nothing here ensures we get content from all of these # collections, but hopefully the diversity of status codes # will help provide a reasonable distribution. # figure out some representative percentage based on our desired total # - by far the most content is old dm (93%), so don't use that % # first facet is old dm (largest total); facet is label, count old_dm_code = facets['access_code'][0][0] old_dm_total = facets['access_code'][0][1] # get percentages based on the total *without* old dm for code, num in facets['access_code'][1:]: # determine number of pids to grab as a percentage # of half the desired number percent = float(num) / (total_pids - old_dm_total) # minimum of at least 1 per code num_pids = max(int((percent) * (desired_total / 2)), 1) if self.verbosity >= self.v_normal: self.stderr.write(' Looking for %d pid(s) for access code %s' % \ (num_pids, code)) # first try to find within the request collections pids_by_code = q.filter(access_code=code) # if no pids are found for this code in our collections, # look for them elsewhere if not pids_by_code.count(): pids_by_code = all_video.filter(access_code=code) for r in pids_by_code[:num_pids]: pids.append(r['pid']) collections.add(r['collection_id']) # other codes will provide slightly more than half, # because we are rounding up; get the rest of the # requested objects from old dm remainder = desired_total - len(pids) for r in q.filter(access_code=old_dm_code)[:remainder]: pids.append(r['pid']) collections.add(r['collection_id']) if self.verbosity >= self.v_normal: self.stderr.write('Selected %d pids from %d collections' % \ (len(pids), len(collections))) for p in pids: self.stdout.write(p)
def solr_items_query(self): 'Solr query for all items in this collection' solr = solr_interface() # search for all items that belong to this collection return solr.query(collection_id=self.pid)
def keyword_search_suggest(request): '''Suggest helper for keyword search. If the search string ends with a recognized field name with an optional value, e.g. ``user:`` or ``user:A``, looks up existing values using Solr facets. Returns a JSON response with the 15 most common matching terms in the requested field with the search term prefix, if any. If the search string is empty or ends with a space, suggests available search fields with an explanation. .. Note:: Due to the current implementation and the limitations of facet querying in Solr, the search term is case-sensitive and only matches at the beginning of the string. Return format is suitable for use with `JQuery UI Autocomplete`_ widget. .. _JQuery UI Autocomplete: http://jqueryui.com/demos/autocomplete/ :param request: the http request passed to the original view method (used to retrieve the search term) ''' term = request.GET.get('term', '') suggestions = [] # if term is empty or ends in a space, suggest available search fields if term == '' or term[-1] == ' ': suggestions = [ {'label': field, 'value': '%s%s' % (term, field), 'category': 'Search Fields', 'desc': desc} for field, desc in KeywordSearch.field_descriptions.iteritems() ] # otherwise, check if there is a field to look up values for else: term_prefix, sep, term_suffix = term.rpartition(' ') value_prefix = term_prefix + sep # parse the last search term try: # parse could error in some cases parsed_terms = parse_search_terms(term_suffix) field, prefix = parsed_terms[-1] except Exception: field, prefix = None, '' if prefix is None: prefix = '' # if field can be faceted, suggest terms if field in KeywordSearch.facet_fields.keys(): facet_field = KeywordSearch.facet_fields[field] # date created is a special case if field == 'created': sort = 'index' category = 'Date Added' # if less than 4 characters, suggest year if len(prefix) < 4: facet_field = 'created_year' result_fmt = '%s' # between 4 and 7, suggest year-month elif len(prefix) < 7: facet_field = 'created_month' result_fmt = '%s' # suggest full dates else: result_fmt = '%s ' elif field in ['added_by', 'user']: # added_by or user sort = 'count' category = 'Users' result_fmt = '"%s" ' # collection label if field == 'coll': sort = 'count' category = 'Collection' result_fmt = '%s ' # if the term is numeric facet by source_id if prefix and prefix.isdigit(): facet_field = 'collection_source_id' solr = solr_interface() facetq = solr.query().paginate(rows=0) # filter by current user permssions # facetq = filter_by_perms(facetq, request.user) # return the 15 most common terms in the requested facet field # with a specified prefix facetq = facetq.facet_by(facet_field, prefix=prefix, sort=sort, limit=15) facets = facetq.execute().facet_counts.facet_fields # generate a dictionary to return via json with label (facet value # + count), and actual value to use suggestions = [{'label': '%s (%d)' % (facet, count), 'value': '%s%s:' % (value_prefix, field) + \ result_fmt % facet, 'category': category} for facet, count in facets[facet_field] ] return HttpResponse(json_serializer.encode(suggestions), content_type='application/json')
def search(request): form = SearchForm(request.GET, user=request.user) # form.filter_libraries_by_user(request.user) ctx = {'form': form} if form.is_valid(): search_terms = form.cleaned_data['keyword'] search_opts = form.cleaned_data # solr search field parses into list of tuples of field, search terms # this search doesn't support any field: searching yet, so just assume all are keywords search_terms = [v for k, v in search_terms] solr = solr_interface() # NOTE: content type currently supported for researcher access cm_query = solr.Q(solr.Q(content_model=AudioObject.AUDIO_CONTENT_MODEL) | solr.Q(content_model=Video.VIDEO_CONTENT_MODEL)) # start with a default query to add filters & search terms q = solr.query().filter(cm_query) # filter the query by logged-in user permissions # includes restricting to researcher-accessible content when appropriate q = filter_by_perms(q, request.user) if search_terms: q = q.query(*search_terms) # NOTE: sunburnt now seems to require explicit list of fields # needed when returning score q = q.sort_by('-score').field_limit(['pid', 'title', 'collection_id', 'collection_source_id', 'collection_label', 'ark_uri', 'date_issued', 'date_created', 'part', 'duration', 'researcher_access', 'object_type'], score=True) # NOTE: do we want a secondary sort after score? else: q = q.sort_by('title_exact') # if a collection search term is specified, filter if 'collection' in search_opts and search_opts['collection']: collection = search_opts['collection'] # search on *either* collection name or collection number q = q.query(solr.Q(collection_label=collection) | solr.Q(collection_source_id=collection)) # if a library is specified, filter by archive id on related collection if 'library' in search_opts and search_opts['library']: library = search_opts['library'] # NOTE: requires a join query; items belong to collections, which belong # to libraries; join on pid->collection id in order to filter on # archive id property on the associated collection object q = q.join('pid', 'collection_id', archive_id=library) # if format search term is specified, filter if 'format' in search_opts and search_opts['format']: format = search_opts['format'] # search on format by content model q = q.query(solr.Q(content_model=format)) # date search if search_opts.get('start_date', None) or search_opts.get('end_date', None): sdate = search_opts.get('start_date', None) edate = search_opts.get('end_date', None) # NOTE: needs to handle date format variation (YYYY, YYYY-MM, etc) if sdate is not None: # ensure we search on 4-digit year sdate = '%04d' % int(sdate) # convert end date to end of year in order to catch any date variants # within that year; e.g. 2001-12-31 will always come after 2001-04, etc if edate is not None: edate = "%04d-12-31" % int(edate) # single date search: start and end date should be the same; # using same logic as range to match any dates within that year # if only one of start or end is specified, results in an open range # i.e. anything after start date or anything before end date # if both values are set, use sunburnt range query if sdate is not None and edate is not None: created_q = solr.Q(date_created__range=(sdate, edate)) issued_q = solr.Q(date_issued__range=(sdate, edate)) # q = q.query(date__range=(sdate, edate)) elif sdate is not None: # restrict by start date # YYYY will be before any date in that year, e.g. "2001" >= "2001-11" # q = q.query(date__gte='%04d' % sdate) created_q = solr.Q(date_created__gte=sdate) issued_q = solr.Q(date_issued__gte=sdate) elif edate is not None: # restrict by end date # q = q.query(date__lte=str(edate)) created_q = solr.Q(date_created__lte=edate) issued_q = solr.Q(date_issued__lte=edate) # NOTE: explicitly search on date created or date issued, # to avoid complications with other values in the generic date field q = q.query(created_q | issued_q) # paginate the solr result set paginator = Paginator(q, 30) try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 try: results = paginator.page(page) except (EmptyPage, InvalidPage): results = paginator.page(paginator.num_pages) # url parameters for pagination links url_params = request.GET.copy() if 'page' in url_params: del url_params['page'] ctx.update({ 'results': results, 'search_opts': request.GET.urlencode(), 'search_terms': search_terms, 'url_params': urlencode(url_params) }) return TemplateResponse(request, 'search/results.html', ctx)
def list_archives(request, archive=None): '''List all top-level archive collections, with the total count of :class:`~keep.collection.models.CollectionObject` in each archive. .. Note:: Archives must be configured in **PID_ALIASES** in Django settings in order to be listed here. .. Note:: Within the code, top-level collections are referred to as "archives", but externally for users they should always be labeled as "Libraries." ''' # if params are set, search for collection if 'archive' in request.GET and 'collection' in request.GET: form = FindCollection(request.GET, user=request.user) if form.is_valid(): data = form.cleaned_data q = CollectionObject.item_collection_query() # submitted value is pid alias; lookup pid for solr query archive_id = settings.PID_ALIASES[data['archive']] q = q.query(archive_id=archive_id, source_id=data['collection']) # if exactly one result is found, redirect to the collection view if q.count() == 1: # give user some context for the redirect messages.info(request, 'One collection found for %s %s.' % (data['archive'].upper(), data['collection'])) return HttpResponseSeeOtherRedirect(reverse('collection:view', kwargs={'pid': q[0]['pid']})) # otherwise, if multiple, redirect to a filtered view of the archive browse elif q.count(): messages.info(request, '%d collections found for %s %s.' % (q.count(), data['archive'].upper(), data['collection'])) return HttpResponseSeeOtherRedirect('%s?%s' % \ (reverse('collection:browse-archive', kwargs={'archive': data['archive']}), urlencode({'collection': data['collection']}))) # if no matches, warn and return to archive display else: messages.warning(request, 'No collections found for %s %s.' % (data['archive'].upper(), data['collection'])) # values submitted but form not valid else: # TODO: better error message? messages.warning(request, 'Collection search input was not valid; please try again.') q = CollectionObject.item_collection_query() q = q.facet_by('archive_id', sort='count', mincount=1) \ .paginate(rows=0) # - depending on permissions, restrict to collections with researcher audio if not request.user.has_perm('collection.view_collection') and \ request.user.has_perm('collection.view_researcher_collection'): q = q.join('collection_id', 'pid', researcher_access=True) q = q.join('collection_id', 'pid', has_access_copy=True) facets = q.execute().facet_counts.facet_fields solr = solr_interface() archive_info = dict([(pid.replace('info:fedora/', ''), {'count': count}) for pid, count in facets['archive_id']]) # construct a boolean pid query to match any archive pids # in order to lookup titles and match them to pids pid_q = solr.Q() for pid in archive_info.keys(): pid_q |= solr.Q(pid=pid) query = solr.query(pid_q) \ .field_limit(['pid', 'title']) \ .sort_by('title') # pid aliases are keyed on the alias, but we need to look up by pid pid_aliases_by_pid = dict([(v, k) for k, v in settings.PID_ALIASES.iteritems()]) # add solr information and pid aliases to info dictionary for q in query: pid = q['pid'] if pid not in archive_info: continue # duplicate to make list of dict available to template for dictsort archive_info[pid]['pid'] = q['pid'] archive_info[pid]['title'] = q['title'] alias = pid_aliases_by_pid.get(pid, None) archive_info[pid]['alias'] = alias if alias is None: logger.warning('No pid alias found for archive %(pid)s (%(title)s)' \ % q) # prune any referenced archives that aren't actually indexed in solr # (should only happen in dev/qa) for pid in archive_info.keys(): if 'title' not in archive_info[pid] or archive_info[pid]['alias'] is None: del archive_info[pid] # NOTE: sending list of values (dictionaries) to allow sorting in template return TemplateResponse(request, 'collection/archives.html', {'archives': archive_info.values(), 'find_collection': FindCollection(user=request.user)})