def check_bad_query_data(kwargs): solr_search = SolrSearch(interface) try: solr_search.query(**kwargs).params() except SolrError: pass else: assert False
def search_dataset(self, request, **kwargs): """ Perform a full-text search on only one dataset. TKTK -- implement field searches TKTK -- implement wildcard + boolean searches """ self.method_check(request, allowed=['get']) self.is_authenticated(request) self.throttle_check(request) if 'pk' in kwargs: dataset_id = kwargs['pk'] else: dataset_id = request.GET.get('id') d = Dataset.objects.get(id=dataset_id) limit = int(request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_ROWS)) offset = int(request.GET.get('offset', 0)) s = SolrSearch(self._solr()) s = s.query(full_text=request.GET.get('q')) s = s.filter(dataset_id=dataset_id) s = s.paginate(offset, limit) s = s.execute() paginator = Paginator(request.GET, s, resource_uri=request.path_info) page = paginator.page() dataset_url = reverse('api_dispatch_detail', kwargs={'api_name': kwargs['api_name'], 'resource_name': 'dataset', 'pk': dataset_id }) # Update with attributes from the dataset # (Resulting object matches a group from the search endpoint) page.update({ 'id': d.id, 'name': d.name, 'resource_uri': dataset_url, 'row_count': d.row_count, 'schema': d.schema }) objects = [] for obj in s.result.docs: bundle = self.build_bundle(obj=SolrObject(obj), request=request) bundle = self.full_dehydrate(bundle) objects.append(bundle) page['objects'] = objects self.log_throttled_access(request) return self.create_response(request, page)
def check_multiple_call_data(arg_kw_list, query_output, filter_output): solr_search = SolrSearch(interface) q = solr_search.query() f = solr_search.query() for args, kwargs in arg_kw_list: q = q.query(*args, **kwargs) f = f.filter(*args, **kwargs) qp = q.params() fp = f.params() check_equal_with_debug(qp, query_output) check_equal_with_debug(fp, filter_output)
def test_wildcard_search_cleaned_up(self): from adhocracy.lib.search.query import add_wildcard_query search = SolrSearch(interface) query = add_wildcard_query(search, 'text', 'one** two*') self.assertEqual( query.params(), [('q', '(text:one OR text:one*) AND (text:two OR text:two*)')])
def test_wildcard_search_ignore_none(self): from adhocracy.lib.search.query import add_wildcard_query search = SolrSearch(interface) query = add_wildcard_query(search, 'text', None) self.assertEqual( query.params(), [('q', '*:*')])
def test_wildcard_search_added_to_search(self): from adhocracy.lib.search.query import add_wildcard_query search = SolrSearch(interface).query(text='passedin') query = add_wildcard_query(search, 'text', 'wild') self.assertEqual( query.params(), [('q', 'text:passedin AND (text:wild OR text:wild*)')])
def test_complex_boolean_queries(): solr_search = SolrSearch(interface) for query, output in complex_boolean_queries: check_complex_boolean_query(solr_search, query, output)
def check_query_data(method, args, kwargs, output): solr_search = SolrSearch(interface) p = getattr(solr_search, method)(*args, **kwargs).params() check_equal_with_debug(p, output)
def get(rows, start, **kwargs): """ Input id start_date end_date phrase rows - the number of records to get from solr start - where to start getting records in solr (offset) frame order states - list of 2 letter state abbreviations Output List of output """ solr_query = Speech.build_sunburnt_query(**kwargs).paginate(rows=rows, start=start) if kwargs.get('order') and kwargs.get('order') not in ["frame", "tfidf", "idf", "termFreq"]: solr_query = solr_query.sort_by(kwargs.get('order')) # solr_query = solr_query.terms('speaking').terms(tf=True) params = solr_query.params() dict_params = dict(params) dict_params['norm'] = 'norm(speaking)' dict_params['tf'] = 'tf(speaking, %s)' % kwargs.get('phrase') dict_params['idf'] = 'idf(speaking, %s)' % kwargs.get('phrase') dict_params['tfidf'] = 'mul($tf, $idf)' dict_params['termFreq'] = 'termfreq(speaking, %s)' % kwargs.get('phrase') dict_params['fl'] = "*, score, $norm, $termFreq, $tf, $idf, $tfidf" dict_params['q'] += " AND {!frange l=8}$tfidf" if kwargs.get('order') == None or kwargs.get('order') == "tfidf": dict_params["sort"] = "$tfidf desc" if kwargs.get('frame') and kwargs.get('order') == "frame" and kwargs.get('analysis_id'): from app.models.analysis import Analysis frame_words = Frame.get(Frame.id == kwargs['frame']).word_string # analysis_obj = Analysis.get(Analysis.id == kwargs['analysis_id']) # key = "%s - %s" % (kwargs.get('start_date'), kwargs.get('end_date')) # vocabulary_proba = json.loads(analysis_obj.speech_windows)[key] # frame_vocabulary_proba = { word: (abs(exp(vocabulary_proba.get(word)[0]) - exp(vocabulary_proba.get(word)[1]))) if vocabulary_proba.get(word) != None else 0 for word in frame_words.split() } # dict_params['frameFreq'] = "mul(sum(" + ", ".join(map(lambda word: "mul(termfreq(speaking,\"%s\"), %f)" % (word, frame_vocabulary_proba[word]), frame_words.split())) + "), $norm)" dict_params['frameFreq'] = "mul(sum(" + ", ".join(map(lambda word: "mul(termfreq(speaking,\"%s\"), %f)" % (word, 1), frame_words.split())) + "), $norm)" if dict_params.get('fl'): dict_params['fl'] += ", $frameFreq" else: dict_params['fl'] = '$frameFreq' dict_params["sort"] = "$frameFreq desc" params = zip(dict_params.keys(), dict_params.values()) # print params result = si.schema.parse_response(si.conn.select(params)) q = SolrSearch(si) response = q.transform_result(result, q.result_constructor) speeches = response.result.docs highlighting = response.highlighting term_vectors = response.term_vectors current_count = response.result.numFound current_start = response.result.start # TODO: improve this if kwargs.get('frame') and kwargs.get('highlight'): frame = Frame.get(Frame.id == kwargs['frame']) # pdb.set_trace() for speech in speeches: speech = Speech.highlight_speech(speech, frame) speeches_dict = { 'count': current_count, 'start': current_start, 'speeches': speeches, 'term_vectors': term_vectors, 'highlighting': highlighting } return speeches_dict
#!/usr/bin/env python from sunburnt import SolrInterface from sunburnt.search import SolrSearch solr = SolrInterface('http://localhost:8983/solr') s = SolrSearch(solr) print 'Testing basic query' response = s.query(full_text='Education').execute() print response.result print 'Testing group query' response = s.query(full_text='Education').group_by('dataset_id', limit=2, sort='+row').execute() for k, g in response.result.groups.items(): print k, g.docs
def search(self, request, **kwargs): """ An endpoint for performing full-text searches. TKTK -- implement field searches TKTK -- implement wildcard + boolean searches """ self.method_check(request, allowed=['get']) self.is_authenticated(request) self.throttle_check(request) limit = int(request.GET.get('limit', settings.PANDA_DEFAULT_SEARCH_GROUPS)) offset = int(request.GET.get('offset', 0)) s = SolrSearch(self._solr()) s = s.query(full_text=request.GET.get('q')) s = s.group_by('dataset_id', limit=settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP, offset=0, sort='+row') s = s.paginate(offset, limit) s = s.execute() paginator = Paginator(request.GET, s, resource_uri=request.path_info) page = paginator.page() datasets = [] for dataset_id, group in s.result.groups.items(): dataset_url = reverse('api_dispatch_detail', kwargs={'api_name': kwargs['api_name'], 'resource_name': 'dataset', 'pk': dataset_id }) dataset_search_url = reverse('api_search_dataset', kwargs={'api_name': kwargs['api_name'], 'resource_name': 'dataset', 'pk': dataset_id }) d = Dataset.objects.get(id=dataset_id) dataset = { 'id': d.id, 'name': d.name, 'resource_uri': dataset_url, 'row_count': d.row_count, 'schema': d.schema, 'meta': { 'limit': settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP, 'next': None, 'offset': 0, 'previous': None, 'total_count': group.numFound }, 'objects': [] } if group.numFound > settings.PANDA_DEFAULT_SEARCH_ROWS_PER_GROUP: dataset['meta']['next'] = '?'.join([dataset_search_url, 'limit=%i&offset=%i' % (settings.PANDA_DEFAULT_SEARCH_ROWS, settings.PANDA_DEFAULT_SEARCH_ROWS)]) for obj in group.docs: bundle = self.build_bundle(obj=SolrObject(obj), request=request) bundle = self.full_dehydrate(bundle) dataset['objects'].append(bundle) datasets.append(dataset) page['objects'] = datasets self.log_throttled_access(request) return self.create_response(request, page)
def get(rows, start, **kwargs): """ Input id start_date end_date phrase rows - the number of records to get from solr start - where to start getting records in solr (offset) frame order states - list of 2 letter state abbreviations Output List of output """ solr_query = Speech.build_sunburnt_query(**kwargs).paginate( rows=rows, start=start) if kwargs.get('order') and kwargs.get('order') not in [ "frame", "tfidf", "idf", "termFreq" ]: solr_query = solr_query.sort_by(kwargs.get('order')) # solr_query = solr_query.terms('speaking').terms(tf=True) params = solr_query.params() dict_params = dict(params) dict_params['norm'] = 'norm(speaking)' dict_params['tf'] = 'tf(speaking, %s)' % kwargs.get('phrase') dict_params['idf'] = 'idf(speaking, %s)' % kwargs.get('phrase') dict_params['tfidf'] = 'mul($tf, $idf)' dict_params['termFreq'] = 'termfreq(speaking, %s)' % kwargs.get( 'phrase') dict_params['fl'] = "*, score, $norm, $termFreq, $tf, $idf, $tfidf" dict_params['q'] += " AND {!frange l=8}$tfidf" if kwargs.get('order') == None or kwargs.get('order') == "tfidf": dict_params["sort"] = "$tfidf desc" if kwargs.get('frame') and kwargs.get( 'order') == "frame" and kwargs.get('analysis_id'): from app.models.analysis import Analysis frame_words = Frame.get(Frame.id == kwargs['frame']).word_string # analysis_obj = Analysis.get(Analysis.id == kwargs['analysis_id']) # key = "%s - %s" % (kwargs.get('start_date'), kwargs.get('end_date')) # vocabulary_proba = json.loads(analysis_obj.speech_windows)[key] # frame_vocabulary_proba = { word: (abs(exp(vocabulary_proba.get(word)[0]) - exp(vocabulary_proba.get(word)[1]))) if vocabulary_proba.get(word) != None else 0 for word in frame_words.split() } # dict_params['frameFreq'] = "mul(sum(" + ", ".join(map(lambda word: "mul(termfreq(speaking,\"%s\"), %f)" % (word, frame_vocabulary_proba[word]), frame_words.split())) + "), $norm)" dict_params['frameFreq'] = "mul(sum(" + ", ".join( map( lambda word: "mul(termfreq(speaking,\"%s\"), %f)" % (word, 1), frame_words.split())) + "), $norm)" if dict_params.get('fl'): dict_params['fl'] += ", $frameFreq" else: dict_params['fl'] = '$frameFreq' dict_params["sort"] = "$frameFreq desc" params = zip(dict_params.keys(), dict_params.values()) # print params result = si.schema.parse_response(si.conn.select(params)) q = SolrSearch(si) response = q.transform_result(result, q.result_constructor) speeches = response.result.docs highlighting = response.highlighting term_vectors = response.term_vectors current_count = response.result.numFound current_start = response.result.start # TODO: improve this if kwargs.get('frame') and kwargs.get('highlight'): frame = Frame.get(Frame.id == kwargs['frame']) # pdb.set_trace() for speech in speeches: speech = Speech.highlight_speech(speech, frame) speeches_dict = { 'count': current_count, 'start': current_start, 'speeches': speeches, 'term_vectors': term_vectors, 'highlighting': highlighting } return speeches_dict