def test_search_persistent(self): solr = Solr(os.getenv('SOLR_URL'), persistent=True, use_get=True) for _ in xrange(10): response = solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, 4563722) self.assertEqual(len(response.documents), 10)
def sciencedata(self): sciencedata_prefix = "https://labcas-dev.jpl.nasa.gov/collections/collections/" results = [] solr_collection = Solr(base_url='http://localhost:8983/solr/collections', version=4) solr_dataset = Solr(base_url='http://localhost:8983/solr/datasets', version=4) collection_query = {'q': '*:*'} collection_response = solr_collection.search(**collection_query) for obj in collection_response.documents: if obj.get("CollectionName") and obj.get("id"): dataset_query = {'q': '*:*', 'fq': "CollectionId='{}'".format(obj.get("id"))} dataset_response = solr_dataset.search(**dataset_query) datasetcount = self.countDatasets(dataset_response.documents) results.append(dict( collectionname=obj["CollectionName"], description=obj.get("CollectionDescription","None"), url=sciencedata_prefix+obj["id"], leadpi=obj.get("LeadPI",["None"]), organ=obj.get("OrganSite",["No Organ info"]), discipline=obj.get("Discipline",["None"]), protocol=obj.get("ProtocolId",["None"]), qastate=obj.get("QAState", ["None"]), species=obj.get("Species", ["None"]), datasetcount=datasetcount )) results.sort(lambda a, b: cmp(a['collectionname'], b['collectionname'])) return results
def generate_sitemap(args): # init sitemap sm = Sitemap(changefreq='weekly') solr_handle = Solr('http://localhost:8080/solr4/fedobjs') query = {'q' : 'rels_isDiscoverable:True', 'fl' : 'id', 'start' : 0} # get solr cursor cursor = solr_handle.search_cursor(**query) # loop through and write to sitemap for chunk in cursor.fetch(100): for object_id in chunk.documents: urladd = "https://digital.library.wayne.edu/item/{object_id}".format(object_id=object_id) sm.add( urladd, lastmod="today" ) # save to disk if args.output: filename = args.output else: filename = "/var/www/wsuls/digitalcollections/public/sitemaps/sitemap_https.xml" fhand = open(filename, "w") sm.write(fhand) fhand.close() print("sitemap created at %s, total time elapsed %s" % (filename, (time.time()-stime) ))
class LogIndexer(object): ''' classdocs ''' def __init__(self,solrAddr): ''' Constructor ''' self.solr = Solr(solrAddr) def index(self,data): for key, value in data.items(): if isinstance(value,datetime.datetime): try: value = solr.core.utc_to_string(value) except: pst = tz.gettz('Europe/Paris') value = value.replace(tzinfo=pst) value = solr.core.utc_to_string(value) data[key] = value try: self.solr.update([data]) except: print "Erreur Index request: " self.solr.commit() print "data indexed"
def query_solr(): query_string = parse_json(json.loads(request.data)) solr = Solr("http://52.76.188.127:8983/solr/clickstream_event_shard1_replica1/") solr_response = solr.search(q=query_string) return json.dumps(solr_response.documents)
def post_to_solr( self, solr_dict ): """ Posts solr_dict to solr. """ SOLR_ROOT_URL = ( os.environ.get('BELL_I_SOLR_ROOT') ) solr = Solr( SOLR_ROOT_URL ) response = solr.update( [solr_dict], 'xml', commit=True ) # 'xml' param converts default json to xml for post; required for our old version of solr response_status = response.status self.logger.info( 'in tasks.indexer.post_to_solr() [for custom-solr]; accession_number, %s; response_status, %s' % (solr_dict['accession_number_original'], response_status) ) if not response_status == 200: raise Exception( 'custom-solr post problem logged' ) return response_status
def post_to_solr( self, solr_dict ): """ Posts solr_dict to solr. Called by update_custom_index_entry() """ solr = Solr( self.CUSTOM_INDEX_SOLR_URL_ROOT ) response = solr.update( [solr_dict], 'xml', commit=True ) # 'xml' param converts default json to xml for post; required for our old version of solr response_status = response.status self.logger.info( 'in tasks.indexer.CustomIndexUpdater.post_to_solr() [for custom-solr]; accession_number, %s; response_status, %s' % (solr_dict['accession_number_original'], response_status) ) if not response_status == 200: raise Exception( 'custom-solr post problem logged' ) return response_status
class SolrUtils: def __init__(self, url): self.url = url self.conn = Solr(url) def addJSONDoc(self, doc): self.conn.update(doc, 'json', commit=False) def commit(self): self.conn.commit()
def delete_item( self, pid ): """ Deletes item from custom bell index. Called by one_offs.rebuild_custom_index(). """ SOLR_ROOT_URL = ( os.environ.get('BELL_I_SOLR_ROOT') ) self.logger.info( 'in tasks.indexer.delete_item() [for custom-solr]; SOLR_ROOT_URL, %s' % SOLR_ROOT_URL ) solr = Solr( SOLR_ROOT_URL ) response = solr.delete_by_query( 'pid:"%s"' % pid, commit=True ) response_status = response.status self.logger.info( 'in tasks.indexer.delete_item() [for custom-solr]; pid, %s; response_status, %s' % (pid, response_status) ) if not response_status == 200: raise Exception( 'custom-solr delete problem logged' ) return response_status
def solr_search(self, query): """Do the solr search and pass back results""" output_dict = {} # Setup connections solr = Solr(self.server,version=4) #UNLIMITED_ROWS = 10000000 # necessary because default in mysolr is mere 10 # Run the search search_results = solr.search(**query) # Format results for pdb in search_results.documents: output_dict[pdb.get('pdb_id').upper()] = {'description': pdb.get('molecule_name')[0]} return output_dict
def getSingleObjects(id_list, start): smCount = 1 tcount = 0 solr = Solr('http://localhost:8080/solr4/fedobjs') query = {'q' : 'rels_isDiscoverable:True', 'fl' : 'id', 'rows' : 50000, 'start' : 0} response = solr.search(**query) print "Num Results:",response.total_results for each in response.documents: # print "adding:",each['id'] id_list.append(each['id']) tcount+=1 print "Writing",tcount,"results..." writeSitemapXML(id_list, smCount)
def delete_target_custom_solr_pids( self ): ## load pids to be deleted with open( self.PIDS_TO_DELETE_SOURCE_DATA_JSON_PATH ) as f: deletion_pid_lst = json.loads( f.read() ) ## run deletion loop, tracking along way for pid in deletion_pid_lst: solr = Solr( self.CUSTOM_INDEX_SOLR_URL_ROOT ) response = solr.delete_by_query( 'pid:"%s"' % pid, commit=True ) response_status = response.status self.update_tracker( pid, response_status ) if not response_status == 200: logger.error( 'custom-solr delete problem-response for pid `{pid}`: ```{response}```'.format(pid=pid, resp=response_status) ) return
def __init__(self, context, request): self.context = context self.request = request self.create_count = 0 self.update_count = 0 self.messages = [] self.to_index = [] solr_uri = request.registry.settings.get('push.solr_uri', None) if solr_uri is None: raise AttributeError(u'A push.solr_uri is required') # XXX: We are importing solr here to be able to mock it in the tests from mysolr import Solr self.solr = Solr(solr_uri) self.shared = context.shared
def solr_search(self, query): """Do the solr search and pass back results""" output_dict = {} # Setup connections solr = Solr(self.server, version=4) #UNLIMITED_ROWS = 10000000 # necessary because default in mysolr is mere 10 # Run the search search_results = solr.search(**query) # Format results for pdb in search_results.documents: output_dict[pdb.get('pdb_id').upper()] = { 'description': pdb.get('molecule_name')[0] } return output_dict
def _readLabcasSolr(self, labcasurl, labcas_sourceurl_prefix): u'''Read the statements made at the RDF at ``url`` and return a dictionary of {s → [{p → [o]}]} where ``s`` is a subject URI mapping to a sequence of dictionaries whose keys ``p`` are predicate URIs mapping to a sequence of ``o`` objects, which may be literal values or reference URIs.''' solr_conn = Solr(base_url=labcasurl, version=4) solr_query = {'q': '*:*'} solr_response = solr_conn.search(**solr_query) results = {} for obj in solr_response.documents: obj['sourceurl'] = labcas_sourceurl_prefix + obj.get("id") results[obj.get("id")] = obj return results
class VIVOService(object): def __init__(self): from mysolr import Solr surl = get_env('SOLR_URL') self.solr = Solr(surl) def get(self, query, class_type): out = [] #Will use acNameStemmed for now. Can construct a more intelligent query #later if necessary. query = { 'q': u'acNameStemmed:{0} type:{1}'.format(query, class_type), 'fl': 'URI,nameRaw,PREFERRED_TITLE', 'rows': 20 } response = self.solr.search(**query) #Massage the Solr response. for doc in response.documents: d = {} d['uri'] = doc['URI'] d['id'] = doc['URI'] d['text'] = "{} - {}".format( doc['nameRaw'][0], doc['PREFERRED_TITLE'][0] ) out.append(d) return out
def __init__(self, exit_on_error=True, solr_host=settings.SOLR_HOST, solr_port=settings.SOLR_PORT, solr_collection=settings.SOLR_COLLECTION): if 'http://' not in solr_host and 'https://' not in solr_host: # forgiving of configurations solr_host = 'http://' + solr_host self.session = requests.Session() if len(solr_collection) > 1: solr_collection = '/' + solr_collection if solr_port == 80: solr_connection_string = solr_host \ + '/solr' + solr_collection else: solr_connection_string = solr_host + ':' + str(solr_port) \ + '/solr' + solr_collection try: # print(solr_connection_string) self.connection = Solr(solr_connection_string, make_request=self.session, version=4) except requests.ConnectionError: print('\nError: Could not connect to Solr at: ' + solr_connection_string +\ '\nPlease verify your Solr instance and configuration.\n') if exit_on_error: sys.exit(1) else: self.connection = False
class call_number_app(object): def __init__(self,**kwargs): """ The `call_number_app` takes a number of optional parameters including an URL where the Aristotle Library Apps instance is currently running. :param url: URL of Aristotle Library Apps path to the call number app, defaults to http://0.0.0.0/apps/call_number/json/. """ if kwargs.has_key("url"): self.call_number_url = kwargs.get("url") else: self.call_number_url = "http://0.0.0.0/apps/call_number/json/" self.solr = Solr(base_url=settings.SOLR_URL) def json_search(self,request): """ Performs a call number search using JSON interface to the call number app. Results are returned as JSON. :param request: Django request """ call_number = request.REQUEST.get('q') if request.REQUEST.has_key("number_type"): number_type = request.REQUEST.get('number_type') else: number_type = 'lccn' context = {'docs':None} json_search_url = os.path.join(self.call_number_url, 'term_search') json_search_url = "{0}?call_number={1}&slice-size={2}&type={3}".format(json_search_url, call_number.strip(), int(settings.ITEMS_PER_PAGE) - 3, number_type) json_results = urllib2.urlopen(json_search_url).read() results = json.load(urllib2.urlopen(json_search_url)) if len(results.get("bib_numbers")) > 0: context['docs'] = [] for bib_num in results.get("bib_numbers"): query = {"q":bib_num, "qt":"dismax", "fl":"*"} response = self.solr.search(**query) for doc in response.documents: context["docs"].append(doc) # Iterate through and create record_urls for doc in context['docs']: doc['record_url'] = settings.CATALOG_RECORD_URL.format(doc['id']) context['current_sort'] = None context['sorts'] = [x[0] for x in settings.SORTS] context['start_number'] = 1 context['end_number'] = min(results, settings.ITEMS_PER_PAGE) return context
def run(self): df = pd.read_csv(self.input().open('r'), sep='\t') df['id'] = df['url'] solr = Solr('SOLR_HOST') # Index 10 docs at a time start = 0 increment = 10 while len(df[start:start + increment]) > 0: sliced = df[start:start + increment] docs = [] for index, row in sliced.iterrows(): doc = json.loads(row.to_json()) docs.append(doc) solr.update(docs, 'json') if start % 1000 == 0: # Just to see that is working print start start += increment
def delete_items(context, request): """Delete the given items from the index """ # If the request isn't an RSS feed, bail out if request.content_type not in ALLOWED_CONTENT: body_msg = ( "The content-type of the request must be one of the " "following: %s" ) % ", ".join(ALLOWED_CONTENT) return HTTPBadRequest(body=body_msg) solr_uri = request.registry.settings.get('push.solr_uri', None) if solr_uri is None: raise AttributeError(u'A push.solr_uri is required') # XXX: We are importing solr here to be able to mock it in the tests from mysolr import Solr solr = Solr(solr_uri) shared_content = feedparser.parse(request.body) missing = [] removed = 0 for item in shared_content.entries: uid = item['id'] uid = normalize_uid(uid) logger.debug('Deleting %s' % uid) if uid not in context.shared: missing.append(uid) solr.delete_by_key(uid) continue del context.shared[uid] solr.delete_by_key(uid) removed += 1 body_msg = "Removed %s items." % removed if missing: msg_str = " %s items could not be found for deletion: %s" args = (len(missing), ', '.join(missing)) msg = msg_str % args logger.warn(msg) body_msg += msg return HTTPOk(body=body_msg)
def atomicUpdate(chunkFile, solrURL): session = requests.Session() solr = Solr(solrURL, make_request=session, version=4) bufferDocs = [] with open(chunkFile, 'r') as inF: for docID in inF: docID = docID.strip() delta_update = { "id": docID, "dataSource_s_md": {"set": "ice"} } ## Caution change this value bufferDocs.append(delta_update) x = solr.update(bufferDocs, commit=True) if x.raw_content['responseHeader']['status'] != 0: print "Solr Commit Failed !!!! Error Status code: ", x.raw_content['responseHeader']['status'] else: print "Awesome!! Solr Commit was a Success"
def update_deletions(context, request): """Receive a UID from the request vars and remove the associated object from the deleted feed. """ uid = request.POST.get('uid') if not uid: return solr_uri = request.registry.settings.get('push.solr_uri', None) if solr_uri is None: raise AttributeError(u'A push.solr_uri is required') from mysolr import Solr solr = Solr(solr_uri) logger.debug('Remove deleted status') remove_deleted_status(uid, context.shared, solr) return HTTPOk(body="Item no longer marked as deleted")
def __init__(self,**kwargs): """ The `title_search_app` takes a number of optional parameters including an URL where the Aristotle Library Apps instance is currently running. :param url: URL of Aristotle Library Apps path to the call number app, defaults to http://0.0.0.0/apps/call_number/json/. """ if kwargs.has_key("url"): self.url = kwargs.get("url") else: self.url = "http://0.0.0.0/apps/title_search/search" self.solr = Solr(base_url=settings.SOLR_URL)
from mysolr import Solr # Default connection to localhost:8080 solr = Solr("http://localhost:8983/solr/barcore") # All solr params are supported! query = {'q' : '*:*', 'facet' : 'true', 'facet.field' : 'zip'} response = solr.search(**query) # do stuff with documents for document in response.documents: # modify field 'foo' document['rating'] = 2.0 # update index with modified documents solr.update(response.documents, commit=True)
def setUp(self): self.solr = Solr('http://localhost:8983/solr')
class QueryResultTestCase(unittest.TestCase): def setUp(self): self.solr = Solr('http://localhost:8983/solr') def test_search(self): response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, 4) self.assertEqual(len(response.documents), 4) def test_search_cursor(self): cursor = self.solr.search_cursor(q='*:*') i = 0 for response in cursor.fetch(1): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 4) cursor = self.solr.search_cursor(q='*:*') i = 0 for response in cursor.fetch(4): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 1) def test_commit(self): response = self.solr.commit() self.assertEqual(response.status, 200) def test_optimize(self): response = self.solr.optimize() self.assertEqual(response.status, 200) def test_ping(self): response = self.solr.ping() self.assertEqual(response.status, 200) def test_is_up(self): response = self.solr.is_up() self.assertEqual(response, True) def test_update_delete(self): # Get total results response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) total_results = response.total_results # Post one document using json documents = [{'id': 1}] response = self.solr.update(documents, input_type='json') self.assertEqual(response.status, 200) # Post anoter document using xml documents = [{'id': 2}] response = self.solr.update(documents, input_type='xml') self.assertEqual(response.status, 200) # Compare total results response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results + 2) # Now delete the two document posted above query = 'id:1' key = 2 response = self.solr.delete_by_query(query) self.assertEqual(response.status, 200) response = self.solr.delete_by_key(key) self.assertEqual(response.status, 200) response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results) def tearDown(self): pass def test_query(self): pass
class UpdateItems(object): """Create a new SharedItem or update it if it already exists. This will find all the entries, then create / update them. Then do a batch index to Solr. """ def __init__(self, context, request): self.context = context self.request = request self.create_count = 0 self.update_count = 0 self.messages = [] self.to_index = [] solr_uri = request.registry.settings.get('push.solr_uri', None) if solr_uri is None: raise AttributeError(u'A push.solr_uri is required') # XXX: We are importing solr here to be able to mock it in the tests from mysolr import Solr self.solr = Solr(solr_uri) self.shared = context.shared def __call__(self): # If the request isn't an RSS feed, bail out if self.request.content_type not in ALLOWED_CONTENT: body_msg = ( "The content-type of the request must be one of the " "following: %s" ) % ", ".join(ALLOWED_CONTENT) return HTTPBadRequest(body=body_msg) # Create / update self._process_items() # Index in Solr self._update_index() # Return a 200 with details on what happened in the body self.messages.append("%s items created." % self.create_count) self.messages.append("%s items updated." % self.update_count) return HTTPOk(body=" ".join(self.messages)) def _process_items(self): """Get a list of new items to create and existing items that need to be updated. """ shared_content = feedparser.parse(self.request.body) for item in shared_content.entries: uid = item['id'] # Get the uid, minus the urn:syndication bit item['uid'] = uid = normalize_uid(uid) logger.info('Processing item %s' % uid) item['link'] = item.link item['feed_link'] = shared_content.feed.link if uid in self.shared: self._update_item(item) else: self._create_item(item) def _create_item(self, entry): """Create new items in the feed """ new_item = SharedItem() uid = entry['uid'] logger.info('Creating item %s' % uid) new_item.update_from_entry(entry) # XXX: Should name and parent be necessary here? Shouldn't # the `add` method do that for us? new_item.__name__ = uid new_item.__parent__ = self.shared self.shared.add(uid, new_item) self.to_index.append(self.shared[uid]) self.create_count += 1 def _update_item(self, entry): """Update existing items in the db using their UID """ uid = entry['uid'] logger.info('Updating item %s' % uid) obj = self.shared[uid] # XXX: these aren't coming from the object. Why is that? Is # the `add` method on the folder not setting them? obj.__name__ = uid obj.__parent__ = self.shared selected_or_shared = ( 'selected' in entry['feed_link'] or 'shared' in entry['feed_link'] ) if selected_or_shared and hasattr(obj, 'deletion_type'): remove_deleted_status(uid, self.shared, self.solr) obj.update_from_entry(entry) self.to_index.append(obj) self.update_count += 1 def _update_index(self): """Clean up the item dictionaries to contain only items that are valid and send them over to Solr for indexing. NOTE: Solr may error out on index if it receives a field it is not aware of. We should change this code to look up the Solr schema, and remove attributes that it doesn't know, like __name__ and __parent__ below. """ logger.debug('Updating index for %s objects' % len(self.to_index)) cleaned = [] ignored_attrs = [ '__name__', '__parent__', 'deletion_type', ] for item in self.to_index: item_dict = copy.deepcopy(item.__dict__) if 'Modified' in item_dict: if hasattr(item_dict['Modified'], 'isoformat'): mod_date = item_dict['Modified'].isoformat() else: mod_date = item_dict['Modified'] # Make sure the date is acceptable to Solr, strip off # the +00:00 and replace it with a Z item_dict['Modified'] = "%sZ" % mod_date[:-6] item_dict['uid'] = item_dict['__name__'] # XXX: Need to look up the schema, then modify the dict # based on that. for attr in ignored_attrs: item_dict.pop(attr, '') cleaned.append(item_dict) # XXX: Need to handle Solr errors here response = self.solr.update(cleaned) return response
import sys database = 'fashion_ip' collection = 'docs' # Make a connection to Mongo. try: db_conn = Connection("localhost") # db_conn = Connection("emo2.trinity.duke.edu", 27017) except ConnectionFailure: print "couldn't connect: be sure that Mongo is running on localhost:27017" sys.exit(1) db = db_conn[database] solr = Solr('http://emo2.trinity.duke.edu:8080/solr/') # query = {'q':'*:*','fl':'_id','tv.tf':'true','qt':'tvrh','rows':10,'start':0} # response = solr.search(**query) # tv = response.raw_response['termVectors'] # tv[0] == 'warnings' # tv[1] == [...] # tv[2] == 'doc-0' # tv[3] == [...] # tv[3][0] == 'uniqueKey' # tv[3][1] == '4f406d8347b2301618000000' # tv[3][2] == 'content' # tv[3][3] == ['1', ['tf', 2], '151', ['tf', 1], '157', ['tf', 1], '182', ['tf', 1], '186', ['tf', 2], ... # tv[4] == 'uniqueKeyFieldName' # tv[5] == '_id'
def __init__(self, urls, config, version=4): self.cursor = Solr(urls, version=version)
from pymongo.errors import ConnectionFailure # import solr from mysolr import Solr # Make a connection to Mongo. try: db_conn = Connection() # db_conn = Connection("emo2.trinity.duke.edu", 27017) except ConnectionFailure: print "couldn't connect: be sure that Mongo is running on localhost:27017" sys.exit(1) db = db_conn['fashion_ip'] # create a connection to a solr server solr = Solr('http://localhost:8080/solr') # DELETE ALL DOCS FIRST!! solr.delete_by_query(query='*:*', commit=True) total_docs = db.docs.find().count() count = 0 documents = [] for doc in db.docs.find({}, { '_id': True, 'year': True, 'court': True, 'court_level': True, 'url': True, 'name': True,
from mysolr import Solr import requests import localConfig # set connection through requests session = requests.Session() solr_handle = Solr(localConfig.solr_URL, make_request=session)
from mysolr import Solr # Default connection to localhost:8080 solr = Solr("http://localhost:8983/solr/barcore") # All solr params are supported! query = {'q': '*:*', 'facet': 'true', 'facet.field': 'zip'} response = solr.search(**query) # do stuff with documents for document in response.documents: # modify field 'foo' document['rating'] = 2.0 # update index with modified documents solr.update(response.documents, commit=True)
#!/usr/bin/env python import sys import os import json from mysolr import Solr PDBE_SOLR_URL = "http://www.ebi.ac.uk/pdbe/search/pdb" solr = Solr(PDBE_SOLR_URL) PY3 = sys.version > '3' if PY3: import urllib.request as urllib2 else: import urllib2 SERVER_URL = "https://www.ebi.ac.uk/pdbe/api" def join_with_AND(query_params): '''convenience function to create query string with AND''' return " AND ".join(["%s:%s" % (k, v) for k, v in query_params.items()]) def execute_solr_query(query, query_fields): '''convenience function''' query["q"] = join_with_AND(query_fields) # add q response = solr.search(**query) documents = response.documents print("Found %d matching entities in %d entries." %
from flask import Flask, request, session, g, redirect, url_for, abort, render_template, flash import sqlite3 import pdb from mysolr import Solr import requests from contextlib import closing from flask.ext.sqlalchemy import SQLAlchemy #configuration must have the full path DATABASE = 'c:/Users/Alicia/PycharmProjects/WorldValues/worldvalues.db' DEBUG = True SECRET_KEY = 'development key' USERNAME = '******' PASSWORD = '******' solr = Solr('http://localhost:8983/solr/#/collection1') app = Flask(__name__) app.debug = True app.config.from_object(__name__) def connect_db(): return sqlite3.connect(app.config['DATABASE']) @app.before_request def before_request(): g.db = connect_db() @app.teardown_request
#User Guide #Connecting to Solr #Use mysolr.Solr object to connect to a Solr instance. from mysolr import Solr # Default connection. Connecting to http://localhost:8080/solr/ solr = Solr() # Custom connection solr = Solr('http://foo.bar:9090/solr/') # If the server is secured with HTTP basic authentication you can connect by using auth parameter. from mysolr import Solr solr = Solr(auth=('admin', 'admin')) #Further information about auth parameter in requests docs #Queriying to Solr #Making a query to Solr is very easy, just call search method with your query. from mysolr import Solr solr = Solr() # Search for all documents response = solr.search(q='*:*') # Get documents documents = response.documents #Besides, all available Solr query params are supported. So making a query using pagination would be as simple as
def query_solr(query): solr = Solr() response = solr.search(q=query) documents = response.documents return documents
from mysolr import Solr import requests # set connection through requests session = requests.Session() solr_handle = Solr('http://localhost:8080/solr/search', make_request=session)
def setUp(self): self.solr = Solr(os.getenv('SOLR_URL'))
class eBsolr: cursor = None def __init__(self, urls, config, version=4): self.cursor = Solr(urls, version=version) def update(self, documents, input_type='json', commit=False): self.cursor.update(documents, input_type, commit) def deleteById(self, tid, commit=False): return self.cursor.delete_by_key(tid, commit=commit) def deleteByQuery(self, query, commit=False): return self.cursor.delete_by_query(query=query, commit=commit) def deleteAll(self, commit=False): return self.cursor.delete_by_query("*:*", commit=commit) def getResponse(self, search, fields=None, start=0, rows=None, sort=None, fq=None): query = {'q': search} if fields: if isinstance(fields, basestring): query['fl'] = fields else: query['fl'] = ",".join(fields) if sort: query['sort'] = sort if fq: query['fq'] = fq # Default to 10000 rows limit = rows if rows is None: limit = _MAXROWS query['start'] = start query['rows'] = limit response = self.cursor.search(**query) if int(response.status) >= 400: raise Exception('Error Solr {}: {}'.format(response.status, response.extract_errmessage())) if rows is None and response.total_results > limit: # query['start'] = response.total_results query['rows'] = response.total_results response = self.cursor.search(**query) return response def get_language_query(self, language): q_temp = None if language is not None and language != "": langArray = language.split(';') if len(langArray) > 0: lang = langArray[0] q_temp = "language:%s" % lang for lang in langArray[1:]: q_temp = "%s OR language:%s" % (q_temp, lang) return q_temp def getDocs(self, search, fields=None, start=0, rows=None, sort=None, fq=None): """search: query sintaks ex: "field:keys,field2:keys2" fields: field yg di ambil (list) ex: ['field', 'field2'] start: start row rows: max / limit row sort: order rows ex: field asc, field2 desc""" # Get documents response = self.getResponse(search, fields, start, rows, sort, fq) return {"docs": response.documents, "count": response.total_results} def getFacetList(self, facets, facetField): ff = {} if not isinstance(facetField, list): facetField = facetField.split(",") for facet in facetField: if facet: ff[facet] = facets['facet_fields'][facet] return ff def getFacetPivotGeneral(self, query, facetField, pivotField, limit=None, fq=None): try: url = "{0}select?q={1}&rows=1&wt=json&indent=true&facet=true&facet.pivot={2},{3}".format( self.cursor.base_url, query.replace("+", "%2B"), facetField, pivotField) url = '{}select'.format(self.cursor.base_url) params = {'q': query, 'rows': 0, 'wt': 'json', 'indent': 'true', 'facet': 'true', 'facet.pivot': '{},{}'.format(facetField, pivotField)} if limit: params['facet.limit'] = limit if fq: params['fq'] = fq # url = "%s&facet.limit=%d" % (url, limit) http_response = requests.get(url, params=params) # print url # http_response = requests.get(url) return http_response.json()['facet_counts']['facet_pivot']['{0},{1}'.format(facetField, pivotField)] except Exception, e: print("Error parsing facet pivot...") print e return None
import time import socket import xml.parsers.expat #import sunburnt from mysolr import Solr from Resource.ResourceHelper import ResourceHelper from Resource.Resource import Resource from Util.PathTool import PathTool from Digester.FeedDictFactory import FeedDictFactory solrBase = "http://localhost:8983/solr/" updateUrl = solrBase + 'update/' solr = Solr(solrBase) _pt = PathTool.PathTool() _rh = ResourceHelper() feeds = _rh.getAllFeedPaths() for feed in feeds: try: feedDictFactory = FeedDictFactory() feedDict = feedDictFactory.getFeedDict(feed) if feedDict != None and feedDict != {}: feedDict['id'] = Resource(feed, 'feed').get_id() print(feedDict['id']) print("Indexing", feedDict) solr.update([feedDict], 'json', commit=True) print('Indexed.')
return self._stemmer.stem(word).lower() # Make a connection to Mongo. try: # db_conn = Connection("localhost", 27017) db_conn = Connection("emo2.trinity.duke.edu", 27017) except ConnectionFailure: print "couldn't connect: be sure that Mongo is running on localhost:27017" # sys.stdout.flush() sys.exit(1) db = db_conn['fashion_ip'] # Connection to Solr for faster full text searching solr = Solr('http://localhost:8080/solr') qstring = sys.argv[1] pir_re = re.compile(r'.* ' + qstring + '.*', re.IGNORECASE) porter = nltk.PorterStemmer() for year in range(1900, 2013): print '\nYEAR: ', year response = solr.search(q=qstring + ' year:' + str(year), fl='_id,score', rows=10000, start=0) documents = response.documents
class QueryResultTestCase(unittest.TestCase): def setUp(self): self.solr = Solr('http://localhost:8983/solr') def test_search(self): response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, 4) self.assertEqual(len(response.documents), 4) def test_search_cursor(self): cursor = self.solr.search_cursor(q='*:*') i = 0 for response in cursor.fetch(1): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 4) cursor = self.solr.search_cursor(q='*:*') i = 0 for response in cursor.fetch(4): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 1) def test_commit(self): response = self.solr.commit() self.assertEqual(response.status, 200) def test_optimize(self): response = self.solr.optimize() self.assertEqual(response.status, 200) def test_ping(self): response = self.solr.ping() self.assertEqual(response.status, 200) def test_is_up(self): response = self.solr.is_up() self.assertEqual(response, True) def test_update_delete(self): # Get total results response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) total_results = response.total_results # Post one document using json documents = [{'id' : 1}] response = self.solr.update(documents, input_type='json') self.assertEqual(response.status, 200) # Post anoter document using xml documents = [{'id' : 2}] response = self.solr.update(documents, input_type='xml') self.assertEqual(response.status, 200) # Compare total results response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results + 2) # Now delete the two document posted above query = 'id:1' key = 2 response = self.solr.delete_by_query(query) self.assertEqual(response.status, 200) response = self.solr.delete_by_key(key) self.assertEqual(response.status, 200) response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results) def tearDown(self): pass def test_query(self): pass
def __init__(self, url): self.url = url self.conn = Solr(url)
def get_solr_count(query): server = Solr(SOLR_SERVER) query_response = server.search(**query) count = query_response.total_results return count
def __init__(self,solrAddr): ''' Constructor ''' self.solr = Solr(solrAddr)
class UpdateItems(object): """Create a new SharedItem or update it if it already exists. This will find all the entries, then create / update them. Then do a batch index to Solr. """ def __init__(self, context, request): self.context = context self.request = request self.create_count = 0 self.update_count = 0 self.messages = [] self.to_index = [] solr_uri = request.registry.settings.get('push.solr_uri', None) if solr_uri is None: raise AttributeError(u'A push.solr_uri is required') # XXX: We are importing solr here to be able to mock it in the tests from mysolr import Solr self.solr = Solr(solr_uri) self.shared = context.shared def __call__(self): # If the request isn't an RSS feed, bail out if self.request.content_type not in ALLOWED_CONTENT: body_msg = ( "The content-type of the request must be one of the " "following: %s" ) % ", ".join(ALLOWED_CONTENT) return HTTPBadRequest(body=body_msg) # Create / update self._process_items() # Index in Solr self._update_index() # Return a 200 with details on what happened in the body self.messages.append("%s items created." % self.create_count) self.messages.append("%s items updated." % self.update_count) return HTTPOk(body=" ".join(self.messages)) def _process_items(self): """Get a list of new items to create and existing items that need to be updated. """ shared_content = feedparser.parse(self.request.body) for item in shared_content.entries: uid = item['id'] # Get the uid, minus the urn:syndication bit item['uid'] = uid = normalize_uid(uid) logger.info('Processing item %s' % uid) item['link'] = item.link item['feed_link'] = shared_content.feed.link if uid in self.shared: self._update_item(item) else: self._create_item(item) def _create_item(self, entry): """Create new items in the feed """ new_item = SharedItem() uid = entry['uid'] logger.info('Creating item %s' % uid) new_item.update_from_entry(entry) # XXX: Should name and parent be necessary here? Shouldn't # the `add` method do that for us? new_item.__name__ = uid new_item.__parent__ = self.shared self.shared.add(uid, new_item) self.to_index.append(self.shared[uid]) self.create_count += 1 def _update_item(self, entry): """Update existing items in the db using their UID """ uid = entry['uid'] logger.info('Updating item %s' % uid) obj = self.shared[uid] # XXX: these aren't coming from the object. Why is that? Is # the `add` method on the folder not setting them? obj.__name__ = uid obj.__parent__ = self.shared selected_or_shared = ( 'selected' in entry['feed_link'] or 'shared' in entry['feed_link'] ) if selected_or_shared and hasattr(obj, 'deletion_type'): remove_deleted_status(uid, self.shared, self.solr) obj.update_from_entry(entry) self.to_index.append(obj) self.update_count += 1 def _update_index(self): """Clean up the item dictionaries to contain only items that are valid and send them over to Solr for indexing. NOTE: Solr may error out on index if it receives a field it is not aware of. We should change this code to look up the Solr schema, and remove attributes that it doesn't know, like __name__ and __parent__ below. """ logger.debug('Updating index for %s objects' % len(self.to_index)) cleaned = [] ignored_attrs = [ '__name__', '__parent__', 'deletion_type', ] for item in self.to_index: item_dict = copy.deepcopy(item.__dict__) if 'Modified' in item_dict: if hasattr(item_dict['Modified'], 'isoformat'): mod_date = item_dict['Modified'].isoformat() else: mod_date = item_dict['Modified'] # Make sure the date is acceptable to Solr, strip off # the +00:00 and replace it with a Z item_dict['Modified'] = "%sZ" % mod_date[:-6] if 'content' in item_dict: items = [item['value'] for item in item_dict['content']] if items: # XXX: use first content item, discard the rest item_dict['content'] = items[0] item_dict['uid'] = item_dict['__name__'] # XXX: Need to look up the schema, then modify the dict # based on that. for attr in ignored_attrs: item_dict.pop(attr, '') cleaned.append(item_dict) # XXX: Need to handle Solr errors here response = self.solr.update(cleaned) return response
from pymongo.errors import ConnectionFailure # import solr from mysolr import Solr # Make a connection to Mongo. try: db_conn = Connection() # db_conn = Connection("emo2.trinity.duke.edu", 27017) except ConnectionFailure: print "couldn't connect: be sure that Mongo is running on localhost:27017" sys.exit(1) db = db_conn['fashion_ip'] # create a connection to a solr server solr = Solr('http://localhost:8080/solr') # DELETE ALL DOCS FIRST!! solr.delete_by_query(query='*:*', commit=True) total_docs = db.docs.find().count() count = 0 documents = [] for doc in db.docs.find({},{'_id':True,'year':True,'court':True,'court_level':True,'url':True,'name':True,'content':True,'tags':True,'subjects':True}): if count%100 == 0: print count # don't know how else to get solr to take IDs... doc['_id'] = str(doc['_id']) # include subject tag in list of strings if weigth greater than 0.01