class LogIndexer(object): ''' classdocs ''' def __init__(self,solrAddr): ''' Constructor ''' self.solr = Solr(solrAddr) def index(self,data): for key, value in data.items(): if isinstance(value,datetime.datetime): try: value = solr.core.utc_to_string(value) except: pst = tz.gettz('Europe/Paris') value = value.replace(tzinfo=pst) value = solr.core.utc_to_string(value) data[key] = value try: self.solr.update([data]) except: print "Erreur Index request: " self.solr.commit() print "data indexed"
class SolrUtils: def __init__(self, url): self.url = url self.conn = Solr(url) def addJSONDoc(self, doc): self.conn.update(doc, 'json', commit=False) def commit(self): self.conn.commit()
def post_to_solr( self, solr_dict ): """ Posts solr_dict to solr. """ SOLR_ROOT_URL = ( os.environ.get('BELL_I_SOLR_ROOT') ) solr = Solr( SOLR_ROOT_URL ) response = solr.update( [solr_dict], 'xml', commit=True ) # 'xml' param converts default json to xml for post; required for our old version of solr response_status = response.status self.logger.info( 'in tasks.indexer.post_to_solr() [for custom-solr]; accession_number, %s; response_status, %s' % (solr_dict['accession_number_original'], response_status) ) if not response_status == 200: raise Exception( 'custom-solr post problem logged' ) return response_status
def post_to_solr( self, solr_dict ): """ Posts solr_dict to solr. Called by update_custom_index_entry() """ solr = Solr( self.CUSTOM_INDEX_SOLR_URL_ROOT ) response = solr.update( [solr_dict], 'xml', commit=True ) # 'xml' param converts default json to xml for post; required for our old version of solr response_status = response.status self.logger.info( 'in tasks.indexer.CustomIndexUpdater.post_to_solr() [for custom-solr]; accession_number, %s; response_status, %s' % (solr_dict['accession_number_original'], response_status) ) if not response_status == 200: raise Exception( 'custom-solr post problem logged' ) return response_status
def run(self): df = pd.read_csv(self.input().open('r'), sep='\t') df['id'] = df['url'] solr = Solr('SOLR_HOST') # Index 10 docs at a time start = 0 increment = 10 while len(df[start:start + increment]) > 0: sliced = df[start:start + increment] docs = [] for index, row in sliced.iterrows(): doc = json.loads(row.to_json()) docs.append(doc) solr.update(docs, 'json') if start % 1000 == 0: # Just to see that is working print start start += increment
def atomicUpdate(chunkFile, solrURL): session = requests.Session() solr = Solr(solrURL, make_request=session, version=4) bufferDocs = [] with open(chunkFile, 'r') as inF: for docID in inF: docID = docID.strip() delta_update = { "id": docID, "dataSource_s_md": {"set": "ice"} } ## Caution change this value bufferDocs.append(delta_update) x = solr.update(bufferDocs, commit=True) if x.raw_content['responseHeader']['status'] != 0: print "Solr Commit Failed !!!! Error Status code: ", x.raw_content['responseHeader']['status'] else: print "Awesome!! Solr Commit was a Success"
class eBsolr: cursor = None def __init__(self, urls, config, version=4): self.cursor = Solr(urls, version=version) def update(self, documents, input_type='json', commit=False): self.cursor.update(documents, input_type, commit) def deleteById(self, tid, commit=False): return self.cursor.delete_by_key(tid, commit=commit) def deleteByQuery(self, query, commit=False): return self.cursor.delete_by_query(query=query, commit=commit) def deleteAll(self, commit=False): return self.cursor.delete_by_query("*:*", commit=commit) def getResponse(self, search, fields=None, start=0, rows=None, sort=None, fq=None): query = {'q': search} if fields: if isinstance(fields, basestring): query['fl'] = fields else: query['fl'] = ",".join(fields) if sort: query['sort'] = sort if fq: query['fq'] = fq # Default to 10000 rows limit = rows if rows is None: limit = _MAXROWS query['start'] = start query['rows'] = limit response = self.cursor.search(**query) if int(response.status) >= 400: raise Exception('Error Solr {}: {}'.format(response.status, response.extract_errmessage())) if rows is None and response.total_results > limit: # query['start'] = response.total_results query['rows'] = response.total_results response = self.cursor.search(**query) return response def get_language_query(self, language): q_temp = None if language is not None and language != "": langArray = language.split(';') if len(langArray) > 0: lang = langArray[0] q_temp = "language:%s" % lang for lang in langArray[1:]: q_temp = "%s OR language:%s" % (q_temp, lang) return q_temp def getDocs(self, search, fields=None, start=0, rows=None, sort=None, fq=None): """search: query sintaks ex: "field:keys,field2:keys2" fields: field yg di ambil (list) ex: ['field', 'field2'] start: start row rows: max / limit row sort: order rows ex: field asc, field2 desc""" # Get documents response = self.getResponse(search, fields, start, rows, sort, fq) return {"docs": response.documents, "count": response.total_results} def getFacetList(self, facets, facetField): ff = {} if not isinstance(facetField, list): facetField = facetField.split(",") for facet in facetField: if facet: ff[facet] = facets['facet_fields'][facet] return ff def getFacetPivotGeneral(self, query, facetField, pivotField, limit=None, fq=None): try: url = "{0}select?q={1}&rows=1&wt=json&indent=true&facet=true&facet.pivot={2},{3}".format( self.cursor.base_url, query.replace("+", "%2B"), facetField, pivotField) url = '{}select'.format(self.cursor.base_url) params = {'q': query, 'rows': 0, 'wt': 'json', 'indent': 'true', 'facet': 'true', 'facet.pivot': '{},{}'.format(facetField, pivotField)} if limit: params['facet.limit'] = limit if fq: params['fq'] = fq # url = "%s&facet.limit=%d" % (url, limit) http_response = requests.get(url, params=params) # print url # http_response = requests.get(url) return http_response.json()['facet_counts']['facet_pivot']['{0},{1}'.format(facetField, pivotField)] except Exception, e: print("Error parsing facet pivot...") print e return None
from mysolr import Solr # Default connection to localhost:8080 solr = Solr("http://localhost:8983/solr/barcore") # All solr params are supported! query = {'q' : '*:*', 'facet' : 'true', 'facet.field' : 'zip'} response = solr.search(**query) # do stuff with documents for document in response.documents: # modify field 'foo' document['rating'] = 2.0 # update index with modified documents solr.update(response.documents, commit=True)
class QueryResultTestCase(unittest.TestCase): def setUp(self): self.solr = Solr('http://localhost:8983/solr') def test_search(self): response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, 4) self.assertEqual(len(response.documents), 4) def test_search_cursor(self): cursor = self.solr.search_cursor(q='*:*') i = 0 for response in cursor.fetch(1): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 4) cursor = self.solr.search_cursor(q='*:*') i = 0 for response in cursor.fetch(4): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 1) def test_commit(self): response = self.solr.commit() self.assertEqual(response.status, 200) def test_optimize(self): response = self.solr.optimize() self.assertEqual(response.status, 200) def test_ping(self): response = self.solr.ping() self.assertEqual(response.status, 200) def test_is_up(self): response = self.solr.is_up() self.assertEqual(response, True) def test_update_delete(self): # Get total results response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) total_results = response.total_results # Post one document using json documents = [{'id': 1}] response = self.solr.update(documents, input_type='json') self.assertEqual(response.status, 200) # Post anoter document using xml documents = [{'id': 2}] response = self.solr.update(documents, input_type='xml') self.assertEqual(response.status, 200) # Compare total results response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results + 2) # Now delete the two document posted above query = 'id:1' key = 2 response = self.solr.delete_by_query(query) self.assertEqual(response.status, 200) response = self.solr.delete_by_key(key) self.assertEqual(response.status, 200) response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results) def tearDown(self): pass def test_query(self): pass
from mysolr import Solr from Resource.ResourceHelper import ResourceHelper from Resource.Resource import Resource from Util.PathTool import PathTool from Digester.FeedDictFactory import FeedDictFactory solrBase = "http://localhost:8983/solr/" updateUrl = solrBase + 'update/' solr = Solr(solrBase) _pt = PathTool.PathTool() _rh = ResourceHelper() feeds = _rh.getAllFeedPaths() for feed in feeds: try: feedDictFactory = FeedDictFactory() feedDict = feedDictFactory.getFeedDict(feed) if feedDict != None and feedDict != {}: feedDict['id'] = Resource(feed, 'feed').get_id() print(feedDict['id']) print("Indexing", feedDict) solr.update([feedDict], 'json', commit=True) print('Indexed.') except (xml.parsers.expat.ExpatError, ValueError): print(("Failed:", feed)) print("done")
from mysolr import Solr # Default connection to localhost:8080 solr = Solr("http://localhost:8983/solr/barcore") # All solr params are supported! query = {'q': '*:*', 'facet': 'true', 'facet.field': 'zip'} response = solr.search(**query) # do stuff with documents for document in response.documents: # modify field 'foo' document['rating'] = 2.0 # update index with modified documents solr.update(response.documents, commit=True)
}, { 'q' : 'foo:bar' } ] # using 10 threads responses = solr.async_search(queries, size=10) #See installation section for further information about how to install this feature. #Indexing documents from mysolr import Solr solr = Solr() # Create documents documents = [ {'id' : 1, 'field1' : 'foo' }, {'id' : 2, 'field2' : 'bar' } ] # Index using json is faster! solr.update(documents, 'json', commit=False) # Manual commit solr.commit()
class QueryResultTestCase(unittest.TestCase): def setUp(self): self.solr = Solr(os.getenv("SOLR_URL")) def test_search(self): response = self.solr.search(q="*:*") self.assertEqual(response.status, 200) self.assertEqual(response.total_results, 4) self.assertEqual(len(response.documents), 4) def test_search_cursor(self): cursor = self.solr.search_cursor(q="*:*") i = 0 for response in cursor.fetch(1): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 4) cursor = self.solr.search_cursor(q="*:*") i = 0 for response in cursor.fetch(4): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 1) def test_commit(self): response = self.solr.commit() self.assertEqual(response.status, 200) def test_optimize(self): response = self.solr.optimize() self.assertEqual(response.status, 200) def test_ping(self): response = self.solr.ping() self.assertEqual(response.status, 200) def test_is_up(self): response = self.solr.is_up() self.assertEqual(response, True) def test_update_delete(self): # Get total results response = self.solr.search(q="*:*") self.assertEqual(response.status, 200) total_results = response.total_results # Post one document using json documents = [{"id": 1}] response = self.solr.update(documents, input_type="json") self.assertEqual(response.status, 200) # Post anoter document using xml documents = [{"id": 2}] response = self.solr.update(documents, input_type="xml") self.assertEqual(response.status, 200) # Compare total results response = self.solr.search(q="*:*") self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results + 2) # Now delete the two document posted above query = "id:1" key = 2 response = self.solr.delete_by_query(query) self.assertEqual(response.status, 200) response = self.solr.delete_by_key(key) self.assertEqual(response.status, 200) response = self.solr.search(q="*:*") self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results) def tearDown(self): pass def test_query(self): pass
class UpdateItems(object): """Create a new SharedItem or update it if it already exists. This will find all the entries, then create / update them. Then do a batch index to Solr. """ def __init__(self, context, request): self.context = context self.request = request self.create_count = 0 self.update_count = 0 self.messages = [] self.to_index = [] solr_uri = request.registry.settings.get('push.solr_uri', None) if solr_uri is None: raise AttributeError(u'A push.solr_uri is required') # XXX: We are importing solr here to be able to mock it in the tests from mysolr import Solr self.solr = Solr(solr_uri) self.shared = context.shared def __call__(self): # If the request isn't an RSS feed, bail out if self.request.content_type not in ALLOWED_CONTENT: body_msg = ( "The content-type of the request must be one of the " "following: %s" ) % ", ".join(ALLOWED_CONTENT) return HTTPBadRequest(body=body_msg) # Create / update self._process_items() # Index in Solr self._update_index() # Return a 200 with details on what happened in the body self.messages.append("%s items created." % self.create_count) self.messages.append("%s items updated." % self.update_count) return HTTPOk(body=" ".join(self.messages)) def _process_items(self): """Get a list of new items to create and existing items that need to be updated. """ shared_content = feedparser.parse(self.request.body) for item in shared_content.entries: uid = item['id'] # Get the uid, minus the urn:syndication bit item['uid'] = uid = normalize_uid(uid) logger.info('Processing item %s' % uid) item['link'] = item.link item['feed_link'] = shared_content.feed.link if uid in self.shared: self._update_item(item) else: self._create_item(item) def _create_item(self, entry): """Create new items in the feed """ new_item = SharedItem() uid = entry['uid'] logger.info('Creating item %s' % uid) new_item.update_from_entry(entry) # XXX: Should name and parent be necessary here? Shouldn't # the `add` method do that for us? new_item.__name__ = uid new_item.__parent__ = self.shared self.shared.add(uid, new_item) self.to_index.append(self.shared[uid]) self.create_count += 1 def _update_item(self, entry): """Update existing items in the db using their UID """ uid = entry['uid'] logger.info('Updating item %s' % uid) obj = self.shared[uid] # XXX: these aren't coming from the object. Why is that? Is # the `add` method on the folder not setting them? obj.__name__ = uid obj.__parent__ = self.shared selected_or_shared = ( 'selected' in entry['feed_link'] or 'shared' in entry['feed_link'] ) if selected_or_shared and hasattr(obj, 'deletion_type'): remove_deleted_status(uid, self.shared, self.solr) obj.update_from_entry(entry) self.to_index.append(obj) self.update_count += 1 def _update_index(self): """Clean up the item dictionaries to contain only items that are valid and send them over to Solr for indexing. NOTE: Solr may error out on index if it receives a field it is not aware of. We should change this code to look up the Solr schema, and remove attributes that it doesn't know, like __name__ and __parent__ below. """ logger.debug('Updating index for %s objects' % len(self.to_index)) cleaned = [] ignored_attrs = [ '__name__', '__parent__', 'deletion_type', ] for item in self.to_index: item_dict = copy.deepcopy(item.__dict__) if 'Modified' in item_dict: if hasattr(item_dict['Modified'], 'isoformat'): mod_date = item_dict['Modified'].isoformat() else: mod_date = item_dict['Modified'] # Make sure the date is acceptable to Solr, strip off # the +00:00 and replace it with a Z item_dict['Modified'] = "%sZ" % mod_date[:-6] item_dict['uid'] = item_dict['__name__'] # XXX: Need to look up the schema, then modify the dict # based on that. for attr in ignored_attrs: item_dict.pop(attr, '') cleaned.append(item_dict) # XXX: Need to handle Solr errors here response = self.solr.update(cleaned) return response
class UpdateItems(object): """Create a new SharedItem or update it if it already exists. This will find all the entries, then create / update them. Then do a batch index to Solr. """ def __init__(self, context, request): self.context = context self.request = request self.create_count = 0 self.update_count = 0 self.messages = [] self.to_index = [] solr_uri = request.registry.settings.get('push.solr_uri', None) if solr_uri is None: raise AttributeError(u'A push.solr_uri is required') # XXX: We are importing solr here to be able to mock it in the tests from mysolr import Solr self.solr = Solr(solr_uri) self.shared = context.shared def __call__(self): # If the request isn't an RSS feed, bail out if self.request.content_type not in ALLOWED_CONTENT: body_msg = ( "The content-type of the request must be one of the " "following: %s" ) % ", ".join(ALLOWED_CONTENT) return HTTPBadRequest(body=body_msg) # Create / update self._process_items() # Index in Solr self._update_index() # Return a 200 with details on what happened in the body self.messages.append("%s items created." % self.create_count) self.messages.append("%s items updated." % self.update_count) return HTTPOk(body=" ".join(self.messages)) def _process_items(self): """Get a list of new items to create and existing items that need to be updated. """ shared_content = feedparser.parse(self.request.body) for item in shared_content.entries: uid = item['id'] # Get the uid, minus the urn:syndication bit item['uid'] = uid = normalize_uid(uid) logger.info('Processing item %s' % uid) item['link'] = item.link item['feed_link'] = shared_content.feed.link if uid in self.shared: self._update_item(item) else: self._create_item(item) def _create_item(self, entry): """Create new items in the feed """ new_item = SharedItem() uid = entry['uid'] logger.info('Creating item %s' % uid) new_item.update_from_entry(entry) # XXX: Should name and parent be necessary here? Shouldn't # the `add` method do that for us? new_item.__name__ = uid new_item.__parent__ = self.shared self.shared.add(uid, new_item) self.to_index.append(self.shared[uid]) self.create_count += 1 def _update_item(self, entry): """Update existing items in the db using their UID """ uid = entry['uid'] logger.info('Updating item %s' % uid) obj = self.shared[uid] # XXX: these aren't coming from the object. Why is that? Is # the `add` method on the folder not setting them? obj.__name__ = uid obj.__parent__ = self.shared selected_or_shared = ( 'selected' in entry['feed_link'] or 'shared' in entry['feed_link'] ) if selected_or_shared and hasattr(obj, 'deletion_type'): remove_deleted_status(uid, self.shared, self.solr) obj.update_from_entry(entry) self.to_index.append(obj) self.update_count += 1 def _update_index(self): """Clean up the item dictionaries to contain only items that are valid and send them over to Solr for indexing. NOTE: Solr may error out on index if it receives a field it is not aware of. We should change this code to look up the Solr schema, and remove attributes that it doesn't know, like __name__ and __parent__ below. """ logger.debug('Updating index for %s objects' % len(self.to_index)) cleaned = [] ignored_attrs = [ '__name__', '__parent__', 'deletion_type', ] for item in self.to_index: item_dict = copy.deepcopy(item.__dict__) if 'Modified' in item_dict: if hasattr(item_dict['Modified'], 'isoformat'): mod_date = item_dict['Modified'].isoformat() else: mod_date = item_dict['Modified'] # Make sure the date is acceptable to Solr, strip off # the +00:00 and replace it with a Z item_dict['Modified'] = "%sZ" % mod_date[:-6] if 'content' in item_dict: items = [item['value'] for item in item_dict['content']] if items: # XXX: use first content item, discard the rest item_dict['content'] = items[0] item_dict['uid'] = item_dict['__name__'] # XXX: Need to look up the schema, then modify the dict # based on that. for attr in ignored_attrs: item_dict.pop(attr, '') cleaned.append(item_dict) # XXX: Need to handle Solr errors here response = self.solr.update(cleaned) return response
'_id': True, 'year': True, 'court': True, 'court_level': True, 'url': True, 'name': True, 'content': True, 'tags': True, 'subjects': True }): if count % 100 == 0: print count # don't know how else to get solr to take IDs... doc['_id'] = str(doc['_id']) # include subject tag in list of strings if weigth greater than 0.01 if 'subjects' in doc: sub_tmp = [k for k, v in doc['subjects'].items() if v >= 0.05] doc['subjects'] = sub_tmp count += 1 documents.append(doc) # json indexing supposed to be faster # at least with mysolr, doing them as a big list is much faster for 18300 docs # 3 minutes vs 1 min 53 sec print "updating..." solr.update(documents, 'json', commit=False) print "committing..." solr.commit() print "done..."
def index(): client = MongoClient(host=HOST, port=PORT) db = client['crawl'] coll = db['web'] server = Solr(SERVER) max_indexed_id = get_max_indexed_id(server) if not max_indexed_id: max_indexed_id = ObjectId('000000000000') else: max_indexed_id = ObjectId(max_indexed_id) sites = get_host_name() step = 100 count = 0 jdocs = [] for row in coll.find({'_id': {'$gt': max_indexed_id}}).sort([('_id',1)]): jdoc = {} jdoc['id'] = str(row['_id']) if len(jdocs) == 0: start = row['_id'] jdoc['url'] = row['curi:url'] jdoc['site'] = sites[get_url_domain(row['curi:url'])] jdoc['ip'] = row['curi:ip'] if 'curi:processed_at' in row: jdoc['processed_at'] = row['curi:processed_at'] if 'content_type' in row: jdoc['content_type'] = row['content_type'] if 'content_length' in row: jdoc['content_length'] = row['content_length'] if 'class_key' in row: jdoc['class_key'] = row['class_key'] if 'host' in row: jdoc['host'] = row['host'] if 'curi:request' in row: jdoc['request'] = row['curi:request'] if 'content:headers' in row: jdoc['headers'] = row['content:headers'] if 'text' in row: jdoc['text'] = row['text'] if 'title' in row: jdoc['title'] = row['title'] if 'parse:keywords' in row: jdoc['keywords'] = row['parse:keywords'] if 'parse:content-encoding' in row: jdoc['content_encoding'] = row['parse:content-encoding'] if 'content:raw_data' in row: jdoc['raw_data'] = row['content:raw_data'] jdocs.append(jdoc) count = count + 1 if len(jdocs) >= step: end = row['_id'] xx = server.update(jdocs) server.commit() print xx jdocs = [] print('commit %d documents. %s to %s' % (count, start, end)) if len(jdocs) > 0: server.update(jdocs) server.commit()
class QueryResultTestCase(unittest.TestCase): def setUp(self): self.solr = Solr('http://localhost:8983/solr') def test_search(self): response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, 4) self.assertEqual(len(response.documents), 4) def test_search_cursor(self): cursor = self.solr.search_cursor(q='*:*') i = 0 for response in cursor.fetch(1): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 4) cursor = self.solr.search_cursor(q='*:*') i = 0 for response in cursor.fetch(4): self.assertEqual(response.status, 200) i += 1 self.assertEqual(i, 1) def test_commit(self): response = self.solr.commit() self.assertEqual(response.status, 200) def test_optimize(self): response = self.solr.optimize() self.assertEqual(response.status, 200) def test_ping(self): response = self.solr.ping() self.assertEqual(response.status, 200) def test_is_up(self): response = self.solr.is_up() self.assertEqual(response, True) def test_update_delete(self): # Get total results response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) total_results = response.total_results # Post one document using json documents = [{'id' : 1}] response = self.solr.update(documents, input_type='json') self.assertEqual(response.status, 200) # Post anoter document using xml documents = [{'id' : 2}] response = self.solr.update(documents, input_type='xml') self.assertEqual(response.status, 200) # Compare total results response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results + 2) # Now delete the two document posted above query = 'id:1' key = 2 response = self.solr.delete_by_query(query) self.assertEqual(response.status, 200) response = self.solr.delete_by_key(key) self.assertEqual(response.status, 200) response = self.solr.search(q='*:*') self.assertEqual(response.status, 200) self.assertEqual(response.total_results, total_results) def tearDown(self): pass def test_query(self): pass
pass else: continue print page['name'].encode('utf-8') #print page['subname'] print page['_id'] item['id'] = page['_id'] item['name'] = page['name'] if page.has_key('subname'): item['subname'] = page['subname'] #item['content'] = page['content'] items.append(item) solr.update(items, 'json',commit=False) solr.commit() ''' items = [] for index in xrange(11,16): game = games[index] item = {} item['bname'] = game['name'] item['bsummary'] = game['des'] item['id'] = game['url'] tmp = '' for x in game['property']: tmp = tmp + x['title']+':' tmp = tmp + x['content']+'\n'