def get_all_sound_ids_from_solr(limit=False): logger.info("getting all sound ids from solr.") if not limit: limit = 99999999999999 solr = Solr(settings.SOLR_URL) solr_ids = [] solr_count = None PAGE_SIZE = 2000 current_page = 1 try: while (len(solr_ids) < solr_count or solr_count == None) and len(solr_ids) < limit: #print "Getting page %i" % current_page response = SolrResponseInterpreter( solr.select( unicode( search_prepare_query('', '', search_prepare_sort( 'created asc', SEARCH_SORT_OPTIONS_WEB), current_page, PAGE_SIZE, include_facets=False)))) solr_ids += [element['id'] for element in response.docs] solr_count = response.num_found current_page += 1 except Exception, e: raise Exception(e)
def get_all_sound_ids_from_solr(limit=False): logger.info("getting all sound ids from solr.") if not limit: limit = 99999999999999 solr = Solr(settings.SOLR_URL) solr_ids = [] solr_count = None PAGE_SIZE = 2000 current_page = 1 try: while (len(solr_ids) < solr_count or solr_count == None) and len(solr_ids) < limit: # print "Getting page %i" % current_page response = SolrResponseInterpreter( solr.select( unicode( search_prepare_query( "", "", search_prepare_sort("created asc", SEARCH_SORT_OPTIONS_WEB), current_page, PAGE_SIZE, include_facets=False, ) ) ) ) solr_ids += [element["id"] for element in response.docs] solr_count = response.num_found current_page += 1 except Exception, e: raise Exception(e)
def clean(solr_address): s = Solr(solr_address) resp = s.select('*:*', fl='id', rows='50000') ids = [] for result in resp.results: ids.append(result.get('id')) print ids s.delete_many(ids, commit=True)
def check_if_sound_exists_in_solr(sound): solr = Solr(settings.SOLR_URL) response = SolrResponseInterpreter( solr.select( unicode( search_prepare_query( '', 'id:%i' % sound.id, search_prepare_sort('created asc', SEARCH_SORT_OPTIONS_WEB), 1, 1)))) return response.num_found > 0
def main(url_couchdb=None, dbname=None, url_solr=None, all_docs=False, since=None): '''Use the _changes feed with a "since" parameter to only catch new changes to docs. The _changes feed will only have the *last* event on a document and does not retain intermediate changes. Setting the "since" to 0 will result in getting a _changes record for each document, essentially dumping the db to solr ''' print('Solr update PID: {}'.format(os.getpid())) dt_start = datetime.datetime.now() print('Start time:{}'.format(dt_start)) sys.stdout.flush() # put pd db = get_couchdb(url=url_couchdb, dbname=dbname) s3_seq_cache = CouchdbLastSeq_S3() if not since: since = s3_seq_cache.last_seq if all_docs: since = '0' print('Attempt to connect to {0} - db:{1}'.format(url_couchdb, dbname)) print('Getting changes since:{}'.format(since)) sys.stdout.flush() # put pd db = get_couchdb(url=url_couchdb, dbname=dbname) changes = db.changes(since=since) previous_since = since last_since = int( changes['last_seq']) # get new last_since for changes feed results = changes['results'] n_up = n_design = n_delete = 0 solr_db = Solr(url_solr) start_time = datetime.datetime.now() for row in results: cur_id = row['id'] if '_design' in cur_id: n_design += 1 print("Skip {0}".format(cur_id)) continue if row.get('deleted', False): # need to get the solr doc for this couch resp = solr_db.select(q=''.join(('harvest_id_s:"', cur_id, '"'))) if resp.numFound == 1: sdoc = resp.results[0] print('====DELETING: {0} -- {1}'.format(cur_id, sdoc['id'])) solr_db.delete(id=sdoc['id']) n_delete += 1 else: print("-----DELETION of {} - FOUND {} docs".format( cur_id, resp.numFound)) else: doc = db.get(cur_id) try: doc = fill_in_title(doc) has_required_fields(doc) except KeyError as e: print(e.message) continue except ValueError as e: print(e.message) continue try: try: solr_doc = map_couch_to_solr_doc(doc) except OldCollectionException: print('---- ERROR: OLD COLLECTION FOR:{}'.format(cur_id)) continue try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message) continue solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db) except TypeError as e: print('TypeError for {0} : {1}'.format(cur_id, e)) continue n_up += 1 if n_up % 1000 == 0: elapsed_time = datetime.datetime.now() - start_time print("Updated {} so far in {}".format(n_up, elapsed_time)) solr_db.commit() if not all_docs: s3_seq_cache.last_seq = last_since print("UPDATED {0} DOCUMENTS. DELETED:{1}".format(n_up, n_delete)) print("PREVIOUS SINCE:{0}".format(previous_since)) print("LAST SINCE:{0}".format(last_since)) run_time = datetime.datetime.now() - dt_start print("RUN TIME:{}".format(run_time))
def check_if_sound_exists_in_solr(sound): solr = Solr(settings.SOLR_URL) response = SolrResponseInterpreter( solr.select(unicode(search_prepare_query( '', 'id:%i' % sound.id, search_prepare_sort('created asc', SEARCH_SORT_OPTIONS_WEB), 1, 1)))) return response.num_found > 0