def delete_post_from_solr(post_id): logger.info("deleting post with id %d" % post_id) try: solr = Solr(settings.SOLR_FORUM_URL) solr.delete_by_id(post_id) solr.commit() except SolrException as e: logger.error('could not delete post with id %s (%s).' % (post_id, e))
def add_posts_to_solr(posts): logger.info("adding multiple forum posts to solr index") solr = Solr(settings.SOLR_FORUM_URL, auto_commit=False) logger.info("creating XML") documents = map(convert_to_solr_document, posts) logger.info("posting to Solr") solr.add(documents) solr.commit() logger.info("optimizing solr index") #solr.optimize() logger.info("done")
def send_posts_to_solr(posts): logger.info("adding forum posts to solr index") logger.info("creating XML") documents = [convert_to_solr_document(p) for p in posts] try: logger.info("posting to Solr") solr = Solr(settings.SOLR_FORUM_URL) solr.add(documents) solr.commit() except SolrException as e: logger.error("failed to add posts to solr index, reason: %s" % str(e)) logger.info("done")
def main(config): cfg = cliconfig(config) session = SessionFactory(cfg['database']['url']).create() server = Solr(str(cfg['solr']['url']), http_user=cfg['solr'].get('username'), http_pass=cfg['solr'].get('password')) documents = [] q = session.query(Address).filter(Address.prefecture is not None) q = q.order_by(Address.zipcode) for r in ifilter(lambda r: r, imap(transform, q)): documents.append(r) if len(documents) >= COMMIT_UNIT: server.add_many(documents) documents = [] if len(documents) > 0: server.add_many(documents) server.commit()
def sync_couch_collection_to_solr(collection_key): # This works from inside an environment with default URLs for couch & solr URL_SOLR = os.environ.get('URL_SOLR', None) collection_key = str(collection_key) # Couch need string keys v = CouchDBCollectionFilter( couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) updated_docs = [] num_added = 0 report = defaultdict(int) for r in v: try: fill_in_title(r.doc) has_required_fields(r.doc) except KeyError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue except ValueError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue solr_doc = map_couch_to_solr_doc(r.doc) # TODO: here is where to check if existing and compare collection vals try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message, file=sys.stderr) report[e.dict_key] += 1 continue updated_docs.append(solr_doc) num_added += push_doc_to_solr(solr_doc, solr_db=solr_db) solr_db.commit() publish_to_harvesting( 'Synced collection {} to solr'.format(collection_key), harvesting_report( collection_key, updated_docs, num_added, report)) return updated_docs, report
def sync_couch_collection_to_solr(collection_key): # This works from inside an environment with default URLs for couch & solr delete_solr_collection(collection_key) URL_SOLR = os.environ.get('URL_SOLR', None) collection_key = str(collection_key) # Couch need string keys v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) updated_docs = [] num_added = 0 report = defaultdict(int) for r in v: try: fill_in_title(r.doc) has_required_fields(r.doc) except KeyError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue except ValueError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue solr_doc = map_couch_to_solr_doc(r.doc) # TODO: here is where to check if existing and compare collection vals try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message, file=sys.stderr) report[e.dict_key] += 1 continue updated_docs.append(solr_doc) num_added += push_doc_to_solr(solr_doc, solr_db=solr_db) solr_db.commit() publish_to_harvesting( 'Synced collection {} to solr'.format(collection_key), harvesting_report(collection_key, updated_docs, num_added, report)) return updated_docs, report
def main(url_couchdb=None, dbname=None, url_solr=None, all_docs=False, since=None): '''Use the _changes feed with a "since" parameter to only catch new changes to docs. The _changes feed will only have the *last* event on a document and does not retain intermediate changes. Setting the "since" to 0 will result in getting a _changes record for each document, essentially dumping the db to solr ''' print('Solr update PID: {}'.format(os.getpid())) dt_start = datetime.datetime.now() print('Start time:{}'.format(dt_start)) sys.stdout.flush() # put pd db = get_couchdb(url=url_couchdb, dbname=dbname) s3_seq_cache = CouchdbLastSeq_S3() if not since: since = s3_seq_cache.last_seq if all_docs: since = '0' print('Attempt to connect to {0} - db:{1}'.format(url_couchdb, dbname)) print('Getting changes since:{}'.format(since)) sys.stdout.flush() # put pd db = get_couchdb(url=url_couchdb, dbname=dbname) changes = db.changes(since=since) previous_since = since last_since = int( changes['last_seq']) # get new last_since for changes feed results = changes['results'] n_up = n_design = n_delete = 0 solr_db = Solr(url_solr) start_time = datetime.datetime.now() for row in results: cur_id = row['id'] if '_design' in cur_id: n_design += 1 print("Skip {0}".format(cur_id)) continue if row.get('deleted', False): # need to get the solr doc for this couch resp = solr_db.select(q=''.join(('harvest_id_s:"', cur_id, '"'))) if resp.numFound == 1: sdoc = resp.results[0] print('====DELETING: {0} -- {1}'.format(cur_id, sdoc['id'])) solr_db.delete(id=sdoc['id']) n_delete += 1 else: print("-----DELETION of {} - FOUND {} docs".format( cur_id, resp.numFound)) else: doc = db.get(cur_id) try: doc = fill_in_title(doc) has_required_fields(doc) except KeyError as e: print(e.message) continue except ValueError as e: print(e.message) continue try: try: solr_doc = map_couch_to_solr_doc(doc) except OldCollectionException: print('---- ERROR: OLD COLLECTION FOR:{}'.format(cur_id)) continue try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message) continue solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db) except TypeError as e: print('TypeError for {0} : {1}'.format(cur_id, e)) continue n_up += 1 if n_up % 1000 == 0: elapsed_time = datetime.datetime.now() - start_time print("Updated {} so far in {}".format(n_up, elapsed_time)) solr_db.commit() if not all_docs: s3_seq_cache.last_seq = last_since print("UPDATED {0} DOCUMENTS. DELETED:{1}".format(n_up, n_delete)) print("PREVIOUS SINCE:{0}".format(previous_since)) print("LAST SINCE:{0}".format(last_since)) run_time = datetime.datetime.now() - dt_start print("RUN TIME:{}".format(run_time))
u['phonenumbers'] = {'set': d['ner_phone_number_ts_md']} u['ner_phone_number_ts_md'] = {'set': None} else: print("Error: Skipped") continue yield u def read_stream(filename): ''' Reads json line stream :param filename: path to json line :return: doc stream ''' with open(filename) as inf: for l in inf: yield json.loads(l) if __name__ == '__main__': url = "http://127.0.0.1:8983/solr/imagecatdev" solr = Solr(url) docs = solr.query_iterator("ner_phone_number_t_md:* OR ner_phone_number_ts_md:*", rows=1000, fl='id,ner_phone_number_t_md,ner_phone_number_ts_md', sort="indexedAt asc") updates = fix_phonenumbers(docs) count, success = solr.post_iterator(updates, False, buffer_size=1000) solr.commit() print(success) print(count)
print("Error: Skipped") continue yield u def read_stream(filename): ''' Reads json line stream :param filename: path to json line :return: doc stream ''' with open(filename) as inf: for l in inf: yield json.loads(l) if __name__ == '__main__': url = "http://127.0.0.1:8983/solr/imagecatdev" solr = Solr(url) docs = solr.query_iterator( "ner_phone_number_t_md:* OR ner_phone_number_ts_md:*", rows=1000, fl='id,ner_phone_number_t_md,ner_phone_number_ts_md', sort="indexedAt asc") updates = fix_phonenumbers(docs) count, success = solr.post_iterator(updates, False, buffer_size=1000) solr.commit() print(success) print(count)