def get_facet_query(solr_url, field, **kwargs): '''Return a facet data dict to muck with based on "field") ''' query = facet_query.copy() query.update({'facet.field': field}) solr_json = get_solr_json(solr_url=solr_url, query=query, **kwargs) return create_facet_dict(solr_json, field)
def create_new_facet_values_sheet(facet, workbook, solr_url, api_key, solr_url_new, api_key_new): #report new values for the given facet query = { 'facet': 'true', 'facet.field': [ facet, ], 'rows': 0, 'facet.limit': -1, # give them all 'facet.sort': 'count', 'facet.mincount': 1, } production_json = get_solr_json(solr_url, query, api_key=api_key) production_facet_dict = create_facet_dict(production_json, facet) new_json = get_solr_json(solr_url_new, query, api_key=api_key_new) new_facet_dict = create_facet_dict(new_json, facet) not_in_new, not_in_prod, count_equal, new_less, new_more = \ compare_datasets(production_facet_dict, new_facet_dict) print("{}: NOT IN PROD: {} NOT_IN_NEW: {}".format(facet, len(not_in_prod), len(not_in_new))) page = workbook.add_worksheet('New {} Values'.format(facet)) header_format = workbook.add_format({ 'bold': True, }) number_format = workbook.add_format() number_format.set_num_format('#,##0') if not_in_prod > 0: page.set_tab_color('red') number_format.set_bg_color('red') page.write(0, 0, 'New {} Values'.format(facet), header_format) page.write(0, 1, 'Counts', header_format) # width page.set_column( 0, 1, 25, ) row = 2 for value, count in not_in_prod: page.write(row, 0, value) page.write(row, 1, count, number_format) row = row + 1
def create_new_facet_values_sheet(facet, workbook, solr_url, api_key, solr_url_new, api_key_new): #report new values for the given facet query = { 'facet': 'true', 'facet.field': [facet, ], 'rows': 0, 'facet.limit': -1, # give them all 'facet.sort': 'count', 'facet.mincount': 1, } production_json = get_solr_json(solr_url, query, api_key=api_key) production_facet_dict = create_facet_dict(production_json, facet) new_json = get_solr_json(solr_url_new, query, api_key=api_key_new) new_facet_dict = create_facet_dict(new_json, facet) not_in_new, not_in_prod, count_equal, new_less, new_more = \ compare_datasets(production_facet_dict, new_facet_dict) print("{}: NOT IN PROD: {} NOT_IN_NEW: {}".format( facet, len(not_in_prod), len(not_in_new))) page = workbook.add_worksheet('New {} Values'.format(facet)) header_format = workbook.add_format({'bold': True, }) number_format = workbook.add_format() number_format.set_num_format('#,##0') if not_in_prod > 0: page.set_tab_color('red') number_format.set_bg_color('red') page.write(0, 0, 'New {} Values'.format(facet), header_format) page.write(0, 1, 'Counts', header_format) # width page.set_column( 0, 1, 25, ) row = 2 for value, count in not_in_prod: page.write(row, 0, value) page.write(row, 1, count, number_format) row = row + 1
def create_missing_report(field, workbook, header_format, add_query=None): '''add_query is additional parameters for the query as a dictionary of param: value. Needed for filter query for missing reference_image_md5 ''' query = { 'q': '-{}:[* TO *]'.format(field), 'rows': 0, 'wt': 'json', 'facet': 'true', 'facet.field': 'collection_url' } if add_query: query.update(add_query) collection_urls = create_facet_dict( get_solr_json( solr_url, query=query, api_key=api_key, digest_user=digest_user, digest_pswd=digest_pswd), 'collection_url') title = 'missing {}'.format(field) create_missing_worksheet(title, collection_urls, workbook, header_format)
def main(solr_url='https://harvest-stg.cdlib.org/solr/dc-collection/query', outdir=None, api_key=None, digest_user=None, digest_pswd=None): print("USING SOLR:{}".format(solr_url)) field = 'reference_image_md5' #print "======FIELD:{} {} {}".format(field, digest_user, digest_pswd) #print "======FIELD:{} {} {}".format(field, api_key, solr_url) dup_md5 = get_facet_query( solr_url, field, api_key=api_key, digest_user=digest_user, digest_pswd=digest_pswd) #now for each md5, get the collection_url that it is in for md5, count in dup_md5.items(): query = { 'q': md5, 'rows': 0, 'wt': 'json', 'facet': 'true', 'facet.field': 'collection_url' } collection_urls = create_facet_dict( get_solr_json( solr_url, query=query, api_key=api_key, digest_user=digest_user, digest_pswd=digest_pswd), 'collection_url') dup_md5[md5] = (count, collection_urls) workbook, header_format, number_format = create_report_workbook(outdir) page = workbook.add_worksheet(field) # headers page.write(0, 0, field, header_format) page.write(0, 1, 'Number Dups', header_format) page.write(0, 2, 'Collections', header_format) # width page.set_column( 0, 0, 50, ) page.set_column( 1, 1, 10, ) page.set_column( 2, 10, 50, ) row = 1 for md5, data in dup_md5.items(): page.write(row, 0, md5) page.write(row, 1, data[0]) column = 2 for c_url, num in data[1].items(): coll_data = ' - '.join((c_url, str(num))) page.write(row, column, coll_data) column += 1 row += 1 #end md5 page #missing type_ss field = 'type_ss' create_missing_report(field, workbook, header_format) field = 'repository_data' create_missing_report(field, workbook, header_format) field = 'title_ss' create_missing_report(field, workbook, header_format) field = 'url_item' create_missing_report(field, workbook, header_format) field = 'reference_image_md5' create_missing_report( field, workbook, header_format, add_query={'fq': 'type_ss:image'}) field = 'rights_ss' create_missing_report(field, workbook, header_format)
parser = argparse.ArgumentParser() parser.add_argument('outdir', ) argv = parser.parse_args() config = configparser.SafeConfigParser() config.read('report.ini') solr_url = config.get('new-index', 'solrUrl') api_key = config.get('new-index', 'solrAuth') couchdb_url = config.get('couchdb', 'url') solr_collection_json = get_solr_json(solr_url, solr_collection_query, api_key=api_key) solr_collection_facet = create_facet_dict(solr_collection_json, 'collection_url') diffs = [] couch_less = [] for curl, count in solr_collection_facet.items(): cid = curl.rsplit('/', 2)[-2] url_couchdb_count = ''.join( ('{}/couchdb/ucldc/_design/', 'all_provider_docs/_view/', 'by_provider_name_count?', 'key="{}"')).format(couchdb_url, cid) resp = requests.get(url_couchdb_count, verify=False) couch_count = resp.json()['rows'][0]['value'] if count != couch_count: diffs.append((cid, count, couch_count)) if couch_count < count: couch_less.append((cid, count, couch_count)) print "{} SOLR:{} COUCH:{}".format(cid, count, couch_count) print "FOR {} COLLECTIONS, {} have different counts".format(
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('outdir',) argv = parser.parse_args() config = configparser.SafeConfigParser() config.read('report.ini') solr_url = config.get('new-index', 'solrUrl') api_key = config.get('new-index', 'solrAuth') couchdb_url = config.get('couchdb', 'url') solr_collection_json = get_solr_json(solr_url, solr_collection_query, api_key=api_key) solr_collection_facet = create_facet_dict(solr_collection_json, 'collection_url') diffs = [] couch_less = [] for curl, count in solr_collection_facet.items(): cid = curl.rsplit('/', 2)[-2] url_couchdb_count = ''.join(('{}/couchdb/ucldc/_design/', 'all_provider_docs/_view/', 'by_provider_name_count?', 'key="{}"')).format(couchdb_url, cid) resp = requests.get(url_couchdb_count, verify=False) couch_count = resp.json()['rows'][0]['value'] if count != couch_count: diffs.append((cid, count, couch_count)) if couch_count < count: couch_less.append((cid, count, couch_count)) print "{} SOLR:{} COUCH:{}".format(cid, count, couch_count)
def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument( 'outdir', nargs=1, ) if argv is None: argv = parser.parse_args() config = configparser.SafeConfigParser() config.read('report.ini') #get totals for reporting on first page query_t = { 'facet': 'true', 'facet.field': [ 'type_ss', 'facet_decade', ], 'facet.missing': 'on', 'rows': 0, 'facet.limit': -1, } solr_url = config.get('calisphere', 'solrUrl') api_key = config.get('calisphere', 'solrAuth') production_totals = get_solr_json(solr_url, query_t, api_key=api_key) num_prod_docs = get_total_docs(production_totals) production_type_ss_dict = create_facet_dict(production_totals, 'type_ss') solr_url_new = config.get('new-index', 'solrUrl') api_key_new = config.get('new-index', 'solrAuth') new_totals = get_solr_json(solr_url_new, query_t, api_key=api_key_new) num_new_docs = get_total_docs(new_totals) new_type_ss_dict = create_facet_dict(new_totals, 'type_ss') #get calisphere current index data production_json = get_solr_json(solr_url, base_query, api_key=api_key) production_facet_dict = create_facet_dict(production_json, 'collection_url') new_json = get_solr_json(solr_url_new, base_query, api_key=api_key_new) new_facet_dict = create_facet_dict(new_json, 'collection_url') pp('OLD LEN:{} NEW LEN:{}'.format(len(production_facet_dict), len(new_facet_dict))) not_in_new, not_in_prod, count_equal, new_less, new_more = \ compare_datasets(production_facet_dict, new_facet_dict) all_collections, ready_for_pub, not_ready_for_pub = \ get_registry_collection_data() pp("READY FOR PUB:{} NOT READY:{}".format(len(ready_for_pub), len(not_ready_for_pub))) missing_ready_for_pub = [ c for c in ready_for_pub if c['url'] not in new_facet_dict ] not_ready_for_pub = [ c for c in not_ready_for_pub if c['url'] in new_facet_dict ] pp('NOT IN NEW INDEX {}'.format(len(not_in_new))) pp('NOT IN PROD INDEX {}'.format(len(not_in_prod))) pp('COUNT EQUAL {}'.format(len(count_equal))) pp('NEW LESS {}'.format(len(new_less))) pp('NEW MORE {}'.format(len(new_more))) workbook = create_report_workbook( argv.outdir[0], not_in_new, not_in_prod, count_equal, new_less, new_more, num_found_prod=num_prod_docs, num_found_new=num_new_docs, type_ss_prod=production_type_ss_dict, type_ss_new=new_type_ss_dict, all_collections=all_collections, missing_ready_for_pub=missing_ready_for_pub, not_ready_for_pub=not_ready_for_pub) create_new_facet_values_sheet('coverage_ss', workbook, solr_url, api_key, solr_url_new, api_key_new) create_new_facet_values_sheet('facet_decade', workbook, solr_url, api_key, solr_url_new, api_key_new) create_new_facet_values_sheet('rights_ss', workbook, solr_url, api_key, solr_url_new, api_key_new) workbook.close()
def main(argv=None): parser = argparse.ArgumentParser() parser.add_argument( 'outdir', nargs=1, ) if argv is None: argv = parser.parse_args() config = configparser.SafeConfigParser() config.read('report.ini') #get totals for reporting on first page query_t = { 'facet': 'true', 'facet.field': [ 'type_ss', 'facet_decade', ], 'facet.missing': 'on', 'rows': 0, 'facet.limit': -1, } solr_url = config.get('calisphere', 'solrUrl') api_key = config.get('calisphere', 'solrAuth') production_totals = get_solr_json(solr_url, query_t, api_key=api_key) num_prod_docs = get_total_docs(production_totals) production_type_ss_dict = create_facet_dict(production_totals, 'type_ss') solr_url_new = config.get('new-index', 'solrUrl') api_key_new = config.get('new-index', 'solrAuth') new_totals = get_solr_json(solr_url_new, query_t, api_key=api_key_new) num_new_docs = get_total_docs(new_totals) new_type_ss_dict = create_facet_dict(new_totals, 'type_ss') #get calisphere current index data production_json = get_solr_json(solr_url, base_query, api_key=api_key) production_facet_dict = create_facet_dict(production_json, 'collection_url') new_json = get_solr_json(solr_url_new, base_query, api_key=api_key_new) new_facet_dict = create_facet_dict(new_json, 'collection_url') pp('OLD LEN:{} NEW LEN:{}'.format( len(production_facet_dict), len(new_facet_dict))) not_in_new, not_in_prod, count_equal, new_less, new_more = \ compare_datasets(production_facet_dict, new_facet_dict) all_collections, ready_for_pub, not_ready_for_pub = \ get_registry_collection_data() pp("READY FOR PUB:{} NOT READY:{}".format( len(ready_for_pub), len(not_ready_for_pub))) missing_ready_for_pub = [ c for c in ready_for_pub if c['url'] not in new_facet_dict ] not_ready_for_pub = [ c for c in not_ready_for_pub if c['url'] in new_facet_dict ] pp('NOT IN NEW INDEX {}'.format(len(not_in_new))) pp('NOT IN PROD INDEX {}'.format(len(not_in_prod))) pp('COUNT EQUAL {}'.format(len(count_equal))) pp('NEW LESS {}'.format(len(new_less))) pp('NEW MORE {}'.format(len(new_more))) workbook = create_report_workbook( argv.outdir[0], not_in_new, not_in_prod, count_equal, new_less, new_more, num_found_prod=num_prod_docs, num_found_new=num_new_docs, type_ss_prod=production_type_ss_dict, type_ss_new=new_type_ss_dict, all_collections=all_collections, missing_ready_for_pub=missing_ready_for_pub, not_ready_for_pub=not_ready_for_pub) create_new_facet_values_sheet('coverage_ss', workbook, solr_url, api_key, solr_url_new, api_key_new) create_new_facet_values_sheet('facet_decade', workbook, solr_url, api_key, solr_url_new, api_key_new) create_new_facet_values_sheet('rights_ss', workbook, solr_url, api_key, solr_url_new, api_key_new) workbook.close()