def handle(self, *args, **options): total = 0 cycle = 0 try: # Retrieve the Search and Field models from the database solr = SolrClient(settings.SOLR_SERVER_URL) try: self.search_target = Search.objects.get( search_id=options['search']) self.solr_core = self.search_target.solr_core_name self.all_fields = Field.objects.filter( search_id=self.search_target) if options['nothing_to_report']: self.search_fields = Field.objects.filter( search_id=self.search_target, alt_format='ALL') | Field.objects.filter( search_id=self.search_target, alt_format='NTR') else: self.search_fields = Field.objects.filter( search_id=self.search_target, alt_format='ALL') | Field.objects.filter( search_id=self.search_target, alt_format='') for search_field in self.search_fields: self.csv_fields[search_field.field_id] = search_field codes = Code.objects.filter(field_id=search_field) # Most csv_fields will not have codes, so the queryset will be zero length if len(codes) > 0: code_dict = {} for code in codes: code_dict[code.code_id.lower()] = code self.field_codes[search_field.field_id] = code_dict except Search.DoesNotExist as x: self.logger.error('Search not found: "{0}"'.format(x)) exit(-1) except Field.DoesNotExist as x1: self.logger.error( 'Fields not found for search: "{0}"'.format(x1)) # Process the records in the CSV file one at a time with open(options['csv'], 'r', encoding='utf-8-sig', errors="ignore") as csv_file: csv_reader = csv.DictReader(csv_file, dialect='excel') solr_items = [] for csv_record in csv_reader: # Clear out the Solr core. on the first line if total == 0 and not options['nothing_to_report']: solr.delete_doc_by_query(self.solr_core, "*:*") print("Purging all records") elif total == 0 and options['nothing_to_report']: solr.delete_doc_by_query(self.solr_core, "format:NTR") solr.commit(self.solr_core, softCommit=True) print("Purging NTR records") total += 1 cycle += 1 # Call plugins if they exist for this search type. This is where a developer can introduce # code to customize the data that is loaded into Solr for a particular search. search_type_plugin = 'search.plugins.{0}'.format( options['search']) if search_type_plugin in self.discovered_plugins: include, filtered_record = self.discovered_plugins[ search_type_plugin].filter_csv_record( csv_record, self.search_target, self.csv_fields, self.field_codes, 'NTR' if options['nothing_to_report'] else '') if not include: continue else: csv_record = filtered_record # Create a dictionary for each record loaded into Solr solr_record = { 'format': 'NTR' if options['nothing_to_report'] else 'DEFAULT' } for csv_field in csv_reader.fieldnames: # Verify that it is a known field if csv_field not in self.csv_fields and csv_field not in ( 'owner_org_title', 'owner_org'): self.logger.error( "CSV files contains unknown field: {0}".format( csv_field)) exit(-1) if csv_field == 'owner_org_title': continue # Handle multi-valued fields here if self.csv_fields[csv_field].solr_field_multivalued: solr_record[csv_field] = csv_record[ csv_field].split(',') # Copy fields fo report cannot use multi-values - so directly populate with original string if self.csv_fields[csv_field].solr_field_export: for extra_field in self.csv_fields[ csv_field].solr_field_export.split( ','): solr_record[extra_field] = csv_record[ csv_field] else: solr_record[csv_field] = csv_record[csv_field] # Automatically expand out dates and numbers for use with Solr export handler if self.csv_fields[ csv_field].solr_field_type == 'pdate': try: if csv_record[csv_field]: csv_date = datetime.strptime( csv_record[csv_field], '%Y-%m-%d') solr_record[csv_field + '_en'] = format_date( csv_date, locale='en') solr_record[csv_field + '_fr'] = format_date( csv_date, locale='fr') if self.csv_fields[ csv_field].is_default_year: solr_record['year'] = csv_date.year if self.csv_fields[ csv_field].is_default_month: solr_record['month'] = csv_date.month else: solr_record[csv_field + '_en'] = '' solr_record[csv_field + '_fr'] = '' except ValueError as x2: self.logger.error( 'Invalid date: "{0}"'.format(x2)) solr_record[csv_field] = '' continue elif self.csv_fields[csv_field].solr_field_type in [ 'pint', 'pfloat' ]: if solr_record[csv_field]: if solr_record[csv_field] == '.': solr_record[csv_field] = "0" csv_decimal = parse_decimal( solr_record[csv_field], locale='en_US') if self.csv_fields[ csv_field].solr_field_is_currency: solr_record[csv_field + '_en'] = format_currency( csv_decimal, 'CAD', locale='en_CA') solr_record[csv_field + '_fr'] = format_currency( csv_decimal, 'CAD', locale='fr_CA') else: solr_record[csv_field + '_en'] = format_decimal( csv_decimal, locale='en_CA') solr_record[csv_field + '_fr'] = format_decimal( csv_decimal, locale='fr_CA') else: solr_record[csv_field + '_en'] = '' solr_record[csv_field + '_fr'] = '' # Lookup the expanded code value from the codes dict of dict if csv_field in self.field_codes: if csv_record[csv_field]: if self.csv_fields[ csv_field].solr_field_multivalued: codes_en = [] codes_fr = [] for code_value in csv_record[ csv_field].split(","): if code_value.lower( ) in self.field_codes[csv_field]: codes_en.append( self.field_codes[csv_field] [code_value.lower()].label_en) codes_fr.append( self.field_codes[csv_field] [code_value.lower()].label_fr) else: self.logger.info( "Unknown code value: {0} for field: {1}" .format(code_value, csv_field)) solr_record[csv_field + '_en'] = codes_en solr_record[csv_field + '_fr'] = codes_fr else: if csv_record[csv_field].lower( ) in self.field_codes[csv_field]: solr_record[csv_field + '_en'] = self.field_codes[ csv_field][csv_record[ csv_field].lower( )].label_en solr_record[csv_field + '_fr'] = self.field_codes[ csv_field][csv_record[ csv_field].lower( )].label_fr else: self.logger.info( "Unknown code value: {0} for field: {1}" .format(csv_record[csv_field], csv_field)) solr_record = self.set_empty_fields(solr_record) # Set the Solr ID field (Nothing To Report records are excluded) if not options['nothing_to_report']: if self.search_target.id_fields: id_values = [] for id_field in self.search_target.id_fields.split( ","): id_values.append(csv_record[id_field]) solr_record['id'] = ",".join(id_values) else: if 'month' in solr_record: solr_record['id'] = "{0}-{1}-{2}".format( solr_record['owner_org'], solr_record['year'], solr_record['month']) elif 'quarter' in solr_record: solr_record['id'] = "{0}-{1}-{2}".format( solr_record['owner_org'], solr_record['year'], solr_record['quarter']) # Call plugins if they exist for this search type. This is where a developer can introduce # code to customize the data that is loaded into Solr for a particular search. if search_type_plugin in self.discovered_plugins: solr_record = self.discovered_plugins[ search_type_plugin].load_csv_record( csv_record, solr_record, self.search_target, self.csv_fields, self.field_codes, 'NTR' if options['nothing_to_report'] else '') solr_items.append(solr_record) # Write to Solr whenever the cycle threshold is reached if cycle >= self.cycle_on: # try to connect to Solr up to 10 times for countdown in reversed(range(10)): try: solr.index(self.solr_core, solr_items) print("{0} rows processed".format(total)) cycle = 0 solr_items.clear() break except ConnectionError as cex: if not countdown: raise print( "Solr error: {0}. Waiting to try again ... {1}" .format(cex, countdown)) time.sleep((10 - countdown) * 5) # Write and remaining records to Solr and commit if cycle > 0: # try to connect to Solr up to 10 times for countdown in reversed(range(10)): try: solr.index(self.solr_core, solr_items) total += len(solr_items) print("{0} rows processed".format(cycle)) cycle = 0 solr_items.clear() break except ConnectionError as cex: if not countdown: raise print( "Solr error: {0}. Waiting to try again ... {1}" .format(cex, countdown)) time.sleep((10 - countdown) * 5) solr.commit(self.solr_core, softCommit=True, waitSearcher=True) print("Total rows processed: {0}".format(total)) except Exception as x: self.logger.error('Unexpected Error "{0}"'.format(x))
class ClientTestIndexing(unittest.TestCase): @classmethod def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) for field in test_config['collections']['copy_fields']: try: self.solr.schema.delete_copy_field( test_config['SOLR_COLLECTION'], field) except Exception as e: pass for field in test_config['collections']['fields']: try: self.solr.schema.create_field(test_config['SOLR_COLLECTION'], field) except Exception as e: pass def setUp(self): self.delete_docs() self.commit() def delete_docs(self): self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*') self.commit() def commit(self): self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True) sleep(5) def test_delete_doc_by_id_with_space(self): self.delete_docs() self.solr.index_json( test_config['SOLR_COLLECTION'], json.dumps([{ 'id': 'potato potato', 'product_name': 'potato' }])) self.commit() self.assertTrue( len( self.solr.query(test_config['SOLR_COLLECTION'], { 'q': 'id:"potato potato"' }).docs) == 1) self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], "potato potato") self.commit() self.assertTrue( len( self.solr.query(test_config['SOLR_COLLECTION'], { 'q': 'id:"potato potato"' }).docs) == 0) self.delete_docs() def test_delete_doc_by_query(self): self.delete_docs() self.solr.index_json( test_config['SOLR_COLLECTION'], json.dumps([{ 'id': 'potato potato', 'product_name': 'potato' }])) self.commit() self.assertTrue( len( self.solr.query(test_config['SOLR_COLLECTION'], { 'q': 'id:"potato potato"' }).docs) == 1) self.solr.delete_doc_by_query(test_config['SOLR_COLLECTION'], "product_name:potato") self.commit() self.assertTrue( len( self.solr.query(test_config['SOLR_COLLECTION'], { 'q': 'id:"potato potato"' }).docs) == 0) self.delete_docs() @unittest.skip("Skipping for now") def test_access_without_auth(self): if not test_config['SOLR_CREDENTIALS'][0]: return solr = SolrClient(test_config['SOLR_SERVER'], devel=True) with self.assertRaises(ConnectionError) as cm: solr.query('SolrClient_unittest', {'q': 'not_gonna_happen'}) def test_indexing_json(self): self.docs = self.rand_docs.get_docs(53) self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps(self.docs)) self.commit() sleep(5) for doc in self.docs: logging.debug("Checking {}".format(doc['id'])) self.assertEqual( self.solr.query(test_config['SOLR_COLLECTION'], { 'q': 'id:{}'.format(doc['id']) }).get_num_found(), 1) self.delete_docs() self.commit() def test_indexing_conn_log(self): self.docs = self.rand_docs.get_docs(53) self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps(self.docs)) self.commit() sleep(5) for doc in self.docs: logging.debug("Checking {}".format(doc['id'])) self.assertEqual( self.solr.query(test_config['SOLR_COLLECTION'], { 'q': 'id:{}'.format(doc['id']) }).get_num_found(), 1) logging.info(self.solr.transport._action_log) self.delete_docs() self.commit() def test_index_json_file(self): self.docs = self.rand_docs.get_docs(55) with open('temp_file.json', 'w') as f: json.dump(self.docs, f) r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'}) self.assertEqual(r.get_num_found(), len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_stream_file_gzip_file(self): self.docs = self.rand_docs.get_docs(60) with gzip.open('temp_file.json.gz', 'wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json.gz') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'}) self.assertEqual(r.get_num_found(), len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass @unittest.skip("Don't test remote indexing in travis") def test_index_json_file(self): self.docs = self.rand_docs.get_docs(61) with open('temp_file.json', 'w') as f: json.dump(self.docs, f) r = self.solr.local_index(test_config['SOLR_COLLECTION'], 'temp_file.json') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'}) self.assertEqual(r.get_num_found(), len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query_with_rows(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz', 'wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q': '*:*'}, rows=50): self.assertTrue(len(res.docs) == 50) docs.extend(res.docs) queries += 1 self.assertEqual( [x['id'] for x in sorted(docs, key=lambda x: x['id'])], [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])]) self.assertTrue(1000 / 50 == queries) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz', 'wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q': '*:*'}): self.assertTrue(len(res.docs) == 1000) docs.extend(res.docs) queries += 1 self.assertTrue(queries == 1) self.assertEqual( [x['id'] for x in sorted(docs, key=lambda x: x['id'])], [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])]) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query_with_max(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz', 'wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q': '*:*'}, rows=50, max_start=502): self.assertTrue(len(res.docs) == 50) queries += 1 docs.extend(res.docs) ids = [x['id'] for x in docs] for item in docs: self.assertTrue(item['id'] in ids) self.assertEqual(11, queries) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_cursor_query(self): self.docs = self.rand_docs.get_docs(2000) with gzip.open('temp_file.json.gz', 'wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.cursor_query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'rows': 100 }): self.assertTrue(len(res.docs) == 100) queries += 1 docs.extend(res.docs) ids = [x['id'] for x in docs] for item in docs: self.assertTrue(item['id'] in ids) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass