示例#1
0
    def handle(self, *args, **options):

        total = 0
        cycle = 0

        try:
            # Retrieve the Search  and Field models from the database
            solr = SolrClient(settings.SOLR_SERVER_URL)
            try:
                self.search_target = Search.objects.get(
                    search_id=options['search'])
                self.solr_core = self.search_target.solr_core_name
                self.all_fields = Field.objects.filter(
                    search_id=self.search_target)
                if options['nothing_to_report']:
                    self.search_fields = Field.objects.filter(
                        search_id=self.search_target,
                        alt_format='ALL') | Field.objects.filter(
                            search_id=self.search_target, alt_format='NTR')
                else:
                    self.search_fields = Field.objects.filter(
                        search_id=self.search_target,
                        alt_format='ALL') | Field.objects.filter(
                            search_id=self.search_target, alt_format='')
                for search_field in self.search_fields:
                    self.csv_fields[search_field.field_id] = search_field

                    codes = Code.objects.filter(field_id=search_field)
                    # Most csv_fields will not  have codes, so the queryset will be zero length
                    if len(codes) > 0:
                        code_dict = {}
                        for code in codes:
                            code_dict[code.code_id.lower()] = code
                        self.field_codes[search_field.field_id] = code_dict

            except Search.DoesNotExist as x:
                self.logger.error('Search not found: "{0}"'.format(x))
                exit(-1)
            except Field.DoesNotExist as x1:
                self.logger.error(
                    'Fields not found for search: "{0}"'.format(x1))

            # Process the records in the CSV file one at a time
            with open(options['csv'],
                      'r',
                      encoding='utf-8-sig',
                      errors="ignore") as csv_file:
                csv_reader = csv.DictReader(csv_file, dialect='excel')
                solr_items = []
                for csv_record in csv_reader:

                    # Clear out the Solr core. on the first line
                    if total == 0 and not options['nothing_to_report']:
                        solr.delete_doc_by_query(self.solr_core, "*:*")
                        print("Purging all records")
                    elif total == 0 and options['nothing_to_report']:
                        solr.delete_doc_by_query(self.solr_core, "format:NTR")
                        solr.commit(self.solr_core, softCommit=True)
                        print("Purging NTR records")
                    total += 1
                    cycle += 1

                    # Call plugins if they exist for this search type. This is where a developer can introduce
                    # code to customize the data that is loaded into Solr for a particular search.
                    search_type_plugin = 'search.plugins.{0}'.format(
                        options['search'])
                    if search_type_plugin in self.discovered_plugins:
                        include, filtered_record = self.discovered_plugins[
                            search_type_plugin].filter_csv_record(
                                csv_record, self.search_target,
                                self.csv_fields, self.field_codes,
                                'NTR' if options['nothing_to_report'] else '')
                        if not include:
                            continue
                        else:
                            csv_record = filtered_record
                    # Create a dictionary for each record loaded into  Solr
                    solr_record = {
                        'format':
                        'NTR' if options['nothing_to_report'] else 'DEFAULT'
                    }
                    for csv_field in csv_reader.fieldnames:
                        # Verify that it is a known field
                        if csv_field not in self.csv_fields and csv_field not in (
                                'owner_org_title', 'owner_org'):
                            self.logger.error(
                                "CSV files contains unknown field: {0}".format(
                                    csv_field))
                            exit(-1)
                        if csv_field == 'owner_org_title':
                            continue

                        # Handle multi-valued fields here
                        if self.csv_fields[csv_field].solr_field_multivalued:
                            solr_record[csv_field] = csv_record[
                                csv_field].split(',')
                            # Copy fields fo report cannot use multi-values - so directly populate with original string
                            if self.csv_fields[csv_field].solr_field_export:
                                for extra_field in self.csv_fields[
                                        csv_field].solr_field_export.split(
                                            ','):
                                    solr_record[extra_field] = csv_record[
                                        csv_field]
                        else:
                            solr_record[csv_field] = csv_record[csv_field]

                        # Automatically expand out dates and numbers for use with Solr export handler
                        if self.csv_fields[
                                csv_field].solr_field_type == 'pdate':
                            try:
                                if csv_record[csv_field]:
                                    csv_date = datetime.strptime(
                                        csv_record[csv_field], '%Y-%m-%d')
                                    solr_record[csv_field +
                                                '_en'] = format_date(
                                                    csv_date, locale='en')
                                    solr_record[csv_field +
                                                '_fr'] = format_date(
                                                    csv_date, locale='fr')
                                    if self.csv_fields[
                                            csv_field].is_default_year:
                                        solr_record['year'] = csv_date.year
                                    if self.csv_fields[
                                            csv_field].is_default_month:
                                        solr_record['month'] = csv_date.month
                                else:
                                    solr_record[csv_field + '_en'] = ''
                                    solr_record[csv_field + '_fr'] = ''
                            except ValueError as x2:
                                self.logger.error(
                                    'Invalid date: "{0}"'.format(x2))
                                solr_record[csv_field] = ''
                                continue
                        elif self.csv_fields[csv_field].solr_field_type in [
                                'pint', 'pfloat'
                        ]:
                            if solr_record[csv_field]:
                                if solr_record[csv_field] == '.':
                                    solr_record[csv_field] = "0"
                                csv_decimal = parse_decimal(
                                    solr_record[csv_field], locale='en_US')
                                if self.csv_fields[
                                        csv_field].solr_field_is_currency:
                                    solr_record[csv_field +
                                                '_en'] = format_currency(
                                                    csv_decimal,
                                                    'CAD',
                                                    locale='en_CA')
                                    solr_record[csv_field +
                                                '_fr'] = format_currency(
                                                    csv_decimal,
                                                    'CAD',
                                                    locale='fr_CA')
                                else:
                                    solr_record[csv_field +
                                                '_en'] = format_decimal(
                                                    csv_decimal,
                                                    locale='en_CA')
                                    solr_record[csv_field +
                                                '_fr'] = format_decimal(
                                                    csv_decimal,
                                                    locale='fr_CA')
                            else:
                                solr_record[csv_field + '_en'] = ''
                                solr_record[csv_field + '_fr'] = ''

                        # Lookup the expanded code value from the codes dict of dict
                        if csv_field in self.field_codes:
                            if csv_record[csv_field]:

                                if self.csv_fields[
                                        csv_field].solr_field_multivalued:
                                    codes_en = []
                                    codes_fr = []
                                    for code_value in csv_record[
                                            csv_field].split(","):
                                        if code_value.lower(
                                        ) in self.field_codes[csv_field]:
                                            codes_en.append(
                                                self.field_codes[csv_field]
                                                [code_value.lower()].label_en)
                                            codes_fr.append(
                                                self.field_codes[csv_field]
                                                [code_value.lower()].label_fr)
                                        else:
                                            self.logger.info(
                                                "Unknown code value: {0} for field: {1}"
                                                .format(code_value, csv_field))
                                    solr_record[csv_field + '_en'] = codes_en
                                    solr_record[csv_field + '_fr'] = codes_fr
                                else:
                                    if csv_record[csv_field].lower(
                                    ) in self.field_codes[csv_field]:
                                        solr_record[csv_field +
                                                    '_en'] = self.field_codes[
                                                        csv_field][csv_record[
                                                            csv_field].lower(
                                                            )].label_en
                                        solr_record[csv_field +
                                                    '_fr'] = self.field_codes[
                                                        csv_field][csv_record[
                                                            csv_field].lower(
                                                            )].label_fr
                                    else:
                                        self.logger.info(
                                            "Unknown code value: {0} for field: {1}"
                                            .format(csv_record[csv_field],
                                                    csv_field))
                    solr_record = self.set_empty_fields(solr_record)
                    # Set the Solr ID field (Nothing To Report records are excluded)
                    if not options['nothing_to_report']:
                        if self.search_target.id_fields:
                            id_values = []
                            for id_field in self.search_target.id_fields.split(
                                    ","):
                                id_values.append(csv_record[id_field])
                            solr_record['id'] = ",".join(id_values)
                    else:

                        if 'month' in solr_record:
                            solr_record['id'] = "{0}-{1}-{2}".format(
                                solr_record['owner_org'], solr_record['year'],
                                solr_record['month'])
                        elif 'quarter' in solr_record:
                            solr_record['id'] = "{0}-{1}-{2}".format(
                                solr_record['owner_org'], solr_record['year'],
                                solr_record['quarter'])

                    # Call plugins if they exist for this search type. This is where a developer can introduce
                    # code to customize the data that is loaded into Solr for a particular search.
                    if search_type_plugin in self.discovered_plugins:
                        solr_record = self.discovered_plugins[
                            search_type_plugin].load_csv_record(
                                csv_record, solr_record, self.search_target,
                                self.csv_fields, self.field_codes,
                                'NTR' if options['nothing_to_report'] else '')

                    solr_items.append(solr_record)

                    # Write to Solr whenever the cycle threshold is reached
                    if cycle >= self.cycle_on:
                        # try to connect to Solr up to 10 times
                        for countdown in reversed(range(10)):
                            try:
                                solr.index(self.solr_core, solr_items)
                                print("{0} rows processed".format(total))
                                cycle = 0
                                solr_items.clear()
                                break
                            except ConnectionError as cex:
                                if not countdown:
                                    raise
                                print(
                                    "Solr error: {0}. Waiting to try again ... {1}"
                                    .format(cex, countdown))
                                time.sleep((10 - countdown) * 5)

                # Write and remaining records to Solr and commit
                if cycle > 0:
                    # try to connect to Solr up to 10 times
                    for countdown in reversed(range(10)):
                        try:
                            solr.index(self.solr_core, solr_items)
                            total += len(solr_items)
                            print("{0} rows processed".format(cycle))
                            cycle = 0
                            solr_items.clear()
                            break
                        except ConnectionError as cex:
                            if not countdown:
                                raise
                            print(
                                "Solr error: {0}. Waiting to try again ... {1}"
                                .format(cex, countdown))
                            time.sleep((10 - countdown) * 5)

                solr.commit(self.solr_core, softCommit=True, waitSearcher=True)
                print("Total rows processed: {0}".format(total))

        except Exception as x:
            self.logger.error('Unexpected Error "{0}"'.format(x))
示例#2
0
class ClientTestIndexing(unittest.TestCase):
    @classmethod
    def setUpClass(self):

        self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                               devel=True,
                               auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)

        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.schema.delete_copy_field(
                    test_config['SOLR_COLLECTION'], field)
            except Exception as e:
                pass

        for field in test_config['collections']['fields']:
            try:
                self.solr.schema.create_field(test_config['SOLR_COLLECTION'],
                                              field)
            except Exception as e:
                pass

    def setUp(self):
        self.delete_docs()
        self.commit()

    def delete_docs(self):
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*')
        self.commit()

    def commit(self):
        self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True)
        sleep(5)

    def test_delete_doc_by_id_with_space(self):
        self.delete_docs()
        self.solr.index_json(
            test_config['SOLR_COLLECTION'],
            json.dumps([{
                'id': 'potato potato',
                'product_name': 'potato'
            }]))
        self.commit()
        self.assertTrue(
            len(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:"potato potato"'
                }).docs) == 1)
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],
                                   "potato potato")
        self.commit()
        self.assertTrue(
            len(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:"potato potato"'
                }).docs) == 0)
        self.delete_docs()

    def test_delete_doc_by_query(self):
        self.delete_docs()
        self.solr.index_json(
            test_config['SOLR_COLLECTION'],
            json.dumps([{
                'id': 'potato potato',
                'product_name': 'potato'
            }]))
        self.commit()
        self.assertTrue(
            len(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:"potato potato"'
                }).docs) == 1)
        self.solr.delete_doc_by_query(test_config['SOLR_COLLECTION'],
                                      "product_name:potato")
        self.commit()
        self.assertTrue(
            len(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:"potato potato"'
                }).docs) == 0)
        self.delete_docs()

    @unittest.skip("Skipping for now")
    def test_access_without_auth(self):
        if not test_config['SOLR_CREDENTIALS'][0]:
            return
        solr = SolrClient(test_config['SOLR_SERVER'], devel=True)
        with self.assertRaises(ConnectionError) as cm:
            solr.query('SolrClient_unittest', {'q': 'not_gonna_happen'})

    def test_indexing_json(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:{}'.format(doc['id'])
                }).get_num_found(), 1)
        self.delete_docs()
        self.commit()

    def test_indexing_conn_log(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:{}'.format(doc['id'])
                }).get_num_found(), 1)
        logging.info(self.solr.transport._action_log)
        self.delete_docs()
        self.commit()

    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(55)
        with open('temp_file.json', 'w') as f:
            json.dump(self.docs, f)
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_stream_file_gzip_file(self):
        self.docs = self.rand_docs.get_docs(60)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    @unittest.skip("Don't test remote indexing in travis")
    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(61)
        with open('temp_file.json', 'w') as f:
            json.dump(self.docs, f)
        r = self.solr.local_index(test_config['SOLR_COLLECTION'],
                                  'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_rows(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'},
                                          rows=50):
            self.assertTrue(len(res.docs) == 50)
            docs.extend(res.docs)
            queries += 1
        self.assertEqual(
            [x['id'] for x in sorted(docs, key=lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])])
        self.assertTrue(1000 / 50 == queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'}):
            self.assertTrue(len(res.docs) == 1000)
            docs.extend(res.docs)
            queries += 1
        self.assertTrue(queries == 1)
        self.assertEqual(
            [x['id'] for x in sorted(docs, key=lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])])
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_max(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'},
                                          rows=50,
                                          max_start=502):
            self.assertTrue(len(res.docs) == 50)
            queries += 1
            docs.extend(res.docs)
        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.assertEqual(11, queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_cursor_query(self):
        self.docs = self.rand_docs.get_docs(2000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []

        for res in self.solr.cursor_query(test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'rows': 100
        }):
            self.assertTrue(len(res.docs) == 100)
            queries += 1
            docs.extend(res.docs)

        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass