Пример #1
0
class ClientTestIndexing(unittest.TestCase):
    @classmethod
    def setUpClass(self):

        self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True,
                               auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)

        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.schema.delete_copy_field(test_config['SOLR_COLLECTION'], field)
            except Exception as e:
                pass

        for field in test_config['collections']['fields']:
            try:
                self.solr.schema.create_field(test_config['SOLR_COLLECTION'], field)
            except Exception as e:
                pass

    def setUp(self):
        self.delete_docs()
        self.commit()

    def delete_docs(self):
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*')
        self.commit()

    def commit(self):
        self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True)
        sleep(5)

    def test_delete_doc_by_id_with_space(self):
        self.delete_docs()
        self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps(
                        [{'id': 'potato potato', 'product_name': 'potato'}]))
        self.commit()
        self.assertTrue(
            len(self.solr.query(test_config['SOLR_COLLECTION'],
                                {'q': 'id:"potato potato"'}).docs) == 1)
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],
                                   "potato potato")
        self.commit()
        self.assertTrue(
            len(self.solr.query(test_config['SOLR_COLLECTION'],
                                {'q': 'id:"potato potato"'}).docs) == 0)
        self.delete_docs()

    @unittest.skip("Skipping for now")
    def test_access_without_auth(self):
        if not test_config['SOLR_CREDENTIALS'][0]:
            return
        solr = SolrClient(test_config['SOLR_SERVER'], devel=True)
        with self.assertRaises(ConnectionError) as cm:
            solr.query('SolrClient_unittest', {'q': 'not_gonna_happen'})


    def test_indexing_json(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'], {'q': 'id:{}'.format(doc['id'])}).get_num_found(),1)
        self.delete_docs()
        self.commit()

    def test_indexing_conn_log(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'], {'q': 'id:{}'.format(doc['id'])}).get_num_found(), 1)
        logging.info(self.solr.transport._action_log)
        self.delete_docs()
        self.commit()

    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(55)
        with open('temp_file.json', 'w') as f:
            json.dump(self.docs, f)
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass


    def test_stream_file_gzip_file(self):
        self.docs = self.rand_docs.get_docs(60)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json.gz')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    @unittest.skip("Don't test remote indexing in travis")
    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(61)
        with open('temp_file.json', 'w') as f:
            json.dump(self.docs, f)
        r = self.solr.local_index(test_config['SOLR_COLLECTION'], 'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_rows(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'},
                                          rows=50):
            self.assertTrue(len(res.docs) == 50)
            docs.extend(res.docs)
            queries += 1
        self.assertEqual(
            [x['id'] for x in sorted(docs, key=lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])]
            )
        self.assertTrue(1000/50 == queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'}):
            self.assertTrue(len(res.docs) == 1000)
            docs.extend(res.docs)
            queries += 1
        self.assertTrue(queries == 1)
        self.assertEqual(
            [x['id'] for x in sorted(docs, key=lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])]
            )
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_max(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q':'*:*'}, rows = 50, max_start = 502):
            self.assertTrue(len(res.docs) == 50)
            queries +=1
            docs.extend(res.docs)
        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.assertEqual(11, queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass


    def test_cursor_query(self):
        self.docs = self.rand_docs.get_docs(2000)
        with gzip.open('temp_file.json.gz','wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []

        for res in self.solr.cursor_query(test_config['SOLR_COLLECTION'], {'q':'*:*', 'rows':100}):
            self.assertTrue(len(res.docs) == 100)
            queries +=1
            docs.extend(res.docs)

        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass
Пример #2
0
class ClientTestIndexing(unittest.TestCase):
    @classmethod
    def setUpClass(self):

        self.solr = SolrClient(test_config['SOLR_SERVER'][0],
                               devel=True,
                               auth=test_config['SOLR_CREDENTIALS'])
        self.rand_docs = RandomTestData()
        self.docs = self.rand_docs.get_docs(50)

        for field in test_config['collections']['copy_fields']:
            try:
                self.solr.schema.delete_copy_field(
                    test_config['SOLR_COLLECTION'], field)
            except Exception as e:
                pass

        for field in test_config['collections']['fields']:
            try:
                self.solr.schema.create_field(test_config['SOLR_COLLECTION'],
                                              field)
            except Exception as e:
                pass

    def setUp(self):
        self.delete_docs()
        self.commit()

    def delete_docs(self):
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*')
        self.commit()

    def commit(self):
        self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True)
        sleep(5)

    def test_delete_doc_by_id_with_space(self):
        self.delete_docs()
        self.solr.index_json(
            test_config['SOLR_COLLECTION'],
            json.dumps([{
                'id': 'potato potato',
                'product_name': 'potato'
            }]))
        self.commit()
        self.assertTrue(
            len(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:"potato potato"'
                }).docs) == 1)
        self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],
                                   "potato potato")
        self.commit()
        self.assertTrue(
            len(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:"potato potato"'
                }).docs) == 0)
        self.delete_docs()

    def test_delete_doc_by_query(self):
        self.delete_docs()
        self.solr.index_json(
            test_config['SOLR_COLLECTION'],
            json.dumps([{
                'id': 'potato potato',
                'product_name': 'potato'
            }]))
        self.commit()
        self.assertTrue(
            len(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:"potato potato"'
                }).docs) == 1)
        self.solr.delete_doc_by_query(test_config['SOLR_COLLECTION'],
                                      "product_name:potato")
        self.commit()
        self.assertTrue(
            len(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:"potato potato"'
                }).docs) == 0)
        self.delete_docs()

    @unittest.skip("Skipping for now")
    def test_access_without_auth(self):
        if not test_config['SOLR_CREDENTIALS'][0]:
            return
        solr = SolrClient(test_config['SOLR_SERVER'], devel=True)
        with self.assertRaises(ConnectionError) as cm:
            solr.query('SolrClient_unittest', {'q': 'not_gonna_happen'})

    def test_indexing_json(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:{}'.format(doc['id'])
                }).get_num_found(), 1)
        self.delete_docs()
        self.commit()

    def test_indexing_conn_log(self):
        self.docs = self.rand_docs.get_docs(53)
        self.solr.index_json(test_config['SOLR_COLLECTION'],
                             json.dumps(self.docs))
        self.commit()
        sleep(5)
        for doc in self.docs:
            logging.debug("Checking {}".format(doc['id']))
            self.assertEqual(
                self.solr.query(test_config['SOLR_COLLECTION'], {
                    'q': 'id:{}'.format(doc['id'])
                }).get_num_found(), 1)
        logging.info(self.solr.transport._action_log)
        self.delete_docs()
        self.commit()

    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(55)
        with open('temp_file.json', 'w') as f:
            json.dump(self.docs, f)
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_stream_file_gzip_file(self):
        self.docs = self.rand_docs.get_docs(60)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    @unittest.skip("Don't test remote indexing in travis")
    def test_index_json_file(self):
        self.docs = self.rand_docs.get_docs(61)
        with open('temp_file.json', 'w') as f:
            json.dump(self.docs, f)
        r = self.solr.local_index(test_config['SOLR_COLLECTION'],
                                  'temp_file.json')
        self.commit()
        r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'})
        self.assertEqual(r.get_num_found(), len(self.docs))
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_rows(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'},
                                          rows=50):
            self.assertTrue(len(res.docs) == 50)
            docs.extend(res.docs)
            queries += 1
        self.assertEqual(
            [x['id'] for x in sorted(docs, key=lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])])
        self.assertTrue(1000 / 50 == queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'}):
            self.assertTrue(len(res.docs) == 1000)
            docs.extend(res.docs)
            queries += 1
        self.assertTrue(queries == 1)
        self.assertEqual(
            [x['id'] for x in sorted(docs, key=lambda x: x['id'])],
            [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])])
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_paging_query_with_max(self):
        self.docs = self.rand_docs.get_docs(1000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []
        for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],
                                          {'q': '*:*'},
                                          rows=50,
                                          max_start=502):
            self.assertTrue(len(res.docs) == 50)
            queries += 1
            docs.extend(res.docs)
        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.assertEqual(11, queries)
        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass

    def test_cursor_query(self):
        self.docs = self.rand_docs.get_docs(2000)
        with gzip.open('temp_file.json.gz', 'wb') as f:
            f.write(json.dumps(self.docs).encode('utf-8'))
        r = self.solr.stream_file(test_config['SOLR_COLLECTION'],
                                  'temp_file.json.gz')
        self.commit()
        queries = 0
        docs = []

        for res in self.solr.cursor_query(test_config['SOLR_COLLECTION'], {
                'q': '*:*',
                'rows': 100
        }):
            self.assertTrue(len(res.docs) == 100)
            queries += 1
            docs.extend(res.docs)

        ids = [x['id'] for x in docs]

        for item in docs:
            self.assertTrue(item['id'] in ids)

        self.delete_docs()
        self.commit()
        try:
            os.remove('temp_file.json.gz')
            os.remove('temp_file.json')
        except:
            pass
Пример #3
0
#recuperaHierarquiaAssuntos(dfJT_2017_2018)

####################################################################################################################################
# =============================================================================
# CORPUS INTEIRO
# =============================================================================
dicionarioFinal = corpora.Dictionary('')
start_time = time.time()
listaProcessada = []
if os.path.exists('./Data/corpus/listaProcessadaFinal_' + nomeDataSet +
                  '_CorpusCompleto.csv'):
    os.remove('./Data/corpus/listaProcessadaFinal_' + nomeDataSet +
              '_CorpusCompleto.csv')
for resCursor in solr.cursor_query(nomeCore, {
        'q': query,
        'rows': '100',
        'fl': 'tx_conteudo_documento',
        'sort': 'id asc'
}):
    listaProcessada = Parallel(n_jobs=7)(
        delayed(processa_texto)(documento.get('tx_conteudo_documento'))
        for documento in resCursor.docs)
    dicionarioParcial = corpora.Dictionary(listaProcessada)
    dicionarioFinal.merge_with(dicionarioParcial)
    with open(
            './Data/corpus/listaProcessadaFinal_' + nomeDataSet +
            '_CorpusCompleto.csv', "a") as fp:
        wr = csv.writer(fp, dialect='excel')
        for row in listaProcessada:
            wr.writerow(row)
end_time = time.time() - start_time
print('Tempo de processamento do texto:' + str(timedelta(seconds=end_time)))