def test_cannot_union_mixed_document_corpora_by_meta_data(self): a = track.DocumentCorpus( "test", documents=[ track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-01"), ], meta_data={"with-metadata": False}) b = track.DocumentCorpus( "test", documents=[ track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-02"), ], meta_data={"with-metadata": True}) with self.assertRaises(exceptions.RallyAssertionError) as ae: a.union(b) self.assertEqual( ae.exception.message, "Corpora meta-data differ: [{'with-metadata': False}] and [{'with-metadata': True}]." )
def test_cannot_union_mixed_document_corpora(self): a = track.DocumentCorpus("test", documents=[ track.Documents(source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-01"), ]) b = track.DocumentCorpus("other", documents=[ track.Documents(source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-02"), ]) with self.assertRaisesRegex(exceptions.RallyAssertionError, "Both document corpora must have the same name"): a.union(b)
def test_union_document_corpora_is_symmetric(self): a = track.DocumentCorpus("test", documents=[ track.Documents(source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-01"), ]) b = track.DocumentCorpus("test", documents=[ track.Documents(source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-02"), ]) self.assertEqual(b.union(a), a.union(b)) self.assertEqual(2, len(a.union(b).documents))
def test_filter_documents_by_format_and_indices(self): corpus = track.DocumentCorpus( "test", documents=[ track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-01"), track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=6, target_index="logs-02"), track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=7, target_index="logs-03"), track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=8, target_index=None) ]) filtered_corpus = corpus.filter( source_format=track.Documents.SOURCE_FORMAT_BULK, target_indices=["logs-01", "logs-02"]) self.assertEqual("test", filtered_corpus.name) self.assertEqual(2, len(filtered_corpus.documents)) self.assertEqual("logs-01", filtered_corpus.documents[0].target_index) self.assertEqual("logs-02", filtered_corpus.documents[1].target_index)
def test_filter_documents_by_data_streams(self): corpus = track.DocumentCorpus( "test", documents=[ track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_data_stream="logs-01"), track.Documents(source_format="other", number_of_documents=6, target_data_stream="logs-02"), track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=7, target_data_stream="logs-03"), track.Documents(source_format=None, number_of_documents=8, target_data_stream=None) ]) filtered_corpus = corpus.filter(target_data_streams=["logs-02"]) self.assertEqual("test", filtered_corpus.name) self.assertEqual(1, len(filtered_corpus.documents)) self.assertEqual("logs-02", filtered_corpus.documents[0].target_data_stream)
def test_do_not_filter(self): corpus = track.DocumentCorpus( "test", documents=[ track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-01"), track.Documents(source_format="other", number_of_documents=6, target_index="logs-02"), track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=7, target_index="logs-03"), track.Documents(source_format=None, number_of_documents=8, target_index=None) ], meta_data={"average-document-size-in-bytes": 12}) filtered_corpus = corpus.filter() self.assertEqual(corpus.name, filtered_corpus.name) self.assertListEqual(corpus.documents, filtered_corpus.documents) self.assertDictEqual(corpus.meta_data, filtered_corpus.meta_data)
def test_filter_documents_by_indices(self): corpus = track.DocumentCorpus( "test", documents=[ track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-01"), track.Documents(source_format="other", number_of_documents=6, target_index="logs-02"), track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=7, target_index="logs-03"), track.Documents(source_format=None, number_of_documents=8, target_index=None), ], ) filtered_corpus = corpus.filter(target_indices=["logs-02"]) assert filtered_corpus.name == "test" assert len(filtered_corpus.documents) == 1 assert filtered_corpus.documents[0].target_index == "logs-02"
def test_union_document_corpus_is_reflexive(self): corpus = track.DocumentCorpus("test", documents=[ track.Documents(source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-01"), track.Documents(source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=6, target_index="logs-02"), track.Documents(source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=7, target_index="logs-03"), track.Documents(source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=8, target_index=None) ]) self.assertTrue(corpus.union(corpus) is corpus)
def test_cannot_union_mixed_document_corpora_by_name(self): a = track.DocumentCorpus( "test", documents=[ track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-01"), ]) b = track.DocumentCorpus( "other", documents=[ track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-02"), ]) with self.assertRaises(exceptions.RallyAssertionError) as ae: a.union(b) self.assertEqual(ae.exception.message, "Corpora names differ: [test] and [other].")
def test_do_not_filter(self): corpus = track.DocumentCorpus("test", documents=[ track.Documents(source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-01"), track.Documents(source_format="other", number_of_documents=6, target_index="logs-02"), track.Documents(source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=7, target_index="logs-03"), track.Documents(source_format=None, number_of_documents=8, target_index=None) ]) filtered_corpus = corpus.filter() self.assertEqual(corpus.name, filtered_corpus.name) self.assertListEqual(corpus.documents, filtered_corpus.documents)
def test_cannot_union_mixed_document_corpora_by_name(self): a = track.DocumentCorpus( "test", documents=[ track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-01"), ], ) b = track.DocumentCorpus( "other", documents=[ track.Documents( source_format=track.Documents.SOURCE_FORMAT_BULK, number_of_documents=5, target_index="logs-02"), ], ) with pytest.raises(exceptions.RallyAssertionError) as exc: a.union(b) assert exc.value.message == "Corpora names differ: [test] and [other]."