def test_array_scalar_load2(self): # Test sub arrays with documents as elements son_docs = [ bson.SON( [('x', [ [ bson.SON([('a', i), ('b', i)]), bson.SON([('a', -i), ('b', -i)]) ], [ bson.SON([('c', i), ('d', i)]), bson.SON([('c', -i), ('d', -i)]) ], ])]) for i in range(2, 4)] raw_docs = [bson._dict_to_bson( doc, False, bson.DEFAULT_CODEC_OPTIONS) for doc in son_docs] sub_sub_dtype = np.dtype(([('a', 'int32'), ('b', 'int32')], 2)) sub_dtype = np.dtype((sub_sub_dtype, 2)) dtype = np.dtype([('x', sub_dtype)]) ndarray = np.array( [[([(i, i), (-i, -i)],), ([(i, i), (-i, -i)],)] for i in range(2, 4)], dtype) # Correct dtype with self.assertRaisesPattern(bsonnumpy.error, r'unsupported BSON type: unknown'): bsonnumpy.sequence_to_ndarray(raw_docs, dtype, 2)
def test_incorrect_sub_dtype4(self): # Sub document not a document bad_doc = bson.SON([("x", bson.SON([("y", 0), ("z", 0)])), ("q", 10)]) bad_raw_docs = self.raw_docs[:9] bad_raw_docs.append( bson._dict_to_bson(bad_doc, False, bson.DEFAULT_CODEC_OPTIONS)) with self.assertRaisesPattern( bsonnumpy.error, "invalid document: expected subdoc from dtype," " got other type"): bsonnumpy.sequence_to_ndarray(bad_raw_docs, self.dtype_sub, 10) bad_doc = bson.SON([("x", bson.SON([("y", 0), ("z", 0)])), ("q", [10, 11, 12])]) bad_raw_docs = self.raw_docs[:9] bad_raw_docs.append( bson._dict_to_bson(bad_doc, False, bson.DEFAULT_CODEC_OPTIONS)) with self.assertRaisesPattern( bsonnumpy.error, "invalid document: expected subdoc from dtype," " got other type"): bsonnumpy.sequence_to_ndarray(bad_raw_docs, self.dtype_sub, 10)
def test_null(self): data = bson._dict_to_bson({"x": None}, True, bson.DEFAULT_CODEC_OPTIONS) with self.assertRaisesPattern(bsonnumpy.error, r'unsupported BSON type: Null'): bsonnumpy.sequence_to_ndarray(iter([data]), np.dtype([('x', '<V10')]), 1)
def test_array_scalar_load4(self): # Test documents with multiple levels of sub documents son_docs = [ bson.SON( [('x', [ [ bson.SON([('a', i), ('b', i)]), bson.SON([('a', -i), ('b', -i)]) ], [ bson.SON([('c', i), ('d', i)]), bson.SON([('c', -i), ('d', -i)]) ], ])]) for i in range(10)] raw_docs = [bson._dict_to_bson( doc, False, bson.DEFAULT_CODEC_OPTIONS) for doc in son_docs] sub_sub_sub_dtype = np.dtype([('q', 'int32')]) sub_sub_dtype = np.dtype( ([('a', sub_sub_sub_dtype), ('b', sub_sub_sub_dtype)], 2)) sub_dtype = np.dtype((sub_sub_dtype, 2)) dtype = np.dtype([('x', sub_dtype)]) # Correct dtype with self.assertRaisesPattern(bsonnumpy.error, r'unsupported BSON type: unknown'): bsonnumpy.sequence_to_ndarray(raw_docs, dtype, 4)
def raw_bson_func(use_large): c = db[collection_names[use_large]] if not hasattr(c, 'find_raw_batches'): print("Wrong PyMongo: no 'find_raw_batches' feature") return dtype = dtypes[use_large] bsonnumpy.sequence_to_ndarray(c.find_raw_batches(), dtype, c.count())
def bson_numpy_func(use_large): raw_coll = db.get_collection( collection_names[use_large], codec_options=CodecOptions(document_class=RawBSONDocument)) cursor = raw_coll.find() dtype = dtypes[use_large] bsonnumpy.sequence_to_ndarray((doc.raw for doc in cursor), dtype, raw_coll.count())
def _test_error(self, value, bson_type_name, codes): data = bson._dict_to_bson({'x': value}, True, bson.DEFAULT_CODEC_OPTIONS) for code in codes: dtype = np.dtype([('x', code)]) expected = "cannot convert %s to dtype" % bson_type_name with self.assertRaisesPattern(bsonnumpy.error, expected): bsonnumpy.sequence_to_ndarray(iter([data]), dtype, 1)
def test(self): data = bson._dict_to_bson( {"a": value}, True, # check_keys bson.DEFAULT_CODEC_OPTIONS) with self.assertRaises(bsonnumpy.error) as context: bsonnumpy.sequence_to_ndarray([data], np.dtype([("a", dtype)]), 1) self.assertIn("unsupported BSON type: %s" % type_name, str(context.exception))
def test_incorrect_sub_dtype_array7(self): # Sub array too short bad_doc = bson.SON([("x", [['d' * 1, 'd' * 2], ['d' * 4, 'd' * 5]]), ("y", [['d' * 7, 'd' * 8], ['d' * 10, 'd' * 11]])]) bad_raw_docs = self.raw_docs[:3] bad_raw_docs.append( bson._dict_to_bson(bad_doc, False, bson.DEFAULT_CODEC_OPTIONS)) with self.assertRaisesPattern( bsonnumpy.error, "invalid document: array is of incorrect length"): bsonnumpy.sequence_to_ndarray(bad_raw_docs, self.dtype, 4)
def test_incorrect_sub_dtype3(self): # Sub document missing key bad_doc = bson.SON([("x", bson.SON([("bad", 0), ("z", 0)])), ("q", bson.SON([("y", 0), ("z", 0)]))]) bad_raw_docs = self.raw_docs[:9] bad_raw_docs.append( bson._dict_to_bson(bad_doc, False, bson.DEFAULT_CODEC_OPTIONS)) with self.assertRaisesPattern(bsonnumpy.error, "document does not match dtype"): bsonnumpy.sequence_to_ndarray(bad_raw_docs, self.dtype_sub, 10)
def test_incorrect_sub_dtype_array1(self): # Top document missing key bad_doc = bson.SON([("x", [['d' * 1, 'd' * 2, 'd' * 3], ['d' * 4, 'd' * 5, 'd' * 6]]), ("bad_key", [['d' * 7, 'd' * 7, 'd' * 9], ['d' * 10, 'd' * 11, 'd' * 12]])]) bad_raw_docs = self.raw_docs[:3] bad_raw_docs.append( bson._dict_to_bson(bad_doc, False, bson.DEFAULT_CODEC_OPTIONS)) with self.assertRaisesPattern(bsonnumpy.error, "document does not match dtype"): bsonnumpy.sequence_to_ndarray(bad_raw_docs, self.dtype, 4)
def test_incorrect_sub_dtype_array2(self): # Top-level array not array bad_doc = bson.SON([("x", [['d' * 1, 'd' * 2, 'd' * 3], ['d' * 4, 'd' * 5, 'd' * 6]]), ("y", 'not an array')]) bad_raw_docs = self.raw_docs[:3] bad_raw_docs.append( bson._dict_to_bson(bad_doc, False, bson.DEFAULT_CODEC_OPTIONS)) with self.assertRaisesPattern( bsonnumpy.error, "invalid document: expected list from dtype, got other type"): bsonnumpy.sequence_to_ndarray(bad_raw_docs, self.dtype, 4)
def raw_bson_func(use_large): c = db[collection_names[use_large]] try: batches = list(c.find(raw_batches=True)) except TypeError as exc: if "unexpected keyword argument 'raw_batches'" in str(exc): print("Wrong PyMongo: no 'raw_batches' feature") return else: raise dtype = dtypes[use_large] bsonnumpy.sequence_to_ndarray(batches, dtype, c.count())
def test_dimensions_limit(self): # Make a deeply-nested dtype([('x', dtype([('x', dtype([('x', ... dtype = np.dtype([('y', np.int32)]) for _ in range(31): dtype = np.dtype([('x', dtype)]) # No error. bsonnumpy.sequence_to_ndarray([], dtype, 0) # One more level. dtype = np.dtype([('x', dtype)]) with self.assertRaisesPattern(bsonnumpy.error, r'exceeds 32 levels'): bsonnumpy.sequence_to_ndarray([], dtype, 0)
def test_incorrect_dtype(self): dtype = np.dtype([('a', np.int32), ('b', np.int32)]) # Dtype is named, but does not match documents with self.assertRaisesPattern(bsonnumpy.error, r'document does not match dtype'): bsonnumpy.sequence_to_ndarray(self.bson_docs, dtype, 10) # Dtype is not named with self.assertRaisesPattern( bsonnumpy.error, r'dtype must include field names,' r' like dtype\(\[\(\'fieldname\', numpy.int\)\]\)'): bsonnumpy.sequence_to_ndarray(self.bson_docs, np.dtype(np.int32), 10) # Dtype is simple array with self.assertRaisesPattern( bsonnumpy.error, r'dtype must include field names,' r' like dtype\(\[\(\'fieldname\', numpy.int\)\]\)'): bsonnumpy.sequence_to_ndarray(self.bson_docs, np.dtype('(3,2)int32'), 10) # Dtype is null or empty with self.assertRaisesPattern(bsonnumpy.error, r'dtype must include field names'): bsonnumpy.sequence_to_ndarray(self.bson_docs, None, 1)
def test_incorrect_sub_dtype5(self): # Sub document extra key dtype2 = np.dtype([('y', np.int32), ('z', np.int32)]) dtype_sub2 = np.dtype([('x', dtype2)]) ndarray2 = np.array([((i, i), ) for i in range(10)], dtype=dtype_sub2) res = bsonnumpy.sequence_to_ndarray(self.raw_docs, dtype_sub2, 10) self.assertTrue(np.array_equal(ndarray2, res)) dtype3 = np.dtype([('y', np.int32)]) dtype_sub3 = np.dtype([('x', dtype3), ('q', dtype3)]) ndarray3 = np.array([((i, ), (i, )) for i in range(10)], dtype=dtype_sub3) res = bsonnumpy.sequence_to_ndarray(self.raw_docs, dtype_sub3, 10) self.assertTrue(np.array_equal(ndarray3, res))
def test_incorrect_arguments(self): # Expects iterator, dtype, count with self.assertRaisesPattern(TypeError, r'\binteger\b'): bsonnumpy.sequence_to_ndarray(None, None, None) with self.assertRaisesPattern( TypeError, r'sequence_to_ndarray requires an iterator'): bsonnumpy.sequence_to_ndarray(0, self.dtype, 0) with self.assertRaisesPattern(bsonnumpy.error, r'dtype must include field names'): bsonnumpy.sequence_to_ndarray(self.bson_docs, None, 10) with self.assertRaisesPattern( TypeError, r'sequence_to_ndarray requires an iterator'): bsonnumpy.sequence_to_ndarray(self.dtype, self.dtype, 10) with self.assertRaisesPattern( TypeError, r'function takes exactly 3 arguments \(4 given\)'): bsonnumpy.sequence_to_ndarray(self.dtype, self.bson_docs, 10, 10)
def findAll(self, start, end): filter = {"date": {"$gte": start, "$lt": end}} global dtype return bsonnumpy.sequence_to_ndarray( self.colecao.find_raw_batches(filter).limit(100).batch_size(100), dtype, self.colecao.count(filter))
def make_mixed_collection_test(self, docs, dtype): coll = self.get_cursor_sequence(docs) ndarray = bsonnumpy.sequence_to_ndarray(coll.find_raw_batches(), dtype, coll.count()) self.compare_results(np.dtype(dtype), self.client.bsonnumpy_test.coll.find(), ndarray)
def test_datetime(self): docs = [{ "x": datetime.datetime(1970, 1, 1) }, { "x": datetime.datetime(1980, 1, 1) }, { "x": datetime.datetime(1990, 1, 1) }] dtype = np.dtype([('x', np.int64)]) self.client.bsonnumpy_test.coll.delete_many({}) self.client.bsonnumpy_test.coll.insert_many(docs) raw_coll = self.client.get_database( 'bsonnumpy_test', codec_options=CodecOptions(document_class=RawBSONDocument)).coll cursor = raw_coll.find() ndarray = bsonnumpy.sequence_to_ndarray((doc.raw for doc in cursor), dtype, raw_coll.count()) for i, row in enumerate(ndarray): document = docs[i] self.assertEqual( millis(document["x"] - datetime.datetime(1970, 1, 1)), row["x"])
def test_deeply_nested_array(self): # arrays of length 1 are maintained when they are within another array dtype = np.dtype([("a", "(3,2,1)int32"), ("b", "(3,2,1)int32")]) doc = bson.SON([("a", [[[9], [9]], [[8], [8]], [[7], [7]]]), ("b", [[[6], [6]], [[5], [5]], [[4], [4]]])]) utf8 = bson._dict_to_bson(doc, False, bson.DEFAULT_CODEC_OPTIONS) result = bsonnumpy.sequence_to_ndarray([utf8], dtype, 1) self.assertEqual(dtype, result.dtype) self.assertTrue( np.array_equal( result, np.array([([[[9], [9]], [[8], [8]], [[7], [7]] ], [[[6], [6]], [[5], [5]], [[4], [4]]])], dtype))) dtype = np.dtype([("a", "(3,1)int32"), ("b", "(3,1)int32"), ("c", "(3,1)int32")]) doc = bson.SON([("a", [[9], [8], [7]]), ("b", [[6], [5], [4]]), ("c", [[3], [2], [1]])]) utf8 = bson._dict_to_bson(doc, False, bson.DEFAULT_CODEC_OPTIONS) result = bsonnumpy.sequence_to_ndarray([utf8], dtype, 1) self.assertEqual(dtype, result.dtype) self.assertTrue( np.array_equal( result, np.array([([[9], [8], [7]], [[6], [5], [4]], [[3], [2], [1]])], dtype))) dtype = np.dtype([("a", "2int32")]) doc = bson.SON([("a", [7, 7])]) utf8 = bson._dict_to_bson(doc, False, bson.DEFAULT_CODEC_OPTIONS) result = bsonnumpy.sequence_to_ndarray([utf8], dtype, 1) self.assertEqual(dtype, result.dtype) self.assertTrue(np.array_equal(result, np.array([([7, 7], )], dtype))) dtype = np.dtype([("a", "(2,1,1,1)int32")]) doc = bson.SON([("a", [[[[99]]], [[[88]]]])]) utf8 = bson._dict_to_bson(doc, False, bson.DEFAULT_CODEC_OPTIONS) result = bsonnumpy.sequence_to_ndarray([utf8], dtype, 1) self.assertEqual(dtype, result.dtype) self.assertTrue( np.array_equal(result, np.array([([[[[99]]], [[[88]]]], )], dtype)))
def test_empty(self): dtype = np.dtype([('x', np.int32), ('y', np.float)]) batch = b''.join([ bson.BSON.encode({ "x": 1, "y": 1.1 }), bson.BSON.encode({}), bson.BSON.encode({ "x": 3, "y": 1.3 }), ]) with self.assertRaisesPattern(bsonnumpy.error, r'document does not match dtype'): bsonnumpy.sequence_to_ndarray([batch], dtype, 3)
def compare_seq_to_ndarray_result(self, np_type, document): data = bson._dict_to_bson(document, False, bson.DEFAULT_CODEC_OPTIONS) dtype = np.dtype(np_type) result = bsonnumpy.sequence_to_ndarray([data], dtype, 1) self.assertEqual(result.dtype, dtype) for key in document: self.assertEqual(result[0][key], document[key], "Comparison failed for type %s: %s != %s" % ( dtype, result[0][key], document[key]))
def findDiskUsage(self, start, end): filter = {"date": {"$gte": start, "$lt": end}} projection = {"disk_usage": True} global dtype return bsonnumpy.sequence_to_ndarray( self.colecao.find_raw_batches( filter, projection).limit(100).batch_size(100), dtype, self.colecao.count(filter))
def test_array_scalar_load3(self): # Test sub arrays with documents that have arrays son_docs = [ bson.SON( [('x', [ bson.SON([('a', [i, i, i, i]), ('b', [i, i, i, i])]), bson.SON([('a', [-i, -i, -i, -i]), ('b', [-i, -i, -i, -i])]) ])]) for i in range(10)] raw_docs = [bson._dict_to_bson( doc, False, bson.DEFAULT_CODEC_OPTIONS) for doc in son_docs] sub_dtype = np.dtype(([('a', '4int32'), ('b', '4int32')], 2)) dtype = np.dtype([('x', sub_dtype)]) # Correct dtype with self.assertRaisesPattern(bsonnumpy.error, r'unsupported BSON type: Sub-document'): bsonnumpy.sequence_to_ndarray(raw_docs, dtype, 4)
def test_string_length(self): data = bson._dict_to_bson({"x": "abc"}, True, bson.DEFAULT_CODEC_OPTIONS) ndarray = bsonnumpy.sequence_to_ndarray(iter([data]), np.dtype([("x", "V1")]), 1) self.assertEqual(ndarray[0]["x"].tobytes(), b"a") ndarray = bsonnumpy.sequence_to_ndarray(iter([data]), np.dtype([("x", "V2")]), 1) self.assertEqual(ndarray[0]["x"].tobytes(), b"ab") ndarray = bsonnumpy.sequence_to_ndarray(iter([data]), np.dtype([("x", "V3")]), 1) self.assertEqual(ndarray[0]["x"].tobytes(), b"abc") ndarray = bsonnumpy.sequence_to_ndarray(iter([data]), np.dtype([("x", "V4")]), 1) self.assertEqual(ndarray[0]["x"].tobytes(), b"abc\0")
def test_aggregate_raw_batches(self): dtype = np.dtype([('y', np.int32)]) docs = [{"x": i} for i in range(10)] expected = [(2 * i, ) for i in range(10)] coll = self.get_cursor_sequence(docs) pipeline = [{'$project': {'y': {'$multiply': [2, '$x']}}}] ndarray = bsonnumpy.sequence_to_ndarray( coll.aggregate_raw_batches(pipeline), dtype, coll.count()) self.assertEqual(dtype, ndarray.dtype) np.testing.assert_array_equal(ndarray, np.array(expected, dtype))
def test_objectid(self): docs = [{"x": bson.ObjectId()} for _ in range(10)] dtype = np.dtype([('x', '<V12')]) self.client.bsonnumpy_test.coll.delete_many({}) self.client.bsonnumpy_test.coll.insert_many(docs) cursor = self.client.bsonnumpy_test.coll.find_raw_batches() ndarray = bsonnumpy.sequence_to_ndarray(cursor, dtype, cursor.count()) for i, row in enumerate(ndarray): document = docs[i] self.assertEqual(document["x"].binary, row["x"].tobytes())
def test_incorrect_sub_dtype2(self): # Top document has extra key data = bson._dict_to_bson({ "x": 12, "y": 13 }, True, bson.DEFAULT_CODEC_OPTIONS) ndarray = bsonnumpy.sequence_to_ndarray([data], np.dtype([("y", np.int)]), 1) self.assertEqual(1, len(ndarray)) self.assertEqual(13, ndarray[0]["y"]) with self.assertRaises(ValueError): ndarray[0]["x"]
def test_raw_batch(self): dtype = np.dtype([('x', np.int32), ('y', np.float)]) # A variety of lengths. batch = b''.join([ bson.BSON.encode({ "x": 1, "y": 1.1 }), bson.BSON.encode({ "x": 2, "y": 1.2, "extra key": "foobar" }), bson.BSON.encode({ "x": 3, "y": 1.3 }), ]) result = bsonnumpy.sequence_to_ndarray([batch], dtype, 3) ndarray = np.array([(1, 1.1), (2, 1.2), (3, 1.3)], dtype) np.testing.assert_array_equal(result, ndarray) dtype = np.dtype([('x', np.int32), ('y', np.float), ('z', np.int32)]) # A variety of orders. batch = b''.join([ bson.BSON.encode(SON([("x", 1), ("y", 1.1), ("z", 4)])), bson.BSON.encode(SON([("x", 2), ("z", 5), ("y", 1.2)])), bson.BSON.encode(SON([("z", 6), ("x", 3), ("y", 1.3)])) ]) result = bsonnumpy.sequence_to_ndarray([batch], dtype, 3) ndarray = np.array([(1, 1.1, 4), (2, 1.2, 5), (3, 1.3, 6)], dtype) np.testing.assert_array_equal(result, ndarray)