def split_input(cls, job_config): """Inherit docs.""" params = job_config.input_reader_params shard_count = job_config.shard_count query_spec = cls._get_query_spec(params) if not property_range.should_shard_by_property_range( query_spec.filters): return super(ModelDatastoreInputReader, cls).split_input(job_config) p_range = property_range.PropertyRange(query_spec.filters, query_spec.model_class_path) p_ranges = p_range.split(shard_count) # User specified a namespace. if query_spec.ns: ns_range = namespace_range.NamespaceRange( namespace_start=query_spec.ns, namespace_end=query_spec.ns, _app=query_spec.app) ns_ranges = [copy.copy(ns_range) for _ in p_ranges] else: ns_keys = namespace_range.get_namespace_keys( query_spec.app, cls.MAX_NAMESPACES_FOR_KEY_SHARD + 1) if not ns_keys: return # User doesn't specify ns but the number of ns is small. # We still split by property range. if len(ns_keys) <= cls.MAX_NAMESPACES_FOR_KEY_SHARD: ns_ranges = [ namespace_range.NamespaceRange(_app=query_spec.app) for _ in p_ranges ] # Lots of namespaces. Split by ns. else: ns_ranges = namespace_range.NamespaceRange.split( n=shard_count, contiguous=False, can_query=lambda: True, _app=query_spec.app) p_ranges = [copy.copy(p_range) for _ in ns_ranges] assert len(p_ranges) == len(ns_ranges) iters = [ db_iters.RangeIteratorFactory.create_property_range_iterator( p, ns, query_spec) for p, ns in zip(p_ranges, ns_ranges) ] return [cls(i) for i in iters]
def testSplitWithNoNamespacesInDatastoreWithContiguous(self): self.assertEqual([namespace_range.NamespaceRange(_app=self.app_id)], namespace_range.NamespaceRange.split( 10, contiguous=True, can_query=lambda: True, _app=self.app_id))
def testSplitWithoutQueriesWithContiguous(self): self.assertEqual([ namespace_range.NamespaceRange( namespace_start='', namespace_end='abc', _app=self.app_id), namespace_range.NamespaceRange( namespace_start='ac', namespace_end='bb', _app=self.app_id), namespace_range.NamespaceRange( namespace_start='bba', namespace_end='caa', _app=self.app_id), namespace_range.NamespaceRange( namespace_start='cab', namespace_end='ccc', _app=self.app_id) ], namespace_range.NamespaceRange.split( 4, contiguous=True, can_query=lambda: False, _app=self.app_id))
def testQueryPaging(self): self.mox.StubOutClassWithMocks(datastore, 'Query') ns_range = namespace_range.NamespaceRange(namespace_start='a', namespace_end='b', _app=self.app_id) ns_kind = '__namespace__' ns_key = lambda ns: db.Key.from_path(ns_kind, ns) filters = {'__key__ >= ': ns_key('a'), '__key__ <= ': ns_key('b')} def ExpectQuery(cursor): return datastore.Query(ns_kind, filters=filters, keys_only=True, cursor=cursor, _app=self.app_id) query = ExpectQuery(None) query.Run(limit=3).AndReturn([ns_key(ns) for ns in ['a', 'aa', 'aaa']]) query.GetCursor().AndReturn('c1') query = ExpectQuery('c1') query.Run(limit=3).AndReturn( [ns_key(ns) for ns in ['aab', 'ab', 'ac']]) query.GetCursor().AndReturn('c2') query = ExpectQuery('c2') query.Run(limit=3).AndReturn([ns_key('b')]) self.mox.ReplayAll() self.assertEqual(7, len(list(ns_range)))
def testKeyRangesFromNSRange(self): namespaces = ["1", "3", "5"] self.create_entities_in_multiple_ns(namespaces) ns_range = namespace_range.NamespaceRange("0", "5", _app=self.app) kranges = key_ranges.KeyRangesFactory.create_from_ns_range(ns_range) expected = [key_range.KeyRange(namespace="1", _app=self.app), key_range.KeyRange(namespace="3", _app=self.app), key_range.KeyRange(namespace="5", _app=self.app)] self._assertEqualsAndSerialize(expected, kranges)
def testSplitWithOnlyDefaultNamespaceWithContiguous(self): self.CreateInNamespace('') self.assertEqual([ namespace_range.NamespaceRange( namespace_start='', namespace_end='ccc', _app=self.app_id) ], namespace_range.NamespaceRange.split( 10, contiguous=True, can_query=lambda: True, _app=self.app_id))
def _create_iter(self, entity_kind): query_spec = model.QuerySpec( entity_kind=util.get_short_name(entity_kind), batch_size=10, filters=self.filters, model_class_path=entity_kind) p_range = property_range.PropertyRange(self.filters, entity_kind) ns_range = namespace_range.NamespaceRange(self.namespaces[0], self.namespaces[-1]) itr = db_iters.RangeIteratorFactory.create_property_range_iterator( p_range, ns_range, query_spec) return itr
def testSplitWithInfiniteQueriesLargerSplitThanNamespaces(self): # Create 6 namespaces and split by 10 ranges. self.CreateInNamespace('a') self.CreateInNamespace('aa') self.CreateInNamespace('aab') self.CreateInNamespace('b') self.CreateInNamespace('bac') self.CreateInNamespace('cca') self.assertEqual([ namespace_range.NamespaceRange(namespace_start='a', namespace_end='a'), namespace_range.NamespaceRange(namespace_start='aa', namespace_end='aa'), namespace_range.NamespaceRange(namespace_start='aab', namespace_end='aab'), namespace_range.NamespaceRange(namespace_start='b', namespace_end='b'), namespace_range.NamespaceRange(namespace_start='bac', namespace_end='bac'), namespace_range.NamespaceRange(namespace_start='cca', namespace_end='cca') ], namespace_range.NamespaceRange.split( 10, contiguous=False, can_query=lambda: True))
def testSplitWithInfiniteQueriesSmallerSplitThanNamespaces(self): # Create 6 namespaces and split by 3 ranges. Use contiguous data for this # test (although we are not testing contiguous) and since the mid-point is # rounded down, skip 'aa' and 'aba' so that start and end of each range # will match exactly and contain 2 items. self.CreateInNamespace('a') self.CreateInNamespace('aaa') self.CreateInNamespace('aab') self.CreateInNamespace('aac') self.CreateInNamespace('ab') self.CreateInNamespace('abb') self.assertEqual([ namespace_range.NamespaceRange(namespace_start='a', namespace_end='aaa'), namespace_range.NamespaceRange(namespace_start='aab', namespace_end='aac'), namespace_range.NamespaceRange(namespace_start='ab', namespace_end='abb') ], namespace_range.NamespaceRange.split( 3, contiguous=False, can_query=lambda: True))
def _split_input_from_params(cls, app, namespaces, entity_kind_name, params, shard_count): readers = super(ConsistentKeyReader, cls)._split_input_from_params( app, namespaces, entity_kind_name, params, shard_count) # We always produce at least one namespace range because: # a) there might be unapplied entities # b) it simplifies mapper code if not readers: readers = [cls(entity_kind_name, key_ranges=None, ns_range=namespace_range.NamespaceRange(), batch_size=shard_count)] return readers
def testSplitWithInfiniteQueriesWithContiguous(self): self.CreateInNamespace('a') self.CreateInNamespace('aa') self.CreateInNamespace('aab') self.CreateInNamespace('b') self.CreateInNamespace('bac') self.CreateInNamespace('cca') self.assertEqual([ namespace_range.NamespaceRange(namespace_start='', namespace_end='a'), namespace_range.NamespaceRange(namespace_start='aa', namespace_end='aaa'), namespace_range.NamespaceRange(namespace_start='aab', namespace_end='acc'), namespace_range.NamespaceRange(namespace_start='b', namespace_end='bab'), namespace_range.NamespaceRange(namespace_start='bac', namespace_end='cc'), namespace_range.NamespaceRange(namespace_start='cca', namespace_end='ccc') ], namespace_range.NamespaceRange.split( 10, contiguous=True, can_query=lambda: True))
def testFromJsonObjectWithApp(self): self.assertEqual( namespace_range.NamespaceRange('a', 'b', _app='myapp'), namespace_range.NamespaceRange.from_json_object( dict(namespace_start='a', namespace_end='b', app='myapp')))
def testToJsonObject(self): self.assertEqual( dict(namespace_start='a', namespace_end='b'), namespace_range.NamespaceRange('a', 'b').to_json_object())