def testReadFromThreeShards(self): recordio = RecordIOShard.create("test", hi=("1", )) recordio.insert(("0", STRING + "a")) recordio.commit() recordio = RecordIOShard.create("test", lo=("1", ), hi=("3", )) recordio.insert(("1", STRING + "b")) recordio.insert(("2", STRING + "c")) recordio.commit() recordio = RecordIOShard.create("test", lo=("3", )) recordio.insert(("3", STRING + "d")) recordio.commit() reader = RecordIOReader("test") self.assertEqual([("0", "a"), ("1", "b"), ("2", "c"), ("3", "d")], list(reader)) self.assertEqual([("0", "a"), ("1", "b"), ("2", "c"), ("3", "d")], list(reader.read(start_key="0"))) self.assertEqual([("0", "a"), ("1", "b"), ("2", "c"), ("3", "d")], list(reader.read(end_key="4"))) self.assertEqual([("1", "b"), ("2", "c"), ("3", "d")], list(reader.read(start_key="1"))) self.assertEqual([("2", "c"), ("3", "d")], list(reader.read(start_key="2"))) self.assertEqual([("0", "a"), ("1", "b")], list(reader.read(end_key="2"))) self.assertEqual([("1", "b"), ("2", "c")], list(reader.read(start_key="1", end_key="3"))) self.assertEqual([("1", "b")], list(reader.read(start_key="1", end_key="2")))
def testReadFromThreeShards(self): recordio = RecordIOShard.create("test", hi=("1",)) recordio.insert(("0", STRING + "a")) recordio.commit() recordio = RecordIOShard.create("test", lo=("1",), hi=("3",)) recordio.insert(("1", STRING + "b")) recordio.insert(("2", STRING + "c")) recordio.commit() recordio = RecordIOShard.create("test", lo=("3",)) recordio.insert(("3", STRING + "d")) recordio.commit() reader = RecordIOReader("test") self.assertEqual([("0", "a"), ("1", "b"), ("2", "c"), ("3", "d")], list(reader)) self.assertEqual([("0", "a"), ("1", "b"), ("2", "c"), ("3", "d")], list(reader.read(start_key="0"))) self.assertEqual([("0", "a"), ("1", "b"), ("2", "c"), ("3", "d")], list(reader.read(end_key="4"))) self.assertEqual([("1", "b"), ("2", "c"), ("3", "d")], list(reader.read(start_key="1"))) self.assertEqual([("2", "c"), ("3", "d")], list(reader.read(start_key="2"))) self.assertEqual([("0", "a"), ("1", "b")], list(reader.read(end_key="2"))) self.assertEqual([("1", "b"), ("2", "c")], list(reader.read(start_key="1", end_key="3"))) self.assertEqual([("1", "b")], list(reader.read(start_key="1", end_key="2")))
def all_names(): """Returns the names of all existing RecordIOs. :return: list of RecordIO names """ for x in RecordIOShard.all(keys_only=True).filter("index =", True): yield RecordIOShard.get_name(x.name())
def write2MBAndReplace(self, compressed): test_string = test_helper.uncompressableString(2**21) updater = RecordIOWriter("test") updater.create(compressed=compressed) updater.insert("test", test_string) updater.commit_sync() output = [] entries = 0 shards_count = 0 for recordio in RecordIOShard.all(): self.assertTrue(len(recordio.data) >= 1000) shards_count += 1 for entry in recordio: output += [entry[-1]] entries += 1 self.assertTrue(shards_count > 1) self.assertTrue(entries > 3) self.assertEqual("".join(output), STRING + test_string, "read != write") updater.insert("test", "short") updater.commit_sync(retries=0) replaced_shards_count = 0 for recordio in RecordIOShard.all(): if replaced_shards_count == 0: self.assertEqual(1, len(recordio)) for entry in recordio: self.assertEqual(STRING + "short", entry[-1]) else: self.assertEqual(0, len(recordio)) for entry in recordio: self.fail("shouldnt be iterable") replaced_shards_count += 1 self.assertTrue(len(recordio.data) < 1000) self.assertTrue(replaced_shards_count > 0) self.assertTrue(replaced_shards_count <= shards_count)
def create(self, compressed=True, pre_split=[]): """Creates a RecordIO in datastore. If the RecordIO exists, nothing happens :param compressed: Boolean if the data in the RecordIO should be gzipped. :param pre_split: An optional list of keys to that should be used to pre-split the internal data shards. This is only makes sense if you are going to write a lot of data and you already know the key range of the data and roughly how many entries fit into one shard. :return: True, if the RecordIO didn't exist before. """ self.db_search += 1 if RecordIOShard.get_all_query(self.name, keys_only=True).get() == None: pre_split.sort() self.db_put += 1 split = [None] + [(x,) for x in pre_split] + [None] split = [(split[i], split[i+1]) for i in xrange(len(split) - 1)] for lo, hi in split: index = None if lo == None: index = True RecordIOShard.get_or_insert(RecordIOShard.key_name(self.name, lo=lo, hi=hi), compressed=compressed, index=index) return True return False
def testShardNamesForKeysMissingLo(self): recordio_hi = RecordIOShard.create("test", lo="1") recordio_hi.insert(("1", "b")) recordio_hi.insert(("2", "c")) recordio_hi.commit() self.assertEqual({ None: [("0", )] }, self.getResult(RecordIOShard.get_shards_for_key_values( "test", [("0", )])))
def testKeyName(self): name = RecordIOShard.key_name("te|st", ("b|b",), ("d|d",)) self.assertEqual("te%7Cst!0!647c64!0000000000!0000000001!0000000000" + "!627c62!0000000000!0000000001!0000000000", name) recordio = RecordIOShard.create("te|st", ("b|b", 0, 1, 0), ("d|d",)) self.assertEqual("te|st", recordio.name()) self.assertEqual((("b|b", 0, 1, 0), ("d|d", 0, 1, 0)), recordio.lo_hi())
def testShardNamesForKeysEmpty(self): recordio = RecordIOShard.create("test") recordio.insert(("0", "a")) recordio.insert(("1", "b")) recordio.insert(("2", "c")) recordio.commit() self.assertEqual({ RecordIOShard.key_name("test"): [("", ),] }, self.getResult(RecordIOShard.get_shards_for_key_values( "test", [("",)])))
def commit_shard_(self, shard_name, key_values): """Adds key, values to a shard and splits it if necessary. :param shard_name: The key name of the RecordIOShard. :param key_values: A list of key values to be added :return: list of keys that need to be deleted in other shards. """ shard = RecordIOShard.get_by_key_name(shard_name) self.db_get += 1 if shard == None: raise RecordIOShardDoesNotExistError(shard_name) for entry in key_values: shard.insert(entry) try: shard.commit() self.db_put += 1 return (shard.not_deleted(), None, None) except (RecordIOShardTooBigError, RequestTooLargeError, ValueError, ArgumentError, BadRequestError): shard.delete() lo_shard, hi_shard = shard.split() lo_shard.commit() hi_shard.commit() self.db_put += 2 logging.debug("Split\n%s\n%s\n%s" % (shard.key().name(), lo_shard.key().name(), hi_shard.key().name())) shard_name = hi_shard.key().name() return shard.not_deleted(), (lo_shard.lo_hi()), (hi_shard.lo_hi())
def testReadFromInexistingLoShards(self): recordio_hi = RecordIOShard.create("test", lo="1") recordio_hi.insert(("1", STRING + "b")) recordio_hi.insert(("2", STRING + "c")) recordio_hi.commit() reader = RecordIOReader("test") self.assertRaises(RecordIOShardDoesNotExistError, self.readAll, reader)
def testReadSplitEntries(self): recordio = RecordIOShard.create("test", compressed=False) recordio.insert(("a", STRING + "a")) recordio.insert(("b", 0, 1, 1, STRING + "b")) recordio.insert(("c", STRING + "c")) recordio.insert(("d", 0, 2, 1, STRING + "1")) recordio.insert(("d", 1, 2, 1, "2")) recordio.insert(("e", 0, 3, 1, STRING + "1")) recordio.insert(("e", 1, 3, 1, "2")) recordio.insert(("e", 2, 3, 1, "3")) recordio.insert(("f", STRING + "f")) recordio.insert(("g", 0, 2, 2, STRING + "1")) recordio.insert(("g", 1, 2, 1, "bad")) recordio.insert(("g", 1, 2, 2, "2")) recordio.insert(("g_missing_1", 0, 3, 1, STRING + "bad")) recordio.insert(("g_missing_1", 1, 3, 1, "bad")) recordio.insert(("g_missing_2", 1, 2, 1, "bad")) recordio.insert(("h", STRING + "h")) recordio.commit() reader = RecordIOReader("test") self.assertEqual([("a", "a"), ("b", "b"), ("c", "c"), ("d", "12"), ("e", "123"), ("f", "f"), ("g", "12"), ("h", "h")], list(reader.read())) self.assertEqual(["g_missing_1"], reader.get_not_read())
def insertGetAndOrder(self, compressed): recordio = RecordIOShard.create("test", compressed=compressed) recordio.insert(("a", "a")) test_strings = self.getStrings() assert(len(test_strings) > 1) random.shuffle(test_strings) for x in test_strings: recordio.insert((x, x)) self.assertEqual(len(test_strings), len(recordio)) for x in test_strings: recordio.insert((x, "".join(reversed(x)))) self.assertEqual(len(test_strings), len(recordio)) for i in range(0, len(test_strings), 500): x = test_strings[i] self.assertTrue(x in recordio) self.assertEqual(recordio[(x,)], "".join(reversed(x))) test_strings = self.getStrings() i = 0 for key, value in recordio: self.assertEqual(test_strings[i], key) self.assertEqual("".join(reversed(test_strings[i])), value) i += 1 assert("not_in" not in test_strings) self.assertFalse("not_in" in recordio)
def testTaskQueue(self): writer = RecordIOWriter("test") writer.create(compressed=False) test_value = test_helper.uncompressableString(MAX_ENTRY_SIZE-1) entries_to_write = MAX_BLOB_SIZE / MAX_ENTRY_SIZE + 1 for i in range(entries_to_write): writer.insert(str(i), test_value) writer.commit_async() taskq = self.testbed.get_stub(testbed.TASKQUEUE_SERVICE_NAME) tasks = taskq.GetTasks("recordio-writer") for task in tasks: url=task["url"] args = urlparse.parse_qs(base64.b64decode(task["body"])) for x in args: args[x] = args[x][0] test_helper.requestGet(WriteHandler(), url, args) assert(len([x for x in RecordIOShard.all()]) > 1) reader = RecordIOReader("test") result = {} for key, value in reader: result[key] = value self.assertEqual(len(result), entries_to_write) for i in range(entries_to_write): self.assertEqual(result[str(i)], test_value, "Not equal")
def commit_to_queue_(self): """Adds all pending changes to the task queues for async commits :return: Yields all shard names that need to be updated. """ pull = taskqueue.Queue('recordio-queue') rpcs = [] key_values_not_added = RecordIORecords() for shard_name, key_values in RecordIOShard.get_shards_for_key_values( self.name, self.updates): self.db_search += 1 if shard_name == None: for entry in key_values: key_values_not_added.insert(entry) else: for key_values_chunk in get_chunks(key_values, MAX_TASKQUEUE_BATCH_SIZE): payload = marshal.dumps(key_values_chunk, MARSHAL_VERSION) rpc = pull.add_async(taskqueue.Task(payload=payload, method='PULL', tag=shard_name)) rpcs.append((rpc, key_values_chunk, shard_name)) for rpc, key_values, shard_name in rpcs: try: rpc.get_result() yield shard_name except: for entry in key_values: key_values_not_added.insert(entry) self.updates = key_values_not_added if len(self.updates): raise RecordIOWriterNotCompletedError(len(self.updates))
def delete(self): """Deletes a RecordIO. Modifying RecordIOs or applying queued writes may result in errors during deletions. """ db.delete(RecordIOShard.get_all_query(self.name, keys_only=True))
def commit_to_queue_(self): """Adds all pending changes to the task queues for async commits :return: Yields all shard names that need to be updated. """ pull = taskqueue.Queue('recordio-queue') rpcs = [] key_values_not_added = RecordIORecords() for shard_name, key_values in RecordIOShard.get_shards_for_key_values( self.name, self.updates): self.db_search += 1 if shard_name == None: for entry in key_values: key_values_not_added.insert(entry) else: for key_values_chunk in get_chunks(key_values, MAX_TASKQUEUE_BATCH_SIZE): payload = marshal.dumps(key_values_chunk, MARSHAL_VERSION) rpc = pull.add_async( taskqueue.Task(payload=payload, method='PULL', tag=shard_name)) rpcs.append((rpc, key_values_chunk, shard_name)) for rpc, key_values, shard_name in rpcs: try: rpc.get_result() yield shard_name except: for entry in key_values: key_values_not_added.insert(entry) self.updates = key_values_not_added if len(self.updates): raise RecordIOWriterNotCompletedError(len(self.updates))
def testReadFromInexistingHiShards(self): recordio_lo = RecordIOShard.create("test", hi="1") recordio_lo.insert(("0", STRING + "a")) recordio_lo.commit() reader = RecordIOReader("test") self.assertRaises(RecordIOShardDoesNotExistError, self.readAll, reader)
def testGetAllQuery(self): RecordIOShard.create("test", hi=("a", "")).commit() RecordIOShard.create("test", lo=("a", ""), hi=("b", "")).commit() RecordIOShard.create("test", lo=("b", "")).commit() self.assertEqual( [(None, ("a", 0, 1, 0)), (('a', 0, 1, 0), ('b', 0, 1, 0)), (('b', 0, 1, 0), None)], [RecordIOShard.lo_hi_from_key(x.name()) for x in RecordIOShard.get_all_query("test", keys_only=True)])
def testSplitEntriesSplit(self): recordio = RecordIOShard.create("test", compressed=False) recordio.insert(("b", 0, 3, 3, "bb")) recordio.insert(("b", 1, 3, 3, "bb")) recordio.insert(("b", 2, 3, 3, "bb")) lo_record, hi_record = recordio.split() self.assertEqual((None, ('b', 2, 3, 3)), lo_record.lo_hi()) self.assertEqual((('b', 2, 3, 3), None), hi_record.lo_hi())
def testWriteDuringSplit(self): recordio = RecordIOShard.create("test", compressed=False) recordio.insert(("1", STRING + "1")) recordio.insert(("2", STRING + "2")) lo_shard, hi_shard = recordio.split() lo_shard.commit() updater = RecordIOWriter("test") updater.insert("3", "3") self.assertRaises(RecordIOShardDoesNotExistError, updater.commit_shard_, hi_shard.key().name(), updater.updates) self.assertRaises(RecordIOWriterNotCompletedError, updater.commit_sync, 32, 0) hi_shard.commit() updater.insert("0", STRING + "0") updater.commit_sync() lo_shard, hi_shard = [x for x in RecordIOShard.all()] self.assertEqual([x[0] for x in lo_shard], ["0", "1"]) self.assertEqual([x[0] for x in hi_shard], ["2", "3"])
def testCommitToQueue(self): updater = RecordIOWriter("test") updater.create() chunk_size = MAX_ENTRY_SIZE - 1 entries_to_write = MAX_TASKQUEUE_BATCH_SIZE / MAX_ENTRY_SIZE + 1 for i in xrange(entries_to_write): updater.insert(str("%09d" % i), test_helper.uncompressableString(chunk_size)) list(updater.commit_to_queue_()) pull = taskqueue.Queue('recordio-queue') tasks = list(pull.lease_tasks(60, 100)) self.assertEqual(len(tasks), 2) self.assertEqual(tasks[0].tag, RecordIOShard.key_name("test")) self.assertEqual(tasks[1].tag, RecordIOShard.key_name("test")) updates_0 = marshal.loads(tasks[0].payload) updates_1 = marshal.loads(tasks[1].payload) self.assertEqual([str("%09d" % x) for x in xrange(entries_to_write)], [x[0] for x in updates_0] + [x[0] for x in updates_1]) self.assertTrue(updates_0[0][1] == STRING + test_helper.uncompressableString(chunk_size))
def testReadStringMarshalPickle(self): recordio = RecordIOShard.create("test") recordio.insert(("string", STRING + "string")) marshalable = {"a": [1, 2, 3, u"asd"]} recordio.insert(("marshal", MARSHAL + marshal.dumps(marshalable))) pickleable = AnyClass() recordio.insert(("cpickle", CPICKLE + cPickle.dumps(pickleable))) recordio.commit() reader = RecordIOReader("test") self.assertEqual([("cpickle", pickleable), ("marshal", marshalable), ("string", "string")], list(reader))
def read_entries_(self, start_key=None, end_key=None): """An internal helper function to read split entries. :param start_key: An entry tuple (no value needed) :param end_key: An entry tuple (no value needed) Exclusive. :return: Yields key, split_values """ # TODO (andrin): fetch a couple of shards instead of just one based on # method argument current_key = start_key if current_key == None: current_key = ("", ) limit_shard_name = RecordIOShard.key_name(self.name, lo=start_key, hi=end_key).split(SPLIT_CHAR) while True: shard = RecordIOShard.get_shards_for_key_values( self.name, [current_key], keys_only=False).next()[0] self.db_search_and_get += 1 if shard == None: raise RecordIOShardDoesNotExistError(self.name) hi = shard.lo_hi()[1] shard_name = shard.key().name().split(SPLIT_CHAR) if (shard_name[6:10] >= limit_shard_name[6:10] and (shard_name[2:5] < limit_shard_name[2:5] or limit_shard_name[2] == SPLIT_CHAR_AFTER)): # Read the whole shard for entry in shard: yield entry else: # Read parts of the shard for entry in shard.read(current_key, end_key): yield entry if hi == None: # Was the last shard return current_key = hi if (end_key != None and RecordIORecords.entry_comperator( current_key, end_key) >= 0): # Next shard is after end_key return
def testReadStringMarshalPickle(self): recordio = RecordIOShard.create("test") recordio.insert(("string", STRING + "string")) marshalable = {"a": [1,2,3, u"asd"]} recordio.insert(("marshal", MARSHAL + marshal.dumps(marshalable))) pickleable = AnyClass() recordio.insert(("cpickle", CPICKLE + cPickle.dumps(pickleable))) recordio.commit() reader = RecordIOReader("test") self.assertEqual([("cpickle", pickleable), ("marshal", marshalable), ("string", "string")], list(reader))
def testShardNamesForShorterKeys(self): RecordIOShard.create("test", hi=("a", "")).commit() RecordIOShard.create("test", lo=("a", "")).commit() self.assertEqual({ RecordIOShard.key_name("test", lo=("a", "")): [("aa", ),] }, self.getResult(RecordIOShard.get_shards_for_key_values( "test", [("aa",)])))
def read_entries_(self, start_key=None, end_key=None): """An internal helper function to read split entries. :param start_key: An entry tuple (no value needed) :param end_key: An entry tuple (no value needed) Exclusive. :return: Yields key, split_values """ # TODO (andrin): fetch a couple of shards instead of just one based on # method argument current_key = start_key if current_key == None: current_key = ("", ) limit_shard_name = RecordIOShard.key_name( self.name, lo=start_key, hi=end_key).split(SPLIT_CHAR) while True: shard = RecordIOShard.get_shards_for_key_values( self.name, [current_key], keys_only=False).next()[0] self.db_search_and_get += 1 if shard == None: raise RecordIOShardDoesNotExistError(self.name) hi = shard.lo_hi()[1] shard_name = shard.key().name().split(SPLIT_CHAR) if (shard_name[6:10] >= limit_shard_name[6:10] and (shard_name[2:5] < limit_shard_name[2:5] or limit_shard_name[2] == SPLIT_CHAR_AFTER)): # Read the whole shard for entry in shard: yield entry else: # Read parts of the shard for entry in shard.read(current_key, end_key): yield entry if hi == None: # Was the last shard return current_key = hi if (end_key != None and RecordIORecords.entry_comperator(current_key, end_key) >= 0): # Next shard is after end_key return
def testCommitToQueueAndScheduleWrite(self): updater = RecordIOWriter("test") updater.create() updater.insert("a", "") updater.commit_async() taskq = self.testbed.get_stub(testbed.TASKQUEUE_SERVICE_NAME) tasks = taskq.GetTasks("recordio-writer") self.assertEqual(len(tasks), 1) self.assertEqual(tasks[0]["url"], "/recordio/write") self.assertEqual(base64.b64decode(tasks[0]["body"]), "taskqueue=" + urllib.quote( RecordIOShard.key_name("test")))
def writeOneShard(self, compressed): updater = RecordIOWriter("test") updater.create(compressed=compressed) updater.insert("1", "foo") updater.insert("2", "bar") updater.commit_sync() updater = RecordIOWriter("test") updater.insert("3", "win") updater.remove("2") updater.commit_sync() recordio = RecordIOShard.all().get() self.assertEqual(recordio.compressed, compressed) self.assertEqual([x for x in recordio], [("1", STRING + "foo"), ("3", STRING + "win")])
def testShardNamesForKeysSplit(self): recordio = RecordIOShard.create("test") test_strings = [str(x) for x in range(10)] for x in test_strings: recordio.insert((x, test_helper.uncompressableString(2**16))) recordio.commit() self.assertEqual({ RecordIOShard.key_name("test"): [("0", ""), ("1", "")] }, self.getResult(RecordIOShard.get_shards_for_key_values( "test", [("0", ""), ("1", "")]))) recordio.delete() shard_0, shard_1 = recordio.split() shard_1, shard_2 = shard_1.split() shard_0.commit() shard_1.commit() shard_2.commit() self.assertEqual({ shard_0.key().name(): [('0', '0'), ('1', '1'), ('2', '2'), ('3', '3'), ('4', '4')], shard_1.key().name(): [('5', '5'), ('6', '6'), ('7', '7')], shard_2.key().name(): [('8', '8'), ('9', '9')]}, self.getResult(RecordIOShard.get_shards_for_key_values( "test", zip(test_strings, test_strings))))
def testCommitToQueueSplitEntries(self): chunk_size = MAX_ENTRY_SIZE + 1 test_string = test_helper.uncompressableString(chunk_size) updater = RecordIOWriter("test") updater.create() updater.insert("test", test_string) list(updater.commit_to_queue_()) pull = taskqueue.Queue('recordio-queue') tasks = list(pull.lease_tasks(60, 100)) self.assertEqual(len(tasks), 1) self.assertEqual(tasks[0].tag, RecordIOShard.key_name("test")) updates = marshal.loads(tasks[0].payload) self.assertEqual([('test', 0, 2), ('test', 1, 2)], [x[:-2] for x in updates]) self.assertEqual(STRING + test_string, "".join([x[-1] for x in updates]))
def testWriteStringMarshalPickle(self): updater = RecordIOWriter("test") updater.create() updater.insert("string", "string") marshalable = {"a": [1,2,3]} updater.insert("marshal", marshalable) class AnyClass(): pass pickleable = AnyClass() updater.insert("cpickle", pickleable) updater.commit_sync() recordio = RecordIOShard.all().get() self.assertEqual([x for x in recordio], [("cpickle", CPICKLE + cPickle.dumps(pickleable)), ("marshal", MARSHAL + marshal.dumps(marshalable)), ("string", STRING + "string")])
def testSplit(self): recordio = RecordIOShard.create("test") test_strings = ["c", "a", "b", "d", "e"] for x in test_strings: recordio.insert((x, test_helper.uncompressableString(ZIP_CHUNKS))) lo_record, hi_record = recordio.split() self.assertEqual(3, len(lo_record)) self.assertEqual(2, len(hi_record)) for x in test_strings: self.assertTrue(x in lo_record or x in hi_record) self.assertTrue(max(lo_record) < min(hi_record)) self.assertEqual("test", lo_record.name()) self.assertEqual((None, ('d', 0, 1, 0)), lo_record.lo_hi()) self.assertEqual(["a", "b", "c"], [x[0] for x in lo_record]) self.assertEqual("test", hi_record.name()) self.assertEqual((('d', 0, 1, 0), None), hi_record.lo_hi()) self.assertEqual(["d", "e"], [x[0] for x in hi_record])
def readFromnOneShard(self, compressed): recordio = RecordIOShard.create("test", compressed=compressed) recordio.insert(("0", STRING + "a")) recordio.insert(("1", STRING + "b")) recordio.insert(("2", STRING + "c")) recordio.commit() reader = RecordIOReader("test") self.assertEqual([("0", "a"), ("1", "b"), ("2", "c")], list(reader)) self.assertEqual([("0", "a"), ("1", "b"), ("2", "c")], list(reader.read(start_key="0"))) self.assertEqual([("0", "a"), ("1", "b"), ("2", "c")], list(reader.read(end_key="3"))) self.assertEqual([("1", "b"), ("2", "c")], list(reader.read(start_key="1"))) self.assertEqual([("0", "a"), ("1", "b")], list(reader.read(end_key="2"))) self.assertEqual([("1", "b")], list(reader.read(start_key="1", end_key="2"))) self.assertTrue("0" in reader) self.assertFalse("3" in reader) self.assertEqual(reader["0"], "a")
def commit_batch(self, tag, batch): """Applies a batch of values to a RecordIO and deletes the taskqueue task, :param tag: The current tag we are working on :param batch: A list of (tasqueue_task, key_value_list) :return: True on success """ if batch: done_tasks = [] count = 0 writer = RecordIOWriter(RecordIOShard.get_name(tag)) for done_task, key_values in batch: done_tasks.append(done_task) for entry in key_values: writer.insert_entry_(entry) count += 1 try: writer.commit_sync(retries=1) try: self.pull.delete_tasks(done_tasks) except taskqueue.BadTaskStateError: for task in done_tasks: if task.was_deleted: continue try: self.pull.delete_tasks(task) except taskqueue.BadTaskStateError: logging.debug( "RecordIO Failed to free task %s on %s" % task.name, tag) logging.debug("RecordIO wrote %d entries to %s" % (count, writer.name)) except RecordIOWriterNotCompletedError: logging.debug("RecordIO not completed on: %s" % tag) for task in done_tasks: self.pull.modify_task_lease(task, 0) return False return True
def commit_batch(self, tag, batch): """Applies a batch of values to a RecordIO and deletes the taskqueue task, :param tag: The current tag we are working on :param batch: A list of (tasqueue_task, key_value_list) :return: True on success """ if batch: done_tasks = [] count = 0 writer = RecordIOWriter(RecordIOShard.get_name(tag)) for done_task, key_values in batch: done_tasks.append(done_task) for entry in key_values: writer.insert_entry_(entry) count += 1 try: writer.commit_sync(retries=1) try: self.pull.delete_tasks(done_tasks) except taskqueue.BadTaskStateError: for task in done_tasks: if task.was_deleted: continue try: self.pull.delete_tasks(task) except taskqueue.BadTaskStateError: logging.debug("RecordIO Failed to free task %s on %s" % task.name, tag) logging.debug("RecordIO wrote %d entries to %s" % (count, writer.name)) except RecordIOWriterNotCompletedError: logging.debug("RecordIO not completed on: %s" % tag) for task in done_tasks: self.pull.modify_task_lease(task, 0) return False return True
def commit_sync(self, retries=32, retry_timeout=1): """Applies all changes synchronously to the RecordIO. :param retries: How many times a commit_sync should be retried in case of datastore collisions. :param retry_timeout: The amount of second to wait before the next retry. """ if not len(self.updates): return for attempt in range(retries + 1): shard_does_not_exist = RecordIORecords() for shard_name, key_values in RecordIOShard.get_shards_for_key_values( self.name, self.updates): self.db_search += 1 if shard_name == None and key_values: logging.debug("RecordIO %s: No shard found for:\n%s -> %s" % (self.name, SPLIT_CHAR.join(RecordIOShard.entry_key(key_values[0])), key_values[0][:-1])) for entry in key_values: shard_does_not_exist.insert(entry) else: lo_just_split = None hi_just_split = None for key_values_chunk in get_chunks(key_values, MAX_WRITE_BATCH_SIZE): if lo_just_split and hi_just_split and key_values_chunk: if RecordIORecords.in_range(key_values_chunk[0], lo=lo_just_split[0], hi=lo_just_split[1]): shard_name = RecordIOShard.key_name(self.name, lo=lo_just_split[0], hi=lo_just_split[1]) elif RecordIORecords.in_range(key_values_chunk[0], lo=hi_just_split[0], hi=hi_just_split[1]): shard_name = RecordIOShard.key_name(self.name, lo=hi_just_split[0], hi=hi_just_split[1]) not_deleted = None try: not_deleted, lo_just_split, hi_just_split = self.commit_shard_( shard_name, key_values_chunk) except RecordIOShardDoesNotExistError: logging.debug("Shard does not exist:\n" + shard_name) lo_just_split = None hi_just_split = None for entry in key_values_chunk: shard_does_not_exist.insert(entry) if not_deleted: for to_delete_shard_name, to_delete_key_values in ( RecordIOShard.get_shards_for_key_values( self.name, not_deleted)): self.db_search += 1 try: self.commit_shard_(to_delete_shard_name, to_delete_key_values) except RecordIOShardDoesNotExistError: logging.debug("Shard does not exist:\n" + shard_name) for entry in to_delete_key_values: shard_does_not_exist.insert(entry) self.updates = shard_does_not_exist if len(self.updates): if attempt == retries: raise RecordIOWriterNotCompletedError(len(self.updates)) else: logging.debug("Commit attempt %d failed" % attempt) time.sleep(retry_timeout) else: return
def testShardNamesForKeysNone(self): self.assertEqual({ None: [("0", ""), ("1", "")] }, self.getResult(RecordIOShard.get_shards_for_key_values( "test", [("0", ""), ("1", "")])))
def commit_sync(self, retries=32, retry_timeout=1): """Applies all changes synchronously to the RecordIO. :param retries: How many times a commit_sync should be retried in case of datastore collisions. :param retry_timeout: The amount of second to wait before the next retry. """ if not len(self.updates): return for attempt in range(retries + 1): shard_does_not_exist = RecordIORecords() for shard_name, key_values in RecordIOShard.get_shards_for_key_values( self.name, self.updates): self.db_search += 1 if shard_name == None and key_values: logging.debug( "RecordIO %s: No shard found for:\n%s -> %s" % (self.name, SPLIT_CHAR.join(RecordIOShard.entry_key( key_values[0])), key_values[0][:-1])) for entry in key_values: shard_does_not_exist.insert(entry) else: lo_just_split = None hi_just_split = None for key_values_chunk in get_chunks(key_values, MAX_WRITE_BATCH_SIZE): if lo_just_split and hi_just_split and key_values_chunk: if RecordIORecords.in_range(key_values_chunk[0], lo=lo_just_split[0], hi=lo_just_split[1]): shard_name = RecordIOShard.key_name( self.name, lo=lo_just_split[0], hi=lo_just_split[1]) elif RecordIORecords.in_range(key_values_chunk[0], lo=hi_just_split[0], hi=hi_just_split[1]): shard_name = RecordIOShard.key_name( self.name, lo=hi_just_split[0], hi=hi_just_split[1]) not_deleted = None try: not_deleted, lo_just_split, hi_just_split = self.commit_shard_( shard_name, key_values_chunk) except RecordIOShardDoesNotExistError: logging.debug("Shard does not exist:\n" + shard_name) lo_just_split = None hi_just_split = None for entry in key_values_chunk: shard_does_not_exist.insert(entry) if not_deleted: for to_delete_shard_name, to_delete_key_values in ( RecordIOShard.get_shards_for_key_values( self.name, not_deleted)): self.db_search += 1 try: self.commit_shard_(to_delete_shard_name, to_delete_key_values) except RecordIOShardDoesNotExistError: logging.debug("Shard does not exist:\n" + shard_name) for entry in to_delete_key_values: shard_does_not_exist.insert(entry) self.updates = shard_does_not_exist if len(self.updates): if attempt == retries: raise RecordIOWriterNotCompletedError(len(self.updates)) else: logging.debug("Commit attempt %d failed" % attempt) time.sleep(retry_timeout) else: return