示例#1
0
  def create(self, compressed=True, pre_split=[]):
    """Creates a RecordIO in datastore. If the RecordIO exists, nothing happens

    :param compressed: Boolean if the data in the RecordIO should be gzipped.
    :param pre_split: An optional list of keys to that should be used to
                      pre-split the internal data shards. This is only makes
                      sense if you are going to write a lot of data and you
                      already know the key range of the data and roughly how
                      many entries fit into one shard.
    :return: True, if the RecordIO didn't exist before.
    """
    self.db_search += 1
    if RecordIOShard.get_all_query(self.name, keys_only=True).get() == None:
      pre_split.sort()
      self.db_put += 1
      split = [None] + [(x,) for x in pre_split] + [None]
      split = [(split[i], split[i+1]) for i in xrange(len(split) - 1)]
      for lo, hi in split:
        index = None
        if lo == None:
          index = True
        RecordIOShard.get_or_insert(RecordIOShard.key_name(self.name,
                                                          lo=lo, hi=hi),
                                    compressed=compressed, index=index)
      return True
    return False
示例#2
0
 def testShardNamesForShorterKeys(self):
   RecordIOShard.create("test", hi=("a", "")).commit()
   RecordIOShard.create("test", lo=("a", "")).commit()
   self.assertEqual({ RecordIOShard.key_name("test", lo=("a", "")):
                          [("aa", ),] },
                     self.getResult(RecordIOShard.get_shards_for_key_values(
                                    "test", [("aa",)])))
示例#3
0
    def create(self, compressed=True, pre_split=[]):
        """Creates a RecordIO in datastore. If the RecordIO exists, nothing happens

    :param compressed: Boolean if the data in the RecordIO should be gzipped.
    :param pre_split: An optional list of keys to that should be used to
                      pre-split the internal data shards. This is only makes
                      sense if you are going to write a lot of data and you
                      already know the key range of the data and roughly how
                      many entries fit into one shard.
    :return: True, if the RecordIO didn't exist before.
    """
        self.db_search += 1
        if RecordIOShard.get_all_query(self.name,
                                       keys_only=True).get() == None:
            pre_split.sort()
            self.db_put += 1
            split = [None] + [(x, ) for x in pre_split] + [None]
            split = [(split[i], split[i + 1]) for i in xrange(len(split) - 1)]
            for lo, hi in split:
                index = None
                if lo == None:
                    index = True
                RecordIOShard.get_or_insert(RecordIOShard.key_name(self.name,
                                                                   lo=lo,
                                                                   hi=hi),
                                            compressed=compressed,
                                            index=index)
            return True
        return False
示例#4
0
 def testKeyName(self):
   name = RecordIOShard.key_name("te|st", ("b|b",), ("d|d",))
   self.assertEqual("te%7Cst!0!647c64!0000000000!0000000001!0000000000" +
                    "!627c62!0000000000!0000000001!0000000000", name)
   recordio = RecordIOShard.create("te|st", ("b|b", 0, 1, 0), ("d|d",))
   self.assertEqual("te|st", recordio.name())
   self.assertEqual((("b|b", 0, 1, 0),
                     ("d|d", 0, 1, 0)),
                    recordio.lo_hi())
示例#5
0
 def testShardNamesForKeysEmpty(self):
   recordio = RecordIOShard.create("test")
   recordio.insert(("0", "a"))
   recordio.insert(("1", "b"))
   recordio.insert(("2", "c"))
   recordio.commit()
   self.assertEqual({ RecordIOShard.key_name("test"): [("", ),] },
                    self.getResult(RecordIOShard.get_shards_for_key_values(
                                   "test", [("",)])))
示例#6
0
 def testCommitToQueue(self):
   updater = RecordIOWriter("test")
   updater.create()
   chunk_size = MAX_ENTRY_SIZE - 1
   entries_to_write = MAX_TASKQUEUE_BATCH_SIZE / MAX_ENTRY_SIZE + 1
   for i in xrange(entries_to_write):
     updater.insert(str("%09d" % i),
                    test_helper.uncompressableString(chunk_size))
   list(updater.commit_to_queue_())
   pull = taskqueue.Queue('recordio-queue')
   tasks = list(pull.lease_tasks(60, 100))
   self.assertEqual(len(tasks), 2)
   self.assertEqual(tasks[0].tag, RecordIOShard.key_name("test"))
   self.assertEqual(tasks[1].tag, RecordIOShard.key_name("test"))
   updates_0 = marshal.loads(tasks[0].payload)
   updates_1 = marshal.loads(tasks[1].payload)
   self.assertEqual([str("%09d" % x) for x in xrange(entries_to_write)],
                    [x[0] for x in updates_0] + [x[0] for x in updates_1])
   self.assertTrue(updates_0[0][1] ==
                   STRING + test_helper.uncompressableString(chunk_size))
示例#7
0
 def testCommitToQueueAndScheduleWrite(self):
   updater = RecordIOWriter("test")
   updater.create()
   updater.insert("a", "")
   updater.commit_async()
   taskq = self.testbed.get_stub(testbed.TASKQUEUE_SERVICE_NAME)
   
   tasks = taskq.GetTasks("recordio-writer")
   self.assertEqual(len(tasks), 1)
   self.assertEqual(tasks[0]["url"], "/recordio/write")
   self.assertEqual(base64.b64decode(tasks[0]["body"]),
                    "taskqueue=" + urllib.quote(
                    RecordIOShard.key_name("test")))
示例#8
0
 def testCommitToQueueSplitEntries(self):
   chunk_size = MAX_ENTRY_SIZE + 1
   test_string = test_helper.uncompressableString(chunk_size)
   updater = RecordIOWriter("test")
   updater.create()
   updater.insert("test", test_string)
   list(updater.commit_to_queue_())
   pull = taskqueue.Queue('recordio-queue')
   tasks = list(pull.lease_tasks(60, 100))
   self.assertEqual(len(tasks), 1)
   self.assertEqual(tasks[0].tag, RecordIOShard.key_name("test"))
   updates = marshal.loads(tasks[0].payload)
   self.assertEqual([('test', 0, 2), ('test', 1, 2)],
                    [x[:-2] for x in updates])
   self.assertEqual(STRING + test_string, "".join([x[-1] for x in updates]))
示例#9
0
    def read_entries_(self, start_key=None, end_key=None):
        """An internal helper function to read split entries.

    :param start_key: An entry tuple (no value needed)
    :param end_key: An entry tuple (no value needed) Exclusive.
    :return: Yields key, split_values
    """
        # TODO (andrin): fetch a couple of shards instead of just one based on
        #                method argument
        current_key = start_key
        if current_key == None:
            current_key = ("", )
        limit_shard_name = RecordIOShard.key_name(self.name,
                                                  lo=start_key,
                                                  hi=end_key).split(SPLIT_CHAR)
        while True:
            shard = RecordIOShard.get_shards_for_key_values(
                self.name, [current_key], keys_only=False).next()[0]
            self.db_search_and_get += 1
            if shard == None:
                raise RecordIOShardDoesNotExistError(self.name)
            hi = shard.lo_hi()[1]
            shard_name = shard.key().name().split(SPLIT_CHAR)
            if (shard_name[6:10] >= limit_shard_name[6:10]
                    and (shard_name[2:5] < limit_shard_name[2:5]
                         or limit_shard_name[2] == SPLIT_CHAR_AFTER)):
                # Read the whole shard
                for entry in shard:
                    yield entry
            else:
                # Read parts of the shard
                for entry in shard.read(current_key, end_key):
                    yield entry
            if hi == None:
                # Was the last shard
                return
            current_key = hi
            if (end_key != None and RecordIORecords.entry_comperator(
                    current_key, end_key) >= 0):
                # Next shard is after end_key
                return
示例#10
0
  def read_entries_(self, start_key=None, end_key=None):
    """An internal helper function to read split entries.

    :param start_key: An entry tuple (no value needed)
    :param end_key: An entry tuple (no value needed) Exclusive.
    :return: Yields key, split_values
    """
    # TODO (andrin): fetch a couple of shards instead of just one based on
    #                method argument
    current_key = start_key
    if current_key == None:
      current_key = ("", )
    limit_shard_name = RecordIOShard.key_name(
        self.name, lo=start_key, hi=end_key).split(SPLIT_CHAR)
    while True:
      shard = RecordIOShard.get_shards_for_key_values(
          self.name, [current_key], keys_only=False).next()[0]
      self.db_search_and_get += 1
      if shard == None:
        raise RecordIOShardDoesNotExistError(self.name)
      hi = shard.lo_hi()[1]
      shard_name = shard.key().name().split(SPLIT_CHAR)
      if (shard_name[6:10] >= limit_shard_name[6:10] and
          (shard_name[2:5] < limit_shard_name[2:5] or
           limit_shard_name[2] == SPLIT_CHAR_AFTER)):
        # Read the whole shard
        for entry in shard:
          yield entry
      else:
        # Read parts of the shard
        for entry in shard.read(current_key, end_key):
          yield entry
      if hi == None:
        # Was the last shard
        return
      current_key = hi
      if (end_key != None and
          RecordIORecords.entry_comperator(current_key, end_key) >= 0):
        # Next shard is after end_key
        return
示例#11
0
 def testShardNamesForKeysSplit(self):
   recordio = RecordIOShard.create("test")
   test_strings = [str(x) for x in range(10)]
   for x in test_strings:
     recordio.insert((x, test_helper.uncompressableString(2**16)))
   recordio.commit()
   self.assertEqual({ RecordIOShard.key_name("test"):
                          [("0", ""), ("1", "")] },
                    self.getResult(RecordIOShard.get_shards_for_key_values(
                                   "test", [("0", ""), ("1", "")])))
   recordio.delete()
   shard_0, shard_1 = recordio.split()
   shard_1, shard_2 = shard_1.split()
   shard_0.commit()
   shard_1.commit()
   shard_2.commit()
   self.assertEqual({ shard_0.key().name(): [('0', '0'), ('1', '1'),
                                             ('2', '2'), ('3', '3'),
                                             ('4', '4')],
                     shard_1.key().name(): [('5', '5'), ('6', '6'),
                                            ('7', '7')],
                     shard_2.key().name(): [('8', '8'), ('9', '9')]},
                     self.getResult(RecordIOShard.get_shards_for_key_values(
                                    "test", zip(test_strings, test_strings))))
示例#12
0
    def commit_sync(self, retries=32, retry_timeout=1):
        """Applies all changes synchronously to the RecordIO.

    :param retries: How many times a commit_sync should be retried in case of
                    datastore collisions.
    :param retry_timeout: The amount of second to wait before the next retry.
    """
        if not len(self.updates):
            return
        for attempt in range(retries + 1):
            shard_does_not_exist = RecordIORecords()
            for shard_name, key_values in RecordIOShard.get_shards_for_key_values(
                    self.name, self.updates):
                self.db_search += 1
                if shard_name == None and key_values:
                    logging.debug(
                        "RecordIO %s: No shard found for:\n%s -> %s" %
                        (self.name,
                         SPLIT_CHAR.join(RecordIOShard.entry_key(
                             key_values[0])), key_values[0][:-1]))
                    for entry in key_values:
                        shard_does_not_exist.insert(entry)
                else:
                    lo_just_split = None
                    hi_just_split = None
                    for key_values_chunk in get_chunks(key_values,
                                                       MAX_WRITE_BATCH_SIZE):
                        if lo_just_split and hi_just_split and key_values_chunk:
                            if RecordIORecords.in_range(key_values_chunk[0],
                                                        lo=lo_just_split[0],
                                                        hi=lo_just_split[1]):
                                shard_name = RecordIOShard.key_name(
                                    self.name,
                                    lo=lo_just_split[0],
                                    hi=lo_just_split[1])
                            elif RecordIORecords.in_range(key_values_chunk[0],
                                                          lo=hi_just_split[0],
                                                          hi=hi_just_split[1]):
                                shard_name = RecordIOShard.key_name(
                                    self.name,
                                    lo=hi_just_split[0],
                                    hi=hi_just_split[1])
                        not_deleted = None
                        try:
                            not_deleted, lo_just_split, hi_just_split = self.commit_shard_(
                                shard_name, key_values_chunk)
                        except RecordIOShardDoesNotExistError:
                            logging.debug("Shard does not exist:\n" +
                                          shard_name)
                            lo_just_split = None
                            hi_just_split = None
                            for entry in key_values_chunk:
                                shard_does_not_exist.insert(entry)
                        if not_deleted:
                            for to_delete_shard_name, to_delete_key_values in (
                                    RecordIOShard.get_shards_for_key_values(
                                        self.name, not_deleted)):
                                self.db_search += 1
                                try:
                                    self.commit_shard_(to_delete_shard_name,
                                                       to_delete_key_values)
                                except RecordIOShardDoesNotExistError:
                                    logging.debug("Shard does not exist:\n" +
                                                  shard_name)
                                    for entry in to_delete_key_values:
                                        shard_does_not_exist.insert(entry)
            self.updates = shard_does_not_exist
            if len(self.updates):
                if attempt == retries:
                    raise RecordIOWriterNotCompletedError(len(self.updates))
                else:
                    logging.debug("Commit attempt %d failed" % attempt)
                    time.sleep(retry_timeout)
            else:
                return
示例#13
0
  def commit_sync(self, retries=32, retry_timeout=1):
    """Applies all changes synchronously to the RecordIO.

    :param retries: How many times a commit_sync should be retried in case of
                    datastore collisions.
    :param retry_timeout: The amount of second to wait before the next retry.
    """
    if not len(self.updates):
      return
    for attempt in range(retries + 1):
      shard_does_not_exist = RecordIORecords()
      for shard_name, key_values in RecordIOShard.get_shards_for_key_values(
          self.name, self.updates):
        self.db_search += 1
        if shard_name == None and key_values:
          logging.debug("RecordIO %s: No shard found for:\n%s -> %s" %
              (self.name, 
               SPLIT_CHAR.join(RecordIOShard.entry_key(key_values[0])),
               key_values[0][:-1]))
          for entry in key_values:
            shard_does_not_exist.insert(entry)
        else:
          lo_just_split = None
          hi_just_split = None
          for key_values_chunk in get_chunks(key_values, MAX_WRITE_BATCH_SIZE):
            if lo_just_split and hi_just_split and key_values_chunk:
              if RecordIORecords.in_range(key_values_chunk[0],
                                          lo=lo_just_split[0],
                                          hi=lo_just_split[1]):
                shard_name = RecordIOShard.key_name(self.name,
                                                   lo=lo_just_split[0],
                                                   hi=lo_just_split[1])
              elif RecordIORecords.in_range(key_values_chunk[0],
                                            lo=hi_just_split[0],
                                            hi=hi_just_split[1]):
                shard_name = RecordIOShard.key_name(self.name,
                                                    lo=hi_just_split[0],
                                                    hi=hi_just_split[1])
            not_deleted = None
            try:
              not_deleted, lo_just_split, hi_just_split = self.commit_shard_(
                  shard_name, key_values_chunk)
            except RecordIOShardDoesNotExistError:
              logging.debug("Shard does not exist:\n" + shard_name)
              lo_just_split = None
              hi_just_split = None
              for entry in key_values_chunk:
                shard_does_not_exist.insert(entry)
            if not_deleted:
              for to_delete_shard_name, to_delete_key_values in (
                   RecordIOShard.get_shards_for_key_values(
                       self.name, not_deleted)):
                self.db_search += 1
                try:
                  self.commit_shard_(to_delete_shard_name, to_delete_key_values)
                except RecordIOShardDoesNotExistError:
                  logging.debug("Shard does not exist:\n" + shard_name)
                  for entry in to_delete_key_values:
                    shard_does_not_exist.insert(entry)
      self.updates = shard_does_not_exist
      if len(self.updates):
        if attempt == retries:
          raise RecordIOWriterNotCompletedError(len(self.updates))
        else:
          logging.debug("Commit attempt %d failed" % attempt)
          time.sleep(retry_timeout)
      else:
        return