Exemplo n.º 1
0
 def testGetData(self):
     data = [("a", "aa"), ("b", "bb")]
     records = RecordIORecords()
     records.insert(data[0])
     records.insert(data[1])
     records = RecordIORecords(records.get_data())
     self.assertEqual(list(records), data)
Exemplo n.º 2
0
  def commit_to_queue_(self):
    """Adds all pending changes to the task queues for async commits

    :return: Yields all shard names that need to be updated.
    """
    pull = taskqueue.Queue('recordio-queue')
    rpcs = []
    key_values_not_added = RecordIORecords()
    for shard_name, key_values in RecordIOShard.get_shards_for_key_values(
          self.name, self.updates):
      self.db_search += 1
      if shard_name == None:
        for entry in key_values:
          key_values_not_added.insert(entry)
      else:
        for key_values_chunk in get_chunks(key_values, MAX_TASKQUEUE_BATCH_SIZE):
          payload = marshal.dumps(key_values_chunk, MARSHAL_VERSION)
          rpc = pull.add_async(taskqueue.Task(payload=payload, method='PULL',
                                              tag=shard_name))
          rpcs.append((rpc, key_values_chunk, shard_name))
    
    for rpc, key_values, shard_name in rpcs:
      try:
        rpc.get_result()
        yield shard_name
      except:
        for entry in key_values:
          key_values_not_added.insert(entry)
    self.updates = key_values_not_added
    if len(self.updates):
      raise RecordIOWriterNotCompletedError(len(self.updates))
Exemplo n.º 3
0
 def init(self):
     """Initializes internal values."""
     if not hasattr(self, "records_"):
         if self.compressed:
             self.records_ = RecordIORecordsZipped(self.data)
         else:
             self.records_ = RecordIORecords(self.data)
         self.loHi_ = RecordIOShard.lo_hi_from_key(self.key().name())
Exemplo n.º 4
0
 def init(self):
   """Initializes internal values."""
   if not hasattr(self, "records_"):
     if self.compressed:
       self.records_ = RecordIORecordsZipped(self.data)
     else:
       self.records_ = RecordIORecords(self.data)
     self.loHi_ = RecordIOShard.lo_hi_from_key(self.key().name())
Exemplo n.º 5
0
  def get_shards_for_key_values(name, records, keys_only=True):
    """Given a list of entries, returns the shards where they belong to

    :param name: The name of the RecordIO
    :param records: A list of entry tuples.
    :param keys_only: If only the keys should be returned.
    :return: A list of names or shards.
    """
    gen = RecordIOShard.iterate_records_(records)
    entry = None
    while True:
      if entry == None:
        try:
          entry = gen.next()
        except StopIteration:
          return
      key_before_name = RecordIOShard.key_name(name, hi=entry)
      key_before_name = key_before_name.split(SPLIT_CHAR)
      key_before_name[6] = SPLIT_CHAR_AFTER
      key_before_name = SPLIT_CHAR.join(key_before_name)
      if entry[0] == "":
        key_before_name = (key_before_name.split(SPLIT_CHAR)[0] +
                           SPLIT_CHAR + "0" + SPLIT_CHAR)
      key_before = db.Key.from_path(
          "RecordIOShard",
          key_before_name)
      shard_obj = RecordIOShard.get_all_query(name, keys_only=keys_only).filter(
          "__key__ >", key_before).get()
      if shard_obj == None:
        yield None, [entry] + list(gen)
        return
      shard_key = None
      key_result = shard_obj
      if keys_only:
        shard_key = shard_obj.name()
        key_result = shard_key
      else:
        shard_key = shard_obj.key().name()
      lo, hi = RecordIOShard.lo_hi_from_key(shard_key)
      result = []
      try:
        while entry and not RecordIORecords.in_range(entry, lo, hi):
          result.append(entry)
          entry = gen.next()
      except StopIteration:
        entry = None
        
      if result:
        yield None, result
      result = []
      try:
        while entry and RecordIORecords.in_range(entry, lo, hi):
          result.append(entry)
          entry = gen.next()
      except StopIteration:
        entry = None
      if result:
        yield key_result, result
Exemplo n.º 6
0
 def testInsertSplitDataSmallToBig(self):
     records = RecordIORecords()
     self.insertABC(records)
     records.insert(("b", 0, 3, 3, "bb"))
     records.insert(("b", 1, 3, 3, "bb"))
     records.insert(("b", 2, 3, 3, "bb"))
     self.assertEqual(
         [("a", "aa"), ("b", 0, 3, 3, "bb"), ("b", 1, 3, 3, "bb"), ("b", 2, 3, 3, "bb"), ("c", "cc")], list(records)
     )
Exemplo n.º 7
0
    def get_shards_for_key_values(name, records, keys_only=True):
        """Given a list of entries, returns the shards where they belong to

    :param name: The name of the RecordIO
    :param records: A list of entry tuples.
    :param keys_only: If only the keys should be returned.
    :return: A list of names or shards.
    """
        gen = RecordIOShard.iterate_records_(records)
        entry = None
        while True:
            if entry == None:
                try:
                    entry = gen.next()
                except StopIteration:
                    return
            key_before_name = RecordIOShard.key_name(name, hi=entry)
            key_before_name = key_before_name.split(SPLIT_CHAR)
            key_before_name[6] = SPLIT_CHAR_AFTER
            key_before_name = SPLIT_CHAR.join(key_before_name)
            if entry[0] == "":
                key_before_name = (key_before_name.split(SPLIT_CHAR)[0] +
                                   SPLIT_CHAR + "0" + SPLIT_CHAR)
            key_before = db.Key.from_path("RecordIOShard", key_before_name)
            shard_obj = RecordIOShard.get_all_query(
                name, keys_only=keys_only).filter("__key__ >",
                                                  key_before).get()
            if shard_obj == None:
                yield None, [entry] + list(gen)
                return
            shard_key = None
            key_result = shard_obj
            if keys_only:
                shard_key = shard_obj.name()
                key_result = shard_key
            else:
                shard_key = shard_obj.key().name()
            lo, hi = RecordIOShard.lo_hi_from_key(shard_key)
            result = []
            try:
                while entry and not RecordIORecords.in_range(entry, lo, hi):
                    result.append(entry)
                    entry = gen.next()
            except StopIteration:
                entry = None

            if result:
                yield None, result
            result = []
            try:
                while entry and RecordIORecords.in_range(entry, lo, hi):
                    result.append(entry)
                    entry = gen.next()
            except StopIteration:
                entry = None
            if result:
                yield key_result, result
Exemplo n.º 8
0
 def testSplit(self):
     records = RecordIORecords()
     self.insertABC(records)
     records.insert(("d", "dd"))
     records.insert(("e", "ee"))
     lo, hi, middle = records.split()
     lo = RecordIORecords(lo)
     hi = RecordIORecords(hi)
     self.assertEqual(middle, ("d", "dd"))
     self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(lo))
     self.assertEqual([("d", "dd"), ("e", "ee")], list(hi))
Exemplo n.º 9
0
 def testDelete(self):
     records = RecordIORecords()
     self.insertABC(records)
     self.assertTrue(records.insert(("b", )))
     records.insert(("b", "bb"))
     self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")],
                      list(records))
     self.assertTrue(records.insert(("b", )))
     self.assertFalse(records.insert(("d", )))
     records = RecordIORecords(records.get_data())
     self.assertEqual([("a", "aa"), ("c", "cc")], list(records))
Exemplo n.º 10
0
 def testInsertNotDeleted(self):
     records = RecordIORecords()
     records.insert(("a", "aa"))
     records.insert(("b", 0, 3, 3, "bb"))
     records.insert(("b", 0, 2, 2, "bb"))
     other = RecordIORecords(records.get_data())
     self.assertEqual([("a", "aa"), ("b", 0, 2, 2, "bb")], list(other))
     self.assertEqual([('b', 1, 3, 3), ('b', 2, 3, 3)],
                      list(records.not_deleted()))
Exemplo n.º 11
0
  def zip_chunk_comperator(a, b):
    """Compares two zipped chunks.

    :param a: zipped chunk tuple (See class definition).
    :param b: zipped chunk tuple (See class definition).
    :return: Boolean
    """
    a_lo, a_hi = a[:2]
    b_lo, b_hi = b[:2]
    if RecordIORecords.entry_comperator(a_hi, b_lo) == -1:
      return -1
    elif RecordIORecords.entry_comperator(a_lo, b_hi) == 1:
      return 1
    return 0
Exemplo n.º 12
0
    def zip_chunk_comperator(a, b):
        """Compares two zipped chunks.

    :param a: zipped chunk tuple (See class definition).
    :param b: zipped chunk tuple (See class definition).
    :return: Boolean
    """
        a_lo, a_hi = a[:2]
        b_lo, b_hi = b[:2]
        if RecordIORecords.entry_comperator(a_hi, b_lo) == -1:
            return -1
        elif RecordIORecords.entry_comperator(a_lo, b_hi) == 1:
            return 1
        return 0
Exemplo n.º 13
0
    def __init__(self, name):
        """Creates a RecordIOWriter

    :param name: The name of the RecordIO. The urllib quoted name is not
                 allowed to be longer than 64 characters.
    """
        if len(urllib.quote(name)) > MAX_KEY_LENGTH:
            raise ValueError(
                "Max urllib.quote(name) length is %d: len('%s') is %d" %
                (MAX_KEY_LENGTH, name, len(urllib.quote(name))))
        self.name = name
        self.updates = RecordIORecords()
        self.pending_worker_tasks = []
        self.db_search = 0
        self.db_get = 0
        self.db_put = 0
Exemplo n.º 14
0
  def insert(self, entry):
    """Inserts an entry tuple into the RecordIORecords.

    :param entry:  An entry tuple
    """
    pos = bisect_left(self.zipped_chunks_, (entry, entry),
                      comperator=self.zip_chunk_comperator)
    if ((pos < len(self.zipped_chunks_) and
        self.zip_chunk_comperator(self.zipped_chunks_[pos], (entry, entry)) == 0)
        or self.is_entry_deleted(entry)):
      RecordIORecords.insert(self, entry)
    else:
      self.zipped_chunks_.insert(pos,
          (entry[:-1], entry[:-1],
           zlib.compress(marshal.dumps([entry], MARSHAL_VERSION),
                             COMPRESSION_LEVEL_MIN)))
Exemplo n.º 15
0
    def insert(self, entry):
        """Inserts an entry tuple into the RecordIORecords.

    :param entry:  An entry tuple
    """
        pos = bisect_left(self.zipped_chunks_, (entry, entry),
                          comperator=self.zip_chunk_comperator)
        if ((pos < len(self.zipped_chunks_)
             and self.zip_chunk_comperator(self.zipped_chunks_[pos],
                                           (entry, entry)) == 0)
                or self.is_entry_deleted(entry)):
            RecordIORecords.insert(self, entry)
        else:
            self.zipped_chunks_.insert(
                pos, (entry[:-1], entry[:-1],
                      zlib.compress(marshal.dumps([entry], MARSHAL_VERSION),
                                    COMPRESSION_LEVEL_MIN)))
Exemplo n.º 16
0
 def testDelete(self):
     records = RecordIORecords()
     self.insertABC(records)
     self.assertTrue(records.insert(("b",)))
     records.insert(("b", "bb"))
     self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records))
     self.assertTrue(records.insert(("b",)))
     self.assertFalse(records.insert(("d",)))
     records = RecordIORecords(records.get_data())
     self.assertEqual([("a", "aa"), ("c", "cc")], list(records))
Exemplo n.º 17
0
 def testGetData(self):
     data = [("a", "aa"), ("b", "bb")]
     records = RecordIORecords()
     records.insert(data[0])
     records.insert(data[1])
     records = RecordIORecords(records.get_data())
     self.assertEqual(list(records), data)
Exemplo n.º 18
0
 def testInRange(self):
     self.assertTrue(RecordIORecords.in_range(("a",)))
     self.assertTrue(RecordIORecords.in_range(("a",), lo=("a",)))
     self.assertTrue(RecordIORecords.in_range(("a",), hi=("b",)))
     self.assertTrue(RecordIORecords.in_range(("b",), lo=("a",), hi=("c",)))
     self.assertTrue(RecordIORecords.in_range(("a",), lo=("a",), hi=("b",)))
     self.assertFalse(RecordIORecords.in_range(("a",), lo=("b",)))
     self.assertFalse(RecordIORecords.in_range(("b",), hi=("b",)))
Exemplo n.º 19
0
 def testInsertNotDeleted(self):
     records = RecordIORecords()
     records.insert(("a", "aa"))
     records.insert(("b", 0, 3, 3, "bb"))
     records.insert(("b", 0, 2, 2, "bb"))
     other = RecordIORecords(records.get_data())
     self.assertEqual([("a", "aa"), ("b", 0, 2, 2, "bb")], list(other))
     self.assertEqual([("b", 1, 3, 3), ("b", 2, 3, 3)], list(records.not_deleted()))
Exemplo n.º 20
0
 def testInsertGetAndRead(self):
     records = RecordIORecords()
     self.insertABC(records)
     self.assertEqual(len(records), 3)
     self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")],
                      list(records))
     records.insert(("b", "new"))
     self.assertEqual(len(records), 3)
     self.assertEqual(records["b"], ("b", "new"))
     self.assertTrue("a" in records)
     self.assertFalse("z" in records)
     records.insert(("b", "bb"))
     self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")],
                      list(records.read()))
     self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")],
                      list(records.read(("", ), ("d", ))))
     self.assertEqual([("b", "bb")], list(records.read(("b", ), ("c", ))))
Exemplo n.º 21
0
 def testInRange(self):
     self.assertTrue(RecordIORecords.in_range(("a", )))
     self.assertTrue(RecordIORecords.in_range(("a", ), lo=("a", )))
     self.assertTrue(RecordIORecords.in_range(("a", ), hi=("b", )))
     self.assertTrue(
         RecordIORecords.in_range(("b", ), lo=("a", ), hi=("c", )))
     self.assertTrue(
         RecordIORecords.in_range(("a", ), lo=("a", ), hi=("b", )))
     self.assertFalse(RecordIORecords.in_range(("a", ), lo=("b", )))
     self.assertFalse(RecordIORecords.in_range(("b", ), hi=("b", )))
Exemplo n.º 22
0
 def testInsertGetAndRead(self):
     records = RecordIORecords()
     self.insertABC(records)
     self.assertEqual(len(records), 3)
     self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records))
     records.insert(("b", "new"))
     self.assertEqual(len(records), 3)
     self.assertEqual(records["b"], ("b", "new"))
     self.assertTrue("a" in records)
     self.assertFalse("z" in records)
     records.insert(("b", "bb"))
     self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records.read()))
     self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records.read(("",), ("d",))))
     self.assertEqual([("b", "bb")], list(records.read(("b",), ("c",))))
Exemplo n.º 23
0
    def commit_to_queue_(self):
        """Adds all pending changes to the task queues for async commits

    :return: Yields all shard names that need to be updated.
    """
        pull = taskqueue.Queue('recordio-queue')
        rpcs = []
        key_values_not_added = RecordIORecords()
        for shard_name, key_values in RecordIOShard.get_shards_for_key_values(
                self.name, self.updates):
            self.db_search += 1
            if shard_name == None:
                for entry in key_values:
                    key_values_not_added.insert(entry)
            else:
                for key_values_chunk in get_chunks(key_values,
                                                   MAX_TASKQUEUE_BATCH_SIZE):
                    payload = marshal.dumps(key_values_chunk, MARSHAL_VERSION)
                    rpc = pull.add_async(
                        taskqueue.Task(payload=payload,
                                       method='PULL',
                                       tag=shard_name))
                    rpcs.append((rpc, key_values_chunk, shard_name))

        for rpc, key_values, shard_name in rpcs:
            try:
                rpc.get_result()
                yield shard_name
            except:
                for entry in key_values:
                    key_values_not_added.insert(entry)
        self.updates = key_values_not_added
        if len(self.updates):
            raise RecordIOWriterNotCompletedError(len(self.updates))
Exemplo n.º 24
0
 def testSplit(self):
     records = RecordIORecords()
     self.insertABC(records)
     records.insert(("d", "dd"))
     records.insert(("e", "ee"))
     lo, hi, middle = records.split()
     lo = RecordIORecords(lo)
     hi = RecordIORecords(hi)
     self.assertEqual(middle, ("d", "dd"))
     self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(lo))
     self.assertEqual([("d", "dd"), ("e", "ee")], list(hi))
Exemplo n.º 25
0
 def testInsertSplitDataSmallToBig(self):
     records = RecordIORecords()
     self.insertABC(records)
     records.insert(("b", 0, 3, 3, "bb"))
     records.insert(("b", 1, 3, 3, "bb"))
     records.insert(("b", 2, 3, 3, "bb"))
     self.assertEqual([("a", "aa"), ("b", 0, 3, 3, "bb"),
                       ("b", 1, 3, 3, "bb"), ("b", 2, 3, 3, "bb"),
                       ("c", "cc")], list(records))
Exemplo n.º 26
0
  def __init__(self, name):
    """Creates a RecordIOWriter

    :param name: The name of the RecordIO. The urllib quoted name is not
                 allowed to be longer than 64 characters.
    """
    if len(urllib.quote(name)) > MAX_KEY_LENGTH:
      raise ValueError("Max urllib.quote(name) length is %d: len('%s') is %d" %
                       (MAX_KEY_LENGTH, name, len(urllib.quote(name))))
    self.name = name
    self.updates = RecordIORecords()
    self.pending_worker_tasks = []
    self.db_search = 0
    self.db_get = 0
    self.db_put = 0
Exemplo n.º 27
0
    def read_entries_(self, start_key=None, end_key=None):
        """An internal helper function to read split entries.

    :param start_key: An entry tuple (no value needed)
    :param end_key: An entry tuple (no value needed) Exclusive.
    :return: Yields key, split_values
    """
        # TODO (andrin): fetch a couple of shards instead of just one based on
        #                method argument
        current_key = start_key
        if current_key == None:
            current_key = ("", )
        limit_shard_name = RecordIOShard.key_name(self.name,
                                                  lo=start_key,
                                                  hi=end_key).split(SPLIT_CHAR)
        while True:
            shard = RecordIOShard.get_shards_for_key_values(
                self.name, [current_key], keys_only=False).next()[0]
            self.db_search_and_get += 1
            if shard == None:
                raise RecordIOShardDoesNotExistError(self.name)
            hi = shard.lo_hi()[1]
            shard_name = shard.key().name().split(SPLIT_CHAR)
            if (shard_name[6:10] >= limit_shard_name[6:10]
                    and (shard_name[2:5] < limit_shard_name[2:5]
                         or limit_shard_name[2] == SPLIT_CHAR_AFTER)):
                # Read the whole shard
                for entry in shard:
                    yield entry
            else:
                # Read parts of the shard
                for entry in shard.read(current_key, end_key):
                    yield entry
            if hi == None:
                # Was the last shard
                return
            current_key = hi
            if (end_key != None and RecordIORecords.entry_comperator(
                    current_key, end_key) >= 0):
                # Next shard is after end_key
                return
Exemplo n.º 28
0
  def read_entries_(self, start_key=None, end_key=None):
    """An internal helper function to read split entries.

    :param start_key: An entry tuple (no value needed)
    :param end_key: An entry tuple (no value needed) Exclusive.
    :return: Yields key, split_values
    """
    # TODO (andrin): fetch a couple of shards instead of just one based on
    #                method argument
    current_key = start_key
    if current_key == None:
      current_key = ("", )
    limit_shard_name = RecordIOShard.key_name(
        self.name, lo=start_key, hi=end_key).split(SPLIT_CHAR)
    while True:
      shard = RecordIOShard.get_shards_for_key_values(
          self.name, [current_key], keys_only=False).next()[0]
      self.db_search_and_get += 1
      if shard == None:
        raise RecordIOShardDoesNotExistError(self.name)
      hi = shard.lo_hi()[1]
      shard_name = shard.key().name().split(SPLIT_CHAR)
      if (shard_name[6:10] >= limit_shard_name[6:10] and
          (shard_name[2:5] < limit_shard_name[2:5] or
           limit_shard_name[2] == SPLIT_CHAR_AFTER)):
        # Read the whole shard
        for entry in shard:
          yield entry
      else:
        # Read parts of the shard
        for entry in shard.read(current_key, end_key):
          yield entry
      if hi == None:
        # Was the last shard
        return
      current_key = hi
      if (end_key != None and
          RecordIORecords.entry_comperator(current_key, end_key) >= 0):
        # Next shard is after end_key
        return
Exemplo n.º 29
0
class RecordIOShard(db.Model):
  """Holds the actual data of a RecordIO as sharded datastore entries."""

  # The data from a RecordIORecords or RecordIORecordsZipped
  data = db.BlobProperty()
  # Determines if it's a RecordIORecords or RecordIORecordsZipped.
  compressed = db.BooleanProperty(default=True, indexed=False)
  # The first shard is the index shard. Used for getting a list of all
  # RecordIO names.
  index = db.BooleanProperty()

  @staticmethod
  def entry_key(key):
    """Returns a list of escaped strings representing a RecordIO entry tuple.

    :param key: An entry tuple (no value needed)
    :return: list of escaped string.
    """
    if len(key) >= 4:
      str_key = [INTEGER_FMT % i for i in key[1:4]]
      return (binascii.hexlify(key[0]), ) + tuple(str_key)
    return (binascii.hexlify(key[0]),
            INTEGER_FMT_0, INTEGER_FMT_1, INTEGER_FMT_0)

  @staticmethod
  def key_name(name, lo=None, hi=None):
    """Returns the datastore key name for a shard.

    :param name: The name of the RecordIO
    :param lo: The lo entry tuple.
    :param hi: The hi entry tuple.
    :return: String
    """
    if lo == None:
      lo = ("", INTEGER_FMT_0, INTEGER_FMT_1, INTEGER_FMT_0)
    else:
      lo = RecordIOShard.entry_key(lo)
    if hi == None:
      hi = (SPLIT_CHAR_AFTER, INTEGER_FMT_0, INTEGER_FMT_1, INTEGER_FMT_0)
    else:
      hi = RecordIOShard.entry_key(hi)
    return SPLIT_CHAR.join((urllib.quote(name), "0") + hi + lo)

  @staticmethod
  def create(name, lo=None, hi=None, compressed=True):
    """Creates a new RecordIOShard object (in memory).

    :param name: The name of the RecordIO
    :param lo: The lo entry tuple.
    :param hi: The hi entry tuple.
    :param compressed: Whether this RecordIOs data is zipped or not.
    :return: RecordIOShard
    """
    shard = RecordIOShard(key_name=RecordIOShard.key_name(name, lo, hi))
    shard.compressed = compressed
    if lo == None:
      shard.index = True
    return shard
  
  @staticmethod
  def get_name(key_name):
    """Returns the name of the RecordIO.

    :param key_name: A datastore key name
    :return: String
    """
    return urllib.unquote(key_name.split(SPLIT_CHAR, 1)[0])
  
  def name(self):
    """Returns the name of the RecordIO this shard belongs to.

    :return: String
    """
    return self.get_name(self.key().name())
  
  def init(self):
    """Initializes internal values."""
    if not hasattr(self, "records_"):
      if self.compressed:
        self.records_ = RecordIORecordsZipped(self.data)
      else:
        self.records_ = RecordIORecords(self.data)
      self.loHi_ = RecordIOShard.lo_hi_from_key(self.key().name())
  
  def commit(self):
    """Writes the data to datastore."""
    self.init()
    self.data = self.records_.get_data(max_size=MAX_BLOB_SIZE)
    if len(self.data) >= MAX_BLOB_SIZE:
      raise RecordIOShardTooBigError()
    self.put()
    
  def not_deleted(self):
    """Entries that need to be deleted in another shard.

    :return: list of keys
    """
    return self.records_.not_deleted()
  
  def __len__(self):
    """The amount of records in this RecordIO. Expensive if compressed.

    :return: int
    """
    self.init()
    return len(self.records_)
  
  def __getitem__(self, key):
    """Returns an the value of an item.

    :param key_entry: An entry tuple (no value needed)
    :return: Object
    """
    return self.records_[key][-1]
  
  def __iter__(self):
    """Yields all entry tuples.

    :return: Entry tuples
    """
    self.init()
    for entry in self.records_:
      yield entry
  
  def __contains__(self, x):
    """Checks whether an entry tuple key is part of this RecordIOShard.

    :param x:  An entry tuple (no value needed)
    :return: Boolean
    """
    try:
      self[x]
      return True
    except:
      return False
  
  def insert(self, entry):
    """Inserts an entry tuple into the RecordIOShard.

    :param entry: An entry tuple
    """
    self.init()
    assert(self.records_.in_range(entry, self.loHi_[0], self.loHi_[1]))
    self.records_.insert(entry)

  def read(self, start_key, end_key):
    """Reads through the records from start_key to end_key (exclusive)

    :param start_key: An entry tuple (no value needed)
    :param end_key: An entry tuple (no value needed)
    :return: Yields all entry tuples within the range.
    """
    self.init()
    for entry in self.records_.read(start_key, end_key):
      yield entry
      
  @staticmethod
  def iterate_records_(records):
    """Iterates over all records.

    :param records: A generator
    :return: A generator
    """
    for x in records:
      yield x

  @staticmethod
  def get_all_query(name, keys_only):
    """Returns a datastore query that returns all shards of a RecordIO.

    :param name: Name of the RecordIO
    :param keys_only: If this should be a keys only query
    :return: A datastore query.
    """
    key_before = db.Key.from_path("RecordIOShard",
                                  urllib.quote(name) +
                                  SPLIT_CHAR + "0" + SPLIT_CHAR)
    key_after = db.Key.from_path("RecordIOShard",
                                 urllib.quote(name) +
                                 SPLIT_CHAR + "0" + SPLIT_CHAR +
                                 SPLIT_CHAR_AFTER +
                                 SPLIT_CHAR_AFTER)
    return RecordIOShard.all(keys_only=keys_only
        ).filter("__key__ >=", key_before).filter("__key__ <", key_after)

  @staticmethod
  def get_shards_for_key_values(name, records, keys_only=True):
    """Given a list of entries, returns the shards where they belong to

    :param name: The name of the RecordIO
    :param records: A list of entry tuples.
    :param keys_only: If only the keys should be returned.
    :return: A list of names or shards.
    """
    gen = RecordIOShard.iterate_records_(records)
    entry = None
    while True:
      if entry == None:
        try:
          entry = gen.next()
        except StopIteration:
          return
      key_before_name = RecordIOShard.key_name(name, hi=entry)
      key_before_name = key_before_name.split(SPLIT_CHAR)
      key_before_name[6] = SPLIT_CHAR_AFTER
      key_before_name = SPLIT_CHAR.join(key_before_name)
      if entry[0] == "":
        key_before_name = (key_before_name.split(SPLIT_CHAR)[0] +
                           SPLIT_CHAR + "0" + SPLIT_CHAR)
      key_before = db.Key.from_path(
          "RecordIOShard",
          key_before_name)
      shard_obj = RecordIOShard.get_all_query(name, keys_only=keys_only).filter(
          "__key__ >", key_before).get()
      if shard_obj == None:
        yield None, [entry] + list(gen)
        return
      shard_key = None
      key_result = shard_obj
      if keys_only:
        shard_key = shard_obj.name()
        key_result = shard_key
      else:
        shard_key = shard_obj.key().name()
      lo, hi = RecordIOShard.lo_hi_from_key(shard_key)
      result = []
      try:
        while entry and not RecordIORecords.in_range(entry, lo, hi):
          result.append(entry)
          entry = gen.next()
      except StopIteration:
        entry = None
        
      if result:
        yield None, result
      result = []
      try:
        while entry and RecordIORecords.in_range(entry, lo, hi):
          result.append(entry)
          entry = gen.next()
      except StopIteration:
        entry = None
      if result:
        yield key_result, result

  def split(self):
    """Splits a RecordIOShard into two smaller shards.

    :return: lo_shard, hi_shard
    """
    self.init()
    name = self.name()
    original_lo, original_hi = self.lo_hi()
    lo_data, hi_data, middle = self.records_.split()
    middle_key = middle[0:4]
    if len(middle_key) == 2:
      middle_key = middle[0:1]
    lo_shard = RecordIOShard.create(name, original_lo, middle_key)
    lo_shard.data = lo_data
    lo_shard.compressed = self.compressed
    hi_shard = RecordIOShard.create(name, middle_key, original_hi)
    hi_shard.data = hi_data
    hi_shard.compressed = self.compressed
    return lo_shard, hi_shard
  
  @staticmethod
  def lo_hi_from_key(key_name):
    """Given a datastore keyname, returns the lo, hi entry tuples.

    :param key_name: String
    :return: (lo, hi) entry tuples
    """
    lo = key_name.split(SPLIT_CHAR)[6:10]
    if lo[0]:
      lo = [binascii.unhexlify(lo[0])] + [int(x) for x in lo[1:]]
      lo = tuple(lo)
    else:
      lo = None
    hi = key_name.split(SPLIT_CHAR)[2:6]
    if hi[0] != SPLIT_CHAR_AFTER:
      hi = [binascii.unhexlify(hi[0])] + [int(x) for x in hi[1:]]
      hi = tuple(hi)
    else:
      hi = None
    return lo, hi
  
  def lo_hi(self):
    """Returns the lo, hi entry tuples of a shard.

    :return:(lo, hi) entry tuples
    """
    self.init()
    return self.loHi_
Exemplo n.º 30
0
 def testComperator(self):
     self.assertEqual(RecordIORecords.entry_comperator(("a",), ("b",)), -1)
     self.assertEqual(RecordIORecords.entry_comperator(("b",), ("b",)), 0)
     self.assertEqual(RecordIORecords.entry_comperator(("c",), ("b",)), 1)
     self.assertEqual(RecordIORecords.entry_comperator(("b", "bb"), ("b",)), 0)
     self.assertEqual(RecordIORecords.entry_comperator(("b", 0, 1, 1, "bb"), ("b",)), 0)
     self.assertEqual(RecordIORecords.entry_comperator(("b",), ("b", 0, 1, 1, "bb")), 0)
     self.assertEqual(RecordIORecords.entry_comperator(("b", 0, 1, 1, "bb"), ("b", "bb")), 0)
     self.assertEqual(RecordIORecords.entry_comperator(("b", "bb"), ("b", 0, 1, 1, "bb")), 0)
     self.assertEqual(RecordIORecords.entry_comperator(("b", 0, 1, 1, "bb"), ("b", 0, 1, 2, "bb")), 0)
     self.assertEqual(RecordIORecords.entry_comperator(("b", 1, 2, 1, "bb"), ("b", 1, 2, 2, "bb")), -1)
     self.assertEqual(RecordIORecords.entry_comperator(("b", 1, 2, 1, "bb"), ("b", 1, 2, 1, "bb")), 0)
     self.assertEqual(RecordIORecords.entry_comperator(("b", 1, 3, 1, "bb"), ("b", 1, 2, 1, "bb")), 1)
Exemplo n.º 31
0
 def testInsertSplitDataBigToSmall(self):
     records = RecordIORecords()
     records.insert(("a", "aa"))
     records.insert(("b", 0, 3, 3, "bb"))
     records.insert(("b", 1, 3, 3, "bb"))
     records.insert(("b", 2, 3, 3, "bb"))
     records.insert(("c", "cc"))
     records.insert(("b", "bb"))
     records = RecordIORecords(records.get_data())
     self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")], list(records))
Exemplo n.º 32
0
class RecordIOShard(db.Model):
    """Holds the actual data of a RecordIO as sharded datastore entries."""

    # The data from a RecordIORecords or RecordIORecordsZipped
    data = db.BlobProperty()
    # Determines if it's a RecordIORecords or RecordIORecordsZipped.
    compressed = db.BooleanProperty(default=True, indexed=False)
    # The first shard is the index shard. Used for getting a list of all
    # RecordIO names.
    index = db.BooleanProperty()

    @staticmethod
    def entry_key(key):
        """Returns a list of escaped strings representing a RecordIO entry tuple.

    :param key: An entry tuple (no value needed)
    :return: list of escaped string.
    """
        if len(key) >= 4:
            str_key = [INTEGER_FMT % i for i in key[1:4]]
            return (binascii.hexlify(key[0]), ) + tuple(str_key)
        return (binascii.hexlify(key[0]), INTEGER_FMT_0, INTEGER_FMT_1,
                INTEGER_FMT_0)

    @staticmethod
    def key_name(name, lo=None, hi=None):
        """Returns the datastore key name for a shard.

    :param name: The name of the RecordIO
    :param lo: The lo entry tuple.
    :param hi: The hi entry tuple.
    :return: String
    """
        if lo == None:
            lo = ("", INTEGER_FMT_0, INTEGER_FMT_1, INTEGER_FMT_0)
        else:
            lo = RecordIOShard.entry_key(lo)
        if hi == None:
            hi = (SPLIT_CHAR_AFTER, INTEGER_FMT_0, INTEGER_FMT_1,
                  INTEGER_FMT_0)
        else:
            hi = RecordIOShard.entry_key(hi)
        return SPLIT_CHAR.join((urllib.quote(name), "0") + hi + lo)

    @staticmethod
    def create(name, lo=None, hi=None, compressed=True):
        """Creates a new RecordIOShard object (in memory).

    :param name: The name of the RecordIO
    :param lo: The lo entry tuple.
    :param hi: The hi entry tuple.
    :param compressed: Whether this RecordIOs data is zipped or not.
    :return: RecordIOShard
    """
        shard = RecordIOShard(key_name=RecordIOShard.key_name(name, lo, hi))
        shard.compressed = compressed
        if lo == None:
            shard.index = True
        return shard

    @staticmethod
    def get_name(key_name):
        """Returns the name of the RecordIO.

    :param key_name: A datastore key name
    :return: String
    """
        return urllib.unquote(key_name.split(SPLIT_CHAR, 1)[0])

    def name(self):
        """Returns the name of the RecordIO this shard belongs to.

    :return: String
    """
        return self.get_name(self.key().name())

    def init(self):
        """Initializes internal values."""
        if not hasattr(self, "records_"):
            if self.compressed:
                self.records_ = RecordIORecordsZipped(self.data)
            else:
                self.records_ = RecordIORecords(self.data)
            self.loHi_ = RecordIOShard.lo_hi_from_key(self.key().name())

    def commit(self):
        """Writes the data to datastore."""
        self.init()
        self.data = self.records_.get_data(max_size=MAX_BLOB_SIZE)
        if len(self.data) >= MAX_BLOB_SIZE:
            raise RecordIOShardTooBigError()
        self.put()

    def not_deleted(self):
        """Entries that need to be deleted in another shard.

    :return: list of keys
    """
        return self.records_.not_deleted()

    def __len__(self):
        """The amount of records in this RecordIO. Expensive if compressed.

    :return: int
    """
        self.init()
        return len(self.records_)

    def __getitem__(self, key):
        """Returns an the value of an item.

    :param key_entry: An entry tuple (no value needed)
    :return: Object
    """
        return self.records_[key][-1]

    def __iter__(self):
        """Yields all entry tuples.

    :return: Entry tuples
    """
        self.init()
        for entry in self.records_:
            yield entry

    def __contains__(self, x):
        """Checks whether an entry tuple key is part of this RecordIOShard.

    :param x:  An entry tuple (no value needed)
    :return: Boolean
    """
        try:
            self[x]
            return True
        except:
            return False

    def insert(self, entry):
        """Inserts an entry tuple into the RecordIOShard.

    :param entry: An entry tuple
    """
        self.init()
        assert (self.records_.in_range(entry, self.loHi_[0], self.loHi_[1]))
        self.records_.insert(entry)

    def read(self, start_key, end_key):
        """Reads through the records from start_key to end_key (exclusive)

    :param start_key: An entry tuple (no value needed)
    :param end_key: An entry tuple (no value needed)
    :return: Yields all entry tuples within the range.
    """
        self.init()
        for entry in self.records_.read(start_key, end_key):
            yield entry

    @staticmethod
    def iterate_records_(records):
        """Iterates over all records.

    :param records: A generator
    :return: A generator
    """
        for x in records:
            yield x

    @staticmethod
    def get_all_query(name, keys_only):
        """Returns a datastore query that returns all shards of a RecordIO.

    :param name: Name of the RecordIO
    :param keys_only: If this should be a keys only query
    :return: A datastore query.
    """
        key_before = db.Key.from_path(
            "RecordIOShard",
            urllib.quote(name) + SPLIT_CHAR + "0" + SPLIT_CHAR)
        key_after = db.Key.from_path(
            "RecordIOShard",
            urllib.quote(name) + SPLIT_CHAR + "0" + SPLIT_CHAR +
            SPLIT_CHAR_AFTER + SPLIT_CHAR_AFTER)
        return RecordIOShard.all(keys_only=keys_only).filter(
            "__key__ >=", key_before).filter("__key__ <", key_after)

    @staticmethod
    def get_shards_for_key_values(name, records, keys_only=True):
        """Given a list of entries, returns the shards where they belong to

    :param name: The name of the RecordIO
    :param records: A list of entry tuples.
    :param keys_only: If only the keys should be returned.
    :return: A list of names or shards.
    """
        gen = RecordIOShard.iterate_records_(records)
        entry = None
        while True:
            if entry == None:
                try:
                    entry = gen.next()
                except StopIteration:
                    return
            key_before_name = RecordIOShard.key_name(name, hi=entry)
            key_before_name = key_before_name.split(SPLIT_CHAR)
            key_before_name[6] = SPLIT_CHAR_AFTER
            key_before_name = SPLIT_CHAR.join(key_before_name)
            if entry[0] == "":
                key_before_name = (key_before_name.split(SPLIT_CHAR)[0] +
                                   SPLIT_CHAR + "0" + SPLIT_CHAR)
            key_before = db.Key.from_path("RecordIOShard", key_before_name)
            shard_obj = RecordIOShard.get_all_query(
                name, keys_only=keys_only).filter("__key__ >",
                                                  key_before).get()
            if shard_obj == None:
                yield None, [entry] + list(gen)
                return
            shard_key = None
            key_result = shard_obj
            if keys_only:
                shard_key = shard_obj.name()
                key_result = shard_key
            else:
                shard_key = shard_obj.key().name()
            lo, hi = RecordIOShard.lo_hi_from_key(shard_key)
            result = []
            try:
                while entry and not RecordIORecords.in_range(entry, lo, hi):
                    result.append(entry)
                    entry = gen.next()
            except StopIteration:
                entry = None

            if result:
                yield None, result
            result = []
            try:
                while entry and RecordIORecords.in_range(entry, lo, hi):
                    result.append(entry)
                    entry = gen.next()
            except StopIteration:
                entry = None
            if result:
                yield key_result, result

    def split(self):
        """Splits a RecordIOShard into two smaller shards.

    :return: lo_shard, hi_shard
    """
        self.init()
        name = self.name()
        original_lo, original_hi = self.lo_hi()
        lo_data, hi_data, middle = self.records_.split()
        middle_key = middle[0:4]
        if len(middle_key) == 2:
            middle_key = middle[0:1]
        lo_shard = RecordIOShard.create(name, original_lo, middle_key)
        lo_shard.data = lo_data
        lo_shard.compressed = self.compressed
        hi_shard = RecordIOShard.create(name, middle_key, original_hi)
        hi_shard.data = hi_data
        hi_shard.compressed = self.compressed
        return lo_shard, hi_shard

    @staticmethod
    def lo_hi_from_key(key_name):
        """Given a datastore keyname, returns the lo, hi entry tuples.

    :param key_name: String
    :return: (lo, hi) entry tuples
    """
        lo = key_name.split(SPLIT_CHAR)[6:10]
        if lo[0]:
            lo = [binascii.unhexlify(lo[0])] + [int(x) for x in lo[1:]]
            lo = tuple(lo)
        else:
            lo = None
        hi = key_name.split(SPLIT_CHAR)[2:6]
        if hi[0] != SPLIT_CHAR_AFTER:
            hi = [binascii.unhexlify(hi[0])] + [int(x) for x in hi[1:]]
            hi = tuple(hi)
        else:
            hi = None
        return lo, hi

    def lo_hi(self):
        """Returns the lo, hi entry tuples of a shard.

    :return:(lo, hi) entry tuples
    """
        self.init()
        return self.loHi_
Exemplo n.º 33
0
  def commit_sync(self, retries=32, retry_timeout=1):
    """Applies all changes synchronously to the RecordIO.

    :param retries: How many times a commit_sync should be retried in case of
                    datastore collisions.
    :param retry_timeout: The amount of second to wait before the next retry.
    """
    if not len(self.updates):
      return
    for attempt in range(retries + 1):
      shard_does_not_exist = RecordIORecords()
      for shard_name, key_values in RecordIOShard.get_shards_for_key_values(
          self.name, self.updates):
        self.db_search += 1
        if shard_name == None and key_values:
          logging.debug("RecordIO %s: No shard found for:\n%s -> %s" %
              (self.name, 
               SPLIT_CHAR.join(RecordIOShard.entry_key(key_values[0])),
               key_values[0][:-1]))
          for entry in key_values:
            shard_does_not_exist.insert(entry)
        else:
          lo_just_split = None
          hi_just_split = None
          for key_values_chunk in get_chunks(key_values, MAX_WRITE_BATCH_SIZE):
            if lo_just_split and hi_just_split and key_values_chunk:
              if RecordIORecords.in_range(key_values_chunk[0],
                                          lo=lo_just_split[0],
                                          hi=lo_just_split[1]):
                shard_name = RecordIOShard.key_name(self.name,
                                                   lo=lo_just_split[0],
                                                   hi=lo_just_split[1])
              elif RecordIORecords.in_range(key_values_chunk[0],
                                            lo=hi_just_split[0],
                                            hi=hi_just_split[1]):
                shard_name = RecordIOShard.key_name(self.name,
                                                    lo=hi_just_split[0],
                                                    hi=hi_just_split[1])
            not_deleted = None
            try:
              not_deleted, lo_just_split, hi_just_split = self.commit_shard_(
                  shard_name, key_values_chunk)
            except RecordIOShardDoesNotExistError:
              logging.debug("Shard does not exist:\n" + shard_name)
              lo_just_split = None
              hi_just_split = None
              for entry in key_values_chunk:
                shard_does_not_exist.insert(entry)
            if not_deleted:
              for to_delete_shard_name, to_delete_key_values in (
                   RecordIOShard.get_shards_for_key_values(
                       self.name, not_deleted)):
                self.db_search += 1
                try:
                  self.commit_shard_(to_delete_shard_name, to_delete_key_values)
                except RecordIOShardDoesNotExistError:
                  logging.debug("Shard does not exist:\n" + shard_name)
                  for entry in to_delete_key_values:
                    shard_does_not_exist.insert(entry)
      self.updates = shard_does_not_exist
      if len(self.updates):
        if attempt == retries:
          raise RecordIOWriterNotCompletedError(len(self.updates))
        else:
          logging.debug("Commit attempt %d failed" % attempt)
          time.sleep(retry_timeout)
      else:
        return
Exemplo n.º 34
0
class RecordIOWriter():
  """This class allows you to write data to a RecordIO."""
  def __init__(self, name):
    """Creates a RecordIOWriter

    :param name: The name of the RecordIO. The urllib quoted name is not
                 allowed to be longer than 64 characters.
    """
    if len(urllib.quote(name)) > MAX_KEY_LENGTH:
      raise ValueError("Max urllib.quote(name) length is %d: len('%s') is %d" %
                       (MAX_KEY_LENGTH, name, len(urllib.quote(name))))
    self.name = name
    self.updates = RecordIORecords()
    self.pending_worker_tasks = []
    self.db_search = 0
    self.db_get = 0
    self.db_put = 0
  
  def create(self, compressed=True, pre_split=[]):
    """Creates a RecordIO in datastore. If the RecordIO exists, nothing happens

    :param compressed: Boolean if the data in the RecordIO should be gzipped.
    :param pre_split: An optional list of keys to that should be used to
                      pre-split the internal data shards. This is only makes
                      sense if you are going to write a lot of data and you
                      already know the key range of the data and roughly how
                      many entries fit into one shard.
    :return: True, if the RecordIO didn't exist before.
    """
    self.db_search += 1
    if RecordIOShard.get_all_query(self.name, keys_only=True).get() == None:
      pre_split.sort()
      self.db_put += 1
      split = [None] + [(x,) for x in pre_split] + [None]
      split = [(split[i], split[i+1]) for i in xrange(len(split) - 1)]
      for lo, hi in split:
        index = None
        if lo == None:
          index = True
        RecordIOShard.get_or_insert(RecordIOShard.key_name(self.name,
                                                          lo=lo, hi=hi),
                                    compressed=compressed, index=index)
      return True
    return False
  
  def delete(self):
    """Deletes a RecordIO.

    Modifying RecordIOs or applying queued writes may result in errors during
    deletions.
    """
    db.delete(RecordIOShard.get_all_query(self.name, keys_only=True))
  
  def insert(self, key, value):
    """Assigns a value to a given key.

    Overwrites existing values with the same key.

    :param key: Must be a string and must not be longer than 64 characters.
    :param value:  Values can be of any type that is pickeable (anything you
                   can put in memcache). Values can have arbitrary size
                   (There is no size limit like normal Datastore entries have).
    """
    if isinstance(key, unicode):
      try:
        key = str(key)
      except:
        pass
    if not isinstance(key, str):
      raise ValueError("Key must be <type 'str'> got: %s" % type(key))
    typed_value = None
    if isinstance(value, str):
      typed_value = recordio_entry_types.STRING + value
    elif type(value) in MARSHALABLE_TYPES:
      try:
        typed_value = recordio_entry_types.MARSHAL + marshal.dumps(
                          value, MARSHAL_VERSION)
      except:
        pass
    if typed_value == None:
      typed_value = recordio_entry_types.CPICKLE + cPickle.dumps(value)
    if len(key) > MAX_KEY_LENGTH:
      raise ValueError("Max key length is %d: %d" %
                       (MAX_KEY_LENGTH, len(key)))
    if len(typed_value) > MAX_ENTRY_SIZE:
      entries = int(math.ceil(1.0 * len(typed_value) / MAX_ENTRY_SIZE))
      version = (hash(typed_value) + hash(str(time.time()))) % INTEGER_MAX
      for i in xrange(entries):
        self.insert_entry_((key, i, entries, version,
                           typed_value[i * MAX_ENTRY_SIZE:
                                       (i+1) * MAX_ENTRY_SIZE]))
    else:
      self.insert_entry_((key, typed_value))
  
  def remove(self, key):
    """Removes a value from the RecordIO

    :param key: A key of a previously inserted value. If this key does not
                exist, no exception is thrown.
    """
    self.updates.insert((key, ))
  
  def commit_sync(self, retries=32, retry_timeout=1):
    """Applies all changes synchronously to the RecordIO.

    :param retries: How many times a commit_sync should be retried in case of
                    datastore collisions.
    :param retry_timeout: The amount of second to wait before the next retry.
    """
    if not len(self.updates):
      return
    for attempt in range(retries + 1):
      shard_does_not_exist = RecordIORecords()
      for shard_name, key_values in RecordIOShard.get_shards_for_key_values(
          self.name, self.updates):
        self.db_search += 1
        if shard_name == None and key_values:
          logging.debug("RecordIO %s: No shard found for:\n%s -> %s" %
              (self.name, 
               SPLIT_CHAR.join(RecordIOShard.entry_key(key_values[0])),
               key_values[0][:-1]))
          for entry in key_values:
            shard_does_not_exist.insert(entry)
        else:
          lo_just_split = None
          hi_just_split = None
          for key_values_chunk in get_chunks(key_values, MAX_WRITE_BATCH_SIZE):
            if lo_just_split and hi_just_split and key_values_chunk:
              if RecordIORecords.in_range(key_values_chunk[0],
                                          lo=lo_just_split[0],
                                          hi=lo_just_split[1]):
                shard_name = RecordIOShard.key_name(self.name,
                                                   lo=lo_just_split[0],
                                                   hi=lo_just_split[1])
              elif RecordIORecords.in_range(key_values_chunk[0],
                                            lo=hi_just_split[0],
                                            hi=hi_just_split[1]):
                shard_name = RecordIOShard.key_name(self.name,
                                                    lo=hi_just_split[0],
                                                    hi=hi_just_split[1])
            not_deleted = None
            try:
              not_deleted, lo_just_split, hi_just_split = self.commit_shard_(
                  shard_name, key_values_chunk)
            except RecordIOShardDoesNotExistError:
              logging.debug("Shard does not exist:\n" + shard_name)
              lo_just_split = None
              hi_just_split = None
              for entry in key_values_chunk:
                shard_does_not_exist.insert(entry)
            if not_deleted:
              for to_delete_shard_name, to_delete_key_values in (
                   RecordIOShard.get_shards_for_key_values(
                       self.name, not_deleted)):
                self.db_search += 1
                try:
                  self.commit_shard_(to_delete_shard_name, to_delete_key_values)
                except RecordIOShardDoesNotExistError:
                  logging.debug("Shard does not exist:\n" + shard_name)
                  for entry in to_delete_key_values:
                    shard_does_not_exist.insert(entry)
      self.updates = shard_does_not_exist
      if len(self.updates):
        if attempt == retries:
          raise RecordIOWriterNotCompletedError(len(self.updates))
        else:
          logging.debug("Commit attempt %d failed" % attempt)
          time.sleep(retry_timeout)
      else:
        return
  
  def commit_async(self, write_every_n_seconds=300):
    """Applies the changes asynchronously to the RecordIO.

    Automatically batches other pending writes to the same RecordIO (Cheaper
    and more efficient than synchronous commits).

    :param write_every_n_seconds: Applies the changes after this amount of
                                  seconds to the RecordIO.
    """
    seen = set([])
    raise_exception = False
    try:
      for tag in self.commit_to_queue_():
        if tag in seen:
          continue
        seen.add(tag)
        self.pending_worker_tasks.append(
            self.create_task_(tag, write_every_n_seconds))
    except RecordIOWriterNotCompletedError:
      raise_exception = True
    
    failed_add = []
    while self.pending_worker_tasks:
      batch = self.pending_worker_tasks[:100]
      self.pending_worker_tasks = self.pending_worker_tasks[100:]
      try:
        taskqueue.Queue('recordio-writer').add(batch)
      except (taskqueue.DuplicateTaskNameError, taskqueue.TombstonedTaskError,
              taskqueue.TaskAlreadyExistsError):
        pass
      except ValueError:
        failed_add += batch
    self.pending_worker_tasks = failed_add
  
    if raise_exception or self.pending_worker_tasks:
      raise RecordIOWriterNotCompletedError(len(self.updates))

  def db_stats(self):
    """Returns some datastore access statistics.

    :return: Dict
    """
    return { "search": self.db_search, "get": self.db_get, "put": self.db_put }

  def insert_entry_(self, entry):
    """Inserts a entry tuples to the internal queue.

    :param entry: An entry tuple.
    """
    self.updates.insert(entry)

  @staticmethod
  def create_task_(tag, write_every_n_seconds=300, in_past=False):
    """Creates the future taskqueue tasks to apply queued writes.

    :param tag: The shard to write.
    :param write_every_n_seconds: At what interval the shard should be updated.
    :param in_past: If the we should schedule the task in the past
    :return: taskqueue.Task
    """
    now = int(time.time())
    schedule = now - (now % write_every_n_seconds)
    schedule += hash(tag) % write_every_n_seconds
    if schedule < now and not in_past:
      schedule += write_every_n_seconds
    if schedule > now and in_past:
      schedule -= write_every_n_seconds
    task_name = "%d_%d_%d" % (hash(tag[:len(tag)/2]),
                              hash(tag[len(tag)/2:]),
                              schedule)
    
    params = {"taskqueue": tag}
    return taskqueue.Task(name=task_name,
                          url="/recordio/write",
                          params=params,
                          eta=datetime.datetime.fromtimestamp(
                              schedule + MAX_CLOCK_SKEW))

  def commit_to_queue_(self):
    """Adds all pending changes to the task queues for async commits

    :return: Yields all shard names that need to be updated.
    """
    pull = taskqueue.Queue('recordio-queue')
    rpcs = []
    key_values_not_added = RecordIORecords()
    for shard_name, key_values in RecordIOShard.get_shards_for_key_values(
          self.name, self.updates):
      self.db_search += 1
      if shard_name == None:
        for entry in key_values:
          key_values_not_added.insert(entry)
      else:
        for key_values_chunk in get_chunks(key_values, MAX_TASKQUEUE_BATCH_SIZE):
          payload = marshal.dumps(key_values_chunk, MARSHAL_VERSION)
          rpc = pull.add_async(taskqueue.Task(payload=payload, method='PULL',
                                              tag=shard_name))
          rpcs.append((rpc, key_values_chunk, shard_name))
    
    for rpc, key_values, shard_name in rpcs:
      try:
        rpc.get_result()
        yield shard_name
      except:
        for entry in key_values:
          key_values_not_added.insert(entry)
    self.updates = key_values_not_added
    if len(self.updates):
      raise RecordIOWriterNotCompletedError(len(self.updates))

  @db.transactional(xg=True)
  def commit_shard_(self, shard_name, key_values):
    """Adds key, values to a shard and splits it if necessary.

    :param shard_name: The key name of the RecordIOShard.
    :param key_values: A list of key values to be added
    :return: list of keys that need to be deleted in other shards.
    """
    shard = RecordIOShard.get_by_key_name(shard_name)
    self.db_get += 1
    if shard == None:
      raise RecordIOShardDoesNotExistError(shard_name)
    for entry in key_values:
      shard.insert(entry)
    try:
      shard.commit()
      self.db_put += 1
      return (shard.not_deleted(), None, None)
    except (RecordIOShardTooBigError,
            RequestTooLargeError, ValueError, ArgumentError, BadRequestError):
      shard.delete()
      lo_shard, hi_shard = shard.split()
      lo_shard.commit()
      hi_shard.commit()
      self.db_put += 2
      logging.debug("Split\n%s\n%s\n%s" % (shard.key().name(),
                                            lo_shard.key().name(),
                                            hi_shard.key().name()))
      shard_name = hi_shard.key().name()
      return shard.not_deleted(), (lo_shard.lo_hi()), (hi_shard.lo_hi())
Exemplo n.º 35
0
 def testComperator(self):
     self.assertEqual(RecordIORecords.entry_comperator(("a", ), ("b", )),
                      -1)
     self.assertEqual(RecordIORecords.entry_comperator(("b", ), ("b", )), 0)
     self.assertEqual(RecordIORecords.entry_comperator(("c", ), ("b", )), 1)
     self.assertEqual(
         RecordIORecords.entry_comperator(("b", "bb"), ("b", )), 0)
     self.assertEqual(
         RecordIORecords.entry_comperator(("b", 0, 1, 1, "bb"), ("b", )), 0)
     self.assertEqual(
         RecordIORecords.entry_comperator(("b", ), ("b", 0, 1, 1, "bb")), 0)
     self.assertEqual(
         RecordIORecords.entry_comperator(("b", 0, 1, 1, "bb"),
                                          ("b", "bb")), 0)
     self.assertEqual(
         RecordIORecords.entry_comperator(("b", "bb"),
                                          ("b", 0, 1, 1, "bb")), 0)
     self.assertEqual(
         RecordIORecords.entry_comperator(("b", 0, 1, 1, "bb"),
                                          ("b", 0, 1, 2, "bb")), 0)
     self.assertEqual(
         RecordIORecords.entry_comperator(("b", 1, 2, 1, "bb"),
                                          ("b", 1, 2, 2, "bb")), -1)
     self.assertEqual(
         RecordIORecords.entry_comperator(("b", 1, 2, 1, "bb"),
                                          ("b", 1, 2, 1, "bb")), 0)
     self.assertEqual(
         RecordIORecords.entry_comperator(("b", 1, 3, 1, "bb"),
                                          ("b", 1, 2, 1, "bb")), 1)
Exemplo n.º 36
0
    def commit_sync(self, retries=32, retry_timeout=1):
        """Applies all changes synchronously to the RecordIO.

    :param retries: How many times a commit_sync should be retried in case of
                    datastore collisions.
    :param retry_timeout: The amount of second to wait before the next retry.
    """
        if not len(self.updates):
            return
        for attempt in range(retries + 1):
            shard_does_not_exist = RecordIORecords()
            for shard_name, key_values in RecordIOShard.get_shards_for_key_values(
                    self.name, self.updates):
                self.db_search += 1
                if shard_name == None and key_values:
                    logging.debug(
                        "RecordIO %s: No shard found for:\n%s -> %s" %
                        (self.name,
                         SPLIT_CHAR.join(RecordIOShard.entry_key(
                             key_values[0])), key_values[0][:-1]))
                    for entry in key_values:
                        shard_does_not_exist.insert(entry)
                else:
                    lo_just_split = None
                    hi_just_split = None
                    for key_values_chunk in get_chunks(key_values,
                                                       MAX_WRITE_BATCH_SIZE):
                        if lo_just_split and hi_just_split and key_values_chunk:
                            if RecordIORecords.in_range(key_values_chunk[0],
                                                        lo=lo_just_split[0],
                                                        hi=lo_just_split[1]):
                                shard_name = RecordIOShard.key_name(
                                    self.name,
                                    lo=lo_just_split[0],
                                    hi=lo_just_split[1])
                            elif RecordIORecords.in_range(key_values_chunk[0],
                                                          lo=hi_just_split[0],
                                                          hi=hi_just_split[1]):
                                shard_name = RecordIOShard.key_name(
                                    self.name,
                                    lo=hi_just_split[0],
                                    hi=hi_just_split[1])
                        not_deleted = None
                        try:
                            not_deleted, lo_just_split, hi_just_split = self.commit_shard_(
                                shard_name, key_values_chunk)
                        except RecordIOShardDoesNotExistError:
                            logging.debug("Shard does not exist:\n" +
                                          shard_name)
                            lo_just_split = None
                            hi_just_split = None
                            for entry in key_values_chunk:
                                shard_does_not_exist.insert(entry)
                        if not_deleted:
                            for to_delete_shard_name, to_delete_key_values in (
                                    RecordIOShard.get_shards_for_key_values(
                                        self.name, not_deleted)):
                                self.db_search += 1
                                try:
                                    self.commit_shard_(to_delete_shard_name,
                                                       to_delete_key_values)
                                except RecordIOShardDoesNotExistError:
                                    logging.debug("Shard does not exist:\n" +
                                                  shard_name)
                                    for entry in to_delete_key_values:
                                        shard_does_not_exist.insert(entry)
            self.updates = shard_does_not_exist
            if len(self.updates):
                if attempt == retries:
                    raise RecordIOWriterNotCompletedError(len(self.updates))
                else:
                    logging.debug("Commit attempt %d failed" % attempt)
                    time.sleep(retry_timeout)
            else:
                return
Exemplo n.º 37
0
class RecordIOWriter():
    """This class allows you to write data to a RecordIO."""
    def __init__(self, name):
        """Creates a RecordIOWriter

    :param name: The name of the RecordIO. The urllib quoted name is not
                 allowed to be longer than 64 characters.
    """
        if len(urllib.quote(name)) > MAX_KEY_LENGTH:
            raise ValueError(
                "Max urllib.quote(name) length is %d: len('%s') is %d" %
                (MAX_KEY_LENGTH, name, len(urllib.quote(name))))
        self.name = name
        self.updates = RecordIORecords()
        self.pending_worker_tasks = []
        self.db_search = 0
        self.db_get = 0
        self.db_put = 0

    def create(self, compressed=True, pre_split=[]):
        """Creates a RecordIO in datastore. If the RecordIO exists, nothing happens

    :param compressed: Boolean if the data in the RecordIO should be gzipped.
    :param pre_split: An optional list of keys to that should be used to
                      pre-split the internal data shards. This is only makes
                      sense if you are going to write a lot of data and you
                      already know the key range of the data and roughly how
                      many entries fit into one shard.
    :return: True, if the RecordIO didn't exist before.
    """
        self.db_search += 1
        if RecordIOShard.get_all_query(self.name,
                                       keys_only=True).get() == None:
            pre_split.sort()
            self.db_put += 1
            split = [None] + [(x, ) for x in pre_split] + [None]
            split = [(split[i], split[i + 1]) for i in xrange(len(split) - 1)]
            for lo, hi in split:
                index = None
                if lo == None:
                    index = True
                RecordIOShard.get_or_insert(RecordIOShard.key_name(self.name,
                                                                   lo=lo,
                                                                   hi=hi),
                                            compressed=compressed,
                                            index=index)
            return True
        return False

    def delete(self):
        """Deletes a RecordIO.

    Modifying RecordIOs or applying queued writes may result in errors during
    deletions.
    """
        db.delete(RecordIOShard.get_all_query(self.name, keys_only=True))

    def insert(self, key, value):
        """Assigns a value to a given key.

    Overwrites existing values with the same key.

    :param key: Must be a string and must not be longer than 64 characters.
    :param value:  Values can be of any type that is pickeable (anything you
                   can put in memcache). Values can have arbitrary size
                   (There is no size limit like normal Datastore entries have).
    """
        if isinstance(key, unicode):
            try:
                key = str(key)
            except:
                pass
        if not isinstance(key, str):
            raise ValueError("Key must be <type 'str'> got: %s" % type(key))
        typed_value = None
        if isinstance(value, str):
            typed_value = recordio_entry_types.STRING + value
        elif type(value) in MARSHALABLE_TYPES:
            try:
                typed_value = recordio_entry_types.MARSHAL + marshal.dumps(
                    value, MARSHAL_VERSION)
            except:
                pass
        if typed_value == None:
            typed_value = recordio_entry_types.CPICKLE + cPickle.dumps(value)
        if len(key) > MAX_KEY_LENGTH:
            raise ValueError("Max key length is %d: %d" %
                             (MAX_KEY_LENGTH, len(key)))
        if len(typed_value) > MAX_ENTRY_SIZE:
            entries = int(math.ceil(1.0 * len(typed_value) / MAX_ENTRY_SIZE))
            version = (hash(typed_value) +
                       hash(str(time.time()))) % INTEGER_MAX
            for i in xrange(entries):
                self.insert_entry_(
                    (key, i, entries, version,
                     typed_value[i * MAX_ENTRY_SIZE:(i + 1) * MAX_ENTRY_SIZE]))
        else:
            self.insert_entry_((key, typed_value))

    def remove(self, key):
        """Removes a value from the RecordIO

    :param key: A key of a previously inserted value. If this key does not
                exist, no exception is thrown.
    """
        self.updates.insert((key, ))

    def commit_sync(self, retries=32, retry_timeout=1):
        """Applies all changes synchronously to the RecordIO.

    :param retries: How many times a commit_sync should be retried in case of
                    datastore collisions.
    :param retry_timeout: The amount of second to wait before the next retry.
    """
        if not len(self.updates):
            return
        for attempt in range(retries + 1):
            shard_does_not_exist = RecordIORecords()
            for shard_name, key_values in RecordIOShard.get_shards_for_key_values(
                    self.name, self.updates):
                self.db_search += 1
                if shard_name == None and key_values:
                    logging.debug(
                        "RecordIO %s: No shard found for:\n%s -> %s" %
                        (self.name,
                         SPLIT_CHAR.join(RecordIOShard.entry_key(
                             key_values[0])), key_values[0][:-1]))
                    for entry in key_values:
                        shard_does_not_exist.insert(entry)
                else:
                    lo_just_split = None
                    hi_just_split = None
                    for key_values_chunk in get_chunks(key_values,
                                                       MAX_WRITE_BATCH_SIZE):
                        if lo_just_split and hi_just_split and key_values_chunk:
                            if RecordIORecords.in_range(key_values_chunk[0],
                                                        lo=lo_just_split[0],
                                                        hi=lo_just_split[1]):
                                shard_name = RecordIOShard.key_name(
                                    self.name,
                                    lo=lo_just_split[0],
                                    hi=lo_just_split[1])
                            elif RecordIORecords.in_range(key_values_chunk[0],
                                                          lo=hi_just_split[0],
                                                          hi=hi_just_split[1]):
                                shard_name = RecordIOShard.key_name(
                                    self.name,
                                    lo=hi_just_split[0],
                                    hi=hi_just_split[1])
                        not_deleted = None
                        try:
                            not_deleted, lo_just_split, hi_just_split = self.commit_shard_(
                                shard_name, key_values_chunk)
                        except RecordIOShardDoesNotExistError:
                            logging.debug("Shard does not exist:\n" +
                                          shard_name)
                            lo_just_split = None
                            hi_just_split = None
                            for entry in key_values_chunk:
                                shard_does_not_exist.insert(entry)
                        if not_deleted:
                            for to_delete_shard_name, to_delete_key_values in (
                                    RecordIOShard.get_shards_for_key_values(
                                        self.name, not_deleted)):
                                self.db_search += 1
                                try:
                                    self.commit_shard_(to_delete_shard_name,
                                                       to_delete_key_values)
                                except RecordIOShardDoesNotExistError:
                                    logging.debug("Shard does not exist:\n" +
                                                  shard_name)
                                    for entry in to_delete_key_values:
                                        shard_does_not_exist.insert(entry)
            self.updates = shard_does_not_exist
            if len(self.updates):
                if attempt == retries:
                    raise RecordIOWriterNotCompletedError(len(self.updates))
                else:
                    logging.debug("Commit attempt %d failed" % attempt)
                    time.sleep(retry_timeout)
            else:
                return

    def commit_async(self, write_every_n_seconds=300):
        """Applies the changes asynchronously to the RecordIO.

    Automatically batches other pending writes to the same RecordIO (Cheaper
    and more efficient than synchronous commits).

    :param write_every_n_seconds: Applies the changes after this amount of
                                  seconds to the RecordIO.
    """
        seen = set([])
        raise_exception = False
        try:
            for tag in self.commit_to_queue_():
                if tag in seen:
                    continue
                seen.add(tag)
                self.pending_worker_tasks.append(
                    self.create_task_(tag, write_every_n_seconds))
        except RecordIOWriterNotCompletedError:
            raise_exception = True

        failed_add = []
        while self.pending_worker_tasks:
            batch = self.pending_worker_tasks[:100]
            self.pending_worker_tasks = self.pending_worker_tasks[100:]
            try:
                taskqueue.Queue('recordio-writer').add(batch)
            except (taskqueue.DuplicateTaskNameError,
                    taskqueue.TombstonedTaskError,
                    taskqueue.TaskAlreadyExistsError):
                pass
            except ValueError:
                failed_add += batch
        self.pending_worker_tasks = failed_add

        if raise_exception or self.pending_worker_tasks:
            raise RecordIOWriterNotCompletedError(len(self.updates))

    def db_stats(self):
        """Returns some datastore access statistics.

    :return: Dict
    """
        return {
            "search": self.db_search,
            "get": self.db_get,
            "put": self.db_put
        }

    def insert_entry_(self, entry):
        """Inserts a entry tuples to the internal queue.

    :param entry: An entry tuple.
    """
        self.updates.insert(entry)

    @staticmethod
    def create_task_(tag, write_every_n_seconds=300, in_past=False):
        """Creates the future taskqueue tasks to apply queued writes.

    :param tag: The shard to write.
    :param write_every_n_seconds: At what interval the shard should be updated.
    :param in_past: If the we should schedule the task in the past
    :return: taskqueue.Task
    """
        now = int(time.time())
        schedule = now - (now % write_every_n_seconds)
        schedule += hash(tag) % write_every_n_seconds
        if schedule < now and not in_past:
            schedule += write_every_n_seconds
        if schedule > now and in_past:
            schedule -= write_every_n_seconds
        task_name = "%d_%d_%d" % (hash(
            tag[:len(tag) / 2]), hash(tag[len(tag) / 2:]), schedule)

        params = {"taskqueue": tag}
        return taskqueue.Task(
            name=task_name,
            url="/recordio/write",
            params=params,
            eta=datetime.datetime.fromtimestamp(schedule + MAX_CLOCK_SKEW))

    def commit_to_queue_(self):
        """Adds all pending changes to the task queues for async commits

    :return: Yields all shard names that need to be updated.
    """
        pull = taskqueue.Queue('recordio-queue')
        rpcs = []
        key_values_not_added = RecordIORecords()
        for shard_name, key_values in RecordIOShard.get_shards_for_key_values(
                self.name, self.updates):
            self.db_search += 1
            if shard_name == None:
                for entry in key_values:
                    key_values_not_added.insert(entry)
            else:
                for key_values_chunk in get_chunks(key_values,
                                                   MAX_TASKQUEUE_BATCH_SIZE):
                    payload = marshal.dumps(key_values_chunk, MARSHAL_VERSION)
                    rpc = pull.add_async(
                        taskqueue.Task(payload=payload,
                                       method='PULL',
                                       tag=shard_name))
                    rpcs.append((rpc, key_values_chunk, shard_name))

        for rpc, key_values, shard_name in rpcs:
            try:
                rpc.get_result()
                yield shard_name
            except:
                for entry in key_values:
                    key_values_not_added.insert(entry)
        self.updates = key_values_not_added
        if len(self.updates):
            raise RecordIOWriterNotCompletedError(len(self.updates))

    @db.transactional(xg=True)
    def commit_shard_(self, shard_name, key_values):
        """Adds key, values to a shard and splits it if necessary.

    :param shard_name: The key name of the RecordIOShard.
    :param key_values: A list of key values to be added
    :return: list of keys that need to be deleted in other shards.
    """
        shard = RecordIOShard.get_by_key_name(shard_name)
        self.db_get += 1
        if shard == None:
            raise RecordIOShardDoesNotExistError(shard_name)
        for entry in key_values:
            shard.insert(entry)
        try:
            shard.commit()
            self.db_put += 1
            return (shard.not_deleted(), None, None)
        except (RecordIOShardTooBigError, RequestTooLargeError, ValueError,
                ArgumentError, BadRequestError):
            shard.delete()
            lo_shard, hi_shard = shard.split()
            lo_shard.commit()
            hi_shard.commit()
            self.db_put += 2
            logging.debug("Split\n%s\n%s\n%s" %
                          (shard.key().name(), lo_shard.key().name(),
                           hi_shard.key().name()))
            shard_name = hi_shard.key().name()
            return shard.not_deleted(), (lo_shard.lo_hi()), (hi_shard.lo_hi())
Exemplo n.º 38
0
 def testInsertSplitDataBiggerToBigToSmall(self):
     records = RecordIORecords()
     records.insert(("a", "aa"))
     records.insert(("b", 0, 3, 3, "bb"))
     records.insert(("b", 1, 3, 3, "bb"))
     records.insert(("b", 2, 3, 3, "bb"))
     records.insert(("c", "cc"))
     records.insert(("b", 0, 2, 2, "bb"))
     records.insert(("b", 1, 2, 2, "bb"))
     records.insert(("b", "bb"))
     records = RecordIORecords(records.get_data())
     self.assertEqual([("a", "aa"), ("b", "bb"), ("c", "cc")],
                      list(records))