Пример #1
0
    def __iter__(self):
        """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.
    """
        ctx = context.get()
        mapper_spec = ctx.mapreduce_spec.mapper
        shard_number = ctx.shard_state.shard_number
        filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

        if len(filenames) != len(self._offsets):
            raise Exception("Files list and offsets do not match.")

        readers = []

        for (i, filename) in enumerate(filenames):
            offset = self._offsets[i]
            reader = records.RecordsReader(files.BufferedFile(filename))
            reader.seek(offset)
            readers.append((None, None, i, reader))

        current_result = None
        while readers:
            (key, value, index, reader) = readers[0]

            if key is not None:
                if current_result and key != current_result[0]:

                    yield current_result
                if not current_result or key != current_result[0]:
                    current_result = (key, [])
                current_result[1].append(value)

            try:
                self._offsets[index] = reader.tell()
                start_time = time.time()
                binary_record = reader.read()

                if context.get():
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_BYTES,
                        len(binary_record))(context.get())
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_MSEC,
                        int((time.time() - start_time) * 1000))(context.get())
                proto = file_service_pb.KeyValue()
                proto.ParseFromString(binary_record)

                heapq.heapreplace(readers,
                                  (proto.key(), proto.value(), index, reader))
            except EOFError:
                heapq.heappop(readers)

        if current_result:
            yield current_result
Пример #2
0
  def testProcessNamespace(self):
    """Test ProcessNamespace function."""
    namespace_manager.set_namespace("1")
    TestEntity().put()
    namespace_manager.set_namespace(None)

    namespaces_jobs = utils.RunMapForKinds(
        self.operation,
        [TestEntity.kind()],
        'Test job for %(kind)s%(namespace)s',
        '__main__.foo',
        self.reader_class_spec,
        {'test_param': 1})
    testutil.execute_all_tasks(self.taskqueue)

    m = mox.Mox()
    m.StubOutWithMock(context, "get", use_mock_anything=True)

    ctx = context.Context(
        model.MapreduceState.get_by_job_id(namespaces_jobs[0]).mapreduce_spec,
        None)
    context.get().AndReturn(ctx)
    context.get().AndReturn(ctx)

    m.ReplayAll()
    try:
      jobs = utils.ProcessNamespace('1')
      jobs.extend(utils.ProcessNamespace('1'))
      m.VerifyAll()
    finally:
      m.UnsetStubs()
    testutil.execute_all_tasks(self.taskqueue)

    self.assertEquals(1, len(jobs))
    job = jobs[0]
    state = model.MapreduceState.get_by_job_id(job)
    self.assertTrue(state)

    spec = state.mapreduce_spec
    self.assertTrue(spec)
    self.assertEquals("Test job for TestEntity in namespace 1", spec.name)
    mapper = spec.mapper
    self.assertTrue(mapper)
    self.assertEquals({'test_param': 1,
                       'entity_kind': TestEntity.kind(),
                       'namespaces': '1'},
                      mapper.params)
    self.assertEquals('__main__.foo', mapper.handler_spec)
    self.assertEquals(self.reader_class_spec, mapper.input_reader_spec)
Пример #3
0
def process(comment):
  ctx = context.get()
  params = ctx.mapreduce_spec.mapper.params
  program_key = params['program_key']

  program = GCIProgram.get_by_key_name(program_key)

  if comment.parent().program.key() != program.key():
    yield operation.counters.Increment("prev_program_comment_not_converted")
    return

  if comment.title not in ACTION_TITLES:
    yield operation.counters.Increment("user_comment_not_converted")
    return

  comment_title = ACTION_TITLES[comment.title]

  changes = ACTION_TITLES[comment_title]
  # Task reopening is a special case which could have been performed
  # either by a mentor or by the automated system after the passing of
  # the deadline. So additional inference of the user has to be made.
  if comment_title == 'Task Reopened':
    if comment.created_by:
      user_info = ugettext('User-Mentor')
    else:
      user_info = ugettext('MelangeAutomatic')
    changes = [user_info] + changes

  comment.changes = changes

  yield operation.db.Put(comment)
  yield operation.counters.Increment("action_comment_converted")
Пример #4
0
  def write(self, data):
    """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
    ctx = context.get()
    if len(data) != 2:
      logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                    len(data), data)

    try:
      key = str(data[0])
      value = str(data[1])
    except TypeError:
      logging.error("Expecting a tuple, but got %s: %s",
                    data.__class__.__name__, data)

    file_index = key.__hash__() % len(self._filehandles)





    pool = self._pools[file_index]
    if pool is None:
      filehandle = self._filehandles[file_index]
      pool = output_writers.GCSRecordsPool(filehandle=filehandle, ctx=ctx)
      self._pools[file_index] = pool

    proto = file_service_pb.KeyValue()
    proto.set_key(key)
    proto.set_value(value)
    pool.append(proto.Encode())
Пример #5
0
    def write(self, data):
        """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
        ctx = context.get()
        if len(data) != 2:
            logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                          len(data), data)

        try:
            key = str(data[0])
            value = str(data[1])
        except TypeError:
            logging.error("Expecting a tuple, but got %s: %s",
                          data.__class__.__name__, data)

        file_index = key.__hash__() % len(self._filehandles)

        pool = self._pools[file_index]
        if pool is None:
            filehandle = self._filehandles[file_index]
            pool = output_writers.GCSRecordsPool(filehandle=filehandle,
                                                 ctx=ctx)
            self._pools[file_index] = pool

        proto = file_service_pb.KeyValue()
        proto.set_key(key)
        proto.set_value(value)
        pool.append(proto.Encode())
Пример #6
0
def process(task):
    ctx = context.get()
    params = ctx.mapreduce_spec.mapper.params
    program_key = params["program_key"]

    try:
        program = GCIProgram.get_by_key_name(program_key)
    except db.BadValueError:
        yield operation.counters.Increment("program_key_is_empty_or_invalid")
        return

    def subscribe_to_task_txn(task_key, subscribe):
        task = GCITask.get(task_key)
        task.subscribers = list(set(task.subscribers + subscribe))
        task.put()
        return task

    if task.program.key() != program.key():
        yield operation.counters.Increment("old_program_task_not_updated")
        return

    mentors = db.get(task.mentors)
    entities = mentors + [task.created_by, task.modified_by]

    subscribe = [ent.key() for ent in entities if ent.automatic_task_subscription]

    result = db.run_in_transaction(subscribe_to_task_txn, task.key(), subscribe)

    if result:
        yield operation.counters.Increment("task_updated")
    else:
        yield operation.counters.Increment("task_not_updated")
Пример #7
0
  def write(self, data):
    """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
    ctx = context.get()
    if ctx.get_pool("file_pool") is None:
      ctx.register_pool("file_pool", _FilePool(ctx=ctx))
    ctx.get_pool("file_pool").append(self._filename, str(data))
Пример #8
0
 def initialize(self):
   if self.initialized:
     return
   mapper_params = context.get().mapreduce_spec.mapper.params
   kind_filter = mapper_params.get('kind_filter')
   self.kind_filter = set(kind_filter) if kind_filter else None
   original_app = mapper_params.get('original_app')
   if original_app and os.getenv('APPLICATION_ID') != original_app:
     self.app_id = os.getenv('APPLICATION_ID')
   self.initialized = True
Пример #9
0
    def write(self, data):
        """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
        ctx = context.get()
        if ctx.get_pool("file_pool") is None:
            ctx.register_pool("file_pool", _FilePool(ctx=ctx))
        ctx.get_pool("file_pool").append(self._filename, str(data))
Пример #10
0
 def initialize(self):
   if self.initialized:
     return
   mapper_params = context.get().mapreduce_spec.mapper.params
   kind_filter = mapper_params.get('kind_filter')
   self.kind_filter = set(kind_filter) if kind_filter else None
   original_app = mapper_params.get('original_app')
   if original_app and os.getenv('APPLICATION_ID') != original_app:
     self.app_id = os.getenv('APPLICATION_ID')
   self.initialized = True
Пример #11
0
    def __iter__(self):
        ctx = context.get()
        combiner = None

        if ctx:
            combiner_spec = ctx.mapreduce_spec.mapper.params.get(
                "combiner_spec")
            if combiner_spec:
                combiner = util.handler_for_name(combiner_spec)

        self.current_key = None
        self.current_values = None

        for binary_record in super(_ReducerReader, self).__iter__():
            proto = file_service_pb.KeyValues()
            proto.ParseFromString(binary_record)

            if self.current_key is None:
                self.current_key = proto.key()
                self.current_values = []
            else:
                assert proto.key() == self.current_key, (
                    "inconsistent key sequence. Expected %s but got %s" %
                    (self.current_key, proto.key()))

            if combiner:
                combiner_result = combiner(self.current_key,
                                           proto.value_list(),
                                           self.current_values)

                if not util.is_generator(combiner_result):
                    raise errors.BadCombinerOutputError(
                        "Combiner %s should yield values instead of returning them (%s)"
                        % (combiner, combiner_result))

                self.current_values = []
                for value in combiner_result:
                    if isinstance(value, operation.Operation):
                        value(ctx)
                    else:

                        self.current_values.append(value)
            else:

                self.current_values.extend(proto.value_list())

            if not proto.partial():
                key = self.current_key
                values = self.current_values

                self.current_key = None
                self.current_values = None
                yield (key, values)
            else:
                yield input_readers.ALLOW_CHECKPOINT
Пример #12
0
    def write(self, data):
        """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
        ctx = context.get()
        if ctx.get_pool("records_pool") is None:
            ctx.register_pool(
                "records_pool",
                RecordsPool(self._filename, ctx=ctx, exclusive=True))
        ctx.get_pool("records_pool").append(str(data))
Пример #13
0
  def write(self, data):
    """Write data to the GoogleCloudStorage file.

    Args:
      data: string containing the data to be written.
    """
    start_time = time.time()
    self._get_write_buffer().write(data)
    ctx = context.get()
    operation.counters.Increment(COUNTER_IO_WRITE_BYTES, len(data))(ctx)
    operation.counters.Increment(
        COUNTER_IO_WRITE_MSEC, int((time.time() - start_time) * 1000))(ctx)
Пример #14
0
  def write(self, data):
    """Write data to the GoogleCloudStorage file.

    Args:
      data: string containing the data to be written.
    """
    start_time = time.time()
    self._streaming_buffer.write(data)
    ctx = context.get()
    operation.counters.Increment(COUNTER_IO_WRITE_BYTES, len(data))(ctx)
    operation.counters.Increment(
        COUNTER_IO_WRITE_MSEC, int((time.time() - start_time) * 1000))(ctx)
Пример #15
0
  def __iter__(self):
    ctx = context.get()
    combiner = None

    if ctx:
      combiner_spec = ctx.mapreduce_spec.mapper.params.get("combiner_spec")
      if combiner_spec:
        combiner = util.handler_for_name(combiner_spec)

    self.current_key = None
    self.current_values = None

    for binary_record in super(_ReducerReader, self).__iter__():
      proto = file_service_pb.KeyValues()
      proto.ParseFromString(binary_record)

      if self.current_key is None:
        self.current_key = proto.key()
        self.current_values = []
      else:
        assert proto.key() == self.current_key, (
            "inconsistent key sequence. Expected %s but got %s" %
            (self.current_key, proto.key()))

      if combiner:
        combiner_result = combiner(
            self.current_key, proto.value_list(), self.current_values)

        if not util.is_generator(combiner_result):
          raise errors.BadCombinerOutputError(
              "Combiner %s should yield values instead of returning them (%s)" %
              (combiner, combiner_result))

        self.current_values = []
        for value in combiner_result:
          if isinstance(value, operation.Operation):
            value(ctx)
          else:

            self.current_values.append(value)
      else:

        self.current_values.extend(proto.value_list())

      if not proto.partial():
        key = self.current_key
        values = self.current_values

        self.current_key = None
        self.current_values = None
        yield (key, values)
      else:
        yield input_readers.ALLOW_CHECKPOINT
Пример #16
0
  def write(self, data):
    """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
    ctx = context.get()
    if ctx.get_pool("records_pool") is None:
      ctx.register_pool("records_pool",


                        RecordsPool(self._filename, ctx=ctx, exclusive=True))
    ctx.get_pool("records_pool").append(str(data))
Пример #17
0
  def __iter__(self):
    ctx = context.get()

    while self._count:
      self._count -= 1
      start_time = time.time()
      content = "".join(random.choice(string.ascii_lowercase)
                        for _ in range(self._string_length))
      if ctx:
        operation.counters.Increment(
            COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(ctx)
        operation.counters.Increment(COUNTER_IO_READ_BYTES, len(content))(ctx)
      yield content
Пример #18
0
def touch(key):
    # change entity
    app = key.namespace()
    kind = key.kind()
    id = key.id_or_name()

    ctx = context.get()
    params = ctx.mapreduce_spec.mapper.params
    matching_app = params["app_to_process"]

    if matching_app and matching_app != app:
        return

    metadata_entity = store._GetMetadataEntity(app)
    store.update_entity(app, kind, id, {}, metadata_entity, None, put_function=yield_put, rebuild_facets=True)
Пример #19
0
def process(task):
  ctx = context.get()
  params = ctx.mapreduce_spec.mapper.params
  program_key = params['program_key']

  program = GCIProgram.get_by_key_name(program_key)

  if (task.program.key() == program.key() and 
      (task.status == 'Unapproved'or task.status == 'Unpublished')):
    task.status = 'Open'
    yield operation.db.Put(task)

    yield operation.counters.Increment("task_updated")

  yield operation.counters.Increment("task_not_updated")
    def __iter__(self):
        ctx = context.get()

        while self._count:
            self._count -= 1
            start_time = time.time()
            content = "".join(
                random.choice(string.ascii_lowercase)
                for _ in range(self._string_length))
            if ctx:
                operation.counters.Increment(
                    COUNTER_IO_READ_MSEC, int(
                        (time.time() - start_time) * 1000))(ctx)
                operation.counters.Increment(COUNTER_IO_READ_BYTES,
                                             len(content))(ctx)
            yield content
Пример #21
0
def ProcessNamespace(namespace):
  """Handler function for mapper over all namespaces.

  Starts mapper jobs specified by parameters over all passed kinds.

  Args:
    namespace: namespace to process.

  Returns:
    Started mapper job ids. Mapper framework ignores function value. Returning
    these for testing purposes only.
  """
  ctx = context.get()
  mapreduce_spec = ctx.mapreduce_spec
  params = mapreduce_spec.params
  operation = DatastoreAdminOperation.get(
      params[DatastoreAdminOperation.PARAM_DATASTORE_ADMIN_OPERATION])
  mapper_params = params['mapper_params']

  jobs = []
  for kind in params['kinds']:
    job_key_name = kind + "@" + namespace
    mapper_params['entity_kind'] = kind
    mapper_params['namespaces'] = namespace
    job_name = params['job_name'] % {
        'kind': kind,
        'namespace': ' in namespace ' + namespace
        }

    def tx():
      if db.get(db.Key.from_path(operation.kind(),
                                 operation.key().id_or_name(),
                                 DatastoreAdminOperationJob.kind(),
                                 job_key_name)):
        return None
      DatastoreAdminOperationJob(key_name=job_key_name, parent=operation).put()
      return StartMap(operation,
                      job_name,
                      params['handler_spec'],
                      params['reader_spec'],
                      mapper_params,
                      start_transaction=False)
    job = db.run_in_transaction(tx)
    if job:
      jobs.append(job)
  return jobs
Пример #22
0
  def _read(self, entry):
    """Read entry content.

    Args:
      entry: zip file entry as zipfile.ZipInfo.
    Returns:
      Entry content as string.
    """
    start_time = time.time()
    content = self._zip.read(entry.filename)

    ctx = context.get()
    if ctx:
      operation.counters.Increment(COUNTER_IO_READ_BYTES, len(content))(ctx)
      operation.counters.Increment(
          COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(ctx)

    return content
Пример #23
0
def process(org_app):
  ctx = context.get()
  params = ctx.mapreduce_spec.mapper.params
  program_key = params['program_key']
  # TODO(SRabbelier): should have been a full url
  url = 'gci/profile/organization/%s' % program_key

  # TODO(SRabbelier): create a MapReduce/Task RequestData
  data = RequestData()
  data.program = GCIProgram.get_by_key_name(program_key)
  data.site = Site.get_by_key_name('site')

  if org_app.status == 'pre-accepted':
    org_app_logic.setStatus(data, org_app, 'accepted', url)
    yield operation.counters.Increment("proposals_accepted")
  elif org_app.status == 'pre-rejected':
    org_app_logic.setStatus(data, org_app, 'rejected', url)
    yield operation.counters.Increment("proposals_rejected")
  else:
    yield operation.counters.Increment("proposals_ignored")
Пример #24
0
def _sort_records_map(records):
    """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new GCS file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
    ctx = context.get()
    l = len(records)
    key_records = [None] * l

    logging.debug("Parsing")
    for i in range(l):
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(records[i])
        key_records[i] = (proto.key(), records[i])

    logging.debug("Sorting")
    key_records.sort(cmp=_compare_keys)

    logging.debug("Writing")
    mapper_spec = ctx.mapreduce_spec.mapper
    params = input_readers._get_params(mapper_spec)
    bucket_name = params.get("bucket_name")
    filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" +
                ctx.shard_id + "-" + str(int(time.time())))
    full_filename = "/%s/%s" % (bucket_name, filename)
    filehandle = cloudstorage.open(full_filename, mode="w")
    with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool:
        for key_record in key_records:
            pool.append(key_record[1])

    logging.debug("Finalizing")
    filehandle.close()

    entity = _OutputFile(key_name=full_filename,
                         parent=_OutputFile.get_root_key(ctx.mapreduce_id))
    entity.put()
Пример #25
0
def _sort_records(records):
  """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new blobstore file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
  ctx = context.get()
  l = len(records)
  proto_records = [None] * l


  logging.info("parsing")
  for i in range(l):
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(records[i])
    proto_records[i] = proto

  logging.info("sorting")
  proto_records.sort(cmp=_compare_keys)

  logging.info("writing")
  blob_file_name = (ctx.mapreduce_spec.name + "-" +
                    ctx.mapreduce_id + "-output")
  output_path = files.blobstore.create(
      _blobinfo_uploaded_filename=blob_file_name)
  with output_writers.RecordsPool(output_path, ctx=ctx) as pool:
    for proto in proto_records:
      pool.append(proto.Encode())

  logging.info("finalizing")
  files.finalize(output_path)
  output_path = files.blobstore.get_file_name(
      files.blobstore.get_blob_key(output_path))

  entity = _OutputFile(key_name=output_path,
                       parent=_OutputFile.get_root_key(ctx.mapreduce_id))
  entity.put()
Пример #26
0
def _sort_records(records):
  """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new blobstore file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
  ctx = context.get()
  l = len(records)
  proto_records = [None] * l

  logging.debug("Parsing")
  for i in range(l):
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(records[i])
    proto_records[i] = proto

  logging.debug("Sorting")
  proto_records.sort(cmp=_compare_keys)

  logging.debug("Writing")
  blob_file_name = (ctx.mapreduce_spec.name + "-" +
                    ctx.mapreduce_id + "-output")
  output_path = files.blobstore.create(
      _blobinfo_uploaded_filename=blob_file_name)
  with output_writers.RecordsPool(output_path, ctx=ctx) as pool:
    for proto in proto_records:
      pool.append(proto.Encode())

  logging.debug("Finalizing")
  files.finalize(output_path)
  time.sleep(1)
  output_path = files.blobstore.get_file_name(
      files.blobstore.get_blob_key(output_path))

  entity = _OutputFile(key_name=output_path,
                       parent=_OutputFile.get_root_key(ctx.mapreduce_id))
  entity.put()
Пример #27
0
def _sort_records_map(records):
  """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new GCS file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
  ctx = context.get()
  l = len(records)
  key_records = [None] * l

  logging.debug("Parsing")
  for i in range(l):
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(records[i])
    key_records[i] = (proto.key(), records[i])

  logging.debug("Sorting")
  key_records.sort(cmp=_compare_keys)

  logging.debug("Writing")
  mapper_spec = ctx.mapreduce_spec.mapper
  params = input_readers._get_params(mapper_spec)
  bucket_name = params.get("bucket_name")
  filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" +
              ctx.shard_id + "-" + str(int(time.time())))
  full_filename = "/%s/%s" % (bucket_name, filename)
  filehandle = cloudstorage.open(full_filename, mode="w")
  with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool:
    for key_record in key_records:
      pool.append(key_record[1])

  logging.debug("Finalizing")
  filehandle.close()

  entity = _OutputFile(key_name=full_filename,
                       parent=_OutputFile.get_root_key(ctx.mapreduce_id))
  entity.put()
Пример #28
0
def touch(key):
    # change entity
    app = key.namespace()
    kind = key.kind()
    id = key.id_or_name()

    ctx = context.get()
    params = ctx.mapreduce_spec.mapper.params
    matching_app = params['app_to_process']

    if matching_app and matching_app != app:
        return

    metadata_entity = store._GetMetadataEntity(app)
    store.update_entity(app,
                        kind,
                        id, {},
                        metadata_entity,
                        None,
                        put_function=yield_put,
                        rebuild_facets=True)
Пример #29
0
  def __iter__(self):
    """Iterate over records in file.

    Yields records as strings.
    """
    ctx = context.get()

    while self._reader:
      try:
        start_time = time.time()
        record = self._reader.read()
        if ctx:
          operation.counters.Increment(
              COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(ctx)
          operation.counters.Increment(COUNTER_IO_READ_BYTES, len(record))(ctx)
        yield record
      except EOFError:
        self._filenames.pop(0)
        if not self._filenames:
          self._reader = None
        else:
          self._reader = records.RecordsReader(
              files.BufferedFile(self._filenames[0]))
Пример #30
0
  def __iter__(self):
    """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.
    """
    ctx = context.get()
    mapper_spec = ctx.mapreduce_spec.mapper
    shard_number = ctx.shard_state.shard_number
    filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

    if len(filenames) != len(self._offsets):
      raise Exception("Files list and offsets do not match.")


    readers = []


    for (i, filename) in enumerate(filenames):
      offset = self._offsets[i]
      reader = records.RecordsReader(files.BufferedFile(filename))
      reader.seek(offset)
      readers.append((None, None, i, reader))


    current_result = None
    while readers:
      (key, value, index, reader) = readers[0]

      if key is not None:
        if current_result and key != current_result[0]:

          yield current_result
        if not current_result or key != current_result[0]:
          current_result = (key, [])
        current_result[1].append(value)


      try:
        self._offsets[index] = reader.tell()
        start_time = time.time()
        binary_record = reader.read()

        if context.get():
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_BYTES,
              len(binary_record))(context.get())
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_MSEC,
              int((time.time() - start_time) * 1000))(context.get())
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(binary_record)

        heapq.heapreplace(readers,
                          (proto.key(), proto.value(), index, reader))
      except EOFError:
        heapq.heappop(readers)


    if current_result:
      yield current_result
Пример #31
0
def delete(entity):
    params = context.get().mapreduce_spec.mapper.params
    quiz_id = int(params['quiz_id'])
    if entity.quiz.key().id() == quiz_id:
        entity.is_archived = True
        yield operation.db.Delete(entity)
Пример #32
0
def yield_put(entity):
    f = op.db.Put(entity)
    f(context.get())
Пример #33
0
    def __iter__(self):
        """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.

    Raises:
      Exception: when Files list and offsets do not match.

    Yields:
      The result.
    """
        ctx = context.get()
        mapper_spec = ctx.mapreduce_spec.mapper
        shard_number = ctx._shard_state.shard_number
        filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

        if len(filenames) != len(self._offsets):
            raise Exception("Files list and offsets do not match.")

        readers = []

        for (i, filename) in enumerate(filenames):
            offset = self._offsets[i]

            reader = records.RecordsReader(
                cloudstorage.open(filename,
                                  read_buffer_size=self.GCS_BUFFER_SIZE))
            reader.seek(offset)
            readers.append((None, None, i, reader))

        current_result = None
        current_count = 0
        current_size = 0
        while readers:
            (key, value, index, reader) = readers[0]

            if key is not None:
                current_count += 1
                current_size += len(value)

                should_yield = False
                if current_result:
                    if key != current_result[0]:

                        should_yield = True
                    elif (self._max_values_count != -1
                          and current_count >= self._max_values_count):

                        current_result[2] = True
                        should_yield = True
                    elif (self._max_values_size != -1
                          and current_size >= self._max_values_size):

                        current_result[2] = True
                        should_yield = True

                if should_yield:

                    yield current_result
                if not current_result or should_yield:
                    current_result = [key, [], False]
                    current_count = 0
                    current_size = 0
                current_result[1].append(value)

            try:
                self._offsets[index] = reader.tell()
                start_time = time.time()
                binary_record = reader.read()

                if context.get():
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_BYTES,
                        len(binary_record))(context.get())
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_MSEC,
                        int((time.time() - start_time) * 1000))(context.get())
                proto = file_service_pb.KeyValue()
                proto.ParseFromString(binary_record)

                heapq.heapreplace(readers,
                                  (proto.key(), proto.value(), index, reader))
            except EOFError:
                heapq.heappop(readers)

        if current_result:
            yield current_result
def get_mapper_params():
    """Return current mapreduce mapper params. Easily stubbed out for testing."""
    return context.get().mapreduce_spec.mapper.params
Пример #35
0
 def __init__(self):
     mapper_params = context.get().mapreduce_spec.mapper.params
     kind_filter = mapper_params.get('kind_filter')
     self.kind_filter = set(kind_filter) if kind_filter else None
Пример #36
0
def get_mapper_params():
  """Return current mapreduce mapper params. Easily stubbed out for testing."""
  return context.get().mapreduce_spec.mapper.params
Пример #37
0
  def __iter__(self):
    """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.

    Raises:
      Exception: when Files list and offsets do not match.

    Yields:
      The result.
    """
    ctx = context.get()
    mapper_spec = ctx.mapreduce_spec.mapper
    shard_number = ctx._shard_state.shard_number
    filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

    if len(filenames) != len(self._offsets):
      raise Exception("Files list and offsets do not match.")


    readers = []


    for (i, filename) in enumerate(filenames):
      offset = self._offsets[i]


      reader = records.RecordsReader(
          cloudstorage.open(filename, read_buffer_size=self.GCS_BUFFER_SIZE))
      reader.seek(offset)
      readers.append((None, None, i, reader))





    current_result = None
    current_count = 0
    current_size = 0
    while readers:
      (key, value, index, reader) = readers[0]

      if key is not None:
        current_count += 1
        current_size += len(value)

        should_yield = False
        if current_result:
          if key != current_result[0]:

            should_yield = True
          elif (self._max_values_count != -1 and
                current_count >= self._max_values_count):

            current_result[2] = True
            should_yield = True
          elif (self._max_values_size != -1 and
                current_size >= self._max_values_size):

            current_result[2] = True
            should_yield = True

        if should_yield:

          yield current_result
        if not current_result or should_yield:
          current_result = [key, [], False]
          current_count = 0
          current_size = 0
        current_result[1].append(value)


      try:
        self._offsets[index] = reader.tell()
        start_time = time.time()
        binary_record = reader.read()

        if context.get():
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_BYTES,
              len(binary_record))(context.get())
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_MSEC,
              int((time.time() - start_time) * 1000))(context.get())
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(binary_record)

        heapq.heapreplace(readers,
                          (proto.key(), proto.value(), index, reader))
      except EOFError:
        heapq.heappop(readers)


    if current_result:
      yield current_result
Пример #38
0
def yield_put(entity):
    f = op.db.Put(entity)
    f(context.get())
Пример #39
0
 def __init__(self):
   mapper_params = context.get().mapreduce_spec.mapper.params
   kind_filter = mapper_params.get('kind_filter')
   self.kind_filter = set(kind_filter) if kind_filter else None