Exemplo n.º 1
0
def Start():
  jsonTree = {}
  items = []

  for filename in os.listdir(backupFolder):
    if not filename.startswith("output-"): continue
    print("Reading from:" + filename)
    
    inPath = os.path.join(backupFolder, filename)
    raw = open(inPath, 'rb')
    reader = records.RecordsReader(raw)
    for recordIndex, record in enumerate(reader):
      entity_proto = entity_pb.EntityProto(contents=record)

      #collection = GetCollectionOfProtoEntity(entity_proto)
      collectionInJSONTree = GetCollectionInJSONTreeForProtoEntity(jsonTree, entity_proto)
      key = GetKeyOfProtoEntity(entity_proto)
      entity = GetValueOfProtoEntity(entity_proto)

      collectionInJSONTree[key] = entity
      items.append(entity) # also add to flat list, so we know the total item count

      print("Parsing document #" + str(len(items)))
      
  outPath = os.path.join(backupFolder, 'Data.json')
  out = open(outPath, 'w')
  out.write(json.dumps(jsonTree, default=JsonSerializeFunc, encoding='latin-1', indent=2))
  out.close()
  print("JSON file written to: " + outPath)
def run():
    # Set your downloaded folder's path here (must be readable by dev_appserver)
    mypath = '/Users/lambert/Dropbox/dancedeets/data/datastore_backup_datastore_backup_2016_11_19_DBEvent/15700286559371541387849311E815D'
    # Se the class of the objects here
    cls = DBEvent
    # Set your app's name here
    appname = "dev~None"

    # Do the harlem shake
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

    for file in onlyfiles:
        i = 0
        try:
            raw = open(mypath + "/" + file, 'r')
            reader = records.RecordsReader(raw)
            to_put = list()
            for record in reader:
                entity_proto = entity_pb.EntityProto(contents=record)
                entity_proto.key_.app_ = appname
                obj = cls._from_pb(entity_proto)

                to_put.append(obj)
                i += 1
                if i % 100 == 0:
                    print "Saved %d %ss" % (i, '')  #entity.kind())
                    ndb.put_multi(to_put)  # use_memcache=False)
                    to_put = list()

            ndb.put_multi(to_put)  # use_memcache=False)
            to_put = list()
            print "Saved %d" % i

        except ProtocolBufferDecodeError:
            """ All good """
Exemplo n.º 3
0
def parse_backup_info_file(content):
  """Returns entities iterator from a backup_info file content."""
  reader = records.RecordsReader(cStringIO.StringIO(content))
  version = reader.read()
  if version != '1':
    raise IOError('Unsupported version')
  for record in reader:
    yield datastore.Entity.FromPb(record)
Exemplo n.º 4
0
    def __iter__(self):
        """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.
    """
        ctx = context.get()
        mapper_spec = ctx.mapreduce_spec.mapper
        shard_number = ctx.shard_state.shard_number
        filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

        if len(filenames) != len(self._offsets):
            raise Exception("Files list and offsets do not match.")

        readers = []

        for (i, filename) in enumerate(filenames):
            offset = self._offsets[i]
            reader = records.RecordsReader(files.BufferedFile(filename))
            reader.seek(offset)
            readers.append((None, None, i, reader))

        current_result = None
        while readers:
            (key, value, index, reader) = readers[0]

            if key is not None:
                if current_result and key != current_result[0]:

                    yield current_result
                if not current_result or key != current_result[0]:
                    current_result = (key, [])
                current_result[1].append(value)

            try:
                self._offsets[index] = reader.tell()
                start_time = time.time()
                binary_record = reader.read()

                if context.get():
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_BYTES,
                        len(binary_record))(context.get())
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_MSEC,
                        int((time.time() - start_time) * 1000))(context.get())
                proto = file_service_pb.KeyValue()
                proto.ParseFromString(binary_record)

                heapq.heapreplace(readers,
                                  (proto.key(), proto.value(), index, reader))
            except EOFError:
                heapq.heappop(readers)

        if current_result:
            yield current_result
Exemplo n.º 5
0
Arquivo: read.py Projeto: jemisa/poem
def iter_entity(folder):
    for d, _, files in os.walk(folder):
        for fn in files:
            path = os.path.join(d, fn)
            raw = open(path, 'r')
            reader = records.RecordsReader(raw)
            for record in reader:
                entity = record_to_dict(record)
                yield entity
            raw.close()
Exemplo n.º 6
0
def readfile(f, otype, outs):
    raw = open('{0}/kind_{1}/{2}'.format(DATA_DIRECTORY, otype, f), 'r')
    reader = records.RecordsReader(raw)
    last = ''
    for record in reader:
        entity_proto = entity_pb.EntityProto(contents=record)
        entity = datastore.Entity.FromPb(entity_proto)
        key = entity_proto.key()
        elems = key.path()
        the_type = elems.element_list()[-1].type()
        if the_type in objects:
            write_object(outs, the_type, entity)
            count[otype] += 1
Exemplo n.º 7
0
def run():
    # Set your downloaded folder's path here (must be readable by dev_appserver)
    mypath = '/local_target'
    # Set your app's name here
    appname = "dev~yourappnamehere"

    # Do the harlem shake
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    ec = datastore_pbs.get_entity_converter()

    for file in onlyfiles:
        i = 0
        try:
            raw = open(mypath + "/" + file, 'r')
            reader = records.RecordsReader(raw)
            to_put = list()
            for record in reader:
                entity_proto = entity_pb.EntityProto(contents=record)
                entity_proto.key_.app_ = appname
                entity = db.model_from_protobuf(entity_proto)
                a = db.model_from_protobuf(entity_proto)

                for pp in dir(a):
                    try:
                        ppp = getattr(a, "_" + pp)
                        if isinstance(ppp, db.Key):
                            ppp._Key__reference.set_app(appname)
                            ppp
                    except AttributeError:
                        """ It's okay """

                to_put.append(a)
                i += 1
                if i % 100 == 0:
                    print "Saved %d %ss" % (i, entity.kind())
                    db.put(to_put)
                    to_put = list()

            db.put(to_put)
            to_put = list()
            print "Saved %d" % i

        except ProtocolBufferDecodeError:
            """ All good """
Exemplo n.º 8
0
def import_datastore_backup_file(filename):
    app = ndb.Key('Foo', 'Bar').app()
    kind = None
    i = 0
    raw = open(filename, 'r')
    reader = records.RecordsReader(raw)
    to_put = list()
    for record in reader:
        entity = protobuf_to_entity(record)
        _localize_key_properties(entity, app)
        kind = kind or entity.key.kind()
        to_put.append(entity)
        if len(to_put) == 100:
            ndb.put_multi(to_put)
            to_put = list()
            i += 100
    ndb.put_multi(to_put)
    i += len(to_put)
    logging.info('imported %d %s entities', i, kind)
Exemplo n.º 9
0
def process(output_dir, time_capture, table_tuple, writeFn):
    root = table_tuple[0]
    table_name = table_tuple[1]
    filenames = table_tuple[2]
    header_list = None

    print 'Converting ' + table_name + ' to a CSV file...'
    time_capture.start()

    count = 0
    # open file for writing
    with open(join(output_dir, table_name + '.csv'), 'w') as write_file:
        write_file = csv.writer(write_file)
        # read files, process, and write
        for filename in filenames:
            path = join(root, filename)
            with open(path, 'r') as raw_file:
                reader = records.RecordsReader(raw_file)
                for record in reader:
                    entity_proto = entity_pb.EntityProto(contents=record)
                    entity = datastore.Entity.FromPb(entity_proto)
                    if header_list is None:
                        header_list = parseHeaderFields(entity)
                        display_header_list = ['key']
                        display_header_list.extend(header_list)
                        writeFn(write_file, entity, display_header_list)
                    csv_row = entity2csvRow(header_list, entity)
                    writeFn(write_file, entity, csv_row)
                    count += 1
    time_capture.end(count)
    print '    ...converted {count:d} objects of type {obj_type} in {run_time:.2f} seconds | {ms_per_obj:.2f} ms/obj | total time = {total_time:.2f} seconds'.format(
        count=count,
        obj_type=table_name,
        run_time=time_capture.run_time,
        ms_per_obj=time_capture.ms_per_obj,
        total_time=time_capture.total_time)
    return time_capture
Exemplo n.º 10
0
# type : 0
# show_answer_when_incorrect : False


# Returns a formatted json object or string
def pp_json(json_thing, sort=True, indents=4):
    if type(json_thing) is str:
        return json.dumps(json.loads(json_thing),
                          sort_keys=sort,
                          indent=indents)
    else:
        return json.dumps(json_thing, sort_keys=sort, indent=indents)


raw = open(sys.argv[1], 'r')
reader = records.RecordsReader(raw)
questions = []
for record in reader:
    entity_proto = entity_pb.EntityProto(contents=record)
    questions.append(datastore.Entity.FromPb(entity_proto)['data'])

#print('***RAM*** entity = ' + str(questions) + '\n\n')

k = 0
question0 = json.loads(questions[0])
for prop in question0:
    print prop, ':', question0[prop]
print

csp_qs = []
k = 0
Exemplo n.º 11
0
    def __iter__(self):
        """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.
    """
        ctx = context.get()
        mapper_spec = ctx.mapreduce_spec.mapper
        shard_number = ctx.shard_state.shard_number
        filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

        if len(filenames) != len(self._offsets):
            raise Exception("Files list and offsets do not match.")

        # Heap with (Key, Value, Index, reader) pairs.
        readers = []

        # Initialize heap
        for (i, filename) in enumerate(filenames):
            offset = self._offsets[i]
            reader = records.RecordsReader(files.BufferedFile(filename))
            reader.seek(offset)
            readers.append((None, None, i, reader))

        # Read records from heap and merge values with the same key.

        # current_result is yielded and consumed buy _merge_map.
        # current_result = (key, value, is_partial)
        current_result = None
        current_count = 0
        current_size = 0
        while readers:
            (key, value, index, reader) = readers[0]

            if key is not None:
                current_count += 1
                current_size += len(value)

                should_yield = False
                if current_result:
                    if key != current_result[0]:
                        # New key encountered
                        should_yield = True
                    elif (self._max_values_count != -1
                          and current_count >= self._max_values_count):
                        # Maximum number of values encountered.
                        current_result[2] = True
                        should_yield = True
                    elif (self._max_values_size != -1
                          and current_size >= self._max_values_size):
                        # Maximum size of values encountered
                        current_result[2] = True
                        should_yield = True

                if should_yield:
                    # New key encountered or maximum count hit. Yield current key.
                    yield current_result
                if not current_result or should_yield:
                    current_result = [key, [], False]
                    current_count = 0
                    current_size = 0
                current_result[1].append(value)

            # Read next key/value from reader.
            try:
                self._offsets[index] = reader.tell()
                start_time = time.time()
                binary_record = reader.read()
                # update counters
                if context.get():
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_BYTES,
                        len(binary_record))(context.get())
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_MSEC,
                        int((time.time() - start_time) * 1000))(context.get())
                proto = file_service_pb.KeyValue()
                proto.ParseFromString(binary_record)
                # Put read data back into heap.
                heapq.heapreplace(readers,
                                  (proto.key(), proto.value(), index, reader))
            except EOFError:
                heapq.heappop(readers)

        # Yield leftovers.
        if current_result:
            yield current_result
Exemplo n.º 12
0
def load_datastore_backup(request):
    
    backup_info_files, backup_output_files = glob_output_files(FILE_PATH)
    
    html = '<body>'
    
    if not request.method == 'POST':
        
        files       = groupFiles( FILE_PATH )
        model_names = [filename.split(".")[1] for filename in backup_info_files]

        html += '<h2>Import datastore backup files into local dev server via  &rarr;datastore_handler.py</h2> \
                <h3>&bull; Using file path: &nbsp; {} <br/><br /> \
                &bull; Click on the "Start Export" at the bottom to start the process. \
                Depending on the export size, this process can take a very long time.</h3>'.format( FILE_PATH )

        html += '<h4>Model Names:</h4>{}<br /><br />'.format(model_names)
        html += '<h4>backup_info_files: </h4>{}<br /><br />'.format(backup_info_files)
        html += '<h4>backup_output_files: </h4>{}<br /><br />'.format(backup_output_files)
        
        # only allow one submit 
        html += '<script> \
                    function doSubmit() { \
                        var theButton = document.getElementById("submitButton"); \
                        var theForm = document.getElementById("theForm"); \
                        if (theButton.innerText == "Start Import") { \
                            theButton.innerText = "Processing..."; \
                            theForm.submit(); \
                        } \
                    } \
                </script> \
                <form id="theForm" action="" method="post"></form> \
                <center> \
                <button id="submitButton" style="padding:10px;text-align:center;" onclick="doSubmit();">Start Import</button> \
                </center>'
        
    else:   # POST
        
        html = '<h1>Import Finished!</h1> \
                <h3>Check below for errors:</h3>\
                <style>table, th, td {border:1px solid gray; border-collapse: collapse; text-align:center;}</style> \
                <table> \
                    <tr> \
                        <td><b>Output File</b></td> \
                        <td><b>Put() errors</b></td> \
                    </tr>' 

        # directly import (don't put double-underscores in paths!):
        for output_file_name in backup_output_files:
            model_class_name = output_file_name.split("__")[1].split("/")[0]

            # find the model classes dynamically, so don't have to manually import each:
            
            # might be an alias:
            if MODEL_ALIASES.has_key(model_class_name):
                model_class_name = MODEL_ALIASES[model_class_name]
        
            for location in MODEL_LOCATIONS:
                model_class = locate(location + model_class_name)
                if model_class:
                    break
                
            if not model_class:
                html += '<tr><td colspan="2" style="color:red;"><b>Can\'t find this model!  :  {}</b></td></tr>'.format(model_class_name)
            else:
                raw = open(output_file_name, 'rb')
                reader = records.RecordsReader(raw)
            
                put_failure_count = 0
                
                for record in reader:
                    entity_proto = entity_pb.EntityProto(contents=record)
                    entity_proto.key_.set_app( APP_NAME )
                    
                    try:
                        entity = datastore.Entity.FromPb(entity_proto, default_kind=model_class)
                        datastore.Put(entity)
                        
                        # the above works across all: ndb, db, django
                        # more specific, for ndb:
                        # entity = ndb.ModelAdapter(model_class).pb_to_entity(entity_proto)
                        # entity.put()
                        # for db:
                        # entity = db.model_from_protobuf(entity_proto)
                        # entity.put()
                    except  Exception as e:
                        put_failure_count += 1
                        logging.error("Error: {}\n Entity: {}".format(e, entity_proto.value_list() ))
                        
                html += '<tr><td style="text-align:left;">{}</td><td><b>{}</b></td></tr>'.format(
                                                            output_file_name.replace(FILE_PATH, ""), 
                                                            put_failure_count
                                                        )
                
        html += '</table><br /><br /><h1>fin!</h1>'
        
    html += "</body>"
    
    return HttpResponse(html)
Exemplo n.º 13
0
def export_as_csv(request):
    
    backup_info_files, backup_output_files = glob_output_files(FILE_PATH)
    
    html = '<body>'
    
    if not request.method == 'POST':
        
        files       = groupFiles( FILE_PATH )
        model_names = [filename.split(".")[1] for filename in backup_info_files]

        html += '<h2>Export datastore backup file as CSV using  &rarr;views_datastore_handler.py</h2> \
                <h3>&bull; Using file path: &nbsp; {} <br/><br /> \
                &bull; Select a model, then click on the "Export as CSV" at the bottom to convert.</h3>'.format( FILE_PATH )

        html += '<form id="ModelNameForm" action="" method="post">'
        
        for model_name in model_names:
            html += '<input type="radio" name="modelChoice" value="{}" style="margin-left:16px;"/> {} <br />'.format(model_name, model_name)
            
        # only allow one submit every 2.5 seconds
        html += '<script> \
                    function doSubmit() { \
                        var theButton = document.getElementById("submitButton"); \
                        var theForm = document.getElementById("ModelNameForm"); \
                        if (theButton.innerText == "Export as CSV") { \
                            theButton.innerText = "Processing..."; \
                            setTimeout(function(){ document.getElementById("submitButton").innerHTML = "Export as CSV"; }, 2500); \
                            theForm.submit(); \
                        } \
                    } \
                </script> \
                <br /><br /> \
                <button id="submitButton" style="padding:10px;text-align:center;" onclick="doSubmit();">Export as CSV</button> \
                </form>'

        html += '<br /><br /><h3>For debugging, here are the files available:</h3>'
        html += '<h4>backup_info_files: </h4>{}<br /><br />'.format(backup_info_files)
        html += '<h4>backup_output_files: </h4>{}<br /><br />'.format(backup_output_files)
            
        html += '</body>'
        
        return HttpResponse(html)
                
    else:   # POST
    
        model_name = request.POST.get("modelChoice")
        if not model_name:
            return HttpResponse('You forgot to choose a model', content_type='text/html')
        
        header     = []
        rows       = []
        got_header = False
        
        try:
            response = HttpResponse(content_type='text/csv')
            response['Content-Disposition'] = 'attachment; filename="{}.csv"'.format(model_name)
            writer   = csv.writer(response)
            
            
            for output_file_name in backup_output_files:
                model_class_name = output_file_name.split("__")[1].split("/")[0]
                if model_class_name == model_name:
            
                    # might be an alias:
                    if MODEL_ALIASES.has_key(model_class_name):
                        model_class_name = MODEL_ALIASES[model_class_name]
        
                    for location in MODEL_LOCATIONS:
                        model_class = locate(location + model_class_name)
                        if model_class:
                            break
                    
                    raw    = open(output_file_name, 'rb')
                    reader = records.RecordsReader(raw)
            
                    for record in reader:
                        entity_proto = entity_pb.EntityProto(contents=record)
                        entity_proto.key_.set_app( APP_NAME )
                
                        entity = datastore.Entity.FromPb(entity_proto, default_kind=model_class)
                        
                        row = []
                        for k,v in entity.items():
                            if not got_header:
                                header.append(k) 
                            row.append(v)
                        rows.append(row)
                        got_header = True
                    
            writer.writerow(header)
            for row in rows:
                writer.writerow(row)
                    
            return response
            
        except  Exception as e:
            return HttpResponse("Error: {}\n Output file: {}".format(e, output_file_name ))