def Start(): jsonTree = {} items = [] for filename in os.listdir(backupFolder): if not filename.startswith("output-"): continue print("Reading from:" + filename) inPath = os.path.join(backupFolder, filename) raw = open(inPath, 'rb') reader = records.RecordsReader(raw) for recordIndex, record in enumerate(reader): entity_proto = entity_pb.EntityProto(contents=record) #collection = GetCollectionOfProtoEntity(entity_proto) collectionInJSONTree = GetCollectionInJSONTreeForProtoEntity(jsonTree, entity_proto) key = GetKeyOfProtoEntity(entity_proto) entity = GetValueOfProtoEntity(entity_proto) collectionInJSONTree[key] = entity items.append(entity) # also add to flat list, so we know the total item count print("Parsing document #" + str(len(items))) outPath = os.path.join(backupFolder, 'Data.json') out = open(outPath, 'w') out.write(json.dumps(jsonTree, default=JsonSerializeFunc, encoding='latin-1', indent=2)) out.close() print("JSON file written to: " + outPath)
def run(): # Set your downloaded folder's path here (must be readable by dev_appserver) mypath = '/Users/lambert/Dropbox/dancedeets/data/datastore_backup_datastore_backup_2016_11_19_DBEvent/15700286559371541387849311E815D' # Se the class of the objects here cls = DBEvent # Set your app's name here appname = "dev~None" # Do the harlem shake onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] for file in onlyfiles: i = 0 try: raw = open(mypath + "/" + file, 'r') reader = records.RecordsReader(raw) to_put = list() for record in reader: entity_proto = entity_pb.EntityProto(contents=record) entity_proto.key_.app_ = appname obj = cls._from_pb(entity_proto) to_put.append(obj) i += 1 if i % 100 == 0: print "Saved %d %ss" % (i, '') #entity.kind()) ndb.put_multi(to_put) # use_memcache=False) to_put = list() ndb.put_multi(to_put) # use_memcache=False) to_put = list() print "Saved %d" % i except ProtocolBufferDecodeError: """ All good """
def parse_backup_info_file(content): """Returns entities iterator from a backup_info file content.""" reader = records.RecordsReader(cStringIO.StringIO(content)) version = reader.read() if version != '1': raise IOError('Unsupported version') for record in reader: yield datastore.Entity.FromPb(record)
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx.shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") readers = [] for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader(files.BufferedFile(filename)) reader.seek(offset) readers.append((None, None, i, reader)) current_result = None while readers: (key, value, index, reader) = readers[0] if key is not None: if current_result and key != current_result[0]: yield current_result if not current_result or key != current_result[0]: current_result = (key, []) current_result[1].append(value) try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) if current_result: yield current_result
def iter_entity(folder): for d, _, files in os.walk(folder): for fn in files: path = os.path.join(d, fn) raw = open(path, 'r') reader = records.RecordsReader(raw) for record in reader: entity = record_to_dict(record) yield entity raw.close()
def readfile(f, otype, outs): raw = open('{0}/kind_{1}/{2}'.format(DATA_DIRECTORY, otype, f), 'r') reader = records.RecordsReader(raw) last = '' for record in reader: entity_proto = entity_pb.EntityProto(contents=record) entity = datastore.Entity.FromPb(entity_proto) key = entity_proto.key() elems = key.path() the_type = elems.element_list()[-1].type() if the_type in objects: write_object(outs, the_type, entity) count[otype] += 1
def run(): # Set your downloaded folder's path here (must be readable by dev_appserver) mypath = '/local_target' # Set your app's name here appname = "dev~yourappnamehere" # Do the harlem shake onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] ec = datastore_pbs.get_entity_converter() for file in onlyfiles: i = 0 try: raw = open(mypath + "/" + file, 'r') reader = records.RecordsReader(raw) to_put = list() for record in reader: entity_proto = entity_pb.EntityProto(contents=record) entity_proto.key_.app_ = appname entity = db.model_from_protobuf(entity_proto) a = db.model_from_protobuf(entity_proto) for pp in dir(a): try: ppp = getattr(a, "_" + pp) if isinstance(ppp, db.Key): ppp._Key__reference.set_app(appname) ppp except AttributeError: """ It's okay """ to_put.append(a) i += 1 if i % 100 == 0: print "Saved %d %ss" % (i, entity.kind()) db.put(to_put) to_put = list() db.put(to_put) to_put = list() print "Saved %d" % i except ProtocolBufferDecodeError: """ All good """
def import_datastore_backup_file(filename): app = ndb.Key('Foo', 'Bar').app() kind = None i = 0 raw = open(filename, 'r') reader = records.RecordsReader(raw) to_put = list() for record in reader: entity = protobuf_to_entity(record) _localize_key_properties(entity, app) kind = kind or entity.key.kind() to_put.append(entity) if len(to_put) == 100: ndb.put_multi(to_put) to_put = list() i += 100 ndb.put_multi(to_put) i += len(to_put) logging.info('imported %d %s entities', i, kind)
def process(output_dir, time_capture, table_tuple, writeFn): root = table_tuple[0] table_name = table_tuple[1] filenames = table_tuple[2] header_list = None print 'Converting ' + table_name + ' to a CSV file...' time_capture.start() count = 0 # open file for writing with open(join(output_dir, table_name + '.csv'), 'w') as write_file: write_file = csv.writer(write_file) # read files, process, and write for filename in filenames: path = join(root, filename) with open(path, 'r') as raw_file: reader = records.RecordsReader(raw_file) for record in reader: entity_proto = entity_pb.EntityProto(contents=record) entity = datastore.Entity.FromPb(entity_proto) if header_list is None: header_list = parseHeaderFields(entity) display_header_list = ['key'] display_header_list.extend(header_list) writeFn(write_file, entity, display_header_list) csv_row = entity2csvRow(header_list, entity) writeFn(write_file, entity, csv_row) count += 1 time_capture.end(count) print ' ...converted {count:d} objects of type {obj_type} in {run_time:.2f} seconds | {ms_per_obj:.2f} ms/obj | total time = {total_time:.2f} seconds'.format( count=count, obj_type=table_name, run_time=time_capture.run_time, ms_per_obj=time_capture.ms_per_obj, total_time=time_capture.total_time) return time_capture
# type : 0 # show_answer_when_incorrect : False # Returns a formatted json object or string def pp_json(json_thing, sort=True, indents=4): if type(json_thing) is str: return json.dumps(json.loads(json_thing), sort_keys=sort, indent=indents) else: return json.dumps(json_thing, sort_keys=sort, indent=indents) raw = open(sys.argv[1], 'r') reader = records.RecordsReader(raw) questions = [] for record in reader: entity_proto = entity_pb.EntityProto(contents=record) questions.append(datastore.Entity.FromPb(entity_proto)['data']) #print('***RAM*** entity = ' + str(questions) + '\n\n') k = 0 question0 = json.loads(questions[0]) for prop in question0: print prop, ':', question0[prop] print csp_qs = [] k = 0
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx.shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") # Heap with (Key, Value, Index, reader) pairs. readers = [] # Initialize heap for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader(files.BufferedFile(filename)) reader.seek(offset) readers.append((None, None, i, reader)) # Read records from heap and merge values with the same key. # current_result is yielded and consumed buy _merge_map. # current_result = (key, value, is_partial) current_result = None current_count = 0 current_size = 0 while readers: (key, value, index, reader) = readers[0] if key is not None: current_count += 1 current_size += len(value) should_yield = False if current_result: if key != current_result[0]: # New key encountered should_yield = True elif (self._max_values_count != -1 and current_count >= self._max_values_count): # Maximum number of values encountered. current_result[2] = True should_yield = True elif (self._max_values_size != -1 and current_size >= self._max_values_size): # Maximum size of values encountered current_result[2] = True should_yield = True if should_yield: # New key encountered or maximum count hit. Yield current key. yield current_result if not current_result or should_yield: current_result = [key, [], False] current_count = 0 current_size = 0 current_result[1].append(value) # Read next key/value from reader. try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() # update counters if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) # Put read data back into heap. heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) # Yield leftovers. if current_result: yield current_result
def load_datastore_backup(request): backup_info_files, backup_output_files = glob_output_files(FILE_PATH) html = '<body>' if not request.method == 'POST': files = groupFiles( FILE_PATH ) model_names = [filename.split(".")[1] for filename in backup_info_files] html += '<h2>Import datastore backup files into local dev server via →datastore_handler.py</h2> \ <h3>• Using file path: {} <br/><br /> \ • Click on the "Start Export" at the bottom to start the process. \ Depending on the export size, this process can take a very long time.</h3>'.format( FILE_PATH ) html += '<h4>Model Names:</h4>{}<br /><br />'.format(model_names) html += '<h4>backup_info_files: </h4>{}<br /><br />'.format(backup_info_files) html += '<h4>backup_output_files: </h4>{}<br /><br />'.format(backup_output_files) # only allow one submit html += '<script> \ function doSubmit() { \ var theButton = document.getElementById("submitButton"); \ var theForm = document.getElementById("theForm"); \ if (theButton.innerText == "Start Import") { \ theButton.innerText = "Processing..."; \ theForm.submit(); \ } \ } \ </script> \ <form id="theForm" action="" method="post"></form> \ <center> \ <button id="submitButton" style="padding:10px;text-align:center;" onclick="doSubmit();">Start Import</button> \ </center>' else: # POST html = '<h1>Import Finished!</h1> \ <h3>Check below for errors:</h3>\ <style>table, th, td {border:1px solid gray; border-collapse: collapse; text-align:center;}</style> \ <table> \ <tr> \ <td><b>Output File</b></td> \ <td><b>Put() errors</b></td> \ </tr>' # directly import (don't put double-underscores in paths!): for output_file_name in backup_output_files: model_class_name = output_file_name.split("__")[1].split("/")[0] # find the model classes dynamically, so don't have to manually import each: # might be an alias: if MODEL_ALIASES.has_key(model_class_name): model_class_name = MODEL_ALIASES[model_class_name] for location in MODEL_LOCATIONS: model_class = locate(location + model_class_name) if model_class: break if not model_class: html += '<tr><td colspan="2" style="color:red;"><b>Can\'t find this model! : {}</b></td></tr>'.format(model_class_name) else: raw = open(output_file_name, 'rb') reader = records.RecordsReader(raw) put_failure_count = 0 for record in reader: entity_proto = entity_pb.EntityProto(contents=record) entity_proto.key_.set_app( APP_NAME ) try: entity = datastore.Entity.FromPb(entity_proto, default_kind=model_class) datastore.Put(entity) # the above works across all: ndb, db, django # more specific, for ndb: # entity = ndb.ModelAdapter(model_class).pb_to_entity(entity_proto) # entity.put() # for db: # entity = db.model_from_protobuf(entity_proto) # entity.put() except Exception as e: put_failure_count += 1 logging.error("Error: {}\n Entity: {}".format(e, entity_proto.value_list() )) html += '<tr><td style="text-align:left;">{}</td><td><b>{}</b></td></tr>'.format( output_file_name.replace(FILE_PATH, ""), put_failure_count ) html += '</table><br /><br /><h1>fin!</h1>' html += "</body>" return HttpResponse(html)
def export_as_csv(request): backup_info_files, backup_output_files = glob_output_files(FILE_PATH) html = '<body>' if not request.method == 'POST': files = groupFiles( FILE_PATH ) model_names = [filename.split(".")[1] for filename in backup_info_files] html += '<h2>Export datastore backup file as CSV using →views_datastore_handler.py</h2> \ <h3>• Using file path: {} <br/><br /> \ • Select a model, then click on the "Export as CSV" at the bottom to convert.</h3>'.format( FILE_PATH ) html += '<form id="ModelNameForm" action="" method="post">' for model_name in model_names: html += '<input type="radio" name="modelChoice" value="{}" style="margin-left:16px;"/> {} <br />'.format(model_name, model_name) # only allow one submit every 2.5 seconds html += '<script> \ function doSubmit() { \ var theButton = document.getElementById("submitButton"); \ var theForm = document.getElementById("ModelNameForm"); \ if (theButton.innerText == "Export as CSV") { \ theButton.innerText = "Processing..."; \ setTimeout(function(){ document.getElementById("submitButton").innerHTML = "Export as CSV"; }, 2500); \ theForm.submit(); \ } \ } \ </script> \ <br /><br /> \ <button id="submitButton" style="padding:10px;text-align:center;" onclick="doSubmit();">Export as CSV</button> \ </form>' html += '<br /><br /><h3>For debugging, here are the files available:</h3>' html += '<h4>backup_info_files: </h4>{}<br /><br />'.format(backup_info_files) html += '<h4>backup_output_files: </h4>{}<br /><br />'.format(backup_output_files) html += '</body>' return HttpResponse(html) else: # POST model_name = request.POST.get("modelChoice") if not model_name: return HttpResponse('You forgot to choose a model', content_type='text/html') header = [] rows = [] got_header = False try: response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="{}.csv"'.format(model_name) writer = csv.writer(response) for output_file_name in backup_output_files: model_class_name = output_file_name.split("__")[1].split("/")[0] if model_class_name == model_name: # might be an alias: if MODEL_ALIASES.has_key(model_class_name): model_class_name = MODEL_ALIASES[model_class_name] for location in MODEL_LOCATIONS: model_class = locate(location + model_class_name) if model_class: break raw = open(output_file_name, 'rb') reader = records.RecordsReader(raw) for record in reader: entity_proto = entity_pb.EntityProto(contents=record) entity_proto.key_.set_app( APP_NAME ) entity = datastore.Entity.FromPb(entity_proto, default_kind=model_class) row = [] for k,v in entity.items(): if not got_header: header.append(k) row.append(v) rows.append(row) got_header = True writer.writerow(header) for row in rows: writer.writerow(row) return response except Exception as e: return HttpResponse("Error: {}\n Output file: {}".format(e, output_file_name ))