def main(self): # Specify options. options = [("n=", "The number of protocol buffers to save.", True)] # Start main method here. command_line = "%s --n=number_to_print input_shard output_shard" options_hash, remainder = parseCommandLine(options, command_line=command_line) if len(remainder) != 2: print usage(sys.argv, command_line, options) sys.exit() input_shard = remainder[0] output_shard = remainder[1] num_in_file = countProtosInFile(input_shard) reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True) num_messages_to_print = int(options_hash["n"]) first_message_to_print = num_in_file - num_messages_to_print if first_message_to_print < 0: first_message_to_print = 0 num_messages = 0 for message in reader: if num_messages >= first_message_to_print: writer.write(message) num_messages += 1 writer.close() reader.close()
def main(self): # Specify options. options = [ ('n=', 'The number of protocol buffers to save.', True), ] # Start main method here. command_line = '%s --n=number_to_print input_shard output_shard' options_hash, remainder = parseCommandLine(options, command_line=command_line) if (len(remainder) != 2): print usage(sys.argv, command_line, options) sys.exit() input_shard = remainder[0] output_shard = remainder[1] reader = ProtocolBufferFileReader(None, filename=input_shard, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_shard, messages_are_byte_strings=True) num_messages_to_print = int(options_hash['n']) num_messages = 0 for message in reader: num_messages += 1 if num_messages > num_messages_to_print: break writer.write(message) writer.close() reader.close()
def run(self): # Specify options. options = [ ('num_records=', 'Number of records to select.', True), ] # Start main method here. command_line = '%s --num_records=n input_shard output_shard' options_hash, remainder = parseCommandLine(options, command_line=command_line) if (len(remainder) != 2): print usage(sys.argv, command_line, options) sys.exit() num_records = int(options_hash['num_records']) input_file = remainder[0] output_file = remainder[1] total_records = countProtosInFile(input_file) print 'Selecting %d records from %d total records.' % (num_records, total_records) random.seed() # Randomly select some records to use. records_to_use = set(random.sample(range(total_records), num_records)) reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True) writer = ProtocolBufferFileWriter(filename=output_file, messages_are_byte_strings=True) for ii, message in enumerate(reader): if ii in records_to_use: writer.write(message) reader.close() writer.close()
def randomize(self, input_file, output_file): reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True) buffer = [] for message in reader: buffer.append(message) reader.close() random.shuffle(buffer) writer = ProtocolBufferFileWriter(filename=output_file, messages_are_byte_strings=True) for message in buffer: writer.write(message) writer.close()
def main(): proto_name = sys.argv[1] filename = sys.argv[2] object = resolveProtoObjectFromString(proto_name) reader = ProtocolBufferFileReader(object, filename=filename) for message in reader: print message
def processFile(self, input, output): reader = ProtocolBufferFileReader(Communication, filename=input) writer = ProtocolBufferFileWriter(filename=output) num_docs = 0 num_entities = 0 num_entities_set = 0 for msg in reader: new_num_entities, new_num_entities_set = self.processCommunication(msg) num_entities_set += new_num_entities_set num_entities += new_num_entities num_docs += 1 writer.write(msg) if num_docs % 100 == 0: sys.stdout.write(str(num_docs) + '\r') sys.stdout.flush() reader.close() writer.close() print 'Processed %d communications.' % num_docs print 'Processed %d entities.' % num_entities print 'Assigned %d canonical names.' % num_entities_set
def run(self): # Specify options. options = [ ('max_shard_size=', 'The maximum size of each shard in bytes.'), ('max_records_per_shard=', 'The maximum number of records in each shard.'), ('count_records_only', 'Counts the number of protobufs in the file and exits.'), ('output_shard_prefix=', 'REQUIRED (unless count_records_only): Creates shards starting with this file prefix.' ), ] # Start main method here. options_hash, remainder = parseCommandLine(options) if (len(remainder) != 1): command_line = '%s --output_shard_prefix=shard_prefix input_shard' print usage(sys.argv, command_line, options) sys.exit() num_options_specified = 0 max_shard_size = None max_records_per_shard = None count_records_only = None if 'max_shard_size' in options_hash: max_shard_size = int(options_hash['max_shard_size']) num_options_specified += 1 if 'max_records_per_shard' in options_hash: max_records_per_shard = int(options_hash['max_records_per_shard']) num_options_specified += 1 print 'Using %d records per shard.' % (max_records_per_shard) if 'count_records_only' in options_hash: print 'Only counting records.' count_records_only = True num_options_specified += 1 if (num_options_specified != 1): print 'Only one of the following options must be specified:' print '\t max_shard_size, max_records_per_shard, count_records_only' sys.exit() if not count_records_only and 'output_shard_prefix' not in options_hash: print 'output_shard_prefix is a required option.' sys.exit() if 'output_shard_prefix' in options_hash: output_shard_prefix = options_hash['output_shard_prefix'] input_file = remainder[0] reader = ProtocolBufferFileReader(None, filename=input_file, return_byte_string_only=True) num_messages_written = 0 bytes_written = 0 total_num_messages = 0 num_files_written = 1 if count_records_only: writer = None else: writer = ProtocolBufferFileWriter( filename=self.createShardFilename(output_shard_prefix, num_files_written), messages_are_byte_strings=True) num_files_written += 1 for message in reader: num_messages_written += 1 total_num_messages += 1 bytes_written += len(message) + 4 # +4 for the message size prefix if writer: writer.write(message) if (max_shard_size != None and bytes_written >= max_shard_size) or \ (max_records_per_shard != None and num_messages_written >= max_records_per_shard): bytes_written = 0 num_messages_written = 0 writer.close() writer = ProtocolBufferFileWriter( filename=self.createShardFilename(output_shard_prefix, num_files_written), messages_are_byte_strings=True) num_files_written += 1 if writer: writer.close() num_files_written -= 1 print 'Number of records in shard: %d' % total_num_messages if num_files_written != 0: print 'Number of files written: %d' % num_files_written