def writeData(writer, filename, data): key = Text() value = BytesWritable() key.set(filename) value.set(data) writer.append(key, value)
def convert_to_sequencefiles(cpp_encrypted_data): # Get all data files outputted by C++ partition_pattern = os.path.join(cpp_encrypted_data, "data/cpp-part*") partition_files = glob.glob(partition_pattern) # Convert each partition to SequenceFile format for partition_file in partition_files: # FIXME: should we stream this so we dont load entire 1 GB into memory? with open(partition_file, "rb") as partition: partition_data = partition.read() # FIXME: better way of generating new file name # This way has the limitation of original path cannot contain `cpp-` output_partition_file = partition_file.replace("cpp-", "") sequence_file_writer = SequenceFile.createWriter( output_partition_file, IntWritable, BytesWritable) key = IntWritable() value = BytesWritable() key.set(0) value.set(partition_data) sequence_file_writer.append(key, value) sequence_file_writer.close() # Remove temporary file generated by C++ os.remove(partition_file)
def sequence(file_out, s3_files_in, make_key, tempvaluefile="/tmp/temp.nc"): """ String file path to write to A list of string file paths to read from. Each file in is encoded to a different k, v pair, with the key equal to the cube's metadata make_key is a function with takes a cube and returns a uid string """ keys_done = [] writer = SequenceFile.createWriter(file_out, Text, BytesWritable) for s3_file_in in s3_files_in: f = get_s3_file(s3_file_in, tempvaluefile) c = iris.load_cube(f) key_writer = Text() if (str(c.metadata) in keys_done): warnings.warn("Key for file " + f + " already present - overwriting") key_writer.set(make_key(c)) keys_done.append(str(c.metadata)) value_writer = BytesWritable() with open(tempvaluefile, "rb") as f: print s3_file_in value_writer.set(f.read()) writer.append(key_writer, value_writer) writer.close()
def sequence(file_out, s3_files_in, make_key, tempvaluefile="/tmp/temp.nc"): """ String file path to write to A list of string file paths to read from. Each file in is encoded to a different k, v pair, with the key equal to the cube's metadata make_key is a function with takes a cube and returns a uid string """ keys_done = [] writer = SequenceFile.createWriter(file_out, Text, BytesWritable) for s3_file_in in s3_files_in: f = get_s3_file(s3_file_in, tempvaluefile) c = iris.load_cube(f) key_writer = Text() if (str(c.metadata) in keys_done): warnings.warn("Key for file "+f+" already present - overwriting") key_writer.set(make_key(c)) keys_done.append(str(c.metadata)) value_writer = BytesWritable() with open(tempvaluefile, "rb") as f: print s3_file_in value_writer.set(f.read()) writer.append(key_writer, value_writer) writer.close()
def writeData(writer): key = BytesWritable() value = BytesWritable() # for i in xrange(1000): key.set("A") value.set("B") print '[%d] %s %s' % (writer.getLength(), key.toString(), value.toString()) writer.append(key, value)
def writeData(self, key, value): datetime_now = time.localtime(time.time()) if datetime_now.tm_mday != self.file_time.tm_mday or datetime_now.tm_hour != self.file_time.tm_hour: self.writer = self.create_writer(self.writer) writer = self.writer writer_key = Text() writer_value = BytesWritable() writer_key.set(key) writer_value.set(value) #print '[%d] %s %s' % (writer.getLength(), writer_key.toString(), writer_value.toString()) writer.append(writer_key, writer_value)
def write_seq_file(file_name, data_dict): writer = SequenceFile.createWriter(file_name, Text, BytesWritable) for key, value in data_dict.iteritems(): print key, ", " , key_writer = Text() key_writer.set(key) value_writer = BytesWritable() iris.save(value, "temp.nc") with open("temp.nc", "rb") as f: value_writer.set(f.read()) writer.append(key_writer, value_writer) writer.close()
def write_seq_file(file_name, data_dict): writer = SequenceFile.createWriter(file_name, Text, BytesWritable) for key, value in data_dict.iteritems(): print key, ", ", key_writer = Text() key_writer.set(key) value_writer = BytesWritable() iris.save(value, "temp.nc") with open("temp.nc", "rb") as f: value_writer.set(f.read()) writer.append(key_writer, value_writer) writer.close()
def importSGY(sgyFilename, rddFilename): # os.remove(rddFilename) fp = open(sgyFilename, 'rb') writer = SequenceFile.createWriter(rddFilename, IntWritable, BytesWritable) SH = segypy.getSegyHeader(sgyFilename, 3600, segypy.endian) bps = segypy.getBytePerSample(SH) filesize = os.path.getsize(sgyFilename) samp_count = SH['ns'] data_len = samp_count * bps trace_size = data_len + 240 ntraces = (filesize - 3600) / trace_size data = fp.read(3600) for trace_num in range(ntraces): SegyTraceHeader = fp.read(240) SegyTraceData = fp.read(data_len) error - segypy.getValue is not correct SegyTraceData = segypy.getValue( SegyTraceData, 0, 'float', segypy.endian, samp_count) writer.append(IntWritable(trace_num), BytesWritable( str(SegyTraceHeader) + str(SegyTraceData)))