def __init__(self, dirname): self._data = SequenceFile.Reader(os.path.join(dirname, DATA_FILE_NAME)) self._index = SequenceFile.Reader( os.path.join(dirname, INDEX_FILE_NAME)) self._first_position = self._data.getPosition() self._positions = [] self._keys = []
def __init__(self, dirname, key_class, value_class): os.mkdir(dirname) data_path = os.path.join(dirname, DATA_FILE_NAME) self._data = SequenceFile.createWriter(data_path, key_class, value_class) index_path = os.path.join(dirname, INDEX_FILE_NAME) self._index = SequenceFile.createBlockWriter(index_path, key_class, LongWritable) self._size = 0 self._last_index_pos = -1 self._last_index_nkeys = -4294967295
def convert_to_sequencefiles(cpp_encrypted_data): # Get all data files outputted by C++ partition_pattern = os.path.join(cpp_encrypted_data, "data/cpp-part*") partition_files = glob.glob(partition_pattern) # Convert each partition to SequenceFile format for partition_file in partition_files: # FIXME: should we stream this so we dont load entire 1 GB into memory? with open(partition_file, "rb") as partition: partition_data = partition.read() # FIXME: better way of generating new file name # This way has the limitation of original path cannot contain `cpp-` output_partition_file = partition_file.replace("cpp-", "") sequence_file_writer = SequenceFile.createWriter( output_partition_file, IntWritable, BytesWritable) key = IntWritable() value = BytesWritable() key.set(0) value.set(partition_data) sequence_file_writer.append(key, value) sequence_file_writer.close() # Remove temporary file generated by C++ os.remove(partition_file)
def sequence(file_out, s3_files_in, make_key, tempvaluefile="/tmp/temp.nc"): """ String file path to write to A list of string file paths to read from. Each file in is encoded to a different k, v pair, with the key equal to the cube's metadata make_key is a function with takes a cube and returns a uid string """ keys_done = [] writer = SequenceFile.createWriter(file_out, Text, BytesWritable) for s3_file_in in s3_files_in: f = get_s3_file(s3_file_in, tempvaluefile) c = iris.load_cube(f) key_writer = Text() if (str(c.metadata) in keys_done): warnings.warn("Key for file "+f+" already present - overwriting") key_writer.set(make_key(c)) keys_done.append(str(c.metadata)) value_writer = BytesWritable() with open(tempvaluefile, "rb") as f: print s3_file_in value_writer.set(f.read()) writer.append(key_writer, value_writer) writer.close()
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='tsv2seq') # parser.add_argument() parser.add_argument("pathname") args = parser.parse_args(args) outputPathname = args.pathname + ".seq" writer = SequenceFile.createWriter(outputPathname, Text, Text) count = 0 start = datetime.datetime.now() with open(args.pathname, 'r') as f: print f for line in f: try: (url, payload) = line.split('\t') key = Text() key.set(url) value = Text() # I'm not at all sure why we would want to decode, not encode here # this is the only thing that worked value.set(Text.decode(json.dumps(payload))) writer.append(key, value) count += 1 except ValueError as e: pass writer.close() end = datetime.datetime.now() delta = end - start print >> sys.stderr, "ELAPSED tsv2seq is %s" % elapsed(delta) return count
def test_text(): writer = SequenceFile.createWriter('test_text.seq', LongWritable, Text, compression_type=CompressionType.BLOCK) write_text_data(writer) writer.close()
def desequence(seq_file, output_path, get_fname=lambda k, i: "file" + str(i) + ".nc"): """ Takes a sequence file and writes out a separate NetCDF file for each value. seq_file: path to a seq file where the values are valid NetCDF binary blobs output_path: a string path to dump files to get_fname: a function which takes the key and an incrimental integer, and returns a string to be used as the file name. """ reader = SequenceFile.Reader(seq_file) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() i = 0 while reader.next(key, value): with open(output_path + get_fname(key, i), "wb") as f: f.write(value.getBytes()) i += 1 reader.close()
def sequence(file_out, s3_files_in, make_key, tempvaluefile="/tmp/temp.nc"): """ String file path to write to A list of string file paths to read from. Each file in is encoded to a different k, v pair, with the key equal to the cube's metadata make_key is a function with takes a cube and returns a uid string """ keys_done = [] writer = SequenceFile.createWriter(file_out, Text, BytesWritable) for s3_file_in in s3_files_in: f = get_s3_file(s3_file_in, tempvaluefile) c = iris.load_cube(f) key_writer = Text() if (str(c.metadata) in keys_done): warnings.warn("Key for file " + f + " already present - overwriting") key_writer.set(make_key(c)) keys_done.append(str(c.metadata)) value_writer = BytesWritable() with open(tempvaluefile, "rb") as f: print s3_file_in value_writer.set(f.read()) writer.append(key_writer, value_writer) writer.close()
def init_pailfile_source(self, **kwargs): return PailfileSource( self.logger, self.loop, kwargs['gate'], SequenceFile.Reader(kwargs['input'][0].path), )
def main(argv=None): '''this is called if run from command line''' (prog, args) = interpretCmdLine() parser = argparse.ArgumentParser(prog, description='seq2tsv') # parser.add_argument() parser.add_argument("pathname") args = parser.parse_args(args) outputPathname = args.pathname + ".tsv" count = 0 start = datetime.datetime.now() with open(outputPathname, 'w') as f: reader = SequenceFile.Reader(args.pathname) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() # reader.sync(4042) position = reader.getPosition() while reader.next(key, value): # print '*' if reader.syncSeen() else ' ', print >> f, '%s\t%s' % (key.toString(), value.toString()) position = reader.getPosition() reader.close() end = datetime.datetime.now() delta = end - start print >> sys.stderr, "ELAPSED seq2tsv is %s" % elapsed(delta) return count
def convert_from_sequencefiles(encrypted_data): partition_pattern = os.path.join(encrypted_data, "data/part-*") partition_files = glob.glob(partition_pattern) output_partition_files = [] # Convert each partition from SequenceFile format to bytes for partition_file in partition_files: # Example taken from # https://github.com/matteobertozzi/Hadoop/blob/master/python-hadoop/examples/SequenceFileReader.py sequence_file_reader = SequenceFile.Reader(partition_file) key_class = sequence_file_reader.getKeyClass() value_class = sequence_file_reader.getValueClass() key = key_class() value = value_class() # FIXME: better way of generating intermediate file name output_partition_file = partition_file.replace("part-", "cpp-part-") # FIXME: Unclear if we need the below line # position = sequence_file_reader.getPosition() has_next = sequence_file_reader.next(key, value) if has_next: with open(output_partition_file, "wb") as partition: while has_next: partition.write(value.toBytes()) has_next = sequence_file_reader.next(key, value) # position = sequence_file_reader.getPosition() output_partition_files.append(output_partition_file) sequence_file_reader.close() return output_partition_files
def mergeFiles(seq_file_name, directory, suffix): writer = SequenceFile.createWriter(seq_file_name, Text, BytesWritable) for filename in os.listdir(directory): if filename.endswith(suffix): f = open(os.path.join(directory, filename), 'rb') data = f.read() writeData(writer, filename, data) writer.close()
def testWrite(filename): metadata = Metadata() metadata.set('Meta Key 0', 'Meta Value 0') metadata.set('Meta Key 1', 'Meta Value 1') writer = SequenceFile.createWriter(filename, LongWritable, LongWritable, metadata) writeData(writer) writer.close()
def test(): writer = SequenceFile.createWriter('test.seq', LongWritable, LongWritable) writeData(writer) writer.close() writer = SequenceFile.createWriter('test-record.seq', LongWritable, LongWritable, compression_type=CompressionType.RECORD) writeData(writer) writer.close() writer = SequenceFile.createWriter('test-block.seq', LongWritable, LongWritable, compression_type=CompressionType.BLOCK) writeData(writer) writer.close()
def init_parameters(topic_num, word_num, hadoop_hdfs_root): ''' Initialize parameters, alpha, lambda and eta ''' # parameter initialized numpy.random.seed(100000001) # file setting parameter_target_filename = 'parameters_for_0.txt' writer = SequenceFile.createWriter(parameter_target_filename, TypedBytesWritable, TypedBytesWritable) # For alpha _alpha = numpy.zeros(topic_num) + 1. / topic_num output_key_a = TypedBytesWritable() output_value_a = TypedBytesWritable() output_key_a.set('new_alpha') output_value_a.set(_alpha.tostring()) writer.append(output_key_a, output_value_a) # For lambda _lambda = 1 * numpy.random.gamma(100., 1. / 100., (topic_num, word_num)) output_key_l = TypedBytesWritable() output_value_l = TypedBytesWritable() output_key_l.set('new_lambda') output_value_l.set(_lambda.tostring()) writer.append(output_key_l, output_value_l) # For eta _eta = numpy.zeros(word_num) + 1. / topic_num output_key_e = TypedBytesWritable() output_value_e = TypedBytesWritable() output_key_e.set('new_eta') output_value_e.set(_eta.tostring()) writer.append(output_key_e, output_value_e) writer.close() subprocess.call("hadoop dfs -copyFromLocal " + parameter_target_filename + " " + hadoop_hdfs_root, shell=True, stdout=file(os.devnull, "w")) os.remove(parameter_target_filename) return parameter_target_filename
def count_file(filename): reader = SequenceFile.Reader(filename) key = Text() value = NullWritable() count = 0 while reader.next(key, value): count += 1 return count
def test_text(): from hadoop.io.compress.ZlibCodec import ZlibCodec from hadoop.io.compress.GzipCodec import GzipCodec from hadoop.io.compress.BZip2Codec import BZip2Codec from hadoop.io.compress.LzoCodec import LzoCodec from hadoop.io.compress.SnappyCodec import SnappyCodec writer = SequenceFile.createWriter('resume_compressed.seq', Text, Text, compression_codec=SnappyCodec(), compression_type=CompressionType.BLOCK) write_text_data(writer) writer.close()
def write_seq_file(file_name, data_dict): writer = SequenceFile.createWriter(file_name, Text, BytesWritable) for key, value in data_dict.iteritems(): print key, ", " , key_writer = Text() key_writer.set(key) value_writer = BytesWritable() iris.save(value, "temp.nc") with open("temp.nc", "rb") as f: value_writer.set(f.read()) writer.append(key_writer, value_writer) writer.close()
def write_seq_file(file_name, data_dict): writer = SequenceFile.createWriter(file_name, Text, BytesWritable) for key, value in data_dict.iteritems(): print key, ", ", key_writer = Text() key_writer.set(key) value_writer = BytesWritable() iris.save(value, "temp.nc") with open("temp.nc", "rb") as f: value_writer.set(f.read()) writer.append(key_writer, value_writer) writer.close()
def init_parameters(topic_num, word_num, hadoop_hdfs_root): ''' Initialize parameters, alpha, lambda and eta ''' # parameter initialized numpy.random.seed(100000001) # file setting parameter_target_filename = 'parameters_for_0.txt' writer = SequenceFile.createWriter(parameter_target_filename, TypedBytesWritable, TypedBytesWritable) # For alpha _alpha = numpy.zeros(topic_num) + 1./topic_num output_key_a = TypedBytesWritable() output_value_a = TypedBytesWritable() output_key_a.set('new_alpha') output_value_a.set(_alpha.tostring()) writer.append(output_key_a, output_value_a) # For lambda _lambda = 1*numpy.random.gamma(100., 1./100., (topic_num, word_num)) output_key_l = TypedBytesWritable() output_value_l = TypedBytesWritable() output_key_l.set('new_lambda') output_value_l.set(_lambda.tostring()) writer.append(output_key_l, output_value_l) # For eta _eta = numpy.zeros(word_num) + 1./topic_num output_key_e = TypedBytesWritable() output_value_e = TypedBytesWritable() output_key_e.set('new_eta') output_value_e.set(_eta.tostring()) writer.append(output_key_e, output_value_e) writer.close() subprocess.call("hadoop dfs -copyFromLocal " + parameter_target_filename + " " + hadoop_hdfs_root, shell=True, stdout=file(os.devnull, "w")) os.remove(parameter_target_filename) return parameter_target_filename
def __init__(self): self._word_num = int(self.params['word_num']) self._document_num = int(self.params['document_num']) self._minibatch_size = int(self.params['minibatch_size']) self._meanchangethresh = float(self.params['meanchangethresh']) self._topic_num = int(self.params['topic_num']) self._tau0 = float(self.params['tau0']) self._updatect = float(self.params['updatect']) self._kappa = float(self.params['kappa']) rhot = pow(self._tau0 + self._updatect, -self._kappa) self._rhot = rhot # Load parameter from distributed cache parameter_reader = SequenceFile.Reader('./_params') key_class = parameter_reader.getKeyClass() value_class = parameter_reader.getValueClass() key_instance = key_class() value_instance = value_class() while parameter_reader.next(key_instance, value_instance): key_instance_str = key_instance.toString() if 'new_alpha' == key_instance_str: # For alpha self._alpha = value_instance.toString() self._alpha = numpy.fromstring(self._alpha) self._alpha.shape = self._topic_num elif 'new_lambda' == key_instance_str: # For lambda self._lambda = value_instance.toString() self._lambda = numpy.fromstring(self._lambda) self._lambda.shape = (self._topic_num, self._word_num) elif 'new_eta' == key_instance_str: # For eta self._eta = value_instance.toString() self._eta = numpy.fromstring(self._eta) self._eta.shape = self._word_num else: # Error sys.stderr.write("Something wrong in parameter_reader\n") sys.exit(1) parameter_reader.close() self._Elogbeta = self.dirichlet_expectation(self._lambda) self._expElogbeta = numpy.exp(self._Elogbeta) # initialize sstats self.sstats = numpy.zeros((self._topic_num, self._word_num)) self.gamma = numpy.zeros((self._minibatch_size, self._topic_num))
def make_text_null_seq(filename, reader): writer = SequenceFile.createWriter(filename, Text, NullWritable) key = Text() value = NullWritable() count = 0 for x in reader: key.set(x) writer.append(key, value) count += 1 writer.close() return count
def seqReader(path): reader = SequenceFile.Reader(path) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() while reader.next(key, value): position = reader.getPosition() name, d1, d2 = key.toString().split(".") nparr = np.array(value.toString().split(","), np.uint8).reshape(int(d1), int(d2)) #img = cv2.imdecode(nparr, cv2.CV_LOAD_IMAGE_COLOR) print nparr.shape reader.close()
def seqReader(pathtpsaveimage): reader = SequenceFile.Reader(self.path) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() while reader.next(key, value): position = reader.getPosition() name,d1,d2,ext=key.toString().split(".") print len(value.getBytes()) nparr = np.fromstring(value.getBytes(), np.uint8) img = cv2.imdecode(nparr, cv2.CV_LOAD_IMAGE_COLOR) print np.array(img).size reader.close()
def SequenceFileIterator(path): reader = SequenceFile.Reader(path) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() while reader.next(key, value): yield (position, key.toString(), value.toString()) position = reader.getPosition() reader.close()
def exportSGY(rddFilename, sgyFilename): reader = SequenceFile.Reader(rddFilename) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() # reader.sync(4042) position = reader.getPosition() while reader.next(key, value): print('*' if reader.syncSeen() else ' ', '[%6s] %6s %6s' % (position, key.toString(), value.toString())) position = reader.getPosition() reader.close()
def hadoop_input_stream(stream, size, url, params): stream.seek(0, 2) size = stream.tell() stream.seek(0) reader = SequenceFile.Reader(stream, length=size) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() while reader.next(key, value): yield key, value reader.close()
def __init__(self, path, metadict = None): FileBase.__init__(self, path) self._tmppostfix = 'seqtmp' self._postfix = 'seq' self._raw_key, self._raw_value = Text(), Text() self._item_count = 0 tmpdict = metadict or {} tmpdict['name'] = 'SequenceFileWriter' tmpdict['ver'] = '0.1' from hadoop.io.SequenceFile import Metadata meta = Metadata() for k, v in tmpdict.items(): meta.set(k, v) self._writer = SequenceFile.createWriter(self._gen_tmp_path(), Text, Text, metadata = meta, compression_type = SequenceFile.CompressionType.BLOCK) assert self._writer, "Failed Create Writer File handler"
def seqReader(pathtpsaveimage): reader = SequenceFile.Reader(self.path) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() compression_codec = BZip2Codec() while reader.next(key, value): position = reader.getPosition() name, d1, d2, ext = key.toString().split(".") arr = compression_codec.decompress(value.getBytes()) nparr = np.frombuffer(arr, np.uint8) try: img = cv2.imdecode(nparr, cv2.CV_LOAD_IMAGE_COLOR) except AttributeError: img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) print name, img.shape reader.close()
def testRead(filename): reader = SequenceFile.Reader(filename) metadata = reader.getMetadata() for meta_key, meta_value in metadata: print 'METADATA:', meta_key, meta_value key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() position = reader.getPosition() while reader.next(key, value): print '*' if reader.syncSeen() else ' ', print '[%6s] %6s %6s' % (position, key.toString(), value.toString()) position = reader.getPosition() reader.close()
def main(): inputfiles = sys.argv[1] call(['mkdir', os.path.join(options.tmpdir, 'tmp')]) print "downloading inputfiles %s" % (inputfiles) check_call([ 'hadoop', 'fs', '-copyToLocal', inputfiles, os.path.join(options.tmpdir, 'tmp') ]) order = {} values = [] for fname in os.listdir(os.path.join(options.tmpdir, 'tmp')): reader = SequenceFile.Reader(os.path.join(options.tmpdir, 'tmp', fname)) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() while reader.next(key, value): order[int(key.get())] = value.get() reader.close() var = [] for key, val in sorted(order.iteritems()): var.extend(val) var2 = np.array(var) print "reading templatefile %s" % (options.template) templatefile = ep.ExoFile(options.template, 'r') print "writing outputfile %s" % (options.output) newfile = ep.ExoFile(options.output, 'w') result = insert_vars(templatefile, newfile, (options.varname, ), (var2, )) print "removing inputfiles %s" % (inputfiles) check_call(['rm', '-r', os.path.join(options.tmpdir, 'tmp')]) print "Done!"
def __init__(self): numpy.random.seed(100000001) self._word_num = int(self.params['word_num']) self._meanchangethresh = float(self.params['meanchangethresh']) self._topic_num = int(self.params['topic_num']) # Load parameter from distributed cache parameter_reader = SequenceFile.Reader('./_params') key_class = parameter_reader.getKeyClass() value_class = parameter_reader.getValueClass() key_instance = key_class() value_instance = value_class() while parameter_reader.next(key_instance, value_instance): key_instance_str = key_instance.toString() if 'new_alpha' == key_instance_str: # For alpha self._alpha = value_instance.toString() self._alpha = numpy.fromstring(self._alpha) self._alpha.shape = self._topic_num elif 'new_lambda' == key_instance_str: # For lambda self._lambda = value_instance.toString() self._lambda = numpy.fromstring(self._lambda) self._lambda.shape = (self._topic_num, self._word_num) elif 'new_eta' == key_instance_str: # For eta # loading useless continue else: # Error sys.stderr.write("Something wrong in parameter_reader\n") sys.exit(1) parameter_reader.close() self._Elogbeta = self.dirichlet_expectation(self._lambda) self._expElogbeta = numpy.exp(self._Elogbeta)
def importSGY(sgyFilename, rddFilename): # os.remove(rddFilename) fp = open(sgyFilename, 'rb') writer = SequenceFile.createWriter(rddFilename, IntWritable, BytesWritable) SH = segypy.getSegyHeader(sgyFilename, 3600, segypy.endian) bps = segypy.getBytePerSample(SH) filesize = os.path.getsize(sgyFilename) samp_count = SH['ns'] data_len = samp_count * bps trace_size = data_len + 240 ntraces = (filesize - 3600) / trace_size data = fp.read(3600) for trace_num in range(ntraces): SegyTraceHeader = fp.read(240) SegyTraceData = fp.read(data_len) error - segypy.getValue is not correct SegyTraceData = segypy.getValue( SegyTraceData, 0, 'float', segypy.endian, samp_count) writer.append(IntWritable(trace_num), BytesWritable( str(SegyTraceHeader) + str(SegyTraceData)))
# with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from hadoop.io.SequenceFile import CompressionType from hadoop.io import BytesWritable from hadoop.io import LongWritable from hadoop.io import SequenceFile def writeData(writer): key = BytesWritable() value = BytesWritable() # for i in xrange(1000): key.set("A") value.set("B") print '[%d] %s %s' % (writer.getLength(), key.toString(), value.toString()) writer.append(key, value) if __name__ == '__main__': writer = SequenceFile.createWriter('test-bytes.seq', BytesWritable, BytesWritable) writeData(writer) writer.close()
def compile_data(input_str, substitute, outputpath='', compression=False, test_file=False, p=None): temp = input_str.rpartition(os.sep) path_temp = temp[0] file_temp = temp[2] if outputpath is not '': try: os.mkdir(outputpath) except: pass if not p==None: global pool pool=p ica_key, ica_val, raw_key, raw_val = Text(), Text(), Text(), Text() for i, v in enumerate(substitute): path_to_data = path_temp.replace('?', str(v)) filename = file_temp.replace('?', str(v)) eeg = get_eeg(path_to_data + os.sep, filename) if eeg is not 1: raw_data, ica_act = read_full_float(eeg) else: continue if raw_data is None: continue print(filename + ': identifying outliers') artifact_indexes = find_artifact_indexes(eeg, ica_act) eeg['artifact_indexes'] = artifact_indexes; f=open('..\\artifact_indexes', 'w') pickle.dump(artifact_indexes,f) f.close() eegstr = pickle.dumps(eeg, protocol=2) print(filename + ': compiling dataset into hadoop sequence file') if outputpath is '': outputpath = path_to_data; #Enable compression if requested if compression: comp_type=SequenceFile.CompressionType.RECORD else: comp_type=SequenceFile.CompressionType.NONE writer = SequenceFile.createWriter(outputpath + os.sep + filename + '.seq', Text, Text, compression_type=comp_type) for i in range(raw_data.shape[1]): if test_file and i > 3: break this_raw = np.ascontiguousarray(raw_data[:,i], dtype=raw_data.dtype) this_ica = np.ascontiguousarray(ica_act[:,i], dtype=ica_act.dtype) ica_key.set(outputpath + os.sep + filename + '.ica.' + str(i+1)) raw_key.set(outputpath + os.sep + filename + '.raw.' + str(i+1)) ica_temp = pickle.dumps((this_ica, eegstr), protocol=2) raw_temp = pickle.dumps((this_raw, eegstr), protocol=2) ica = base64.b64encode(ica_temp) raw = base64.b64encode(raw_temp) ica_val.set(ica) raw_val.set(raw) writer.append(raw_key, raw_val) writer.append(ica_key, ica_val) print(filename + ': '+str(i+1)) writer.close() print filename + ': finished writing file' return 0
#!/usr/bin/env python from hadoop.io import SequenceFile, Text from hadoop.io.SequenceFile import CompressionType import msgpack import re import itertools writer = SequenceFile.createWriter ('pg10.seq', Text, Text, compression_type=CompressionType.BLOCK) key = Text() value = Text() with open ('pg10.txt', 'r') as f: for pos, line in enumerate(iter (f.readline, '')): line = line.strip() pos = msgpack.packb (pos) line = msgpack.packb (line) key._bytes = pos key._length = len(pos) value._bytes = line value._length = len(line) writer.append (key, value) writer.close()
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys from hadoop.io import SequenceFile if __name__ == '__main__': if len(sys.argv) < 2: print('usage: SequenceFileReader <filename>') else: reader = SequenceFile.Reader(sys.argv[1]) key_class = reader.getKeyClass() value_class = reader.getValueClass() key = key_class() value = value_class() #reader.sync(4042) position = reader.getPosition() while reader.next(key, value): print('*' if reader.syncSeen() else ' ', end=' ') print('[%6s] %6s %6s' % (position, key.toString(), value.toString())) position = reader.getPosition()
def create_writer(self, writer=None): if writer: writer.close() seq_file = self.gernate_file() self.seq_filename = seq_file return SequenceFile.createWriter(seq_file, Text, BytesWritable, compression_type=CompressionType.RECORD)
# regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from hadoop.io.SequenceFile import CompressionType from hadoop.io import Text from hadoop.io import SequenceFile def writeData(writer): key = Text() value = Text() key.set('Key') value.set('Value') writer.append(key, value) if __name__ == '__main__': writer = SequenceFile.createWriter('test.seq', Text, Text) writeData(writer) writer.close()
def convert(inputfile, steps, outdir, variables): f = ep.ExoFile(inputfile,'r') total_time_steps = f.num_time_steps if total_time_steps < steps: print >> sys.stderr, 'The total time steps is', total_time_steps print >> sys.stderr, 'The patitions step is',steps,'. No need to patition the file.' return False Vars = variables.split(',') # Get time data and coordinate (x,y,z) data time = f.cdf.variables["time_whole"] timedata = time.getValue() coordz = f.cdf.variables["coordz"] zdata = coordz.getValue() coordy = f.cdf.variables["coordy"] ydata = coordy.getValue() coordx = f.cdf.variables["coordx"] xdata = coordx.getValue() # To avoid PICKLE type in typedbytes files timedata2 = [] for i, ele in enumerate(timedata): timedata2.append(float(ele)) xdata2 = [] for i, ele in enumerate(xdata): xdata2.append(float(ele)) ydata2 = [] for i, ele in enumerate(ydata): ydata2.append(float(ele)) zdata2 = [] for i, ele in enumerate(zdata): zdata2.append(float(ele)) # Get variable data varnames = f.node_variable_names() vardata = [] for i, var in enumerate(Vars): vdata = None for vi,n in enumerate(varnames): if n == var.strip(): #vtemp = vi vindex = vi break if vindex == None: print >> sys.stderr, 'The variable ', var.strip(), 'does not exist!' return False tmp = f.vars['vals_nod_var'+str(vindex+1)] tmpdata = tmp.getValue() vardata.append((var.strip(), tmpdata)) # Begin to partition basename = os.path.basename(inputfile) ind = basename.rfind('.') basename = basename[0:ind] indexkey = TypedBytesWritable() indexvalue = TypedBytesWritable() indexwriter = SequenceFile.createWriter(os.path.join(outdir,'index.seq'), TypedBytesWritable, TypedBytesWritable,compression_type=CompressionType.RECORD) begin = 0 i = 0 while begin < total_time_steps: end = begin + steps - 1 if end > total_time_steps - 1: end = total_time_steps - 1 outputfilename = basename + '_part'+ str(i) + '.seq' writer = SequenceFile.createWriter(os.path.join(outdir,outputfilename), TypedBytesWritable, TypedBytesWritable,compression_type=CompressionType.RECORD) key = TypedBytesWritable() value = TypedBytesWritable() key.set(-1) value.set(xdata2) writer.append(key,value) key.set(-2) value.set(ydata2) writer.append(key,value) key.set(-3) value.set(zdata2) writer.append(key,value) for j in xrange(begin, end+1): key.set((j,timedata2[j])) valuedata = [] for m, var in enumerate(vardata): name = var[0] data = var[1][j] data2 = [] for m, ele in enumerate(data): data2.append(float(ele)) valuedata.append((name,data2)) value.set(valuedata) writer.append(key,value) writer.close() indexkey.set(outputfilename) indexvalue.set(end-begin+1) indexwriter.append(indexkey,indexvalue) begin = begin + steps i = i + 1 indexkey.set('total') indexvalue.set(total_time_steps) indexwriter.append(indexkey,indexvalue) indexwriter.close() return True
#!/usr/bin/env python import sys #read the input files for line in sys.stdin: print (1, line[5] or 'N/A') #!/usr/bin/env python import sys from hadoop.io import LongWritable from hadoop.io import SequenceFile writer = SequenceFile.createWriter('reddit_posts.seq' % _id, LongWritable, LongWritable) #read the input files for line in sys.stdin: #use try/expect block to make sure a improperly formatted row does not blow our program u # remove leading and trailing whitespace # assume that the files are both comma delimited and only contain the columns described in assignment 4 part 1 line = line.strip().split(",") _id, _text, = line[0], line[5] or 'N/A' key = LongWritable() key.set(int(_id)) value = LongWritable()
def convert(inputfile, steps, outdir, variables, normalized_timesteps): fset = 0 fdir,fname = os.path.split(inputfile) fsetnum='' for i,c in enumerate(fname): if c.isdigit(): fsetnum+=c if fsetnum != '': fset=int(fsetnum) f = ep.ExoFile(inputfile,'r') Vars = variables.split(',') # Get time data and coordinate (x,y,z) data time = f.cdf.variables["time_whole"] timedata = time.getValue() coordz = f.cdf.variables["coordz"] zdata = coordz.getValue() coordy = f.cdf.variables["coordy"] ydata = coordy.getValue() coordx = f.cdf.variables["coordx"] xdata = coordx.getValue() # To avoid PICKLE type in typedbytes files timedata2 = [] for i, ele in enumerate(timedata): timedata2.append(float(ele)) xdata2 = [] for i, ele in enumerate(xdata): xdata2.append(float(ele)) ydata2 = [] for i, ele in enumerate(ydata): ydata2.append(float(ele)) zdata2 = [] for i, ele in enumerate(zdata): zdata2.append(float(ele)) # Note: the size of normalized_timesteps should not be greater than # num_time_steps in the exodus file. if normalized_timesteps is None: normalized_timesteps = timedata2 total_time_steps = len(normalized_timesteps) # Get variable data varnames = f.node_variable_names() vardata = [] for i, var in enumerate(Vars): vdata = None for vi,n in enumerate(varnames): if n == var.strip(): #vtemp = vi vindex = vi break if vindex == None: print >> sys.stderr, 'The variable ', var.strip(), 'does not exist!' return False tmp = f.vars['vals_nod_var'+str(vindex+1)] tmpdata = tmp.getValue() vardata.append((var.strip(), tmpdata)) # Begin to partition basename = os.path.basename(inputfile) ind = basename.rfind('.') basename = basename[0:ind] indexkey = TypedBytesWritable() indexvalue = TypedBytesWritable() indexwriter = SequenceFile.createWriter(os.path.join(outdir,'index.seq'), TypedBytesWritable, TypedBytesWritable,compression_type=CompressionType.RECORD) begin = 0 i = 0 time_begin = 0 while begin < total_time_steps: end = begin + steps - 1 if end > total_time_steps - 1: end = total_time_steps - 1 outputfilename = basename + '_part'+ str(i) + '.seq' writer = SequenceFile.createWriter(os.path.join(outdir,outputfilename), TypedBytesWritable, TypedBytesWritable,compression_type=CompressionType.RECORD) key = TypedBytesWritable() value = TypedBytesWritable() key.set(-1) value.set(xdata2) writer.append(key,value) key.set(-2) value.set(ydata2) writer.append(key,value) key.set(-3) value.set(zdata2) writer.append(key,value) for j in xrange(begin, end+1): key.set((fset,(j,normalized_timesteps[j]))) valuedata = [] for m, var in enumerate(vardata): name = var[0] data = var[1] for t in xrange(time_begin, len(timedata2)): if normalized_timesteps[j] == timedata2[t]: normalized_data = data[t] time_begin = t break elif normalized_timesteps[j] < timedata2[t]: normalized_data = linear_interpolate(normalized_timesteps[j], timedata2[t-1], data[t-1], timedata2[t], data[t]) break data2 = [] for m, ele in enumerate(normalized_data): data2.append(float(ele)) valuedata.append((name,data2)) value.set(valuedata) writer.append(key,value) writer.close() indexkey.set(outputfilename) indexvalue.set(end-begin+1) indexwriter.append(indexkey,indexvalue) begin = begin + steps i = i + 1 indexkey.set('total') indexvalue.set(total_time_steps) indexwriter.append(indexkey,indexvalue) indexwriter.close() return True