Пример #1
0
 def __init__(self, dirname):
     self._data = SequenceFile.Reader(os.path.join(dirname, DATA_FILE_NAME))
     self._index = SequenceFile.Reader(
         os.path.join(dirname, INDEX_FILE_NAME))
     self._first_position = self._data.getPosition()
     self._positions = []
     self._keys = []
Пример #2
0
    def __init__(self, dirname, key_class, value_class):
        os.mkdir(dirname)

        data_path = os.path.join(dirname, DATA_FILE_NAME)
        self._data = SequenceFile.createWriter(data_path, key_class,
                                               value_class)

        index_path = os.path.join(dirname, INDEX_FILE_NAME)
        self._index = SequenceFile.createBlockWriter(index_path, key_class,
                                                     LongWritable)

        self._size = 0
        self._last_index_pos = -1
        self._last_index_nkeys = -4294967295
Пример #3
0
def convert_to_sequencefiles(cpp_encrypted_data):
    # Get all data files outputted by C++
    partition_pattern = os.path.join(cpp_encrypted_data, "data/cpp-part*")
    partition_files = glob.glob(partition_pattern)

    # Convert each partition to SequenceFile format
    for partition_file in partition_files:
        # FIXME: should we stream this so we dont load entire 1 GB into memory?
        with open(partition_file, "rb") as partition:
            partition_data = partition.read()

        # FIXME: better way of generating new file name
        # This way has the limitation of original path cannot contain `cpp-`
        output_partition_file = partition_file.replace("cpp-", "")
        sequence_file_writer = SequenceFile.createWriter(
            output_partition_file, IntWritable, BytesWritable)

        key = IntWritable()
        value = BytesWritable()

        key.set(0)
        value.set(partition_data)

        sequence_file_writer.append(key, value)
        sequence_file_writer.close()

        # Remove temporary file generated by C++
        os.remove(partition_file)
Пример #4
0
def sequence(file_out, s3_files_in, make_key, tempvaluefile="/tmp/temp.nc"):
    """
    String file path to write to
    A list of string file paths to read from. Each file in is encoded to a
    different k, v pair, with the key equal to the cube's metadata
    make_key is a function with takes a cube and returns a uid string
    
    """
    keys_done = []
    
    writer = SequenceFile.createWriter(file_out, Text, BytesWritable)
    for s3_file_in in s3_files_in:
        f = get_s3_file(s3_file_in, tempvaluefile)
        c = iris.load_cube(f)
        key_writer = Text()
        
        if (str(c.metadata) in keys_done):
            warnings.warn("Key for file "+f+" already present - overwriting")
        key_writer.set(make_key(c))
        keys_done.append(str(c.metadata))
        
        value_writer = BytesWritable()
        with open(tempvaluefile, "rb") as f:
            print s3_file_in
            value_writer.set(f.read())
            writer.append(key_writer, value_writer)
    writer.close()
Пример #5
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='tsv2seq')
    # parser.add_argument()
    parser.add_argument("pathname")
    args = parser.parse_args(args)
    
    outputPathname = args.pathname + ".seq"
    writer = SequenceFile.createWriter(outputPathname, Text, Text)
    count = 0
    start = datetime.datetime.now()
    with open(args.pathname, 'r') as f:
        print f
        for line in f:
            try:
                (url, payload) = line.split('\t')
                key = Text()
                key.set(url)
                value = Text()
                # I'm not at all sure why we would want to decode, not encode here
                # this is the only thing that worked
                value.set(Text.decode(json.dumps(payload)))
                writer.append(key, value)
                count += 1
            except ValueError as e:
                pass
    writer.close()
    end = datetime.datetime.now()
    delta = end - start
    print >> sys.stderr, "ELAPSED tsv2seq is %s" % elapsed(delta)
    return count
def test_text():
    writer = SequenceFile.createWriter('test_text.seq',
                                       LongWritable,
                                       Text,
                                       compression_type=CompressionType.BLOCK)
    write_text_data(writer)
    writer.close()
def test_text():
    writer = SequenceFile.createWriter('test_text.seq',
                                       LongWritable,
                                       Text,
                                       compression_type=CompressionType.BLOCK)
    write_text_data(writer)
    writer.close()
Пример #8
0
def desequence(seq_file,
               output_path,
               get_fname=lambda k, i: "file" + str(i) + ".nc"):
    """
    Takes a sequence file and writes out a separate NetCDF file
    for each value.

    seq_file: path to a seq file where the values are valid NetCDF binary blobs
    output_path: a string path to dump files to
    get_fname: a function which takes the key and an incrimental integer,
                    and returns a string to be used as the file name.

    """
    reader = SequenceFile.Reader(seq_file)

    key_class = reader.getKeyClass()
    value_class = reader.getValueClass()

    key = key_class()
    value = value_class()

    position = reader.getPosition()
    i = 0
    while reader.next(key, value):
        with open(output_path + get_fname(key, i), "wb") as f:
            f.write(value.getBytes())
        i += 1
    reader.close()
Пример #9
0
def sequence(file_out, s3_files_in, make_key, tempvaluefile="/tmp/temp.nc"):
    """
    String file path to write to
    A list of string file paths to read from. Each file in is encoded to a
    different k, v pair, with the key equal to the cube's metadata
    make_key is a function with takes a cube and returns a uid string
    
    """
    keys_done = []

    writer = SequenceFile.createWriter(file_out, Text, BytesWritable)
    for s3_file_in in s3_files_in:
        f = get_s3_file(s3_file_in, tempvaluefile)
        c = iris.load_cube(f)
        key_writer = Text()

        if (str(c.metadata) in keys_done):
            warnings.warn("Key for file " + f +
                          " already present - overwriting")
        key_writer.set(make_key(c))
        keys_done.append(str(c.metadata))

        value_writer = BytesWritable()
        with open(tempvaluefile, "rb") as f:
            print s3_file_in
            value_writer.set(f.read())
            writer.append(key_writer, value_writer)
    writer.close()
Пример #10
0
 def init_pailfile_source(self, **kwargs):
     return PailfileSource(
         self.logger,
         self.loop,
         kwargs['gate'],
         SequenceFile.Reader(kwargs['input'][0].path),
     )
Пример #11
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='seq2tsv')
    # parser.add_argument()
    parser.add_argument("pathname")
    args = parser.parse_args(args)
    outputPathname = args.pathname + ".tsv"
    count = 0
    start = datetime.datetime.now()
    with open(outputPathname, 'w') as f:
        reader = SequenceFile.Reader(args.pathname)

        key_class = reader.getKeyClass()
        value_class = reader.getValueClass()

        key = key_class()
        value = value_class()

        # reader.sync(4042)
        position = reader.getPosition()
        while reader.next(key, value):
            # print '*' if reader.syncSeen() else ' ',
            print >> f, '%s\t%s' % (key.toString(), value.toString())
            position = reader.getPosition()

        reader.close()
    end = datetime.datetime.now()
    delta = end - start
    print >> sys.stderr, "ELAPSED seq2tsv is %s" % elapsed(delta)
    return count
Пример #12
0
def convert_from_sequencefiles(encrypted_data):
    partition_pattern = os.path.join(encrypted_data, "data/part-*")
    partition_files = glob.glob(partition_pattern)

    output_partition_files = []

    # Convert each partition from SequenceFile format to bytes
    for partition_file in partition_files:
        # Example taken from
        # https://github.com/matteobertozzi/Hadoop/blob/master/python-hadoop/examples/SequenceFileReader.py
        sequence_file_reader = SequenceFile.Reader(partition_file)
        key_class = sequence_file_reader.getKeyClass()
        value_class = sequence_file_reader.getValueClass()

        key = key_class()
        value = value_class()

        # FIXME: better way of generating intermediate file name
        output_partition_file = partition_file.replace("part-", "cpp-part-")

        # FIXME: Unclear if we need the below line
        #  position = sequence_file_reader.getPosition()
        has_next = sequence_file_reader.next(key, value)
        if has_next:
            with open(output_partition_file, "wb") as partition:
                while has_next:
                    partition.write(value.toBytes())
                    has_next = sequence_file_reader.next(key, value)
                    #  position = sequence_file_reader.getPosition()

            output_partition_files.append(output_partition_file)

        sequence_file_reader.close()

    return output_partition_files
Пример #13
0
def mergeFiles(seq_file_name, directory, suffix):
    writer = SequenceFile.createWriter(seq_file_name, Text, BytesWritable)
    for filename in os.listdir(directory):
        if filename.endswith(suffix):
            f = open(os.path.join(directory, filename), 'rb')
            data = f.read()
            writeData(writer, filename, data)
    writer.close()
Пример #14
0
def testWrite(filename):
    metadata = Metadata()
    metadata.set('Meta Key 0', 'Meta Value 0')
    metadata.set('Meta Key 1', 'Meta Value 1')

    writer = SequenceFile.createWriter(filename, LongWritable, LongWritable, metadata)
    writeData(writer)
    writer.close()
def test():
    writer = SequenceFile.createWriter('test.seq', LongWritable, LongWritable)
    writeData(writer)
    writer.close()

    writer = SequenceFile.createWriter('test-record.seq',
                                       LongWritable,
                                       LongWritable,
                                       compression_type=CompressionType.RECORD)
    writeData(writer)
    writer.close()

    writer = SequenceFile.createWriter('test-block.seq',
                                       LongWritable,
                                       LongWritable,
                                       compression_type=CompressionType.BLOCK)
    writeData(writer)
    writer.close()
Пример #16
0
def testWrite(filename):
    metadata = Metadata()
    metadata.set('Meta Key 0', 'Meta Value 0')
    metadata.set('Meta Key 1', 'Meta Value 1')

    writer = SequenceFile.createWriter(filename, LongWritable, LongWritable,
                                       metadata)
    writeData(writer)
    writer.close()
Пример #17
0
def init_parameters(topic_num, word_num, hadoop_hdfs_root):
    '''
    Initialize parameters, alpha, lambda and eta
    '''
    # parameter initialized
    numpy.random.seed(100000001)

    # file setting
    parameter_target_filename = 'parameters_for_0.txt'

    writer = SequenceFile.createWriter(parameter_target_filename,
                                       TypedBytesWritable, TypedBytesWritable)

    # For alpha
    _alpha = numpy.zeros(topic_num) + 1. / topic_num

    output_key_a = TypedBytesWritable()
    output_value_a = TypedBytesWritable()

    output_key_a.set('new_alpha')
    output_value_a.set(_alpha.tostring())

    writer.append(output_key_a, output_value_a)

    # For lambda
    _lambda = 1 * numpy.random.gamma(100., 1. / 100., (topic_num, word_num))

    output_key_l = TypedBytesWritable()
    output_value_l = TypedBytesWritable()

    output_key_l.set('new_lambda')
    output_value_l.set(_lambda.tostring())

    writer.append(output_key_l, output_value_l)

    # For eta
    _eta = numpy.zeros(word_num) + 1. / topic_num

    output_key_e = TypedBytesWritable()
    output_value_e = TypedBytesWritable()

    output_key_e.set('new_eta')
    output_value_e.set(_eta.tostring())

    writer.append(output_key_e, output_value_e)

    writer.close()

    subprocess.call("hadoop dfs -copyFromLocal " + parameter_target_filename +
                    " " + hadoop_hdfs_root,
                    shell=True,
                    stdout=file(os.devnull, "w"))
    os.remove(parameter_target_filename)

    return parameter_target_filename
def test():
    writer = SequenceFile.createWriter('test.seq',
                                       LongWritable,
                                       LongWritable)
    writeData(writer)
    writer.close()

    writer = SequenceFile.createWriter('test-record.seq',
                                       LongWritable,
                                       LongWritable,
                                       compression_type=CompressionType.RECORD)
    writeData(writer)
    writer.close()

    writer = SequenceFile.createWriter('test-block.seq',
                                       LongWritable,
                                       LongWritable,
                                       compression_type=CompressionType.BLOCK)
    writeData(writer)
    writer.close()
def count_file(filename):
    reader = SequenceFile.Reader(filename)

    key = Text()
    value = NullWritable()

    count = 0
    while reader.next(key, value):
        count += 1

    return count
def test_text():

    from hadoop.io.compress.ZlibCodec import ZlibCodec
    from hadoop.io.compress.GzipCodec import GzipCodec
    from hadoop.io.compress.BZip2Codec import BZip2Codec
    from hadoop.io.compress.LzoCodec import LzoCodec
    from hadoop.io.compress.SnappyCodec import SnappyCodec

    writer = SequenceFile.createWriter('resume_compressed.seq', Text, Text,
                                       compression_codec=SnappyCodec(),
                                       compression_type=CompressionType.BLOCK)
    write_text_data(writer)
    writer.close()
def write_seq_file(file_name, data_dict):
    writer = SequenceFile.createWriter(file_name, Text, BytesWritable)
    for key, value in data_dict.iteritems():
        print key, ", " ,
        key_writer = Text()
        key_writer.set(key)
        
        value_writer = BytesWritable()
        iris.save(value, "temp.nc")
        with open("temp.nc", "rb") as f:
            value_writer.set(f.read())
        writer.append(key_writer, value_writer)
    writer.close()
def write_seq_file(file_name, data_dict):
    writer = SequenceFile.createWriter(file_name, Text, BytesWritable)
    for key, value in data_dict.iteritems():
        print key, ", ",
        key_writer = Text()
        key_writer.set(key)

        value_writer = BytesWritable()
        iris.save(value, "temp.nc")
        with open("temp.nc", "rb") as f:
            value_writer.set(f.read())
        writer.append(key_writer, value_writer)
    writer.close()
Пример #23
0
def init_parameters(topic_num, word_num, hadoop_hdfs_root):
    '''
    Initialize parameters, alpha, lambda and eta
    '''
    # parameter initialized    
    numpy.random.seed(100000001)
    
    # file setting
    parameter_target_filename = 'parameters_for_0.txt'
    
    writer = SequenceFile.createWriter(parameter_target_filename, TypedBytesWritable, TypedBytesWritable)
    
    # For alpha
    _alpha = numpy.zeros(topic_num) + 1./topic_num
    
    output_key_a = TypedBytesWritable()
    output_value_a = TypedBytesWritable()

    output_key_a.set('new_alpha')
    output_value_a.set(_alpha.tostring())

    writer.append(output_key_a, output_value_a)
    
    # For lambda
    _lambda = 1*numpy.random.gamma(100., 1./100., (topic_num, word_num))
    
    output_key_l = TypedBytesWritable()
    output_value_l = TypedBytesWritable()

    output_key_l.set('new_lambda')
    output_value_l.set(_lambda.tostring())

    writer.append(output_key_l, output_value_l)
    
    # For eta
    _eta = numpy.zeros(word_num) + 1./topic_num
    
    output_key_e = TypedBytesWritable()
    output_value_e = TypedBytesWritable()

    output_key_e.set('new_eta')
    output_value_e.set(_eta.tostring())

    writer.append(output_key_e, output_value_e)

    writer.close()
    
    subprocess.call("hadoop dfs -copyFromLocal " + parameter_target_filename + " " + hadoop_hdfs_root, shell=True, stdout=file(os.devnull, "w"))
    os.remove(parameter_target_filename)
    
    return parameter_target_filename
Пример #24
0
    def __init__(self):
        self._word_num = int(self.params['word_num'])
        self._document_num = int(self.params['document_num'])
        self._minibatch_size = int(self.params['minibatch_size'])
        self._meanchangethresh = float(self.params['meanchangethresh'])
        self._topic_num = int(self.params['topic_num'])

        self._tau0 = float(self.params['tau0'])
        self._updatect = float(self.params['updatect'])
        self._kappa = float(self.params['kappa'])

        rhot = pow(self._tau0 + self._updatect, -self._kappa)
        self._rhot = rhot

        # Load parameter from distributed cache
        parameter_reader = SequenceFile.Reader('./_params')
        key_class = parameter_reader.getKeyClass()
        value_class = parameter_reader.getValueClass()
        key_instance = key_class()
        value_instance = value_class()

        while parameter_reader.next(key_instance, value_instance):
            key_instance_str = key_instance.toString()
            if 'new_alpha' == key_instance_str:
                # For alpha
                self._alpha = value_instance.toString()
                self._alpha = numpy.fromstring(self._alpha)
                self._alpha.shape = self._topic_num
            elif 'new_lambda' == key_instance_str:
                # For lambda
                self._lambda = value_instance.toString()
                self._lambda = numpy.fromstring(self._lambda)
                self._lambda.shape = (self._topic_num, self._word_num)
            elif 'new_eta' == key_instance_str:
                # For eta
                self._eta = value_instance.toString()
                self._eta = numpy.fromstring(self._eta)
                self._eta.shape = self._word_num
            else:
                # Error
                sys.stderr.write("Something wrong in parameter_reader\n")
                sys.exit(1)

        parameter_reader.close()

        self._Elogbeta = self.dirichlet_expectation(self._lambda)
        self._expElogbeta = numpy.exp(self._Elogbeta)

        # initialize sstats
        self.sstats = numpy.zeros((self._topic_num, self._word_num))
        self.gamma = numpy.zeros((self._minibatch_size, self._topic_num))
Пример #25
0
def make_text_null_seq(filename, reader):
    writer = SequenceFile.createWriter(filename, Text, NullWritable)

    key = Text()
    value = NullWritable()

    count = 0
    for x in reader:
        key.set(x)
        writer.append(key, value)
        count += 1

    writer.close()
    return count
def make_text_null_seq(filename, reader):
    writer = SequenceFile.createWriter(filename, Text, NullWritable)

    key = Text()
    value = NullWritable()

    count = 0
    for x in reader:
        key.set(x)
        writer.append(key, value)
        count += 1

    writer.close()
    return count
Пример #27
0
 def seqReader(path):
     reader = SequenceFile.Reader(path)
     key_class = reader.getKeyClass()
     value_class = reader.getValueClass()
     key = key_class()
     value = value_class()
     position = reader.getPosition()
     while reader.next(key, value):
         position = reader.getPosition()
         name, d1, d2 = key.toString().split(".")
         nparr = np.array(value.toString().split(","),
                          np.uint8).reshape(int(d1), int(d2))
         #img = cv2.imdecode(nparr, cv2.CV_LOAD_IMAGE_COLOR)
         print nparr.shape
     reader.close()
		def seqReader(pathtpsaveimage): 
			reader = SequenceFile.Reader(self.path)
			key_class = reader.getKeyClass()
			value_class = reader.getValueClass()
			key = key_class()
			value = value_class()
			position = reader.getPosition()
			while reader.next(key, value):
				position = reader.getPosition()
				name,d1,d2,ext=key.toString().split(".")
				print len(value.getBytes())
				nparr = np.fromstring(value.getBytes(), np.uint8)
				img = cv2.imdecode(nparr, cv2.CV_LOAD_IMAGE_COLOR)
				print np.array(img).size
			reader.close()
def test_text():

    from hadoop.io.compress.ZlibCodec import ZlibCodec
    from hadoop.io.compress.GzipCodec import GzipCodec
    from hadoop.io.compress.BZip2Codec import BZip2Codec
    from hadoop.io.compress.LzoCodec import LzoCodec
    from hadoop.io.compress.SnappyCodec import SnappyCodec

    writer = SequenceFile.createWriter('resume_compressed.seq',
                                       Text,
                                       Text,
                                       compression_codec=SnappyCodec(),
                                       compression_type=CompressionType.BLOCK)
    write_text_data(writer)
    writer.close()
Пример #30
0
def SequenceFileIterator(path):
    reader = SequenceFile.Reader(path)

    key_class = reader.getKeyClass()
    value_class = reader.getValueClass()

    key = key_class()
    value = value_class()

    position = reader.getPosition()

    while reader.next(key, value):
        yield (position, key.toString(), value.toString())
        position = reader.getPosition()

    reader.close()    
Пример #31
0
def exportSGY(rddFilename, sgyFilename):
    reader = SequenceFile.Reader(rddFilename)

    key_class = reader.getKeyClass()
    value_class = reader.getValueClass()

    key = key_class()
    value = value_class()

    # reader.sync(4042)
    position = reader.getPosition()
    while reader.next(key, value):
        print('*' if reader.syncSeen() else ' ',
              '[%6s] %6s %6s' % (position, key.toString(), value.toString()))
        position = reader.getPosition()

    reader.close()
Пример #32
0
def hadoop_input_stream(stream, size, url, params):

    stream.seek(0, 2)
    size = stream.tell()
    stream.seek(0)
    reader = SequenceFile.Reader(stream, length=size)

    key_class = reader.getKeyClass()
    value_class = reader.getValueClass()

    key = key_class()
    value = value_class()

    while reader.next(key, value):
        yield key, value

    reader.close()
Пример #33
0
 def __init__(self, path, metadict = None):
   FileBase.__init__(self, path)
   self._tmppostfix = 'seqtmp'
   self._postfix = 'seq'
   self._raw_key, self._raw_value = Text(), Text()
   self._item_count = 0
   tmpdict = metadict or {}
   tmpdict['name'] = 'SequenceFileWriter'
   tmpdict['ver'] = '0.1'
   from hadoop.io.SequenceFile import Metadata
   meta = Metadata()
   for k, v in tmpdict.items():
     meta.set(k, v)
   self._writer = SequenceFile.createWriter(self._gen_tmp_path(), Text, Text,
       metadata = meta,
       compression_type = SequenceFile.CompressionType.BLOCK)
   assert self._writer, "Failed Create Writer File handler"
Пример #34
0
 def seqReader(pathtpsaveimage):
     reader = SequenceFile.Reader(self.path)
     key_class = reader.getKeyClass()
     value_class = reader.getValueClass()
     key = key_class()
     value = value_class()
     position = reader.getPosition()
     compression_codec = BZip2Codec()
     while reader.next(key, value):
         position = reader.getPosition()
         name, d1, d2, ext = key.toString().split(".")
         arr = compression_codec.decompress(value.getBytes())
         nparr = np.frombuffer(arr, np.uint8)
         try:
             img = cv2.imdecode(nparr, cv2.CV_LOAD_IMAGE_COLOR)
         except AttributeError:
             img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
         print name, img.shape
     reader.close()
Пример #35
0
def testRead(filename):
    reader = SequenceFile.Reader(filename)

    metadata = reader.getMetadata()
    for meta_key, meta_value in metadata:
        print 'METADATA:', meta_key, meta_value

    key_class = reader.getKeyClass()
    value_class = reader.getValueClass()

    key = key_class()
    value = value_class()

    position = reader.getPosition()
    while reader.next(key, value):
        print '*' if reader.syncSeen() else ' ',
        print '[%6s] %6s %6s' % (position, key.toString(), value.toString())
        position = reader.getPosition()

    reader.close()
Пример #36
0
def main():
    inputfiles = sys.argv[1]

    call(['mkdir', os.path.join(options.tmpdir, 'tmp')])
    print "downloading inputfiles  %s" % (inputfiles)
    check_call([
        'hadoop', 'fs', '-copyToLocal', inputfiles,
        os.path.join(options.tmpdir, 'tmp')
    ])

    order = {}
    values = []

    for fname in os.listdir(os.path.join(options.tmpdir, 'tmp')):
        reader = SequenceFile.Reader(os.path.join(options.tmpdir, 'tmp',
                                                  fname))
        key_class = reader.getKeyClass()
        value_class = reader.getValueClass()
        key = key_class()
        value = value_class()
        while reader.next(key, value):
            order[int(key.get())] = value.get()
        reader.close()

    var = []
    for key, val in sorted(order.iteritems()):
        var.extend(val)

    var2 = np.array(var)

    print "reading templatefile %s" % (options.template)
    templatefile = ep.ExoFile(options.template, 'r')
    print "writing outputfile %s" % (options.output)
    newfile = ep.ExoFile(options.output, 'w')

    result = insert_vars(templatefile, newfile, (options.varname, ), (var2, ))

    print "removing inputfiles  %s" % (inputfiles)
    check_call(['rm', '-r', os.path.join(options.tmpdir, 'tmp')])
    print "Done!"
Пример #37
0
    def __init__(self):
        numpy.random.seed(100000001)

        self._word_num = int(self.params['word_num'])
        self._meanchangethresh = float(self.params['meanchangethresh'])
        self._topic_num = int(self.params['topic_num'])

        # Load parameter from distributed cache
        parameter_reader = SequenceFile.Reader('./_params')
        key_class = parameter_reader.getKeyClass()
        value_class = parameter_reader.getValueClass()
        key_instance = key_class()
        value_instance = value_class()

        while parameter_reader.next(key_instance, value_instance):
            key_instance_str = key_instance.toString()
            if 'new_alpha' == key_instance_str:
                # For alpha
                self._alpha = value_instance.toString()
                self._alpha = numpy.fromstring(self._alpha)
                self._alpha.shape = self._topic_num
            elif 'new_lambda' == key_instance_str:
                # For lambda
                self._lambda = value_instance.toString()
                self._lambda = numpy.fromstring(self._lambda)
                self._lambda.shape = (self._topic_num, self._word_num)
            elif 'new_eta' == key_instance_str:
                # For eta
                # loading useless
                continue
            else:
                # Error
                sys.stderr.write("Something wrong in parameter_reader\n")
                sys.exit(1)

        parameter_reader.close()

        self._Elogbeta = self.dirichlet_expectation(self._lambda)
        self._expElogbeta = numpy.exp(self._Elogbeta)
Пример #38
0
def importSGY(sgyFilename, rddFilename):

    # os.remove(rddFilename)
    fp = open(sgyFilename, 'rb')
    writer = SequenceFile.createWriter(rddFilename, IntWritable, BytesWritable)

    SH = segypy.getSegyHeader(sgyFilename, 3600, segypy.endian)
    bps = segypy.getBytePerSample(SH)

    filesize = os.path.getsize(sgyFilename)
    samp_count = SH['ns']
    data_len = samp_count * bps
    trace_size = data_len + 240
    ntraces = (filesize - 3600) / trace_size

    data = fp.read(3600)
    for trace_num in range(ntraces):
        SegyTraceHeader = fp.read(240)
        SegyTraceData = fp.read(data_len)
		error - segypy.getValue is not correct
        SegyTraceData = segypy.getValue(
            SegyTraceData, 0, 'float', segypy.endian, samp_count)
        writer.append(IntWritable(trace_num), BytesWritable(
            str(SegyTraceHeader) + str(SegyTraceData)))
# with the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from hadoop.io.SequenceFile import CompressionType
from hadoop.io import BytesWritable
from hadoop.io import LongWritable
from hadoop.io import SequenceFile

def writeData(writer):
    key = BytesWritable()
    value = BytesWritable()

    # for i in xrange(1000):
    key.set("A")
    value.set("B")
    print '[%d] %s %s' % (writer.getLength(), key.toString(), value.toString())
    writer.append(key, value)

if __name__ == '__main__':
    writer = SequenceFile.createWriter('test-bytes.seq', BytesWritable, BytesWritable)
    writeData(writer)
    writer.close()

Пример #40
0
def compile_data(input_str, substitute, outputpath='', compression=False, test_file=False, p=None):
    temp = input_str.rpartition(os.sep)
    path_temp = temp[0]
    file_temp = temp[2]

    if outputpath is not '':
        try:
            os.mkdir(outputpath)
        except: pass

    if not p==None:
        global pool
        pool=p

    ica_key, ica_val, raw_key, raw_val = Text(), Text(), Text(), Text()

    for i, v in enumerate(substitute):

        path_to_data = path_temp.replace('?', str(v))
        filename = file_temp.replace('?', str(v))

        eeg = get_eeg(path_to_data + os.sep, filename)

        if eeg is not 1:
            raw_data, ica_act = read_full_float(eeg)
        else:
            continue
        if raw_data is None:
            continue

        print(filename + ': identifying outliers')
        artifact_indexes = find_artifact_indexes(eeg, ica_act)
        eeg['artifact_indexes'] = artifact_indexes;

        f=open('..\\artifact_indexes', 'w')
        pickle.dump(artifact_indexes,f)
        f.close()

        eegstr = pickle.dumps(eeg, protocol=2)

        print(filename + ': compiling dataset into hadoop sequence file')

        if outputpath is '':
            outputpath = path_to_data;

        #Enable compression if requested
        if compression:
            comp_type=SequenceFile.CompressionType.RECORD
        else:
            comp_type=SequenceFile.CompressionType.NONE

        writer = SequenceFile.createWriter(outputpath + os.sep + filename + '.seq', Text, Text, compression_type=comp_type)

        for i in range(raw_data.shape[1]):
            if test_file and i > 3:
                break

            this_raw = np.ascontiguousarray(raw_data[:,i], dtype=raw_data.dtype)
            this_ica = np.ascontiguousarray(ica_act[:,i], dtype=ica_act.dtype)

            ica_key.set(outputpath + os.sep + filename + '.ica.' + str(i+1))
            raw_key.set(outputpath + os.sep + filename + '.raw.' + str(i+1))

            ica_temp = pickle.dumps((this_ica, eegstr), protocol=2)
            raw_temp = pickle.dumps((this_raw, eegstr), protocol=2)

            ica = base64.b64encode(ica_temp)
            raw = base64.b64encode(raw_temp)

            ica_val.set(ica)
            raw_val.set(raw)

            writer.append(raw_key, raw_val)
            writer.append(ica_key, ica_val)

            print(filename + ': '+str(i+1))

        writer.close()
        print  filename + ': finished writing file'

    return 0
#!/usr/bin/env python

from hadoop.io import SequenceFile, Text
from hadoop.io.SequenceFile import CompressionType

import msgpack
import re
import itertools

writer = SequenceFile.createWriter ('pg10.seq', Text, Text, compression_type=CompressionType.BLOCK)

key = Text()
value = Text()

with open ('pg10.txt', 'r') as f:
    for pos, line in enumerate(iter (f.readline, '')):
        line = line.strip()

        pos = msgpack.packb (pos)
        line = msgpack.packb (line)

        key._bytes = pos
        key._length = len(pos)
        value._bytes = line
        value._length = len(line)

        writer.append (key, value)

writer.close()        
Пример #42
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys

from hadoop.io import SequenceFile

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('usage: SequenceFileReader <filename>')
    else:
        reader = SequenceFile.Reader(sys.argv[1])

        key_class = reader.getKeyClass()
        value_class = reader.getValueClass()

        key = key_class()
        value = value_class()

        #reader.sync(4042)
        position = reader.getPosition()
        while reader.next(key, value):
            print('*' if reader.syncSeen() else ' ', end=' ')
            print('[%6s] %6s %6s' %
                  (position, key.toString(), value.toString()))
            position = reader.getPosition()
Пример #43
0
 def create_writer(self, writer=None):
     if writer:
         writer.close()
     seq_file = self.gernate_file()
     self.seq_filename = seq_file
     return SequenceFile.createWriter(seq_file, Text, BytesWritable, compression_type=CompressionType.RECORD)
Пример #44
0
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from hadoop.io.SequenceFile import CompressionType
from hadoop.io import Text
from hadoop.io import SequenceFile

def writeData(writer):
    key = Text()
    value = Text()

    key.set('Key')
    value.set('Value')

    writer.append(key, value)

if __name__ == '__main__':
    writer = SequenceFile.createWriter('test.seq', Text, Text)
    writeData(writer)
    writer.close()
Пример #45
0
def convert(inputfile, steps, outdir, variables):
    f = ep.ExoFile(inputfile,'r')
    total_time_steps = f.num_time_steps

    if total_time_steps < steps:
        print >> sys.stderr, 'The total time steps is', total_time_steps
        print >> sys.stderr, 'The patitions step is',steps,'. No need to patition the file.'
        return False
        
    Vars = variables.split(',')
    
    # Get time data and coordinate (x,y,z) data
    time = f.cdf.variables["time_whole"]
    timedata = time.getValue()
    coordz = f.cdf.variables["coordz"]
    zdata = coordz.getValue()
    coordy = f.cdf.variables["coordy"]
    ydata = coordy.getValue()
    coordx = f.cdf.variables["coordx"]
    xdata = coordx.getValue()
    
    # To avoid PICKLE type in typedbytes files
    timedata2 = []
    for i, ele in enumerate(timedata):
        timedata2.append(float(ele))
    xdata2 = []
    for i, ele in enumerate(xdata):
        xdata2.append(float(ele))
    ydata2 = []
    for i, ele in enumerate(ydata):
        ydata2.append(float(ele))
    zdata2 = []
    for i, ele in enumerate(zdata):
        zdata2.append(float(ele))
    
    # Get variable data
    varnames = f.node_variable_names()
    vardata = []
    for i, var in enumerate(Vars):
        vdata = None
        for vi,n in enumerate(varnames):
            if n == var.strip():
                #vtemp = vi
                vindex = vi
                break
        if vindex == None:
            print  >> sys.stderr, 'The variable ', var.strip(), 'does not exist!'
            return False
        tmp = f.vars['vals_nod_var'+str(vindex+1)]
        tmpdata = tmp.getValue()
        vardata.append((var.strip(), tmpdata))
    
    # Begin to partition
    basename = os.path.basename(inputfile)
    ind = basename.rfind('.')
    basename = basename[0:ind]
    
    indexkey = TypedBytesWritable()
    indexvalue = TypedBytesWritable()
    indexwriter = SequenceFile.createWriter(os.path.join(outdir,'index.seq'), 
        TypedBytesWritable, TypedBytesWritable,compression_type=CompressionType.RECORD)
    
    begin = 0
    i = 0
    
    while begin < total_time_steps:
        end = begin + steps - 1
        if end > total_time_steps - 1:
            end = total_time_steps - 1
        outputfilename = basename + '_part'+ str(i) + '.seq'
        
        writer = SequenceFile.createWriter(os.path.join(outdir,outputfilename),
            TypedBytesWritable, TypedBytesWritable,compression_type=CompressionType.RECORD)
        key = TypedBytesWritable()
        value = TypedBytesWritable()
        key.set(-1)
        value.set(xdata2)
        writer.append(key,value)
        key.set(-2)
        value.set(ydata2)
        writer.append(key,value)
        key.set(-3)
        value.set(zdata2)
        writer.append(key,value)
        
        for j in xrange(begin, end+1):
            key.set((j,timedata2[j]))
            valuedata = []
            for m, var in enumerate(vardata):
                name = var[0]
                data = var[1][j]
                data2 = []
                for m, ele in enumerate(data):
                    data2.append(float(ele))
                valuedata.append((name,data2))
            value.set(valuedata)
            writer.append(key,value)
        writer.close()
        indexkey.set(outputfilename)
        indexvalue.set(end-begin+1)
        indexwriter.append(indexkey,indexvalue)
        begin = begin + steps
        i = i + 1
        
    indexkey.set('total')
    indexvalue.set(total_time_steps)
    indexwriter.append(indexkey,indexvalue)   
    indexwriter.close()
    
    return True
Пример #46
0
#!/usr/bin/env python
import sys

#read the input files
for line in sys.stdin:
    print (1, line[5] or 'N/A')




#!/usr/bin/env python
import sys
from hadoop.io import LongWritable
from hadoop.io import SequenceFile

writer = SequenceFile.createWriter('reddit_posts.seq' % _id, LongWritable, LongWritable)

#read the input files
for line in sys.stdin:

    #use try/expect block to make sure a improperly formatted row does not blow our program u
    # remove leading and trailing whitespace
    # assume that the files are both comma delimited and only contain the columns described in assignment 4 part 1
    line = line.strip().split(",")

    _id, _text, = line[0], line[5] or 'N/A'

    key = LongWritable()
    key.set(int(_id))

    value = LongWritable()
Пример #47
0
def convert(inputfile, steps, outdir, variables, normalized_timesteps):
    fset = 0
    
    fdir,fname = os.path.split(inputfile)
    fsetnum=''
    for i,c in enumerate(fname):
        if c.isdigit():
            fsetnum+=c
    if fsetnum != '':
        fset=int(fsetnum)
        
    f = ep.ExoFile(inputfile,'r')
        
    Vars = variables.split(',')
    
    # Get time data and coordinate (x,y,z) data
    time = f.cdf.variables["time_whole"]
    timedata = time.getValue()
    coordz = f.cdf.variables["coordz"]
    zdata = coordz.getValue()
    coordy = f.cdf.variables["coordy"]
    ydata = coordy.getValue()
    coordx = f.cdf.variables["coordx"]
    xdata = coordx.getValue()
    
    # To avoid PICKLE type in typedbytes files
    timedata2 = []
    for i, ele in enumerate(timedata):
        timedata2.append(float(ele))
    xdata2 = []
    for i, ele in enumerate(xdata):
        xdata2.append(float(ele))
    ydata2 = []
    for i, ele in enumerate(ydata):
        ydata2.append(float(ele))
    zdata2 = []
    for i, ele in enumerate(zdata):
        zdata2.append(float(ele))
    
    # Note: the size of normalized_timesteps should not be greater than 
    # num_time_steps in the exodus file.
    if normalized_timesteps is None:
        normalized_timesteps = timedata2
    
    total_time_steps = len(normalized_timesteps)
    
    # Get variable data
    varnames = f.node_variable_names()
    vardata = []
    for i, var in enumerate(Vars):
        vdata = None
        for vi,n in enumerate(varnames):
            if n == var.strip():
                #vtemp = vi
                vindex = vi
                break
        if vindex == None:
            print  >> sys.stderr, 'The variable ', var.strip(), 'does not exist!'
            return False
        tmp = f.vars['vals_nod_var'+str(vindex+1)]
        tmpdata = tmp.getValue()
        vardata.append((var.strip(), tmpdata))
    
    # Begin to partition
    basename = os.path.basename(inputfile)
    ind = basename.rfind('.')
    basename = basename[0:ind]
    
    indexkey = TypedBytesWritable()
    indexvalue = TypedBytesWritable()
    indexwriter = SequenceFile.createWriter(os.path.join(outdir,'index.seq'), 
        TypedBytesWritable, TypedBytesWritable,compression_type=CompressionType.RECORD)
    
    begin = 0
    i = 0
    
    time_begin = 0
    
    while begin < total_time_steps:
        end = begin + steps - 1
        if end > total_time_steps - 1:
            end = total_time_steps - 1
        outputfilename = basename + '_part'+ str(i) + '.seq'
        
        writer = SequenceFile.createWriter(os.path.join(outdir,outputfilename),
            TypedBytesWritable, TypedBytesWritable,compression_type=CompressionType.RECORD)
        key = TypedBytesWritable()
        value = TypedBytesWritable()
        key.set(-1)
        value.set(xdata2)
        writer.append(key,value)
        key.set(-2)
        value.set(ydata2)
        writer.append(key,value)
        key.set(-3)
        value.set(zdata2)
        writer.append(key,value)
        
        for j in xrange(begin, end+1):
            key.set((fset,(j,normalized_timesteps[j])))
            valuedata = []
            for m, var in enumerate(vardata):
                name = var[0]
                data = var[1]
                for t in xrange(time_begin, len(timedata2)):
                    if normalized_timesteps[j] == timedata2[t]:
                        normalized_data = data[t]
                        time_begin = t
                        break
                    elif normalized_timesteps[j] < timedata2[t]:
                        normalized_data =  linear_interpolate(normalized_timesteps[j], timedata2[t-1], data[t-1], timedata2[t], data[t])
                        break
                data2 = []
                for m, ele in enumerate(normalized_data):
                    data2.append(float(ele))
                valuedata.append((name,data2))
            value.set(valuedata)
            writer.append(key,value)
        writer.close()
        indexkey.set(outputfilename)
        indexvalue.set(end-begin+1)
        indexwriter.append(indexkey,indexvalue)
        begin = begin + steps
        i = i + 1
        
    indexkey.set('total')
    indexvalue.set(total_time_steps)
    indexwriter.append(indexkey,indexvalue)   
    indexwriter.close()
    
    return True