Пример #1
0
 def __init__(self, dirname):
     self._data = SequenceFile.Reader(os.path.join(dirname, DATA_FILE_NAME))
     self._index = SequenceFile.Reader(
         os.path.join(dirname, INDEX_FILE_NAME))
     self._first_position = self._data.getPosition()
     self._positions = []
     self._keys = []
Пример #2
0
def convert_from_sequencefiles(encrypted_data):
    partition_pattern = os.path.join(encrypted_data, "data/part-*")
    partition_files = glob.glob(partition_pattern)

    output_partition_files = []

    # Convert each partition from SequenceFile format to bytes
    for partition_file in partition_files:
        # Example taken from
        # https://github.com/matteobertozzi/Hadoop/blob/master/python-hadoop/examples/SequenceFileReader.py
        sequence_file_reader = SequenceFile.Reader(partition_file)
        key_class = sequence_file_reader.getKeyClass()
        value_class = sequence_file_reader.getValueClass()

        key = key_class()
        value = value_class()

        # FIXME: better way of generating intermediate file name
        output_partition_file = partition_file.replace("part-", "cpp-part-")

        # FIXME: Unclear if we need the below line
        #  position = sequence_file_reader.getPosition()
        has_next = sequence_file_reader.next(key, value)
        if has_next:
            with open(output_partition_file, "wb") as partition:
                while has_next:
                    partition.write(value.toBytes())
                    has_next = sequence_file_reader.next(key, value)
                    #  position = sequence_file_reader.getPosition()

            output_partition_files.append(output_partition_file)

        sequence_file_reader.close()

    return output_partition_files
Пример #3
0
def desequence(seq_file,
               output_path,
               get_fname=lambda k, i: "file" + str(i) + ".nc"):
    """
    Takes a sequence file and writes out a separate NetCDF file
    for each value.

    seq_file: path to a seq file where the values are valid NetCDF binary blobs
    output_path: a string path to dump files to
    get_fname: a function which takes the key and an incrimental integer,
                    and returns a string to be used as the file name.

    """
    reader = SequenceFile.Reader(seq_file)

    key_class = reader.getKeyClass()
    value_class = reader.getValueClass()

    key = key_class()
    value = value_class()

    position = reader.getPosition()
    i = 0
    while reader.next(key, value):
        with open(output_path + get_fname(key, i), "wb") as f:
            f.write(value.getBytes())
        i += 1
    reader.close()
Пример #4
0
 def init_pailfile_source(self, **kwargs):
     return PailfileSource(
         self.logger,
         self.loop,
         kwargs['gate'],
         SequenceFile.Reader(kwargs['input'][0].path),
     )
Пример #5
0
def main(argv=None):
    '''this is called if run from command line'''
    (prog, args) = interpretCmdLine()
    parser = argparse.ArgumentParser(prog, description='seq2tsv')
    # parser.add_argument()
    parser.add_argument("pathname")
    args = parser.parse_args(args)
    outputPathname = args.pathname + ".tsv"
    count = 0
    start = datetime.datetime.now()
    with open(outputPathname, 'w') as f:
        reader = SequenceFile.Reader(args.pathname)

        key_class = reader.getKeyClass()
        value_class = reader.getValueClass()

        key = key_class()
        value = value_class()

        # reader.sync(4042)
        position = reader.getPosition()
        while reader.next(key, value):
            # print '*' if reader.syncSeen() else ' ',
            print >> f, '%s\t%s' % (key.toString(), value.toString())
            position = reader.getPosition()

        reader.close()
    end = datetime.datetime.now()
    delta = end - start
    print >> sys.stderr, "ELAPSED seq2tsv is %s" % elapsed(delta)
    return count
def count_file(filename):
    reader = SequenceFile.Reader(filename)

    key = Text()
    value = NullWritable()

    count = 0
    while reader.next(key, value):
        count += 1

    return count
Пример #7
0
    def __init__(self):
        self._word_num = int(self.params['word_num'])
        self._document_num = int(self.params['document_num'])
        self._minibatch_size = int(self.params['minibatch_size'])
        self._meanchangethresh = float(self.params['meanchangethresh'])
        self._topic_num = int(self.params['topic_num'])

        self._tau0 = float(self.params['tau0'])
        self._updatect = float(self.params['updatect'])
        self._kappa = float(self.params['kappa'])

        rhot = pow(self._tau0 + self._updatect, -self._kappa)
        self._rhot = rhot

        # Load parameter from distributed cache
        parameter_reader = SequenceFile.Reader('./_params')
        key_class = parameter_reader.getKeyClass()
        value_class = parameter_reader.getValueClass()
        key_instance = key_class()
        value_instance = value_class()

        while parameter_reader.next(key_instance, value_instance):
            key_instance_str = key_instance.toString()
            if 'new_alpha' == key_instance_str:
                # For alpha
                self._alpha = value_instance.toString()
                self._alpha = numpy.fromstring(self._alpha)
                self._alpha.shape = self._topic_num
            elif 'new_lambda' == key_instance_str:
                # For lambda
                self._lambda = value_instance.toString()
                self._lambda = numpy.fromstring(self._lambda)
                self._lambda.shape = (self._topic_num, self._word_num)
            elif 'new_eta' == key_instance_str:
                # For eta
                self._eta = value_instance.toString()
                self._eta = numpy.fromstring(self._eta)
                self._eta.shape = self._word_num
            else:
                # Error
                sys.stderr.write("Something wrong in parameter_reader\n")
                sys.exit(1)

        parameter_reader.close()

        self._Elogbeta = self.dirichlet_expectation(self._lambda)
        self._expElogbeta = numpy.exp(self._Elogbeta)

        # initialize sstats
        self.sstats = numpy.zeros((self._topic_num, self._word_num))
        self.gamma = numpy.zeros((self._minibatch_size, self._topic_num))
		def seqReader(pathtpsaveimage): 
			reader = SequenceFile.Reader(self.path)
			key_class = reader.getKeyClass()
			value_class = reader.getValueClass()
			key = key_class()
			value = value_class()
			position = reader.getPosition()
			while reader.next(key, value):
				position = reader.getPosition()
				name,d1,d2,ext=key.toString().split(".")
				print len(value.getBytes())
				nparr = np.fromstring(value.getBytes(), np.uint8)
				img = cv2.imdecode(nparr, cv2.CV_LOAD_IMAGE_COLOR)
				print np.array(img).size
			reader.close()
Пример #9
0
 def seqReader(path):
     reader = SequenceFile.Reader(path)
     key_class = reader.getKeyClass()
     value_class = reader.getValueClass()
     key = key_class()
     value = value_class()
     position = reader.getPosition()
     while reader.next(key, value):
         position = reader.getPosition()
         name, d1, d2 = key.toString().split(".")
         nparr = np.array(value.toString().split(","),
                          np.uint8).reshape(int(d1), int(d2))
         #img = cv2.imdecode(nparr, cv2.CV_LOAD_IMAGE_COLOR)
         print nparr.shape
     reader.close()
Пример #10
0
def SequenceFileIterator(path):
    reader = SequenceFile.Reader(path)

    key_class = reader.getKeyClass()
    value_class = reader.getValueClass()

    key = key_class()
    value = value_class()

    position = reader.getPosition()

    while reader.next(key, value):
        yield (position, key.toString(), value.toString())
        position = reader.getPosition()

    reader.close()    
Пример #11
0
def hadoop_input_stream(stream, size, url, params):

    stream.seek(0, 2)
    size = stream.tell()
    stream.seek(0)
    reader = SequenceFile.Reader(stream, length=size)

    key_class = reader.getKeyClass()
    value_class = reader.getValueClass()

    key = key_class()
    value = value_class()

    while reader.next(key, value):
        yield key, value

    reader.close()
Пример #12
0
def exportSGY(rddFilename, sgyFilename):
    reader = SequenceFile.Reader(rddFilename)

    key_class = reader.getKeyClass()
    value_class = reader.getValueClass()

    key = key_class()
    value = value_class()

    # reader.sync(4042)
    position = reader.getPosition()
    while reader.next(key, value):
        print('*' if reader.syncSeen() else ' ',
              '[%6s] %6s %6s' % (position, key.toString(), value.toString()))
        position = reader.getPosition()

    reader.close()
Пример #13
0
 def seqReader(pathtpsaveimage):
     reader = SequenceFile.Reader(self.path)
     key_class = reader.getKeyClass()
     value_class = reader.getValueClass()
     key = key_class()
     value = value_class()
     position = reader.getPosition()
     compression_codec = BZip2Codec()
     while reader.next(key, value):
         position = reader.getPosition()
         name, d1, d2, ext = key.toString().split(".")
         arr = compression_codec.decompress(value.getBytes())
         nparr = np.frombuffer(arr, np.uint8)
         try:
             img = cv2.imdecode(nparr, cv2.CV_LOAD_IMAGE_COLOR)
         except AttributeError:
             img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
         print name, img.shape
     reader.close()
Пример #14
0
def testRead(filename):
    reader = SequenceFile.Reader(filename)

    metadata = reader.getMetadata()
    for meta_key, meta_value in metadata:
        print 'METADATA:', meta_key, meta_value

    key_class = reader.getKeyClass()
    value_class = reader.getValueClass()

    key = key_class()
    value = value_class()

    position = reader.getPosition()
    while reader.next(key, value):
        print '*' if reader.syncSeen() else ' ',
        print '[%6s] %6s %6s' % (position, key.toString(), value.toString())
        position = reader.getPosition()

    reader.close()
Пример #15
0
def main():
    inputfiles = sys.argv[1]

    call(['mkdir', os.path.join(options.tmpdir, 'tmp')])
    print "downloading inputfiles  %s" % (inputfiles)
    check_call([
        'hadoop', 'fs', '-copyToLocal', inputfiles,
        os.path.join(options.tmpdir, 'tmp')
    ])

    order = {}
    values = []

    for fname in os.listdir(os.path.join(options.tmpdir, 'tmp')):
        reader = SequenceFile.Reader(os.path.join(options.tmpdir, 'tmp',
                                                  fname))
        key_class = reader.getKeyClass()
        value_class = reader.getValueClass()
        key = key_class()
        value = value_class()
        while reader.next(key, value):
            order[int(key.get())] = value.get()
        reader.close()

    var = []
    for key, val in sorted(order.iteritems()):
        var.extend(val)

    var2 = np.array(var)

    print "reading templatefile %s" % (options.template)
    templatefile = ep.ExoFile(options.template, 'r')
    print "writing outputfile %s" % (options.output)
    newfile = ep.ExoFile(options.output, 'w')

    result = insert_vars(templatefile, newfile, (options.varname, ), (var2, ))

    print "removing inputfiles  %s" % (inputfiles)
    check_call(['rm', '-r', os.path.join(options.tmpdir, 'tmp')])
    print "Done!"
Пример #16
0
    def __init__(self):
        numpy.random.seed(100000001)

        self._word_num = int(self.params['word_num'])
        self._meanchangethresh = float(self.params['meanchangethresh'])
        self._topic_num = int(self.params['topic_num'])

        # Load parameter from distributed cache
        parameter_reader = SequenceFile.Reader('./_params')
        key_class = parameter_reader.getKeyClass()
        value_class = parameter_reader.getValueClass()
        key_instance = key_class()
        value_instance = value_class()

        while parameter_reader.next(key_instance, value_instance):
            key_instance_str = key_instance.toString()
            if 'new_alpha' == key_instance_str:
                # For alpha
                self._alpha = value_instance.toString()
                self._alpha = numpy.fromstring(self._alpha)
                self._alpha.shape = self._topic_num
            elif 'new_lambda' == key_instance_str:
                # For lambda
                self._lambda = value_instance.toString()
                self._lambda = numpy.fromstring(self._lambda)
                self._lambda.shape = (self._topic_num, self._word_num)
            elif 'new_eta' == key_instance_str:
                # For eta
                # loading useless
                continue
            else:
                # Error
                sys.stderr.write("Something wrong in parameter_reader\n")
                sys.exit(1)

        parameter_reader.close()

        self._Elogbeta = self.dirichlet_expectation(self._lambda)
        self._expElogbeta = numpy.exp(self._Elogbeta)
def loadDatainES(filename,
                 index,
                 doctype,
                 dataFileType,
                 hostname="localhost",
                 port=9200,
                 mappingFilePath=None):
    try:
        print "Connecting to " + hostname + " at port:" + str(port)
        # es = Elasticsearch([{'host': hostname, 'port': port}])
        es = Elasticsearch([
            'https://*****:*****@' + hostname + ":" + str(port)
        ],
                           show_ssl_warnings=False)

        if mappingFilePath:
            with open(mappingFilePath) as m:
                mapping = m.read()
                #print "Mapping file:" + mapping
                es.indices.create(index=index, body=mapping, ignore=400)

        if dataFileType == "1":
            with open(filename) as f:
                d = json.load(f)
                for wp in d:
                    res = es.index(index=index,
                                   doc_type=doctype,
                                   body=wp,
                                   id=wp["uri"])
                    print "indexing id: " + res["_id"] + " for uri: " + wp[
                        "uri"]
        elif dataFileType == "0":
            with open(filename) as f:
                lines = f.readlines()

                for line in lines:
                    if line.strip() != "":
                        jsonurlobj = json.loads(line.strip())
                        objkey = jsonurlobj['uri']
                        res = es.index(index=index,
                                       doc_type=doctype,
                                       body=line)
                        print "indexing id: " + res[
                            "_id"] + " for uri: " + objkey
        elif dataFileType == "2":
            reader = SequenceFile.Reader(filename)
            key_class = reader.getKeyClass()
            value_class = reader.getValueClass()

            key = key_class()
            value = value_class()

            position = reader.getPosition()
            counter = 0

            bulk_data = []

            while reader.next(key, value):
                if value.toString().strip() != "":
                    data_dict = {}
                    line = value.toString()
                    for i in range(len(line)):
                        data_dict[header[i]] = line[i]

                    op_dict = {
                        "index": {
                            "_index": index,
                            "_type": doctype,
                            "_id": data_dict["uri"]
                        }
                    }
                    bulk_data.append(op_dict)
                    bulk_data.append(data_dict)

                    #                  //res = es.index(index=index,doc_type=doctype,body=value.toString(),id=objkey)
                    # bulk index the data
                    if counter % 10000 == 0:
                        res = es.bulk(index=index,
                                      body=bulk_data,
                                      refresh=True)
                        bulk_data = []

                position = reader.getPosition()
            reader.close()

            print "Errors:" + str(i)
    except Exception, e:
        print >> stderr.write('ERROR: %s\n' % str(e))
        pass
Пример #18
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys

from hadoop.io import SequenceFile

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print('usage: SequenceFileReader <filename>')
    else:
        reader = SequenceFile.Reader(sys.argv[1])

        key_class = reader.getKeyClass()
        value_class = reader.getValueClass()

        key = key_class()
        value = value_class()

        #reader.sync(4042)
        position = reader.getPosition()
        while reader.next(key, value):
            print('*' if reader.syncSeen() else ' ', end=' ')
            print('[%6s] %6s %6s' %
                  (position, key.toString(), value.toString()))
            position = reader.getPosition()
Пример #19
0
redis_conn = redis.Redis(db=2)
ids = {line.rstrip().upper():True for line in open('selected_ids_20_2')}
file_list = ["full/" + x for x in filter(lambda x:"part-" in x, os.listdir("full"))]
cores = multiprocessing.cpu_count()
chunked_list = chunker(file_list, cores)

pids = []

for x in range(0, cores):
    pid = os.fork()
    if pid == 0:

        for ef in chunked_list[x]:
            print("Proc %s doing %s" % (x, ef))
            reader = SequenceFile.Reader(ef)
            kc = reader.getKeyClass()
            vc = reader.getValueClass()

            k,v = kc(), vc()

            while reader.next(k,v):
                ks = k.toString()
                if ks in ids:
                    print("    setting %s" % ks)
                    redis_conn.set(ks, extract_important(v.toString()))
                else:
                    print("Not setting %s" % ks)

        sys.exit(0)
    else:
Пример #20
0
from hadoop.io import SequenceFile
import time
import json
import pickle

#setimgkeys=set()
setvisualkeys = pickle.load(open("setvisualkeys.p", "r"))
visualvaluesdict = dict.fromkeys(list(setvisualkeys))
for visualkey in setvisualkeys:
    visualvaluesdict[visualkey] = set()

for part in xrange(1, 21):
    filename = "./trial01/part-r-000" + "%02d" % part
    reader = SequenceFile.Reader(filename)
    key_class = reader.getKeyClass()
    value_class = reader.getValueClass()
    key = key_class()
    value = value_class()
    position = reader.getPosition()
    while reader.next(key, value):
        if not reader.syncSeen():
            thisKey = key.toString()
            thisValue = value.toString()
            tmpj = json.loads(thisValue)
            #print tmpj
            for visualkey in setvisualkeys:
                try:
                    #print list(tmpj['hasImagePart'].copy().keys())
                    visualvaluesdict[visualkey] = visualvaluesdict[
                        visualkey].union(
                            [tmpj['hasImagePart'][visualkey]['featureValue']])
Пример #21
0
def test_hadoop_fs_destination_sequence_files(sdc_builder, sdc_executor,
                                              cluster):
    """Test Hadoop FS destination configuring File Type to Sequence File.
    We use sequence files with a EL expression in the sequence key file.
    We use SequenceFile module to read the generated file.
    Hadoop File is copied to local file system.
    """

    # Configure Prefix and Sufix and Directory
    FILES_PREFIX, FILES_SUFFIX = 'tst', 'seq'
    hdfs_directory = f'/tmp/out/{get_random_string(string.ascii_letters, 10)}'

    # Get Pipeline Builder
    pipeline_builder = sdc_builder.get_pipeline_builder()

    # Create Dev Raw Data Stage
    raw_data = '\n'.join(json.dumps(product) for product in PRODUCT_DATA_FIX)
    logger.info('Pipeline will write to HDFS directory %s ...', hdfs_directory)
    dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source')
    dev_raw_data_source.set_attributes(data_format='JSON',
                                       raw_data=raw_data,
                                       stop_after_first_batch=True)

    # Create Hadoop FS Destination
    hadoop_fs = pipeline_builder.add_stage('Hadoop FS', type='destination')
    hadoop_fs.set_attributes(
        data_format='JSON',
        directory_template=hdfs_directory,
        files_prefix=FILES_PREFIX,
        files_suffix=FILES_SUFFIX,
        file_type='SEQUENCE_FILE',
        compression_type='RECORD',
        sequence_file_key='${record:value(\'/sequenceKey\')}')

    # triggered the destination file to be closed after writing all data.
    hadoop_fs.set_attributes(max_records_in_file=len(PRODUCT_DATA_FIX))

    dev_raw_data_source >> hadoop_fs

    # Build and Start Pipeline. After first batch it finishes.
    pipeline = pipeline_builder.build('Hadoop FS Destination Sequence Key'
                                      ).configure_for_environment(cluster)
    sdc_executor.add_pipeline(pipeline)
    sdc_executor.start_pipeline(pipeline).wait_for_finished(timeout_sec=10)

    try:

        # Check that just one file is in the directory
        hdfs_fs_files = cluster.hdfs.client.list(hdfs_directory)
        assert len(hdfs_fs_files) == 1
        # Check the prefix and suffix
        hdfs_fs_filename = hdfs_fs_files[0]
        assert hdfs_fs_filename.startswith(FILES_PREFIX)
        assert hdfs_fs_filename.endswith(FILES_SUFFIX)

        # Download the file from HDFS to Local File System
        cluster.hdfs.client.download(f'{hdfs_directory}/{hdfs_fs_filename}',
                                     f'/tmp/{hdfs_fs_filename}')

        # Read the sequence file
        reader = SequenceFile.Reader(f'/tmp/{hdfs_fs_filename}')
        key_class = reader.getKeyClass()
        value_class = reader.getValueClass()
        key = key_class()
        value = value_class()

        # Convert list of dict to list of bytes
        product_data_expected = [
            json.dumps(row, separators=(',', ':')).encode()
            for row in PRODUCT_DATA_FIX
        ]

        for i in range(2):
            # Read the information
            reader.next(key, value)

            # Check if name, price and release are in value
            assert product_data_expected[i] == value.toString()

        reader.close()

    finally:
        logger.info('Deleting Hadoop FS directory %s ...', hdfs_directory)
        cluster.hdfs.client.delete(hdfs_directory, recursive=True)