def main(argv): """ Main method. This method performs the following tasks: 1. Parse command line arguments 2. Retrieve credentials and connect to Cloudant and WebHDFS 3. Connect to the Cloudant `_changes` feed for checkpointed document consumption 4. Process each change individually. 5. Upon exception throwing, store the latest checkpoint to local file and exit. """ #add options into the parser parser = configureOptions() (options, args) = parser.parse_args() checkRequiredArguments(options, parser) print options # configurations last_seq = options.last_seq #get credential perm_file = '%s/.clou' % os.environ['HOME'] creds = get_creds(perm_file) #connect to source database s = Server('https://%s:%s@%s' % (creds['cloudant_user'], creds['cloudant_pwd'], options.uri)) db = s[options.dbname] #print db.info() #connect to target hdfs cluster hdfs = PyWebHdfsClient(host=options.hdfs_host, port=options.hdfs_port, user_name=creds['hdfs_user']) hdfs.make_dir(options.hdfs_path) #and here we consume the cloudant `_changes` feed counter = 0 changestream = ChangesStream(db, include_docs=True, heartbeat=True, since=last_seq) for c in changestream: #print c try: if counter % 100 == 0: checkpoint(last_seq) seq = processChange(hdfs, c, options.hdfs_path) if seq: # protect against the last line being blank last_seq = seq counter += 1 except Exception: traceback.print_exc() checkpoint(last_seq) os._exit(1) checkpoint(last_seq)
class HdfsHandler: def __init__(self, hadoopHost, hadopPort='50070', user='******'): # self.hdfs = PyWebHdfsClient(host='52.14.121.163', port='50070', user_name='hadoop') self.hdfs = PyWebHdfsClient(host=hadoopHost, port=hadopPort, user_name=user) self.s3_client = boto3.client('s3') def copyToHDFS(self, src_path, hdfs_path): if hdfs_path.startswith("hdfs"): temp_path = hdfs_path.split("8020") self.new_hdfs_path = temp_path[1] + '/lib' print "New Path: %s" % self.new_hdfs_path # create a new client instance # print "New Path: %s" % self.new_hdfs_path[1] jar_name = os.path.basename(src_path) print src_path fileContent = open(src_path, 'rb').read() # copies file to local for testing purpose # with open("E:/temp/java-0.0.2.jar", "wb") as jarfile: # jarfile.write(fileContent) # create a new file on hdfs print('making new file at: {0}\n'.format(jar_name)) result = self.hdfs.create_file(self.new_hdfs_path + "/" + jar_name, fileContent, overwrite=True) print "HDFS Copy Result: %s" % result return result def list_hdfs_dir(self, hdfs_path): print self.hdfs.list_dir(hdfs_path)
def load(self, job, task, fifo): self.job = job self.task = task self.fifo = fifo self.key = None self.script_proc = None self.decompress_obj = None self.pycurl_callback_exception = None if task.data['scheme'] == 's3': self.is_anonymous = job.spec.source.aws_access_key is None or job.spec.source.aws_secret_key is None if self.is_anonymous: s3_conn = S3Connection(anon=True) else: s3_conn = S3Connection(job.spec.source.aws_access_key, job.spec.source.aws_secret_key) bucket = s3_conn.get_bucket(task.data['bucket']) try: self.key = bucket.get_key(task.data['key_name']) except S3ResponseError as e: raise WorkerException( "Received %s %s accessing `%s`, aborting" % (e.status, e.reason, task.data['key_name'])) elif task.data['scheme'] == 'hdfs': fname = task.data['key_name'] client = PyWebHdfsClient(job.spec.source.hdfs_host, job.spec.source.webhdfs_port, user_name=job.spec.source.hdfs_user) try: filesize = client.get_file_dir_status( fname)['FileStatus']['length'] except pywebhdfs.errors.FileNotFound: raise WorkerException("File '%s' does not exist on HDFS" % fname) self.key = AttrDict({'name': fname, 'size': filesize}) elif task.data['scheme'] == 'file': globber = glob2.Globber() fname = globber._normalize_string(task.data['key_name']) if not os.path.exists(fname): raise WorkerException( "File '%s' does not exist on this filesystem" % fname) elif not os.path.isfile(fname): raise WorkerException("File '%s' exists, but is not a file" % fname) self.key = AttrDict({ 'name': fname, 'size': os.path.getsize(fname) }) else: raise WorkerException('Unsupported job with paths: %s' % [str(p) for p in self.job.paths]) if self.key is None: raise WorkerException( 'Failed to find key associated with task ID %s' % task.task_id) self.metrics = DownloadMetrics(self.key.size)
def upload_file(): """ Upload file --- tags: - Files consumes: "multipart/form-data" parameters: - name: file in: formData required: true paramType: body dataType: file type: file responses: 200: description: Return a successful message 401: description: Unauthorized 400: description: Bad Request 500: description: Server Internal error """ # hard-code config information. You should imporove it. hdfs = PyWebHdfsClient(host='webhdfs',port='50070', user_name='thanhson1085') if request.method == 'POST': file = request.files['file'] if file and allowed_file(file.filename): filename = secure_filename(str(time.time()) + file.filename) my_file = 'tmp/thanhson1085/data/' + filename hdfs.create_file(my_file, file) return jsonify({'success':'true'}) return jsonify({'success':'false'})
def ship_udf(ic, function, hdfs_path=None, udf_name=None, database=None, overwrite=False): # extract some information from the function if udf_name is None: udf_name = function.name symbol = function.llvm_func.name ir = function.llvm_module.to_bitcode() return_type = udf_to_impala_type[function.signature.return_type.name] arg_types = [udf_to_impala_type[arg.name] for arg in function.signature.args[1:]] # ship the IR to the cluster hdfs_client = PyWebHdfsClient(host=ic._nn_host, port=ic._webhdfs_port, user_name=ic._hdfs_user) if hdfs_path is None: hdfs_path = os.path.join(ic._temp_dir, udf_name + '.ll') if not hdfs_path.endswith('.ll'): raise ValueError("The HDFS file name must end with .ll") hdfs_client.create_file(hdfs_path.lstrip('/'), ir, overwrite=overwrite) # register the function in Impala if database is None: database = ic._temp_db impala_name = '%s.%s(%s)' % (database, udf_name, ', '.join(arg_types)) if overwrite: ic._cursor.execute("DROP FUNCTION IF EXISTS %s" % impala_name) register_query = "CREATE FUNCTION %s RETURNS %s LOCATION '%s' SYMBOL='%s'" % (impala_name, return_type, hdfs_path, symbol) ic._cursor.execute(register_query)
class WhenTestingDeleteOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.response = MagicMock() def test_rename_throws_exception_for_not_ok(self): self.response.status_code = httplib.BAD_REQUEST self.requests.delete.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.delete_file_dir(self.path) def test_rename_returns_true(self): self.response.status_code = httplib.OK self.requests.delete.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.delete_file_dir(self.path) self.assertTrue(result)
def __init__(self, host, port, user, logger): self._hdfs = PyWebHdfsClient( host=host, port=port, user_name=user, timeout=None) global LOGGER LOGGER = logger LOGGER.debug('webhdfs = %s@%s:%s', user, host, port)
class WhenTestingDeleteXattrOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.path = 'user/hdfs/old_dir' self.xattr = 'user.test' self.response = MagicMock() @patch.object(Session, 'put') def test_delete_xattr_throws_exception_for_not_ok(self, mock_put): self.response.status_code = http_client.BAD_REQUEST mock_put.return_value = self.response with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.delete_xattr(self.path, self.xattr) @patch.object(Session, 'put') def test_delete_xattr_returns_true(self, mock_put): self.response.status_code = http_client.OK mock_put.return_value = self.response result = self.webhdfs.delete_xattr(self.path, self.xattr) self.assertTrue(result)
class WhenTestingOpenOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs' self.file_data = u'010101' self.response = MagicMock() self.response.content = self.file_data def test_read_throws_exception_for_not_ok(self): self.response.status_code = http_client.BAD_REQUEST self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.read_file(self.path) def test_read_returns_file(self): self.response.status_code = http_client.OK self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.read_file(self.path) self.assertEqual(result, self.file_data)
class WhenTestingGetXattrOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.path = 'user/hdfs/old_dir' self.xattr = 'user.test' self.response = MagicMock() self.file_status = { "XAttrs": [ { "name": self.xattr, "value": "1" } ] } self.response.json = MagicMock(return_value=self.file_status) @patch.object(Session, 'get') def test_get_xattr_throws_exception_for_not_ok(self, mock_get): self.response.status_code = http_client.BAD_REQUEST mock_get.return_value = self.response with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.get_xattr(self.path, self.xattr) @patch.object(Session, 'get') def test_get_xattr_returns_true(self, mock_get): self.response.status_code = http_client.OK mock_get.return_value = self.response result = self.webhdfs.get_xattr(self.path, self.xattr) for key in result: self.assertEqual(result[key], self.file_status[key])
class WhenTestingListXattrsOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.path = 'user/hdfs/old_dir' self.response = MagicMock() self.file_status = { "XAttrNames": [ "[\"XATTRNAME1\",\"XATTRNAME2\",\"XATTRNAME3\"]" ] } self.response.json = MagicMock(return_value=self.file_status) @patch.object(Session, 'get') def test_list_xattrs_throws_exception_for_not_ok(self, mock_get): self.response.status_code = http_client.BAD_REQUEST mock_get.return_value = self.response with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.list_xattrs(self.path) @patch.object(Session, 'get') def test_list_xattrs_returns_true(self, mock_get): self.response.status_code = http_client.OK mock_get.return_value = self.response result = self.webhdfs.list_xattrs(self.path) for key in result: self.assertEqual(result[key], self.file_status[key])
class WhenTestingDeleteOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.response = MagicMock() def test_rename_throws_exception_for_not_ok(self): self.response.status_code = http_client.BAD_REQUEST self.requests.delete.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.delete_file_dir(self.path) def test_rename_returns_true(self): self.response.status_code = http_client.OK self.requests.delete.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.delete_file_dir(self.path) self.assertTrue(result)
class WhenTestingGetFileChecksumOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.path = 'user/hdfs/old_dir' self.response = MagicMock() self.file_checksum = { "FileChecksum": { "algorithm": "MD5-of-1MD5-of-512CRC32", "bytes": ("000002000000000000000000729a144ad5e9399f70c9bedd757" "2e6bf00000000"), "length": 28 } } self.response.json = MagicMock(return_value=self.file_checksum) @patch.object(Session, 'get') def test_get_status_throws_exception_for_not_ok(self, mock_get): self.response.status_code = http_client.BAD_REQUEST mock_get.return_value = self.response with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.get_file_checksum(self.path) @patch.object(Session, 'get') def test_get_status_returns_true(self, mock_get): self.response.status_code = http_client.OK mock_get.return_value = self.response result = self.webhdfs.get_file_checksum(self.path) for key in result: self.assertEqual(result[key], self.file_checksum[key])
class WhenTestingRenameOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.path = 'user/hdfs/old_dir' self.new_path = '/user/hdfs/new_dir' self.response = MagicMock() self.rename = {"boolean": True} self.response.json = MagicMock(return_value=self.rename) @patch.object(Session, 'put') def test_rename_throws_exception_for_not_ok(self, mock_put): self.response.status_code = http_client.BAD_REQUEST mock_put.return_value = self.response with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.rename_file_dir(self.path, self.new_path) @patch.object(Session, 'put') def test_rename_returns_true(self, mock_put): self.response.status_code = http_client.OK mock_put.return_value = self.response result = self.webhdfs.rename_file_dir(self.path, self.new_path) self.assertEqual(result, {"boolean": True})
def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.response = MagicMock() self.file_status = { "FileStatus": { "accessTime": 0, "blockSize": 0, "group": "supergroup", "length": 0, "modificationTime": 1320173277227, "owner": "webuser", "pathSuffix": "", "permission": "777", "replication": 0, "type": "DIRECTORY" } } self.response.json = MagicMock(return_value=self.file_status)
class WhenTestingOpenOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.path = 'user/hdfs' self.file_data = u'010101' self.response = MagicMock() self.response.content = self.file_data @patch.object(Session, 'get') def test_read_throws_exception_for_not_ok(self, mock_get): self.response.status_code = http_client.BAD_REQUEST mock_get.return_value = self.response with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.read_file(self.path) @patch.object(Session, 'get') def test_read_returns_file(self, mock_get): self.response.status_code = http_client.OK mock_get.return_value = self.response result = self.webhdfs.read_file(self.path) self.assertEqual(result, self.file_data) @patch.object(Session, 'get') def test_stream_returns_generator(self, mock_get): self.response.status_code = http_client.OK mock_get.return_value = self.response result = self.webhdfs.stream_file(self.path) self.assertIsInstance(result, types.GeneratorType)
def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.path = 'user/hdfs' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name)
def to_hdfs(file_path, hdfs_path): hdfs = PyWebHdfsClient(host='hdfs-v1', port='50070', user_name='hdfs', timeout=100) with open(file_path, 'rb') as f: hdfs.create_file(hdfs_path, f, overwrite=True)
class WhenTestingRenameOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.new_path = '/user/hdfs/new_dir' self.response = MagicMock() self.rename = {"boolean": True} self.response.json = MagicMock(return_value=self.rename) def test_rename_throws_exception_for_not_ok(self): self.response.status_code = http_client.BAD_REQUEST self.requests.put.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.rename_file_dir(self.path, self.new_path) def test_rename_returns_true(self): self.response.status_code = http_client.OK self.requests.put.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.rename_file_dir(self.path, self.new_path) self.assertEqual(result, {"boolean": True})
def solarLog_call(epoch_time): conn = http.client.HTTPConnection("") r = requests.get(" http://winsun.solarlog-web.ch/api?cid=" + pfadheimBaarCID + "&locale=de_ch&username=277555406&password=5a03cdf0a3ff42de09bc85361d8a2f0f&function=dashboard&format=jsonh&solarlog=9112&tiles=Yield|true,Grafic|true,Env|true,Weather|true&ctime=" + epoch_time) logging.info("Response: " + str(r.status_code) + " " + r.reason) data = r.json() # This will return entire content. data['timestamp'] = epoch_time # Remove key's with complex JSON structure del data['cur_production_per_wrid'] del data['invEnergyType'] #del data['decimalseperator'] logging.debug(data) #write data to .json with open('/home/claude/repo/bda-solar/data/data_timestamp/pfadibaar_solarlog_' + epoch_time + '.json', 'w', encoding='utf-8') as outfile: json.dump(data, outfile, indent=4, ensure_ascii=False) #write the same data as .csv since it is more easy to handel with hdfs.. with open('/home/claude/repo/bda-solar/data/data_timestamp/pfadibaar_solarlog_' + epoch_time + '.csv', 'w') as f: # Just use 'w' mode in 3.x w = csv.DictWriter(f, data.keys(), dialect=csv.excel_tab) w.writeheader() w.writerow(data) # write the same data as .csv since it is more easy to handel with hdfs.. hdfs = PyWebHdfsClient(host='193.246.208.147', port='50079', user_name='hdfs') #hdfs_path = 'user/hdfs/from_python' hdfs.create_file('user/hdfs/pfadibaar_solarlog.csv', '0100') #with open('pfadibaar_solarlog_' + epoch_time + '.csv') as file_data: # hdfs.create_file(hdfs_path, data=file_data) conn.close()
class WhenTestingCreateUri(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.path = 'user/hdfs' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) def test_create_uri_no_kwargs(self): op = operations.CREATE uri = 'http://{host}:{port}/webhdfs/v1/' \ '{path}?op={op}&user.name={user}'\ .format( host=self.host, port=self.port, path=self.path, op=op, user=self.user_name) result = self.webhdfs._create_uri(self.path, op) self.assertEqual(uri, result) def test_create_uri_with_kwargs(self): op = operations.CREATE mykey = 'mykey' myval = 'myval' uri = 'http://{host}:{port}/webhdfs/v1/' \ '{path}?op={op}&{key}={val}' \ '&user.name={user}' \ .format( host=self.host, port=self.port, path=self.path, op=op, key=mykey, val=myval, user=self.user_name) result = self.webhdfs._create_uri(self.path, op, mykey=myval) self.assertEqual(uri, result)
def setup_common_oozie_libs(name_node): webhdfs_port = '14000' webhdfs_user = '******' platform_dir = 'user/deployment/platform' lib_path_list = ['/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar', '/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar', '/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar', '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar', '/opt/cloudera/parcels/CDH/lib/spark/lib/spark-examples.jar'] # Setup a connection with hdfs using namenode. hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None) # Create directory on hadoop file system (HDFS). hdfs_client.make_dir(platform_dir) # Creates a new file on HDFS and write contents from local FS. for path in lib_path_list: platform_file = '%s/%s' % (platform_dir, os.path.basename(path)) logging.info('Copying source file: %s to HDFS path %s', path, platform_file) with open(path) as file_data: hdfs_client.create_file(platform_file, file_data, overwrite=True)
class WhenTestingDeleteXattrOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.xattr = 'user.test' self.response = MagicMock() def test_delete_xattr_throws_exception_for_not_ok(self): self.response.status_code = http_client.BAD_REQUEST self.requests.return_value = self.response with patch('requests.sessions.Session.put', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.delete_xattr(self.path, self.xattr) def test_delete_xattr_returns_true(self): self.response.status_code = http_client.OK self.requests.return_value = self.response with patch('requests.sessions.Session.put', self.requests): result = self.webhdfs.delete_xattr(self.path, self.xattr) self.assertTrue(result)
class WhenTestingOpenOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs' self.file_data = '010101' self.response = MagicMock() self.response.text = self.file_data def test_read_throws_exception_for_not_ok(self): self.response.status_code = httplib.BAD_REQUEST self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.read_file(self.path) def test_read_returns_file(self): self.response.status_code = httplib.OK self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.read_file(self.path) self.assertEqual(result, self.file_data)
class WhenTestingListDirOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.path = 'user/hdfs/old_dir' self.response = MagicMock() self.file_status = { "FileStatuses": { "FileStatus": [ { "accessTime": 0, "blockSize": 0, "group": "supergroup", "length": 24930, "modificationTime": 1320173277227, "owner": "webuser", "pathSuffix": "a.patch", "permission": "777", "replication": 0, "type": "FILE" }, { "accessTime": 0, "blockSize": 0, "group": "supergroup", "length": 0, "modificationTime": 1320173277227, "owner": "webuser", "pathSuffix": "", "permission": "777", "replication": 0, "type": "DIRECTORY" } ] } } self.response.json = MagicMock(return_value=self.file_status) @patch.object(Session, 'get') def test_get_status_throws_exception_for_not_ok(self, mock_get): self.response.status_code = http_client.BAD_REQUEST mock_get.return_value = self.response with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.list_dir(self.path) @patch.object(Session, 'get') def test_get_status_returns_true(self, mock_get): self.response.status_code = http_client.OK mock_get.return_value = self.response result = self.webhdfs.list_dir(self.path) for key in result: self.assertEqual(result[key], self.file_status[key])
def downParts(fpath): '从hdfs下载spark输出的文件, 以 part-0??? 格式的系列文件, 下载的时候直接聚合到一个文件中' from pywebhdfs.webhdfs import PyWebHdfsClient hdfs = PyWebHdfsClient(host='iasp76', port='12003', user_name='mci') flist = hdfs.list_dir(fpath) x = flist['FileStatuses']['FileStatus'] _SUCCESS = False for f in x: if f['pathSuffix'] == '_SUCCESS': _SUCCESS = True break if not _SUCCESS: print("not complete yet!") return fnames = [ f['pathSuffix'] for f in x if f['pathSuffix'].startswith('part-') ] fnames1 = sorted(fnames) foutname = fpath[fpath.rfind('/') + 1:] l = len(fnames1) with open(foutname, "wb") as fo: for fname in fnames1: fpath1 = fpath + "/" + fname fo.write(hdfs.read_file(fpath1)) print(" progress: ", fname, l)
def ship_udf(ic, function, hdfs_path=None, udf_name=None, database=None, overwrite=False): # extract some information from the function if udf_name is None: udf_name = function.name symbol = function.llvm_func.name ir = function.llvm_module.to_bitcode() return_type = udf_to_impala_type[function.signature.return_type.name] arg_types = [udf_to_impala_type[arg.name] for arg in function.signature.args[1:]] # ship the IR to the cluster hdfs_client = PyWebHdfsClient(host=ic._nn_host, port=ic._webhdfs_port, user_name=ic._hdfs_user) if hdfs_path is None: hdfs_path = os.path.join(ic._temp_dir, udf_name + '.ll') if not hdfs_path.endswith('.ll'): raise ValueError("The HDFS file name must end with .ll") hdfs_client.create_file(hdfs_path.lstrip('/'), ir, overwrite=overwrite) # register the function in Impala if database is None: database = ic._temp_db impala_name = '%s.%s(%s)' % (database, udf_name, ', '.join(arg_types)) if overwrite: ic._cursor.execute("DROP FUNCTION IF EXISTS %s" % impala_name) register_query = ("CREATE FUNCTION %s RETURNS %s " "LOCATION '%s' SYMBOL='%s'") % (impala_name, return_type, hdfs_path, symbol) ic._cursor.execute(register_query)
class HDFS(NDArray): ''' HDFS storage Parameters ---------- name : str Name of directory to store text files (Path to the directory) without a leading '/' model : Model If None, the model is taken from the 'with' context vars : list of variables Sampling values will be stored for these variables. If None. 'model.unobserved_RVs' is used host : str The IP address or hostname of the HDFS namenode. By default, it is 'localhost' port : str The port number for WebHDFS on the namenode. By default, it is '50070' user_name : str WebHDFS user_name used for authentication. By default, it is None ''' def __init__(self, name, model=None, vars=None, host='localhost', port='50070', user_name=None): self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name) try: self.hdfs.list_dir(name) except FileNotFound: self.hdfs.make_dir(name) super(HDFS, self).__init__(name, model, vars) def close(self): super(HDFS, self).close() _dump_trace(self.name, self)
def submit(self, bund, files=[]): hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie') for f in files: hdfs.create_file("{}/{}".format(bund.path, f.name), f.read()) doc, tag, text = Doc().tagtext() with tag("configuration"): with tag("property"): with tag("name"): text("user.name") with tag("value"): text("oozie") with tag("property"): with tag("name"): text("oozie.bundle.application.path") with tag("value"): text("/"+bund.path + "/" + bund.name) configuration = doc.getvalue() response = post("{0}/oozie/v1/jobs".format(self.url), data=configuration, headers={'Content-Type': 'application/xml'}) if response.status_code > 399: print response.headers["oozie-error-message"] print response.status_code print response.content
def saveToStore(path,meta): con=happybase.Connection(MasterHbase) con.open() metaTable= con.table('MetaTable') if meta['size'] < largeSize: # save to Hbase encTable = con.table('EncTable') with open(path,'rb') as f: encTable.put(meta['rowkey'],{'enc:data': f.read()}) metaTable.put(str(meta['rowkey']),{ 'pp:name': str(meta['filename']), 'pp:checksum': str(meta['checksum']), 'pp:size': str(meta['size']), 'pp:often': str(meta['often']), 'pp:des': str(meta['description']) } ) app.logger.debug('%s is saved to Hbase',meta['rowkey']) else: # save to HDFS hdfs = PyWebHdfsClient(host=Master,port='50070', timeout=None,user_name='hduser') with open(path, 'rb') as f: hdfs.create_file(HDFSMainPath+meta['rowkey'], f) metaTable.put(str(meta['rowkey']),{ 'pp:name': str(meta['filename']), 'pp:checksum': str(meta['checksum']), 'pp:size': str(meta['size']), 'pp:HDFSpath': str(HDFSMainPath + meta['rowkey']), 'pp:often': str(meta['often']), 'pp:des': str(meta['description']) } ) app.logger.debug('%s is saved to HDFS',meta['rowkey']) con.close()
def save_extracted_subgraph(elements, args: application_args): pair, subgraph, _ = elements path = args.get_folder_results_path() hdfs = PyWebHdfsClient(host=args.hdfs_host, port=args.hdfs_port) file = os.path.join(path, f"graph_{str(pair[0])}_{str(pair[1])}") pickled = pkl.dumps(subgraph) hdfs.create_file(file, pickled, overwrite=True)
def close(self): # drop the temp database self._cursor.execute('USE %s' % self._temp_db) self._cursor.execute('SHOW TABLES') temp_tables = [x[0] for x in self._cursor.fetchall()] for table in temp_tables: self._cursor.execute('DROP TABLE IF EXISTS %s.%s' % (self._temp_db, table)) self._cursor.execute('SHOW FUNCTIONS') temp_udfs = [x[1] for x in self._cursor.fetchall()] for udf in temp_udfs: self._cursor.execute('DROP FUNCTION IF EXISTS %s.%s' % (self._temp_db, udf)) self._cursor.execute('SHOW AGGREGATE FUNCTIONS') temp_udas = [x[1] for x in self._cursor.fetchall()] for uda in temp_udas: self._cursor.execute('DROP AGGREGATE FUNCTION IF EXISTS %s.%s' % (self._temp_db, uda)) self._cursor.execute('USE default') self._cursor.execute('DROP DATABASE IF EXISTS %s' % self._temp_db) # drop the temp dir in HDFS try: from requests.exceptions import ConnectionError from pywebhdfs.webhdfs import PyWebHdfsClient hdfs_client = PyWebHdfsClient(host=self._nn_host, port=self._webhdfs_port, user_name=self._hdfs_user) hdfs_client.delete_file_dir(self._temp_dir.lstrip('/'), recursive=True) except ImportError: import sys sys.stderr.write("Could not import requests or pywebhdfs. " "You must delete the temporary directory manually: %s" % self._temp_dir) except ConnectionError: import sys sys.stderr.write("Could not connect via pywebhdfs. " "You must delete the temporary directory manually: %s" % self._temp_dir)
def test_create_throws_exception_for_not_created(self, mock_put): webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.init_response.status_code = http_client.TEMPORARY_REDIRECT self.response.status_code = http_client.BAD_REQUEST mock_put.side_effect = [self.init_response, self.response] with self.assertRaises(errors.PyWebHdfsException): webhdfs.create_file(self.path, self.file_data)
def get_file_contents(self, hdfs_path, user_name='trifacta', httpfs_port='14000'): hdfs = PyWebHdfsClient(host=urlparse(self.trifacta_base_url).netloc, port=httpfs_port, user_name=user_name) return hdfs.read_file(hdfs_path).decode('utf-8')
def from_hdfs(hdfs_path, file_path): hdfs = PyWebHdfsClient(host='hdfs-v1', port='50070', user_name='hdfs', timeout=100) binary_file = hdfs.read_file(hdfs_path) with open(file_path, 'wb') as f: f.write(binary_file)
def run(self): if ("agg" in self.arg): #reading from a file to memory to stream later with open(self.path,"rb") as f: self.data_holder['data'] = json.dumps(cPickle.load(f)); #indicating that reading in memory is finished for this data self.data_holder["indicator"]='ready'; elif("raw" in self.arg): from pywebhdfs.webhdfs import PyWebHdfsClient; hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',port='50070', user_name='uacharya'); file_path = 'user/uacharya/flow/'+str(self.arg['d'])+'/node_'+str(self.arg['n'])+'/output.csv' #reading the csv files in the memory self.data_holder['data']= hdfs.read_file(file_path,buffersize=4096) self.data_holder["indicator"]='ready'; elif("bitmap" in self.arg): #putting the line data into a object to stream with open(self.path+"/data.json","rb")as f: self.data_holder['data'] = json.dumps(cPickle.load(f)); # with open(self.path+"\\data.json","rb")as f: # output = cPickle.load(f); #not loading images into memory if there is none images if(self.data_holder['data']=='""'): #indicating that reading in memory is finished for this data self.data_holder['frames']=(0,[]); self.data_holder["indicator"]='ready'; return; # if(not output): # self.data_holder['data']= msgpack.packb(output,use_bin_type=True); # self.data_holder["indicator"]='ready'; # return; #just in case there is some data to stream add all the PNGS to a list # output['frames']=[]; content_length =0; #calculate the content length in bytes of all images to stream in total PNGS=[]; #list to hold all the pngs data in memory #reading all the images to memory to stream for x in xrange(1,31): buf_string = cStringIO.StringIO(); Image.open(self.path+"/imgs/"+str(x)+".png").save(buf_string, format="PNG", quality=100); content_length = content_length+(buf_string.tell()+4); PNGS.append(struct.pack('>I',buf_string.tell())+buf_string.getvalue()); buf_string.close(); # for x in xrange(1,31): # buf_string = cStringIO.StringIO(); # Image.open(self.path+"\\imgs\\"+str(x)+".png").save(buf_string, format="PNG", quality=100); # output['frames'].append(buf_string.getvalue()); # buf_string.close(); self.data_holder['frames']=(content_length,PNGS); # self.data_holder['data']=msgpack.packb(output,use_bin_type=True); #indicating that reading in memory is finished for this data self.data_holder["indicator"]='ready'; else: raise InvalidFormatError("the type of format is not available to read in memory");
def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir'
def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.path = 'user/hdfs/old_dir' self.xattr = 'user.test' self.response = MagicMock()
def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.path = 'user/hdfs' self.file_data = u'010101' self.response = MagicMock() self.response.content = self.file_data
def test_create_returns_file_location(self, mock_put): webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.init_response.status_code = http_client.TEMPORARY_REDIRECT self.response.status_code = http_client.CREATED mock_put.side_effect = [self.init_response, self.response] result = webhdfs.create_file(self.path, self.file_data) self.assertTrue(result) mock_put.assert_called_with( self.location, headers=self.expected_headers, data=self.file_data)
def main(argv): """ Main method. This method performs the following tasks: 1. Parse command line arguments 2. Retrieve credentials and connect to Cloudant and WebHDFS 3. Connect to the Cloudant `_changes` feed for checkpointed document consumption 4. Process each change individually. 5. Upon exception throwing, store the latest checkpoint to local file and exit. """ # add options into the parser parser = configureOptions() (options, args) = parser.parse_args() checkRequiredArguments(options, parser) print options # configurations last_seq = options.last_seq # get credential perm_file = "%s/.clou" % os.environ["HOME"] creds = get_creds(perm_file) # connect to source database s = Server("https://%s:%s@%s" % (creds["cloudant_user"], creds["cloudant_pwd"], options.uri)) db = s[options.dbname] # print db.info() # connect to target hdfs cluster hdfs = PyWebHdfsClient(host=options.hdfs_host, port=options.hdfs_port, user_name=creds["hdfs_user"]) hdfs.make_dir(options.hdfs_path) # and here we consume the cloudant `_changes` feed counter = 0 changestream = ChangesStream(db, include_docs=True, heartbeat=True, since=last_seq) for c in changestream: # print c try: if counter % 100 == 0: checkpoint(last_seq) seq = processChange(hdfs, c, options.hdfs_path) if seq: # protect against the last line being blank last_seq = seq counter += 1 except Exception: traceback.print_exc() checkpoint(last_seq) os._exit(1) checkpoint(last_seq)
def __init__(self, remote=None, namenodes=None, **kwargs): self.remote = remote self.namenodes = namenodes or [] PyWebHdfsClient.__init__(self, **kwargs) if self.namenodes and 'path_to_hosts' not in kwargs: self.path_to_hosts = [('.*', self.namenodes)] # Override base uri self.base_uri_pattern = kwargs.get('base_uri_pattern', "http://{host}/webhdfs/v1/").format( host="{host}")
class WhenTestingCreateUri(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.path = 'user/hdfs' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) def test_create_uri_no_kwargs(self): op = operations.CREATE uri = 'http://{{host}}:{port}/webhdfs/v1/' \ '{path}?op={op}&user.name={user}'\ .format(port=self.port, path=self.path, op=op, user=self.user_name) result = self.webhdfs._create_uri(self.path, op) self.assertEqual(uri, result) def test_create_uri_with_kwargs(self): op = operations.CREATE mykey = 'mykey' myval = 'myval' uri = 'http://{{host}}:{port}/webhdfs/v1/' \ '{path}?op={op}&{key}={val}' \ '&user.name={user}' \ .format( port=self.port, path=self.path, op=op, key=mykey, val=myval, user=self.user_name) result = self.webhdfs._create_uri(self.path, op, mykey=myval) self.assertEqual(uri, result) def test_create_uri_with_leading_slash(self): op = operations.CREATE uri_path_no_slash = self.webhdfs._create_uri(self.path, op) uri_path_with_slash = self.webhdfs._create_uri('/' + self.path, op) self.assertEqual(uri_path_no_slash, uri_path_with_slash) def test_create_uri_with_unicode_path(self): op = operations.CREATE mykey = 'mykey' myval = 'myval' path = u'die/Stra\xdfe' quoted_path = 'die/Stra%C3%9Fe' uri = 'http://{{host}}:{port}/webhdfs/v1/' \ '{path}?op={op}&{key}={val}' \ '&user.name={user}' \ .format( port=self.port, path=quoted_path, op=op, key=mykey, val=myval, user=self.user_name) result = self.webhdfs._create_uri(path, op, mykey=myval) self.assertEqual(uri, result)
class WhenTestingFileExistsOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.response = MagicMock() self.file_status = { "FileStatus": { "accessTime": 0, "blockSize": 0, "group": "supergroup", "length": 0, "modificationTime": 1320173277227, "owner": "webuser", "pathSuffix": "", "permission": "777", "replication": 0, "type": "DIRECTORY" } } self.response.json = MagicMock(return_value=self.file_status) def test_exists_throws_exception_for_error(self): self.response.status_code = http_client.BAD_REQUEST self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.exists_file_dir(self.path) def test_exists_returns_true(self): self.response.status_code = http_client.OK self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): self.assertTrue(self.webhdfs.exists_file_dir(self.path)) def test_exists_returns_false(self): self.response.status_code = http_client.NOT_FOUND self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): self.assertFalse(self.webhdfs.exists_file_dir(self.path))
class WhenTestingCreateOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.location = 'redirect_uri' self.path = 'user/hdfs' self.file_data = '010101' self.init_response = MagicMock() self.init_response.headers = {'location': self.location} self.response = MagicMock() self.expected_headers = {'content-type': 'application/octet-stream'} def test_create_throws_exception_for_no_redirect(self): self.init_response.status_code = httplib.BAD_REQUEST self.response.status_code = httplib.CREATED self.requests.put.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.create_file(self.path, self.file_data) def test_create_throws_exception_for_not_created(self): self.init_response.status_code = httplib.TEMPORARY_REDIRECT self.response.status_code = httplib.BAD_REQUEST self.requests.put.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.create_file(self.path, self.file_data) def test_create_returns_file_location(self): self.init_response.status_code = httplib.TEMPORARY_REDIRECT self.response.status_code = httplib.CREATED self.put_method = MagicMock( side_effect=[self.init_response, self.response]) self.requests.put = self.put_method with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.create_file(self.path, self.file_data) self.assertTrue(result) self.put_method.assert_called_with( self.location, headers=self.expected_headers, data=self.file_data)
def create_data_from_station_data(first, second): """this function creates the data analyzing the two stations in comparison""" global hdfs; #global hdfs object global hbase; #global hbase object if(hdfs is None): from pywebhdfs.webhdfs import PyWebHdfsClient; hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',port='50070', user_name='uacharya'); if(hbase is None): import happybase; hbase = happybase.ConnectionPool(size=1,host='cshadoop.boisestate.edu'); date_for_comparision = first["Date"].strip(); # creating directory for each date try: hdfs.get_file_dir_status('user/uacharya/simulation/'+date_for_comparision); except Exception: # directory to hold dataset in csv file for reach node in wall display starting from 1 to 9 for index in range(1, 10): content = 'Date,ID,Source,Destination,S_Lat,S_Lon,D_Lat,D_Lon,Wind_Lat,Wind_Lon,Wind_Velocity\n'; try: hdfs.create_file('user/uacharya/simulation/'+date_for_comparision+'/node'+str(index)+'/output.csv',content,replication=1); except Exception: continue; dataset = {'node_1':[],'node_2':[],'node_3':[],'node_4':[],'node_5':[],'node_6':[],'node_7':[],'node_8':[],'node_9':[]}; for data in broadcast_variable.value: compare_data_between(date_for_comparision, first, data,dataset); # for key in dataset: # if(len(dataset[key])!=0): # content = "\n".join(dataset[key]); # content +="\n"; # while(True): # try: # hdfs.append_file('user/uacharya/simulation/'+date+'/'+key+'/output.csv',content,buffersize=4096); # break; # except Exception: # time.sleep(0.2); # continue; dataset.clear(); #clearing the dictionary # append over here after all the global variable has been made return second;
def load(self, job, task, fifo): self.job = job self.task = task self.fifo = fifo self.key = None self.script_proc = None self.decompress_obj = None self.pycurl_callback_exception = None if task.data['scheme'] == 's3': self.is_anonymous = job.spec.source.aws_access_key is None or job.spec.source.aws_secret_key is None if self.is_anonymous: s3_conn = S3Connection(anon=True) else: s3_conn = S3Connection(job.spec.source.aws_access_key, job.spec.source.aws_secret_key) bucket = s3_conn.get_bucket(task.data['bucket']) try: self.key = bucket.get_key(task.data['key_name']) except S3ResponseError as e: raise WorkerException("Received %s %s accessing `%s`, aborting" % (e.status, e.reason, task.data['key_name'])) elif task.data['scheme'] == 'hdfs': fname = task.data['key_name'] client = PyWebHdfsClient( job.spec.source.hdfs_host, job.spec.source.webhdfs_port, user_name=job.spec.source.hdfs_user) try: filesize = client.get_file_dir_status(fname)['FileStatus']['length'] except pywebhdfs.errors.FileNotFound: raise WorkerException("File '%s' does not exist on HDFS" % fname) self.key = AttrDict({'name': fname, 'size': filesize}) elif task.data['scheme'] == 'file': globber = glob2.Globber() fname = globber._normalize_string(task.data['key_name']) if not os.path.exists(fname): raise WorkerException("File '%s' does not exist on this filesystem" % fname) elif not os.path.isfile(fname): raise WorkerException("File '%s' exists, but is not a file" % fname) self.key = AttrDict({'name': fname, 'size': os.path.getsize(fname)}) else: raise WorkerException('Unsupported job with paths: %s' % [ str(p) for p in self.job.paths ]) if self.key is None: raise WorkerException('Failed to find key associated with task ID %s' % task.task_id) self.metrics = DownloadMetrics(self.key.size)
def __init__(self, name, model=None, vars=None, host='localhost', port='50070', user_name=None): self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name) try: self.hdfs.list_dir(name) except FileNotFound: self.hdfs.make_dir(name) super(HDFS, self).__init__(name, model, vars)
def update_raw_stage(output, delivery_tag): #context = zmq.Context() #confirm = context.socket(zmq.PUSH) #confirm.connect(confirm_host) hdfs = PyWebHdfsClient(host=webhdfs_host, port=webhdfs_port, user_name=webhdfs_user) impala_conn = connect(host=impala_host, port=int(impala_port)) cur = impala_conn.cursor() start_time = time.time() for k, v in output.iteritems(): if (time.time() - start_time)/60 > sink_minutes: sink_logger.warning('ETL process running longer then sink timeout: {0} minutes'.format((time.time() - start_time)/60)) try: file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k) hdfs.append_file(file_name, '\n'.join(v)) cur.execute('refresh test_log_{0}'.format(k)) except hdfs_err.PyWebHdfsException: file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k) hdfs.create_file(file_name, '') hdfs.append_file(file_name, '\n'.join(v)) cur.execute('refresh test_log_{0}'.format(k)) #confirm.send(delivery_tag) sink_logger.info('ETL process finished for {0} minutes'.format((time.time() - start_time)/60)) sink_logger.info('ETL process finished with {0} delivery_tag'.format(delivery_tag))
def setup_common_oozie_libs(name_node): webhdfs_port = '14000' webhdfs_user = '******' platform_dir = 'user/deployment/platform' lib_path_list = ['/usr/hdp/current/hbase-client/lib/hbase-client.jar', '/usr/hdp/current/hbase-client/lib/hbase-common.jar', '/usr/hdp/current/hbase-client/lib/hbase-protocol.jar', '/usr/hdp/current/hbase-client/lib/hbase-server.jar', '/usr/hdp/current/hbase-client/lib/htrace-core-3.1.0-incubating.jar', '/usr/hdp/current/hbase-client/lib/hbase-hadoop-compat.jar', '/usr/hdp/current/hbase-client/lib/hbase-it.jar', '/usr/hdp/current/hbase-client/lib/hbase-prefix-tree.jar', '/usr/hdp/current/hbase-client/lib/zookeeper.jar', '/usr/hdp/current/pig-client/piggybank.jar', '/usr/hdp/current/spark-client/lib/spark-examples.jar'] # Setup a connection with hdfs using namenode. hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None) # Create directory on hadoop file system (HDFS). hdfs_client.make_dir(platform_dir) # Creates a new file on HDFS and write contents from local FS. for path in lib_path_list: platform_file = '%s/%s' % (platform_dir, os.path.basename(path)) print 'Copying source file: %s to HDFS path %s' % (path, platform_file) with open(path) as file_data: try: hdfs_client.create_file(platform_file, file_data, overwrite=True) except PyWebHdfsException: print 'retrying HDFS copy command for %s' % platform_file time.sleep(5) hdfs_client.create_file(platform_file, file_data, overwrite=True)
def load(name, chains=None, model=None, host='localhost', port='50070', user_name=None): ''' Load text database Parameters ---------- name : str Path to root directory in HDFS for text database without a leading '/' chains : list Chains to load. If None, all chains are loaded model : Model If None, the model is taken from the 'with' context host : str The IP address or hostname of the HDFS namenode. By default, it is 'localhost' port : str The port number for WebHDFS on the namenode. By default, it is '50070' user_name : str WebHDFS user_name used for authentication. By default, it is None Returns ------- ndarray.Trace instance ''' hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name) chain_dirs = _get_chain_dirs(name, hdfs) if chains is None: chains = list(chain_dirs.keys()) traces = [] for chain in chains: chain_dir = chain_dirs[chain] dir_path = os.path.join(name, chain_dir) shape_file = os.path.join(dir_path, 'shapes.json') shapes = json.load(StringIO.StringIO(hdfs.read_file(shape_file))) samples = {} for varname, shape in shapes.items(): var_file = os.path.join(dir_path, varname + '.txt') samples[varname] = np.loadtxt(StringIO.StringIO(str(hdfs.read_file(var_file)))).reshape(shape) trace = NDArray(model=model) trace.samples = samples trace.chain = chain traces.append(trace) return base.MultiTrace(traces)
def from_pandas(ic, df, table=None, path=None, method='in_query', file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', hdfs_host=None, webhdfs_port=50070, hdfs_user=None, overwrite=False): """Create a BDF by shipping an in-memory pandas `DataFrame` into Impala path is the dir, not the filename """ # TODO: this is not atomic temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (ic._temp_db, temp_table) if path is None: path = os.path.join(ic._temp_dir, temp_table) table_name = _to_TableName(table) if overwrite: ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) columns = list(df.columns) types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes] schema = zip(columns, types) create_stmt = _create_table(table_name, schema, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) ic._cursor.execute(create_stmt) if method == 'in_query': query = "INSERT INTO %s VALUES " % table_name.to_sql() query += ', '.join(['(%s)' % ', '.join(map(_py_to_sql_string, row)) for row in df.values]) ic._cursor.execute(query) elif method == 'webhdfs': if file_format != 'TEXTFILE': raise ValueError("only TEXTFILE format supported for webhdfs") if path is None: raise ValueError("must supply a path for EXTERNAL table for webhdfs") from pywebhdfs.webhdfs import PyWebHdfsClient hdfs_client = PyWebHdfsClient(host=hdfs_host, port=webhdfs_port, user_name=hdfs_user) raw_data = StringIO() df.to_csv(raw_data, sep=field_terminator, line_terminator=line_terminator, quoting=csv.QUOTE_NONE, escapechar=escape_char, header=False, index=False) hdfs_client.create_file(os.path.join(path, 'data.txt').lstrip('/'), raw_data.getvalue(), overwrite=overwrite) raw_data.close() else: raise ValueError("method must be 'in_query' or 'webhdfs'; got %s" % method) return from_sql_table(ic, table_name.to_sql())
class WhenTestingAppendOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.location = 'redirect_uri' self.path = 'user/hdfs' self.file_data = '010101' self.init_response = MagicMock() self.init_response.header = {'location': self.location} self.response = MagicMock() def test_append_throws_exception_for_no_redirect(self): self.init_response.status_code = httplib.BAD_REQUEST self.response.status_code = httplib.OK self.requests.post.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.append_file(self.path, self.file_data) def test_append_throws_exception_for_not_ok(self): self.init_response.status_code = httplib.TEMPORARY_REDIRECT self.response.status_code = httplib.BAD_REQUEST self.requests.post.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.append_file(self.path, self.file_data) def test_append_returns_true(self): self.init_response.status_code = httplib.TEMPORARY_REDIRECT self.response.status_code = httplib.OK self.requests.post.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.append_file(self.path, self.file_data) self.assertTrue(result)
class WhenTestingGetContentSummaryOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.response = MagicMock() self.file_status = { "ContentSummary": { "directoryCount": 2, "fileCount": 1, "length": 24930, "quota": -1, "spaceConsumed": 24930, "spaceQuota": -1 } } self.response.json = MagicMock(return_value=self.file_status) def test_get_status_throws_exception_for_not_ok(self): self.response.status_code = http_client.BAD_REQUEST self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.get_content_summary(self.path) def test_get_status_returns_true(self): self.response.status_code = http_client.OK self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.get_content_summary(self.path) for key in result: self.assertEqual(result[key], self.file_status[key])
class Store (store.Store): """ HDFS backed store. """ def __init__ (self): """ Connect to store """ self._client = PyWebHdfsClient(host=store_host, port=store_port, user_name=store_user) def mkdir (self, path): self._client.make_dir(path) def read (self, path, open_handle): return StoreFile(self._client, path, "r", open_handle) def append (self, path, open_handle): return StoreFile(self._client, path, "a", open_handle) def write (self, path, open_handle): return StoreFile(self._client, path, "w", open_handle) def exists (self, path): try: dirinfo = self._client.list_dir(path) return True except errors.FileNotFound: return False def walk (self, path, visitor, recursive = False): """ Walk files in a path. Use recursive=True to include subdirs """ dirinfo = self._client.list_dir(path) for status in dirinfo["FileStatuses"]["FileStatus"]: if recursive and status["type"] == "DIRECTORY": if len(path) > 0: self.walk(path + "/" + status["pathSuffix"], visitor, recursive) else: self.walk(status["pathSuffix"], visitor, recursive) else: info = dict(name=status["pathSuffix"], modify=datetime.fromtimestamp(status["modificationTime"]), size=status["length"]) visitor(path, info)
def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.response = MagicMock()