def do_read_labels(file_uri): o = urlparse(file_uri) t = '/tmp/image.dat.' + str(os.getpid()) try: if o.scheme != 'hdfs': with open(o.path, 'rb') as fd: labels_train, labels_test = pk.load(fd) else: if os.path.exists(t): os.remove(t) client = Client(o.hostname, o.port) for f in client.copyToLocal([o.path], t): if f['result'] == True: with open(t, 'rb') as fd: labels_train, labels_test = pk.load(fd) os.remove(t) else: print('File ' + f['path'] + ' NOT copied because "' + str(f['error']) + '", sorry !') return None, None except: print('Exception ' + str(sys.exc_info()[0]) + ' on file ' + file_uri) return None, None return labels_train, labels_test
def gethdfsfile(request): if request.method == "GET": file_id = request.GET["file_id"] file_name = request.GET["file_name"] file_name = urllib.unquote(file_name).encode("utf-8") ### временный файл tfile = "/tmp/{file_id}".format(file_id=file_id) client = Client('10.6.0.135', 9000) for x in client.copyToLocal(['/blocks/%s' % file_id], tfile): print x f = open(tfile, 'r') data = f.read() f.close() ### Удаление временного файла os.remove(tfile) content_type = mimetypes.types_map[".%s" % file_name.split('.')[-1]] response = HttpResponse(data, content_type=content_type) response['Content-Disposition'] = 'attachment; filename="%s"' % file_name return response
def main(argv): hdfs_namenode = os.environ['HDFS_NAMENODE'] model_on_hdfs = os.environ['MODEL_ON_HDFS'] ip, port = hdfs_namenode.rsplit(':', 1) client = Client(ip, int(port), use_trash=False) dst_dir = os.path.join('/') for x in client.copyToLocal([model_on_hdfs], dst_dir): print x zk_master = os.environ['ZK_MASTER'] logger.info('job_name: {0}, task_index: {1}'.format( os.environ['JOB_NAME'], os.environ['TASK_INDEX'])) logger.info('command: {0}'.format(os.environ['CMD'])) zk = KazooClient(hosts=zk_master) zk.start() logger.info('job uid: {0}'.format(os.environ['UID'])) job_zk_dir = '/' + os.environ['UID'] members = zk.get_children(job_zk_dir + '/member/') members.sort() cluster_def = {} for member in members: host = zk.get(job_zk_dir + '/member/' + member)[0] if host != '': logger.info('{0} running on {1}'.format(member, host)) job_type = member.split('_')[2] if job_type == 'ps': cluster_def.setdefault('ps', []).append(host) elif job_type == 'worker': cluster_def.setdefault('worker', []).append(host) else: logger.error('unkown type: {0}'.format(job_type)) ps = ','.join(cluster_def['ps']) worker = ','.join(cluster_def['worker']) my_env = os.environ.copy() logger.info(my_env) my_env['PS'] = ps my_env['WORKER'] = worker cmd = [os.environ['CMD']] child = subprocess.Popen(cmd, shell=True, env=my_env) child.wait() zk.stop()
def get_json_object(): client = Client('localhost', 9000) #merged filename is hardcoded , you hav to keep changing the filename for every analysis!! for a in client.copyToLocal(['/user/flume/tweets/merged_20210102123709.json'], '/home/manojkhatokar/Downloads/BGD/final_python_scripts/merged_data'): print(a) # with open('/home/manojkhatokar/Downloads/merged_20210101142527.json') as f: # raw_data = f.read().splitlines()[-1] # list_data = f'[{raw_data}]' # json_data = json.loads(list_data) # print(json_data) json_file = open('/home/manojkhatokar/Downloads/BGD/final_python_scripts/merged_data/merged_20210102123709.json') json_object = json.load(json_file) json_file.close return json_object
def getTrainedModel(hdfsServer, modelFile): hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020)) modelSavePath = "/user/" + os.getenv('LOGNAME') + "/data/model/" + modelFile + '/' # Load the saved model data hdfs_client = Client(hdfsServer, hdfsPort) filesInfo = hdfs_client.ls([modelSavePath]) # Copy HDFS files to local temp directory # First clean up and recreate the temp folder copyDir = tempfile.gettempdir() + "/" + modelFile shutil.rmtree(copyDir, ignore_errors=True) os.makedirs(copyDir) res = hdfs_client.copyToLocal([f['path'] for f in filesInfo], copyDir) for r in res: if not r['result']: print "Error: %s" % r modelFilePath = copyDir + '/' + modelFile print "Load model from %s" % modelFilePath return joblib.load(modelFilePath)
def getObjsBackend(objs, backend, config): if(backend == 'hdfs'): client = Client(socket.gethostname(), config['HADOOP_RPC_PORT'], use_trash=False) for obj in objs: try: copy_gen = client.copyToLocal([obj[0]], obj[1]) for copy_item in copy_gen: pass except Exception as e: print(e) elif(backend == 'swift'): options = {'os_auth_url': os.environ['OS_AUTH_URL'], 'os_username': os.environ['OS_USERNAME'], 'os_password': os.environ['OS_PASSWORD'], 'os_tenant_id': os.environ['OS_TENANT_ID'], 'os_tenant_name': os.environ['OS_TENANT_NAME']} swiftService = SwiftService(options=options) for obj in objs: # Create the containers which are used in this application for Object Storage if(obj[0] == 'sqlite.db'): swiftService.post(container='containerFiles') swiftService.post(container='containerFeatures') swiftService.post(container='containerModules') out_file = obj[1] # Get the output file location from runner localoptions = {'out_file': out_file} objects = [] objects.append(obj[0]) swiftDownload = swiftService.download(container='containerModules', objects=objects, options=localoptions) for downloaded in swiftDownload: if("error" in downloaded.keys()): raise RuntimeError(downloaded['error']) # print(downloaded) elif(backend == 'nfs'): # Every file is already in respective local dirs pass
from snakebite.client import Client client = Client('localhost', 9000) for f in client.copyToLocal(['/input/input.txt'], '/tmp'): print f
class DataGenerator(object): 'Generates data for Keras' ''' Initialization function of the class ''' def __init__(self, height=28, width=28, channels=1, batch_size=32, cache_mode='', images_uri='/', shuffle=True): 'Initialization' self.debug = False self.height = height self.width = width self.channels = channels self.batch_size = batch_size self.shuffle = shuffle self.cache_mode = cache_mode self.images_uri = images_uri o = urlparse(self.images_uri) if o.scheme == 'hdfs': self.images_path = o.path self.client = Client( o.hostname, o.port ) # images_uri: 'hdfs://10.0.40.19:9600/daloflow/dataset32x32/' else: self.images_path = images_uri self.client = None ''' Set debug mode True/False ''' def set_debug(self, debug_mode): 'Do not show or show messages' self.debug = debug_mode if self.debug == True: print(' * Debug mode: ' + self.debug) print(' * Height: ' + self.height) print(' * Width: ' + self.width) print(' * Channels: ' + self.channels) print(' * Batch_size: ' + self.batch_size) print(' * Shuffle: ' + self.shuffle) print(' * Cache mode: ' + self.cache_mode) print(' * Image uri: ' + self.images_uri) ''' Goes through the dataset and outputs one batch at a time. ''' def generate(self, labels, list_IDs, yield_labels=True): 'Generates batches of samples' # Infinite loop while 1: # Generate random order of exploration of dataset (to make each epoch different) indexes = self.__get_exploration_order(list_IDs) # Generate batches imax = int(len(indexes) / self.batch_size) # number of batches for i in range(imax): # Find list of IDs for one batch list_IDs_temp = [ list_IDs[k] for k in indexes[i * self.batch_size:(i + 1) * self.batch_size] ] # Train, validation X, y = self.__data_generation(labels, list_IDs_temp, yield_labels) yield X, y ''' Generates a random order of exploration for a given set of list_IDs. If activated, this feature will shuffle the order in which the examples are fed to the classifier so that batches between epochs do not look alike. Doing so will eventually make our model more robust. ''' def __get_exploration_order(self, list_IDs): 'Generates order of exploration' # Find exploration order indexes = np.arange(len(list_IDs)) if self.shuffle == True: np.random.shuffle(indexes) return indexes ''' Get data: local ''' def __get_data_local(self, image_file_name): 'Get data from local file system path' pixels = None try: with open(image_file_name, 'rb') as image_file: pixels = np.fromstring(zlib.decompress(image_file.read()), dtype=np.uint8, sep='').reshape(self.height, self.width, self.channels) except: if self.debug == True: print('Exception ' + str(sys.exc_info()[0]) + ' on file ' + image_file_name) return pixels ''' Get data: remote ''' def __get_data_remote(self, image_file_name): 'Get data from HDFS' pixels = None if self.client == None: return pixels try: t = '/tmp/image.dat.' + str(os.getpid()) if os.path.exists(t): os.remove(t) for f in self.client.copyToLocal([image_file_name], t): if f['result'] == True: with open(t, 'rb') as image_file: pixels = np.fromstring( zlib.decompress(image_file.read()), dtype=np.uint8, sep='').reshape(self.height, self.width, self.channels) os.remove(t) else: print('File ' + f['path'] + ' NOT copied because "' + str(f['error']) + '", sorry !') except: if self.debug == True: print('Exception ' + str(sys.exc_info()[0]) + ' on file ' + image_file_name) return pixels ''' Get data: local or remote ''' def __get_data(self, image_file_name): 'Get data: local or remote' pixels = None #print(' * image file name: ' + image_file_name) if self.cache_mode == 'hdfs2local' or self.cache_mode == 'hdfs2local-full': pixels = self.__get_data_local(image_file_name) elif self.cache_mode == 'nocache': pixels = self.__get_data_remote(image_file_name) elif self.cache_mode == 'hdfs2local-partial': pixels = self.__get_data_local(image_file_name) if pixels == None: pixels = self.__get_data_remote(image_file_name) else: print('ERROR: unknown "' + self.cache_mode + '" cache mode') return pixels ''' Outputs batches of data and only needs to know about the list of IDs included in batches as well as their corresponding labels. ''' def __data_generation(self, labels, list_IDs_temp, yield_labels): 'Generates data of batch_size samples' # X : (n_samples, v_size, v_size, v_size, n_channels) # Initialization X = np.empty((self.batch_size, self.height, self.width, self.channels), dtype='float32') y = np.empty((self.batch_size), dtype='float32') # Generate data for i, ID in enumerate(list_IDs_temp): # Decompress image into pixel NumPy tensor image_file_name = self.images_path + '/'.join( ID.split('/')[1:]) + '.tar.gz' # Read image pixels = self.__get_data(image_file_name) # Store volume #pixels = np.rollaxis(pixels, 0, 3) # from 'channels_first' to 'channels_last' X[i, :, :, :] = pixels # get y value y_value = labels[ID] y[i] = y_value # return X and Y (train, validation) return X, y ''' Please note that Keras only accepts labels written in a binary form (in a 6-label problem, the third label is writtten [0 0 1 0 0 0]), which is why we need the sparsify function to perform this task, should y be a list of numerical values. ''' def sparsify1(self, y): 'Returns labels in binary NumPy array' return np.array([[1 if y[i] == j else 0 for j in range(10)] for i in range(y.shape[0])])
from snakebite.client import Client import json import boto3 import os ## Set auth keys with open('/home/n/opt/MindBender_BD/Misc/keys') as keys: s3_keys = json.load(keys) access_key = s3_keys["s3_python_test"]["access_key"] secret_access_key = s3_keys["s3_python_test"]["secret_access_key"] s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_access_key) ## Connect to HDFS with Snakebite client = Client('localhost', 9000) ## Move file locally (temporarily) for f in client.copyToLocal(['/spark/data.json'], '/home/n/opt/MindBender_BD/Task-021/tmp'): print("Moved one file.") ## Upload temp file to S3 s3.upload_file('/home/n/opt/MindBender_BD/Task-021/tmp/data.json', 'mindbender0001', 'data.json') ## Delete tmp file os.remove('/home/n/opt/MindBender_BD/Task-021/tmp/data.json')
def copy(): client = Client("study", 9000, use_trash=False) client.copyToLocal(["/data/gz"],"/root/data/",check_crc=False)
from snakebite.client import Client client = Client('localhost', 8020) #port is the RPC port of the namenode. for i in client.ls(['/user/cloudera/behrouz']): #takes a list of paths!! print i #get this parameters from /etc/hadoop/conf/core-site.xml under the fs.defaults #many of the methods in snake bite return generators #creating a directory: #create two directories behrouz, behrouz1/b1 on HDFS: print '*' * 40 for p in client.mkdir(['/behrouz', 'behrouz1/b1'], create_parent=True): print p print '*' * 40 #deleting files and directories: deletes any subdirectories and files a directory contains #recursively deleting the directories! for p in client.delete(['/behrouz', 'behrouz1/b1'], recurse=True): print p print '*' * 40 # retrieving data from hdfs: #copying files from HDFS to Local file system: for f in client.copyToLocal(['/user/cloudera/wordCount.out'], '/home/cloudera/'): print f print '*' * 40 ####### #reading contents of a file for l in client.text(['/user/cloudera/testfile.txt']): print l #the text method automatically decompress and display gzip and bzip2 files.
#!/user/local/bin/python from snakebite.client import Client client = Client('localhost', 9000) for f in client.copyToLocal(['/user/cbohara/book.txt'], '/tmp'): print f
# Создадим пару директорий for p in client.mkdir(['/student9_7/py_dir_01', '/student9_7/py_dir_02'], create_parent=True): print(p) ''' {'path': '/student9_7/py_dir_01', 'result': True} {'path': '/student9_7/py_dir_02', 'result': True} ''' # Удалим директорию `py_dir_01` for p in client.delete(['/student9_7/py_dir_01'], recurse=True): print(p) ''' {'path': '/student9_7/py_dir_01', 'result': True} ''' # Посмотрим что содержится в файле `test` for t in client.text(['/student9_7/test']): print(t) ''' test file for hdfs ''' # Скопируем файл `test` из хранилища в локальную домашнюю директорию под именем `retrived_file_via_py` for f in client.copyToLocal(['/student9_7/test'], 'retrived_file_via_py'): print(f) ''' {'path': '/home/student9_7/retrived_file_via_py', 'source_path': '/student9_7/test', 'result': True, 'error': ''} '''