class DataProcessor: def __init__(self, data_path=None): if data_path is None: self.data_path = r'./config/connect_info.json' else: assert type(data_path) == str self.data_path = data_path if not os.path.exists(self.data_path): self.data_path = r'./connect_info.json' with open(self.data_path) as data_file: data = json.load(data_file) self.hdfs_client = InsecureClient( url='http://' + data['namenode_url'] + ':' + str(data['port']), user=data['user'], root=data['root_path']) self.img_dir = data['img_dir'] if self.img_dir[-1] != '/': self.img_dir += '/' self.file_name = 1 def InitImgDir(self): try: list_rslt = self.hdfs_client.list(self.img_dir) if len(list_rslt) > 0: for name in list_rslt: file_path = self.img_dir + name self.hdfs_client.delete(file_path) except util.HdfsError: self.hdfs_client.makedirs(self.img_dir) print("Mkdir ...") return True def DataProcess(self, data, append=False, file_name=None): assert type(data) == str if file_name is None: file_name = self.img_dir + str(self.file_name) else: assert (type(file_name)) == str print("start writing...") start = time.time() self.hdfs_client.write(file_name, data, overwrite=True, replication=1, append=append) delta = time.time() - start print("writing complete, time delta is " + str(delta)) return True def Upload(self, remote_name, local_path): assert os.path.exists(local_path) remote_path = self.img_dir + remote_name self.hdfs_client.upload(remote_path, local_path, True) return True
def delete_data(request): response_content = {} response = HttpResponse() try: proj_id = request.GET.get('proj_id') data_id = request.GET.get('data_id') user_id = request.GET.get('user_id') fetched = Datasets.objects.filter(proj_id=proj_id, data_id=data_id, user_id=user_id).values('hdfs_path') if len(fetched) == 0: raise Exception('Oops! No access!') if list(fetched)[0]['hdfs_path']: client = InsecureClient("http://hdfs.neurolearn.com:50070", user="******") client.delete(list(fetched)[0]['hdfs_path'], recursive=True) Datasets.objects.filter(proj_id=proj_id, data_id=data_id, user_id=user_id).delete() response_content['msg'] = 'success' response_content['error_num'] = 0 except Exception as e: response_content['msg'] = str(e) response_content['error_num'] = 1 response.write(json.dumps(response_content)) return response
class SavedModelUploader(object): """upload a saved model to hadoop file system""" def __init__(self, url, user, base_path=""): self._logger = logging.getLogger(self.__class__.__name__) self._url = url self._user_ = user self._base_path = base_path self._client = InsecureClient(url, user) if not self._exist(base_path): self._mkdir(base_path) def _exist(self, path): if self._client.content(path, strict=False): return True else: return False def _mkdir(self, path): self._client.makedirs(path) def _del(self, path): self._client.delete(path, recursive=True) def _upload(self, local_path, hdfs_path): self._client.upload(hdfs_path, local_path) def _logging_progress(self, local_path, nbytes): msg = None if nbytes > 0: msg = "uploading: '{}' [{} bytes]".format(local_path, nbytes) else: msg = "uploading: '{}' [done]".format(local_path) self._logger.info(msg) def upload(self, local_model_path, overwrite=False): hdfs_model_path = self._base_path + '/' + basename(local_model_path) existed = self._exist(hdfs_model_path) if overwrite and existed: self._del(hdfs_model_path) elif not overwrite and existed: raise RuntimeError( "could not overwrite the model, already existed.") try: self._client.upload(self._base_path, local_model_path, progress=self._logging_progress) except HdfsError as e: self._logger.error(e) self._logger.info("model upload done")
class HdfsDb(object): HOST = '192.168.71.156' PORT = 50070 USER = '******' HOST_URI = 'http://{0}:{1}'.format(HOST, PORT) def __init__(self): self.client = InsecureClient(self.HOST_URI, user=self.USER) @check_dir_path def list_dir(self, dir_path=None): """ 列出根目录 :return: """ dir_data = self.client.list(dir_path) return dir_data @check_dir_path def mk_dir(self, dir_path=None): self.client.makedirs(dir_path) def write_file(self, filename, data, dir_path=None): """ 写入文件 hd.write_file('test.json', {'name': 'zhexiao'}, dir_path='/data') :param filename: :param data: :param dir_path: :return: """ file_path = '{0}/{1}'.format(dir_path, filename) self.client.write(file_path, str(data)) @check_dir_path def read_file(self, filename, dir_path=None): """ 读取文件数据 filedata = hd.read_file('README.txt', dir_path='/data') :param filename: :param dir_path: :return: """ file_path = '{0}/{1}'.format(dir_path, filename) with self.client.read(file_path, encoding='utf-8') as reader: for line in reader: yield line @check_dir_path def delete(self, filename, dir_path=None): file_path = '{0}/{1}'.format(dir_path, filename) self.client.delete(file_path)
def delete_hdfs_file(remove_from_local_hdfs, schema, table_name): # removing hdfs temporary files if remove_from_local_hdfs: # get private ip to connect to hdfs import socket private_ip = socket.gethostbyname(socket.gethostname()) try: hdfs_client = InsecureClient( url="http://{}:8020".format(private_ip), user="******") hdfs_client.delete("/user/hadoop/{}/{}".format(schema, table_name), recursive=True) except Exception as error: logging.error(error)
def upload_to_hdfs(input_dir, output_dir, chunk_size): # locate files in directory files = [ os.path.abspath("{}/{}".format(input_dir, f)) for f in listdir(input_dir) if isfile(join(input_dir, f)) ] tmp_dir = "{}/tmp".format(input_dir) # setup temp dir if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.mkdir(tmp_dir) # split files into 128mb chunks for f in files: fs = FileSplit(file=f, splitsize=(chunk_size) * 1e6, output_dir=tmp_dir) fs.split(callback=split_callback) # upload to hdfs hdfs_client = InsecureClient("http://{}:9870".format( settings.HDFS_HOST_VALUE), user=settings.HDFS_USER_VALUE) # delete existing output dir if hdfs_client.content(output_dir, strict=False) != None: hdfs_client.delete(output_dir, recursive=True) # upload files to tmp dir remote_path = hdfs_client.upload(hdfs_path="/tmp", local_path=tmp_dir, n_threads=-1, overwrite=True) # rename to output_dir hdfs_client.rename("/tmp", output_dir) print( "{} files uploaded to hdfs host '{}{}' ({} file chunks total)".format( len(files), settings.HDFS_HOST_VALUE, output_dir, len(split_files), )) # delete temp files shutil.rmtree(tmp_dir) return hdfs_file_paths
class DataProcessor: def __init__(self, data_path=None): if data_path == None: self.data_path = r'./config/connect_info.json' else: assert type(data_path) == str self.data_path = data_path if not os.path.exists(self.data_path): self.data_path = r'./connect_info.json' with open(self.data_path) as data_file: data = json.load(data_file) print("Data: ", data) self.hdfs_client = InsecureClient( url='http://' + data['namenode_url'] + ':' + str(data['port']), user=data['user'], root=data['root_path']) print("hdfs client: ", self.hdfs_client) self.img_dir = data['img_dir'] print("img dir: ", self.img_dir) if self.img_dir[-1] != '/': self.img_dir += '/' else: pass self.file_name = 1 def InitImgDir(self): try: list_rslt = self.hdfs_client.list(self.img_dir) if len(list_rslt) > 0: for name in list_rslt: file_path = self.img_dir + name self.hdfs_client.delete(file_path) except util.HdfsError: self.hdfs_client.makedirs(self.img_dir) return True def Upload(self, file_path, threads=2): print("FilePath: ", file_path) print("img_dir: ", self.img_dir[:-1]) self.hdfs_client.upload(hdfs_path=self.img_dir[:-1], local_path=file_path, n_threads=threads, overwrite=True) return 0
class HdfsWrapper: def __init__(self): self.client = None def connect_hdfs(self): self.client = InsecureClient(CONST.HDFS_URL, user=CONST.HDFS_USER) def mkdir_hdfs(self, path): if not exists(path): self.client.makedirs(path) def list_hdfs(self, path): return self.client.list(path) def read_hdfs(self, hdfs_path): try: with self.client.read(hdfs_path) as reader: return reader.read() except: log.error(traceback.format_exc()) self.connect_hdfs() log.error('reconnect hdfs...') def write_hdfs(self, hdfs_path, data, overwrite=False): try: with self.client.write(hdfs_path, overwrite=overwrite) as writer: writer.write(data) return hdfs_path except: log.error(traceback.format_exc()) self.connect_hdfs() log.error('reconnect hdfs...') def delete_hdfs(self, hdfs_path, recursive=False): return self.client.delete(hdfs_path, recursive)
class Storage: def __init__(self, protocol: str = 'webHDFS', *args, **kwargs): self.protocol, self.client = protocol.lower(), None if protocol.lower() == 'webHDFS'.lower(): from hdfs import InsecureClient self.client = InsecureClient(*args, **kwargs) for f in 'upload download list status delete'.split(): setattr(self, f, getattr(self, '%s_%s' % (f, protocol.lower()))) def upload_webhdfs(self, local_path: str, remote_path: str, **kwargs): to_screen("upload %s -> %s" % (local_path, remote_path)) return self.client.upload(local_path=local_path, hdfs_path=remote_path, **kwargs) def download_webhdfs(self, remote_path: str, local_path: str, **kwargs): mkdir_for(local_path) to_screen("download %s -> %s" % (remote_path, local_path)) return self.client.download(local_path=local_path, hdfs_path=remote_path, overwrite=True, **kwargs) def list_webhdfs(self, remote_path: str, **kwargs): return self.client.list(hdfs_path=remote_path, **kwargs) def status_webhdfs(self, remote_path: str, **kwargs): return self.client.status(hdfs_path=remote_path, **kwargs) def delete_webhdfs(self, remote_path: str, **kwargs): return self.client.delete(hdfs_path=remote_path, **kwargs)
def init_context(context): key_count = defaultdict(int) setattr(context.user_data, "key_count", key_count) # init HDFS hdfs_host = os.environ.get("HDFS_HOST") hdfs_user = os.environ.get("HDFS_USER") output_path = os.environ.get("REDUCER_OUTPUT_FILENAME") hdfs_client = InsecureClient("http://{}:9870".format(hdfs_host), user=hdfs_user) # delete existing output file if hdfs_client.content(output_path, strict=False) != None: hdfs_client.delete(output_path) setattr(context.user_data, "hdfs_client", hdfs_client)
def delJPG_Newmodel(basepath): now_time = datetime.datetime.now() now_date_str = now_time.strftime('%Y%m%d') now_date = datetime.datetime.strptime(now_date_str, '%Y%m%d') try: # client = Client('http://10.41.158.72:50070') # client = InsecureClient('http://10.41.158.106:50075', user='******') client = InsecureClient('http://10.41.158.65:50070', user='******') # path="/P8AOI" # path1="C:/Users/z18073048/Desktop/bigdata/X1778-ANSI-BOT_20200813_TB1-F11-TRI-05@20200813094718-FPW03354EX3P49WBS.JPG" # client.upload(path,path1,cleanup=True) folderlist = client.list(basepath) newmodel = getnewmodel() for i in range(len(folderlist)): if isinstance(folderlist[i], unicode): #if isinstance(folderlist[i],list): folderlist[i] = folderlist[i].decode('string_escape') fname = folderlist[i] #print folderlist[i] #if (fname=='X1777' or fname=='X1778' or fname=='Errormodel'): if (fname in newmodel): folderlist1 = client.list(basepath + '/' + fname) print 'newmodel:', folderlist[i], folderlist1 for i in range(len(folderlist1)): if isinstance(folderlist1[i], list): folderlist1[i] = folderlist1[i].decode('string_escape') date_flag = is_valid_date(folderlist1[i]) #print date_flag if date_flag == 'true': folderItem = datetime.datetime.strptime( folderlist1[i], '%Y%m%d') if folderItem + datetime.timedelta( days=365) <= now_date: paths = basepath + fname + '/' + folderlist1[i] delHbase(folderlist1[i], client, paths) deleteKudu(folderlist1[i], client, paths) try: client.delete(paths, recursive=True) print paths + ' is delete' except Exception as e: print e except Exception as e: print e
def orchestrationTraining(): hdfs_cli = InsecureClient('http://192.168.1.4:9870', user='******') hdfs_cli.delete('/images', recursive=True) hdfs_cli.delete('/images_augmented', recursive=True) hdfs_cli.delete('/images_crop', recursive=True) hdfs_cli.delete('/images_norm', recursive=True) data = request.get_json() url = data["url_db"] classifiers = data["classifiers"] list_algo = [] for algo in classifiers: if not (algo in list_algo_deep) and not (algo in list_algo_ml): return algo + ' is an incorrect algo.' list_algo.append(algo) orch = Orchestration(url, list_algo) list_returns_trains = orch.run() string_result = '{ \"returns_trains\": {' for i in range(len(list_returns_trains)): string_result += list_returns_trains[i] if i == len(list_returns_trains) - 1: string_result += '}}' else: string_result += ',' return json.loads(string_result)
def launcher(self): """ Send remove checkpoints task """ # Connect client = InsecureClient('http://{ip}:{port}'.format( ip=self.namenode_ip, port=self.namenode_port), user=self.file_user) # Get current timestamp timenow = calendar.timegm(datetime.datetime.now().timetuple()) unix_timestamp = int(timenow * 1000) onehour = 3600000 todelete = int(unix_timestamp - onehour) # Return file name list for directory in self.directories: fnames = client.list(directory, status=True) # Fetch list and sets modificationTime for fname in fnames: ctime = fname[1]['modificationTime'] if ctime <= todelete: dirtodelete = fname[1]['pathSuffix'] client.delete('{directory}/{dirtodelete}'.format( directory=directory, dirtodelete=dirtodelete), recursive=True) l.info( 'Removing {dir} ...Removed!'.format(dir=dirtodelete)) message = self.deleteddirs.append(dirtodelete) else: l.info( 'Nothing to remove into {directory}. Bye bye!'.format( directory=directory)) if message: stdout = message else: stdout = 'No directories were deleted.' return {'Deleted directories': stdout}
def download_file(path, test_case_number, task_number): try: client = InsecureClient( ('http://' + HADOOP_HOST_NAME + ':' + HADOOP_NAMENODE_PORT_NUMBER), user=HADOOP_USER_NAME) except: print("Error connecting to hdfs client") return try: client.download( HADOOP_OUTPUT_PATH + task_number + test_case_number + "/", os.path.join(path, test_case_number)) except Exception as e: print(e) print("Error downloading output file from hdfs") return try: client.delete(HADOOP_OUTPUT_PATH + task_number + test_case_number, recursive=True) except: print("Error deleting hdfs output directory") return
def full_load(tables, cur): for table in tables: tableName = table ts = datetime.datetime.now().strftime('_%Y%m%d_%H%M%S') query = "COPY (SELECT * FROM " + tableName + ") TO '/tmp/" + tableName + "_FL" + ts + ".csv'" cur.execute(query) ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh.connect('172.16.6.89', username='******', password='******') ftp = ssh.open_sftp() ftp.get("/tmp/" + tableName + "_FL" + ts + ".csv", "Gp/" + tableName + "_FL" + ts + ".csv") ftp.close() #Connect To hadoop client = InsecureClient('http://172.16.4.144:50070', user='******') client.delete("/user/root/greenplum/source/" + tableName, True) client.makedirs("/user/root/greenplum/source/" + tableName, "0777") client.upload( "/user/root/greenplum/source/" + tableName + "/", "F:/Srilatha/Attunity-POC/Greenplum/Gp/" + tableName + "_FL" + ts + ".csv") sql = "INSERT INTO control_table(table_name) VALUES(%s);" cur.execute(sql, (tableName, )) connection.commit()
def delete_directory(self, directory_url): web_hdfs_url = Environment().get_web_hdfs_url() session = SwSessionManager().get_session() user_name = session.get_username() client = InsecureClient(web_hdfs_url, user_name) try: directory_name_with_path = urllib3.util.parse_url( directory_url).path logger.log_info( "Deleting the directory {}".format(directory_name_with_path)) response = client.delete(directory_name_with_path, recursive=True) if not response: raise ServiceError("Directory {0} doesn't exist".format( directory_name_with_path)) return except Exception as e: raise ServiceError( "Deleting the folder from HDFS failed with the error: {0}". format(str(e)))
def orchestrationPrediction(): hdfs_client = InsecureClient('http://192.168.1.4:9870', user='******') hdfs_client.delete('/image_test', recursive=True) hdfs_client.delete('/image_test_crop', recursive=True) hdfs_client.delete('/image_test_ready', recursive=True) if request.files['picture'] is None: return json.dumps(None) picture = request.files['picture'] with hdfs_client.write('/image_test/test.jpg') as writer: picture.save(writer) data = pd.DataFrame(['test.jpg'], columns=['Path']) with hdfs_client.write('/image_test/data.csv', encoding='utf-8') as writer: data.to_csv(writer, index_label='index') classifiers = request.form.getlist('classifiers') list_algo = [] for algo in classifiers[0].split(','): if not (algo in list_algo_deep) and not (algo in list_algo_ml): return algo + ' is an incorrect algo.' list_algo.append(algo) orchPred = OrchestrationPrediction('test.jpg', list_algo) list_returns_predict = orchPred.run() data = {} data['returns_predictions'] = {} for res in list_returns_predict: data['returns_predictions'][list(res.keys())[0]] = res[list( res.keys())[0]] return json.dumps(data)
def hadoop_load(self): #Dumping data from local file system to hadoop ecosystem client_hdfs = InsecureClient('http://localhost:50070', user="******") try: client_hdfs.upload('/', "/home/student/Pied Piper/ethfinal68.csv") except: client_hdfs.delete(hdfs_path='/' + 'ethfinal68.csv' + '/', recursive=True) client_hdfs.upload('/', "/home/student/Pied Piper/ethfinal68.csv") try: client_hdfs.upload('/', "/home/student/Pied Piper/btcfinal68.csv") except: client_hdfs.delete(hdfs_path='/' + 'btcfinal68.csv' + '/', recursive=True) client_hdfs.upload('/', "/home/student/Pied Piper/btcfinal68.csv") try: client_hdfs.upload('/', "/home/student/Pied Piper/ltcfinal68.csv") except: client_hdfs.delete(hdfs_path='/' + 'ltcfinal68.csv' + '/', recursive=True) client_hdfs.upload('/', "/home/student/Pied Piper/ltcfinal68.csv")
# -*- coding: utf-8 -*- # # Copyright © 2018 white <*****@*****.**> # # Distributed under terms of the MIT license. """ https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.client """ from hdfs import InsecureClient hdfs_url = "http://192.168.30.125:50070" hdfs_user = "******" c = InsecureClient(hdfs_url, user=hdfs_user) c.write("/test_write", data="string") c.delete("/test_write") c.makedirs("/new/path") # 自动递归创建 with c.read("f.txt", encoding="utf-8") as f: content = f.read() c.write("/test.txt", "test string")
# import hdfs library to Python from hdfs import InsecureClient # log in hdfs server client = InsecureClient('http://master32:50070', user='******') # print all of the hdfs root folder print client.list('/') path = '/test/aaa.txt' # Check if the file exists if (client.content(path, strict=False) != None): client.delete(path) print "START TO WRITE FILE" # write a text file from hdfs with client.write(path, encoding='utf-8') as writer: for i in range(10): writer.write("Hello World\n") print "DONE" print "START TO READ FILE" # read a text file from hdfs with client.read(path, chunk_size=8096) as reader: for chunk in reader: print chunk
# ==== Writing Dataframe to HDFS ===== with client_hdfs.write('/user/hdfs/wiki/helloworld.csv', encoding='utf-8') as writer: df.to_csv(writer) # ====== Reading files ====== with client_hdfs.read('/user/hdfs/wiki/helloworld.csv', encoding='utf-8') as reader: df = pd.read_csv(reader, index_col=0) # ==== Getting Content Summary ==== client_hdfs.content('hdfs_path') # ==== Remove a directory or File in HDFS ==== client_hdfs.delete('hdfs_path', recursive=False, skip_trash=True) # ==== Create a Directory ==== client_hdfs.makedirs('hdfs_path', permission=None) # ==== Upload FIle into HDFS ==== client_hdfs.upload('hdfs_path', 'local_path', n_threads=1, temp_dir=None, chunk_size=65536, progress=None, cleanup=True, overwrite=True) # Source : https://hdfscli.readthedocs.io/en/latest/api.html#module-hdfs.client
class HDFSStorage(Storage): """ HDFS storage """ def fix_slashes(self, path): sep = os.path.sep if path[0] != sep: path = sep + path if path[-1] != sep: path = path + sep return path def __init__(self, location=None, base_url=None): self.hdfs_hosts = settings.HDFS_STORAGE['hosts'] self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root']) self.media_root = settings.MEDIA_ROOT self.media_url = self.fix_slashes(settings.MEDIA_URL) self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root) self.client = InsecureClient(self.hdfs_hosts) def _open(self, name, mode='rb'): local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep)) if not os.path.exists(local_path): remote_path = self.path(name) local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.makedirs(local_dir) print(self.client.download(remote_path, local_path=local_path, overwrite=True, temp_dir=tempfile.gettempdir())) return File(open(local_path, mode)) def _save(self, name, content): print("_save(%s, %s, %s)" % (self, name, content)) local_path = content.name hdfs_path = self.path(name) # os.path.basename(local_path)) print(hdfs_path, local_path) self.client.write(hdfs_path, data=content, overwrite=True) return name def url(self, name): return self.fetch_url % name def delete(self, name): return self.client.delete(self.path(name)) def listdir(self, path): file_list = [] dir_list = [] for name, status in self.client.list(self.path(path), status=True): if status['type'] == 'DIRECTORY': dir_list.append(name) elif status['type'] == 'FILE': file_list.append(name) return dir_list, file_list def size(self, name): return self.client.status(self.path(name))['length'] def exists(self, name): try: return True if self.client.status(self.path(name)) else False except HdfsError: return False def path(self, name): return (self.hdfs_root + name).replace('\\', '/')
for line in f: temp = line # 5. Append the local file RandomText.txt to the end of the HDFS file: /activity1/data/NotSoRandomText.txt client.write(hdfs_path='/activity1/data/NotSoRandomText.txt', data=temp, append=True) # 6. List the disk space used by the directory /activity1/data/ diskSpaceUsed = client.content('/activity1/data/', strict=True) print(diskSpaceUsed['spaceConsumed']) # 7. Put the local file MoreRandomText.txt into HDFS as the path: /activity1/data/MoreRandomText.txt client.upload(hdfs_path='/activity1/data/', local_path='./MoreRandomText.txt') print(client.list('/activity1/data')) # 8. Recursively list the contents of the directory /activity1/ fnames = client.list('/activity1') fpaths = [ psp.join(dpath, fname) for dpath, _, fnames in client.walk('/activity1') for fname in fnames ] print(fpaths) # 9. Remove the directory /activity1/ and all files/directories underneath it client.delete(hdfs_path='/activity1', recursive=True) print(client.list('/')) print('End')
def remove_in_hdfs(hdfs_path): client = InsecureClient('http://quickstart.cloudera:50070', user='******') client.delete(hdfs_path, recursive=True)
from hdfs import InsecureClient hdfs_cli = InsecureClient('http://192.168.1.4:9870', user='******') hdfs_cli.delete('/images', recursive=True) hdfs_cli.delete('/images_augmented', recursive=True) hdfs_cli.delete('/images_crop', recursive=True) hdfs_cli.delete('/images_norm', recursive=True) hdfs_cli.delete('/image_test', recursive=True) hdfs_cli.delete('/image_test_crop', recursive=True) hdfs_cli.delete('/image_test_ready', recursive=True) hdfs_cli.delete('/algo_trained', recursive=True)
from hdfs import InsecureClient import os client = InsecureClient("http://localhost:9870", user='******') client.delete("streamInput/area", True) client.makedirs("streamInput/area") # os.removedirs('file')
class HadoopFileSystem(): def __init__(self, url, user): u = urlsplit(url) if u.scheme != 'http' and u.scheme != 'https': raise ValueError("Invalid name node address") self.url = urlunparse((u.scheme, u.netloc, '', '', '', '')) self.client = InsecureClient(self.url, user=user) self.localdir = u.path self.prefix = 'HDFS' def normalize_path(self, path): path = os.path.normpath(path) path = self.strip_prefix(path) while path and path[0] == os.sep: path = path[1:] return os.path.join(self.localdir, path) def strip_prefix(self, path): return path[len(self.prefix):] if path.startswith( self.prefix) else path def strip_root(self, path): path = self.strip_prefix(path) if path.startswith(self.url): path = path[len(self.url):] if not path.startswith(self.localdir): raise 'Invalid hdfs path. It must start with the root directory' return path[len(self.localdir):] if path.startswith( self.localdir) else path def create_folder(self, path): try: path = self.normalize_path(path) self.client.makedirs(path) except: return None return path def remove(self, path): try: path = self.normalize_path(path) if self.client.status(path, False) is not None: self.client.delete(path, True) except Exception as e: print(e) def rename(self, oldpath, newpath): try: oldpath = self.normalize_path(oldpath) newpath = self.normalize_path(newpath) self.client.rename(oldpath, newpath) except Exception as e: print(e) def get_files(self, path): path = self.normalize_path(path) files = [] for f in self.client.list(path): status = self.client.status(join(path, f), False) if status['type'] != "DIRECTORY": files.append(f) return files def get_folders(self, path): path = self.normalize_path(path) folders = [] for f in self.client.list(path): status = self.client.status(join(path, f), False) if status['type'] == "DIRECTORY": folders.append(f) return folders def exists(self, path): path = self.normalize_path(path) status = self.client.status(path, False) return not (status is None) def isdir(self, path): path = self.normalize_path(path) status = self.client.status(path, False) return status['type'] == "DIRECTORY" def isfile(self, path): path = self.normalize_path(path) status = self.client.status(path, False) return status['type'] == "FILE" def read(self, path): path = self.normalize_path(path) with self.client.read(path) as reader: return reader.read().decode('utf-8') def write(self, path, content): path = self.normalize_path(path) self.client.write(path, content) def make_json(self, path): normalized_path = self.normalize_path(path) data_json = { 'path': urljoin(self.url, normalized_path), 'text': os.path.basename(path) } status = self.client.status(normalized_path, False) if status is not None: data_json['folder'] = status['type'] == "DIRECTORY" if status['type'] == "DIRECTORY": data_json['nodes'] = [ self.make_json(os.path.join(path, fn)) for fn in self.client.list(normalized_path) ] #print(json.dumps(data_json)) return data_json def save_upload(self, file, fullpath): localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath)) if os.path.isfile(localpath): os.remove(localpath) try: file.save(localpath) if isfile(fullpath): fullpath = os.path.dirname(fullpath) self.client.upload(self.normalize_path(fullpath), localpath, True) except: pass def download(self, path): path = self.normalize_path(path) status = self.client.status(path, False) if status is not None and status['type'] == "FILE": localpath = os.path.join(tempfile.gettempdir(), os.path.basename(path)) return self.client.download(path, localpath, True) else: return None
class HDFSStorage(Storage): """ HDFS storage """ def fix_slashes(self, path): sep = os.path.sep if path[0] != sep: path = sep + path if path[-1] != sep: path = path + sep return path def __init__(self, location=None, base_url=None): self.hdfs_hosts = settings.HDFS_STORAGE['hosts'] self.hdfs_root = self.fix_slashes(settings.HDFS_STORAGE['root']) self.media_root = settings.MEDIA_ROOT self.media_url = self.fix_slashes(settings.MEDIA_URL) self.fetch_url = '%s/webhdfs/v1%s%%s?op=OPEN' % (self.hdfs_hosts.split(',')[0], self.hdfs_root) self.client = InsecureClient(self.hdfs_hosts) def _open(self, name, mode='rb'): local_path = os.path.join(settings.MEDIA_ROOT, name.replace('/', os.path.sep)) if not os.path.exists(local_path): remote_path = self.path(name) local_dir = os.path.dirname(local_path) if not os.path.exists(local_dir): os.mkdir(local_dir) print self.client.download(remote_path, local_path=local_path, overwrite=True, temp_dir=tempfile.gettempdir()) return File(open(local_path, mode)) def _save(self, name, content): print "_save(%s, %s, %s)" % (self, name, content) local_path = content.name hdfs_path = self.path(name) # os.path.basename(local_path)) print hdfs_path, local_path self.client.write(hdfs_path, data=content, overwrite=True) return name def url(self, name): return self.fetch_url % name def delete(self, name): return self.client.delete(self.path(name)) def listdir(self, path): file_list = [] dir_list = [] for name, status in self.client.list(self.path(path), status=True): if status['type'] == 'DIRECTORY': dir_list.append(name) elif status['type'] == 'FILE': file_list.append(name) return dir_list, file_list def size(self, name): return self.client.status(self.path(name))['length'] def exists(self, name): try: return True if self.client.status(self.path(name)) else False except HdfsError: return False def path(self, name): return (self.hdfs_root + name).replace('\\', '/')
class HDFSLibrary: """ Test library for working with HDFS """ WEB_HDFS_URL = "" client = "" def __init__(self, namenode="localhost", port="50070"): self.WEB_HDFS_URL = 'http://' + namenode + ':' + str(port) print namenode, ">>", port, ">>", self.WEB_HDFS_URL self.client = InsecureClient(self.WEB_HDFS_URL) def check_hdfs_file_exists(self, file_path, stop=False): if None == self.client.status(file_path, strict=False): if stop: print "ERROR: Error: File does not exist: ", file_path return "ERROR: Error: File does not exist: ", file_path # exit(172) return False return True def get_hdfs_file_content(self, file_path): self.check_hdfs_file_exists(file_path, stop=True) data = "" with self.client.read(file_path) as reader: for line in reader: data += line return data def search_string_in_hdfs_file(self, file_path, text1, text2="aqwszx", text3="xzswqa"): ret = self.check_hdfs_file_exists(file_path, stop=True) found = "" if ret else ret with self.client.read(file_path) as reader: for line in reader: if line.find(text1) == -1 and line.find( text2) == -1 and line.find(text3) == -1: continue found += line return found def hdfs_file_should_not_contain(self, file_path, text1, text2="aqwszx", text3="xzswqa"): self.check_hdfs_file_exists(file_path, stop=True) with self.client.read(file_path) as reader: for line in reader: if line.find(text1) != -1 or line.find( text2) != -1 or line.find(text3) != -1: return False return True ######################## # # BASIC FUNCTIONS: # # ######################## def get_hdfs_file_folder_content_summary(self, file_path): """ Retrieving a file or folder content summary. :return: returns a file or folder content summary. """ self.check_hdfs_file_exists(file_path, stop=True) return self.client.content(file_path) def get_hdfs_file_folder_status(self, file_path): """ Retrieving a file or folder status. :return: returns a file or folder status. """ self.check_hdfs_file_exists(file_path, stop=True) return self.client.status(file_path) def list_hdfs_directory(self, folder_path): """ Listing all files inside a directory. :return: returns a file list. """ self.check_hdfs_file_exists(folder_path, stop=True) return self.client.list(folder_path) def move_hdfs_file(self, old_path, new_path): """ Renaming ("moving") a file. :return: NA """ self.check_hdfs_file_exists(old_path, stop=True) self.client.rename(old_path, new_path) def delete_hdfs_file(self, file_path): """ Deleting a file or folder recursively. :return: returns `True` if the deletion was successful otherwise `False` """ self.check_hdfs_file_exists(file_path) return self.client.delete(file_path, recursive=True) def copy_to_local_hdfs_file(self, hdfs_path, local_path): """ Copy a file or folder from HDFS to local. :return: local_path """ self.check_hdfs_file_exists(hdfs_path) return self.client.download(hdfs_path, local_path, overwrite=True, n_threads=4) def copy_from_local_hdfs_file(self, local_path, hdfs_path): """ Copy a file or folder from local to HDFS. :return: hdfs_path """ return self.client.upload(hdfs_path, local_path, overwrite=True, n_threads=4) def get_hdfs_file_checksum(self, file_path): """ Get the checksum value for file :return: checksum """ self.check_hdfs_file_exists(file_path, stop=True) return self.client.checksum(file_path) def create_hdfs_dir(self, dir_path, perm=755): """ Create a directory or recursive dirs on HDFS :return: NA """ self.client.makedirs(dir_path, permission=perm)
import io # For Data Lake from hdfs import InsecureClient # For Data Warehouse from pyhive import hive import pandas as pd df_source = pd.read_csv(r'output/price.csv') # Define HDFS interface hdfs_interface = InsecureClient('http://localhost:50070') hdfs_interface.list('/') # Delete old data hdfs_interface.delete('/wqd7005/raw_price', recursive=True, skip_trash=True) # Create hdfs directories to store data hdfs_interface.makedirs('/wqd7005') hdfs_interface.makedirs('/wqd7005/raw_price') hdfs_interface.list('/wqd7005') # Write data to raw_price directory # text buffer s_buf = io.StringIO() # saving a data frame to a buffer (same as with a regular file): df_source.to_csv(s_buf, index=False, header=False) hdfs_interface.write('/wqd7005/raw_price/000000_0', data=s_buf.getvalue(),
# Check if there is data for a prediction client_hdfs = InsecureClient('http://awscdh6-ma.sap.local:9870', user='******') hdfs_content = client_hdfs.list('/tmp/tbr/BARMER/XSA') print(hdfs_content) print() if len(hdfs_content) > 0 and hdfs_content[0] == 'iris.csv': print('Starte Prediction') #Herkunft des R-Scripts source_path = 'https://github.com/JimKnopfSun/BARMER_XSA.git' #Ziel des R-Scripts auf XSA target_path = '/usr/sap/HN2/home/testdir/' #Leere alte Script-Downloads im XSA shutil.rmtree(path=target_path + "/BARMER_XSA", ignore_errors=True, onerror=None) #Lade R-Script nach XSA git_clone(source_path, target_path) #Führe R-Script aus r = robjects.r _ = r.source(target_path + "/BARMER_XSA/sample.R") # Remove Data from HDFS client_hdfs.delete("/tmp/tbr/BARMER/XSA/iris.csv")
class HadoopFileSystem(object): def __init__(self, *opts): self.client = InsecureClient(current_app.config['WEBHDFS_ADDR'], user=current_app.config['WEBHDFS_USER']) # def make_tree(self, datasourceid, client, path): # tree = dict(name=(os.path.basename(path), datasourceid + os.path.sep + path), children=[]) # try: lst = client.list(path, status=True) # except: # pass #ignore errors # else: # for fsitem in lst: # fn = os.path.join(path, fsitem[0]) # if fsitem[1]['type'] == "DIRECTORY": # tree['children'].append(make_hdfs_tree(datasourceid, client, fn)) # else: # tree['children'].append({'name' : (fsitem[0], datasourceid + os.path.sep + fn), 'children' : []}) # return tree def make_json(self, datasourceid, base, relative_path): path = os.path.join(base, relative_path) data_json = {'datasource': datasourceid, 'path': relative_path, 'name': os.path.basename(relative_path) } status = self.client.status(path, False) if status is not None: if status['type'] == "DIRECTORY": data_json['type'] = DataType.Folder data_json['children'] = [self.make_json(datasourceid, base, os.path.join(relative_path, fn)) for fn in self.client.list(path)] else: data_json['type'] = DataType.File #print(json.dumps(data_json)) return data_json def makedirs(self, path): try: self.client.makedirs(path) except: return None return path def delete(self, path): try: if self.client.status(path, False) is not None: self.client.delete(path, True) except Exception as e: print(e) def addfolder(self, path): i = 0 while self.client.status(os.path.join(path, "New Folder ({0})".format(i)), False) is None: i += 1 return self.makedirs(os.path.join(path, "New Folder ({0})".format(i))) def rename(self, oldpath, newpath): try: self.client.rename(oldpath, newpath) except Exception as e: print(e) def saveUpload(self, file, fullpath): localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath)) if os.path.isfile(localpath): os.remove(localpath) try: file.save(localpath) self.client.upload(os.path.dirname(fullpath), localpath, True) except: pass def download(self, fullpath): status = self.client.status(fullpath, False) if status is not None and status['type'] == "FILE": localpath = os.path.join(tempfile.gettempdir(), os.path.basename(fullpath)) return self.client.download(fullpath, localpath, True) else: return None