def close(self): # drop the temp database self._cursor.execute('USE %s' % self._temp_db) self._cursor.execute('SHOW TABLES') temp_tables = [x[0] for x in self._cursor.fetchall()] for table in temp_tables: self._cursor.execute('DROP TABLE IF EXISTS %s.%s' % (self._temp_db, table)) self._cursor.execute('SHOW FUNCTIONS') temp_udfs = [x[1] for x in self._cursor.fetchall()] for udf in temp_udfs: self._cursor.execute('DROP FUNCTION IF EXISTS %s.%s' % (self._temp_db, udf)) self._cursor.execute('SHOW AGGREGATE FUNCTIONS') temp_udas = [x[1] for x in self._cursor.fetchall()] for uda in temp_udas: self._cursor.execute('DROP AGGREGATE FUNCTION IF EXISTS %s.%s' % (self._temp_db, uda)) self._cursor.execute('USE default') self._cursor.execute('DROP DATABASE IF EXISTS %s' % self._temp_db) # drop the temp dir in HDFS try: from requests.exceptions import ConnectionError from pywebhdfs.webhdfs import PyWebHdfsClient hdfs_client = PyWebHdfsClient(host=self._nn_host, port=self._webhdfs_port, user_name=self._hdfs_user) hdfs_client.delete_file_dir(self._temp_dir.lstrip('/'), recursive=True) except ImportError: import sys sys.stderr.write("Could not import requests or pywebhdfs. " "You must delete the temporary directory manually: %s" % self._temp_dir) except ConnectionError: import sys sys.stderr.write("Could not connect via pywebhdfs. " "You must delete the temporary directory manually: %s" % self._temp_dir)
class WhenTestingDeleteOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.response = MagicMock() def test_rename_throws_exception_for_not_ok(self): self.response.status_code = http_client.BAD_REQUEST self.requests.delete.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.delete_file_dir(self.path) def test_rename_returns_true(self): self.response.status_code = http_client.OK self.requests.delete.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.delete_file_dir(self.path) self.assertTrue(result)
class WhenTestingDeleteOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.response = MagicMock() def test_rename_throws_exception_for_not_ok(self): self.response.status_code = httplib.BAD_REQUEST self.requests.delete.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.delete_file_dir(self.path) def test_rename_returns_true(self): self.response.status_code = httplib.OK self.requests.delete.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.delete_file_dir(self.path) self.assertTrue(result)
def test_webhdfs_csv(self): from pywebhdfs.webhdfs import PyWebHdfsClient dfs = PyWebHdfsClient(host='localhost',port='9870', user_name='hadoop') dfs.make_dir("/temp") with open("tests/data/data.csv") as input_file: dfs.create_file("/temp/data.csv", file_data=input_file, overwrite=True) dfs.delete_file_dir("/temp", recursive=True)
def close(self): # drop the temp database self._cursor.execute('USE %s' % self._temp_db) self._cursor.execute('SHOW TABLES') temp_tables = [x[0] for x in self._cursor.fetchall()] for table in temp_tables: self._cursor.execute('DROP TABLE IF EXISTS %s.%s' % (self._temp_db, table)) self._cursor.execute('SHOW FUNCTIONS') temp_udfs = [x[1] for x in self._cursor.fetchall()] for udf in temp_udfs: self._cursor.execute('DROP FUNCTION IF EXISTS %s.%s' % (self._temp_db, udf)) self._cursor.execute('SHOW AGGREGATE FUNCTIONS') temp_udas = [x[1] for x in self._cursor.fetchall()] for uda in temp_udas: self._cursor.execute('DROP AGGREGATE FUNCTION IF EXISTS %s.%s' % (self._temp_db, uda)) self._cursor.execute('USE default') self._cursor.execute('DROP DATABASE IF EXISTS %s' % self._temp_db) # drop the temp dir in HDFS try: from requests.exceptions import ConnectionError from pywebhdfs.webhdfs import PyWebHdfsClient hdfs_client = PyWebHdfsClient(host=self._nn_host, port=self._webhdfs_port, user_name=self._hdfs_user) hdfs_client.delete_file_dir(self._temp_dir.lstrip('/'), recursive=True) except ImportError: import sys sys.stderr.write("Could not import requests or pywebhdfs. " "You must delete the temporary directory " "manually: %s" % self._temp_dir) except ConnectionError: import sys sys.stderr.write("Could not connect via pywebhdfs. " "You must delete the temporary directory " "manually: %s" % self._temp_dir)
def write_data_to_hdfs(username, records): global hdfs_namenodes to_return = {} file_path = "/jobs_to_do/" + username + ".txt" result_path = "/jobs_done/" + username logger.debug("Writing file " + file_path + " to HDFS") try: logger.debug("Trying to connect to " + hdfs_namenodes[0] + " namenode") hdfs_client = PyWebHdfsClient(host=hdfs_namenodes[0], port='50070', user_name='xnet', timeout=100) logger.debug("Trying to erase " + file_path) logger.debug("Trying to erase " + result_path) hdfs_client.delete_file_dir(file_path, recursive=True) hdfs_client.delete_file_dir(result_path, recursive=True) hdfs_client.create_file(file_path, records.encode("utf-8")) except (ConnectionError, PyWebHdfsException) as ce: to_return["details_1"] = str(ce) logger.debug("Failed connecting to" + hdfs_namenodes[0] + " namenode") try: logger.debug("Trying to connect to " + hdfs_namenodes[1] + " namenode") hdfs_client = PyWebHdfsClient(host=hdfs_namenodes[1], port='50070', user_name='xnet', timeout=100) logger.debug("Trying to erase " + file_path) logger.debug("Trying to erase " + result_path) hdfs_client.delete_file_dir(file_path, recursive=True) hdfs_client.delete_file_dir(result_path, recursive=True) hdfs_client.create_file(file_path, records.encode("utf-8")) except (ConnectionError, PyWebHdfsException) as ce: to_return[ "error"] = "There was a problem while trying to connect to HDFS namenode." to_return["details_2"] = str(ce) logger.debug(str(to_return)) return False, to_return return True, None
#1 imports from pywebhdfs.webhdfs import PyWebHdfsClient #2 make connection with hadoop file system hdfs = PyWebHdfsClient(user_name="hdfs",port=50070,host="sandbox.hortonworks.com") hdfs.delete_file_dir('chapter5/LoanStats3d.csv',recursive=True) #4 recreate the chapters directory hdfs.make_dir('chapter5') #5 upload the csv file with open('./data/stored_csv.csv') as file_data: hdfs.create_file('chapter5/LoanStats3d.csv',file_data, overwrite=True) #6 print the status to see if this succeeded. print hdfs.get_file_dir_status('chapter5/LoanStats3d.csv')
class HDFS(object): def __init__(self, host, port, user): self._hdfs = PyWebHdfsClient(host=host, port=port, user_name=user, timeout=None) logging.debug('webhdfs = %s@%s:%s', user, host, port) def recursive_copy(self, local_path, remote_path, exclude=None, permission=755): if exclude is None: exclude = [] c_path = canonicalize(remote_path) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path, permission=permission) fs_g = os.walk(local_path) for dpath, dnames, fnames in fs_g: _, relative_path = dpath.split(local_path) for dname in dnames: if dname not in exclude: c_path = canonicalize('%s/%s/%s' % (remote_path, relative_path, dname)) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path, permission=permission) for fname in fnames: if fname not in exclude: data = file( canonicalize('%s/%s/%s' % (local_path, relative_path, fname)), 'rb') c_path = canonicalize('%s/%s/%s' % (remote_path, relative_path, fname)) logging.debug('creating %s', c_path) self._hdfs.create_file(c_path, data, overwrite=True, permission=permission) data.close() def make_dir(self, path, permission=755): logging.debug('make_dir: %s', path) self._hdfs.make_dir(canonicalize(path), permission=permission) def create_file(self, data, remote_file_path, permission=755): logging.debug('create_file: %s', remote_file_path) sio = BytesIO(data) self._hdfs.create_file(canonicalize(remote_file_path), sio, overwrite=True, permission=permission) def append_file(self, data, remote_file_path): logging.debug('append to: %s', remote_file_path) self._hdfs.append_file(canonicalize(remote_file_path), data) def stream_file_to_disk(self, remote_file_path, local_file_path): chunk_size = 10 * 1024 * 1024 offset = 0 with open(local_file_path, 'wb') as dest_file: data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) while True: dest_file.write(data) if len(data) < chunk_size: break offset += chunk_size data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) def read_file(self, remote_file_path): data = self._hdfs.read_file(canonicalize(remote_file_path)) return data def remove(self, path, recursive=False): logging.debug('remove: %s', path) self._hdfs.delete_file_dir(canonicalize(path), recursive) def file_exists(self, path): try: self._hdfs.get_file_dir_status(path) return True except: return False
#1 imports from pywebhdfs.webhdfs import PyWebHdfsClient #2 make connection with hadoop file system hdfs = PyWebHdfsClient(user_name="hdfs", port=50070, host="sandbox.hortonworks.com") hdfs.delete_file_dir('chapter5/LoanStats3d.csv', recursive=True) #4 recreate the chapters directory hdfs.make_dir('chapter5') #5 upload the csv file with open('./data/stored_csv.csv') as file_data: hdfs.create_file('chapter5/LoanStats3d.csv', file_data, overwrite=True) #6 print the status to see if this succeeded. print hdfs.get_file_dir_status('chapter5/LoanStats3d.csv')
class HadoopFileSystem(BaseFs.FileSystem): def __init__(self, vcPath, simulateOnly=False, isVerbose=False, logger=None, user=None, host=None, port=None): BaseFs.FileSystem.__init__(self, simulateOnly, isVerbose, logger) config = Config.Config() hdfsUser, hdfsHost, hdfsPort = config.getHadoopConfig(user, host, port) self.hdfs = PyWebHdfsClient(host=hdfsHost, port=hdfsPort, user_name=hdfsUser) self.vcPath = vcPath def make_fd(self, path, isSrc, dstDirMustExist): fd = None try: fd = HadoopFileDescriptor(self, path, isSrc, dstDirMustExist) except pywebhdfs.errors.FileNotFound: self.logger.info("DESC: does not exist: " + path) raise Errors.FileNotFound("Path {0} does not exist".format(path)) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format(path, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format(path, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( path, e)) raise Errors.BadConnection( "Connection error while looking for path: {0}, exc={1}".format( path, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( path, e)) raise Errors.FsException( "An exception happened while looking for path: {0}, exc={1}". format(path, e)) return fd def exists_file_dir(self, fd): try: return self.hdfs.exists_file_dir(fd.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS exists test: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS exists test: {0}, exc={1}". format(fd.abspath, e)) def delete_file_dir(self, fd, recursive=False, force=False): if self.simulateOnly: print("SIMULATE -> remove file/dir: {0}, recursive={1}".format( fd.abspath, recursive)) else: try: if not recursive or force or \ query_yes_no(question="Are you sure you want to delete folder recursively?", default="no"): status = self.hdfs.delete_file_dir(fd.abspath, recursive=recursive) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS delete directory: {0}, exc={1}" .format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS delete directory: {0}, exc={1}" .format(fd.abspath, e)) def list_dir(self, fd): try: status = self.hdfs.list_dir(fd.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error while looking for path: {0}, exc={1}".format( fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened while looking for path: {0}, exc={1}". format(fd.abspath, e)) currentDir = status["FileStatuses"]["FileStatus"] for item in currentDir: yield HadoopFileDescriptor(self, fd.abspath, isSrc=True, needsDstDirCheck=False, fileJson=item) def make_dir(self, path): if self.simulateOnly: print("SIMULATE -> make dir: " + path) else: try: self.hdfs.make_dir(path) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( path, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format(path, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( path, e)) raise Errors.BadConnection( "Connection error during HDFS create directory: {0}, exc={1}" .format(path, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( path, e)) raise Errors.FsException( "An exception happened during HDFS create directory: {0}, exc={1}" .format(path, e)) def open_file(self, fd, rwMode): return fd def close_file(self, fd): pass def touch_file(self, fd): if self.simulateOnly: print("SIMULATE -> touch file: " + fd.abspath) else: try: self.hdfs.create_file(fd.abspath, 0, overwrite=True) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS create file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS create file: {0}, exc={1}" .format(fd.abspath, e)) def truncate_file(self, fd, size): if self.simulateOnly: print("SIMULATE -> truncate file: {0}, size={1}".format( fd.abspath, size)) else: try: self.hdfs.truncate_file(fd.abspath, size) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS truncate file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS truncate file: {0}, exc={1}" .format(fd.abspath, e)) def try_concat_files(self, fd, chunkFdList): # Workaround for unordered concat bug in Hadoop 2.7.1 is to use one source at the time # https://issues.apache.org/jira/browse/HDFS-8891 currIndex = 0 concatStep = 20 chunkedList = [ chunkFdList[pos:pos + concatStep] for pos in range(0, len(chunkFdList), concatStep) ] for sourceChunk in chunkedList: try: self.concat_files(fd, sourceChunk) currIndex += len(sourceChunk) except Errors.FsException as e: break return currIndex def concat_files(self, fd, chunkFdList): strList = list() for chunkFd in chunkFdList: strList.append(chunkFd.abspath) if self.simulateOnly: print("SIMULATE -> concat file: {0}, sources={1}".format( fd.abspath, ",".join(strList))) else: try: self.hdfs.concat_files(fd.abspath, strList) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS concat file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS concat file: {0}, exc={1}" .format(fd.abspath, e)) def read_data(self, fd, offset, size): if offset >= fd.size: return "" else: try: contents = self.hdfs.read_file(fd.abspath, offset=offset, length=size) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS read file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS read file: {0}, exc={1}" .format(fd.abspath, e)) return contents def append_data(self, fd, data): if self.simulateOnly: print("SIMULATE -> write file data: " + fd.abspath) else: try: self.hdfs.append_file(fd.abspath, data) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS append file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS append file: {0}, exc={1}" .format(fd.abspath, e)) def local_mv_file(self, src, dst): if self.simulateOnly: print("SIMULATE -> local move file: {0} -> {1} ".format( src.abspath, dst.abspath)) else: try: self.hdfs.rename_file_dir(src.abspath, dst.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( src.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( src.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( src.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS rename file: {0}, exc={1}". format(src.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( src.abspath, e)) raise Errors.FsException( "An exception happened during HDFS rename file: {0}, exc={1}" .format(src.abspath, e)) def local_cp_file(self, src, dst): # This is an open issue in Hadoop community: https://issues.apache.org/jira/browse/HDFS-3370 # Instead, we can do a symbolic link if self.simulateOnly: print("SIMULATE -> local copy file: {0} -> {1} ".format( src.abspath, dst.abspath)) else: print( "Copy within HDFS is not supported due to lack of Hadoop support" ) print( "Once symbolic links are enabled, this feature will be enabled" ) sys.exit(1) # self.hdfs.create_sym_link(src.abspath, dst.abspath, createParent=True) def get_hdfs_file_dir_json(self, path): try: status = self.hdfs.get_file_dir_status(path) return status["FileStatus"] except pywebhdfs.errors.FileNotFound: return None def validate_hdfs_arg(self, arg): if not arg.startswith(self.vcPath): print("Error: You don't have permissions to the path: %s" % arg) print("Your path must be rooted under: %s" % self.vcPath) sys.exit(1)
</inputs> <outputs> <data name="filtin" label="${process_id}_filterin" tag="filterin" format="csv" type="file"/> <data name="filtout" label="${process_id}_filterout" tag="filterout" format="csv" type="file"/> <data name="summ" label="${process_id}_summary" tag="filtering_summary" format="csv" type="file"/> </outputs> <help> This tool filters IMGT Summary Data based on a combination of criteria. </help> </tool>""" # create a new client instance hdfs = PyWebHdfsClient(host='83.212.112.144', port='50070', user_name='root') # delete existing file print('delete current file\n'.format(example_file)) hdfs.delete_file_dir(example_file,recursive=False) # create a new directory for the example #print('making new HDFS directory at: {0}\n'.format(example_dir)) #hdfs.make_dir(example_dir) # get a dictionary of the directory's status #dir_status = hdfs.get_file_dir_status(example_dir) #print(dir_status) # create a new file on hdfs print('making new file at: {0}\n'.format(example_file)) hdfs.create_file(example_file, example_data) #file_status = hdfs.get_file_dir_status(example_file) #print(file_status)
class DMS: def __init__(self, debug=0): ''' This function use to init a class. To show an error messages debug should be 1. :param : debug - 1, show an error or success message. 0 otherwise :return: Nothing. ''' self.debug = debug pass def hbase_connection(self, host, port, table='dms'): ''' This function use to establish a connection to hbase, for preparing to insert, remove, fetch data from hbase. We use starbase for connect to hbase via rest api.(See more: https://github.com/barseghyanartur/starbase) :param : host - hbase rest host :param : port - hbase rest running port :param : table - DMS table on hbase (default: 'dms') :return: Nothing. ''' self.hbase = hbaseConnection(host=host, port=port) t = self.hbase.table(table) if (not t.exists()): t.create('meta_data', 'file') self.hbase_table = t def hdfs_connection(self, host, port, user_name, hdfs_path='/tmp/'): ''' This function use to establish a connection to hdfs, for preparing to create, retrieve, update, delete file in hdfs. We use pywebhdfs in order to do this task via hdfs rest api.(See more: http://pythonhosted.org/pywebhdfs/) :param : host - hdfs rest host :param : port - hdfs rest running port :param : user_name - hdfs username (for authentication) :param : hdfs_path - location to store files. (default: '/tmp/') :return: Nothing. ''' self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name) self.hdfs_path = hdfs_path def solr_connection(self, host, port, collection): ''' This function use to establish a connection to solr, for query or search any text on a system. :param : host - solr's host :param : port - solr's running port :param : collection - solr's collection for searching ''' self.solr = ''.join(['http://', host, ':', port, '/solr/', collection]) def extract(self, file): ''' This function use to extract meta data from a file. We use hachoir3 library to extract them. (See more: http://hachoir3.readthedocs.org) :param : file - file for extract :return: meta data as dict for success, 0 if fail. ''' try: filename, realname = unicodeFilename(file), file parser = createParser(filename, realname) meta_data = extractMetadata(parser) meta_data_text = meta_data.exportPlaintext() meta_list = dict() for i in range(1, len(meta_data_text)): meta_split = meta_data_text[i].split(":") column = meta_split[0].replace('- ', '') value = meta_split[1].lstrip() meta_list.update({column: value}) return meta_list except: if self.debug: print "Something went wrong, meta data of", file, "could not extract." return None def upload(self, file): ''' This function use to uplaod a file to hdfs and store meta data on hbase Meta data consist of 2 main parts: file's meta data and hdfs's file's meta data. This function will increase a file version if it is already store in hbase. :param : file - file's name :return: True if success otherwise False. ''' version = 1 key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) # Read a file try: f = open(file, 'r') file_content = f.read() f.close() except: print "Cannot read file:", file # Check file's version while self.hbase_table.fetch(key) != None: version = int(self.get_lastest_version(file)) + 1 key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) # Try to upload file. try: self.hdfs.create_file(path, file_content) hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus'] file_meta = self.extract(file) t = self.hbase_table status = t.insert(key, {'file': {'content': file_content}}) if status != 200: if self.debug: print "Error inserting: file content" # save hbase meta data for i in range(0, len(file_meta.keys())): status = t.insert( key, { 'meta_data': { file_meta.keys()[i]: file_meta[file_meta.keys()[i]] } }) if status != 200: if self.debug: print "Error inserting:", file_meta.keys()[i] # save hdfs meta data for i in range(0, len(hdfs_meta.keys())): status = t.insert( key, { 'meta_data': { hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]] } }) if status != 200: if self.debug: print "Error inserting:", hdfs_meta.keys()[i] # save version status = t.insert(key, {'meta_data': {'version': version}}) if status != 200: if self.debug: print "Error inserting: version" except: if self.debug: print "Upload failed." return False if self.debug: print "[Uploaded]", file, "version:", version return True def download(self, file, version=None, download_dir=''): ''' This function use to retrieve or download file from hdfs. Then save it as a new file named (v[version].[file] - For example, v1.mytext.txt). You can specify the directory of downloaded file. You can also specify file's version for downloading if not it will be version 1. :param : file - file's name :param : version - file's version (default: 1) :param : download_dir - download directory (default: '' or current directory NOTE: it must end with '/' - For example, '../download/') :return: True if success otherwise false. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) downloaded_file = ''.join([download_dir, key]) try: f = open(downloaded_file, 'w') f.write(self.hdfs.read_file(path)) f.close() except: if self.debug: print "Cannot download a file:", file return False if self.debug: print "[Downloaded]", key return True def update(self, file, version=None): ''' This function use to update file to hdfs and data stored in hbase by overwrite that file on hdfs, and also insert new data to hbase too. You can specify a file's version in order to update it. :param : file - file's name :param : version - file's version :return: True if success otherwise False. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) # Read a file try: f = open(file, 'r') file_content = f.read() f.close() except: print "Cannot read file:", file # Try to upload file. try: self.hdfs.create_file(path, file, overwrite=True) hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus'] file_meta = self.extract(file) status = t.insert( key, {'file': { 'content': file_content, 'name': file }}) if status != 200: if self.debug: print "Error inserting: file content" # save hbase meta data for i in range(0, len(file_meta.keys())): status = t.insert( key, { 'meta_data': { file_meta.keys()[i]: file_meta[file_meta.keys()[i]] } }) if status != 200: if self.debug: print "Error inserting:", file_meta.keys()[i] # save hdfs meta data for i in range(0, len(hdfs_meta.keys())): status = t.insert( key, { 'meta_data': { hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]] } }) if status != 200: if self.debug: print "Error inserting:", hdfs_meta.keys()[i] # save version status = t.insert(key, {'meta_data': {'version': version}}) if status != 200: if self.debug: print "Error inserting: version" except: if self.debug: print "Update failed." return False if self.debug: print "[Updated]", file, "version:", version return True def delete(self, file, version=None): ''' This function use to delete file in hbase, and hdfs. You can specify file's version in order to delete it. :param : file - file's name :param : version - file's version :return: True if succes otherwise False. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) # Check if file exists if self.hbase_table.fetch(key) == None: if self.debug: print "Cannot delete.", key, "is not exists." return False # Remove row on hbase t = self.hbase_table if t.remove(key) != 200: if self.debug: print "[HBASE] cannot remove a row key:", key return False # Delete file on hdfs if not self.hdfs.delete_file_dir(path): if self.debug: print "[HDFS] Cannot remove a file path:", path return False if self.debug: print "[Deleted]", file, "version:", version return True def get_file_meta_data(self, file, version=None): ''' This function use to get all file's meta_data from hbase. You can specify a file's version. :param : file - file's name :param : version - file's version :return: meta data as dict for success, 0 if fail ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) if not self.hbase_table.fetch(key): if self.debug: print key, "is not exists" return False return self.hbase_table.fetch(key)['meta_data'] def get_file_content(self, file, version=None): ''' This function use to get all file's content from hbase. You can specify a file's version. :param : file - file's name :param : version - file's version :return: meta data as dict for success, 0 if fail ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) if not self.hbase_table.fetch(key): if self.debug: print key, "is not exists" return False return self.hbase_table.fetch(key)['file'] def search(self, text): ''' This function will search in xxxx via solr rest api. :param : text - text for searching :return: json response from solr, False for not found. ''' query = urlopen(''.join([self.solr, '/select?q=', text, '&wt=json'])) response = simplejson.load(query) if response['response']['numFound'] == 0: if self.debug: print text, "not found!" return False return response def get_all_file(self): ''' This function return all files that stored on Hbase in a list format. :param : Nothing. :return: fetch result as a list. ''' rf = '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": ""}}' t = self.hbase_table result = t.fetch_all_rows(with_row_id=True, filter_string=rf) return list(result) def get_file_version(self, file): ''' This function will fetch data from file name then return them. :param : file - file's name :return: file_list with version as a dict. ''' rf = ''.join([ '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": "', file, '"}}' ]) t = self.hbase_table result = t.fetch_all_rows(with_row_id=True, filter_string=rf) lsr = list(result) file_version = list() for i in range(0, len(lsr)): file_version.append(lsr[i].keys()[0].split('.')[0].split('v')[1]) file_list = dict() file_list['name'] = file file_list['version'] = file_version return file_list def get_lastest_version(self, file): ''' This function will return a lastest version number as integer. :param : file - file's name :return: version number as an integer. ''' file_version = self.get_file_version(file) file_version['version'].sort() return file_version['version'][len(file_version['version']) - 1] def delete_all_version(self, file): ''' This function will delete all file's version in an hbase and HDFS :param : file - file's name :return: True if success otherwise False ''' self.get_file_version(file)['version'].sort() for version in self.get_file_version(file)['version']: try: self.delete(file, version) except: return False return True def delete_all(self): ''' This function will delete all the files on an hbase and hdfs. :param : Nothing :return: True if success otherwise False ''' for full_file in self.get_all_file(): file = full_file.keys()[0].split('.')[1] version = full_file.keys()[0].split('.')[0].split('v')[1] try: self.delete(file, version) except: return False return True
HADOOP_TEXTFILE = 'user/root/texts/' + str(ANET) + '/' + str( BNET) + '/' + TEXTFILE print "-======= site: " + str(url) + " =======-" try: soup = BeautifulSoup(html) except: print " soup exception" continue HFP = open(HTMLFILE, 'w') HFP.write(soup.encode('utf-8')) HFP.close() with open(HTMLFILE) as hfp: try: client.create_file(HADOOP_HTMLFILE, hfp) except: client.delete_file_dir(HADOOP_HTMLFILE) client.create_file(HADOOP_HTMLFILE, hfp) TFP = open(TEXTFILE, 'w') WRITEOUT = unicode(soup.get_text()) WORDLIST = re.sub(r'[^a-zA-Z0-9 ]', r' ', WRITEOUT) WORDLIST = WORDLIST.strip().split() TFP.write(WRITEOUT.encode('utf-8')) TFP.close() PAGETITLE = '' try: PAGETITLE = soup.title.string except: pass try: PAGETITLE = PAGETITLE.encode('utf-8')
line = line.values() line[0], line[5] = line[5], line[0] return ', '.join(line) if __name__ == '__main__': host = 'hdfs://localhost:9000' ticker_path = host + '/user/hadoop/tickers.txt' save_path = host + '/user/hadoop/stock' hdfs = PyWebHdfsClient(host='localhost', port='50070', user_name='hadoop') folder = hdfs.list_dir('user/hadoop/stock')['FileStatuses']['FileStatus'] files = sorted([dt.datetime.strptime(f['pathSuffix'].split('.')[0], '%Y-%m-%d').date() for f in folder]) end = dt.date.today().strftime('%Y-%m-%d') sc = SparkContext(appName='stock_data') if len(files) > 3: hdfs.delete_file_dir(join(save_path, files[0].strftime('%Y-%m-%d') + '.csv'), recursive=True) if len(files) == 0: start = '2014-01-01' stockData = sc.textFile(ticker_path).flatMap(lambda x: Share(x).get_historical(start, end)).map(formatLine) stockData.saveAsTextFile(join(save_path, end + '.csv')) else: start = (files[-1] + dt.timedelta(days=1)).strftime('%Y-%m-%d') histStockData = sc.textFile(join(save_path, files[-1].strftime('%Y-%m-%d') + '.csv')) stockData = sc.textFile(ticker_path).flatMap(lambda x: Share(x).get_historical(start, end)).map(formatLine) histStockData.union(stockData).saveAsTextFile(join(save_path, end + '.csv'))
HADOOP_HTMLFILE='user/root/crawls/'+str(ANET)+'/'+str(BNET)+'/'+HTMLFILE HADOOP_TEXTFILE='user/root/texts/'+str(ANET)+'/'+str(BNET)+'/'+TEXTFILE print "-======= site: "+str(url)+" =======-" try: soup = BeautifulSoup(html) except: print " soup exception" continue HFP=open(HTMLFILE,'w') HFP.write(soup.encode('utf-8')) HFP.close() with open(HTMLFILE) as hfp: try: client.create_file(HADOOP_HTMLFILE,hfp) except: client.delete_file_dir(HADOOP_HTMLFILE) client.create_file(HADOOP_HTMLFILE,hfp) TFP=open(TEXTFILE,'w') WRITEOUT=unicode(soup.get_text()) WORDLIST=re.sub(r'[^a-zA-Z0-9 ]',r' ',WRITEOUT) WORDLIST=WORDLIST.strip().split() TFP.write(WRITEOUT.encode('utf-8')) TFP.close() PAGETITLE='' try: PAGETITLE=soup.title.string except: pass try: PAGETITLE=PAGETITLE.encode('utf-8')
# checksum reflects file changes file_checksum = hdfs.get_file_checksum(example_file) print(file_checksum) # read in the data for the file print('reading data from file at: {0}\n'.format(example_file)) file_data = hdfs.read_file(example_file) print(file_data) # rename the example_dir print('renaming directory from {0} to {1}\n').format(example_dir, rename_dir) hdfs.rename_file_dir(example_dir, '/{0}'.format(rename_dir)) # list the contents of the new directory listdir_stats = hdfs.list_dir(rename_dir) print(listdir_stats) example_file = '{dir}/example.txt'.format(dir=rename_dir) # delete the example file print('deleting example file at: {0}'.format(example_file)) hdfs.delete_file_dir(example_file) # list the contents of the directory listdir_stats = hdfs.list_dir(rename_dir) print(listdir_stats) # delete the example directory print('deleting the example directory at: {0}'.format(rename_dir)) hdfs.delete_file_dir(rename_dir, recursive='true')
class DMS: def __init__(self, debug=0): ''' This function use to init a class. To show an error messages debug should be 1. :param : debug - 1, show an error or success message. 0 otherwise :return: Nothing. ''' self.debug = debug pass def hbase_connection(self, host, port, table='dms'): ''' This function use to establish a connection to hbase, for preparing to insert, remove, fetch data from hbase. We use starbase for connect to hbase via rest api.(See more: https://github.com/barseghyanartur/starbase) :param : host - hbase rest host :param : port - hbase rest running port :param : table - DMS table on hbase (default: 'dms') :return: Nothing. ''' self.hbase = hbaseConnection(host=host, port=port) t = self.hbase.table(table) if (not t.exists()): t.create('meta_data','file') self.hbase_table = t def hdfs_connection(self, host, port, user_name, hdfs_path='/tmp/'): ''' This function use to establish a connection to hdfs, for preparing to create, retrieve, update, delete file in hdfs. We use pywebhdfs in order to do this task via hdfs rest api.(See more: http://pythonhosted.org/pywebhdfs/) :param : host - hdfs rest host :param : port - hdfs rest running port :param : user_name - hdfs username (for authentication) :param : hdfs_path - location to store files. (default: '/tmp/') :return: Nothing. ''' self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name) self.hdfs_path = hdfs_path def solr_connection(self, host, port, collection): ''' This function use to establish a connection to solr, for query or search any text on a system. :param : host - solr's host :param : port - solr's running port :param : collection - solr's collection for searching ''' self.solr = ''.join(['http://',host,':',port,'/solr/',collection]) def extract(self, file): ''' This function use to extract meta data from a file. We use hachoir3 library to extract them. (See more: http://hachoir3.readthedocs.org) :param : file - file for extract :return: meta data as dict for success, 0 if fail. ''' try: filename, realname = unicodeFilename(file), file parser = createParser(filename, realname) meta_data = extractMetadata(parser) meta_data_text = meta_data.exportPlaintext() meta_list = dict() for i in range(1,len(meta_data_text)): meta_split = meta_data_text[i].split(":") column = meta_split[0].replace('- ','') value = meta_split[1].lstrip() meta_list.update({column:value}) return meta_list except: if self.debug: print "Something went wrong, meta data of",file,"could not extract." return None def upload(self, file): ''' This function use to uplaod a file to hdfs and store meta data on hbase Meta data consist of 2 main parts: file's meta data and hdfs's file's meta data. This function will increase a file version if it is already store in hbase. :param : file - file's name :return: True if success otherwise False. ''' version = 1 key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) # Read a file try: f = open(file,'r') file_content = f.read() f.close() except: print "Cannot read file:",file # Check file's version while self.hbase_table.fetch(key) != None: version = int(self.get_lastest_version(file)) + 1 key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) # Try to upload file. try: self.hdfs.create_file(path,file_content) hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus'] file_meta = self.extract(file) t = self.hbase_table status = t.insert( key, { 'file': {'content': file_content} } ) if status != 200: if self.debug: print "Error inserting: file content" # save hbase meta data for i in range(0,len(file_meta.keys())): status = t.insert( key, { 'meta_data': {file_meta.keys()[i]: file_meta[file_meta.keys()[i]]} } ) if status != 200: if self.debug: print "Error inserting:", file_meta.keys()[i] # save hdfs meta data for i in range(0,len(hdfs_meta.keys())): status = t.insert( key, { 'meta_data': {hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]} } ) if status != 200: if self.debug: print "Error inserting:", hdfs_meta.keys()[i] # save version status = t.insert( key, { 'meta_data': {'version': version} } ) if status != 200: if self.debug: print "Error inserting: version" except: if self.debug: print "Upload failed." return False if self.debug: print "[Uploaded]", file, "version:", version return True def download(self, file, version=None, download_dir=''): ''' This function use to retrieve or download file from hdfs. Then save it as a new file named (v[version].[file] - For example, v1.mytext.txt). You can specify the directory of downloaded file. You can also specify file's version for downloading if not it will be version 1. :param : file - file's name :param : version - file's version (default: 1) :param : download_dir - download directory (default: '' or current directory NOTE: it must end with '/' - For example, '../download/') :return: True if success otherwise false. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) downloaded_file = ''.join([download_dir,key]) try: f = open(downloaded_file, 'w') f.write(self.hdfs.read_file(path)) f.close() except: if self.debug: print "Cannot download a file:", file return False if self.debug: print "[Downloaded]",key return True def update(self, file, version=None): ''' This function use to update file to hdfs and data stored in hbase by overwrite that file on hdfs, and also insert new data to hbase too. You can specify a file's version in order to update it. :param : file - file's name :param : version - file's version :return: True if success otherwise False. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) # Read a file try: f = open(file,'r') file_content = f.read() f.close() except: print "Cannot read file:",file # Try to upload file. try: self.hdfs.create_file(path,file,overwrite=True) hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus'] file_meta = self.extract(file) status = t.insert( key, { 'file': {'content': file_content, 'name': file} } ) if status != 200: if self.debug: print "Error inserting: file content" # save hbase meta data for i in range(0,len(file_meta.keys())): status = t.insert( key, { 'meta_data': {file_meta.keys()[i]: file_meta[file_meta.keys()[i]]} } ) if status != 200: if self.debug: print "Error inserting:", file_meta.keys()[i] # save hdfs meta data for i in range(0,len(hdfs_meta.keys())): status = t.insert( key, { 'meta_data': {hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]} } ) if status != 200: if self.debug: print "Error inserting:", hdfs_meta.keys()[i] # save version status = t.insert( key, { 'meta_data': {'version': version} } ) if status != 200: if self.debug: print "Error inserting: version" except: if self.debug: print "Update failed." return False if self.debug: print "[Updated]", file, "version:", version return True def delete(self, file, version=None): ''' This function use to delete file in hbase, and hdfs. You can specify file's version in order to delete it. :param : file - file's name :param : version - file's version :return: True if succes otherwise False. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) # Check if file exists if self.hbase_table.fetch(key) == None: if self.debug: print "Cannot delete.",key,"is not exists." return False # Remove row on hbase t = self.hbase_table if t.remove(key) != 200: if self.debug: print "[HBASE] cannot remove a row key:",key return False # Delete file on hdfs if not self.hdfs.delete_file_dir(path): if self.debug: print "[HDFS] Cannot remove a file path:",path return False if self.debug: print "[Deleted]", file, "version:", version return True def get_file_meta_data(self, file, version=None): ''' This function use to get all file's meta_data from hbase. You can specify a file's version. :param : file - file's name :param : version - file's version :return: meta data as dict for success, 0 if fail ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) if not self.hbase_table.fetch(key): if self.debug: print key,"is not exists" return False return self.hbase_table.fetch(key)['meta_data'] def get_file_content(self, file, version=None): ''' This function use to get all file's content from hbase. You can specify a file's version. :param : file - file's name :param : version - file's version :return: meta data as dict for success, 0 if fail ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) if not self.hbase_table.fetch(key): if self.debug: print key,"is not exists" return False return self.hbase_table.fetch(key)['file'] def search(self, text): ''' This function will search in xxxx via solr rest api. :param : text - text for searching :return: json response from solr, False for not found. ''' query = urlopen(''.join([self.solr,'/select?q=',text,'&wt=json'])) response = simplejson.load(query) if response['response']['numFound'] == 0: if self.debug: print text,"not found!" return False return response def get_all_file(self): ''' This function return all files that stored on Hbase in a list format. :param : Nothing. :return: fetch result as a list. ''' rf = '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": ""}}' t = self.hbase_table result = t.fetch_all_rows(with_row_id=True, filter_string=rf) return list(result) def get_file_version(self, file): ''' This function will fetch data from file name then return them. :param : file - file's name :return: file_list with version as a dict. ''' rf = ''.join(['{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": "',file,'"}}']) t = self.hbase_table result = t.fetch_all_rows(with_row_id=True, filter_string=rf) lsr = list(result) file_version = list() for i in range(0,len(lsr)): file_version.append(lsr[i].keys()[0].split('.')[0].split('v')[1]) file_list = dict() file_list['name'] = file file_list['version'] = file_version return file_list def get_lastest_version(self, file): ''' This function will return a lastest version number as integer. :param : file - file's name :return: version number as an integer. ''' file_version = self.get_file_version(file) file_version['version'].sort() return file_version['version'][len(file_version['version'])-1] def delete_all_version(self, file): ''' This function will delete all file's version in an hbase and HDFS :param : file - file's name :return: True if success otherwise False ''' self.get_file_version(file)['version'].sort() for version in self.get_file_version(file)['version']: try: self.delete(file,version) except: return False return True def delete_all(self): ''' This function will delete all the files on an hbase and hdfs. :param : Nothing :return: True if success otherwise False ''' for full_file in self.get_all_file(): file = full_file.keys()[0].split('.')[1] version = full_file.keys()[0].split('.')[0].split('v')[1] try: self.delete(file,version) except: return False return True
class HdfsApi: def __init__(self, request_timeout=10, logger=logging, active_nn_host='localhost', kerberos=False): self.timeout = request_timeout self.hdfs_schema = os.environ.get('HDFS_NAMENODE_SCHEMA', 'http') self.hdfs_host = active_nn_host self.hdfs_port = os.environ.get('HDFS_NAMENODE_PORT', 50070) if kerberos: extra_opts = { 'auth': HTTPKerberosAuth(mutual_authentication=OPTIONAL, sanitize_mutual_error_response=False, force_preemptive=True) } else: extra_opts = {} self.webhdfs = PyWebHdfsClient(host=self.hdfs_host, port=self.hdfs_port, request_extra_opts=extra_opts) self.logger = logger def request_namenode(self, path, method='GET', headers=None, **kwargs): self.logger.info("Calling HDFS API ({0})".format(path)) if headers is None: headers = dict() if path.startswith('http'): hdfs_url = path else: hdfs_url = '{0}://{1}:{2}/{3}'.format(self.hdfs_schema, self.hdfs_host, self.hdfs_port, path) self.logger.debug(hdfs_url) r = requests.request(method, hdfs_url, headers=headers, timeout=self.timeout, verify=False, auth=HTTPKerberosAuth(), **kwargs) return self._check_response_status(r) def request_webhdfs_status(self, path): return self.webhdfs.get_file_dir_status(path) def _check_response_status(self, response): self.logger.debug(response.text) if response.status_code >= 400: self.logger.error( "HdfsResponse returned with error status [{0}], response was: {1}" .format(response.status_code, response.text)) raise HdfsRequestError( "HdfsResponse returned with error status [{0}]".format( response.status_code)) return response def get_block_info_for_file(self, file_path): path = "fsck" params = {'files': 0, 'racks': 1, 'blocks': 0, 'path': file_path} response = self.request_namenode(path, params=params) return response @staticmethod def get_first_block_info(filename, block_info): regex = r"^{0}.*\n(.*)\n".format(filename) info_of_first_block = re.findall(regex, block_info, re.MULTILINE) if len(info_of_first_block) < 1: raise HdfsRequestError( "No block information found for file {0} in {1}".format( filename, block_info)) return info_of_first_block[0] @staticmethod def get_location_of_first_block(block_info): ip_regex = r"(?<!\-)(\d{2,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?=:)" block_locations = re.findall(ip_regex, block_info) if len(block_locations) < 1: raise HdfsRequestError( "No block location information found in {0}".format( block_info)) return block_locations[0] @staticmethod def get_host_by_ip(ip): host_info = socket.gethostbyaddr(ip) if len(host_info) < 1: raise HdfsRequestError( "Unable to get hostname form ip {0}".format(ip)) return host_info[0] @staticmethod def calculate_md5(file, block_size=65536): hash_builder = hashlib.md5() for block in iter(lambda: file.read(block_size), b""): hash_builder.update(block) md5 = hash_builder.hexdigest() file.seek(0) return md5 @staticmethod def create_temp_file(): return tempfile.NamedTemporaryFile(suffix='.temporary', prefix='hdfs-smoketest-api-') def create_temp_file_of_size(self, temp_file_size): tmp = self.create_temp_file() tmp.seek(temp_file_size * 1024 * 1024) tmp.write(b'1') tmp.seek(0) return tmp def copy_to_hdfs(self, remote_path, tmpfile): self.webhdfs.create_file(remote_path, file_data=tmpfile, overwrite=True) def create_hdfs_file_of_size_in_mb(self, path, size=300): with self.create_temp_file_of_size(size) as tmp_file: md5_of_tmp_file = self.calculate_md5(tmp_file) self.copy_to_hdfs(path, tmp_file) return md5_of_tmp_file def get_remote_file(self, path): return self.webhdfs.read_file(path) def write_remote_file_to_local_temp(self, remote_path): local = self.create_temp_file() file = self.get_remote_file(remote_path) local.write(file) local.seek(0) return local def get_hdfsfile_and_calc_md5(self, path): with self.write_remote_file_to_local_temp(path) as temp_file: return self.calculate_md5(temp_file) def cleanup_remote_file(self, path, recursive=False): self.webhdfs.delete_file_dir(path, recursive=recursive) def get_host_location_of_first_block(self, filename): file_block_info = self.get_block_info_for_file(filename) file_first_block_info = self.get_first_block_info( filename, file_block_info.text) file_block_ip = self.get_location_of_first_block(file_first_block_info) return self.get_host_by_ip(file_block_ip)
file_status = hdfs.get_file_dir_status(example_file) print file_status # read in the data for the file print('reading data from file at: {0}\n'.format(example_file)) file_data = hdfs.read_file(example_file) print file_data # rename the example_dir print('renaming directory from {0} to {1}\n').format(example_dir, rename_dir) hdfs.rename_file_dir(example_dir, '/{0}'.format(rename_dir)) # list the contents of the new directory listdir_stats = hdfs.list_dir(rename_dir) print listdir_stats example_file = '{dir}/example.txt'.format(dir=rename_dir) # delete the example file print('deleting example file at: {0}'.format(example_file)) hdfs.delete_file_dir(example_file) # list the contents of the directory listdir_stats = hdfs.list_dir(rename_dir) print listdir_stats # delete the example directory print('deleting the example directory at: {0}'.format(rename_dir)) hdfs.delete_file_dir(rename_dir, recursive='true')
class HDFS(object): def __init__(self, host, port, user): self._hdfs = PyWebHdfsClient( host=host, port=port, user_name=user, timeout=None) logging.debug('webhdfs = %s@%s:%s', user, host, port) def recursive_copy(self, local_path, remote_path, exclude=None): if exclude is None: exclude = [] c_path = canonicalize(remote_path) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path) fs_g = os.walk(local_path) for dpath, dnames, fnames in fs_g: _, relative_path = dpath.split(local_path) for dname in dnames: if dname not in exclude: c_path = canonicalize( '%s/%s/%s' % (remote_path, relative_path, dname)) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path) for fname in fnames: if fname not in exclude: data = file( canonicalize( '%s/%s/%s' % (local_path, relative_path, fname)), 'rb') c_path = canonicalize( '%s/%s/%s' % (remote_path, relative_path, fname)) logging.debug('creating %s', c_path) self._hdfs.create_file(c_path, data, overwrite=True) data.close() def make_dir(self, path): logging.debug('make_dir: %s', path) self._hdfs.make_dir(canonicalize(path)) def create_file(self, data, remote_file_path): logging.debug('create_file: %s', remote_file_path) sio = StringIO.StringIO(data) self._hdfs.create_file( canonicalize(remote_file_path), sio, overwrite=True) def append_file(self, data, remote_file_path): logging.debug('append to: %s', remote_file_path) self._hdfs.append_file(canonicalize(remote_file_path), data) def stream_file_to_disk(self, remote_file_path, local_file_path): chunk_size = 10*1024*1024 offset = 0 with open(local_file_path, 'wb') as dest_file: data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) while True: dest_file.write(data) if len(data) < chunk_size: break offset += chunk_size data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) def read_file(self, remote_file_path): data = self._hdfs.read_file(canonicalize(remote_file_path)) return data def remove(self, path, recursive=False): logging.debug('remove: %s', path) self._hdfs.delete_file_dir(canonicalize(path), recursive)