def create_data_from_station_data(first, second): """this function creates the data analyzing the two stations in comparison""" global hdfs; #global hdfs object global hbase; #global hbase object if(hdfs is None): from pywebhdfs.webhdfs import PyWebHdfsClient; hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',port='50070', user_name='uacharya'); if(hbase is None): import happybase; hbase = happybase.ConnectionPool(size=1,host='cshadoop.boisestate.edu'); date_for_comparision = first["Date"].strip(); # creating directory for each date try: hdfs.get_file_dir_status('user/uacharya/simulation/'+date_for_comparision); except Exception: # directory to hold dataset in csv file for reach node in wall display starting from 1 to 9 for index in range(1, 10): content = 'Date,ID,Source,Destination,S_Lat,S_Lon,D_Lat,D_Lon,Wind_Lat,Wind_Lon,Wind_Velocity\n'; try: hdfs.create_file('user/uacharya/simulation/'+date_for_comparision+'/node'+str(index)+'/output.csv',content,replication=1); except Exception: continue; dataset = {'node_1':[],'node_2':[],'node_3':[],'node_4':[],'node_5':[],'node_6':[],'node_7':[],'node_8':[],'node_9':[]}; for data in broadcast_variable.value: compare_data_between(date_for_comparision, first, data,dataset); # for key in dataset: # if(len(dataset[key])!=0): # content = "\n".join(dataset[key]); # content +="\n"; # while(True): # try: # hdfs.append_file('user/uacharya/simulation/'+date+'/'+key+'/output.csv',content,buffersize=4096); # break; # except Exception: # time.sleep(0.2); # continue; dataset.clear(); #clearing the dictionary # append over here after all the global variable has been made return second;
class HDFS(object): def __init__(self, host, port, user, logger): self._hdfs = PyWebHdfsClient( host=host, port=port, user_name=user, timeout=None) global LOGGER LOGGER = logger LOGGER.debug('webhdfs = %s@%s:%s', user, host, port) def file_exists(self, path): try: self._hdfs.get_file_dir_status(path) return True except: return False
def load(self, job, task, fifo): self.job = job self.task = task self.fifo = fifo self.key = None self.script_proc = None self.decompress_obj = None self.pycurl_callback_exception = None if task.data['scheme'] == 's3': self.is_anonymous = job.spec.source.aws_access_key is None or job.spec.source.aws_secret_key is None if self.is_anonymous: s3_conn = S3Connection(anon=True) else: s3_conn = S3Connection(job.spec.source.aws_access_key, job.spec.source.aws_secret_key) bucket = s3_conn.get_bucket(task.data['bucket']) try: self.key = bucket.get_key(task.data['key_name']) except S3ResponseError as e: raise WorkerException( "Received %s %s accessing `%s`, aborting" % (e.status, e.reason, task.data['key_name'])) elif task.data['scheme'] == 'hdfs': fname = task.data['key_name'] client = PyWebHdfsClient(job.spec.source.hdfs_host, job.spec.source.webhdfs_port, user_name=job.spec.source.hdfs_user) try: filesize = client.get_file_dir_status( fname)['FileStatus']['length'] except pywebhdfs.errors.FileNotFound: raise WorkerException("File '%s' does not exist on HDFS" % fname) self.key = AttrDict({'name': fname, 'size': filesize}) elif task.data['scheme'] == 'file': globber = glob2.Globber() fname = globber._normalize_string(task.data['key_name']) if not os.path.exists(fname): raise WorkerException( "File '%s' does not exist on this filesystem" % fname) elif not os.path.isfile(fname): raise WorkerException("File '%s' exists, but is not a file" % fname) self.key = AttrDict({ 'name': fname, 'size': os.path.getsize(fname) }) else: raise WorkerException('Unsupported job with paths: %s' % [str(p) for p in self.job.paths]) if self.key is None: raise WorkerException( 'Failed to find key associated with task ID %s' % task.task_id) self.metrics = DownloadMetrics(self.key.size)
class WhenTestingGetFileStatusOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.response = MagicMock() self.file_status = { "FileStatus": { "accessTime": 0, "blockSize": 0, "group": "supergroup", "length": 0, "modificationTime": 1320173277227, "owner": "webuser", "pathSuffix": "", "permission": "777", "replication": 0, "type": "DIRECTORY" } } self.response.json = MagicMock(return_value=self.file_status) def test_get_status_throws_exception_for_not_ok(self): self.response.status_code = httplib.BAD_REQUEST self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.get_file_dir_status(self.path) def test_get_status_returns_true(self): self.response.status_code = httplib.OK self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.get_file_dir_status(self.path) for key in result: self.assertEqual(result[key], self.file_status[key])
class WhenTestingGetFileStatusOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs/old_dir' self.response = MagicMock() self.file_status = { "FileStatus": { "accessTime": 0, "blockSize": 0, "group": "supergroup", "length": 0, "modificationTime": 1320173277227, "owner": "webuser", "pathSuffix": "", "permission": "777", "replication": 0, "type": "DIRECTORY" } } self.response.json = MagicMock(return_value=self.file_status) def test_get_status_throws_exception_for_not_ok(self): self.response.status_code = http_client.BAD_REQUEST self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.get_file_dir_status(self.path) def test_get_status_returns_true(self): self.response.status_code = http_client.OK self.requests.get.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.get_file_dir_status(self.path) for key in result: self.assertEqual(result[key], self.file_status[key])
def load(self, job, task, fifo): self.job = job self.task = task self.fifo = fifo self.key = None self.script_proc = None self.decompress_obj = None self.pycurl_callback_exception = None if task.data['scheme'] == 's3': self.is_anonymous = job.spec.source.aws_access_key is None or job.spec.source.aws_secret_key is None if self.is_anonymous: s3_conn = S3Connection(anon=True) else: s3_conn = S3Connection(job.spec.source.aws_access_key, job.spec.source.aws_secret_key) bucket = s3_conn.get_bucket(task.data['bucket']) try: self.key = bucket.get_key(task.data['key_name']) except S3ResponseError as e: raise WorkerException("Received %s %s accessing `%s`, aborting" % (e.status, e.reason, task.data['key_name'])) elif task.data['scheme'] == 'hdfs': fname = task.data['key_name'] client = PyWebHdfsClient( job.spec.source.hdfs_host, job.spec.source.webhdfs_port, user_name=job.spec.source.hdfs_user) try: filesize = client.get_file_dir_status(fname)['FileStatus']['length'] except pywebhdfs.errors.FileNotFound: raise WorkerException("File '%s' does not exist on HDFS" % fname) self.key = AttrDict({'name': fname, 'size': filesize}) elif task.data['scheme'] == 'file': globber = glob2.Globber() fname = globber._normalize_string(task.data['key_name']) if not os.path.exists(fname): raise WorkerException("File '%s' does not exist on this filesystem" % fname) elif not os.path.isfile(fname): raise WorkerException("File '%s' exists, but is not a file" % fname) self.key = AttrDict({'name': fname, 'size': os.path.getsize(fname)}) else: raise WorkerException('Unsupported job with paths: %s' % [ str(p) for p in self.job.paths ]) if self.key is None: raise WorkerException('Failed to find key associated with task ID %s' % task.task_id) self.metrics = DownloadMetrics(self.key.size)
class DMS: def __init__(self, debug=0): ''' This function use to init a class. To show an error messages debug should be 1. :param : debug - 1, show an error or success message. 0 otherwise :return: Nothing. ''' self.debug = debug pass def hbase_connection(self, host, port, table='dms'): ''' This function use to establish a connection to hbase, for preparing to insert, remove, fetch data from hbase. We use starbase for connect to hbase via rest api.(See more: https://github.com/barseghyanartur/starbase) :param : host - hbase rest host :param : port - hbase rest running port :param : table - DMS table on hbase (default: 'dms') :return: Nothing. ''' self.hbase = hbaseConnection(host=host, port=port) t = self.hbase.table(table) if (not t.exists()): t.create('meta_data','file') self.hbase_table = t def hdfs_connection(self, host, port, user_name, hdfs_path='/tmp/'): ''' This function use to establish a connection to hdfs, for preparing to create, retrieve, update, delete file in hdfs. We use pywebhdfs in order to do this task via hdfs rest api.(See more: http://pythonhosted.org/pywebhdfs/) :param : host - hdfs rest host :param : port - hdfs rest running port :param : user_name - hdfs username (for authentication) :param : hdfs_path - location to store files. (default: '/tmp/') :return: Nothing. ''' self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name) self.hdfs_path = hdfs_path def solr_connection(self, host, port, collection): ''' This function use to establish a connection to solr, for query or search any text on a system. :param : host - solr's host :param : port - solr's running port :param : collection - solr's collection for searching ''' self.solr = ''.join(['http://',host,':',port,'/solr/',collection]) def extract(self, file): ''' This function use to extract meta data from a file. We use hachoir3 library to extract them. (See more: http://hachoir3.readthedocs.org) :param : file - file for extract :return: meta data as dict for success, 0 if fail. ''' try: filename, realname = unicodeFilename(file), file parser = createParser(filename, realname) meta_data = extractMetadata(parser) meta_data_text = meta_data.exportPlaintext() meta_list = dict() for i in range(1,len(meta_data_text)): meta_split = meta_data_text[i].split(":") column = meta_split[0].replace('- ','') value = meta_split[1].lstrip() meta_list.update({column:value}) return meta_list except: if self.debug: print "Something went wrong, meta data of",file,"could not extract." return None def upload(self, file): ''' This function use to uplaod a file to hdfs and store meta data on hbase Meta data consist of 2 main parts: file's meta data and hdfs's file's meta data. This function will increase a file version if it is already store in hbase. :param : file - file's name :return: True if success otherwise False. ''' version = 1 key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) # Read a file try: f = open(file,'r') file_content = f.read() f.close() except: print "Cannot read file:",file # Check file's version while self.hbase_table.fetch(key) != None: version = int(self.get_lastest_version(file)) + 1 key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) # Try to upload file. try: self.hdfs.create_file(path,file_content) hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus'] file_meta = self.extract(file) t = self.hbase_table status = t.insert( key, { 'file': {'content': file_content} } ) if status != 200: if self.debug: print "Error inserting: file content" # save hbase meta data for i in range(0,len(file_meta.keys())): status = t.insert( key, { 'meta_data': {file_meta.keys()[i]: file_meta[file_meta.keys()[i]]} } ) if status != 200: if self.debug: print "Error inserting:", file_meta.keys()[i] # save hdfs meta data for i in range(0,len(hdfs_meta.keys())): status = t.insert( key, { 'meta_data': {hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]} } ) if status != 200: if self.debug: print "Error inserting:", hdfs_meta.keys()[i] # save version status = t.insert( key, { 'meta_data': {'version': version} } ) if status != 200: if self.debug: print "Error inserting: version" except: if self.debug: print "Upload failed." return False if self.debug: print "[Uploaded]", file, "version:", version return True def download(self, file, version=None, download_dir=''): ''' This function use to retrieve or download file from hdfs. Then save it as a new file named (v[version].[file] - For example, v1.mytext.txt). You can specify the directory of downloaded file. You can also specify file's version for downloading if not it will be version 1. :param : file - file's name :param : version - file's version (default: 1) :param : download_dir - download directory (default: '' or current directory NOTE: it must end with '/' - For example, '../download/') :return: True if success otherwise false. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) downloaded_file = ''.join([download_dir,key]) try: f = open(downloaded_file, 'w') f.write(self.hdfs.read_file(path)) f.close() except: if self.debug: print "Cannot download a file:", file return False if self.debug: print "[Downloaded]",key return True def update(self, file, version=None): ''' This function use to update file to hdfs and data stored in hbase by overwrite that file on hdfs, and also insert new data to hbase too. You can specify a file's version in order to update it. :param : file - file's name :param : version - file's version :return: True if success otherwise False. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) # Read a file try: f = open(file,'r') file_content = f.read() f.close() except: print "Cannot read file:",file # Try to upload file. try: self.hdfs.create_file(path,file,overwrite=True) hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus'] file_meta = self.extract(file) status = t.insert( key, { 'file': {'content': file_content, 'name': file} } ) if status != 200: if self.debug: print "Error inserting: file content" # save hbase meta data for i in range(0,len(file_meta.keys())): status = t.insert( key, { 'meta_data': {file_meta.keys()[i]: file_meta[file_meta.keys()[i]]} } ) if status != 200: if self.debug: print "Error inserting:", file_meta.keys()[i] # save hdfs meta data for i in range(0,len(hdfs_meta.keys())): status = t.insert( key, { 'meta_data': {hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]} } ) if status != 200: if self.debug: print "Error inserting:", hdfs_meta.keys()[i] # save version status = t.insert( key, { 'meta_data': {'version': version} } ) if status != 200: if self.debug: print "Error inserting: version" except: if self.debug: print "Update failed." return False if self.debug: print "[Updated]", file, "version:", version return True def delete(self, file, version=None): ''' This function use to delete file in hbase, and hdfs. You can specify file's version in order to delete it. :param : file - file's name :param : version - file's version :return: True if succes otherwise False. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) # Check if file exists if self.hbase_table.fetch(key) == None: if self.debug: print "Cannot delete.",key,"is not exists." return False # Remove row on hbase t = self.hbase_table if t.remove(key) != 200: if self.debug: print "[HBASE] cannot remove a row key:",key return False # Delete file on hdfs if not self.hdfs.delete_file_dir(path): if self.debug: print "[HDFS] Cannot remove a file path:",path return False if self.debug: print "[Deleted]", file, "version:", version return True def get_file_meta_data(self, file, version=None): ''' This function use to get all file's meta_data from hbase. You can specify a file's version. :param : file - file's name :param : version - file's version :return: meta data as dict for success, 0 if fail ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) if not self.hbase_table.fetch(key): if self.debug: print key,"is not exists" return False return self.hbase_table.fetch(key)['meta_data'] def get_file_content(self, file, version=None): ''' This function use to get all file's content from hbase. You can specify a file's version. :param : file - file's name :param : version - file's version :return: meta data as dict for success, 0 if fail ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) if not self.hbase_table.fetch(key): if self.debug: print key,"is not exists" return False return self.hbase_table.fetch(key)['file'] def search(self, text): ''' This function will search in xxxx via solr rest api. :param : text - text for searching :return: json response from solr, False for not found. ''' query = urlopen(''.join([self.solr,'/select?q=',text,'&wt=json'])) response = simplejson.load(query) if response['response']['numFound'] == 0: if self.debug: print text,"not found!" return False return response def get_all_file(self): ''' This function return all files that stored on Hbase in a list format. :param : Nothing. :return: fetch result as a list. ''' rf = '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": ""}}' t = self.hbase_table result = t.fetch_all_rows(with_row_id=True, filter_string=rf) return list(result) def get_file_version(self, file): ''' This function will fetch data from file name then return them. :param : file - file's name :return: file_list with version as a dict. ''' rf = ''.join(['{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": "',file,'"}}']) t = self.hbase_table result = t.fetch_all_rows(with_row_id=True, filter_string=rf) lsr = list(result) file_version = list() for i in range(0,len(lsr)): file_version.append(lsr[i].keys()[0].split('.')[0].split('v')[1]) file_list = dict() file_list['name'] = file file_list['version'] = file_version return file_list def get_lastest_version(self, file): ''' This function will return a lastest version number as integer. :param : file - file's name :return: version number as an integer. ''' file_version = self.get_file_version(file) file_version['version'].sort() return file_version['version'][len(file_version['version'])-1] def delete_all_version(self, file): ''' This function will delete all file's version in an hbase and HDFS :param : file - file's name :return: True if success otherwise False ''' self.get_file_version(file)['version'].sort() for version in self.get_file_version(file)['version']: try: self.delete(file,version) except: return False return True def delete_all(self): ''' This function will delete all the files on an hbase and hdfs. :param : Nothing :return: True if success otherwise False ''' for full_file in self.get_all_file(): file = full_file.keys()[0].split('.')[1] version = full_file.keys()[0].split('.')[0].split('v')[1] try: self.delete(file,version) except: return False return True
class HdfsApi: def __init__(self, request_timeout=10, logger=logging, active_nn_host='localhost', kerberos=False): self.timeout = request_timeout self.hdfs_schema = os.environ.get('HDFS_NAMENODE_SCHEMA', 'http') self.hdfs_host = active_nn_host self.hdfs_port = os.environ.get('HDFS_NAMENODE_PORT', 50070) if kerberos: extra_opts = { 'auth': HTTPKerberosAuth(mutual_authentication=OPTIONAL, sanitize_mutual_error_response=False, force_preemptive=True) } else: extra_opts = {} self.webhdfs = PyWebHdfsClient(host=self.hdfs_host, port=self.hdfs_port, request_extra_opts=extra_opts) self.logger = logger def request_namenode(self, path, method='GET', headers=None, **kwargs): self.logger.info("Calling HDFS API ({0})".format(path)) if headers is None: headers = dict() if path.startswith('http'): hdfs_url = path else: hdfs_url = '{0}://{1}:{2}/{3}'.format(self.hdfs_schema, self.hdfs_host, self.hdfs_port, path) self.logger.debug(hdfs_url) r = requests.request(method, hdfs_url, headers=headers, timeout=self.timeout, verify=False, auth=HTTPKerberosAuth(), **kwargs) return self._check_response_status(r) def request_webhdfs_status(self, path): return self.webhdfs.get_file_dir_status(path) def _check_response_status(self, response): self.logger.debug(response.text) if response.status_code >= 400: self.logger.error( "HdfsResponse returned with error status [{0}], response was: {1}" .format(response.status_code, response.text)) raise HdfsRequestError( "HdfsResponse returned with error status [{0}]".format( response.status_code)) return response def get_block_info_for_file(self, file_path): path = "fsck" params = {'files': 0, 'racks': 1, 'blocks': 0, 'path': file_path} response = self.request_namenode(path, params=params) return response @staticmethod def get_first_block_info(filename, block_info): regex = r"^{0}.*\n(.*)\n".format(filename) info_of_first_block = re.findall(regex, block_info, re.MULTILINE) if len(info_of_first_block) < 1: raise HdfsRequestError( "No block information found for file {0} in {1}".format( filename, block_info)) return info_of_first_block[0] @staticmethod def get_location_of_first_block(block_info): ip_regex = r"(?<!\-)(\d{2,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?=:)" block_locations = re.findall(ip_regex, block_info) if len(block_locations) < 1: raise HdfsRequestError( "No block location information found in {0}".format( block_info)) return block_locations[0] @staticmethod def get_host_by_ip(ip): host_info = socket.gethostbyaddr(ip) if len(host_info) < 1: raise HdfsRequestError( "Unable to get hostname form ip {0}".format(ip)) return host_info[0] @staticmethod def calculate_md5(file, block_size=65536): hash_builder = hashlib.md5() for block in iter(lambda: file.read(block_size), b""): hash_builder.update(block) md5 = hash_builder.hexdigest() file.seek(0) return md5 @staticmethod def create_temp_file(): return tempfile.NamedTemporaryFile(suffix='.temporary', prefix='hdfs-smoketest-api-') def create_temp_file_of_size(self, temp_file_size): tmp = self.create_temp_file() tmp.seek(temp_file_size * 1024 * 1024) tmp.write(b'1') tmp.seek(0) return tmp def copy_to_hdfs(self, remote_path, tmpfile): self.webhdfs.create_file(remote_path, file_data=tmpfile, overwrite=True) def create_hdfs_file_of_size_in_mb(self, path, size=300): with self.create_temp_file_of_size(size) as tmp_file: md5_of_tmp_file = self.calculate_md5(tmp_file) self.copy_to_hdfs(path, tmp_file) return md5_of_tmp_file def get_remote_file(self, path): return self.webhdfs.read_file(path) def write_remote_file_to_local_temp(self, remote_path): local = self.create_temp_file() file = self.get_remote_file(remote_path) local.write(file) local.seek(0) return local def get_hdfsfile_and_calc_md5(self, path): with self.write_remote_file_to_local_temp(path) as temp_file: return self.calculate_md5(temp_file) def cleanup_remote_file(self, path, recursive=False): self.webhdfs.delete_file_dir(path, recursive=recursive) def get_host_location_of_first_block(self, filename): file_block_info = self.get_block_info_for_file(filename) file_first_block_info = self.get_first_block_info( filename, file_block_info.text) file_block_ip = self.get_location_of_first_block(file_first_block_info) return self.get_host_by_ip(file_block_ip)
#1 imports from pywebhdfs.webhdfs import PyWebHdfsClient #2 make connection with hadoop file system hdfs = PyWebHdfsClient(user_name="hdfs",port=50070,host="sandbox.hortonworks.com") hdfs.delete_file_dir('chapter5/LoanStats3d.csv',recursive=True) #4 recreate the chapters directory hdfs.make_dir('chapter5') #5 upload the csv file with open('./data/stored_csv.csv') as file_data: hdfs.create_file('chapter5/LoanStats3d.csv',file_data, overwrite=True) #6 print the status to see if this succeeded. print hdfs.get_file_dir_status('chapter5/LoanStats3d.csv')
example_dir = 'user/hdfs/example_dir' example_file = '{dir}/example.txt'.format(dir=example_dir) example_data = '01010101010101010101010101010101010101010101\n' rename_dir = 'user/hdfs/example_rename' # create a new client instance hdfs = PyWebHdfsClient(host='localhost', port='50070', user_name='hduser') # create a new directory for the example print('making new HDFS directory at: {0}\n'.format(example_dir)) hdfs.make_dir(example_dir) # get a dictionary of the directory's status dir_status = hdfs.get_file_dir_status(example_dir) print(dir_status) # create a new file on hdfs print('making new file at: {0}\n'.format(example_file)) hdfs.create_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print(file_status) # get the checksum for the file file_checksum = hdfs.get_file_checksum(example_file) print(file_checksum) # append to the file created in previous step print('appending to file at: {0}\n'.format(example_file))
def create_data_from_station_data(first, second): """this function creates the data analyzing the two stations in comparison""" global hdfs #global hdfs object global hbase #global hbase object if (hdfs is None): from pywebhdfs.webhdfs import PyWebHdfsClient hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu', port='50070', user_name='uacharya') if (hbase is None): import happybase hbase = happybase.ConnectionPool(size=1, host='cshadoop.boisestate.edu') date_for_comparision = first["Date"].strip() # creating directory for each date try: hdfs.get_file_dir_status('user/uacharya/single_screen/' + date_for_comparision) except Exception: # directory to hold dataset in csv file for reach node in wall display starting from 1 to 9 content = 'Date,ID,Source,Destination,S_Lat,S_Lon,D_Lat,D_Lon,Wind_Lat,Wind_Lon,Wind_Velocity\n' try: hdfs.create_file('user/uacharya/single_screen/' + date_for_comparision + '/data/output.csv', content, replication=1) except Exception: pass dataset = { 'node_1': [], 'node_2': [], 'node_3': [] } for data in broadcast_variable.value: compare_data_between(date_for_comparision, first, data, dataset) # for key in dataset: # if(len(dataset[key])!=0): # content = "\n".join(dataset[key]); # content +="\n"; # while(True): # try: # hdfs.append_file('user/uacharya/simulation/'+date+'/'+key+'/output.csv',content,buffersize=4096); # break; # except Exception: # time.sleep(0.2); # continue; dataset.clear() #clearing the dictionary # append over here after all the global variable has been made return second
def upload_to_hdfs(self, local_file, table, index): ''' upload file from local filesystem to hdfs ''' hiveOper = hive_op.HiveOperation() local_dir = self._conf.get('local', 'data_dir') local_path = '{}{}/{}'.format(local_dir, index, local_file) host1 = self._conf.get('hdfs', 'name_node1') host2 = self._conf.get('hdfs', 'name_node2') user = self._conf.get('hdfs', 'user') port = self._conf.getint('hdfs', 'port') hdfs_base_path = self._conf.get('hdfs', 'upload_path') hdfs_dir_path = '{}{}'.format(hdfs_base_path, index) hdfs_path = '{}{}/{}'.format(hdfs_base_path, index, local_file) #implement HA manually try: hdfs_cli = PyWebHdfsClient(host=host1, port=port, user_name=user) hdfs_cli.list_dir('/') except Exception as e: logger.warn('open hdfs client failed error {}'.format(e)) hdfs_cli = PyWebHdfsClient(host=host2, port=port, user_name=user) hdfs_cli.list_dir('/') if hdfs_cli is None: logger.error('no active host') return None try: hdfs_cli.get_file_dir_status(hdfs_path) # 若hdfs中临时文件存在,表示可能是上次上传hive失败,或者进程中途被杀导致 # 先将临时文件中的数据导入hive,再进行下一步操作 ret = hiveOper.load_hdfs_file_into_tmp_table(hdfs_path, table) if ret == -1: logger.error('load from hdfs to tmp table failed') logger.info('last time! {} load into tmp finished'.format(table)) hiveOper.load_tmp_table_to_main(table) logger.info( 'last time! {} load tmp table to main finished'.format(table)) #FileNotFountException except Exception as e: #文件不存在是正常情况 logger.debug('no such file {}'.format(hdfs_path)) retry_count = 0 upload_finished = False while retry_count <= 10 and not upload_finished: with open(local_path) as f: logger.debug('''local path is {}, hdfs_cli is {}, file is {}, hdfs_path is {}'''.format( local_path, hdfs_cli, f, hdfs_path)) #hdfs_cli.delete_file_dir(hdfs_path) #若目录不存在,先创建目录 try: hdfs_cli.get_file_dir_status(hdfs_dir_path) except Exception as e: hdfs_cli.make_dir(hdfs_dir_path) try: hdfs_cli.create_file(hdfs_path, f) upload_finished = True except Exception as e: logger.warn('''create file on hdfs failed, local path is {}, hdfs path is {}, retry count {}, upload flag {}'''.format( local_path, hdfs_path, retry_count, upload_finished)) logger.warn('error is {}'.format(e)) retry_count += 1 if retry_count <= 10: return hdfs_path else: logger.error('''{} upload 10 times, still failed, retry count {}, upload_flag is {}'''.format( local_path, retry_count, upload_finished)) return None
class HDFS(object): def __init__(self, host, port, user): self._hdfs = PyWebHdfsClient(host=host, port=port, user_name=user, timeout=None) logging.debug('webhdfs = %s@%s:%s', user, host, port) def recursive_copy(self, local_path, remote_path, exclude=None, permission=755): if exclude is None: exclude = [] c_path = canonicalize(remote_path) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path, permission=permission) fs_g = os.walk(local_path) for dpath, dnames, fnames in fs_g: _, relative_path = dpath.split(local_path) for dname in dnames: if dname not in exclude: c_path = canonicalize('%s/%s/%s' % (remote_path, relative_path, dname)) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path, permission=permission) for fname in fnames: if fname not in exclude: data = file( canonicalize('%s/%s/%s' % (local_path, relative_path, fname)), 'rb') c_path = canonicalize('%s/%s/%s' % (remote_path, relative_path, fname)) logging.debug('creating %s', c_path) self._hdfs.create_file(c_path, data, overwrite=True, permission=permission) data.close() def make_dir(self, path, permission=755): logging.debug('make_dir: %s', path) self._hdfs.make_dir(canonicalize(path), permission=permission) def create_file(self, data, remote_file_path, permission=755): logging.debug('create_file: %s', remote_file_path) sio = BytesIO(data) self._hdfs.create_file(canonicalize(remote_file_path), sio, overwrite=True, permission=permission) def append_file(self, data, remote_file_path): logging.debug('append to: %s', remote_file_path) self._hdfs.append_file(canonicalize(remote_file_path), data) def stream_file_to_disk(self, remote_file_path, local_file_path): chunk_size = 10 * 1024 * 1024 offset = 0 with open(local_file_path, 'wb') as dest_file: data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) while True: dest_file.write(data) if len(data) < chunk_size: break offset += chunk_size data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) def read_file(self, remote_file_path): data = self._hdfs.read_file(canonicalize(remote_file_path)) return data def remove(self, path, recursive=False): logging.debug('remove: %s', path) self._hdfs.delete_file_dir(canonicalize(path), recursive) def file_exists(self, path): try: self._hdfs.get_file_dir_status(path) return True except: return False
class HadoopFileSystem(BaseFs.FileSystem): def __init__(self, vcPath, simulateOnly=False, isVerbose=False, logger=None, user=None, host=None, port=None): BaseFs.FileSystem.__init__(self, simulateOnly, isVerbose, logger) config = Config.Config() hdfsUser, hdfsHost, hdfsPort = config.getHadoopConfig(user, host, port) self.hdfs = PyWebHdfsClient(host=hdfsHost, port=hdfsPort, user_name=hdfsUser) self.vcPath = vcPath def make_fd(self, path, isSrc, dstDirMustExist): fd = None try: fd = HadoopFileDescriptor(self, path, isSrc, dstDirMustExist) except pywebhdfs.errors.FileNotFound: self.logger.info("DESC: does not exist: " + path) raise Errors.FileNotFound("Path {0} does not exist".format(path)) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format(path, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format(path, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( path, e)) raise Errors.BadConnection( "Connection error while looking for path: {0}, exc={1}".format( path, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( path, e)) raise Errors.FsException( "An exception happened while looking for path: {0}, exc={1}". format(path, e)) return fd def exists_file_dir(self, fd): try: return self.hdfs.exists_file_dir(fd.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS exists test: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS exists test: {0}, exc={1}". format(fd.abspath, e)) def delete_file_dir(self, fd, recursive=False, force=False): if self.simulateOnly: print("SIMULATE -> remove file/dir: {0}, recursive={1}".format( fd.abspath, recursive)) else: try: if not recursive or force or \ query_yes_no(question="Are you sure you want to delete folder recursively?", default="no"): status = self.hdfs.delete_file_dir(fd.abspath, recursive=recursive) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS delete directory: {0}, exc={1}" .format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS delete directory: {0}, exc={1}" .format(fd.abspath, e)) def list_dir(self, fd): try: status = self.hdfs.list_dir(fd.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error while looking for path: {0}, exc={1}".format( fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened while looking for path: {0}, exc={1}". format(fd.abspath, e)) currentDir = status["FileStatuses"]["FileStatus"] for item in currentDir: yield HadoopFileDescriptor(self, fd.abspath, isSrc=True, needsDstDirCheck=False, fileJson=item) def make_dir(self, path): if self.simulateOnly: print("SIMULATE -> make dir: " + path) else: try: self.hdfs.make_dir(path) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( path, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format(path, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( path, e)) raise Errors.BadConnection( "Connection error during HDFS create directory: {0}, exc={1}" .format(path, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( path, e)) raise Errors.FsException( "An exception happened during HDFS create directory: {0}, exc={1}" .format(path, e)) def open_file(self, fd, rwMode): return fd def close_file(self, fd): pass def touch_file(self, fd): if self.simulateOnly: print("SIMULATE -> touch file: " + fd.abspath) else: try: self.hdfs.create_file(fd.abspath, 0, overwrite=True) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS create file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS create file: {0}, exc={1}" .format(fd.abspath, e)) def truncate_file(self, fd, size): if self.simulateOnly: print("SIMULATE -> truncate file: {0}, size={1}".format( fd.abspath, size)) else: try: self.hdfs.truncate_file(fd.abspath, size) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS truncate file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS truncate file: {0}, exc={1}" .format(fd.abspath, e)) def try_concat_files(self, fd, chunkFdList): # Workaround for unordered concat bug in Hadoop 2.7.1 is to use one source at the time # https://issues.apache.org/jira/browse/HDFS-8891 currIndex = 0 concatStep = 20 chunkedList = [ chunkFdList[pos:pos + concatStep] for pos in range(0, len(chunkFdList), concatStep) ] for sourceChunk in chunkedList: try: self.concat_files(fd, sourceChunk) currIndex += len(sourceChunk) except Errors.FsException as e: break return currIndex def concat_files(self, fd, chunkFdList): strList = list() for chunkFd in chunkFdList: strList.append(chunkFd.abspath) if self.simulateOnly: print("SIMULATE -> concat file: {0}, sources={1}".format( fd.abspath, ",".join(strList))) else: try: self.hdfs.concat_files(fd.abspath, strList) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS concat file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS concat file: {0}, exc={1}" .format(fd.abspath, e)) def read_data(self, fd, offset, size): if offset >= fd.size: return "" else: try: contents = self.hdfs.read_file(fd.abspath, offset=offset, length=size) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS read file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS read file: {0}, exc={1}" .format(fd.abspath, e)) return contents def append_data(self, fd, data): if self.simulateOnly: print("SIMULATE -> write file data: " + fd.abspath) else: try: self.hdfs.append_file(fd.abspath, data) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS append file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS append file: {0}, exc={1}" .format(fd.abspath, e)) def local_mv_file(self, src, dst): if self.simulateOnly: print("SIMULATE -> local move file: {0} -> {1} ".format( src.abspath, dst.abspath)) else: try: self.hdfs.rename_file_dir(src.abspath, dst.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( src.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( src.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( src.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS rename file: {0}, exc={1}". format(src.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( src.abspath, e)) raise Errors.FsException( "An exception happened during HDFS rename file: {0}, exc={1}" .format(src.abspath, e)) def local_cp_file(self, src, dst): # This is an open issue in Hadoop community: https://issues.apache.org/jira/browse/HDFS-3370 # Instead, we can do a symbolic link if self.simulateOnly: print("SIMULATE -> local copy file: {0} -> {1} ".format( src.abspath, dst.abspath)) else: print( "Copy within HDFS is not supported due to lack of Hadoop support" ) print( "Once symbolic links are enabled, this feature will be enabled" ) sys.exit(1) # self.hdfs.create_sym_link(src.abspath, dst.abspath, createParent=True) def get_hdfs_file_dir_json(self, path): try: status = self.hdfs.get_file_dir_status(path) return status["FileStatus"] except pywebhdfs.errors.FileNotFound: return None def validate_hdfs_arg(self, arg): if not arg.startswith(self.vcPath): print("Error: You don't have permissions to the path: %s" % arg) print("Your path must be rooted under: %s" % self.vcPath) sys.exit(1)
class DMS: def __init__(self, debug=0): ''' This function use to init a class. To show an error messages debug should be 1. :param : debug - 1, show an error or success message. 0 otherwise :return: Nothing. ''' self.debug = debug pass def hbase_connection(self, host, port, table='dms'): ''' This function use to establish a connection to hbase, for preparing to insert, remove, fetch data from hbase. We use starbase for connect to hbase via rest api.(See more: https://github.com/barseghyanartur/starbase) :param : host - hbase rest host :param : port - hbase rest running port :param : table - DMS table on hbase (default: 'dms') :return: Nothing. ''' self.hbase = hbaseConnection(host=host, port=port) t = self.hbase.table(table) if (not t.exists()): t.create('meta_data', 'file') self.hbase_table = t def hdfs_connection(self, host, port, user_name, hdfs_path='/tmp/'): ''' This function use to establish a connection to hdfs, for preparing to create, retrieve, update, delete file in hdfs. We use pywebhdfs in order to do this task via hdfs rest api.(See more: http://pythonhosted.org/pywebhdfs/) :param : host - hdfs rest host :param : port - hdfs rest running port :param : user_name - hdfs username (for authentication) :param : hdfs_path - location to store files. (default: '/tmp/') :return: Nothing. ''' self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name) self.hdfs_path = hdfs_path def solr_connection(self, host, port, collection): ''' This function use to establish a connection to solr, for query or search any text on a system. :param : host - solr's host :param : port - solr's running port :param : collection - solr's collection for searching ''' self.solr = ''.join(['http://', host, ':', port, '/solr/', collection]) def extract(self, file): ''' This function use to extract meta data from a file. We use hachoir3 library to extract them. (See more: http://hachoir3.readthedocs.org) :param : file - file for extract :return: meta data as dict for success, 0 if fail. ''' try: filename, realname = unicodeFilename(file), file parser = createParser(filename, realname) meta_data = extractMetadata(parser) meta_data_text = meta_data.exportPlaintext() meta_list = dict() for i in range(1, len(meta_data_text)): meta_split = meta_data_text[i].split(":") column = meta_split[0].replace('- ', '') value = meta_split[1].lstrip() meta_list.update({column: value}) return meta_list except: if self.debug: print "Something went wrong, meta data of", file, "could not extract." return None def upload(self, file): ''' This function use to uplaod a file to hdfs and store meta data on hbase Meta data consist of 2 main parts: file's meta data and hdfs's file's meta data. This function will increase a file version if it is already store in hbase. :param : file - file's name :return: True if success otherwise False. ''' version = 1 key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) # Read a file try: f = open(file, 'r') file_content = f.read() f.close() except: print "Cannot read file:", file # Check file's version while self.hbase_table.fetch(key) != None: version = int(self.get_lastest_version(file)) + 1 key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) # Try to upload file. try: self.hdfs.create_file(path, file_content) hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus'] file_meta = self.extract(file) t = self.hbase_table status = t.insert(key, {'file': {'content': file_content}}) if status != 200: if self.debug: print "Error inserting: file content" # save hbase meta data for i in range(0, len(file_meta.keys())): status = t.insert( key, { 'meta_data': { file_meta.keys()[i]: file_meta[file_meta.keys()[i]] } }) if status != 200: if self.debug: print "Error inserting:", file_meta.keys()[i] # save hdfs meta data for i in range(0, len(hdfs_meta.keys())): status = t.insert( key, { 'meta_data': { hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]] } }) if status != 200: if self.debug: print "Error inserting:", hdfs_meta.keys()[i] # save version status = t.insert(key, {'meta_data': {'version': version}}) if status != 200: if self.debug: print "Error inserting: version" except: if self.debug: print "Upload failed." return False if self.debug: print "[Uploaded]", file, "version:", version return True def download(self, file, version=None, download_dir=''): ''' This function use to retrieve or download file from hdfs. Then save it as a new file named (v[version].[file] - For example, v1.mytext.txt). You can specify the directory of downloaded file. You can also specify file's version for downloading if not it will be version 1. :param : file - file's name :param : version - file's version (default: 1) :param : download_dir - download directory (default: '' or current directory NOTE: it must end with '/' - For example, '../download/') :return: True if success otherwise false. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) downloaded_file = ''.join([download_dir, key]) try: f = open(downloaded_file, 'w') f.write(self.hdfs.read_file(path)) f.close() except: if self.debug: print "Cannot download a file:", file return False if self.debug: print "[Downloaded]", key return True def update(self, file, version=None): ''' This function use to update file to hdfs and data stored in hbase by overwrite that file on hdfs, and also insert new data to hbase too. You can specify a file's version in order to update it. :param : file - file's name :param : version - file's version :return: True if success otherwise False. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) # Read a file try: f = open(file, 'r') file_content = f.read() f.close() except: print "Cannot read file:", file # Try to upload file. try: self.hdfs.create_file(path, file, overwrite=True) hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus'] file_meta = self.extract(file) status = t.insert( key, {'file': { 'content': file_content, 'name': file }}) if status != 200: if self.debug: print "Error inserting: file content" # save hbase meta data for i in range(0, len(file_meta.keys())): status = t.insert( key, { 'meta_data': { file_meta.keys()[i]: file_meta[file_meta.keys()[i]] } }) if status != 200: if self.debug: print "Error inserting:", file_meta.keys()[i] # save hdfs meta data for i in range(0, len(hdfs_meta.keys())): status = t.insert( key, { 'meta_data': { hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]] } }) if status != 200: if self.debug: print "Error inserting:", hdfs_meta.keys()[i] # save version status = t.insert(key, {'meta_data': {'version': version}}) if status != 200: if self.debug: print "Error inserting: version" except: if self.debug: print "Update failed." return False if self.debug: print "[Updated]", file, "version:", version return True def delete(self, file, version=None): ''' This function use to delete file in hbase, and hdfs. You can specify file's version in order to delete it. :param : file - file's name :param : version - file's version :return: True if succes otherwise False. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) # Check if file exists if self.hbase_table.fetch(key) == None: if self.debug: print "Cannot delete.", key, "is not exists." return False # Remove row on hbase t = self.hbase_table if t.remove(key) != 200: if self.debug: print "[HBASE] cannot remove a row key:", key return False # Delete file on hdfs if not self.hdfs.delete_file_dir(path): if self.debug: print "[HDFS] Cannot remove a file path:", path return False if self.debug: print "[Deleted]", file, "version:", version return True def get_file_meta_data(self, file, version=None): ''' This function use to get all file's meta_data from hbase. You can specify a file's version. :param : file - file's name :param : version - file's version :return: meta data as dict for success, 0 if fail ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) if not self.hbase_table.fetch(key): if self.debug: print key, "is not exists" return False return self.hbase_table.fetch(key)['meta_data'] def get_file_content(self, file, version=None): ''' This function use to get all file's content from hbase. You can specify a file's version. :param : file - file's name :param : version - file's version :return: meta data as dict for success, 0 if fail ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) if not self.hbase_table.fetch(key): if self.debug: print key, "is not exists" return False return self.hbase_table.fetch(key)['file'] def search(self, text): ''' This function will search in xxxx via solr rest api. :param : text - text for searching :return: json response from solr, False for not found. ''' query = urlopen(''.join([self.solr, '/select?q=', text, '&wt=json'])) response = simplejson.load(query) if response['response']['numFound'] == 0: if self.debug: print text, "not found!" return False return response def get_all_file(self): ''' This function return all files that stored on Hbase in a list format. :param : Nothing. :return: fetch result as a list. ''' rf = '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": ""}}' t = self.hbase_table result = t.fetch_all_rows(with_row_id=True, filter_string=rf) return list(result) def get_file_version(self, file): ''' This function will fetch data from file name then return them. :param : file - file's name :return: file_list with version as a dict. ''' rf = ''.join([ '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": "', file, '"}}' ]) t = self.hbase_table result = t.fetch_all_rows(with_row_id=True, filter_string=rf) lsr = list(result) file_version = list() for i in range(0, len(lsr)): file_version.append(lsr[i].keys()[0].split('.')[0].split('v')[1]) file_list = dict() file_list['name'] = file file_list['version'] = file_version return file_list def get_lastest_version(self, file): ''' This function will return a lastest version number as integer. :param : file - file's name :return: version number as an integer. ''' file_version = self.get_file_version(file) file_version['version'].sort() return file_version['version'][len(file_version['version']) - 1] def delete_all_version(self, file): ''' This function will delete all file's version in an hbase and HDFS :param : file - file's name :return: True if success otherwise False ''' self.get_file_version(file)['version'].sort() for version in self.get_file_version(file)['version']: try: self.delete(file, version) except: return False return True def delete_all(self): ''' This function will delete all the files on an hbase and hdfs. :param : Nothing :return: True if success otherwise False ''' for full_file in self.get_all_file(): file = full_file.keys()[0].split('.')[1] version = full_file.keys()[0].split('.')[0].split('v')[1] try: self.delete(file, version) except: return False return True
from pywebhdfs.webhdfs import PyWebHdfsClient hdfs = PyWebHdfsClient(host='localhost',port='50070',user_name='vagrant') my_file = 'user/vagrant/hdfs-test/data.dat' print 'Status of file: ', my_file status = hdfs.get_file_dir_status(my_file) print status print 'Second 500 bytes of file: ',my_file data = hdfs.read_file(my_file,offset=500,length=500) print data
from pywebhdfs.webhdfs import PyWebHdfsClient import logging from pprint import pprint logging.basicConfig(level=logging.DEBUG) _LOG = logging.getLogger(__name__) #host= your server address. hdfs = PyWebHdfsClient(host='',port='50070', user_name='hduser',timeout=4) # your Namenode IP & username here my_dir = '/user/hduser/sample' fileFinal=my_dir+'/file.txt' pprint(hdfs.list_dir(my_dir)) dir_status = hdfs.get_file_dir_status(my_dir) print dir_status print "Reading file from hadoop hdfs" file_data = hdfs.read_file("user/hduser/sample/file.txt") print file_data
_LOG = logging.getLogger(__name__) example_dir = 'user/hdfs/example_dir' example_file = '{dir}/example.txt'.format(dir=example_dir) example_data = '01010101010101010101010101010101010101010101\n' rename_dir = 'user/hdfs/example_rename' # create a new client instance hdfs = PyWebHdfsClient(host='localhost', port='50070', user_name='hduser') # create a new directory for the example print('making new HDFS directory at: {0}\n'.format(example_dir)) hdfs.make_dir(example_dir) # get a dictionary of the directory's status dir_status = hdfs.get_file_dir_status(example_dir) print dir_status # create a new file on hdfs print('making new file at: {0}\n'.format(example_file)) hdfs.create_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print file_status # append to the file created in previous step print('appending to file at: {0}\n'.format(example_file)) hdfs.append_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print file_status
verify=False) stringio = StringIO.StringIO(source.content) unzipped = zipfile.ZipFile(stringio) import pandas as pd from pywebhdfs.webhdfs import PyWebHdfsClient subselection_csv = pd.read_csv(unzipped.open('LoanStats3d.csv'), skiprows=1, skipfooter=2, engine='python') stored_csv = subselection_csv.to_csv('./stored_csv.csv') hdfs = PyWebHdfsClient(user_name="hdfs", port=50070, host="sandbox") hdfs.make_dir('chapter5') with open('./stored_csv.csv') as file_data: hdfs.create_file('chapter5/LoanStats3d.csv', file_data, overwrite=True) print(hdfs.get_file_dir_status('chapter5/LoanStats3d.csv')) from pyspark.sql import HiveContext # sc = SparkContext() sqlContext = HiveContext(sc) data = sc.textFile("/chapter5/LoanStats3d.csv") parts = data.map(lambda r: r.split(',')) firstline = parts.first() datalines = parts.filter(lambda x: x != firstline) def cleans(row): row[7] = str(float(row[7][:-1]) / 100) return [s.encode('utf8').replace(r"_", " ").lower() for s in row]