def ship_udf(ic, function, hdfs_path=None, udf_name=None, database=None, overwrite=False): # extract some information from the function if udf_name is None: udf_name = function.name symbol = function.llvm_func.name ir = function.llvm_module.to_bitcode() return_type = udf_to_impala_type[function.signature.return_type.name] arg_types = [udf_to_impala_type[arg.name] for arg in function.signature.args[1:]] # ship the IR to the cluster hdfs_client = PyWebHdfsClient(host=ic._nn_host, port=ic._webhdfs_port, user_name=ic._hdfs_user) if hdfs_path is None: hdfs_path = os.path.join(ic._temp_dir, udf_name + '.ll') if not hdfs_path.endswith('.ll'): raise ValueError("The HDFS file name must end with .ll") hdfs_client.create_file(hdfs_path.lstrip('/'), ir, overwrite=overwrite) # register the function in Impala if database is None: database = ic._temp_db impala_name = '%s.%s(%s)' % (database, udf_name, ', '.join(arg_types)) if overwrite: ic._cursor.execute("DROP FUNCTION IF EXISTS %s" % impala_name) register_query = ("CREATE FUNCTION %s RETURNS %s " "LOCATION '%s' SYMBOL='%s'") % (impala_name, return_type, hdfs_path, symbol) ic._cursor.execute(register_query)
def setup_common_oozie_libs(name_node): webhdfs_port = '14000' webhdfs_user = '******' platform_dir = 'user/deployment/platform' lib_path_list = ['/usr/hdp/current/hbase-client/lib/hbase-client.jar', '/usr/hdp/current/hbase-client/lib/hbase-common.jar', '/usr/hdp/current/hbase-client/lib/hbase-protocol.jar', '/usr/hdp/current/hbase-client/lib/hbase-server.jar', '/usr/hdp/current/hbase-client/lib/htrace-core-3.1.0-incubating.jar', '/usr/hdp/current/hbase-client/lib/hbase-hadoop-compat.jar', '/usr/hdp/current/hbase-client/lib/hbase-it.jar', '/usr/hdp/current/hbase-client/lib/hbase-prefix-tree.jar', '/usr/hdp/current/hbase-client/lib/zookeeper.jar', '/usr/hdp/current/pig-client/piggybank.jar', '/usr/hdp/current/spark-client/lib/spark-examples.jar'] # Setup a connection with hdfs using namenode. hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None) # Create directory on hadoop file system (HDFS). hdfs_client.make_dir(platform_dir) # Creates a new file on HDFS and write contents from local FS. for path in lib_path_list: platform_file = '%s/%s' % (platform_dir, os.path.basename(path)) print 'Copying source file: %s to HDFS path %s' % (path, platform_file) with open(path) as file_data: try: hdfs_client.create_file(platform_file, file_data, overwrite=True) except PyWebHdfsException: print 'retrying HDFS copy command for %s' % platform_file time.sleep(5) hdfs_client.create_file(platform_file, file_data, overwrite=True)
def solarLog_call(epoch_time): conn = http.client.HTTPConnection("") r = requests.get(" http://winsun.solarlog-web.ch/api?cid=" + pfadheimBaarCID + "&locale=de_ch&username=277555406&password=5a03cdf0a3ff42de09bc85361d8a2f0f&function=dashboard&format=jsonh&solarlog=9112&tiles=Yield|true,Grafic|true,Env|true,Weather|true&ctime=" + epoch_time) logging.info("Response: " + str(r.status_code) + " " + r.reason) data = r.json() # This will return entire content. data['timestamp'] = epoch_time # Remove key's with complex JSON structure del data['cur_production_per_wrid'] del data['invEnergyType'] #del data['decimalseperator'] logging.debug(data) #write data to .json with open('/home/claude/repo/bda-solar/data/data_timestamp/pfadibaar_solarlog_' + epoch_time + '.json', 'w', encoding='utf-8') as outfile: json.dump(data, outfile, indent=4, ensure_ascii=False) #write the same data as .csv since it is more easy to handel with hdfs.. with open('/home/claude/repo/bda-solar/data/data_timestamp/pfadibaar_solarlog_' + epoch_time + '.csv', 'w') as f: # Just use 'w' mode in 3.x w = csv.DictWriter(f, data.keys(), dialect=csv.excel_tab) w.writeheader() w.writerow(data) # write the same data as .csv since it is more easy to handel with hdfs.. hdfs = PyWebHdfsClient(host='193.246.208.147', port='50079', user_name='hdfs') #hdfs_path = 'user/hdfs/from_python' hdfs.create_file('user/hdfs/pfadibaar_solarlog.csv', '0100') #with open('pfadibaar_solarlog_' + epoch_time + '.csv') as file_data: # hdfs.create_file(hdfs_path, data=file_data) conn.close()
def upload_file(): """ Upload file --- tags: - Files consumes: "multipart/form-data" parameters: - name: file in: formData required: true paramType: body dataType: file type: file responses: 200: description: Return a successful message 401: description: Unauthorized 400: description: Bad Request 500: description: Server Internal error """ # hard-code config information. You should imporove it. hdfs = PyWebHdfsClient(host='webhdfs',port='50070', user_name='thanhson1085') if request.method == 'POST': file = request.files['file'] if file and allowed_file(file.filename): filename = secure_filename(str(time.time()) + file.filename) my_file = 'tmp/thanhson1085/data/' + filename hdfs.create_file(my_file, file) return jsonify({'success':'true'}) return jsonify({'success':'false'})
def update_raw_stage(output, delivery_tag): #context = zmq.Context() #confirm = context.socket(zmq.PUSH) #confirm.connect(confirm_host) hdfs = PyWebHdfsClient(host=webhdfs_host, port=webhdfs_port, user_name=webhdfs_user) impala_conn = connect(host=impala_host, port=int(impala_port)) cur = impala_conn.cursor() start_time = time.time() for k, v in output.iteritems(): if (time.time() - start_time)/60 > sink_minutes: sink_logger.warning('ETL process running longer then sink timeout: {0} minutes'.format((time.time() - start_time)/60)) try: file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k) hdfs.append_file(file_name, '\n'.join(v)) cur.execute('refresh test_log_{0}'.format(k)) except hdfs_err.PyWebHdfsException: file_name = 'user/impala/test_log_1/raw_log_{0}.txt'.format(k) hdfs.create_file(file_name, '') hdfs.append_file(file_name, '\n'.join(v)) cur.execute('refresh test_log_{0}'.format(k)) #confirm.send(delivery_tag) sink_logger.info('ETL process finished for {0} minutes'.format((time.time() - start_time)/60)) sink_logger.info('ETL process finished with {0} delivery_tag'.format(delivery_tag))
def setup_common_oozie_libs(name_node): webhdfs_port = '14000' webhdfs_user = '******' platform_dir = 'user/deployment/platform' lib_path_list = ['/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar', '/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar', '/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar', '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar', '/opt/cloudera/parcels/CDH/lib/spark/lib/spark-examples.jar'] # Setup a connection with hdfs using namenode. hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None) # Create directory on hadoop file system (HDFS). hdfs_client.make_dir(platform_dir) # Creates a new file on HDFS and write contents from local FS. for path in lib_path_list: platform_file = '%s/%s' % (platform_dir, os.path.basename(path)) logging.info('Copying source file: %s to HDFS path %s', path, platform_file) with open(path) as file_data: hdfs_client.create_file(platform_file, file_data, overwrite=True)
def submit(self, bund, files=[]): hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie') for f in files: hdfs.create_file("{}/{}".format(bund.path, f.name), f.read()) doc, tag, text = Doc().tagtext() with tag("configuration"): with tag("property"): with tag("name"): text("user.name") with tag("value"): text("oozie") with tag("property"): with tag("name"): text("oozie.bundle.application.path") with tag("value"): text("/"+bund.path + "/" + bund.name) configuration = doc.getvalue() response = post("{0}/oozie/v1/jobs".format(self.url), data=configuration, headers={'Content-Type': 'application/xml'}) if response.status_code > 399: print response.headers["oozie-error-message"] print response.status_code print response.content
def saveToStore(path,meta): con=happybase.Connection(MasterHbase) con.open() metaTable= con.table('MetaTable') if meta['size'] < largeSize: # save to Hbase encTable = con.table('EncTable') with open(path,'rb') as f: encTable.put(meta['rowkey'],{'enc:data': f.read()}) metaTable.put(str(meta['rowkey']),{ 'pp:name': str(meta['filename']), 'pp:checksum': str(meta['checksum']), 'pp:size': str(meta['size']), 'pp:often': str(meta['often']), 'pp:des': str(meta['description']) } ) app.logger.debug('%s is saved to Hbase',meta['rowkey']) else: # save to HDFS hdfs = PyWebHdfsClient(host=Master,port='50070', timeout=None,user_name='hduser') with open(path, 'rb') as f: hdfs.create_file(HDFSMainPath+meta['rowkey'], f) metaTable.put(str(meta['rowkey']),{ 'pp:name': str(meta['filename']), 'pp:checksum': str(meta['checksum']), 'pp:size': str(meta['size']), 'pp:HDFSpath': str(HDFSMainPath + meta['rowkey']), 'pp:often': str(meta['often']), 'pp:des': str(meta['description']) } ) app.logger.debug('%s is saved to HDFS',meta['rowkey']) con.close()
def ship_udf(ic, function, hdfs_path=None, udf_name=None, database=None, overwrite=False): # extract some information from the function if udf_name is None: udf_name = function.name symbol = function.llvm_func.name ir = function.llvm_module.to_bitcode() return_type = udf_to_impala_type[function.signature.return_type.name] arg_types = [udf_to_impala_type[arg.name] for arg in function.signature.args[1:]] # ship the IR to the cluster hdfs_client = PyWebHdfsClient(host=ic._nn_host, port=ic._webhdfs_port, user_name=ic._hdfs_user) if hdfs_path is None: hdfs_path = os.path.join(ic._temp_dir, udf_name + '.ll') if not hdfs_path.endswith('.ll'): raise ValueError("The HDFS file name must end with .ll") hdfs_client.create_file(hdfs_path.lstrip('/'), ir, overwrite=overwrite) # register the function in Impala if database is None: database = ic._temp_db impala_name = '%s.%s(%s)' % (database, udf_name, ', '.join(arg_types)) if overwrite: ic._cursor.execute("DROP FUNCTION IF EXISTS %s" % impala_name) register_query = "CREATE FUNCTION %s RETURNS %s LOCATION '%s' SYMBOL='%s'" % (impala_name, return_type, hdfs_path, symbol) ic._cursor.execute(register_query)
def to_hdfs(file_path, hdfs_path): hdfs = PyWebHdfsClient(host='hdfs-v1', port='50070', user_name='hdfs', timeout=100) with open(file_path, 'rb') as f: hdfs.create_file(hdfs_path, f, overwrite=True)
def setup_common_oozie_libs(name_node): webhdfs_port = '14000' webhdfs_user = '******' platform_dir = 'user/deployment/platform' lib_path_list = [ '/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar', '/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar', '/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar', '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar' ] # Setup a connection with hdfs using namenode. hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None) # Create directory on hadoop file system (HDFS). hdfs_client.make_dir(platform_dir) # Creates a new file on HDFS and write contents from local FS. for path in lib_path_list: platform_file = '%s/%s' % (platform_dir, os.path.basename(path)) logging.info('Copying source file: %s to HDFS path %s', path, platform_file) with open(path) as file_data: hdfs_client.create_file(platform_file, file_data, overwrite=True)
def save_extracted_subgraph(elements, args: application_args): pair, subgraph, _ = elements path = args.get_folder_results_path() hdfs = PyWebHdfsClient(host=args.hdfs_host, port=args.hdfs_port) file = os.path.join(path, f"graph_{str(pair[0])}_{str(pair[1])}") pickled = pkl.dumps(subgraph) hdfs.create_file(file, pickled, overwrite=True)
def test_create_throws_exception_for_not_created(self, mock_put): webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.init_response.status_code = http_client.TEMPORARY_REDIRECT self.response.status_code = http_client.BAD_REQUEST mock_put.side_effect = [self.init_response, self.response] with self.assertRaises(errors.PyWebHdfsException): webhdfs.create_file(self.path, self.file_data)
def test_webhdfs_csv(self): from pywebhdfs.webhdfs import PyWebHdfsClient dfs = PyWebHdfsClient(host='localhost',port='9870', user_name='hadoop') dfs.make_dir("/temp") with open("tests/data/data.csv") as input_file: dfs.create_file("/temp/data.csv", file_data=input_file, overwrite=True) dfs.delete_file_dir("/temp", recursive=True)
def put_file_contents(self, hdfs_path, file_contents, user_name='trifacta', httpfs_port='14000'): hdfs = PyWebHdfsClient(host=urlparse(self.trifacta_base_url).netloc, port=httpfs_port, user_name=user_name) hdfs.create_file(hdfs_path, file_contents, overwrite=True) return True
class WhenTestingCreateOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.location = 'redirect_uri' self.path = 'user/hdfs' self.file_data = '010101' self.init_response = MagicMock() self.init_response.headers = {'location': self.location} self.response = MagicMock() self.expected_headers = {'content-type': 'application/octet-stream'} def test_create_throws_exception_for_no_redirect(self): self.init_response.status_code = http_client.BAD_REQUEST self.response.status_code = http_client.CREATED self.requests.put.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.create_file(self.path, self.file_data) def test_create_throws_exception_for_not_created(self): self.init_response.status_code = http_client.TEMPORARY_REDIRECT self.response.status_code = http_client.BAD_REQUEST self.requests.put.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.create_file(self.path, self.file_data) def test_create_returns_file_location(self): self.init_response.status_code = http_client.TEMPORARY_REDIRECT self.response.status_code = http_client.CREATED self.put_method = MagicMock( side_effect=[self.init_response, self.response]) self.requests.put = self.put_method with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.create_file(self.path, self.file_data) self.assertTrue(result) self.put_method.assert_called_with(self.location, headers=self.expected_headers, data=self.file_data)
class WhenTestingCreateOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.location = 'redirect_uri' self.path = 'user/hdfs' self.file_data = '010101' self.init_response = MagicMock() self.init_response.headers = {'location': self.location} self.response = MagicMock() self.expected_headers = {'content-type': 'application/octet-stream'} def test_create_throws_exception_for_no_redirect(self): self.init_response.status_code = httplib.BAD_REQUEST self.response.status_code = httplib.CREATED self.requests.put.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.create_file(self.path, self.file_data) def test_create_throws_exception_for_not_created(self): self.init_response.status_code = httplib.TEMPORARY_REDIRECT self.response.status_code = httplib.BAD_REQUEST self.requests.put.side_effect = [self.init_response, self.response] with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.create_file(self.path, self.file_data) def test_create_returns_file_location(self): self.init_response.status_code = httplib.TEMPORARY_REDIRECT self.response.status_code = httplib.CREATED self.put_method = MagicMock( side_effect=[self.init_response, self.response]) self.requests.put = self.put_method with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.create_file(self.path, self.file_data) self.assertTrue(result) self.put_method.assert_called_with( self.location, headers=self.expected_headers, data=self.file_data)
def create_data_from_station_data(first, second): """this function creates the data analyzing the two stations in comparison""" global hdfs; #global hdfs object global hbase; #global hbase object if(hdfs is None): from pywebhdfs.webhdfs import PyWebHdfsClient; hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',port='50070', user_name='uacharya'); if(hbase is None): import happybase; hbase = happybase.ConnectionPool(size=1,host='cshadoop.boisestate.edu'); date_for_comparision = first["Date"].strip(); # creating directory for each date try: hdfs.get_file_dir_status('user/uacharya/simulation/'+date_for_comparision); except Exception: # directory to hold dataset in csv file for reach node in wall display starting from 1 to 9 for index in range(1, 10): content = 'Date,ID,Source,Destination,S_Lat,S_Lon,D_Lat,D_Lon,Wind_Lat,Wind_Lon,Wind_Velocity\n'; try: hdfs.create_file('user/uacharya/simulation/'+date_for_comparision+'/node'+str(index)+'/output.csv',content,replication=1); except Exception: continue; dataset = {'node_1':[],'node_2':[],'node_3':[],'node_4':[],'node_5':[],'node_6':[],'node_7':[],'node_8':[],'node_9':[]}; for data in broadcast_variable.value: compare_data_between(date_for_comparision, first, data,dataset); # for key in dataset: # if(len(dataset[key])!=0): # content = "\n".join(dataset[key]); # content +="\n"; # while(True): # try: # hdfs.append_file('user/uacharya/simulation/'+date+'/'+key+'/output.csv',content,buffersize=4096); # break; # except Exception: # time.sleep(0.2); # continue; dataset.clear(); #clearing the dictionary # append over here after all the global variable has been made return second;
def sharedlib_install(name_node, webhdfs_port, authentic_user, platform_dir, lib_path_list): # Setup a connection with hdfs using namenode. hdfs = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=authentic_user, timeout=None) # Create directory on hadoop file system (HDFS). hdfs.make_dir(platform_dir) # Creates a new file on HDFS and write contents from local FS. for path in lib_path_list: platform_file = '%s/%s' % (platform_dir, os.path.basename(path)) print >> sys.stdout, 'Copying source file: %s to HDFS path %s' %\ (path, platform_file) with open(path) as file_data: hdfs.create_file(platform_file, file_data, overwrite=True)
class HdfsHandler: def __init__(self, hadoopHost, hadopPort='50070', user='******'): # self.hdfs = PyWebHdfsClient(host='52.14.121.163', port='50070', user_name='hadoop') self.hdfs = PyWebHdfsClient(host=hadoopHost, port=hadopPort, user_name=user) self.s3_client = boto3.client('s3') def copyToHDFS(self, src_path, hdfs_path): if hdfs_path.startswith("hdfs"): temp_path = hdfs_path.split("8020") self.new_hdfs_path = temp_path[1] + '/lib' print "New Path: %s" % self.new_hdfs_path # create a new client instance # print "New Path: %s" % self.new_hdfs_path[1] jar_name = os.path.basename(src_path) print src_path fileContent = open(src_path, 'rb').read() # copies file to local for testing purpose # with open("E:/temp/java-0.0.2.jar", "wb") as jarfile: # jarfile.write(fileContent) # create a new file on hdfs print('making new file at: {0}\n'.format(jar_name)) result = self.hdfs.create_file(self.new_hdfs_path + "/" + jar_name, fileContent, overwrite=True) print "HDFS Copy Result: %s" % result return result def list_hdfs_dir(self, hdfs_path): print self.hdfs.list_dir(hdfs_path)
def writeFileToHdfs(hostName, userName, writePath, dataframe, fileName): auth = HTTPKerberosAuth() hdfsClient = PyWebHdfsClient(host=hostName, port='50070', user_name=userName, request_extra_opts={'auth': auth}) outputPath = writePath stringDF = dataframe.to_csv(columns=[ "env_name", "db_name", "tab_name", "col_name", "data_type", "comment" ], index=False) hdfsClient.create_file(path=outputPath + fileName + ".csv", file_data=stringDF, overwrite=True)
def from_pandas(ic, df, table=None, path=None, method='in_query', file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', hdfs_host=None, webhdfs_port=50070, hdfs_user=None, overwrite=False): """Create a BDF by shipping an in-memory pandas `DataFrame` into Impala path is the dir, not the filename """ # TODO: this is not atomic temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (ic._temp_db, temp_table) if path is None: path = os.path.join(ic._temp_dir, temp_table) table_name = _to_TableName(table) if overwrite: ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) columns = list(df.columns) types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes] schema = zip(columns, types) create_stmt = _create_table(table_name, schema, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) ic._cursor.execute(create_stmt) if method == 'in_query': query = "INSERT INTO %s VALUES " % table_name.to_sql() query += ', '.join(['(%s)' % ', '.join(map(_py_to_sql_string, row)) for row in df.values]) ic._cursor.execute(query) elif method == 'webhdfs': if file_format != 'TEXTFILE': raise ValueError("only TEXTFILE format supported for webhdfs") if path is None: raise ValueError("must supply a path for EXTERNAL table for webhdfs") from pywebhdfs.webhdfs import PyWebHdfsClient hdfs_client = PyWebHdfsClient(host=hdfs_host, port=webhdfs_port, user_name=hdfs_user) raw_data = StringIO() df.to_csv(raw_data, sep=field_terminator, line_terminator=line_terminator, quoting=csv.QUOTE_NONE, escapechar=escape_char, header=False, index=False) hdfs_client.create_file(os.path.join(path, 'data.txt').lstrip('/'), raw_data.getvalue(), overwrite=overwrite) raw_data.close() else: raise ValueError("method must be 'in_query' or 'webhdfs'; got %s" % method) return from_sql_table(ic, table_name.to_sql())
def test_create_returns_file_location(self, mock_put): webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.init_response.status_code = http_client.TEMPORARY_REDIRECT self.response.status_code = http_client.CREATED mock_put.side_effect = [self.init_response, self.response] result = webhdfs.create_file(self.path, self.file_data) self.assertTrue(result) mock_put.assert_called_with( self.location, headers=self.expected_headers, data=self.file_data)
def save_prediction_results(results, time, args: application_args): # get hdfs path path = args.get_folder_results_path() # save data on pod chained = list(itertools.chain.from_iterable(results)) file = os.path.join(workdir, 'prediction_' + args.get_folder_results_name()) np.savetxt(file, chained, fmt=['%d', '%d', '%1.2f']) # access it to read linewise predictions = '' with open(file, 'r') as f: for line in f: predictions += line.strip() + '\n' os.remove(file) # save results on hdfs hdfs = PyWebHdfsClient(host=args.hdfs_host, port=args.hdfs_port) file = os.path.join(path, "predictions") hdfs.create_file(file, predictions) file = os.path.join(path, "resulting_prediction_time") hdfs.create_file(file, str(time))
def write_data_to_hdfs(username, records): global hdfs_namenodes to_return = {} file_path = "/jobs_to_do/" + username + ".txt" result_path = "/jobs_done/" + username logger.debug("Writing file " + file_path + " to HDFS") try: logger.debug("Trying to connect to " + hdfs_namenodes[0] + " namenode") hdfs_client = PyWebHdfsClient(host=hdfs_namenodes[0], port='50070', user_name='xnet', timeout=100) logger.debug("Trying to erase " + file_path) logger.debug("Trying to erase " + result_path) hdfs_client.delete_file_dir(file_path, recursive=True) hdfs_client.delete_file_dir(result_path, recursive=True) hdfs_client.create_file(file_path, records.encode("utf-8")) except (ConnectionError, PyWebHdfsException) as ce: to_return["details_1"] = str(ce) logger.debug("Failed connecting to" + hdfs_namenodes[0] + " namenode") try: logger.debug("Trying to connect to " + hdfs_namenodes[1] + " namenode") hdfs_client = PyWebHdfsClient(host=hdfs_namenodes[1], port='50070', user_name='xnet', timeout=100) logger.debug("Trying to erase " + file_path) logger.debug("Trying to erase " + result_path) hdfs_client.delete_file_dir(file_path, recursive=True) hdfs_client.delete_file_dir(result_path, recursive=True) hdfs_client.create_file(file_path, records.encode("utf-8")) except (ConnectionError, PyWebHdfsException) as ce: to_return[ "error"] = "There was a problem while trying to connect to HDFS namenode." to_return["details_2"] = str(ce) logger.debug(str(to_return)) return False, to_return return True, None
def setup_common_oozie_libs(name_node): webhdfs_port = '14000' webhdfs_user = '******' platform_dir = 'user/deployment/platform' lib_path_list = [ '/usr/hdp/current/hbase-client/lib/hbase-client.jar', '/usr/hdp/current/hbase-client/lib/hbase-common.jar', '/usr/hdp/current/hbase-client/lib/hbase-protocol.jar', '/usr/hdp/current/hbase-client/lib/hbase-server.jar', '/usr/hdp/current/hbase-client/lib/htrace-core-3.1.0-incubating.jar', '/usr/hdp/current/hbase-client/lib/hbase-hadoop-compat.jar', '/usr/hdp/current/hbase-client/lib/hbase-it.jar', '/usr/hdp/current/hbase-client/lib/hbase-prefix-tree.jar', '/usr/hdp/current/hbase-client/lib/zookeeper.jar', '/usr/hdp/current/pig-client/piggybank.jar', '/usr/hdp/current/spark-client/lib/spark-examples.jar' ] # Setup a connection with hdfs using namenode. hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None) # Create directory on hadoop file system (HDFS). hdfs_client.make_dir(platform_dir) # Creates a new file on HDFS and write contents from local FS. for path in lib_path_list: platform_file = '%s/%s' % (platform_dir, os.path.basename(path)) print 'Copying source file: %s to HDFS path %s' % (path, platform_file) with open(path) as file_data: try: hdfs_client.create_file(platform_file, file_data, overwrite=True) except PyWebHdfsException: print 'retrying HDFS copy command for %s' % platform_file time.sleep(5) hdfs_client.create_file(platform_file, file_data, overwrite=True)
def save(self, workflow_name="workflow.xml"): hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie') workflow_path = "{0}/{1}/workflow.xml".format(self.path, self.name) hdfs.make_dir(self.path) hdfs.create_file(workflow_path, self.as_xml())
source = requests.get("https://resources.lendingclub.com/LoanStats3d.csv.zip", verify=False) stringio = StringIO.StringIO(source.content) unzipped = zipfile.ZipFile(stringio) import pandas as pd from pywebhdfs.webhdfs import PyWebHdfsClient subselection_csv = pd.read_csv(unzipped.open('LoanStats3d.csv'), skiprows=1, skipfooter=2, engine='python') stored_csv = subselection_csv.to_csv('./stored_csv.csv') hdfs = PyWebHdfsClient(user_name="hdfs", port=50070, host="sandbox") hdfs.make_dir('chapter5') with open('./stored_csv.csv') as file_data: hdfs.create_file('chapter5/LoanStats3d.csv', file_data, overwrite=True) print(hdfs.get_file_dir_status('chapter5/LoanStats3d.csv')) from pyspark.sql import HiveContext # sc = SparkContext() sqlContext = HiveContext(sc) data = sc.textFile("/chapter5/LoanStats3d.csv") parts = data.map(lambda r: r.split(',')) firstline = parts.first() datalines = parts.filter(lambda x: x != firstline) def cleans(row): row[7] = str(float(row[7][:-1]) / 100) return [s.encode('utf8').replace(r"_", " ").lower() for s in row]
HTMLFILE=str(line[1])+'.htm' TEXTFILE=str(line[1])+'.txt' HADOOP_HTMLFILE='user/root/crawls/'+str(ANET)+'/'+str(BNET)+'/'+HTMLFILE HADOOP_TEXTFILE='user/root/texts/'+str(ANET)+'/'+str(BNET)+'/'+TEXTFILE print "-======= site: "+str(url)+" =======-" try: soup = BeautifulSoup(html) except: print " soup exception" continue HFP=open(HTMLFILE,'w') HFP.write(soup.encode('utf-8')) HFP.close() with open(HTMLFILE) as hfp: try: client.create_file(HADOOP_HTMLFILE,hfp) except: client.delete_file_dir(HADOOP_HTMLFILE) client.create_file(HADOOP_HTMLFILE,hfp) TFP=open(TEXTFILE,'w') WRITEOUT=unicode(soup.get_text()) WORDLIST=re.sub(r'[^a-zA-Z0-9 ]',r' ',WRITEOUT) WORDLIST=WORDLIST.strip().split() TFP.write(WRITEOUT.encode('utf-8')) TFP.close() PAGETITLE='' try: PAGETITLE=soup.title.string except: pass
def save(self): hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie') coordinator_path = "{0}/{1}/coordinator.xml".format(self.path, self.name) hdfs.make_dir(self.path) hdfs.create_file(coordinator_path, self.as_xml())
class HDFS(object): def __init__(self, host, port, user): self._hdfs = PyWebHdfsClient( host=host, port=port, user_name=user, timeout=None) logging.debug('webhdfs = %s@%s:%s', user, host, port) def recursive_copy(self, local_path, remote_path, exclude=None): if exclude is None: exclude = [] c_path = canonicalize(remote_path) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path) fs_g = os.walk(local_path) for dpath, dnames, fnames in fs_g: _, relative_path = dpath.split(local_path) for dname in dnames: if dname not in exclude: c_path = canonicalize( '%s/%s/%s' % (remote_path, relative_path, dname)) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path) for fname in fnames: if fname not in exclude: data = file( canonicalize( '%s/%s/%s' % (local_path, relative_path, fname)), 'rb') c_path = canonicalize( '%s/%s/%s' % (remote_path, relative_path, fname)) logging.debug('creating %s', c_path) self._hdfs.create_file(c_path, data, overwrite=True) data.close() def make_dir(self, path): logging.debug('make_dir: %s', path) self._hdfs.make_dir(canonicalize(path)) def create_file(self, data, remote_file_path): logging.debug('create_file: %s', remote_file_path) sio = StringIO.StringIO(data) self._hdfs.create_file( canonicalize(remote_file_path), sio, overwrite=True) def append_file(self, data, remote_file_path): logging.debug('append to: %s', remote_file_path) self._hdfs.append_file(canonicalize(remote_file_path), data) def stream_file_to_disk(self, remote_file_path, local_file_path): chunk_size = 10*1024*1024 offset = 0 with open(local_file_path, 'wb') as dest_file: data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) while True: dest_file.write(data) if len(data) < chunk_size: break offset += chunk_size data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) def read_file(self, remote_file_path): data = self._hdfs.read_file(canonicalize(remote_file_path)) return data def remove(self, path, recursive=False): logging.debug('remove: %s', path) self._hdfs.delete_file_dir(canonicalize(path), recursive)
class HdfsApi: def __init__(self, request_timeout=10, logger=logging, active_nn_host='localhost', kerberos=False): self.timeout = request_timeout self.hdfs_schema = os.environ.get('HDFS_NAMENODE_SCHEMA', 'http') self.hdfs_host = active_nn_host self.hdfs_port = os.environ.get('HDFS_NAMENODE_PORT', 50070) if kerberos: extra_opts = { 'auth': HTTPKerberosAuth(mutual_authentication=OPTIONAL, sanitize_mutual_error_response=False, force_preemptive=True) } else: extra_opts = {} self.webhdfs = PyWebHdfsClient(host=self.hdfs_host, port=self.hdfs_port, request_extra_opts=extra_opts) self.logger = logger def request_namenode(self, path, method='GET', headers=None, **kwargs): self.logger.info("Calling HDFS API ({0})".format(path)) if headers is None: headers = dict() if path.startswith('http'): hdfs_url = path else: hdfs_url = '{0}://{1}:{2}/{3}'.format(self.hdfs_schema, self.hdfs_host, self.hdfs_port, path) self.logger.debug(hdfs_url) r = requests.request(method, hdfs_url, headers=headers, timeout=self.timeout, verify=False, auth=HTTPKerberosAuth(), **kwargs) return self._check_response_status(r) def request_webhdfs_status(self, path): return self.webhdfs.get_file_dir_status(path) def _check_response_status(self, response): self.logger.debug(response.text) if response.status_code >= 400: self.logger.error( "HdfsResponse returned with error status [{0}], response was: {1}" .format(response.status_code, response.text)) raise HdfsRequestError( "HdfsResponse returned with error status [{0}]".format( response.status_code)) return response def get_block_info_for_file(self, file_path): path = "fsck" params = {'files': 0, 'racks': 1, 'blocks': 0, 'path': file_path} response = self.request_namenode(path, params=params) return response @staticmethod def get_first_block_info(filename, block_info): regex = r"^{0}.*\n(.*)\n".format(filename) info_of_first_block = re.findall(regex, block_info, re.MULTILINE) if len(info_of_first_block) < 1: raise HdfsRequestError( "No block information found for file {0} in {1}".format( filename, block_info)) return info_of_first_block[0] @staticmethod def get_location_of_first_block(block_info): ip_regex = r"(?<!\-)(\d{2,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?=:)" block_locations = re.findall(ip_regex, block_info) if len(block_locations) < 1: raise HdfsRequestError( "No block location information found in {0}".format( block_info)) return block_locations[0] @staticmethod def get_host_by_ip(ip): host_info = socket.gethostbyaddr(ip) if len(host_info) < 1: raise HdfsRequestError( "Unable to get hostname form ip {0}".format(ip)) return host_info[0] @staticmethod def calculate_md5(file, block_size=65536): hash_builder = hashlib.md5() for block in iter(lambda: file.read(block_size), b""): hash_builder.update(block) md5 = hash_builder.hexdigest() file.seek(0) return md5 @staticmethod def create_temp_file(): return tempfile.NamedTemporaryFile(suffix='.temporary', prefix='hdfs-smoketest-api-') def create_temp_file_of_size(self, temp_file_size): tmp = self.create_temp_file() tmp.seek(temp_file_size * 1024 * 1024) tmp.write(b'1') tmp.seek(0) return tmp def copy_to_hdfs(self, remote_path, tmpfile): self.webhdfs.create_file(remote_path, file_data=tmpfile, overwrite=True) def create_hdfs_file_of_size_in_mb(self, path, size=300): with self.create_temp_file_of_size(size) as tmp_file: md5_of_tmp_file = self.calculate_md5(tmp_file) self.copy_to_hdfs(path, tmp_file) return md5_of_tmp_file def get_remote_file(self, path): return self.webhdfs.read_file(path) def write_remote_file_to_local_temp(self, remote_path): local = self.create_temp_file() file = self.get_remote_file(remote_path) local.write(file) local.seek(0) return local def get_hdfsfile_and_calc_md5(self, path): with self.write_remote_file_to_local_temp(path) as temp_file: return self.calculate_md5(temp_file) def cleanup_remote_file(self, path, recursive=False): self.webhdfs.delete_file_dir(path, recursive=recursive) def get_host_location_of_first_block(self, filename): file_block_info = self.get_block_info_for_file(filename) file_first_block_info = self.get_first_block_info( filename, file_block_info.text) file_block_ip = self.get_location_of_first_block(file_first_block_info) return self.get_host_by_ip(file_block_ip)
for function in ll.functions: try: symbol = function.name log("Loading types for function %s" % symbol) # skip the first argument, which is FunctionContext* arg_types = tuple([llvm2impala[arg.pointee.name] for arg in function.type.pointee.args[1:]]) functions.append((symbol, arg_types)) except (AttributeError, KeyError): # this process could fail for non-UDF helper functions...just ignore them, # because we're not going to be registering them anyway log("Had trouble with function %s; moving on..." % symbol) pass # transfer the LLVM module to HDFS hdfs_client = PyWebHdfsClient(host=args.nn_host, port=args.webhdfs_port, user_name=args.user) hdfs_client.create_file(args.hdfs_path.lstrip('/'), bc, overwrite=args.force) log("Transferred LLVM IR to HDFS at %s" % args.hdfs_path) # register the functions with impala conn = impala.dbapi.connect(host=args.impala_host, port=args.impala_port) cursor = conn.cursor(user=args.user) log("Connected to impalad: %s" % args.impala_host) if args.db: cursor.execute('USE %s' % args.db) cursor.execute("SHOW FUNCTIONS") registered_functions = cursor.fetchall() for (udf_name, return_type) in zip(args.name, args.return_type): log("Registering function %s" % udf_name) # find matching LLVM symbols to the current UDF name matches = [pair for pair in functions if udf_name in pair[0]] if len(matches) == 0:
while True: a_net = randint(1, 255) ROW = t.row(str(a_net)) if len(ROW) > 0: for key, value in ROW.items(): if value != str(-1): START = randint(1, 255) continue t.put(str(a_net), {'data:user': '******'}) print 'scanning the major ' + str(a_net) + '.0.0.0/8 subnet' for bnet in range(0, 256): if a_net == 10: continue elif a_net == 192 and bnet == 168: continue elif a_net == 172 and bnet == 16: continue elif a_net == 127: continue IPADDR = str(a_net) + '.' + str(bnet) + '.0.0/16' OFILE = str(a_net) + '-' + str(bnet) + '-p80.log' A = subprocess.Popen( ['masscan', '-p80', '-oG', OFILE, IPADDR, '--rate=2000']) A.wait() time.sleep(2) HADOOP_FILE_NAME = 'user/root/scans/' + str(a_net) + '/' + OFILE with open(OFILE) as ofp: hdfs.create_file(HADOOP_FILE_NAME, ofp) subprocess.Popen(['rm', OFILE]) t.put(str(a_net), {'data:user': '******'})
def from_pandas(ic, df, table=None, path=None, method='in_query', file_format='TEXTFILE', field_terminator='\t', line_terminator='\n', escape_char='\\', hdfs_host=None, webhdfs_port=50070, hdfs_user=None, overwrite=False): """Create a BDF by shipping an in-memory pandas `DataFrame` into Impala path is the dir, not the filename """ # TODO: this is not atomic temp_table = _random_id('tmp_table_', 8) if table is None: table = "%s.%s" % (ic._temp_db, temp_table) if path is None: path = os.path.join(ic._temp_dir, temp_table) table_name = _to_TableName(table) if overwrite: ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql()) columns = list(df.columns) types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes] schema = zip(columns, types) create_stmt = _create_table(table_name, schema, path=path, file_format=file_format, field_terminator=field_terminator, line_terminator=line_terminator, escape_char=escape_char) ic._cursor.execute(create_stmt) if method == 'in_query': query = "INSERT INTO %s VALUES " % table_name.to_sql() query += ', '.join([ '(%s)' % ', '.join(map(_py_to_sql_string, row)) for row in df.values ]) ic._cursor.execute(query) elif method == 'webhdfs': if file_format != 'TEXTFILE': raise ValueError("only TEXTFILE format supported for webhdfs") if path is None: raise ValueError( "must supply a path for EXTERNAL table for webhdfs") from pywebhdfs.webhdfs import PyWebHdfsClient hdfs_client = PyWebHdfsClient(host=hdfs_host, port=webhdfs_port, user_name=hdfs_user) raw_data = StringIO() df.to_csv(raw_data, sep=field_terminator, line_terminator=line_terminator, quoting=csv.QUOTE_NONE, escapechar=escape_char, header=False, index=False) hdfs_client.create_file(os.path.join(path, 'data.txt').lstrip('/'), raw_data.getvalue(), overwrite=overwrite) raw_data.close() else: raise ValueError("method must be 'in_query' or 'webhdfs'; got %s" % method) return from_sql_table(ic, table_name.to_sql())
_FULLURL = url + link.get('href') if _FULLURL.endswith('.pdf') and _FULLURL.startswith( "https://www.basf.com/documents/"): urls.append(_FULLURL) names.append(link.get('href').rsplit('/', 1)[-1]) names_urls = zip(names, urls) for name, url in names_urls: if not os.path.isfile("files/" + name): rq = urllib.Request(url) try: res = urllib.urlopen(rq) pdf = open("files/" + name, 'wb') pdf.write(res.read()) pdf.close() print("Download: " + url) except: continue else: print(url) try: with open('files/' + name, "rb") as file_data: # UTF-8 Latin-1 hdfs.create_file("/user/data/" + name, file_data) print("Upload to HDFS " + name) except: print("Upload to HDFS Failed")
def create_or_overwrite_file(self, path, f, hdfs_user='******', request_extra_opts={}): hdfs = PyWebHdfsClient(host='10.1.94.54', port=14000, user_name=hdfs_user, request_extra_opts=request_extra_opts) return hdfs.create_file(path, f, overwrite=True)
import ujson from impala.dbapi import connect import datetime import re hdfs_row = [] bad_str = """?NaDDi?\\""" # hdfs_row.append('blablabla') hdfs_row.append(re.sub("""(\n|\t|\r)""", '?', bad_str)) hdfs_row.append('blablabla') #try: hdfs = PyWebHdfsClient(host='al1.zmeke.com', port=50070, user_name='k.kraynov') #data = 'test,test,test' hdfs.create_file('user/k.kraynov/test/test.txt', 'blabla') #hdfs.delete_file_dir('user/k.kraynov/test.txt') #hdfs.append_file('user/k.kraynov/test2.txt', data+'\n') #hdfs.append_file('etl/500.txt', 'test') #hdfs.make_dir('etl/stage/log_{0}') #conn = connect(host='al1.zmeke.com', port=21050) #cur = conn.cursor() #cur.execute('show tables in stage;') #cur.execute('REFRESH analytics.test') #result = cur.fetchall() #for i in result: #print hdfs.list_dir('user/k.kraynov/') #dir = hdfs.list_dir('etl/stage/log_102/2') #for dir_file in dir['FileStatuses']['FileStatus']: # print dir_file['pathSuffix'] #except
opendata3mDataMetada = getUrlFromOpendata3M(inputCSV) jsonfile = open(pathToSaveDownloadedMeta, "w") jsonfile.write(json.dumps(opendata3mDataMetada)) jsonfile.close() """Download File""" nboffiledl = downloadOpendata3MFiles(opendata3mDataMetada, pathToSaveDownloadedData) """Insert files inside HDFS and store file""" # connect to HDFS hdfs = PyWebHdfsClient(host=namenode, port=namenodePort, user_name=hdfsuser) for file in os.listdir(pathToSaveDownloadedData): if (str(file) != ".forgit"): try: pathInDL = "." file_data = str(file) hdfs.create_file(file_data, pathInDL) except Exception as e: print('Failed to upload in HDFS: ' + str(e)) """Build and insert iso19139 xml to geonetwork""" try: subprocess.call("/usr/bin/Rscript addServicesToGN.R") except: print("R error due to OSM ? Try re-launched") subprocess.call("R -f addServicesToGN.R", shell=True) print( str(nboffiledl) + " files downloaded in : " + pathToSaveDownloadedData) print("AIDMOIt ingestion module ends")
class HDFS(object): def __init__(self, host, port, user): self._hdfs = PyWebHdfsClient(host=host, port=port, user_name=user, timeout=None) logging.debug('webhdfs = %s@%s:%s', user, host, port) def recursive_copy(self, local_path, remote_path, exclude=None, permission=755): if exclude is None: exclude = [] c_path = canonicalize(remote_path) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path, permission=permission) fs_g = os.walk(local_path) for dpath, dnames, fnames in fs_g: _, relative_path = dpath.split(local_path) for dname in dnames: if dname not in exclude: c_path = canonicalize('%s/%s/%s' % (remote_path, relative_path, dname)) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path, permission=permission) for fname in fnames: if fname not in exclude: data = file( canonicalize('%s/%s/%s' % (local_path, relative_path, fname)), 'rb') c_path = canonicalize('%s/%s/%s' % (remote_path, relative_path, fname)) logging.debug('creating %s', c_path) self._hdfs.create_file(c_path, data, overwrite=True, permission=permission) data.close() def make_dir(self, path, permission=755): logging.debug('make_dir: %s', path) self._hdfs.make_dir(canonicalize(path), permission=permission) def create_file(self, data, remote_file_path, permission=755): logging.debug('create_file: %s', remote_file_path) sio = BytesIO(data) self._hdfs.create_file(canonicalize(remote_file_path), sio, overwrite=True, permission=permission) def append_file(self, data, remote_file_path): logging.debug('append to: %s', remote_file_path) self._hdfs.append_file(canonicalize(remote_file_path), data) def stream_file_to_disk(self, remote_file_path, local_file_path): chunk_size = 10 * 1024 * 1024 offset = 0 with open(local_file_path, 'wb') as dest_file: data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) while True: dest_file.write(data) if len(data) < chunk_size: break offset += chunk_size data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) def read_file(self, remote_file_path): data = self._hdfs.read_file(canonicalize(remote_file_path)) return data def remove(self, path, recursive=False): logging.debug('remove: %s', path) self._hdfs.delete_file_dir(canonicalize(path), recursive) def file_exists(self, path): try: self._hdfs.get_file_dir_status(path) return True except: return False
t=conn.table('anet') while True: a_net=randint(1,255) ROW=t.row(str(a_net)) if len(ROW) > 0: for key, value in ROW.items(): if value != str(-1): START=randint(1,255) continue t.put(str(a_net),{'data:user':'******'}) print 'scanning the major '+str(a_net)+'.0.0.0/8 subnet' for bnet in range(0,256): if a_net==10: continue elif a_net==192 and bnet==168: continue elif a_net==172 and bnet==16: continue elif a_net==127: continue IPADDR=str(a_net)+'.'+str(bnet)+'.0.0/16' OFILE=str(a_net)+'-'+str(bnet)+'-p80.log' A=subprocess.Popen(['masscan','-p80','-oG',OFILE,IPADDR,'--rate=2000']) A.wait() time.sleep(2) HADOOP_FILE_NAME='user/root/scans/'+str(a_net)+'/'+OFILE with open(OFILE) as ofp: hdfs.create_file(HADOOP_FILE_NAME,ofp) subprocess.Popen(['rm',OFILE]) t.put(str(a_net),{'data:user':'******'})
#1 imports from pywebhdfs.webhdfs import PyWebHdfsClient #2 make connection with hadoop file system hdfs = PyWebHdfsClient(user_name="hdfs",port=50070,host="sandbox.hortonworks.com") hdfs.delete_file_dir('chapter5/LoanStats3d.csv',recursive=True) #4 recreate the chapters directory hdfs.make_dir('chapter5') #5 upload the csv file with open('./data/stored_csv.csv') as file_data: hdfs.create_file('chapter5/LoanStats3d.csv',file_data, overwrite=True) #6 print the status to see if this succeeded. print hdfs.get_file_dir_status('chapter5/LoanStats3d.csv')
from pywebhdfs.webhdfs import PyWebHdfsClient hdfs = PyWebHdfsClient(host='s12m.westeurope.cloudapp.azure.com', port='50070', user_name='data', timeout=10) # s12m.westeurope.cloudapp.azure.com hdfs = PyWebHdfsClient(host='', port='50070', user_name='data', timeout=10) # s12m.westeurope.cloudapp.azure.com for root, dirs, files in os.walk("./ExxonMobil"): for filename in files: try: txtname = filename.split('.')[0] + '.txt' text = convert_pdf_to_txt_1("ExxonMobil/"+filename) text_file = open("text/" + txtname, 'w', encoding="utf-8") text_file.write(text) text_file.close() print(filename) try: with open('text/' + txtname, "rb") as file_data: # UTF-8 Latin-1 hdfs.create_file("/user/data/txt_file/" + txtname, file_data) print("Upload to HDFS " + filename) except: print("Upload to HDFS Failed") except: continue ''' filename = "BASF_Creating-Chemistry_07.pdf" text = convert_pdf_to_txt_1("files/"+filename) print(text) text_file = open("text/" + filename.split('.')[0] + '.txt', 'w', encoding="utf-8") text_file.write(text) text_file.close()
# create a new client instance hdfs = PyWebHdfsClient(host='localhost', port='50070', user_name='hduser') # create a new directory for the example print('making new HDFS directory at: {0}\n'.format(example_dir)) hdfs.make_dir(example_dir) # get a dictionary of the directory's status dir_status = hdfs.get_file_dir_status(example_dir) print(dir_status) # create a new file on hdfs print('making new file at: {0}\n'.format(example_file)) hdfs.create_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print(file_status) # get the checksum for the file file_checksum = hdfs.get_file_checksum(example_file) print(file_checksum) # append to the file created in previous step print('appending to file at: {0}\n'.format(example_file)) hdfs.append_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print(file_status)
def create_data_from_station_data(first, second): """this function creates the data analyzing the two stations in comparison""" global hdfs #global hdfs object global hbase #global hbase object if (hdfs is None): from pywebhdfs.webhdfs import PyWebHdfsClient hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu', port='50070', user_name='uacharya') if (hbase is None): import happybase hbase = happybase.ConnectionPool(size=1, host='cshadoop.boisestate.edu') date_for_comparision = first["Date"].strip() # creating directory for each date try: hdfs.get_file_dir_status('user/uacharya/single_screen/' + date_for_comparision) except Exception: # directory to hold dataset in csv file for reach node in wall display starting from 1 to 9 content = 'Date,ID,Source,Destination,S_Lat,S_Lon,D_Lat,D_Lon,Wind_Lat,Wind_Lon,Wind_Velocity\n' try: hdfs.create_file('user/uacharya/single_screen/' + date_for_comparision + '/data/output.csv', content, replication=1) except Exception: pass dataset = { 'node_1': [], 'node_2': [], 'node_3': [] } for data in broadcast_variable.value: compare_data_between(date_for_comparision, first, data, dataset) # for key in dataset: # if(len(dataset[key])!=0): # content = "\n".join(dataset[key]); # content +="\n"; # while(True): # try: # hdfs.append_file('user/uacharya/simulation/'+date+'/'+key+'/output.csv',content,buffersize=4096); # break; # except Exception: # time.sleep(0.2); # continue; dataset.clear() #clearing the dictionary # append over here after all the global variable has been made return second
class DMS: def __init__(self, debug=0): ''' This function use to init a class. To show an error messages debug should be 1. :param : debug - 1, show an error or success message. 0 otherwise :return: Nothing. ''' self.debug = debug pass def hbase_connection(self, host, port, table='dms'): ''' This function use to establish a connection to hbase, for preparing to insert, remove, fetch data from hbase. We use starbase for connect to hbase via rest api.(See more: https://github.com/barseghyanartur/starbase) :param : host - hbase rest host :param : port - hbase rest running port :param : table - DMS table on hbase (default: 'dms') :return: Nothing. ''' self.hbase = hbaseConnection(host=host, port=port) t = self.hbase.table(table) if (not t.exists()): t.create('meta_data','file') self.hbase_table = t def hdfs_connection(self, host, port, user_name, hdfs_path='/tmp/'): ''' This function use to establish a connection to hdfs, for preparing to create, retrieve, update, delete file in hdfs. We use pywebhdfs in order to do this task via hdfs rest api.(See more: http://pythonhosted.org/pywebhdfs/) :param : host - hdfs rest host :param : port - hdfs rest running port :param : user_name - hdfs username (for authentication) :param : hdfs_path - location to store files. (default: '/tmp/') :return: Nothing. ''' self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name) self.hdfs_path = hdfs_path def solr_connection(self, host, port, collection): ''' This function use to establish a connection to solr, for query or search any text on a system. :param : host - solr's host :param : port - solr's running port :param : collection - solr's collection for searching ''' self.solr = ''.join(['http://',host,':',port,'/solr/',collection]) def extract(self, file): ''' This function use to extract meta data from a file. We use hachoir3 library to extract them. (See more: http://hachoir3.readthedocs.org) :param : file - file for extract :return: meta data as dict for success, 0 if fail. ''' try: filename, realname = unicodeFilename(file), file parser = createParser(filename, realname) meta_data = extractMetadata(parser) meta_data_text = meta_data.exportPlaintext() meta_list = dict() for i in range(1,len(meta_data_text)): meta_split = meta_data_text[i].split(":") column = meta_split[0].replace('- ','') value = meta_split[1].lstrip() meta_list.update({column:value}) return meta_list except: if self.debug: print "Something went wrong, meta data of",file,"could not extract." return None def upload(self, file): ''' This function use to uplaod a file to hdfs and store meta data on hbase Meta data consist of 2 main parts: file's meta data and hdfs's file's meta data. This function will increase a file version if it is already store in hbase. :param : file - file's name :return: True if success otherwise False. ''' version = 1 key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) # Read a file try: f = open(file,'r') file_content = f.read() f.close() except: print "Cannot read file:",file # Check file's version while self.hbase_table.fetch(key) != None: version = int(self.get_lastest_version(file)) + 1 key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) # Try to upload file. try: self.hdfs.create_file(path,file_content) hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus'] file_meta = self.extract(file) t = self.hbase_table status = t.insert( key, { 'file': {'content': file_content} } ) if status != 200: if self.debug: print "Error inserting: file content" # save hbase meta data for i in range(0,len(file_meta.keys())): status = t.insert( key, { 'meta_data': {file_meta.keys()[i]: file_meta[file_meta.keys()[i]]} } ) if status != 200: if self.debug: print "Error inserting:", file_meta.keys()[i] # save hdfs meta data for i in range(0,len(hdfs_meta.keys())): status = t.insert( key, { 'meta_data': {hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]} } ) if status != 200: if self.debug: print "Error inserting:", hdfs_meta.keys()[i] # save version status = t.insert( key, { 'meta_data': {'version': version} } ) if status != 200: if self.debug: print "Error inserting: version" except: if self.debug: print "Upload failed." return False if self.debug: print "[Uploaded]", file, "version:", version return True def download(self, file, version=None, download_dir=''): ''' This function use to retrieve or download file from hdfs. Then save it as a new file named (v[version].[file] - For example, v1.mytext.txt). You can specify the directory of downloaded file. You can also specify file's version for downloading if not it will be version 1. :param : file - file's name :param : version - file's version (default: 1) :param : download_dir - download directory (default: '' or current directory NOTE: it must end with '/' - For example, '../download/') :return: True if success otherwise false. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) downloaded_file = ''.join([download_dir,key]) try: f = open(downloaded_file, 'w') f.write(self.hdfs.read_file(path)) f.close() except: if self.debug: print "Cannot download a file:", file return False if self.debug: print "[Downloaded]",key return True def update(self, file, version=None): ''' This function use to update file to hdfs and data stored in hbase by overwrite that file on hdfs, and also insert new data to hbase too. You can specify a file's version in order to update it. :param : file - file's name :param : version - file's version :return: True if success otherwise False. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) # Read a file try: f = open(file,'r') file_content = f.read() f.close() except: print "Cannot read file:",file # Try to upload file. try: self.hdfs.create_file(path,file,overwrite=True) hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus'] file_meta = self.extract(file) status = t.insert( key, { 'file': {'content': file_content, 'name': file} } ) if status != 200: if self.debug: print "Error inserting: file content" # save hbase meta data for i in range(0,len(file_meta.keys())): status = t.insert( key, { 'meta_data': {file_meta.keys()[i]: file_meta[file_meta.keys()[i]]} } ) if status != 200: if self.debug: print "Error inserting:", file_meta.keys()[i] # save hdfs meta data for i in range(0,len(hdfs_meta.keys())): status = t.insert( key, { 'meta_data': {hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]} } ) if status != 200: if self.debug: print "Error inserting:", hdfs_meta.keys()[i] # save version status = t.insert( key, { 'meta_data': {'version': version} } ) if status != 200: if self.debug: print "Error inserting: version" except: if self.debug: print "Update failed." return False if self.debug: print "[Updated]", file, "version:", version return True def delete(self, file, version=None): ''' This function use to delete file in hbase, and hdfs. You can specify file's version in order to delete it. :param : file - file's name :param : version - file's version :return: True if succes otherwise False. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) path = ''.join([self.hdfs_path,key]) # Check if file exists if self.hbase_table.fetch(key) == None: if self.debug: print "Cannot delete.",key,"is not exists." return False # Remove row on hbase t = self.hbase_table if t.remove(key) != 200: if self.debug: print "[HBASE] cannot remove a row key:",key return False # Delete file on hdfs if not self.hdfs.delete_file_dir(path): if self.debug: print "[HDFS] Cannot remove a file path:",path return False if self.debug: print "[Deleted]", file, "version:", version return True def get_file_meta_data(self, file, version=None): ''' This function use to get all file's meta_data from hbase. You can specify a file's version. :param : file - file's name :param : version - file's version :return: meta data as dict for success, 0 if fail ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) if not self.hbase_table.fetch(key): if self.debug: print key,"is not exists" return False return self.hbase_table.fetch(key)['meta_data'] def get_file_content(self, file, version=None): ''' This function use to get all file's content from hbase. You can specify a file's version. :param : file - file's name :param : version - file's version :return: meta data as dict for success, 0 if fail ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v',str(version),'.',file]) if not self.hbase_table.fetch(key): if self.debug: print key,"is not exists" return False return self.hbase_table.fetch(key)['file'] def search(self, text): ''' This function will search in xxxx via solr rest api. :param : text - text for searching :return: json response from solr, False for not found. ''' query = urlopen(''.join([self.solr,'/select?q=',text,'&wt=json'])) response = simplejson.load(query) if response['response']['numFound'] == 0: if self.debug: print text,"not found!" return False return response def get_all_file(self): ''' This function return all files that stored on Hbase in a list format. :param : Nothing. :return: fetch result as a list. ''' rf = '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": ""}}' t = self.hbase_table result = t.fetch_all_rows(with_row_id=True, filter_string=rf) return list(result) def get_file_version(self, file): ''' This function will fetch data from file name then return them. :param : file - file's name :return: file_list with version as a dict. ''' rf = ''.join(['{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": "',file,'"}}']) t = self.hbase_table result = t.fetch_all_rows(with_row_id=True, filter_string=rf) lsr = list(result) file_version = list() for i in range(0,len(lsr)): file_version.append(lsr[i].keys()[0].split('.')[0].split('v')[1]) file_list = dict() file_list['name'] = file file_list['version'] = file_version return file_list def get_lastest_version(self, file): ''' This function will return a lastest version number as integer. :param : file - file's name :return: version number as an integer. ''' file_version = self.get_file_version(file) file_version['version'].sort() return file_version['version'][len(file_version['version'])-1] def delete_all_version(self, file): ''' This function will delete all file's version in an hbase and HDFS :param : file - file's name :return: True if success otherwise False ''' self.get_file_version(file)['version'].sort() for version in self.get_file_version(file)['version']: try: self.delete(file,version) except: return False return True def delete_all(self): ''' This function will delete all the files on an hbase and hdfs. :param : Nothing :return: True if success otherwise False ''' for full_file in self.get_all_file(): file = full_file.keys()[0].split('.')[1] version = full_file.keys()[0].split('.')[0].split('v')[1] try: self.delete(file,version) except: return False return True
class DMS: def __init__(self, debug=0): ''' This function use to init a class. To show an error messages debug should be 1. :param : debug - 1, show an error or success message. 0 otherwise :return: Nothing. ''' self.debug = debug pass def hbase_connection(self, host, port, table='dms'): ''' This function use to establish a connection to hbase, for preparing to insert, remove, fetch data from hbase. We use starbase for connect to hbase via rest api.(See more: https://github.com/barseghyanartur/starbase) :param : host - hbase rest host :param : port - hbase rest running port :param : table - DMS table on hbase (default: 'dms') :return: Nothing. ''' self.hbase = hbaseConnection(host=host, port=port) t = self.hbase.table(table) if (not t.exists()): t.create('meta_data', 'file') self.hbase_table = t def hdfs_connection(self, host, port, user_name, hdfs_path='/tmp/'): ''' This function use to establish a connection to hdfs, for preparing to create, retrieve, update, delete file in hdfs. We use pywebhdfs in order to do this task via hdfs rest api.(See more: http://pythonhosted.org/pywebhdfs/) :param : host - hdfs rest host :param : port - hdfs rest running port :param : user_name - hdfs username (for authentication) :param : hdfs_path - location to store files. (default: '/tmp/') :return: Nothing. ''' self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name) self.hdfs_path = hdfs_path def solr_connection(self, host, port, collection): ''' This function use to establish a connection to solr, for query or search any text on a system. :param : host - solr's host :param : port - solr's running port :param : collection - solr's collection for searching ''' self.solr = ''.join(['http://', host, ':', port, '/solr/', collection]) def extract(self, file): ''' This function use to extract meta data from a file. We use hachoir3 library to extract them. (See more: http://hachoir3.readthedocs.org) :param : file - file for extract :return: meta data as dict for success, 0 if fail. ''' try: filename, realname = unicodeFilename(file), file parser = createParser(filename, realname) meta_data = extractMetadata(parser) meta_data_text = meta_data.exportPlaintext() meta_list = dict() for i in range(1, len(meta_data_text)): meta_split = meta_data_text[i].split(":") column = meta_split[0].replace('- ', '') value = meta_split[1].lstrip() meta_list.update({column: value}) return meta_list except: if self.debug: print "Something went wrong, meta data of", file, "could not extract." return None def upload(self, file): ''' This function use to uplaod a file to hdfs and store meta data on hbase Meta data consist of 2 main parts: file's meta data and hdfs's file's meta data. This function will increase a file version if it is already store in hbase. :param : file - file's name :return: True if success otherwise False. ''' version = 1 key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) # Read a file try: f = open(file, 'r') file_content = f.read() f.close() except: print "Cannot read file:", file # Check file's version while self.hbase_table.fetch(key) != None: version = int(self.get_lastest_version(file)) + 1 key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) # Try to upload file. try: self.hdfs.create_file(path, file_content) hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus'] file_meta = self.extract(file) t = self.hbase_table status = t.insert(key, {'file': {'content': file_content}}) if status != 200: if self.debug: print "Error inserting: file content" # save hbase meta data for i in range(0, len(file_meta.keys())): status = t.insert( key, { 'meta_data': { file_meta.keys()[i]: file_meta[file_meta.keys()[i]] } }) if status != 200: if self.debug: print "Error inserting:", file_meta.keys()[i] # save hdfs meta data for i in range(0, len(hdfs_meta.keys())): status = t.insert( key, { 'meta_data': { hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]] } }) if status != 200: if self.debug: print "Error inserting:", hdfs_meta.keys()[i] # save version status = t.insert(key, {'meta_data': {'version': version}}) if status != 200: if self.debug: print "Error inserting: version" except: if self.debug: print "Upload failed." return False if self.debug: print "[Uploaded]", file, "version:", version return True def download(self, file, version=None, download_dir=''): ''' This function use to retrieve or download file from hdfs. Then save it as a new file named (v[version].[file] - For example, v1.mytext.txt). You can specify the directory of downloaded file. You can also specify file's version for downloading if not it will be version 1. :param : file - file's name :param : version - file's version (default: 1) :param : download_dir - download directory (default: '' or current directory NOTE: it must end with '/' - For example, '../download/') :return: True if success otherwise false. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) downloaded_file = ''.join([download_dir, key]) try: f = open(downloaded_file, 'w') f.write(self.hdfs.read_file(path)) f.close() except: if self.debug: print "Cannot download a file:", file return False if self.debug: print "[Downloaded]", key return True def update(self, file, version=None): ''' This function use to update file to hdfs and data stored in hbase by overwrite that file on hdfs, and also insert new data to hbase too. You can specify a file's version in order to update it. :param : file - file's name :param : version - file's version :return: True if success otherwise False. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) # Read a file try: f = open(file, 'r') file_content = f.read() f.close() except: print "Cannot read file:", file # Try to upload file. try: self.hdfs.create_file(path, file, overwrite=True) hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus'] file_meta = self.extract(file) status = t.insert( key, {'file': { 'content': file_content, 'name': file }}) if status != 200: if self.debug: print "Error inserting: file content" # save hbase meta data for i in range(0, len(file_meta.keys())): status = t.insert( key, { 'meta_data': { file_meta.keys()[i]: file_meta[file_meta.keys()[i]] } }) if status != 200: if self.debug: print "Error inserting:", file_meta.keys()[i] # save hdfs meta data for i in range(0, len(hdfs_meta.keys())): status = t.insert( key, { 'meta_data': { hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]] } }) if status != 200: if self.debug: print "Error inserting:", hdfs_meta.keys()[i] # save version status = t.insert(key, {'meta_data': {'version': version}}) if status != 200: if self.debug: print "Error inserting: version" except: if self.debug: print "Update failed." return False if self.debug: print "[Updated]", file, "version:", version return True def delete(self, file, version=None): ''' This function use to delete file in hbase, and hdfs. You can specify file's version in order to delete it. :param : file - file's name :param : version - file's version :return: True if succes otherwise False. ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) path = ''.join([self.hdfs_path, key]) # Check if file exists if self.hbase_table.fetch(key) == None: if self.debug: print "Cannot delete.", key, "is not exists." return False # Remove row on hbase t = self.hbase_table if t.remove(key) != 200: if self.debug: print "[HBASE] cannot remove a row key:", key return False # Delete file on hdfs if not self.hdfs.delete_file_dir(path): if self.debug: print "[HDFS] Cannot remove a file path:", path return False if self.debug: print "[Deleted]", file, "version:", version return True def get_file_meta_data(self, file, version=None): ''' This function use to get all file's meta_data from hbase. You can specify a file's version. :param : file - file's name :param : version - file's version :return: meta data as dict for success, 0 if fail ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) if not self.hbase_table.fetch(key): if self.debug: print key, "is not exists" return False return self.hbase_table.fetch(key)['meta_data'] def get_file_content(self, file, version=None): ''' This function use to get all file's content from hbase. You can specify a file's version. :param : file - file's name :param : version - file's version :return: meta data as dict for success, 0 if fail ''' if not version: version = self.get_lastest_version(file) key = ''.join(['v', str(version), '.', file]) if not self.hbase_table.fetch(key): if self.debug: print key, "is not exists" return False return self.hbase_table.fetch(key)['file'] def search(self, text): ''' This function will search in xxxx via solr rest api. :param : text - text for searching :return: json response from solr, False for not found. ''' query = urlopen(''.join([self.solr, '/select?q=', text, '&wt=json'])) response = simplejson.load(query) if response['response']['numFound'] == 0: if self.debug: print text, "not found!" return False return response def get_all_file(self): ''' This function return all files that stored on Hbase in a list format. :param : Nothing. :return: fetch result as a list. ''' rf = '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": ""}}' t = self.hbase_table result = t.fetch_all_rows(with_row_id=True, filter_string=rf) return list(result) def get_file_version(self, file): ''' This function will fetch data from file name then return them. :param : file - file's name :return: file_list with version as a dict. ''' rf = ''.join([ '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": "', file, '"}}' ]) t = self.hbase_table result = t.fetch_all_rows(with_row_id=True, filter_string=rf) lsr = list(result) file_version = list() for i in range(0, len(lsr)): file_version.append(lsr[i].keys()[0].split('.')[0].split('v')[1]) file_list = dict() file_list['name'] = file file_list['version'] = file_version return file_list def get_lastest_version(self, file): ''' This function will return a lastest version number as integer. :param : file - file's name :return: version number as an integer. ''' file_version = self.get_file_version(file) file_version['version'].sort() return file_version['version'][len(file_version['version']) - 1] def delete_all_version(self, file): ''' This function will delete all file's version in an hbase and HDFS :param : file - file's name :return: True if success otherwise False ''' self.get_file_version(file)['version'].sort() for version in self.get_file_version(file)['version']: try: self.delete(file, version) except: return False return True def delete_all(self): ''' This function will delete all the files on an hbase and hdfs. :param : Nothing :return: True if success otherwise False ''' for full_file in self.get_all_file(): file = full_file.keys()[0].split('.')[1] version = full_file.keys()[0].split('.')[0].split('v')[1] try: self.delete(file, version) except: return False return True
rename_dir = 'user/hdfs/example_rename' # create a new client instance hdfs = PyWebHdfsClient(host='localhost', port='50070', user_name='hduser') # create a new directory for the example print('making new HDFS directory at: {0}\n'.format(example_dir)) hdfs.make_dir(example_dir) # get a dictionary of the directory's status dir_status = hdfs.get_file_dir_status(example_dir) print dir_status # create a new file on hdfs print('making new file at: {0}\n'.format(example_file)) hdfs.create_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print file_status # append to the file created in previous step print('appending to file at: {0}\n'.format(example_file)) hdfs.append_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print file_status # read in the data for the file print('reading data from file at: {0}\n'.format(example_file)) file_data = hdfs.read_file(example_file) print file_data
def save(self): hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie') deployment_path = "user/oozie/bundles/{0}".format(self.name) bundle_path = "{0}/bundle.xml".format(deployment_path, self.name) hdfs.create_file(bundle_path, self.as_xml())