def main(argv): """ Main method. This method performs the following tasks: 1. Parse command line arguments 2. Retrieve credentials and connect to Cloudant and WebHDFS 3. Connect to the Cloudant `_changes` feed for checkpointed document consumption 4. Process each change individually. 5. Upon exception throwing, store the latest checkpoint to local file and exit. """ #add options into the parser parser = configureOptions() (options, args) = parser.parse_args() checkRequiredArguments(options, parser) print options # configurations last_seq = options.last_seq #get credential perm_file = '%s/.clou' % os.environ['HOME'] creds = get_creds(perm_file) #connect to source database s = Server('https://%s:%s@%s' % (creds['cloudant_user'], creds['cloudant_pwd'], options.uri)) db = s[options.dbname] #print db.info() #connect to target hdfs cluster hdfs = PyWebHdfsClient(host=options.hdfs_host, port=options.hdfs_port, user_name=creds['hdfs_user']) hdfs.make_dir(options.hdfs_path) #and here we consume the cloudant `_changes` feed counter = 0 changestream = ChangesStream(db, include_docs=True, heartbeat=True, since=last_seq) for c in changestream: #print c try: if counter % 100 == 0: checkpoint(last_seq) seq = processChange(hdfs, c, options.hdfs_path) if seq: # protect against the last line being blank last_seq = seq counter += 1 except Exception: traceback.print_exc() checkpoint(last_seq) os._exit(1) checkpoint(last_seq)
def setup_common_oozie_libs(name_node): webhdfs_port = '14000' webhdfs_user = '******' platform_dir = 'user/deployment/platform' lib_path_list = ['/usr/hdp/current/hbase-client/lib/hbase-client.jar', '/usr/hdp/current/hbase-client/lib/hbase-common.jar', '/usr/hdp/current/hbase-client/lib/hbase-protocol.jar', '/usr/hdp/current/hbase-client/lib/hbase-server.jar', '/usr/hdp/current/hbase-client/lib/htrace-core-3.1.0-incubating.jar', '/usr/hdp/current/hbase-client/lib/hbase-hadoop-compat.jar', '/usr/hdp/current/hbase-client/lib/hbase-it.jar', '/usr/hdp/current/hbase-client/lib/hbase-prefix-tree.jar', '/usr/hdp/current/hbase-client/lib/zookeeper.jar', '/usr/hdp/current/pig-client/piggybank.jar', '/usr/hdp/current/spark-client/lib/spark-examples.jar'] # Setup a connection with hdfs using namenode. hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None) # Create directory on hadoop file system (HDFS). hdfs_client.make_dir(platform_dir) # Creates a new file on HDFS and write contents from local FS. for path in lib_path_list: platform_file = '%s/%s' % (platform_dir, os.path.basename(path)) print 'Copying source file: %s to HDFS path %s' % (path, platform_file) with open(path) as file_data: try: hdfs_client.create_file(platform_file, file_data, overwrite=True) except PyWebHdfsException: print 'retrying HDFS copy command for %s' % platform_file time.sleep(5) hdfs_client.create_file(platform_file, file_data, overwrite=True)
class HDFS(NDArray): ''' HDFS storage Parameters ---------- name : str Name of directory to store text files (Path to the directory) without a leading '/' model : Model If None, the model is taken from the 'with' context vars : list of variables Sampling values will be stored for these variables. If None. 'model.unobserved_RVs' is used host : str The IP address or hostname of the HDFS namenode. By default, it is 'localhost' port : str The port number for WebHDFS on the namenode. By default, it is '50070' user_name : str WebHDFS user_name used for authentication. By default, it is None ''' def __init__(self, name, model=None, vars=None, host='localhost', port='50070', user_name=None): self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name) try: self.hdfs.list_dir(name) except FileNotFound: self.hdfs.make_dir(name) super(HDFS, self).__init__(name, model, vars) def close(self): super(HDFS, self).close() _dump_trace(self.name, self)
def setup_common_oozie_libs(name_node): webhdfs_port = '14000' webhdfs_user = '******' platform_dir = 'user/deployment/platform' lib_path_list = [ '/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar', '/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar', '/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar', '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar' ] # Setup a connection with hdfs using namenode. hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None) # Create directory on hadoop file system (HDFS). hdfs_client.make_dir(platform_dir) # Creates a new file on HDFS and write contents from local FS. for path in lib_path_list: platform_file = '%s/%s' % (platform_dir, os.path.basename(path)) logging.info('Copying source file: %s to HDFS path %s', path, platform_file) with open(path) as file_data: hdfs_client.create_file(platform_file, file_data, overwrite=True)
def setup_common_oozie_libs(name_node): webhdfs_port = '14000' webhdfs_user = '******' platform_dir = 'user/deployment/platform' lib_path_list = ['/opt/cloudera/parcels/CDH/lib/hbase/hbase-client.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-common.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-protocol.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-server.jar', '/opt/cloudera/parcels/CDH/lib/hbase/lib/htrace-core.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-hadoop-compat.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-it.jar', '/opt/cloudera/parcels/CDH/lib/hbase/hbase-prefix-tree.jar', '/opt/cloudera/parcels/CDH/lib/hbase/lib/zookeeper.jar', '/opt/cloudera/parcels/CDH/lib/pig/piggybank.jar', '/opt/cloudera/parcels/CDH/lib/spark/lib/spark-examples.jar'] # Setup a connection with hdfs using namenode. hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None) # Create directory on hadoop file system (HDFS). hdfs_client.make_dir(platform_dir) # Creates a new file on HDFS and write contents from local FS. for path in lib_path_list: platform_file = '%s/%s' % (platform_dir, os.path.basename(path)) logging.info('Copying source file: %s to HDFS path %s', path, platform_file) with open(path) as file_data: hdfs_client.create_file(platform_file, file_data, overwrite=True)
class WhenTestingMkdirOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs' self.response = MagicMock() def test_mkdir_throws_exception_for_not_ok(self): self.response.status_code = http_client.BAD_REQUEST self.requests.put.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.make_dir(self.path) def test_mkdir_returns_true(self): self.response.status_code = http_client.OK self.requests.put.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.make_dir(self.path) self.assertTrue(result)
class WhenTestingMkdirOperation(unittest.TestCase): def setUp(self): self.host = 'hostname' self.port = '00000' self.user_name = 'username' self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port, user_name=self.user_name) self.response = MagicMock() self.requests = MagicMock(return_value=self.response) self.path = 'user/hdfs' self.response = MagicMock() def test_mkdir_throws_exception_for_not_ok(self): self.response.status_code = httplib.BAD_REQUEST self.requests.put.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): with self.assertRaises(errors.PyWebHdfsException): self.webhdfs.make_dir(self.path) def test_mkdir_returns_true(self): self.response.status_code = httplib.OK self.requests.put.return_value = self.response with patch('pywebhdfs.webhdfs.requests', self.requests): result = self.webhdfs.make_dir(self.path) self.assertTrue(result)
def test_webhdfs_csv(self): from pywebhdfs.webhdfs import PyWebHdfsClient dfs = PyWebHdfsClient(host='localhost',port='9870', user_name='hadoop') dfs.make_dir("/temp") with open("tests/data/data.csv") as input_file: dfs.create_file("/temp/data.csv", file_data=input_file, overwrite=True) dfs.delete_file_dir("/temp", recursive=True)
def main(argv): """ Main method. This method performs the following tasks: 1. Parse command line arguments 2. Retrieve credentials and connect to Cloudant and WebHDFS 3. Connect to the Cloudant `_changes` feed for checkpointed document consumption 4. Process each change individually. 5. Upon exception throwing, store the latest checkpoint to local file and exit. """ # add options into the parser parser = configureOptions() (options, args) = parser.parse_args() checkRequiredArguments(options, parser) print options # configurations last_seq = options.last_seq # get credential perm_file = "%s/.clou" % os.environ["HOME"] creds = get_creds(perm_file) # connect to source database s = Server("https://%s:%s@%s" % (creds["cloudant_user"], creds["cloudant_pwd"], options.uri)) db = s[options.dbname] # print db.info() # connect to target hdfs cluster hdfs = PyWebHdfsClient(host=options.hdfs_host, port=options.hdfs_port, user_name=creds["hdfs_user"]) hdfs.make_dir(options.hdfs_path) # and here we consume the cloudant `_changes` feed counter = 0 changestream = ChangesStream(db, include_docs=True, heartbeat=True, since=last_seq) for c in changestream: # print c try: if counter % 100 == 0: checkpoint(last_seq) seq = processChange(hdfs, c, options.hdfs_path) if seq: # protect against the last line being blank last_seq = seq counter += 1 except Exception: traceback.print_exc() checkpoint(last_seq) os._exit(1) checkpoint(last_seq)
def sharedlib_install(name_node, webhdfs_port, authentic_user, platform_dir, lib_path_list): # Setup a connection with hdfs using namenode. hdfs = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=authentic_user, timeout=None) # Create directory on hadoop file system (HDFS). hdfs.make_dir(platform_dir) # Creates a new file on HDFS and write contents from local FS. for path in lib_path_list: platform_file = '%s/%s' % (platform_dir, os.path.basename(path)) print >> sys.stdout, 'Copying source file: %s to HDFS path %s' %\ (path, platform_file) with open(path) as file_data: hdfs.create_file(platform_file, file_data, overwrite=True)
class Store (store.Store): """ HDFS backed store. """ def __init__ (self): """ Connect to store """ self._client = PyWebHdfsClient(host=store_host, port=store_port, user_name=store_user) def mkdir (self, path): self._client.make_dir(path) def read (self, path, open_handle): return StoreFile(self._client, path, "r", open_handle) def append (self, path, open_handle): return StoreFile(self._client, path, "a", open_handle) def write (self, path, open_handle): return StoreFile(self._client, path, "w", open_handle) def exists (self, path): try: dirinfo = self._client.list_dir(path) return True except errors.FileNotFound: return False def walk (self, path, visitor, recursive = False): """ Walk files in a path. Use recursive=True to include subdirs """ dirinfo = self._client.list_dir(path) for status in dirinfo["FileStatuses"]["FileStatus"]: if recursive and status["type"] == "DIRECTORY": if len(path) > 0: self.walk(path + "/" + status["pathSuffix"], visitor, recursive) else: self.walk(status["pathSuffix"], visitor, recursive) else: info = dict(name=status["pathSuffix"], modify=datetime.fromtimestamp(status["modificationTime"]), size=status["length"]) visitor(path, info)
def setup_common_oozie_libs(name_node): webhdfs_port = '14000' webhdfs_user = '******' platform_dir = 'user/deployment/platform' lib_path_list = [ '/usr/hdp/current/hbase-client/lib/hbase-client.jar', '/usr/hdp/current/hbase-client/lib/hbase-common.jar', '/usr/hdp/current/hbase-client/lib/hbase-protocol.jar', '/usr/hdp/current/hbase-client/lib/hbase-server.jar', '/usr/hdp/current/hbase-client/lib/htrace-core-3.1.0-incubating.jar', '/usr/hdp/current/hbase-client/lib/hbase-hadoop-compat.jar', '/usr/hdp/current/hbase-client/lib/hbase-it.jar', '/usr/hdp/current/hbase-client/lib/hbase-prefix-tree.jar', '/usr/hdp/current/hbase-client/lib/zookeeper.jar', '/usr/hdp/current/pig-client/piggybank.jar', '/usr/hdp/current/spark-client/lib/spark-examples.jar' ] # Setup a connection with hdfs using namenode. hdfs_client = PyWebHdfsClient(host=name_node, port=webhdfs_port, user_name=webhdfs_user, timeout=None) # Create directory on hadoop file system (HDFS). hdfs_client.make_dir(platform_dir) # Creates a new file on HDFS and write contents from local FS. for path in lib_path_list: platform_file = '%s/%s' % (platform_dir, os.path.basename(path)) print 'Copying source file: %s to HDFS path %s' % (path, platform_file) with open(path) as file_data: try: hdfs_client.create_file(platform_file, file_data, overwrite=True) except PyWebHdfsException: print 'retrying HDFS copy command for %s' % platform_file time.sleep(5) hdfs_client.create_file(platform_file, file_data, overwrite=True)
#1 imports from pywebhdfs.webhdfs import PyWebHdfsClient #2 make connection with hadoop file system hdfs = PyWebHdfsClient(user_name="hdfs",port=50070,host="sandbox.hortonworks.com") hdfs.delete_file_dir('chapter5/LoanStats3d.csv',recursive=True) #4 recreate the chapters directory hdfs.make_dir('chapter5') #5 upload the csv file with open('./data/stored_csv.csv') as file_data: hdfs.create_file('chapter5/LoanStats3d.csv',file_data, overwrite=True) #6 print the status to see if this succeeded. print hdfs.get_file_dir_status('chapter5/LoanStats3d.csv')
class HDFS(object): def __init__(self, host, port, user): self._hdfs = PyWebHdfsClient(host=host, port=port, user_name=user, timeout=None) logging.debug('webhdfs = %s@%s:%s', user, host, port) def recursive_copy(self, local_path, remote_path, exclude=None, permission=755): if exclude is None: exclude = [] c_path = canonicalize(remote_path) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path, permission=permission) fs_g = os.walk(local_path) for dpath, dnames, fnames in fs_g: _, relative_path = dpath.split(local_path) for dname in dnames: if dname not in exclude: c_path = canonicalize('%s/%s/%s' % (remote_path, relative_path, dname)) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path, permission=permission) for fname in fnames: if fname not in exclude: data = file( canonicalize('%s/%s/%s' % (local_path, relative_path, fname)), 'rb') c_path = canonicalize('%s/%s/%s' % (remote_path, relative_path, fname)) logging.debug('creating %s', c_path) self._hdfs.create_file(c_path, data, overwrite=True, permission=permission) data.close() def make_dir(self, path, permission=755): logging.debug('make_dir: %s', path) self._hdfs.make_dir(canonicalize(path), permission=permission) def create_file(self, data, remote_file_path, permission=755): logging.debug('create_file: %s', remote_file_path) sio = BytesIO(data) self._hdfs.create_file(canonicalize(remote_file_path), sio, overwrite=True, permission=permission) def append_file(self, data, remote_file_path): logging.debug('append to: %s', remote_file_path) self._hdfs.append_file(canonicalize(remote_file_path), data) def stream_file_to_disk(self, remote_file_path, local_file_path): chunk_size = 10 * 1024 * 1024 offset = 0 with open(local_file_path, 'wb') as dest_file: data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) while True: dest_file.write(data) if len(data) < chunk_size: break offset += chunk_size data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) def read_file(self, remote_file_path): data = self._hdfs.read_file(canonicalize(remote_file_path)) return data def remove(self, path, recursive=False): logging.debug('remove: %s', path) self._hdfs.delete_file_dir(canonicalize(path), recursive) def file_exists(self, path): try: self._hdfs.get_file_dir_status(path) return True except: return False
import requests source = requests.get("https://resources.lendingclub.com/LoanStats3d.csv.zip", verify=False) stringio = StringIO.StringIO(source.content) unzipped = zipfile.ZipFile(stringio) import pandas as pd from pywebhdfs.webhdfs import PyWebHdfsClient subselection_csv = pd.read_csv(unzipped.open('LoanStats3d.csv'), skiprows=1, skipfooter=2, engine='python') stored_csv = subselection_csv.to_csv('./stored_csv.csv') hdfs = PyWebHdfsClient(user_name="hdfs", port=50070, host="sandbox") hdfs.make_dir('chapter5') with open('./stored_csv.csv') as file_data: hdfs.create_file('chapter5/LoanStats3d.csv', file_data, overwrite=True) print(hdfs.get_file_dir_status('chapter5/LoanStats3d.csv')) from pyspark.sql import HiveContext # sc = SparkContext() sqlContext = HiveContext(sc) data = sc.textFile("/chapter5/LoanStats3d.csv") parts = data.map(lambda r: r.split(',')) firstline = parts.first() datalines = parts.filter(lambda x: x != firstline) def cleans(row):
class MulticlassLogisticRegressionModelTrainer(object): pass def __init__(self): pass def hdfsizePath(self, path): return self.hdfsServerUrl+path def start(self,q,parentEnv, configJsonStr): pass self.config = DictionaryAsNestedObjectSerializer(json.loads(configJsonStr)) self.hdfs = PyWebHdfsClient(host=self.config.acm.servers.hdfs.host,port=self.config.acm.servers.hdfs.restPort, user_name=self.config.acm.servers.hdfs.fileOwner) self.hdfsServerUrl = "hdfs://"+self.config.acm.servers.hdfs.host+":"+str(self.config.acm.servers.hdfs.port) env_ = json.loads(parentEnv) py4jExists=False for key in env_.keys(): os.environ[key]=env_[key] if "py4j-" in env_[key]: py4jExists=True ### set pyspark env variable ### #os.environ["SPARK_HOME"]="/home/halil/programs/spark230" #if os.environ.get("PYTHONPATH") is None: # os.environ["PYTHONPATH"] = os.path.join(os.environ["SPARK_HOME"], "python/") # #if py4jExists==False: # os.environ["PYTHONPATH"] = os.path.join(os.environ["SPARK_HOME"], "python/lib/py4j-0.10.6-src.zip")+ ":"+ os.environ["PYTHONPATH"] #set config trainDataFiles= self.hdfsizePath(self.config.acm.models.classification.data.hdfs.inputDir+"/*.csv") print (trainDataFiles) sc =SparkContext() sqlContext = SQLContext(sc) data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(trainDataFiles).limit(1000) print(data.columns) drop_list = ['Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y'] data = data.select([column for column in data.columns if column not in drop_list]) data.show(5) data.printSchema() # by top 20 categories data.groupBy("Category") \ .count() \ .orderBy(col("count").desc()) \ .show() # by top 20 descriptions data.groupBy("Descript") \ .count() \ .orderBy(col("count").desc()) \ .show() # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="Descript", outputCol="words", pattern="\\W") # stop words add_stopwords = ["http","https","amp","rt","t","c","the"] # standard stop words stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords) # bag of words count countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label") transformers=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx] pipeline = Pipeline(stages=transformers) pipelineFit = pipeline.fit(data) dataset = pipelineFit.transform(data) dataset.show(5) ### Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100) print("Training Dataset Count: " + str(trainingData.count())) print("Test Dataset Count: " + str(testData.count())) # Build the model lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) # Train model with Training Data lrModel = lr.fit(trainingData) savedModelsDir=self.hdfsizePath(self.config.acm.models.classification.data.hdfs.savedModels) savedModelsZipDir=self.hdfsizePath(self.config.acm.models.classification.data.hdfs.zipDir) modelSavePolicy=self.config.acm.models.classification.modelSavePolicy if modelSavePolicy=="mostRecentOne": time_ms = str(int(time.time()*1000)) #if not os.path.exists(outputDir): # os.mkdir(outputDir) #if not os.path.exists(zipDir): # os.mkdir(zipDir) self.hdfs.make_dir(self.config.acm.models.classification.data.hdfs.savedModels) self.hdfs.make_dir(self.config.acm.models.classification.data.hdfs.zipDir) newModelDirName = self.config.acm.models.classification.data.hdfs.savedModels + "/" + time_ms modelOutputPath = newModelDirName+"/model" pipelineOutputPath = newModelDirName+"/pipeline" self.hdfs.make_dir(newModelDirName) self.hdfs.make_dir(modelOutputPath) self.hdfs.make_dir(pipelineOutputPath) lrModel.write().overwrite().save(self.hdfsizePath( modelOutputPath)) pipelineFit.write().overwrite().save(self.hdfsizePath(pipelineOutputPath))
class HDFS(object): def __init__(self, host, port, user): self._hdfs = PyWebHdfsClient( host=host, port=port, user_name=user, timeout=None) logging.debug('webhdfs = %s@%s:%s', user, host, port) def recursive_copy(self, local_path, remote_path, exclude=None): if exclude is None: exclude = [] c_path = canonicalize(remote_path) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path) fs_g = os.walk(local_path) for dpath, dnames, fnames in fs_g: _, relative_path = dpath.split(local_path) for dname in dnames: if dname not in exclude: c_path = canonicalize( '%s/%s/%s' % (remote_path, relative_path, dname)) logging.debug('making %s', c_path) self._hdfs.make_dir(c_path) for fname in fnames: if fname not in exclude: data = file( canonicalize( '%s/%s/%s' % (local_path, relative_path, fname)), 'rb') c_path = canonicalize( '%s/%s/%s' % (remote_path, relative_path, fname)) logging.debug('creating %s', c_path) self._hdfs.create_file(c_path, data, overwrite=True) data.close() def make_dir(self, path): logging.debug('make_dir: %s', path) self._hdfs.make_dir(canonicalize(path)) def create_file(self, data, remote_file_path): logging.debug('create_file: %s', remote_file_path) sio = StringIO.StringIO(data) self._hdfs.create_file( canonicalize(remote_file_path), sio, overwrite=True) def append_file(self, data, remote_file_path): logging.debug('append to: %s', remote_file_path) self._hdfs.append_file(canonicalize(remote_file_path), data) def stream_file_to_disk(self, remote_file_path, local_file_path): chunk_size = 10*1024*1024 offset = 0 with open(local_file_path, 'wb') as dest_file: data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) while True: dest_file.write(data) if len(data) < chunk_size: break offset += chunk_size data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size) def read_file(self, remote_file_path): data = self._hdfs.read_file(canonicalize(remote_file_path)) return data def remove(self, path, recursive=False): logging.debug('remove: %s', path) self._hdfs.delete_file_dir(canonicalize(path), recursive)
def hdfsPutTrainDataToDir(filePath, dir_): fname = os.path.basename(filePath) hdfsPut(filePath, dir_ + "/" + fname) conf = SparkConf() conf.setMaster("spark://acm-spark-master:7077") conf.setAppName("batch-multiclass-text-classification") sc = SparkContext() sqlContext = SQLContext(sc) trainDataFile = "./data/sanfrancisco-crime/train.csv" hdfsPath = "/acm/ml/clsf/data/test001" modelsPath = hdfsPath + "/models" hdfs.make_dir(hdfsPath) hdfs.make_dir(modelsPath) hdfsPutTrainDataToDir(trainDataFile, hdfsPath) data = sqlContext.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load("hdfs://namenode:9000/" + hdfsPath).limit(1000) print(data.columns) drop_list = [ 'Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y' ] data = data.select( [column for column in data.columns if column not in drop_list]) data.show(5)
_LOG = logging.getLogger(__name__) example_dir = 'user/hdfs/example_dir' example_file = '{dir}/example.txt'.format(dir=example_dir) example_data = '01010101010101010101010101010101010101010101\n' rename_dir = 'user/hdfs/example_rename' # create a new client instance hdfs = PyWebHdfsClient(host='localhost', port='50070', user_name='hduser') # create a new directory for the example print('making new HDFS directory at: {0}\n'.format(example_dir)) hdfs.make_dir(example_dir) # get a dictionary of the directory's status dir_status = hdfs.get_file_dir_status(example_dir) print(dir_status) # create a new file on hdfs print('making new file at: {0}\n'.format(example_file)) hdfs.create_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print(file_status) # get the checksum for the file file_checksum = hdfs.get_file_checksum(example_file) print(file_checksum)
#!/usr/bin/env python from pywebhdfs.webhdfs import PyWebHdfsClient import os hdfs = PyWebHdfsClient(host='namenode',port='50070', user_name='root') def hdfsPut(local_path, hdfs_path): with open(local_path) as file_data: hdfs.create_file(hdfs_path, file_data=file_data, overwrite=True) def hdfsPutTrainDataToDir(filePath, dir_): fname= os.path.basename(filePath) hdfsPath = dir_+"/"+fname hdfsPut(filePath, hdfsPath) trainDataFile="./data/sanfrancisco-crime/train.csv" hdfsPath="/acm/ml/clsf/data/test001" hdfs.make_dir(hdfsPath) hdfsPutTrainDataToDir(trainDataFile,hdfsPath)
def upload_to_hdfs(self, local_file, table, index): ''' upload file from local filesystem to hdfs ''' hiveOper = hive_op.HiveOperation() local_dir = self._conf.get('local', 'data_dir') local_path = '{}{}/{}'.format(local_dir, index, local_file) host1 = self._conf.get('hdfs', 'name_node1') host2 = self._conf.get('hdfs', 'name_node2') user = self._conf.get('hdfs', 'user') port = self._conf.getint('hdfs', 'port') hdfs_base_path = self._conf.get('hdfs', 'upload_path') hdfs_dir_path = '{}{}'.format(hdfs_base_path, index) hdfs_path = '{}{}/{}'.format(hdfs_base_path, index, local_file) #implement HA manually try: hdfs_cli = PyWebHdfsClient(host=host1, port=port, user_name=user) hdfs_cli.list_dir('/') except Exception as e: logger.warn('open hdfs client failed error {}'.format(e)) hdfs_cli = PyWebHdfsClient(host=host2, port=port, user_name=user) hdfs_cli.list_dir('/') if hdfs_cli is None: logger.error('no active host') return None try: hdfs_cli.get_file_dir_status(hdfs_path) # 若hdfs中临时文件存在,表示可能是上次上传hive失败,或者进程中途被杀导致 # 先将临时文件中的数据导入hive,再进行下一步操作 ret = hiveOper.load_hdfs_file_into_tmp_table(hdfs_path, table) if ret == -1: logger.error('load from hdfs to tmp table failed') logger.info('last time! {} load into tmp finished'.format(table)) hiveOper.load_tmp_table_to_main(table) logger.info( 'last time! {} load tmp table to main finished'.format(table)) #FileNotFountException except Exception as e: #文件不存在是正常情况 logger.debug('no such file {}'.format(hdfs_path)) retry_count = 0 upload_finished = False while retry_count <= 10 and not upload_finished: with open(local_path) as f: logger.debug('''local path is {}, hdfs_cli is {}, file is {}, hdfs_path is {}'''.format( local_path, hdfs_cli, f, hdfs_path)) #hdfs_cli.delete_file_dir(hdfs_path) #若目录不存在,先创建目录 try: hdfs_cli.get_file_dir_status(hdfs_dir_path) except Exception as e: hdfs_cli.make_dir(hdfs_dir_path) try: hdfs_cli.create_file(hdfs_path, f) upload_finished = True except Exception as e: logger.warn('''create file on hdfs failed, local path is {}, hdfs path is {}, retry count {}, upload flag {}'''.format( local_path, hdfs_path, retry_count, upload_finished)) logger.warn('error is {}'.format(e)) retry_count += 1 if retry_count <= 10: return hdfs_path else: logger.error('''{} upload 10 times, still failed, retry count {}, upload_flag is {}'''.format( local_path, retry_count, upload_finished)) return None
class HadoopFileSystem(BaseFs.FileSystem): def __init__(self, vcPath, simulateOnly=False, isVerbose=False, logger=None, user=None, host=None, port=None): BaseFs.FileSystem.__init__(self, simulateOnly, isVerbose, logger) config = Config.Config() hdfsUser, hdfsHost, hdfsPort = config.getHadoopConfig(user, host, port) self.hdfs = PyWebHdfsClient(host=hdfsHost, port=hdfsPort, user_name=hdfsUser) self.vcPath = vcPath def make_fd(self, path, isSrc, dstDirMustExist): fd = None try: fd = HadoopFileDescriptor(self, path, isSrc, dstDirMustExist) except pywebhdfs.errors.FileNotFound: self.logger.info("DESC: does not exist: " + path) raise Errors.FileNotFound("Path {0} does not exist".format(path)) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format(path, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format(path, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( path, e)) raise Errors.BadConnection( "Connection error while looking for path: {0}, exc={1}".format( path, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( path, e)) raise Errors.FsException( "An exception happened while looking for path: {0}, exc={1}". format(path, e)) return fd def exists_file_dir(self, fd): try: return self.hdfs.exists_file_dir(fd.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS exists test: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS exists test: {0}, exc={1}". format(fd.abspath, e)) def delete_file_dir(self, fd, recursive=False, force=False): if self.simulateOnly: print("SIMULATE -> remove file/dir: {0}, recursive={1}".format( fd.abspath, recursive)) else: try: if not recursive or force or \ query_yes_no(question="Are you sure you want to delete folder recursively?", default="no"): status = self.hdfs.delete_file_dir(fd.abspath, recursive=recursive) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS delete directory: {0}, exc={1}" .format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS delete directory: {0}, exc={1}" .format(fd.abspath, e)) def list_dir(self, fd): try: status = self.hdfs.list_dir(fd.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error while looking for path: {0}, exc={1}".format( fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened while looking for path: {0}, exc={1}". format(fd.abspath, e)) currentDir = status["FileStatuses"]["FileStatus"] for item in currentDir: yield HadoopFileDescriptor(self, fd.abspath, isSrc=True, needsDstDirCheck=False, fileJson=item) def make_dir(self, path): if self.simulateOnly: print("SIMULATE -> make dir: " + path) else: try: self.hdfs.make_dir(path) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( path, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format(path, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( path, e)) raise Errors.BadConnection( "Connection error during HDFS create directory: {0}, exc={1}" .format(path, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( path, e)) raise Errors.FsException( "An exception happened during HDFS create directory: {0}, exc={1}" .format(path, e)) def open_file(self, fd, rwMode): return fd def close_file(self, fd): pass def touch_file(self, fd): if self.simulateOnly: print("SIMULATE -> touch file: " + fd.abspath) else: try: self.hdfs.create_file(fd.abspath, 0, overwrite=True) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS create file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS create file: {0}, exc={1}" .format(fd.abspath, e)) def truncate_file(self, fd, size): if self.simulateOnly: print("SIMULATE -> truncate file: {0}, size={1}".format( fd.abspath, size)) else: try: self.hdfs.truncate_file(fd.abspath, size) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS truncate file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS truncate file: {0}, exc={1}" .format(fd.abspath, e)) def try_concat_files(self, fd, chunkFdList): # Workaround for unordered concat bug in Hadoop 2.7.1 is to use one source at the time # https://issues.apache.org/jira/browse/HDFS-8891 currIndex = 0 concatStep = 20 chunkedList = [ chunkFdList[pos:pos + concatStep] for pos in range(0, len(chunkFdList), concatStep) ] for sourceChunk in chunkedList: try: self.concat_files(fd, sourceChunk) currIndex += len(sourceChunk) except Errors.FsException as e: break return currIndex def concat_files(self, fd, chunkFdList): strList = list() for chunkFd in chunkFdList: strList.append(chunkFd.abspath) if self.simulateOnly: print("SIMULATE -> concat file: {0}, sources={1}".format( fd.abspath, ",".join(strList))) else: try: self.hdfs.concat_files(fd.abspath, strList) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS concat file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS concat file: {0}, exc={1}" .format(fd.abspath, e)) def read_data(self, fd, offset, size): if offset >= fd.size: return "" else: try: contents = self.hdfs.read_file(fd.abspath, offset=offset, length=size) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS read file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS read file: {0}, exc={1}" .format(fd.abspath, e)) return contents def append_data(self, fd, data): if self.simulateOnly: print("SIMULATE -> write file data: " + fd.abspath) else: try: self.hdfs.append_file(fd.abspath, data) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( fd.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( fd.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( fd.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS append file: {0}, exc={1}". format(fd.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( fd.abspath, e)) raise Errors.FsException( "An exception happened during HDFS append file: {0}, exc={1}" .format(fd.abspath, e)) def local_mv_file(self, src, dst): if self.simulateOnly: print("SIMULATE -> local move file: {0} -> {1} ".format( src.abspath, dst.abspath)) else: try: self.hdfs.rename_file_dir(src.abspath, dst.abspath) except pywebhdfs.errors.Unauthorized as e: self.logger.info("Unauthorized for path {0}: {1}".format( src.abspath, e)) raise Errors.Unauthorized( "Unauthorized access to the path {0}: {1}".format( src.abspath, e)) except requests.exceptions.RequestException as e: self.logger.info("ConnectionError for path {0}: {1}".format( src.abspath, e)) raise Errors.BadConnection( "Connection error during HDFS rename file: {0}, exc={1}". format(src.abspath, e)) except pywebhdfs.errors.PyWebHdfsException as e: self.logger.info("PyWebHdfsException for path {0}: {1}".format( src.abspath, e)) raise Errors.FsException( "An exception happened during HDFS rename file: {0}, exc={1}" .format(src.abspath, e)) def local_cp_file(self, src, dst): # This is an open issue in Hadoop community: https://issues.apache.org/jira/browse/HDFS-3370 # Instead, we can do a symbolic link if self.simulateOnly: print("SIMULATE -> local copy file: {0} -> {1} ".format( src.abspath, dst.abspath)) else: print( "Copy within HDFS is not supported due to lack of Hadoop support" ) print( "Once symbolic links are enabled, this feature will be enabled" ) sys.exit(1) # self.hdfs.create_sym_link(src.abspath, dst.abspath, createParent=True) def get_hdfs_file_dir_json(self, path): try: status = self.hdfs.get_file_dir_status(path) return status["FileStatus"] except pywebhdfs.errors.FileNotFound: return None def validate_hdfs_arg(self, arg): if not arg.startswith(self.vcPath): print("Error: You don't have permissions to the path: %s" % arg) print("Your path must be rooted under: %s" % self.vcPath) sys.exit(1)
def save(self): hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie') coordinator_path = "{0}/{1}/coordinator.xml".format(self.path, self.name) hdfs.make_dir(self.path) hdfs.create_file(coordinator_path, self.as_xml())
import logging logging.basicConfig(level=logging.DEBUG) _LOG = logging.getLogger(__name__) example_dir = 'user/hdfs/example_dir' example_file = '{dir}/example.txt'.format(dir=example_dir) example_data = '01010101010101010101010101010101010101010101\n' rename_dir = 'user/hdfs/example_rename' # create a new client instance hdfs = PyWebHdfsClient(host='localhost', port='50070', user_name='hduser') # create a new directory for the example print('making new HDFS directory at: {0}\n'.format(example_dir)) hdfs.make_dir(example_dir) # get a dictionary of the directory's status dir_status = hdfs.get_file_dir_status(example_dir) print dir_status # create a new file on hdfs print('making new file at: {0}\n'.format(example_file)) hdfs.create_file(example_file, example_data) file_status = hdfs.get_file_dir_status(example_file) print file_status # append to the file created in previous step print('appending to file at: {0}\n'.format(example_file)) hdfs.append_file(example_file, example_data)
def save(self, workflow_name="workflow.xml"): hdfs = PyWebHdfsClient(host=os.environ["WEBHDFS_HOST"], port='14000', user_name='oozie') workflow_path = "{0}/{1}/workflow.xml".format(self.path, self.name) hdfs.make_dir(self.path) hdfs.create_file(workflow_path, self.as_xml())