def fetch_content(self, uri): p = urlparse(uri) if p.scheme == 'hdfs': host, port = p.netloc.split(':') c = Client(host, int(port)) content = StringIO.StringIO() for line in c.text([p.path]): content.write(line) return content.getvalue() if p.scheme == 's3': bucket = Bucket(self.s3, p.netloc) key = bucket.get_key(key_name=p.path[1:]) return key.get_contents_as_string()
class HDFSTextLoader(Unit, TriviallyDistributable): def __init__(self, workflow, **kwargs): super(HDFSTextLoader, self).__init__(workflow, **kwargs) self.file_name = kwargs["file"] self.chunk_lines_number = kwargs.get("chunk", 1000) client_kwargs = dict(kwargs) del client_kwargs["file"] if "chunk" in kwargs: del client_kwargs["chunk"] self.hdfs_client = Client(**client_kwargs) self.output = [""] * self.chunk_lines_number self.finished = Bool() def initialize(self): self.debug("Opened %s", self.hdfs_client.stat([self.file_name])) self._generator = self.hdfs_client.text([self.file_name]) def run(self): assert not self.finished try: for i in range(self.chunk_lines_number): self.output[i] = next(self._generator) except StopIteration: self.finished <<= True
class HDFSTextLoader(Unit, TriviallyDistributable): def __init__(self, workflow, **kwargs): super(HDFSTextLoader, self).__init__(workflow, **kwargs) self.file_name = kwargs["file"] self.chunk_lines_number = kwargs.get("chunk", 1000) client_kwargs = dict(kwargs) del client_kwargs["file"] if "chunk" in kwargs: del client_kwargs["chunk"] self.hdfs_client = Client(**client_kwargs) self.output = [""] * self.chunk_lines_number self.finished = Bool() def initialize(self): self.debug("Opened %s", self.hdfs_client.stat([self.file_name])) self._generator = self.hdfs_client.text([self.file_name]) def run(self): assert not self.finished try: for i in range(self.chunk_lines_number): self.output[i] = next(self._generator) except StopIteration: self.finished <<= True
# Create kafka client print "Create kafka client to: %s" % args.kafka kafka = KafkaClient(args.kafka + ':9092') producer = SimpleProducer(kafka) # Read testing data from hdfs hdfsServer = args.hdfs hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020)) hdfsHost = "hdfs://" + hdfsServer + ":" + str(hdfsPort) topic = args.topic from snakebite.client import Client print "Reading input from HDFS: server=%s, port=%d" % (hdfsServer, hdfsPort) client = Client(hdfsServer, hdfsPort) data_file = client.text(["/user/" + os.getenv('LOGNAME') + "/data/X_test.txt"]).next() label_file = client.text(["/user/" + os.getenv('LOGNAME') + "/data/y_test.txt"]).next() samples = data_file.splitlines() labels = label_file.splitlines() test_data = zip(samples, labels) random.shuffle(test_data) # Shuffle it import random import time import itertools def getActivityName(a): a = int(a) if a in range(1,7): return str(a)
from snakebite.client import Client client = Client('localhost', 54310) for l in client.text(['/input/input.txt']): print l
from snakebite.client import Client client = Client('localhost', 9000) for l in client.text(['/input/input.txt']): print l
parser = argparse.ArgumentParser() parser.add_argument("--hdfs", help="HDFS FS name", default = 'localhost') parser.add_argument("--model", help="Name of model file", default = 'belt.model') args = parser.parse_args() hdfsServer = args.hdfs hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020)) hdfsHost = "hdfs://" + hdfsServer + ":" + str(hdfsPort) modelSavePath = "/user/" + os.getenv('LOGNAME') + "/data/model/" + args.model + "/" print "hdfs=%s, savePath=%s, hdfsHost=%s" % (hdfsServer, modelSavePath, hdfsHost) hdfs_client = Client(hdfsServer, hdfsPort) X_train_file = hdfs_client.text(["/user/" + os.getenv('LOGNAME') + "/data/X_train.txt"]).next() y_train_file = hdfs_client.text(["/user/" + os.getenv('LOGNAME') + "/data/y_train.txt"]).next() X_train = np.genfromtxt(str.splitlines(X_train_file)) y_train = np.genfromtxt(str.splitlines(y_train_file)) clf = LogisticRegression() clf = clf.fit(X_train, y_train) files = joblib.dump(clf, "belt.model") subprocess.check_call(['hdfs', 'dfs', '-rm', '-r', '-f', modelSavePath], shell=False) subprocess.check_call(['hdfs', 'dfs', '-mkdir', '-p', modelSavePath], shell=False) for f in files: subprocess.check_call(['hdfs', 'dfs', '-put', os.getcwd() + '/' + f, modelSavePath + f], shell=False)
import os from snakebite.client import Client # provide the Internet Process Communcation Port INTERNET_PROCESS_CIOMMUNICATION_PORT = "..." # provide the Name Node of Hadoop NAME_NODE = "..." # and get the client of HDFS CLIENT_HDFS = Client(NAME_NODE, INTERNET_PROCESS_CIOMMUNICATION_PORT) def read_hdfs_file(file_path_and_name) """Reads an hdfs file :param meta_info_file: the path and the file to read """ # 1. gets the hdfs file object for file_contents in CLIENT_HDFS.text([hdfs_file_name]): file_unicode = file_contents.decode('unicode-escape') file_obj = StringIO(file_unicode) # 2. read and operate on top: file_lines = get_hdfs_file_obj(meta_info_file).readlines() for line in file_lines: # ... # do operations on the file
from snakebite.client import Client client = Client('localhost', 8020) #port is the RPC port of the namenode. for i in client.ls(['/user/cloudera/behrouz']): #takes a list of paths!! print i #get this parameters from /etc/hadoop/conf/core-site.xml under the fs.defaults #many of the methods in snake bite return generators #creating a directory: #create two directories behrouz, behrouz1/b1 on HDFS: print '*' * 40 for p in client.mkdir(['/behrouz', 'behrouz1/b1'], create_parent=True): print p print '*' * 40 #deleting files and directories: deletes any subdirectories and files a directory contains #recursively deleting the directories! for p in client.delete(['/behrouz', 'behrouz1/b1'], recurse=True): print p print '*' * 40 # retrieving data from hdfs: #copying files from HDFS to Local file system: for f in client.copyToLocal(['/user/cloudera/wordCount.out'], '/home/cloudera/'): print f print '*' * 40 ####### #reading contents of a file for l in client.text(['/user/cloudera/testfile.txt']): print l #the text method automatically decompress and display gzip and bzip2 files.
#!/user/local/bin/python from snakebite.client import Client client = Client('localhost', 9000) # text() automatically uncompress and display gzip and bzip2 files for line in client.text(['/user/cbohara/book.txt']): print line
from snakebite.client import Client client = Client("10.21.137.43", 8020, use_trash=False) for x in client.ls(['/']): print x for x in client.text(['/testBrian/ttt.txt']): print x # https://snakebite.readthedocs.io/en/latest/client.html # Spotify
from mrjob.job import MRJob from snakebite.client import Client client = Client('hadoop', 9000) for l in client.text(['/input/word.txt']): print(l) # class MRWordCount(MRJob): # # def mapper(self, _, line): # for word in line.split(): # yield(word, 1) # # def reducer(self, word, counts): # yield(word, sum(counts)) # # if __name__ == '__main__': # MRWordCount.run()
# Создадим пару директорий for p in client.mkdir(['/student9_7/py_dir_01', '/student9_7/py_dir_02'], create_parent=True): print(p) ''' {'path': '/student9_7/py_dir_01', 'result': True} {'path': '/student9_7/py_dir_02', 'result': True} ''' # Удалим директорию `py_dir_01` for p in client.delete(['/student9_7/py_dir_01'], recurse=True): print(p) ''' {'path': '/student9_7/py_dir_01', 'result': True} ''' # Посмотрим что содержится в файле `test` for t in client.text(['/student9_7/test']): print(t) ''' test file for hdfs ''' # Скопируем файл `test` из хранилища в локальную домашнюю директорию под именем `retrived_file_via_py` for f in client.copyToLocal(['/student9_7/test'], 'retrived_file_via_py'): print(f) ''' {'path': '/home/student9_7/retrived_file_via_py', 'source_path': '/student9_7/test', 'result': True, 'error': ''} '''