示例#1
0
    def fetch_content(self, uri):
        p = urlparse(uri)

        if p.scheme == 'hdfs':
            host, port = p.netloc.split(':')

            c = Client(host, int(port))

            content = StringIO.StringIO()

            for line in c.text([p.path]):
                content.write(line)

            return content.getvalue()

        if p.scheme == 's3':
            bucket = Bucket(self.s3, p.netloc)
            key = bucket.get_key(key_name=p.path[1:])

            return key.get_contents_as_string()
示例#2
0
class HDFSTextLoader(Unit, TriviallyDistributable):
    def __init__(self, workflow, **kwargs):
        super(HDFSTextLoader, self).__init__(workflow, **kwargs)
        self.file_name = kwargs["file"]
        self.chunk_lines_number = kwargs.get("chunk", 1000)
        client_kwargs = dict(kwargs)
        del client_kwargs["file"]
        if "chunk" in kwargs:
            del client_kwargs["chunk"]
        self.hdfs_client = Client(**client_kwargs)
        self.output = [""] * self.chunk_lines_number
        self.finished = Bool()

    def initialize(self):
        self.debug("Opened %s", self.hdfs_client.stat([self.file_name]))
        self._generator = self.hdfs_client.text([self.file_name])

    def run(self):
        assert not self.finished
        try:
            for i in range(self.chunk_lines_number):
                self.output[i] = next(self._generator)
        except StopIteration:
            self.finished <<= True
示例#3
0
文件: hdfs_loader.py 项目: 2php/veles
class HDFSTextLoader(Unit, TriviallyDistributable):
    def __init__(self, workflow, **kwargs):
        super(HDFSTextLoader, self).__init__(workflow, **kwargs)
        self.file_name = kwargs["file"]
        self.chunk_lines_number = kwargs.get("chunk", 1000)
        client_kwargs = dict(kwargs)
        del client_kwargs["file"]
        if "chunk" in kwargs:
            del client_kwargs["chunk"]
        self.hdfs_client = Client(**client_kwargs)
        self.output = [""] * self.chunk_lines_number
        self.finished = Bool()

    def initialize(self):
        self.debug("Opened %s", self.hdfs_client.stat([self.file_name]))
        self._generator = self.hdfs_client.text([self.file_name])

    def run(self):
        assert not self.finished
        try:
            for i in range(self.chunk_lines_number):
                self.output[i] = next(self._generator)
        except StopIteration:
            self.finished <<= True
# Create kafka client
print "Create kafka client to: %s" % args.kafka
kafka = KafkaClient(args.kafka + ':9092')
producer = SimpleProducer(kafka)

# Read testing data from hdfs
hdfsServer = args.hdfs
hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020))
hdfsHost = "hdfs://" + hdfsServer + ":" + str(hdfsPort)

topic = args.topic

from snakebite.client import Client
print "Reading input from HDFS: server=%s, port=%d" % (hdfsServer, hdfsPort)
client = Client(hdfsServer, hdfsPort)
data_file = client.text(["/user/" + os.getenv('LOGNAME') + "/data/X_test.txt"]).next()
label_file = client.text(["/user/" + os.getenv('LOGNAME') + "/data/y_test.txt"]).next()

samples = data_file.splitlines()
labels = label_file.splitlines()
test_data = zip(samples, labels)
random.shuffle(test_data) # Shuffle it

import random
import time
import itertools

def getActivityName(a):
    a = int(a)
    if a in range(1,7):
        return str(a)
示例#5
0
from snakebite.client import Client

client = Client('localhost', 54310)
for l in client.text(['/input/input.txt']):
    print l
示例#6
0
from snakebite.client import Client

client = Client('localhost', 9000)
for l in client.text(['/input/input.txt']):
   print l
parser = argparse.ArgumentParser()
parser.add_argument("--hdfs", help="HDFS FS name", default = 'localhost')
parser.add_argument("--model", help="Name of model file", default = 'belt.model')
args = parser.parse_args()


hdfsServer = args.hdfs
hdfsPort = int(os.environ.get('HDFS_NAME_PORT', 8020))
hdfsHost = "hdfs://" + hdfsServer + ":" + str(hdfsPort)
modelSavePath = "/user/" + os.getenv('LOGNAME') + "/data/model/" + args.model + "/"
print "hdfs=%s, savePath=%s, hdfsHost=%s" % (hdfsServer, modelSavePath, hdfsHost)

hdfs_client = Client(hdfsServer, hdfsPort)

X_train_file = hdfs_client.text(["/user/" + os.getenv('LOGNAME') + "/data/X_train.txt"]).next()
y_train_file = hdfs_client.text(["/user/" + os.getenv('LOGNAME') + "/data/y_train.txt"]).next()

X_train = np.genfromtxt(str.splitlines(X_train_file))
y_train = np.genfromtxt(str.splitlines(y_train_file))

clf = LogisticRegression()
clf = clf.fit(X_train, y_train)

files = joblib.dump(clf, "belt.model")

subprocess.check_call(['hdfs', 'dfs', '-rm', '-r', '-f', modelSavePath], shell=False)
subprocess.check_call(['hdfs', 'dfs', '-mkdir', '-p', modelSavePath], shell=False)

for f in files:
    subprocess.check_call(['hdfs', 'dfs', '-put', os.getcwd() + '/' + f, modelSavePath + f], shell=False)
import os
from snakebite.client import Client

# provide the Internet Process Communcation Port
INTERNET_PROCESS_CIOMMUNICATION_PORT = "..."

# provide the Name Node of Hadoop
NAME_NODE = "..."

# and get the client of HDFS
CLIENT_HDFS = Client(NAME_NODE, INTERNET_PROCESS_CIOMMUNICATION_PORT)


def read_hdfs_file(file_path_and_name)
    """Reads an hdfs file
    :param meta_info_file: the path and the file to read
    """

    # 1. gets the hdfs file object
    for file_contents in CLIENT_HDFS.text([hdfs_file_name]):
        file_unicode = file_contents.decode('unicode-escape')
        file_obj = StringIO(file_unicode)

    # 2. read and operate on top:
    file_lines = get_hdfs_file_obj(meta_info_file).readlines()
    for line in file_lines:
        # ...
        # do operations on the file
示例#9
0
from snakebite.client import Client
client = Client('localhost', 8020)  #port is the RPC port of the namenode.
for i in client.ls(['/user/cloudera/behrouz']):  #takes a list of paths!!
    print i
#get this parameters from /etc/hadoop/conf/core-site.xml under the fs.defaults
#many of the methods in snake bite return generators

#creating a directory:
#create two directories behrouz, behrouz1/b1 on HDFS:
print '*' * 40
for p in client.mkdir(['/behrouz', 'behrouz1/b1'], create_parent=True):
    print p
print '*' * 40
#deleting files and directories: deletes any subdirectories and files a directory contains
#recursively deleting the directories!
for p in client.delete(['/behrouz', 'behrouz1/b1'], recurse=True):
    print p
print '*' * 40
# retrieving data from hdfs:
#copying files from HDFS to Local file system:
for f in client.copyToLocal(['/user/cloudera/wordCount.out'],
                            '/home/cloudera/'):
    print f
print '*' * 40
#######
#reading contents of a file
for l in client.text(['/user/cloudera/testfile.txt']):
    print l
#the text method automatically decompress and display gzip and bzip2 files.
示例#10
0
#!/user/local/bin/python
from snakebite.client import Client

client = Client('localhost', 9000)

# text() automatically uncompress and display gzip and bzip2 files
for line in client.text(['/user/cbohara/book.txt']):
    print line
示例#11
0
from snakebite.client import Client
client = Client("10.21.137.43", 8020, use_trash=False)
for x in client.ls(['/']):
    print x

for x in client.text(['/testBrian/ttt.txt']):
    print x

# https://snakebite.readthedocs.io/en/latest/client.html
# Spotify
示例#12
0
from mrjob.job import MRJob
from snakebite.client import Client

client = Client('hadoop', 9000)
for l in client.text(['/input/word.txt']):
    print(l)

# class MRWordCount(MRJob):
#
#    def mapper(self,  _, line):
#       for word in line.split():
#          yield(word, 1)
#
#    def reducer(self, word, counts):
#       yield(word, sum(counts))
#
# if __name__ == '__main__':
#    MRWordCount.run()
示例#13
0
# Создадим пару директорий
for p in client.mkdir(['/student9_7/py_dir_01', '/student9_7/py_dir_02'],
                      create_parent=True):
    print(p)
'''
{'path': '/student9_7/py_dir_01', 'result': True}
{'path': '/student9_7/py_dir_02', 'result': True}
'''

# Удалим директорию `py_dir_01`
for p in client.delete(['/student9_7/py_dir_01'], recurse=True):
    print(p)
'''
{'path': '/student9_7/py_dir_01', 'result': True}
'''

# Посмотрим что содержится в файле `test`
for t in client.text(['/student9_7/test']):
    print(t)
'''
test file for hdfs
'''

# Скопируем файл `test` из хранилища в локальную домашнюю директорию под именем `retrived_file_via_py`
for f in client.copyToLocal(['/student9_7/test'], 'retrived_file_via_py'):
    print(f)
'''
{'path': '/home/student9_7/retrived_file_via_py', 'source_path': '/student9_7/test', 'result': True, 'error': ''}
'''