示例#1
0
class WhenTestingOpenOperation(unittest.TestCase):
    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host,
                                       port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs'
        self.file_data = u'010101'
        self.response = MagicMock()
        self.response.content = self.file_data

    def test_read_throws_exception_for_not_ok(self):

        self.response.status_code = http_client.BAD_REQUEST
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.read_file(self.path)

    def test_read_returns_file(self):

        self.response.status_code = http_client.OK
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.read_file(self.path)
        self.assertEqual(result, self.file_data)
示例#2
0
class WhenTestingOpenOperation(unittest.TestCase):
    def setUp(self):
        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.path = 'user/hdfs'
        self.file_data = u'010101'
        self.response = MagicMock()
        self.response.content = self.file_data

    @patch.object(Session, 'get')
    def test_read_throws_exception_for_not_ok(self, mock_get):
        self.response.status_code = http_client.BAD_REQUEST
        mock_get.return_value = self.response
        with self.assertRaises(errors.PyWebHdfsException):
            self.webhdfs.read_file(self.path)

    @patch.object(Session, 'get')
    def test_read_returns_file(self, mock_get):
        self.response.status_code = http_client.OK
        mock_get.return_value = self.response
        result = self.webhdfs.read_file(self.path)
        self.assertEqual(result, self.file_data)

    @patch.object(Session, 'get')
    def test_stream_returns_generator(self, mock_get):
        self.response.status_code = http_client.OK
        mock_get.return_value = self.response
        result = self.webhdfs.stream_file(self.path)
        self.assertIsInstance(result, types.GeneratorType)
示例#3
0
def read_result_from_hdfs(username):
    result = ""
    to_return = {}
    file_path = "/jobs_done/" + username + "/part-00000"
    logger.debug("Reading file " + file_path + " from HDFS")
    try:
        logger.debug("Trying to connect to " + hdfs_namenodes[0] + " namenode")
        hdfs_client = PyWebHdfsClient(host=hdfs_namenodes[0],
                                      port='50070',
                                      user_name='xnet',
                                      timeout=100)
        result = hdfs_client.read_file(file_path)
    except (ActiveHostNotFound, ConnectionError) as e:
        to_return["details_1"] = str(e)
        try:
            logger.debug("Trying to connect to " + hdfs_namenodes[1] +
                         " namenode")
            hdfs_client = PyWebHdfsClient(host=hdfs_namenodes[1],
                                          port='50070',
                                          user_name='xnet',
                                          timeout=100)
            result = hdfs_client.read_file(file_path)
        except (ActiveHostNotFound, ConnectionError) as e2:
            to_return[
                "error"] = "There was a problem while trying to read result from HDFS."
            to_return["details2"] = str(e2)
            logger.debug(str(to_return))
            return False, to_return

    return True, result
示例#4
0
class WhenTestingOpenOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs'
        self.file_data = '010101'
        self.response = MagicMock()
        self.response.text = self.file_data

    def test_read_throws_exception_for_not_ok(self):

        self.response.status_code = httplib.BAD_REQUEST
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.read_file(self.path)

    def test_read_returns_file(self):

        self.response.status_code = httplib.OK
        self.requests.get.return_value = self.response
        with patch('pywebhdfs.webhdfs.requests', self.requests):
            result = self.webhdfs.read_file(self.path)
        self.assertEqual(result, self.file_data)
示例#5
0
文件: hdfs.py 项目: wmastersonV/pymc
def load(name,
         chains=None,
         model=None,
         host='localhost',
         port='50070',
         user_name=None):
    '''
	Load text database

	Parameters
	----------
	name : str
		Path to root directory in HDFS for text database without a leading '/'
	chains : list
		Chains to load. If None, all chains are loaded
	model : Model
		If None, the model is taken from the 'with' context
	host : str
		The IP address or hostname of the HDFS namenode. By default,
		it is 'localhost'
	port : str
		The port number for WebHDFS on the namenode. By default, it
		is '50070'
	user_name : str
		WebHDFS user_name used for authentication. By default, it is
		None

	Returns
	-------
	ndarray.Trace instance
	'''
    hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
    chain_dirs = _get_chain_dirs(name, hdfs)
    if chains is None:
        chains = list(chain_dirs.keys())
    traces = []
    for chain in chains:
        chain_dir = chain_dirs[chain]
        dir_path = os.path.join(name, chain_dir)
        shape_file = os.path.join(dir_path, 'shapes.json')
        shapes = json.load(StringIO.StringIO(hdfs.read_file(shape_file)))
        samples = {}
        for varname, shape in shapes.items():
            var_file = os.path.join(dir_path, varname + '.txt')
            samples[varname] = np.loadtxt(
                StringIO.StringIO(str(
                    hdfs.read_file(var_file)))).reshape(shape)
        trace = NDArray(model=model)
        trace.samples = samples
        trace.chain = chain
        traces.append(trace)
    return base.MultiTrace(traces)
示例#6
0
def downParts(fpath):
    '从hdfs下载spark输出的文件, 以 part-0??? 格式的系列文件, 下载的时候直接聚合到一个文件中'
    from pywebhdfs.webhdfs import PyWebHdfsClient
    hdfs = PyWebHdfsClient(host='iasp76', port='12003', user_name='mci')
    flist = hdfs.list_dir(fpath)
    x = flist['FileStatuses']['FileStatus']
    _SUCCESS = False
    for f in x:
        if f['pathSuffix'] == '_SUCCESS':
            _SUCCESS = True
            break
    if not _SUCCESS:
        print("not complete yet!")
        return
    fnames = [
        f['pathSuffix'] for f in x if f['pathSuffix'].startswith('part-')
    ]
    fnames1 = sorted(fnames)
    foutname = fpath[fpath.rfind('/') + 1:]
    l = len(fnames1)
    with open(foutname, "wb") as fo:
        for fname in fnames1:
            fpath1 = fpath + "/" + fname
            fo.write(hdfs.read_file(fpath1))
            print(" progress: ", fname, l)
示例#7
0
 def get_file_contents(self,
                       hdfs_path,
                       user_name='trifacta',
                       httpfs_port='14000'):
     hdfs = PyWebHdfsClient(host=urlparse(self.trifacta_base_url).netloc,
                            port=httpfs_port,
                            user_name=user_name)
     return hdfs.read_file(hdfs_path).decode('utf-8')
示例#8
0
def from_hdfs(hdfs_path, file_path):
    hdfs = PyWebHdfsClient(host='hdfs-v1',
                           port='50070',
                           user_name='hdfs',
                           timeout=100)
    binary_file = hdfs.read_file(hdfs_path)
    with open(file_path, 'wb') as f:
        f.write(binary_file)
示例#9
0
    def run(self):
        if ("agg" in self.arg):
            #reading from a file to memory to stream later
            with open(self.path,"rb") as f:
                self.data_holder['data'] = json.dumps(cPickle.load(f));
            #indicating that reading in memory is finished for this data  
            self.data_holder["indicator"]='ready'; 
        
        elif("raw" in self.arg):
            from pywebhdfs.webhdfs import PyWebHdfsClient;
            hdfs = PyWebHdfsClient(host='cshadoop.boisestate.edu',port='50070', user_name='uacharya');
            
            file_path = 'user/uacharya/flow/'+str(self.arg['d'])+'/node_'+str(self.arg['n'])+'/output.csv'
            #reading the csv files in the memory
            self.data_holder['data']= hdfs.read_file(file_path,buffersize=4096) 
                
            self.data_holder["indicator"]='ready'; 
            
        elif("bitmap" in self.arg):
            #putting the line data into a object to stream
            with open(self.path+"/data.json","rb")as f:
                self.data_holder['data'] = json.dumps(cPickle.load(f));          
#             with open(self.path+"\\data.json","rb")as f:
#                 output = cPickle.load(f);  
            #not loading images into memory if there is none images
            if(self.data_holder['data']=='""'):
                #indicating that reading in memory is finished for this data  
                self.data_holder['frames']=(0,[]);
                self.data_holder["indicator"]='ready'; 
                return;
#             if(not output):
#                 self.data_holder['data']= msgpack.packb(output,use_bin_type=True);
#                 self.data_holder["indicator"]='ready'; 
#                 return;     
            #just in case there is some data to stream add all the PNGS to a list   
#             output['frames']=[];
            content_length =0; #calculate the content length in bytes of all images to stream in total
            PNGS=[]; #list to hold all the pngs data in memory
            #reading all the images to memory to stream
            for x in xrange(1,31):
                buf_string = cStringIO.StringIO();
                Image.open(self.path+"/imgs/"+str(x)+".png").save(buf_string, format="PNG", quality=100);
                content_length = content_length+(buf_string.tell()+4); 
                PNGS.append(struct.pack('>I',buf_string.tell())+buf_string.getvalue());
                buf_string.close();
#             for x in xrange(1,31):
#                 buf_string = cStringIO.StringIO();
#                 Image.open(self.path+"\\imgs\\"+str(x)+".png").save(buf_string, format="PNG", quality=100);
#                 output['frames'].append(buf_string.getvalue());
#                 buf_string.close();
                
            self.data_holder['frames']=(content_length,PNGS);
#             self.data_holder['data']=msgpack.packb(output,use_bin_type=True);
            #indicating that reading in memory is finished for this data  
            self.data_holder["indicator"]='ready'; 
                
        else:
            raise InvalidFormatError("the type of format is not available to read in memory");
示例#10
0
文件: hdfs.py 项目: bkanuka/pymc
def load(name, chains=None, model=None, host='localhost', port='50070', user_name=None):
	'''
	Load text database

	Parameters
	----------
	name : str
		Path to root directory in HDFS for text database without a leading '/'
	chains : list
		Chains to load. If None, all chains are loaded
	model : Model
		If None, the model is taken from the 'with' context
	host : str
		The IP address or hostname of the HDFS namenode. By default,
		it is 'localhost'
	port : str
		The port number for WebHDFS on the namenode. By default, it
		is '50070'
	user_name : str
		WebHDFS user_name used for authentication. By default, it is
		None

	Returns
	-------
	ndarray.Trace instance
	'''
	hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
	chain_dirs = _get_chain_dirs(name, hdfs)
	if chains is None:
		chains = list(chain_dirs.keys())
	traces = []
	for chain in chains:
		chain_dir = chain_dirs[chain]
		dir_path = os.path.join(name, chain_dir)
		shape_file = os.path.join(dir_path, 'shapes.json')
		shapes = json.load(StringIO.StringIO(hdfs.read_file(shape_file)))
		samples = {}
		for varname, shape in shapes.items():
			var_file = os.path.join(dir_path, varname + '.txt')
			samples[varname] = np.loadtxt(StringIO.StringIO(str(hdfs.read_file(var_file)))).reshape(shape)
		trace = NDArray(model=model)
		trace.samples = samples
		trace.chain = chain
		traces.append(trace)
	return base.MultiTrace(traces)
示例#11
0
class HdfsHandler(object):
    def __init__(self):
        self._HDFS = PyWebHdfsClient(host='10.81.1.160',
                                     port='50070',
                                     user_name='hdfs')

    def readFile(self, file):
        dirToRead = "%s/%s" % (LOG_ROOT_DIR, file)
        dataOut = self._HDFS.list_dir(dirToRead)
        fileToRead = "%s/%s" % (
            dirToRead, dataOut['FileStatuses']['FileStatus'][1]['pathSuffix'])
        return self._HDFS.read_file(fileToRead)
示例#12
0
class WhenTestingOpenOperation(unittest.TestCase):

    def setUp(self):

        self.host = 'hostname'
        self.port = '00000'
        self.user_name = 'username'
        self.webhdfs = PyWebHdfsClient(host=self.host, port=self.port,
                                       user_name=self.user_name)
        self.response = MagicMock()
        self.requests = MagicMock(return_value=self.response)
        self.path = 'user/hdfs'
        self.file_data = u'010101'
        self.response = MagicMock()
        self.response.content = self.file_data

    def test_read_throws_exception_for_not_ok(self):

        self.response.status_code = http_client.BAD_REQUEST
        self.requests.return_value = self.response
        with patch('requests.sessions.Session.get', self.requests):
            with self.assertRaises(errors.PyWebHdfsException):
                self.webhdfs.read_file(self.path)

    def test_read_returns_file(self):

        self.response.status_code = http_client.OK
        self.requests.return_value = self.response
        with patch('requests.sessions.Session.get', self.requests):
            result = self.webhdfs.read_file(self.path)
        self.assertEqual(result, self.file_data)

    def test_stream_returns_generator(self):

        self.response.status_code = http_client.OK
        self.requests.return_value = self.response
        with patch('requests.sessions.Session.get', self.requests):
            result = self.webhdfs.stream_file(self.path)
        self.assertIsInstance(result, types.GeneratorType)
示例#13
0
def getFromStore(meta,rowkey):
    if 'pp:HDFSpath' in meta.keys():
        # retrieve from HDFS
        hdfs = PyWebHdfsClient(host=Master,port='50070', timeout=None,user_name='hduser')
        file = hdfs.read_file(meta['pp:HDFSpath'])
        app.logger.debug(">> READ from HDFS %s",type(file))
        return file
    else:
        #retrieve from Hbase
        con=happybase.Connection(MasterHbase)
        con.open()
        enc_table = con.table('EncTable')
        row_enc = enc_table.row(rowkey)
        con.close()
        app.logger.debug(">> READ from Hbase %s",type(row_enc['enc:data']))
        return row_enc['enc:data']
示例#14
0
def getOrigin(fileid):
    '''

        hdfs applied
        flask.send_file -> hdfs.read_file

        :param fileid:
        :return:
    '''
    hdfs = PyWebHdfsClient(host='localhost', port='50070')
    origin = Origin.query.filter(Origin.originID == fileid).first()
    path = join(origin.originPath, origin.originName)
    path = path.replace(app.config['UPLOAD_FOLDER'], "")
    path = "/uploads" + str(path)
    # return send_file(path)
    return hdfs.read_file(path)
示例#15
0
def getCompare(fileid):
    '''

        hdfs applied
        flask.send_file -> hdfs.read_file

        :param fileid:
        :return:
    '''
    hdfs = PyWebHdfsClient(host='localhost', port='50070')
    compare = Compare.query.filter(Compare.compID == fileid).first()
    path = join(compare.compPath, compare.compName)
    path = path.replace(app.config['UPLOAD_FOLDER'], "")
    path = "/uploads" + str(path)
    # return send_file(path)
    return hdfs.read_file(path)
from pywebhdfs.webhdfs import PyWebHdfsClient

hdfs = PyWebHdfsClient(host='localhost',port='50070',user_name='vagrant')
my_file = 'user/vagrant/hdfs-test/data.dat'

print 'Status of file: ', my_file
status = hdfs.get_file_dir_status(my_file)
print status

print 'Second 500 bytes of file: ',my_file
data = hdfs.read_file(my_file,offset=500,length=500)

print data
示例#17
0
class HadoopFileSystem(BaseFs.FileSystem):
    def __init__(self,
                 vcPath,
                 simulateOnly=False,
                 isVerbose=False,
                 logger=None,
                 user=None,
                 host=None,
                 port=None):
        BaseFs.FileSystem.__init__(self, simulateOnly, isVerbose, logger)
        config = Config.Config()
        hdfsUser, hdfsHost, hdfsPort = config.getHadoopConfig(user, host, port)
        self.hdfs = PyWebHdfsClient(host=hdfsHost,
                                    port=hdfsPort,
                                    user_name=hdfsUser)
        self.vcPath = vcPath

    def make_fd(self, path, isSrc, dstDirMustExist):
        fd = None
        try:
            fd = HadoopFileDescriptor(self, path, isSrc, dstDirMustExist)
        except pywebhdfs.errors.FileNotFound:
            self.logger.info("DESC: does not exist: " + path)
            raise Errors.FileNotFound("Path {0} does not exist".format(path))
        except pywebhdfs.errors.Unauthorized as e:
            self.logger.info("Unauthorized for path {0}: {1}".format(path, e))
            raise Errors.Unauthorized(
                "Unauthorized access to the path {0}: {1}".format(path, e))
        except requests.exceptions.RequestException as e:
            self.logger.info("ConnectionError for path {0}: {1}".format(
                path, e))
            raise Errors.BadConnection(
                "Connection error while looking for path: {0}, exc={1}".format(
                    path, e))
        except pywebhdfs.errors.PyWebHdfsException as e:
            self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                path, e))
            raise Errors.FsException(
                "An exception happened while looking for path: {0}, exc={1}".
                format(path, e))
        return fd

    def exists_file_dir(self, fd):
        try:
            return self.hdfs.exists_file_dir(fd.abspath)
        except pywebhdfs.errors.Unauthorized as e:
            self.logger.info("Unauthorized for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.Unauthorized(
                "Unauthorized access to the path {0}: {1}".format(
                    fd.abspath, e))
        except requests.exceptions.RequestException as e:
            self.logger.info("ConnectionError for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.BadConnection(
                "Connection error during HDFS exists test: {0}, exc={1}".
                format(fd.abspath, e))
        except pywebhdfs.errors.PyWebHdfsException as e:
            self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.FsException(
                "An exception happened during HDFS exists test: {0}, exc={1}".
                format(fd.abspath, e))

    def delete_file_dir(self, fd, recursive=False, force=False):
        if self.simulateOnly:
            print("SIMULATE -> remove file/dir: {0}, recursive={1}".format(
                fd.abspath, recursive))
        else:
            try:
                if not recursive or force or \
                        query_yes_no(question="Are you sure you want to delete folder recursively?", default="no"):
                    status = self.hdfs.delete_file_dir(fd.abspath,
                                                       recursive=recursive)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS delete directory: {0}, exc={1}"
                    .format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS delete directory: {0}, exc={1}"
                    .format(fd.abspath, e))

    def list_dir(self, fd):
        try:
            status = self.hdfs.list_dir(fd.abspath)
        except pywebhdfs.errors.Unauthorized as e:
            self.logger.info("Unauthorized for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.Unauthorized(
                "Unauthorized access to the path {0}: {1}".format(
                    fd.abspath, e))
        except requests.exceptions.RequestException as e:
            self.logger.info("ConnectionError for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.BadConnection(
                "Connection error while looking for path: {0}, exc={1}".format(
                    fd.abspath, e))
        except pywebhdfs.errors.PyWebHdfsException as e:
            self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                fd.abspath, e))
            raise Errors.FsException(
                "An exception happened while looking for path: {0}, exc={1}".
                format(fd.abspath, e))
        currentDir = status["FileStatuses"]["FileStatus"]
        for item in currentDir:
            yield HadoopFileDescriptor(self,
                                       fd.abspath,
                                       isSrc=True,
                                       needsDstDirCheck=False,
                                       fileJson=item)

    def make_dir(self, path):
        if self.simulateOnly:
            print("SIMULATE -> make dir: " + path)
        else:
            try:
                self.hdfs.make_dir(path)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    path, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(path, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    path, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS create directory: {0}, exc={1}"
                    .format(path, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    path, e))
                raise Errors.FsException(
                    "An exception happened during HDFS create directory: {0}, exc={1}"
                    .format(path, e))

    def open_file(self, fd, rwMode):
        return fd

    def close_file(self, fd):
        pass

    def touch_file(self, fd):
        if self.simulateOnly:
            print("SIMULATE -> touch file: " + fd.abspath)
        else:
            try:
                self.hdfs.create_file(fd.abspath, 0, overwrite=True)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS create file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS create file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def truncate_file(self, fd, size):
        if self.simulateOnly:
            print("SIMULATE -> truncate file: {0}, size={1}".format(
                fd.abspath, size))
        else:
            try:
                self.hdfs.truncate_file(fd.abspath, size)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS truncate file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS truncate file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def try_concat_files(self, fd, chunkFdList):
        # Workaround for unordered concat bug in Hadoop 2.7.1 is to use one source at the time
        # https://issues.apache.org/jira/browse/HDFS-8891
        currIndex = 0
        concatStep = 20
        chunkedList = [
            chunkFdList[pos:pos + concatStep]
            for pos in range(0, len(chunkFdList), concatStep)
        ]
        for sourceChunk in chunkedList:
            try:
                self.concat_files(fd, sourceChunk)
                currIndex += len(sourceChunk)
            except Errors.FsException as e:
                break

        return currIndex

    def concat_files(self, fd, chunkFdList):
        strList = list()
        for chunkFd in chunkFdList:
            strList.append(chunkFd.abspath)

        if self.simulateOnly:
            print("SIMULATE -> concat file: {0}, sources={1}".format(
                fd.abspath, ",".join(strList)))
        else:
            try:
                self.hdfs.concat_files(fd.abspath, strList)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS concat file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS concat file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def read_data(self, fd, offset, size):
        if offset >= fd.size:
            return ""
        else:
            try:
                contents = self.hdfs.read_file(fd.abspath,
                                               offset=offset,
                                               length=size)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS read file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS read file: {0}, exc={1}"
                    .format(fd.abspath, e))
            return contents

    def append_data(self, fd, data):
        if self.simulateOnly:
            print("SIMULATE -> write file data: " + fd.abspath)
        else:
            try:
                self.hdfs.append_file(fd.abspath, data)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        fd.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS append file: {0}, exc={1}".
                    format(fd.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    fd.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS append file: {0}, exc={1}"
                    .format(fd.abspath, e))

    def local_mv_file(self, src, dst):
        if self.simulateOnly:
            print("SIMULATE -> local move file: {0} -> {1} ".format(
                src.abspath, dst.abspath))
        else:
            try:
                self.hdfs.rename_file_dir(src.abspath, dst.abspath)
            except pywebhdfs.errors.Unauthorized as e:
                self.logger.info("Unauthorized for path {0}: {1}".format(
                    src.abspath, e))
                raise Errors.Unauthorized(
                    "Unauthorized access to the path {0}: {1}".format(
                        src.abspath, e))
            except requests.exceptions.RequestException as e:
                self.logger.info("ConnectionError for path {0}: {1}".format(
                    src.abspath, e))
                raise Errors.BadConnection(
                    "Connection error during HDFS rename file: {0}, exc={1}".
                    format(src.abspath, e))
            except pywebhdfs.errors.PyWebHdfsException as e:
                self.logger.info("PyWebHdfsException for path {0}: {1}".format(
                    src.abspath, e))
                raise Errors.FsException(
                    "An exception happened during HDFS rename file: {0}, exc={1}"
                    .format(src.abspath, e))

    def local_cp_file(self, src, dst):
        # This is an open issue in Hadoop community: https://issues.apache.org/jira/browse/HDFS-3370
        # Instead, we can do a symbolic link
        if self.simulateOnly:
            print("SIMULATE -> local copy file: {0} -> {1} ".format(
                src.abspath, dst.abspath))
        else:
            print(
                "Copy within HDFS is not supported due to lack of Hadoop support"
            )
            print(
                "Once symbolic links are enabled, this feature will be enabled"
            )
            sys.exit(1)
            # self.hdfs.create_sym_link(src.abspath, dst.abspath, createParent=True)

    def get_hdfs_file_dir_json(self, path):
        try:
            status = self.hdfs.get_file_dir_status(path)
            return status["FileStatus"]
        except pywebhdfs.errors.FileNotFound:
            return None

    def validate_hdfs_arg(self, arg):
        if not arg.startswith(self.vcPath):
            print("Error: You don't have permissions to the path: %s" % arg)
            print("Your path must be rooted under: %s" % self.vcPath)
            sys.exit(1)
示例#18
0
文件: DMS.py 项目: lukkiddd/DMSHadoop
class DMS:
    def __init__(self, debug=0):
        ''' This function use to init a class. To show an error messages debug
        should be 1.
        :param : debug - 1, show an error or success message. 0 otherwise
        :return: Nothing.
        '''
        self.debug = debug
        pass

    def hbase_connection(self, host, port, table='dms'):
        ''' This function use to establish a connection to hbase, for preparing to
        insert, remove, fetch data from hbase. We use starbase for connect to hbase
        via rest api.(See more: https://github.com/barseghyanartur/starbase)
        :param : host - hbase rest host
        :param : port - hbase rest running port
        :param : table - DMS table on hbase (default: 'dms')
        :return: Nothing.
        '''
        self.hbase = hbaseConnection(host=host, port=port)
        t = self.hbase.table(table)
        if (not t.exists()):
            t.create('meta_data','file')
        self.hbase_table = t

    def hdfs_connection(self, host, port, user_name, hdfs_path='/tmp/'):
        ''' This function use to establish a connection to hdfs, for preparing to
        create, retrieve, update, delete file in hdfs. We use pywebhdfs in order to
        do this task via hdfs rest api.(See more: http://pythonhosted.org/pywebhdfs/)
        :param : host - hdfs rest host
        :param : port - hdfs rest running port
        :param : user_name - hdfs username (for authentication)
        :param : hdfs_path - location to store files. (default: '/tmp/')
        :return: Nothing.
        '''
        self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
        self.hdfs_path = hdfs_path

    def solr_connection(self, host, port, collection):
        ''' This function use to establish a connection to solr, for query or
        search any text on a system.
        :param : host - solr's host
        :param : port - solr's running port
        :param : collection - solr's collection for searching
        '''
        self.solr = ''.join(['http://',host,':',port,'/solr/',collection])

    def extract(self, file):
        ''' This function use to extract meta data from a file. We use hachoir3 library
        to extract them. (See more: http://hachoir3.readthedocs.org)
        :param : file - file for extract
        :return: meta data as dict for success, 0 if fail.
        '''
    	try:
    		filename, realname = unicodeFilename(file), file
    		parser = createParser(filename, realname)
    		meta_data = extractMetadata(parser)
    		meta_data_text = meta_data.exportPlaintext()
    		meta_list = dict()
    		for i in range(1,len(meta_data_text)):
    			meta_split = meta_data_text[i].split(":")
    			column = meta_split[0].replace('- ','')
    			value = meta_split[1].lstrip()
    			meta_list.update({column:value})
    		return meta_list
    	except:
            if self.debug:
        		print "Something went wrong, meta data of",file,"could not extract."
            return None


    def upload(self, file):
        ''' This function use to uplaod a file to hdfs and store meta data on hbase
        Meta data consist of 2 main parts: file's meta data and hdfs's file's meta data.
        This function will increase a file version if it is already store in hbase.
        :param : file - file's name
        :return: True if success otherwise False.
        '''
        version = 1
        key = ''.join(['v',str(version),'.',file])
        path = ''.join([self.hdfs_path,key])

        # Read a file
        try:
            f = open(file,'r')
            file_content = f.read()
            f.close()
        except:
            print "Cannot read file:",file

        # Check file's version
        while self.hbase_table.fetch(key) != None:
            version = int(self.get_lastest_version(file)) + 1
            key = ''.join(['v',str(version),'.',file])
            path = ''.join([self.hdfs_path,key])

        # Try to upload file.
        try:
            self.hdfs.create_file(path,file_content)
            hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus']
            file_meta = self.extract(file)
            t = self.hbase_table
            status = t.insert(
                key,
                {
                    'file': {'content': file_content}
                }
            )
            if status != 200:
                if self.debug:
                    print "Error inserting: file content"
            # save hbase meta data
            for i in range(0,len(file_meta.keys())):
                status = t.insert(
                    key,
                    {
                        'meta_data': {file_meta.keys()[i]: file_meta[file_meta.keys()[i]]}
                    }
                )
                if status != 200:
                    if self.debug:
                        print "Error inserting:", file_meta.keys()[i]
            # save hdfs meta data
            for i in range(0,len(hdfs_meta.keys())):
                status = t.insert(
                    key,
                    {
                        'meta_data': {hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]}
                    }
                )
                if status != 200:
                    if self.debug:
                        print "Error inserting:", hdfs_meta.keys()[i]
            # save version
            status = t.insert(
                key,
                {
                    'meta_data': {'version': version}
                }
            )
            if status != 200:
                if self.debug:
                    print "Error inserting: version"
        except:
            if self.debug:
                print "Upload failed."
            return False
        if self.debug:
            print "[Uploaded]", file, "version:", version
        return True

    def download(self, file, version=None, download_dir=''):
        ''' This function use to retrieve or download file from hdfs. Then save
        it as a new file named (v[version].[file] - For example, v1.mytext.txt).
        You can specify the directory of downloaded file. You can also specify
        file's version for downloading if not it will be version 1.
        :param : file - file's name
        :param : version - file's version (default: 1)
        :param : download_dir - download directory (default: '' or current directory
                 NOTE: it must end with '/' - For example, '../download/')
        :return: True if success otherwise false.
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v',str(version),'.',file])
        path = ''.join([self.hdfs_path,key])
        downloaded_file = ''.join([download_dir,key])
        try:
            f = open(downloaded_file, 'w')
            f.write(self.hdfs.read_file(path))
            f.close()
        except:
            if self.debug:
                print "Cannot download a file:", file
            return False
        if self.debug:
            print "[Downloaded]",key
        return True

    def update(self, file, version=None):
        ''' This function use to update file to hdfs and data stored in hbase by
        overwrite that file on hdfs, and also insert new data to hbase too. You can
        specify a file's version in order to update it.
        :param : file - file's name
        :param : version - file's version
        :return: True if success otherwise False.
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v',str(version),'.',file])
        path = ''.join([self.hdfs_path,key])

        # Read a file
        try:
            f = open(file,'r')
            file_content = f.read()
            f.close()
        except:
            print "Cannot read file:",file

        # Try to upload file.
        try:
            self.hdfs.create_file(path,file,overwrite=True)
            hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus']
            file_meta = self.extract(file)
            status = t.insert(
                key,
                {
                    'file': {'content': file_content,
                             'name': file}
                }
            )
            if status != 200:
                if self.debug:
                    print "Error inserting: file content"

            # save hbase meta data
            for i in range(0,len(file_meta.keys())):
                status = t.insert(
                    key,
                    {
                        'meta_data': {file_meta.keys()[i]: file_meta[file_meta.keys()[i]]}
                    }
                )
                if status != 200:
                    if self.debug:
                        print "Error inserting:", file_meta.keys()[i]
            # save hdfs meta data
            for i in range(0,len(hdfs_meta.keys())):
                status = t.insert(
                    key,
                    {
                        'meta_data': {hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]}
                    }
                )
                if status != 200:
                    if self.debug:
                        print "Error inserting:", hdfs_meta.keys()[i]
            # save version
            status = t.insert(
                key,
                {
                    'meta_data': {'version': version}
                }
            )
            if status != 200:
                if self.debug:
                    print "Error inserting: version"
        except:
            if self.debug:
                print "Update failed."
            return False
        if self.debug:
            print "[Updated]", file, "version:", version
        return True

    def delete(self, file, version=None):
        ''' This function use to delete file in hbase, and hdfs. You can specify
        file's version in order to delete it.
        :param : file - file's name
        :param : version - file's version
        :return: True if succes otherwise False.
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v',str(version),'.',file])
        path = ''.join([self.hdfs_path,key])

        # Check if file exists
        if self.hbase_table.fetch(key) == None:
            if self.debug:
                print "Cannot delete.",key,"is not exists."
            return False

        # Remove row on hbase
        t = self.hbase_table
        if t.remove(key) != 200:
            if self.debug:
                print "[HBASE] cannot remove a row key:",key
            return False

        # Delete file on hdfs
        if not self.hdfs.delete_file_dir(path):
            if self.debug:
                print "[HDFS] Cannot remove a file path:",path
            return False
        if self.debug:
            print "[Deleted]", file, "version:", version
        return True

    def get_file_meta_data(self, file, version=None):
        ''' This function use to get all file's meta_data from hbase. You can
        specify a file's version.
        :param : file - file's name
        :param : version - file's version
        :return: meta data as dict for success, 0 if fail
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v',str(version),'.',file])
        if not self.hbase_table.fetch(key):
            if self.debug:
                print key,"is not exists"
            return False
        return self.hbase_table.fetch(key)['meta_data']

    def get_file_content(self, file, version=None):
        ''' This function use to get all file's content from hbase. You can
        specify a file's version.
        :param : file - file's name
        :param : version - file's version
        :return: meta data as dict for success, 0 if fail
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v',str(version),'.',file])
        if not self.hbase_table.fetch(key):
            if self.debug:
                print key,"is not exists"
            return False
        return self.hbase_table.fetch(key)['file']

    def search(self, text):
        ''' This function will search in xxxx via solr rest api.
        :param : text - text for searching
        :return: json response from solr, False for not found.
        '''
        query = urlopen(''.join([self.solr,'/select?q=',text,'&wt=json']))
        response = simplejson.load(query)
        if response['response']['numFound'] == 0:
            if self.debug:
                print text,"not found!"
            return False
        return response

    def get_all_file(self):
        ''' This function return all files that stored on Hbase in a list format.
        :param : Nothing.
        :return: fetch result as a list.
        '''
        rf = '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": ""}}'
        t = self.hbase_table
        result = t.fetch_all_rows(with_row_id=True, filter_string=rf)
        return list(result)

    def get_file_version(self, file):
        ''' This function will fetch data from file name then return them.
        :param : file - file's name
        :return: file_list with version as a dict.
        '''
        rf = ''.join(['{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": "',file,'"}}'])
        t = self.hbase_table
        result = t.fetch_all_rows(with_row_id=True, filter_string=rf)
        lsr = list(result)
        file_version = list()
        for i in range(0,len(lsr)):
            file_version.append(lsr[i].keys()[0].split('.')[0].split('v')[1])
        file_list = dict()
        file_list['name'] = file
        file_list['version'] = file_version
        return file_list

    def get_lastest_version(self, file):
        ''' This function will return a lastest version number as integer.
        :param : file - file's name
        :return: version number as an integer.
        '''
        file_version = self.get_file_version(file)
        file_version['version'].sort()
        return file_version['version'][len(file_version['version'])-1]

    def delete_all_version(self, file):
        ''' This function will delete all file's version in an hbase and HDFS
        :param : file - file's name
        :return: True if success otherwise False
        '''
        self.get_file_version(file)['version'].sort()
        for version in self.get_file_version(file)['version']:
            try:
                self.delete(file,version)
            except:
                return False
        return True

    def delete_all(self):
        ''' This function will delete all the files on an hbase and hdfs.
        :param : Nothing
        :return: True if success otherwise False
        '''
        for full_file in self.get_all_file():
            file = full_file.keys()[0].split('.')[1]
            version = full_file.keys()[0].split('.')[0].split('v')[1]
            try:
                self.delete(file,version)
            except:
                return False
        return True
示例#19
0
print(file_checksum)

# append to the file created in previous step
print('appending to file at: {0}\n'.format(example_file))
hdfs.append_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print(file_status)

# checksum reflects file changes
file_checksum = hdfs.get_file_checksum(example_file)
print(file_checksum)

# read in the data for the file
print('reading data from file at: {0}\n'.format(example_file))
file_data = hdfs.read_file(example_file)
print(file_data)

# rename the example_dir
print('renaming directory from {0} to {1}\n').format(example_dir, rename_dir)
hdfs.rename_file_dir(example_dir, '/{0}'.format(rename_dir))

# list the contents of the new directory
listdir_stats = hdfs.list_dir(rename_dir)
print(listdir_stats)

example_file = '{dir}/example.txt'.format(dir=rename_dir)

# delete the example file
print('deleting example file at: {0}'.format(example_file))
hdfs.delete_file_dir(example_file)
示例#20
0
from pywebhdfs.webhdfs import PyWebHdfsClient
from requests_kerberos import HTTPKerberosAuth

# Create Client connection to Kerberos Hadoop Cluster
auth = HTTPKerberosAuth()
devHdfs = PyWebHdfsClient(host='devHost',
                          port='portNumber',
                          user_name='userName',
                          request_extra_opts={'auth': auth})
prdHdfs = PyWebHdfsClient(host='prodHost',
                          port='portNumber',
                          user_name='userName',
                          request_extra_opts={'auth': auth})

userDir = '/user/path'
userFileName = 'DemoFile1'
print(devHdfs.read_file(userDir + userFileName))

prdDir = '/projects/'
prdFileName = 'fileName.txt'
print(prdHdfs.read_file(prdDir + prdFileName))
示例#21
0
import json
import pyspark
import numpy as np
import pyspark.sql.functions as F
from joblib import load
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pywebhdfs.webhdfs import PyWebHdfsClient
from io import BytesIO

hdfs = PyWebHdfsClient(host='192.168.100.38',
                       port='50070',
                       user_name='vagrant')
bytes = BytesIO(hdfs.read_file('models/without-scaler/RandomForest.joblib'))
clf = load(bytes)


@F.udf(returnType=IntegerType())
def predict_udf(data):
    print(data)
    return 1
    # pred = clf.predict([data])
    # return int(pred[0])


spark = (SparkSession.builder.master('spark://192.168.100.38:7077').appName(
    'MalwareDetection'
).config("spark.driver.memory", "512m").config(
    "spark.jars.packages",
class HDFS(object):
    def __init__(self, host, port, user):
        self._hdfs = PyWebHdfsClient(host=host,
                                     port=port,
                                     user_name=user,
                                     timeout=None)
        logging.debug('webhdfs = %s@%s:%s', user, host, port)

    def recursive_copy(self,
                       local_path,
                       remote_path,
                       exclude=None,
                       permission=755):

        if exclude is None:
            exclude = []

        c_path = canonicalize(remote_path)
        logging.debug('making %s', c_path)
        self._hdfs.make_dir(c_path, permission=permission)

        fs_g = os.walk(local_path)
        for dpath, dnames, fnames in fs_g:
            _, relative_path = dpath.split(local_path)
            for dname in dnames:
                if dname not in exclude:
                    c_path = canonicalize('%s/%s/%s' %
                                          (remote_path, relative_path, dname))
                    logging.debug('making %s', c_path)
                    self._hdfs.make_dir(c_path, permission=permission)

            for fname in fnames:
                if fname not in exclude:
                    data = file(
                        canonicalize('%s/%s/%s' %
                                     (local_path, relative_path, fname)), 'rb')
                    c_path = canonicalize('%s/%s/%s' %
                                          (remote_path, relative_path, fname))
                    logging.debug('creating %s', c_path)
                    self._hdfs.create_file(c_path,
                                           data,
                                           overwrite=True,
                                           permission=permission)
                    data.close()

    def make_dir(self, path, permission=755):

        logging.debug('make_dir: %s', path)

        self._hdfs.make_dir(canonicalize(path), permission=permission)

    def create_file(self, data, remote_file_path, permission=755):

        logging.debug('create_file: %s', remote_file_path)

        sio = BytesIO(data)

        self._hdfs.create_file(canonicalize(remote_file_path),
                               sio,
                               overwrite=True,
                               permission=permission)

    def append_file(self, data, remote_file_path):

        logging.debug('append to: %s', remote_file_path)

        self._hdfs.append_file(canonicalize(remote_file_path), data)

    def stream_file_to_disk(self, remote_file_path, local_file_path):
        chunk_size = 10 * 1024 * 1024
        offset = 0
        with open(local_file_path, 'wb') as dest_file:
            data = self._hdfs.read_file(canonicalize(remote_file_path),
                                        offset=offset,
                                        length=chunk_size)
            while True:
                dest_file.write(data)
                if len(data) < chunk_size:
                    break
                offset += chunk_size
                data = self._hdfs.read_file(canonicalize(remote_file_path),
                                            offset=offset,
                                            length=chunk_size)

    def read_file(self, remote_file_path):

        data = self._hdfs.read_file(canonicalize(remote_file_path))

        return data

    def remove(self, path, recursive=False):

        logging.debug('remove: %s', path)

        self._hdfs.delete_file_dir(canonicalize(path), recursive)

    def file_exists(self, path):

        try:
            self._hdfs.get_file_dir_status(path)
            return True
        except:
            return False
示例#23
0
import re
import pickle
THRIFTNODE='data2'
client=PyWebHdfsClient(host='namenode',port='50070',user_name='root')
conn=happybase.Connection(THRIFTNODE) 
crawls=conn.table('crawls')
MAXLOCALLINKCOUNT = 30
timeout = 5
socket.setdefaulttimeout(timeout)
DATESTRING=str(time.strftime('%Y%m%d'))
ANET=187
for BNET in range(5,10):
  SCANSITESFILE=str(ANET)+'-'+str(BNET)+'-p80.log'
  FNAME='user/root/scans/'+str(ANET)+'/'+SCANSITESFILE
  SSFP=open(SCANSITESFILE,'w')
  SSFP.write(client.read_file(FNAME))
  SSFP.close()
  try:
   ifp=open(SCANSITESFILE,'r')
  except:
    continue
  for line in ifp:
    line = line.strip().split()
    url = 'http://'+str(line[1])+'/'
    req = urllib2.Request(url)
    html = ''
    try:
      html = urllib2.urlopen(req)
    except:
      print ' url open exception on '+str(url)
      continue
示例#24
0
import re
import pickle
THRIFTNODE = 'data2'
client = PyWebHdfsClient(host='namenode', port='50070', user_name='root')
conn = happybase.Connection(THRIFTNODE)
crawls = conn.table('crawls')
MAXLOCALLINKCOUNT = 30
timeout = 5
socket.setdefaulttimeout(timeout)
DATESTRING = str(time.strftime('%Y%m%d'))
ANET = 187
for BNET in range(5, 10):
    SCANSITESFILE = str(ANET) + '-' + str(BNET) + '-p80.log'
    FNAME = 'user/root/scans/' + str(ANET) + '/' + SCANSITESFILE
    SSFP = open(SCANSITESFILE, 'w')
    SSFP.write(client.read_file(FNAME))
    SSFP.close()
    try:
        ifp = open(SCANSITESFILE, 'r')
    except:
        continue
    for line in ifp:
        line = line.strip().split()
        url = 'http://' + str(line[1]) + '/'
        req = urllib2.Request(url)
        html = ''
        try:
            html = urllib2.urlopen(req)
        except:
            print ' url open exception on ' + str(url)
            continue
class HDFS(object):
    def __init__(self, host, port, user):
        self._hdfs = PyWebHdfsClient(
            host=host, port=port, user_name=user, timeout=None)
        logging.debug('webhdfs = %s@%s:%s', user, host, port)

    def recursive_copy(self, local_path, remote_path, exclude=None):

        if exclude is None:
            exclude = []

        c_path = canonicalize(remote_path)
        logging.debug('making %s', c_path)
        self._hdfs.make_dir(c_path)

        fs_g = os.walk(local_path)
        for dpath, dnames, fnames in fs_g:
            _, relative_path = dpath.split(local_path)
            for dname in dnames:
                if dname not in exclude:
                    c_path = canonicalize(
                        '%s/%s/%s' %
                        (remote_path, relative_path, dname))
                    logging.debug('making %s', c_path)
                    self._hdfs.make_dir(c_path)

            for fname in fnames:
                if fname not in exclude:
                    data = file(
                        canonicalize(
                            '%s/%s/%s' %
                            (local_path, relative_path, fname)), 'rb')
                    c_path = canonicalize(
                        '%s/%s/%s' %
                        (remote_path, relative_path, fname))
                    logging.debug('creating %s', c_path)
                    self._hdfs.create_file(c_path, data, overwrite=True)
                    data.close()

    def make_dir(self, path):

        logging.debug('make_dir: %s', path)

        self._hdfs.make_dir(canonicalize(path))

    def create_file(self, data, remote_file_path):

        logging.debug('create_file: %s', remote_file_path)

        sio = StringIO.StringIO(data)

        self._hdfs.create_file(
            canonicalize(remote_file_path),
            sio,
            overwrite=True)

    def append_file(self, data, remote_file_path):

        logging.debug('append to: %s', remote_file_path)

        self._hdfs.append_file(canonicalize(remote_file_path), data)


    def stream_file_to_disk(self, remote_file_path, local_file_path):
        chunk_size = 10*1024*1024
        offset = 0
        with open(local_file_path, 'wb') as dest_file:
            data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size)
            while True:
                dest_file.write(data)
                if len(data) < chunk_size:
                    break
                offset += chunk_size
                data = self._hdfs.read_file(canonicalize(remote_file_path), offset=offset, length=chunk_size)

    def read_file(self, remote_file_path):

        data = self._hdfs.read_file(canonicalize(remote_file_path))

        return data

    def remove(self, path, recursive=False):

        logging.debug('remove: %s', path)

        self._hdfs.delete_file_dir(canonicalize(path), recursive)
示例#26
0
class HdfsApi:
    def __init__(self,
                 request_timeout=10,
                 logger=logging,
                 active_nn_host='localhost',
                 kerberos=False):
        self.timeout = request_timeout
        self.hdfs_schema = os.environ.get('HDFS_NAMENODE_SCHEMA', 'http')
        self.hdfs_host = active_nn_host
        self.hdfs_port = os.environ.get('HDFS_NAMENODE_PORT', 50070)
        if kerberos:
            extra_opts = {
                'auth':
                HTTPKerberosAuth(mutual_authentication=OPTIONAL,
                                 sanitize_mutual_error_response=False,
                                 force_preemptive=True)
            }
        else:
            extra_opts = {}
        self.webhdfs = PyWebHdfsClient(host=self.hdfs_host,
                                       port=self.hdfs_port,
                                       request_extra_opts=extra_opts)
        self.logger = logger

    def request_namenode(self, path, method='GET', headers=None, **kwargs):
        self.logger.info("Calling HDFS API ({0})".format(path))
        if headers is None:
            headers = dict()

        if path.startswith('http'):
            hdfs_url = path
        else:
            hdfs_url = '{0}://{1}:{2}/{3}'.format(self.hdfs_schema,
                                                  self.hdfs_host,
                                                  self.hdfs_port, path)
        self.logger.debug(hdfs_url)
        r = requests.request(method,
                             hdfs_url,
                             headers=headers,
                             timeout=self.timeout,
                             verify=False,
                             auth=HTTPKerberosAuth(),
                             **kwargs)
        return self._check_response_status(r)

    def request_webhdfs_status(self, path):
        return self.webhdfs.get_file_dir_status(path)

    def _check_response_status(self, response):
        self.logger.debug(response.text)
        if response.status_code >= 400:
            self.logger.error(
                "HdfsResponse returned with error status [{0}], response was: {1}"
                .format(response.status_code, response.text))
            raise HdfsRequestError(
                "HdfsResponse returned with error status [{0}]".format(
                    response.status_code))
        return response

    def get_block_info_for_file(self, file_path):
        path = "fsck"
        params = {'files': 0, 'racks': 1, 'blocks': 0, 'path': file_path}

        response = self.request_namenode(path, params=params)
        return response

    @staticmethod
    def get_first_block_info(filename, block_info):
        regex = r"^{0}.*\n(.*)\n".format(filename)
        info_of_first_block = re.findall(regex, block_info, re.MULTILINE)
        if len(info_of_first_block) < 1:
            raise HdfsRequestError(
                "No block information found for file {0} in {1}".format(
                    filename, block_info))
        return info_of_first_block[0]

    @staticmethod
    def get_location_of_first_block(block_info):
        ip_regex = r"(?<!\-)(\d{2,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?=:)"
        block_locations = re.findall(ip_regex, block_info)
        if len(block_locations) < 1:
            raise HdfsRequestError(
                "No block location information found in {0}".format(
                    block_info))
        return block_locations[0]

    @staticmethod
    def get_host_by_ip(ip):
        host_info = socket.gethostbyaddr(ip)
        if len(host_info) < 1:
            raise HdfsRequestError(
                "Unable to get hostname form ip {0}".format(ip))
        return host_info[0]

    @staticmethod
    def calculate_md5(file, block_size=65536):
        hash_builder = hashlib.md5()
        for block in iter(lambda: file.read(block_size), b""):
            hash_builder.update(block)
        md5 = hash_builder.hexdigest()
        file.seek(0)
        return md5

    @staticmethod
    def create_temp_file():
        return tempfile.NamedTemporaryFile(suffix='.temporary',
                                           prefix='hdfs-smoketest-api-')

    def create_temp_file_of_size(self, temp_file_size):
        tmp = self.create_temp_file()
        tmp.seek(temp_file_size * 1024 * 1024)
        tmp.write(b'1')
        tmp.seek(0)

        return tmp

    def copy_to_hdfs(self, remote_path, tmpfile):
        self.webhdfs.create_file(remote_path,
                                 file_data=tmpfile,
                                 overwrite=True)

    def create_hdfs_file_of_size_in_mb(self, path, size=300):
        with self.create_temp_file_of_size(size) as tmp_file:
            md5_of_tmp_file = self.calculate_md5(tmp_file)
            self.copy_to_hdfs(path, tmp_file)

        return md5_of_tmp_file

    def get_remote_file(self, path):
        return self.webhdfs.read_file(path)

    def write_remote_file_to_local_temp(self, remote_path):
        local = self.create_temp_file()
        file = self.get_remote_file(remote_path)
        local.write(file)
        local.seek(0)
        return local

    def get_hdfsfile_and_calc_md5(self, path):
        with self.write_remote_file_to_local_temp(path) as temp_file:
            return self.calculate_md5(temp_file)

    def cleanup_remote_file(self, path, recursive=False):
        self.webhdfs.delete_file_dir(path, recursive=recursive)

    def get_host_location_of_first_block(self, filename):
        file_block_info = self.get_block_info_for_file(filename)
        file_first_block_info = self.get_first_block_info(
            filename, file_block_info.text)
        file_block_ip = self.get_location_of_first_block(file_first_block_info)
        return self.get_host_by_ip(file_block_ip)
# -*- coding: UTF8 -*-

from pywebhdfs.webhdfs import PyWebHdfsClient

my_file = '/di/logs/raw_logs/rings.event/day=20160429/2016042900rings.event.1461859205239'

hdfs = PyWebHdfsClient(host='10.160.241.61',port='14000', user_name='hdfs')
hdfs.read_file(my_file)
示例#28
0
from pywebhdfs.webhdfs import PyWebHdfsClient
import logging
from pprint import pprint

logging.basicConfig(level=logging.DEBUG)
_LOG = logging.getLogger(__name__)


#host= your server address.
hdfs = PyWebHdfsClient(host='',port='50070', user_name='hduser',timeout=4)  # your Namenode IP & username here
my_dir = '/user/hduser/sample'
fileFinal=my_dir+'/file.txt'
pprint(hdfs.list_dir(my_dir))


dir_status = hdfs.get_file_dir_status(my_dir)
print dir_status
print "Reading file from hadoop hdfs"
file_data = hdfs.read_file("user/hduser/sample/file.txt")

print file_data
示例#29
0
class DMS:
    def __init__(self, debug=0):
        ''' This function use to init a class. To show an error messages debug
        should be 1.
        :param : debug - 1, show an error or success message. 0 otherwise
        :return: Nothing.
        '''
        self.debug = debug
        pass

    def hbase_connection(self, host, port, table='dms'):
        ''' This function use to establish a connection to hbase, for preparing to
        insert, remove, fetch data from hbase. We use starbase for connect to hbase
        via rest api.(See more: https://github.com/barseghyanartur/starbase)
        :param : host - hbase rest host
        :param : port - hbase rest running port
        :param : table - DMS table on hbase (default: 'dms')
        :return: Nothing.
        '''
        self.hbase = hbaseConnection(host=host, port=port)
        t = self.hbase.table(table)
        if (not t.exists()):
            t.create('meta_data', 'file')
        self.hbase_table = t

    def hdfs_connection(self, host, port, user_name, hdfs_path='/tmp/'):
        ''' This function use to establish a connection to hdfs, for preparing to
        create, retrieve, update, delete file in hdfs. We use pywebhdfs in order to
        do this task via hdfs rest api.(See more: http://pythonhosted.org/pywebhdfs/)
        :param : host - hdfs rest host
        :param : port - hdfs rest running port
        :param : user_name - hdfs username (for authentication)
        :param : hdfs_path - location to store files. (default: '/tmp/')
        :return: Nothing.
        '''
        self.hdfs = PyWebHdfsClient(host=host, port=port, user_name=user_name)
        self.hdfs_path = hdfs_path

    def solr_connection(self, host, port, collection):
        ''' This function use to establish a connection to solr, for query or
        search any text on a system.
        :param : host - solr's host
        :param : port - solr's running port
        :param : collection - solr's collection for searching
        '''
        self.solr = ''.join(['http://', host, ':', port, '/solr/', collection])

    def extract(self, file):
        ''' This function use to extract meta data from a file. We use hachoir3 library
        to extract them. (See more: http://hachoir3.readthedocs.org)
        :param : file - file for extract
        :return: meta data as dict for success, 0 if fail.
        '''
        try:
            filename, realname = unicodeFilename(file), file
            parser = createParser(filename, realname)
            meta_data = extractMetadata(parser)
            meta_data_text = meta_data.exportPlaintext()
            meta_list = dict()
            for i in range(1, len(meta_data_text)):
                meta_split = meta_data_text[i].split(":")
                column = meta_split[0].replace('- ', '')
                value = meta_split[1].lstrip()
                meta_list.update({column: value})
            return meta_list
        except:
            if self.debug:
                print "Something went wrong, meta data of", file, "could not extract."
            return None

    def upload(self, file):
        ''' This function use to uplaod a file to hdfs and store meta data on hbase
        Meta data consist of 2 main parts: file's meta data and hdfs's file's meta data.
        This function will increase a file version if it is already store in hbase.
        :param : file - file's name
        :return: True if success otherwise False.
        '''
        version = 1
        key = ''.join(['v', str(version), '.', file])
        path = ''.join([self.hdfs_path, key])

        # Read a file
        try:
            f = open(file, 'r')
            file_content = f.read()
            f.close()
        except:
            print "Cannot read file:", file

        # Check file's version
        while self.hbase_table.fetch(key) != None:
            version = int(self.get_lastest_version(file)) + 1
            key = ''.join(['v', str(version), '.', file])
            path = ''.join([self.hdfs_path, key])

        # Try to upload file.
        try:
            self.hdfs.create_file(path, file_content)
            hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus']
            file_meta = self.extract(file)
            t = self.hbase_table
            status = t.insert(key, {'file': {'content': file_content}})
            if status != 200:
                if self.debug:
                    print "Error inserting: file content"
            # save hbase meta data
            for i in range(0, len(file_meta.keys())):
                status = t.insert(
                    key, {
                        'meta_data': {
                            file_meta.keys()[i]: file_meta[file_meta.keys()[i]]
                        }
                    })
                if status != 200:
                    if self.debug:
                        print "Error inserting:", file_meta.keys()[i]
            # save hdfs meta data
            for i in range(0, len(hdfs_meta.keys())):
                status = t.insert(
                    key, {
                        'meta_data': {
                            hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]
                        }
                    })
                if status != 200:
                    if self.debug:
                        print "Error inserting:", hdfs_meta.keys()[i]
            # save version
            status = t.insert(key, {'meta_data': {'version': version}})
            if status != 200:
                if self.debug:
                    print "Error inserting: version"
        except:
            if self.debug:
                print "Upload failed."
            return False
        if self.debug:
            print "[Uploaded]", file, "version:", version
        return True

    def download(self, file, version=None, download_dir=''):
        ''' This function use to retrieve or download file from hdfs. Then save
        it as a new file named (v[version].[file] - For example, v1.mytext.txt).
        You can specify the directory of downloaded file. You can also specify
        file's version for downloading if not it will be version 1.
        :param : file - file's name
        :param : version - file's version (default: 1)
        :param : download_dir - download directory (default: '' or current directory
                 NOTE: it must end with '/' - For example, '../download/')
        :return: True if success otherwise false.
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v', str(version), '.', file])
        path = ''.join([self.hdfs_path, key])
        downloaded_file = ''.join([download_dir, key])
        try:
            f = open(downloaded_file, 'w')
            f.write(self.hdfs.read_file(path))
            f.close()
        except:
            if self.debug:
                print "Cannot download a file:", file
            return False
        if self.debug:
            print "[Downloaded]", key
        return True

    def update(self, file, version=None):
        ''' This function use to update file to hdfs and data stored in hbase by
        overwrite that file on hdfs, and also insert new data to hbase too. You can
        specify a file's version in order to update it.
        :param : file - file's name
        :param : version - file's version
        :return: True if success otherwise False.
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v', str(version), '.', file])
        path = ''.join([self.hdfs_path, key])

        # Read a file
        try:
            f = open(file, 'r')
            file_content = f.read()
            f.close()
        except:
            print "Cannot read file:", file

        # Try to upload file.
        try:
            self.hdfs.create_file(path, file, overwrite=True)
            hdfs_meta = self.hdfs.get_file_dir_status(path)['FileStatus']
            file_meta = self.extract(file)
            status = t.insert(
                key, {'file': {
                    'content': file_content,
                    'name': file
                }})
            if status != 200:
                if self.debug:
                    print "Error inserting: file content"

            # save hbase meta data
            for i in range(0, len(file_meta.keys())):
                status = t.insert(
                    key, {
                        'meta_data': {
                            file_meta.keys()[i]: file_meta[file_meta.keys()[i]]
                        }
                    })
                if status != 200:
                    if self.debug:
                        print "Error inserting:", file_meta.keys()[i]
            # save hdfs meta data
            for i in range(0, len(hdfs_meta.keys())):
                status = t.insert(
                    key, {
                        'meta_data': {
                            hdfs_meta.keys()[i]: hdfs_meta[hdfs_meta.keys()[i]]
                        }
                    })
                if status != 200:
                    if self.debug:
                        print "Error inserting:", hdfs_meta.keys()[i]
            # save version
            status = t.insert(key, {'meta_data': {'version': version}})
            if status != 200:
                if self.debug:
                    print "Error inserting: version"
        except:
            if self.debug:
                print "Update failed."
            return False
        if self.debug:
            print "[Updated]", file, "version:", version
        return True

    def delete(self, file, version=None):
        ''' This function use to delete file in hbase, and hdfs. You can specify
        file's version in order to delete it.
        :param : file - file's name
        :param : version - file's version
        :return: True if succes otherwise False.
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v', str(version), '.', file])
        path = ''.join([self.hdfs_path, key])

        # Check if file exists
        if self.hbase_table.fetch(key) == None:
            if self.debug:
                print "Cannot delete.", key, "is not exists."
            return False

        # Remove row on hbase
        t = self.hbase_table
        if t.remove(key) != 200:
            if self.debug:
                print "[HBASE] cannot remove a row key:", key
            return False

        # Delete file on hdfs
        if not self.hdfs.delete_file_dir(path):
            if self.debug:
                print "[HDFS] Cannot remove a file path:", path
            return False
        if self.debug:
            print "[Deleted]", file, "version:", version
        return True

    def get_file_meta_data(self, file, version=None):
        ''' This function use to get all file's meta_data from hbase. You can
        specify a file's version.
        :param : file - file's name
        :param : version - file's version
        :return: meta data as dict for success, 0 if fail
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v', str(version), '.', file])
        if not self.hbase_table.fetch(key):
            if self.debug:
                print key, "is not exists"
            return False
        return self.hbase_table.fetch(key)['meta_data']

    def get_file_content(self, file, version=None):
        ''' This function use to get all file's content from hbase. You can
        specify a file's version.
        :param : file - file's name
        :param : version - file's version
        :return: meta data as dict for success, 0 if fail
        '''
        if not version:
            version = self.get_lastest_version(file)
        key = ''.join(['v', str(version), '.', file])
        if not self.hbase_table.fetch(key):
            if self.debug:
                print key, "is not exists"
            return False
        return self.hbase_table.fetch(key)['file']

    def search(self, text):
        ''' This function will search in xxxx via solr rest api.
        :param : text - text for searching
        :return: json response from solr, False for not found.
        '''
        query = urlopen(''.join([self.solr, '/select?q=', text, '&wt=json']))
        response = simplejson.load(query)
        if response['response']['numFound'] == 0:
            if self.debug:
                print text, "not found!"
            return False
        return response

    def get_all_file(self):
        ''' This function return all files that stored on Hbase in a list format.
        :param : Nothing.
        :return: fetch result as a list.
        '''
        rf = '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": ""}}'
        t = self.hbase_table
        result = t.fetch_all_rows(with_row_id=True, filter_string=rf)
        return list(result)

    def get_file_version(self, file):
        ''' This function will fetch data from file name then return them.
        :param : file - file's name
        :return: file_list with version as a dict.
        '''
        rf = ''.join([
            '{"type": "RowFilter", "op": "EQUAL", "comparator": {"type": "RegexStringComparator", "value": "',
            file, '"}}'
        ])
        t = self.hbase_table
        result = t.fetch_all_rows(with_row_id=True, filter_string=rf)
        lsr = list(result)
        file_version = list()
        for i in range(0, len(lsr)):
            file_version.append(lsr[i].keys()[0].split('.')[0].split('v')[1])
        file_list = dict()
        file_list['name'] = file
        file_list['version'] = file_version
        return file_list

    def get_lastest_version(self, file):
        ''' This function will return a lastest version number as integer.
        :param : file - file's name
        :return: version number as an integer.
        '''
        file_version = self.get_file_version(file)
        file_version['version'].sort()
        return file_version['version'][len(file_version['version']) - 1]

    def delete_all_version(self, file):
        ''' This function will delete all file's version in an hbase and HDFS
        :param : file - file's name
        :return: True if success otherwise False
        '''
        self.get_file_version(file)['version'].sort()
        for version in self.get_file_version(file)['version']:
            try:
                self.delete(file, version)
            except:
                return False
        return True

    def delete_all(self):
        ''' This function will delete all the files on an hbase and hdfs.
        :param : Nothing
        :return: True if success otherwise False
        '''
        for full_file in self.get_all_file():
            file = full_file.keys()[0].split('.')[1]
            version = full_file.keys()[0].split('.')[0].split('v')[1]
            try:
                self.delete(file, version)
            except:
                return False
        return True
示例#30
0
print('making new file at: {0}\n'.format(example_file))
hdfs.create_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print file_status

# append to the file created in previous step
print('appending to file at: {0}\n'.format(example_file))
hdfs.append_file(example_file, example_data)

file_status = hdfs.get_file_dir_status(example_file)
print file_status

# read in the data for the file
print('reading data from file at: {0}\n'.format(example_file))
file_data = hdfs.read_file(example_file)
print file_data

# rename the example_dir
print('renaming directory from {0} to {1}\n').format(example_dir, rename_dir)
hdfs.rename_file_dir(example_dir, '/{0}'.format(rename_dir))

# list the contents of the new directory
listdir_stats = hdfs.list_dir(rename_dir)
print listdir_stats

example_file = '{dir}/example.txt'.format(dir=rename_dir)

# delete the example file
print('deleting example file at: {0}'.format(example_file))
hdfs.delete_file_dir(example_file)
示例#31
0
from pywebhdfs.webhdfs import PyWebHdfsClient

hdfs = PyWebHdfsClient(host='localhost', port='50070', user_name='vagrant')
my_file = 'user/vagrant/hdfs-test/data.dat'

print 'Status of file: ', my_file
status = hdfs.get_file_dir_status(my_file)
print status

print 'Second 500 bytes of file: ', my_file
data = hdfs.read_file(my_file, offset=500, length=500)

print data