Exemplo n.º 1
0
def download(hdfs_path, local_path):
    # 初始化HDFS连接
    client = Client('http://fisher.lazybone.xyz:9001', root='/')
    if os.path.exists(local_path):
        print('文件已存在')
        return
    client.download(hdfs_path=hdfs_path, local_path=local_path)
Exemplo n.º 2
0
 def get(self, request):
     client = Client(HDFS_HOST)
     hdfs = request.GET.get('hdfs')
     file_name = DataSource.objects.get(format_filename=hdfs).file_name
     client.download('/datahoop/' + hdfs, settings.MEDIA_ROOT + 'hdfs_download')
     path = os.path.join(settings.MEDIA_ROOT, 'hdfs_download')
     file = open(os.path.join(path, hdfs), 'rb')
     response = FileResponse(file)
     response = HttpResponse(content_type='application/vnd.ms-csv')
     response['Content-Disposition'] = 'attachment; filename={}.csv'.format(file_name.split('.')[0])
     return (response)
Exemplo n.º 3
0
#daoru
from hdfs import Client

client = Client("http://master:9870")
# client.makedirs("/abc/xyz")
x = client.list("/")
y = client.list("/", status=True)
print(y[1][0])
print(y[1][1]["accessTime"])

client.upload("/abc", "HDFSDao.py")
client.download("/abc/HDFSDao.py", "d:/ttt.py")

print("end___")
Exemplo n.º 4
0
class ChatBotModel(object):
    def __init__(self,
                 hadoop_url,
                 hdfs_index_file,
                 local_index_file,
                 corpus_dir,
                 unk_answer='',
                 max_answer_len=1024):
        self.hadoop_url = hadoop_url
        self.hdfs_index_file = hdfs_index_file
        self.local_index_file = local_index_file
        self.corpus_dir = corpus_dir
        self.max_answer_len = max_answer_len
        self.unk_answer = unk_answer
        self.client = None
        self.inverted_index = {}

    def build_connection(self):
        self.client = Client(self.hadoop_url)

    def fetch_index_file(self):
        self.client.download(hdfs_path=self.hdfs_index_file,
                             local_path=self.local_index_file,
                             overwrite=True)

    def load_inverted_index(self):
        with open(self.local_index_file, 'r', encoding='utf-8') as f:
            for line in f:
                word, *querys = line.strip().split('\t')
                for query in querys:
                    file_name, query_id, score = query.split(':')
                    if word in self.inverted_index:
                        self.inverted_index[word].append(
                            [file_name, int(query_id),
                             float(score)])
                    else:
                        self.inverted_index[word] = []
                        self.inverted_index[word].append(
                            [file_name, int(query_id),
                             float(score)])

    def prepare(self):
        self.build_connection()
        self.fetch_index_file()
        self.load_inverted_index()

    def read_corpus_answer(self, file_name, query_id):
        file_path = os.path.join(self.corpus_dir, file_name)
        file_status = self.client.status(file_path)
        if file_status['length'] <= query_id:
            return None
        with self.client.read(hdfs_path=file_path,
                              offset=query_id,
                              length=self.max_answer_len,
                              encoding='utf-8') as f:
            answer = f.read().strip().split('\n')[0]
            return answer

    def predict_answer(self, query):
        words = jieba.lcut_for_search(query)
        querys = {}
        for word in words:
            if word not in self.inverted_index:
                continue
            for file_name, query_id, score in self.inverted_index[word]:
                query = (file_name, query_id)
                if query in querys:
                    querys[query] += score
                else:
                    querys[query] = score
        if len(querys) == 0:
            return self.unk_answer
        best_query = max(querys.items(), key=lambda x: x[1])
        (best_file_name, best_query_id), best_score = best_query
        best_answer = self.read_corpus_answer(best_file_name, best_query_id)
        if best_answer is None:
            best_answer = self.unk_answer
        return best_answer
Exemplo n.º 5
0
class HDFSUtil:
    def __init__(self, url):
        self._client = Client(url)

    def make_dir(self, hdfs_path):
        """
        支持递归创建多级目录
        :param hdfs_path:
        :return:
        """
        self._client.makedirs(hdfs_path)

    def delete_hdfs_file(self, hdfs_path):
        """
        删除HDFS文件

        如果是目录, 必须为空
        :param hdfs_path:
        :return:
        """
        self._client.delete(hdfs_path)

    def delete_hdfs_dir(self, hdfs_dir):
        """
        删除HDFS文件/目录

        如果目录不为空, 递归删除
        :param hdfs_dir:
        :return:
        """
        dir_list = self.hdfs_dir_list(hdfs_dir)
        if dir_list is None or len(dir_list) == 0:
            print('Delete File: {0}'.format(hdfs_dir))
            self._client.delete(hdfs_dir)
        else:
            for file_name in dir_list:
                self.delete_hdfs_dir(hdfs_dir + '/' + file_name)
            print('Delete Dir: {0}'.format(hdfs_dir))
            self._client.delete(hdfs_dir)

    def upload_to_hdfs(self, local_path, hdfs_path):
        """
        将本地文件/目录上传到HDFS上

        如果目录不存在, 会自动创建
        :param local_path:
        :param hdfs_path:
        :return:
        """
        self._client.upload(hdfs_path, local_path, cleanup=True)

    def download_from_hdfs(self, hdfs_path, local_path):
        """
        将HDFS上的文件/目录下载到本地
        :param hdfs_path:
        :param local_path:
        :return:
        """
        self._client.download(hdfs_path, local_path, overwrite=True)

    def write_to_hdfs(self, hdfs_path, data, overwrite=False, append=True):
        """
        追加: overwrite=false, append=true => Default
        复写: overwrite=true, append=false

        overwrite和append逻辑必须互斥
        :param hdfs_path:
        :param data:
        :param overwrite: Boolean 是否复写
        :param append: Boolean 是否追加
        :return:
        """
        if not self._client.content(hdfs_path, strict=False):
            print('File Not exist in HDFS')
        self._client.write(hdfs_path, data, overwrite=overwrite, append=append)

    def move_or_rename(self, hdfs_src_path, hdfs_dst_path):
        """
        文件移动/重命名
        :param hdfs_src_path:
        :param hdfs_dst_path:
        :return:
        """
        self._client.rename(hdfs_src_path, hdfs_dst_path)

    def hdfs_dir_list(self, hdfs_path):
        """
        获取指定目录下的文件
        当hdfs_path不是目录, 捕获异常并返回None
        :param hdfs_path:
        :return: List[filename] or None
        """
        try:
            return self._client.list(hdfs_path, status=False)
        except HdfsError:
            return None
Exemplo n.º 6
0
if __name__ == '__main__':
    hdfs_ip = "192.168.146.133"
    hdfs_version = 3
    hdfs_root = "~/test"
    filepath = r"C:\Users\daqige\PycharmProjects\newLeetCode\convert.py"
    hdfs_addr = "http://" + hdfs_ip + ":" + str(9870 if
                                                (hdfs_version == 3) else 90070)

    client = Client(hdfs_addr)

    # print("创建文件夹")
    # client.makedirs(hdfs_root)
    # print(client.list("/"))
    #
    # print("上传文件")
    # client.upload(hdfs_root, filepath)
    # print(client.list(hdfs_root))
    #
    # print("修改文件名")
    # client.rename(hdfs_root + "/convert.py", hdfs_root + "/ubuntu.py")
    # print(client.list(hdfs_root))

    print("下载文件")
    client.download(hdfs_root + "/ubuntu.py", ".")
    print(os.listdir("."))

    print("删除文件")
    client.delete(hdfs_root + "/ubuntu.py")
    print(client.list(hdfs_root))
Exemplo n.º 7
0
# 返回目标信息
info = client.status(file_name, strict=False)
print(info)

# 写入文件(覆盖)
client.write(file_name, data="hello hdfs !", overwrite=True)

# 写入文件(追加)
client.write(file_name, data="hello way !", overwrite=False, append=True)

# 读取文件内容
with client.read(file_name, encoding='utf-8') as f:
    print(f.read())

# 文件下载
client.download(file_name, loacl_file_name, overwrite=True)

# 文件上传
client.upload(file_name + '111', loacl_file_name, cleanup=True)

# 删除文件
client.delete(file_name2)

# 文件重命名
client.rename(file_name, file_name2)

# 文件夹底下文件
files = client.list(file_dir, status=False)
for file in files:
    print(file)
Exemplo n.º 8
0
class RF_HDFS(object):
    def __init__(self):
        self.client = None
        self.directory = None

    def connect_and_login(self, **kwargs):
        import requests

        host = None
        port = None
        user = None
        password = None
        root = None
        timeout = None
        proxy = None

        if 'host' in kwargs:
            host = kwargs['host']
        if 'port' in kwargs:
            port = kwargs['port']
        if 'kdc' in kwargs:
            kdc = kwargs['kdc']
        if 'user' in kwargs:
            user = kwargs['user']
        if 'password' in kwargs:
            password = kwargs['password']
        if 'root' in kwargs:
            root = kwargs['root']
        if 'proxy' in kwargs:
            proxy = kwargs['proxy']
        if 'timeout' in kwargs:
            timeout = kwargs['timeout']

        self.session = requests.Session()
        adapter = requests.adapters.HTTPAdapter(pool_maxsize=0)
        self.session.mount('http://',  adapter)
        self.session.mount('https://', adapter)
        self.session.headers.update({'Connection':'Keep-Alive'})

        self.connectionStatus = False
        try:
            timeout = int(timeout)
            url = "http://" + host + ":" + str(port)

            hdfsLogin = WebHDFS(url, kdc)
            cookieStr = hdfsLogin.authenticate(user, password)
            if cookieStr != None:
                cookieList = cookieStr.split('=', 1)
                cookieDict = {cookieList[0]: cookieList[1]}
                requests.utils.add_dict_to_cookiejar(self.session.cookies, cookieDict)

            self.client = Client(url, root=root, proxy=proxy, timeout=timeout, session=self.session)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

        self.connectionStatus = True
        return self.client

    def checkConnectionStatus(self):
        return self.connectionStatus

    def list_dir(self, directory):
        output = []
        try:
            if directory != None:
                output = self.client.list(directory, status=True)
            else:
                output = self.client.list(self.client.root, status=True)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def list_names(self, directory):
        output = []
        try:
            if directory != None:
                output = self.client.list(directory, status=False)
            else:
                output = self.client.list(self.client.root, status=False)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def upload(self, remote_path, local_path, overwrite=False, permission=None):
        output = None
        try:
            output = self.client.upload(remote_path, local_path, overwrite, permission=permission)
        except HdfsError as hdfsError:
            # For some reason this exception includes the entire stack trace after
            # the error message, so split on '\n' and only return the first line.
            error = str(hdfsError).splitlines()[0]
            raise HdfsLibraryError(error)
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def download(self, remote_path, local_path, overwrite=False):
        output = None
        try:
            output = self.client.download(remote_path, local_path, overwrite)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def mkdir(self, directory, permission):
        try:
            # no return value
            self.client.makedirs(directory, permission=permission)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def rmdir(self, directory):
        try:
            # no return value
            if self.client.delete(directory, recursive=True) == False:
                raise HdfsLibraryError("Directory does not exist: %r", directory)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def rename(self, src_file, dst_file):
        try:
            # no return value
            self.client.rename(src_file, dst_file)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def delete(self, file):
        try:
            # no return value
            if self.client.delete(file) == False:
                raise HdfsLibraryError("File does not exist: %r", file)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def set_time(self, file, mod_time):
        try:
            # no return value
            self.client.set_times(file, -1, mod_time)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def set_owner(self, file, owner, group):
        try:
            # no return value
            self.client.set_owner(file, owner=owner, group=group)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def set_permission(self, file, permission):
        try:
            # no return value
            self.client.set_permission(file, permission=permission)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def set_acl(self, file, aclspec):
        try:
            # no return value
            self.client.set_acl(file, aclspec=aclspec)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))

    def status(self, path):
        output = ''
        try:
            output = self.client.status(path)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def checksum(self, path):
        output = ''
        try:
            output = self.client.checksum(path)
        except HdfsError as hdfsError:
            raise HdfsLibraryError(str(hdfsError))
        except Exception as exception:
            raise HdfsLibraryError(str(exception))
        return output

    def close(self):
        self.session.close()