Пример #1
0
    def __push_fields(self, hdfs_host: str, fields: Dict[str, Field]):
        fs = HdfsClient(hdfs_host)
        fs.mkdirs('/'.join(self.fields_path.split('/')[:-1]))
        fs.delete(self.fields_path)
        dicted_fields = {k: self.field_to_dict(v) for k, v in fields.items()}
        fs.create(self.fields_path, json.dumps(dicted_fields))

        fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name)
        fs.mkdirs('/'.join(self.c3_fields_path.split('/')[:-1]))
        fs.delete(self.c3_fields_path)
        c3_dicted_fields = {}
        for k, value in dicted_fields.items():
            if value['use_vocab']:
                max_vocab_index = len(value['vocab']['itos'])
                value['max_vocab_index'] = max_vocab_index
                value['dtype'] = str(torch.int64)
                vocab = value['vocab']
                for tok in self.FIELDS_TOKEN_ATTRS:
                    if value[tok]:
                        value[tok] = vocab['stoi'][value[tok]]
                value.pop('vocab')
                value['use_vocab'] = False
            else:
                value['max_vocab_index'] = 1
            c3_dicted_fields[k] = value
        fs.create(self.c3_fields_path, json.dumps(c3_dicted_fields))
Пример #2
0
class HDFSSErvice:

    namenode_host = "localhost"
    namenode_port = "9870"
    root_folder = "/"
    chunck_size = 100000

    def __init__(self):
        self._client = HdfsClient(hosts=self.namenode_host + ":" + self.namenode_port, user_name="root")

    def get(self, hdfs_path: str):
        file_size = self.get_file_size(hdfs_path)
        for i in range(0, file_size, self.chunck_size):
            file_response = self._client.open(hdfs_path, offset=i, length=i + self.chunck_size)
            yield file_response.read()
        
    def append(self, hdfs_path: str, data: bytes):
        self.create_if_not_exist(hdfs_path)
        self._client.append(hdfs_path, data)
    
    def create_if_not_exist(self, hdfs_path: str):
        if not self._client.exists(hdfs_path):
            self._client.create(hdfs_path, b"")

    def get_messages_number(self, hdfs_path: str):
        return int(self.get_file_size(hdfs_path) / self.chunck_size + 1)

    def get_file_size(self, hdfs_path):
        file_infos = self._client.get_content_summary(hdfs_path)
        return file_infos.length

    def test(self):
        pass
Пример #3
0
def upload_txt_to_hdfs(arr):
    client = HdfsClient(hosts="localhost:50070", user_name="Alphalbj")
    name = "/words/words-" + datetime.datetime.now().strftime(
        '%Y-%m-%d-%H-%M-%S') + ".txt"
    content = ""
    for word in arr:
        content += word + " "
    client.create(name, content.encode('utf-8'))
Пример #4
0
    def _setup_walk(self, client: HdfsClient) -> Callable[..., str]:
        def path(*args: str) -> str:
            return posixpath.join(TEST_DIR, *args)

        self._make_empty_dir(client)
        client.create(path("f1"), b"")
        client.mkdirs(path("a1", "b1"))
        client.create(path("a1", "b1", "f2"), b"")
        client.mkdirs(path("a1", "b2"))
        client.mkdirs(path("a2"))
        return path
Пример #5
0
def save_img(path, corlor_pic):
    # 创建HDFS连接客户端
    client = HdfsClient(hosts="192.168.2.109", user_name="hadoop")
    # 读取本地图片(也可自己通过numpy模块生成)
    #     mat = cv2.imread(r"C:\Users\HUAWEI\Pictures\1.png")
    corlor_pic = cv2.resize(
        corlor_pic, (corlor_pic.shape[1] // 1, corlor_pic.shape[0] // 1))
    # hdfs保存路径
    # 写入hdfs
    if client.exists(path):
        client.delete(path)
    client.create(path, cv2.imencode('.png', corlor_pic)[1].tobytes())
    def save_model(self, savemodel, model, appendix=None):
        if(savemodel):
            c3_path = f'/user/{self.username}/fortuna/model/{self.trainfile}_{self.testnum}/model'
            fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.username)
            if appendix:
                c3_path += f'_{appendix}'

            model_pickle = pickle.dumps(model.state_dict())
            try:
                fs.create(c3_path, model_pickle, overwrite=True)
            except Exception as e:
                print(e)
        else:
            file_name = f'data_out/model'
            if appendix:
                file_name += f'_{appendix}'
            torch.save({'model': model.state_dict(), 'task': type(self.task).__name__}, file_name)
Пример #7
0
 def _make_dir_and_file(self, client: HdfsClient) -> None:
     self._make_empty_dir(client)
     client.create(TEST_FILE, FILE_CONTENTS)
Пример #8
0
                logger.debug('doing file : %s' % f_fullname)
                f = client.open(f_fullname)
                try:
                    f_context = f.read().decode('gbk')
                except UnicodeDecodeError as e:
                    logger.error('decode error : %s' % f_fullname)
                    logger.error(e)
                    dir_error = os.path.join(dir_subdata, 'error_cleaning')
                    if not client.exists(dir_error):
                        client.mkdirs(dir_error)
                        logger.debug('mkdir dir for error files : %s' %
                                     dir_error)
                    #TODO: if success delete error files
                    fname_error = os.path.join(dir_error, fname)
                    if not client.exists(fname_error):
                        client.create(fname_error, None)
                        logger.warn('create error flag file : %s' %
                                    fname_error)
                    continue
                finally:
                    f.close()

                list_datalines = f_context.split('\n')
                s_write_buffer = ''
                for line in list_datalines:
                    a = public.transform_line(constants, line)
                    if a:
                        a.append(s_guapairiqi)
                        s_write_buffer += '\t'.join(a)
                        s_write_buffer += '\n'
Пример #9
0
class hdfs(object):
    #默认50070端口
    def __init__(self, cur_database_param):
        # super(HdfsClients, self).__init__()
        # self.quert_db_info = super(HdfsClients, self).getDBConfig()
        # self.hdfsHost=self.quert_db_info["host"]
        hdfsHost = cur_database_param['url']
        path = cur_database_param['dbname']
        self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost))
        self.host = hdfsHost
        self.path = path

    def append(self, path, data):
        self.hdfs.append(path, data)
        pass

    def concat(self, target, sources):
        self.concat(target, sources)

    # self, taskJobId,tableName=None,jobTemplateFieldList=None
    def createTableByTaskJobId(self,
                               taskJobId,
                               tableName=None,
                               jobTemplateFieldList=None,
                               data=None):
        if tableName == None:
            taskJob = TaskJobDao.loadTaskById(taskJobId)
            tableName = taskJob.tableName
        path = self.path + '/' + tableName
        self.hdfs.create(path, data, replication=2)

    def hmkdirs(self, path):
        self.hdfs.mkdirs(path)

    def open(self, path):
        return self.hdfs.open(path=path)

    def delete(self, path):
        self.hdfs.delete(path=path)

    def listdir(self, rule):
        f = self.hdfs.listdir(rule)
        return f

    def insert(self, jobid, tablename, column_dict, paramMap=None):
        if tablename == None:
            taskJob = TaskJobDao.loadTaskById(jobid)
            tablename = taskJob.tableName
        path = self.path + '/' + tablename
        createTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
        task_job_id_sequenceValue = paramMap.get(
            "task_job_id_sequence") if paramMap != None else None
        if task_job_id_sequenceValue != None:
            column_dict.update(
                {"task_job_id_sequence": str(task_job_id_sequenceValue)})
        column_dict.update({
            "task_job_del_flag": "False",
            "task_job_create_time": createTime
        })
        # self.append(path, column_dict)
        if self.isTableExist(tablename):
            self.append(path, column_dict)
        else:
            self.createTableByTaskJobId(jobid, tablename, column_dict)
        # return column_dict

    def isTableExist(self, tablename):
        path = self.path + '/' + tablename
        exist = self.hdfs.exists(path)
        return exist

    def save_to_hdfs(self, jobid, path, data):
        if self.isTableExist(path):
            self.append(path, data)
        else:
            self.createTableByTaskJobId(jobid, path, data)

    def save_to_hdfs2(self, path, data):
        if self.hdfs.exists(path):
            self.hdfs.append(path, data)
        else:
            self.hdfs.create(path, data, replication=2)

    def execute(self, sqls="append", path=None, data=None):
        try:
            if isinstance(sqls, list) and len(sqls) > 0:
                for sql in sqls:
                    # method = eval(sql)
                    method = getattr(self, sql)
                    method(path, data)
            else:
                # method = eval(sqls)
                method = getattr(self, sqls)
                method(path, data)
        except Exception, e:
            logging.error("hdfs,execute," + str(e))
            raise Exception()
Пример #10
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from pyhdfs import HdfsClient
'''
python链接hadoop的hdfs文件系统,进行文件的上传和下载
'''

# 从hdfs文件系统读取文件

# hdfs地址
# client = HdfsClient(hosts='192.168.1.163:50070')
client = HdfsClient(hosts='192.168.1.156:50070')

print(client.listdir("/repo/"))

res = client.open('/repo/README.txt')
for r in res:
    line = str(r, encoding='utf-8')  # open后是二进制,str()转换为字符串并转码
    print(line)

client = HdfsClient(hosts='192.168.1.156:50070',
                    user_name='hadoop')  # 只有hadoop用户拥有写权限
str1 = 'hello world'
client.create('/py.txt', str1)  # 创建新文件并写入字符串

# 上传本地文件到HDFS

# client = HdfsClient(hosts='hacker:50070', user_name='root')
# 本地文件绝对路径,HDFS目录必须不存在
# client.copy_from_local('D:/PythonProjects/crawl_work/thread_crawl_work02', '/usr/hadoop/')