예제 #1
0
def update_csv():
    local = '/Users/constantine/PycharmProjects/test02/data.csv'
    tmpLocal = '/Users/constantine/PycharmProjects/test02/tmpdata.csv'
    remote = '/data/data.csv'
    host = '127.0.0.1:9870'
    user_name = 'host'
    client = HdfsClient(hosts=host,user_name=user_name)
    if client.exists(remote):
        client.copy_to_local(remote,tmpLocal)
        client.delete(remote)
        fRead = open(local,'r')
        fWrite = open(tmpLocal,'w')
        lines = fRead.readlines()

        for line in lines:
            fWrite.writelines(lines)
        fRead.close()
        fWrite.close()
        fRead = open(local, 'r')
        lines = fRead.read()
        fRead.close()
        fWrite = open(tmpLocal, 'w')
        lines = '\n'.join(list(set(lines.split('\n')))[1:])
        fWrite.write(lines)
        fWrite.close()
        client.copy_from_local(tmpLocal,remote)


    else:
        client.copy_from_local(local, remote)
예제 #2
0
    def __push_fields(self, hdfs_host: str, fields: Dict[str, Field]):
        fs = HdfsClient(hdfs_host)
        fs.mkdirs('/'.join(self.fields_path.split('/')[:-1]))
        fs.delete(self.fields_path)
        dicted_fields = {k: self.field_to_dict(v) for k, v in fields.items()}
        fs.create(self.fields_path, json.dumps(dicted_fields))

        fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name)
        fs.mkdirs('/'.join(self.c3_fields_path.split('/')[:-1]))
        fs.delete(self.c3_fields_path)
        c3_dicted_fields = {}
        for k, value in dicted_fields.items():
            if value['use_vocab']:
                max_vocab_index = len(value['vocab']['itos'])
                value['max_vocab_index'] = max_vocab_index
                value['dtype'] = str(torch.int64)
                vocab = value['vocab']
                for tok in self.FIELDS_TOKEN_ATTRS:
                    if value[tok]:
                        value[tok] = vocab['stoi'][value[tok]]
                value.pop('vocab')
                value['use_vocab'] = False
            else:
                value['max_vocab_index'] = 1
            c3_dicted_fields[k] = value
        fs.create(self.c3_fields_path, json.dumps(c3_dicted_fields))
예제 #3
0
def hdfs_clean(host, user_name, output_dir, experiment_id=None):
    '''clean up hdfs data'''
    hdfs_client = HdfsClient(hosts='{0}:80'.format(host),
                             user_name=user_name,
                             webhdfs_path='/webhdfs/api/v1',
                             timeout=5)
    if experiment_id:
        full_path = '/' + '/'.join(
            [user_name, 'nni', 'experiments', experiment_id])
    else:
        full_path = '/' + '/'.join([user_name, 'nni', 'experiments'])
    print_normal('removing folder {0} in hdfs'.format(full_path))
    hdfs_client.delete(full_path, recursive=True)
    if output_dir:
        pattern = re.compile(
            'hdfs://(?P<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?P<baseDir>/.*)?'
        )
        match_result = pattern.match(output_dir)
        if match_result:
            output_host = match_result.group('host')
            output_dir = match_result.group('baseDir')
            #check if the host is valid
            if output_host != host:
                print_warning(
                    'The host in {0} is not consistent with {1}'.format(
                        output_dir, host))
            else:
                if experiment_id:
                    output_dir = output_dir + '/' + experiment_id
                print_normal('removing folder {0} in hdfs'.format(output_dir))
                hdfs_client.delete(output_dir, recursive=True)
예제 #4
0
def Copy_From_Local(file):
    '''
    上传文件到hadoop
    '''
    h_file = ('/tmp/te/%s' % file)
    client = HdfsClient(hosts='localhost:50070')  #hdfs地址,连接hdfs
    if client.exists(h_file):
        client.delete(h_file)
        #判断文件是否存在于hdfs,存在就删除
    client.copy_from_local(file, h_file)
예제 #5
0
def save_img(path, corlor_pic):
    # 创建HDFS连接客户端
    client = HdfsClient(hosts="192.168.2.109", user_name="hadoop")
    # 读取本地图片(也可自己通过numpy模块生成)
    #     mat = cv2.imread(r"C:\Users\HUAWEI\Pictures\1.png")
    corlor_pic = cv2.resize(
        corlor_pic, (corlor_pic.shape[1] // 1, corlor_pic.shape[0] // 1))
    # hdfs保存路径
    # 写入hdfs
    if client.exists(path):
        client.delete(path)
    client.create(path, cv2.imencode('.png', corlor_pic)[1].tobytes())
예제 #6
0
class DeleteHdfsData():

    # 初始化
    def __init__(self):
        self.host = "172.27.133.18"
        self.port = "8020"
        self.userName = "******"
        self.remotePath = "/user/shiyouguandao"
        self.fs = HdfsClient(self.host, self.userName)
        self.ReadHdfsFile()

    # 读取文件
    def ReadHdfsFile(self):
        count = self.HdfsFileList(self.remotePath)
        localtime = time.asctime(time.localtime(time.time()))
        log.info(localtime + "\t共删除/user/shiyouguandao下\t" + str(count) +
                 "个csv文件...")

    # 文件删除
    def DeleteHdfsFile(self, hdfsPath):
        # self.fs.delete(hdfsPath, skip_trash=False)
        self.fs.delete(
            "/user/shiyouguandao/feature_ZSY-69_2019-09-24_23411.csv",
            skip_trash=False)

    #文件目录列表遍历判定
    def HdfsFileList(self, path):
        count = 0
        for root, dirs, files in self.fs.walk(path):
            for file in files:
                is_csv = self.hdfsFileHandler(file)
                if is_csv:
                    self.DeleteHdfsFile(path + "/" + file)
                    count += 1
        return count

    #文件过滤
    def hdfsFileHandler(self, fileName):
        if fileName.endswith(".csv"):
            temp = 60 * 60 * 24
            index = fileName.rfind("_")
            str = fileName[index - len(fileName) - 10:index - len(fileName)]
            current = int(time.time())
            fileTime = int(time.mktime(time.strptime(str, "%Y-%m-%d")))
            if (current - fileTime) >= temp:
                return True
            else:
                return False
예제 #7
0
def test_read_files_with_spaces(started_cluster):
    hdfs_api = started_cluster.hdfs_api

    fs = HdfsClient(hosts=started_cluster.hdfs_ip)
    dir = '/test_spaces'
    exists = fs.exists(dir)
    if exists:
        fs.delete(dir, recursive=True)
    fs.mkdirs(dir)

    hdfs_api.write_data(f"{dir}/test test test 1.txt", "1\n")
    hdfs_api.write_data(f"{dir}/test test test 2.txt", "2\n")
    hdfs_api.write_data(f"{dir}/test test test 3.txt", "3\n")

    node1.query(f"create table test (id UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{dir}/test*', 'TSV')")
    assert node1.query("select * from test order by id") == "1\n2\n3\n"
    fs.delete(dir, recursive=True)
예제 #8
0
def drop_table(cluster):
    node = cluster.instances["node"]

    fs = HdfsClient(hosts=cluster.hdfs_ip)
    hdfs_objects = fs.listdir('/clickhouse')
    print('Number of hdfs objects to delete:', len(hdfs_objects), sep=' ')

    node.query("DROP TABLE IF EXISTS hdfs_test SYNC")

    try:
        wait_for_delete_hdfs_objects(cluster, 0)
    finally:
        hdfs_objects = fs.listdir('/clickhouse')
        if len(hdfs_objects) == 0:
            return
        print("Manually removing extra objects to prevent tests cascade failing: ", hdfs_objects)
        for path in hdfs_objects:
            fs.delete(path)
예제 #9
0
def test_hdfsCluster(started_cluster):
    hdfs_api = started_cluster.hdfs_api
    fs = HdfsClient(hosts=started_cluster.hdfs_ip)
    dir = '/test_hdfsCluster'
    exists = fs.exists(dir)
    if exists:
        fs.delete(dir, recursive=True)
    fs.mkdirs(dir)
    hdfs_api.write_data("/test_hdfsCluster/file1", "1\n")
    hdfs_api.write_data("/test_hdfsCluster/file2", "2\n")
    hdfs_api.write_data("/test_hdfsCluster/file3", "3\n")

    actual = node1.query("select id, _file as file_name, _path as file_path from hdfs('hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id")
    expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n"
    assert actual == expected

    actual = node1.query("select id, _file as file_name, _path as file_path from hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') order by id")
    expected = "1\tfile1\thdfs://hdfs1:9000/test_hdfsCluster/file1\n2\tfile2\thdfs://hdfs1:9000/test_hdfsCluster/file2\n3\tfile3\thdfs://hdfs1:9000/test_hdfsCluster/file3\n"
    assert actual == expected
    fs.delete(dir, recursive=True)
예제 #10
0
    def __push_preprocessed(self, c3_path:str, user_name:str, dataset: Dataset):
        def push_to_hdfs(jstrs):
            if not fs.exists(c3_path):
                fs.create(c3_path, '\n'.join(jstrs) + '\n')
            else:
                fs.append(c3_path, '\n'.join(jstrs) + '\n')

        fs = HdfsClient(self.C3_HDFS_HOST, user_name=user_name)
        fs.mkdirs('/'.join(c3_path.split('/')[:-1]))
        fs.delete(c3_path)
        jstrs = []
        BUFSIZE = 2048
        for fxed_instance in tqdm(Iterator(dataset, batch_size=1), maxinterval=len(dataset.examples)):
            fxed_instance_dict = {name: getattr(fxed_instance, name).tolist()[0] for name in self.fields.keys()}
            jstrs.append(json.dumps(fxed_instance_dict))
            if len(jstrs) >= BUFSIZE:
                push_to_hdfs(jstrs)
                jstrs = []

        if jstrs:
            push_to_hdfs(jstrs)
예제 #11
0
 def _make_empty_dir(self, client: HdfsClient) -> None:
     # Get an empty dir
     client.delete(TEST_DIR, recursive=True)
     assert not client.delete(TEST_DIR, recursive=True)
     assert client.mkdirs(TEST_DIR)
예제 #12
0
class HDFSClientUtilityTest(unittest.TestCase):
    '''Unit test for hdfsClientUtility.py'''
    def setUp(self):
        self.hdfs_file_path = '../../.vscode/hdfsInfo.json'
        self.hdfs_config = None
        try:
            with open(self.hdfs_file_path, 'r') as file:
                self.hdfs_config = json.load(file)
        except Exception as exception:
            print(exception)

        self.hdfs_client = HdfsClient(hosts='{0}:{1}'.format(
            self.hdfs_config['host'], '50070'),
                                      user_name=self.hdfs_config['userName'])

    def get_random_name(self, length):
        return ''.join(
            random.sample(string.ascii_letters + string.digits, length))

    def test_copy_file_run(self):
        '''test copyFileToHdfs'''
        file_name = self.get_random_name(8)
        file_content = 'hello world!'

        with open('./{}'.format(file_name), 'w') as file:
            file.write(file_content)

        result = copyFileToHdfs(
            './{}'.format(file_name),
            '/{0}/{1}'.format(self.hdfs_config['userName'],
                              file_name), self.hdfs_client)
        self.assertTrue(result)

        file_list = self.hdfs_client.listdir('/{0}'.format(
            self.hdfs_config['userName']))
        self.assertIn(file_name, file_list)

        hdfs_file_name = self.get_random_name(8)
        self.hdfs_client.copy_to_local(
            '/{0}/{1}'.format(self.hdfs_config['userName'], file_name),
            './{}'.format(hdfs_file_name))
        self.assertTrue(os.path.exists('./{}'.format(hdfs_file_name)))

        with open('./{}'.format(hdfs_file_name), 'r') as file:
            content = file.readline()
            self.assertEqual(file_content, content)
        #clean up
        os.remove('./{}'.format(file_name))
        os.remove('./{}'.format(hdfs_file_name))
        self.hdfs_client.delete('/{0}/{1}'.format(self.hdfs_config['userName'],
                                                  file_name))

    def test_copy_directory_run(self):
        '''test copyDirectoryToHdfs'''
        directory_name = self.get_random_name(8)
        file_name_list = [self.get_random_name(8), self.get_random_name(8)]
        file_content = 'hello world!'

        os.makedirs('./{}'.format(directory_name))
        for file_name in file_name_list:
            with open('./{0}/{1}'.format(directory_name, file_name),
                      'w') as file:
                file.write(file_content)

        result = copyDirectoryToHdfs(
            './{}'.format(directory_name),
            '/{0}/{1}'.format(self.hdfs_config['userName'],
                              directory_name), self.hdfs_client)
        self.assertTrue(result)

        directory_list = self.hdfs_client.listdir('/{0}'.format(
            self.hdfs_config['userName']))
        self.assertIn(directory_name, directory_list)

        sub_file_list = self.hdfs_client.listdir('/{0}/{1}'.format(
            self.hdfs_config['userName'], directory_name))
        for file_name in file_name_list:
            self.assertIn(file_name, sub_file_list)
            #clean up
            self.hdfs_client.delete('/{0}/{1}/{2}'.format(
                self.hdfs_config['userName'], directory_name, file_name))
        self.hdfs_client.delete('/{0}/{1}'.format(self.hdfs_config['userName'],
                                                  directory_name))

        shutil.rmtree('./{}'.format(directory_name))
예제 #13
0
파일: HdfsUtil.py 프로젝트: ldw0810/Crawler
class hdfs(object):
    #默认50070端口
    def __init__(self, cur_database_param):
        # super(HdfsClients, self).__init__()
        # self.quert_db_info = super(HdfsClients, self).getDBConfig()
        # self.hdfsHost=self.quert_db_info["host"]
        hdfsHost = cur_database_param['url']
        path = cur_database_param['dbname']
        self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost))
        self.host = hdfsHost
        self.path = path

    def append(self, path, data):
        self.hdfs.append(path, data)
        pass

    def concat(self, target, sources):
        self.concat(target, sources)

    # self, taskJobId,tableName=None,jobTemplateFieldList=None
    def createTableByTaskJobId(self,
                               taskJobId,
                               tableName=None,
                               jobTemplateFieldList=None,
                               data=None):
        if tableName == None:
            taskJob = TaskJobDao.loadTaskById(taskJobId)
            tableName = taskJob.tableName
        path = self.path + '/' + tableName
        self.hdfs.create(path, data, replication=2)

    def hmkdirs(self, path):
        self.hdfs.mkdirs(path)

    def open(self, path):
        return self.hdfs.open(path=path)

    def delete(self, path):
        self.hdfs.delete(path=path)

    def listdir(self, rule):
        f = self.hdfs.listdir(rule)
        return f

    def insert(self, jobid, tablename, column_dict, paramMap=None):
        if tablename == None:
            taskJob = TaskJobDao.loadTaskById(jobid)
            tablename = taskJob.tableName
        path = self.path + '/' + tablename
        createTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                   time.localtime(time.time()))
        task_job_id_sequenceValue = paramMap.get(
            "task_job_id_sequence") if paramMap != None else None
        if task_job_id_sequenceValue != None:
            column_dict.update(
                {"task_job_id_sequence": str(task_job_id_sequenceValue)})
        column_dict.update({
            "task_job_del_flag": "False",
            "task_job_create_time": createTime
        })
        # self.append(path, column_dict)
        if self.isTableExist(tablename):
            self.append(path, column_dict)
        else:
            self.createTableByTaskJobId(jobid, tablename, column_dict)
        # return column_dict

    def isTableExist(self, tablename):
        path = self.path + '/' + tablename
        exist = self.hdfs.exists(path)
        return exist

    def save_to_hdfs(self, jobid, path, data):
        if self.isTableExist(path):
            self.append(path, data)
        else:
            self.createTableByTaskJobId(jobid, path, data)

    def save_to_hdfs2(self, path, data):
        if self.hdfs.exists(path):
            self.hdfs.append(path, data)
        else:
            self.hdfs.create(path, data, replication=2)

    def execute(self, sqls="append", path=None, data=None):
        try:
            if isinstance(sqls, list) and len(sqls) > 0:
                for sql in sqls:
                    # method = eval(sql)
                    method = getattr(self, sql)
                    method(path, data)
            else:
                # method = eval(sqls)
                method = getattr(self, sqls)
                method(path, data)
        except Exception, e:
            logging.error("hdfs,execute," + str(e))
            raise Exception()