Пример #1
0
def test_move_partition_to_another_disk(cluster):
    create_table(cluster, "hdfs_test")

    node = cluster.instances["node"]
    fs = HdfsClient(hosts=cluster.hdfs_ip)

    node.query("INSERT INTO hdfs_test VALUES {}".format(
        generate_values('2020-01-03', 4096)))
    node.query("INSERT INTO hdfs_test VALUES {}".format(
        generate_values('2020-01-04', 4096)))
    assert node.query(
        "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(
        hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2

    node.query(
        "ALTER TABLE hdfs_test MOVE PARTITION '2020-01-04' TO DISK 'hdd'")
    assert node.query(
        "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE

    node.query(
        "ALTER TABLE hdfs_test MOVE PARTITION '2020-01-04' TO DISK 'hdfs'")
    assert node.query(
        "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(
        hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
Пример #2
0
def ProcAll(LocalDir, HdfsDir):
    NameNode = GolobalConfig['hdfs']['NameNode']
    UserName = GolobalConfig['hdfs']['UserName']
    client = HdfsClient(hosts=NameNode, user_name=UserName)
    if not client.exists(HdfsDir):
        client.mkdirs(HdfsDir)
    total = len(os.listdir(LocalDir))
    processed = 0
    failedList = list()
    FileSize = 0
    StartTime = time.time()
    for filename in os.listdir(LocalDir):
        srcFile = os.path.join(LocalDir, filename)
        dstFile = HdfsDir + '/' + filename
        if not ProcOne(client, srcFile, dstFile):
            failedList.append(srcFile)
        else:
            FileSize += os.path.getsize(srcFile)
        processed += 1
        print('%d/%d/%d, time cost: %.2f s' %
              (total, processed, len(failedList), time.time() - StartTime))
        print('%d B, %.2f MB/s \n' % (FileSize, FileSize / 1024 / 1024 /
                                      (time.time() - StartTime)))

    if failedList:
        print('failedList: %s' % repr(failedList))
        return False
    else:
        print('Good! No Error!')
        print('%d B, %.2f MB, %.2f GB, %.2f MB/s' % \
          (FileSize, FileSize/1024/1024, FileSize/1024/1024/1024, FileSize/1024/1024/(time.time()-StartTime)))
        return True
Пример #3
0
def read_hdfs(filename,
              host,
              split_ratio,
              delimiter=',',
              normalize=False,
              dtype=None,
              header=None,
              skiprows=None,
              index_col=False,
              output_label=True,
              randomize=False,
              return_as_dataframe=False,
              describe=False,
              label_vector=False):
    client = HdfsClient(hosts=host)
    return read_csv(filename=client.open(filename),
                    split_ratio=split_ratio,
                    delimiter=delimiter,
                    normalize=normalize,
                    dtype=dtype,
                    header=header,
                    skiprows=skiprows,
                    index_col=index_col,
                    output_label=output_label,
                    randomize=randomize,
                    return_as_dataframe=return_as_dataframe,
                    describe=describe,
                    label_vector=label_vector)
Пример #4
0
 def test_invalid_construction(self) -> None:
     with self.assertRaises(ValueError):
         HdfsClient([])
     with self.assertRaises(ValueError):
         HdfsClient(retry_delay=-1)
     with self.assertRaises(ValueError):
         HdfsClient(max_tries=0)
Пример #5
0
 def __init__(self):
     self.host = "172.27.133.18"
     self.port = "8020"
     self.userName = "******"
     self.remotePath = "/user/shiyouguandao"
     self.fs = HdfsClient(self.host, self.userName)
     self.ReadHdfsFile()
Пример #6
0
def get_hdfs_client(args):
    global _hdfs_client

    if _hdfs_client is not None:
        return _hdfs_client
    # backward compatibility
    hdfs_host = None

    if args.hdfs_host:
        hdfs_host = args.hdfs_host
    elif args.pai_hdfs_host:
        hdfs_host = args.pai_hdfs_host
    else:
        return None

    if hdfs_host is not None and args.nni_hdfs_exp_dir is not None:
        try:
            if args.webhdfs_path:
                _hdfs_client = HdfsClient(hosts='{0}:80'.format(hdfs_host),
                                          user_name=args.pai_user_name,
                                          webhdfs_path=args.webhdfs_path,
                                          timeout=5)
            else:
                # backward compatibility
                _hdfs_client = HdfsClient(hosts='{0}:{1}'.format(
                    hdfs_host, '50070'),
                                          user_name=args.pai_user_name,
                                          timeout=5)
        except Exception as e:
            nni_log(LogType.Error, 'Create HDFS client error: ' + str(e))
            raise e
    return _hdfs_client
Пример #7
0
def test_table_manipulations(cluster):
    create_table(cluster, "hdfs_test")

    node = cluster.instances["node"]
    fs = HdfsClient(hosts=cluster.hdfs_ip)

    node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-03', 4096)))
    node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-04', 4096)))

    node.query("RENAME TABLE hdfs_test TO hdfs_renamed")
    assert node.query("SELECT count(*) FROM hdfs_renamed FORMAT Values") == "(8192)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2

    node.query("RENAME TABLE hdfs_renamed TO hdfs_test")
    assert node.query("CHECK TABLE hdfs_test FORMAT Values") == "(1)"

    node.query("DETACH TABLE hdfs_test")
    node.query("ATTACH TABLE hdfs_test")
    assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2

    node.query("TRUNCATE TABLE hdfs_test")
    assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(0)"

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD
Пример #8
0
def cluster():
    try:
        cluster = ClickHouseCluster(__file__)
        cluster.add_instance(
            "node1",
            main_configs=["configs/config.d/storage_conf.xml"],
            macros={"replica": "node1"},
            with_zookeeper=True,
            with_hdfs=True,
        )
        cluster.add_instance(
            "node2",
            main_configs=["configs/config.d/storage_conf.xml"],
            macros={"replica": "node2"},
            with_zookeeper=True,
            with_hdfs=True,
        )
        logging.info("Starting cluster...")
        cluster.start()
        if cluster.instances["node1"].is_debug_build():
            # https://github.com/ClickHouse/ClickHouse/issues/27814
            pytest.skip(
                "libhdfs3 calls rand function which does not pass harmful check in debug build"
            )
        logging.info("Cluster started")

        fs = HdfsClient(hosts=cluster.hdfs_ip)
        fs.mkdirs("/clickhouse1")
        fs.mkdirs("/clickhouse2")
        logging.info("Created HDFS directory")

        yield cluster
    finally:
        cluster.shutdown()
Пример #9
0
def test_simple_insert_select(cluster, min_rows_for_wide_part, files_per_part):
    create_table(cluster,
                 "hdfs_test",
                 additional_settings="min_rows_for_wide_part={}".format(
                     min_rows_for_wide_part))

    node = cluster.instances["node"]

    values1 = generate_values('2020-01-03', 4096)
    node.query("INSERT INTO hdfs_test VALUES {}".format(values1))
    assert node.query(
        "SELECT * FROM hdfs_test order by dt, id FORMAT Values") == values1

    fs = HdfsClient(hosts=cluster.hdfs_ip)

    hdfs_objects = fs.listdir('/clickhouse')
    print(hdfs_objects)
    assert len(hdfs_objects) == FILES_OVERHEAD + files_per_part

    values2 = generate_values('2020-01-04', 4096)
    node.query("INSERT INTO hdfs_test VALUES {}".format(values2))
    assert node.query("SELECT * FROM hdfs_test ORDER BY dt, id FORMAT Values"
                      ) == values1 + "," + values2

    hdfs_objects = fs.listdir('/clickhouse')
    assert len(hdfs_objects) == FILES_OVERHEAD + files_per_part * 2

    assert node.query(
        "SELECT count(*) FROM hdfs_test where id = 1 FORMAT Values") == "(2)"
Пример #10
0
def upload_txt_to_hdfs(arr):
    client = HdfsClient(hosts="localhost:50070", user_name="Alphalbj")
    name = "/words/words-" + datetime.datetime.now().strftime(
        '%Y-%m-%d-%H-%M-%S') + ".txt"
    content = ""
    for word in arr:
        content += word + " "
    client.create(name, content.encode('utf-8'))
Пример #11
0
 def __init__(self, cur_database_param):
     # super(HdfsClients, self).__init__()
     # self.quert_db_info = super(HdfsClients, self).getDBConfig()
     # self.hdfsHost=self.quert_db_info["host"]
     hdfsHost = cur_database_param['url']
     path = cur_database_param['dbname']
     self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost))
     self.host = hdfsHost
     self.path = path
Пример #12
0
    def load_fields_with_vocab(self, hdfs_host: str) -> Dict[str, Field]:
        fs = HdfsClient(hdfs_host)
        if fs.exists(self.fields_path):
            print(f'get fields from {hdfs_host}{self.fields_path}')
        else:
            raise Exception(f'there are no fields in {hdfs_host}{self.fields_path}')

        loaded_dict = json.loads(fs.open(self.fields_path).read())
        return {k: self.dict_to_field(v) for k, v in loaded_dict.items()}
Пример #13
0
def wait_for_delete_hdfs_objects(cluster, expected, num_tries=30):
    fs = HdfsClient(hosts=cluster.hdfs_ip)
    while num_tries > 0:
        num_hdfs_objects = len(fs.listdir('/clickhouse'))
        if num_hdfs_objects == expected:
            break
        num_tries -= 1
        time.sleep(1)
    assert(len(fs.listdir('/clickhouse')) == expected)
Пример #14
0
def Copy_To_Local(file):
    '''
    从Hadoop上下载文件
    '''
    client = HdfsClient(hosts='localhost:50070')  #连接到hdfs
    if os.path.exists(file):
        os.remove(file)
        #判断本地是否存在文件,存在就删除
    client.copy_to_local(file, './')
Пример #15
0
def wait_for_hdfs_objects(cluster, fp, expected, num_tries=30):
    fs = HdfsClient(hosts=cluster.hdfs_ip)
    while num_tries > 0:
        num_hdfs_objects = len(fs.listdir(fp))
        if num_hdfs_objects == expected:
            break
        num_tries -= 1
        time.sleep(1)
    assert len(fs.listdir(fp)) == expected
Пример #16
0
 def __load_corpus_from_hdfs(self, hdfs_host: str) -> List:
     fs = HdfsClient(hdfs_host)
     with fs.open(self.corpus_path) as fp:
         corpus = list()
         for line in tqdm(fp.read().decode().split('\n')):
             if line:
                 d = json.loads(line)
                 corpus.append(d)
     return corpus
 def load_model(self, train_dir, modelnum, appendix):
     print('~' * 100)
     c3_path = f'/user/{self.username}/fortuna/model/{train_dir}_{modelnum}/model_e{appendix}'
     print(c3_path)
     fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.username)
     model_pickle = fs.open(c3_path)
     model_dict = pickle.load(model_pickle)
     self.model.load_state_dict(model_dict)
     acc_lst, total, prec,recall,f1score,f1s,rocauc = self.eval(self.test_iter, len(self.task.te_dataset))
     print('~' * 100)
Пример #18
0
 def setUp(self):
     self.hdfs_file_path = '../../.vscode/hdfsInfo.json'
     self.hdfs_config = None
     try:
         with open(self.hdfs_file_path, 'r') as file:
             self.hdfs_config = json.load(file)
     except Exception as exception:
         print(exception)
     
     self.hdfs_client = HdfsClient(hosts='{0}:{1}'.format(self.hdfs_config['host'], '50070'), user_name=self.hdfs_config['userName'])
Пример #19
0
 def load_fields_from_c3(self) -> Dict[str, Field]:
     fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name)
     if fs.exists(self.c3_fields_path):
         print(f'get fields from {self.C3_HDFS_HOST}{self.c3_fields_path}')
     else:
         raise Exception(f'there are no fields in {self.C3_HDFS_HOST}{self.c3_fields_path}')
     loaded_dict = json.loads(fs.open(self.c3_fields_path).read())
     print(loaded_dict)
     max_vocab_indexes = {k: v['max_vocab_index'] for k, v in loaded_dict.items()}
     return {k: self.dict_to_field(v) for k, v in loaded_dict.items()}, max_vocab_indexes
Пример #20
0
    def _setup_walk(self, client: HdfsClient) -> Callable[..., str]:
        def path(*args: str) -> str:
            return posixpath.join(TEST_DIR, *args)

        self._make_empty_dir(client)
        client.create(path("f1"), b"")
        client.mkdirs(path("a1", "b1"))
        client.create(path("a1", "b1", "f2"), b"")
        client.mkdirs(path("a1", "b2"))
        client.mkdirs(path("a2"))
        return path
Пример #21
0
 def load_matrix(self, filepath, shape=None):
     if os.environ['local'] == '1' and os.path.exists(filepath):
         return np.loadtxt(filepath, dtype=np.float)
     else:
         hosts = os.environ['hosts']
         if len(hosts) == 0:
             hosts = 'master'
         client = HdfsClient(hosts=hosts)
         if client.exists(filepath):
             return np.fromstring(
                 client.open(filepath).read()).reshape(shape)
     return False
Пример #22
0
def crawler(word, products_list=[]):
    """ 爬取一号店的商品数据 """
    word = urllib.parse.quote(word)

    url = 'https://search.yhd.com/c0-0/k{0}'.format(word)

    # 获取html源码
    html_doc = requests.get(url).text

    # xpath对象
    selector = html.fromstring(html_doc)

    # 商品列表
    ul_list = selector.xpath('//div[@id="itemSearchList"]/div')

    # 解析数据
    for li in ul_list:

        # 标题
        title = li.xpath('div//p[@class="proName clearfix"]/a/@title')
        #print(title)

        # 链接
        link = li.xpath('div//p[@class="proName clearfix"]/a/@href')
        #print(link)

        # 价格
        price = li.xpath('div//p[@class="proPrice"]/em/@yhdprice')

        with open("p_price", "a", encoding="gbk") as f:
            for j in range(len(price)):
                f.write(price[j] + "\n")

        f.close()
        #print(price)

        if len(title) > 0 and len(link) > 0 and len(price) > 0:
            # print(title)
            # print(link)
            # print(price)
            # print('--------------------')

            products_list.append({
                'title': title[0],
                'price': price[0],
                'link': 'https:' + link[0],
                'referer': '1号店'
            })
    client = HdfsClient(hosts='222.27.166.209:50070', user_name='hadoop')
    client.copy_from_local('/home/hadoop/Downloads/PriceCompaer/p_price',
                           '/p_price.txt')
Пример #23
0
    def _load_preprocessed(self) -> List[Example]:
        fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name)
        if fs.exists(self.c3_path):
            print(f'get preprocessed corpus from {self.C3_HDFS_HOST}{self.c3_path}')
        else:
            raise Exception(f'there are no preprocessed in {self.C3_HDFS_HOST}{self.c3_path}')

        preprocessed = []
        for line in fs.open(self.c3_path).read().decode().split('\n'):
            if line:
                ex = Example()
                for k, v in json.loads(line).items():
                    setattr(ex, k, v)
                preprocessed.append(ex)
        return preprocessed
Пример #24
0
class DeleteHdfsData():

    # 初始化
    def __init__(self):
        self.host = "172.27.133.18"
        self.port = "8020"
        self.userName = "******"
        self.remotePath = "/user/shiyouguandao"
        self.fs = HdfsClient(self.host, self.userName)
        self.ReadHdfsFile()

    # 读取文件
    def ReadHdfsFile(self):
        count = self.HdfsFileList(self.remotePath)
        localtime = time.asctime(time.localtime(time.time()))
        log.info(localtime + "\t共删除/user/shiyouguandao下\t" + str(count) +
                 "个csv文件...")

    # 文件删除
    def DeleteHdfsFile(self, hdfsPath):
        # self.fs.delete(hdfsPath, skip_trash=False)
        self.fs.delete(
            "/user/shiyouguandao/feature_ZSY-69_2019-09-24_23411.csv",
            skip_trash=False)

    #文件目录列表遍历判定
    def HdfsFileList(self, path):
        count = 0
        for root, dirs, files in self.fs.walk(path):
            for file in files:
                is_csv = self.hdfsFileHandler(file)
                if is_csv:
                    self.DeleteHdfsFile(path + "/" + file)
                    count += 1
        return count

    #文件过滤
    def hdfsFileHandler(self, fileName):
        if fileName.endswith(".csv"):
            temp = 60 * 60 * 24
            index = fileName.rfind("_")
            str = fileName[index - len(fileName) - 10:index - len(fileName)]
            current = int(time.time())
            fileTime = int(time.mktime(time.strptime(str, "%Y-%m-%d")))
            if (current - fileTime) >= temp:
                return True
            else:
                return False
Пример #25
0
def cluster():
    try:
        cluster = ClickHouseCluster(__file__)
        cluster.add_instance("node", main_configs=["configs/config.d/storage_conf.xml"], with_hdfs=True)
        logging.info("Starting cluster...")
        cluster.start()
        logging.info("Cluster started")

        fs = HdfsClient(hosts=cluster.hdfs_ip)
        fs.mkdirs('/clickhouse')

        logging.info("Created HDFS directory")

        yield cluster
    finally:
        cluster.shutdown()
Пример #26
0
def test_alter_table_columns(cluster):
    create_table(cluster, "hdfs_test")

    node = cluster.instances["node"]
    fs = HdfsClient(hosts=cluster.hdfs_ip)

    node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-03', 4096)))
    node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-03', 4096, -1)))

    node.query("ALTER TABLE hdfs_test ADD COLUMN col1 UInt64 DEFAULT 1")
    # To ensure parts have merged
    node.query("OPTIMIZE TABLE hdfs_test")

    assert node.query("SELECT sum(col1) FROM hdfs_test FORMAT Values") == "(8192)"
    assert node.query("SELECT sum(col1) FROM hdfs_test WHERE id > 0 FORMAT Values") == "(4096)"
    wait_for_delete_hdfs_objects(cluster, FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE + FILES_OVERHEAD_PER_COLUMN)

    node.query("ALTER TABLE hdfs_test MODIFY COLUMN col1 String", settings={"mutations_sync": 2})

    assert node.query("SELECT distinct(col1) FROM hdfs_test FORMAT Values") == "('1')"
    # and file with mutation
    wait_for_delete_hdfs_objects(cluster, FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE + FILES_OVERHEAD_PER_COLUMN + 1)

    node.query("ALTER TABLE hdfs_test DROP COLUMN col1", settings={"mutations_sync": 2})

    # and 2 files with mutations
    wait_for_delete_hdfs_objects(cluster, FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE + 2)
Пример #27
0
def main_loop(args):
    '''main loop logic for trial keeper'''

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    stdout_file = open(STDOUT_FULL_PATH, 'a+')
    stderr_file = open(STDERR_FULL_PATH, 'a+')
    # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior
    process = Popen(args.trial_command,
                    shell=True,
                    stdout=stdout_file,
                    stderr=stderr_file)
    nni_log(
        LogType.Info,
        'Trial keeper spawns a subprocess (pid {0}) to run command: {1}'.
        format(process.pid, shlex.split(args.trial_command)))

    while True:
        retCode = process.poll()
        ## Read experiment metrics, to avoid missing metrics
        read_experiment_metrics(args.nnimanager_ip, args.nnimanager_port)

        if retCode is not None:
            nni_log(
                LogType.Info,
                'subprocess terminated. Exit code is {}. Quit'.format(retCode))
            if NNI_PLATFORM == 'pai':
                # Copy local directory to hdfs for OpenPAI
                nni_local_output_dir = os.environ['NNI_OUTPUT_DIR']
                try:
                    hdfs_client = HdfsClient(hosts='{0}:{1}'.format(
                        args.pai_hdfs_host, '50070'),
                                             user_name=args.pai_user_name,
                                             timeout=5)
                    if copyDirectoryToHdfs(nni_local_output_dir,
                                           args.pai_hdfs_output_dir,
                                           hdfs_client):
                        nni_log(
                            LogType.Info,
                            'copy directory from {0} to {1} success!'.format(
                                nni_local_output_dir,
                                args.pai_hdfs_output_dir))
                    else:
                        nni_log(
                            LogType.Info,
                            'copy directory from {0} to {1} failed!'.format(
                                nni_local_output_dir,
                                args.pai_hdfs_output_dir))
                except Exception as e:
                    nni_log(LogType.Error,
                            'HDFS copy directory got exception: ' + str(e))
                    raise e

            ## Exit as the retCode of subprocess(trial)
            exit(retCode)
            break

        time.sleep(2)
Пример #28
0
def getAllFolderHaveData(client_: HdfsClient, path_: str):
    _folderPathList = []
    for _root, _dir, _files in client_.walk(path_, status=True):
        # 有文件的内容的文件夹才是需要拷贝的文件夹
        if len(_files) > 0:
            print(_root)
            _folderPathList.append(_root)
    return utils.listUtils.joinListToStr(_folderPathList, "\n")
    def save_model(self, savemodel, model, appendix=None):
        if(savemodel):
            c3_path = f'/user/{self.username}/fortuna/model/{self.trainfile}_{self.testnum}/model'
            fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.username)
            if appendix:
                c3_path += f'_{appendix}'

            model_pickle = pickle.dumps(model.state_dict())
            try:
                fs.create(c3_path, model_pickle, overwrite=True)
            except Exception as e:
                print(e)
        else:
            file_name = f'data_out/model'
            if appendix:
                file_name += f'_{appendix}'
            torch.save({'model': model.state_dict(), 'task': type(self.task).__name__}, file_name)
Пример #30
0
class HDFSSErvice:

    namenode_host = "localhost"
    namenode_port = "9870"
    root_folder = "/"
    chunck_size = 100000

    def __init__(self):
        self._client = HdfsClient(hosts=self.namenode_host + ":" + self.namenode_port, user_name="root")

    def get(self, hdfs_path: str):
        file_size = self.get_file_size(hdfs_path)
        for i in range(0, file_size, self.chunck_size):
            file_response = self._client.open(hdfs_path, offset=i, length=i + self.chunck_size)
            yield file_response.read()
        
    def append(self, hdfs_path: str, data: bytes):
        self.create_if_not_exist(hdfs_path)
        self._client.append(hdfs_path, data)
    
    def create_if_not_exist(self, hdfs_path: str):
        if not self._client.exists(hdfs_path):
            self._client.create(hdfs_path, b"")

    def get_messages_number(self, hdfs_path: str):
        return int(self.get_file_size(hdfs_path) / self.chunck_size + 1)

    def get_file_size(self, hdfs_path):
        file_infos = self._client.get_content_summary(hdfs_path)
        return file_infos.length

    def test(self):
        pass
Пример #31
0
 def __init__(self, hdfs_host, hbase_host, hbase_port_no, table_name, repo_path):
     logging.info(
         'Open connection pool for hbase host:%s port:%d', hbase_host, hbase_port_no)
     # create connection pools
     try:
         self.conn_pool = happybase.ConnectionPool(DB_CONNECTION_POOL_SIZE, host=hbase_host,
                                                   port=hbase_port_no,
                                                   timeout=DB_CONNECTION_TIME_OUT)
     except TException as exception:
         logging.warn("Exception throw for HBase Connection pool creation{%s}",
                      exception.message)
     self.hbase_host = hbase_host
     self.hdfs_host = hdfs_host
     self.hbase_port_no = hbase_port_no
     self.table_name = table_name
     self.repo_path = repo_path
     self.master_dataset = list()
     self.client = HdfsClient(hosts=hdfs_host, user_name='hdfs')
Пример #32
0
class HDBDataStore(object):
    """
    Singleton class to read and maintain datasets for Service API
    Its not a generic HBase dataset handler.
    """
    __metaclass__ = Singleton
    def __init__(self, hdfs_host, hbase_host, hbase_port_no, table_name, repo_path):
        logging.info(
            'Open connection pool for hbase host:%s port:%d', hbase_host, hbase_port_no)
        # create connection pools
        try:
            self.conn_pool = happybase.ConnectionPool(DB_CONNECTION_POOL_SIZE, host=hbase_host,
                                                      port=hbase_port_no,
                                                      timeout=DB_CONNECTION_TIME_OUT)
        except TException as exception:
            logging.warn("Exception throw for HBase Connection pool creation{%s}",
                         exception.message)
        self.hbase_host = hbase_host
        self.hdfs_host = hdfs_host
        self.hbase_port_no = hbase_port_no
        self.table_name = table_name
        self.repo_path = repo_path
        self.master_dataset = list()
        self.client = HdfsClient(hosts=hdfs_host, user_name='hdfs')

    def collect(self):
        """
        Collect datasets by reading from HDFS Repo and HBase repo
        :return:
        """
        hdfs_list = self.read_data_from_repo()
        hbase_list = self.retrieve_datasets_from_hbase()
        inter_list = list()
        # find intersection and keep hbase copy
        for hbase_entry, hdfs_entry in [(hbase_entry, hdfs_entry) for hbase_entry in hbase_list
                                        for hdfs_entry in hdfs_list]:
            if hbase_entry['id'] == hdfs_entry['id']:
                # remove entries in HDFS list that matches hbase
                inter_list.append(hbase_entry)
                hdfs_list.remove(hdfs_entry)
                hbase_list.remove(hbase_entry)
        # yes intersection
        if len(inter_list) > 0:
            logging.debug("The intersection list:%s is", inter_list)
            self.master_dataset = inter_list + hdfs_list
            if len(hbase_list) != 0:
                logging.warn(" Warning Untracked datasets of size %d", len(hbase_list))
                self.master_dataset = self.master_dataset + tag_for_integrity(hbase_list)
        else:
            # god knows whats happening
            self.master_dataset = tag_for_integrity(hbase_list) + hdfs_list

    def read_data_from_repo(self):
        """
        Read data from HDFS repo_path
        :return:
        """
        repo_path = self.repo_path
        hdfs_dataset = list()
        try:
            for root, dirs, _ in self.client.walk(repo_path, topdown=True, onerror=onerror):
                for entry in dirs:
                    m_source = re.match('^source=(?P<source>.*)', entry)
                    if m_source is None:
                        continue
                    elif m_source.group('source') == '':
                        logging.warn('An empty source is present, this is not allowed. Something was wrong during ingestion')
                        continue
                    else:
                        item = {DATASET.ID: m_source.group('source'),
                                DATASET.POLICY: POLICY.SIZE,
                                DATASET.PATH: os.path.join(root, entry), DATASET.MODE: 'keep'}
                        hdfs_dataset.append(item)
                break
        except HdfsException as exception:
            logging.warn("Error in walking HDFS File system %s", exception.message)
        return hdfs_dataset

    def retrieve_datasets_from_hbase(self):
        """
        Connect to hbase table and return list of hbase_dataset
        :return:
        """
        hbase_datasets = list()
        table_name = self.table_name
        try:
            with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection:
                if table_name not in connection.tables():
                    logging.info('creating hbase table %s', table_name)
                    connection.create_table(table_name, {'cf': dict()})

                table = connection.table(table_name)
                for _, data in table.scan(limit=1):
                    logging.debug('%s found', table_name)
        except TException as exception:
            logging.warn(" failed to read table from hbase error(%s):", exception.message)
            return hbase_datasets
        logging.debug('connecting to hbase to read hbase_dataset')
        for key, data in table.scan():
            item = {DATASET.ID: key, DATASET.PATH: data[DBSCHEMA.PATH],
                    DATASET.POLICY: data[DBSCHEMA.POLICY],
                    DATASET.MODE: data[DBSCHEMA.MODE]}
            if item[DATASET.POLICY] == POLICY.AGE:
                item[DATASET.MAX_AGE] = int(data[DBSCHEMA.RETENTION])
            elif item[DATASET.POLICY] == POLICY.SIZE:
                item[DATASET.MAX_SIZE] = int(data[DBSCHEMA.RETENTION])
            hbase_datasets.append(item)
        logging.info(hbase_datasets)
        return hbase_datasets

    def read_datasets(self):
        """
        Connect to hbase table and return list of datasets
        :return:
        """
        return self.master_dataset

    def read_partitions(self, data_path):
        """
        Read partition for a HDFS dataset
        :param data_path:
        :return:
        """
        data_parts = list()
        try:
            for entry in dirwalk(self.client, data_path):
                if entry not in data_parts:
                    data_parts.append(entry)
        except HdfsException as exception:
            logging.warn(
                "Error in walking HDFS File system for partitions errormsg:%s", exception.message)
        return data_parts

    def write_dataset(self, data):
        """
        Persist dataset entry into HBase Table
        :param data: api that needs update
        :return: None
        """
        try:
            logging.debug("Write dataset:{%s}", data)
            table_name = self.table_name
            with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection:
                table = connection.table(table_name)
                dataset = {DBSCHEMA.PATH: data[DATASET.PATH], DBSCHEMA.POLICY: data[DATASET.POLICY],
                           DBSCHEMA.MODE: data[DATASET.MODE]}
                if DATASET.RETENTION in data:
                    dataset[DBSCHEMA.RETENTION] = data[DATASET.RETENTION]
                logging.debug("calling put on table for %s", dataset)
                table.put(data[DATASET.ID], dataset)
        except TException as exception:
            logging.warn("Failed to write dataset into hbase,  error(%s):", exception.message)

    def delete_dataset(self, data):
        """
        Delete dataset entry from HBase.
        :param data: dataset instance
        :return: None
        """
        try:
            table_name = self.table_name
            with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection:
                table = connection.table(table_name)
                logging.debug("Deleting dataset from HBase:{%s}", data)
                table.delete(data['id'])
        except TException as exception:
            logging.warn("Failed to delete dataset in hbase,  error(%s):", exception.message)