def test_move_partition_to_another_disk(cluster): create_table(cluster, "hdfs_test") node = cluster.instances["node"] fs = HdfsClient(hosts=cluster.hdfs_ip) node.query("INSERT INTO hdfs_test VALUES {}".format( generate_values('2020-01-03', 4096))) node.query("INSERT INTO hdfs_test VALUES {}".format( generate_values('2020-01-04', 4096))) assert node.query( "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)" hdfs_objects = fs.listdir('/clickhouse') assert len( hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 node.query( "ALTER TABLE hdfs_test MOVE PARTITION '2020-01-04' TO DISK 'hdd'") assert node.query( "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)" hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE node.query( "ALTER TABLE hdfs_test MOVE PARTITION '2020-01-04' TO DISK 'hdfs'") assert node.query( "SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)" hdfs_objects = fs.listdir('/clickhouse') assert len( hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
def ProcAll(LocalDir, HdfsDir): NameNode = GolobalConfig['hdfs']['NameNode'] UserName = GolobalConfig['hdfs']['UserName'] client = HdfsClient(hosts=NameNode, user_name=UserName) if not client.exists(HdfsDir): client.mkdirs(HdfsDir) total = len(os.listdir(LocalDir)) processed = 0 failedList = list() FileSize = 0 StartTime = time.time() for filename in os.listdir(LocalDir): srcFile = os.path.join(LocalDir, filename) dstFile = HdfsDir + '/' + filename if not ProcOne(client, srcFile, dstFile): failedList.append(srcFile) else: FileSize += os.path.getsize(srcFile) processed += 1 print('%d/%d/%d, time cost: %.2f s' % (total, processed, len(failedList), time.time() - StartTime)) print('%d B, %.2f MB/s \n' % (FileSize, FileSize / 1024 / 1024 / (time.time() - StartTime))) if failedList: print('failedList: %s' % repr(failedList)) return False else: print('Good! No Error!') print('%d B, %.2f MB, %.2f GB, %.2f MB/s' % \ (FileSize, FileSize/1024/1024, FileSize/1024/1024/1024, FileSize/1024/1024/(time.time()-StartTime))) return True
def read_hdfs(filename, host, split_ratio, delimiter=',', normalize=False, dtype=None, header=None, skiprows=None, index_col=False, output_label=True, randomize=False, return_as_dataframe=False, describe=False, label_vector=False): client = HdfsClient(hosts=host) return read_csv(filename=client.open(filename), split_ratio=split_ratio, delimiter=delimiter, normalize=normalize, dtype=dtype, header=header, skiprows=skiprows, index_col=index_col, output_label=output_label, randomize=randomize, return_as_dataframe=return_as_dataframe, describe=describe, label_vector=label_vector)
def test_invalid_construction(self) -> None: with self.assertRaises(ValueError): HdfsClient([]) with self.assertRaises(ValueError): HdfsClient(retry_delay=-1) with self.assertRaises(ValueError): HdfsClient(max_tries=0)
def __init__(self): self.host = "172.27.133.18" self.port = "8020" self.userName = "******" self.remotePath = "/user/shiyouguandao" self.fs = HdfsClient(self.host, self.userName) self.ReadHdfsFile()
def get_hdfs_client(args): global _hdfs_client if _hdfs_client is not None: return _hdfs_client # backward compatibility hdfs_host = None if args.hdfs_host: hdfs_host = args.hdfs_host elif args.pai_hdfs_host: hdfs_host = args.pai_hdfs_host else: return None if hdfs_host is not None and args.nni_hdfs_exp_dir is not None: try: if args.webhdfs_path: _hdfs_client = HdfsClient(hosts='{0}:80'.format(hdfs_host), user_name=args.pai_user_name, webhdfs_path=args.webhdfs_path, timeout=5) else: # backward compatibility _hdfs_client = HdfsClient(hosts='{0}:{1}'.format( hdfs_host, '50070'), user_name=args.pai_user_name, timeout=5) except Exception as e: nni_log(LogType.Error, 'Create HDFS client error: ' + str(e)) raise e return _hdfs_client
def test_table_manipulations(cluster): create_table(cluster, "hdfs_test") node = cluster.instances["node"] fs = HdfsClient(hosts=cluster.hdfs_ip) node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-03', 4096))) node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-04', 4096))) node.query("RENAME TABLE hdfs_test TO hdfs_renamed") assert node.query("SELECT count(*) FROM hdfs_renamed FORMAT Values") == "(8192)" hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 node.query("RENAME TABLE hdfs_renamed TO hdfs_test") assert node.query("CHECK TABLE hdfs_test FORMAT Values") == "(1)" node.query("DETACH TABLE hdfs_test") node.query("ATTACH TABLE hdfs_test") assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(8192)" hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2 node.query("TRUNCATE TABLE hdfs_test") assert node.query("SELECT count(*) FROM hdfs_test FORMAT Values") == "(0)" hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD
def cluster(): try: cluster = ClickHouseCluster(__file__) cluster.add_instance( "node1", main_configs=["configs/config.d/storage_conf.xml"], macros={"replica": "node1"}, with_zookeeper=True, with_hdfs=True, ) cluster.add_instance( "node2", main_configs=["configs/config.d/storage_conf.xml"], macros={"replica": "node2"}, with_zookeeper=True, with_hdfs=True, ) logging.info("Starting cluster...") cluster.start() if cluster.instances["node1"].is_debug_build(): # https://github.com/ClickHouse/ClickHouse/issues/27814 pytest.skip( "libhdfs3 calls rand function which does not pass harmful check in debug build" ) logging.info("Cluster started") fs = HdfsClient(hosts=cluster.hdfs_ip) fs.mkdirs("/clickhouse1") fs.mkdirs("/clickhouse2") logging.info("Created HDFS directory") yield cluster finally: cluster.shutdown()
def test_simple_insert_select(cluster, min_rows_for_wide_part, files_per_part): create_table(cluster, "hdfs_test", additional_settings="min_rows_for_wide_part={}".format( min_rows_for_wide_part)) node = cluster.instances["node"] values1 = generate_values('2020-01-03', 4096) node.query("INSERT INTO hdfs_test VALUES {}".format(values1)) assert node.query( "SELECT * FROM hdfs_test order by dt, id FORMAT Values") == values1 fs = HdfsClient(hosts=cluster.hdfs_ip) hdfs_objects = fs.listdir('/clickhouse') print(hdfs_objects) assert len(hdfs_objects) == FILES_OVERHEAD + files_per_part values2 = generate_values('2020-01-04', 4096) node.query("INSERT INTO hdfs_test VALUES {}".format(values2)) assert node.query("SELECT * FROM hdfs_test ORDER BY dt, id FORMAT Values" ) == values1 + "," + values2 hdfs_objects = fs.listdir('/clickhouse') assert len(hdfs_objects) == FILES_OVERHEAD + files_per_part * 2 assert node.query( "SELECT count(*) FROM hdfs_test where id = 1 FORMAT Values") == "(2)"
def upload_txt_to_hdfs(arr): client = HdfsClient(hosts="localhost:50070", user_name="Alphalbj") name = "/words/words-" + datetime.datetime.now().strftime( '%Y-%m-%d-%H-%M-%S') + ".txt" content = "" for word in arr: content += word + " " client.create(name, content.encode('utf-8'))
def __init__(self, cur_database_param): # super(HdfsClients, self).__init__() # self.quert_db_info = super(HdfsClients, self).getDBConfig() # self.hdfsHost=self.quert_db_info["host"] hdfsHost = cur_database_param['url'] path = cur_database_param['dbname'] self.hdfs = HdfsClient(hosts='{hdfs_host}'.format(hdfs_host=hdfsHost)) self.host = hdfsHost self.path = path
def load_fields_with_vocab(self, hdfs_host: str) -> Dict[str, Field]: fs = HdfsClient(hdfs_host) if fs.exists(self.fields_path): print(f'get fields from {hdfs_host}{self.fields_path}') else: raise Exception(f'there are no fields in {hdfs_host}{self.fields_path}') loaded_dict = json.loads(fs.open(self.fields_path).read()) return {k: self.dict_to_field(v) for k, v in loaded_dict.items()}
def wait_for_delete_hdfs_objects(cluster, expected, num_tries=30): fs = HdfsClient(hosts=cluster.hdfs_ip) while num_tries > 0: num_hdfs_objects = len(fs.listdir('/clickhouse')) if num_hdfs_objects == expected: break num_tries -= 1 time.sleep(1) assert(len(fs.listdir('/clickhouse')) == expected)
def Copy_To_Local(file): ''' 从Hadoop上下载文件 ''' client = HdfsClient(hosts='localhost:50070') #连接到hdfs if os.path.exists(file): os.remove(file) #判断本地是否存在文件,存在就删除 client.copy_to_local(file, './')
def wait_for_hdfs_objects(cluster, fp, expected, num_tries=30): fs = HdfsClient(hosts=cluster.hdfs_ip) while num_tries > 0: num_hdfs_objects = len(fs.listdir(fp)) if num_hdfs_objects == expected: break num_tries -= 1 time.sleep(1) assert len(fs.listdir(fp)) == expected
def __load_corpus_from_hdfs(self, hdfs_host: str) -> List: fs = HdfsClient(hdfs_host) with fs.open(self.corpus_path) as fp: corpus = list() for line in tqdm(fp.read().decode().split('\n')): if line: d = json.loads(line) corpus.append(d) return corpus
def load_model(self, train_dir, modelnum, appendix): print('~' * 100) c3_path = f'/user/{self.username}/fortuna/model/{train_dir}_{modelnum}/model_e{appendix}' print(c3_path) fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.username) model_pickle = fs.open(c3_path) model_dict = pickle.load(model_pickle) self.model.load_state_dict(model_dict) acc_lst, total, prec,recall,f1score,f1s,rocauc = self.eval(self.test_iter, len(self.task.te_dataset)) print('~' * 100)
def setUp(self): self.hdfs_file_path = '../../.vscode/hdfsInfo.json' self.hdfs_config = None try: with open(self.hdfs_file_path, 'r') as file: self.hdfs_config = json.load(file) except Exception as exception: print(exception) self.hdfs_client = HdfsClient(hosts='{0}:{1}'.format(self.hdfs_config['host'], '50070'), user_name=self.hdfs_config['userName'])
def load_fields_from_c3(self) -> Dict[str, Field]: fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name) if fs.exists(self.c3_fields_path): print(f'get fields from {self.C3_HDFS_HOST}{self.c3_fields_path}') else: raise Exception(f'there are no fields in {self.C3_HDFS_HOST}{self.c3_fields_path}') loaded_dict = json.loads(fs.open(self.c3_fields_path).read()) print(loaded_dict) max_vocab_indexes = {k: v['max_vocab_index'] for k, v in loaded_dict.items()} return {k: self.dict_to_field(v) for k, v in loaded_dict.items()}, max_vocab_indexes
def _setup_walk(self, client: HdfsClient) -> Callable[..., str]: def path(*args: str) -> str: return posixpath.join(TEST_DIR, *args) self._make_empty_dir(client) client.create(path("f1"), b"") client.mkdirs(path("a1", "b1")) client.create(path("a1", "b1", "f2"), b"") client.mkdirs(path("a1", "b2")) client.mkdirs(path("a2")) return path
def load_matrix(self, filepath, shape=None): if os.environ['local'] == '1' and os.path.exists(filepath): return np.loadtxt(filepath, dtype=np.float) else: hosts = os.environ['hosts'] if len(hosts) == 0: hosts = 'master' client = HdfsClient(hosts=hosts) if client.exists(filepath): return np.fromstring( client.open(filepath).read()).reshape(shape) return False
def crawler(word, products_list=[]): """ 爬取一号店的商品数据 """ word = urllib.parse.quote(word) url = 'https://search.yhd.com/c0-0/k{0}'.format(word) # 获取html源码 html_doc = requests.get(url).text # xpath对象 selector = html.fromstring(html_doc) # 商品列表 ul_list = selector.xpath('//div[@id="itemSearchList"]/div') # 解析数据 for li in ul_list: # 标题 title = li.xpath('div//p[@class="proName clearfix"]/a/@title') #print(title) # 链接 link = li.xpath('div//p[@class="proName clearfix"]/a/@href') #print(link) # 价格 price = li.xpath('div//p[@class="proPrice"]/em/@yhdprice') with open("p_price", "a", encoding="gbk") as f: for j in range(len(price)): f.write(price[j] + "\n") f.close() #print(price) if len(title) > 0 and len(link) > 0 and len(price) > 0: # print(title) # print(link) # print(price) # print('--------------------') products_list.append({ 'title': title[0], 'price': price[0], 'link': 'https:' + link[0], 'referer': '1号店' }) client = HdfsClient(hosts='222.27.166.209:50070', user_name='hadoop') client.copy_from_local('/home/hadoop/Downloads/PriceCompaer/p_price', '/p_price.txt')
def _load_preprocessed(self) -> List[Example]: fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.user_name) if fs.exists(self.c3_path): print(f'get preprocessed corpus from {self.C3_HDFS_HOST}{self.c3_path}') else: raise Exception(f'there are no preprocessed in {self.C3_HDFS_HOST}{self.c3_path}') preprocessed = [] for line in fs.open(self.c3_path).read().decode().split('\n'): if line: ex = Example() for k, v in json.loads(line).items(): setattr(ex, k, v) preprocessed.append(ex) return preprocessed
class DeleteHdfsData(): # 初始化 def __init__(self): self.host = "172.27.133.18" self.port = "8020" self.userName = "******" self.remotePath = "/user/shiyouguandao" self.fs = HdfsClient(self.host, self.userName) self.ReadHdfsFile() # 读取文件 def ReadHdfsFile(self): count = self.HdfsFileList(self.remotePath) localtime = time.asctime(time.localtime(time.time())) log.info(localtime + "\t共删除/user/shiyouguandao下\t" + str(count) + "个csv文件...") # 文件删除 def DeleteHdfsFile(self, hdfsPath): # self.fs.delete(hdfsPath, skip_trash=False) self.fs.delete( "/user/shiyouguandao/feature_ZSY-69_2019-09-24_23411.csv", skip_trash=False) #文件目录列表遍历判定 def HdfsFileList(self, path): count = 0 for root, dirs, files in self.fs.walk(path): for file in files: is_csv = self.hdfsFileHandler(file) if is_csv: self.DeleteHdfsFile(path + "/" + file) count += 1 return count #文件过滤 def hdfsFileHandler(self, fileName): if fileName.endswith(".csv"): temp = 60 * 60 * 24 index = fileName.rfind("_") str = fileName[index - len(fileName) - 10:index - len(fileName)] current = int(time.time()) fileTime = int(time.mktime(time.strptime(str, "%Y-%m-%d"))) if (current - fileTime) >= temp: return True else: return False
def cluster(): try: cluster = ClickHouseCluster(__file__) cluster.add_instance("node", main_configs=["configs/config.d/storage_conf.xml"], with_hdfs=True) logging.info("Starting cluster...") cluster.start() logging.info("Cluster started") fs = HdfsClient(hosts=cluster.hdfs_ip) fs.mkdirs('/clickhouse') logging.info("Created HDFS directory") yield cluster finally: cluster.shutdown()
def test_alter_table_columns(cluster): create_table(cluster, "hdfs_test") node = cluster.instances["node"] fs = HdfsClient(hosts=cluster.hdfs_ip) node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-03', 4096))) node.query("INSERT INTO hdfs_test VALUES {}".format(generate_values('2020-01-03', 4096, -1))) node.query("ALTER TABLE hdfs_test ADD COLUMN col1 UInt64 DEFAULT 1") # To ensure parts have merged node.query("OPTIMIZE TABLE hdfs_test") assert node.query("SELECT sum(col1) FROM hdfs_test FORMAT Values") == "(8192)" assert node.query("SELECT sum(col1) FROM hdfs_test WHERE id > 0 FORMAT Values") == "(4096)" wait_for_delete_hdfs_objects(cluster, FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE + FILES_OVERHEAD_PER_COLUMN) node.query("ALTER TABLE hdfs_test MODIFY COLUMN col1 String", settings={"mutations_sync": 2}) assert node.query("SELECT distinct(col1) FROM hdfs_test FORMAT Values") == "('1')" # and file with mutation wait_for_delete_hdfs_objects(cluster, FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE + FILES_OVERHEAD_PER_COLUMN + 1) node.query("ALTER TABLE hdfs_test DROP COLUMN col1", settings={"mutations_sync": 2}) # and 2 files with mutations wait_for_delete_hdfs_objects(cluster, FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE + 2)
def main_loop(args): '''main loop logic for trial keeper''' if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) stdout_file = open(STDOUT_FULL_PATH, 'a+') stderr_file = open(STDERR_FULL_PATH, 'a+') # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior process = Popen(args.trial_command, shell=True, stdout=stdout_file, stderr=stderr_file) nni_log( LogType.Info, 'Trial keeper spawns a subprocess (pid {0}) to run command: {1}'. format(process.pid, shlex.split(args.trial_command))) while True: retCode = process.poll() ## Read experiment metrics, to avoid missing metrics read_experiment_metrics(args.nnimanager_ip, args.nnimanager_port) if retCode is not None: nni_log( LogType.Info, 'subprocess terminated. Exit code is {}. Quit'.format(retCode)) if NNI_PLATFORM == 'pai': # Copy local directory to hdfs for OpenPAI nni_local_output_dir = os.environ['NNI_OUTPUT_DIR'] try: hdfs_client = HdfsClient(hosts='{0}:{1}'.format( args.pai_hdfs_host, '50070'), user_name=args.pai_user_name, timeout=5) if copyDirectoryToHdfs(nni_local_output_dir, args.pai_hdfs_output_dir, hdfs_client): nni_log( LogType.Info, 'copy directory from {0} to {1} success!'.format( nni_local_output_dir, args.pai_hdfs_output_dir)) else: nni_log( LogType.Info, 'copy directory from {0} to {1} failed!'.format( nni_local_output_dir, args.pai_hdfs_output_dir)) except Exception as e: nni_log(LogType.Error, 'HDFS copy directory got exception: ' + str(e)) raise e ## Exit as the retCode of subprocess(trial) exit(retCode) break time.sleep(2)
def getAllFolderHaveData(client_: HdfsClient, path_: str): _folderPathList = [] for _root, _dir, _files in client_.walk(path_, status=True): # 有文件的内容的文件夹才是需要拷贝的文件夹 if len(_files) > 0: print(_root) _folderPathList.append(_root) return utils.listUtils.joinListToStr(_folderPathList, "\n")
def save_model(self, savemodel, model, appendix=None): if(savemodel): c3_path = f'/user/{self.username}/fortuna/model/{self.trainfile}_{self.testnum}/model' fs = HdfsClient(self.C3_HDFS_HOST, user_name=self.username) if appendix: c3_path += f'_{appendix}' model_pickle = pickle.dumps(model.state_dict()) try: fs.create(c3_path, model_pickle, overwrite=True) except Exception as e: print(e) else: file_name = f'data_out/model' if appendix: file_name += f'_{appendix}' torch.save({'model': model.state_dict(), 'task': type(self.task).__name__}, file_name)
class HDFSSErvice: namenode_host = "localhost" namenode_port = "9870" root_folder = "/" chunck_size = 100000 def __init__(self): self._client = HdfsClient(hosts=self.namenode_host + ":" + self.namenode_port, user_name="root") def get(self, hdfs_path: str): file_size = self.get_file_size(hdfs_path) for i in range(0, file_size, self.chunck_size): file_response = self._client.open(hdfs_path, offset=i, length=i + self.chunck_size) yield file_response.read() def append(self, hdfs_path: str, data: bytes): self.create_if_not_exist(hdfs_path) self._client.append(hdfs_path, data) def create_if_not_exist(self, hdfs_path: str): if not self._client.exists(hdfs_path): self._client.create(hdfs_path, b"") def get_messages_number(self, hdfs_path: str): return int(self.get_file_size(hdfs_path) / self.chunck_size + 1) def get_file_size(self, hdfs_path): file_infos = self._client.get_content_summary(hdfs_path) return file_infos.length def test(self): pass
def __init__(self, hdfs_host, hbase_host, hbase_port_no, table_name, repo_path): logging.info( 'Open connection pool for hbase host:%s port:%d', hbase_host, hbase_port_no) # create connection pools try: self.conn_pool = happybase.ConnectionPool(DB_CONNECTION_POOL_SIZE, host=hbase_host, port=hbase_port_no, timeout=DB_CONNECTION_TIME_OUT) except TException as exception: logging.warn("Exception throw for HBase Connection pool creation{%s}", exception.message) self.hbase_host = hbase_host self.hdfs_host = hdfs_host self.hbase_port_no = hbase_port_no self.table_name = table_name self.repo_path = repo_path self.master_dataset = list() self.client = HdfsClient(hosts=hdfs_host, user_name='hdfs')
class HDBDataStore(object): """ Singleton class to read and maintain datasets for Service API Its not a generic HBase dataset handler. """ __metaclass__ = Singleton def __init__(self, hdfs_host, hbase_host, hbase_port_no, table_name, repo_path): logging.info( 'Open connection pool for hbase host:%s port:%d', hbase_host, hbase_port_no) # create connection pools try: self.conn_pool = happybase.ConnectionPool(DB_CONNECTION_POOL_SIZE, host=hbase_host, port=hbase_port_no, timeout=DB_CONNECTION_TIME_OUT) except TException as exception: logging.warn("Exception throw for HBase Connection pool creation{%s}", exception.message) self.hbase_host = hbase_host self.hdfs_host = hdfs_host self.hbase_port_no = hbase_port_no self.table_name = table_name self.repo_path = repo_path self.master_dataset = list() self.client = HdfsClient(hosts=hdfs_host, user_name='hdfs') def collect(self): """ Collect datasets by reading from HDFS Repo and HBase repo :return: """ hdfs_list = self.read_data_from_repo() hbase_list = self.retrieve_datasets_from_hbase() inter_list = list() # find intersection and keep hbase copy for hbase_entry, hdfs_entry in [(hbase_entry, hdfs_entry) for hbase_entry in hbase_list for hdfs_entry in hdfs_list]: if hbase_entry['id'] == hdfs_entry['id']: # remove entries in HDFS list that matches hbase inter_list.append(hbase_entry) hdfs_list.remove(hdfs_entry) hbase_list.remove(hbase_entry) # yes intersection if len(inter_list) > 0: logging.debug("The intersection list:%s is", inter_list) self.master_dataset = inter_list + hdfs_list if len(hbase_list) != 0: logging.warn(" Warning Untracked datasets of size %d", len(hbase_list)) self.master_dataset = self.master_dataset + tag_for_integrity(hbase_list) else: # god knows whats happening self.master_dataset = tag_for_integrity(hbase_list) + hdfs_list def read_data_from_repo(self): """ Read data from HDFS repo_path :return: """ repo_path = self.repo_path hdfs_dataset = list() try: for root, dirs, _ in self.client.walk(repo_path, topdown=True, onerror=onerror): for entry in dirs: m_source = re.match('^source=(?P<source>.*)', entry) if m_source is None: continue elif m_source.group('source') == '': logging.warn('An empty source is present, this is not allowed. Something was wrong during ingestion') continue else: item = {DATASET.ID: m_source.group('source'), DATASET.POLICY: POLICY.SIZE, DATASET.PATH: os.path.join(root, entry), DATASET.MODE: 'keep'} hdfs_dataset.append(item) break except HdfsException as exception: logging.warn("Error in walking HDFS File system %s", exception.message) return hdfs_dataset def retrieve_datasets_from_hbase(self): """ Connect to hbase table and return list of hbase_dataset :return: """ hbase_datasets = list() table_name = self.table_name try: with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection: if table_name not in connection.tables(): logging.info('creating hbase table %s', table_name) connection.create_table(table_name, {'cf': dict()}) table = connection.table(table_name) for _, data in table.scan(limit=1): logging.debug('%s found', table_name) except TException as exception: logging.warn(" failed to read table from hbase error(%s):", exception.message) return hbase_datasets logging.debug('connecting to hbase to read hbase_dataset') for key, data in table.scan(): item = {DATASET.ID: key, DATASET.PATH: data[DBSCHEMA.PATH], DATASET.POLICY: data[DBSCHEMA.POLICY], DATASET.MODE: data[DBSCHEMA.MODE]} if item[DATASET.POLICY] == POLICY.AGE: item[DATASET.MAX_AGE] = int(data[DBSCHEMA.RETENTION]) elif item[DATASET.POLICY] == POLICY.SIZE: item[DATASET.MAX_SIZE] = int(data[DBSCHEMA.RETENTION]) hbase_datasets.append(item) logging.info(hbase_datasets) return hbase_datasets def read_datasets(self): """ Connect to hbase table and return list of datasets :return: """ return self.master_dataset def read_partitions(self, data_path): """ Read partition for a HDFS dataset :param data_path: :return: """ data_parts = list() try: for entry in dirwalk(self.client, data_path): if entry not in data_parts: data_parts.append(entry) except HdfsException as exception: logging.warn( "Error in walking HDFS File system for partitions errormsg:%s", exception.message) return data_parts def write_dataset(self, data): """ Persist dataset entry into HBase Table :param data: api that needs update :return: None """ try: logging.debug("Write dataset:{%s}", data) table_name = self.table_name with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection: table = connection.table(table_name) dataset = {DBSCHEMA.PATH: data[DATASET.PATH], DBSCHEMA.POLICY: data[DATASET.POLICY], DBSCHEMA.MODE: data[DATASET.MODE]} if DATASET.RETENTION in data: dataset[DBSCHEMA.RETENTION] = data[DATASET.RETENTION] logging.debug("calling put on table for %s", dataset) table.put(data[DATASET.ID], dataset) except TException as exception: logging.warn("Failed to write dataset into hbase, error(%s):", exception.message) def delete_dataset(self, data): """ Delete dataset entry from HBase. :param data: dataset instance :return: None """ try: table_name = self.table_name with self.conn_pool.connection(DB_CONNECTION_TIME_OUT) as connection: table = connection.table(table_name) logging.debug("Deleting dataset from HBase:{%s}", data) table.delete(data['id']) except TException as exception: logging.warn("Failed to delete dataset in hbase, error(%s):", exception.message)