def client_init(user): session = requests.session() session.keep_alive = False client = pyhdfs.HdfsClient(hosts="192.168.7.150:9870", user_name=user, requests_session=session) return client
def __init__(self): hosts = ",".join( ["{}:{}".format(host, HDFS_PORT) for host in HDFS_HOST]) self.client = pyhdfs.HdfsClient(hosts=hosts, user_name="szliu", max_tries=5, retry_delay=5)
def CCP(): fs = pyhdfs.HdfsClient(hosts="127.0.0.1:50070", user_name="kali") f = fs.open("/user/kali/lab2/outputCCP/part-00000") arr = [] for line in f: strin = ", ".join(line.decode()[0:-1].split("\t")) pairs = strin.split(", ") pairs[2] = int(pairs[2]) arr.append(pairs) arr.sort(key=lambda x: x[2], reverse=True) # for item in arr: # print(item) check_word = input("Write item's name: ") count = 0 for i in range(len(arr)): if check_word.lower() == arr[i][0].lower(): count += 1 print(f"{count}:\t{arr[i][1]}: {arr[i][2]}") elif check_word.lower() == arr[i][1].lower(): count += 1 print(f"{count}:\t{arr[i][0]}: {arr[i][2]}") if count == 10: break
def SchemaExtractor(hdfs_host=HDFS_HOST, hdfs_port=HDFS_IPC_PORT, path=None): ''' Infer the schema of a Parquet file from its metadata :param hdfs_host: :param hdfs_port: :param path: path to the parquet file :return: schema (String) ''' sqlContext = SQLContext(sc) seperator = ':' hdfs_path = [hdfs_host, str(hdfs_port)] fs = pyhdfs.HdfsClient(hosts=seperator.join(hdfs_path), user_name='spark24') file_list = [] for root, _, filenames in fs.walk(path): for filename in filenames: if filename.endswith(".parquet"): file_list.append(root + '/' + filename) single_parquet_path = file_list[0] df = sqlContext.read.parquet('hdfs://' + hdfs_host + single_parquet_path) schema = '' for name, dtype in df.dtypes: schema = schema + ' ' + str(name) + ' ' + str(dtype) + ',' schema = schema[:-1] return schema
def handleFileFromHdfs(fileName, rootFolder, jsonData={}, userName='******', hdfsHost="spark-master0", nnPort="50070"): ''' remove or rename the file from hdfs ''' try: client = pyhdfs.HdfsClient(hosts="{0}:{1}".format(hdfsHost, nnPort)) except Exception: logger.error("Exception: {0}, Traceback: {1}".format( sys.exc_info(), traceback.format_exc())) return False if not jsonData: ''' check the generated file if exist ''' FolderUri = "{0}/{1}/{2}".format(rootFolder, userName, fileName) if client.exists(FolderUri): return True return False if rootFolder.startswith('/tmp/users'): csvFolderUri = "{0}/{1}/csv/{2}".format(rootFolder, userName, fileName) parquetFolderUri = "{0}/{1}/parquet/{2}".format( rootFolder, userName, fileName) if jsonData['method'] == 'delete': deleteHdfsFile(client, csvFolderUri) deleteHdfsFile(client, parquetFolderUri) return True elif jsonData['method'] == 'rename': newname = jsonData['newname'] csvRs = renameHdfsFile(client, csvFolderUri, newname) parquetRs = renameHdfsFile(client, parquetFolderUri, newname) if csvRs and parquetRs: return True return False elif rootFolder.startswith('/users'): username = jsonData['username'] if 'username' in jsonData.keys( ) else 'yzy' FolderUri = "{0}/{1}/{2}".format(rootFolder, userName, fileName) if jsonData['method'] == 'delete': checkOrDeleteView(fileName, username, delete=True) return deleteHdfsFile(client, FolderUri) elif jsonData['method'] == 'rename': newname = jsonData['newname'] return renameHdfsFile(client, FolderUri, newname, username=username)
def seplunk_start(config_path): if not os.path.exists(config_path): LOGGER.error("config file path not exists") sys.exit(1) config = get_configure(config_path) default_fs = config.get('fs.defaultFS', None) if not default_fs: LOGGER.error("hdfs not found") host = default_fs.split(':')[1].strip("/") hdfs_client = pyhdfs.HdfsClient(host) if not os.path.exists(TMP_PATH): os.mkdir(TMP_PATH) db_path = os.path.join(TMP_PATH, DB_FLIE) # if os.path.exists(db_path): # os.remove(db_path) conn = sql.get_conn(db_path) p_log_conn, p_monitor_conn = Pipe() p_log = Process(target=create_process_log, args=(hdfs_client, config, conn, p_log_conn)) p_log.start() p_monitor = Process(target=create_process_monitor, args=(hdfs_client, config, conn, p_monitor_conn)) p_monitor.start()
def hdfs_get_file(remote_path, filename, local_path, delete=False, hdfs_path='10.20.37.175', port=9000): hdfs_client = pyhdfs.HdfsClient(hdfs_path, port) try: hdfs_client.copy_to_local(remote_path + filename, local_path + filename) except: time.sleep(2) hdfs_client.copy_to_local(remote_path + filename, local_path + filename) print("load completed!") if delete: files = hdfs_client.listdir(remote_path) while filename in files: try: hdfs_client.delete(remote_path + filename) files = hdfs_client.listdir(remote_path) time.sleep(2) except: files = hdfs_client.listdir(remote_path) return local_path + filename
def _init_client(self): s = requests.session() s.keep_alive = False client = pyhdfs.HdfsClient(hosts=HDFS_HOSTS, user_name=self.user, requests_session=s) return client
def up_hdfs(self,new_file): r""" 上传到hadoop上 """ #链接hdfs client = pyhdfs.HdfsClient(hosts='hadoop2x-01:50070,hadoop2x-02:50070',user_name='suh') now_time = datetime.datetime.now().strftime("%Y%m%d") year = time.strftime('%Y',time.localtime(time.time())) HdfsDir = r'/RawData/aiaa_meeting/big_json/%s/%s'%(year,now_time) if not client.exists(HdfsDir): client.mkdirs(HdfsDir) print('Before !%s' % client.listdir(HdfsDir)) #从本地上传文件至集群 local_path = new_file up_path = HdfsDir + '/%s.big_json' % (now_time) big_json_size = os.path.getsize(local_path) if big_json_size != 0 : if client.exists(up_path): client.delete(up_path) client.copy_from_local(local_path,up_path) print('After !%s' % client.listdir(HdfsDir)) msg = '成功上传到%s' % HdfsDir self.msg2weixin(msg) else: print("无更新,不上传")
def get_a_file(): fs = pyhdfs.HdfsClient(hosts='localhost:9870', user_name='hadoop') lines = list() with fs.open('/output/part-00000') as f: for line in f: lines.append(line.decode("utf-8").strip('\n')) return lines
def main(keywords: str, tag_type: str) -> None: """ main function to get data from postgres and move it into hdfs Parameters ---------- keywords : str tag string tag_type : str partition in hive See Also ---------- get_data : get poi related data from POI table """ res = get_data(keywords) with TemporaryDirectory() as dirname: # generate txt file res.to_csv(os.path.join(dirname, f'POI码表_{keywords}_每日扫描版.txt'), header=None, index=None, sep=',', mode='w') # move txt file into hdfs hdfs_path = f'/user/hive/warehouse/poi.db/code_aoi_geohash/tag_type={tag_type}/POI码表_{keywords}_每日扫描版.txt' hdfs = pyhdfs.HdfsClient(['10.244.16.101', '10.244.16.102'], user_name='hdfs') hdfs.copy_from_local(os.path.join(dirname, f'POI码表_{keywords}_每日扫描版.txt'), hdfs_path, overwrite=True)
def file_upload(self, host, user_name, local_path, hdfs_path): print("===== file upload start =====") fs = pyhdfs.HdfsClient(hosts=host, user_name=user_name) print(fs.get_active_namenode()) print(fs.listdir('/')) fs.copy_from_local(local_path, hdfs_path) print("==== file upload finish =====")
def upload_file_to_hdfs(hdfs_path, src_path): """ 上传文件到HDFS文件系统,如果文件夹不存在就创建 Arguments: hdfs_path {string} -- HDFS文件夹 src_path {string} -- 源地址文件夹 Raises: ValueError -- 当`hdfs_path`不是类似`Linux`文件路径就会抛出 e -- `pyhdfs`模块的异常 """ if not hdfs_path.startswith("/"): raise ValueError( '属性`hdfs_path`必须为有效的hadoop平台路径,例如 /RawData/chaoxing/duxiu_ts/big_htm' ) name_node = cf.get('hadoop', 'namenode') try: client = pyhdfs.HdfsClient(hosts=name_node) if not client.exists(hdfs_path): # 如果hadoop平台文件夹不存在,就创建文件夹 client.mkdirs(hdfs_path) for _, local_file in file_list(src_path): filename = os.path.basename(local_file) dst_file = hdfs_path + '/' + filename client.copy_from_local(local_file, dst_file, overwrite=True) except pyhdfs.HdfsException as e: raise e
def __init__(self, spark): self.spark = spark self.precision_span = 60 self.timestack_span = 600 self.HDFS_CLIENT = pyhdfs.HdfsClient(hosts=hdfs_host+':9870', user_name='hdfs') self.HDFS_AUDIT = hdfs_audit_path self.HDFS_IM = hdfs_im_path
def client_init(hosts): session = requests.session() session.keep_alive = False client = pyhdfs.HdfsClient(hosts=hosts, user_name="hdfs", requests_session=session) return client
def __init__(self, host: str = "localhost", port: int = 9870, user_name="root", **kwargs): super().__init__() self.host = host self.port = port self.user = user_name self.home = f"/user/{self.user:s}" self.fs = pyhdfs.HdfsClient( hosts=[f"{host:s}:{port:d}", "localhost:9870"], user_name=user_name, **kwargs)
def get_conn(hosts,user): """ get a client of hadoop distributed file system parameter : 'hosts' is like 'host1:port1,host2:port2', use ',' to divide different server 'user' is the user of hadoop """ return pyhdfs.HdfsClient(hosts=hosts,user_name=user)
def up_hdfs(self): #链接hdfs client = pyhdfs.HdfsClient(hosts='hadoop2x-01:50070,hadoop2x-02:50070',user_name='suh') # 从本地上传文件至集群之前,集群的目录 print('Before !%s' % client.listdir('/RawData/asceproceedings/big_json/2019/20190812')) #从本地上传文件至集群 client.copy_from_local(r'D:\code\proceedings\big_json\20190812_1.big_json','/RawData/asceproceedings/big_json/2019/20190812/20190812_1.big_json') # 从本地上传文件至集群之后,集群的目录 print('After !%s' % client.listdir('/RawData/asceproceedings/big_json/2019/20190812'))
def savetohdfs(rdd): client = pyhdfs.HdfsClient(hosts="10.120.14.120,50070", user_name="cloudera") for r in rdd: for t in r: client.append( "/user/cloudera/model_deploy/output/utime.csv", "This patient need {} seconds to detect from MRI\n".format( str(t)))
def __init__(self, spark): self.spark = spark self.HDFS_CLIENT = pyhdfs.HdfsClient(hosts=HDFS_HOST + ':9870', user_name='hdfs') self.HDFS_AUDIT = HDFS_AUDIT_COMPRESS_PATH self.HDFS_IM = HDFS_IM_COMPRESS_PATH self.probe_fields_change = [['start_time', 'probe_time']] self.wifi_fields_change = [['collect_time', 'probe_time'], ['wifi_mac', 'probe_mac']]
def hdfs_init_total(remote_path, hdfs_path="10.20.37.175", port=9000): hdfs_client = pyhdfs.HdfsClient(hdfs_path, port) try: files = hdfs_client.listdir(remote_path) for k in files: hdfs_client.delete(remote_path + k) hdfs_client.delete(remote_path) except: return
def judgeIcon(request, hdfsHost="spark-master0", nnPort="50070", csvUrl="/tmp/users", url="/users", userName="******"): ''' 返回图标状态(是否亮) ''' jsonData = request.data logger.debug('jsondata: {0}'.format(jsonData)) if request.method == 'POST': username = jsonData['username'] if 'username' in jsonData.keys( ) else 'yzy' second = 0 third = 0 fourth = 1 if username in Singleton().dataPaltForm.keys(): for key, value in Singleton().dataPaltForm[username].items(): if judgeConn(value.con): second = 1 break csvUrl = '{0}/{1}'.format(csvUrl, userName) url = '{0}/{1}'.format(url, userName) client = pyhdfs.HdfsClient(hosts="{0}:{1}".format(hdfsHost, nnPort)) if client.exists(url): fileList = client.listdir(url) if fileList: second = 1 third = 1 if second == 0: if client.exists('{0}/{1}'.format(csvUrl, 'parquet')): parquetList = client.listdir('{0}/{1}'.format( csvUrl, 'parquet')) if parquetList: second = 1 folderList = DashboardFolderByUser.objects.filter(username=username) viewList = DashboardViewByUser.objects.filter(username=username) indexList = DashboardIndexByUser.objects.filter(username=username) logger.debug('folderList: {0}, viewList: {1}, indexList: {2}'.format( folderList, viewList, indexList)) if (len(folderList) == 0 and len(viewList) == 0 and len(indexList) == 0) or third == 0: fourth = 0 context = { "status": "success", "results": { "constructview": second, "dashboardview": third, "statementview": fourth } } return JsonResponse(context)
def update_track_talbe(self, df_track_output): """执行更新操作,将需要更新的数据列出,将新的数据追加到track表中""" # 旧track表的数据 # df_track_old[id,probe_time,probe_type,probe_data,probe_device_mac,netbar_wacode,longitude,latitude,device_type,create_time,datasource_id,datasource_table_name,base_person_id,flag] # 到时候df_track_old读取的数据(track表)维护在hdfs里,最后写回hdfs与pgsql client = pyhdfs.HdfsClient(hosts=self.PYHDFS_HOST, user_name='hdfs') if client.exists(self.HDFS_ST_TRACK_INFO_SUB): df_track_old = self.spark.read.parquet(self.HDFS_ST_TRACK_INFO) else: df_track_old = self.spark.read.jdbc( url=self.PGSQL_URL_analysis, table=self.PGSQL_TABLE_TRACK, properties=self.PGSQL_PROPERTIES).limit(0).withColumn( "flag", functions.lit(0)) # df_track_old = self.spark.read.csv(self.HDFS_ST_TRACK, head=True) df_track_old = df_track_old[df_track_old["datasource_table_name"] == "gd_ele_fence"] df_track_old = df_track_old.withColumn( "device_type", df_track_old.device_type.astype("long")) # _df_track_old [netbar_wacode,probe_time,probe_data,probe_type,datasource_table_name,longitude,latitude,device_type,probe_device_mac,flag] _df_track_old = df_track_old.select("netbar_wacode", "probe_time", "probe_data", "probe_type", "datasource_table_name", "longitude", "latitude", "device_type", "probe_device_mac", "flag") # , "last_update_time" # df_track_output[netbar_wacode,probe_time,probe_data,probe_type,create_time,datasource_table_name,longitude,latitude,device_type,probe_device_mac,flag] df_track_output = df_track_output.drop("create_time").drop("count") # # df_track_output[netbar_wacode,probe_time,probe_data,probe_type,datasource_table_name,longitude,latitude,device_type,probe_device_mac,flag] # # 旧表中有的去掉新表中有的,留下旧表中已被更新的数据,以供update # # 由于需要update的数据在append中也有,所以需要将append中的update的数据进行剔除 # df_track_update = _df_track_old.subtract(df_track_output) # # df_track_update[netbar_wacode,probe_time,probe_data,probe_type,datasource_table_name,longitude,latitude,device_type,probe_device_mac,flag,create_time] # df_track_update = df_track_update.withColumn("create_time", functions.lit(int(time.time() * 1000))) # df_track_update = df_track_update.join(df_track_old.select("id", "probe_time", "probe_data", "probe_device_mac", "flag"), ["probe_time", "probe_data", "probe_device_mac", "flag"], "left") # df_track_update.select("id", "probe_time").show() # # 到时候和旧track进行join,取到id,之后通过id进行更新time_on,即time_end # subtract 会自动去重掉原df的数据 # !!!怎么解决? df中维护start和end # 当结构两个df结构没有完全相同,例如nullable,subtract不会求差 df_track_append = df_track_output.subtract(_df_track_old) # df_track_output.show() # _df_track_old.show() df_track_append = df_track_append.withColumn( "device_type", df_track_append.device_type.astype("long")) df_track_append = df_track_append.withColumn( "create_time", functions.lit(int(time.time() * 1000))) # 此为有flag版,到时候插入轨迹表需要将flag字段去掉 # df_track_append.write.mode(saveMode='append').jdbc(url=self.PGSQL_URL_analysis, table="zhaoqing_duanzhou_db.track_copy2", properties=self.PGSQL_PROPERTIES) df_track_append.drop("flag").write.mode(saveMode='append').jdbc( url=self.PGSQL_URL_analysis, table=self.PGSQL_TABLE_TRACK, properties=self.PGSQL_PROPERTIES) df_track_append.write.parquet(self.HDFS_ST_TRACK_INFO, mode="append")
def __init__(self): conf = SparkConf().setAppName('deviceupdate') conf.set("spark.executor.memory", "3g") conf.set("spark.driver.memory", "3g") conf.set("spark.executor.core", "3") self.spark = SparkSession.builder.config(conf=conf).getOrCreate() self.PGSQL_URL = f"jdbc:postgresql://{DST_DB_HOST}:{DST_DB_PORT}/{DST_DB_DATABASE}" self.PGSQL_DEVICE_TABLE = DST_DB_SCHEMA+".gd_device" self.PGSQL_PLACE_TABLE = DST_DB_SCHEMA+".gd_place" self.PGSQL_PROPERTIES = {'user': DST_DB_USER, 'password': DST_DB_PASSWORD} self.HDFS_CLIENT = pyhdfs.HdfsClient(hosts=HDFS_HOST+':9870', user_name='hdfs')
def hdfs_test(): fs = pyhdfs.HdfsClient("localhost", 9000) f = fs.open('/common_step3/part-00000') for i in f: line = str(i, encoding='utf-8') # print(j) rules, confidence = line.split('\t') left_rules, right_rules = rules.split('->') print('{0}---->{1}---->{2}'.format(left_rules, right_rules, float(confidence.replace('\n', ''))))
def load_wordlist(filename): hdfs = pyhdfs.HdfsClient(hosts='hdfs-namenode:50070') words = {} f = hdfs.open(filename) text = f.read().decode('utf-8') text = text.split('\n') for line in text: words[line] = 1 f.close() return words
def hdfs_set_file(local_file_path, remote_file_path, filename, hdfs_path="10.20.37.175", port=9000): hdfs_client = pyhdfs.HdfsClient(hdfs_path, port) files = hdfs_client.listdir(remote_file_path) if filename in files: hdfs_client.delete(remote_file_path + filename) hdfs_client.copy_from_local(local_file_path + filename, remote_file_path + filename) print("set Completed!")
def hdfs_get_file(remote_path, filename, local_path, delete=False, hdfs_path='10.20.37.175', port=9000): hdfs_client = pyhdfs.HdfsClient(hdfs_path, port) hdfs_client.copy_to_local(remote_path + filename, local_path + filename) print("load completed!") if delete: hdfs_client.delete(remote_path + filename) return local_path + filename
def _save(self, name, content): '''保存文件时用''' # name,选择上传文件的名字 # content,包含上传文件内容的File对象 # 创建一个hdfs_client对象 client = pyhdfs.HdfsClient(hosts=self.hosts, user_name=self.user_name) #判断文件是否上传,如果没有上传就上传 if not client.exists(self.file_url + content.name): client.create(self.file_url + content.name, content) #否则文件已上传则不用上传,直接返回文件名 filename = self.file_url + content.name return filename
def hdfs_init_fold(remote_path, hdfs_path="10.20.37.175", port=9000): hdfs_client = pyhdfs.HdfsClient(hdfs_path, port) try: files = hdfs_client.listdir(remote_path) except: hdfs_client.mkdirs(remote_path) return if files == []: return else: for k in files: hdfs_client.delete(remote_path + k) return