示例#1
0
def client_init(user):
    session = requests.session()
    session.keep_alive = False
    client = pyhdfs.HdfsClient(hosts="192.168.7.150:9870",
                               user_name=user,
                               requests_session=session)
    return client
示例#2
0
 def __init__(self):
     hosts = ",".join(
         ["{}:{}".format(host, HDFS_PORT) for host in HDFS_HOST])
     self.client = pyhdfs.HdfsClient(hosts=hosts,
                                     user_name="szliu",
                                     max_tries=5,
                                     retry_delay=5)
示例#3
0
def CCP():
    fs = pyhdfs.HdfsClient(hosts="127.0.0.1:50070", user_name="kali")
    f = fs.open("/user/kali/lab2/outputCCP/part-00000")
    arr = []
    for line in f:
        strin = ", ".join(line.decode()[0:-1].split("\t"))
        pairs = strin.split(", ")
        pairs[2] = int(pairs[2])
        arr.append(pairs)

    arr.sort(key=lambda x: x[2], reverse=True)

    # for item in arr:
    #     print(item)

    check_word = input("Write item's name: ")
    count = 0
    for i in range(len(arr)):
        if check_word.lower() == arr[i][0].lower():
            count += 1
            print(f"{count}:\t{arr[i][1]}:  {arr[i][2]}")
        elif check_word.lower() == arr[i][1].lower():
            count += 1
            print(f"{count}:\t{arr[i][0]}:  {arr[i][2]}")
        if count == 10:
            break
示例#4
0
def SchemaExtractor(hdfs_host=HDFS_HOST, hdfs_port=HDFS_IPC_PORT, path=None):
    ''' Infer the schema of a Parquet file from its metadata

    :param hdfs_host:
    :param hdfs_port:
    :param path: path to the parquet file
    :return: schema (String)
    '''
    sqlContext = SQLContext(sc)
    seperator = ':'
    hdfs_path = [hdfs_host, str(hdfs_port)]
    fs = pyhdfs.HdfsClient(hosts=seperator.join(hdfs_path),
                           user_name='spark24')

    file_list = []
    for root, _, filenames in fs.walk(path):
        for filename in filenames:
            if filename.endswith(".parquet"):
                file_list.append(root + '/' + filename)

    single_parquet_path = file_list[0]

    df = sqlContext.read.parquet('hdfs://' + hdfs_host + single_parquet_path)

    schema = ''
    for name, dtype in df.dtypes:
        schema = schema + ' ' + str(name) + ' ' + str(dtype) + ','
    schema = schema[:-1]
    return schema
示例#5
0
def handleFileFromHdfs(fileName,
                       rootFolder,
                       jsonData={},
                       userName='******',
                       hdfsHost="spark-master0",
                       nnPort="50070"):
    '''
    remove or rename the file from hdfs
    '''
    try:
        client = pyhdfs.HdfsClient(hosts="{0}:{1}".format(hdfsHost, nnPort))
    except Exception:
        logger.error("Exception: {0}, Traceback: {1}".format(
            sys.exc_info(), traceback.format_exc()))
        return False

    if not jsonData:
        '''
        check the generated file if exist
        '''
        FolderUri = "{0}/{1}/{2}".format(rootFolder, userName, fileName)
        if client.exists(FolderUri):
            return True
        return False

    if rootFolder.startswith('/tmp/users'):
        csvFolderUri = "{0}/{1}/csv/{2}".format(rootFolder, userName, fileName)
        parquetFolderUri = "{0}/{1}/parquet/{2}".format(
            rootFolder, userName, fileName)

        if jsonData['method'] == 'delete':
            deleteHdfsFile(client, csvFolderUri)
            deleteHdfsFile(client, parquetFolderUri)
            return True

        elif jsonData['method'] == 'rename':
            newname = jsonData['newname']
            csvRs = renameHdfsFile(client, csvFolderUri, newname)
            parquetRs = renameHdfsFile(client, parquetFolderUri, newname)

            if csvRs and parquetRs:
                return True
            return False

    elif rootFolder.startswith('/users'):
        username = jsonData['username'] if 'username' in jsonData.keys(
        ) else 'yzy'
        FolderUri = "{0}/{1}/{2}".format(rootFolder, userName, fileName)

        if jsonData['method'] == 'delete':

            checkOrDeleteView(fileName, username, delete=True)
            return deleteHdfsFile(client, FolderUri)

        elif jsonData['method'] == 'rename':
            newname = jsonData['newname']
            return renameHdfsFile(client,
                                  FolderUri,
                                  newname,
                                  username=username)
示例#6
0
def seplunk_start(config_path):
    if not os.path.exists(config_path):
        LOGGER.error("config file path not exists")
        sys.exit(1)
    config = get_configure(config_path)
    default_fs = config.get('fs.defaultFS', None)
    if not default_fs:
        LOGGER.error("hdfs not found")
    host = default_fs.split(':')[1].strip("/")
    hdfs_client = pyhdfs.HdfsClient(host)
    if not os.path.exists(TMP_PATH):
        os.mkdir(TMP_PATH)
    db_path = os.path.join(TMP_PATH, DB_FLIE)
    # if os.path.exists(db_path):
    #    os.remove(db_path)
    conn = sql.get_conn(db_path)

    p_log_conn, p_monitor_conn = Pipe()
    p_log = Process(target=create_process_log,
            args=(hdfs_client, config, conn, p_log_conn))
    p_log.start()

    p_monitor = Process(target=create_process_monitor,
        args=(hdfs_client, config, conn, p_monitor_conn))
    p_monitor.start()
示例#7
0
def hdfs_get_file(remote_path,
                  filename,
                  local_path,
                  delete=False,
                  hdfs_path='10.20.37.175',
                  port=9000):
    hdfs_client = pyhdfs.HdfsClient(hdfs_path, port)
    try:
        hdfs_client.copy_to_local(remote_path + filename,
                                  local_path + filename)
    except:
        time.sleep(2)
        hdfs_client.copy_to_local(remote_path + filename,
                                  local_path + filename)
    print("load completed!")
    if delete:
        files = hdfs_client.listdir(remote_path)
        while filename in files:
            try:
                hdfs_client.delete(remote_path + filename)
                files = hdfs_client.listdir(remote_path)
                time.sleep(2)
            except:
                files = hdfs_client.listdir(remote_path)
    return local_path + filename
示例#8
0
 def _init_client(self):
     s = requests.session()
     s.keep_alive = False
     client = pyhdfs.HdfsClient(hosts=HDFS_HOSTS,
                                user_name=self.user,
                                requests_session=s)
     return client
 def up_hdfs(self,new_file):
     r"""
     上传到hadoop上
     """
     #链接hdfs
     client = pyhdfs.HdfsClient(hosts='hadoop2x-01:50070,hadoop2x-02:50070',user_name='suh')
     now_time = datetime.datetime.now().strftime("%Y%m%d")
     year = time.strftime('%Y',time.localtime(time.time()))
     HdfsDir = r'/RawData/aiaa_meeting/big_json/%s/%s'%(year,now_time)
     if not client.exists(HdfsDir):
         client.mkdirs(HdfsDir)
     print('Before !%s' % client.listdir(HdfsDir))
     #从本地上传文件至集群
     local_path = new_file
     up_path = HdfsDir + '/%s.big_json' % (now_time)
     big_json_size = os.path.getsize(local_path)
     if big_json_size != 0 :
         if client.exists(up_path):
             client.delete(up_path)
         client.copy_from_local(local_path,up_path)
         print('After !%s' % client.listdir(HdfsDir))
         msg = '成功上传到%s' % HdfsDir
         self.msg2weixin(msg)
     else:
         print("无更新,不上传")
示例#10
0
def get_a_file():
    fs = pyhdfs.HdfsClient(hosts='localhost:9870', user_name='hadoop')
    lines = list()
    with fs.open('/output/part-00000') as f:
        for line in f:
            lines.append(line.decode("utf-8").strip('\n'))
    return lines
示例#11
0
def main(keywords: str, tag_type: str) -> None:
    """
    main function to get data from postgres and move it into hdfs

    Parameters
    ----------
    keywords : str
        tag string
    tag_type : str
        partition in hive

    See Also
    ----------
    get_data : get poi related data from POI table
    """
    res = get_data(keywords)

    with TemporaryDirectory() as dirname:
        # generate txt file
        res.to_csv(os.path.join(dirname, f'POI码表_{keywords}_每日扫描版.txt'),
                   header=None,
                   index=None,
                   sep=',',
                   mode='w')

        # move txt file into hdfs
        hdfs_path = f'/user/hive/warehouse/poi.db/code_aoi_geohash/tag_type={tag_type}/POI码表_{keywords}_每日扫描版.txt'
        hdfs = pyhdfs.HdfsClient(['10.244.16.101', '10.244.16.102'],
                                 user_name='hdfs')
        hdfs.copy_from_local(os.path.join(dirname,
                                          f'POI码表_{keywords}_每日扫描版.txt'),
                             hdfs_path,
                             overwrite=True)
示例#12
0
 def file_upload(self, host, user_name, local_path, hdfs_path):
     print("===== file upload start =====")
     fs = pyhdfs.HdfsClient(hosts=host, user_name=user_name)
     print(fs.get_active_namenode())
     print(fs.listdir('/'))
     fs.copy_from_local(local_path, hdfs_path)
     print("==== file upload finish =====")
def upload_file_to_hdfs(hdfs_path, src_path):
    """
    上传文件到HDFS文件系统,如果文件夹不存在就创建
    
    Arguments:
        hdfs_path {string} -- HDFS文件夹
        src_path {string} -- 源地址文件夹
    
    Raises:
        ValueError -- 当`hdfs_path`不是类似`Linux`文件路径就会抛出
        e -- `pyhdfs`模块的异常
    """

    if not hdfs_path.startswith("/"):
        raise ValueError(
            '属性`hdfs_path`必须为有效的hadoop平台路径,例如 /RawData/chaoxing/duxiu_ts/big_htm'
        )
    name_node = cf.get('hadoop', 'namenode')
    try:
        client = pyhdfs.HdfsClient(hosts=name_node)
        if not client.exists(hdfs_path):  # 如果hadoop平台文件夹不存在,就创建文件夹
            client.mkdirs(hdfs_path)
        for _, local_file in file_list(src_path):
            filename = os.path.basename(local_file)
            dst_file = hdfs_path + '/' + filename
            client.copy_from_local(local_file, dst_file, overwrite=True)
    except pyhdfs.HdfsException as e:
        raise e
示例#14
0
 def __init__(self, spark):
     self.spark = spark
     self.precision_span = 60
     self.timestack_span = 600
     self.HDFS_CLIENT = pyhdfs.HdfsClient(hosts=hdfs_host+':9870', user_name='hdfs')
     self.HDFS_AUDIT = hdfs_audit_path
     self.HDFS_IM = hdfs_im_path
示例#15
0
def client_init(hosts):
    session = requests.session()
    session.keep_alive = False
    client = pyhdfs.HdfsClient(hosts=hosts,
                               user_name="hdfs",
                               requests_session=session)
    return client
示例#16
0
 def __init__(self, host: str = "localhost", port: int = 9870, user_name="root", **kwargs):
     super().__init__()
     self.host = host
     self.port = port
     self.user = user_name
     self.home = f"/user/{self.user:s}"
     self.fs = pyhdfs.HdfsClient(
         hosts=[f"{host:s}:{port:d}", "localhost:9870"], user_name=user_name, **kwargs)
示例#17
0
def get_conn(hosts,user):
	"""
	get a client of hadoop distributed file system

	parameter :
		'hosts' is like 'host1:port1,host2:port2', use ',' to divide different server
		'user' is the user of hadoop
	"""
	return pyhdfs.HdfsClient(hosts=hosts,user_name=user)
 def up_hdfs(self):
     #链接hdfs
     client = pyhdfs.HdfsClient(hosts='hadoop2x-01:50070,hadoop2x-02:50070',user_name='suh')
     # 从本地上传文件至集群之前,集群的目录
     print('Before !%s' % client.listdir('/RawData/asceproceedings/big_json/2019/20190812'))
     #从本地上传文件至集群
     client.copy_from_local(r'D:\code\proceedings\big_json\20190812_1.big_json','/RawData/asceproceedings/big_json/2019/20190812/20190812_1.big_json')
     # 从本地上传文件至集群之后,集群的目录
     print('After !%s' % client.listdir('/RawData/asceproceedings/big_json/2019/20190812'))
def savetohdfs(rdd):
    client = pyhdfs.HdfsClient(hosts="10.120.14.120,50070",
                               user_name="cloudera")
    for r in rdd:
        for t in r:
            client.append(
                "/user/cloudera/model_deploy/output/utime.csv",
                "This patient need {} seconds to detect from MRI\n".format(
                    str(t)))
示例#20
0
 def __init__(self, spark):
     self.spark = spark
     self.HDFS_CLIENT = pyhdfs.HdfsClient(hosts=HDFS_HOST + ':9870',
                                          user_name='hdfs')
     self.HDFS_AUDIT = HDFS_AUDIT_COMPRESS_PATH
     self.HDFS_IM = HDFS_IM_COMPRESS_PATH
     self.probe_fields_change = [['start_time', 'probe_time']]
     self.wifi_fields_change = [['collect_time', 'probe_time'],
                                ['wifi_mac', 'probe_mac']]
示例#21
0
def hdfs_init_total(remote_path, hdfs_path="10.20.37.175", port=9000):
    hdfs_client = pyhdfs.HdfsClient(hdfs_path, port)
    try:
        files = hdfs_client.listdir(remote_path)
        for k in files:
            hdfs_client.delete(remote_path + k)
        hdfs_client.delete(remote_path)
    except:
        return
示例#22
0
def judgeIcon(request,
              hdfsHost="spark-master0",
              nnPort="50070",
              csvUrl="/tmp/users",
              url="/users",
              userName="******"):
    '''
    返回图标状态(是否亮)
    '''
    jsonData = request.data
    logger.debug('jsondata: {0}'.format(jsonData))
    if request.method == 'POST':
        username = jsonData['username'] if 'username' in jsonData.keys(
        ) else 'yzy'
        second = 0
        third = 0
        fourth = 1

        if username in Singleton().dataPaltForm.keys():
            for key, value in Singleton().dataPaltForm[username].items():
                if judgeConn(value.con):
                    second = 1
                    break
        csvUrl = '{0}/{1}'.format(csvUrl, userName)
        url = '{0}/{1}'.format(url, userName)
        client = pyhdfs.HdfsClient(hosts="{0}:{1}".format(hdfsHost, nnPort))
        if client.exists(url):
            fileList = client.listdir(url)
            if fileList:
                second = 1
                third = 1
        if second == 0:
            if client.exists('{0}/{1}'.format(csvUrl, 'parquet')):
                parquetList = client.listdir('{0}/{1}'.format(
                    csvUrl, 'parquet'))
                if parquetList:
                    second = 1

        folderList = DashboardFolderByUser.objects.filter(username=username)
        viewList = DashboardViewByUser.objects.filter(username=username)
        indexList = DashboardIndexByUser.objects.filter(username=username)
        logger.debug('folderList: {0}, viewList: {1}, indexList: {2}'.format(
            folderList, viewList, indexList))
        if (len(folderList) == 0 and len(viewList) == 0
                and len(indexList) == 0) or third == 0:
            fourth = 0

        context = {
            "status": "success",
            "results": {
                "constructview": second,
                "dashboardview": third,
                "statementview": fourth
            }
        }
        return JsonResponse(context)
示例#23
0
    def update_track_talbe(self, df_track_output):
        """执行更新操作,将需要更新的数据列出,将新的数据追加到track表中"""
        # 旧track表的数据
        # df_track_old[id,probe_time,probe_type,probe_data,probe_device_mac,netbar_wacode,longitude,latitude,device_type,create_time,datasource_id,datasource_table_name,base_person_id,flag]
        # 到时候df_track_old读取的数据(track表)维护在hdfs里,最后写回hdfs与pgsql
        client = pyhdfs.HdfsClient(hosts=self.PYHDFS_HOST, user_name='hdfs')
        if client.exists(self.HDFS_ST_TRACK_INFO_SUB):
            df_track_old = self.spark.read.parquet(self.HDFS_ST_TRACK_INFO)
        else:
            df_track_old = self.spark.read.jdbc(
                url=self.PGSQL_URL_analysis,
                table=self.PGSQL_TABLE_TRACK,
                properties=self.PGSQL_PROPERTIES).limit(0).withColumn(
                    "flag", functions.lit(0))
        # df_track_old = self.spark.read.csv(self.HDFS_ST_TRACK, head=True)
        df_track_old = df_track_old[df_track_old["datasource_table_name"] ==
                                    "gd_ele_fence"]
        df_track_old = df_track_old.withColumn(
            "device_type", df_track_old.device_type.astype("long"))
        # _df_track_old  [netbar_wacode,probe_time,probe_data,probe_type,datasource_table_name,longitude,latitude,device_type,probe_device_mac,flag]
        _df_track_old = df_track_old.select("netbar_wacode", "probe_time",
                                            "probe_data", "probe_type",
                                            "datasource_table_name",
                                            "longitude", "latitude",
                                            "device_type", "probe_device_mac",
                                            "flag")  # , "last_update_time"
        # df_track_output[netbar_wacode,probe_time,probe_data,probe_type,create_time,datasource_table_name,longitude,latitude,device_type,probe_device_mac,flag]
        df_track_output = df_track_output.drop("create_time").drop("count")
        # # df_track_output[netbar_wacode,probe_time,probe_data,probe_type,datasource_table_name,longitude,latitude,device_type,probe_device_mac,flag]
        # # 旧表中有的去掉新表中有的,留下旧表中已被更新的数据,以供update
        # # 由于需要update的数据在append中也有,所以需要将append中的update的数据进行剔除
        # df_track_update = _df_track_old.subtract(df_track_output)
        # # df_track_update[netbar_wacode,probe_time,probe_data,probe_type,datasource_table_name,longitude,latitude,device_type,probe_device_mac,flag,create_time]
        # df_track_update = df_track_update.withColumn("create_time", functions.lit(int(time.time() * 1000)))
        # df_track_update = df_track_update.join(df_track_old.select("id", "probe_time", "probe_data", "probe_device_mac", "flag"), ["probe_time", "probe_data", "probe_device_mac", "flag"], "left")
        # df_track_update.select("id", "probe_time").show()
        # # 到时候和旧track进行join,取到id,之后通过id进行更新time_on,即time_end

        # subtract 会自动去重掉原df的数据
        # !!!怎么解决? df中维护start和end
        # 当结构两个df结构没有完全相同,例如nullable,subtract不会求差
        df_track_append = df_track_output.subtract(_df_track_old)
        # df_track_output.show()
        # _df_track_old.show()
        df_track_append = df_track_append.withColumn(
            "device_type", df_track_append.device_type.astype("long"))
        df_track_append = df_track_append.withColumn(
            "create_time", functions.lit(int(time.time() * 1000)))
        # 此为有flag版,到时候插入轨迹表需要将flag字段去掉
        # df_track_append.write.mode(saveMode='append').jdbc(url=self.PGSQL_URL_analysis, table="zhaoqing_duanzhou_db.track_copy2", properties=self.PGSQL_PROPERTIES)
        df_track_append.drop("flag").write.mode(saveMode='append').jdbc(
            url=self.PGSQL_URL_analysis,
            table=self.PGSQL_TABLE_TRACK,
            properties=self.PGSQL_PROPERTIES)
        df_track_append.write.parquet(self.HDFS_ST_TRACK_INFO, mode="append")
示例#24
0
 def __init__(self):
     conf = SparkConf().setAppName('deviceupdate')
     conf.set("spark.executor.memory", "3g")
     conf.set("spark.driver.memory", "3g")
     conf.set("spark.executor.core", "3")
     self.spark = SparkSession.builder.config(conf=conf).getOrCreate()
     self.PGSQL_URL = f"jdbc:postgresql://{DST_DB_HOST}:{DST_DB_PORT}/{DST_DB_DATABASE}"
     self.PGSQL_DEVICE_TABLE = DST_DB_SCHEMA+".gd_device"
     self.PGSQL_PLACE_TABLE = DST_DB_SCHEMA+".gd_place"
     self.PGSQL_PROPERTIES = {'user': DST_DB_USER, 'password': DST_DB_PASSWORD}
     self.HDFS_CLIENT = pyhdfs.HdfsClient(hosts=HDFS_HOST+':9870', user_name='hdfs')
示例#25
0
def hdfs_test():
    fs = pyhdfs.HdfsClient("localhost", 9000)
    f = fs.open('/common_step3/part-00000')
    for i in f:
        line = str(i, encoding='utf-8')
        # print(j)
        rules, confidence = line.split('\t')
        left_rules, right_rules = rules.split('->')
        print('{0}---->{1}---->{2}'.format(left_rules, right_rules,
                                           float(confidence.replace('\n',
                                                                    ''))))
示例#26
0
def load_wordlist(filename):

    hdfs = pyhdfs.HdfsClient(hosts='hdfs-namenode:50070')
    words = {}
    f = hdfs.open(filename)
    text = f.read().decode('utf-8')
    text = text.split('\n')
    for line in text:
        words[line] = 1
    f.close()
    return words
示例#27
0
def hdfs_set_file(local_file_path,
                  remote_file_path,
                  filename,
                  hdfs_path="10.20.37.175",
                  port=9000):
    hdfs_client = pyhdfs.HdfsClient(hdfs_path, port)
    files = hdfs_client.listdir(remote_file_path)
    if filename in files:
        hdfs_client.delete(remote_file_path + filename)
    hdfs_client.copy_from_local(local_file_path + filename,
                                remote_file_path + filename)
    print("set Completed!")
示例#28
0
def hdfs_get_file(remote_path,
                  filename,
                  local_path,
                  delete=False,
                  hdfs_path='10.20.37.175',
                  port=9000):
    hdfs_client = pyhdfs.HdfsClient(hdfs_path, port)
    hdfs_client.copy_to_local(remote_path + filename, local_path + filename)
    print("load completed!")
    if delete:
        hdfs_client.delete(remote_path + filename)
    return local_path + filename
示例#29
0
    def _save(self, name, content):
        '''保存文件时用'''
        # name,选择上传文件的名字
        # content,包含上传文件内容的File对象

        # 创建一个hdfs_client对象
        client = pyhdfs.HdfsClient(hosts=self.hosts, user_name=self.user_name)
        #判断文件是否上传,如果没有上传就上传
        if not client.exists(self.file_url + content.name):
            client.create(self.file_url + content.name, content)
        #否则文件已上传则不用上传,直接返回文件名
        filename = self.file_url + content.name
        return filename
示例#30
0
def hdfs_init_fold(remote_path, hdfs_path="10.20.37.175", port=9000):
    hdfs_client = pyhdfs.HdfsClient(hdfs_path, port)
    try:
        files = hdfs_client.listdir(remote_path)
    except:
        hdfs_client.mkdirs(remote_path)
        return
    if files == []:
        return
    else:
        for k in files:
            hdfs_client.delete(remote_path + k)
        return