Python Client示例，hdfs.Client Python示例

示例#1

0

显示文件

def getReleaseFromHdfs_batch(releaseNumber=10):
    '''copy releases form server using hdfs'''
    client = hdfs.Client("http://10.141.221.82:50070", root='/home/fdse')
    releases = client.list("/java/releases")
    for r in releases[:releaseNumber]:
        client.download(u'/java/releases/' + r, os.path.join(path, 'releases'))
        print r, ' has download...'

示例#2

0

显示文件

 def __init__(self,
              uri='http://hadoop-hd1:50070',
              user='******',
              root='/user/hive/warehouse'):
     self.client = hdfs.Client(uri)
     self.user = user
     self.root = root

示例#3

0

显示文件

文件： data_helper.py 项目： anarkia7115/snorkel-experiment

def spark_df_to_local_txt(local_path,
                          df: pyspark.sql.DataFrame = None,
                          deli="\t",
                          hdfs_dir_path=None):
    def row_to_str(row):
        return deli.join([str(xx) for xx in row.asDict().values()])

    # map as string rdd
    if hdfs_dir_path == None:
        str_rdd = df.rdd.map(row_to_str)
        rand_bits = random.getrandbits(64)
        hdfs_dir_name = "str_df_%016x" % rand_bits
        hdfs_dir_path = "/tmp/" + hdfs_dir_name
        # save to hdfs
        str_rdd.saveAsTextFile(hdfs_dir_path)
    else:
        hdfs_dir_name = os.path.basename(hdfs_dir_path)
        hdfs_dir_path = "/tmp/" + hdfs_dir_name

    # get hdfs to local
    local_tmp_dir = "./tmp/" + hdfs_dir_name
    c = hdfs.Client("http://soldier1:50070")
    c.download(hdfs_dir_path, local_tmp_dir)
    # delete tmp hdfs
    c.delete(hdfs_dir_path, recursive=True)

    # cat to one file
    with open(local_path, 'wb') as outfile:
        for tmp_f in os.listdir(local_tmp_dir):
            fn = os.path.join(local_tmp_dir, tmp_f)
            with open(fn, 'rb') as readfile:
                shutil.copyfileobj(readfile, outfile)

    # remove local tmp_file
    shutil.rmtree(local_tmp_dir)

示例#4

0

显示文件

 def __init__(self,
              uri='http://two-stream-master-prod-02:50070',
              user='******',
              root='hdfs://nameservicestream/user/hive/warehouse'):
     self.client = hdfs.Client(uri)
     self.user = user
     self.root = root

示例#5

0

显示文件

文件： etl_hdfs.py 项目： CHNnoodle/PyEtlOld

def put_hdfs(shcontext, filename, hdfs_path):
    try:
        os.environ['NLS_LANG'] = 'AMERICAN_AMERICA.AL32UTF8'  #设置环境变量，避免乱码
        sh_path = '/root/spoolsh/' + filename + '.sh'
        txt_local_path = '/root/spooldata/' + filename + '.txt'
        f = open(sh_path, 'wb+')
        f.write(shcontext)
        f.close()
        print '写入pool脚本'
        oscmd1 = 'chmod +x ' + sh_path
        (res_status1, res_output1) = commands.getstatusoutput(oscmd1)
        oscmd2 = 'sh ' + sh_path
        (res_status2, res_output2) = commands.getstatusoutput(oscmd2)
        print 'spool数据到本地'
        txt_hdfs_path = hdfs_path + filename + '.txt'
        client = hdfs.Client("http://192.10.86.31:50070",
                             root="/",
                             timeout=100,
                             session=False)
        client.delete(hdfs_path, recursive=True)
        client.upload(txt_hdfs_path, txt_local_path)
        print '上传数据到hdfs'
        oscmd3 = 'rm -f ' + txt_local_path
        print '删除本地文件'
        res3 = os.system(oscmd3)
        return 'success'
    except Exception, e:
        print e
        return e

示例#6

0

显示文件

文件： datasets.py 项目： LIANGSNOW/MLStudy

def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000,
                   seed=None):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype,
                           seed=seed)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    HOST = "192.168.205.185"
    NAME_NODE_PORT = 50070
    client = hdfs.Client('http://{}:{}'.format(HOST, NAME_NODE_PORT))

    with client.read(train_dir + "/" + TRAIN_IMAGES) as f:
        train_images = extract_images(f)

    with client.read(train_dir + "/" + TRAIN_LABELS) as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    with client.read(train_dir + "/" + TEST_IMAGES) as f:
        test_images = extract_images(f)

    with client.read(train_dir + "/" + TEST_LABELS) as f:
        test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    options = dict(dtype=dtype, reshape=reshape, seed=seed)

    train = DataSet(train_images, train_labels, **options)
    validation = DataSet(validation_images, validation_labels, **options)
    test = DataSet(test_images, test_labels, **options)

    return base.Datasets(train=train, validation=validation, test=test)

示例#7

0

显示文件

文件： phdfs.py 项目： mpdesign/gflow

 def client(self, host='', port='', timeout=None):
     host = host or HDFS_CONFIG['host']
     port = port or HDFS_CONFIG['webport']
     if self._client is None:
         try:
             self._client = hdfs.Client('http://%s:%s' % (host, port), timeout=timeout, session=rs)
         except Exception, e:
             output('hdfs Exception ' + str(e), logType='hdfs')
             raise

示例#8

0

显示文件

文件： flink_job.py 项目： xzregg/flink_platform

    def get_storage_file_list(self):
        from ..settings import FLINK_SAVEPOINT_PATH_BACKEND_ADDRESS
        import hdfs
        storage_file_backed = hdfs.Client(FLINK_SAVEPOINT_PATH_BACKEND_ADDRESS,
                                          timeout=10)
        storage_file_list = self.get_storage_file_list_status(
            storage_file_backed, 'savepoints', self.get_savepoint_path())
        # storage_file_list = self.get_storage_file_list_status(FLINK_SAVEPOINT_PATH_BACKEND, 'checkponits',
        #                                                       self.get_checkponit_path())
        storage_file_list.sort(key=lambda x: x.modification_time, reverse=True)

        return storage_file_list

示例#9

0

显示文件

文件： PySynProc.py 项目： CHNnoodle/PyEtl

def table_to_hdfs(d_info):
    d_info = p_insert_log(d_info)
    localtempfile = str(d_info.get('localpath')) + str(
        d_info.get('source_proc')) + '.tmp'
    localfile = str(d_info.get('localpath')) + str(
        d_info.get('source_proc')) + '.lz4'
    if d_info.get('syn_strategy') == 1:
        hdfspath = str(d_info.get('target_proc'))
    else:
        hdfspath = str(d_info.get('target_proc')) + str(
            d_info.get('acctday')) + '/'

    try:
        d_info['source_dns'] = getdns(d_info.get('source_dbname').upper())
        source_db = create_engine(d_info.get('source_dns'))
        source_conn = source_db.connect()
        try:
            seldata = list(
                source_conn.execute(
                    text(d_info.get('sql_select').format(**d_info))))
        except Exception, etp:
            logging.error(etp)
            raise Exception(etp)
        finally:
            source_conn.close()

        with open(localtempfile, 'wb+') as f:
            for row_data in seldata:
                f.write(''.join(row_data).encode('utf-8') + '\n')

        logging.info('数据已写入本地，开始压缩数据')
        oscmd = 'lz4 ' + localtempfile + ' ' + localfile
        (status, output) = commands.getstatusoutput(oscmd)
        if status == 0:
            try:
                d_info['target_dns'] = getdns(
                    d_info.get('target_dbname').upper())
                client = hdfs.Client(d_info.get('target_dns'),
                                     root="/",
                                     timeout=100,
                                     session=False)
                if not client.status(hdfspath, strict=False):
                    client.makedirs(hdfspath, permission=777)
                print localfile, hdfspath
                client.upload(hdfspath, localfile)
                d_info['finish_flag'] = 'finish'
                d_info['retcode'] = 'success'
            except Exception, etp:
                logging.error(etp)
                raise Exception(etp)

示例#10

0

显示文件

文件： py_hdfs.py 项目： swiich/GZRR_HRDB

def download_file(hdfs_location, file_local):
    """
    :param hdfs_location: hdfs路径+文件名
    :param file_local: 本地路径+文件名
    """
    # 可以用 try finally提高可读性
    client = hdfs.Client('http://172.39.8.61:50070', root='/', timeout=10)
    try:
        with client.read(hdfs_location) as r:
            with open(file_local, 'wb') as f:
                f.write(r.read())

    except hdfs.util.HdfsError:
        client = hdfs.Client('http://172.39.8.62:50070', root='/', timeout=10)
        with client.read(hdfs_location) as r:
            with open(file_local, 'wb') as f:
                f.write(r.read())

    except IOError as msg:
        with open("err.log", "a") as f:
            f.write(str(msg))
        file_local = None

    return file_local

示例#11

0

显示文件

    def __init__(self):
        try:
            self.zk = KazooClient(hosts=self.zooQuorum)
            self.zk.start()
            if self.zk.exists(self.path):
                data = self.zk.get(self.path)
                ip = None
                if self.nameNodeA[0] in data[0]:
                    ip = 'http://' + self.nameNodeA[1] + ':50070'
                elif self.nameNodeB[0] in data[0]:
                    ip = 'http://' + self.nameNodeB[1] + ':50070'

                self.client = hdfs.Client(ip, root='/')
        except Exception as e:
            print('%s kazooClient __init__ ERROR! %s' %
                  (datetime.datetime.now(), traceback.format_exc()))

示例#12

0

显示文件

 def _init_connect(self):
     """连接hdfs服务器"""
     # 先获取hdfs ip
     ips = sorted([ip for hostname, ip in cluster_ip.items()])
     port = self.port  # port='50070'
     conn = None
     for host in ips:
         # 连接字符串
         url = "http://{host}:{port}".format(host=host, port=port)
         try:
             conn = hdfs.Client(url, root='/', timeout=100, session=False)
             conn.list('/')
             break
         except:
             pass
     # 如果执行到这一步，说明前面的都有问题
     if conn.list('/'):
         return conn
     else:
         raise Exception('没有找到hdfs的地址，请检查')

示例#13

0

显示文件

文件： hdfs_handler.py 项目： star860/work_warehouse

    def __init__(self):
        # 连接zookeeper
        self.zk = KazooClient(hosts=self.zoo_quorum_dev)
        self.zk.start()
        # 判断是否存在zookeeper同步
        if self.zk.exists(self.path_dev):
            # 获取当前active的namenode
            data = self.zk.get(self.path_dev)
            ip = None
            if self.name_node_a_dev[0] in data[0]:
                ip = 'http://' + self.name_node_a_dev[1] + ':50070'
            elif self.name_node_b_dev[0] in data[0]:
                ip = 'http://' + self.name_node_b_dev[1] + ':50070'

            if ip:
                # 连接active的hdfs namenode
                self.client = hdfs.Client(ip, root='/')
                if not self.client:
                    print('hdfs client 建立连接失败!')
            else:
                print('nameNode IP 获取失败!')

示例#14

0

显示文件

文件： py_hdfs.py 项目： swiich/GZRR_HRDB

def upload_file(hdfs_location, local):
    try:
        client = hdfs.Client('http://172.39.8.61:50070', root='/', timeout=10)
        base_dir = local.split('/').pop()  # 要上传的路径的最后一个文件夹

        for root, dirs, files in os.walk(local):

            new_dir = base_dir + root.split(base_dir).pop()  # 去除本地路径前缀

            for file in files:
                old_path = root + '/' + file  # 原始本地路径文件
                lpath = new_dir + '/' + file  # 去除本地路径前缀后的文件

                if not client.status(hdfs_location + '/' + lpath,
                                     strict=False):
                    # 第一个参数远程路径，第二个参数本地路径，第三个参数是否覆盖，第四个参数工作线程数
                    client.upload(hdfs_location + '/' + lpath,
                                  old_path,
                                  overwrite=False)

    except Exception as e:
        with open("err.log", "a") as f:
            f.write(str(e))
            f.write('\n')

示例#15

0

显示文件

文件： hdfs_op.py 项目： ACEGuiPeng/JobRep_wrapper

 def connect_hdfs(cls):
     cls._client = hdfs.Client(**cls._parm_handler())
     cls.handle_dns()

示例#16

0

显示文件

文件： spark-1.6-hdfs-yarn-CDH5.py 项目： airztz/Python4fun

        #dataframeoutput = dataframeoutput.drop('sum(EUCELL_DL_TPUT_NUM_KBITS)').drop('sum(DLPRBUSEDWITHDSPUC_FDUSERS)').drop('sum(DLPRBUSEDWITHDSPUC_FSUSERS)').drop('sum(EUCELL_DL_DRB_TPUT_NUM_KBITS)').drop('sum(EUCELL_DL_DRB_TPUT_DEN_SECS)')
        dataframeoutput = dataframeoutput.select(
            'DATE', 'MARKET', 'VENDOR', 'BAND', 'Cell Traffic (kbytes)',
            'Cell Used PRB', 'Cell Spectral Efficiency (bps/Hz)',
            'UE Traffic (kbytes)', 'UE Active Time (s)', 'UE Tput (kbps)',
            'Total cell count', 'Total Spectrum in MHz')
        dataframeoutput = dataframeoutput.coalesce(1)
        #take action here
        dataframeoutput.write.format('com.databricks.spark.csv').save(
            outputName)
        difference = dt.datetime.now() - start
        dataframeoutput.unpersist()
        sc.stop()
        return difference


if __name__ == "__main__":
    outDirectory = os.path.join(os.path.dirname(__file__), 'report/')
    hdfsFiles = hdfs.Client('http://hdfs1:50070').list(
        '/user/ec2-user/sample-data'
    )  # Use namenode public ip http://namenode:50070
    print "start"
    print ALU_LTE_SPARK().run("hdfs", hdfsFiles, outDirectory)
    print "OK"
    exit()

#Submit job to yarn on top of your hdfs cluster
#spark-submit --master yarn --deploy-mode cluster --num-executors 3 --executor-cores 2 --executor-memory 2g --packages com.databricks:spark-csv_2.10:1.5.0 spark-1.6-hdfs-yarn-CDH5.py

#For hdfs, pls find output file at:
#hdfs://user/ec2-user/report/result_group_by_MARKET_ALU_2017_spark_hdfs.csv

示例#17

0

显示文件

import hdfs

client = hdfs.Client('http://localhost:9870', root='/')
print(client.list('/'))

with client.write('/streaming.txt', encoding='utf-8') as hdfs_file:
    for idx in range(10_000):
        hdfs_file.write(f'{idx} {idx ** 2} {idx ** 3}\n')

示例#18

0

显示文件

文件： activate_relu.py 项目： ZhangNYG/classificationForTxt

hiddenLayer3 = add_layer("layer3", hiddenLayer2, in_size=512, out_size=128, activation_function=tf.nn.relu)

prediction = add_layer("end", hiddenLayer3, in_size=128, out_size=3, activation_function=tf.nn.softmax)

# loss = tf.reduce_mean(tf.reduce_sum(y_lable - prediction))
loss = -tf.reduce_mean(y_lable * tf.log(tf.clip_by_value(prediction, 1e-10, 1.0)))

train_step = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

init = tf.global_variables_initializer()

sess = tf.Session()

sess.run(init)

client = hdfs.Client(HADOOP_IP_PORT, root="/", timeout=500, session=False)
fileList = client.list(HADOOP_PATH)

epoch_times = 0
while True:
    epoch_times += 1
    for file_loop in fileList:  # 每个数据集中有一批数据
        print('\n', file_loop)
        get_data(client, HADOOP_PATH + file_loop)





        # for i in range(2000):
        #

示例#19

0

显示文件

 def upload_pfile(self, cur_dir):
     '''
        构建字典存储爬虫插件对应的属性文件、原文文件夹、txt文本文件夹信息
        键为插件名,值为属性文件位置、原文文件文件夹位置ori、txt文本文件夹位置txt组成的列表
        当ori或txt文件夹不存在时,使用None占位
     '''
     f_all_dict = {}
     print("mmmmmmmmmmmmmm", cur_dir)
     for f in os.listdir(cur_dir):
         if f.find('文献属性.xls') > 0 or f.find('文献属性.xlsx') > 0:
             filepro = cur_dir + f if cur_dir[
                 -1] == '/' else cur_dir + '/' + f
             filepath = cur_dir + f[:f.find('文献属性')] + '_ori/' if cur_dir[
                 -1] == '/' else cur_dir + '/' + f[:f.find('文献属性'
                                                           )] + '_ori/'
             filetxt = cur_dir + f[:f.find('文献属性')] + '_txt/' if cur_dir[
                 -1] == '/' else cur_dir + '/' + f[:f.find('文献属性'
                                                           )] + '_txt/'
             f_all_dict.setdefault(
                 f[:f.find('文献属性')], []).append(filepro) if os.path.exists(
                     filepro) else f_all_dict.setdefault(
                         f[:f.find('文献属性')], []).append(None)
             f_all_dict.setdefault(
                 f[:f.find('文献属性')], []).append(filepath) if os.path.exists(
                     filepath) else f_all_dict.setdefault(
                         f[:f.find('文献属性')], []).append(None)
             f_all_dict.setdefault(
                 f[:f.find('文献属性')], []).append(filetxt) if os.path.exists(
                     filetxt) else f_all_dict.setdefault(
                         f[:f.find('文献属性')], []).append(None)
     for f_key in f_all_dict.keys():
         if f_all_dict[f_key][0] is not None:
             book = xlrd.open_workbook(f_all_dict[f_key][0])
             sheet = book.sheet_by_index(0)
             ut = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))
             ops = []
             '''
             逐行读取excel中信息,当爬取标注为CRA开头时,不添加上传标志UPA信息
             '''
             for r in range(1, sheet.nrows):
                 if sheet.cell(r, 0).value[0:3] == "CRA":
                     # values = (sheet.cell(r, 0).value[3:], sheet.cell(r, 2).value, sheet.cell(r, 3).value,sheet.cell(r, 4).value,sheet.cell(r, 5).value,sheet.cell(r, 6).value,sheet.cell(r, 7).value,sheet.cell(r, 8).value,sheet.cell(r, 9).value,sheet.cell(r,10).value,ut)
                     values = (sheet.cell(r,
                                          0).value[3:], sheet.cell(r,
                                                                   3).value,
                               sheet.cell(r, 4).value, sheet.cell(r,
                                                                  5).value,
                               sheet.cell(r, 2).value, sheet.cell(r,
                                                                  7).value,
                               sheet.cell(r, 8).value, sheet.cell(r,
                                                                  9).value,
                               sheet.cell(r, 10).value, ut,
                               sheet.cell(r, 6).value)
                     # print("6666666666666", values)
                     ops.append(values)
                 else:
                     tag = "UPA" + time.strftime(
                         '%Y%m%d%H%M%S', time.localtime(time.time()))
                     values = (tag[3:], str(sheet.cell(r, 3).value),
                               str(sheet.cell(r, 4).value),
                               str(sheet.cell(r, 5).value),
                               tag + str(r).zfill(4) +
                               str(sheet.cell(r, 2).value),
                               str(sheet.cell(r, 7).value),
                               str(sheet.cell(r, 8).value),
                               str(sheet.cell(r, 9).value),
                               str(sheet.cell(r, 10).value), ut,
                               str(sheet.cell(r, 6).value))
                     # print("555555555555", values)
                     ops.append(values)
             if f_all_dict[f_key][1] is not None:
                 f_list = os.listdir(f_all_dict[f_key][1])
                 f_list_doc = []
                 '''找到原文文件路径下所有文档,存到f_list_doc中'''
                 for f in f_list:
                     if os.path.splitext(f)[1] in {
                             '.caj', '.pdf', '.txt', '.doc', '.docx'
                     }:
                         self.suffix = os.path.splitext(f)[1]
                         f_list_doc.append(f)
                 temp_list = []
                 # print("f_list_doc", len(f_list_doc)) 40
                 for f in f_list_doc:
                     for item in ops:
                         # print("item", item[10][0:3])
                         if item[4][0:3] == 'CRA':
                             if item[4] == os.path.splitext(f)[0]:
                                 temp_list.append(item)
                         else:
                             if item[4][21:] == os.path.splitext(f)[0]:
                                 temp_list.append(item)
                 '''文件存在且与属性文件一一匹配的'''
                 save_file = []
                 for file in temp_list:
                     fileUuid = (str(uuid.uuid1()).replace("-", ""), )
                     if file[4][0:3] == 'CRA':
                         filepath = f_all_dict[f_key][1] + file[
                             4] + '.' + file[8]
                     else:
                         filepath = f_all_dict[f_key][1] + file[4][
                             21:] + '.' + file[8]
                     self.upload_filepath = filepath
                     try:
                         b = open(filepath, "rb").read()
                         origin = (pymysql.Binary(b), )
                         newfile = fileUuid + ("1010", "ZH") + file + origin
                         save_file.append(newfile)
                     except OSError:
                         if file[4][0:3] == 'CRA':
                             print('未找到文件' % file[4])
                         else:
                             print('未找到文件' % file[4][21:])
                 cursor = self.conn.cursor()
                 for n in range(0, len(save_file)):
                     a = globalVar.get_st()
                     print(a)
                     if a == 1:
                         self.CrawProcess.emit(
                             str("正在导入%s\n" % (save_file[n][7])))
                         try:
                             self.hdfs_ip = "http://192.168.1.107:50070"
                             self.inputpath = '/4516/upload'
                             self.client = hdfs.Client(self.hdfs_ip)
                             if self.configs['flag'] == True:
                                 # cursor.executemany(
                                 #     "insert into DOCUMENTS(UUID,CRA_DT,TITLE,AUTHOR,AURDEPT,KYWRD,ABSTRACT,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,CONTENT_ORI,SOURCE_CODE,LANG)values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s,%s, %s, %s)",
                                 #     save_file[n:n+1])
                                 # sql = "insert into DOCUMENTS(UUID,CRA_DT,TITLE,AUTHOR,AURDEPT,KYWRD,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,SOURCE_CODE,LANG,ABSTRACT,CONTENT_ORI)values(:1, to_date(:2,'yyyy-mm-dd hh24:mi:ss'), :3, :4, :5, :6, :7, :8, :9, :10, :11, :12, :13, :14, :15)"
                                 # sql = "insert into DOCUMENTS(UUID,CRA_DT,TITLE,AUTHOR,AURDEPT,KYWRD,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,SOURCE_CODE,LANG,ABSTRACT)values(:1, to_date(:2,'yyyy-mm-dd hh24:mi:ss'), :3, :4, :5, :6, :7, :8, :9, :10, :11, :12, :13, :14)"
                                 # txt导入
                                 sql = "insert into DOCUMENTS(UUID,SOURCE_CODE,LANG,CRA_DT,AUTHOR,KYWRD,AURDEPT,TITLE,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,ABSTRACT,CONTENT_ORI)values(:1, :2, :3, to_date(:4,'yyyy-mm-dd hh24:mi:ss'), :5, :6, :7, :8, :9, to_date(:10,'yyyy-mm-dd hh24:mi:ss'), :11, :12, to_date(:13,'yyyy-mm-dd hh24:mi:ss'), :14,:15)"
                                 # pdf导入
                                 # sql = "insert into DOCUMENTS(UUID,SOURCE_CODE,LANG,CRA_DT,AUTHOR,KYWRD,AURDEPT,TITLE,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,ABSTRACT)values(:1, :2, :3, to_date(:4,'yyyy-mm-dd hh24:mi:ss'), :5, :6, :7, :8, :9, to_date(:10,'yyyy-mm-dd hh24:mi:ss'), :11, :12, to_date(:13,'yyyy-mm-dd hh24:mi:ss'), :14)"
                                 # sql = "insert into DOCUMENTS(UUID,SOURCE_CODE,LANG,CRA_DT,AUTHOR,ABSTRACT,AURDEPT,KYWRD,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,TITLE)values('ce017578ec0411ea91d6a85e45b3a491', '1010', 'ZH', to_date('20200830203320','yyyy-mm-dd hh24:mi:ss'), '龙视要闻', '', '', '', '', to_date('20200830203320','yyyy-mm-dd hh24:mi:ss'), 'http://baijiahao.baidu.com/s?id=1676355714433432996', 'txt', to_date('20200830203320','yyyy-mm-dd hh24:mi:ss'), 'CRA202008302033200001美国最机密武器五年来首次现身，莫斯科：敢挑衅就摧毁')"
                                 # sql = "insert into DOCUMENTS(UUID,SOURCE_CODE,LANG,CRA_DT,AUTHOR,ABSTRACT,AURDEPT,KYWRD,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,TITLE)values('fUiiiuid', '', '', '', '', '', '', '', '', '', '', '', '', '')"
                                 # sql = "insert into DOCUMENTS(UUID,CRA_DT,TITLE,AUTHOR,AURDEPT,KYWRD)values('fUuid', to_date('2020-06-29 00:00:00','yyyy-mm-dd hh24:mi:ss'), 'hhhhhhhh', 'jjjjjjjjj', 'ooooo', 'ppppppp')"
                                 # a = ('ce017578ec0411ea91d6a85e45b3a491', '1010', 'ZH', '20200830203320', '龙视要闻', '', '', '', '', '2020-08-29', 'http://baijiahao.baidu.com/s?id=1676355714433432996', 'txt', '20200901113956', 'CRA202008302033200001美国最机密武器五年来首次现身，莫斯科：敢挑衅就摧毁')
                                 # 上传到oracle
                                 cursor.executemany(
                                     sql,
                                     #     # "insert into DOCUMENTS(UUID,CRA_DT,TITLE,AUTHOR,AURDEPT,KYWRD,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,SOURCE_CODE,LANG,ABSTRACT,CONTENT_ORI)values(:1, to_date(:2,'yyyy-mm-dd hh24:mi:ss'), :3, :4, :5, :6, :7, :8, :9, :10, :11, :12, :13, :14, :15)",
                                     #     # "insert into DOCUMENTS(UUID,CRA_DT,TITLE,AUTHOR,AURDEPT,KYWRD,ABSTRACT,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,CONTENT_ORI,SOURCE_CODE,LANG)values(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)",
                                     #     # "insert into DOCUMENTS(UUID,CRA_DT,TITLE)values(:1, :2, :3)",
                                     save_file[n:n + 1])
                                 # str(save_file[n:n+1]).replace('[','').replace(']',''))
                                 # cursor.execute(sql)
                                 try:
                                     # 上传到hdfs
                                     t = self.upload_filepath.rindex('/')
                                     self.client.upload(
                                         self.inputpath,
                                         self.upload_filepath[0:t + 1] +
                                         save_file[n][7] + self.suffix)
                                 except Exception as e:
                                     print("upload error!", e)
                             else:
                                 break
                         except Exception as e:
                             print("1111111", e)
                     else:
                         break
                 self.CrawProcess.emit("导入完成")
                 cursor.close()
                 self.conn.commit()
             if f_all_dict[f_key][2] is not None:
                 self.upload_txt(f_all_dict[f_key][2])

示例#20

0

显示文件

文件： hdfs_trans.py 项目： hyper-xx/hdfs_trans

                    format='%(asctime)s - %(levelname)s: %(message)s')

cfg = configparser.ConfigParser()
cfg.read('config.conf')
cfg.sections()
hdfs_mvpath = cfg.get('path', 'hdfs_mvpath')
tmp_path = cfg.get('path', 'tmp_path')
oldhdfs_url = cfg.get('oldhdfs', 'url')
oldhdfs_root = cfg.get('oldhdfs', 'root')
newhdfs_url = cfg.get('newhdfs', 'url')
newhdfs_root = cfg.get('newhdfs', 'root')
dingrobot = cfg.get('alert', 'dingrobot')
notice = cfg.get('alert', 'notice')

oldhdfs = hdfs.Client(oldhdfs_url,
                      root=oldhdfs_root,
                      timeout=100,
                      session=False)
# newhdfs = hdfs.Client(newhdfs_url, root=newhdfs_root, timeout=100, session=False)

#oldhdfs = InsecureClient(oldhdfs_url, user="******")
newhdfs = InsecureClient(newhdfs_url, user="******")

L = threading.Lock()
hdfs_mvpathlist = hdfs_mvpath.strip(',').split(',')


class Producer(threading.Thread):
    def __init__(self, name):
        threading.Thread.__init__(self)
        self.name = name

示例#21

0

显示文件

文件： delete_hive_tmp_multiprocess.py 项目： kikyoar/Hadoop-Automated-scripts

"""
存在GIL问题，所以能用进程的时候，用进程，或者用C语言解决
"""

hdfs_url = "http://192.168.100.120:50070"
hdfs_path = "/tmp/hive/hive"
last_day = 60 * 60 * 24 * 1000
# 删除前10天的文件
last_days = 9 * last_day

# 获取当前时间
current_time = round(time.time() * 1000)

# 创建hdfs连接
client = hdfs.Client(hdfs_url, timeout=100, session=False)

# 遍历删除文件所在目录下的文件，深度遍历
for first_depth in client.walk(hdfs_path, depth=1, status=True):
    for second_depth in first_depth:
        if type(second_depth) == list and len(second_depth) != 0:
            # 输出所有的配置文件列表，里面以字典形式存放
            file_list = second_depth

# 使用字典保存文件名和文件时间
list_files = []
for file_tuple in file_list:
    # 生成文件名和文件修改最后修改时间的元组
    dict_tuple = (file_tuple[0], file_tuple[1]["modificationTime"])
    # 添加为列表
    list_files.append(dict_tuple)

示例#22

0

显示文件

                raise Exception(e)

            finally:
                cursor.close()

        filepath = local_path + filename
        with open(filepath, 'wb+') as f:
            for row_data in res:
                f.write(''.join(row_data).encode('utf-8') + '\n')
            logger.info('写入本地文件%s完成' % filepath)

        try:
            logger.info('上传数据到hdfs')
            txt_hdfs_path = hdfs_path + filename
            client = hdfs.Client("http://192.10.86.31:50070",
                                 root="/",
                                 timeout=100,
                                 session=False)
            client.delete(hdfs_path, recursive=True)
            client.upload(txt_hdfs_path, filepath)
            logger.info('upload数据完成')
        except Exception, e:
            logger.error(e)
            raise Exception(e)

        oscmd = 'rm -f ' + filepath
        logger.info('删除本地文件')
        res = os.system(oscmd)

        (retcode, retinfo) = ('success', '')
    except Exception, e:
        (retcode, retinfo) = ('fail', e)

示例#23

0

显示文件

for line in f.readlines():
    tem = line[:-1].split(' ')
    if len(tem)<2:
        break
    x = biao[tem[0]]
    G[x]["label"] = tem[1]

node_num = len(G.keys())
neg = [0 for i in range(node_num)]
for i, node  in enumerate(G.keys()):
    neg[i] = negative[node]**0.75
s = sum(neg)
for i in range(1,node_num):
    neg[i] = neg[i-1]+neg[i]
neg = [neg[i]/s for i in range(node_num)]
G['negative'] = neg
f.close()

f = open('graph.txt', 'w')
f.write(str(G))
f.close()

client = hdfs.Client("http://localhost:50070", timeout=100, session=False)

client.upload("/", "graph.txt", overwrite=True)
# with client.read('/graph.txt') as reader:
#     G = eval(reader.read())
# a = list(G.keys())[:-1]
# print(a)

示例#24

0

显示文件

文件： tools_mongoToHDFS.py 项目： WaterH2P/Python-Scrapy

import hdfs
import pymongo
import json
import os
import time

# 启动 mongodb
# sudo mongod --dbpath=/Users/h2p/Documents/Project/data/db

client = hdfs.Client('http://*:50070', root='/')
print('连接 hdfs')
# client = hdfs.Client('http://*:50070', root='/')
# client = hdfs.Client('http://*:50070', root='/')

print('连接 mongodb')
# myClient = pymongo.MongoClient(host='*', port=20000)
myClient = pymongo.MongoClient(host='127.0.0.1', port=27017)
mydb = myClient['CloudComputing']
mycol = mydb['UserInfo']

print('读取已转移 Mongo Id')
Mongo_json_OK = []
with open('Mongo_json_OK.txt', 'r', encoding='utf-8') as f:
    mongoId = f.readline().strip()
    while mongoId:
        Mongo_json_OK.append(id)
        mongoId = f.readline().strip()

print('读取 Mongo 数据')
count = len(Mongo_json_OK)
for item in mycol.find():

示例#25

0

显示文件

文件： Spark-1.6-Join-CSV-inside-hdfs-yarn-CDH5.py 项目： airztz/Python4fun

            df_fact.EUTRANCELLFDD == rdd_dimension.EUTRANCELLFDD,
            'left').select(df_fact.EUTRANCELLFDD, 'DATETIME',
                           rdd_dimension.REGION, rdd_dimension.MARKET,
                           'PMACTIVEUEDLSUM')
        outputDF.show()
        outputDF.write.format('com.databricks.spark.csv').save(
            '/Users/Joy4fun/Desktop/joined_' + filename)
        #output to hdfs
        #outputDF.write.format('com.databricks.spark.csv').save(outDirectory + filename)
        print 'Writing is done!'
    sc.stop()


if __name__ == "__main__":

    intDirectory = "hdfs://hdfs2:8020/user/ec2-user/sample_data_eri/"
    outDirectory = "hdfs://hdfs2:8020/user/ec2-user/joined_data/"

    hdfs_fact_Files = hdfs.Client('http://hdfs2:50070').list(
        '/user/ec2-user/sample_data_eri/')
    hdfs_dimension_File = "hdfs://hdfs2:8020/user/ec2-user/ERI_CELL_REGION_MARKET.csv"
    Spark_join_csv_in_hdfs("local", hdfs_fact_Files, hdfs_dimension_File,
                           hdfs_fact_Files, intDirectory, outDirectory)
    #print Spark_read_write_csv_to_hdfs("s3", localFiles, outDirectory)

#spark-submit --packages com.databricks:spark-csv_2.10:1.5.0 Spark-1.6-read-write-CSV-to-hdfs-yarn-CDH5.py

#pyspark --packages com.databricks:spark-csv_2.11:1.5.0

#If you don't want to give --packages option, please:
# sudo cp downloads/spark-csv_2.11-1.5.0.jar /Library/Python/2.7/site-packages/pyspark/jars/.

示例#26

0

显示文件

import hdfs
import pymongo
import json
import os
import time

client = hdfs.Client('http://172.19.240.199:50070', root='/')

myClient = pymongo.MongoClient(host='172.19.240.199', port=20000)
mydb = myClient['CloudComputing']
mycol = mydb['UserInfo']

Mongo_json_OK = []
with open('Mongo_json_OK.txt', 'r', encoding='utf-8') as f:
    mongoId = f.readline().strip()
    while mongoId:
        Mongo_json_OK.append(id)
        mongoId = f.readline().strip()

count = len(Mongo_json_OK)
for item in mycol.find():
    item['_id'] = str(item['_id'])
    if item['_id'] not in Mongo_json_OK:
        filePath = './json/' + item['_id'] + '.json'
        with open(filePath, 'w', encoding='utf-8') as f:
            json.dump(item, f, ensure_ascii=False)
        client.upload('/streaminput/', filePath, overwrite=True)
        os.remove(filePath)

        Mongo_json_OK.append(item['_id'])
        with open('Mongo_json_OK.txt', 'a', encoding='utf-8') as f:

示例#27

0

显示文件

文件： deepwalk.py 项目： chenchuanchang/distributed-graph-embedding

def main(args):
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job,
                             task_index=FLAGS.task)

    client = hdfs.Client("http://10.76.3.92:50070", root='/', timeout=100)
    with client.read('/graph.txt') as reader:
        G = eval(reader.read())
    FLAGS.worker = len(cluster_dic['worker'])
    FLAGS.node_num = len(G.keys()) - 1

    # FLAGS.train_steps = FLAGS.node_num//FLAGS.batch_size*20
    FLAGS.train_steps = 4
    is_chief = (FLAGS.task == 0)

    if FLAGS.job == 'ps':
        server.join()

    with tf.device(tf.train.replica_device_setter(cluster=cluster)):
        global_step = tf.Variable(0, name='global_step',
                                  trainable=False)  # 创建纪录全局训练步数变量
        emb_init = (np.random.randn(FLAGS.node_num, FLAGS.dim) /
                    np.sqrt(FLAGS.node_num / 2)).astype('float32')
        emb = tf.Variable(emb_init, name='emb',
                          trainable=True)  # 创建embedding向量并且初始化

        L_con = 0
        L_ucon = 0

        # 训练参数占位符
        pos = (FLAGS.w - 2 * FLAGS.cs) * 2 * FLAGS.cs

        xc_0 = tf.placeholder(dtype=tf.int32,
                              shape=(pos * FLAGS.batch_size))  # 正边 source
        xc_1 = tf.placeholder(dtype=tf.int32,
                              shape=(pos * FLAGS.batch_size))  # 正边 target
        xuc_0 = tf.placeholder(dtype=tf.int32,
                               shape=(pos * FLAGS.ns *
                                      FLAGS.batch_size))  # 无边 source
        xuc_1 = tf.placeholder(dtype=tf.int32,
                               shape=(pos * FLAGS.ns *
                                      FLAGS.batch_size))  # 无边 target

        # 测试参数占位符
        val = tf.placeholder(dtype=tf.int32,
                             shape=(2, FLAGS.test_size, 2))  # 随机抽取正边 = 负边

        # 将边序列映射到embedding上
        con_0_emb = tf.squeeze(tf.nn.embedding_lookup(emb,
                                                      xc_0))  # (batch,  dim)
        con_1_emb = tf.squeeze(tf.nn.embedding_lookup(emb,
                                                      xc_1))  # (batch,  dim)
        ucon_0_emb = tf.squeeze(tf.nn.embedding_lookup(emb,
                                                       xuc_0))  # (batch,  dim)
        ucon_1_emb = tf.squeeze(tf.nn.embedding_lookup(emb,
                                                       xuc_1))  # (batch,  dim)

        # 计算边相似度是，包括positive samples 和 negative samples
        con_v = tf.sigmoid(
            tf.squeeze(
                tf.reduce_sum(tf.einsum('ni,ni->ni', con_0_emb, con_1_emb),
                              -1)))
        ucon_v = tf.sigmoid(
            tf.squeeze(
                tf.reduce_sum(tf.einsum('ni,ni->ni', ucon_0_emb, ucon_1_emb),
                              -1)))

        # 计算skip-gram的loss
        L_con -= tf.reduce_sum(tf.log(con_v + 1e-15))  # connection
        L_ucon -= tf.reduce_sum(tf.log(1 - ucon_v + 1e-15))  # unconnection
        loss = (L_con + L_ucon)

        optimizer = tf.train.AdamOptimizer(FLAGS.lr)
        train_op = optimizer.minimize(loss, global_step=global_step)

        AUC = tf.py_func(link_prediction, [val, emb], tf.double, stateful=True)

        # init_op = tf.global_variables_initializer()# 参数初始化
        with tf.train.MonitoredTrainingSession(
                master=server.target,
                is_chief=is_chief,
                # hooks=[tf.train.StopAtStepHook(last_step=FLAGS.train_steps),
                #                 tf.train.NanTensorHook(loss)],
                #  checkpoint_dir="./checkpoint_dir",
                save_checkpoint_steps=100) as sess:
            time_begin = time.time()
            print('Traing begins @ %f' % time_begin)

            local_step = 0
            step = 0

            dval = testing_data(FLAGS, G)
            val_feed = {val: dval}
            while not sess.should_stop() and step <= FLAGS.train_steps:
                dxc_0, dxc_1, dxuc_0, dxuc_1 = traning_data(
                    FLAGS, G, local_step)
                train_feed = {
                    xc_0: dxc_0,
                    xc_1: dxc_1,
                    xuc_0: dxuc_0,
                    xuc_1: dxuc_1
                }

                _, step, _loss = sess.run([train_op, global_step, loss],
                                          feed_dict=train_feed)
                local_step += 1

                now = time.time()
                print(
                    '%f: Worker %d: traing step %d dome (global step:%d/%d), and loss : %f'
                    % (now, FLAGS.task, local_step - 1, step,
                       FLAGS.train_steps, _loss))

                if local_step % 10 == 0 and local_step != 0:
                    auc = sess.run([AUC], feed_dict=val_feed)
                    print("Link prediction AUC is %.2f" % auc[0])

                if step >= FLAGS.train_steps:
                    break

            auc = sess.run([AUC], feed_dict=val_feed)
            print("Link prediction AUC is %.2f" % auc[0])
            time_end = time.time()
            print('Training ends @ %f' % time_end)
            train_time = time_end - time_begin
            print('Training elapsed time:%f s' % train_time)

            sleep_time = 0
            while sleep_time < 5:
                time.sleep(2)
                sleep_time += 1
                print("Waiting other machines...")

示例#28

0

显示文件

文件： hdfs_load.py 项目： li871804050/kaggle

import pyhdfs
import hdfs

if __name__ == '__main__':
    # client = pyhdfs.HdfsClient(hosts='127.0.0.1:50070', user_name='Administrator')
    # # client = client.get_active_namenode()
    # client.mkdirs('/hadoop')
    # print(client.listdir("/"))
    # client.copy_from_local('E:/data/test/pku98/199801.txt', '/dfs/1.txt')
    # print(client.listdir("/"))
    client = hdfs.Client(url='http://127.0.0.1:50070')
    client.makedirs('/test/hdfs', permission=777)
    client.upload('/test/hdfs',
                  'E:/data/test/pku98/199801.txt',
                  overwrite=False)

示例#29

0

显示文件

文件： file_manager.py 项目： linxigal/tfos

 def __client(path):
     _, addr, file_path = path.split(':', 2)
     client = hdfs.Client('http:' + addr + ':50070')
     path = '/' + file_path.split('/', 1)[-1]
     return client, path

示例#30

0

显示文件

#!/usr/bin/env python3
# coding=utf-8
# date 2019-05-15 17:23:25
# author calllivecn <*****@*****.**>

import hdfs

cli = hdfs.Client("http://192.168.56.6:9870")

for l in cli.list("/", True):
    print(l)