def getReleaseFromHdfs_batch(releaseNumber=10): '''copy releases form server using hdfs''' client = hdfs.Client("http://10.141.221.82:50070", root='/home/fdse') releases = client.list("/java/releases") for r in releases[:releaseNumber]: client.download(u'/java/releases/' + r, os.path.join(path, 'releases')) print r, ' has download...'
def __init__(self, uri='http://hadoop-hd1:50070', user='******', root='/user/hive/warehouse'): self.client = hdfs.Client(uri) self.user = user self.root = root
def spark_df_to_local_txt(local_path, df: pyspark.sql.DataFrame = None, deli="\t", hdfs_dir_path=None): def row_to_str(row): return deli.join([str(xx) for xx in row.asDict().values()]) # map as string rdd if hdfs_dir_path == None: str_rdd = df.rdd.map(row_to_str) rand_bits = random.getrandbits(64) hdfs_dir_name = "str_df_%016x" % rand_bits hdfs_dir_path = "/tmp/" + hdfs_dir_name # save to hdfs str_rdd.saveAsTextFile(hdfs_dir_path) else: hdfs_dir_name = os.path.basename(hdfs_dir_path) hdfs_dir_path = "/tmp/" + hdfs_dir_name # get hdfs to local local_tmp_dir = "./tmp/" + hdfs_dir_name c = hdfs.Client("http://soldier1:50070") c.download(hdfs_dir_path, local_tmp_dir) # delete tmp hdfs c.delete(hdfs_dir_path, recursive=True) # cat to one file with open(local_path, 'wb') as outfile: for tmp_f in os.listdir(local_tmp_dir): fn = os.path.join(local_tmp_dir, tmp_f) with open(fn, 'rb') as readfile: shutil.copyfileobj(readfile, outfile) # remove local tmp_file shutil.rmtree(local_tmp_dir)
def __init__(self, uri='http://two-stream-master-prod-02:50070', user='******', root='hdfs://nameservicestream/user/hive/warehouse'): self.client = hdfs.Client(uri) self.user = user self.root = root
def put_hdfs(shcontext, filename, hdfs_path): try: os.environ['NLS_LANG'] = 'AMERICAN_AMERICA.AL32UTF8' #设置环境变量,避免乱码 sh_path = '/root/spoolsh/' + filename + '.sh' txt_local_path = '/root/spooldata/' + filename + '.txt' f = open(sh_path, 'wb+') f.write(shcontext) f.close() print '写入pool脚本' oscmd1 = 'chmod +x ' + sh_path (res_status1, res_output1) = commands.getstatusoutput(oscmd1) oscmd2 = 'sh ' + sh_path (res_status2, res_output2) = commands.getstatusoutput(oscmd2) print 'spool数据到本地' txt_hdfs_path = hdfs_path + filename + '.txt' client = hdfs.Client("http://192.10.86.31:50070", root="/", timeout=100, session=False) client.delete(hdfs_path, recursive=True) client.upload(txt_hdfs_path, txt_local_path) print '上传数据到hdfs' oscmd3 = 'rm -f ' + txt_local_path print '删除本地文件' res3 = os.system(oscmd3) return 'success' except Exception, e: print e return e
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, seed=None): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' HOST = "192.168.205.185" NAME_NODE_PORT = 50070 client = hdfs.Client('http://{}:{}'.format(HOST, NAME_NODE_PORT)) with client.read(train_dir + "/" + TRAIN_IMAGES) as f: train_images = extract_images(f) with client.read(train_dir + "/" + TRAIN_LABELS) as f: train_labels = extract_labels(f, one_hot=one_hot) with client.read(train_dir + "/" + TEST_IMAGES) as f: test_images = extract_images(f) with client.read(train_dir + "/" + TEST_LABELS) as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def client(self, host='', port='', timeout=None): host = host or HDFS_CONFIG['host'] port = port or HDFS_CONFIG['webport'] if self._client is None: try: self._client = hdfs.Client('http://%s:%s' % (host, port), timeout=timeout, session=rs) except Exception, e: output('hdfs Exception ' + str(e), logType='hdfs') raise
def get_storage_file_list(self): from ..settings import FLINK_SAVEPOINT_PATH_BACKEND_ADDRESS import hdfs storage_file_backed = hdfs.Client(FLINK_SAVEPOINT_PATH_BACKEND_ADDRESS, timeout=10) storage_file_list = self.get_storage_file_list_status( storage_file_backed, 'savepoints', self.get_savepoint_path()) # storage_file_list = self.get_storage_file_list_status(FLINK_SAVEPOINT_PATH_BACKEND, 'checkponits', # self.get_checkponit_path()) storage_file_list.sort(key=lambda x: x.modification_time, reverse=True) return storage_file_list
def table_to_hdfs(d_info): d_info = p_insert_log(d_info) localtempfile = str(d_info.get('localpath')) + str( d_info.get('source_proc')) + '.tmp' localfile = str(d_info.get('localpath')) + str( d_info.get('source_proc')) + '.lz4' if d_info.get('syn_strategy') == 1: hdfspath = str(d_info.get('target_proc')) else: hdfspath = str(d_info.get('target_proc')) + str( d_info.get('acctday')) + '/' try: d_info['source_dns'] = getdns(d_info.get('source_dbname').upper()) source_db = create_engine(d_info.get('source_dns')) source_conn = source_db.connect() try: seldata = list( source_conn.execute( text(d_info.get('sql_select').format(**d_info)))) except Exception, etp: logging.error(etp) raise Exception(etp) finally: source_conn.close() with open(localtempfile, 'wb+') as f: for row_data in seldata: f.write(''.join(row_data).encode('utf-8') + '\n') logging.info('数据已写入本地,开始压缩数据') oscmd = 'lz4 ' + localtempfile + ' ' + localfile (status, output) = commands.getstatusoutput(oscmd) if status == 0: try: d_info['target_dns'] = getdns( d_info.get('target_dbname').upper()) client = hdfs.Client(d_info.get('target_dns'), root="/", timeout=100, session=False) if not client.status(hdfspath, strict=False): client.makedirs(hdfspath, permission=777) print localfile, hdfspath client.upload(hdfspath, localfile) d_info['finish_flag'] = 'finish' d_info['retcode'] = 'success' except Exception, etp: logging.error(etp) raise Exception(etp)
def download_file(hdfs_location, file_local): """ :param hdfs_location: hdfs路径+文件名 :param file_local: 本地路径+文件名 """ # 可以用 try finally提高可读性 client = hdfs.Client('http://172.39.8.61:50070', root='/', timeout=10) try: with client.read(hdfs_location) as r: with open(file_local, 'wb') as f: f.write(r.read()) except hdfs.util.HdfsError: client = hdfs.Client('http://172.39.8.62:50070', root='/', timeout=10) with client.read(hdfs_location) as r: with open(file_local, 'wb') as f: f.write(r.read()) except IOError as msg: with open("err.log", "a") as f: f.write(str(msg)) file_local = None return file_local
def __init__(self): try: self.zk = KazooClient(hosts=self.zooQuorum) self.zk.start() if self.zk.exists(self.path): data = self.zk.get(self.path) ip = None if self.nameNodeA[0] in data[0]: ip = 'http://' + self.nameNodeA[1] + ':50070' elif self.nameNodeB[0] in data[0]: ip = 'http://' + self.nameNodeB[1] + ':50070' self.client = hdfs.Client(ip, root='/') except Exception as e: print('%s kazooClient __init__ ERROR! %s' % (datetime.datetime.now(), traceback.format_exc()))
def _init_connect(self): """连接hdfs服务器""" # 先获取hdfs ip ips = sorted([ip for hostname, ip in cluster_ip.items()]) port = self.port # port='50070' conn = None for host in ips: # 连接字符串 url = "http://{host}:{port}".format(host=host, port=port) try: conn = hdfs.Client(url, root='/', timeout=100, session=False) conn.list('/') break except: pass # 如果执行到这一步,说明前面的都有问题 if conn.list('/'): return conn else: raise Exception('没有找到hdfs的地址,请检查')
def __init__(self): # 连接zookeeper self.zk = KazooClient(hosts=self.zoo_quorum_dev) self.zk.start() # 判断是否存在zookeeper同步 if self.zk.exists(self.path_dev): # 获取当前active的namenode data = self.zk.get(self.path_dev) ip = None if self.name_node_a_dev[0] in data[0]: ip = 'http://' + self.name_node_a_dev[1] + ':50070' elif self.name_node_b_dev[0] in data[0]: ip = 'http://' + self.name_node_b_dev[1] + ':50070' if ip: # 连接active的hdfs namenode self.client = hdfs.Client(ip, root='/') if not self.client: print('hdfs client 建立连接失败!') else: print('nameNode IP 获取失败!')
def upload_file(hdfs_location, local): try: client = hdfs.Client('http://172.39.8.61:50070', root='/', timeout=10) base_dir = local.split('/').pop() # 要上传的路径的最后一个文件夹 for root, dirs, files in os.walk(local): new_dir = base_dir + root.split(base_dir).pop() # 去除本地路径前缀 for file in files: old_path = root + '/' + file # 原始本地路径文件 lpath = new_dir + '/' + file # 去除本地路径前缀后的文件 if not client.status(hdfs_location + '/' + lpath, strict=False): # 第一个参数远程路径,第二个参数本地路径,第三个参数是否覆盖,第四个参数工作线程数 client.upload(hdfs_location + '/' + lpath, old_path, overwrite=False) except Exception as e: with open("err.log", "a") as f: f.write(str(e)) f.write('\n')
def connect_hdfs(cls): cls._client = hdfs.Client(**cls._parm_handler()) cls.handle_dns()
#dataframeoutput = dataframeoutput.drop('sum(EUCELL_DL_TPUT_NUM_KBITS)').drop('sum(DLPRBUSEDWITHDSPUC_FDUSERS)').drop('sum(DLPRBUSEDWITHDSPUC_FSUSERS)').drop('sum(EUCELL_DL_DRB_TPUT_NUM_KBITS)').drop('sum(EUCELL_DL_DRB_TPUT_DEN_SECS)') dataframeoutput = dataframeoutput.select( 'DATE', 'MARKET', 'VENDOR', 'BAND', 'Cell Traffic (kbytes)', 'Cell Used PRB', 'Cell Spectral Efficiency (bps/Hz)', 'UE Traffic (kbytes)', 'UE Active Time (s)', 'UE Tput (kbps)', 'Total cell count', 'Total Spectrum in MHz') dataframeoutput = dataframeoutput.coalesce(1) #take action here dataframeoutput.write.format('com.databricks.spark.csv').save( outputName) difference = dt.datetime.now() - start dataframeoutput.unpersist() sc.stop() return difference if __name__ == "__main__": outDirectory = os.path.join(os.path.dirname(__file__), 'report/') hdfsFiles = hdfs.Client('http://hdfs1:50070').list( '/user/ec2-user/sample-data' ) # Use namenode public ip http://namenode:50070 print "start" print ALU_LTE_SPARK().run("hdfs", hdfsFiles, outDirectory) print "OK" exit() #Submit job to yarn on top of your hdfs cluster #spark-submit --master yarn --deploy-mode cluster --num-executors 3 --executor-cores 2 --executor-memory 2g --packages com.databricks:spark-csv_2.10:1.5.0 spark-1.6-hdfs-yarn-CDH5.py #For hdfs, pls find output file at: #hdfs://user/ec2-user/report/result_group_by_MARKET_ALU_2017_spark_hdfs.csv
import hdfs client = hdfs.Client('http://localhost:9870', root='/') print(client.list('/')) with client.write('/streaming.txt', encoding='utf-8') as hdfs_file: for idx in range(10_000): hdfs_file.write(f'{idx} {idx ** 2} {idx ** 3}\n')
hiddenLayer3 = add_layer("layer3", hiddenLayer2, in_size=512, out_size=128, activation_function=tf.nn.relu) prediction = add_layer("end", hiddenLayer3, in_size=128, out_size=3, activation_function=tf.nn.softmax) # loss = tf.reduce_mean(tf.reduce_sum(y_lable - prediction)) loss = -tf.reduce_mean(y_lable * tf.log(tf.clip_by_value(prediction, 1e-10, 1.0))) train_step = tf.train.GradientDescentOptimizer(0.1).minimize(loss) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) client = hdfs.Client(HADOOP_IP_PORT, root="/", timeout=500, session=False) fileList = client.list(HADOOP_PATH) epoch_times = 0 while True: epoch_times += 1 for file_loop in fileList: # 每个数据集中有一批数据 print('\n', file_loop) get_data(client, HADOOP_PATH + file_loop) # for i in range(2000): #
def upload_pfile(self, cur_dir): ''' 构建字典存储爬虫插件对应的属性文件、原文文件夹、txt文本文件夹信息 键为插件名,值为属性文件位置、原文文件文件夹位置ori、txt文本文件夹位置txt组成的列表 当ori或txt文件夹不存在时,使用None占位 ''' f_all_dict = {} print("mmmmmmmmmmmmmm", cur_dir) for f in os.listdir(cur_dir): if f.find('文献属性.xls') > 0 or f.find('文献属性.xlsx') > 0: filepro = cur_dir + f if cur_dir[ -1] == '/' else cur_dir + '/' + f filepath = cur_dir + f[:f.find('文献属性')] + '_ori/' if cur_dir[ -1] == '/' else cur_dir + '/' + f[:f.find('文献属性' )] + '_ori/' filetxt = cur_dir + f[:f.find('文献属性')] + '_txt/' if cur_dir[ -1] == '/' else cur_dir + '/' + f[:f.find('文献属性' )] + '_txt/' f_all_dict.setdefault( f[:f.find('文献属性')], []).append(filepro) if os.path.exists( filepro) else f_all_dict.setdefault( f[:f.find('文献属性')], []).append(None) f_all_dict.setdefault( f[:f.find('文献属性')], []).append(filepath) if os.path.exists( filepath) else f_all_dict.setdefault( f[:f.find('文献属性')], []).append(None) f_all_dict.setdefault( f[:f.find('文献属性')], []).append(filetxt) if os.path.exists( filetxt) else f_all_dict.setdefault( f[:f.find('文献属性')], []).append(None) for f_key in f_all_dict.keys(): if f_all_dict[f_key][0] is not None: book = xlrd.open_workbook(f_all_dict[f_key][0]) sheet = book.sheet_by_index(0) ut = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) ops = [] ''' 逐行读取excel中信息,当爬取标注为CRA开头时,不添加上传标志UPA信息 ''' for r in range(1, sheet.nrows): if sheet.cell(r, 0).value[0:3] == "CRA": # values = (sheet.cell(r, 0).value[3:], sheet.cell(r, 2).value, sheet.cell(r, 3).value,sheet.cell(r, 4).value,sheet.cell(r, 5).value,sheet.cell(r, 6).value,sheet.cell(r, 7).value,sheet.cell(r, 8).value,sheet.cell(r, 9).value,sheet.cell(r,10).value,ut) values = (sheet.cell(r, 0).value[3:], sheet.cell(r, 3).value, sheet.cell(r, 4).value, sheet.cell(r, 5).value, sheet.cell(r, 2).value, sheet.cell(r, 7).value, sheet.cell(r, 8).value, sheet.cell(r, 9).value, sheet.cell(r, 10).value, ut, sheet.cell(r, 6).value) # print("6666666666666", values) ops.append(values) else: tag = "UPA" + time.strftime( '%Y%m%d%H%M%S', time.localtime(time.time())) values = (tag[3:], str(sheet.cell(r, 3).value), str(sheet.cell(r, 4).value), str(sheet.cell(r, 5).value), tag + str(r).zfill(4) + str(sheet.cell(r, 2).value), str(sheet.cell(r, 7).value), str(sheet.cell(r, 8).value), str(sheet.cell(r, 9).value), str(sheet.cell(r, 10).value), ut, str(sheet.cell(r, 6).value)) # print("555555555555", values) ops.append(values) if f_all_dict[f_key][1] is not None: f_list = os.listdir(f_all_dict[f_key][1]) f_list_doc = [] '''找到原文文件路径下所有文档,存到f_list_doc中''' for f in f_list: if os.path.splitext(f)[1] in { '.caj', '.pdf', '.txt', '.doc', '.docx' }: self.suffix = os.path.splitext(f)[1] f_list_doc.append(f) temp_list = [] # print("f_list_doc", len(f_list_doc)) 40 for f in f_list_doc: for item in ops: # print("item", item[10][0:3]) if item[4][0:3] == 'CRA': if item[4] == os.path.splitext(f)[0]: temp_list.append(item) else: if item[4][21:] == os.path.splitext(f)[0]: temp_list.append(item) '''文件存在且与属性文件一一匹配的''' save_file = [] for file in temp_list: fileUuid = (str(uuid.uuid1()).replace("-", ""), ) if file[4][0:3] == 'CRA': filepath = f_all_dict[f_key][1] + file[ 4] + '.' + file[8] else: filepath = f_all_dict[f_key][1] + file[4][ 21:] + '.' + file[8] self.upload_filepath = filepath try: b = open(filepath, "rb").read() origin = (pymysql.Binary(b), ) newfile = fileUuid + ("1010", "ZH") + file + origin save_file.append(newfile) except OSError: if file[4][0:3] == 'CRA': print('未找到文件' % file[4]) else: print('未找到文件' % file[4][21:]) cursor = self.conn.cursor() for n in range(0, len(save_file)): a = globalVar.get_st() print(a) if a == 1: self.CrawProcess.emit( str("正在导入%s\n" % (save_file[n][7]))) try: self.hdfs_ip = "http://192.168.1.107:50070" self.inputpath = '/4516/upload' self.client = hdfs.Client(self.hdfs_ip) if self.configs['flag'] == True: # cursor.executemany( # "insert into DOCUMENTS(UUID,CRA_DT,TITLE,AUTHOR,AURDEPT,KYWRD,ABSTRACT,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,CONTENT_ORI,SOURCE_CODE,LANG)values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s,%s, %s, %s)", # save_file[n:n+1]) # sql = "insert into DOCUMENTS(UUID,CRA_DT,TITLE,AUTHOR,AURDEPT,KYWRD,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,SOURCE_CODE,LANG,ABSTRACT,CONTENT_ORI)values(:1, to_date(:2,'yyyy-mm-dd hh24:mi:ss'), :3, :4, :5, :6, :7, :8, :9, :10, :11, :12, :13, :14, :15)" # sql = "insert into DOCUMENTS(UUID,CRA_DT,TITLE,AUTHOR,AURDEPT,KYWRD,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,SOURCE_CODE,LANG,ABSTRACT)values(:1, to_date(:2,'yyyy-mm-dd hh24:mi:ss'), :3, :4, :5, :6, :7, :8, :9, :10, :11, :12, :13, :14)" # txt导入 sql = "insert into DOCUMENTS(UUID,SOURCE_CODE,LANG,CRA_DT,AUTHOR,KYWRD,AURDEPT,TITLE,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,ABSTRACT,CONTENT_ORI)values(:1, :2, :3, to_date(:4,'yyyy-mm-dd hh24:mi:ss'), :5, :6, :7, :8, :9, to_date(:10,'yyyy-mm-dd hh24:mi:ss'), :11, :12, to_date(:13,'yyyy-mm-dd hh24:mi:ss'), :14,:15)" # pdf导入 # sql = "insert into DOCUMENTS(UUID,SOURCE_CODE,LANG,CRA_DT,AUTHOR,KYWRD,AURDEPT,TITLE,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,ABSTRACT)values(:1, :2, :3, to_date(:4,'yyyy-mm-dd hh24:mi:ss'), :5, :6, :7, :8, :9, to_date(:10,'yyyy-mm-dd hh24:mi:ss'), :11, :12, to_date(:13,'yyyy-mm-dd hh24:mi:ss'), :14)" # sql = "insert into DOCUMENTS(UUID,SOURCE_CODE,LANG,CRA_DT,AUTHOR,ABSTRACT,AURDEPT,KYWRD,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,TITLE)values('ce017578ec0411ea91d6a85e45b3a491', '1010', 'ZH', to_date('20200830203320','yyyy-mm-dd hh24:mi:ss'), '龙视要闻', '', '', '', '', to_date('20200830203320','yyyy-mm-dd hh24:mi:ss'), 'http://baijiahao.baidu.com/s?id=1676355714433432996', 'txt', to_date('20200830203320','yyyy-mm-dd hh24:mi:ss'), 'CRA202008302033200001美国最机密武器五年来首次现身,莫斯科:敢挑衅就摧毁')" # sql = "insert into DOCUMENTS(UUID,SOURCE_CODE,LANG,CRA_DT,AUTHOR,ABSTRACT,AURDEPT,KYWRD,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,TITLE)values('fUiiiuid', '', '', '', '', '', '', '', '', '', '', '', '', '')" # sql = "insert into DOCUMENTS(UUID,CRA_DT,TITLE,AUTHOR,AURDEPT,KYWRD)values('fUuid', to_date('2020-06-29 00:00:00','yyyy-mm-dd hh24:mi:ss'), 'hhhhhhhh', 'jjjjjjjjj', 'ooooo', 'ppppppp')" # a = ('ce017578ec0411ea91d6a85e45b3a491', '1010', 'ZH', '20200830203320', '龙视要闻', '', '', '', '', '2020-08-29', 'http://baijiahao.baidu.com/s?id=1676355714433432996', 'txt', '20200901113956', 'CRA202008302033200001美国最机密武器五年来首次现身,莫斯科:敢挑衅就摧毁') # 上传到oracle cursor.executemany( sql, # # "insert into DOCUMENTS(UUID,CRA_DT,TITLE,AUTHOR,AURDEPT,KYWRD,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,SOURCE_CODE,LANG,ABSTRACT,CONTENT_ORI)values(:1, to_date(:2,'yyyy-mm-dd hh24:mi:ss'), :3, :4, :5, :6, :7, :8, :9, :10, :11, :12, :13, :14, :15)", # # "insert into DOCUMENTS(UUID,CRA_DT,TITLE,AUTHOR,AURDEPT,KYWRD,ABSTRACT,JOURNAL,PUB_DT,URL,SUFFIX,UPLD_DT,CONTENT_ORI,SOURCE_CODE,LANG)values(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)", # # "insert into DOCUMENTS(UUID,CRA_DT,TITLE)values(:1, :2, :3)", save_file[n:n + 1]) # str(save_file[n:n+1]).replace('[','').replace(']','')) # cursor.execute(sql) try: # 上传到hdfs t = self.upload_filepath.rindex('/') self.client.upload( self.inputpath, self.upload_filepath[0:t + 1] + save_file[n][7] + self.suffix) except Exception as e: print("upload error!", e) else: break except Exception as e: print("1111111", e) else: break self.CrawProcess.emit("导入完成") cursor.close() self.conn.commit() if f_all_dict[f_key][2] is not None: self.upload_txt(f_all_dict[f_key][2])
format='%(asctime)s - %(levelname)s: %(message)s') cfg = configparser.ConfigParser() cfg.read('config.conf') cfg.sections() hdfs_mvpath = cfg.get('path', 'hdfs_mvpath') tmp_path = cfg.get('path', 'tmp_path') oldhdfs_url = cfg.get('oldhdfs', 'url') oldhdfs_root = cfg.get('oldhdfs', 'root') newhdfs_url = cfg.get('newhdfs', 'url') newhdfs_root = cfg.get('newhdfs', 'root') dingrobot = cfg.get('alert', 'dingrobot') notice = cfg.get('alert', 'notice') oldhdfs = hdfs.Client(oldhdfs_url, root=oldhdfs_root, timeout=100, session=False) # newhdfs = hdfs.Client(newhdfs_url, root=newhdfs_root, timeout=100, session=False) #oldhdfs = InsecureClient(oldhdfs_url, user="******") newhdfs = InsecureClient(newhdfs_url, user="******") L = threading.Lock() hdfs_mvpathlist = hdfs_mvpath.strip(',').split(',') class Producer(threading.Thread): def __init__(self, name): threading.Thread.__init__(self) self.name = name
""" 存在GIL问题,所以能用进程的时候,用进程,或者用C语言解决 """ hdfs_url = "http://192.168.100.120:50070" hdfs_path = "/tmp/hive/hive" last_day = 60 * 60 * 24 * 1000 # 删除前10天的文件 last_days = 9 * last_day # 获取当前时间 current_time = round(time.time() * 1000) # 创建hdfs连接 client = hdfs.Client(hdfs_url, timeout=100, session=False) # 遍历删除文件所在目录下的文件,深度遍历 for first_depth in client.walk(hdfs_path, depth=1, status=True): for second_depth in first_depth: if type(second_depth) == list and len(second_depth) != 0: # 输出所有的配置文件列表,里面以字典形式存放 file_list = second_depth # 使用字典保存文件名和文件时间 list_files = [] for file_tuple in file_list: # 生成文件名和文件修改最后修改时间的元组 dict_tuple = (file_tuple[0], file_tuple[1]["modificationTime"]) # 添加为列表 list_files.append(dict_tuple)
raise Exception(e) finally: cursor.close() filepath = local_path + filename with open(filepath, 'wb+') as f: for row_data in res: f.write(''.join(row_data).encode('utf-8') + '\n') logger.info('写入本地文件%s完成' % filepath) try: logger.info('上传数据到hdfs') txt_hdfs_path = hdfs_path + filename client = hdfs.Client("http://192.10.86.31:50070", root="/", timeout=100, session=False) client.delete(hdfs_path, recursive=True) client.upload(txt_hdfs_path, filepath) logger.info('upload数据完成') except Exception, e: logger.error(e) raise Exception(e) oscmd = 'rm -f ' + filepath logger.info('删除本地文件') res = os.system(oscmd) (retcode, retinfo) = ('success', '') except Exception, e: (retcode, retinfo) = ('fail', e)
for line in f.readlines(): tem = line[:-1].split(' ') if len(tem)<2: break x = biao[tem[0]] G[x]["label"] = tem[1] node_num = len(G.keys()) neg = [0 for i in range(node_num)] for i, node in enumerate(G.keys()): neg[i] = negative[node]**0.75 s = sum(neg) for i in range(1,node_num): neg[i] = neg[i-1]+neg[i] neg = [neg[i]/s for i in range(node_num)] G['negative'] = neg f.close() f = open('graph.txt', 'w') f.write(str(G)) f.close() client = hdfs.Client("http://localhost:50070", timeout=100, session=False) client.upload("/", "graph.txt", overwrite=True) # with client.read('/graph.txt') as reader: # G = eval(reader.read()) # a = list(G.keys())[:-1] # print(a)
import hdfs import pymongo import json import os import time # 启动 mongodb # sudo mongod --dbpath=/Users/h2p/Documents/Project/data/db client = hdfs.Client('http://*:50070', root='/') print('连接 hdfs') # client = hdfs.Client('http://*:50070', root='/') # client = hdfs.Client('http://*:50070', root='/') print('连接 mongodb') # myClient = pymongo.MongoClient(host='*', port=20000) myClient = pymongo.MongoClient(host='127.0.0.1', port=27017) mydb = myClient['CloudComputing'] mycol = mydb['UserInfo'] print('读取已转移 Mongo Id') Mongo_json_OK = [] with open('Mongo_json_OK.txt', 'r', encoding='utf-8') as f: mongoId = f.readline().strip() while mongoId: Mongo_json_OK.append(id) mongoId = f.readline().strip() print('读取 Mongo 数据') count = len(Mongo_json_OK) for item in mycol.find():
df_fact.EUTRANCELLFDD == rdd_dimension.EUTRANCELLFDD, 'left').select(df_fact.EUTRANCELLFDD, 'DATETIME', rdd_dimension.REGION, rdd_dimension.MARKET, 'PMACTIVEUEDLSUM') outputDF.show() outputDF.write.format('com.databricks.spark.csv').save( '/Users/Joy4fun/Desktop/joined_' + filename) #output to hdfs #outputDF.write.format('com.databricks.spark.csv').save(outDirectory + filename) print 'Writing is done!' sc.stop() if __name__ == "__main__": intDirectory = "hdfs://hdfs2:8020/user/ec2-user/sample_data_eri/" outDirectory = "hdfs://hdfs2:8020/user/ec2-user/joined_data/" hdfs_fact_Files = hdfs.Client('http://hdfs2:50070').list( '/user/ec2-user/sample_data_eri/') hdfs_dimension_File = "hdfs://hdfs2:8020/user/ec2-user/ERI_CELL_REGION_MARKET.csv" Spark_join_csv_in_hdfs("local", hdfs_fact_Files, hdfs_dimension_File, hdfs_fact_Files, intDirectory, outDirectory) #print Spark_read_write_csv_to_hdfs("s3", localFiles, outDirectory) #spark-submit --packages com.databricks:spark-csv_2.10:1.5.0 Spark-1.6-read-write-CSV-to-hdfs-yarn-CDH5.py #pyspark --packages com.databricks:spark-csv_2.11:1.5.0 #If you don't want to give --packages option, please: # sudo cp downloads/spark-csv_2.11-1.5.0.jar /Library/Python/2.7/site-packages/pyspark/jars/.
import hdfs import pymongo import json import os import time client = hdfs.Client('http://172.19.240.199:50070', root='/') myClient = pymongo.MongoClient(host='172.19.240.199', port=20000) mydb = myClient['CloudComputing'] mycol = mydb['UserInfo'] Mongo_json_OK = [] with open('Mongo_json_OK.txt', 'r', encoding='utf-8') as f: mongoId = f.readline().strip() while mongoId: Mongo_json_OK.append(id) mongoId = f.readline().strip() count = len(Mongo_json_OK) for item in mycol.find(): item['_id'] = str(item['_id']) if item['_id'] not in Mongo_json_OK: filePath = './json/' + item['_id'] + '.json' with open(filePath, 'w', encoding='utf-8') as f: json.dump(item, f, ensure_ascii=False) client.upload('/streaminput/', filePath, overwrite=True) os.remove(filePath) Mongo_json_OK.append(item['_id']) with open('Mongo_json_OK.txt', 'a', encoding='utf-8') as f:
def main(args): server = tf.train.Server(cluster, job_name=FLAGS.job, task_index=FLAGS.task) client = hdfs.Client("http://10.76.3.92:50070", root='/', timeout=100) with client.read('/graph.txt') as reader: G = eval(reader.read()) FLAGS.worker = len(cluster_dic['worker']) FLAGS.node_num = len(G.keys()) - 1 # FLAGS.train_steps = FLAGS.node_num//FLAGS.batch_size*20 FLAGS.train_steps = 4 is_chief = (FLAGS.task == 0) if FLAGS.job == 'ps': server.join() with tf.device(tf.train.replica_device_setter(cluster=cluster)): global_step = tf.Variable(0, name='global_step', trainable=False) # 创建纪录全局训练步数变量 emb_init = (np.random.randn(FLAGS.node_num, FLAGS.dim) / np.sqrt(FLAGS.node_num / 2)).astype('float32') emb = tf.Variable(emb_init, name='emb', trainable=True) # 创建embedding向量并且初始化 L_con = 0 L_ucon = 0 # 训练参数占位符 pos = (FLAGS.w - 2 * FLAGS.cs) * 2 * FLAGS.cs xc_0 = tf.placeholder(dtype=tf.int32, shape=(pos * FLAGS.batch_size)) # 正边 source xc_1 = tf.placeholder(dtype=tf.int32, shape=(pos * FLAGS.batch_size)) # 正边 target xuc_0 = tf.placeholder(dtype=tf.int32, shape=(pos * FLAGS.ns * FLAGS.batch_size)) # 无边 source xuc_1 = tf.placeholder(dtype=tf.int32, shape=(pos * FLAGS.ns * FLAGS.batch_size)) # 无边 target # 测试参数占位符 val = tf.placeholder(dtype=tf.int32, shape=(2, FLAGS.test_size, 2)) # 随机抽取正边 = 负边 # 将边序列映射到embedding上 con_0_emb = tf.squeeze(tf.nn.embedding_lookup(emb, xc_0)) # (batch, dim) con_1_emb = tf.squeeze(tf.nn.embedding_lookup(emb, xc_1)) # (batch, dim) ucon_0_emb = tf.squeeze(tf.nn.embedding_lookup(emb, xuc_0)) # (batch, dim) ucon_1_emb = tf.squeeze(tf.nn.embedding_lookup(emb, xuc_1)) # (batch, dim) # 计算边相似度是,包括positive samples 和 negative samples con_v = tf.sigmoid( tf.squeeze( tf.reduce_sum(tf.einsum('ni,ni->ni', con_0_emb, con_1_emb), -1))) ucon_v = tf.sigmoid( tf.squeeze( tf.reduce_sum(tf.einsum('ni,ni->ni', ucon_0_emb, ucon_1_emb), -1))) # 计算skip-gram的loss L_con -= tf.reduce_sum(tf.log(con_v + 1e-15)) # connection L_ucon -= tf.reduce_sum(tf.log(1 - ucon_v + 1e-15)) # unconnection loss = (L_con + L_ucon) optimizer = tf.train.AdamOptimizer(FLAGS.lr) train_op = optimizer.minimize(loss, global_step=global_step) AUC = tf.py_func(link_prediction, [val, emb], tf.double, stateful=True) # init_op = tf.global_variables_initializer()# 参数初始化 with tf.train.MonitoredTrainingSession( master=server.target, is_chief=is_chief, # hooks=[tf.train.StopAtStepHook(last_step=FLAGS.train_steps), # tf.train.NanTensorHook(loss)], # checkpoint_dir="./checkpoint_dir", save_checkpoint_steps=100) as sess: time_begin = time.time() print('Traing begins @ %f' % time_begin) local_step = 0 step = 0 dval = testing_data(FLAGS, G) val_feed = {val: dval} while not sess.should_stop() and step <= FLAGS.train_steps: dxc_0, dxc_1, dxuc_0, dxuc_1 = traning_data( FLAGS, G, local_step) train_feed = { xc_0: dxc_0, xc_1: dxc_1, xuc_0: dxuc_0, xuc_1: dxuc_1 } _, step, _loss = sess.run([train_op, global_step, loss], feed_dict=train_feed) local_step += 1 now = time.time() print( '%f: Worker %d: traing step %d dome (global step:%d/%d), and loss : %f' % (now, FLAGS.task, local_step - 1, step, FLAGS.train_steps, _loss)) if local_step % 10 == 0 and local_step != 0: auc = sess.run([AUC], feed_dict=val_feed) print("Link prediction AUC is %.2f" % auc[0]) if step >= FLAGS.train_steps: break auc = sess.run([AUC], feed_dict=val_feed) print("Link prediction AUC is %.2f" % auc[0]) time_end = time.time() print('Training ends @ %f' % time_end) train_time = time_end - time_begin print('Training elapsed time:%f s' % train_time) sleep_time = 0 while sleep_time < 5: time.sleep(2) sleep_time += 1 print("Waiting other machines...")
import pyhdfs import hdfs if __name__ == '__main__': # client = pyhdfs.HdfsClient(hosts='127.0.0.1:50070', user_name='Administrator') # # client = client.get_active_namenode() # client.mkdirs('/hadoop') # print(client.listdir("/")) # client.copy_from_local('E:/data/test/pku98/199801.txt', '/dfs/1.txt') # print(client.listdir("/")) client = hdfs.Client(url='http://127.0.0.1:50070') client.makedirs('/test/hdfs', permission=777) client.upload('/test/hdfs', 'E:/data/test/pku98/199801.txt', overwrite=False)
def __client(path): _, addr, file_path = path.split(':', 2) client = hdfs.Client('http:' + addr + ':50070') path = '/' + file_path.split('/', 1)[-1] return client, path
#!/usr/bin/env python3 # coding=utf-8 # date 2019-05-15 17:23:25 # author calllivecn <*****@*****.**> import hdfs cli = hdfs.Client("http://192.168.56.6:9870") for l in cli.list("/", True): print(l)