def get_hdfs_client(hdfs_host, user): """ :return: """ cli = client.InsecureClient(url=hdfs_host, user=user) return cli
def __get_hdfs_client(self): # hdfs_host = "http://10.72.59.89:50070" # user = "******" if self.is_kerberos: cli = KerberosClient(url=hdfs_host) else: cli = client.InsecureClient(url=hdfs_host, user=user) return cli
def get_hdfs_client(): """ :return: client of hdfs """ hdfs_host = "http://10.72.59.89:50070" user = "******" cli = client.InsecureClient(url=hdfs_host, user=user) return cli
def upload_to_hdfs(localFileName, sparkDirName, params): clientHdfs = client.InsecureClient(params["hdfsHost"], user="******") if sparkDirName.split('/')[-1] in clientHdfs.list( os.path.dirname(sparkDirName)): clientHdfs.delete(sparkDirName, recursive=True) clientHdfs.upload(sparkDirName, localFileName) logger.info("====\"{}\" upload to HDFS finished====".format( localFileName.split('/')[-1])) delete_before2_sparkData(sparkDirName, params)
def delete_before2_sparkData(fileName, params): clientHdfs = client.InsecureClient(params["hdfsHost"], user="******") before2_dateStr1 = datetime.strftime( params["generateDate"] - timedelta(days=2), "%Y%m%d") pattern = re.compile(r'\d{8}') before2_fileName = re.sub(pattern, before2_dateStr1, fileName) if before2_fileName in clientHdfs.list(os.path.dirname(fileName)): clientHdfs.delete(before2_fileName, recursive=True) logger.info("====\"{}\" delete finished ====".format(before2_fileName))
def get_hdfs_client(is_kerberos=False): """ :return: client of hdfs """ # hdfs_host = "http://10.18.0.28:50070" # user="******" if is_kerberos: cli = KerberosClient(url=hdfs_host) else: cli = client.InsecureClient(url=hdfs_host, user=user) client.Client return cli
def run_load_model_predict(params, dfm_params): clientHdfs = client.InsecureClient(params['hdfsHost'], user="******") fileNames = clientHdfs.list(params['sparkDirName_onlineData']) fileNames.remove('_SUCCESS') featureDict, _, _ = get_featureDict_info(params) dataParser = DataParser(params, featureDict) df_online = load_data(clientHdfs, params) df_online_index, df_online_value = dataParser.data_parser(df_online, has_label=False) deep_model_predict = DeepFM_model_predict(params, dfm_params) deep_model_predict.write_result(params, df_online, df_online_index, df_online_value) deep_model_predict.sess.close() logger.info("====\"{}\" write finished====".format( params["localFileName_deepFM_result"].split("/")[-1]))
# train_score, train_loss = dfm.evaluate(df_trainSample_index, df_trainSample_value, y_trainSample_label) # train_scores.append(round(train_score, 5)) # train_losses.append(round(train_loss, 5)) # val_score, val_loss = dfm.evaluate(df_valSample_index, df_valSample_value, y_valSample_label) # val_scores.append(round(val_score, 5)) # val_losses.append(round(val_loss, 5)) # if counter % 50 == 0: # logger.info("====dfm params: {}====".format(dfm_params)) # logger.info("====epoch: {}-{}, train_scores: {}====".format(epoch + 1, counter, train_scores)) # logger.info("====epoch: {}-{}, val_scores: {}====".format(epoch + 1, counter, val_scores)) # logger.info("====epoch: {}-{}, train_losses: {}====".format(epoch + 1, counter, train_losses)) # logger.info("====epoch: {}-{}, val_losses: {}====".format(epoch + 1, counter, val_losses)) if __name__ == '__main__': params = config.params clientHdfs = client.InsecureClient(params['hdfsHost'], user="******") # clientHdfs = '' featureDict, feature_size, field_size = get_featureDict_info( params['featureDict_fileName']) dfm_params = { "feature_size": feature_size, "field_size": field_size, "num_category": 1, "embedding_size": config.embedding_size, "dropout_fm": config.dropout_fm, "deep_layers": config.deep_layers, "dropout_deep": config.dropout_deep, "deep_layer_activation": tf.nn.relu, "epoch": config.epoches, "batch_size": config.batch_size, "init_learning_rate": config.init_learning_rate,
logger = log() base_params = { "epoches": config.epoches, "sparkDirName_trainData": config.sparkDirName_trainData, "sparkDirName_trainSampleData": config.sparkDirName_trainData, "sparkDirName_valSampleData": config.sparkDirName_valSampleData, "featureDict_fileName": config.featureDict_fileName, "DNN_model_fileName": config.DNN_model_fileName, "columnNames": config.columnNames, "numericCols": config.numericCols, "categoryCols": config.categoryCols, "dropFeatures": config.dropFeatures, "label": config.label, "hdfsHost": config.hdfsHost } clientHdfs = client.InsecureClient(base_params['hdfsHost'], user="******") # clientHdfs = '' df_trainSample = load_parquet_file(base_params, clientHdfs, trainSampleData=True) df_valSample = load_parquet_file(base_params, clientHdfs, trainSampleData=False) featureDict, cate_feature_size, cate_field_size = get_featureDict_info( base_params) dcn_params = { "cate_feature_size": cate_feature_size, "cate_field_size": cate_field_size, "numeric_feature_size": len(config.numericCols), "embedding_size": config.embedding_size, "deep_layers": config.deep_layers,
# **hdfs操作 import os from hdfs import client #**url:ip:端口 client = client.InsecureClient("http://yh001:50070", user="******" ,root="/") def upload(remote_dir="/pings/wordcount/", local_dir="D:/data/1/"): """上传本地文件到hdfs""" client.delete(remote_dir, True) client.makedirs(remote_dir) for file in os.listdir(local_dir): client.upload(remote_dir, local_dir + file) if __name__ == "__main__": upload() print(client.list("/"))
df = pipelineModel.transform(df) for colName in stringIndexerCols: df = df.withColumn('stringIndexer_%s' %colName, col('stringIndexer_%s' %colName).cast('double')) df_shuffled = df.orderBy(rand()) df_train_DNN = df.filter(df.departDate < '2019-07-25').orderBy(rand()) df_val_DNN = df.filter(df.departDate >= '2019-07-25') df_train_DNN_sample = df_train_DNN.sample(False, 0.0002) df_val_DNN_sample = df_val_DNN.sample(False, 0.003) todayStr1 = datetime.strftime(datetime.now(), '%Y%m%d') todayStr2 = datetime.strftime(datetime.now(), '%Y-%m-%d') featureData_fileName = 'domestic_shuffled_featureData_' + todayStr1 +'.csv' DNN_trainData_fileName = 'deomestic_DNN_trainData_' + todayStr1 + '.csv' DNN_valData_fileName = 'deomestic_DNN_valData_' + todayStr1 + '.csv' DNN_trainSampleData_fileName = 'deomestic_trainSampleData_' + todayStr1 + '.csv' DNN_valSampleData_fileName = 'deomestic_valSampleData_' + todayStr1 + '.csv' clientHdfs = client.InsecureClient('http://10.0.4.217:9870', user="******") #跟HDFS建立连接 fileNames = [featureData_fileName, DNN_trainData_fileName, DNN_valData_fileName, DNN_trainSampleData_fileName, DNN_valSampleData_fileName] df_subs = [df_shuffled, df_train_DNN, df_val_DNN, df_train_DNN_sample, df_val_DNN_sample] for fileName, df_sub in zip(fileNames, df_subs): if fileName in clientHdfs.list('/crawler/'): clientHdfs.delete("/crawler/" + fileName, recursive=True) df_sub.write.format('csv').save("hdfs://10.0.4.217:8020/crawler/" + fileName) dropFeatures = ['orgPrice', 'org', 'query_calday_ln', 'fn', 'query_festival', 'dst', 'depart_calday_ln', 'futureMinPrice', 'fc', 'queryDate', 'price', 'depart_festival', 'departDate', 'trend'] assembleFeatures = list(set(df.columns) - set(dropFeatures))