示例#1
0
def get_hdfs_client(hdfs_host, user):
    """

    :return:
    """
    cli = client.InsecureClient(url=hdfs_host, user=user)
    return cli
示例#2
0
 def __get_hdfs_client(self):
     # hdfs_host = "http://10.72.59.89:50070"
     # user = "******"
     if self.is_kerberos:
         cli = KerberosClient(url=hdfs_host)
     else:
         cli = client.InsecureClient(url=hdfs_host, user=user)
     return cli
示例#3
0
def get_hdfs_client():
    """
    :return: client of hdfs
    """
    hdfs_host = "http://10.72.59.89:50070"
    user = "******"

    cli = client.InsecureClient(url=hdfs_host, user=user)
    return cli
示例#4
0
def upload_to_hdfs(localFileName, sparkDirName, params):
    clientHdfs = client.InsecureClient(params["hdfsHost"], user="******")
    if sparkDirName.split('/')[-1] in clientHdfs.list(
            os.path.dirname(sparkDirName)):
        clientHdfs.delete(sparkDirName, recursive=True)
    clientHdfs.upload(sparkDirName, localFileName)
    logger.info("====\"{}\" upload to HDFS finished====".format(
        localFileName.split('/')[-1]))
    delete_before2_sparkData(sparkDirName, params)
示例#5
0
def delete_before2_sparkData(fileName, params):
    clientHdfs = client.InsecureClient(params["hdfsHost"], user="******")
    before2_dateStr1 = datetime.strftime(
        params["generateDate"] - timedelta(days=2), "%Y%m%d")
    pattern = re.compile(r'\d{8}')
    before2_fileName = re.sub(pattern, before2_dateStr1, fileName)
    if before2_fileName in clientHdfs.list(os.path.dirname(fileName)):
        clientHdfs.delete(before2_fileName, recursive=True)
        logger.info("====\"{}\" delete finished ====".format(before2_fileName))
示例#6
0
def get_hdfs_client(is_kerberos=False):
    """

    :return: client of hdfs
    """
    # hdfs_host = "http://10.18.0.28:50070"
    # user="******"
    if is_kerberos:
        cli = KerberosClient(url=hdfs_host)
    else:
        cli = client.InsecureClient(url=hdfs_host, user=user)
        client.Client
    return cli
示例#7
0
def run_load_model_predict(params, dfm_params):
    clientHdfs = client.InsecureClient(params['hdfsHost'], user="******")
    fileNames = clientHdfs.list(params['sparkDirName_onlineData'])
    fileNames.remove('_SUCCESS')
    featureDict, _, _ = get_featureDict_info(params)
    dataParser = DataParser(params, featureDict)
    df_online = load_data(clientHdfs, params)
    df_online_index, df_online_value = dataParser.data_parser(df_online,
                                                              has_label=False)
    deep_model_predict = DeepFM_model_predict(params, dfm_params)
    deep_model_predict.write_result(params, df_online, df_online_index,
                                    df_online_value)
    deep_model_predict.sess.close()
    logger.info("====\"{}\" write finished====".format(
        params["localFileName_deepFM_result"].split("/")[-1]))
示例#8
0
#     train_score, train_loss = dfm.evaluate(df_trainSample_index, df_trainSample_value, y_trainSample_label)
#     train_scores.append(round(train_score, 5))
#     train_losses.append(round(train_loss, 5))
#     val_score, val_loss = dfm.evaluate(df_valSample_index, df_valSample_value, y_valSample_label)
#     val_scores.append(round(val_score, 5))
#     val_losses.append(round(val_loss, 5))
#     if counter % 50 == 0:
#         logger.info("====dfm params: {}====".format(dfm_params))
#         logger.info("====epoch: {}-{}, train_scores: {}====".format(epoch + 1, counter, train_scores))
#         logger.info("====epoch: {}-{}, val_scores: {}====".format(epoch + 1, counter, val_scores))
#         logger.info("====epoch: {}-{}, train_losses: {}====".format(epoch + 1, counter, train_losses))
#         logger.info("====epoch: {}-{}, val_losses: {}====".format(epoch + 1, counter, val_losses))

if __name__ == '__main__':
    params = config.params
    clientHdfs = client.InsecureClient(params['hdfsHost'], user="******")
    # clientHdfs = ''
    featureDict, feature_size, field_size = get_featureDict_info(
        params['featureDict_fileName'])
    dfm_params = {
        "feature_size": feature_size,
        "field_size": field_size,
        "num_category": 1,
        "embedding_size": config.embedding_size,
        "dropout_fm": config.dropout_fm,
        "deep_layers": config.deep_layers,
        "dropout_deep": config.dropout_deep,
        "deep_layer_activation": tf.nn.relu,
        "epoch": config.epoches,
        "batch_size": config.batch_size,
        "init_learning_rate": config.init_learning_rate,
示例#9
0
 logger = log()
 base_params = {
     "epoches": config.epoches,
     "sparkDirName_trainData": config.sparkDirName_trainData,
     "sparkDirName_trainSampleData": config.sparkDirName_trainData,
     "sparkDirName_valSampleData": config.sparkDirName_valSampleData,
     "featureDict_fileName": config.featureDict_fileName,
     "DNN_model_fileName": config.DNN_model_fileName,
     "columnNames": config.columnNames,
     "numericCols": config.numericCols,
     "categoryCols": config.categoryCols,
     "dropFeatures": config.dropFeatures,
     "label": config.label,
     "hdfsHost": config.hdfsHost
 }
 clientHdfs = client.InsecureClient(base_params['hdfsHost'], user="******")
 # clientHdfs = ''
 df_trainSample = load_parquet_file(base_params,
                                    clientHdfs,
                                    trainSampleData=True)
 df_valSample = load_parquet_file(base_params,
                                  clientHdfs,
                                  trainSampleData=False)
 featureDict, cate_feature_size, cate_field_size = get_featureDict_info(
     base_params)
 dcn_params = {
     "cate_feature_size": cate_feature_size,
     "cate_field_size": cate_field_size,
     "numeric_feature_size": len(config.numericCols),
     "embedding_size": config.embedding_size,
     "deep_layers": config.deep_layers,
示例#10
0
# **hdfs操作
import os
from hdfs import client

#**url:ip:端口
client = client.InsecureClient("http://yh001:50070", user="******" ,root="/") 


def upload(remote_dir="/pings/wordcount/", local_dir="D:/data/1/"):
    """上传本地文件到hdfs"""
    client.delete(remote_dir, True)
    client.makedirs(remote_dir)
    for file in os.listdir(local_dir):
        client.upload(remote_dir, local_dir + file)


if __name__ == "__main__": 
    upload()
    print(client.list("/"))
示例#11
0
df = pipelineModel.transform(df)
for colName in stringIndexerCols:
    df = df.withColumn('stringIndexer_%s' %colName, col('stringIndexer_%s' %colName).cast('double'))
df_shuffled = df.orderBy(rand())
df_train_DNN = df.filter(df.departDate < '2019-07-25').orderBy(rand())
df_val_DNN = df.filter(df.departDate >= '2019-07-25')
df_train_DNN_sample = df_train_DNN.sample(False, 0.0002)
df_val_DNN_sample = df_val_DNN.sample(False, 0.003)
todayStr1 = datetime.strftime(datetime.now(), '%Y%m%d')
todayStr2 = datetime.strftime(datetime.now(), '%Y-%m-%d')
featureData_fileName = 'domestic_shuffled_featureData_' + todayStr1 +'.csv'
DNN_trainData_fileName = 'deomestic_DNN_trainData_' + todayStr1 + '.csv'
DNN_valData_fileName = 'deomestic_DNN_valData_' + todayStr1 + '.csv'
DNN_trainSampleData_fileName = 'deomestic_trainSampleData_' + todayStr1 + '.csv'
DNN_valSampleData_fileName = 'deomestic_valSampleData_' + todayStr1 + '.csv'
clientHdfs = client.InsecureClient('http://10.0.4.217:9870', user="******") #跟HDFS建立连接
fileNames = [featureData_fileName, DNN_trainData_fileName, DNN_valData_fileName, DNN_trainSampleData_fileName, DNN_valSampleData_fileName]
df_subs = [df_shuffled, df_train_DNN, df_val_DNN, df_train_DNN_sample, df_val_DNN_sample]
for fileName, df_sub in zip(fileNames, df_subs):
    if fileName in clientHdfs.list('/crawler/'):
        clientHdfs.delete("/crawler/" + fileName, recursive=True)
    df_sub.write.format('csv').save("hdfs://10.0.4.217:8020/crawler/" + fileName)







dropFeatures = ['orgPrice', 'org', 'query_calday_ln', 'fn', 'query_festival', 'dst', 'depart_calday_ln', 'futureMinPrice', 'fc', 'queryDate', 'price', 'depart_festival', 'departDate', 'trend']
assembleFeatures = list(set(df.columns) - set(dropFeatures))