예제 #1
0
    def __init__(self, config, expiration=30000):
        def get_token(username, password, expiration):
            ###
            # input_type: str, str, int
            # input: the username of PAI, the password of PAI and the expiration time of the token
            # output_type: str
            # output: token
            # Get the token from rest server API
            ###
            token_ready = False
            loop_count = 0
            while not token_ready:
                time.sleep(loop_count)
                loop_count += 1
                http_object = self.http.request(
                    'POST',
                    self.rest_server_url + 'token',
                    headers={
                        'Content-Type': 'application/x-www-form-urlencoded',
                    },
                    body='username='******'&password='******'&expiration=' + str(expiration))
                if http_object.status == 200:
                    token_ready = True
                    return json.loads(
                        http_object.data.decode('utf-8'))['token']
                else:
                    print(http_object.status, http_object.data)

        self.rest_server_url = config.rest_server_url  # rest server url
        self.http = urllib3.PoolManager()  # urllib3 http
        self.token = get_token(config.PAI_username, config.PAI_password,
                               expiration)  # rest Server token
        self.hdfs_client = Client(config.webhdfs_url)  # hdfs web url
예제 #2
0
def get_data():
    client = Client("http://t3.dev:50070", "hadoop")
    # client = InsecureClient(url="http://t3.dev:50070", user="******", root="/")
    print(client.list("/huiqu/common/area.txt"))
    with client.read("/huiqu/common/area.txt/part-00000") as read:
        # print(read.read().decode('utf8'))
        return {"data": read.read().decode('utf8')}
예제 #3
0
def download(hdfs_path, local_path):
    # 初始化HDFS连接
    client = Client('http://fisher.lazybone.xyz:9001', root='/')
    if os.path.exists(local_path):
        print('文件已存在')
        return
    client.download(hdfs_path=hdfs_path, local_path=local_path)
예제 #4
0
def generate_job_and_outputtable(schedules_list):
    job_list = []
    outtable_list = []
    client = Client("http://emr2-header-1.ipa.aidigger.com:50070", timeout=30)
    for work_id, schedule_id, cron_type in schedules_list:
        schedule_url = "https://pony.aidigger.com/api/v1/schedules/{}".format(
            schedule_id)
        job_infos = requests.get(schedule_url).json()["data"]
        time.sleep(1)
        owner = job_infos["owner"]
        print("schedule_url: " + schedule_url)
        for job_info in job_infos["execute_DAG"]:
            try:
                if job_info.get("job_info") \
                    and job_info["job_info"]["configs"].get("command","") \
                    and job_info["job_info"]["configs"]["command"].startswith("data_pipeline") : #or job_info["job_info"]["configs"]["command"].startswith("data_connector")
                    config = job_info["job_info"]["configs"]
                    if "args" in config.keys():
                        config['args']["isstreaming"] = str(
                            config['args']["isStreaming"] if "isStreaming" in
                            config['args'].keys(
                            ) else config['args']["isstreaming"])
                        config['args'].get("spark_conf", {}).get(
                            "dependency", {}).pop("data_pipeline", None)
                        config['args'].pop("KafkaCheckpoint", None)
                    job_list.append(
                        (work_id, config["job_id"], job_info["name"],
                         job_info["job_info"]["configs"]["command"], "1G",
                         "0.3", owner, cron_type))
                    for output in config["output"]:
                        outtable_list.append(deepcopy(output))
                        dayu_fullnames = output["dayu_fullname"].split(":")
                        if not dayu_fullnames:
                            raise Exception("error!!")
                        if dayu_fullnames[0].lower() == "hive":
                            dayu_fullnames[1] = "dayu_temp"
                            output["dayu_full_name"] = ":".join(
                                dayu_fullnames) + "_k8spre"
                        elif dayu_fullnames[0].lower().startswith("oss"):
                            output["dayu_full_name"] = output[
                                "dayu_fullname"][:-1] + "_k8spre/"
                        elif dayu_fullnames[0].lower().startswith("kafka"):
                            output["dayu_full_name"] = output[
                                "dayu_fullname"] + "_k8spre"
                            output["dayu_full_name"] = output[
                                "dayu_full_name"].replace(".", "_")
                        else:
                            output["dayu_full_name"] = output[
                                "dayu_fullname"] + "_k8spre"
                        output.pop("dayu_id")
                    content = json.dumps(config).encode(encoding='utf-8')
                    client.write("/tmp/ting.wu/k8s_press/{}.json".format(
                        config["job_id"]),
                                 overwrite=True,
                                 data=content)
                    print("  hdfs: /tmp/ting.wu/k8s_press/{}.json".format(
                        config["job_id"]))
            except Exception as err:
                print(err)
    return job_list, outtable_list
예제 #5
0
    def upload(name, file_path, config):
        env_prefix = config.get("prefix", None)
        hdfs_client = Client(url=config["hdfs"]["name_node"])
        hdfs_hosts = []
        hdfs_http_host = config["hdfs"]["name_node"]
        hdfs_hosts.append(hdfs_http_host.replace("http://", ""))
        hdfs_data_service_root = "/data_service"
        if env_prefix is not None:
            hdfs_data_service_root = "/{0}_data_service".format(env_prefix)

        hdfs_client.makedirs(hdfs_data_service_root)
        timestamp = int(round(time.time() * 1000))
        target_file_name = "{2}/{0}/{1}/{0}_{1}.py".format(
            name, str(timestamp), hdfs_data_service_root)
        hdfs_client.makedirs("{2}/{0}/{1}".format(name, str(timestamp),
                                                  hdfs_data_service_root))
        print("hdfs file name: {0}".format(target_file_name))
        hdfs_client.upload(target_file_name, file_path)
        zip_path = os.path.join(
            os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
            "joowing.zip")
        target_zp_file_name = "{2}/{0}/{1}/joowing.zip".format(
            name, str(timestamp), hdfs_data_service_root)
        # hdfs_client.upload(target_zp_file_name, zip_path)
        # return target_file_name, target_zp_file_name
        return target_file_name
예제 #6
0
    def reflect(self, reload_flag = False):
        if not self.ds_dict:
            si_app.logger.error('error: ds_dict must be provided.')
            return

        if reload_flag:
            si_app.delete_doc_from_index_by_datasource(ds_name=self.ds_dict['ds_name'])


        tclient = Client( self.ds_dict['ds_param']['hdfs_web_url'])     # self.ds_dict['ds_url'])
        path_hdfs = self.ds_dict['ds_param']['root_path']      # self.ds_dict['table_group_name']

        if path_hdfs[-1] == '/':
            # remove trailing '/' for concatenating more path
            path_hdfs = path_hdfs[:-1] # remove the trailing '/'

        filelist = self.getHDFSFileInfo(tclient, path_hdfs)


        for fd in filelist:
            si_app.add_table_content_index(ds_name = self.ds_dict['ds_name'],
                                           table_id=fd['table_name'],
                                           table_info=(json.dumps(fd) ),
                                           table_content_index = ' '.join([fd[k] for k in fd.keys() ])
                                           )
예제 #7
0
 def __init__(self, spark, config, generator):
     super(DataContext, self).__init__()
     self.spark = spark
     self.config = config
     self.generator = generator
     self.env_prefix = config.get("prefix", None)
     self.hdfs_client = Client(url=";".join(config["hdfs"]["name_node"]),
                               proxy="joowing")
예제 #8
0
def main():
    client = Client("http://127.0.0.1:50070",
                    root="/",
                    timeout=100,
                    session=False)
    #client.makedirs("/news")
    client.upload("/input", "x.html")
    print(client.list("/"))
    def from_settings(cls, settings):
        hdfs_master = settings['HDFS_MASTER']
        hdfs_address = settings['HDFS_ADDRESS']
        try:
            client = Client('http://' + str(hdfs_master) + ':' +
                            str(hdfs_address))
        except Exception as e:
            print(e)

        return cls(client)
예제 #10
0
파일: views.py 프로젝트: BigGoby/DataHoop
 def get(self, request):
     client = Client(HDFS_HOST)
     hdfs = request.GET.get('hdfs')
     file_name = DataSource.objects.get(format_filename=hdfs).file_name
     client.download('/datahoop/' + hdfs, settings.MEDIA_ROOT + 'hdfs_download')
     path = os.path.join(settings.MEDIA_ROOT, 'hdfs_download')
     file = open(os.path.join(path, hdfs), 'rb')
     response = FileResponse(file)
     response = HttpResponse(content_type='application/vnd.ms-csv')
     response['Content-Disposition'] = 'attachment; filename={}.csv'.format(file_name.split('.')[0])
     return (response)
예제 #11
0
 def __init__(self, **kwargs):
     self.table_cols_map = {}  # 表字段顺序 {table:(cols, col_default)}
     self.bizdate = bizdate  # 业务日期为启动爬虫的日期
     self.buckets_map = {}  # 桶 {table:items}
     self.bucketsize = kwargs.get('BUCKETSIZE')
     self.client = Client(kwargs.get('HDFS_URLS'))
     self.dir = kwargs.get('HDFS_FOLDER')  # 文件夹路径
     self.delimiter = kwargs.get('HDFS_DELIMITER')  # 列分隔符,默认 hive默认分隔符
     self.encoding = kwargs.get('HDFS_ENCODING')  # 文件编码,默认 'utf-8'
     self.hive_host = kwargs.get('HIVE_HOST')
     self.hive_port = kwargs.get('HIVE_PORT')
     self.hive_dbname = kwargs.get('HIVE_DBNAME')  # 数据库名称
     self.hive_auto_create = kwargs.get('HIVE_AUTO_CREATE',
                                        False)  # hive 是否自动建表,默认 False
     self.client.makedirs(self.dir)
예제 #12
0
파일: views.py 프로젝트: BigGoby/DataHoop
    def get(self, request, *args, **kwargs):
        table_name = request.data.get('name')
        username = request.session['username']
        password = request.session['password']
        host = request.session['host']
        port = request.session['port']
        database_name = request.session['dbdatabase_name']
        obj = DataSource.objects
        conn = pymssql.connect(database=database_name, user=username, password=password, host=host, port=port)
        client = Client(HDFS_HOST)
        cur = conn.cursor()
        for i in table_name:
            global rels
            cur.execute("select name from syscolumns where id = object_id('%s');" % (i))
            rels = []
            rel = []
            rows = cur.fetchall()
            for i in rows:
                for item in i:
                    rel.append(item)
            rels.append(rel)
            # 类似于其他语言的 query 函数, execute 是 python 中的执行查询函数
            cur.execute("SELECT * FROM  %s" % (i))
            # 使用 fetchall 函数,将结果集(多维元组)存入 rows 里面
            rows1 = cur.fetchall()
            # 依次遍历结果集,发现每个元素,就是表中的一条记录,用一个元组来显示
            for row in rows1:
                rels.append(list(row))
            file_name = i + '.sql'
            format_name = uuid.uuid1()
            filepath = settings.MEDIA_ROOT + format_name
            with open(filepath, 'wb+') as writer:
                for chunk in rels:
                    writer.write(chunk)

            client.upload("/datahoop", filepath)
            obj.create(file_name=file_name, format_name=format_name, user_id=1)
            os.remove(filepath)
        client.close()
        cur.close()
        conn.close()

        return HttpResponse(json.dumps(rels), content_type='application/json')
예제 #13
0
파일: views.py 프로젝트: BigGoby/DataHoop
    def delete(self, request, *args, **kwargs):

        file_id = request.data.get('file_id')
        where = DataSource.objects.get(id=file_id).where
        if where == 'hdfs':
            file = DataSource.objects.get(id=file_id)
            hdfs_name = DataSource.objects.get(id=file_id).format_filename
            client = Client(HDFS_HOST)
            client.delete('/datahoop/' + hdfs_name, recursive=True)
            file.delete()
        else:
            client = pymongo.MongoClient(settings.MONGO_DB_HOST, settings.MONGO_DB_PORT)
            db = client.datahoop.data
            file_id = DataSource.objects.filter(id=id).first()
            obj_id = file_id.obj_id
            file_id.delete()
            db.remove({"_id": ObjectId(obj_id)})
            client.close()
        return HttpResponse(content_type='application/json')
예제 #14
0
파일: openpai.py 프로젝트: honkliu/March
    def __init__(self, config: dict = None, file: str = 'openpai.json'):
        """config should contain
            - rest_server_socket
            - hdfs_web_socket
            - user
            - password
        """
        if config is None:
            with open(file) as fn:
                config = json.load(fn)
        for key in [
                'rest_server_socket', 'hdfs_web_socket', 'user', 'password'
        ]:
            assert key in config, '%s is not defined for OpenPAI' % (key)
        for key in ['rest_server_socket', 'hdfs_web_socket']:
            assert config[key].startswith(
                'http://'), '%s should have http prefix' % (key)

        self.rest_server_socket = config['rest_server_socket']
        self.hdfs_client = Client(config['hdfs_web_socket'])
        self.config = config
예제 #15
0
파일: views.py 프로젝트: BigGoby/DataHoop
    def get(self, request, *args, **kwargs):
        # table_name = request.data.get('name')
        table_name = 'files_datasource'
        username = request.session['username']
        password = request.session['password']
        host = request.session['host']
        port = request.session['port']
        database_name = request.session['database_name']
        obj = DataSource.objects
        con = pymysql.connect(host, username, password, database_name)
        client = Client(HDFS_HOST)
        cur = con.cursor()
        # for i in table_name:
        sql = "select DISTINCT (COLUMN_NAME) from information_schema.COLUMNS where table_name = '%s'"
        cur.execute(sql % (table_name))
        rows = cur.fetchall()
        rels = []
        rel = []
        for i in rows:
            rel.append(i[0])
        rels.append(rel)  # 类似于其他语言的 query 函数, execute 是 python 中的执行查询函数
        cur.execute("SELECT * FROM  %s" % (table_name))  # 使用 fetchall 函数,将结果集(多维元组)存入 rows 里面
        rows = cur.fetchall()  # 依次遍历结果集,发现每个元素,就是表中的一条记录,用一个元组来显示
        for row in rows:
            rels.append(list(row))
        file_name = table_name + '.sql'
        format_name = uuid.uuid1()
        filepath = settings.MEDIA_ROOT + str(format_name)
        with open(filepath, 'wb+') as writer:
            for chunk in rels:
                writer.write(chunk)

        client.upload("/datahoop", filepath)
        obj.create(file_name=file_name, format_name=format_name, user_id=1)
        os.remove(filepath)
        client.close()
        con.close()
        cur.close()

        return HttpResponse(json.dumps(rels), content_type='application/json')
예제 #16
0
파일: views.py 프로젝트: BigGoby/DataHoop
 def get(self, request):  # delete mydata
     file_id = request.GET.get('file_id')
     try:
         where = DataSource.objects.get(id=file_id).where
         print(DataSource.objects.get(id=file_id))
         print(where)
         format_filename = DataSource.objects.get(
             id=file_id).format_filename
         format_name_count = DataSource.objects.filter(
             format_filename=format_filename).count()
         if where == 'hdfs' and format_name_count == 1:
             file = DataSource.objects.get(id=file_id)
             hdfs_name = DataSource.objects.get(id=file_id).format_filename
             client = Client(HDFS_HOST)
             client.delete('/datahoop/' + hdfs_name, recursive=True)
             file.delete()
             item = Collect.objects.filter(file_id=file_id)
             if item:
                 item.delete()
         elif where == 'hdfs' and format_name_count > 1:
             file = DataSource.objects.get(id=file_id)
             file.delete()
             item = Collect.objects.filter(file_id=file_id)
             if item:
                 item.delete()
         else:
             client = pymongo.MongoClient(settings.MONGO_DB_URI)
             db = client.datahoop.data
             data_obj = DataSource.objects.filter(id=file_id).first()
             obj_id = data_obj.obj_id
             data_obj.delete()
             db.remove({"_id": ObjectId(obj_id)})
             client.close()
             item = Collect.objects.filter(file_id=file_id)
             if item:
                 item.delete()
         return JsonResponse({'status': True})
     except:
         return JsonResponse({'status': False})
예제 #17
0
파일: job_manage.py 프로젝트: zmoon111/pai
    def __init__(self, config, expiration=30000):
        def get_token(username, password, expiration):
            ###
            # input_type: str, str, int
            # input: the username of PAI, the password of PAI and the expiration time of the token
            # output_type: str
            # output: token
            # Get the token from rest server API
            ###
            rest_server_url_without_namespace = '/'.join(
                self.rest_server_url.split('/')[:-3]) + '/'
            token_ready = False
            loop_count = 0
            while not token_ready:
                time.sleep(loop_count)
                loop_count += 1
                http_object = self.http.request(
                    'POST',
                    rest_server_url_without_namespace + 'token',
                    headers={
                        'Content-Type': 'application/json',
                    },
                    body=json.dumps({
                        'username': username,
                        'password': password,
                        'expiration': str(expiration)
                    }))
                if http_object.status == 200:
                    token_ready = True
                    return json.loads(
                        http_object.data.decode('utf-8'))['token']
                else:
                    print(http_object.status, http_object.data)

        self.rest_server_url = config.rest_server_url  # rest server url
        self.http = urllib3.PoolManager()  # urllib3 http
        self.token = get_token(config.PAI_username, config.PAI_password,
                               expiration)  # rest Server token
        self.hdfs_client = Client(config.webhdfs_url)  # hdfs web url
예제 #18
0
    def conn(self):
        client = Client('http://192.168.0.107:11070')

        return client
예제 #19
0
 def build_connection(self):
     self.client = Client(self.hadoop_url)
예제 #20
0
        rowkey = md5(str(user_id) + str(visitTime))

        print(rowkey)

        mutations = [Mutation(column=self.columnFamily + ":ipaddr", value=ipaddr), \
                     Mutation(column=self.columnFamily + ":visitTime", value=visitTime), \
                     Mutation(column=self.columnFamily + ":user_id", value=user_id), \
                     Mutation(column=self.columnFamily + ":link", value=link)
                     ]
        self.client.mutateRow(self.tablename, rowkey, mutations)


if __name__ == "__main__":

    # 建立hbase连接
    hbasewriteer = CreateTableAndImportData('user_log_info', 'cf_log')
    hbasewriteer.createTable()

    # 连接HDFS
    client = Client(HDFSNN, timeout=200000)

    # 获取文件列表
    logFiles = client.list(LOGPATH)

    # 读取文件
    for logfile in logFiles:
        with client.read(LOGPATH + logfile) as fp:
            for line in fp:
                record = line.split(" ")
                hbasewriteer.importData(record)
예제 #21
0
 def __init__(self):
     self.client = Client("http://fantome:50070")
예제 #22
0
 def get_client(self):
     client = Client(self.url)
     return client
예제 #23
0
 def build_hdfs_client(self) -> Client:
     return Client(url=";".join(self.config["hdfs"]["name_node"]), proxy='joowing')
예제 #24
0
 def __init__(self, config):
     super(NormalContext, self).__init__()
     self.config = config
     self.hdfs_client = Client(url=";".join(config["hdfs"]["name_node"]), proxy='joowing')
     self.env_prefix = config.get("prefix", None)
예제 #25
0
from hdfs import Client

client = Client("http://master:9870")
#client.makedirs("/abc/xyz")
x = client.list("/")
y = client.list("/", status=True)
예제 #26
0
def train(train_path,
          test_path,
          output_path,
          target,
          train_split_ratio=0.33,
          penalty='l2',
          dual=False,
          tol=1e-4,
          C=1.0,
          random_state=None,
          multi_class='ovr'):
    # 设置起始时间
    time.localtime()
    time_trains_start = time.strftime('%Y{y}%m{m}%d{d} %H{h}%M{f}%S{s}'.format(
        y='/', m='/', d='', h=':', f=':', s=''))
    start_time = time.time()

    # 设置输入文件路径
    train_FILENAME = train_path + "/data/Data.csv"  # hdfs文件路径
    test_FILENAME = test_path + "/data/Data.csv"  # hdfs文件路径
    client = Client(HDFS_HOSTS1)
    # 训练数据读取
    with client.read(train_FILENAME) as tr_s:
        tr_content = tr_s.read()
        tr_s = str(tr_content, 'utf-8')
    # 确保文件写入完毕
    tr_file = open("trainData.csv", "w")
    tr_file.flush()
    os.fsync(tr_file)
    tr_file.write(tr_s)
    tr_file.close()
    df_train = pd.read_csv("trainData.csv", header=0)
    print(df_train)

    # 测试数据读取
    with client.read(test_FILENAME) as te_fs:
        te_content = te_fs.read()
        te_s = str(te_content, 'utf-8')
    # 确保文件写入完毕
    te_file = open("testData.csv", "w")
    te_file.flush()
    os.fsync(te_file)
    te_file.write(te_s)
    te_file.close()
    df_test = pd.read_csv("testData.csv", header=0)
    print(df_test)

    min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
    test_data_num = df_train.shape[0]
    train_data_num = df_train.shape[0]

    # 处理预测集
    df_test = min_max_scaler.fit_transform(df_test)
    df_test = np.array(df_test)

    # 数据处理和清洗
    cols = [tmp_i for tmp_i in df_train.columns if tmp_i not in [target]]
    X = df_train[cols]

    X = np.array(X)
    X = min_max_scaler.fit_transform(X)
    Y = df_train[target]
    Y = np.array(Y)

    # 训练集数据分割
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=train_split_ratio)

    # 使用 scikit learn 中的LR模型进行训练
    clf = LogisticRegression(penalty,
                             dual,
                             tol,
                             C,
                             random_state,
                             multi_class,
                             solver='liblinear')
    clf.fit(X_train, Y_train)

    # 准确率train_acc
    train_acc = clf.score(X_test, Y_test)
    print('score Scikit learn: ', train_acc)
    # 精确率train_precision_score
    train_precision_score = precision_score(Y_test, clf.predict(X_test))
    # 召回率train_recall_score
    train_recall_score = recall_score(Y_test, clf.predict(X_test))
    # F1_Score
    train_f1_score = f1_score(Y_test, clf.predict(X_test))
    # roc_auc_score
    train_roc_auc_score1 = roc_auc_score(Y_test, clf.predict(X_test))

    # 使用 scikit learn 中的LR模型进行预测
    result = clf.predict(df_test)
    # print(result)

    # 设置终止时间,并计算总时间
    train_end = time.time()
    train_seconds = train_end - start_time
    m, s = divmod(train_seconds, 60)
    h, m = divmod(m, 60)
    time_trains_all = "%02d:%02d:%02d" % (h, m, s)

    # ++++++++++++++++++++++++++++++++++++++++训练结果保存+++++++++++++++++++++++++++++++++++++++#
    ## 保存摘要模型报告文件
    # abstract_path = HDFS_HOSTS1 + output_path + '/abstract/data/'
    abstract_path = output_path + '/abstract/data/'
    f = open('abstract.csv', mode='w', newline='')
    fileheader = [
        'FrameWork', 'Version', 'model', 'accuracy', 'time_trains_start',
        'time_trains_all', 'test_data_num', 'train_data_num'
    ]
    w = csv.DictWriter(f, fileheader)
    w.writeheader()
    csv_dict = edict()
    csv_dict.FrameWork = 'Scikit-learn'
    csv_dict.Version = sklearn.__version__
    csv_dict.model = '%s' % LogisticRegression
    csv_dict.accuracy = str(train_acc)
    csv_dict.time_trains_start = time_trains_start
    csv_dict.time_trains_all = time_trains_all
    csv_dict.test_data_num = str(test_data_num)
    csv_dict.train_data_num = str(train_data_num)
    w.writerow(csv_dict)
    f.close()
    client.delete(abstract_path + 'abstract.csv')
    client.upload(abstract_path + 'abstract.csv', 'abstract.csv')
    # if len(client.list(abstract_path)):
    # 	client.delete(abstract_path + 'abstract.csv')
    # 	client.upload(abstract_path + 'abstract.csv', 'abstract.csv')
    # else:
    # 	client.upload(abstract_path + 'abstract.csv', 'abstract.csv')

    ##保存模型版本信息csv文件
    version_path = output_path + '/msg/data/'
    f = open('msg.csv', mode='w', newline='')
    fileheader = [
        'accuracy', 'time_trains_start', 'time_trains_all', 'test_data_num',
        'train_data_num'
    ]
    w = csv.DictWriter(f, fileheader)
    w.writeheader()
    csv_dict = edict()
    csv_dict.accuracy = str(train_acc)
    csv_dict.time_trains_start = time_trains_start
    csv_dict.time_trains_all = time_trains_all
    csv_dict.test_data_num = str(test_data_num)
    csv_dict.train_data_num = str(train_data_num)
    w.writerow(csv_dict)
    f.close()
    client.delete(version_path + 'msg.csv')
    client.upload(version_path + 'msg.csv', 'msg.csv')

    ## 保存训练评价指标模型报告文件
    file_csv_path = output_path + '/evaluation/data/'
    f = open('evaluation.csv', mode='w', newline='')
    fileheader = [
        'accuracy', 'train_precision_score', 'train_recall_score',
        'train_f1_score', 'train_roc_auc_score1'
    ]
    w = csv.DictWriter(f, fileheader)
    w.writeheader()
    csv_dict = edict()
    csv_dict.accuracy = str(train_acc)
    csv_dict.train_precision_score = train_precision_score
    csv_dict.train_recall_score = train_recall_score
    csv_dict.train_f1_score = train_f1_score
    csv_dict.train_roc_auc_score1 = train_roc_auc_score1
    w.writerow(csv_dict)
    f.close()
    client.delete(file_csv_path + 'evaluation.csv')
    client.upload(file_csv_path + 'evaluation.csv', 'evaluation.csv')

    # 保存测试集预测结果文件
    file_csv_path = output_path + '/result/data/'

    # 字典中的key值即为csv中列名
    dataframe = pd.DataFrame({target: result})
    # 将DataFrame存储为csv,index表示是否显示行名,default=True
    dataframe.to_csv("result.csv", index=False, sep=',')

    client.delete(file_csv_path + 'result.csv')
    client.upload(file_csv_path + 'result.csv', 'result.csv')
예제 #27
0
            print(rowkey)
            mutations = [Mutation(column=self.columnFamily + ":ipaddr", value=ipaddr), \
                         Mutation(column=self.columnFamily + ":visitTime", value=visitTime), \
                         Mutation(column=self.columnFamily + ":user_id", value=user_id), \
                         Mutation(column=self.columnFamily + ":link", value=link)
                         ]
            # 一次提交多行
            mutations_batch.append(
                BatchMutation(row=rowkey, mutations=mutations))
            if len(mutations_batch) % batch_size == 0:
                self.client.mutateRows(self.tablename, mutations_batch)
                mutations_batch = []


if __name__ == "__main__":

    # 建立hbase连接
    hbasewriteer = CreateTableAndImportData('user_log_info', 'cf_log')
    hbasewriteer.createTable()

    # 连接HDFS
    client = Client(HDFSNN)

    # 获取文件列表
    logFiles = client.list(LOGPATH)

    # 读取文件
    for logfile in logFiles:
        with client.read(os.path.join(LOGPATH, logfile)) as deal_file_handle:
            hbasewriteer.importData(deal_file_handle)
예제 #28
0
import pandas as pd
import os
from hdfs import Client
# 目前读取hdfs文件采用方式:
# 1. 先从hdfs读取二进制数据流文件
# 2. 将二进制文件另存为.csv
# 3. 使用pandas读取csv文件
HDFSHOST = "http://172.16.18.112:50070"
train_path = "/9a7e7ef5a78f4f8388deff28cc5c2115/dataSet/devdatasetdef19761"
test_path = "/9a7e7ef5a78f4f8388deff28cc5c2115/dataSet/devdatasetdef19762"
train_FILENAME = train_path + "/data/Data.csv"  #hdfs文件路径
test_FILENAME = test_path + "/data/Data.csv"  #hdfs文件路径
client = Client(HDFSHOST)
with client.read(train_FILENAME) as tr_s:
    tr_content = tr_s.read()
    tr_s = str(tr_content, 'utf-8')

# 确保文件写入完毕
tr_file = open("trainData.csv", "w")
tr_file.flush()
os.fsync(tr_file)
tr_file.write(tr_s)
tr_file.close()

# 读取文件
df_train = pd.read_csv("trainData.csv", header=0)
print(df_train)

with client.read(test_FILENAME) as te_fs:
    te_content = te_fs.read()
    te_s = str(te_content, 'utf-8')
예제 #29
0
 def __init__(self, host, port=50070):
     self.url = "http://%s:%d" % (host, port)
     self.client = Client(url=self.url)
예제 #30
0
 def __init__(self, url):
     self._client = Client(url)