示例#1
0
def structure(request, table_name):
    # 检查 table_name 为非空
    check = VALIDATE.not_null_validate(table_name, 'table_name')
    if check is not None:
        response = Response.fail(ERRORS.PARAMETER_VALUE_ERROR, check)
        return HttpResponse(response.to_json())

    result = py4j_common_hive_util('checkExist', table_name)
    if isinstance(result, HttpResponse):
        return result
    if not result:
        return HttpResponse(
            Response.fail(ERRORS.HIVE_TABLE_NOT_EXIST, None).to_json())

    result = py4j_common_hive_util('describeAndSample', table_name)
    result = list(result)
    result_trans = list()
    # 从数据库类型映射到建模类型
    # 1. 不支持的数据类型,标记 ignore为 true
    # 2. 日期类型,标记前端,可选的最小粒度,记录在字段 date_format中
    for field_desc in result:
        field = field_desc.getName()
        database_type_trans = field_desc.getType()
        ignore = True
        field_type = None
        date_format = None
        date_size = None
        if database_type_trans in DATABASE_MAPPING:
            ignore = False
            field_type = DATABASE_MAPPING[database_type_trans]

            sample_data = field_desc.getSampleData()
            if field_type == FIELDTYPE.FACTOR:
                if sample_data is not None:
                    sample_data = list(sample_data)
                    date_, size_ = is_date(sample_data)
                    if date_:
                        date_format = size_
                        date_size = size_
                        field_type = FIELDTYPE.DATE

            if database_type_trans == 'TIMESTAMP':
                date_format = 'second'
                date_size = 'second'
            elif database_type_trans == 'DATE':
                date_format = 'day'
                date_size = 'day'
        struct = StructureClass(field, field_type, database_type_trans,
                                date_format, date_size, ignore)
        result_trans.append(struct)
    # result_trans.sort(key=lambda x: x.field)
    response = Response.success(result_trans)
    return HttpResponse(response.to_json())
示例#2
0
def view_table(request, project_id, component_id):
    robotx_task = Task.objects.filter(project_id=project_id, component_id=component_id)
    if len(robotx_task)==0:
        return HttpResponse(Response.fail(ERRORS.ROBOTX_NOT_SUCCESS).to_json())
    robotx_task = robotx_task[0]
    assert isinstance(robotx_task, Task)
    if robotx_task.task_status != TASK_STATUS.SUCCEEDED:
        return HttpResponse(Response.fail(ERRORS.ROBOTX_NOT_SUCCESS).to_json())

    result_table = RobotXSpark.output_table(project_id, component_id)
    result = py4j_common_hive_util('viewTable', result_table, 10)
    if isinstance(result, HttpResponse):
        return result
    return HttpResponse(Response.success([dict(name=k,value=list(v)) for k,v in result.items()]).to_json())
示例#3
0
def list_table(request, query_str: str = None):
    param = list()
    if query_str is None or query_str.strip() == '':
        # 参数为空,调用listTable,获取所有的表
        func = 'listTable'
    else:
        # 参数非空,模糊匹配表
        func = 'queryTable'
        param.append(query_str.strip())

    result = py4j_common_hive_util(func, *param)
    if isinstance(result, HttpResponse):
        return result
    response = Response.success(list(result))
    return HttpResponse(response.to_json())
示例#4
0
def preview(result, project_id, component_id):
    reader = HiveReader.objects.filter(project_id=project_id,
                                       component_id=component_id)
    if len(reader) == 0:
        return HttpResponse(
            Response.fail(ERRORS.HIVE_TABLE_NOT_EXIST).to_json())
    reader = reader[0]
    table_name = reader.table_name
    result = py4j_common_hive_util('viewTable', table_name, 10)
    if isinstance(result, HttpResponse):
        return result
    return HttpResponse(
        Response.success([
            dict(name=k, value=list(v)) for k, v in result.items()
        ]).to_json())
示例#5
0
def delete_table(request, filenames: List[FileNames]):
    for table in filenames:
        table = table.filename
        hdfs_path_dir = os.path.join(CLUSTER_DIRECTORY, "mydata", "%s" % table)
        # 删除hive表
        result = py4j_common_hive_util("dropTable", table)
        if isinstance(result, HttpResponse):
            return result
        #删除hdfs上保存的本地csv文件
        delete_hdfs = py4j_common_hive_util("deleteHdfsFile", hdfs_path_dir)
        if isinstance(delete_hdfs, HttpResponse):
            return delete_hdfs
        #删除服务器上的csv文件
        mydata_directory = os.path.join(WORKING_DIRECTORY, "mydata")
        csv_saving_path = os.path.join(mydata_directory, "%s.csv" % table)
        new_csv_saving_path = os.path.join(mydata_directory, "%s_.csv" % table)
        if os.path.exists(csv_saving_path):
            os.remove(csv_saving_path)
        if os.path.exists(new_csv_saving_path):
            os.remove(new_csv_saving_path)
        #删除mysql数据库
        MyData.objects.filter(file_name=table).delete()
        MyDataCsvInfo.objects.filter(file_name=table).delete()
    return HttpResponse(Response.success().to_json())
示例#6
0
def check_hive_mysql(mysql_table_sum):
    """查询hive仓库的表,如果没有在mysql中,则认为是手工直接在hive创建的,并保存表名到mysql"""

    search_from_hive_table_list = py4j_common_hive_util('listTable')
    if isinstance(search_from_hive_table_list, HttpResponse):
        return search_from_hive_table_list
    search_from_mysql_table_list = []
    for row_obj in mysql_table_sum:
        search_from_mysql_table_list.append(row_obj.file_name)
        if row_obj.file_name not in search_from_hive_table_list and len(
                search_from_hive_table_list) > 0:
            MyData.objects.filter(file_name=row_obj.file_name).delete()
    for hive_table in search_from_hive_table_list:
        if hive_table not in search_from_mysql_table_list:
            MyData(file_name=hive_table).save()
示例#7
0
def view_table(request, project_id, component_id):
    feature_combine_task = Task.objects.filter(project_id=project_id,
                                               component_id=component_id)
    if len(feature_combine_task) == 0:
        return HttpResponse(
            Response.fail(ERRORS.FEATURE_COMBINE_NOT_SUCCESS).to_json())
    feature_combine_task = feature_combine_task[0]
    assert isinstance(feature_combine_task, Task)
    if feature_combine_task.task_status != TASK_STATUS.SUCCEEDED:
        return HttpResponse(
            Response.fail(ERRORS.FEATURE_COMBINE_NOT_SUCCESS).to_json())

    result_table = FeatureCombineComp.output_table(project_id, component_id)
    result = py4j_common_hive_util('viewTable', result_table, 10)
    if isinstance(result, HttpResponse):
        return result
    return HttpResponse(
        Response.success([
            dict(name=k, value=list(v)) for k, v in result.items()
        ]).to_json())
示例#8
0
def save_hive_reader(request, project_id, component_id, table_name, logic_name):
    # 检查数据类型是否正确
    # project_id是否为数字
    # component_id是否以HiverReader开头+数字
    # table_name在数据库中是否存在
    # logic_name 是否符合规范
    component_id_validate = VALIDATE.component_id_validate(component_id, COMPONENTS.HIVE_READER)
    if component_id_validate is not None:
        return HttpResponse(component_id_validate.to_json())

    result = py4j_common_hive_util('checkExist', table_name)
    if isinstance(result, HttpResponse):
        return result
    if not result:
        return HttpResponse(Response.fail(ERRORS.HIVE_TABLE_NOT_EXIST, None).to_json())

    HiveReader.objects.filter(project_id=project_id, component_id=component_id).delete()
    HiveReader(project_id=project_id, component_id=component_id,
               table_name=table_name, logic_name= logic_name).save()
    return HttpResponse(Response.success(None).to_json())
示例#9
0
def common_del(project_id, component_id, table):
    Task.objects.filter(project_id=project_id, component_id=component_id).delete()
    YarnResource.objects.filter(project_id=project_id, component_id=component_id).delete()
    # delete hdfs working directory
    cluster_working_dir = Component.cluster_working_directory(project_id, component_id)
    py4j_common_hive_util('cleanComponent', cluster_working_dir, table)
示例#10
0
def csv_into_hive(request, filename, username, field_types: List[FieldType]):
    #读取csv文件,获得字段名
    csv_field_list = None
    field_num = None
    hive_filename = filename.strip()
    csv_saving_path = os.path.join(WORKING_DIRECTORY, "mydata",
                                   "%s.csv" % hive_filename)
    with (open(csv_saving_path, 'r', encoding='utf-8')) as f:
        load_hive_csv_reader = csv.reader(f)
        for row_num, row in enumerate(load_hive_csv_reader):
            csv_field_list = row
            field_num = len(row)
            break
    #上传时保存数据信息
    file_size = get_file_size(csv_saving_path)
    creat_time = get_file_create_time(csv_saving_path)
    MyData(file_name=hive_filename,
           field_num=field_num,
           file_size=file_size,
           creat_time=creat_time,
           creat_user=username).save()

    # 手动选择保存字段类型
    db_field_types = []
    for field in field_types:
        db_field_types.append(field.to_db_type(hive_filename))
    # 保存类型
    MyDataType.objects.filter(file_name=hive_filename).delete()
    MyDataType.objects.bulk_create(db_field_types)

    field_list = []
    for field in csv_field_list:
        objcs = MyDataType.objects.filter(file_name=hive_filename, field=field)
        for objcs_field_type in objcs:
            field_type = objcs_field_type.field_type
            sample_field_type = objcs_field_type.sample_data
            if field_type == "numeric":
                field_type = "bigint"
            elif field_type == "factor":
                field_type = "string"
            elif field_type == "date":
                field_type = "date"
            else:
                field_type = "string"
            #如果数据包含小数点,则认为是Double类型
            for x in sample_field_type:
                if "." in x:
                    field_type = "Double"
            field_and_type = format("`%s` %s" % (field, field_type))
            field_list.append(field_and_type)
    hdfs_path = os.path.join(CLUSTER_DIRECTORY, "mydata", "%s",
                             "%s.csv") % (hive_filename, hive_filename)
    hdfs_path_dir = os.path.join(CLUSTER_DIRECTORY, "mydata",
                                 "%s" % hive_filename)
    # print(hdfs_path)
    # print(hdfs_path_dir)
    new_csv_saving_path = os.path.join(WORKING_DIRECTORY, "mydata",
                                       "%s_.csv" % hive_filename)
    try:
        delete_csv_first_row(csv_saving_path, new_csv_saving_path)
        load_into_hive(hive_filename, new_csv_saving_path, hdfs_path,
                       hdfs_path_dir, field_list)
    except Exception as e:
        MyData.objects.filter(file_name=hive_filename).delete()
        # 删除hive表
        result = py4j_common_hive_util("dropTable", hive_filename)
        if isinstance(result, HttpResponse):
            return result
        # 删除hdfs上保存的本地csv文件
        delete_hdfs = py4j_common_hive_util("deleteHdfsFile", hdfs_path_dir)
        if isinstance(delete_hdfs, HttpResponse):
            return delete_hdfs
        # 删除服务器上的csv文件
        mydata_directory = os.path.join(WORKING_DIRECTORY, "mydata")
        csv_saving_path = os.path.join(mydata_directory,
                                       "%s.csv" % hive_filename)
        new_csv_saving_path = os.path.join(mydata_directory,
                                           "%s_.csv" % hive_filename)
        if os.path.exists(csv_saving_path):
            os.remove(csv_saving_path)
        if os.path.exists(new_csv_saving_path):
            os.remove(new_csv_saving_path)
        #raise e
        response = Response.fail(ERRORS.CSV_INTO_ERROR, None)
        return HttpResponse(response.to_json())
    return HttpResponse(Response.success().to_json())