def structure(request, table_name): # 检查 table_name 为非空 check = VALIDATE.not_null_validate(table_name, 'table_name') if check is not None: response = Response.fail(ERRORS.PARAMETER_VALUE_ERROR, check) return HttpResponse(response.to_json()) result = py4j_common_hive_util('checkExist', table_name) if isinstance(result, HttpResponse): return result if not result: return HttpResponse( Response.fail(ERRORS.HIVE_TABLE_NOT_EXIST, None).to_json()) result = py4j_common_hive_util('describeAndSample', table_name) result = list(result) result_trans = list() # 从数据库类型映射到建模类型 # 1. 不支持的数据类型,标记 ignore为 true # 2. 日期类型,标记前端,可选的最小粒度,记录在字段 date_format中 for field_desc in result: field = field_desc.getName() database_type_trans = field_desc.getType() ignore = True field_type = None date_format = None date_size = None if database_type_trans in DATABASE_MAPPING: ignore = False field_type = DATABASE_MAPPING[database_type_trans] sample_data = field_desc.getSampleData() if field_type == FIELDTYPE.FACTOR: if sample_data is not None: sample_data = list(sample_data) date_, size_ = is_date(sample_data) if date_: date_format = size_ date_size = size_ field_type = FIELDTYPE.DATE if database_type_trans == 'TIMESTAMP': date_format = 'second' date_size = 'second' elif database_type_trans == 'DATE': date_format = 'day' date_size = 'day' struct = StructureClass(field, field_type, database_type_trans, date_format, date_size, ignore) result_trans.append(struct) # result_trans.sort(key=lambda x: x.field) response = Response.success(result_trans) return HttpResponse(response.to_json())
def view_table(request, project_id, component_id): robotx_task = Task.objects.filter(project_id=project_id, component_id=component_id) if len(robotx_task)==0: return HttpResponse(Response.fail(ERRORS.ROBOTX_NOT_SUCCESS).to_json()) robotx_task = robotx_task[0] assert isinstance(robotx_task, Task) if robotx_task.task_status != TASK_STATUS.SUCCEEDED: return HttpResponse(Response.fail(ERRORS.ROBOTX_NOT_SUCCESS).to_json()) result_table = RobotXSpark.output_table(project_id, component_id) result = py4j_common_hive_util('viewTable', result_table, 10) if isinstance(result, HttpResponse): return result return HttpResponse(Response.success([dict(name=k,value=list(v)) for k,v in result.items()]).to_json())
def list_table(request, query_str: str = None): param = list() if query_str is None or query_str.strip() == '': # 参数为空,调用listTable,获取所有的表 func = 'listTable' else: # 参数非空,模糊匹配表 func = 'queryTable' param.append(query_str.strip()) result = py4j_common_hive_util(func, *param) if isinstance(result, HttpResponse): return result response = Response.success(list(result)) return HttpResponse(response.to_json())
def preview(result, project_id, component_id): reader = HiveReader.objects.filter(project_id=project_id, component_id=component_id) if len(reader) == 0: return HttpResponse( Response.fail(ERRORS.HIVE_TABLE_NOT_EXIST).to_json()) reader = reader[0] table_name = reader.table_name result = py4j_common_hive_util('viewTable', table_name, 10) if isinstance(result, HttpResponse): return result return HttpResponse( Response.success([ dict(name=k, value=list(v)) for k, v in result.items() ]).to_json())
def delete_table(request, filenames: List[FileNames]): for table in filenames: table = table.filename hdfs_path_dir = os.path.join(CLUSTER_DIRECTORY, "mydata", "%s" % table) # 删除hive表 result = py4j_common_hive_util("dropTable", table) if isinstance(result, HttpResponse): return result #删除hdfs上保存的本地csv文件 delete_hdfs = py4j_common_hive_util("deleteHdfsFile", hdfs_path_dir) if isinstance(delete_hdfs, HttpResponse): return delete_hdfs #删除服务器上的csv文件 mydata_directory = os.path.join(WORKING_DIRECTORY, "mydata") csv_saving_path = os.path.join(mydata_directory, "%s.csv" % table) new_csv_saving_path = os.path.join(mydata_directory, "%s_.csv" % table) if os.path.exists(csv_saving_path): os.remove(csv_saving_path) if os.path.exists(new_csv_saving_path): os.remove(new_csv_saving_path) #删除mysql数据库 MyData.objects.filter(file_name=table).delete() MyDataCsvInfo.objects.filter(file_name=table).delete() return HttpResponse(Response.success().to_json())
def check_hive_mysql(mysql_table_sum): """查询hive仓库的表,如果没有在mysql中,则认为是手工直接在hive创建的,并保存表名到mysql""" search_from_hive_table_list = py4j_common_hive_util('listTable') if isinstance(search_from_hive_table_list, HttpResponse): return search_from_hive_table_list search_from_mysql_table_list = [] for row_obj in mysql_table_sum: search_from_mysql_table_list.append(row_obj.file_name) if row_obj.file_name not in search_from_hive_table_list and len( search_from_hive_table_list) > 0: MyData.objects.filter(file_name=row_obj.file_name).delete() for hive_table in search_from_hive_table_list: if hive_table not in search_from_mysql_table_list: MyData(file_name=hive_table).save()
def view_table(request, project_id, component_id): feature_combine_task = Task.objects.filter(project_id=project_id, component_id=component_id) if len(feature_combine_task) == 0: return HttpResponse( Response.fail(ERRORS.FEATURE_COMBINE_NOT_SUCCESS).to_json()) feature_combine_task = feature_combine_task[0] assert isinstance(feature_combine_task, Task) if feature_combine_task.task_status != TASK_STATUS.SUCCEEDED: return HttpResponse( Response.fail(ERRORS.FEATURE_COMBINE_NOT_SUCCESS).to_json()) result_table = FeatureCombineComp.output_table(project_id, component_id) result = py4j_common_hive_util('viewTable', result_table, 10) if isinstance(result, HttpResponse): return result return HttpResponse( Response.success([ dict(name=k, value=list(v)) for k, v in result.items() ]).to_json())
def save_hive_reader(request, project_id, component_id, table_name, logic_name): # 检查数据类型是否正确 # project_id是否为数字 # component_id是否以HiverReader开头+数字 # table_name在数据库中是否存在 # logic_name 是否符合规范 component_id_validate = VALIDATE.component_id_validate(component_id, COMPONENTS.HIVE_READER) if component_id_validate is not None: return HttpResponse(component_id_validate.to_json()) result = py4j_common_hive_util('checkExist', table_name) if isinstance(result, HttpResponse): return result if not result: return HttpResponse(Response.fail(ERRORS.HIVE_TABLE_NOT_EXIST, None).to_json()) HiveReader.objects.filter(project_id=project_id, component_id=component_id).delete() HiveReader(project_id=project_id, component_id=component_id, table_name=table_name, logic_name= logic_name).save() return HttpResponse(Response.success(None).to_json())
def common_del(project_id, component_id, table): Task.objects.filter(project_id=project_id, component_id=component_id).delete() YarnResource.objects.filter(project_id=project_id, component_id=component_id).delete() # delete hdfs working directory cluster_working_dir = Component.cluster_working_directory(project_id, component_id) py4j_common_hive_util('cleanComponent', cluster_working_dir, table)
def csv_into_hive(request, filename, username, field_types: List[FieldType]): #读取csv文件,获得字段名 csv_field_list = None field_num = None hive_filename = filename.strip() csv_saving_path = os.path.join(WORKING_DIRECTORY, "mydata", "%s.csv" % hive_filename) with (open(csv_saving_path, 'r', encoding='utf-8')) as f: load_hive_csv_reader = csv.reader(f) for row_num, row in enumerate(load_hive_csv_reader): csv_field_list = row field_num = len(row) break #上传时保存数据信息 file_size = get_file_size(csv_saving_path) creat_time = get_file_create_time(csv_saving_path) MyData(file_name=hive_filename, field_num=field_num, file_size=file_size, creat_time=creat_time, creat_user=username).save() # 手动选择保存字段类型 db_field_types = [] for field in field_types: db_field_types.append(field.to_db_type(hive_filename)) # 保存类型 MyDataType.objects.filter(file_name=hive_filename).delete() MyDataType.objects.bulk_create(db_field_types) field_list = [] for field in csv_field_list: objcs = MyDataType.objects.filter(file_name=hive_filename, field=field) for objcs_field_type in objcs: field_type = objcs_field_type.field_type sample_field_type = objcs_field_type.sample_data if field_type == "numeric": field_type = "bigint" elif field_type == "factor": field_type = "string" elif field_type == "date": field_type = "date" else: field_type = "string" #如果数据包含小数点,则认为是Double类型 for x in sample_field_type: if "." in x: field_type = "Double" field_and_type = format("`%s` %s" % (field, field_type)) field_list.append(field_and_type) hdfs_path = os.path.join(CLUSTER_DIRECTORY, "mydata", "%s", "%s.csv") % (hive_filename, hive_filename) hdfs_path_dir = os.path.join(CLUSTER_DIRECTORY, "mydata", "%s" % hive_filename) # print(hdfs_path) # print(hdfs_path_dir) new_csv_saving_path = os.path.join(WORKING_DIRECTORY, "mydata", "%s_.csv" % hive_filename) try: delete_csv_first_row(csv_saving_path, new_csv_saving_path) load_into_hive(hive_filename, new_csv_saving_path, hdfs_path, hdfs_path_dir, field_list) except Exception as e: MyData.objects.filter(file_name=hive_filename).delete() # 删除hive表 result = py4j_common_hive_util("dropTable", hive_filename) if isinstance(result, HttpResponse): return result # 删除hdfs上保存的本地csv文件 delete_hdfs = py4j_common_hive_util("deleteHdfsFile", hdfs_path_dir) if isinstance(delete_hdfs, HttpResponse): return delete_hdfs # 删除服务器上的csv文件 mydata_directory = os.path.join(WORKING_DIRECTORY, "mydata") csv_saving_path = os.path.join(mydata_directory, "%s.csv" % hive_filename) new_csv_saving_path = os.path.join(mydata_directory, "%s_.csv" % hive_filename) if os.path.exists(csv_saving_path): os.remove(csv_saving_path) if os.path.exists(new_csv_saving_path): os.remove(new_csv_saving_path) #raise e response = Response.fail(ERRORS.CSV_INTO_ERROR, None) return HttpResponse(response.to_json()) return HttpResponse(Response.success().to_json())