Exemplo n.º 1
0
def pcaCore(requestStr):
    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']
    columnNames = requestDict['columnNames']
    # 新列列名,默认为“降维结果”,若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = "降维结果"

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 默认目标维度k为3,若用户指定,以指定为准
    try:
        k = int(requestDict['k'])
    except:
        k = 3

    # 目标维度k需要小于原始维度,否则返回错误信息
    if k >= len(columnNames):
        return "error_targetDimensions"

    # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息
    vecAssembler = VectorAssembler(inputCols=columnNames, outputCol="features")
    try:
        df = vecAssembler.transform(df)
    except utils.IllegalArgumentException:
        return "error_numerical"

    # 设定pca模型
    pca = PCA(k=k, inputCol="features", outputCol=newColumnName)

    # 训练
    df = pca.fit(df).transform(df)

    df = df.drop("features")
    df.show()

    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '12'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return df
Exemplo n.º 2
0
def vectorIndexerCore(requestStr):
    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']
    columnName = requestDict['columnName']
    # 只能输入一列,否则报错
    if len(columnName.split(",")) != 1:
        return "error_columnInputNumSingle"
    # 新列的列名默认为columnName + "(向量索引转换)",若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = columnName + "(向量索引转换)"

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 默认分类阈值maxCategories为20,若用户指定,以指定为准
    try:
        maxCategories = int(requestDict['maxCategories'])
    except:
        maxCategories = 20

    # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息
    vecAssembler = VectorAssembler(inputCols=[columnName], outputCol="features")
    try:
        df = vecAssembler.transform(df)
    except utils.IllegalArgumentException:
        return "error_numerical"

    # 定义indexer(向量索引转换模型)
    indexer = VectorIndexer(maxCategories=maxCategories, inputCol="features", outputCol=newColumnName)

    # 训练
    df = indexer.fit(df).transform(df)

    df = df.drop("features")
    df.show()

    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '10'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return df
Exemplo n.º 3
0
def polynomialExpansionCore(requestStr):
    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']
    columnNamesStr = requestDict['columnNames']
    columnNames = columnNamesStr.split(",")
    # 新列的列名默认为"多项式扩展" + columnNames,若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = "多项式扩展" + "(" + columnNamesStr + ")"

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息
    vecAssembler = VectorAssembler(inputCols=columnNames, outputCol="features")
    try:
        df = vecAssembler.transform(df)
    except utils.IllegalArgumentException:
        return "error_numerical"

    # 设定多项式扩展模型
    px = PolynomialExpansion(inputCol="features", outputCol=newColumnName)

    # 训练
    df = px.transform(df)

    df = df.drop("features")
    df.show()

    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '9'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return df
Exemplo n.º 4
0
def oneHotEncoderCore(requestStr):
    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']
    columnName = requestDict['columnName']
    # 只能输入一列,否则报错
    if len(columnName.split(",")) != 1:
        return "error_columnInputNumSingle"
    # 新列的列名默认为columnName + "(独热编码)",若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = columnName + "(独热编码)"

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 设定独热编码模型
    ohe = OneHotEncoderEstimator(inputCols=[columnName], outputCols=[newColumnName])

    # 训练
    try:
        df = ohe.fit(df).transform(df)
    except:
        return "error_intOnly"

    df.show()

    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '7'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return df
Exemplo n.º 5
0
def quantileDiscretization():
    # 接受请求传参,例如: {"projectName":"订单分析","columnName":"装运成本","newColumnName":"装运成本(分位数离散化)","numBuckets":10}
    # 参数中可指定分箱数numBuckets, 默认为5
    if request.method == 'GET':
        requestStr = request.args.get("requestStr")
    else:
        requestStr = request.form.get("requestStr")

    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 执行主函数,获取df(spark格式)
    df = quantileDiscretizationCore(requestStr,df)
    if df == "error_projectUrl":
        return "error: 项目名或项目路径有误"
    elif df == "error_columnInputNumSingle":
        return "error: 只能选择一列进行分位数离散化"
    elif df == "error_numerical":
        return "error: 只能离散化数值型的列,请检查列名输入是否有误"

    df.show()
    # 处理后的数据写入文件(借助pandas进行存储、返回)
    df_pandas = df.toPandas()
    df_pandas.to_csv(save_dir, header=True)
    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '8'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return jsonify({'length': df.count(), 'data': df_pandas.to_json(force_ascii=False)})
Exemplo n.º 6
0
def stringIndexerCore(requestStr):
    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']
    columnName = requestDict['columnName']
    # 新列名称,默认为columnName + “(标签化,按频率排序,0为频次最高)”,若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = columnName + "(标签化,按频率排序,0为频次最高)"
    # 只能输入一列,否则报错
    if len(columnName.split(",")) != 1:
        return "error_columnInputNumSingle"

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 设定si(字符串转标签模型)
    si = StringIndexer(inputCol=columnName, outputCol=newColumnName)

    # 训练
    df = si.fit(df).transform(df)

    df.show()

    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '16'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return df
Exemplo n.º 7
0
def chiSqSelectorCore(requestStr):
    # 对参数格式进行转化:json->字典,并进一步进行解析
    requestDict = json.loads(requestStr)
    projectName = requestDict['projectName']
    columnNamesStr = requestDict['columnNames']
    columnName_label = requestDict['columnName_label']
    # columnName_label必须为单列,否则报错
    if len(columnName_label.split(",")) != 1:
        return "error_columnInputNumSingle"

    # 获取卡方选择结果topN的数目,默认numTopFeatures为1
    try:
        numTopFeatures = requestDict['numTopFeatures']
    except:
        numTopFeatures = 1

    columnNames = columnNamesStr.split(",")
    # columnNames的数目必须大于1,否则报错
    if len(columnNames) < 2:
        return "error_columnInputNumMultiple"
    # 新列的列名默认为"卡方选择" + (与 columnName_label 相关的前 numTopFeatures 个特征列),若用户指定,以用户指定为准
    try:
        newColumnName = requestDict['newColumnName']
    except:
        newColumnName = "卡方选择" + "(与 [" + str(columnName_label) + "] 相关的前 " + str(numTopFeatures) + " 个特征列)"

    # spark会话
    spark = SparkSession \
        .builder \
        .master("local") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

    # 解析项目路径,读取csv
    urls = getProjectCurrentDataUrl(projectName)
    if urls == 'error':
        return 'error_projectUrl'  # 错误类型:项目名或项目路径有误
    fileUrl = urls['fileUrl']
    df = spark.read.csv(fileUrl, header=True, inferSchema=True)

    # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息
    vecAssembler = VectorAssembler(inputCols=columnNames, outputCol="features")
    try:
        df = vecAssembler.transform(df)
    except utils.IllegalArgumentException:
        return "error_numerical"

    # 设定标签列label
    df = df.withColumn("label", df[columnName_label])

    # 设定多项式扩展模型
    selector = ChiSqSelector(numTopFeatures=numTopFeatures, outputCol=newColumnName)

    # 训练,若label的类型不是数值型,报错
    try:
        df = selector.fit(df).transform(df)
    except utils.IllegalArgumentException:
        return "error_numerical"

    df = df.drop("features")
    df.show()

    # 追加处理流程记录
    operateParameter = {}
    operateParameter['type'] = '13'
    operateParameter['operate'] = requestStr
    addProcessingFlow(projectName, "admin", operateParameter)

    return df