def randomForest(request):
    # 获取数据
    f = request.FILES.get("csv_file")
    filename = csv_util.upload(f)
    data_train = pd.read_csv(filename)
    os.remove(filename)
    # target是补全的目标列名
    target = request.POST['target']
    # ref是用来生成拟合值的相关项列表,格式是以逗号将各项隔开的字符串,如:SibSp,Pclass,Fare,Parch
    ref_str = request.POST['ref']
    ref = ref_str.split(',')
    ref.insert(0,target)#将target列名插入在ref最前面
    print(ref)
    target_df=data_train[ref]#将这些列的数据都取出来
    #将数据分成已知目标项值和未知目标项值两部分
    known_data= target_df[target_df[target].notnull()].as_matrix()
    unknown_data =  target_df[target_df[target].isnull()].as_matrix()
    # y即目标值
    y = known_data[:, 0]
    # X即特征属性值
    X = known_data[:, 1:]
    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)
    # 用得到的模型进行未知目标值结果预测
    predictedAges = rfr.predict(unknown_data[:, 1::])
    # 用得到的预测结果填补原缺失数据
    data_train.loc[(data_train[target].isnull()), target] = predictedAges

    return response_util.csv_info(data_train)
Exemplo n.º 2
0
def scale_(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        #尺度变换目标列名
        target_str = request.POST['target']
        target = target_str.split(',')
        # 尺度变换方法:log2、log10、ln、abs、sqrt
        scale = request.POST['scale']
        for each in target:
            target_df = data_train[each]
            if scale == 'log2':
                data_train[each] = np.log2(target_df)
            elif scale == 'log10':
                data_train[each] = np.log10(target_df)
            elif scale == 'ln':
                data_train[each] = np.log(target_df)
            elif scale == 'abs':
                data_train[each] = np.abs(target_df)
            elif scale == 'sqrt':
                data_train[each] = np.sqrt(target_df)
            else:
                return response_util.wrong_info(
                    '输入的方法不包含在log2、log10、ln、abs、sqrt里')
        return response_util.csv_info(data_train)
def setId(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename=csv_util.upload(f)
        data_train=pd.read_csv(filename)
        os.remove(filename)
        # 增加id列
        count = len(data_train)
        list = []
        for num in range(0, count):
            list.append(num)
        data_train.insert(0, 'id', list)
        return response_util.csv_info(data_train)
def standard(request):
    # 获取数据
    f = request.FILES.get("csv_file")
    filename = csv_util.upload(f)
    data_train = pd.read_csv(filename)
    os.remove(filename)
    # target是标准化的目标列名
    target_str = request.POST['target']
    target = target_str.split(',')
    scaler = preprocessing.StandardScaler()
    for each in target:
        standard_data = scaler.fit_transform(data_train[each].values.reshape(-1,1))
        data_train.drop([each], axis=1, inplace=True)
        data_train[each] = standard_data
    return response_util.csv_info(data_train)
def dummy(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        # target_str为需要因子化的列名,以逗号隔开,例:cabin,sex,pclass
        target_str = request.POST['target']
        target = target_str.split(',')
        for each in target:
            dummies = pd.get_dummies(data_train[each], prefix=each)
            data_train = pd.concat([data_train, dummies], axis=1)
            data_train.drop([each], axis=1, inplace=True)
        return response_util.csv_info(data_train)
Exemplo n.º 6
0
def soften(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        #平滑目标列名
        target = request.POST['target']
        # 平滑方法:百分位 per 、阈值 thresh
        soften_method = request.POST['soften_method']
        # 上下数据点(上下百分位或最大最小阈值)
        min_value = int(request.POST['min'])
        max_value = int(request.POST['max'])
        # 取出目标数据
        target_df = data_train[target].values.tolist()
        if soften_method == 'per':
            # 百分位平滑
            if min_value < 0:
                return response_util.wrong_info('百分位平滑,min值应不小于0')
            elif max_value > 100:
                return response_util.wrong_info('百分位平滑,max值应不大于100')
            else:
                min_value /= 100
                max_value /= 100
                # 列表中最小最大值
                data_min = min(target_df)
                data_max = max(target_df)
                # 计算出来的范围
                min_num = data_min + min_value * (data_max - data_min)
                max_num = data_min + max_value * (data_max - data_min)
                for i in range(0, len(target_df)):
                    if target_df[i] < min_num:
                        target_df[i] = min_num
                    elif target_df[i] > max_num:
                        target_df[i] = max_num
                data_train[target] = target_df
        elif soften_method == 'thresh':
            # 阈值平滑
            for i in range(0, len(target_df)):
                if target_df[i] < min_value:
                    target_df[i] = min_value
                elif target_df[i] > max_value:
                    target_df[i] = max_value
            data_train[target] = target_df
        else:
            return response_util.wrong_info('输入的方法不包含在per/thresh里')
        return response_util.csv_info(data_train)
def normalize(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        #归一化目标列名
        target = request.POST['target']
        # 取出目标列数据
        mm = preprocessing.MinMaxScaler()  # 归一化
        target_str = request.POST['target']
        target = target_str.split(',')
        for each in target:
            mm_data = mm.fit_transform(data_train[each].values.reshape(
                -1, 1))  # 处理数据
            data_train.drop([each], axis=1, inplace=True)
            data_train[each] = mm_data
        return response_util.csv_info(data_train)
Exemplo n.º 8
0
def discrete(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        #离散化目标列名
        target = request.POST['target']
        # 离散化方法:等频 frequency 、等距 metric、聚类 cluster
        discrete_method = request.POST['discrete_method']
        # 离散区间
        num = int(request.POST['num'])
        target_df = data_train[target]
        if discrete_method == 'metric':
            # 等距离散化
            data_train[target] = pd.cut(target_df, num, labels=range(num))
        elif discrete_method == 'frequency':
            # 等频率离散化
            w = [1.0 * i / num for i in range(num + 1)]
            w = target_df.describe(percentiles=w)[4:4 + num + 1]
            w[0] = w[0] * (1 - 1e-10)
            data_train[target] = pd.cut(target_df, w, labels=range(num))
        elif discrete_method == 'cluster':
            #基于聚类的离散化
            kmodel = KMeans(n_clusters=num, n_jobs=4)  # n_jobs是并行数,一般等于CPU数
            kmodel.fit(target_df.values.reshape((len(target_df), 1)))
            c = pd.DataFrame(kmodel.cluster_centers_).sort_values(0)
            # rolling_mean表示移动平均,即用当前值和前2个数值取平均数,
            # 由于通过移动平均,会使得第一个数变为空值,因此需要使用.iloc[1:]过滤掉空值。
            w = c.rolling(2).mean().iloc[1:]
            w = [0] + list(w[0]) + [target_df.max()
                                    ]  # 把首末边界点加上,首边界为0,末边界为data的最大值120000
            data_train[target] = pd.cut(
                target_df, w, labels=range(num))  # cut函数实现将data中的数据按照w的边界分类。
        else:
            return response_util.wrong_info(
                '输入的方法不包含在frequency/metric/cluster里')
        return response_util.csv_info(data_train)