def randomForest(request):
    # 获取数据
    f = request.FILES.get("csv_file")
    filename = csv_util.upload(f)
    data_train = pd.read_csv(filename)
    os.remove(filename)
    # target是补全的目标列名
    target = request.POST['target']
    # ref是用来生成拟合值的相关项列表,格式是以逗号将各项隔开的字符串,如:SibSp,Pclass,Fare,Parch
    ref_str = request.POST['ref']
    ref = ref_str.split(',')
    ref.insert(0,target)#将target列名插入在ref最前面
    print(ref)
    target_df=data_train[ref]#将这些列的数据都取出来
    #将数据分成已知目标项值和未知目标项值两部分
    known_data= target_df[target_df[target].notnull()].as_matrix()
    unknown_data =  target_df[target_df[target].isnull()].as_matrix()
    # y即目标值
    y = known_data[:, 0]
    # X即特征属性值
    X = known_data[:, 1:]
    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)
    # 用得到的模型进行未知目标值结果预测
    predictedAges = rfr.predict(unknown_data[:, 1::])
    # 用得到的预测结果填补原缺失数据
    data_train.loc[(data_train[target].isnull()), target] = predictedAges

    return response_util.csv_info(data_train)
예제 #2
0
def format(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        # label是作为标签的属性名
        label = request.POST['label']
        y_train = data_train[label].values.tolist()
        # 是否除了标签列外的所有列都需要
        all_used = request.POST['all_used']
        if all_used == '0':
            # data_col 是需要的数据列名
            data_col_str = request.POST['data_col']
            data_col = data_col_str.split(',')
            x_train = data_train[data_col]
            result = {"X_train": x_train, "Y_train": y_train}
            return HttpResponse(json.dumps(result),
                                content_type="application/json")
        else:
            x_train = data_train.drop(label, 1).values.tolist()
            result = {"X_train": x_train, "y_train": y_train}
            return HttpResponse(json.dumps(result),
                                content_type="application/json")
def PCA_(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        # PCA需要使用的列名
        target_str = request.POST['target']
        target = target_str.split(',')
        target_df = data_train[target]  # 将这些列的数据都取出来
        # target_data=np.array(target_df)
        # print (target_data)
        # 进行主成分分析
        feature_num = int(request.POST['num'])
        pca = PCA(n_components=feature_num)
        pca_data = pca.fit_transform(target_df)
        # print (pca_data.tolist())
        result = {
            "pca_result": pca_data.tolist(),
            "explained_variance_ratio_":
            pca.explained_variance_ratio_.tolist()
        }
        return HttpResponse(json.dumps(result),
                            content_type="application/json")
예제 #4
0
def scale_(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        #尺度变换目标列名
        target_str = request.POST['target']
        target = target_str.split(',')
        # 尺度变换方法:log2、log10、ln、abs、sqrt
        scale = request.POST['scale']
        for each in target:
            target_df = data_train[each]
            if scale == 'log2':
                data_train[each] = np.log2(target_df)
            elif scale == 'log10':
                data_train[each] = np.log10(target_df)
            elif scale == 'ln':
                data_train[each] = np.log(target_df)
            elif scale == 'abs':
                data_train[each] = np.abs(target_df)
            elif scale == 'sqrt':
                data_train[each] = np.sqrt(target_df)
            else:
                return response_util.wrong_info(
                    '输入的方法不包含在log2、log10、ln、abs、sqrt里')
        return response_util.csv_info(data_train)
예제 #5
0
def filter(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        # 评估重要性目标列名
        target_str = request.POST['target']
        target = target_str.split(',')
        x_train = data_train[target]  # 将这些列的数据都取出来
        # 标签列名
        label = request.POST['label']
        # 过滤后剩下的特征数
        num = int(request.POST['num'])

        y_train = data_train[label]

        gbdt = GradientBoostingClassifier(init=None,
                                          learning_rate=0.1,
                                          loss='deviance',
                                          max_depth=3,
                                          max_features=None,
                                          max_leaf_nodes=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=100,
                                          random_state=None,
                                          subsample=1.0,
                                          verbose=0,
                                          warm_start=False)
        gbdt.fit(x_train, y_train)
        importances = gbdt.feature_importances_
        importances = importances.tolist()
        temp_importances = importances.copy()
        i = 0
        result_col = []
        result_importance = []
        while i < num:
            imax = max(temp_importances)
            index = temp_importances.index(imax)
            result_col.append(target[importances.index(imax)])
            result_importance.append(temp_importances[index])
            del temp_importances[index]
            i += 1
        result_data = x_train[result_col]
        # print (pca_data.tolist())
        result = {
            "result_data": result_data.values.tolist(),
            "result_features": result_col,
            "result_importance": result_importance
        }
        return HttpResponse(json.dumps(result),
                            content_type="application/json")
def setId(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename=csv_util.upload(f)
        data_train=pd.read_csv(filename)
        os.remove(filename)
        # 增加id列
        count = len(data_train)
        list = []
        for num in range(0, count):
            list.append(num)
        data_train.insert(0, 'id', list)
        return response_util.csv_info(data_train)
def standard(request):
    # 获取数据
    f = request.FILES.get("csv_file")
    filename = csv_util.upload(f)
    data_train = pd.read_csv(filename)
    os.remove(filename)
    # target是标准化的目标列名
    target_str = request.POST['target']
    target = target_str.split(',')
    scaler = preprocessing.StandardScaler()
    for each in target:
        standard_data = scaler.fit_transform(data_train[each].values.reshape(-1,1))
        data_train.drop([each], axis=1, inplace=True)
        data_train[each] = standard_data
    return response_util.csv_info(data_train)
def dummy(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        # target_str为需要因子化的列名,以逗号隔开,例:cabin,sex,pclass
        target_str = request.POST['target']
        target = target_str.split(',')
        for each in target:
            dummies = pd.get_dummies(data_train[each], prefix=each)
            data_train = pd.concat([data_train, dummies], axis=1)
            data_train.drop([each], axis=1, inplace=True)
        return response_util.csv_info(data_train)
예제 #9
0
def soften(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        #平滑目标列名
        target = request.POST['target']
        # 平滑方法:百分位 per 、阈值 thresh
        soften_method = request.POST['soften_method']
        # 上下数据点(上下百分位或最大最小阈值)
        min_value = int(request.POST['min'])
        max_value = int(request.POST['max'])
        # 取出目标数据
        target_df = data_train[target].values.tolist()
        if soften_method == 'per':
            # 百分位平滑
            if min_value < 0:
                return response_util.wrong_info('百分位平滑,min值应不小于0')
            elif max_value > 100:
                return response_util.wrong_info('百分位平滑,max值应不大于100')
            else:
                min_value /= 100
                max_value /= 100
                # 列表中最小最大值
                data_min = min(target_df)
                data_max = max(target_df)
                # 计算出来的范围
                min_num = data_min + min_value * (data_max - data_min)
                max_num = data_min + max_value * (data_max - data_min)
                for i in range(0, len(target_df)):
                    if target_df[i] < min_num:
                        target_df[i] = min_num
                    elif target_df[i] > max_num:
                        target_df[i] = max_num
                data_train[target] = target_df
        elif soften_method == 'thresh':
            # 阈值平滑
            for i in range(0, len(target_df)):
                if target_df[i] < min_value:
                    target_df[i] = min_value
                elif target_df[i] > max_value:
                    target_df[i] = max_value
            data_train[target] = target_df
        else:
            return response_util.wrong_info('输入的方法不包含在per/thresh里')
        return response_util.csv_info(data_train)
def importance_filter(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        # 评估重要性目标列名
        target_str = request.POST['target']
        target = target_str.split(',')
        x_train = data_train[target]  # 将这些列的数据都取出来
        # 标签列名
        label = request.POST['label']
        y_train = data_train[label]
        #过滤后剩下的列数
        num = int(request.POST['num'])

        forest = RandomForestClassifier(n_estimators=10000,
                                        random_state=0,
                                        n_jobs=-1)
        forest.fit(x_train, y_train)
        importances = forest.feature_importances_
        importances = importances.tolist()
        temp_importances = importances.copy()
        i = 0
        result_col = []
        result_importance = []
        while i < num:
            imax = max(temp_importances)
            index = temp_importances.index(imax)
            result_col.append(target[importances.index(imax)])
            result_importance.append(temp_importances[index])
            del temp_importances[index]
            i += 1
        result_data = x_train[result_col]
        # print (pca_data.tolist())
        result = {
            "result_data": result_data.values.tolist(),
            "result_features": result_col,
            "result_importance": result_importance
        }
        return HttpResponse(json.dumps(result),
                            content_type="application/json")
def normalize(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        #归一化目标列名
        target = request.POST['target']
        # 取出目标列数据
        mm = preprocessing.MinMaxScaler()  # 归一化
        target_str = request.POST['target']
        target = target_str.split(',')
        for each in target:
            mm_data = mm.fit_transform(data_train[each].values.reshape(
                -1, 1))  # 处理数据
            data_train.drop([each], axis=1, inplace=True)
            data_train[each] = mm_data
        return response_util.csv_info(data_train)
예제 #12
0
def discrete(request):
    if "POST" == request.method:
        # 获取数据
        f = request.FILES.get("csv_file")
        filename = csv_util.upload(f)
        data_train = pd.read_csv(filename)
        os.remove(filename)
        #离散化目标列名
        target = request.POST['target']
        # 离散化方法:等频 frequency 、等距 metric、聚类 cluster
        discrete_method = request.POST['discrete_method']
        # 离散区间
        num = int(request.POST['num'])
        target_df = data_train[target]
        if discrete_method == 'metric':
            # 等距离散化
            data_train[target] = pd.cut(target_df, num, labels=range(num))
        elif discrete_method == 'frequency':
            # 等频率离散化
            w = [1.0 * i / num for i in range(num + 1)]
            w = target_df.describe(percentiles=w)[4:4 + num + 1]
            w[0] = w[0] * (1 - 1e-10)
            data_train[target] = pd.cut(target_df, w, labels=range(num))
        elif discrete_method == 'cluster':
            #基于聚类的离散化
            kmodel = KMeans(n_clusters=num, n_jobs=4)  # n_jobs是并行数,一般等于CPU数
            kmodel.fit(target_df.values.reshape((len(target_df), 1)))
            c = pd.DataFrame(kmodel.cluster_centers_).sort_values(0)
            # rolling_mean表示移动平均,即用当前值和前2个数值取平均数,
            # 由于通过移动平均,会使得第一个数变为空值,因此需要使用.iloc[1:]过滤掉空值。
            w = c.rolling(2).mean().iloc[1:]
            w = [0] + list(w[0]) + [target_df.max()
                                    ]  # 把首末边界点加上,首边界为0,末边界为data的最大值120000
            data_train[target] = pd.cut(
                target_df, w, labels=range(num))  # cut函数实现将data中的数据按照w的边界分类。
        else:
            return response_util.wrong_info(
                '输入的方法不包含在frequency/metric/cluster里')
        return response_util.csv_info(data_train)