示例#1
0
def handle_tornado_upload_file(http_handler, tornado_http_files,
                               upload_start_time):
    # 1. check and read param
    tornado_http_file = tornado_http_files.get("file")[0]

    if tornado_http_file is None:
        raise MissingParamException("file")

    file_name = tornado_http_file['filename']
    file_body = tornado_http_file['body']
    file_size = util.human_data_size(len(file_body))
    file_suffix = util.get_file_suffix(file_name)

    assert file_suffix in [
        '.csv', '.tsv'
    ], 'Please check is your file suffix in [.csv, .tsv], current is: %s' % file_suffix

    origin_file_name = util.make_dataset_name(util.cut_suffix(
        file_name)) + file_suffix  # for it in url, disk path readable

    # 2. open temporary file and  write to local file
    temporary_file_path = util.temporary_upload_file_path(origin_file_name)

    if not P.exists(P.dirname(temporary_file_path)):
        os.makedirs(P.dirname(temporary_file_path))

    logger.info(f"Open path {temporary_file_path} to store upload file.")

    with open(temporary_file_path, 'wb') as f:
        f.write(file_body)
    logger.info(
        f"Uploaded file finished at {temporary_file_path}, file size {file_size} ."
    )

    upload_took = util.time_diff(time.time(), upload_start_time)

    # 3. response
    # relative_path = temporary_file_path[len(consts.PATH_DATA_ROOT)+1:]  # relative path not start with /
    response = \
        {
            "path": util.relative_path(P.abspath(temporary_file_path)),
            "size": file_size,
            "took": upload_took
        }
    http_handler.response_json(response)
#             logger.info(f"Encode label column {label_col} because type is {f.type}. ")
#             y = pd.Series(LabelEncoder().fit_transform(y), name=label_col)

# 5. encode categorical features
pearson_corr_dict = {}
for f in dataset_stats.features:
    if f.type == FeatureType.Categorical:
        logger.info(f"Skip categorical feature {f.name} ")
        # lb = LabelEncoder()
        # encoded_series = pd.Series(lb.fit_transform(df[f.name]), name=f.name)
        # pearson_corr_dict[f.name] = y.corr(encoded_series, method='pearson')
        pearson_corr_dict[f.name] = None

    elif f.type in [FeatureType.Continuous, FeatureType.Datetime]:
        pearson_corr_dict[f.name] = y.corr(df[f.name], method='pearson')
    else:
        logger.info(
            f"Encode feature {f.name} type is {f.type}, skipped calc corr. ")
        pearson_corr_dict[f.name] = None  # not support text feature

extension = {"corr": pearson_corr_dict, "label_col": label_col}

# 6. send back calc result
client.analyze_callback(portal=server_portal,
                        dataset_name=dataset_name,
                        analyze_job_name=job_name,
                        type=AnalyzeStep.Types.PatchCorrelation,
                        status=JobStep.Status.Succeed,
                        took=util.time_diff(time.time(), t),
                        extension=extension)