示例#1
0
def get_feature_pipelines(sort_by=FeaturePipeline.start_time,
                          response_type="json",
                          **kwargs):
    assert_response_type(response_type)
    q = session.query(FeaturePipeline)
    q = sort_and_filter(q, sort_by=sort_by, **kwargs)
    return build_response(q.all(), response_type)
示例#2
0
def get_predictors(sort_by=Predictor.created_at,
                   response_type="json",
                   **kwargs):
    assert_response_type(response_type)
    q = session.query(Predictor)
    q = sort_and_filter(q, sort_by=sort_by, **kwargs)
    if response_type == 'object':
        return q.all()
    else:
        return build_response(q.all(), response_type)
示例#3
0
文件: data.py 项目: hpi-dhc/robotehr
    def get_features(self, train_config):
        numeric_feature_objs = session.query(Feature).filter_by(
            feature_pipeline=self,
            window_start=train_config.window_start_numeric,
            window_end=train_config.window_end_numeric,
            feature_type=train_config.feature_type_numeric).all()
        numeric_feature_dfs = [
            pd.read_csv(f.path) for f in numeric_feature_objs
        ]

        occurring_feature_objs = session.query(Feature).filter_by(
            feature_pipeline=self,
            window_start=train_config.window_start_occurring,
            window_end=train_config.window_end_occurring,
            feature_type=train_config.feature_type_occurring).all()
        occurring_feature_dfs = [
            pd.read_csv(f.path) for f in occurring_feature_objs
        ]

        return numeric_feature_dfs, occurring_feature_dfs
示例#4
0
def get_training_results(
    pipeline_id=None,
    columns=DEFAULT_COLUMNS,
    metrics=DEFAULT_METRICS,
    response_type="json",
    **kwargs
):
    assert_response_type(response_type)
    pipeline_id = pipeline_id or request.args.get('pipeline', type=int)
    assert pipeline_id is not None
    q = session.query(
        TrainingResult
    ).join(
        TrainingConfiguration,
        TrainingResult.training_configuration_id == TrainingConfiguration.id
    ).filter(
        TrainingConfiguration.training_pipeline_id == pipeline_id
    ).with_entities(
        *columns,
        *metrics
    )
    q = sort_and_filter(q, **kwargs)
    results = [row._asdict() for row in q]
    return build_response(results, response_type)
示例#5
0
 def load(cls, id):
     obj = session.query(cls).filter_by(id=id).first()
     return obj
示例#6
0
 def load_by_config(cls, training_configuration, algorithm, sampler):
     obj = session.query(cls).filter_by(
         training_configuration_id=training_configuration.id,
         algorithm=algorithm,
         sampler=sampler).first()
     return obj
示例#7
0
 def load(cls, id):
     obj = session.query(cls).filter_by(
         training_configuration_id=id).first()
     return obj
示例#8
0
 def load_by_config(cls, training_pipeline_id, config):
     obj = session.query(cls).filter_by(
         training_pipeline_id=training_pipeline_id, **config).first()
     return obj
示例#9
0
def load_features_and_transform(training_configuration,
                                data_loader,
                                bin_size=30,
                                persist_data=True):
    """
    Load features from feature pipeline.
    Then apply the DataLoader for feature transformation.
    """
    target = training_configuration.target
    cohort = training_configuration.training_pipeline.cohort.get_fiber()
    onset_df = training_configuration.training_pipeline.onset_dataframe.get_df(
        target)
    feature_pipeline = training_configuration.training_pipeline.feature_pipeline

    numeric_feature_dfs = []
    if training_configuration.feature_type_numeric == 'numeric_binned':
        window = training_configuration.window_start_numeric, training_configuration.window_end_numeric
        for w in range(window[0], window[1], bin_size):
            numeric_feature_objs = session.query(Feature).filter_by(
                feature_pipeline=feature_pipeline,
                feature_type=training_configuration.feature_type_numeric,
                window_start=w,
                window_end=w + bin_size).all()
            cur_numeric_dfs = [
                pd.read_csv(f.path) for f in numeric_feature_objs
            ]
            for df in cur_numeric_dfs:
                df.set_index(OCCURRENCE_INDEX, inplace=True)
                new_cols = [
                    get_name_for_interval(c, [w, w + bin_size])
                    for c in df.columns
                ]
                df.columns = new_cols
                df.reset_index(inplace=True)
            numeric_feature_dfs += cur_numeric_dfs
        _, occurring_feature_dfs = feature_pipeline.get_features(
            training_configuration)

    else:
        numeric_feature_dfs, occurring_feature_dfs = feature_pipeline.get_features(
            training_configuration)

    cohort.occurrences.medical_record_number = cohort.occurrences.medical_record_number.astype(
        int)
    onset_df.medical_record_number = onset_df.medical_record_number.astype(int)

    if training_configuration.feature_type_numeric == "numeric_time_series":
        pivoted_dfs = []
        for df in numeric_feature_dfs:
            numeric_df = threshold_clip_time_series(
                df=df,
                cohort=cohort,
                threshold=training_configuration.threshold_numeric)
            pivoted_dfs.append(
                pivot_time_series(
                    cohort=cohort,
                    onset_df=onset_df,
                    df=numeric_df,
                ))
        numeric_df = merge_to_base(cohort.occurrences, pivoted_dfs)

    else:
        numeric_df = merge_to_base(cohort.occurrences, [
            x.filter(regex=(data_loader.column_selector +
                            "|medical_record_number|age_in_days"))
            for x in numeric_feature_dfs
        ])
        numeric_df = column_threshold_clip(
            df=numeric_df, threshold=training_configuration.threshold_numeric)

    occurring_df = merge_to_base(cohort.occurrences, [
        x.filter(regex=(data_loader.column_selector +
                        "|medical_record_number|age_in_days"))
        for x in occurring_feature_dfs
    ])
    occurring_df = column_threshold_clip(
        df=occurring_df, threshold=training_configuration.threshold_occurring)

    cohort.occurrences.medical_record_number = cohort.occurrences.medical_record_number.astype(
        str)
    numeric_df.medical_record_number = numeric_df.medical_record_number.astype(
        str)
    occurring_df.medical_record_number = occurring_df.medical_record_number.astype(
        str)
    onset_df.medical_record_number = onset_df.medical_record_number.astype(str)

    ## Merge to cohort data and use user's data loader
    data = cohort.merge_patient_data(
        onset_df,
        numeric_df,
        occurring_df,
    )

    ## persist training data
    if persist_data:
        TrainingData.persist(training_configuration=training_configuration,
                             data=data)
    X, y = data_loader.transform(X=data.drop(columns=[target]), y=data[target])
    return X, y
示例#10
0
def get_cohorts(sort_by=Cohort.created_at, response_type="json", **kwargs):
    assert_response_type(response_type)
    q = session.query(Cohort)
    q = sort_and_filter(q, sort_by=sort_by, **kwargs)
    return build_response(q.all(), response_type)