def get_feature_pipelines(sort_by=FeaturePipeline.start_time, response_type="json", **kwargs): assert_response_type(response_type) q = session.query(FeaturePipeline) q = sort_and_filter(q, sort_by=sort_by, **kwargs) return build_response(q.all(), response_type)
def get_predictors(sort_by=Predictor.created_at, response_type="json", **kwargs): assert_response_type(response_type) q = session.query(Predictor) q = sort_and_filter(q, sort_by=sort_by, **kwargs) if response_type == 'object': return q.all() else: return build_response(q.all(), response_type)
def get_features(self, train_config): numeric_feature_objs = session.query(Feature).filter_by( feature_pipeline=self, window_start=train_config.window_start_numeric, window_end=train_config.window_end_numeric, feature_type=train_config.feature_type_numeric).all() numeric_feature_dfs = [ pd.read_csv(f.path) for f in numeric_feature_objs ] occurring_feature_objs = session.query(Feature).filter_by( feature_pipeline=self, window_start=train_config.window_start_occurring, window_end=train_config.window_end_occurring, feature_type=train_config.feature_type_occurring).all() occurring_feature_dfs = [ pd.read_csv(f.path) for f in occurring_feature_objs ] return numeric_feature_dfs, occurring_feature_dfs
def get_training_results( pipeline_id=None, columns=DEFAULT_COLUMNS, metrics=DEFAULT_METRICS, response_type="json", **kwargs ): assert_response_type(response_type) pipeline_id = pipeline_id or request.args.get('pipeline', type=int) assert pipeline_id is not None q = session.query( TrainingResult ).join( TrainingConfiguration, TrainingResult.training_configuration_id == TrainingConfiguration.id ).filter( TrainingConfiguration.training_pipeline_id == pipeline_id ).with_entities( *columns, *metrics ) q = sort_and_filter(q, **kwargs) results = [row._asdict() for row in q] return build_response(results, response_type)
def load(cls, id): obj = session.query(cls).filter_by(id=id).first() return obj
def load_by_config(cls, training_configuration, algorithm, sampler): obj = session.query(cls).filter_by( training_configuration_id=training_configuration.id, algorithm=algorithm, sampler=sampler).first() return obj
def load(cls, id): obj = session.query(cls).filter_by( training_configuration_id=id).first() return obj
def load_by_config(cls, training_pipeline_id, config): obj = session.query(cls).filter_by( training_pipeline_id=training_pipeline_id, **config).first() return obj
def load_features_and_transform(training_configuration, data_loader, bin_size=30, persist_data=True): """ Load features from feature pipeline. Then apply the DataLoader for feature transformation. """ target = training_configuration.target cohort = training_configuration.training_pipeline.cohort.get_fiber() onset_df = training_configuration.training_pipeline.onset_dataframe.get_df( target) feature_pipeline = training_configuration.training_pipeline.feature_pipeline numeric_feature_dfs = [] if training_configuration.feature_type_numeric == 'numeric_binned': window = training_configuration.window_start_numeric, training_configuration.window_end_numeric for w in range(window[0], window[1], bin_size): numeric_feature_objs = session.query(Feature).filter_by( feature_pipeline=feature_pipeline, feature_type=training_configuration.feature_type_numeric, window_start=w, window_end=w + bin_size).all() cur_numeric_dfs = [ pd.read_csv(f.path) for f in numeric_feature_objs ] for df in cur_numeric_dfs: df.set_index(OCCURRENCE_INDEX, inplace=True) new_cols = [ get_name_for_interval(c, [w, w + bin_size]) for c in df.columns ] df.columns = new_cols df.reset_index(inplace=True) numeric_feature_dfs += cur_numeric_dfs _, occurring_feature_dfs = feature_pipeline.get_features( training_configuration) else: numeric_feature_dfs, occurring_feature_dfs = feature_pipeline.get_features( training_configuration) cohort.occurrences.medical_record_number = cohort.occurrences.medical_record_number.astype( int) onset_df.medical_record_number = onset_df.medical_record_number.astype(int) if training_configuration.feature_type_numeric == "numeric_time_series": pivoted_dfs = [] for df in numeric_feature_dfs: numeric_df = threshold_clip_time_series( df=df, cohort=cohort, threshold=training_configuration.threshold_numeric) pivoted_dfs.append( pivot_time_series( cohort=cohort, onset_df=onset_df, df=numeric_df, )) numeric_df = merge_to_base(cohort.occurrences, pivoted_dfs) else: numeric_df = merge_to_base(cohort.occurrences, [ x.filter(regex=(data_loader.column_selector + "|medical_record_number|age_in_days")) for x in numeric_feature_dfs ]) numeric_df = column_threshold_clip( df=numeric_df, threshold=training_configuration.threshold_numeric) occurring_df = merge_to_base(cohort.occurrences, [ x.filter(regex=(data_loader.column_selector + "|medical_record_number|age_in_days")) for x in occurring_feature_dfs ]) occurring_df = column_threshold_clip( df=occurring_df, threshold=training_configuration.threshold_occurring) cohort.occurrences.medical_record_number = cohort.occurrences.medical_record_number.astype( str) numeric_df.medical_record_number = numeric_df.medical_record_number.astype( str) occurring_df.medical_record_number = occurring_df.medical_record_number.astype( str) onset_df.medical_record_number = onset_df.medical_record_number.astype(str) ## Merge to cohort data and use user's data loader data = cohort.merge_patient_data( onset_df, numeric_df, occurring_df, ) ## persist training data if persist_data: TrainingData.persist(training_configuration=training_configuration, data=data) X, y = data_loader.transform(X=data.drop(columns=[target]), y=data[target]) return X, y
def get_cohorts(sort_by=Cohort.created_at, response_type="json", **kwargs): assert_response_type(response_type) q = session.query(Cohort) q = sort_and_filter(q, sort_by=sort_by, **kwargs) return build_response(q.all(), response_type)