Пример #1
0
def get_testing_metrics(model, X, y, metrics, as_indexes, n_folds, X_test=None):
    y_pred = cross_val_predict(
        model,
        X,
        y,
        cv=StratifiedKFold(
            y,
            n_folds=n_folds,
            shuffle=True,
            random_state=RANDOM_STATE
        )
    )
    print "y_pred", y_pred
    model.fit(X, y)
    result = get_y_true_y_pred_based_metrics(y, y_pred, metrics)
    if FEATURES in metrics:
        result[FEATURES] = model.get_support(indices=True)
    if OBJECTS in metrics:
        if as_indexes:
            result[OBJECTS] = [get_data_keeper().get_object_name_by_index(index) for (index,) in X]
        else:
            result[OBJECTS] = list(X.index)
    if TEST_PREDICTIONS in metrics:
        result[TEST_PREDICTIONS] = X_test, model.predict(X_test)
    return result
def make_new_generator():
    start_time = time.time()
    X = get_data_keeper().get_common_x()

    print "matrix shape before:", X.shape  # Матрица X = наша матрица мутаций (snps)
    X[X != 1] = 0

    to_drop = (X.sum(axis=0) >= (X.shape[0] / 2)) | (
        X.sum(axis=0) < 3
    )  # Убираем столбцы (=позиции мутаций), где слищком малое число мутированных образцов
    to_drop = to_drop[to_drop].index
    X = X.drop(to_drop, axis=1)

    # save filtered SNPs matrix (add saving to temp directory)
    X.to_csv(RAW_X_BEFORE_SUBSET_GENERATION_PATH)

    print "matrix shape after:", X.shape  # оставили только слолбцы, где более 3-х мутированных образцов
    sys.stdout.flush()

    # реализовать, чтобы генерация подмножеств не запускалась, если этого не требуется
    generator = SubsetGenerator()

    generator.generate_and_set(X.as_matrix().astype(
        np.uint8))  # генерируем набор подмножеств (запускаем модуль на Си++)

    print "generating done, time from start spent:", time.time() - start_time

    # временный файл для хранения сгенерированных подмножеств: нужно чтобы это было в каталоге $HOME/gwas/tmp
    generator.store(POSSIBLE_COMPLEX_FEATURES_PATH)
    print "storing done, time from start spent:", time.time() - start_time
    return generator, X
Пример #3
0
def run_experiment(
        params,
        experiment_name,
        drug,
        max_evals=100,
        as_indexes=True
    ):

    experiment_name_for_drug = get_experiment_name_for_drug(experiment_name, drug)
    results_dumper = ResultsDumper(experiment_name_for_drug)
    loss_getter = AccuracyLossGetter()

    inner_metrics_getter = MetricsGetter(
        metrics=ALL_METRICS,
        as_indexes=as_indexes,
        loss_func=loss_getter,
        n_folds=5,
    )

    model = MetamodelFactory(
        metamodel_structure=params,
        feature_selector=None,
        results_dumper=results_dumper,
        metrics_getter=inner_metrics_getter,
        max_evals=max_evals
    )

    X, y = get_data_keeper().get_train_data(drug, as_indexes=as_indexes)

    n_folds = 5

    if len(y) < 50:
        n_folds = 10

    init_common()
    
    processes = list()
    
    for i, (train_index, test_index) in enumerate(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=RANDOM_STATE)):
        process = Process(
            target=run_experiment_fold,
            args=(model, X, y, train_index, test_index, i)
        )
        processes.append(process)
        process.start()

    for process in processes:
        process.join()
def make_new_generator():
    start_time = time.time()
    X = get_data_keeper().get_common_x()
    print "matrix shape before:", X.shape
    X[X!=1] = 0
    to_drop = (X.sum(axis=0) >= (X.shape[0] / 2)) | (X.sum(axis=0) < 3)
    to_drop = to_drop[to_drop].index
    X = X.drop(to_drop, axis=1)
    X.to_csv(RAW_X_BEFORE_SUBSET_GENERATION_PATH)
    print "matrix shape after:", X.shape
    sys.stdout.flush()
    generator = SubsetGenerator()
    generator.generate_and_set(X.as_matrix().astype(np.uint8))
    print "generating done, time from start spent:", time.time() - start_time
    generator.store(POSSIBLE_COMPLEX_FEATURES_PATH)
    print "storing done, time from start spent:", time.time() - start_time
    return generator, X
def run_experiment(
    params,
    experiment_name,
    drug,
):
    experiment_name_for_drug = get_experiment_name_for_drug(
        experiment_name, drug)
    results_dumper = ResultsDumper(experiment_name_for_drug)
    results_dumper.set_subdir(str(0))

    X, y = get_data_keeper().get_train_data(drug, as_indexes=True)

    init_common()

    model = params

    model.fit(indexes=X, y=y)

    # save model.extender_strategy._result_feature_sets
    results_dumper.save_tuple(model.extender_strategy._result_feature_sets)
Пример #6
0
def run_experiment(
    params,
    experiment_name,
    drug,
    max_evals=100,
    as_indexes=True):

    experiment_name_for_drug = get_experiment_name_for_drug(experiment_name, drug)
    results_dumper = ResultsDumper(experiment_name_for_drug)
    loss_getter = AccuracyLossGetter()
    inner_metrics_getter = MetricsGetter(
        metrics=ALL_METRICS,
        as_indexes=as_indexes,
        loss_func=loss_getter,
        n_folds=5,
    )
    model = HyperParameterSearcher(
        params=params,
        results_dumper=results_dumper,
        metrics_getter=inner_metrics_getter,
        max_evals=max_evals,
    )

    X, y = get_data_keeper().get_train_data(drug, as_indexes=as_indexes)

    n_folds = 5

    if len(y) < 50:
        n_folds = 10

    init_common()
    
    processes = list()
    
    for i, (train_index, test_index) in enumerate(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=RANDOM_STATE)):
        process = Process(target=run_experiment_fold, args=(model, X, y, train_index, test_index, i))
        processes.append(process)
        process.start()

    for process in processes:
        process.join()
Пример #7
0
            target=run_experiment_fold,
            args=(model, X, y, train_index, test_index, i)
        )
        processes.append(process)
        process.start()

    for process in processes:
        process.join()

########


def run_model(drug):
    params = get_linear_model_params()

    return run_experiment(
        params=params,
        experiment_name='model',
        drug=drug,
        as_indexes=True,
        #as_indexes=False,
        max_evals=MAX_EVALS,
    )


if __name__ == '__main__':
    #run_model(get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])])
    run_model(get_data_keeper().get_possible_second_level_drugs()[int(2)])
    #run_model(get_data_keeper().get_possible_first_level_drugs()[int(2)])

Пример #8
0
        'feature':
        np.asarray(model_features[feature_indices])
    })

    df.to_csv(
        join(model.results_dumper.get_root_folder(),
             "final_model_features{}.csv".format(fold_index)))

    #model.results_dumper.plot_metrics_progress(metrics=PLOT_METRICS)
    print "Best hyperparams: ", model.get_hyperparams(deep=True)


# Run experiment
if __name__ == '__main__':

    drug_name = get_data_keeper().get_possible_second_level_drugs()[2]
    experiment_name_for_drug = get_experiment_name_for_drug(
        "simple_logreg_experiment_", drug_name)

    # create model
    my_model = SequentialModel(name='simpleLR')
    my_model.add(layer=get_linear_model())

    # create metamodel
    my_metamodel = SimpleFeaturesMetamodel()
    my_metamodel.configure_params(inner_model=my_model)

    my_metamodel.set_result_dumper(result_dumper=ResultsDumper(
        experiment_name=experiment_name_for_drug))

    my_metamodel.set_metrics_getter(metrics_getter=MetricsGetter(
                                  get_model_params, \
                                  get_complex_features_adder_wrapper, \
                                  get_nothing_doing_extender_strategy, \
                                  get_frn_params
from common import MAX_EVALS


def get_all_params():
    inner_model_params = get_model_params()
    frn_params = get_frn_params(inner_model_params)
    result_params = scope.get_complex_features_adder_wrapper(
        inner_model=frn_params,
        extender_strategy=scope.get_nothing_doing_extender_strategy(),
    )
    return result_params


def run_frn_model(drug):
    params = get_all_params()
    return run_experiment(
        params=params,
        experiment_name='frn_model',
        drug=drug,
        as_indexes=True,
        max_evals=MAX_EVALS,
    )


if __name__ == '__main__':
    run_frn_model(get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])])
Пример #10
0
    # df.columns = [str(result.inner_model), 'pos']
    df.to_csv(join('/home/roma/tb_gwas_experiments/experiments_results', "final_model_features{}.csv".format(fold_index)))

    #model.results_dumper.dump_final_result(model._result_model, model._result_metrics)

    ### calculate and plot model performance metrics
    ### read all files with models, read metrics for train data, calculate and plot metrics for test data

    model.results_dumper.plot_all_metrics()



# Run experiment
if __name__ == '__main__':

    drug_name = get_data_keeper().get_possible_second_level_drugs()[2]
    experiment_name_for_drug = get_experiment_name_for_drug("simple_logreg_model", drug_name)

    # create metamodels to test
    metamodelLR = SimpleMetamodel(experiment_name=experiment_name_for_drug)

    # load data  - in X it will return INDEXES of points for which y exists
    X, y = get_data_keeper().get_train_data(drug_name, as_indexes=False)

    n_folds = 5

    if len(y) < 50:
        n_folds = 10

    #init_common()
                                  get_model_params
from common import MAX_EVALS


def get_all_params():
    inner_model_params = get_model_params()
    feature_selection_params = get_feature_selector_params(
        inner_model_params=inner_model_params,
    )
    result_params = scope.get_complex_features_adder_wrapper(
        inner_model=feature_selection_params,
        extender_strategy=scope.get_nothing_doing_extender_strategy(),
    )
    return result_params


def run_selector_model(drug):
    params = get_all_params()
    return run_experiment(
        params=params,
        experiment_name='selector_model',
        drug=drug,
        as_indexes=True,
        max_evals=MAX_EVALS,
    )


if __name__ == '__main__':
    run_selector_model(get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])])
    #run_selector_model(get_data_keeper().get_possible_second_level_drugs()[int(2)])
                                  get_model_params, \
                                  get_complex_features_adder_wrapper, \
                                  get_nothing_doing_extender_strategy, \
                                  get_frn_params, \
                                  get_boruta_feature_selector_params, \
                                  get_feature_selector_estimator_params
from common import MAX_EVALS


def get_all_params():
    inner_model_params = get_feature_selector_estimator_params()
    result_params = get_simple_feature_adder_wrapper_params(
        inner_model_params=inner_model_params,
    )
    return result_params


def run_extender_robust_model(drug):
    params = get_all_params()
    return run_experiment(
        params=params,
        experiment_name='extender_robust_model',
        drug=drug,
        as_indexes=True,
        max_evals=MAX_EVALS,
    )


if __name__ == '__main__':
    run_extender_robust_model(get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])])
Пример #13
0
                                  get_model_params, \
                                  get_complex_features_adder_wrapper, \
                                  get_nothing_doing_extender_strategy, \
                                  get_frn_params, \
                                  get_boruta_feature_selector_params, \
                                  get_feature_selector_estimator_params
from common import MAX_EVALS


def get_all_params():
    inner_model_params = get_feature_selector_estimator_params()
    result_params = get_simple_feature_adder_wrapper_params(
        inner_model_params=inner_model_params, )
    return result_params


def run_extender_robust_model(drug):
    params = get_all_params()
    return run_experiment(
        params=params,
        experiment_name='extender_robust_model',
        drug=drug,
        as_indexes=True,
        max_evals=MAX_EVALS,
    )


if __name__ == '__main__':
    run_extender_robust_model(
        get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])])
Пример #14
0
def test_model_with_drug(model, drug, metrics, as_indexes, n_folds=10):
    X, y = get_data_keeper().get_train_data(drug, as_indexes=as_indexes)
    return get_testing_metrics(model, X, y, metrics, as_indexes, n_folds)
Пример #15
0
import sys
import time
from multiprocessing import Process
from data_keeper import get_data_keeper
from run_experiment import init_common
from run_model_experiment import run_model
from run_selector_model_experiment import run_selector_model
from run_frn_model_experiment import run_frn_model
from run_extender_selector_model_experiment import run_extender_selector_model
from run_extender_frn_model_experiment import run_extender_frn_model
from run_extender_robust_model_experiment import run_extender_robust_model

if __name__ == "__main__":
    #drug = get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])]
    drug = get_data_keeper().get_possible_second_level_drugs()[2]

    start_time = time.time()

    init_common(
    )  # сгенерировать файл с комбинациями признаков (размер файла >4 Гб!!)

    processes = list()
    processes.append(Process(target=run_model, args=(drug, )))
    processes.append(Process(target=run_frn_model, args=(drug, )))
    processes.append(Process(target=run_selector_model, args=(drug, )))
    processes.append(Process(target=run_extender_frn_model, args=(drug, )))
    processes.append(Process(target=run_extender_selector_model,
                             args=(drug, )))
    processes.append(Process(target=run_extender_robust_model, args=(drug, )))
    for process in processes:
Пример #16
0
from hyperparameter_search import get_simple_feature_adder_wrapper_params,\
                                  get_feature_selector_params, \
                                  get_model_params
from common import MAX_EVALS


def get_all_params():
    inner_model_params = get_model_params() # choose between parameters of model: RF, XGB or Log regression
    feature_selection_params = get_feature_selector_params( # choose between models for feature selection: Chi-squared or RF/XGB/LogRegr k most important features
        inner_model_params=inner_model_params,
    )
    result_params = get_simple_feature_adder_wrapper_params(
        inner_model_params=feature_selection_params,
    )
    return result_params


def run_extender_selector_model(drug):
    params = get_all_params()
    return run_experiment(
        params=params,
        experiment_name='extender_selector_model',
        drug=drug,
        as_indexes=True,
        max_evals=MAX_EVALS,
    )


if __name__ == '__main__':
    run_extender_selector_model(get_data_keeper().get_possible_second_level_drugs()[int(1)])
Пример #17
0
from data_keeper import get_data_keeper
from sklearn.model_selection import GridSearchCV

from testing import test_models_with_drugs

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV

get_data_keeper().get_possible_drugs()

import wrappers
from wrappers import GridSearchCVWrapper
from wrappers import XGBoostClassifierFeatureImportances as XGB
from wrappers import MatrixCleaningWrapper
from wrappers import SparseWrapper
from wrappers import ModelFeatureSelectionWrapper
from wrappers import ModelBasedFeatureImportanceGetter
from wrappers import AsMatrixWrapper

from frn import FeatureRelevanceNetworkWrapper

def get_complete_linear_model():
    inner_model = LogisticRegressionCV(Cs=[10 ** i for i in xrange(-4, 4)], solver='liblinear')
    outer_model = GridSearchCV(inner_model, {'penalty': ['l1', 'l2']})
    return MatrixCleaningWrapper(SparseWrapper(outer_model))

def get_complete_tree_based_model():
    cv_params = {'inner_model__inner_model__n_estimators': [1],#, 5, 10, 20, 50, 100],
                 'feature_selection_threshold_coef': [0.1]}#, 1, 3, 10, 30, 100, 300]}
    return MatrixCleaningWrapper(FeatureRelevanceNetworkWrapper(XGB(n_estimators=100), ModelBasedFeatureImportanceGetter(XGB())))
Пример #18
0
                                  get_feature_selector_params, \
                                  get_model_params, \
                                  get_complex_features_adder_wrapper, \
                                  get_nothing_doing_extender_strategy, \
                                  get_frn_params
from common import MAX_EVALS


def get_all_params():
    inner_model_params = get_model_params()
    frn_params = get_frn_params(inner_model_params)
    result_params = get_simple_feature_adder_wrapper_params(
        inner_model_params=frn_params,
    )
    return result_params


def run_extender_frn_model(drug):
    params = get_all_params()
    return run_experiment(
        params=params,
        experiment_name='extender_frn_model',
        drug=drug,
        as_indexes=True,
        max_evals=MAX_EVALS,
    )


if __name__ == '__main__':
    run_extender_frn_model(get_data_keeper().get_possible_second_level_drugs()[int(4)])
Пример #19
0
                                  get_frn_params
from common import MAX_EVALS


def get_all_params():
    inner_model_params = get_model_params()
    result_params = scope.get_complex_features_adder_wrapper(
        inner_model=inner_model_params,
        extender_strategy=scope.get_nothing_doing_extender_strategy(),
    )
    return result_params


def run_model(drug):
    params = get_all_params()
    return run_experiment(
        params=params,
        experiment_name='model',
        drug=drug,
        as_indexes=True,
        #as_indexes=False,
        max_evals=MAX_EVALS,
    )


if __name__ == '__main__':
    #run_model(get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])])
    #run_model(get_data_keeper().get_possible_second_level_drugs()[int(4)])
    run_model(get_data_keeper().get_possible_first_level_drugs()[int(
        sys.argv[1])])
Пример #20
0
import sys
import time
from multiprocessing import Process
from data_keeper import get_data_keeper
from run_experiment import init_common
from run_model_experiment import run_model
from run_selector_model_experiment import run_selector_model
from run_frn_model_experiment import run_frn_model
from run_extender_selector_model_experiment import run_extender_selector_model
from run_extender_frn_model_experiment import run_extender_frn_model
from run_boruta_model_experiment import run_boruta_model
from run_extender_robust_model_experiment import run_extender_robust_model


if __name__ == "__main__":
    drug = get_data_keeper().get_possible_second_level_drugs()[int(sys.argv[1])]
    init_common()
    processes = list()
    processes.append(Process(target=run_model, args=(drug,)))
    processes.append(Process(target=run_frn_model, args=(drug,)))
    processes.append(Process(target=run_selector_model, args=(drug,)))
    processes.append(Process(target=run_extender_frn_model, args=(drug,)))
    processes.append(Process(target=run_extender_selector_model, args=(drug,)))
    #processes.append(Process(target=run_boruta_model_experiment, args=(drug,)))
    processes.append(Process(target=run_extender_robust_model, args=(drug,)))
    for process in processes:
        process.start()
    for process in processes:
        process.join()
    print "done, ", time.time() - start_time