def model_predict_and_evaluate(self, dataset):
        """Train model with subset of the features and evalaute the performance.
    
    Args:
      - dataset: dataset with subset of temporal features
      
    Returns:
      - performance: performance with subset of temporal features
    """
        # Build model
        pred_class = prediction(self.model_parameters['model_type'],
                                self.model_parameters, self.task)
        # Train the model
        pred_class.fit(dataset)
        # Test the model
        test_y_hat = pred_class.predict(dataset)
        # Extract the labels
        _, _, test_y, _, _ = dataset.get_fold(fold=0, split='test')
        # Evaluate the performance
        temp_performance = Metrics([self.metric_name],
                                   self.metric_parameters).evaluate(
                                       test_y, test_y_hat)
        performance = np.mean(list(temp_performance.values())[0])

        return performance
def expected_consistency_selection_ensemble(
        labels,
        class_num,
        target,
        mlset,
        nlset,
        cons_type='must',
        ensemble_methods=_default_ensemble_methods,
        ease_factor=1):
    selected_labels = _expected_consistency_selection(labels,
                                                      mlset,
                                                      nlset,
                                                      cons_type=cons_type,
                                                      ease_factor=ease_factor)
    retVals = []
    retVals.append(ease_factor)
    retVals.append(selected_labels.shape[0])
    print('[INFO] Selected Solutions:' + str(selected_labels.shape[0]))
    for method in ensemble_methods:
        ensemble_labels = _ensemble_method[method](selected_labels,
                                                   N_clusters_max=class_num)
        ensemble_nmi = Metrics.normalized_max_mutual_info_score(
            ensemble_labels, target)
        retVals.append(ensemble_nmi)
        print('[INFO] Ensemble Method:' + method)
        print('[INFO] Performance:' + str(ensemble_nmi))
    return retVals
예제 #3
0
def comparison_ensemble_methods(dataset_name, library_name, eval_method=None):
    """
    get the performance of comparison methods (ensemble)

    :param dataset_name:
    :param library_name:
    :param eval_method:
    :return:
    """
    filename = _default_eval_path + dataset_name + '_ensemble_eval_' + time.strftime(
        '%Y-%m-%d_%H_%M_%S', time.localtime(time.time())) + '.csv'
    lib = np.loadtxt(_default_library_path + dataset_name + '/' +
                     library_name + '.res',
                     delimiter=',')
    with open(filename, 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(library_name)
        data, targets = exd.dataset[dataset_name]['data']()
        k = exd.dataset[dataset_name]['k']
        eval_methods = _default_ensemble_eval_methods if eval_method is None else eval_method
        print '[Ensemble Comparison]: Dataset: ' + str(dataset_name)
        print '[Ensemble Comparison]: Library: ' + str(library_name)
        print '[Ensemble Comparison]: Comparison Methods: ' + str(eval_methods)
        print '[Ensemble Comparison]: Real k is ' + str(k)
        for method in eval_methods:
            ensemble_label = _ensemble_methods[method](lib, N_clusters_max=k)
            performance = metrics.normalized_max_mutual_info_score(
                targets, ensemble_label)
            writer.writerow([method, str(performance)])
    return
예제 #4
0
def plot_consistency(labels, pos, mlset, nlset, savepath, consistency_type='both'):
    """
    plot consistency distribution of given library

    Parameters
    ----------
    :param labels:
    :param pos:
    :param mlset:
    :param nlset:
    :param savepath:
    :param consistency_type:
    """
    texts = []
    colors = []
    plot_labels = [None] * (len(labels) - _ADDITIONAL_RANGE)
    markers = ['o'] * (len(labels) - _ADDITIONAL_RANGE)
    for label in labels[0:-_ADDITIONAL_RANGE]:
        cons = Metrics.consistency(label, mlset, nlset, cons_type=consistency_type)
        texts.append(cons)
    cNorm = colors2.Normalize(vmin=min(texts), vmax=max(texts))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=plt.get_cmap('CMRmap'))
    plot_labels.extend(_ADDITIONAL_NAMES)
    for text in texts:
        colors.append(scalarMap.to_rgba(text))
    texts = map(_round_digits, texts)
    texts.append('')
    texts.extend(_ADDITIONAL_NAMES[1:])
    colors.extend(_ADDITIONAL_COLORS)
    markers.extend(_ADDITIONAL_MARKERS)
    title = consistency_type + ' Consistency ,' + 'Max val = ' + str(max(texts[0:-_ADDITIONAL_RANGE])) +\
                               ' ,Min k = ' + str(min(texts[0:-_ADDITIONAL_RANGE]))
    _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title)
    return
예제 #5
0
def plot_nmi_max(labels, pos, savepath):
    """
    plot nmi_max distribution of given library

    Parameters
    ----------
    :param labels:
    :param pos:
    :param savepath:
    """
    texts = []
    colors = []
    plot_labels = [None] * (len(labels) - _ADDITIONAL_RANGE)
    markers = ['o'] * (len(labels) - _ADDITIONAL_RANGE)
    for label in labels[0:-1]:
        cons = Metrics.normalized_max_mutual_info_score(label, labels[-1])
        texts.append(cons)
    cNorm = colors2.Normalize(vmin=min(texts[0:-_ADDITIONAL_RANGE+1]), vmax=max(texts[0:-_ADDITIONAL_RANGE+1]))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=plt.get_cmap('CMRmap'))
    plot_labels.extend(_ADDITIONAL_NAMES)
    for text in texts[0:-_ADDITIONAL_RANGE+1]:
        colors.append(scalarMap.to_rgba(text))
    texts = map(_round_digits, texts)
    texts.extend(_ADDITIONAL_NAMES[-1:])
    colors.extend(_ADDITIONAL_COLORS)
    markers.extend(_ADDITIONAL_MARKERS)
    title = 'NMI distribution, ' + 'Max val = ' + str(max(texts[0:-_ADDITIONAL_RANGE])) +\
            ' ,Min k = ' + str(min(texts[0:-_ADDITIONAL_RANGE]))
    _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title)
    return
예제 #6
0
def evaluate_library(name,
                     path,
                     class_num,
                     target,
                     evaluate_methods=_default_evaluate_methods):
    """
    do evaluation for a given library

    :param name: name of the library
    :param path: path where the library is
    :param class_num: #real_classes
    :param target: real class label
    :param evaluate_methods: consensus functions used for evaluation

    Return
    ------
    :return: score of all consensus functions in a list
    """
    labels = np.loadtxt(path + name, delimiter=',')
    if not name.endswith('_pure.res'):
        labels = labels[0:-5]
    scores = []
    for method in evaluate_methods:
        ensemble_label = _ensemble_method[method](labels,
                                                  N_clusters_max=class_num)
        scores.append(
            Metrics.normalized_max_mutual_info_score(target, ensemble_label))
    return scores
def do_propagation_ensemble(library_folder,
                            library_name,
                            class_num,
                            target,
                            constraint_file,
                            logger,
                            alphas,
                            have_zero=True,
                            ensemble_method=_default_ensemble_method):
    logger.debug(
        '==========================================================================================='
    )
    logger.debug('-----------------Propagation Ensemble for library:' +
                 str(library_name) + '----------------')
    logger.debug('-----------------Have zero type = ' + str(have_zero) +
                 '-----------------------------------')
    logger.debug('-----------------Constraint File name = ' + constraint_file +
                 '----------------------------')

    labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',')
    labels = labels.astype(int)

    ml, cl = io_func.read_constraints(constraint_file)

    hyperedges = ce.build_hypergraph_adjacency(labels)
    hyperedges = hyperedges.transpose()

    coas_matrix = hyperedges.dot(hyperedges.transpose())
    coas_matrix = np.squeeze(np.asarray(coas_matrix.todense()))
    coas_matrix = coas_matrix.astype(np.float32)
    coas_matrix /= np.max(coas_matrix)

    print coas_matrix

    nmis = []
    for alpha in alphas:
        logger.debug(
            '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------'
        )
        propagated_coas_matrix = propagation_on_coassociation_matrix(
            coas_matrix, ml, cl, alpha)
        cur_nmis = []
        for method in ensemble_method:
            ensemble_label = _ensemble_method[method](propagated_coas_matrix,
                                                      labels.shape[0],
                                                      class_num)
            ensemble_nmi = Metrics.normalized_max_mutual_info_score(
                ensemble_label, target)
            logger.debug(method + ' alpha=' + str(alpha) + ', NMI=' +
                         str(ensemble_nmi))
            cur_nmis.append(ensemble_nmi)
        nmis.append(cur_nmis)
        logger.debug(
            '------------------------->>>>>> END OF THIS PARAM <<<<<<------------------------------'
        )
    logger.debug(
        '==========================================================================================='
    )
    return nmis
예제 #8
0
def evalparser(path='./examples', report=False):
    """ Test the parsing performance

    :type path: string
    :param path: path to the evaluation data

    :type report: boolean
    :param report: whether to report (calculate) the f1 score
    """
    from os import listdir
    from os.path import join as joinpath
    # ----------------------------------------
    # Load the parsing model
    pm = ParsingModel()
    pm.loadmodel("parsing-model.pickle.gz")
    # ----------------------------------------
    # Evaluation
    met = Metrics(levels=['span','nuclearity','relation'])
    # ----------------------------------------
    # Read all files from the given path
    doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.edus')]
    for fedus in doclist:
        # ----------------------------------------
        # Parsing
        fpos = fedus + ".pos"
        d_pos = get_d_pos(fpos)
        fdep = fedus + ".dep"
        d_dep = get_d_dep(fdep)
        pred_rst = parse(pm, fedus=fedus, d_pos=d_pos, d_dep=d_dep)
        # Get brackets from parsing results
        pred_brackets = pred_rst.bracketing()
        fbrackets = fedus.replace('edus', 'brackets')
        writebrackets(fbrackets, pred_brackets)
        # ----------------------------------------
        # Evaluate with gold RST tree
        if report:
            fdis = fedus.replace('edus', 'dis')
            gold_rst = RSTTree(fname=fdis)
            gold_rst.build()
            gold_brackets = gold_rst.bracketing()
            met.eval(gold_rst, pred_rst)
    if report:
        met.report()
예제 #9
0
def evalparser(path='./examples',
               report=False,
               bcvocab=None,
               draw=True,
               withdp=False,
               fdpvocab=None,
               fprojmat=None):
    """ Test the parsing performance

    :type path: string
    :param path: path to the evaluation data

    :type report: boolean
    :param report: whether to report (calculate) the f1 score
    """
    # ----------------------------------------
    # Load the parsing model
    print('Load parsing model ...')
    pm = ParsingModel(withdp=withdp, fdpvocab=fdpvocab, fprojmat=fprojmat)
    pm.loadmodel("model/parsing-model.pickle.gz")
    # ----------------------------------------
    # Evaluation
    met = Metrics(levels=['span', 'nuclearity', 'relation'])
    # ----------------------------------------
    # Read all files from the given path
    exsisting_files = [
        ".".join(fname.split(".")[:-1]) for fname in listdir(path)
        if fname.endswith('.brackets')
    ]
    all_files = [
        ".".join(fname.split(".")[:-1]) for fname in listdir(path)
        if fname.endswith('.merge')
    ]
    todo_files = list(set(all_files) - set(exsisting_files))
    doclist = [joinpath(path, fname + '.merge') for fname in todo_files]
    print("TODO files len:")
    print(len(doclist))
    print(doclist[0])
    global_pm = pm
    global global_pm
    global_bv = bcvocab
    global global_bv
    eval_parser_unit(doclist[0])
    cnt = multiprocessing.cpu_count()

    pool = multiprocessing.Pool(processes=cnt)

    pool.map(eval_parser_unit, doclist)
    pool.close()
    pool.join()
    """
예제 #10
0
def plot_normalized_consistency(labels, mlset, nlset, savepath, additional_values):
    """
    plot correlations between must and cannot consistency of given library

    Parameters
    ----------
    :param labels:
    :param mlset:
    :param nlset:
    :param savepath:
    :param additional_values:
    """
    texts = additional_values
    colors = []
    plot_labels = [None] * (len(labels) - _ADDITIONAL_RANGE)
    markers = ['o'] * (len(labels) - _ADDITIONAL_RANGE)
    cNorm = colors2.Normalize(vmin=min(texts), vmax=max(texts))
    scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=plt.get_cmap('CMRmap'))
    plot_labels.extend(_ADDITIONAL_NAMES)
    for text in texts:
        colors.append(scalarMap.to_rgba(text))
    title = 'Must-Cannot Correlation'
    must_consistencies = []
    cannot_consistencies = []
    for label in labels[0:-5]:
        must_cons = Metrics.consistency(label, mlset, nlset, cons_type='must')
        cannot_cons = Metrics.consistency(label, mlset, nlset, cons_type='cannot')
        must_consistencies.append(must_cons)
        cannot_consistencies.append(cannot_cons)
    scaler = preprocessing.MinMaxScaler()
    must_consistencies = scaler.fit_transform(np.array(must_consistencies).reshape(-1, 1))
    cannot_consistencies = scaler.fit_transform(np.array(cannot_consistencies).reshape(-1, 1))
    pos = np.hstack((np.array(must_consistencies), np.array(cannot_consistencies)))
    _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title,
                              xlabel='Must consistency', ylabel='Cannot consistency', legend_need=False)
    return
예제 #11
0
def evalparser(path='./examples', report=False, 
               bcvocab=None, draw=True,
               withdp=False, fdpvocab=None, fprojmat=None):
    """ Test the parsing performance

    :type path: string
    :param path: path to the evaluation data

    :type report: boolean
    :param report: whether to report (calculate) the f1 score
    """
    # ----------------------------------------
    # Load the parsing model
    print 'Load parsing model ...'
    pm = ParsingModel(withdp=withdp,
        fdpvocab=fdpvocab, fprojmat=fprojmat)
    pm.loadmodel("model/parsing-model.pickle.gz")
    # ----------------------------------------
    # Evaluation
    met = Metrics(levels=['span','nuclearity','relation'])
    # ----------------------------------------
    # Read all files from the given path
    doclist = [joinpath(path, fname) for fname in listdir(path) if fname.endswith('.merge')]
    for fmerge in doclist:
        # ----------------------------------------
        # Read *.merge file
        dr = DocReader()
        doc = dr.read(fmerge)
        # ----------------------------------------
        # Parsing
        pred_rst = pm.sr_parse(doc, bcvocab)
        if draw:
            strtree = pred_rst.parse()
            drawrst(strtree, fmerge.replace(".merge",".ps"))
        # Get brackets from parsing results
        pred_brackets = pred_rst.bracketing()
        fbrackets = fmerge.replace('.merge', '.brackets')
        # Write brackets into file
        writebrackets(fbrackets, pred_brackets)
        # ----------------------------------------
        # Evaluate with gold RST tree
        if report:
            fdis = fmerge.replace('.merge', '.dis')
            gold_rst = RSTTree(fdis, fmerge)
            gold_rst.build()
            gold_brackets = gold_rst.bracketing()
            met.eval(gold_rst, pred_rst)
    if report:
        met.report()
예제 #12
0
def _expected_consistency_selection(labels,
                                    mlset,
                                    nlset,
                                    cons_type='',
                                    ease_factor=1):
    n_solutions = labels.shape[0]
    k_values = []
    cons = []
    final_idx = np.array([False] * n_solutions)
    for label in labels:
        cons.append(
            Metrics.consistency(label, mlset, nlset, cons_type=cons_type))
        k_values.append(len(np.unique(label)))
    cons = np.array(cons)
    k_values = np.array(k_values, dtype=int)
    possible_k = np.unique(k_values)
    for k in possible_k:
        mean_value = np.mean(cons[k_values == k])
        idx = np.logical_and(cons >= mean_value * ease_factor, k_values == k)
        final_idx = np.logical_or(final_idx, idx)
    return labels[final_idx]
예제 #13
0
def evalparser(path='./examples', report=False):
    """ Test the parsing performance

    :type path: string
    :param path: path to the evaluation data

    :type report: boolean
    :param report: whether to report (calculate) the f1 score
    """
    from os import listdir
    from os.path import join as joinpath
    # ----------------------------------------
    # Load the parsing model
    pm = ParsingModel()
    pm.loadmodel("parsing-model.pickle.gz")
    # ----------------------------------------
    # Evaluation
    met = Metrics(levels=['span', 'nuclearity', 'relation'])
    # ----------------------------------------
    # Read all files from the given path
    doclist = [
        joinpath(path, fname) for fname in listdir(path)
        if fname.endswith('.edus')
    ]
    for fedus in doclist:
        # ----------------------------------------
        # Parsing
        pred_rst = parse(pm, fedus=fedus)
        # Get brackets from parsing results
        #      print fedus
        fin = open("test.dis", "w")
        r = fin.write(str(pred_rst))
        #   pred_brackets = pred_rst.bracketing()
        # fbrackets = fedus.replace('edus', 'brackets')
        #  writebrackets(fbrackets, pred_brackets)
        # ----------------------------------------
        # Evaluate with gold RST tree
        if report:
            fdis = fedus.replace('edus', 'dis')
            gold_rst = RSTTree(fname=fdis)
            gold_rst.build()
            gold_brackets = gold_rst.bracketing()
            met.eval(gold_rst, pred_rst)
    if report:
        met.report()
예제 #14
0
def plot_k_consistency_distribution(labels, mlset, nlset, savepath, pure=True, cons_type='must'):
    k_value = []
    if not pure:
        labels = labels[0:-5]
    for label in labels:
        cons = len(np.unique(label))
        k_value.append(cons)

    texts = [''] * len(labels)
    plot_labels = [None] * len(labels)
    markers = ['x'] * len(labels)
    colors = ['blue'] * len(labels)
    title = 'k-'+cons_type+' consistency Correlation'

    consistencies = []
    for label in labels:
        cons = Metrics.consistency(label, mlset, nlset, cons_type=cons_type)
        consistencies.append(cons)
    pos = np.hstack((np.array(k_value).reshape(-1, 1), np.array(consistencies).reshape(-1, 1)))
    print (pos.shape)

    _plot_generalized_scatter(pos, colors, texts, markers, plot_labels, savepath, title=title,
                              xlabel='k', ylabel='consistency', legend_need=False)
    return
예제 #15
0
def main(args):
    '''Main function for AutoML in time-series predictions.
  
  Args:
    - data loading parameters:
      - data_names: mimic, ward, cf    
      
    - preprocess parameters: 
      - normalization: minmax, standard, None
      - one_hot_encoding: input features that need to be one-hot encoded
      - problem: 'one-shot' or 'online'
        - 'one-shot': one time prediction at the end of the time-series 
        - 'online': preditcion at every time stamps of the time-series
      - max_seq_len: maximum sequence length after padding
      - label_name: the column name for the label(s)
      - treatment: the column name for treatments
      
    - imputation parameters: 
      - static_imputation_model: mean, median, mice, missforest, knn, gain
      - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain
            
    - feature selection parameters:
      - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None
      - feature_number: selected featuer number
      
    - predictor_parameters:
      - epochs: number of epochs
      - bo_itr: bayesian optimization iterations
      - static_mode: how to utilize static features (concatenate or None)
      - time_mode: how to utilize time information (concatenate or None)
      - task: classification or regression
      
    - metric_name: auc, apr, mae, mse
  '''
    #%% Step 0: Set basic parameters
    metric_sets = [args.metric_name]
    metric_parameters = {
        'problem': args.problem,
        'label_name': [args.label_name]
    }

    #%% Step 1: Upload Dataset
    # File names
    data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_'

    data_loader_training = CSVLoader(
        static_file=data_directory + 'static_train_data.csv.gz',
        temporal_file=data_directory + 'temporal_train_data_eav.csv.gz')

    data_loader_testing = CSVLoader(
        static_file=data_directory + 'static_test_data.csv.gz',
        temporal_file=data_directory + 'temporal_test_data_eav.csv.gz')

    dataset_training = data_loader_training.load()
    dataset_testing = data_loader_testing.load()

    print('Finish data loading.')

    #%% Step 2: Preprocess Dataset
    # (0) filter out negative values (Automatically)
    negative_filter = FilterNegative()
    # (1) one-hot encode categorical features
    onehot_encoder = OneHotEncoder(
        one_hot_encoding_features=[args.one_hot_encoding])
    # (2) Normalize features: 3 options (minmax, standard, none)
    normalizer = Normalizer(args.normalization)

    filter_pipeline = PipelineComposer(negative_filter, onehot_encoder,
                                       normalizer)

    dataset_training = filter_pipeline.fit_transform(dataset_training)
    dataset_testing = filter_pipeline.transform(dataset_testing)

    print('Finish preprocessing.')

    #%% Step 3: Define Problem
    problem_maker = ProblemMaker(problem=args.problem,
                                 label=[args.label_name],
                                 max_seq_len=args.max_seq_len,
                                 treatment=[args.treatment])

    dataset_training = problem_maker.fit_transform(dataset_training)
    dataset_testing = problem_maker.fit_transform(dataset_testing)

    print('Finish defining problem.')

    #%% Step 4: Impute Dataset
    static_imputation = Imputation(
        imputation_model_name=args.static_imputation_model, data_type='static')
    temporal_imputation = Imputation(
        imputation_model_name=args.temporal_imputation_model,
        data_type='temporal')

    imputation_pipeline = PipelineComposer(static_imputation,
                                           temporal_imputation)

    dataset_training = imputation_pipeline.fit_transform(dataset_training)
    dataset_testing = imputation_pipeline.transform(dataset_testing)

    print('Finish imputation.')

    #%% Step 5: Feature selection (4 options)
    static_feature_selection = \
    FeatureSelection(feature_selection_model_name = args.static_feature_selection_model,
                     feature_type = 'static', feature_number = args.static_feature_selection_number,
                     task = args.task, metric_name = args.metric_name,
                     metric_parameters = metric_parameters)

    temporal_feature_selection = \
    FeatureSelection(feature_selection_model_name = args.temporal_feature_selection_model,
                     feature_type = 'temporal', feature_number = args.temporal_feature_selection_number,
                     task = args.task, metric_name = args.metric_name,
                     metric_parameters = metric_parameters)

    feature_selection_pipeline = PipelineComposer(static_feature_selection,
                                                  temporal_feature_selection)

    dataset_training = feature_selection_pipeline.fit_transform(
        dataset_training)
    dataset_testing = feature_selection_pipeline.transform(dataset_testing)

    print('Finish feature selection.')

    #%% Step 6: Bayesian Optimization
    ## Model define

    model_parameters = {
        'projection_horizon': 5,
        'static_mode': 'concatenate',
        'time_mode': 'concatenate'
    }

    crn_model = CRN_Model(task=args.task)
    crn_model.set_params(**model_parameters)

    model_class = crn_model

    # train_validate split
    dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.2)

    # Bayesian Optimization Start
    metric = BOMetric(metric='auc', fold=0, split='test')

    # Run BO for selected model class
    BO_model = AutoTS(dataset_training, model_class, metric)
    models, bo_score = BO_model.training_loop(num_iter=2)
    auto_ens_model = AutoEnsemble(models, bo_score)

    # Prediction
    assert not dataset_testing.is_validation_defined
    test_y_hat = auto_ens_model.predict(dataset_testing, test_split='test')
    test_y = dataset_testing.label

    print('Finish AutoML model training and testing.')

    #%% Step 7: Visualize Results
    idx = np.random.permutation(len(test_y_hat))[:2]

    # Evaluate predictor model
    result = Metrics(metric_sets,
                     metric_parameters).evaluate(test_y, test_y_hat)
    print('Finish predictor model evaluation.')

    # Visualize the output
    # (1) Performance
    print('Overall performance')
    print_performance(result, metric_sets, metric_parameters)
    # (2) Predictions
    print('Each prediction')
    print_prediction(test_y_hat[idx], metric_parameters)

    return
예제 #16
0
def main(args):
    '''Main function for individual treatment effect estimation.

  Args:
    - data loading parameters:
      - data_names: mimic, ward, cf, mimic_antibiotics

    - preprocess parameters:
      - normalization: minmax, standard, None
      - one_hot_encoding: input features that need to be one-hot encoded
      - problem: 'online'
        - 'online': preiction at every time stamps of the time-series
      - max_seq_len: maximum sequence length after padding
      - label_name: the column name for the label(s)
      - treatment: the column name for treatments

    - imputation parameters:
      - static_imputation_model: mean, median, mice, missforest, knn, gain
      - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain

    - feature selection parameters:
      - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None
      - feature_number: selected featuer number

    - treatment effects model parameters:
      - model_name: CRN, RMSN, GANITE
      Each model has different types of hyperparameters that need to be set.

        - Parameters needed for the Counterfactual Recurrent Network (CRN):
          - hyperparameters for encoder:
              - rnn_hidden_units: hidden dimensions in the LSTM unit
              - rnn_keep_prob: keep probability used for variational dropout in the LSTM unit
              - br_size: size of the balancing representation
              - fc_hidden_units: hidden dimensions of the fully connected layers used for treatment classifier and predictor
              - batch_size: number of samples in mini-batch
              - num_epochs: number of epochs
              - learning_rate: learning rate
              - max_alpha: alpha controls the trade-off between building tratment invariant representations (domain
                discrimination) and being able to predict outcomes (outcome prediction); during training, CRN uses an
                exponentially increasing schedule for alpha from 0 to max_alpha.
          - hyperparameters for decoder:
              - the decoder requires the same hyperparameters as the encoder with the exception of the rnn_hidden_units
                which is set to be equal to the br_size of the encoder

        - Parameters for Recurrent Marginal Structural Networks (RMSN):
            - hyperparameters for encoder:
                - dropout_rate: dropout probability used for variational
                - rnn_hidden_units: hidden dimensions in the LSTM unit
                - batch_size: number of samples in mini-batch
                - num_epochs: number of epochs
                - learning_rate: learning rate
                - max_norm: max gradient norm used for gradient clipping during training
            - hyperparameters for decoder:
                - the decoder requires the same hyperparameters as the encoder.
            - model_dir: directory where the model is saved
            - model_name: name of the saved model

        - Parameters for GANITE:
          - batch size: number of samples in mini-batch
          - alpha: parameter trading off between discriminator loss and supervised loss for the generator training
          - learning_rate: learning rate
          - hidden_units: hidden dimensions of the fully connected layers used in the networks
          - stack_dim: number of timesteps to stack

        All models have the following common parameters:
          - static_mode: how to utilize static features (concatenate or None)
          - time_mode: how to utilize time information (concatenate or None)
          - taks: 'classification' or 'regression'


    - metric_name: auc, apr, mae, mse (used for factual prediction)
    - patient id: patient for which counterfactual trajectories are computed
    - timestep: timestep in patient trajectory for estimating counterfactuals
  '''
    # %% Step 0: Set basic parameters
    metric_sets = [args.metric_name]
    metric_parameters = {
        'problem': args.problem,
        'label_name': [args.label_name]
    }

    # %% Step 1: Upload Dataset
    # File names
    data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_'

    data_loader_training = CSVLoader(
        static_file=data_directory + 'static_train_data.csv.gz',
        temporal_file=data_directory + 'temporal_train_data_eav.csv.gz')

    data_loader_testing = CSVLoader(
        static_file=data_directory + 'static_test_data.csv.gz',
        temporal_file=data_directory + 'temporal_test_data_eav.csv.gz')

    dataset_training = data_loader_training.load()
    dataset_testing = data_loader_testing.load()

    print('Finish data loading.')

    # %% Step 2: Preprocess Dataset
    # (0) filter out negative values (Automatically)
    negative_filter = FilterNegative()
    # (1) one-hot encode categorical features
    onehot_encoder = OneHotEncoder(
        one_hot_encoding_features=[args.one_hot_encoding])
    # (2) Normalize features: 3 options (minmax, standard, none)
    normalizer = Normalizer(args.normalization)

    filter_pipeline = PipelineComposer(negative_filter, onehot_encoder,
                                       normalizer)

    dataset_training = filter_pipeline.fit_transform(dataset_training)
    dataset_testing = filter_pipeline.transform(dataset_testing)

    print('Finish preprocessing.')

    # %% Step 3: Define Problem
    problem_maker = ProblemMaker(problem=args.problem,
                                 label=[args.label_name],
                                 max_seq_len=args.max_seq_len,
                                 treatment=[args.treatment])

    dataset_training = problem_maker.fit_transform(dataset_training)
    dataset_testing = problem_maker.fit_transform(dataset_testing)

    print('Finish defining problem.')

    # %% Step 4: Impute Dataset
    static_imputation = Imputation(
        imputation_model_name=args.static_imputation_model, data_type='static')
    temporal_imputation = Imputation(
        imputation_model_name=args.temporal_imputation_model,
        data_type='temporal')

    imputation_pipeline = PipelineComposer(static_imputation,
                                           temporal_imputation)

    dataset_training = imputation_pipeline.fit_transform(dataset_training)
    dataset_testing = imputation_pipeline.transform(dataset_testing)

    print('Finish imputation.')

    # %% Step 5: Feature selection (4 options)
    static_feature_selection = \
      FeatureSelection(feature_selection_model_name=args.static_feature_selection_model,
                       feature_type='static', feature_number=args.static_feature_selection_number,
                       task=args.task, metric_name=args.metric_name,
                       metric_parameters=metric_parameters)

    temporal_feature_selection = \
      FeatureSelection(feature_selection_model_name=args.temporal_feature_selection_model,
                       feature_type='temporal', feature_number=args.temporal_feature_selection_number,
                       task=args.task, metric_name=args.metric_name,
                       metric_parameters=metric_parameters)

    feature_selection_pipeline = PipelineComposer(static_feature_selection,
                                                  temporal_feature_selection)

    dataset_training = feature_selection_pipeline.fit_transform(
        dataset_training)
    dataset_testing = feature_selection_pipeline.transform(dataset_testing)

    print('Finish feature selection.')

    # %% Step 6: Fit treatment effects (3 options)
    # Set the validation data for best model saving
    dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.0)

    # Set the treatment effects model
    model_name = args.model_name

    # Set treatment effects model parameters
    if model_name == 'CRN':
        model_parameters = {
            'encoder_rnn_hidden_units': args.crn_encoder_rnn_hidden_units,
            'encoder_br_size': args.crn_encoder_br_size,
            'encoder_fc_hidden_units': args.crn_encoder_fc_hidden_units,
            'encoder_learning_rate': args.crn_encoder_learning_rate,
            'encoder_batch_size': args.crn_encoder_batch_size,
            'encoder_keep_prob': args.crn_encoder_keep_prob,
            'encoder_num_epochs': args.crn_encoder_num_epochs,
            'encoder_max_alpha': args.crn_encoder_max_alpha,
            'decoder_br_size': args.crn_decoder_br_size,
            'decoder_fc_hidden_units': args.crn_decoder_fc_hidden_units,
            'decoder_learning_rate': args.crn_decoder_learning_rate,
            'decoder_batch_size': args.crn_decoder_batch_size,
            'decoder_keep_prob': args.crn_decoder_keep_prob,
            'decoder_num_epochs': args.crn_decoder_num_epochs,
            'decoder_max_alpha': args.crn_decoder_max_alpha,
            'projection_horizon': args.projection_horizon,
            'static_mode': args.static_mode,
            'time_mode': args.time_mode
        }
        treatment_model = treatment_effects_model(model_name,
                                                  model_parameters,
                                                  task='classification')
        treatment_model.fit(dataset_training)

    elif model_name == 'RMSN':
        hyperparams_encoder_iptw = {
            'dropout_rate': args.rmsn_encoder_dropout_rate,
            'memory_multiplier': args.rmsn_encoder_memory_multiplier,
            'num_epochs': args.rmsn_encoder_num_epochs,
            'batch_size': args.rmsn_encoder_batch_size,
            'learning_rate': args.rmsn_encoder_learning_rate,
            'max_norm': args.rmsn_encoder_max_norm
        }

        hyperparams_decoder_iptw = {
            'dropout_rate': args.rmsn_decoder_dropout_rate,
            'memory_multiplier': args.rmsn_decoder_memory_multiplier,
            'num_epochs': args.rmsn_decoder_num_epochs,
            'batch_size': args.rmsn_decoder_batch_size,
            'learning_rate': args.rmsn_decoder_learning_rate,
            'max_norm': args.rmsn_decoder_max_norm
        }

        model_parameters = {
            'hyperparams_encoder_iptw': hyperparams_encoder_iptw,
            'hyperparams_decoder_iptw': hyperparams_decoder_iptw,
            'model_dir': args.rmsn_model_dir,
            'model_name': args.rmsn_model_name,
            'static_mode': args.static_mode,
            'time_mode': args.time_mode
        }

        treatment_model = treatment_effects_model(model_name,
                                                  model_parameters,
                                                  task='classification')
        treatment_model.fit(dataset_training,
                            projection_horizon=args.projection_horizon)

    elif model_name == 'GANITE':
        hyperparams = {
            'batch_size': args.ganite_batch_size,
            'alpha': args.ganite_alpha,
            'hidden_dims': args.ganite_hidden_dims,
            'learning_rate': args.ganite_learning_rate
        }

        model_parameters = {
            'hyperparams': hyperparams,
            'stack_dim': args.ganite_stack_dim,
            'static_mode': args.static_mode,
            'time_mode': args.time_mode
        }

        treatment_model = treatment_effects_model(model_name,
                                                  model_parameters,
                                                  task='classification')
        treatment_model.fit(dataset_training)

    test_y_hat = treatment_model.predict(dataset_testing)

    print('Finish treatment effects model training and testing.')

    # %% Step 9: Visualize Results

    # Evaluate predictor model
    result = Metrics(metric_sets,
                     metric_parameters).evaluate(dataset_testing.label,
                                                 test_y_hat)
    print('Finish predictor model evaluation.')

    # Visualize the output
    # (1) Performance on estimating factual outcomes
    print('Overall performance on estimating factual outcomes')
    print_performance(result, metric_sets, metric_parameters)

    # (2) Counterfactual trajectories
    print('Counterfactual trajectories')
    if model_name in ['CRN', 'RMSN']:
        # Predict and visualize counterfactuals for the sequence of treatments indicated by the user
        # through the treatment_options. The lengths of each sequence of treatments needs to be projection_horizon + 1.
        treatment_options = np.array([[[1], [1], [1], [1], [1], [0]],
                                      [[0], [0], [0], [0], [1], [1]]])
        history, counterfactual_traj = treatment_model.predict_counterfactual_trajectories(
            dataset=dataset_testing,
            patient_id=args.patient_id,
            timestep=args.timestep,
            treatment_options=treatment_options)

        print_counterfactual_predictions(
            patient_history=history,
            treatment_options=treatment_options,
            counterfactual_predictions=counterfactual_traj)

    return
예제 #17
0
def do_new_weighted_ensemble_for_library(
        library_folder,
        library_name,
        class_num,
        target,
        constraint_file,
        logger,
        gammas,
        internals=None,
        cons_type='both',
        ensemble_method=_default_ensemble_method,
        scale=False):
    """

    :param library_folder:
    :param library_name:
    :param class_num:
    :param target:
    :param constraint_file:
    :param logger:
    :param alphas:
    :param cons_type:
    :param ensemble_method
    :return:
    """
    logger.debug(
        '==========================================================================================='
    )
    logger.debug('-----------------New ver Weighted Ensemble for library:' +
                 str(library_name) + '---------------')
    logger.debug('-----------------Weight type = ' + cons_type +
                 '-------------------------------------------')
    logger.debug('-----------------Scale type = ' + str(scale) +
                 '-------------------------------------------')
    logger.debug('-----------------Constraint File name = ' + constraint_file +
                 '----------------------------')

    labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',')
    labels = labels.astype(int)

    # if the library is not pure, i.e, ensemble results and targets are also included.
    # then, last 5 rows should be removed (single kmeans, cspa, hgpa, mcla, real labels)
    if 'pure' not in library_name:
        labels = labels[0:-5]
    mlset, nlset = io_func.read_constraints(constraint_file)
    n_instances = labels.shape[1]
    if cons_type == 'both':
        n_constraints = len(mlset) + len(nlset)
    else:
        n_constraints = len(mlset)
    if internals is None:
        internals = _build_pesudo_internal(labels)

    # get cluster/clustering level weights
    # constraints in each cluster of all clusterings are also obtained to get g_gamma
    con_per_cluster = []
    constraints_num = []
    con_clustering = []
    cluster_time_sum = 0.0
    clustering_time_sum = 0.0
    for label in labels:
        t1 = time.clock()
        weight, cluster_cons_num = Metrics.consistency_per_cluster_efficient(
            label, mlset, nlset, cons_type=cons_type)
        con_per_cluster.append(weight)
        constraints_num.append(cluster_cons_num)
        t2 = time.clock()
        cluster_time_sum += (t2 - t1)
    for label in labels:
        t1 = time.clock()
        con_clustering.append(
            Metrics.consistency(label, mlset, nlset, cons_type=cons_type))
        t2 = time.clock()
        clustering_time_sum += (t2 - t1)

    print 'library size=' + str(labels.shape[0])
    print 'cluster avg=' + str(cluster_time_sum / labels.shape[0])
    print 'clustering avg=' + str(clustering_time_sum / labels.shape[0])

    if scale:
        scaler = preprocessing.MinMaxScaler()
        con_clustering = scaler.fit_transform(np.array(con_clustering))

    nmis = []
    for gamma in gammas:
        logger.debug(
            '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------'
        )
        cur_g_gamma = get_g_gamma(constraints_num, labels, n_constraints,
                                  n_instances, gamma)
        cur_nmis = []
        for method in ensemble_method:
            ensemble_labels = _ensemble_method[method](
                labels,
                N_clusters_max=class_num,
                weighted=True,
                clustering_weights=con_clustering,
                cluster_level_weights=con_per_cluster,
                alpha=cur_g_gamma,
                new_formula=True,
                internal=internals)
            # ensemble_labels = _ensemble_method[method](labels, N_clusters_max=class_num,
            #                                            weighted=True, clustering_weights=con_clustering,
            #                                            cluster_level_weights=con_per_cluster, alpha=cur_g_gamma,
            #                                            new_formula=True, internal=internals, ml=mlset, cl=nlset)
            ensemble_nmi = Metrics.normalized_max_mutual_info_score(
                ensemble_labels, target)
            logger.debug(method + ' gamma=' + str(gamma) + ', NMI=' +
                         str(ensemble_nmi))
            cur_nmis.append(ensemble_nmi)
        nmis.append(cur_nmis)
        logger.debug(
            '------------------------->>>>>> END OF THIS PARAM <<<<<<-------------------------------'
        )
    logger.debug(
        '==========================================================================================='
    )
    return nmis
예제 #18
0
def comparison_methods(dataset_name,
                       constraints_files=None,
                       additional_postfix='',
                       eval_method=None):
    """
    get the performance of comparison methods.

    Parameters
    ----------
    :param dataset_name:
    :param constraints_files:
    :param additional_postfix:
    :param eval_method:
    """
    filename = _default_eval_path + dataset_name + '_' + time.strftime(
        '%Y-%m-%d_%H_%M_%S', time.localtime(time.time())) + '.csv'
    with open(filename, 'wb') as f:
        writer = csv.writer(f)
        data, targets = exd.dataset[dataset_name]['data']()
        data = data.astype(np.double)
        k = exd.dataset[dataset_name]['k']
        km = cluster.KMeans(n_clusters=k)
        km.fit(data)
        writer.writerow([
            'KMeans',
            str(metrics.normalized_max_mutual_info_score(targets, km.labels_))
        ])
        eval_methods = _default_eval_methods if eval_method is None else eval_method
        if constraints_files is None:
            filenames = _get_default_constraints_files(
                dataset_name, _default_constraints_postfix, additional_postfix)
        else:
            filenames = _get_default_constraints_files(dataset_name,
                                                       constraints_files,
                                                       additional_postfix)
        for filename in filenames:
            ml, cl = io_func.read_constraints(_default_constraints_folder +
                                              filename + '.txt')
            for method in eval_methods:
                if method == 'Cop_KMeans':
                    result = _constrained_methods[method](data, k, ml, cl)
                    writer.writerow([
                        filename + '_Cop_KMeans',
                        str(
                            metrics.normalized_max_mutual_info_score(
                                targets, result))
                    ])
                elif method == 'E2CP':
                    e2cp = _constrained_methods[method](data=data,
                                                        ml=ml,
                                                        cl=cl,
                                                        n_clusters=k)
                    e2cp.fit_constrained()
                    result = e2cp.labels
                    writer.writerow([
                        filename + '_E2CP',
                        str(
                            metrics.normalized_max_mutual_info_score(
                                targets, result))
                    ])
    return
def main(args):
    '''Main function for time-series prediction.
  
  Args:
    - data loading parameters:
      - data_names: mimic, ward, cf    
      
    - preprocess parameters: 
      - normalization: minmax, standard, None
      - one_hot_encoding: input features that need to be one-hot encoded
      - problem: 'one-shot' or 'online'
        - 'one-shot': one time prediction at the end of the time-series 
        - 'online': preditcion at every time stamps of the time-series
      - max_seq_len: maximum sequence length after padding
      - label_name: the column name for the label(s)
      - treatment: the column name for treatments
      
    - imputation parameters: 
      - static_imputation_model: mean, median, mice, missforest, knn, gain
      - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain
            
    - feature selection parameters:
      - feature_selection_model: greedy-addtion, greedy-deletion, recursive-addition, recursive-deletion, None
      - feature_number: selected featuer number
      
    - predictor_parameters:
      - model_name: rnn, gru, lstm, attention, tcn, transformer
      - model_parameters: network parameters such as numer of layers
        - h_dim: hidden dimensions
        - n_layer: layer number
        - n_head: head number (only for transformer model)
        - batch_size: number of samples in mini-batch
        - epochs: number of epochs
        - learning_rate: learning rate
      - static_mode: how to utilize static features (concatenate or None)
      - time_mode: how to utilize time information (concatenate or None)
      - task: classification or regression
      
    - uncertainty_model_name: uncertainty estimation model name (ensemble)
    - interpretor_model_name: interpretation model name (tinvase)
    - metric_name: auc, apr, mae, mse
  '''
    #%% Step 0: Set basic parameters
    metric_sets = [args.metric_name]
    metric_parameters = {
        'problem': args.problem,
        'label_name': [args.label_name]
    }

    #%% Step 1: Upload Dataset
    # File names
    data_directory = '../datasets/data/' + args.data_name + '/' + args.data_name + '_'

    data_loader_training = CSVLoader(
        static_file=data_directory + 'static_train_data.csv.gz',
        temporal_file=data_directory + 'temporal_train_data_eav.csv.gz')

    data_loader_testing = CSVLoader(
        static_file=data_directory + 'static_test_data.csv.gz',
        temporal_file=data_directory + 'temporal_test_data_eav.csv.gz')

    dataset_training = data_loader_training.load()
    dataset_testing = data_loader_testing.load()

    print('Finish data loading.')

    #%% Step 2: Preprocess Dataset
    # (0) filter out negative values (Automatically)
    negative_filter = FilterNegative()
    # (1) one-hot encode categorical features
    onehot_encoder = OneHotEncoder(
        one_hot_encoding_features=[args.one_hot_encoding])
    # (2) Normalize features: 3 options (minmax, standard, none)
    normalizer = Normalizer(args.normalization)

    filter_pipeline = PipelineComposer(negative_filter, onehot_encoder,
                                       normalizer)

    dataset_training = filter_pipeline.fit_transform(dataset_training)
    dataset_testing = filter_pipeline.transform(dataset_testing)

    print('Finish preprocessing.')

    #%% Step 3: Define Problem
    problem_maker = ProblemMaker(problem=args.problem,
                                 label=[args.label_name],
                                 max_seq_len=args.max_seq_len,
                                 treatment=args.treatment)

    dataset_training = problem_maker.fit_transform(dataset_training)
    dataset_testing = problem_maker.fit_transform(dataset_testing)

    print('Finish defining problem.')

    #%% Step 4: Impute Dataset
    static_imputation = Imputation(
        imputation_model_name=args.static_imputation_model, data_type='static')
    temporal_imputation = Imputation(
        imputation_model_name=args.temporal_imputation_model,
        data_type='temporal')

    imputation_pipeline = PipelineComposer(static_imputation,
                                           temporal_imputation)

    dataset_training = imputation_pipeline.fit_transform(dataset_training)
    dataset_testing = imputation_pipeline.transform(dataset_testing)

    print('Finish imputation.')

    #%% Step 5: Feature selection (4 options)
    static_feature_selection = \
    FeatureSelection(feature_selection_model_name = args.static_feature_selection_model,
                     feature_type = 'static', feature_number = args.static_feature_selection_number,
                     task = args.task, metric_name = args.metric_name,
                     metric_parameters = metric_parameters)

    temporal_feature_selection = \
    FeatureSelection(feature_selection_model_name = args.temporal_feature_selection_model,
                     feature_type = 'temporal', feature_number = args.temporal_feature_selection_number,
                     task = args.task, metric_name = args.metric_name,
                     metric_parameters = metric_parameters)

    feature_selection_pipeline = PipelineComposer(static_feature_selection,
                                                  temporal_feature_selection)

    dataset_training = feature_selection_pipeline.fit_transform(
        dataset_training)
    dataset_testing = feature_selection_pipeline.transform(dataset_testing)

    print('Finish feature selection.')

    #%% Step 6: Fit and Predict (6 options)
    # Set predictor model parameters
    model_parameters = {
        'h_dim': args.h_dim,
        'n_layer': args.n_layer,
        'n_head': args.n_head,
        'batch_size': args.batch_size,
        'epoch': args.epochs,
        'model_type': args.model_name,
        'learning_rate': args.learning_rate,
        'static_mode': args.static_mode,
        'time_mode': args.time_mode,
        'verbose': True
    }

    # Set the validation data for best model saving
    dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.0)

    pred_class = prediction(args.model_name, model_parameters, args.task)
    pred_class.fit(dataset_training)
    test_y_hat = pred_class.predict(dataset_testing)

    print('Finish predictor model training and testing.')

    #%% Step 7: Estimate Uncertainty (1 option)
    uncertainty_model = uncertainty(args.uncertainty_model_name,
                                    model_parameters, pred_class, args.task)
    uncertainty_model.fit(dataset_training)
    test_ci_hat = uncertainty_model.predict(dataset_testing)
    print('Finish uncertainty estimation')

    #%% Step 8: Interpret Predictions (1 option)
    interpretor = interpretation(args.interpretation_model_name,
                                 model_parameters, pred_class, args.task)
    interpretor.fit(dataset_training)
    test_s_hat = interpretor.predict(dataset_testing)
    print('Finish model interpretation')

    #%% Step 9: Visualize Results
    idx = np.random.permutation(len(test_y_hat))[:2]

    # Evaluate predictor model
    result = Metrics(metric_sets,
                     metric_parameters).evaluate(dataset_testing.label,
                                                 test_y_hat)
    print('Finish predictor model evaluation.')

    # Visualize the output
    # (1) Performance
    print('Overall performance')
    print_performance(result, metric_sets, metric_parameters)
    # (2) Predictions
    print('Each prediction')
    print_prediction(test_y_hat[idx], metric_parameters)
    # (3) Uncertainty
    print('Uncertainty estimations')
    print_uncertainty(test_y_hat[idx], test_ci_hat[idx], metric_parameters)
    # (4) Model interpretation
    print('Model interpretation')
    print_interpretation(test_s_hat[idx], dataset_training.feature_name,
                         metric_parameters, model_parameters)

    return
예제 #20
0
def k_selection_ensemble(labels,
                         k_threshold,
                         logger,
                         weighted=False,
                         alpha=0,
                         mlset=None,
                         nlset=None,
                         ctype='both'):
    """
    do selection ensemble using k as criteria
    clusteing with k smaller than k_threshold will be removed

    :param labels:
    :param k_threshold:
    :param logger:
    :param weighted: weighted version or not
    :param alpha: balance factor that control the importance of clustering/cluster
                  consistency in weights (weighted version only)
    :param mlset: cannot-link set (weighted version only)
    :param nlset: must-link set (weighted version only)
    :param ctype: type of consistency (weighted version only)
    :return:
    """
    k_value = []
    class_num = len(np.unique(labels[-1]))
    # select those clusterings that k larger than the threshold.
    for label in labels[0:-5]:
        k_value.append(len(np.unique(label)))
    k_value = np.array(k_value)
    idx = k_value.ravel() >= k_threshold
    selected_labels = labels[0:-5][idx]

    # weights
    con_per_cluster = []
    con_clustering = []
    if weighted:
        for label in selected_labels:
            con_per_cluster.append(
                Metrics.consistency_per_cluster(label,
                                                mlset,
                                                nlset,
                                                cons_type=ctype))
        for label in selected_labels:
            con_clustering.append(
                Metrics.consistency(label, mlset, nlset, cons_type=ctype))

    logger.debug('[K] Start consensus...shape=' + str(selected_labels.shape))
    logger.debug('[K] Average k is ' + str(np.mean(k_value[idx])))
    if weighted:
        logger.debug('[K] weighted consensus, alpha=' + str(alpha))

    label_CSPA = ce.cluster_ensembles_CSPAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=con_clustering,
        cluster_level_weights=con_per_cluster,
        alpha=alpha)
    label_HGPA = ce.cluster_ensembles_HGPAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=con_clustering,
        cluster_level_weights=con_per_cluster,
        alpha=alpha)
    label_MCLA = ce.cluster_ensembles_MCLAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=con_clustering,
        cluster_level_weights=con_per_cluster,
        alpha=alpha)

    nmi_CSPA = Metrics.normalized_max_mutual_info_score(label_CSPA, labels[-1])
    nmi_HGPA = Metrics.normalized_max_mutual_info_score(label_HGPA, labels[-1])
    nmi_MCLA = Metrics.normalized_max_mutual_info_score(label_MCLA, labels[-1])
    logger.debug('CSPA performance:' + str(nmi_CSPA))
    logger.debug('HGPA performance:' + str(nmi_HGPA))
    logger.debug('MCLA performance:' + str(nmi_MCLA))
    logger.debug('--------------------------------------------')
    return
예제 #21
0
import constrained_methods.constrained_clustering as cc
import utils.load_dataset as ld
import utils.io_func as io
import time
import evaluation.Metrics as Metrics

data, target = ld.load_mnist_4000()
print data.shape
data = data.astype(float)
ml, cl = io.read_constraints('Constraints/MNIST4000_diff_n_1.txt')
t1 = time.clock()
e2cp = cc.E2CP(data=data, ml=ml, cl=cl, n_clusters=10)
t2 = time.clock()
e2cp.fit_constrained()
print e2cp.labels
print Metrics.normalized_max_mutual_info_score(target, e2cp.labels)
print t2 - t1
예제 #22
0
def consistency_selection_ensemble(labels,
                                   mlset,
                                   nlset,
                                   logger,
                                   must_threshold,
                                   cannot_threshold,
                                   normalized=True,
                                   weighted=False,
                                   weighted_type='both',
                                   alpha=1):
    """
    do selection ensemble using must/cannot consistency as criteria
    clusteing with k smaller than k_threshold will be removed

    :param labels:
    :param mlset:
    :param nlset:
    :param logger:
    :param must_threshold:
    :param cannot_threshold:
    :param normalized:
    :param weighted:
    :param weighted_type:
    :param alpha:
    :return:
    """
    class_num = len(np.unique(labels[-1]))
    must_consistencies = []
    cannot_consistencies = []
    clustering_weights = []
    cluster_level_weights = []
    k_value = []
    for label in labels[0:-5]:
        must_cons = Metrics.consistency(label, mlset, nlset, cons_type='must')
        cannot_cons = Metrics.consistency(label,
                                          mlset,
                                          nlset,
                                          cons_type='cannot')
        if weighted:
            clustering_weights.append(
                Metrics.consistency(label,
                                    mlset,
                                    nlset,
                                    cons_type=weighted_type))
            cluster_level_weights.append(
                Metrics.consistency_per_cluster(label,
                                                mlset,
                                                nlset,
                                                cons_type=weighted_type))
        must_consistencies.append(must_cons)
        cannot_consistencies.append(cannot_cons)
        k_value.append(len(np.unique(label)))
    if normalized:
        scaler = preprocessing.MinMaxScaler()
        must_consistencies = scaler.fit_transform(
            np.array(must_consistencies).reshape(-1, 1)).ravel()
        cannot_consistencies = scaler.fit_transform(
            np.array(cannot_consistencies).reshape(-1, 1)).ravel()
    idx = np.logical_and(must_consistencies >= must_threshold,
                         cannot_consistencies >= cannot_threshold)
    selected_labels = labels[0:-5][idx]
    k_value = np.array(k_value)[idx]
    logger.debug('[Consistency] Start consensus...shape=' +
                 str(selected_labels.shape))
    if selected_labels.shape[0] == 0:
        logger.debug('[Consistency] No clusterings are selected. Out.')
        return
    logger.debug('[Consistency] Average k is ' + str(np.mean(k_value)))
    label_CSPA = ce.cluster_ensembles_CSPAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=clustering_weights,
        cluster_level_weights=cluster_level_weights,
        alpha=alpha)
    label_HGPA = ce.cluster_ensembles_HGPAONLY(
        selected_labels,
        N_clusters_max=class_num,
        weighted=weighted,
        clustering_weights=clustering_weights,
        cluster_level_weights=cluster_level_weights,
        alpha=alpha)
    label_MCLA = ce.cluster_ensembles_MCLAONLY(selected_labels,
                                               N_clusters_max=class_num)
    nmi_CSPA = Metrics.normalized_max_mutual_info_score(label_CSPA, labels[-1])
    nmi_HGPA = Metrics.normalized_max_mutual_info_score(label_HGPA, labels[-1])
    nmi_MCLA = Metrics.normalized_max_mutual_info_score(label_MCLA, labels[-1])
    logger.debug('CSPA performance:' + str(nmi_CSPA))
    logger.debug('HGPA performance:' + str(nmi_HGPA))
    logger.debug('MCLA performance:' + str(nmi_MCLA))
    return
def main(args):
    """Main function for AutoML in time-series predictions.
  
  Args:
    - data loading parameters:
      - data_names: mimic, ward, cf    
      
    - preprocess parameters: 
      - normalization: minmax, standard, None
      - one_hot_encoding: input features that need to be one-hot encoded
      - problem: 'one-shot' or 'online'
        - 'one-shot': one time prediction at the end of the time-series 
        - 'online': preditcion at every time stamps of the time-series
      - max_seq_len: maximum sequence length after padding
      - label_name: the column name for the label(s)
      - treatment: the column name for treatments
      
    - imputation parameters: 
      - static_imputation_model: mean, median, mice, missforest, knn, gain
      - temporal_imputation_model: mean, median, linear, quadratic, cubic, spline, mrnn, tgain
            
    - feature selection parameters:
      - feature_selection_model: greedy-addition, greedy-deletion, recursive-addition, recursive-deletion, None
      - feature_number: selected featuer number
      
    - predictor_parameters:
      - epochs: number of epochs
      - bo_itr: bayesian optimization iterations
      - static_mode: how to utilize static features (concatenate or None)
      - time_mode: how to utilize time information (concatenate or None)
      - task: classification or regression
      
    - metric_name: auc, apr, mae, mse
  """
    #%% Step 0: Set basic parameters
    metric_sets = [args.metric_name]
    metric_parameters = {
        "problem": args.problem,
        "label_name": [args.label_name]
    }

    #%% Step 1: Upload Dataset
    # File names
    data_directory = "../datasets/data/" + args.data_name + "/" + args.data_name + "_"

    data_loader_training = CSVLoader(
        static_file=data_directory + "static_train_data.csv.gz",
        temporal_file=data_directory + "temporal_train_data_eav.csv.gz",
    )

    data_loader_testing = CSVLoader(
        static_file=data_directory + "static_test_data.csv.gz",
        temporal_file=data_directory + "temporal_test_data_eav.csv.gz",
    )

    dataset_training = data_loader_training.load()
    dataset_testing = data_loader_testing.load()

    print("Finish data loading.")

    #%% Step 2: Preprocess Dataset
    # (0) filter out negative values (Automatically)
    negative_filter = FilterNegative()
    # (1) one-hot encode categorical features
    onehot_encoder = OneHotEncoder(
        one_hot_encoding_features=[args.one_hot_encoding])
    # (2) Normalize features: 3 options (minmax, standard, none)
    normalizer = Normalizer(args.normalization)

    filter_pipeline = PipelineComposer(negative_filter, onehot_encoder,
                                       normalizer)

    dataset_training = filter_pipeline.fit_transform(dataset_training)
    dataset_testing = filter_pipeline.transform(dataset_testing)

    print("Finish preprocessing.")

    #%% Step 3: Define Problem
    problem_maker = ProblemMaker(problem=args.problem,
                                 label=[args.label_name],
                                 max_seq_len=args.max_seq_len,
                                 treatment=args.treatment)

    dataset_training = problem_maker.fit_transform(dataset_training)
    dataset_testing = problem_maker.fit_transform(dataset_testing)

    print("Finish defining problem.")

    #%% Step 4: Impute Dataset
    static_imputation = Imputation(
        imputation_model_name=args.static_imputation_model, data_type="static")
    temporal_imputation = Imputation(
        imputation_model_name=args.temporal_imputation_model,
        data_type="temporal")

    imputation_pipeline = PipelineComposer(static_imputation,
                                           temporal_imputation)

    dataset_training = imputation_pipeline.fit_transform(dataset_training)
    dataset_testing = imputation_pipeline.transform(dataset_testing)

    print("Finish imputation.")

    #%% Step 5: Feature selection (4 options)
    static_feature_selection = FeatureSelection(
        feature_selection_model_name=args.static_feature_selection_model,
        feature_type="static",
        feature_number=args.static_feature_selection_number,
        task=args.task,
        metric_name=args.metric_name,
        metric_parameters=metric_parameters,
    )

    temporal_feature_selection = FeatureSelection(
        feature_selection_model_name=args.temporal_feature_selection_model,
        feature_type="temporal",
        feature_number=args.temporal_feature_selection_number,
        task=args.task,
        metric_name=args.metric_name,
        metric_parameters=metric_parameters,
    )

    feature_selection_pipeline = PipelineComposer(static_feature_selection,
                                                  temporal_feature_selection)

    dataset_training = feature_selection_pipeline.fit_transform(
        dataset_training)
    dataset_testing = feature_selection_pipeline.transform(dataset_testing)

    print("Finish feature selection.")

    #%% Step 6: Bayesian Optimization
    ## Model define
    # RNN model
    rnn_parameters = {
        "model_type": "lstm",
        "epoch": args.epochs,
        "static_mode": args.static_mode,
        "time_mode": args.time_mode,
        "verbose": False,
    }

    general_rnn = GeneralRNN(task=args.task)
    general_rnn.set_params(**rnn_parameters)

    # CNN model
    cnn_parameters = {
        "epoch": args.epochs,
        "static_mode": args.static_mode,
        "time_mode": args.time_mode,
        "verbose": False,
    }
    temp_cnn = TemporalCNN(task=args.task)
    temp_cnn.set_params(**cnn_parameters)

    # Transformer
    transformer = TransformerPredictor(task=args.task,
                                       epoch=args.epochs,
                                       static_mode=args.static_mode,
                                       time_mode=args.time_mode)

    # Attention model
    attn_parameters = {
        "model_type": "lstm",
        "epoch": args.epochs,
        "static_mode": args.static_mode,
        "time_mode": args.time_mode,
        "verbose": False,
    }
    attn = Attention(task=args.task)
    attn.set_params(**attn_parameters)

    # model_class_list = [general_rnn, attn, temp_cnn, transformer]
    model_class_list = [general_rnn, attn]

    # train_validate split
    dataset_training.train_val_test_split(prob_val=0.2, prob_test=0.1)

    # Bayesian Optimization Start
    metric = BOMetric(metric="auc", fold=0, split="test")

    ens_model_list = []

    # Run BO for each model class
    for m in model_class_list:
        BO_model = automl.model.AutoTS(dataset_training,
                                       m,
                                       metric,
                                       model_path="tmp/")
        models, bo_score = BO_model.training_loop(num_iter=args.bo_itr)
        auto_ens_model = AutoEnsemble(models, bo_score)
        ens_model_list.append(auto_ens_model)

    # Load all ensemble models
    for ens in ens_model_list:
        for m in ens.models:
            m.load_model(BO_model.model_path + "/" + m.model_id + ".h5")

    # Stacking algorithm
    stacking_ens_model = StackingEnsemble(ens_model_list)
    stacking_ens_model.fit(dataset_training, fold=0, train_split="val")

    # Prediction
    assert not dataset_testing.is_validation_defined
    test_y_hat = stacking_ens_model.predict(dataset_testing, test_split="test")
    test_y = dataset_testing.label

    print("Finish AutoML model training and testing.")

    #%% Step 7: Visualize Results
    idx = np.random.permutation(len(test_y_hat))[:2]

    # Evaluate predictor model
    result = Metrics(metric_sets,
                     metric_parameters).evaluate(test_y, test_y_hat)
    print("Finish predictor model evaluation.")

    # Visualize the output
    # (1) Performance
    print("Overall performance")
    print_performance(result, metric_sets, metric_parameters)
    # (2) Predictions
    print("Each prediction")
    print_prediction(test_y_hat[idx], metric_parameters)

    return
예제 #24
0
    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    st.subheader('Overall Prediction Performance')

    if select_pred_task == 'Classification':
        metric_sets = ['auc', 'apr']

    if select_pred_task == 'Regression':
        metric_sets = ['mse', 'mae']

    metric_parameters = {
        'problem': problem_type,
        'label_name': label_name,
    }

    metrics = Metrics(metric_sets, metric_parameters)

    result = metrics.evaluate(dataset_v_5.label, test_y_hat)

    if problem_type == 'one-shot':
        text = print_performance(
            result,
            metric_sets,
            metric_parameters,
        )

        st.text(text)

    if problem_type == 'online':
        figs = print_performance(
            result,
예제 #25
0
def do_7th_weighted_ensemble_for_library(
        library_folder,
        library_name,
        class_num,
        target,
        constraint_file,
        logger,
        alphas,
        internals,
        cons_type='both',
        ensemble_method=_default_ensemble_method,
        scale=False):
    """

    :param library_folder:
    :param library_name:
    :param class_num:
    :param target:
    :param constraint_file:
    :param logger:
    :param alphas:
    :param cons_type:
    :param ensemble_method
    :return:
    """
    logger.debug(
        '==========================================================================================='
    )
    logger.debug('-----------------New Weighted Ensemble for library:' +
                 str(library_name) + '-------------------')
    logger.debug('-----------------Weight type = ' + cons_type +
                 '-------------------------------------------')
    logger.debug('-----------------Scale type = ' + str(scale) +
                 '-------------------------------------------')
    logger.debug('-----------------Constraint File name = ' + constraint_file +
                 '----------------------------')

    labels = np.loadtxt(library_folder + library_name + '.res', delimiter=',')
    labels = labels.astype(int)
    k_values = []
    expected_cons = {}

    # if the library is not pure, i.e, ensemble results and targets are also included.
    # then, last 5 rows should be removed (single kmeans, cspa, hgpa, mcla, real labels)
    if 'pure' not in library_name:
        labels = labels[0:-5]
    mlset, nlset = io_func.read_constraints(constraint_file)

    # get cluster/clustering level weights
    con_per_cluster = []
    con_clustering = []
    for label in labels:
        con_per_cluster.append(
            Metrics.consistency_per_cluster(label,
                                            mlset,
                                            nlset,
                                            cons_type=cons_type))
    for label in labels:
        con_clustering.append(
            Metrics.consistency(label, mlset, nlset, cons_type=cons_type))
        k_values.append(len(np.unique(label)))
    k_values = np.array(k_values, dtype=int)
    possible_k = np.unique(k_values)
    cons = np.array(con_clustering)
    for k in possible_k:
        mean_value = np.mean(cons[k_values == k])
        if mean_value == 0:
            mean_value = 1
        expected_cons[k] = mean_value
    for i in range(0, labels.shape[0]):
        con_clustering[i] /= expected_cons[k_values[i]]
        con_clustering[i] *= internals[i]
    if scale:
        scaler = preprocessing.MinMaxScaler()
        con_clustering = scaler.fit_transform(np.array(con_clustering))

    nmis = []
    for alpha in alphas:
        logger.debug(
            '-------------------------->>>>>> PARAM START <<<<<<<---------------------------------'
        )
        cur_nmis = []
        for method in ensemble_method:
            ensemble_labels = _ensemble_method[method](
                labels,
                N_clusters_max=class_num,
                weighted=True,
                clustering_weights=con_clustering,
                cluster_level_weights=con_per_cluster,
                alpha=alpha)
            ensemble_nmi = Metrics.normalized_max_mutual_info_score(
                ensemble_labels, target)
            logger.debug(method + ' alpha=' + str(alpha) + ', NMI=' +
                         str(ensemble_nmi))
            cur_nmis.append(ensemble_nmi)
        nmis.append(cur_nmis)
        logger.debug(
            '------------------------->>>>>> END OF THIS PARAM <<<<<<-------------------------------'
        )
    logger.debug(
        '==========================================================================================='
    )
    return nmis
예제 #26
0
# print '--------------------'
# for doo in data_selected.as_matrix():
#     print doo

d, t = ed.dataset['waveform']['data']()
# d, t = ed.dataset['Wap']['data'](sparse_type='csr')
# d, t = ed.dataset['k1b']['data']()
# d, t = ed.dataset['hitech']['data']()
# d, t = ed.dataset['re0']['data']()
print d.shape
print np.unique(t)
km = cluster.KMeans(n_clusters=3)
t1 = time.clock()
km.fit(d)
t2 = time.clock()
print metrics.normalized_max_mutual_info_score(t, km.labels_)
# metrics
print t2 - t1
# import member_generation.subspace as sub
# subd = sub.feature_sampling(d, 2000)
# print d.shape
# print subd.shape
# data_selected, data_unselected, \
# target_selected, target_unselected = train_test_split(d, t,
#                                                       train_size=500,
#                                                       random_state=154)
# print data_selected
# print data_unselected
# print target_selected
# print target_unselected
# print d
예제 #27
0
    def run(self):
        """
        Runs the full pipeline as configured.
        :return: list of run parameters and evaluation metrics.
        """

        # TODO try/catch to ensure proper shutdown even if error encountered

        params = self._get_params_for_run()
        result_rows = []

        # Check for valid configuration
        if self._test_docs is None and self._k_folds == 0:
            self._logger.error("Explicit test set or number of cross-validation folds must be specified.")
            metrics = Metrics()
            result_row = {**params, **metrics.get_scores_as_dict()}
            result_rows.append(result_row)
            return result_rows

        # Continue while there are configured parameter settings to evaluate
        while params is not None:

            # Get collection of training and test sets for current run
            data_sets = self._get_training_and_test_sets()
            for set_index, (training_docs, test_docs) in enumerate(data_sets):

                # Retrieve an encoder module trained with the specified configuration
                self._encoder = self._get_encoder(params)

                set_index += 1  # Only used for user output, so start index at 1

                num_sets = len(data_sets)
                if num_sets > 1:
                    self._logger.info("Training and evaluating fold {} of {}.".format(set_index, num_sets))

                start = time.time()
                self._train_and_evaluate(params, self._encoder, training_docs, test_docs)
                runtime = time.time() - start
                self._logger.info(
                    "Trained and evaluated fold {} of sequence model in {} seconds.".format(set_index, runtime))

                # Combine run parameters with evaluation results and store
                result_row = {**params, **self._evaluator.get_score_as_dict()}
                result_rows.append(result_row)

                # Check if model should be saved
                if self._is_model_saving_enabled():

                    operator = self._evaluator.get_operator()
                    current_score = self._evaluator.get_score()

                    best_model = self._get_best_model()
                    if best_model is not None:
                        (best_metric, _) = best_model
                        if not operator(best_metric, current_score):
                            self._set_best_model(current_score, (params, self._encoder, self._sequence_learner))
                    else:
                        # New model is the best one if no previous existed
                        self._set_best_model(current_score, (params, self._encoder, self._sequence_learner))

                # Invoke optimizer callback to report on results of this run
                if self._optimizer is not None:
                    self._optimizer.process_run_result(params=params,
                                                       score=self._evaluator.get_score_as_dict(),
                                                       encoder=self._encoder,
                                                       sequence_learner=self._sequence_learner)

            # Check if there are additional runs to execute
            if self._optimizer is not None:
                params = self._optimizer.get_next_params()
            else:
                params = None

        # Store best model, if configured
        if self._is_model_saving_enabled():
            path, name = self._get_model_save_path_and_name()
            try:
                self.save(path, name)
            except Exception:
                self._logger.error("Failed to save model, clearing Keras session and trying again.")
                self._sequence_learner.clear_session()
                self.save(path, name)

        # Clear Keras/Tensorflow models # TODO why a second time?
        if self._sequence_learner is not None:
            self._sequence_learner.clear_session()

        return pd.DataFrame(result_rows)
예제 #28
0
def test_evaluate_model(report1):
    metrics = Metrics(report1.get_gold(), report1.get_pred())
    print(metrics.bleu_score())
    print(metrics.ds_score())
예제 #29
0
import member_generation.library_generation as lg
import utils.io_func as io
import utils.settings as settings
import ensemble.Cluster_Ensembles as ce
import evaluation.Metrics as metrics

# 导入数据集,目前我有使用过的数据集都在utils.load_dataset模块中封装成函数了
# 调用时返回两个量,其一是特征矩阵,尺寸是(#Instances * #Features)
# 部分数据集内置了一些参数,如进行0-1规范化等,自行查看
name = 'Iris'
d, t = ld.load_iris()

# 生成聚类成员集的函数我都包装在member_generation.library_generation中
# 主要函数是generate_library,它可以提供random-subspace的聚类成员集生成
# 以及半监督的聚类成员集(目前有E2CP和COP_KMEANS两种半监督聚类方法)的生成
# 该函数返回的是library的名字,实际library保存在Results/[数据集名字]/
# 具体参数,请见函数注释说明,可能写的比较乱,如果不太明白再问
# p.s. 如果使用random-subspace的方式生成聚类成员,其生成方法主要是对样本or特征的随机采样
# 具体函数封装在member_generation.subspace中,library_generation进行调用
lib_name = lg.generate_library(d, t, name, 10, 3)

# 根据名字读入,这里读进来的是一个(#members * #instances)的矩阵
lib = io.read_matrix(settings.default_library_path + name + '/' + lib_name)

# 进行ensemble,这里ensemble返回的是集成之后的簇标签
ensemble_result = ce.cluster_ensembles_CSPAONLY(lib, N_clusters_max=3)

# print出来看看
print ensemble_result
print metrics.normalized_max_mutual_info_score(t, ensemble_result)
예제 #30
0
def generate_library(data, target, dataset_name, n_members, class_num,
                     n_cluster_lower_bound=0, n_cluster_upper_bound=0,
                     feature_sampling=1.0, sample_sampling=0.7,
                     feature_sampling_lower_bound=0.05, sample_sampling_lower_bound=0.1,
                     f_stable_sample=True, s_stable_sample=True,
                     constraints_file=None, sampling_method='FSRSNC', verbose=True, path=_default_result_path,
                     metric='nid', manifold_type='MDS', subfolder=True,
                     generate_only=True):
    """
    generate a single library of ensemble member.

    Parameters
    ----------
    :param data: dataset in a ndarray
    :param target: target in a ndarray or list
    :param dataset_name: name of dataset
    :param n_members: #clusters
    :param class_num: #real_class
    :param n_cluster_lower_bound: lower bound of k
    :param n_cluster_upper_bound: upper bound of k
    :param feature_sampling: fixed sampling rate of feature, or upper bound if not stable
    :param sample_sampling:  fixed sampling rate of instances, or upper bound if not stable
    :param feature_sampling_lower_bound: lower bound of sampling rate of feature, only available if not stable
    :param sample_sampling_lower_bound: lower bound of sampling rate of instance, only available if not stable
    :param f_stable_sample: stable feature sampling or not
    :param s_stable_sample: stable instance sampling or not
    :param constraints_file: name of constraint file, only available when
    :param sampling_method: 'FSRSNC' and 'FSRSNN' supported
    :param verbose: print debug info.
    :param path: path to store the library
    :param metric: used for visualization only
    :param manifold_type: used for visualization only
    :param subfolder: save library in a separated sub-folder or not.

    Return
    ------
    :return: name of the library generated (the library itself will be stored as a file)
    """
    print('start generating library for dataset:' + dataset_name)

    # make sure that path to store the library existing
    if not os.path.isdir(path):
        os.mkdir(path)
    if subfolder:
        savepath = path + dataset_name + '/'
        if not os.path.isdir(savepath):
            os.mkdir(savepath)
    else:
        savepath = path

    # we set the range of cluster number to [k, 10k] if not defined
    if n_cluster_lower_bound == 0 or n_cluster_upper_bound == 0:
        n_cluster_lower_bound = class_num
        n_cluster_upper_bound = class_num * 10

    # get sampling method, if not exist, it will raise a exception
    if sampling_method in _sampling_methods.keys():
        is_constrained = False
    elif sampling_method in _constrained_methods.keys():
        is_constrained = True
    else:
        raise ValueError('ensemble generation : Method should be set properly.')

    # read constraints file if existing
    if constraints_file is not None:
        mlset, nlset = io_func.read_constraints(constraints_file)
    else:
        if is_constrained:
            raise Exception('ensemble generation : Constrained Member must be with a constraints file.')
        constraints_file = ''
        mlset = []
        nlset = []

    # lower bound of sampling rate (use only if 'stable' set to be false)
    if feature_sampling_lower_bound > feature_sampling:
        feature_sampling_lower_bound = feature_sampling / 2
    if sample_sampling_lower_bound > sample_sampling:
        sample_sampling_lower_bound = sample_sampling / 2

    # there should be at least 2 clusters in the clustering
    if n_cluster_lower_bound < 2:
        n_cluster_lower_bound = 2
    if n_cluster_upper_bound < n_cluster_lower_bound:
        n_cluster_upper_bound = n_cluster_lower_bound

    # path and filename to write the file
    filename = _get_file_name(dataset_name, n_cluster_lower_bound, n_cluster_upper_bound, feature_sampling,
                              feature_sampling_lower_bound, sample_sampling, sample_sampling_lower_bound, n_members,
                              f_stable_sample, s_stable_sample, sampling_method, is_constraint_method=is_constrained,
                              constraint_file=constraints_file)

    # we won't generate the library with same sampling rate and size if existing
    if os.path.isfile(savepath + filename + '.res'):
        print ('[Library Generation] : library already exists.')
        return filename+'.res'
    elif os.path.isfile(savepath + filename + '_pure.res'):
        print ('[Library Generation] : corresponding pure library already exists.')
        return filename+'_pure.res'

    tag = True

    # matrix to store clustering results
    mat = np.empty(data.shape[0])

    # generate ensemble members
    for i in range(0, n_members):
        # determine k randomly
        cluster_num = np.random.randint(n_cluster_lower_bound, n_cluster_upper_bound + 1)
        random_state = np.random.randint(0, _INT_MAX - 1)

        cur_feature_sampling = feature_sampling
        cur_sample_sampling = sample_sampling
        if not f_stable_sample:
            cur_feature_sampling = rand.uniform(feature_sampling_lower_bound, feature_sampling)
        if not s_stable_sample:
            cur_sample_sampling = rand.uniform(sample_sampling_lower_bound, sample_sampling)

        print('For this base clustering, cluster number is ' + str(cluster_num))
        # generate ensemble member by given method
        if sampling_method == 'Cop_KMeans':
            result = _constrained_methods[sampling_method](data, cluster_num, mlset, nlset)
        elif sampling_method == 'E2CP':
            e2cp = _constrained_methods[sampling_method](data=data, ml=mlset, cl=nlset, n_clusters=cluster_num)
            e2cp.fit_constrained()
            result = e2cp.labels
        else:
            result = _sampling_methods[sampling_method](data, target, r_clusters=cluster_num,
                                                        r_state=random_state, fsr=cur_feature_sampling,
                                                        ssr=cur_sample_sampling)
        # print diversity
        diver = Metrics.normalized_max_mutual_info_score(result, target)
        if verbose:
            print ('Base clustering' + str(i) + ' nmi_max between real labels = ' + str(diver))
        # stack the result into the matrix
        if tag:
            mat = np.array(result)
            mat = np.reshape(mat, (1, data.shape[0]))
            tag = False
        else:
            temp = np.array(result)
            temp = np.reshape(temp, (1, data.shape[0]))
            mat = np.vstack([mat, np.array(temp)])

    # change element type to int for consensus
    mat = mat.astype(int)

    if generate_only or is_constrained:
        np.savetxt(savepath + filename + '_pure' + '.res', mat, fmt='%d', delimiter=',')
        return filename+'_pure.res'

    # single k-means model, for comparison
    clf = cluster.KMeans(n_clusters=class_num)
    clf.fit(data)
    kmlabels = clf.labels_

    # do consensus
    labels_CSPA = ce.cluster_ensembles_CSPAONLY(mat, N_clusters_max=class_num)
    labels_HGPA = ce.cluster_ensembles_HGPAONLY(mat, N_clusters_max=class_num)
    labels_MCLA = ce.cluster_ensembles_MCLAONLY(mat, N_clusters_max=class_num)

    # put consensus results into the matrix
    mat = np.vstack([mat, np.reshape(kmlabels, (1, data.shape[0]))])
    mat = np.vstack([mat, np.reshape(labels_CSPA, (1, data.shape[0]))])
    mat = np.vstack([mat, np.reshape(labels_HGPA, (1, data.shape[0]))])
    mat = np.vstack([mat, np.reshape(labels_MCLA, (1, data.shape[0]))])

    # put real labels into the matrix
    temp = np.reshape(target, (1, data.shape[0]))
    mat = np.vstack([mat, np.array(temp)])

    print ('Dataset ' + dataset_name + ', consensus finished, saving...')

    # write results to external file, use %d to keep integer part only
    np.savetxt(savepath + filename + '.res', mat, fmt='%d', delimiter=',')

    # print labels and diversities (between the real labels)
    nmi_CSPA = Metrics.normalized_max_mutual_info_score(labels_CSPA, target)
    nmi_HGPA = Metrics.normalized_max_mutual_info_score(labels_HGPA, target)
    nmi_MCLA = Metrics.normalized_max_mutual_info_score(labels_MCLA, target)
    print ('consensus NMI (CSPA) =' + str(nmi_CSPA))
    print ('consensus NMI (HGPA) =' + str(nmi_HGPA))
    print ('consensus NMI (MCLA) =' + str(nmi_MCLA))

    kmnmi = Metrics.normalized_max_mutual_info_score(kmlabels, target)
    print ('single-model diversity (K-means) =' + str(kmnmi))
    # save performances
    perf = np.array([nmi_CSPA, nmi_HGPA, nmi_MCLA, kmnmi])
    np.savetxt(savepath + filename + '_performance.txt', perf, fmt='%.6f', delimiter=',')

    if metric == 'diversity':
        distance_matrix = Metrics.diversityMatrix(mat)
        np.savetxt(savepath + filename + '_diversity.txt', distance_matrix, delimiter=',')
    else:
        distance_matrix = Metrics.NIDMatrix(mat)
        np.savetxt(savepath + filename + '_nid.txt', distance_matrix, delimiter=',')

    if manifold_type == 'MDS':
        # transform distance matrix into 2-d or 3-d coordinates to visualize
        mds2d = manifold.MDS(n_components=2, max_iter=10000, eps=1e-12, dissimilarity='precomputed')
        mds3d = manifold.MDS(n_components=3, max_iter=10000, eps=1e-12, dissimilarity='precomputed')
        pos2d = mds2d.fit(distance_matrix).embedding_
        pos3d = mds3d.fit(distance_matrix).embedding_
        np.savetxt(savepath + filename + '_mds2d.txt', pos2d, fmt="%.6f", delimiter=',')
        np.savetxt(savepath + filename + '_mds3d.txt', pos3d, fmt="%.6f", delimiter=',')

        # draw odm, k distribution and nmi distribution
        cv.plot_ordered_distance_matrix(distance_matrix, savepath + filename + '_original_distance.png',
                                        savepath + filename + '_odm.png')
        cv.plot_k_distribution(mat, pos2d, savepath + filename+'_k_distribution.png')
        cv.plot_nmi_max(mat, pos2d, savepath + filename + '_nmimax_distribution.png')

        # consistencies are calculated while constraints file exists.
        if constraints_file != '':
            cv.plot_consistency(mat, pos2d, mlset, nlset, savepath + filename+'_consistency_both.png',
                                consistency_type='both')
            cv.plot_consistency(mat, pos2d, mlset, nlset, savepath + filename+'_consistency_must.png',
                                consistency_type='must')
            cv.plot_consistency(mat, pos2d, mlset, nlset, savepath + filename+'_consistency_cannot.png',
                                consistency_type='cannot')
            cv.plt_consistency_corelation_with_k(mat, mlset, nlset, savepath + filename+'_normalized.png')
    return