예제 #1
0
def fisher_complexity(column,trainX, trainY, sel_features, thisfeature):
    """
    fisher method
    """
    feature_method_index = gol.get_val("feature_method_index")
    classes = np.array(gol.get_val("classes"))
    column = np.array(column)
    fsel_X = trainX[:, sel_features[feature_method_index[thisfeature]]]

    labels = dict()
    label_indexs = dict()
    label_indexs['1'] = np.array(np.where(column == 1))[0]
    label_indexs['-1'] = np.array(np.where(column == -1))[0]
    labels['1'] = classes[label_indexs['1']]
    bool_array = np.zeros(len(trainY), dtype=np.bool)  # generate a array being filled of 'False'
    for lb in labels['1']:
        bool_array = bool_array | np.array(trainY == lb)
    part1_X = fsel_X[bool_array]
    labels['-1'] = classes[label_indexs['-1']]
    bool_array = np.zeros(len(trainY), dtype=np.bool)  # generate a array being filled of 'False'
    for lb in labels['-1']:
        bool_array = bool_array | np.array(trainY == lb)
    part2_X = fsel_X[bool_array]

    miu1 = np.mean(np.sum(part1_X, axis=0))
    miu2 = np.mean(np.sum(part2_X, axis=0))
    sigma1 = np.var(np.sum(part1_X, axis=0))
    sigma2 = np.var(np.sum(part2_X, axis=0))

    if sigma1 + sigma2 == 0 :   return 0

    fisher_sum = (miu1 - miu2) * (miu1 - miu2) / (sigma1 + sigma2)
    return fisher_sum
예제 #2
0
def init_classes():
    trainFile = gol.get_val("trainFile")
    validationFile = gol.get_val("validationFile")
    testFile = gol.get_val("testFile")
    classes = DataLoader.loadClasses(trainFile, validationFile,
                                     testFile)  # a list
    gol.set_val("classes", classes)
예제 #3
0
def init_feature():
    # features
    Train_X = gol.get_val("Train_X")
    Train_Y = gol.get_val("Train_Y")
    trainFile = gol.get_val("trainFile")
    # the num of feature to be selected is half of the feature numbers
    feature_number = Train_X.shape[
        1] / 2 + 1 if Train_X.shape[1] / 2 + 1 < 75 else 75
    fea = FeatureSelection.select_features(trainFile, Train_X, Train_Y,
                                           feature_number)
    sel_features_backup = fea[0]
    feature_F1 = fea[1]
    feature_F2 = fea[2]
    feature_F3 = fea[3]
    feature_F4 = fea[4]
    sel_features = []
    sel_features.append(feature_F1)
    sel_features.append(feature_F2)
    sel_features.append(feature_F3)
    sel_features.append(feature_F4)
    # feature_method
    # feature_method_index
    feature_method_index = dict(
        (c, i) for i, c in enumerate(FeatureSelection.feature_method))
    gol.set_val("sel_features", sel_features)
    gol.set_val("feature_number", feature_number)
    gol.set_val("feature_method_index", feature_method_index)
    gol.set_val("feature_method", FeatureSelection.feature_method)
예제 #4
0
def eval_func_information_gain(chromosome):
    """
    # Calculate the information gain
    # The data is all training set
    """
    Train_Y = gol.get_val("Train_Y")
    classes = gol.get_val("classes")
    EcocMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature(
        chromosome)
    infor_gain = information_gain(Train_Y, classes, EcocMatrix)
    return np.mean(infor_gain)
예제 #5
0
def eval_func_entropy(chromosome):
    """
    # Calculate the complexity named "means"
    # The data is all training set
    """
    Train_Y = gol.get_val("Train_Y")
    classes = gol.get_val("classes")
    EcocMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature(
        chromosome)
    entropy = information_entropy(Train_Y, classes, EcocMatrix)
    return np.mean(entropy)
예제 #6
0
def init_dataset():
    trainFile = gol.get_val("trainFile")
    testFile = gol.get_val("testFile")
    validationFile = gol.get_val("validationFile")
    Train_X, Train_Y, validation_X, validation_Y, Test_X, Test_Y = DataLoader.loadDataset(
        trainFile, validationFile, testFile)
    class_num = len(np.unique(Train_Y))
    length = len(Train_Y) + len(validation_Y) + len(Test_Y)
    gol.set_val("Train_X", Train_X)
    gol.set_val("Train_Y", Train_Y)
    gol.set_val("validation_X", validation_X)
    gol.set_val("validation_Y", validation_Y)
    gol.set_val("Test_X", Test_X)
    gol.set_val("Test_Y", Test_Y)
예제 #7
0
 def __init__(self, estimator, random_state=None):
     self.classes = gol.get_val("classes")
     self.classes_ = None
     self.code_book_ = None
     self.estimator = estimator
     self.estimators_ = None
     self.estimator_type = None
     self.featuresNames = gol.get_val("feature_method")
     self.infos_evaluations = []
     self.n_jobs = gol.get_val("n_jobs")
     self.random_state = random_state
     self.storager = Storager(gol.get_val("root_path"),
                              gol.get_val("dataName"), self.estimator)
     self.trainX = None
     self.weights = None
예제 #8
0
파일: Utils.py 프로젝트: samuellees/gpecoc
def get_gene_from_bank(features, ecocmatrix, confusion_matrix, classes):
    # prepare
    genebank = gol.get_val("genebank")
    DETA = 0.01  # to avoid zero divide when calculate acccuracy
    # calculate the errors
    errors = list()
    for i in xrange(len(classes)):
        errors.append(1 - float(confusion_matrix[i, i]) /
                      np.sum(confusion_matrix[i, :]))
    # select classes that have higher error than average.
    avg_errors = np.mean(errors)
    hard_classes = np.where(errors > avg_errors)[0]
    if len(hard_classes) == 0: return  # it means accucacy is 1
    # calculate score of every gene, and sort by decs.
    score_tuple = []
    for (fcolumn, est_accuracy, class_accuracies,
         used_frequence) in genebank.genes:
        scores = [(errors[i] - (1 - class_accuracies[i])) * abs(fcolumn[i + 1])
                  for i in hard_classes]
        score = sum(scores) / (sum(
            [abs(fcolumn[i + 1]) + DETA for i in hard_classes]))
        score_tuple.append((fcolumn, score))
    score_tuple = sorted(score_tuple, key=lambda s: s[1], reverse=True)
    # select most suitable column
    candidate = None
    for (fcolumn, score) in score_tuple:
        if not _check_duplicate(ecocmatrix, fcolumn):
            candidate = fcolumn
            break
    return candidate
예제 #9
0
    def predictFinal_withoutlocalimprovement(self, features_used_list,
                                             sel_features, train_X, train_Y,
                                             valid_X, valid_Y, test_X, test_Y):

        self.feature_name = features_used_list
        self.trainX = np.append(train_X, valid_X, axis=0)
        self.trainY = np.append(train_Y, valid_Y)
        self.sel_features = sel_features

        feature_method_index = gol.get_val("feature_method_index")

        self.fit(self.trainX, self.trainY, features_used_list, sel_features,
                 self.code_book_)
        check_is_fitted(self, 'estimators_')

        Y = []
        for i in xrange(len(self.estimators_)):
            pre = corrected_predict_binary(
                self.estimators_[i], test_X[:, sel_features[
                    feature_method_index[features_used_list[i]]]])
            Y.append(pre)
        Y = np.array(Y).T
        if self.estimator_type == 'decision_function':
            Y = _sigmoid_normalize(Y)
        pred = self.get_distances(Y, self.code_book_,
                                  Weighted=True).argmin(axis=1)
        self.conMatrix = confusion_matrix(test_Y, self.classes_[pred])
        score, accuracy = self.calculateFScore(self.classes_[pred], test_Y)
        return score, accuracy
예제 #10
0
    def predict_withoutlocalimprovement(self, features_used_list, sel_features,
                                        Train_X, Train_Y, Valid_X, valid_Y):
        self.feature_name = features_used_list
        self.trainX = Train_X
        self.trainY = Train_Y
        self.sel_features = sel_features

        check_is_fitted(self, 'estimators_')
        feature_method_index = gol.get_val("feature_method_index")

        Y = []
        for i in xrange(len(self.estimators_)):
            self.storager.setfeaturecode(
                sel_features[feature_method_index[features_used_list[i]]],
                self.code_book_[:, i])
            pre = self.storager.load_prediction_valid()
            if pre is None:
                pre = corrected_predict_binary(
                    self.estimators_[i], Valid_X[:, sel_features[
                        feature_method_index[features_used_list[i]]]])
                self.storager.save_prediction_valid(pre)
            Y.append(pre)
        Y = np.array(Y).T

        if self.estimator_type == 'decision_function':
            Y = _sigmoid_normalize(Y)
        pred = self.get_distances(Y, self.code_book_).argmin(axis=1)
        self.conMatrix = confusion_matrix(valid_Y, self.classes_[pred])

        score, accuracy = self.calculateFScore(self.classes_[pred], valid_Y)
        return score, accuracy
예제 #11
0
 def get_distances(self, output_y, code_book_, Weighted=False):
     # need weighted
     if not Weighted:
         valid_y = gol.get_val("validation_Y")
         self.weights = get_weights(output_y, code_book_, valid_y)
     return weighting_corrected_euclidean_distances(output_y, code_book_,
                                                    self.weights)
예제 #12
0
    def fit(self, X, y, features_used_list, sel_features, code_book):

        _check_estimator(self.estimator)
        if hasattr(self.estimator, "decision_function"):
            self.estimator_type = 'decision_function'  # output = [-Nan,Nan]
        else:
            self.estimator_type = 'predict_proba'  #  output = [0, 1]

        self.classes_ = np.unique(np.sort(y))
        self.code_book_ = code_book
        self.trainX = X
        feature_method_index = gol.get_val("feature_method_index")
        classes_index = dict((c, i) for i, c in enumerate(self.classes_))
        extend_ecocmatrix = np.array(
            [self.code_book_[classes_index[y[i]]] for i in range(X.shape[0])],
            dtype=np.int)
        # try to restore estimators from cache
        self.estimators_ = list()
        for i in range(code_book.shape[1]):
            _column = self.code_book_[:, i]
            _features = feature_method_index[features_used_list[i]]
            self.storager.setfeaturecode(sel_features[_features], _column)
            est = self.storager.load_estimator_train()
            if est is None:
                # need training
                est = corrected_fit_binary(self.estimator,
                                           X[:, sel_features[_features]],
                                           extend_ecocmatrix[:, i])
                self.storager.save_estimator_train(est)
            self.estimators_.append(est)
        return self
예제 #13
0
def eval_func_fscore(chromosome):
    """
    # calculate fscore
    """
    EcocMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature(
        chromosome)
    Train_X = gol.get_val("Train_X")
    Train_Y = gol.get_val("Train_Y")
    validation_X = gol.get_val("validation_X")
    validation_Y = gol.get_val("validation_Y")
    sel_features = gol.get_val("sel_features")

    cc = CC(features_used_list, sel_features, EcocMatrix)
    fscore, accuracy, infos_evaluations = cc.TrainAndTest(
        Train_X, Train_Y, validation_X, validation_Y)
    chromosome.infos_evaluation = infos_evaluations
    return fscore, accuracy
예제 #14
0
def logMiddleInfo_callback(gp_engine):
    Train_X = gol.get_val("Train_X")
    Train_Y = gol.get_val("Train_Y")
    validation_X = gol.get_val("validation_X")
    validation_Y = gol.get_val("validation_Y")
    Test_X = gol.get_val("Test_X")
    Test_Y = gol.get_val("Test_Y")
    sel_features = gol.get_val("sel_features")

    import sys
    from utils import delog
    sys.stdout.write("logMiddleInfo...")
    genid = gp_engine.getCurrentGeneration()

    best = gp_engine.bestIndividual()
    FinalMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature(best)

    # result with local improvemtent
    cc = CC(features_used_list, sel_features, FinalMatrix)
    finalScore, finalAccuracy, infos_evaluations = cc.FinalTrainAndTest(Train_X, Train_Y, validation_X, validation_Y, Test_X, Test_Y)
    delog.logMiddle(genid, finalAccuracy, "AAAAA")
    delog.logMiddle(genid, finalScore, "AAAAAfscore")

    #  result without local improvemtent
    cc = CC(features_used_list, sel_features, FinalMatrix)
    cc.TrainAndTest_withoutlocalimp(Train_X, Train_Y, validation_X, validation_Y)
    _finalScore, _finalAccuracy = cc.FinalTrainAndTest_withoutlocalimp(Train_X, Train_Y, validation_X, validation_Y, Test_X, Test_Y)
    delog.logMiddle(genid, _finalAccuracy, "BestAcc_no_impro")
    delog.logMiddle(genid, _finalScore, "BestFscore_no_impro")

    sys.stdout.write("over\n")
    sys.stdout.flush()
예제 #15
0
def selfDefined_GTreeGPMutatorSubtree(genome, **args):
   """
      The self defined mutator of GTreeGP, Subtree Mutator
      This mutator will recreate random subtree of the tree using the grow algorithm.
   """
   classes = gol.get_val("classes")
   ga_engine = args["ga_engine"]
   max_depth = genome.getParam("max_depth", None)
   mutations = 0

   if max_depth is None:
      Util.raiseException("You must specify the max_depth genome parameter !", ValueError)
   if max_depth < 0:
      Util.raiseException("The max_depth must be >= 1, if you want to use GTreeGPMutatorSubtree crossover !",
                          ValueError)

   if Util.randomFlipCoin(args["pmut"]):
      Illegal = True
      while Illegal is True:
         new_genome = copy.deepcopy(genome)
         node = new_genome.getRandomNode()
         assert node is not None
         depth = new_genome.getNodeDepth(node)
         node_parent = node.getParent()
         mutations += 1
         root_subtree = GTreeNode.buildGTreeGPGrow(ga_engine, 0, max_depth - depth)
         if node_parent is None:
            new_genome.setRoot(root_subtree)
         else:
            root_subtree.setParent(node_parent)
            node_parent.replaceChild(node, root_subtree)
         new_genome.processNodes()

         # illegal ? 
         # Actually, case #1 and case #2 may not happen
         Illegal = False
         ecocMatrix, feature_list = TMConverter.getMatrixDirectly_and_feature(new_genome)

         # 1.The number of column is too little
         if LC.tooLittleColumn(ecocMatrix):
            Illegal = True
         elif LC.tooMuchColumn(ecocMatrix):
            Illegal = True
         # 3. if any class not included in the terminal nodes. - substatute randomly
         else:
            labels = set(classes)
            for i in new_genome.nodes_list:
               if i.isLeaf():
                  labels = labels - set(i.getData())
            labels = list(labels)
            if len(labels) > 0:
               Illegal = True
      genome.setRoot(new_genome.getRoot())
      genome.processNodes()
   return int(mutations)
예제 #16
0
def eval_func_eucdist(chromosome):
    """
    # calculate avg_euclidean_dist of a individual
    """
    EcocMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature(
        chromosome)
    classes = gol.get_val("classes")
    num_class = len(classes)
    num_cols = EcocMatrix.shape[1]
    _dist = euclidean_distances(EcocMatrix, EcocMatrix) / np.sqrt(num_cols)
    dist = np.sum(_dist) / 2 / (num_class * (num_class - 1))
    return dist
예제 #17
0
파일: Utils.py 프로젝트: samuellees/gpecoc
def update_referred_times(referred_fcolumn):
    # prepare
    genebank = gol.get_val("genebank")
    # find
    for (fcolumn, est_accuracy, class_accuracies,
         used_frequence) in genebank.genes:
        if (referred_fcolumn == fcolumn).all() or (
                referred_fcolumn[0] == fcolumn[0]
                and referred_fcolumn[1:] * -1 == fcolumn[1:]).all():
            # update
            used_frequence[0] += 1
            break
예제 #18
0
def eval_func_hamdist(chromosome):
    """
    # calculate hamdist of a individual
    """
    EcocMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature(
        chromosome)
    classes = gol.get_val("classes")
    dist = 0
    for i in xrange(len(EcocMatrix)):
        for j in xrange(i + 1, len(EcocMatrix)):
            dist += distance.hamming(EcocMatrix[i], EcocMatrix[j])
    num = len(classes) * (len(classes) - 1) / 2
    dist /= num
    return dist
예제 #19
0
파일: Utils.py 프로젝트: samuellees/gpecoc
def add_gene_from_matrix(features, ecocmatrix, output_y, valid_y, classes):
    # prepare
    ADD_PERCENT = 0.3  # the percent of columns to be saved.
    DETA = 0.01  # to avoid zero divide when calculate acccuracy
    genebank = gol.get_val("genebank")
    output_y_bin = np.array(output_y)
    # binarize
    for ith_output in output_y_bin:
        ith_output[ith_output > 0] = 1
        ith_output[ith_output < 0] = -1
    # compare and record the true and false number of every base_classifier for every class.
    # eg. t_num[i][j] represents the times that ith classifier truly recognized jth class.
    t_num = np.zeros((ecocmatrix.shape[1], ecocmatrix.shape[0]))
    f_num = np.zeros((ecocmatrix.shape[1], ecocmatrix.shape[0]))
    classDict = dict((j, i) for i, j in enumerate(classes))
    for i in xrange(len(output_y_bin)):
        ith_output = output_y_bin[i]
        ith_codeword = ecocmatrix[classDict[valid_y[i]]]
        for j in xrange(len(ith_codeword)):
            if ith_codeword[j] == 0: continue
            if ith_codeword[j] == ith_output[j]:
                t_num[j][classDict[valid_y[i]]] += 1
            else:
                f_num[j][classDict[valid_y[i]]] += 1
    # calculate the accuracy of every base_classifier.
    ests_accuracy = [
        float(sum(t_num[i])) / (sum(t_num[i]) + sum(f_num[i]) + DETA)
        for i in xrange(ecocmatrix.shape[1])
    ]
    # calculate accuracy of every base_classifier for every class.
    est_class_accuracies = []
    for i in xrange(ecocmatrix.shape[1]):
        est_class_accuracies.append([
            (i, j, float(t_num[i][j]) / (t_num[i][j] + f_num[i][j] + DETA))
            for j in xrange(ecocmatrix.shape[0])
        ])
    # save n column randomly to genebank
    n = np.ceil(ecocmatrix.shape[1] * ADD_PERCENT)
    for i in xrange(int(n)):
        import random
        est_index = random.randint(0, ecocmatrix.shape[1] - 1)
        est_accuracy = ests_accuracy[est_index]
        feature = features[est_index]
        col = ecocmatrix[:, est_index]
        fcolumn = np.hstack((feature, col))
        class_accuracies = [
            accuracy
            for (i_est, j_cls, accuracy) in est_class_accuracies[est_index]
        ]
        genebank.addgene(fcolumn, est_accuracy, class_accuracies, [0])
예제 #20
0
def Operation_F1(a, b):
    features = gol.get_val("feature_method")
    a = list(a)
    b = list(b)
    for i in xrange(len(features)):
        if features[i] in a:
            a.remove(features[i])
        if features[i] in b:
            b.remove(features[i])
    sets = set(a) | set(b)
    function = "fclassify"
    result = list(sets)
    result.insert(0, function)
    return result
예제 #21
0
def logResultEveryGen_callback(gp_engine):
    if gp_engine.getCurrentGeneration() ==0:
        print "="*65
        format_str = 'Gen' + ' '*12 + '%%-8s  %%-8s  %%-8%s %%-10%s   %%-10%s   %%-10%s'
        print( (format_str % ('s', 's', 's', 's')) % ('Max', 'Min', 'Avg', 'Best-Fscore', 'Best-Hamdist', 'Best-Accuracy'))
    np.set_printoptions(threshold='nan') 
    # do in every generation
    best = gp_engine.getPopulation().bestRaw()
    bestMatrix , feature_list = TMConvertor.getMatrixDirectly_and_feature(best)
    feature_method_index = gol.get_val("feature_method_index")
    feature_index_list = list(feature_method_index[method] for method in feature_list)
    bestMatrix = np.ndarray.tolist(bestMatrix)
    bestMatrix.insert(0,feature_index_list)
    print np.array(bestMatrix)
예제 #22
0
def init_config():
    dataName = gol.get_val("dataName")
    gol.set_val("n_jobs", Configs.n_jobs)
    gol.set_val("version", Configs.version)
    gol.set_val("testFile", "data/" + dataName + "_test.data")
    gol.set_val("trainFile", "data/" + dataName + "_train.data")
    gol.set_val("validationFile", "data/" + dataName + "_validation.data")
    gol.set_val("root_path", Configs.root_path)
    gol.set_val("growMethod", Configs.growMethod)
    gol.set_val("freq_stats", Configs.freq_stats)
    gol.set_val("generations", Configs.generations)
    gol.set_val("n_neighbors", Configs.n_neighbors)
    gol.set_val("mutationRate", Configs.mutationRate)
    gol.set_val("crossoverRate", Configs.crossoverRate)
    gol.set_val("populationSize", Configs.populationSize)
예제 #23
0
def main_run():
    ##########################################
    # variables preparation
    ##########################################
    Initializator.init_gol()
    gol.set_val("aimFolder", Configs.aimFolder)
    gol.set_val("dataName", Configs.dataName)
    Initializator.init_all()
    classes = gol.get_val("classes")
    maxDeap = gol.get_val("maxDeap")
    growMethod = gol.get_val("growMethod")
    generations = gol.get_val("generations")
    crossoverRate = gol.get_val("crossoverRate")
    mutationRate = gol.get_val("mutationRate")
    populationSize = gol.get_val("populationSize")
    freq_Stats = gol.get_val("freq_stats")
    Train_X = gol.get_val("Train_X")
    Train_Y = gol.get_val("Train_Y")
    validation_X = gol.get_val("validation_X")
    validation_Y = gol.get_val("validation_Y")
    Test_X = gol.get_val("Test_X")
    Test_Y = gol.get_val("Test_Y")
    sel_features = gol.get_val("sel_features")
    ##########################################

    genome = GTree.GTreeGP()
    genome.setParams(max_depth=maxDeap, method=growMethod)
    genome.evaluator += EM.eval_func_fscore

    ga = GSimpleGA.GSimpleGA(genome)
    ga.setParams(gp_terminals=classes, gp_function_prefix="Operation")
    ga.setMinimax(Consts.minimaxType["maximize"])
    ga.setGenerations(generations)
    ga.setCrossoverRate(crossoverRate)
    ga.setMutationRate(mutationRate)
    ga.setPopulationSize(populationSize)
    ga.setElitismReplacement(1)
    #ga.stepCallback.set(CB.printIndividuals_callback)
    ga.stepCallback += CB.checkAncients_callback
    ga.stepCallback += CB.logResultEveryGen_callback
    ga.stepCallback += CB.delogPopulation_callback
    ga.stepCallback += CB.logMiddleInfo_callback
    ga.stepCallback += CB.debug_callback

    print "------------------------------------------------------"

    ga(freq_stats=freq_Stats)
    best = ga.bestIndividual()

    #change the display_flag to display test labels and predict labels
    FinalMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature(
        best)
    cc = ConnectClassifier(features_used_list, sel_features, FinalMatrix)
    finalScore, finalAccuracy, infos_evaluations = cc.FinalTrainAndTest(
        Train_X, Train_Y, validation_X, validation_Y, Test_X, Test_Y)

    # euddist
    num_class = len(classes)
    num_cols = FinalMatrix.shape[1]
    _dist = euclidean_distances(FinalMatrix, FinalMatrix) / np.sqrt(num_cols)
    dist = np.sum(_dist) / 2 / (num_class * (num_class - 1))

    infos_evaluations.insert(len(infos_evaluations),
                             "---------test------------")
    infos_evaluations.insert(len(infos_evaluations), "fscore: %f" % finalScore)
    infos_evaluations.insert(len(infos_evaluations),
                             "accuracy: %f" % finalAccuracy)
    infos_evaluations.insert(len(infos_evaluations), "dist: %f" % dist)

    for text in infos_evaluations:
        print text
예제 #24
0
def DIYGTreeGPMutatorSubtree(genome, **args):
   """ The mutator of GTreeGP, Subtree Mutator

   This mutator will recreate random subtree of the tree using the grow algorithm.
   
   .. versionadded:: 0.6
      The *GTreeGPMutatorSubtree* function
   """
   classes = gol.get_val("classes")
   
   ind = genome
   Illegal = True
   while Illegal==True:

      #mutator
      if args["pmut"] <= 0.0: return 0
      ga_engine = args["ga_engine"]
      max_depth = genome.getParam("max_depth", None)
      mutations = 0

      if max_depth is None:
         Util.raiseException("You must specify the max_depth genome parameter !", ValueError)
      if max_depth < 0:
         Util.raiseException("The max_depth must be >= 1, if you want to use GTreeGPMutatorSubtree crossover !", ValueError)

      node = genome.getRandomNode()
      assert node is not None
      if Util.randomFlipCoin(args["pmut"]):
         depth = genome.getNodeDepth(node)
         mutations += 1
         root_subtree = GTreeNode.buildGTreeGPGrow(ga_engine, 0, max_depth - depth)
         node_parent = node.getParent()
         if node_parent is None:
            genome.setRoot(root_subtree)
         else:
            root_subtree.setParent(node_parent)
            node_parent.replaceChild(node, root_subtree)
         genome.processNodes()

      # complete ? void : alter
      # code_comp = genome.getCompiledCode()
      # if any class not included in the terminal nodes.
      labels = set(classes)
      nums = []
      for i in xrange(len(ind.nodes_list)):
         if ind.nodes_list[i].getType() == nodeType["TERMINAL"]:
            labels = labels - set(ind.nodes_list[i].getData())
            nums.append(i)
      labels = list(labels)
      #print labels
      if len(nums) >= len(classes) :
         # substatute randomly
         while len(labels):
            for j in xrange(len(labels)):
               slice = random.sample(nums, 1) 
               ind.nodes_list[slice[0]].setData(labels[j])
            # if any class not included in the terminal nodes.
            labels = set(classes)
            nums = []
            for i in xrange(len(ind.nodes_list)):
               if ind.nodes_list[i].getType() == nodeType["TERMINAL"]:
                  labels = labels - set(ind.nodes_list[i].getData())
                  nums.append(i)
            labels = list(labels)

      #illegal?
      Illegal = False
      ecocMatrix,feature_list = TMConverter.getMatrixDirectly_and_feature(ind)

      ###row###
      #1.Two rows having the same numbers
      if LC.sameRows(ecocMatrix):
         Illegal = True
         continue
      #2.There being a row with all 0
      elif LC.zeroRow(ecocMatrix):
         Illegal = True
         continue

      if LC.tooLittleColumn(ecocMatrix):
         Illegal = True
   return int(mutations)
예제 #25
0
def fisher_complexity_old(column,trainX, trainY, sel_features, thisfeature):
    """
    fisher method old version
    """
    feature_method_index = gol.get_val("feature_method_index")
    fea_num = gol.get_val("feature_number")
    classes = gol.get_val("classes")

    miu1 = 0; miu2 = 0; sigma1 = 0; sigma2 = 0
    d1 = np.zeros(fea_num); d2 = np.zeros(fea_num)
    d3 = np.zeros((len(classes),fea_num))    # the sum of each feature for every class
    miu3 = [0]*len(classes) # the average of d3
    sigma3 = [0]*len(classes)
    fisher11 = 0; fisher12 = 0; fisher21 = 0; fisher22 = 0
    numleft = 0; numright = 0
    num = [list(trainY).count(classes[i]) for i in xrange(len(classes))]

    # which feature selection method?
    d = trainX[:, sel_features[feature_method_index[thisfeature]]]

    # in
    for j in xrange(len(classes)):
        for m in xrange(len(trainY)):
            if (trainY[m] == classes[j]):
                d3[j,:] += d[m,:]
        if (column[j] == 1):
            numleft += num[j]
        elif (column[j] == -1):
            numright += num[j]
        miu3[j] = np.mean(d3[j])
        sigma3[j] = np.var(d3[j])


    for i in xrange(len(classes)):
        if (column[i] == 1):
            num[i] = float(num[i]) / numleft
            fisher12 += num[i] * sigma3[i]
        elif (column[i] == -1):
            num[i] = float(num[i]) / numright
            fisher22 += num[i] * sigma3[i]

    for i in xrange(len(classes)):
        for j in xrange(len(classes)):
            if (j > i):
                if (column[i] == 1 and column[j] == 1):
                    fisher11 += num[i] * num[j] * (miu3[i] - miu3[j]) * (miu3[i] - miu3[j])
                elif (column[i] == -1 and column[j] == -1):
                    fisher21 += num[i] * num[j] * (miu3[i] - miu3[j]) * (miu3[i] - miu3[j])

    # out
    for j in xrange(len(classes)):
        if column[j] == 1:
            d1[:] += d3[j,:]
        elif column[j] == -1:
            d2[:] += d3[j,:]
    miu1 = np.mean(d1)
    miu2 = np.mean(d2)
    sigma1 = np.var(d1)
    sigma2 = np.var(d2)
    if sigma1 + sigma2 == 0 :   return 0
    fisher_sum = (miu1 - miu2) * (miu1 - miu2) / (sigma1 + sigma2)

    # final
    fisherSet = []
    fisherSet.append(fisher_sum)
    if (fisher11 != 0):
        fisher1 = float(fisher11 / fisher12)
        fisherSet.append(fisher1)
    if (fisher21 != 0):
        fisher2 = float(fisher21 / fisher22)
        fisherSet.append(fisher2)

    return np.mean(fisherSet)
예제 #26
0
def _crossover_supplement_ecoc(raw_ind, coming_node):
    import numpy as np
    from utils import gol
    classes = gol.get_val("classes")
    leafs = raw_ind.getLeafs()
    datas = [_node_.getData() for _node_ in leafs]
    if len(classes) == len(np.unique(datas)): return

    # when lack class
    import copy
    node_ind = copy.deepcopy(raw_ind)
    node_ind.setRoot(coming_node)
    node_ind.processNodes()
    leafs_coming = node_ind.getLeafs()

    leafs_old = copy.deepcopy(leafs)
    datas_old = [_node_.getData() for _node_ in leafs_old]
    for _node_ in leafs_coming:
        datas_old.remove(_node_.getData())

    class_lack = [cls for cls in classes if cls not in datas]

    # find nodes_redundancy, they will be replaced with lacks
    datas_temp_mid = []
    nodes_redundancy = []
    for _node_ in leafs_coming:
        if _node_.getData() in datas_old: nodes_redundancy.append(_node_)
        elif _node_.getData() in datas_temp_mid:
            nodes_redundancy.append(_node_)
        else:
            datas_temp_mid.append(_node_.getData())

    if len(nodes_redundancy) > 0:
        for i in xrange(len(nodes_redundancy)):
            nodes_redundancy[i].setData(class_lack[0])
            class_lack.remove(class_lack[0])
            if len(class_lack) <= 0: break

    # after replacing, it may still lack class
    if len(class_lack) > 0:
        import random
        from gp.GTreeNode import GTreeNodeGP
        max_depth = gol.get_val("maxDeap")
        nodes_grow_candidate = leafs_coming
        nodes_grow_candidate_new = nodes_grow_candidate

        # grow the tree to fill lacks
        while len(class_lack) > 0:
            nodes_grow_candidate = nodes_grow_candidate_new
            nodes_grow_candidate_new = []
            for i in xrange(len(nodes_grow_candidate)):
                if raw_ind.getNodeDepth(nodes_grow_candidate[i]) < max_depth:
                    _newnode_old = GTreeNodeGP(
                        nodes_grow_candidate[i].getData(),
                        node_type=nodeType['TERMINAL'],
                        parent=nodes_grow_candidate[i])
                    _newnode_lack = GTreeNodeGP(class_lack[0],
                                                node_type=nodeType['TERMINAL'],
                                                parent=nodes_grow_candidate[i])
                    nodes_grow_candidate[i].addChild(_newnode_old)
                    nodes_grow_candidate[i].addChild(_newnode_lack)
                    nodes_grow_candidate[i].setType(nodeType['NONTERMINAL'])
                    nodes_grow_candidate[i].setData('Operation_F' +
                                                    str(random.randint(1, 4)))
                    class_lack.remove(class_lack[0])
                    if len(class_lack) <= 0: break
                    nodes_grow_candidate_new.append(_newnode_old)
                    nodes_grow_candidate_new.append(_newnode_lack)
            raw_ind.processNodes()
    raw_ind.processNodes()
예제 #27
0
def checkAncients_callback(gp_engine):
    if gp_engine.getCurrentGeneration() != 0: return
    from utils import delog
    delog.decache("check first Gen...")

    begin = 0
    end = gol.get_val("populationSize")
    classes = gol.get_val("classes")
    population = gp_engine.getPopulation()
    for i in xrange(begin, end):
        genome = population[i]
        max_depth = genome.getParam("max_depth", None)

        #illegal?
        ecocMatrix, feature_list = TMConvertor.getMatrixDirectly_and_feature(genome)
        Illegal = False
        if LCheckers.tooLittleColumn(ecocMatrix):
            Illegal = True
        elif LCheckers.tooMuchColumn(ecocMatrix):
            Illegal = True
        # 2. if any class not included in the terminal nodes.
        else:
            labels = set(classes)
            for i in genome.nodes_list:
                if i.isLeaf():
                    labels = labels - set(i.getData())
            labels = list(labels)
            if len(labels) > 0:
                Illegal = True

        if max_depth is None:
            Util.raiseException("You must specify the max_depth genome parameter !", ValueError)
        if max_depth < 0:
            Util.raiseException("The max_depth must be >= 1, if you want to use GTreeGPMutatorSubtree crossover !", ValueError)

        while Illegal==True:
            new_genome = copy.deepcopy(genome)
            node = new_genome.getRandomNode()
            assert node is not None
            depth = new_genome.getNodeDepth(node)
            node_parent = node.getParent()
            root_subtree = GTreeNode.buildGTreeGPGrow(gp_engine, 0, max_depth - depth)
            if node_parent is None:
                new_genome.setRoot(root_subtree)
            else:
                root_subtree.setParent(node_parent)
                node_parent.replaceChild(node, root_subtree)
            new_genome.processNodes()

            # illegal ? 
            # Actually, case #1 and case #2 may not happen
            Illegal = False
            ecocMatrix, feature_list = TMConvertor.getMatrixDirectly_and_feature(new_genome)

            # 1.The number of column is too little
            if LCheckers.tooLittleColumn(ecocMatrix):
                Illegal = True
            elif LCheckers.tooMuchColumn(ecocMatrix):
                Illegal = True
            # 2. if any class not included in the terminal nodes.
            else:
                labels = set(classes)
                for i in new_genome.nodes_list:
                    if i.isLeaf():
                        labels = labels - set(i.getData())
                labels = list(labels)
                if len(labels) > 0:
                    Illegal = True

            # apply the mutations
            if Illegal == False:
                genome.setRoot(new_genome.getRoot())
                genome.processNodes()

    #Update the scores of population
    delog.deprint_string( "over.")
    population.evaluate()
    population.sort()
예제 #28
0
    def predictFinal(self, features_used_list, sel_features, train_X, train_Y,
                     valid_X, valid_Y, test_X, test_Y):
        self.feature_name = features_used_list
        self.trainY = train_Y
        self.sel_features = sel_features
        # prepare
        check_is_fitted(self, 'estimators_')
        feature_method_index = gol.get_val("feature_method_index")
        # try to restore output from cache
        output_y = []
        for i in xrange(len(self.estimators_)):
            _column = self.code_book_[:, i]
            _features = feature_method_index[features_used_list[i]]
            self.storager.setfeaturecode(sel_features[_features], _column)
            pre = self.storager.load_prediction_valid()
            if pre is None:
                pre = corrected_predict_binary(
                    self.estimators_[i], valid_X[:, sel_features[_features]])
                self.storager.save_prediction_valid(pre)
            output_y.append(pre)
        output_y = np.array(output_y).T
        if self.estimator_type == 'decision_function':
            output_y = _sigmoid_normalize(output_y)
        # get score and confusion matrix
        pred = self.get_distances(output_y, self.code_book_).argmin(axis=1)
        score, accuracy = self.calculateFScore(self.classes_[pred], valid_Y)
        self.conMatrix = confusion_matrix(valid_Y, self.classes_[pred])
        # log
        _message = "Performance without local improvement:"
        self.matrix_tracer(_message, score, accuracy)
        t_score, t_acc = self.predictFinal_withoutlocalimprovement(
            features_used_list, self.sel_features, train_X, train_Y, valid_X,
            valid_Y, test_X, test_Y)
        self.infos_evaluations.insert(len(self.infos_evaluations),
                                      "test-f-score:" + str(t_score))
        self.infos_evaluations.insert(len(self.infos_evaluations),
                                      "test-accuracy:" + str(t_acc))
        self.infos_evaluations.insert(len(self.infos_evaluations),
                                      self.conMatrix)

        # adding column
        temp_features_name = dict(
            (c, i) for i, c in feature_method_index.items())
        add_counter = 0
        while True:
            features_digit = [
                feature_method_index[features_used_list[i]]
                for i in xrange(self.code_book_.shape[1])
            ]
            add_fcol = get_gene_from_bank(features_digit, self.code_book_,
                                          self.conMatrix, self.classes_)
            if add_fcol is not None:
                # prepare new_ecocmatrix and output_y
                new_ecocmatrix = np.hstack(
                    (self.code_book_, np.array([add_fcol[1:]]).transpose()))
                # reconstruct the output_y because they can not be sigmoid respectively
                new_y = []
                for i in xrange(len(self.estimators_)):
                    _column = self.code_book_[:, i]
                    _features = feature_method_index[features_used_list[i]]
                    self.storager.setfeaturecode(sel_features[_features],
                                                 _column)
                    pre = self.storager.load_prediction_valid()
                    if pre is None:
                        pre = corrected_predict_binary(
                            self.estimators_[i],
                            valid_X[:, sel_features[_features]])
                        self.storager.save_prediction_valid(pre)
                    new_y.append(pre)
                # add new output of new column, need training
                new_estimator = self.fit_one(train_X, train_Y, sel_features,
                                             add_fcol)
                add_fcol_y = corrected_predict_binary(
                    new_estimator, valid_X[:, sel_features[add_fcol[0]]])
                new_y.append(add_fcol_y)
                new_y = np.array(new_y).T
                if self.estimator_type == 'decision_function':
                    new_y = _sigmoid_normalize(new_y)
                # calculate new accuracy and compare
                new_pred = self.get_distances(new_y,
                                              new_ecocmatrix).argmin(axis=1)
                new_score, new_accuracy = self.calculateFScore(
                    self.classes_[new_pred], valid_Y)
                # update if there be any improvement
                if new_accuracy > accuracy:
                    # if new_score >= score:
                    output_y = new_y
                    pred = new_pred
                    score = new_score
                    accuracy = new_accuracy
                    self.code_book_ = new_ecocmatrix
                    self.feature_name = np.hstack(
                        (features_used_list, temp_features_name[add_fcol[0]]))
                    features_used_list = self.feature_name
                    self.estimators_.insert(len(self.estimators_),
                                            new_estimator)
                    self.conMatrix = confusion_matrix(valid_Y,
                                                      self.classes_[pred])
                    # update used frequency
                    update_referred_times(add_fcol)
                    # update matrix tracer
                    add_counter += 1
                    _message = str(add_counter) + "Add one column:"
                    self.matrix_tracer(_message, score, accuracy)
                    t_score, t_acc = self.predictFinal_withoutlocalimprovement(
                        features_used_list, self.sel_features, train_X,
                        train_Y, valid_X, valid_Y, test_X, test_Y)
                    self.infos_evaluations.insert(
                        len(self.infos_evaluations),
                        "test-f-score:" + str(t_score))
                    self.infos_evaluations.insert(
                        len(self.infos_evaluations),
                        "test-accuracy:" + str(t_acc))
                    self.infos_evaluations.insert(len(self.infos_evaluations),
                                                  self.conMatrix)
                else:
                    # no improvement, stop.
                    break
            else:
                # genebank is empty, or no suitable column, stop.
                break
        ########
        # TEST #
        ########
        # retraining because different training set, try to restore estimators from cache.
        classes_index = dict((c, i) for i, c in enumerate(self.classes_))
        final_train_x = np.vstack((train_X, valid_X))
        final_train_y = np.hstack((train_Y, valid_Y))
        self.estimators_ = list()
        for i in range(self.code_book_.shape[1]):
            _column = self.code_book_[:, i]
            _features = feature_method_index[features_used_list[i]]
            self.storager.setfeaturecode(sel_features[_features], _column)
            est = self.storager.load_estimator_test()
            if est is None:
                # need training
                extend_column = np.array([
                    _column[classes_index[final_train_y[i]]]
                    for i in xrange(final_train_x.shape[0])
                ],
                                         dtype=np.int)
                est = corrected_fit_binary(
                    self.estimator, final_train_x[:, sel_features[_features]],
                    extend_column)
                self.storager.save_estimator_test(est)
            self.estimators_.append(est)
        # predicting because different training set, try to restore output from cache.
        output_y = []
        for i in xrange(len(self.estimators_)):
            _column = self.code_book_[:, i]
            _features = feature_method_index[features_used_list[i]]
            self.storager.setfeaturecode(sel_features[_features], _column)
            pre = self.storager.load_prediction_test()
            if pre is None:
                pre = corrected_predict_binary(
                    self.estimators_[i], test_X[:, sel_features[_features]])
                self.storager.save_prediction_test(pre)
            output_y.append(pre)
        output_y = np.array(output_y).T
        if self.estimator_type == 'decision_function':
            output_y = _sigmoid_normalize(output_y)
        # get score
        pred = self.get_distances(output_y, self.code_book_,
                                  Weighted=True).argmin(axis=1)
        score, accuracy = self.calculateFScore(self.classes_[pred], test_Y)
        return score, accuracy, self.infos_evaluations
예제 #29
0
def debug_callback(gp_engine):
    genes = gol.get_val("genebank").genes
    genid = gp_engine.getCurrentGeneration()
    None
예제 #30
0
def printIndividuals_callback(gp_engine):
    import pydot
    global numnum
    New_Ind = GTree.GTreeGP()
    classes = gol.get_val("classes")
    numnum = numnum + 1
    begin = 0
    end = 20
    if gp_engine.getCurrentGeneration() != -1:
         population = gp_engine.getPopulation()
         graph = pydot.Dot(graph_type = "digraph")
         n = 0
         filename = 'Tree' + str(numnum) +'.jpg'
         for i in xrange(begin, end) :

             arrays = []
             ind = population[i]
             subg = pydot.Cluster("cluster_%d" % i, label="\"Ind. #%d - Score Raw/Fit.: %.4f/%.4f\"" % (i, ind.getRawScore(), ind.getFitnessScore()))
             count = n
             node_stack = []
             nodes_dict = {}
             tmp = None
             import __main__ as main_module

             for i in xrange(len(ind.nodes_list)):
                newnode = pydot.Node(str(count), style="filled")
                count += 1

                # color
                if ind.nodes_list[i].getType() == Consts.nodeType["TERMINAL"]:
                    newnode.set_color("lightblue2")
                else:
                   newnode.set_color("goldenrod2")

                # content of node
                if ind.nodes_list[i].getType() == Consts.nodeType["NONTERMINAL"]:
                    func = getattr(main_module, ind.nodes_list[i].getData())

                    if hasattr(func, "shape"):
                        newnode.set_shape(func.shape)

                    if hasattr(func, "representation"):
                        newnode.set_label(func.representation)
                    else:
                        for j in xrange(0, len(classes)):
                            locals()[classes[j]] = classes[j]

                        New_Ind.setRoot(ind.nodes_list[i])
                        array = eval(New_Ind.getCompiledCode())
                        newnode.set_label(str(array))
                    #if hasattr(func, "color"): newnode.set_color(func.color)
                else:
                    newnode.set_label(ind.nodes_list[i].getData())

                nodes_dict.update({ind.nodes_list[i]: newnode})
                graph.add_node(newnode)

             node_stack.append(ind.getRoot())
             while len(node_stack) > 0:
                tmp = node_stack.pop()

                parent = tmp.getParent()
                if parent is not None:
                   parent_node = nodes_dict[parent]
                   child_node  = nodes_dict[tmp]

                   newedge = pydot.Edge(parent_node, child_node)
                   graph.add_edge(newedge)

                rev_childs = tmp.getChilds()[:]
                rev_childs.reverse()
                node_stack.extend(rev_childs)
             n = count
             graph.add_subgraph(subg)
         graph.write(filename, prog='dot', format="jpeg")