예제 #1
0
def cv_scores_scales(X,y,clf,cv,amount,information):        
    X,y = shuffle_set(X,y)
    sc = 1
    scorings = [[]]
    score = [[]]
    predict = [[]]
    guessed = [[]]
    predicts = [[],[]]
    time = [0,0]
    X,y  = split(X,y,amount)
    for i in range(0,cv):
        cv_clf = copy(clf)
        if i == 0 or i== 9:
            if i== 0 :
                X_train = X[0:len(X)-len(X)//cv]
                X_test = X[len(X)-len(X)//cv:len(X)]
                y_train = y[0:len(y)-len(y)//cv]
                y_test = y[len(y)-len(y)//cv:len(y)]
            else:
                X_train = X[len(X)//cv:len(X)]
                X_test = X[0:len(X)//cv]
                y_train = y[len(y)//cv:len(y)]
                y_test = y[0:len(y)//cv]                
        else:
            X_train = X[0:len(X)//10*i]
            X_train.extend(X[len(X)//10*(i+1):len(X)])
            X_test = X[len(X)//10*i:len(X)//10*(i+1)]
            y_train = y[0:len(y)//10*i]
            y_train.extend(y[len(y)//10*(i+1):len(y)])
            y_test = y[len(y)//10*i:len(y)//10*(i+1)]
        with stopwatch() as sw:
            _ = cv_clf.fit(X_train,y_train)
        time[0] = time[0] + sw.duration
        with stopwatch() as sw:
            predict[0] = cv_clf.predict(X_test)
        time[1] = time[1] + sw.duration
        for k in range(0,sc):
            score[k] = 0
        if information >=2:            
            for k in range(0,sc):
                guessed[k].append(distr_guessed(predict[k]))        
                    
        for k in range(0,sc):
            scorings[k].append(accuracy_score(y_test,predict[k]))
        if information >= 3:
            for k in range(0,sc):
                predicts[k].append(predict[k])
            predicts[sc].append(y_test)
        
    
    if(information >= 3):
        return scorings,guessed,predicts,time
    
    elif information == 2:
        return scorings,guessed
    else:
        return scorings
def information_theoretic_metafeatures(X, y, categorical):
	utils.input_check(X, y, categorical)
	features = OrderedDict()

	classes, counts = np.unique(y, return_counts = True)
	features["ClassEntropy"] = scipy.stats.entropy(counts, base = 2)

	# Information theoretic meta-features below only apply to categorical values
	if(sum(categorical) == 0):
		return OrderedDict.fromkeys(information_theoretic_metafeature_names(), value = -1)

	with utils.stopwatch() as sw:
		feature_entropies = [scipy.stats.entropy(column[0]) for column in X[:,np.where(categorical)].T]
		mean_feature_entropy = np.mean(feature_entropies)
		features["MeanFeatureEntropy"] = np.mean(mean_feature_entropy)

		mutual_informations = [sklearn.metrics.mutual_info_score(y, column[0]) for column in X[:, np.where(categorical)].T]
		mean_mutual_information = np.mean(mutual_informations)
		features["MeanMutualInformation"] = mean_mutual_information

		if(mean_mutual_information == 0):
			features["NoiseToSignalRatio"] = 0

		features["NoiseToSignalRatio"] = (mean_feature_entropy - mean_mutual_information) / mean_mutual_information

	features["InformationFeatureTime"] = sw.duration
	return features
예제 #3
0
def ga_main1(model, out='result', clear_directory=False):
    ''' GAテスト & プロット
    '''
    if clear_directory and os.path.isdir(out):
        shutil.rmtree(out)

    pop_size = 500
    epoch = 100
    ksize = 5
    save_trigger = lambda i: i == epoch  # 最後だけ

    model_env = {'nsga2': NSGA2_ENV, 'moead': MOEAD_ENV}[model]

    with model_env(pop_size=pop_size, ksize=ksize) as optimizer:
        with ut.stopwatch('main'):
            # GA開始
            # 初期集団生成
            optimizer.init_population()

            # 進化
            for i in range(1, epoch + 1):
                optimizer.advance()
                print('epoch:',
                      i,
                      'popsize:',
                      len(optimizer.population),
                      end='\r')
                if save_trigger(i):
                    file = f'pop_size{pop_size}_epoch{i}_{ut.strnow("%Y%m%d_%H%M")}.pickle'
                    optimizer.save(file=os.path.join(out, file))

    ga_result11(model, out=out, show=False)
예제 #4
0
def ga_main1(method_name, out='result', clear_directory=False):
    ''' GA実行
    '''
    if clear_directory and os.path.isdir(out):
        shutil.rmtree(out)

    popsize = 100
    epoch = 100
    ksize = 5

    # 最適化設定環境取得
    with M.Optimize_ENV(method_name, popsize=popsize, ksize=ksize) as env:
        optimizer = env.optimizer
        creator = env.creator
        with ut.stopwatch('main'):
            # GA開始
            # 初期集団生成
            population = optimizer.init_population(creator, popsize=popsize)
            history = [population]

            # 進化
            for i in range(1, epoch + 1):
                population = optimizer(population)
                history.append(population)

                print('epoch:', i, 'popsize:', len(population), end='\r')
                if i == epoch:
                    # モデルをファイルに書き込み
                    file = f'popsize{popsize}_epoch{i}_{ut.snow}.pkl'
                    file = os.path.join(out, file)
                    print('save:', file)
                    # optimizer.save(file=os.path.join(out, file))
                    ut.save(file, (env, optimizer, history))
            return env, optimizer, history
예제 #5
0
def simple_metafeatures(X, y, categorical):
    utils.input_check(X, y, categorical)
    features = OrderedDict()
    n = X.shape[0]
    p = X.shape[1]

    with utils.stopwatch() as sw:
        features["NumberOfInstances"] = n
        features["LogNumberOfInstances"] = np.log(n)
        features["NumberOfFeatures"] = p
        features["LogNumberOfFeatures"] = np.log(p)
        features["DatasetRatio"] = p / n
        features["LogDatasetRatio"] = np.log(p / n)
        features["InverseDatasetRatio"] = n / p
        features["LogInverseDatasetRatio"] = np.log(n / p)

        classes, counts = np.unique(y, return_counts=True)
        nrNominal = sum(categorical)
        nrNumeric = len(categorical) - sum(categorical)

        features["NumberOfClasses"] = classes.shape[0]
        features["NumberOfCategoricalFeatures"] = nrNominal
        features["NumberOfNumericFeatures"] = nrNumeric

        features[
            "RatioNumericalToNominal"] = nrNumeric / nrNominal if nrNominal > 0 else 0
        features[
            "RatioNominalToNumerical"] = nrNominal / nrNumeric if nrNumeric > 0 else 0

        class_probabilities = [count / n for count in counts]
        features["ClassProbabilityMin"] = np.min(class_probabilities)
        features["ClassProbabilityMax"] = np.max(class_probabilities)
        features["ClassProbabilityMean"] = np.mean(class_probabilities)
        features["ClassProbabilitySTD"] = np.std(class_probabilities)

        symbols_per_column = [
            np.unique(column).shape[0] for column in X[:,
                                                       np.where(categorical)].T
        ]
        if len(symbols_per_column) > 0:
            features["SymbolsMin"] = np.min(symbols_per_column)
            features["SymbolsMax"] = np.max(symbols_per_column)
            features["SymbolsMean"] = np.mean(symbols_per_column)
            features["SymbolsSTD"] = np.std(symbols_per_column)
            features["SymbolsSum"] = np.sum(symbols_per_column)
        else:
            features["SymbolsMin"] = features["SymbolsMax"] = features[
                "SymbolsMean"] = features["SymbolsSTD"] = features[
                    "SymbolsSum"] = 0

    features["SimpleFeatureTime"] = sw.duration
    # Missing value features missing for now since only datasets without missing features were selected.

    return features
예제 #6
0
def optimizeRF(did,amount):
    X,y = read_did(did)
    X = add_copy_features(X,amount)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    clf = RandomForestClassifier()
    params = {'max_features': range(1,len(X[0])), 'min_samples_split': range(2,20)}
    randomRF = RandomizedSearchCV(clf, param_distributions=params,
                                       n_iter=40,n_jobs = 3)
    with stopwatch() as sw:
        _=randomRF.fit(X_train, y_train)
    duration = sw.duration
    estimator = randomRF.best_estimator_
    cv_score = list(cross_val_score(randomRF.best_estimator_,X,y,cv = 10))
    cv_score.append(randomRF.score(X_test,y_test))
    return estimator,cv_score,duration
def simple_metafeatures(X, y, categorical):
	utils.input_check(X, y, categorical)
	features = OrderedDict()
	n = X.shape[0]
	p = X.shape[1]

	with utils.stopwatch() as sw:
		features["NumberOfInstances"] = n
		features["LogNumberOfInstances"] = np.log(n)
		features["NumberOfFeatures"] = p
		features["LogNumberOfFeatures"] = np.log(p)
		features["DatasetRatio"] = p / n
		features["LogDatasetRatio"] = np.log(p / n)
		features["InverseDatasetRatio"] = n / p 
		features["LogInverseDatasetRatio"] = np.log(n / p)

		classes, counts = np.unique(y, return_counts = True)
		nrNominal = sum(categorical)
		nrNumeric = len(categorical)-sum(categorical)

		features["NumberOfClasses"] = classes.shape[0]
		features["NumberOfCategoricalFeatures"] = nrNominal
		features["NumberOfNumericFeatures"] = nrNumeric

		features["RatioNumericalToNominal"] = nrNumeric / nrNominal if nrNominal > 0 else 0
		features["RatioNominalToNumerical"] = nrNominal / nrNumeric if nrNumeric > 0 else 0

		class_probabilities = [ count / n for count in counts ]
		features["ClassProbabilityMin"] = np.min(class_probabilities)
		features["ClassProbabilityMax"] = np.max(class_probabilities)
		features["ClassProbabilityMean"] = np.mean(class_probabilities)
		features["ClassProbabilitySTD"] = np.std(class_probabilities)

		symbols_per_column = [ np.unique(column).shape[0] for column in X[:, np.where(categorical)].T]
		if len(symbols_per_column) > 0:
			features["SymbolsMin"] = np.min(symbols_per_column)
			features["SymbolsMax"] = np.max(symbols_per_column)
			features["SymbolsMean"] = np.mean(symbols_per_column)
			features["SymbolsSTD"] = np.std(symbols_per_column)
			features["SymbolsSum"] = np.sum(symbols_per_column)
		else:
			features["SymbolsMin"] = features["SymbolsMax"] = features["SymbolsMean"] = features["SymbolsSTD"] = features["SymbolsSum"] = 0

	features["SimpleFeatureTime"] = sw.duration
	# Missing value features missing for now since only datasets without missing features were selected.

	return features
def cross_validate_classifier(clf, X, y, folds):
	"""
	Performs a cross-validation experiment for the classifier clf,
	reporting mean accuracy and time to run (as well as total time,
	but this was later left out in the study).
	"""
	accuracies = []
	times = []

	for train, test in folds:
		with stopwatch() as sw:
			clf.fit(X[train, :], y[train])
			accuracy = clf.score(X[test], y[test])
			accuracies.append(accuracy)
		times.append(sw.duration)

	return np.mean(accuracies), sum(times), np.mean(times)
예제 #9
0
def optimizeADA(did,amount):
    X,y = read_did(did)
    iters = 40
    X = add_copy_features(X,amount)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    clf = AdaBoostClassifier()
    learns = [random()*1.9 + 0.1 for i in range(iters*4)]
    params = {'learning_rate': learns}
    randomRF = RandomizedSearchCV(clf, param_distributions=params,
                                       n_iter=iters,n_jobs = 3)
    with stopwatch() as sw:
        _=randomRF.fit(X_train, y_train)
    duration = sw.duration
    estimator = randomRF.best_estimator_
    cv_score = list(cross_val_score(randomRF.best_estimator_,X,y,cv = 10))
    cv_score.append(randomRF.score(X_test,y_test))
    return estimator,cv_score,duration
예제 #10
0
def cross_validate_classifier(clf, X, y, folds):
    """
	Performs a cross-validation experiment for the classifier clf,
	reporting mean accuracy and time to run (as well as total time,
	but this was later left out in the study).
	"""

    accuracies = []
    times = []

    for train, test in folds:
        with stopwatch() as sw:
            clf.fit(X[train], y[train])
            accuracy = clf.score(X[test], y[test])
            accuracies.append(accuracy)
        times.append(sw.duration)

    return np.mean(accuracies), sum(times), np.mean(times)
예제 #11
0
def statistical_metafeatures(X, y, categorical):
    utils.input_check(X, y, categorical)
    features = OrderedDict()

    numerical = [not cat for cat in categorical]

    # Statistical meta-features are only for the numerical attributes, if there are none, we list them as -1
    # we should see if there is a better way to deal with this, as -1 is a valid value for some of these features..
    if (sum(numerical) == 0):
        return OrderedDict.fromkeys(statistical_metafeature_names(), value=-1)

    with utils.stopwatch() as sw:
        # Taking taking kurtosis of kurtosis and skewness of kurtosis is suggested by Reif et al. in Meta2-features (2012)
        kurtosisses = [
            scipy.stats.kurtosis(column[0])
            for column in X[:, np.where(numerical)].T
        ]
        features["KurtosisMin"] = np.min(kurtosisses)
        features["KurtosisMax"] = np.max(kurtosisses)
        features["KurtosisMean"] = np.mean(kurtosisses)
        features["KurtosisSTD"] = np.std(kurtosisses)
        features["KurtosisKurtosis"] = scipy.stats.kurtosis(kurtosisses)
        features["KurtosisSkewness"] = scipy.stats.skew(kurtosisses)

        skewnesses = [
            scipy.stats.skew(column[0]) for column in X[:,
                                                        np.where(numerical)].T
        ]
        features["SkewnessMin"] = np.min(skewnesses)
        features["SkewnessMax"] = np.max(skewnesses)
        features["SkewnessMean"] = np.mean(skewnesses)
        features["SkewnessSTD"] = np.std(skewnesses)
        features["SkewnessKurtosis"] = scipy.stats.kurtosis(skewnesses)
        features["SkewnessSkewness"] = scipy.stats.skew(skewnesses)

        standard_deviations = [
            np.std(column[0]) for column in X[:, np.where(numerical)].T
        ]
        features["MeanSTDOfNumerical"] = np.mean(standard_deviations)
        features["STDSTDOfNumerical"] = np.std(standard_deviations)

    features["StatisticalFeatureTime"] = sw.duration

    return features
예제 #12
0
def ga_main2(out='result', clear_directory=False):
    ''' GAテスト & プロット
    '''
    if clear_directory and os.path.isdir(out):
        shutil.rmtree(out)

    epoch = 250
    save_trigger = lambda i: i == epoch  # 最後だけ
    optimal_front = get_optomal_front()
    stat = []

    with MOEAD_ENV() as optimizer:
        for rep in range(100):
            with ut.stopwatch(f'epoch{epoch+1}'):
                optimizer.create_initial_population()
                for i in range(1, epoch + 1):
                    optimizer.advance()
                    print('epoch:',
                          i,
                          'popsize:',
                          len(optimizer.population),
                          end='\r')

            last_population = optimizer.get_individuals()
            last_population.sort(key=lambda x: x.value)

            conv = convergence(last_population, optimal_front)
            div = diversity(last_population, optimal_front[0],
                            optimal_front[-1])
            stat.append((conv, div))

            print("Convergence: ", conv)
            print("Diversity: ", div)

    print('=' * 20, 'Average', '=' * 20)
    print("Convergence: ", np.mean([x[0] for x in stat]))
    print("Diversity: ", np.mean([x[1] for x in stat]))
예제 #13
0
def ga_main1(method_name, out='result', clear_directory=False):
    ''' GA実行
    '''
    if clear_directory and os.path.isdir(out):
        shutil.rmtree(out)

    popsize = 100
    epoch = 100
    ksize = 5
    save_trigger = lambda i: i == epoch  # 最後だけ

    model_env = {'nsga2': om.NSGA2_ENV, 'moead': om.MOEAD_ENV}[method_name]

    with model_env(popsize=popsize, ksize=ksize) as env:
        optimizer = env.optimizer
        creator = env.creator
        with ut.stopwatch('main'):
            # GA開始
            # 初期集団生成
            population = optimizer.init_population(creator, popsize=popsize)
            history = [population]

            # 進化
            for i in range(1, epoch + 1):
                # optimizer.advance()
                population = optimizer(population)
                history.append(population)

                print('epoch:', i, 'popsize:', len(population), end='\r')
                if save_trigger(i):
                    file = f'popsize{popsize}_epoch{i}_{ut.strnow("%Y%m%d_%H%M")}.pkl'
                    file = os.path.join(out, file)
                    print('save:', file)
                    # optimizer.save(file=os.path.join(out, file))
                    ut.save(file, (env, optimizer, history))
            return env, optimizer, history
def statistical_metafeatures(X, y, categorical):
	utils.input_check(X, y, categorical)
	features = OrderedDict()

	numerical = [not cat for cat in categorical]

	# Statistical meta-features are only for the numerical attributes, if there are none, we list them as -1
	# we should see if there is a better way to deal with this, as -1 is a valid value for some of these features..
	if(sum(numerical) == 0):
		return OrderedDict.fromkeys(statistical_metafeature_names(), value = -1)

	with utils.stopwatch() as sw:
		# Taking taking kurtosis of kurtosis and skewness of kurtosis is suggested by Reif et al. in Meta2-features (2012)
		kurtosisses = [scipy.stats.kurtosis(column[0]) for column in X[:,np.where(numerical)].T]	
		features["KurtosisMin"] = np.min(kurtosisses)
		features["KurtosisMax"] = np.max(kurtosisses)
		features["KurtosisMean"] = np.mean(kurtosisses)
		features["KurtosisSTD"] = np.std(kurtosisses)
		features["KurtosisKurtosis"] = scipy.stats.kurtosis(kurtosisses)
		features["KurtosisSkewness"] = scipy.stats.skew(kurtosisses)

		skewnesses = [scipy.stats.skew(column[0]) for column in X[:,np.where(numerical)].T]
		features["SkewnessMin"] = np.min(skewnesses)
		features["SkewnessMax"] = np.max(skewnesses)
		features["SkewnessMean"] = np.mean(skewnesses)
		features["SkewnessSTD"] = np.std(skewnesses)
		features["SkewnessKurtosis"] = scipy.stats.kurtosis(skewnesses)
		features["SkewnessSkewness"] = scipy.stats.skew(skewnesses)

		standard_deviations = [np.std(column[0]) for column in X[:,np.where(numerical)].T]
		features["MeanSTDOfNumerical"] = np.mean(standard_deviations)
		features["STDSTDOfNumerical"] = np.std(standard_deviations)

	features["StatisticalFeatureTime"] = sw.duration	

	return features
예제 #15
0
def information_theoretic_metafeatures(X, y, categorical):
    utils.input_check(X, y, categorical)
    features = OrderedDict()

    classes, counts = np.unique(y, return_counts=True)
    features["ClassEntropy"] = scipy.stats.entropy(counts, base=2)

    # Information theoretic meta-features below only apply to categorical values
    if (sum(categorical) == 0):
        return OrderedDict.fromkeys(information_theoretic_metafeature_names(),
                                    value=-1)

    with utils.stopwatch() as sw:
        feature_entropies = [
            scipy.stats.entropy(column[0])
            for column in X[:, np.where(categorical)].T
        ]
        mean_feature_entropy = np.mean(feature_entropies)
        features["MeanFeatureEntropy"] = np.mean(mean_feature_entropy)

        mutual_informations = [
            sklearn.metrics.mutual_info_score(y, column[0])
            for column in X[:, np.where(categorical)].T
        ]
        mean_mutual_information = np.mean(mutual_informations)
        features["MeanMutualInformation"] = mean_mutual_information

        if (mean_mutual_information == 0):
            features["NoiseToSignalRatio"] = 0

        features["NoiseToSignalRatio"] = (
            mean_feature_entropy -
            mean_mutual_information) / mean_mutual_information

    features["InformationFeatureTime"] = sw.duration
    return features
예제 #16
0
def cv_scores_features1(X,y,clf,cv,amount,information):
    dur = 0         
    X,y = shuffle_set(X,y)
#    amount = round(len(X[0])*amount)
    sc = 2
    scorings = [[],[]]
    score = [[],[]]
    predict = [[],[]]
    guessed = [[],[]]
    predicts = [[],[],[]]
    time = [0,0,0,0]
    for i in range(0,cv):
        cv_clf = copy(clf)
        cv_clf2 = copy(clf)
        if i == 0 or i== 9:
            if i== 0 :
                X_train = X[0:len(X)-len(X)//cv]
                X_test = X[len(X)-len(X)//cv:len(X)]
                y_train = y[0:len(y)-len(y)//cv]
                y_test = y[len(y)-len(y)//cv:len(y)]
                train_X = remove_features(X_train,amount)
                test_X = remove_features(X_test,amount)
            else:
                X_train = X[len(X)//cv:len(X)]
                X_test = X[0:len(X)//cv]
                y_train = y[len(y)//cv:len(y)]
                y_test = y[0:len(y)//cv]
                train_X = remove_features(X_train,amount)
                test_X = remove_features(X_test,amount)
        else:
            X_train = X[0:len(X)//10*i]
            X_train.extend(X[len(X)//10*(i+1):len(X)])
            X_test = X[len(X)//10*i:len(X)//10*(i+1)]
            y_train = y[0:len(y)//10*i]
            y_train.extend(y[len(y)//10*(i+1):len(y)])
            y_test = y[len(y)//10*i:len(y)//10*(i+1)]
            train_X = remove_features(X_train,amount)
            test_X = remove_features(X_test,amount)
        with stopwatch() as sw:
            _ = cv_clf.fit(X_train,y_train)
        time[0] = time[0] + sw.duration
        with stopwatch() as sw:
            predict[0] = cv_clf.predict(X_test)
        time[1] = time[1] + sw.duration
        for k in range(0,sc):
            score[k] = 0
        with stopwatch() as sw:    
            _ = cv_clf2.fit(train_X,y_train)
        time[2] = time[2] + sw.duration
        with stopwatch() as sw:
            predict[1] = cv_clf2.predict(test_X)
        time[3] = time[3] + sw.duration
        if information >=2:            
            for k in range(0,sc):
                guessed[k].append(distr_guessed(predict[k]))        
                    
        for k in range(0,sc):
            scorings[k].append(accuracy_score(y_test,predict[k]))
        if information >= 3:
            for k in range(0,sc):
                predicts[k].append(predict[k])
            predicts[sc].append(y_test)
        
    
    if(information >= 3):
        return scorings,guessed,predicts,time
    
    elif information == 2:
        return scorings,guessed
    else:
        return scorings
				# Calculate meta-features
				log("simple-mf")
				simple_features = simple_metafeatures(X_s, y_s, categorical)
				log("stat-mf")
				statistical_features = statistical_metafeatures(X_s, y_s, categorical)
				log("info-mf")
				info_features = information_theoretic_metafeatures(X_s, y_s, categorical)
				log("landmark-mf")
				landmark_features = landmarker_metafeatures(X_s, y_s, categorical, folds)

				# Run baseleaner experiments
				baselearner_results = OrderedDict()
				for baselearner in config.base_learners:
					log("base-learners: {}".format(baselearner.__name__))	
					with stopwatch() as sw:
						baselearner().fit(X_s, y_s)

					baselearner_results[baselearner.__name__] = sw.duration # result if type(result) is float else "E"

				with open(config.document_name, 'a') as fh:
					feature_list = [[did, i], simple_features.values(), statistical_features.values(), info_features.values(),
									 landmark_features.values(), subsample_features.values(),baselearner_results.values()]
					list_as_string = ",".join([str(item) for sublist in feature_list for item in sublist])
					fh.write(list_as_string + "\n")
				del X_s, y_s

		del X, y, categorical, dataset, task

	except Exception as err:
		log_traceback(config.logfile_name)