예제 #1
0
def main(dataset, output, epsilon, capacity, width, kernel_type):

	LOGGER.info("SVM Multiclass classifier")
	LOGGER.info("Epsilon: %s" % epsilon)
	LOGGER.info("Capacity: %s" % capacity)
	LOGGER.info("Gaussian width: %s" % width)

	# Get features
	feats, labels = get_features_and_labels(LibSVMFile(dataset))

	# Create kernel
	try:
		kernel = KERNELS[kernel_type](feats, width)
	except KeyError:
		LOGGER.error("Kernel %s not available. try Gaussian or Linear" % kernel_type)

	# Initialize and train Multiclass SVM
	svm = MulticlassLibSVM(capacity, kernel, labels)
	svm.set_epsilon(epsilon)
	with track_execution():
		svm.train()

	# Serialize to file
	writable_file = SerializableHdf5File(output, 'w')
	with closing(writable_file):
		svm.save_serializable(writable_file)
	LOGGER.info("Serialized classifier saved in: '%s'" % output)
def main(classifier, testset, output):
	LOGGER.info("SVM Multiclass evaluation")

	svm = MulticlassLibSVM()
	serialized_classifier = SerializableHdf5File(classifier, 'r')
	with closing(serialized_classifier):
		svm.load_serializable(serialized_classifier)

	test_feats, test_labels = get_features_and_labels(LibSVMFile(testset))
	predicted_labels = svm.apply(test_feats)

	with open(output, 'w') as f:
		for cls in predicted_labels.get_labels():
			f.write("%s\n" % int(cls))

	LOGGER.info("Predicted labels saved in: '%s'" % output)
예제 #3
0
def main(classifier, testset, output):
    LOGGER.info("SVM Multiclass evaluation")

    svm = MulticlassLibSVM()
    serialized_classifier = SerializableHdf5File(classifier, 'r')
    with closing(serialized_classifier):
        svm.load_serializable(serialized_classifier)

    test_feats, test_labels = get_features_and_labels(LibSVMFile(testset))
    predicted_labels = svm.apply(test_feats)

    with open(output, 'w') as f:
        for cls in predicted_labels.get_labels():
            f.write("%s\n" % int(cls))

    LOGGER.info("Predicted labels saved in: '%s'" % output)
예제 #4
0
def main(actual, predicted):
	LOGGER.info("SVM Multiclass evaluator")

	# Load SVMLight dataset
	feats, labels = get_features_and_labels(LibSVMFile(actual))

	# Load predicted labels
	with open(predicted, 'r') as f:
		predicted_labels_arr = np.array([float(l) for l in f])
		predicted_labels = MulticlassLabels(predicted_labels_arr)

	# Evaluate accuracy
	multiclass_measures = MulticlassAccuracy()
	LOGGER.info("Accuracy = %s" % multiclass_measures.evaluate(
		labels, predicted_labels))
	LOGGER.info("Confusion matrix:")
	res = multiclass_measures.get_confusion_matrix(labels, predicted_labels)
	print res
예제 #5
0
def train_models(num_splits, num_iter, metric):
    random_state = np.random.RandomState(1234)
    outputs = []
    data_dir = os.path.join('data', 'train')
    train_files = os.path.join(data_dir, '*.pkl')
    algorithms = [{
        'name': 'naive-bayes',
        'acronym': 'NB'
    }, {
        'name': 'support-vector-machine',
        'acronym': 'SVM'
    }, {
        'name': 'logistic-regression',
        'acronym': 'LR'
    }, {
        'name': 'random-forest',
        'acronym': 'RF'
    }, {
        'name': 'gradient-boosting',
        'acronym': 'GB'
    }]
    # Train models
    print('Training models')
    kfold = KFold(n_splits=num_splits, shuffle=True, random_state=random_state)
    files = glob.glob(train_files)
    num_files = len(
        [name for name in os.listdir(data_dir) if os.path.isfile(name)])
    num_loops = num_files * len(algorithms)
    with tqdm(total=num_loops, file=sys.stdout) as pbar:
        for file in files:
            with open(file, 'rb') as f:
                data = joblib.load(f)
            train_data = data['data']
            y_train = train_data['label'].values
            text_features = list(train_data.iloc[:, 0].values)
            extra_features = np.array(train_data.iloc[:, 2:].values)
            X_train = np.concatenate((text_features, extra_features), axis=1)
            for algorithm in algorithms:
                classifier = get_classifier(algorithm['acronym'], random_state)
                scores = cross_val_score(classifier,
                                         X_train,
                                         y=y_train,
                                         scoring=metric,
                                         cv=kfold,
                                         n_jobs=-1)
                outputs.append({
                    'algorithm': algorithm['acronym'],
                    'train_filename': file,
                    'metric_scores': scores,
                })
                pbar.update(1)
    # Load results in a dataframe
    print('Loading experiment results in a dataframe')
    output_df = pd.DataFrame(
        columns=['algorithm', 'train_data_file', f'mean_{metric}'])
    for output in outputs:
        row = {
            'algorithm': output['algorithm'],
            'train_data_file': output['train_filename'],
            f'mean_{metric}': round(output['metric_scores'].mean(), 2),
            f'std_{metric}': round(output['metric_scores'].std(), 2)
        }
        output_df = output_df.append(row, ignore_index=True)
    # Save dataframe
    experiment_dir = 'experiments'
    os.makedirs(experiment_dir, exist_ok=True)
    experiment_filename = 'e_{}.csv'.format(
        datetime.now().strftime('%d%m%Y_%H%M%S'))
    experiment_file_path = os.path.join(experiment_dir, experiment_filename)
    print(f'Saving experiment results in {experiment_file_path}')
    output_df.to_csv(experiment_file_path, index=False)
    # Train algorithms on data transformation that work best for each of them
    print('Doing hyperparametrization')
    with tqdm(total=len(algorithms), file=sys.stdout) as pbar:
        for algorithm in algorithms:
            if algorithm['acronym'] != 'NB':
                best_model = output_df[output_df['algorithm']==algorithm['acronym']].\
                    sort_values(by=[f'mean_{metric}', f'std_{metric}'], ascending=False).head(1)
                train_data_file = best_model['train_data_file'].values[0]
                best_model = do_train_model(algorithm['acronym'],
                                            train_data_file, algorithm['name'],
                                            num_splits, num_iter, metric)
                features, labels = get_features_and_labels(train_data_file)
                plot_learning_curve(best_model,
                                    f"{algorithm['acronym']} learning curves",
                                    features,
                                    labels,
                                    metric,
                                    cv=kfold,
                                    shuffle=True,
                                    save_fig=True)
            pbar.update(1)