예제 #1
0
from utils.DatasetGeneration import DeterministicGenerator
from classification.ClassificationTest import ClassificationTest
from classification.KNNClassification import KNNClassifier
# script for corresponding test case
# most test cases should be able to be executed without any further changes, if data is available

SAMPLE_SIZE = [5000, 10000, 20000, 50000]
N_NEIGHBORS = 1
# change device to "cpu" if cuda not available
DEVICE = "cuda"
stopwatch = StopWatch()
# pregenerated embedding and labels
for sample_size in SAMPLE_SIZE:
    file_words = open("../tests/embedding7.json")
    file_labels = open("../data/unique_labels.json")
    data_provider = DataProviderLight(file_words, file_labels, sample_size=sample_size)
    # embedding data, splitting up into train and test set
    processor = PregeneratedProcessor(data_provider)
    generator = DeterministicGenerator(data_provider, processor)
    dataset = generator.generate_dataset()
    print("Sample Size: " + str(sample_size))
    # creating classifier, overwriting parameters
    classifier = KNNClassifier(data_provider, dataset, DEVICE)
    classifier.n_neighbours = N_NEIGHBORS
    # train classifier and output progress
    classifier.train()
    # testing
    test = ClassificationTest(dataset, classifier)
    print("Präzision: " + str(test.test()) + "%")
    file_words.close()
    file_labels.close()
예제 #2
0
from classification.ClassificationTrainer import ClassificationTrainer
# script for corresponding test case
# most test cases should be able to be executed without any further changes, if data is available

SAMPLE_SIZE = 100000
EPOCHS = 150
HIDDEN_SIZE = 10000
BATCH_SIZE = 128
# change device to "cpu" if cuda not available
DEVICE = "cuda"
stopwatch = StopWatch()
# pregenerated embedding and labels
file_words = open("../tests/embedding8.json")
file_labels = open("../data/unique_labels.json")
data_provider = DataProviderLight(file_words,
                                  file_labels,
                                  sample_size=SAMPLE_SIZE)
# embedding data, splitting up into train and test set
processor = PregeneratedProcessor(data_provider)
generator = DeterministicGenerator(data_provider, processor)
dataset = generator.generate_dataset()
# creating classifier, overwriting parameters
classifier = FFNClassifier(data_provider, dataset, DEVICE, BATCH_SIZE,
                           HIDDEN_SIZE)
# train classifier and output progress
trainer = ClassificationTrainer(classifier)
trainer.enable_loss(10, True)
trainer.enable_precision(10, True)
trainer.enable_trainset_precision(10, True)
trainer.train(EPOCHS)
trainer.show()
예제 #3
0
SAMPLE_SIZE = [2000, 5000, 10000, 20000, 50000]
FEATURES = 100
BATCH_SIZE = 256
EPOCHS = 5
N_NEIGHBORS = 5
# change device to "cpu" if cuda not available
DEVICE = "cuda"

for sample_size in SAMPLE_SIZE:
    # raw words and labels
    file_words = open("../data/unique_equations.json")
    file_labels = open("../data/unique_labels.json")
    # pre calculated weight matrix
    file_weights = open("../data/weights_0.json")
    data_provider = DataProviderLight(file_words,
                                      file_labels,
                                      sample_size=sample_size,
                                      file_weights=file_weights)
    # embedding data, splitting up into train and test set
    processor = VectorProcessor(data_provider)
    generator = DeterministicGenerator(data_provider, processor)
    stopwatch = StopWatch()

    # training the word2vec net
    word2vec = Word2Vec(data_provider, FEATURES, DEVICE)
    word2vec.train(EPOCHS, BATCH_SIZE)
    # extracting weights and injecting them into the data provider
    data_provider.weights = torch.tensor(word2vec.get_weights())
    # generate dataset
    dataset = generator.generate_dataset()
    # train knn classifier
    classifier = KNNClassifier(data_provider, dataset, DEVICE)
예제 #4
0
from utils.DataProviderLight import DataProviderLight
import matplotlib.pyplot as plt
# analyze some aspects of the data

file_words = open("../data/unique_equations.json")
file_labels = open("../data/unique_labels.json")
data_provider = DataProviderLight(file_words, file_labels)


def count_word_lengths():
    lengths = {}
    for word in data_provider.words:
        if len(word) in lengths.keys():
            lengths[len(word)] = lengths[len(word)] + 1
        else:
            lengths[len(word)] = 1
    x = [key for key in lengths.keys()]
    y = [lengths[key] for key in lengths.keys()]
    fig, ax = plt.subplots()
    ax.scatter(x, y, color="tab:blue", s=10)
    ax.set_xlabel("Länge der Formel")
    ax.set_ylabel("Anzahl der Formeln")
    plt.show()
    print("done")


def count_class_sizes():
    labels = {}
    for label in data_provider.labels:
        if label in labels:
            labels[label] = labels[label] + 1
예제 #5
0
from classification.KNNClassification import KNNClassifier
from classification.ClassificationTest import ClassificationTest
# script for corresponding test case
# most test cases should be able to be executed without any further changes, if data is available

FEATURES = 100
SAMPLE_SIZE = 10000
EPOCHS = 5
BATCH_SIZE = 32
# change device to "cpu" if cuda not available
DEVICE = "cuda"
stopwatch = StopWatch()
# pregenerated embedding and labels
file_words = open("../data/unique_equations.json")
file_labels = open("../data/unique_labels.json")
file_weights = open("../data/weights_0.json")
data_provider = DataProviderLight(file_words, file_labels, sample_size=SAMPLE_SIZE, file_weights=file_weights)
processor = VectorProcessor(data_provider)
generator = DeterministicGenerator(data_provider, processor)

w2v_epochs = Word2Vec(data_provider, FEATURES, DEVICE)
stopwatch.start()
w2v_epochs.train(EPOCHS, BATCH_SIZE)
stopwatch.stop()
data_provider.weights = torch.tensor(w2v_epochs.get_weights())
dataset = generator.generate_dataset()
classifier = KNNClassifier(data_provider, dataset, DEVICE)
classifier.n_neighbours = 5
classifier.train()
test = ClassificationTest(dataset, classifier)
print("Präzision: " + str(test.test()) + "%")