from utils.DatasetGeneration import DeterministicGenerator from classification.ClassificationTest import ClassificationTest from classification.KNNClassification import KNNClassifier # script for corresponding test case # most test cases should be able to be executed without any further changes, if data is available SAMPLE_SIZE = [5000, 10000, 20000, 50000] N_NEIGHBORS = 1 # change device to "cpu" if cuda not available DEVICE = "cuda" stopwatch = StopWatch() # pregenerated embedding and labels for sample_size in SAMPLE_SIZE: file_words = open("../tests/embedding7.json") file_labels = open("../data/unique_labels.json") data_provider = DataProviderLight(file_words, file_labels, sample_size=sample_size) # embedding data, splitting up into train and test set processor = PregeneratedProcessor(data_provider) generator = DeterministicGenerator(data_provider, processor) dataset = generator.generate_dataset() print("Sample Size: " + str(sample_size)) # creating classifier, overwriting parameters classifier = KNNClassifier(data_provider, dataset, DEVICE) classifier.n_neighbours = N_NEIGHBORS # train classifier and output progress classifier.train() # testing test = ClassificationTest(dataset, classifier) print("Präzision: " + str(test.test()) + "%") file_words.close() file_labels.close()
from classification.ClassificationTrainer import ClassificationTrainer # script for corresponding test case # most test cases should be able to be executed without any further changes, if data is available SAMPLE_SIZE = 100000 EPOCHS = 150 HIDDEN_SIZE = 10000 BATCH_SIZE = 128 # change device to "cpu" if cuda not available DEVICE = "cuda" stopwatch = StopWatch() # pregenerated embedding and labels file_words = open("../tests/embedding8.json") file_labels = open("../data/unique_labels.json") data_provider = DataProviderLight(file_words, file_labels, sample_size=SAMPLE_SIZE) # embedding data, splitting up into train and test set processor = PregeneratedProcessor(data_provider) generator = DeterministicGenerator(data_provider, processor) dataset = generator.generate_dataset() # creating classifier, overwriting parameters classifier = FFNClassifier(data_provider, dataset, DEVICE, BATCH_SIZE, HIDDEN_SIZE) # train classifier and output progress trainer = ClassificationTrainer(classifier) trainer.enable_loss(10, True) trainer.enable_precision(10, True) trainer.enable_trainset_precision(10, True) trainer.train(EPOCHS) trainer.show()
SAMPLE_SIZE = [2000, 5000, 10000, 20000, 50000] FEATURES = 100 BATCH_SIZE = 256 EPOCHS = 5 N_NEIGHBORS = 5 # change device to "cpu" if cuda not available DEVICE = "cuda" for sample_size in SAMPLE_SIZE: # raw words and labels file_words = open("../data/unique_equations.json") file_labels = open("../data/unique_labels.json") # pre calculated weight matrix file_weights = open("../data/weights_0.json") data_provider = DataProviderLight(file_words, file_labels, sample_size=sample_size, file_weights=file_weights) # embedding data, splitting up into train and test set processor = VectorProcessor(data_provider) generator = DeterministicGenerator(data_provider, processor) stopwatch = StopWatch() # training the word2vec net word2vec = Word2Vec(data_provider, FEATURES, DEVICE) word2vec.train(EPOCHS, BATCH_SIZE) # extracting weights and injecting them into the data provider data_provider.weights = torch.tensor(word2vec.get_weights()) # generate dataset dataset = generator.generate_dataset() # train knn classifier classifier = KNNClassifier(data_provider, dataset, DEVICE)
from utils.DataProviderLight import DataProviderLight import matplotlib.pyplot as plt # analyze some aspects of the data file_words = open("../data/unique_equations.json") file_labels = open("../data/unique_labels.json") data_provider = DataProviderLight(file_words, file_labels) def count_word_lengths(): lengths = {} for word in data_provider.words: if len(word) in lengths.keys(): lengths[len(word)] = lengths[len(word)] + 1 else: lengths[len(word)] = 1 x = [key for key in lengths.keys()] y = [lengths[key] for key in lengths.keys()] fig, ax = plt.subplots() ax.scatter(x, y, color="tab:blue", s=10) ax.set_xlabel("Länge der Formel") ax.set_ylabel("Anzahl der Formeln") plt.show() print("done") def count_class_sizes(): labels = {} for label in data_provider.labels: if label in labels: labels[label] = labels[label] + 1
from classification.KNNClassification import KNNClassifier from classification.ClassificationTest import ClassificationTest # script for corresponding test case # most test cases should be able to be executed without any further changes, if data is available FEATURES = 100 SAMPLE_SIZE = 10000 EPOCHS = 5 BATCH_SIZE = 32 # change device to "cpu" if cuda not available DEVICE = "cuda" stopwatch = StopWatch() # pregenerated embedding and labels file_words = open("../data/unique_equations.json") file_labels = open("../data/unique_labels.json") file_weights = open("../data/weights_0.json") data_provider = DataProviderLight(file_words, file_labels, sample_size=SAMPLE_SIZE, file_weights=file_weights) processor = VectorProcessor(data_provider) generator = DeterministicGenerator(data_provider, processor) w2v_epochs = Word2Vec(data_provider, FEATURES, DEVICE) stopwatch.start() w2v_epochs.train(EPOCHS, BATCH_SIZE) stopwatch.stop() data_provider.weights = torch.tensor(w2v_epochs.get_weights()) dataset = generator.generate_dataset() classifier = KNNClassifier(data_provider, dataset, DEVICE) classifier.n_neighbours = 5 classifier.train() test = ClassificationTest(dataset, classifier) print("Präzision: " + str(test.test()) + "%")