예제 #1
0
 def file_output(self, loss_avg, accuracy, epoch_number, time_passed):
     file_path = os.path.join(self.results_path, self.RESULTS_FILENAME)
     append_to_file(file_path,
                    str(self.outputs_counter) +
                    self.SEPARATOR + str(epoch_number) +
                    self.SEPARATOR + str(loss_avg) +
                    self.SEPARATOR + str(accuracy) +
                    self.SEPARATOR + str(time_passed) + '\n')
예제 #2
0
 def add_network_info(self):
     path = os.path.join(self.results_path, self.NETWORK_INFO_FILENAME)
     append_to_file(path,
                    'num_layers: ' + str(self.num_layers) + '\n' +
                    'hidden_size: ' + str(self.hidden_size) + '\n' +
                    'batch_size: ' + str(self.batch_size) + '\n' +
                    'timesteps: ' + str(self.timesteps) + '\n' +
                    'learning_rate: ' + str(self.learning_rate) + '\n' +
                    'num_epochs: ' + str(self.num_epochs) + '\n' +
                    'vocab_size: ' + str(self.vocab_size) + '\n' +
                    'authors_size: ' + str(self.authors_size) + '\n')
예제 #3
0
    def get_average_cross_entropies(self):
        average_cross_entropies_batch_processor = OptimizedBatchProcessor(
            tensors_dir=self.training_tensors_path,
            batch_size=self.batch_size,
            authors_size=self.authors_size,
            timesteps=self.timesteps,
            language=self.language,
            vocab_size=self.vocab_size)
        average_cross_entropies_batch_processor.new_epoch()
        states = (torch.zeros(self.num_layers, self.batch_size,
                              self.hidden_size),
                  torch.zeros(self.num_layers, self.batch_size,
                              self.hidden_size))

        authors_with_average_loss = self.initialize_average_training_loss_struct(
        )

        append_to_file('output.txt', 'after init\n')
        while average_cross_entropies_batch_processor.next_batch():
            batches, target, authors_order = average_cross_entropies_batch_processor.get_results(
            )
            batches = batches.type(torch.FloatTensor)
            outputs, _ = self.model(batches, states)
            for head in range(self.authors_size):
                softmax = self.softmax(outputs[head])
                vector = self.loss(softmax, target)
                for counter, author in enumerate(authors_order):
                    authors_with_average_loss[
                        author - 1]['sum'] += vector[counter].item()
                    authors_with_average_loss[author - 1]['counter'] += 1

        append_to_file('output.txt', 'after while\n')
        for author in authors_with_average_loss:
            author['sum'] /= author['counter']

        return authors_with_average_loss
import os
import re

from library.helpers.files.files_operations import check_if_file, TextFileLoader, append_to_file
from library.helpers.files.name_convention import check_name_convention, TEXT_NAME_CONVENTIONS, KNOWN_AUTHOR

output = []
path = '../data/old/english/authors'
output_file_path = 'en_train.txt'
for author in os.listdir(path):
    directory_path = os.path.join(path, author)
    sum_known = 0
    first_chars = ''
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if check_if_file(file_path) and check_name_convention(filename, TEXT_NAME_CONVENTIONS):
            text_file_loader = TextFileLoader(file_path)
            if re.match(KNOWN_AUTHOR, filename):
                first_chars = first_chars + text_file_loader.text[:100]
                sum_known += len(text_file_loader.text)
    output.append((author, sum_known, first_chars.replace("\n", "").replace(" ", "").replace("\t", "")))

output = sorted(output, key=lambda tup: tup[1], reverse=True)

for tup in output:
    append_to_file(output_file_path,
                   str(tup[0]) + "  " + str(tup[1]) + "     " + str(tup[2]) + "\n")
예제 #5
0
    def train(self):
        append_to_file('output.txt', '\nstart\n')
        self.time_start = time.time()
        counter = 0
        while True:
            batch_processor = BatchProcessor(
                tensors_dir=self.training_tensors_path,
                batch_size=self.batch_size,
                authors_size=self.authors_size,
                timesteps=self.timesteps,
                language=self.language,
                vocab_size=self.vocab_size)
            states = (torch.zeros(self.num_layers, self.batch_size,
                                  self.hidden_size),
                      torch.zeros(self.num_layers, self.batch_size,
                                  self.hidden_size))

            counter += 1
            batch_processor.new_epoch()
            while batch_processor.next_batch():
                batches, target, authors_order = batch_processor.get_results()
                batches = batches.type(torch.FloatTensor)
                outputs, _ = self.model(batches, states)
                heads_to_train = self.get_heads_for_training(authors_order)
                loss = 0
                # print('NEW BATCH')
                # for i, author in enumerate(batches):
                #     print(authors_order[i])
                #     for letter in author:
                #         print(decode_letter(letter), end='')
                #     print('\nnext_letter')
                #     print(decode_letter(class_to_one_hot(target[i])))
                for head in heads_to_train:
                    # creating mask
                    mask = (torch.tensor(authors_order) == head + 1).float()

                    # calculating softmax
                    softmax = self.softmax(outputs[head])

                    # calculating loss which is a vector of same size as outputs[head]
                    vector = self.loss(softmax, target)

                    # s = 0
                    # for elem in self.softmax(outputs[head])[0]:
                    #     s += elem
                    # print(s)
                    #
                    # vector = self.loss_fn(outputs[head], target)

                    # then we equalize to 0 elements of vector we don't need
                    vector = vector * mask

                    # and finally...
                    loss += torch.sum(vector) / torch.sum(mask)

                self.model.zero_grad()
                loss.backward()
                clip_grad_norm_(self.model.parameters(), 0.5)
                self.optimizer.step()

            self.get_accuracy(i=counter)
예제 #6
0
    def get_accuracy(self, i):
        append_to_file('output.txt', 'get accuracy \n')
        batch_processor = BatchProcessor(tensors_dir=self.testing_tensors_path,
                                         batch_size=self.batch_size,
                                         authors_size=self.authors_size,
                                         timesteps=self.timesteps,
                                         language=self.language,
                                         vocab_size=self.vocab_size)

        batch_processor.new_epoch()
        states = (torch.zeros(self.num_layers, self.batch_size,
                              self.hidden_size),
                  torch.zeros(self.num_layers, self.batch_size,
                              self.hidden_size))

        testing_data_looses = self.initialize_testing_loss_struct()

        # average loss collected using training data
        average_cross_entropies = self.get_average_cross_entropies()

        append_to_file('output.txt', 'average_cross_entropies\n')

        append_to_file('output.txt', str(average_cross_entropies) + '\n\n\n')

        while batch_processor.next_batch():
            # here we start using evaluation data
            batches, target, authors_order = batch_processor.get_results()
            batches = batches.type(torch.FloatTensor)
            outputs, _ = self.model(batches, states)

            # iterating through all heads
            for head in range(self.authors_size):
                # calculating cross entropies vector, where is included loss for each unknown author in batch
                # (for this iteration).
                softmax = self.softmax(outputs[head])
                entropies_vector = self.loss(softmax, target)
                # now, I can iterate through all unknown authors in batch for head I'm currently at
                for counter, author in enumerate(authors_order):
                    # and collect losses separately for each unknown author
                    testing_data_looses[head][author][
                        'sum'] += entropies_vector[counter].item()
                    testing_data_looses[head][author]['counter'] += 1

        # after this, it's time to get average loss for each unknown author in each head. And ...
        # to use average loss collected earlier from training data
        max_ = -100000
        min_ = 100000
        append_to_file('output.txt', 'min max\n')
        for head in range(self.authors_size):
            for author in range(self.authors_size):
                average = testing_data_looses[head][author + 1][
                    'sum'] / testing_data_looses[head][author + 1]['counter']
                # testing_data_looses[head][author + 1]['sum'] = average - average_cross_entropies[head]['sum']
                testing_data_looses[head][author + 1]['sum'] = average
                if testing_data_looses[head][author + 1]['sum'] < min_:
                    min_ = testing_data_looses[head][author + 1]['sum']
                if testing_data_looses[head][author + 1]['sum'] > max_:
                    max_ = testing_data_looses[head][author + 1]['sum']

        diff = max_ - min_
        for head in range(self.authors_size):
            for author in range(self.authors_size):
                testing_data_looses[head][author + 1]['sum'] = (
                    testing_data_looses[head][author + 1]['sum'] - min_) / diff

        results = []

        for author in range(self.authors_size):
            min_value = 1000
            min_head = -1
            for head in range(self.authors_size):
                if testing_data_looses[head][author + 1]['sum'] < min_value:
                    min_head = head
                    min_value = testing_data_looses[head][author + 1]['sum']
            results.append({
                'head': min_head,
                'unknown_author_number': author + 1,
                'loss_diff': min_value
            })
        append_to_file('output.txt', str(i) + '\n')
        append_to_file('output.txt', str(results))

        count = 0
        for elem in results:
            if elem['head'] + 1 == elem['unknown_author_number']:
                count += 1
        append_to_file('output.txt', '\n\ntrafieni:' + str(count))
        append_to_file('output.txt', '\n\naccuracy:' + str(count / 79))
예제 #7
0
test_path = 'test'
train_path = 'train'

create_directory(test_path)
create_directory(train_path)

for author in os.listdir(path):
    directory_path = os.path.join(path, author)
    sum_known = 0
    create_directory(os.path.join(test_path, author))
    create_directory(os.path.join(train_path, author))

    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if check_if_file(file_path) and check_name_convention(filename, TEXT_NAME_CONVENTIONS):
            text_file_loader = TextFileLoader(file_path)
            text = text_file_loader.text
            if re.match(KNOWN_AUTHOR, filename):
                length = len(text_file_loader.text)
                middle = length / 2
                test_save_path = os.path.join(test_path, author)
                train_save_path = os.path.join(train_path, author)
                create_file('known01.txt', test_save_path)
                create_file('unknown.txt', test_save_path)
                create_file('known01.txt', train_save_path)
                create_file('unknown.txt', train_save_path)
                append_to_file(os.path.join(test_save_path, 'known01.txt'), text[int(middle):int(length)])
                append_to_file(os.path.join(train_save_path, 'known01.txt'), text[0:int(middle)])
                append_to_file(os.path.join(test_save_path, 'unknown.txt'), text[int(middle):int(length)])
                append_to_file(os.path.join(train_save_path, 'unknown.txt'), text[0:int(middle)])
예제 #8
0
 def add_results_headline(self):
     path = os.path.join(self.results_path, self.RESULTS_FILENAME)
     append_to_file(path, self.HEADLINE)