reviews.append(
                    sentence2tensor(words, w2i, padding_idx, sent_length))
                if len(words) > sent_length:
                    lengths.append(sent_length)
                else:
                    lengths.append(len(words))
                labels.append(int(label))
                if count % 100000 == 0:
                    print('Encoded reviews: ', count)

    return reviews, labels, lengths


if __name__ == '__main__':
    train_file, validation_file = '../Data/train.csv', '../Data/validation.csv'
    w2i = pp.obtainW2i(train=train_file, validate=validation_file)
    print('Loaded vocabulary')
    w2i['<PAD>'] = 0

    vocab_size = len(w2i)
    padding_idx = 0
    sent_length = 80
    translator = str.maketrans('', '', string.punctuation)
    embedding_size = 50

    embedding = nn.Embedding(vocab_size,
                             embedding_size,
                             padding_idx=padding_idx)

    print('Embeddings calculated')
Пример #2
0
        encoder_hidden = encoder.initHidden()

        input_length = sentence_tensor.size(0)
        for ei in range(input_length):
            output, encoder_hidden = encoder(sentence_tensor[ei],
                                             encoder_hidden)

        output = torch.round(output)
        if torch.equal(output, label_tensor):
            accuracy += 1

    return accuracy / len(test_sentences)


if __name__ == '__main__':

    # file name for male reviews and female reviews
    vocabulary, w2i, sentences_m, sentences_f = pp.obtainW2i(
        "../Data/sample_male", "../Data/sample_female")
    train_senetences, train_labels, test_sentences, test_labels = pp.testTrainSplit(
        sentences_m, sentences_f)
    hidden_size = 20
    input_size = len(w2i)
    output_size = 1
    encoder = Encoder(input_size, hidden_size, output_size)
    encoder = encoder.to(device)
    batch_train(encoder, train_senetences, train_labels, 3, w2i)
    #accuracy = evaluate(encoder,test_sentences, test_labels,w2i)
    #print(accuracy)