Exemplo n.º 1
0
def collect():

    print('[+] Transforming Data...')
    td.Data_Collect(p.configure()['dataset_dir'],
                    p.configure()['sentiments']).retrieve(
                        p.configure()['sentiment_adjusted'])

    return None
Exemplo n.º 2
0
def train_CNN():
    print('[+] Load Data')
    text, label, train_data, valid_data, test_data = ld.load_data()
    print('[+] Build Vocabulary')
    text, label = ld.build_vocabulary(text, label, train_data)
    print('[+] Set Iterators')
    train_iter, valid_iter, test_iter = ld.fetch_iterators(
        train_data, valid_data, test_data)
    print('[i] Train Iterator Info: \n')
    print(f'[i] Length of Train Iter: {len(train_iter)}')

    cnn_model = build.set_NN(text, label)

    print(
        f'[i] The model has {build.count_parameters(cnn_model):,} trainable parameters'
    )

    cnn_model = build.embed_vectors(text, cnn_model)

    print('[+] Save Text Data')
    with open('model/TEXT.Field', 'wb') as f:
        dill.dump(text, f)

    cnn_model, optimizer, criterion = build.fetch_loss_utils(cnn_model)

    best_valid_loss = float('inf')

    for epoch in range(p.configure()['EPOCHS']):

        start = time.time()

        train_loss, train_acc = build.train(cnn_model, train_iter, optimizer,
                                            criterion)
        valid_loss, valid_acc = build.evaluate(cnn_model, valid_iter,
                                               criterion)

        end = time.time()

        epoch_mins, epoch_secs = build.epoch_times(start, end)

        if valid_loss < best_valid_loss:

            best_valid_loss = valid_loss

            torch.save(cnn_model, p.configure()['MODEL'])

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(
            f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%'
        )
        print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%'
        )

    return cnn_model
Exemplo n.º 3
0
def build_vocabulary(TEXT, LABEL, train_data):

    TEXT.build_vocab(train_data,
                     max_size=p.configure()['MAX_SIZE'],
                     vectors=p.configure()['GLOVE_DIR'],
                     unk_init=torch.Tensor.normal_)
    print('[i] Text Vocabulary Built...')

    LABEL.build_vocab(train_data)
    print('[i] Label Vocabulary Built...')

    return TEXT, LABEL
Exemplo n.º 4
0
def set_NN(text):

    cnn_model = CNN(len(text.vocab),
                    P.configure()['embedding_dim'],
                    P.configure()['n_filters'],
                    P.configure()['filter_sizes'],
                    P.configure()['output_dim'],
                    P.configure()['dropout'],
                    pad_idx=text.vocab.stoi[text.pad_token])
    print(f'[+] Model Configured...\n \
          {cnn_model}')
    return cnn_model
Exemplo n.º 5
0
def embed_vectors(text, model):
    
    pretrained = text.vocab.vectors
    
    model.embedding.weight.data.copy_(pretrained)
    print('[+] Pretrained Vectors Set...')
    UNK_IDX = text.vocab.stoi[text.unk_token]
    PAD_IDX = text.vocab.stoi[text.pad_token]

    model.embedding.weight.data[UNK_IDX] = torch.zeros(P.configure()['embedding_dim'])
    model.embedding.weight.data[PAD_IDX] = torch.zeros(P.configure()['embedding_dim'])
    print('[+] Embedding Weights Set...')

    return model
Exemplo n.º 6
0
def train_model():

    # prepare data
    text, label, train_data, valid_data, test_data = ds.fetch_data()
    text, label = ds.build_vocabulary(text, label, train_data)
    train_iter, valid_iter, test_iter = ds.fetch_iterators(
        train_data, valid_data, test_data)
    # build model and set parameters
    cnn_model = build.set_NN(text)
    print(
        f'The model has {build.count_parameters(cnn_model):,} trainable parameters'
    )
    cnn_model = build.embed_vectors(text, cnn_model)
    cnn_model, optimizer, criterion = build.fetch_loss_utils(cnn_model)
    # 'save model' conditional
    best_valid_loss = float('inf')
    # training loop
    print('[i] Begin Training...')
    for epoch in range(P.configure()['EPOCHS']):

        start_time = time.time()

        train_loss, train_acc = build.train(cnn_model, train_iter, optimizer,
                                            criterion)
        valid_loss, valid_acc = build.evaluate(cnn_model, valid_iter,
                                               criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = build.epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:

            best_valid_loss = valid_loss
            torch.save(cnn_model, P.configure()['model'])

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(
            f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%'
        )
        print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%'
        )

    print('[i] Training Finished...\n')

    return cnn_model, text
Exemplo n.º 7
0
def fetch():

    print('[+] Requesting Data from Twitter...')
    crawler = tw.Twitter_cli()
    crawler.sentiment_crawler(p.configure()['sentiments'])
    print('[i] Tweets Collected...')

    return None
Exemplo n.º 8
0
def fetch_iterators(train, valid, test):

    train_iter, valid_iter, test_iter = data.BucketIterator.splits(
        (train, valid, test),
        batch_size=P.configure()['batch_size'],
        device=fetch_device())
    print('[+] Dataloaders Set...')

    return train_iter, valid_iter, test_iter
Exemplo n.º 9
0
def fetch_data():

    random.seed(P.configure()['seed'])
    np.random.seed(P.configure()['seed'])
    torch.manual_seed(P.configure()['seed'])
    torch.backends.cudnn.deterministic = True
    print('[+] Seeds Set...')

    text = data.Field(tokenize='spacy', batch_first=True)
    print('[+] Text Recieved...')
    label = data.LabelField(dtype=torch.float)
    print('[+] Label Recieved...')
    print('[+] Transforming...')
    train_data, test_data = datasets.IMDB.splits(text, label)
    print('[+] Train | Test Split Set...')
    train_data, valid_data = train_data.split(
        random_state=random.seed(P.configure()['seed']))
    print('[+] Train | Validation Split Set...')

    return text, label, train_data, valid_data, test_data
Exemplo n.º 10
0
def fetch_iterators(train, valid, test):

    train_iter, valid_iter, test_iter = data.BucketIterator.splits(
        (train, valid, test),
        sort_key=lambda x: x.text,
        batch_size=p.configure()['BATCH_SIZE'],
        device=fetch_device())

    print('[i] Data Loaders Set...')

    return train_iter, valid_iter, test_iter
Exemplo n.º 11
0
def build_vocabulary(text, label, train_data):

    text.build_vocab(train_data,
                     max_size=P.configure()['max_vocab_size'],
                     vectors='glove.6B.100d',
                     unk_init=torch.Tensor.normal_)
    print('[+] Text Vocabulary Built...')

    label.build_vocab(train_data)
    print('[+] Label Vocabulary Built...')

    return text, label
Exemplo n.º 12
0
def load_data():

    torch.manual_seed(p.configure()['SEED'])
    torch.backends.cudnn.deterministic = True

    TEXT = data.Field(tokenize='spacy')
    LABEL = data.Field()
    CLASS = data.Field(sequential=False, use_vocab=False)

    fields = [(None, None), ('text', TEXT), ('label', LABEL), ('cl', CLASS)]

    train_data, valid_data, test_data = data.TabularDataset.splits(
        path='data',
        train='train.csv',
        validation='valid.csv',
        test='test.csv',
        format='csv',
        fields=fields,
        skip_header=True)

    return TEXT, LABEL, train_data, valid_data, test_data
Exemplo n.º 13
0
        epoch_mins, epoch_secs = build.epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:

            best_valid_loss = valid_loss
            torch.save(cnn_model, P.configure()['model'])

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(
            f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%'
        )
        print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%'
        )

    print('[i] Training Finished...\n')

    return cnn_model, text


if __name__ == '__main__':

    cnn_model, text = train_model()

    print('[i] Evaluate Model\n')
    build.predict_sentiment(cnn_model, text,
                            P.configure()['positive_sentence'])
    build.predict_sentiment(cnn_model, text,
                            P.configure()['negative_sentence'])