def collect(): print('[+] Transforming Data...') td.Data_Collect(p.configure()['dataset_dir'], p.configure()['sentiments']).retrieve( p.configure()['sentiment_adjusted']) return None
def train_CNN(): print('[+] Load Data') text, label, train_data, valid_data, test_data = ld.load_data() print('[+] Build Vocabulary') text, label = ld.build_vocabulary(text, label, train_data) print('[+] Set Iterators') train_iter, valid_iter, test_iter = ld.fetch_iterators( train_data, valid_data, test_data) print('[i] Train Iterator Info: \n') print(f'[i] Length of Train Iter: {len(train_iter)}') cnn_model = build.set_NN(text, label) print( f'[i] The model has {build.count_parameters(cnn_model):,} trainable parameters' ) cnn_model = build.embed_vectors(text, cnn_model) print('[+] Save Text Data') with open('model/TEXT.Field', 'wb') as f: dill.dump(text, f) cnn_model, optimizer, criterion = build.fetch_loss_utils(cnn_model) best_valid_loss = float('inf') for epoch in range(p.configure()['EPOCHS']): start = time.time() train_loss, train_acc = build.train(cnn_model, train_iter, optimizer, criterion) valid_loss, valid_acc = build.evaluate(cnn_model, valid_iter, criterion) end = time.time() epoch_mins, epoch_secs = build.epoch_times(start, end) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(cnn_model, p.configure()['MODEL']) print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%' ) return cnn_model
def build_vocabulary(TEXT, LABEL, train_data): TEXT.build_vocab(train_data, max_size=p.configure()['MAX_SIZE'], vectors=p.configure()['GLOVE_DIR'], unk_init=torch.Tensor.normal_) print('[i] Text Vocabulary Built...') LABEL.build_vocab(train_data) print('[i] Label Vocabulary Built...') return TEXT, LABEL
def set_NN(text): cnn_model = CNN(len(text.vocab), P.configure()['embedding_dim'], P.configure()['n_filters'], P.configure()['filter_sizes'], P.configure()['output_dim'], P.configure()['dropout'], pad_idx=text.vocab.stoi[text.pad_token]) print(f'[+] Model Configured...\n \ {cnn_model}') return cnn_model
def embed_vectors(text, model): pretrained = text.vocab.vectors model.embedding.weight.data.copy_(pretrained) print('[+] Pretrained Vectors Set...') UNK_IDX = text.vocab.stoi[text.unk_token] PAD_IDX = text.vocab.stoi[text.pad_token] model.embedding.weight.data[UNK_IDX] = torch.zeros(P.configure()['embedding_dim']) model.embedding.weight.data[PAD_IDX] = torch.zeros(P.configure()['embedding_dim']) print('[+] Embedding Weights Set...') return model
def train_model(): # prepare data text, label, train_data, valid_data, test_data = ds.fetch_data() text, label = ds.build_vocabulary(text, label, train_data) train_iter, valid_iter, test_iter = ds.fetch_iterators( train_data, valid_data, test_data) # build model and set parameters cnn_model = build.set_NN(text) print( f'The model has {build.count_parameters(cnn_model):,} trainable parameters' ) cnn_model = build.embed_vectors(text, cnn_model) cnn_model, optimizer, criterion = build.fetch_loss_utils(cnn_model) # 'save model' conditional best_valid_loss = float('inf') # training loop print('[i] Begin Training...') for epoch in range(P.configure()['EPOCHS']): start_time = time.time() train_loss, train_acc = build.train(cnn_model, train_iter, optimizer, criterion) valid_loss, valid_acc = build.evaluate(cnn_model, valid_iter, criterion) end_time = time.time() epoch_mins, epoch_secs = build.epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(cnn_model, P.configure()['model']) print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%' ) print('[i] Training Finished...\n') return cnn_model, text
def fetch(): print('[+] Requesting Data from Twitter...') crawler = tw.Twitter_cli() crawler.sentiment_crawler(p.configure()['sentiments']) print('[i] Tweets Collected...') return None
def fetch_iterators(train, valid, test): train_iter, valid_iter, test_iter = data.BucketIterator.splits( (train, valid, test), batch_size=P.configure()['batch_size'], device=fetch_device()) print('[+] Dataloaders Set...') return train_iter, valid_iter, test_iter
def fetch_data(): random.seed(P.configure()['seed']) np.random.seed(P.configure()['seed']) torch.manual_seed(P.configure()['seed']) torch.backends.cudnn.deterministic = True print('[+] Seeds Set...') text = data.Field(tokenize='spacy', batch_first=True) print('[+] Text Recieved...') label = data.LabelField(dtype=torch.float) print('[+] Label Recieved...') print('[+] Transforming...') train_data, test_data = datasets.IMDB.splits(text, label) print('[+] Train | Test Split Set...') train_data, valid_data = train_data.split( random_state=random.seed(P.configure()['seed'])) print('[+] Train | Validation Split Set...') return text, label, train_data, valid_data, test_data
def fetch_iterators(train, valid, test): train_iter, valid_iter, test_iter = data.BucketIterator.splits( (train, valid, test), sort_key=lambda x: x.text, batch_size=p.configure()['BATCH_SIZE'], device=fetch_device()) print('[i] Data Loaders Set...') return train_iter, valid_iter, test_iter
def build_vocabulary(text, label, train_data): text.build_vocab(train_data, max_size=P.configure()['max_vocab_size'], vectors='glove.6B.100d', unk_init=torch.Tensor.normal_) print('[+] Text Vocabulary Built...') label.build_vocab(train_data) print('[+] Label Vocabulary Built...') return text, label
def load_data(): torch.manual_seed(p.configure()['SEED']) torch.backends.cudnn.deterministic = True TEXT = data.Field(tokenize='spacy') LABEL = data.Field() CLASS = data.Field(sequential=False, use_vocab=False) fields = [(None, None), ('text', TEXT), ('label', LABEL), ('cl', CLASS)] train_data, valid_data, test_data = data.TabularDataset.splits( path='data', train='train.csv', validation='valid.csv', test='test.csv', format='csv', fields=fields, skip_header=True) return TEXT, LABEL, train_data, valid_data, test_data
epoch_mins, epoch_secs = build.epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(cnn_model, P.configure()['model']) print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%' ) print('[i] Training Finished...\n') return cnn_model, text if __name__ == '__main__': cnn_model, text = train_model() print('[i] Evaluate Model\n') build.predict_sentiment(cnn_model, text, P.configure()['positive_sentence']) build.predict_sentiment(cnn_model, text, P.configure()['negative_sentence'])