def prepare_data(self):
        test, train, val = utils.load_test_train_val(self.data_num) # df

        train_texts = list(train.posts)

        glove = Glove()
        glove.create_custom_embedding([word for text in train_texts for word in text.split()])

        self.train_tuple = utils.process_data(train, glove, self.max_words, self.max_posts)
        self.test_tuple = utils.process_data(test, glove, self.max_words, self.max_posts)
        self.val_tuple = utils.process_data(val, glove, self.max_words, self.max_posts)
예제 #2
0
def train_and_eval(thread_ids,
                   posts,
                   labels,
                   max_posts=20,
                   max_words=400,
                   frac=[0.8, 0.1, 0.1],
                   seed=0,
                   batch_size=9,
                   embedding='glove',
                   max_epoch=500,
                   validate=False,
                   result_dir=None):

    # preliminary check
    if len(thread_ids) != len(posts) or \
        len(thread_ids) != len(labels) or \
        len(posts) != len(labels):
        raise Exception('Invalid length of data.')

    if len(frac) != 3 or frac[0] + frac[1] + frac[2] != 1:
        raise Exception('Invalid value of frac.')

    if frac[0] <= 0 or frac[1] <= 0 or frac[2] <= 0:
        raise Exception('Invalid value(s) for one or more frac element(s).')

    if embedding not in ['glove']:
        raise Exception('Invalid embedding.')

    train_texts, train_labels, test_texts, test_labels, val_texts, val_labels = utils.filter_and_shuffle_data(
        thread_ids, posts, labels, max_words, max_posts, seed, frac)

    print(f'''----------
    Data Split Result:
    Train data = {len(train_texts)}
    Test data = {len(test_texts)}
    Val data = {len(val_texts)}
    ----------''')

    # from here on is glove specific implementation (may need to extract to a function)
    print('Init embedding')
    glove = Glove()
    glove.create_custom_embedding(
        [item for sublist in train_texts for item in sublist])
    glove.add_to_embedding(['.', '!', '?'])

    print('Padding and packing data into data loader')
    for i, thread in enumerate(train_texts):
        for j, post_text in enumerate(thread):
            train_texts[i][j] = glove.sentence_to_indices(post_text,
                                                          seq_len=max_words)
    for i, thread in enumerate(test_texts):
        for j, post_text in enumerate(thread):
            test_texts[i][j] = glove.sentence_to_indices(post_text,
                                                         seq_len=max_words)
    for i, thread in enumerate(val_texts):
        for j, post_text in enumerate(thread):
            val_texts[i][j] = glove.sentence_to_indices(post_text,
                                                        seq_len=max_words)

    # padding at the post level
    post_padding = [glove.word2idx[glove.pad_token]] * max_words
    for posts in [train_texts, test_texts, val_texts]:
        for sublist in posts:
            if len(sublist) < max_posts:
                sublist.extend([post_padding] * (max_posts - len(sublist)))

    for labels in [train_labels, test_labels, val_labels]:
        for sublist in labels:
            if len(sublist) < max_posts:
                sublist.extend([0] * (max_posts - len(sublist)))

    train_loader = utils.to_data_loader(batch_size, train_texts, train_labels)
    test_loader = utils.to_data_loader(batch_size, test_texts, test_labels)
    val_loader = utils.to_data_loader(batch_size, val_texts, val_labels)

    print('Creating model')
    embedding = create_emb_layer(
        torch.from_numpy(glove.weights_matrix).float().to(utils.get_device()))
    model = hLSTM(input_size=glove.emb_dim,
                  hidden_size=glove.emb_dim,
                  output_size=glove.emb_dim,
                  batch_size=batch_size,
                  num_layers=1,
                  bidirectional=False,
                  embedding=embedding,
                  drop_prob=0.5,
                  max_output=max_posts,
                  device=utils.get_device())

    labels = [label for sublist in train_labels for label in sublist]

    intervention_ratio = len([label
                              for label in labels if label == 1]) / len(labels)
    loss_fn = WeightedBCELoss(zero_weight=intervention_ratio,
                              one_weight=1 - intervention_ratio)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    writer = None
    if result_dir is not None:
        writer = SummaryWriter(f'runs/{result_dir}')

        if not os.path.exists(f'models/{result_dir}'):
            os.makedirs(f'models/{result_dir}')

    if not validate:
        val_loader = None

    print('Start training model')
    train_model(model, train_loader, max_epoch, loss_fn, optimizer, val_loader,
                writer)

    print('Evaluating model')
    f1, precision, recall = eval_model(model, test_loader, False)

    print(f'''
    Test results:
    F1 = {f1}
    Precision = {precision}
    Recall = {recall}
    ''')

    if result_dir is not None:
        print('Saving final model')
        torch.save(model.state_dict(), f'models/{result_dir}/final_model.pth')

    print('DONE :)))')
예제 #3
0
def train_and_eval_crf(thread_ids, posts, labels, max_posts=20,
                       max_words=400, frac=[0.8, 0.1, 0.1], seed=0,
                       batch_size=9, embedding='glove', max_epoch=500,
                       validate=False, result_dir=None):

    # preliminary check
    if len(thread_ids) != len(posts) or \
        len(thread_ids) != len(labels) or \
        len(posts) != len(labels):
        raise Exception('Invalid length of data.')

    if len(frac) != 3 or frac[0]+frac[1]+frac[2] != 1:
        raise Exception('Invalid value of frac.')
    
    if frac[0] <= 0 or frac[1] <= 0 or frac[2] <= 0:
        raise Exception('Invalid value(s) for one or more frac element(s).')

    if embedding not in ['glove']:
        raise Exception('Invalid embedding.')
    
    train_texts, train_labels, test_texts, test_labels, val_texts, val_labels = utils.filter_and_shuffle_data(thread_ids, posts, labels, max_words, max_posts, seed, frac)
    
    # from here on is glove specific implementation (may need to extract to a function)
    print('Init embedding')
    glove = Glove()
    glove.create_custom_embedding([item for sublist in train_texts for item in sublist])
    glove.add_to_embedding(['.', '!', '?'])

    print('Padding and packing data into data loader')
    for i, thread in enumerate(train_texts):
        for j, post_text in enumerate(thread):
            train_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words)
    for i, thread in enumerate(test_texts):
        for j, post_text in enumerate(thread):
            test_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words)
    for i, thread in enumerate(val_texts):
        for j, post_text in enumerate(thread):
            val_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words)

    # padding at the post level
    post_padding = [glove.word2idx[glove.pad_token]] * max_words
    for posts in [train_texts, test_texts, val_texts]:
        for sublist in posts:
            if len(sublist) < max_posts:
                sublist.extend([post_padding] * (max_posts - len(sublist)))
    
    train_masks, test_masks, val_masks = [], [], []
    def get_to_append(ones):
        to_append = [1] * len(labels)
        if len(to_append) < max_posts:
            to_append.extend([0] * (max_posts-len(labels)))
        return to_append

    for labels in train_labels:
        to_append = get_to_append(len(labels))
        train_masks.append(to_append)

    for labels in test_labels:
        to_append = get_to_append(len(labels))
        test_masks.append(to_append)

    for labels in val_labels:
        to_append = get_to_append(len(labels))
        val_masks.append(to_append)

    for labels in [train_labels, test_labels, val_labels]:
        for sublist in labels:
            if len(sublist) < max_posts:
                sublist.extend([0] * (max_posts-len(sublist)))

    train_loader = utils.to_data_loader(batch_size, train_texts, train_labels, train_masks)
    test_loader = utils.to_data_loader(batch_size, test_texts, test_labels, test_masks)
    val_loader = utils.to_data_loader(batch_size, val_texts, val_labels, val_masks)

    print('Creating model')
    embedding = create_emb_layer(torch.from_numpy(glove.weights_matrix).float().to(utils.get_device()))
    model = hLSTM_CRF(num_tags=2,
                      input_size=glove.emb_dim,
                      hidden_size=glove.emb_dim, 
                      output_size=glove.emb_dim,
                      batch_size=batch_size,
                      num_layers=1, 
                      bidirectional=False,
                      embedding=embedding, 
                      drop_prob=0.5,
                      max_output=max_posts,
                      device=utils.get_device())

    labels = [label for sublist in train_labels for label in sublist]
    
    intervention_ratio = len([label for label in labels if label == 1]) / len(labels)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    writer = None
    if result_dir is not None:
        writer = SummaryWriter(f'runs/{result_dir}')

        if not os.path.exists(f'models/{result_dir}'):
            os.makedirs(f'models/{result_dir}')

    if not validate:
        val_loader = None

    print('Start training model')
    model.zero_grad()
    model.train()

    running_loss = 0.0
    for epoch in range(max_epoch):
        if (epoch + 1) % 20 == 0:
            print(f'Training model ({epoch + 1} / {max_epoch})')

        for i, (inputs, labels, masks) in enumerate(train_loader):
            inputs, labels, masks = inputs.to(utils.get_device()), labels.to(utils.get_device()), masks.to(utils.get_device())

            loss = model.loss(inputs, labels, masks)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
            optimizer.step()

            running_loss += loss.item()
            if i % 1000 == 999: # every 100 mini-batches
                if writer is not None:
                    writer.add_scalar('training loss', 
                                      running_loss / 1000,
                                      epoch * len(train_loader) + i)
                    running_loss = 0.0
                
                if val_loader is not None:
                    f1, _, _ = eval_model(model, val_loader)
                    writer.add_scalar('validation f1', f1,
                                      epoch * len(train_loader) + i)

    print('Evaluating model')
    f1, precision, recall = eval_model(model, test_loader, False)

    print(f'''
    Test results:
    F1 = {f1}
    Precision = {precision}
    Recall = {recall}
    ''')

    if result_dir is not None:
        print('Saving final model')
        torch.save(model.state_dict(), f'models/{result_dir}/final_model.pth')

    print('DONE :)))')
예제 #4
0
INTERVENED_RATIO = 0.25
EPOCHS = 1
CLIP = 5
VAL_EVERY = 200
TB_FOLDER = 'hlstm_data1_ep1'

test, train, val = utils.load_test_train_val(DATA_NUM) # df
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
writer_v = SummaryWriter(f'runs/{TB_FOLDER}_val')
writer_t = SummaryWriter(f'runs/{TB_FOLDER}_train')

train_texts = list(train.posts)

print('Init GloVe embedding')
glove = Glove()
glove.create_custom_embedding([word for text in train_texts for word in text.split()])

print(len(glove.word2idx))

print('Padding and packing data into data loader')
train_indices, train_labels, train_wl, train_pl = utils.process_data(train, glove, MAX_WORDS, MAX_POSTS)
test_indices, test_labels, test_wl, test_pl = utils.process_data(test, glove, MAX_WORDS, MAX_POSTS)
val_indices, val_labels, val_wl, val_pl = utils.process_data(val, glove, MAX_WORDS, MAX_POSTS)

train_loader = utils.to_data_loader(train_indices, train_labels, train_wl, train_pl, BATCH_SIZE)
test_loader = utils.to_data_loader(test_indices, test_labels, test_wl, test_pl, BATCH_SIZE)
val_loader = utils.to_data_loader(val_indices, val_labels, val_wl, val_pl, BATCH_SIZE)

print('Creating model')
embedding = create_emb_layer(torch.from_numpy(glove.weights_matrix).float().to(device))
criterion = WeightedBCELoss(zero_weight=INTERVENED_RATIO, one_weight=1-INTERVENED_RATIO)