Пример #1
0
def test(dim, args):
    import torch
    import numpy as np
    from features import ExtractWordEmbeddings
    from preprocess_data import batchify, padBatch
    from models.lstm import LSTMClassifier
    from sklearn.utils import shuffle
    from sklearn.metrics import roc_auc_score, recall_score, accuracy_score

    # hyperparameters
    is_cuda = True
    batch_size = 60
    embedding_dim = 300
    hidden_dim = args.hidden_dim
    weight_dir = 'weights/LSTM/%s' % dim
    weight_file = join(weight_dir, 'best-weights.pth')
    assert os.path.exists(
        weight_file), "The file directory for the saved model doesn't exist"

    # load datasets
    X_t, y_t = loadDatasetForLSTM(dim, 'test')

    # load model and settings for training
    model = LSTMClassifier(embedding_dim=embedding_dim, hidden_dim=hidden_dim)

    state_dict = torch.load(weight_file)
    model.load_state_dict(state_dict)
    if is_cuda:
        model.cuda()

    em = ExtractWordEmbeddings(emb_type='glove')

    # validate
    y_scores = []
    X_t, y_t = shuffle(X_t, y_t)
    val_batches = batchify(X_t, y_t, batch_size)
    model.eval()
    with torch.no_grad():
        for X_b, y_b in val_batches:
            inputs = torch.tensor(
                padBatch([
                    em.obtain_vectors_from_sentence(sent, True) for sent in X_b
                ])).float()
            targets = torch.tensor(y_b, dtype=torch.float32)
            if is_cuda:
                inputs, targets = inputs.cuda(), targets.cuda()
            outputs = model(inputs).tolist()
            y_scores.extend(outputs)
    y_preds = np.array(np.array(y_scores) >= 0.5, dtype=int)
    auc = roc_auc_score(y_true=y_t, y_score=y_scores)
    rec = recall_score(y_true=y_t, y_pred=y_preds)
    acc = accuracy_score(y_true=y_t, y_pred=y_preds)
    print('AUC: ', round(auc, 2))
    print('REC: ', round(rec, 2))
    print('ACC: ', round(acc, 2))
    with open(join(weight_dir, 'scores.txt'), 'w') as f:
        f.write('AUC: %1.2f\n' % auc)
        f.write('REC: %1.2f\n' % rec)
        f.write('ACC: %1.2f\n' % acc)
    return
Пример #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--phase', type=str, help='Train or test.')
    parser.add_argument('--embedding_file',
                        type=str,
                        help='Filename to save the trained word embeddings.')
    parser.add_argument('--model_path',
                        type=str,
                        help='The file of the lstm model.')
    parser.add_argument('--test_file',
                        type=str,
                        help='The file of the tesing data.')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='The number of training epochs.')
    parser.add_argument('--batch_size',
                        type=int,
                        default=50,
                        help='The batch size of the training phrase.')
    args = parser.parse_args()
    phase = args.phase
    embedding_file = args.embedding_file
    model_path = args.model_path

    embeddings, word2id, id2word = load_embedding(embedding_file)
    id2label = dict({
        0: u'游戏',
        1: u'角色扮演',
        2: u'moba',
        3: u'运动',
        4: u'三国',
        5: u'战争',
        6: u'服饰',
        7: u'T恤',
        8: u'婚姻'
    })

    EMBEDDING_DIM = 100
    HIDDEN_DIM = 200
    LINEAR_HIDDEN_DIM = 100
    N_CLASSES = len(id2label)

    # Create the lstm model
    model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, LINEAR_HIDDEN_DIM,
                           len(word2id.keys()), N_CLASSES, embeddings)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=5e-4)
    print(model)

    if phase == 'train':
        print('Load the training data and prepare labels...')
        game_roleplay = 'data/train/1k_std_rollplay.word'
        game_moba = 'data/train/1k_std_moba.word'
        game_sport = 'data/train/1k_std_sport_game.word'
        sanguo_battle = 'data/train/1k_std_sanguo.word'
        cloth_shirt = 'data/train/1k_std_cloth.word'
        marriage = 'data/train/1k_std_marriage.word'
        sport = 'data/train/1k_std_sport.word'

        corpus2label = dict({
            'game_roleplay': (game_roleplay, [1, 1, 0, 0, 0, 0, 0, 0, 0]),
            'game_moba': (game_moba, [1, 0, 1, 0, 0, 0, 0, 0, 0]),
            'game_sport': (game_sport, [1, 0, 0, 1, 0, 0, 0, 0, 0]),
            'sanguo_battle': (sanguo_battle, [0, 0, 0, 0, 1, 1, 0, 0, 0]),
            'cloth_shirt': (cloth_shirt, [0, 0, 0, 0, 0, 0, 1, 1, 0]),
            'marriage': (marriage, [0, 0, 0, 0, 0, 0, 0, 0, 1]),
            'sport': (sport, [0, 0, 0, 1, 0, 0, 0, 0, 0])
        })

        corpus_data = []
        labels = []
        for file_name, label in corpus2label.values():
            print(file_name, label)
            tmp_codes, tmp_labels = encode_setence(file_name, word2id, label)
            corpus_data.extend(tmp_codes)
            labels.extend(tmp_labels)

        corpus_data, lengths = get_padding_codes(corpus_data)
        corpus_data = torch.tensor(np.array(corpus_data), dtype=torch.long)
        lengths = torch.tensor(np.array(lengths), dtype=torch.long)
        labels = torch.tensor(np.array(labels), dtype=torch.float)

        # Train and validate
        # labels = np.array(labels)
        train_size = int(corpus_data.shape[0] * 0.8)
        indices = list(range(corpus_data.shape[0]))
        random.shuffle(indices)
        train_indices = indices[0:train_size]
        validate_indices = indices[train_size:]

        train_data = corpus_data[train_indices, :]
        train_labels = labels[train_indices, :]
        train_lengths = lengths[train_indices]
        validate_data = corpus_data[validate_indices, :]
        validate_labels = labels[validate_indices, :]
        validate_lengths = lengths[validate_indices]

        # bind variables to cuda
        if torch.cuda.is_available:
            train_data = train_data.cuda()
            train_lengths = train_lengths.cuda()
            train_labels = train_labels.cuda()
            validate_data = validate_data.cuda()
            validate_labels = validate_labels.cuda()
            validate_lengths = validate_lengths.cuda()
            model.cuda()

        text_data = TextDataset(train_data, train_labels, train_lengths)
        train_dataloader = data.DataLoader(text_data,
                                           batch_size=args.batch_size,
                                           shuffle=True)

        print('Train the LSTM text classifier model...')
        train_lstm(model, model_path, optimizer, train_dataloader,
                   validate_data, validate_labels, validate_lengths,
                   args.epochs)

    if phase == 'test':
        test_file = args.test_file
        model.load_state_dict(torch.load(model_path))
        optimizer.zero_grad()
        test_data, labels = encode_setence(test_file, word2id, 1)
        padding_test_data, lengths = get_padding_codes(test_data)
        padding_test_data = torch.tensor(np.array(padding_test_data),
                                         dtype=torch.long)
        lengths = torch.tensor(np.array(lengths), dtype=torch.long)
        scores = evaluate_lstm(model, padding_test_data, lengths)
        scoers = scores.data.cpu().numpy()

        # for print the result
        for idx, score in enumerate(scores):
            sentence = [id2word[int(code)] for code in test_data[idx]]
            tmp_labels = [id2label[i] for i in np.where(score > 0.5)[0]]
            tmp_score = np.array(
                [float(score[i]) for i in np.where(score > 0.5)[0]])
            tmp_score = tmp_score.prod()
            print(idx),
            print(' '.join(sentence).encode('utf-8').decode('utf-8'))
            print(' '.join(tmp_labels).encode('utf-8').decode('utf-8')),
            print(tmp_score)
Пример #3
0
    def __init__(self,
                 models_dir='./models/lstm_trained_models',
                 embeddings_dir='./embeddings',
                 is_cuda=False):
        """
		@param models_dir: the directory where the LSTM models are stored
		@param embeddings_dir: the directory where the embeddings are stored. The directory must contain the following subdirectories:
		                       word2vec/GoogleNews-vectors-negative300.wv
		                       fasttext/wiki-news-300d-1M-subword.wv
		                       glove/glove.42B.300d.wv
		@param is_cuda: to enable cuda
		"""
        self.is_cuda = is_cuda
        self.models_dir = models_dir
        self.embeddings_dir = embeddings_dir

        #load embeddings
        self.em_glove = ExtractWordEmbeddings('glove',
                                              emb_dir=self.embeddings_dir)
        self.em_word2vec = ExtractWordEmbeddings('word2vec',
                                                 emb_dir=self.embeddings_dir)
        self.em_fasttext = ExtractWordEmbeddings('fasttext',
                                                 emb_dir=self.embeddings_dir)
        self.dimensions_list = [
            'support', 'knowledge', 'conflict', 'power', 'similarity', 'fun',
            'status', 'trust', 'identity', 'romance'
        ]

        #load models
        self.dim2model = {}
        self.dim2embedding = {}

        for dim in self.dimensions_list:
            model = LSTMClassifier(embedding_dim=300, hidden_dim=300)
            if self.is_cuda:
                print(f'Torch version: {torch.__version__}')
                print(f'Torch CUDA available : {torch.cuda.is_available()}')
                if torch.cuda.is_available():
                    print(
                        f'Torch current device : {torch.cuda.current_device()}'
                    )
                    print(f'Torch device count : {torch.cuda.device_count()}')
                    print(
                        f'Torch device name : {torch.cuda.get_device_name(0)}')
                    model.cuda()
                else:
                    print(
                        'Cuda not available. Instantiated the TenDimensionsClassifier with CUDA=False'
                    )
                    self.is_cuda = False
            model.eval()
            for modelname in os.listdir(self.models_dir):
                if ('-best.lstm' in modelname) & (dim in modelname):
                    best_state = torch.load(join(self.models_dir, modelname),
                                            map_location='cpu')
                    model.load_state_dict(best_state)
                    if 'glove' in modelname:
                        em = self.em_glove
                    elif 'word2vec' in modelname:
                        em = self.em_word2vec
                    elif 'fasttext' in modelname:
                        em = self.em_fasttext
                    self.dim2model[dim] = model
                    self.dim2embedding[dim] = em
                    break