예제 #1
0
def get_embd(cfg, vocab):
    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(cfg.input_dir(), 'sick_embed.pth')

    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = utils.load_word_vectors(
            os.path.join(cfg.glove_dir(), 'glove.840B.300d'))
        cfg.logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
        emb = torch.zeros(vocab.size(),
                          glove_emb.size(1),
                          dtype=torch.float,
                          device=cfg.device())
        emb.normal_(0, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([
                Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD,
                Constants.EOS_WORD
        ]):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                    word)]
                torch.save(emb, emb_file)

    return emb
예제 #2
0
def main():
    """
    Create a range of clusters and compare them
    """
    word_vectors, used_words, unused_words = load_word_vectors(FILENAME)
    start = time.time()
    n_clusters = range(1, 21)
    print("Using cluster sizes from {} to {}".format(min(n_clusters), max(n_clusters)))
    kmeans_clusters = [KMeans(n_clusters=n).fit(word_vectors) for n in n_clusters]
    centroids = [k.cluster_centers_ for k in kmeans_clusters]

    D_k = [cdist(word_vectors, cent, 'euclidean') for cent in centroids]
    cIdx = [np.argmin(D, axis=1) for D in D_k]
    dist = [np.min(D, axis=1) for D in D_k]
    avgWithinSS = [sum(d) / len(word_vectors) for d in dist]

    # Total with-in sum of square
    wcss = [sum(d**2) for d in dist]
    tss = sum(pdist(word_vectors)**2)/len(word_vectors)
    bss = tss-wcss

    stop = time.time()
    print("Time taken for clustering: {} seconds.".format(stop - start))

    print("Plotting elbow curve")
    plot_elbow_curve(n_clusters=n_clusters, avgWithinSS=avgWithinSS, bss=bss, tss=tss)
예제 #3
0
def main():
    """
    Open csv
    """

    word_vectors, used_words, unused_words = load_word_vectors(FILENAME)

    start = time.time()
    n_clusters = 3

    print("Clustering")
    kmeans_clustering_predict = KMeans(n_clusters=n_clusters)
    idx = kmeans_clustering_predict.fit_predict(word_vectors)
    clustered_words = {}
    for index, cluster in enumerate(idx):
        key = used_words[index]
        if key not in clustered_words:
            clustered_words[key] = cluster
        else:
            raise Exception("Key {} already exists!".format(key))

    kmeans_clustering = kmeans_clustering_predict.fit(word_vectors)
    labels = kmeans_clustering.labels_
    metrics.silhouette_score(word_vectors, labels, metric='euclidean')
    metrics.calinski_harabaz_score(word_vectors, labels)

    end = time.time()
    elapsed = end - start
    print("Time taken for clustering: {} seconds.".format(elapsed))
    print("Silhouette score: {}".format(
        metrics.silhouette_score(word_vectors, labels, metric='euclidean')))
    print("CH score: {}".format(
        metrics.calinski_harabaz_score(word_vectors, labels)))

    print("Saving word clusters to {}".format(CLUSTER_OUTPUT_FILE))
    with open(CLUSTER_OUTPUT_FILE, "w", encoding="utf-8") as f:
        writer = csv.writer(f,
                            delimiter=',',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)
        for clustered_word, cluster in clustered_words.items():
            writer.writerow([clustered_word, cluster])
    with open("wordsNOTvec_output.txt", "w", encoding="utf8") as f:
        for unused_word in unused_words:
            f.write(unused_word + "\n")
    print("Saving word vectors to {}".format(VECTOR_OUTPUT_FILE))

    with open(VECTOR_OUTPUT_FILE, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f,
                            delimiter=',',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)
        for i, word in enumerate(used_words):
            vectors = word_vectors[i]
            writer.writerow([
                word,
            ] + vectors.tolist())
 def load_glove(glove_path, vocab_size, word_dict):
     # glove_path = "/Users/cyy7645/Desktop/treelstm.pytorch-master/data/glove/"
     glove_vocab, glove_emb = utils.load_word_vectors(
         os.path.join(glove_path, 'glove.840B.300d'))
     # logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
     # 初始化词向量矩阵
     emb = torch.zeros(vocab_size, 300, dtype=torch.float)
     emb.normal_(0, 0.05)
     # zero out the embeddings for padding and other special words if they are absent in vocab
     for idx, item in enumerate(['<PAD>']):
         emb[idx].zero_()
     # 如果数据集的字典里的单词在权重表glove_vocab中出现,用glove_vocab中预定义的权重替换
     for word in word_dict.keys():
         if glove_vocab.getIndex(word):
             emb[word_dict[word]] = glove_emb[glove_vocab.getIndex(
                 word)]
     return emb
예제 #5
0
def main():
    """
    Open csv
    """
    args = parse_args()
    word_vectors, used_words, unused_words = load_word_vectors(args.input)

    with open(args.output, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f,
                            delimiter=',',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)
        for i, word in enumerate(used_words):
            vectors = word_vectors[i]
            writer.writerow([
                word,
            ] + vectors.tolist())
예제 #6
0
def main():
    global args
    args = parse_args()
    vocab_file = os.path.join(args.dtree, 'snli_vocab_cased.txt')
    vocab = Vocab(filename=vocab_file)

    args.cuda = args.cuda and torch.cuda.is_available()
    device = torch.device("cuda:0" if args.cuda else "cpu")
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True

    if not os.path.exists(args.save):
        os.makedirs(args.save)

    l_train_file = os.path.join(args.dtree, args.premise_train)
    r_train_file = os.path.join(args.dtree, args.hypothesis_train)
    label_train_file = os.path.join(args.dtree, args.label_train)

    l_dev_file = os.path.join(args.dtree, args.premise_dev)
    r_dev_file = os.path.join(args.dtree, args.hypothesis_dev)
    label_dev_file = os.path.join(args.dtree, args.label_dev)

    l_test_file = os.path.join(args.dtree, args.premise_test)
    r_test_file = os.path.join(args.dtree, args.hypothesis_test)
    label_test_file = os.path.join(args.dtree, args.label_test)

    l_train_squence_file = os.path.join(args.ctree, args.premise_train)
    r_train_squence_file = os.path.join(args.ctree, args.hypothesis_train)

    l_dev_squence_file = os.path.join(args.ctree, args.premise_dev)
    r_dev_squence_file = os.path.join(args.ctree, args.hypothesis_dev)

    l_test_squence_file = os.path.join(args.ctree, args.premise_test)
    r_test_squence_file = os.path.join(args.ctree, args.hypothesis_test)

    print(l_train_file, l_dev_file, l_test_file)
    print(r_train_file, r_dev_file, r_test_file)
    print(label_train_file, label_dev_file, label_test_file)

    # load SICK dataset splits
    train_file = os.path.join(args.data, 'train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = NLIdataset(premise_tree=l_train_file,
                                   hypothesis_tree=r_train_file,
                                   premise_seq=l_train_squence_file,
                                   hypothesis_seq=r_train_squence_file,
                                   label=label_train_file,
                                   vocab=vocab,
                                   num_classes=3,
                                   args=args)
        torch.save(train_dataset, train_file)
    if args.savedev == 1:
        dev_file = os.path.join(args.data, 'dev.pth')
        if os.path.isfile(dev_file):
            dev_dataset = torch.load(dev_file)
        else:
            dev_dataset = NLIdataset(premise_tree=l_dev_file,
                                     hypothesis_tree=r_dev_file,
                                     premise_seq=l_dev_squence_file,
                                     hypothesis_seq=r_dev_squence_file,
                                     label=label_dev_file,
                                     vocab=vocab,
                                     num_classes=3,
                                     args=args)
            torch.save(dev_dataset, dev_file)

        test_file = os.path.join(args.data, 'test.pth')
        if os.path.isfile(test_file):
            test_dataset = torch.load(test_file)
        else:
            test_dataset = NLIdataset(premise_tree=l_test_file,
                                      hypothesis_tree=r_test_file,
                                      premise_seq=l_test_squence_file,
                                      hypothesis_seq=r_test_squence_file,
                                      label=label_test_file,
                                      vocab=vocab,
                                      num_classes=3,
                                      args=args)
            torch.save(test_dataset, test_file)
    else:
        dev_dataset = NLIdataset(premise_tree=l_dev_file,
                                 hypothesis_tree=r_dev_file,
                                 premise_seq=l_dev_squence_file,
                                 hypothesis_seq=r_dev_squence_file,
                                 label=label_dev_file,
                                 vocab=vocab,
                                 num_classes=3,
                                 args=args)
        test_dataset = NLIdataset(premise_tree=l_test_file,
                                  hypothesis_tree=r_test_file,
                                  premise_seq=l_test_squence_file,
                                  hypothesis_seq=r_test_squence_file,
                                  label=label_test_file,
                                  vocab=vocab,
                                  num_classes=3,
                                  args=args)

    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batchsize,
                                   shuffle=False)
    dev_data_loader = DataLoader(dev_dataset,
                                 batch_size=args.batchsize,
                                 shuffle=False)
    test_data_loader = DataLoader(test_dataset,
                                  batch_size=args.batchsize,
                                  shuffle=False)

    # for data in train_data_loader:
    #     lsent, lgraph, rsent, rgraph, label = data
    #     print(label)
    #     break

    # # initialize model, criterion/loss_function, optimizer
    # model = TreeLSTMforNLI(
    #     vocab.size(),
    #     args.input_dim,
    #     args.mem_dim,
    #     args.hidden_dim,
    #     args.num_classes,
    #     args.sparse,
    #     args.freeze_embed)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'snli_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = utils.load_word_vectors(
            os.path.join(args.glove, 'glove.840B.300d'))
        emb = torch.zeros(vocab.size(),
                          glove_emb.size(1),
                          dtype=torch.float,
                          device=device)
        emb.normal_(0, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate(['_PAD_', '_UNK_', '_BOS_', '_EOS_']):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                    word)]
        torch.save(emb, emb_file)
    # plug these into embedding matrix inside model
    # model.emb.weight.data.copy_(emb)
    model = ESIM(vocab.size(),
                 args.input_dim,
                 args.mem_dim,
                 embeddings=emb,
                 dropout=0.5,
                 num_classes=args.num_classes,
                 device=device,
                 freeze=args.freeze_embed).to(device)
    criterion = nn.CrossEntropyLoss()
    model.to(device), criterion.to(device)
    if args.optim == 'adam':
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(filter(lambda p: p.requires_grad,
                                         model.parameters()),
                                  lr=args.lr,
                                  weight_decay=args.wd)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                     model.parameters()),
                              lr=args.lr,
                              weight_decay=args.wd)

    trainer = Trainer(args, model, criterion, optimizer, device)

    best = -999.0
    best_loop = 0
    for epoch in range(args.epochs):
        train_loss = trainer.train(train_data_loader)
        train_loss, train_acc = trainer.test(train_data_loader)
        dev_loss, dev_acc = trainer.test(dev_data_loader)
        test_loss, test_acc = trainer.test(test_data_loader)

        print('==> Epoch {}, Train \tLoss: {}\tAcc: {}'.format(
            epoch, train_loss, train_acc))
        print('==> Epoch {}, Dev \tLoss: {}\tAcc: {}'.format(
            epoch, dev_loss, dev_acc))
        print('==> Epoch {}, Test \tLoss: {}\tAcc: {}'.format(
            epoch, test_loss, test_acc))

        if best < test_acc:
            best = test_acc
            best_loop = 0
            print('Get Improvement,Save Model, The best performence is %f' %
                  (best))
            checkpoint = {
                'model': trainer.model.state_dict(),
                'optim': trainer.optimizer,
                'acc': test_acc,
                'args': args,
                'epoch': epoch
            }
            print('==> New optimum found, checkpointing everything now...')
            torch.save(checkpoint,
                       '%s.pt' % os.path.join(args.save, args.expname))
        else:
            best_loop += 1
            if best_loop > args.patience:
                print('Early Stop,Best Acc:%f' % (best))
                break
예제 #7
0
def main():
    global args
    args = parse_args(type=1)
    print(args.name)
    print(args.model_name)

    if args.mem_dim == 0:
        if args.model_name == 'dependency':
            args.mem_dim = 168
        elif args.model_name == 'constituency':
            args.mem_dim = 150
        elif args.model_name == 'lstm':
            args.mem_dim = 168
        elif args.model_name == 'bilstm':
            args.mem_dim = 168

    if args.num_classes == 0:
        if args.fine_grain:
            args.num_classes = 5  # 0 1 2 3 4
        else:
            args.num_classes = 3  # 0 1 2 (1 neutral)
    elif args.num_classes == 2:
        # assert False # this will not work
        assert not args.fine_grain

    args.cuda = args.cuda and torch.cuda.is_available()
    # args.cuda = False
    print(args)
    # torch.manual_seed(args.seed)
    # if args.cuda:
    # torch.cuda.manual_seed(args.seed)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    token_files = [
        os.path.join(split, 'sents.toks')
        for split in [train_dir, dev_dir, test_dir]
    ]
    #
    vocab_file = os.path.join(args.data, 'vocab-cased.txt')  # use vocab-cased
    if not os.path.isfile(vocab_file):
        build_vocab(token_files, vocab_file)
    # build_vocab(token_files, vocab_file) NO, DO NOT BUILD VOCAB,  USE OLD VOCAB

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=vocab_file)
    # vocab.add(Constants.UNK)

    print('==> SST vocabulary size : %d ' % vocab.size())

    # Load SST dataset splits

    is_preprocessing_data = False  # let program turn off after preprocess data

    if args.model_name == 'dependency' or args.model_name == 'constituency':
        DatasetClass = SSTDataset
    elif args.model_name == 'lstm' or args.model_name == 'bilstm':
        DatasetClass = SeqSSTDataset

    # train
    train_file = os.path.join(args.data, 'sst_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = DatasetClass(train_dir, vocab, args.num_classes,
                                     args.fine_grain, args.model_name)
        torch.save(train_dataset, train_file)
        is_preprocessing_data = True

    # dev
    dev_file = os.path.join(args.data, 'sst_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = DatasetClass(dev_dir, vocab, args.num_classes,
                                   args.fine_grain, args.model_name)
        torch.save(dev_dataset, dev_file)
        is_preprocessing_data = True

    # test
    test_file = os.path.join(args.data, 'sst_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = DatasetClass(test_dir, vocab, args.num_classes,
                                    args.fine_grain, args.model_name)
        torch.save(test_dataset, test_file)
        is_preprocessing_data = True

    criterion = nn.NLLLoss()
    # initialize model, criterion/loss_function, optimizer
    if args.embedding == 'multi_channel':
        args.channel = 2
        embedding_model2 = nn.Embedding(vocab.size(), args.input_dim)
    else:
        args.channel = 1

    if args.model_name == 'dependency' or args.model_name == 'constituency':
        model = TreeLSTMSentiment(args.cuda, args.channel, args.input_dim,
                                  args.mem_dim, args.num_classes,
                                  args.model_name, criterion)
    elif args.model_name == 'lstm' or args.model_name == 'bilstm':
        model = LSTMSentiment(args.cuda,
                              args.channel,
                              args.input_dim,
                              args.mem_dim,
                              args.num_classes,
                              args.model_name,
                              criterion,
                              pooling=args.pooling)

    embedding_model = nn.Embedding(vocab.size(), args.input_dim)

    if args.cuda:
        embedding_model = embedding_model.cuda()
        if args.channel == 2:
            embedding_model2 = embedding_model2.cuda()

    if args.cuda:
        model.cuda(), criterion.cuda()

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_split_token = ' '
    if args.embedding == 'glove':
        emb_torch = 'sst_embed.pth'
        emb_vector = 'glove.840B.300d'
        emb_vector_path = os.path.join(args.glove, emb_vector)
        # assert os.path.isfile(emb_vector_path+'.txt')
    elif args.embedding == 'paragram':
        emb_torch = 'sst_embed_paragram.pth'
        emb_vector = 'paragram_300_sl999'
        emb_vector_path = os.path.join(args.paragram, emb_vector)
        assert os.path.isfile(emb_vector_path + '.txt')
    elif args.embedding == 'paragram_xxl':
        emb_torch = 'sst_embed_paragram_xxl.pth'
        emb_vector = 'paragram-phrase-XXL'
        emb_vector_path = os.path.join(args.paragram, emb_vector)
        assert os.path.isfile(emb_vector_path + '.txt')
    elif args.embedding == 'other':
        emb_torch = 'other.pth'
        emb_vector = args.embedding_other
        emb_vector_path = emb_vector
        emb_split_token = '\t'
        assert os.path.isfile(emb_vector_path + '.txt')
    elif args.embedding == 'multi_channel':
        emb_torch = 'sst_embed1.pth'
        emb_torch2 = 'sst_embed2.pth'
        emb_vector_path = args.embedding_other
        emb_vector_path2 = args.embedding_othert
        assert os.path.isfile(emb_vector_path + '.txt')
        assert os.path.isfile(emb_vector_path2 + '.txt')
        assert (os.path.isfile(emb_vector_path2 + '.txt'), emb_vector_path2)
    else:
        assert False

    emb_file = os.path.join(args.data, emb_torch)
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
        print('load %s' % (emb_file))
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(emb_vector_path,
                                                   emb_split_token)
        print('==> Embedding vocabulary size: %d ' % glove_vocab.size())

        emb = torch.zeros(vocab.size(), glove_emb.size(1))

        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                    word)]
            else:
                emb[vocab.getIndex(word)] = torch.Tensor(
                    emb[vocab.getIndex(word)].size()).normal_(-0.05, 0.05)
        # torch.save(emb, emb_file)
        glove_emb = None
        glove_vocab = None
        gc.collect()
        # add pretrain embedding
        # pretrain embedding would overwrite exist embedding from glove
        embed1_txt = os.path.join(args.state_dir, 'embed1')
        if os.path.isfile(embed1_txt + '.txt'):
            print('load %s' % (embed1_txt))
            glove_vocab, glove_emb = load_word_vectors(embed1_txt,
                                                       emb_split_token)
            print('==> embed1 vocabulary size: %d ' % glove_vocab.size())
            for word in vocab.labelToIdx.keys():
                if glove_vocab.getIndex(word):
                    emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                        word)]
                else:
                    emb[vocab.getIndex(word)] = torch.Tensor(
                        emb[vocab.getIndex(word)].size()).normal_(-0.05, 0.05)
        torch.save(emb, emb_file)  # saved word embedding matrix

        is_preprocessing_data = True  # flag to quit
        print('done creating emb, quit')

    if args.embedding == 'multi_channel':
        emb_file2 = os.path.join(args.data, emb_torch2)
        if os.path.isfile(emb_file2):
            emb2 = torch.load(emb_file2)
            print('load %s' % (emb_file2))
        else:
            # load glove embeddings and vocab
            glove_vocab, glove_emb = load_word_vectors(emb_vector_path2,
                                                       emb_split_token)
            print('==> Embedding vocabulary size: %d ' % glove_vocab.size())

            emb2 = torch.zeros(vocab.size(), glove_emb.size(1))

            for word in vocab.labelToIdx.keys():
                if glove_vocab.getIndex(word):
                    emb2[vocab.getIndex(word)] = glove_emb[
                        glove_vocab.getIndex(word)]
                else:
                    emb2[vocab.getIndex(word)] = torch.Tensor(
                        emb2[vocab.getIndex(word)].size()).normal_(
                            -0.05, 0.05)

            embed2_txt = os.path.join(args.state_dir, 'embed2')
            if os.path.isfile(embed2_txt + '.txt'):
                print('load %s' % (embed2_txt))
                glove_vocab, glove_emb = load_word_vectors(
                    embed2_txt, emb_split_token)
                print('==> embed1 vocabulary size: %d ' % glove_vocab.size())
                for word in vocab.labelToIdx.keys():
                    if glove_vocab.getIndex(word):
                        emb2[vocab.getIndex(word)] = glove_emb[
                            glove_vocab.getIndex(word)]
                    else:
                        emb2[vocab.getIndex(word)] = torch.Tensor(
                            emb2[vocab.getIndex(word)].size()).normal_(
                                -0.05, 0.05)
            torch.save(emb2, emb_file2)
            glove_emb = None
            glove_vocab = None
            gc.collect()
            is_preprocessing_data = True  # flag to quit
            print('done creating emb, quit')

    if is_preprocessing_data:
        print('quit program')
        quit()

    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
        if args.channel == 2:
            emb2 = emb2.cuda()
    embedding_model.state_dict()['weight'].copy_(emb)
    if args.channel == 2:
        embedding_model2.state_dict()['weight'].copy_(emb2)

    # load cnn, lstm state_dict here
    if args.state_dir != 'meow':  #TODO: here
        model.load_state_files(args.state_dir)

    if args.optim == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        # optimizer   = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.wd)
    elif args.optim == 'adadelta':
        optimizer = optim.Adadelta(model.parameters(),
                                   lr=args.lr,
                                   weight_decay=args.wd)
    elif args.optim == 'adam_combine':
        optimizer = optim.Adam([{
            'params': model.parameters(),
            'lr': args.lr,
            'weight_decay': args.wd
        }, {
            'params': embedding_model.parameters(),
            'lr': args.emblr,
            'weight_decay': args.embwd
        }])
        args.manually_emb = 0
    elif args.optim == 'adagrad_combine':
        optimizer = optim.Adagrad([{
            'params': model.parameters(),
            'lr': args.lr,
            'weight_decay': args.wd
        }, {
            'params': embedding_model.parameters(),
            'lr': args.emblr,
            'weight_decay': args.embwd
        }])
        args.manually_emb = 0
    elif args.optim == 'adam_combine_v2':
        model.embedding_model = embedding_model
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)
        args.manually_emb = 0
    metrics = Metrics(args.num_classes)
    utils.count_param(model)

    # create trainer object for training and testing
    # if args.model_name == 'dependency' or args.model_name == 'constituency':
    #     trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer)
    # elif args.model_name == 'lstm' or args.model_name == 'bilstm':
    #     trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer)

    if args.channel == 1:
        # trainer = MultiChannelSentimentTrainer(args, model, [embedding_model], criterion, optimizer)
        trainer = SentimentTrainer(args, model, embedding_model, criterion,
                                   optimizer)
    else:
        trainer = MultiChannelSentimentTrainer(
            args, model, [embedding_model, embedding_model2], criterion,
            optimizer)

    trainer.set_initial_emb(emb)

    # trainer = SentimentTrainer(args, model, embedding_model ,criterion, optimizer)

    test_idx_dir = os.path.join(args.data, args.test_idx)
    test_idx = None
    if os.path.isfile(test_idx_dir):
        print('load test idx %s' % (args.test_idx))
        test_idx = np.load(test_idx_dir)

    mode = args.mode
    dev_loss, dev_pred, _ = trainer.test(
        dev_dataset)  # make sure thing go smooth before train
    dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels)
    print('==> Dev loss   : %f \t' % dev_loss, end="")
    print('before run dev percentage ', dev_acc)
    if mode == 'DEBUG':
        for epoch in range(args.epochs):
            # print a tree
            tree, sent, label = dev_dataset[3]
            utils.print_span(tree, sent, vocab)
            quit()

            dev_loss = trainer.train(dev_dataset)
            dev_loss, dev_pred, _ = trainer.test(dev_dataset)
            test_loss, test_pred, _ = trainer.test(test_dataset)

            dev_acc = metrics.sentiment_accuracy_score(dev_pred,
                                                       dev_dataset.labels)
            # test_acc = metrics.sentiment_accuracy_score(test_pred, test_dataset.labels)
            print('==> Dev loss   : %f \t' % dev_loss, end="")
            print('Epoch ', epoch, 'dev percentage ', dev_acc)
    elif mode == "PRINT_TREE":
        file_path = os.path.join('print_tree', args.name + '.npy')
        print_list = np.load(file_path)
        utils.print_trees_file_v2(args,
                                  vocab,
                                  test_dataset,
                                  print_list,
                                  name='tree')
        print('break')
        quit()
    elif mode == 'EVALUATE':
        print('EVALUATION')
        print('--Model information--')
        print(model)
        filename = args.name + '.pth'
        model = torch.load(os.path.join(args.saved, '_model_' + filename))
        embedding_model = torch.load(
            os.path.join(args.saved, '_embedding_' + filename))
        if args.channel == 1:
            trainer = SentimentTrainer(args, model, embedding_model, criterion,
                                       optimizer)
        elif args.channel == 2:
            embedding_model2 = torch.load(
                os.path.join(args.saved, '_embedding2_' + filename))
            trainer = MultiChannelSentimentTrainer(
                args, model, [embedding_model, embedding_model2], criterion,
                optimizer)

        test_loss, test_pred, subtree_metrics = trainer.test(test_dataset)
        test_acc = metrics.sentiment_accuracy_score(
            test_pred, test_dataset.labels, num_classes=args.num_classes)
        print(' |test percentage ' + str(test_acc))
        result_filename = os.path.join(args.logs, args.name) + 'result.txt'
        rwriter = open(result_filename, 'w')
        for i in range(test_pred.size()[0]):
            rwriter.write(
                str(int(test_pred[i])) + ' ' +
                str(int(test_dataset.labels[i])) + '\n')
        rwriter.close()
        result_link = log_util.up_gist(
            result_filename,
            args.name,
            __file__,
            client_id='ec3ce6baf7dad6b7cf2c',
            client_secret='82240b38a7e662c28b2ca682325d634c9059efb0')
        print(result_link)

        print_list = subtree_metrics.print_list
        utils.print_trees_file_all(args,
                                   vocab,
                                   test_dataset,
                                   print_list,
                                   name='Tree')
        print('____________________' + str(args.name) + '___________________')
    elif mode == "EXPERIMENT":
        print('--Model information--')
        print(model)
        # dev_loss, dev_pred = trainer.test(dev_dataset)
        # dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels, num_classes=args.num_classes)
        max_dev = 0
        max_dev_epoch = 0
        filename = args.name + '.pth'
        for epoch in range(args.epochs):
            train_loss_while_training = trainer.train(train_dataset)
            if epoch % 5 == 0:  # save at least 1 hours
                train_loss, train_pred, _ = trainer.test(train_dataset)
                train_acc = metrics.sentiment_accuracy_score(
                    train_pred,
                    train_dataset.labels,
                    num_classes=args.num_classes)
                print('Train acc %f ' % (train_acc))
            dev_loss, dev_pred, _ = trainer.test(dev_dataset)
            dev_acc = metrics.sentiment_accuracy_score(
                dev_pred, dev_dataset.labels, num_classes=args.num_classes)
            print('==> Train loss   : %f \t' % train_loss_while_training,
                  end="")
            print('Epoch ', epoch, 'dev percentage ', dev_acc)
            print('Epoch %d dev percentage %f ' % (epoch, dev_acc))

            if dev_acc > max_dev:
                print('update best dev acc %f ' % (dev_acc))
                max_dev = dev_acc
                max_dev_epoch = epoch
                utils.mkdir_p(args.saved)
                torch.save(model, os.path.join(args.saved,
                                               '_model_' + filename))
                torch.save(embedding_model,
                           os.path.join(args.saved, '_embedding_' + filename))
                if args.channel == 2:
                    torch.save(
                        embedding_model2,
                        os.path.join(args.saved, '_embedding2_' + filename))
            gc.collect()
        print('epoch ' + str(max_dev_epoch) + ' dev score of ' + str(max_dev))
        print('eva on test set ')

        model = torch.load(os.path.join(args.saved, '_model_' + filename))
        embedding_model = torch.load(
            os.path.join(args.saved, '_embedding_' + filename))
        if args.channel == 1:
            trainer = SentimentTrainer(args, model, embedding_model, criterion,
                                       optimizer)
        elif args.channel == 2:
            embedding_model2 = torch.load(
                os.path.join(args.saved, '_embedding2_' + filename))
            trainer = MultiChannelSentimentTrainer(
                args, model, [embedding_model, embedding_model2], criterion,
                optimizer)

        test_loss, test_pred, subtree_metrics = trainer.test(test_dataset)
        test_acc = metrics.sentiment_accuracy_score(
            test_pred, test_dataset.labels, num_classes=args.num_classes)
        print('Epoch with max dev:' + str(max_dev_epoch) +
              ' |test percentage ' + str(test_acc))
        print_list = subtree_metrics.print_list
        torch.save(print_list,
                   os.path.join(args.saved, args.name + 'printlist.pth'))
        utils.print_trees_file(args,
                               vocab,
                               test_dataset,
                               print_list,
                               name='tree')
        print('____________________' + str(args.name) + '___________________')
    else:
        for epoch in range(args.epochs):
            train_loss = trainer.train(train_dataset)
            train_loss, train_pred, _ = trainer.test(train_dataset)
            dev_loss, dev_pred, _ = trainer.test(dev_dataset)
            test_loss, test_pred, subtree_metrics = trainer.test(test_dataset)

            train_acc = metrics.sentiment_accuracy_score(
                train_pred, train_dataset.labels)
            dev_acc = metrics.sentiment_accuracy_score(dev_pred,
                                                       dev_dataset.labels)
            test_acc = metrics.sentiment_accuracy_score(
                test_pred, test_dataset.labels)
            print('==> Train loss   : %f \t' % train_loss, end="")
            print('Epoch ', epoch, 'train percentage ', train_acc)
            print('Epoch ', epoch, 'dev percentage ', dev_acc)
            print('Epoch ', epoch, 'test percentage ', test_acc)
            print_list = subtree_metrics.print_list
            torch.save(print_list,
                       os.path.join(args.saved, args.name + 'printlist.pth'))
            utils.print_trees_file(args,
                                   vocab,
                                   test_dataset,
                                   print_list,
                                   name='tree')
예제 #8
0
def main():
    global args
    args = parse_args()
    args.input_dim, args.mem_dim = 300, 150
    args.hidden_dim, args.num_classes = 20, 2
    args.cuda = args.cuda and torch.cuda.is_available()
    if args.sparse and args.wd != 0:
        print('Sparsity and weight decay are incompatible, pick one!')
        exit()
    print(args)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    numpy.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    sick_vocab_file = os.path.join(args.data, 'sick.vocab')
    if not os.path.isfile(sick_vocab_file):
        token_files_a = [
            os.path.join(split, 'toks.a')
            for split in [train_dir, dev_dir, test_dir]
        ]
        token_files_b = [
            os.path.join(split, 'toks.b')
            for split in [train_dir, dev_dir, test_dir]
        ]
        token_files = token_files_a + token_files_b
        sick_vocab_file = os.path.join(args.data, 'sick.vocab')
        build_vocab(token_files, sick_vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=sick_vocab_file,
                  data=[
                      Constants.PAD_WORD, Constants.UNK_WORD,
                      Constants.BOS_WORD, Constants.EOS_WORD
                  ])
    print('==> SICK vocabulary size : %d ' % vocab.size())

    # load SICK dataset splits
    train_file = os.path.join(args.data, 'sick_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SICKDataset(train_dir, vocab, args.num_classes)
        torch.save(train_dataset, train_file)
    print('==> Size of train data   : %d ' % len(train_dataset))
    dev_file = os.path.join(args.data, 'sick_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes)
        torch.save(dev_dataset, dev_file)
    print('==> Size of dev data     : %d ' % len(dev_dataset))
    test_file = os.path.join(args.data, 'sick_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SICKDataset(test_dir, vocab, args.num_classes)
        torch.save(test_dataset, test_file)
    print('==> Size of test data    : %d ' % len(test_dataset))

    # initialize model, criterion/loss_function, optimizer
    model = SimilarityTreeLSTM(args.cuda, vocab.size(), args.input_dim,
                               args.mem_dim, args.hidden_dim, args.num_classes,
                               args.sparse)
    criterion = nn.KLDivLoss()
    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.wd)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sick_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(
            os.path.join(args.glove, 'glove.840B.300d'))
        print('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
        emb = torch.Tensor(vocab.size(),
                           glove_emb.size(1)).normal_(-0.05, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([
                Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD,
                Constants.EOS_WORD
        ]):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            word_new = word.decode("utf8")
            idx_set = [
                glove_vocab.getIndex(token)
                for token in word_tokenize(word_new)
            ]
            idx_set = [id for id in idx_set if id is not None]

            if len(idx_set) != 0:
                idx_set = torch.LongTensor(idx_set)
                sum_emb = F.torch.sum(glove_emb.index_select(0, idx_set), 0)
            else:
                sum_emb = glove_emb[1] * 0


#            for token in word_tokenize(word_new):
#                idx = glove_vocab.getIndex(token)
#                if idx is not None:
#                    if sum_emb is None:
#                        sum_emb = glove_emb[idx]
#                    else:
#                        sum_emb += glove_emb[idx]

            emb[vocab.getIndex(word)] = sum_emb
        torch.save(emb, emb_file)
    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
    model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb)

    # create trainer object for training and testing
    trainer = Trainer(args, model, criterion, optimizer)

    best = -float('inf')
    for epoch in range(args.epochs):
        train_loss = trainer.train(train_dataset)
        train_loss, train_pred = trainer.test(train_dataset)
        print(train_pred)
        dev_loss, dev_pred = trainer.test(dev_dataset)
        print(dev_pred)
        test_loss, test_pred = trainer.test(test_dataset)

        train_pearson = metrics.pearson(train_pred, train_dataset.labels)
        train_mse = metrics.accuracy(train_pred, train_dataset.labels)
        print('==> Train    Loss: {}\tPearson: {}\tL1: {}'.format(
            train_loss, train_pearson, train_mse))
        dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels)
        dev_mse = metrics.accuracy(dev_pred, dev_dataset.labels)
        print('==> Dev      Loss: {}\tPearson: {}\tL1: {}'.format(
            dev_loss, dev_pearson, dev_mse))
        test_pearson = metrics.pearson(test_pred, test_dataset.labels)
        test_mse = metrics.accuracy(test_pred, test_dataset.labels)
        print('==> Test     Loss: {}\tPearson: {}\tL1: {}'.format(
            test_loss, test_pearson, test_mse))

        if best < test_pearson:
            best = test_pearson
            checkpoint = {
                'model': trainer.model.state_dict(),
                'optim': trainer.optimizer,
                'pearson': test_pearson,
                'mse': test_mse,
                'args': args,
                'epoch': epoch
            }
            print('==> New optimum found, checkpointing everything now...')
            torch.save(
                checkpoint,
                '%s.pt' % os.path.join(args.save, args.expname + '.pth'))
예제 #9
0
def main():
    global args
    args = parse_args()
    # global logger
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        "[%(asctime)s] %(levelname)s:%(name)s:%(message)s")
    # file logger
    fh = logging.FileHandler(os.path.join(args.save, args.expname) + '.log',
                             mode='w')
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    # console logger
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    # argument validation
    args.cuda = args.cuda and torch.cuda.is_available()
    if args.sparse and args.wd != 0:
        logger.error('Sparsity and weight decay are incompatible, pick one!')
        exit()
    logger.debug(args)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    sick_vocab_file = os.path.join(args.data, 'sick.vocab')
    if not os.path.isfile(sick_vocab_file):
        token_files_a = [
            os.path.join(split, 'a.toks')
            for split in [train_dir, dev_dir, test_dir]
        ]
        token_files_b = [
            os.path.join(split, 'b.toks')
            for split in [train_dir, dev_dir, test_dir]
        ]
        token_files = token_files_a + token_files_b
        sick_vocab_file = os.path.join(args.data, 'sick.vocab')
        build_vocab(token_files, sick_vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=sick_vocab_file,
                  data=[
                      Constants.PAD_WORD, Constants.UNK_WORD,
                      Constants.BOS_WORD, Constants.EOS_WORD
                  ])
    logger.debug('==> SICK vocabulary size : %d ' % vocab.size())

    # load SICK dataset splits
    train_file = os.path.join(args.data, 'sick_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SICKDataset(train_dir, vocab, args.num_classes)
        torch.save(train_dataset, train_file)
    logger.debug('==> Size of train data   : %d ' % len(train_dataset))
    dev_file = os.path.join(args.data, 'sick_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes)
        torch.save(dev_dataset, dev_file)
    logger.debug('==> Size of dev data     : %d ' % len(dev_dataset))
    test_file = os.path.join(args.data, 'sick_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SICKDataset(test_dir, vocab, args.num_classes)
        torch.save(test_dataset, test_file)
    logger.debug('==> Size of test data    : %d ' % len(test_dataset))

    # initialize model, criterion/loss_function, optimizer
    model = SimilarityTreeLSTM(args.cuda, vocab.size(), args.input_dim,
                               args.mem_dim, args.hidden_dim, args.num_classes,
                               args.sparse)
    criterion = nn.KLDivLoss()
    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.wd)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sick_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(
            os.path.join(args.glove, 'glove.840B.300d'))
        logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
        emb = torch.Tensor(vocab.size(),
                           glove_emb.size(1)).normal_(-0.05, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([
                Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD,
                Constants.EOS_WORD
        ]):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                    word)]
        torch.save(emb, emb_file)
    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
    model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb)

    # create trainer object for training and testing
    trainer = Trainer(args, model, criterion, optimizer)

    best = -float('inf')
    for epoch in range(args.epochs):
        train_loss = trainer.train(train_dataset)
        train_loss, train_pred = trainer.test(train_dataset)
        dev_loss, dev_pred = trainer.test(dev_dataset)
        test_loss, test_pred = trainer.test(test_dataset)

        train_pearson = metrics.pearson(train_pred, train_dataset.labels)
        train_mse = metrics.mse(train_pred, train_dataset.labels)
        logger.info(
            '==> Epoch {}, Train \tLoss: {}\tPearson: {}\tMSE: {}'.format(
                epoch, train_loss, train_pearson, train_mse))
        dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels)
        dev_mse = metrics.mse(dev_pred, dev_dataset.labels)
        logger.info(
            '==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tMSE: {}'.format(
                epoch, dev_loss, dev_pearson, dev_mse))
        test_pearson = metrics.pearson(test_pred, test_dataset.labels)
        test_mse = metrics.mse(test_pred, test_dataset.labels)
        logger.info(
            '==> Epoch {}, Test \tLoss: {}\tPearson: {}\tMSE: {}'.format(
                epoch, test_loss, test_pearson, test_mse))

        if best < test_pearson:
            best = test_pearson
            checkpoint = {
                'model': trainer.model.state_dict(),
                'optim': trainer.optimizer,
                'pearson': test_pearson,
                'mse': test_mse,
                'args': args,
                'epoch': epoch
            }
            logger.debug(
                '==> New optimum found, checkpointing everything now...')
            torch.save(checkpoint,
                       '%s.pt' % os.path.join(args.save, args.expname))
def main():
    args = parse_args()
    print(args)
    args.cuda = args.cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    sick_vocab_file = os.path.join(args.data, 'sick.vocab')
    if not os.path.isfile(sick_vocab_file):
        token_files_a = [
            os.path.join(split, 'a.toks')
            for split in [train_dir, dev_dir, test_dir]
        ]
        token_files_b = [
            os.path.join(split, 'b.toks')
            for split in [train_dir, dev_dir, test_dir]
        ]
        token_files = token_files_a + token_files_b
        sick_vocab_file = os.path.join(args.data, 'sick.vocab')
        build_vocab(token_files, sick_vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=sick_vocab_file,
                  data=[
                      Constants.PAD_WORD, Constants.UNK_WORD,
                      Constants.BOS_WORD, Constants.EOS_WORD
                  ])

    # load SICK dataset splits
    train_file = os.path.join(args.data, 'sick_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SICKDataset(train_dir, vocab, args.num_classes)
        torch.save(train_dataset, train_file)

    dev_file = os.path.join(args.data, 'sick_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes)
        torch.save(dev_dataset, dev_file)

    test_file = os.path.join(args.data, 'sick_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SICKDataset(test_dir, vocab, args.num_classes)
        torch.save(test_dataset, test_file)

    # initialize model, criterion/loss_function, optimizer
    model = SimilarityTreeLSTM(args.cuda,
                               vocab.size(),
                               args.input_dim,
                               args.mem_dim,
                               args.hidden_dim1,
                               args.hidden_dim2,
                               args.hidden_dim3,
                               args.num_classes,
                               args.sparse,
                               args.att_hops,
                               args.att_units,
                               args.maxlen,
                               args.dropout1,
                               args.dropout2,
                               args.dropout3,
                               freeze_emb=True)

    criterion = nn.KLDivLoss()

    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim == 'adam':
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(filter(lambda p: p.requires_grad,
                                         model.parameters()),
                                  lr=args.lr,
                                  weight_decay=args.wd)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                     model.parameters()),
                              lr=args.lr,
                              weight_decay=args.wd)
    elif args.optim == 'adadelta':
        optimizer = optim.Adadelta(filter(lambda p: p.requires_grad,
                                          model.parameters()),
                                   lr=args.lr,
                                   weight_decay=args.wd)
    elif args.optim == 'asgd':
        optimizer = optim.ASGD(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=args.lr,
                               weight_decay=args.wd)

    metrics = Metrics(args.num_classes)
    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sick_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(
            os.path.join(args.glove, 'glove.840B.300d'))
        emb = torch.Tensor(vocab.size(),
                           glove_emb.size(1)).normal_(-0.05, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([
                Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD,
                Constants.EOS_WORD
        ]):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                    word)]
        torch.save(emb, emb_file)
    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()

    model.emb.weight.data.copy_(emb)

    # create trainer object for training and testing
    trainer = Trainer(args, model, criterion, optimizer)

    best = -float('inf')

    def adjust_learning_rate(optimizer, epoch):
        """Sets the learning rate to the initial LR decayed by 5 every 3 epochs"""
        lr = args.lr * (0.01**(epoch // 15))
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

    for epoch in range(args.epochs):
        adjust_learning_rate(optimizer, epoch)

        train_loss = trainer.train(train_dataset)
        dev_loss, dev_pred = trainer.test(dev_dataset, mode='test')

        test_pearson = metrics.pearson(dev_pred, dev_dataset.labels)
        test_mse = metrics.mse(dev_pred, dev_dataset.labels)

        if best < test_pearson:
            best = test_pearson
            checkpoint = {
                'model': trainer.model.state_dict(),
                'optim': trainer.optimizer,
                'pearson': test_pearson,
                'mse': test_mse,
                'args': args,
                'epoch': epoch,
                'vocab': vocab
            }

            torch.save(
                checkpoint, '%s.pt' % os.path.join(
                    args.save, args.expname + '_' + str(test_pearson)))

    # Evaluate
    trainer.model.load_state_dict(checkpoint['model'])
    # trainer.train(train_dataset)
    test_loss, test_pred = trainer.test(test_dataset, mode='test')
    test_pearson = metrics.pearson(test_pred, test_dataset.labels)
    test_mse = metrics.mse(test_pred, test_dataset.labels)
    # Final read out
    checkpoint = {
        'model': trainer.model.state_dict(),
        'optim': trainer.optimizer,
        'pearson': test_pearson,
        'mse': test_mse,
        'args': args,
        'vocab': vocab
    }
    torch.save(
        checkpoint, '%s.pt' %
        os.path.join(args.save, 'end_model_test' + str(test_pearson) + '.pt'))
예제 #11
0
def main():
    global args
    args = parse_args(type=1)
    args.input_dim, args.mem_dim = 300, 168
    if args.fine_grain:
        args.num_classes = 5  # 0 1 2 3 4
    else:
        args.num_classes = 3  # 0 1 2 (1 neutral)
    args.cuda = args.cuda and torch.cuda.is_available()
    print(args)
    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    token_files = [
        os.path.join(split, 'sents.toks')
        for split in [train_dir, dev_dir, test_dir]
    ]
    vocab_file = os.path.join(args.data, 'vocab.txt')
    build_vocab(token_files, vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=vocab_file,
                  data=[
                      Constants.PAD_WORD, Constants.UNK_WORD,
                      Constants.BOS_WORD, Constants.EOS_WORD
                  ])
    print('==> SST vocabulary size : %d ' % vocab.size())

    # Load SST dataset splits

    is_preprocessing_data = False  # let program turn off after preprocess data

    # train
    train_file = os.path.join(args.data, 'sst_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SSTDataset(train_dir, vocab, args.num_classes,
                                   args.fine_grain)
        torch.save(train_dataset, train_file)
        is_preprocessing_data = True

    # dev
    dev_file = os.path.join(args.data, 'sst_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SSTDataset(dev_dir, vocab, args.num_classes,
                                 args.fine_grain)
        torch.save(dev_dataset, dev_file)
        is_preprocessing_data = True

    # test
    test_file = os.path.join(args.data, 'sst_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SSTDataset(test_dir, vocab, args.num_classes,
                                  args.fine_grain)
        torch.save(test_dataset, test_file)
        is_preprocessing_data = True

    # initialize model, criterion/loss_function, optimizer
    model = TreeLSTMSentiment(args.cuda, vocab.size(), args.input_dim,
                              args.mem_dim, args.num_classes)
    criterion = nn.CrossEntropyLoss()
    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    utils.count_param(model)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sst_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(
            os.path.join(args.glove, 'glove.840B.300d'))
        print('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
        emb = torch.Tensor(vocab.size(),
                           glove_emb.size(1)).normal_(-0.05, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([
                Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD,
                Constants.EOS_WORD
        ]):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                    word)]
        torch.save(emb, emb_file)
        is_preprocessing_data = True  # flag to quit
        print('done creating emb, quit')

    if is_preprocessing_data:
        print(
            'quit program due to memory leak during preprocess data, please rerun sentiment.py'
        )
        quit()

    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
    model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb)

    # create trainer object for training and testing
    trainer = SentimentTrainer(args, model, criterion, optimizer)

    for epoch in range(args.epochs):
        train_loss = trainer.train(train_dataset)
        # train_loss, train_pred = trainer.test(dev_dataset)
        dev_loss, dev_pred = trainer.test(dev_dataset)
        test_loss, test_pred = trainer.test(test_dataset)
        # TODO: torch.Tensor(dev_dataset.labels) turn label into tensor # done
        dev_acc = metrics.sentiment_accuracy_score(dev_pred,
                                                   dev_dataset.labels)
        test_acc = metrics.sentiment_accuracy_score(test_pred,
                                                    test_dataset.labels)
        print('==> Train loss   : %f \t' % train_loss, end="")
        print('Epoch ', epoch, 'dev percentage ', dev_acc)
        print('Epoch ', epoch, 'test percentage ', test_acc)
예제 #12
0
def main():
    global args
    args = parse_args(type=1)
    print(args.name)
    print(args.model_name)

    args.input_dim = 300

    if args.mem_dim == 0:
        if args.model_name == 'dependency':
            args.mem_dim = 168
        elif args.model_name == 'constituency':
            args.mem_dim = 150
        elif args.model_name == 'lstm':
            args.mem_dim = 168
        elif args.model_name == 'bilstm':
            args.mem_dim = 168

    if args.num_classes == 0:
        if args.fine_grain:
            args.num_classes = 5  # 0 1 2 3 4
        else:
            args.num_classes = 3  # 0 1 2 (1 neutral)
    elif args.num_classes == 2:
        # assert False # this will not work
        assert not args.fine_grain

    args.cuda = args.cuda and torch.cuda.is_available()
    # args.cuda = False
    print(args)
    # torch.manual_seed(args.seed)
    # if args.cuda:
    # torch.cuda.manual_seed(args.seed)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    token_files = [
        os.path.join(split, 'sents.toks')
        for split in [train_dir, dev_dir, test_dir]
    ]
    vocab_file = os.path.join(args.data, 'vocab-cased.txt')  # use vocab-cased
    # build_vocab(token_files, vocab_file) NO, DO NOT BUILD VOCAB,  USE OLD VOCAB

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=vocab_file)
    print('==> SST vocabulary size : %d ' % vocab.size())

    # Load SST dataset splits

    is_preprocessing_data = False  # let program turn off after preprocess data

    # train
    train_file = os.path.join(args.data, 'sst_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SSTDataset(train_dir, vocab, args.num_classes,
                                   args.fine_grain, args.model_name)
        torch.save(train_dataset, train_file)
        is_preprocessing_data = True

    # dev
    dev_file = os.path.join(args.data, 'sst_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SSTDataset(dev_dir, vocab, args.num_classes,
                                 args.fine_grain, args.model_name)
        torch.save(dev_dataset, dev_file)
        is_preprocessing_data = True

    # test
    test_file = os.path.join(args.data, 'sst_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SSTDataset(test_dir, vocab, args.num_classes,
                                  args.fine_grain, args.model_name)
        torch.save(test_dataset, test_file)
        is_preprocessing_data = True

    criterion = nn.NLLLoss()
    # initialize model, criterion/loss_function, optimizer

    model = DMNWraper(args.cuda, args.input_dim, args.mem_dim, criterion,
                      args.train_subtrees, args.num_classes, args.embdrop)

    embedding_model = nn.Embedding(vocab.size(), args.input_dim)

    if args.cuda:
        embedding_model = embedding_model.cuda()

    if args.cuda:
        model.cuda(), criterion.cuda()

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    if args.embedding == 'glove':
        emb_torch = 'sst_embed.pth'
        emb_vector = 'glove.840B.300d'
        emb_vector_path = os.path.join(args.glove, emb_vector)
        assert os.path.isfile(emb_vector_path + '.txt')
    elif args.embedding == 'paragram':
        emb_torch = 'sst_embed_paragram.pth'
        emb_vector = 'paragram_300_sl999'
        emb_vector_path = os.path.join(args.paragram, emb_vector)
        assert os.path.isfile(emb_vector_path + '.txt')
    elif args.embedding == 'paragram_xxl':
        emb_torch = 'sst_embed_paragram_xxl.pth'
        emb_vector = 'paragram-phrase-XXL'
        emb_vector_path = os.path.join(args.paragram, emb_vector)
        assert os.path.isfile(emb_vector_path + '.txt')
    else:
        assert False

    emb_file = os.path.join(args.data, emb_torch)
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:

        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(emb_vector_path)
        print('==> Embedding vocabulary size: %d ' % glove_vocab.size())

        emb = torch.zeros(vocab.size(), glove_emb.size(1))

        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                    word)]
            else:
                emb[vocab.getIndex(word)] = torch.Tensor(
                    emb[vocab.getIndex(word)].size()).normal_(-0.05, 0.05)
        torch.save(emb, emb_file)
        is_preprocessing_data = True  # flag to quit
        print('done creating emb, quit')

    if is_preprocessing_data:
        print('quit program')
        quit()

    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
    embedding_model.state_dict()['weight'].copy_(emb)

    if args.optim == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        # optimizer   = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.wd)
    elif args.optim == 'adam_combine':
        optimizer = optim.Adam([{
            'params': model.parameters(),
            'lr': args.lr,
            'weight_decay': args.wd
        }, {
            'params': embedding_model.parameters(),
            'lr': args.emblr,
            'weight_decay': args.embwd
        }])
        args.manually_emb = 0
    elif args.optim == 'adagrad_combine':
        optimizer = optim.Adagrad([{
            'params': model.parameters(),
            'lr': args.lr,
            'weight_decay': args.wd
        }, {
            'params': embedding_model.parameters(),
            'lr': args.emblr,
            'weight_decay': args.embwd
        }])
        args.manually_emb = 0
    elif args.optim == 'adam_combine_v2':
        model.embedding_model = embedding_model
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)
        args.manually_emb = 0
    metrics = Metrics(args.num_classes)
    utils.count_param(model)

    trainer = SentimentTrainer(args, model, embedding_model, criterion,
                               optimizer)

    trainer.set_initial_emb(emb)
    question_idx = vocab.labelToIdx['sentiment']
    question_idx = torch.Tensor([question_idx])
    trainer.set_question(question_idx)

    # trainer = SentimentTrainer(args, model, embedding_model ,criterion, optimizer)

    mode = args.mode
    if mode == 'DEBUG':
        for epoch in range(args.epochs):
            # print a tree
            tree, sent, label = dev_dataset[3]
            utils.print_span(tree, sent, vocab)
            quit()

            dev_loss = trainer.train(dev_dataset)
            dev_loss, dev_pred, _ = trainer.test(dev_dataset)
            test_loss, test_pred, _ = trainer.test(test_dataset)

            dev_acc = metrics.sentiment_accuracy_score(dev_pred,
                                                       dev_dataset.labels)
            test_acc = metrics.sentiment_accuracy_score(
                test_pred, test_dataset.labels)
            print('==> Dev loss   : %f \t' % dev_loss, end="")
            print('Epoch ', epoch, 'dev percentage ', dev_acc)
    elif mode == "PRINT_TREE":
        for i in range(0, 10):
            ttree, tsent, tlabel = dev_dataset[i]
            utils.print_tree(ttree, 0)
            print('_______________')
        print('break')
        quit()
    elif mode == 'EVALUATE':
        filename = args.name + '.pth'
        epoch = args.epochs
        model_name = str(epoch) + '_model_' + filename
        embedding_name = str(epoch) + '_embedding_' + filename
        model = torch.load(os.path.join(args.saved, model_name))
        embedding_model = torch.load(os.path.join(args.saved, embedding_name))

        trainer = SentimentTrainer(args, model, embedding_model, criterion,
                                   optimizer)
        trainer.set_question(question_idx)
        test_loss, test_pred, subtree_metrics = trainer.test(test_dataset)
        test_acc = metrics.sentiment_accuracy_score(
            test_pred, test_dataset.labels, num_classes=args.num_classes)
        print('Epoch with max dev:' + str(epoch) + ' |test percentage ' +
              str(test_acc))
        print('____________________' + str(args.name) + '___________________')
        print_list = subtree_metrics.print_list
        torch.save(print_list,
                   os.path.join(args.saved, args.name + 'printlist.pth'))
        utils.print_trees_file(args,
                               vocab,
                               test_dataset,
                               print_list,
                               name='tree')
    elif mode == "EXPERIMENT":
        # dev_loss, dev_pred = trainer.test(dev_dataset)
        # dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels, num_classes=args.num_classes)
        max_dev = 0
        max_dev_epoch = 0
        filename = args.name + '.pth'
        for epoch in range(args.epochs):
            # train_loss, train_pred, _ = trainer.test(train_dataset)
            train_loss_while_training = trainer.train(train_dataset)
            train_loss, train_pred, _ = trainer.test(train_dataset)
            dev_loss, dev_pred, _ = trainer.test(dev_dataset)
            dev_acc = metrics.sentiment_accuracy_score(
                dev_pred, dev_dataset.labels, num_classes=args.num_classes)
            train_acc = metrics.sentiment_accuracy_score(
                train_pred, train_dataset.labels, num_classes=args.num_classes)
            print('==> Train loss   : %f \t' % train_loss_while_training,
                  end="")
            print('Epoch ', epoch, 'dev percentage ', dev_acc)
            print('Epoch %d dev percentage %f ' % (epoch, dev_acc))
            print('Train acc %f ' % (train_acc))
            if dev_acc > max_dev:
                print('update best dev acc %f ' % (dev_acc))
                max_dev = dev_acc
                max_dev_epoch = epoch
                utils.mkdir_p(args.saved)
                torch.save(
                    model,
                    os.path.join(args.saved,
                                 str(epoch) + '_model_' + filename))
                torch.save(
                    embedding_model,
                    os.path.join(args.saved,
                                 str(epoch) + '_embedding_' + filename))
            gc.collect()
        print('epoch ' + str(max_dev_epoch) + ' dev score of ' + str(max_dev))
        print('eva on test set ')
        model = torch.load(
            os.path.join(args.saved,
                         str(max_dev_epoch) + '_model_' + filename))
        embedding_model = torch.load(
            os.path.join(args.saved,
                         str(max_dev_epoch) + '_embedding_' + filename))
        trainer = SentimentTrainer(args, model, embedding_model, criterion,
                                   optimizer)
        trainer.set_question(question_idx)
        test_loss, test_pred, _ = trainer.test(test_dataset)
        test_acc = metrics.sentiment_accuracy_score(
            test_pred, test_dataset.labels, num_classes=args.num_classes)
        print('Epoch with max dev:' + str(max_dev_epoch) +
              ' |test percentage ' + str(test_acc))
        print('____________________' + str(args.name) + '___________________')
    else:
        for epoch in range(args.epochs):
            train_loss = trainer.train(train_dataset)
            train_loss, train_pred, _ = trainer.test(train_dataset)
            dev_loss, dev_pred, _ = trainer.test(dev_dataset)
            test_loss, test_pred, _ = trainer.test(test_dataset)

            train_acc = metrics.sentiment_accuracy_score(
                train_pred, train_dataset.labels)
            dev_acc = metrics.sentiment_accuracy_score(dev_pred,
                                                       dev_dataset.labels)
            test_acc = metrics.sentiment_accuracy_score(
                test_pred, test_dataset.labels)
            print('==> Train loss   : %f \t' % train_loss, end="")
            print('Epoch ', epoch, 'train percentage ', train_acc)
            print('Epoch ', epoch, 'dev percentage ', dev_acc)
            print('Epoch ', epoch, 'test percentage ', test_acc)
예제 #13
0
def prepare_to_train(data=None, glove=None):
    args = parse_args()
    if data is not None:
        args.data = data
    if glove is not None:
        args.glove = glove

    args.input_dim, args.mem_dim = 300, 150
    args.hidden_dim, args.num_classes = 50, 5
    args.cuda = args.cuda and torch.cuda.is_available()
    if args.sparse and args.wd != 0:
        print('Sparsity and weight decay are incompatible, pick one!')
        exit()
    print(args)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    numpy.random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    sick_vocab_file = os.path.join(args.data, 'sick.vocab')
    if not os.path.isfile(sick_vocab_file):
        token_files_a = [
            os.path.join(split, 'a.toks')
            for split in [train_dir, dev_dir, test_dir]
        ]
        token_files_b = [
            os.path.join(split, 'b.toks')
            for split in [train_dir, dev_dir, test_dir]
        ]
        token_files = token_files_a + token_files_b
        sick_vocab_file = os.path.join(args.data, 'sick.vocab')
        build_vocab(token_files, sick_vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=sick_vocab_file,
                  data=[
                      Constants.PAD_WORD, Constants.UNK_WORD,
                      Constants.BOS_WORD, Constants.EOS_WORD
                  ])
    print('==> SICK vocabulary size : %d ' % vocab.size())

    # load SICK dataset splits
    train_file = os.path.join(args.data, 'sick_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SICKDataset(train_dir, vocab, args.num_classes)
        torch.save(train_dataset, train_file)
    print('==> Size of train data   : %d ' % len(train_dataset))
    dev_file = os.path.join(args.data, 'sick_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes)
        torch.save(dev_dataset, dev_file)
    print('==> Size of dev data     : %d ' % len(dev_dataset))
    test_file = os.path.join(args.data, 'sick_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SICKDataset(test_dir, vocab, args.num_classes)
        torch.save(test_dataset, test_file)
    print('==> Size of test data    : %d ' % len(test_dataset))

    # initialize model, criterion/loss_function, optimizer
    model = SimilarityTreeLSTM(args.cuda, vocab.size(), args.input_dim,
                               args.mem_dim, args.hidden_dim, args.num_classes,
                               args.sparse)
    criterion = nn.KLDivLoss()
    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.wd)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sick_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(
            os.path.join(args.glove, 'glove.840B.300d'))
        print('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
        emb = torch.Tensor(vocab.size(),
                           glove_emb.size(1)).normal_(-0.05, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([
                Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD,
                Constants.EOS_WORD
        ]):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if glove_vocab.get_index(word):
                emb[vocab.get_index(word)] = glove_emb[glove_vocab.get_index(
                    word)]
        torch.save(emb, emb_file)
    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
    model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb)

    # create trainer object for training and testing
    #trainer = Trainer(args, model, criterion, optimizer)

    best = -float('inf')

    return (args, best, train_dataset, dev_dataset, test_dataset, metrics,
            optimizer, criterion, model)
예제 #14
0
def main():
    global args 
    args = parse_args()

    # global logger
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    
    formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s")
    # file logger
    fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w')
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    # console logger
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    if not torch.cuda.is_available() and args.cuda:
        args.cuda = False
        logger.info("CUDA is unavailable, convert to cpu mode")

    if args.sparse and args.wd != 0:
        logger.error('Sparsity and weight decay are incompatible, pick one!')
        exit()

    logger.debug(args)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    # set directory
    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # load vocabulary
    vocab_path = os.path.join(args.data, "vocab.npy")
    vocab = Vocab(
        filename=vocab_path, 
        labels=[constants.PAD_WORD, constants.UNK_WORD, constants.BOS_WORD, constants.EOS_WORD]
    )
    logger.debug('==> vocabulary size : %d ' % len(vocab))

    # load train dataset
    train_file = os.path.join(train_dir, "ERdata.pt")
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = ERDataset(train_dir, vocab, 2)
        torch.save(train_dataset, train_file)
    logger.debug('==> train data size: %d' % len(train_dataset))

    # load dev dataset
    dev_file = os.path.join(dev_dir, "ERdata.pt")
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = ERDataset(dev_dir, vocab, 2)
        torch.save(dev_dataset, dev_file)
    logger.debug('==> dev data size: %d' % len(dev_dataset))

    # load test dataset   
    test_file = os.path.join(test_dir, "ERdata.pt")
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = ERDataset(test_dir, vocab, 2)
        torch.save(test_dataset, test_file)
    logger.debug('==> test data size: %d' % len(test_dataset))

    # trainer: 
    # tree model
    model = TreeModel(
        len(vocab),
        args.input_dim,
        args.mem_dim,
        2,  # 0-1 prediction
        args.sparse,
        args.freeze_embed
    )

    # criterion
    criterion = nn.KLDivLoss()
    if args.cuda:
        model.cuda(), criterion.cuda()

    # optimizer
    if args.optim == 'adam':
        optimizer = optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()), 
            lr=args.lr, weight_decay=args.wd
        )
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(
            filter(lambda p: p.requires_grad, model.parameters()), 
            lr=args.lr, weight_decay=args.wd
        )
    elif args.optim == 'sgd':
        optimizer = optim.SGD(
            filter(lambda p: p.requires_grad, model.parameters()), 
            lr=args.lr, weight_decay=args.wd
        )
    else:
        raise Exception("Unknown optimizer")

    # metrics
    metrics = Metrics(2)  # 0-1 prediction

    # embeddings
    sent_emb_path = os.path.join(args.data, "sent_emb.pt")
    raw_sent_emb_path = os.path.join(args.glove, 'glove.840B.300d.txt')

    sent_emb = load_word_vectors(sent_emb_path, vocab, raw_sent_emb_path)
    
    logger.debug('==> sentence embedding size: %d * %d' % (sent_emb.size()[0], sent_emb.size()[1]))
    if args.cuda:
        sent_emb.cuda()
    model.sent_emb.weight.data.copy_(sent_emb)

    trainer = Trainer(args, model, criterion, optimizer)

    # train and test
    best = float("-inf")
    for epoch in range(args.epochs):
        train_loss = trainer.train(train_dataset)

        train_loss, train_pred = trainer.test(train_dataset)
        dev_loss, dev_pred = trainer.test(dev_dataset)
        test_loss, test_pred = trainer.test(test_dataset)

        train_pearson = metrics.pearson(train_pred, train_dataset.labels)
        train_mse = metrics.mse(train_pred, train_dataset.labels)
        logger.info('==> Epoch {}, Train \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, train_loss, train_pearson, train_mse))
        
        dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels)
        dev_mse = metrics.mse(dev_pred, dev_dataset.labels)
        logger.info('==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, dev_loss, dev_pearson, dev_mse))

        test_pearson = metrics.pearson(test_pred, test_dataset.labels)
        test_mse = metrics.mse(test_pred, test_dataset.labels)
        logger.info('==> Epoch {}, Test \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, test_loss, test_pearson, test_mse))

        if best < dev_pearson:
            best = dev_pearson
            checkpoint = {
                'model': trainer.model.state_dict(), 
                'optim': trainer.optimizer,
                'pearson': dev_pearson, 'mse': dev_mse,
                'args': args, 'epoch': epoch
                }
            logger.debug('==> New optimum found, checkpointing everything now...')
            torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))
예제 #15
0
def main():
    global args
    args = parse_args()

    mkl.set_num_threads(1)

    args.cuda = args.cuda and torch.cuda.is_available()
    if args.sparse and args.wd != 0:
        print('Sparsity and weight decay are incompatible, pick one!')
        exit()
    print(args)
    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    token_files_a = [
        os.path.join(split, 'a.toks')
        for split in [train_dir, dev_dir, test_dir]
    ]
    token_files_b = [
        os.path.join(split, 'b.toks')
        for split in [train_dir, dev_dir, test_dir]
    ]
    token_files = token_files_a + token_files_b
    sick_vocab_file = os.path.join(args.data, 'sick.vocab')
    build_vocab(token_files, sick_vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=sick_vocab_file,
                  data=[
                      Constants.PAD_WORD, Constants.UNK_WORD,
                      Constants.BOS_WORD, Constants.EOS_WORD
                  ])
    print('==> SICK vocabulary size : %d ' % vocab.size())

    # load SICK dataset splits
    train_file = os.path.join(args.data, 'sick_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SICKDataset(train_dir, vocab, args.num_classes)
        torch.save(train_dataset, train_file)
    print('==> Size of train data   : %d ' % len(train_dataset))
    dev_file = os.path.join(args.data, 'sick_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes)
        torch.save(dev_dataset, dev_file)
    print('==> Size of dev data     : %d ' % len(dev_dataset))
    test_file = os.path.join(args.data, 'sick_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SICKDataset(test_dir, vocab, args.num_classes)
        torch.save(test_dataset, test_file)
    print('==> Size of test data    : %d ' % len(test_dataset))

    # initialize model, criterion/loss_function, optimizer
    model = SimilarityTreeLSTM(args.encoder_type, args.cuda, vocab.size(),
                               args.input_dim, args.mem_dim, args.hidden_dim,
                               args.num_classes, args.sparse, args)
    criterion = nn.KLDivLoss()
    if args.cuda:
        model.cuda(), criterion.cuda()

    trainable_parameters = [
        param for param in model.parameters() if param.requires_grad
    ]

    if args.optim == 'adam':
        optimizer = optim.Adam(trainable_parameters,
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(trainable_parameters,
                                  lr=args.lr,
                                  weight_decay=args.wd)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(trainable_parameters,
                              lr=args.lr,
                              weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sick_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(
            os.path.join(args.glove, 'glove.840B.300d'))
        print('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
        emb = torch.Tensor(vocab.size(),
                           glove_emb.size(1)).normal_(-0.05, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([
                Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD,
                Constants.EOS_WORD
        ]):
            # TODO '<s>', '</s>' these tokens present in glove w2v but probably with different meaning.
            # though they are not currently used
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if word in glove_vocab.labelToIdx.keys():
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                    word)]
        torch.save(emb, emb_file)
    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
    model.encoder.emb.state_dict()['weight'].copy_(emb)

    # create trainer object for training and testing
    trainer = Trainer(args, model, criterion, optimizer)

    metric_functions = [metrics.pearson, metrics.mse]

    for epoch in range(args.epochs):
        train_loss = trainer.train(train_dataset)
        train_loss, train_pred = trainer.test(train_dataset)
        dev_loss, dev_pred = trainer.test(dev_dataset)
        test_loss, test_pred = trainer.test(test_dataset)

        pearson_stats, mse_stats = get_median_and_confidence_interval(
            train_pred, train_dataset.labels, metric_functions)
        print_results("Train", train_loss, pearson_stats, mse_stats)

        pearson_stats, mse_stats = get_median_and_confidence_interval(
            dev_pred, dev_dataset.labels, metric_functions)
        print_results("Dev", dev_loss, pearson_stats, mse_stats)

        pearson_stats, mse_stats = get_median_and_confidence_interval(
            test_pred, test_dataset.labels, metric_functions)
        print_results("Test", test_loss, pearson_stats, mse_stats)
예제 #16
0
def main(write_to):

    startTime = time.time()

    global args
    args = parse_args(type=1)
    args.input_dim = 300
    if args.model_name == 'dependency':
        args.mem_dim = 168
    elif args.model_name == 'constituency':
        args.mem_dim = 150
    if args.fine_grain:
        args.num_classes = 5  # 0 1 2 3 4
    else:
        args.num_classes = 3  # 0 1 2 (1 neutral)
    args.cuda = args.cuda and torch.cuda.is_available()
    # args.cuda = False
    print(args)
    # torch.manual_seed(args.seed)
    # if args.cuda:
    # torch.cuda.manual_seed(args.seed)

    #    train_dir = os.path.join(args.data,'train/')
    train_dir = os.path.join(
        args.data, 'dev/')  # Fei: wants to train on a smaller data set
    #    dev_dir = os.path.join(args.data,'dev/')
    #    test_dir = os.path.join(args.data,'test/')

    # write unique words from all token files
    token_files = [os.path.join(split, 'sents.toks') for split in [train_dir]]
    vocab_file = os.path.join(args.data, 'vocab-cased.txt')  # use vocab-cased
    # build_vocab(token_files, vocab_file) NO, DO NOT BUILD VOCAB,  USE OLD VOCAB

    #    vocab_file = os.path.join(args.data, 'vocab-cased-dev.txt')
    #    build_vocab(token_files, vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=vocab_file)
    print('==> SST vocabulary size : %d ' % vocab.size())

    # Load SST dataset splits

    is_preprocessing_data = False  # let program turn off after preprocess data

    # train
    train_file = os.path.join(args.data, 'sst_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SSTDataset(train_dir, vocab, args.num_classes,
                                   args.fine_grain, args.model_name)
        torch.save(train_dataset, train_file)
        is_preprocessing_data = True

    # dev


#    dev_file = os.path.join(args.data,'sst_dev.pth')
#    if os.path.isfile(dev_file):
#        dev_dataset = torch.load(dev_file)
#    else:
#        dev_dataset = SSTDataset(dev_dir, vocab, args.num_classes, args.fine_grain, args.model_name)
#        torch.save(dev_dataset, dev_file)
#        is_preprocessing_data = True

# test
#    test_file = os.path.join(args.data,'sst_test.pth')
#    if os.path.isfile(test_file):
#        test_dataset = torch.load(test_file)
#    else:
#        test_dataset = SSTDataset(test_dir, vocab, args.num_classes, args.fine_grain, args.model_name)
#        torch.save(test_dataset, test_file)
#        is_preprocessing_data = True

    criterion = nn.NLLLoss()
    # initialize model, criterion/loss_function, optimizer
    model = TreeLSTMSentiment(args.cuda, vocab.size(), args.input_dim,
                              args.mem_dim, args.num_classes, args.model_name,
                              criterion)

    embedding_model = nn.Embedding(vocab.size(), args.input_dim)
    # Fei: don't optimize embedding
    embedding_model.weight.requires_grad = False

    if args.cuda:
        embedding_model = embedding_model.cuda()

    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim == 'adam':
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        # optimizer   = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
        optimizer = optim.Adagrad(
            [{
                'params': filter(lambda p: p.requires_grad,
                                 model.parameters()),
                'lr': args.lr
            }  # Fei: filter non_trainable
             ],
            lr=args.lr,
            weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    utils.count_param(model)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sst_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:

        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(
            os.path.join(args.glove, 'glove.840B.300d'))
        print('==> GLOVE vocabulary size: %d ' % glove_vocab.size())

        emb = torch.zeros(vocab.size(), glove_emb.size(1))

        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                    word)]
            else:
                emb[vocab.getIndex(word)] = torch.Tensor(
                    emb[vocab.getIndex(word)].size()).normal_(-0.05, 0.05)
        torch.save(emb, emb_file)
        is_preprocessing_data = True  # flag to quit
        print('done creating emb, quit')

    if is_preprocessing_data:
        print('done preprocessing data, quit program to prevent memory leak')
        print('please run again')
        quit()

    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()

    # model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb)
    embedding_model.state_dict()['weight'].copy_(emb)

    # create trainer object for training and testing
    trainer = SentimentTrainer(args, model, embedding_model, criterion,
                               optimizer)

    loopStart = time.time()
    #print('prepare time is %s ' % (loopStart - startTime))
    loss_save = []

    mode = 'EXPERIMENT'
    if mode == 'DEBUG':
        for epoch in range(args.epochs):
            dev_loss = trainer.train(dev_dataset)
            dev_loss, dev_pred = trainer.test(dev_dataset)
            test_loss, test_pred = trainer.test(test_dataset)

            dev_acc = metrics.sentiment_accuracy_score(dev_pred,
                                                       dev_dataset.labels)
            test_acc = metrics.sentiment_accuracy_score(
                test_pred, test_dataset.labels)
            print('==> Dev loss   : %f \t' % dev_loss, end="")
            print('Epoch ', epoch, 'dev percentage ', dev_acc)
    elif mode == "PRINT_TREE":
        for i in range(0, 10):
            ttree, tsent, tlabel = dev_dataset[i]
            utils.print_tree(ttree, 0)
            print('_______________')
        print('break')
        quit()
    elif mode == "EXPERIMENT":
        max_dev = 0
        max_dev_epoch = 0
        filename = args.name + '.pth'
        for epoch in range(args.epochs):
            train_loss = trainer.train(train_dataset)
            #dev_loss, dev_pred = trainer.test(dev_dataset)
            #dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels)
            print('==> Train loss   : %f \t' % train_loss, end="")
            loss_save.append(train_loss)
            #print('Epoch ', epoch, 'dev percentage ', dev_acc)
            #torch.save(model, args.saved + str(epoch) + '_model_' + filename)
            #torch.save(embedding_model, args.saved + str(epoch) + '_embedding_' + filename)
            #if dev_acc > max_dev:
            #    max_dev = dev_acc
            #    max_dev_epoch = epoch
            #gc.collect()

        print("done")
        #print('epoch ' + str(max_dev_epoch) + ' dev score of ' + str(max_dev))
        #print('eva on test set ')
        #model = torch.load(args.saved + str(max_dev_epoch) + '_model_' + filename)
        #embedding_model = torch.load(args.saved + str(max_dev_epoch) + '_embedding_' + filename)
        #trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer)
        #test_loss, test_pred = trainer.test(test_dataset)
        #test_acc = metrics.sentiment_accuracy_score(test_pred, test_dataset.labels)
        #print('Epoch with max dev:' + str(max_dev_epoch) + ' |test percentage ' + str(test_acc))
        #print('____________________' + str(args.name) + '___________________')
    else:
        for epoch in range(args.epochs):
            train_loss = trainer.train(train_dataset)
            train_loss, train_pred = trainer.test(train_dataset)
            dev_loss, dev_pred = trainer.test(dev_dataset)
            test_loss, test_pred = trainer.test(test_dataset)

            train_acc = metrics.sentiment_accuracy_score(
                train_pred, train_dataset.labels)
            dev_acc = metrics.sentiment_accuracy_score(dev_pred,
                                                       dev_dataset.labels)
            test_acc = metrics.sentiment_accuracy_score(
                test_pred, test_dataset.labels)
            print('==> Train loss   : %f \t' % train_loss, end="")
            print('Epoch ', epoch, 'train percentage ', train_acc)
            print('Epoch ', epoch, 'dev percentage ', dev_acc)
            print('Epoch ', epoch, 'test percentage ', test_acc)

    loopEnd = time.time()
    print('looptime is %s ' % (loopEnd - loopStart))

    prepareTime = loopStart - startTime
    loopTime = loopEnd - loopStart
    timePerEpoch = loopTime / args.epochs

    with open(write_to, "w") as f:
        f.write("unit: " + "1 epoch\n")
        for loss in loss_save:
            f.write(str(loss) + "\n")
        f.write("run time: " + str(prepareTime) + " " + str(timePerEpoch) +
                "\n")
def main():
    global args
    args = parse_args()
    # global logger
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s")
    # file logger
    fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w')
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    # console logger
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    # argument validation
    args.cuda = args.cuda and torch.cuda.is_available()
    if args.sparse and args.wd != 0:
        logger.error('Sparsity and weight decay are incompatible, pick one!')
        exit()
    logger.debug(args)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True
    if not os.path.exists(args.save):
        os.makedirs(args.save)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    sick_vocab_file = os.path.join(args.data, 'sick.vocab')
    if not os.path.isfile(sick_vocab_file):
        token_files_b = [os.path.join(split, 'b.toks') for split in [train_dir, dev_dir, test_dir]]
        token_files_a = [os.path.join(split, 'a.toks') for split in [train_dir, dev_dir, test_dir]]
        token_files = token_files_a + token_files_b
        sick_vocab_file = os.path.join(args.data, 'sick.vocab')
        build_vocab(token_files, sick_vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=sick_vocab_file, data=[Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD])
    logger.debug('==> SICK vocabulary size : %d ' % vocab.size())

    # load SICK dataset splits
    train_file = os.path.join(args.data, 'sick_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SICKDataset(train_dir, vocab, args.num_classes)
        torch.save(train_dataset, train_file)
    logger.debug('==> Size of train data   : %d ' % len(train_dataset))
    dev_file = os.path.join(args.data, 'sick_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes)
        torch.save(dev_dataset, dev_file)
    logger.debug('==> Size of dev data     : %d ' % len(dev_dataset))
    test_file = os.path.join(args.data, 'sick_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SICKDataset(test_dir, vocab, args.num_classes)
        torch.save(test_dataset, test_file)
    logger.debug('==> Size of test data    : %d ' % len(test_dataset))

    # initialize model, criterion/loss_function, optimizer
    model = SimilarityTreeLSTM(
                vocab.size(),
                args.input_dim,
                args.mem_dim,
                args.hidden_dim,
                args.num_classes,
                args.sparse,
                args.freeze_embed)
    criterion = nn.KLDivLoss()
    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim == 'adam':
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
    elif args.optim == 'sgd':
        optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sick_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(os.path.join(args.glove, 'glove.840B.300d'))
        logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
        emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)]
        torch.save(emb, emb_file)
    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
    model.emb.weight.data.copy_(emb)

    # create trainer object for training and testing
    trainer = Trainer(args, model, criterion, optimizer)

    best = -float('inf')
    for epoch in range(args.epochs):
        train_loss             = trainer.train(train_dataset)
        train_loss, train_pred = trainer.test(train_dataset)
        dev_loss, dev_pred     = trainer.test(dev_dataset)
        test_loss, test_pred   = trainer.test(test_dataset)

        train_pearson = metrics.pearson(train_pred, train_dataset.labels)
        train_mse = metrics.mse(train_pred, train_dataset.labels)
        logger.info('==> Epoch {}, Train \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, train_loss, train_pearson, train_mse))
        dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels)
        dev_mse = metrics.mse(dev_pred, dev_dataset.labels)
        logger.info('==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, dev_loss, dev_pearson, dev_mse))
        test_pearson = metrics.pearson(test_pred, test_dataset.labels)
        test_mse = metrics.mse(test_pred, test_dataset.labels)
        logger.info('==> Epoch {}, Test \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, test_loss, test_pearson, test_mse))

        if best < test_pearson:
            best = test_pearson
            checkpoint = {
                'model': trainer.model.state_dict(), 
                'optim': trainer.optimizer,
                'pearson': test_pearson, 'mse': test_mse,
                'args': args, 'epoch': epoch
                }
            logger.debug('==> New optimum found, checkpointing everything now...')
            torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))
예제 #18
0
    
    print('test score %.2f' % (cum_score / cum_samples))


if __name__ == '__main__':
    args = docopt(__doc__)
    
    textual_data_path = args['--textual-data-path']
    visual_data_path = args['--visual-data-path']
    batch_size = int(args['--batch-size'])
    delta = int(args['--delta'])
    K = int(args['--K'])
    threshold = float(args['--threshold'])

    word_embed_size=50
    words, word_vectors = load_word_vectors('glove.6B.{}d.txt'.format(word_embed_size))
    vocab = Vocab(words)

    if args['tacos']:
        dataset = TACoS(textual_data_path=textual_data_path, visual_data_path=visual_data_path, 
                        K=K, delta=delta, threshold=threshold)
    elif args['acnet']:
        dataset = ActivityNet(textual_data_path=textual_data_path, visual_data_path=visual_data_path, 
                              K=K, delta=delta, threshold=threshold)
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print('use device: %s' % device, file=sys.stderr)
    
    print('loading the model from %s ...' % args['--model-path'])
    model = TGN.load(args['--model-path'])
    model.to(device)
예제 #19
0


# +
###For changing embeddings
# -

# for words common to dataset vocab and GLOVE, use GLOVE vectors
# for other words in dataset vocab, use random normal vectors
emb_file = os.path.join(args.data, 'sick_embed.pth')
if os.path.isfile(emb_file):
    emb = torch.load(emb_file)
else:
    # load glove embeddings and vocab
    print("embedding")
    glove_vocab, glove_emb = utils.load_word_vectors(
        os.path.join(args.glove, 'glove.840B.300d')) #glove.840B.300d
    logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
    emb = torch.zeros(vocab.size(), glove_emb.size(1), dtype=torch.float, device=device)
    emb.normal_(0, 0.05)
    # zero out the embeddings for padding and other special words if they are absent in vocab
    for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD,
                                Constants.BOS_WORD, Constants.EOS_WORD]):
        emb[idx].zero_()
    for word in vocab.labelToIdx.keys():
        if glove_vocab.getIndex(word):
            emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)]
    torch.save(emb, emb_file)
# plug these into embedding matrix inside model
model.emb.weight.data.copy_(emb)

# +
예제 #20
0
def train():
    """
    Build and Train model by given params
    """

    # params
    # assigned after loading data
    max_seq_length = None
    exp_name = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    keep_prob = 0.5
    n_hidden = 64
    num_classes = 5
    learning_rate = 1e-3
    model_save_path = os.path.join(MODELS_BASE_DIR, exp_name + '.cpkt')
    train_iterations = 100000
    eval_iterations = None
    batch_size = 24
    word_vector_dim = 300

    # ************** Pre-Model **************
    # Load data
    data_params = data_loader.get_data_params(DATA_BASE_DIR)
    max_seq_length = data_params["max_seq_length"]
    X_train, X_eval, y_train, y_eval = data_loader.load_data(
        data_params, one_hot_labels=USE_ONE_HOT_LABELS)
    print("==> Loaded data")

    eval_iterations = math.ceil(float(X_eval.shape[0]) / batch_size)

    # Load GloVe embbeding vectors
    word_vectors = load_word_vectors(WORD_VECTORS_PATH)

    # Batch generators
    train_batch_generator = batch_generator_uniform_prob(
        (X_train, y_train), batch_size, num_classes)
    eval_batch_generator = batch_generator_uniform_prob(
        (X_eval, y_eval), batch_size, num_classes)

    # ************** Model **************
    # placeholders
    labels = tf.placeholder(tf.float32, [None, num_classes])
    input_data = tf.placeholder(tf.int32, [None, max_seq_length])
    input_data_lengths = tf.placeholder(tf.int32, batch_size)

    # data processing
    data = tf.Variable(tf.zeros([batch_size, max_seq_length, word_vector_dim]),
                       dtype=tf.float32)

    data = tf.nn.embedding_lookup(word_vectors, input_data)

    # lstm cell
    lstm_cell = tf.nn.rnn_cell.LSTMCell(n_hidden)
    if USE_DROPOUT:
        lstm_cell = tf.nn.rnn_cell.DropoutWrapper(cell=lstm_cell,
                                                  output_keep_prob=keep_prob)
    # Do we need the state tuple? Because we don't want the cell to be
    # initialized with the state from previous sentence
    ## rnn_tuple_state = tf.nn.rnn_cell.LSTMStateTuple(init_state[0], init_state[1])

    if DYN_RNN_COPY_THROUGH_STATE:
        outputs, _ = tf.nn.dynamic_rnn(lstm_cell,
                                       data,
                                       dtype=tf.float32,
                                       sequence_length=input_data_lengths)
    else:
        outputs, _ = tf.nn.dynamic_rnn(lstm_cell, data, dtype=tf.float32)

    # output layer
    weight = tf.Variable(tf.truncated_normal([n_hidden, num_classes]))
    bias = tf.Variable(tf.constant(0.1, shape=[num_classes]))

    # Let's try this logic
    outputs = tf.transpose(
        outputs, [1, 0, 2])  # max_seq_length, batch_size, word_vector_dim
    last = tf.gather(outputs, int(outputs.get_shape()[0]) - 1)
    prediction = (tf.matmul(last, weight) + bias)

    correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

    # Metrics
    # Should we reduce_mean?
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=prediction,
                                                labels=labels))
    optimizer = tf.train.AdamOptimizer(
        learning_rate=learning_rate).minimize(loss)

    # Summaries
    tf.summary.scalar('Loss', loss)
    tf.summary.scalar('Accuracy', accuracy)
    merged = tf.summary.merge_all()
    logdir = os.path.join(LOGS_BASE_DIR, exp_name, "")

    # ************** Train **************
    print("Run 'tensorboard --logdir={}' to checkout tensorboard logs.".format(
        os.path.abspath(logdir)))
    print("==> training")

    best_accuracy = -1

    # Train
    with tf.Session() as sess:
        train_writer = tf.summary.FileWriter(os.path.join(logdir, "train"))
        eval_writer = tf.summary.FileWriter(os.path.join(logdir, "evaluation"))

        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())

        # Py2.7 or Py3 (if 2.7 --> Change to xrange)
        for iteration in tqdm.tqdm(range(train_iterations)):
            # shoudn't get exception, but check this
            # pass also
            X, y = next(train_batch_generator)
            X_lengths = get_lengths(X, PADD_VAL)
            if DEBUG:
                print("X.shape = {}, X_lengths.shape = {}".format(
                    X.shape, X_lengths.shape))
                print("y.shape = {}".format(y.shape))
                print("type(X) = {}, type(X_lengths) = {}".format(
                    X.dtype, X_lengths.dtype))
                idx = 3
                print("X[:{0}], X_length[:{0}]".format(idx))
                print(X[:idx])
                print(X_lengths[:idx])

            sess.run([optimizer],
                     feed_dict={
                         input_data: X,
                         labels: y,
                         input_data_lengths: X_lengths
                     })

            # Write summary
            if (iteration % 30 == 0):
                _summary, = sess.run([merged],
                                     feed_dict={
                                         input_data: X,
                                         labels: y,
                                         input_data_lengths: X_lengths
                                     })
                train_writer.add_summary(_summary, iteration)

            # evaluate the network every 1,000 iterations
            if (iteration % 1000 == 0 and iteration != 0):
                total_accuracy = 0
                for eval_iteration in tqdm.tqdm(range(eval_iterations)):
                    X, y = next(eval_batch_generator)
                    X_lengths = get_lengths(X, PADD_VAL)
                    _accuracy, _summary = sess.run([accuracy, merged],
                                                   feed_dict={
                                                       input_data:
                                                       X,
                                                       labels:
                                                       y,
                                                       input_data_lengths:
                                                       X_lengths
                                                   })
                    total_accuracy += _accuracy

                average_accuracy = total_accuracy / eval_iterations
                print("accuracy = {}".format(average_accuracy))
                if average_accuracy > best_accuracy:
                    print("Best model!")

                    save_path = saver.save(sess,
                                           model_save_path,
                                           global_step=iteration)
                    print("saved to %s" % save_path)

                    best_accuracy = average_accuracy

        eval_writer.close()
        train_writer.close()
예제 #21
0
def main():
    global args
    args = parse_args(type=10)
    args.input_dim = 300
    args.hidden_dim = 50
    # args.input_dim, args.mem_dim = 300, 150
    # args.hidden_dim, args.num_classes = 50, 5
    if args.model_name == 'dependency':
        args.mem_dim = 150
    elif args.model_name == 'constituency':
        args.mem_dim = 142
    args.num_classes = 5

    args.cuda = args.cuda and torch.cuda.is_available()
    print(args)
    # torch.manual_seed(args.seed)
    # if args.cuda:
        # torch.cuda.manual_seed(args.seed)

    train_dir = os.path.join(args.data,'train/')
    dev_dir = os.path.join(args.data,'dev/')
    test_dir = os.path.join(args.data,'test/')

    # write unique words from all token files
    token_files_a = [os.path.join(split,'a.toks') for split in [train_dir,dev_dir,test_dir]]
    token_files_b = [os.path.join(split,'b.toks') for split in [train_dir,dev_dir,test_dir]]
    token_files = token_files_a+token_files_b
    sick_vocab_file = os.path.join(args.data,'vocab-cased.txt')
    # build_vocab(token_files, sick_vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=sick_vocab_file, data=[Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD])
    print('==> SICK vocabulary size : %d ' % vocab.size())

    # load SICK dataset splits
    train_file = os.path.join(args.data,'sick_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SICKDataset(train_dir, vocab, args.num_classes)
        torch.save(train_dataset, train_file)
    print('==> Size of train data   : %d ' % len(train_dataset))
    dev_file = os.path.join(args.data,'sick_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes)
        torch.save(dev_dataset, dev_file)
    print('==> Size of dev data     : %d ' % len(dev_dataset))
    test_file = os.path.join(args.data,'sick_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SICKDataset(test_dir, vocab, args.num_classes)
        torch.save(test_dataset, test_file)
    print('==> Size of test data    : %d ' % len(test_dataset))

    # initialize model, criterion/loss_function, optimizer
    model = SimilarityTreeLSTM(
                args.cuda, vocab.size(),
                args.input_dim, args.mem_dim,
                args.hidden_dim, args.num_classes
            )
    embedding_model = nn.Embedding(vocab.size(), args.input_dim)
    if args.cuda:
        embedding_model = embedding_model.cuda()

    criterion = nn.KLDivLoss()
    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim=='adam':
        optimizer   = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)
    elif args.optim=='adagrad':
        optimizer   = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sick_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(os.path.join(args.glove,'glove.840B.300d'))
        print('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
        emb = torch.Tensor(vocab.size(),glove_emb.size(1)).normal_(-0.05,0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)]
        torch.save(emb, emb_file)
    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
    embedding_model.state_dict()['weight'].copy_(emb)
    # model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb)

    # create trainer object for training and testing
    trainer     = SimilarityTrainer(args, model, embedding_model, criterion, optimizer)

    for epoch in range(args.epochs):
        train_loss             = trainer.train(train_dataset)
        train_loss, train_pred = trainer.test(train_dataset)
        dev_loss, dev_pred     = trainer.test(dev_dataset)
        test_loss, test_pred   = trainer.test(test_dataset)

        print('==> Train loss   : %f \t' % train_loss, end="")
        print('Train Pearson    : %f \t' % metrics.pearson(train_pred,train_dataset.labels), end="")
        print('Train MSE        : %f \t' % metrics.mse(train_pred,train_dataset.labels), end="\n")
        print('==> Dev loss     : %f \t' % dev_loss, end="")
        print('Dev Pearson      : %f \t' % metrics.pearson(dev_pred,dev_dataset.labels), end="")
        print('Dev MSE          : %f \t' % metrics.mse(dev_pred,dev_dataset.labels), end="\n")
        print('==> Test loss    : %f \t' % test_loss, end="")
        print('Test Pearson     : %f \t' % metrics.pearson(test_pred,test_dataset.labels), end="")
        print('Test MSE         : %f \t' % metrics.mse(test_pred,test_dataset.labels), end="\n")