예제 #1
0
def main():
    method_name = set_method_name()
    logfile = init_logfile(method_name, opt)
    pretrained=load_pretrained(opt)

    dataset = Dataset.load(dataset_name=opt.dataset, pickle_path=opt.pickle_path).show()
    word2index, out_of_vocabulary, unk_index, pad_index, devel_index, test_index = index_dataset(dataset, pretrained)

    # dataset split tr/val/test
    val_size = min(int(len(devel_index) * .2), 20000)
    train_index, val_index, ytr, yval = train_test_split(devel_index, dataset.devel_target, test_size=val_size, random_state=opt.seed, shuffle=True)
    yte = dataset.test_target

    vocabsize = len(word2index) + len(out_of_vocabulary)
    pretrained_embeddings, sup_range = embedding_matrix(dataset, pretrained, vocabsize, word2index, out_of_vocabulary, opt)

    model = init_Net(dataset.nC, vocabsize, pretrained_embeddings, sup_range, tocuda=True)
    optim = init_optimizer(model, lr=opt.lr)
    criterion = init_loss(dataset.classification_type)

    # train-validate
    tinit = time()
    create_if_not_exist(opt.checkpoint_dir)
    early_stop = EarlyStopping(model, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{opt.net}-{opt.dataset}')

    for epoch in range(1, opt.nepochs + 1):
        train(model, train_index, ytr, pad_index, tinit, logfile, criterion, optim, epoch, method_name)

        # validation
        macrof1 = test(model, val_index, yval, pad_index, dataset.classification_type, tinit, epoch, logfile, criterion, 'va')
        early_stop(macrof1, epoch)
        if opt.test_each>0:
            if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
                test(model, test_index, yte, pad_index, dataset.classification_type, tinit, epoch, logfile, criterion, 'te')

        if early_stop.STOP:
            print('[early-stop]')
            if not opt.plotmode: # with plotmode activated, early-stop is ignored
                break

    # restores the best model according to the Mf1 of the validation set (only when plotmode==False)
    stoptime = early_stop.stop_time - tinit
    stopepoch = early_stop.best_epoch
    logfile.add_row(epoch=stopepoch, measure=f'early-stop', value=early_stop.best_score, timelapse=stoptime)

    if opt.plotmode==False:
        print('performing final evaluation')
        model = early_stop.restore_checkpoint()

        if opt.val_epochs>0:
            print(f'last {opt.val_epochs} epochs on the validation set')
            for val_epoch in range(1, opt.val_epochs + 1):
                train(model, val_index, yval, pad_index, tinit, logfile, criterion, optim, epoch+val_epoch, method_name)

        # test
        print('Training complete: testing')
        test(model, test_index, yte, pad_index, dataset.classification_type, tinit, epoch, logfile, criterion, 'final-te')
예제 #2
0
def fetch_ohsumed50k(data_path=None, subset='train', train_test_split=0.7):
    _dataname = 'ohsumed50k'
    if data_path is None:
        data_path = join(os.path.expanduser('~'), _dataname)
    create_if_not_exist(data_path)

    pickle_file = join(data_path, _dataname + '.' + subset + str(train_test_split) + '.pickle')
    if not os.path.exists(pickle_file):
        DOWNLOAD_URL = ('http://disi.unitn.it/moschitti/corpora/ohsumed-all-docs.tar.gz')
        archive_path = os.path.join(data_path, 'ohsumed-all-docs.tar.gz')
        download_file_if_not_exists(DOWNLOAD_URL, archive_path)
        untardir = 'ohsumed-all'
        if not os.path.exists(os.path.join(data_path, untardir)):
            print("untarring ohsumed...")
            tarfile.open(archive_path, 'r:gz').extractall(data_path)

        target_names = []
        doc_classes = dict()
        class_docs = dict()
        content = dict()
        doc_ids = set()
        for cat_id in os.listdir(join(data_path, untardir)):
            target_names.append(cat_id)
            class_docs[cat_id] = []
            for doc_id in os.listdir(join(data_path, untardir, cat_id)):
                doc_ids.add(doc_id)
                text_content = open(join(data_path, untardir, cat_id, doc_id), 'r').read()
                if doc_id not in doc_classes: doc_classes[doc_id] = []
                doc_classes[doc_id].append(cat_id)
                if doc_id not in content: content[doc_id] = text_content
                class_docs[cat_id].append(doc_id)
        target_names.sort()
        print('Read %d different documents' % len(doc_ids))

        splitdata = dict({'train': [], 'test': []})
        for cat_id in target_names:
            free_docs = [d for d in class_docs[cat_id] if (d not in splitdata['train'] and d not in splitdata['test'])]
            if len(free_docs) > 0:
                split_point = int(math.floor(len(free_docs) * train_test_split))
                splitdata['train'].extend(free_docs[:split_point])
                splitdata['test'].extend(free_docs[split_point:])
        for split in ['train', 'test']:
            dataset = LabelledDocuments([], [], target_names)
            for doc_id in splitdata[split]:
                dataset.data.append(content[doc_id])
                dataset.target.append([target_names.index(cat_id) for cat_id in doc_classes[doc_id]])
            pickle.dump(dataset,
                        open(join(data_path, _dataname + '.' + split + str(train_test_split) + '.pickle'), 'wb'),
                        protocol=pickle.HIGHEST_PROTOCOL)

    print(pickle_file)
    return pickle.load(open(pickle_file, 'rb'))
예제 #3
0
def get_embedding_matrix_path(datasetname, dataset, pretrained, supervised,
                              random, vec_matrix_path):
    matrix_name = f'{datasetname}-pretrained{pretrained}-supervised{supervised}-random{random}.vec'
    matrix_path = f'{vec_matrix_path}/{matrix_name}'
    if not os.path.exists(matrix_path):
        vocabulary, matrix = embedding_matrix(dataset, pretrained, supervised,
                                              random)
        create_if_not_exist(vec_matrix_path)
        save_vectors(matrix_path, vocabulary, matrix)
        dims = matrix.shape[1]
    else:
        _, dims = load_metadata_from_vectors_file(matrix_path)
    return matrix_path, dims
예제 #4
0
def main():

    # init the log-file
    method_name = 'fasttext'
    method_name += '-bigrams' if args.bigrams else '-unigrams'
    method_name += '-glove' if args.pretrained else ''
    method_name += '-rand' if args.pretrained and args.learnable > 0 else ''
    method_name += '-sup' if args.supervised else ''
    logfile = CSVLog(args.log_file, [
        'dataset', 'method', 'lr', 'learnable', 'nepochs', 'seed', 'measure',
        'value', 'timelapse'
    ],
                     autoflush=True)
    logfile.set_default('dataset', args.dataset)
    logfile.set_default('method', method_name)
    logfile.set_default('seed', args.seed)
    logfile.set_default('lr', args.lr)
    logfile.set_default('learnable', args.learnable)
    logfile.set_default('nepochs', args.nepochs)
    assert args.force or not logfile.already_calculated(
    ), f'results for dataset {args.dataset} method {method_name} and run {args.seed} already calculated'

    # load dataset
    dataset = Dataset.load(dataset_name=args.dataset,
                           pickle_path=args.pickle_path)

    matrix_path = None
    if args.pretrained or args.supervised:
        matrix_path, dims = get_embedding_matrix_path(args.dataset, dataset,
                                                      args.pretrained,
                                                      args.supervised,
                                                      args.learnable,
                                                      args.vec_matrix_path)

    analyzer = dataset.analyzer()
    devel = [
        ' '.join(analyzer(t))
        for t in tqdm(dataset.devel_raw, desc='indexing-devel')
    ]
    test = [
        ' '.join(analyzer(t))
        for t in tqdm(dataset.test_raw, desc='indexing-test')
    ]

    # dataset split tr/val/test
    val_size = min(int(len(devel) * .2), 20000)
    train, val, ytr, yva = train_test_split(devel,
                                            dataset.devel_target,
                                            test_size=val_size,
                                            random_state=args.seed,
                                            shuffle=True)
    yte = dataset.test_target
    print(f'tr={len(train)} va={len(val)} test={len(test)} docs')

    create_if_not_exist(args.dataset_dir)
    trainpath = get_input_file(train, ytr)

    loss = 'ova' if dataset.classification_type == 'multilabel' else 'softmax'
    ngrams = 2 if args.bigrams else 1
    tinit = time()
    if matrix_path is None:
        model = train_supervised(input=trainpath,
                                 epoch=args.nepochs,
                                 lr=args.lr,
                                 wordNgrams=ngrams,
                                 verbose=2,
                                 minCount=1,
                                 loss=loss,
                                 dim=args.learnable)
    else:
        model = train_supervised(input=trainpath,
                                 epoch=args.nepochs,
                                 lr=args.lr,
                                 wordNgrams=ngrams,
                                 verbose=2,
                                 minCount=1,
                                 loss=loss,
                                 pretrainedVectors=matrix_path,
                                 dim=dims)
    tend = time() - tinit

    predic_and_eval(model, val, yva, 'va', dataset.classification_type,
                    logfile, tend)
    predic_and_eval(model, test, yte, 'te', dataset.classification_type,
                    logfile, tend)
예제 #5
0
 def dump(self, path):
     create_if_not_exist(os.path.dirname(path))
     self.df.to_csv(path, sep='\t')
예제 #6
0
from time import time
from data.domain import pack_domains
from data.tasks import WebisCLS10_task_generator
from domain_adaptation.dci import DCI
from domain_adaptation.pivotselection import pivot_selection
import os, sys
from quantification.helpers import *
from util.file import create_if_not_exist
from os.path import join

dcf = 'cosine'
npivots = 450
dataset_home = '../'
vectors = '../vectors'

create_if_not_exist(vectors)


def __fit_predict(Xtr, ytr, Xte, svm):
    svm.fit(Xtr, ytr)
    return svm.predict(Xte)


def svm_fit_predict(Xs, ys, Xt, nfolds=10):
    print('Xtr=', Xs.shape, ys.mean())
    print('Xte=', Xt.shape)

    parameters = {'C': [10**i for i in range(-5, 5)]}
    svm = GridSearchCV(LinearSVC(),
                       parameters,
                       n_jobs=-1,
def main(opt):
    method_name = set_method_name()
    logfile = init_logfile(method_name, opt)

    dataset = Dataset.load(dataset_name=opt.dataset, pickle_path=opt.pickle_path).show()
    #dataset.devel_raw=dataset.devel_raw[:100]
    #dataset.devel_target = dataset.devel_target[:100]
    #dataset.devel_labelmatrix = dataset.devel_labelmatrix[:100]

    # tokenize and truncate to max_length
    bert = Token2BertEmbeddings('bert-base-uncased', max_length=opt.max_length, device=opt.device)
    tokenize_and_truncate(dataset, bert.tokenizer, opt.max_length)

    # dataset split tr/val/test
    (train_docs, ytr), (val_docs, yval), (test_docs, yte) = train_val_test(dataset)

    wce = None
    if opt.supervised:
        WCE, WCE_range, WCE_vocab = embedding_matrix(opt, dataset)
        wce = Token2WCEmbeddings(
            WCE, WCE_range, WCE_vocab, drop_embedding_prop=opt.sup_drop, device=opt.device, max_length=opt.max_length
        )

    model = init_Net(dataset.nC, bert, wce, opt.device)
    optim = init_optimizer(model, lr=opt.lr, weight_decay=opt.weight_decay)
    criterion = init_loss(dataset.classification_type)

    # train-validate
    tinit = time()
    create_if_not_exist(opt.checkpoint_dir)
    early_stop = EarlyStopping(model, patience=opt.patience,
                               checkpoint=f'{opt.checkpoint_dir}/{opt.net}-{opt.dataset}' if not opt.plotmode else None)

    train_batcher = Batcher(opt.batch_size, opt.max_epoch_length)
    for epoch in range(1, opt.nepochs + 1):
        train(model, train_docs, ytr, tinit, logfile, criterion, optim, epoch, method_name, train_batcher)

        # validation
        macrof1 = test(model, val_docs, yval, dataset.classification_type, tinit, epoch, logfile, criterion, 'va')
        early_stop(macrof1, epoch)
        if opt.test_each>0:
            if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or \
                    (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
                test(model, test_docs, yte, dataset.classification_type, tinit, epoch, logfile, criterion, 'te')

        if early_stop.STOP:
            print('[early-stop]')
            if not opt.plotmode: # with plotmode activated, early-stop is ignored
                break

    # restores the best model according to the Mf1 of the validation set (only when plotmode==False)
    stoptime = early_stop.stop_time - tinit
    stopepoch = early_stop.best_epoch
    logfile.add_row(epoch=stopepoch, measure=f'early-stop', value=early_stop.best_score, timelapse=stoptime)

    if not opt.plotmode:
        print('performing final evaluation')
        model = early_stop.restore_checkpoint()

        if opt.val_epochs>0:
            print(f'last {opt.val_epochs} epochs on the validation set')
            for val_epoch in range(1, opt.val_epochs + 1):
                train(model, val_docs, yval, tinit, logfile, criterion, optim, epoch+val_epoch, method_name)

        # test
        print('Training complete: testing')
        test(model, test_docs, yte, dataset.classification_type, tinit, epoch, logfile, criterion, 'final-te')