예제 #1
0
 def test_rtree_index_xml_file(self):
     method_id = self.id().split('.')[-1]
     self.tldb.load_object_from_xml(
         'xml',
         root_path().joinpath('tests/io/in/core/lib/dewey_id/messages.xml'),
         max_n_children=2)
     with self.out_file[method_id].open(mode='w') as f:
         f.write(self.tldb.get_object('xml').ordered_str())
     self.file_compare_default()
예제 #2
0
 def setUpClass(cls):
     super().setUpClass()
     cls.tldb = TLDB('local')
     input_folder = root_path(
     ) / 'tests' / 'io' / 'in' / 'cases' / 'simple_small' / 'A_B_D_table.dat'
     cls.tldb.load_table_object_from_csv('table',
                                         input_folder,
                                         delimiter=' ',
                                         headers=['A', 'B', 'D'],
                                         max_n_children=2)
     cls.table = cls.tldb.get_object('table')
예제 #3
0
 def test_rtree_index_from_folder(self):
     method_id = self.id().split('.')[-1]
     tldb = TLDB('simple_small')
     tldb.load_from_folder(root_path() / 'tests' / 'io' / 'in' / 'cases' /
                           'simple_small',
                           max_n_children=2)
     with self.out_file[method_id].open(mode='w') as f:
         for obj in tldb.all_objects_name:
             f.write(tldb.get_object(obj).ordered_str())
             f.write('-' * 20 + '\n')
     self.file_compare_default()
예제 #4
0
 def test_rtree_index_csv_file(self):
     method_id = self.id().split('.')[-1]
     input_folder = root_path(
     ) / 'tests' / 'io' / 'in' / 'cases' / 'simple_small' / 'A_B_D_table.dat'
     self.tldb.load_table_object_from_csv('table',
                                          input_folder,
                                          delimiter=' ',
                                          index_type='rtree',
                                          headers=['A', 'B', 'D'],
                                          max_n_children=2)
     with self.out_file[method_id].open(mode='w') as f:
         f.write(self.tldb.get_object('table').ordered_str())
     self.file_compare_default()
예제 #5
0
def get_test_suites():
    checking_dirs = {root_path() / 'test'}
    suites_dir = set()
    while checking_dirs:
        checking_d = checking_dirs.pop()
        sub_dirs = {
            d
            for d in checking_d.iterdir()
            if d.is_dir() and d.stem != '__pycache__'
        }
        if not sub_dirs:
            suites_dir.add(checking_d)
        else:
            checking_dirs = checking_dirs.union(sub_dirs)
    test_suites = {}
    for d in suites_dir:
        tests = unittest.TestLoader().discover(d)
        if tests.countTestCases() > 0:
            parent = d.parent.stem
            test_suites[f"{parent}.{d.stem}"] = tests
    return test_suites
예제 #6
0
def get_languages():
    """
    Find languages in the dataset that has train, dev, and test set.
    If the languages has multiple dataset, chose the directory with the largest amount of tokens
    :return: array of tuples of lang and dir
    """
    lang_to_dir_path = root_path() / 'data' / 'lang_to_dir.pkl'
    if lang_to_dir_path.exists():
        with lang_to_dir_path.open(mode='rb') as f:
            return pickle.load(f)

    # find datasets with train, dev, and test split
    all_dir = [(dir.name.split('-')[0][3:], dir) for dir in data_path.iterdir()
               if len(list(dir.glob('*.conllu'))) > 2]
    languages = dict.fromkeys(list(set(t[0] for t in all_dir)))
    for t in all_dir:
        lang = t[0]
        dir = t[1]
        if not languages[lang]:
            languages[lang] = []
        languages[lang].append(dir)
    # get directory with the most amount of tokens
    for lang in languages:
        list_dirs = languages[lang]
        if len(list_dirs) == 1:
            languages[lang] = LanguageDataset(lang, list_dirs[0])
        else:
            lang_stats = []
            for dir in list_dirs:
                with (dir / 'stats.xml').open() as f:
                    stats = xmltodict.parse(f.read())
                    lang_stats.append(int(stats['treebank']['size']['total']['tokens']))
            languages[lang] = LanguageDataset(lang, list_dirs[np.argmax(lang_stats)])
    with lang_to_dir_path.open(mode='wb') as f:
        pickle.dump(languages, f)
    return languages
예제 #7
0
    configs = {
        'n_epochs': args.n_epochs,
        'word_embed_dim': args.word_embed_dim,
        'char_embed_dim': args.char_embed_dim,
        'char_hidden_dim': args.char_hidden_dim,
        'word_hidden_dim': args.word_hidden_dim,
        'optimizer': args.optimizer,
        'lr': args.lr,
        'use_gpu': use_gpu,
        'save_model': args.save_model
    }

    if not args.folder:
        if args.language == 'all':
            for lang in languages:
                trainer(languages[lang], configs)
        else:
            if args.language not in list(languages.keys()):
                raise ValueError(f'language {args.language} not found')
            trainer(languages[args.language], configs)
    else:
        if args.language == 'all':
            raise ValueError(
                'Cannot train all language with designated folder. '
                'Please remove --folder arguments to train all languages')
        path = root_path() / 'data' / 'ud-treebanks-v2.3' / args.folder
        if not path.exists():
            raise ValueError('Folder not found')
        lang_dataset = LanguageDataset(args.language, path)
        trainer(lang_dataset, configs)
예제 #8
0

if __name__ == '__main__':
    parser = ArgParser(description='test file-converter')
    parser.add_argument('test',
                        help=f"Test a/all test suite(s) or a specific test cases inside the test folder",
                        type=str)
    parser.add_argument('--verbosity', choices=[1, 2], help=f"Test verbosity (default 2)", type=int, default=2)
    parser.add_argument('--interactive', help='Interactive testing with meld (default False)', action='store_true')
    parser.add_argument('--folder', help='Test folder (e.g: test/sub_test) (default test)', type=str, default='test')

    args = parser.parse_args()

    t_suites = get_suites(args.folder)

    DEFAULT_INPUT_FOLDER = root_path().joinpath(args.folder) / 'io' / 'in'
    DEFAULT_OUTPUT_FOLDER = root_path().joinpath(args.folder) / 'io' / 'out'
    DEFAULT_INPUT_FOLDER.mkdir(parents=True, exist_ok=True)
    DEFAULT_OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

    if args.interactive:
        result_class = MeldInteractiveTestResult
    else:
        result_class = unittest.TextTestResult
    runner = unittest.TextTestRunner(verbosity=args.verbosity, resultclass=result_class)

    final_res = False
    if args.test:
        if args.test == 'all':
            results = set()
            for s in t_suites:
예제 #9
0
 def setUpClass(cls):
     cls.input_folder = root_path() / 'test' / 'io' / 'in'
     cls.output_folder = root_path() / 'test' / 'io' / 'out'
     cls.out_file = {}
     cls.exp_file = {}
     cls.in_file = {}
예제 #10
0
def trainer(lang_data, configs):
    vocab = lang_data.vocab
    alphabet = lang_data.alphabet

    meta = lang_data.meta
    use_gpu = configs['use_gpu']
    model = CustomedBiLstm(alphabet_size=len(alphabet),
                           vocab_size=len(vocab),
                           word_embed_dim=configs['word_embed_dim'],
                           char_embed_dim=configs['char_embed_dim'],
                           char_hidden_dim=configs['char_hidden_dim'],
                           word_hidden_dim=configs['word_hidden_dim'],
                           n_tags=meta['n_tags'],
                           use_gpu=use_gpu)
    if use_gpu:
        model.cuda()

    loss_function = nn.NLLLoss()
    if configs['optimizer'] == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=configs['lr'])
    elif configs['optimizer'] == 'SGD':
        optimizer = optim.SGD(model.parameters(), lr=configs['lr'])

    n_try = 0
    log_path = root_path() / 'src' / 'out' / 'log' / (lang_data.name + '_' +
                                                      str(n_try) + '.log')
    while log_path.exists():
        n_try += 1
        log_path = root_path() / 'src' / 'out' / 'log' / (
            lang_data.name + '_' + str(n_try) + '.log')
    logging.basicConfig(filename=str(log_path), level=logging.INFO)
    logging.getLogger('trainer')

    results = {
        'Language': lang_data.name,
        'Repo': lang_data.repo.stem,
        'Stats': {
            'n_tokens': meta['n_tokens'],
            'n_train': len(lang_data.train_split.tokens),
            'n_dev': len(lang_data.dev_split.tokens),
            'n_test': len(lang_data.test_split.tokens)
        },
        'Config': configs,
        'Model': str(model),
        'Time': [],
        'Performance': []
    }
    logging.info(f"Language: {lang_data.name} \n")
    logging.info(f"Repo: {lang_data.repo} \n")
    logging.info(f"Number of tokens: {meta['n_tokens']} \n")
    logging.info(f"Train size: {len(lang_data.train_split.tokens)}, "
                 f"Dev size: {len(lang_data.dev_split.tokens)}, "
                 f"Test size: {len(lang_data.test_split.tokens)}")
    logging.info(f"Model: {model} \n")
    logging.info(f"Config: {configs}\n")

    for epoch in range(configs['n_epochs']):
        logging.info(f"epoch: {epoch}\n")
        epoch_time = {'epoch': epoch + 1}
        epoch_perf = {'epoch': epoch + 1}
        start_epoch = timeit.default_timer()

        indices = np.arange(len(lang_data.train_split.tokens))
        shuffle(indices)
        train_tokens = [lang_data.train_split.tokens[idx] for idx in indices]
        train_tags = [lang_data.train_split.tags[idx] for idx in indices]

        total_loss = 0
        model.zero_grad()
        for idx in range(len(lang_data.train_split.tokens)):
            tokens_tensor, char_tensor, tags_tensor = get_one_batch(
                train_tokens[idx], train_tags[idx], vocab, alphabet,
                meta['all_tags'])
            if use_gpu:
                tokens_tensor = tokens_tensor.cuda()
                char_tensor = char_tensor.cuda()
                tags_tensor = tags_tensor.cuda()
            log_probs = model(tokens_tensor, char_tensor)
            batch_loss = loss_function(log_probs, tags_tensor)
            batch_loss.backward()
            optimizer.step()
            total_loss += batch_loss

        training_time = timeit.default_timer() - start_epoch
        logging.info('\t training the model took %.4f \n' % training_time)
        epoch_time['train'] = training_time

        train_acc, train_eval_time = f_timer(evaluate, lang_data.train_split,
                                             model, vocab, alphabet,
                                             meta['all_tags'], use_gpu)
        dev_acc, test_eval_time = f_timer(evaluate, lang_data.dev_split, model,
                                          vocab, alphabet, meta['all_tags'],
                                          use_gpu)
        logging.info('\t evaluation train split took %.4f \n' %
                     train_eval_time)
        logging.info('\t evaluation dev took %.4f \n' % test_eval_time)
        epoch_time['train_eval'] = train_eval_time
        epoch_time['test_eval'] = test_eval_time

        logging.info('\t one epoch took %.4f \n' %
                     (timeit.default_timer() - start_epoch))
        logging.info('\t loss: %.4f, train acc: %.3f, dev acc: %.3f \n' %
                     (total_loss, train_acc, dev_acc))
        epoch_perf['loss'] = ("%.4f" % total_loss)
        epoch_perf['train_acc'] = ("%.3f" % train_acc)
        epoch_perf['dev_acc'] = ("%.3f" % dev_acc)

        results['Time'].append(epoch_time)
        results['Performance'].append(epoch_perf)

    test_acc = evaluate(lang_data.test_split, model, vocab, alphabet,
                        meta['all_tags'], use_gpu)
    logging.info('test acc: %.3f%% \n' % test_acc)
    results['Accuracy'] = test_acc

    with (root_path() / 'src' / 'out' / 'test' /
          (lang_data.name + '.json')).open(mode='w') as f:
        json.dump(results, f, indent=4, sort_keys=True)

    if configs['save_model']:
        model_name = lang_data.name + '.model'
        torch.save(model, root_path() / 'src' / 'out' / 'cache' / model_name)
예제 #11
0
from pathlib import Path
from config import root_path
from tldb.core.structure.entry import Entry
from tldb.core.structure.dewey_id import DeweyID

import numpy as np
import logging
import timeit

data_path = root_path() / 'test' / 'io' / 'in' / 'cases'


def get_index_highest_element(all_elements_name: [str],
                              table_name: str) -> int:
    """Summary
    This function return the index of highest level element (in XML query) of a table name
    e.g: If query is A->B then all_elements_name would be ['A', 'B']
    get_index_highest_element(['A, B'], 'B_A') returns 1

    :param all_elements_name: list of elements in XML query by level order
    :param table_name:
    :return: index of highest element
    """
    table_elements = table_name.split('_')
    index = []
    for element_name in table_elements:
        index.append(all_elements_name.index(element_name))
    return np.argmin(np.asarray(index))


def load_text_file(file_path: Path) -> [str]:
예제 #12
0
 def addSuccess(self, test):
     super().addSuccess(test)
     with (root_path() / 'test' / 'io' / 'out' / 'metrics.log').open(mode='a+') as f:
         f.write(f"{test.id()}, success, {test._elapsed:5f}\n")
예제 #13
0
 def addFailure(self, test, err):
     super().addFailure(test, err)
     with (root_path() / 'test' / 'io' / 'out' / 'metrics.log').open(mode='a+') as f:
         f.write(f"{test.id()}, fail, {test._elapsed:5f}\n")
예제 #14
0
                        help=f"Test verbosity (default 2)",
                        type=int,
                        default=2)
    parser.add_argument(
        '--meld',
        help='Use meld to compare out and exp file (default False)',
        action='store_true')

    args = parser.parse_args()

    if args.meld:
        result_class = TestResultCompareFileMeld
    else:
        result_class = TestResultLogMetrics

    (root_path() / 'test' / 'io' / 'out').mkdir(parents=True, exist_ok=True)
    # TODO: ugly fix to remove metrics.log file
    if (root_path() / 'test' / 'io' / 'out' / 'metrics.log').is_file():
        (root_path() / 'test' / 'io' / 'out' / 'metrics.log').unlink()

    runner = unittest.TextTestRunner(verbosity=args.verbosity,
                                     resultclass=result_class)

    result = False
    if args.test:
        if args.test == 'all':
            results = set()
            for s in t_suites:
                results.add(runner.run(t_suites[s]).wasSuccessful())
            result = all(results)
        else:
예제 #15
0
import pickle

import conllu
import numpy as np
import xmltodict

from config import root_path
from src.util.nlp import build_vocab_from_sentences_tokens, build_alphabet_from_sentence_tokens

data_path = root_path() / 'data' / 'ud-treebanks-v2.3'


def get_languages():
    """
    Find languages in the dataset that has train, dev, and test set.
    If the languages has multiple dataset, chose the directory with the largest amount of tokens
    :return: array of tuples of lang and dir
    """
    lang_to_dir_path = root_path() / 'data' / 'lang_to_dir.pkl'
    if lang_to_dir_path.exists():
        with lang_to_dir_path.open(mode='rb') as f:
            return pickle.load(f)

    # find datasets with train, dev, and test split
    all_dir = [(dir.name.split('-')[0][3:], dir) for dir in data_path.iterdir()
               if len(list(dir.glob('*.conllu'))) > 2]
    languages = dict.fromkeys(list(set(t[0] for t in all_dir)))
    for t in all_dir:
        lang = t[0]
        dir = t[1]
        if not languages[lang]: