def test_rtree_index_xml_file(self): method_id = self.id().split('.')[-1] self.tldb.load_object_from_xml( 'xml', root_path().joinpath('tests/io/in/core/lib/dewey_id/messages.xml'), max_n_children=2) with self.out_file[method_id].open(mode='w') as f: f.write(self.tldb.get_object('xml').ordered_str()) self.file_compare_default()
def setUpClass(cls): super().setUpClass() cls.tldb = TLDB('local') input_folder = root_path( ) / 'tests' / 'io' / 'in' / 'cases' / 'simple_small' / 'A_B_D_table.dat' cls.tldb.load_table_object_from_csv('table', input_folder, delimiter=' ', headers=['A', 'B', 'D'], max_n_children=2) cls.table = cls.tldb.get_object('table')
def test_rtree_index_from_folder(self): method_id = self.id().split('.')[-1] tldb = TLDB('simple_small') tldb.load_from_folder(root_path() / 'tests' / 'io' / 'in' / 'cases' / 'simple_small', max_n_children=2) with self.out_file[method_id].open(mode='w') as f: for obj in tldb.all_objects_name: f.write(tldb.get_object(obj).ordered_str()) f.write('-' * 20 + '\n') self.file_compare_default()
def test_rtree_index_csv_file(self): method_id = self.id().split('.')[-1] input_folder = root_path( ) / 'tests' / 'io' / 'in' / 'cases' / 'simple_small' / 'A_B_D_table.dat' self.tldb.load_table_object_from_csv('table', input_folder, delimiter=' ', index_type='rtree', headers=['A', 'B', 'D'], max_n_children=2) with self.out_file[method_id].open(mode='w') as f: f.write(self.tldb.get_object('table').ordered_str()) self.file_compare_default()
def get_test_suites(): checking_dirs = {root_path() / 'test'} suites_dir = set() while checking_dirs: checking_d = checking_dirs.pop() sub_dirs = { d for d in checking_d.iterdir() if d.is_dir() and d.stem != '__pycache__' } if not sub_dirs: suites_dir.add(checking_d) else: checking_dirs = checking_dirs.union(sub_dirs) test_suites = {} for d in suites_dir: tests = unittest.TestLoader().discover(d) if tests.countTestCases() > 0: parent = d.parent.stem test_suites[f"{parent}.{d.stem}"] = tests return test_suites
def get_languages(): """ Find languages in the dataset that has train, dev, and test set. If the languages has multiple dataset, chose the directory with the largest amount of tokens :return: array of tuples of lang and dir """ lang_to_dir_path = root_path() / 'data' / 'lang_to_dir.pkl' if lang_to_dir_path.exists(): with lang_to_dir_path.open(mode='rb') as f: return pickle.load(f) # find datasets with train, dev, and test split all_dir = [(dir.name.split('-')[0][3:], dir) for dir in data_path.iterdir() if len(list(dir.glob('*.conllu'))) > 2] languages = dict.fromkeys(list(set(t[0] for t in all_dir))) for t in all_dir: lang = t[0] dir = t[1] if not languages[lang]: languages[lang] = [] languages[lang].append(dir) # get directory with the most amount of tokens for lang in languages: list_dirs = languages[lang] if len(list_dirs) == 1: languages[lang] = LanguageDataset(lang, list_dirs[0]) else: lang_stats = [] for dir in list_dirs: with (dir / 'stats.xml').open() as f: stats = xmltodict.parse(f.read()) lang_stats.append(int(stats['treebank']['size']['total']['tokens'])) languages[lang] = LanguageDataset(lang, list_dirs[np.argmax(lang_stats)]) with lang_to_dir_path.open(mode='wb') as f: pickle.dump(languages, f) return languages
configs = { 'n_epochs': args.n_epochs, 'word_embed_dim': args.word_embed_dim, 'char_embed_dim': args.char_embed_dim, 'char_hidden_dim': args.char_hidden_dim, 'word_hidden_dim': args.word_hidden_dim, 'optimizer': args.optimizer, 'lr': args.lr, 'use_gpu': use_gpu, 'save_model': args.save_model } if not args.folder: if args.language == 'all': for lang in languages: trainer(languages[lang], configs) else: if args.language not in list(languages.keys()): raise ValueError(f'language {args.language} not found') trainer(languages[args.language], configs) else: if args.language == 'all': raise ValueError( 'Cannot train all language with designated folder. ' 'Please remove --folder arguments to train all languages') path = root_path() / 'data' / 'ud-treebanks-v2.3' / args.folder if not path.exists(): raise ValueError('Folder not found') lang_dataset = LanguageDataset(args.language, path) trainer(lang_dataset, configs)
if __name__ == '__main__': parser = ArgParser(description='test file-converter') parser.add_argument('test', help=f"Test a/all test suite(s) or a specific test cases inside the test folder", type=str) parser.add_argument('--verbosity', choices=[1, 2], help=f"Test verbosity (default 2)", type=int, default=2) parser.add_argument('--interactive', help='Interactive testing with meld (default False)', action='store_true') parser.add_argument('--folder', help='Test folder (e.g: test/sub_test) (default test)', type=str, default='test') args = parser.parse_args() t_suites = get_suites(args.folder) DEFAULT_INPUT_FOLDER = root_path().joinpath(args.folder) / 'io' / 'in' DEFAULT_OUTPUT_FOLDER = root_path().joinpath(args.folder) / 'io' / 'out' DEFAULT_INPUT_FOLDER.mkdir(parents=True, exist_ok=True) DEFAULT_OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True) if args.interactive: result_class = MeldInteractiveTestResult else: result_class = unittest.TextTestResult runner = unittest.TextTestRunner(verbosity=args.verbosity, resultclass=result_class) final_res = False if args.test: if args.test == 'all': results = set() for s in t_suites:
def setUpClass(cls): cls.input_folder = root_path() / 'test' / 'io' / 'in' cls.output_folder = root_path() / 'test' / 'io' / 'out' cls.out_file = {} cls.exp_file = {} cls.in_file = {}
def trainer(lang_data, configs): vocab = lang_data.vocab alphabet = lang_data.alphabet meta = lang_data.meta use_gpu = configs['use_gpu'] model = CustomedBiLstm(alphabet_size=len(alphabet), vocab_size=len(vocab), word_embed_dim=configs['word_embed_dim'], char_embed_dim=configs['char_embed_dim'], char_hidden_dim=configs['char_hidden_dim'], word_hidden_dim=configs['word_hidden_dim'], n_tags=meta['n_tags'], use_gpu=use_gpu) if use_gpu: model.cuda() loss_function = nn.NLLLoss() if configs['optimizer'] == 'Adam': optimizer = optim.Adam(model.parameters(), lr=configs['lr']) elif configs['optimizer'] == 'SGD': optimizer = optim.SGD(model.parameters(), lr=configs['lr']) n_try = 0 log_path = root_path() / 'src' / 'out' / 'log' / (lang_data.name + '_' + str(n_try) + '.log') while log_path.exists(): n_try += 1 log_path = root_path() / 'src' / 'out' / 'log' / ( lang_data.name + '_' + str(n_try) + '.log') logging.basicConfig(filename=str(log_path), level=logging.INFO) logging.getLogger('trainer') results = { 'Language': lang_data.name, 'Repo': lang_data.repo.stem, 'Stats': { 'n_tokens': meta['n_tokens'], 'n_train': len(lang_data.train_split.tokens), 'n_dev': len(lang_data.dev_split.tokens), 'n_test': len(lang_data.test_split.tokens) }, 'Config': configs, 'Model': str(model), 'Time': [], 'Performance': [] } logging.info(f"Language: {lang_data.name} \n") logging.info(f"Repo: {lang_data.repo} \n") logging.info(f"Number of tokens: {meta['n_tokens']} \n") logging.info(f"Train size: {len(lang_data.train_split.tokens)}, " f"Dev size: {len(lang_data.dev_split.tokens)}, " f"Test size: {len(lang_data.test_split.tokens)}") logging.info(f"Model: {model} \n") logging.info(f"Config: {configs}\n") for epoch in range(configs['n_epochs']): logging.info(f"epoch: {epoch}\n") epoch_time = {'epoch': epoch + 1} epoch_perf = {'epoch': epoch + 1} start_epoch = timeit.default_timer() indices = np.arange(len(lang_data.train_split.tokens)) shuffle(indices) train_tokens = [lang_data.train_split.tokens[idx] for idx in indices] train_tags = [lang_data.train_split.tags[idx] for idx in indices] total_loss = 0 model.zero_grad() for idx in range(len(lang_data.train_split.tokens)): tokens_tensor, char_tensor, tags_tensor = get_one_batch( train_tokens[idx], train_tags[idx], vocab, alphabet, meta['all_tags']) if use_gpu: tokens_tensor = tokens_tensor.cuda() char_tensor = char_tensor.cuda() tags_tensor = tags_tensor.cuda() log_probs = model(tokens_tensor, char_tensor) batch_loss = loss_function(log_probs, tags_tensor) batch_loss.backward() optimizer.step() total_loss += batch_loss training_time = timeit.default_timer() - start_epoch logging.info('\t training the model took %.4f \n' % training_time) epoch_time['train'] = training_time train_acc, train_eval_time = f_timer(evaluate, lang_data.train_split, model, vocab, alphabet, meta['all_tags'], use_gpu) dev_acc, test_eval_time = f_timer(evaluate, lang_data.dev_split, model, vocab, alphabet, meta['all_tags'], use_gpu) logging.info('\t evaluation train split took %.4f \n' % train_eval_time) logging.info('\t evaluation dev took %.4f \n' % test_eval_time) epoch_time['train_eval'] = train_eval_time epoch_time['test_eval'] = test_eval_time logging.info('\t one epoch took %.4f \n' % (timeit.default_timer() - start_epoch)) logging.info('\t loss: %.4f, train acc: %.3f, dev acc: %.3f \n' % (total_loss, train_acc, dev_acc)) epoch_perf['loss'] = ("%.4f" % total_loss) epoch_perf['train_acc'] = ("%.3f" % train_acc) epoch_perf['dev_acc'] = ("%.3f" % dev_acc) results['Time'].append(epoch_time) results['Performance'].append(epoch_perf) test_acc = evaluate(lang_data.test_split, model, vocab, alphabet, meta['all_tags'], use_gpu) logging.info('test acc: %.3f%% \n' % test_acc) results['Accuracy'] = test_acc with (root_path() / 'src' / 'out' / 'test' / (lang_data.name + '.json')).open(mode='w') as f: json.dump(results, f, indent=4, sort_keys=True) if configs['save_model']: model_name = lang_data.name + '.model' torch.save(model, root_path() / 'src' / 'out' / 'cache' / model_name)
from pathlib import Path from config import root_path from tldb.core.structure.entry import Entry from tldb.core.structure.dewey_id import DeweyID import numpy as np import logging import timeit data_path = root_path() / 'test' / 'io' / 'in' / 'cases' def get_index_highest_element(all_elements_name: [str], table_name: str) -> int: """Summary This function return the index of highest level element (in XML query) of a table name e.g: If query is A->B then all_elements_name would be ['A', 'B'] get_index_highest_element(['A, B'], 'B_A') returns 1 :param all_elements_name: list of elements in XML query by level order :param table_name: :return: index of highest element """ table_elements = table_name.split('_') index = [] for element_name in table_elements: index.append(all_elements_name.index(element_name)) return np.argmin(np.asarray(index)) def load_text_file(file_path: Path) -> [str]:
def addSuccess(self, test): super().addSuccess(test) with (root_path() / 'test' / 'io' / 'out' / 'metrics.log').open(mode='a+') as f: f.write(f"{test.id()}, success, {test._elapsed:5f}\n")
def addFailure(self, test, err): super().addFailure(test, err) with (root_path() / 'test' / 'io' / 'out' / 'metrics.log').open(mode='a+') as f: f.write(f"{test.id()}, fail, {test._elapsed:5f}\n")
help=f"Test verbosity (default 2)", type=int, default=2) parser.add_argument( '--meld', help='Use meld to compare out and exp file (default False)', action='store_true') args = parser.parse_args() if args.meld: result_class = TestResultCompareFileMeld else: result_class = TestResultLogMetrics (root_path() / 'test' / 'io' / 'out').mkdir(parents=True, exist_ok=True) # TODO: ugly fix to remove metrics.log file if (root_path() / 'test' / 'io' / 'out' / 'metrics.log').is_file(): (root_path() / 'test' / 'io' / 'out' / 'metrics.log').unlink() runner = unittest.TextTestRunner(verbosity=args.verbosity, resultclass=result_class) result = False if args.test: if args.test == 'all': results = set() for s in t_suites: results.add(runner.run(t_suites[s]).wasSuccessful()) result = all(results) else:
import pickle import conllu import numpy as np import xmltodict from config import root_path from src.util.nlp import build_vocab_from_sentences_tokens, build_alphabet_from_sentence_tokens data_path = root_path() / 'data' / 'ud-treebanks-v2.3' def get_languages(): """ Find languages in the dataset that has train, dev, and test set. If the languages has multiple dataset, chose the directory with the largest amount of tokens :return: array of tuples of lang and dir """ lang_to_dir_path = root_path() / 'data' / 'lang_to_dir.pkl' if lang_to_dir_path.exists(): with lang_to_dir_path.open(mode='rb') as f: return pickle.load(f) # find datasets with train, dev, and test split all_dir = [(dir.name.split('-')[0][3:], dir) for dir in data_path.iterdir() if len(list(dir.glob('*.conllu'))) > 2] languages = dict.fromkeys(list(set(t[0] for t in all_dir))) for t in all_dir: lang = t[0] dir = t[1] if not languages[lang]: