def test_save(self): catalogue = tacl.Catalogue() catalogue['T2102-辯惑論序'] = 'label1' catalogue['T2102-通直郎庾黔婁答'] = 'label2' catalogue['T2102-遠法師答'] = 'label1' with tempfile.TemporaryDirectory() as tmp_dir: catalogue_path = os.path.join(tmp_dir, 'catalogue.txt') catalogue.save(catalogue_path) saved_catalogue = tacl.Catalogue() saved_catalogue.load(catalogue_path) self.assertEqual(catalogue.items(), saved_catalogue.items())
def text_in_corpus(args): texts = args.texts.read().strip().split() output_dir = os.path.abspath(args.output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) catalogue = tacl.Catalogue({text: 'REST' for text in texts}) commands = [] options = _copy_options(args) for text in texts: text_name = os.path.splitext(text)[0] catalogue_path = os.path.join(output_dir, '{}-catalogue.txt'.format(text_name)) results_path = os.path.join(output_dir, '{}-results.csv'.format(text_name)) reduced_path = os.path.join(output_dir, '{}-reduced.csv'.format(text_name)) catalogue[text] = 'A' catalogue.save(catalogue_path) query_command = 'tacl intersect{} {} {} {} > {}\n'.format( options, args.db, args.corpus, catalogue_path, results_path) report_command = 'tacl report --reduce --remove REST {} > {}\n'.format( results_path, reduced_path) commands.extend((query_command, report_command)) catalogue[text] = 'REST' commands_path = os.path.join(output_dir, 'commands') with open(commands_path, 'w') as fh: fh.writelines(commands)
def test_add_ngrams_with_catalogue(self): catalogue = tacl.Catalogue({'T1': 'A', 'T5': 'B'}) store = tacl.DataStore(':memory:') store.add_ngrams(self._corpus, 1, 1, catalogue) store._conn.row_factory = None actual_rows = store._conn.execute( 'SELECT Text.work, Text.siglum, Text.checksum, Text.label, ' 'TextNGram.ngram, TextNGram.size, TextNGram.count ' 'FROM Text, TextNGram WHERE Text.id = TextNGram.text').fetchall() expected_rows = [ ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 't', 1, 2), ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'h', 1, 1), ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'e', 1, 3), ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'n', 1, 2), ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'w', 1, 2), ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 't', 1, 2), ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'h', 1, 1), ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'e', 1, 3), ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'w', 1, 2), ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'n', 1, 1), ('T5', 'base', '1b42a11f5f647e53d20da8c8f57a9f02', '', 'w', 1, 1), ('T5', 'base', '1b42a11f5f647e53d20da8c8f57a9f02', '', 'e', 1, 1), ('T5', 'base', '1b42a11f5f647e53d20da8c8f57a9f02', '', 'l', 1, 2), ] self.assertEqual(set(actual_rows), set(expected_rows))
def results(args, parser): if args.results == '-': results_fh = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', newline='') else: results_fh = open(args.results, 'r', encoding='utf-8', newline='') tokenizer = utils.get_tokenizer(args) results = tacl.Results(results_fh, tokenizer) if args.extend: corpus = tacl.Corpus(args.extend, tokenizer) results.extend(corpus) if args.bifurcated_extend: if not args.bifurcated_extend_size: parser.error('The bifurcated extend option requires that the ' '--max-be-count option also be supplied') corpus = tacl.Corpus(args.bifurcated_extend, tokenizer) results.bifurcated_extend(corpus, args.bifurcated_extend_size) if args.reduce: results.reduce() if args.reciprocal: results.reciprocal_remove() if args.excise: results.excise(args.excise) if args.zero_fill: corpus = tacl.Corpus(args.zero_fill, tokenizer) results.zero_fill(corpus) if args.ngrams: with open(args.ngrams, encoding='utf-8') as fh: ngrams = fh.read().split() results.prune_by_ngram(ngrams) if args.min_works or args.max_works: results.prune_by_work_count(args.min_works, args.max_works) if args.min_size or args.max_size: results.prune_by_ngram_size(args.min_size, args.max_size) if args.min_count or args.max_count: results.prune_by_ngram_count(args.min_count, args.max_count) if args.min_count_work or args.max_count_work: results.prune_by_ngram_count_per_work(args.min_count_work, args.max_count_work) if args.remove: results.remove_label(args.remove) if args.sort: results.sort() # Run format-changing operations last. if args.add_label_count: results.add_label_count() if args.add_label_work_count: results.add_label_work_count() if args.group_by_ngram: catalogue = tacl.Catalogue() catalogue.load(args.group_by_ngram) results.group_by_ngram(catalogue.ordered_labels) if args.group_by_witness: results.group_by_witness() if args.collapse_witnesses: results.collapse_witnesses() results.csv(sys.stdout)
def test_generate(self): listdir = self._create_patch('os.listdir') listdir.return_value = [sentinel.filename1, sentinel.filename2] catalogue = tacl.Catalogue() catalogue.generate(sentinel.path, sentinel.label) listdir.assert_called_once_with(sentinel.path) self.assertEqual(catalogue.get(sentinel.filename1), sentinel.label) self.assertEqual(catalogue.get(sentinel.filename2), sentinel.label) self.assertEqual(catalogue.get(sentinel.filename3), None)
def setUp (self): self._tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_CBETA, tacl.constants.TOKENIZER_JOINER_CBETA) self._data_dir = os.path.join(os.path.dirname(__file__), 'data') self._corpus = tacl.Corpus(os.path.join(self._data_dir, 'stripped'), self._tokenizer) self._catalogue = tacl.Catalogue() self._catalogue.load(os.path.join(self._data_dir, 'catalogue.txt')) self._store = tacl.DataStore(':memory:') self._store.add_ngrams(self._corpus, 1, 3)
def search_texts (args, parser): """Searches texts for presence of n-grams.""" store = get_data_store(args) corpus = get_corpus(args) catalogue = tacl.Catalogue() if args.catalogue: catalogue.load(args.catalogue) store.validate(corpus, catalogue) with open(args.ngrams, 'r', encoding='utf-8') as fh: ngrams = [ngram.strip() for ngram in fh.readlines()] store.search(catalogue, ngrams, sys.stdout)
def test_generate(self): expected_dir = os.path.join(self._data_dir, 'expected') catalogue = tacl.Catalogue() catalogue.load(os.path.join(self._data_dir, 'catalogue.txt')) results_path = os.path.join(self._data_dir, 'results.csv') tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta']) results = tacl.Results(results_path, tokenizer) label = 'A' with tempfile.TemporaryDirectory() as temp_dir: report = tacl.LifetimeReport() report.generate(temp_dir, catalogue, results, label) self._compare_results_dirs(temp_dir, expected_dir)
def _compare_results(self, max_works, expected_dir_name): expected_dir = os.path.join(self._data_dir, 'expected', expected_dir_name) corpus = tacl.Corpus(self._corpus, self._tokenizer) catalogue = tacl.Catalogue() catalogue.load(self._catalogue) with tempfile.TemporaryDirectory() as temp_dir: data_store = tacl.DataStore(os.path.join(temp_dir, 'test.db'), False) data_store.add_ngrams(corpus, 1, 1) output_dir = os.path.join(temp_dir, 'output') test = paternity.PaternityTest(data_store, catalogue, self._tokenizer, 'P', 'C', 'U', max_works, output_dir) test.process() self._compare_results_dirs(output_dir, expected_dir)
def _compare_results(self, corpus_dir, catalogue_name): """Compare all of the actual results files with the expected versions.""" expected_dir = os.path.join(self._data_dir, 'expected') corpus = tacl.Corpus(os.path.join(self._data_dir, corpus_dir), self._tokenizer) catalogue = tacl.Catalogue() catalogue.load(os.path.join(self._data_dir, catalogue_name)) with tempfile.TemporaryDirectory() as temp_dir: data_store = tacl.DataStore(os.path.join(temp_dir, 'test.db'), False) data_store.add_ngrams(corpus, 1, 1) output_dir = os.path.join(temp_dir, 'output') reporter = lifetime.LifetimeReporter(data_store, catalogue, self._tokenizer, output_dir) reporter.process() self._compare_results_dirs(output_dir, expected_dir)
def intersect_all(self): os.makedirs(self._output_dir, exist_ok=True) works = sorted(self._catalogue.keys()) with open(self._tracking_path, 'a', newline='') as tracking_fh: writer = csv.writer(tracking_fh) for work in works: output_path = os.path.join(self._output_dir, work) os.makedirs(output_path, exist_ok=True) catalogue = tacl.Catalogue() catalogue[work] = work for alt_work in works: self.process_pair(work, alt_work, catalogue, output_path, writer) # Delete output directory if empty. try: os.rmdir(output_path) except OSError: pass
def test_tracking(self): """Tests that seen pairs are not regenerated.""" catalogue = tacl.Catalogue({'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D'}) seen_pairs = (('A', 'B'), ('A', 'C'), ('B', 'C')) self._compare_results('tracking', 1, 1, catalogue, seen_pairs)
def test_intersect_all_no_db(self): catalogue = tacl.Catalogue({'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D'}) self._compare_results('no-db', 1, 1, catalogue, None, None)
def test_intersect_all_extend(self): catalogue = tacl.Catalogue({'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D'}) self._compare_results('extend', 2, 2, catalogue, None)
def test_intersect_all_catalogue(self): catalogue = tacl.Catalogue({'A': 'A', 'B': 'B', 'C': 'C'}) self._compare_results('catalogue', 1, 1, catalogue, None)
def get_catalogue(args): """Returns a `tacl.Catalogue`.""" catalogue = tacl.Catalogue() catalogue.load(args.catalogue) return catalogue