Пример #1
0
 def test_save(self):
     catalogue = tacl.Catalogue()
     catalogue['T2102-辯惑論序'] = 'label1'
     catalogue['T2102-通直郎庾黔婁答'] = 'label2'
     catalogue['T2102-遠法師答'] = 'label1'
     with tempfile.TemporaryDirectory() as tmp_dir:
         catalogue_path = os.path.join(tmp_dir, 'catalogue.txt')
         catalogue.save(catalogue_path)
         saved_catalogue = tacl.Catalogue()
         saved_catalogue.load(catalogue_path)
         self.assertEqual(catalogue.items(), saved_catalogue.items())
Пример #2
0
def text_in_corpus(args):
    texts = args.texts.read().strip().split()
    output_dir = os.path.abspath(args.output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    catalogue = tacl.Catalogue({text: 'REST' for text in texts})
    commands = []
    options = _copy_options(args)
    for text in texts:
        text_name = os.path.splitext(text)[0]
        catalogue_path = os.path.join(output_dir,
                                      '{}-catalogue.txt'.format(text_name))
        results_path = os.path.join(output_dir,
                                    '{}-results.csv'.format(text_name))
        reduced_path = os.path.join(output_dir,
                                    '{}-reduced.csv'.format(text_name))
        catalogue[text] = 'A'
        catalogue.save(catalogue_path)
        query_command = 'tacl intersect{} {} {} {} > {}\n'.format(
            options, args.db, args.corpus, catalogue_path, results_path)
        report_command = 'tacl report --reduce --remove REST {} > {}\n'.format(
            results_path, reduced_path)
        commands.extend((query_command, report_command))
        catalogue[text] = 'REST'
    commands_path = os.path.join(output_dir, 'commands')
    with open(commands_path, 'w') as fh:
        fh.writelines(commands)
Пример #3
0
 def test_add_ngrams_with_catalogue(self):
     catalogue = tacl.Catalogue({'T1': 'A', 'T5': 'B'})
     store = tacl.DataStore(':memory:')
     store.add_ngrams(self._corpus, 1, 1, catalogue)
     store._conn.row_factory = None
     actual_rows = store._conn.execute(
         'SELECT Text.work, Text.siglum, Text.checksum, Text.label, '
         'TextNGram.ngram, TextNGram.size, TextNGram.count '
         'FROM Text, TextNGram WHERE Text.id = TextNGram.text').fetchall()
     expected_rows = [
         ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 't', 1, 2),
         ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'h', 1, 1),
         ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'e', 1, 3),
         ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'n', 1, 2),
         ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'w', 1, 2),
         ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 't', 1, 2),
         ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'h', 1, 1),
         ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'e', 1, 3),
         ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'w', 1, 2),
         ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'n', 1, 1),
         ('T5', 'base', '1b42a11f5f647e53d20da8c8f57a9f02', '', 'w', 1, 1),
         ('T5', 'base', '1b42a11f5f647e53d20da8c8f57a9f02', '', 'e', 1, 1),
         ('T5', 'base', '1b42a11f5f647e53d20da8c8f57a9f02', '', 'l', 1, 2),
     ]
     self.assertEqual(set(actual_rows), set(expected_rows))
Пример #4
0
def results(args, parser):
    if args.results == '-':
        results_fh = io.TextIOWrapper(sys.stdin.buffer,
                                      encoding='utf-8',
                                      newline='')
    else:
        results_fh = open(args.results, 'r', encoding='utf-8', newline='')
    tokenizer = utils.get_tokenizer(args)
    results = tacl.Results(results_fh, tokenizer)
    if args.extend:
        corpus = tacl.Corpus(args.extend, tokenizer)
        results.extend(corpus)
    if args.bifurcated_extend:
        if not args.bifurcated_extend_size:
            parser.error('The bifurcated extend option requires that the '
                         '--max-be-count option also be supplied')
        corpus = tacl.Corpus(args.bifurcated_extend, tokenizer)
        results.bifurcated_extend(corpus, args.bifurcated_extend_size)
    if args.reduce:
        results.reduce()
    if args.reciprocal:
        results.reciprocal_remove()
    if args.excise:
        results.excise(args.excise)
    if args.zero_fill:
        corpus = tacl.Corpus(args.zero_fill, tokenizer)
        results.zero_fill(corpus)
    if args.ngrams:
        with open(args.ngrams, encoding='utf-8') as fh:
            ngrams = fh.read().split()
        results.prune_by_ngram(ngrams)
    if args.min_works or args.max_works:
        results.prune_by_work_count(args.min_works, args.max_works)
    if args.min_size or args.max_size:
        results.prune_by_ngram_size(args.min_size, args.max_size)
    if args.min_count or args.max_count:
        results.prune_by_ngram_count(args.min_count, args.max_count)
    if args.min_count_work or args.max_count_work:
        results.prune_by_ngram_count_per_work(args.min_count_work,
                                              args.max_count_work)
    if args.remove:
        results.remove_label(args.remove)
    if args.sort:
        results.sort()
    # Run format-changing operations last.
    if args.add_label_count:
        results.add_label_count()
    if args.add_label_work_count:
        results.add_label_work_count()
    if args.group_by_ngram:
        catalogue = tacl.Catalogue()
        catalogue.load(args.group_by_ngram)
        results.group_by_ngram(catalogue.ordered_labels)
    if args.group_by_witness:
        results.group_by_witness()
    if args.collapse_witnesses:
        results.collapse_witnesses()
    results.csv(sys.stdout)
Пример #5
0
 def test_generate(self):
     listdir = self._create_patch('os.listdir')
     listdir.return_value = [sentinel.filename1, sentinel.filename2]
     catalogue = tacl.Catalogue()
     catalogue.generate(sentinel.path, sentinel.label)
     listdir.assert_called_once_with(sentinel.path)
     self.assertEqual(catalogue.get(sentinel.filename1), sentinel.label)
     self.assertEqual(catalogue.get(sentinel.filename2), sentinel.label)
     self.assertEqual(catalogue.get(sentinel.filename3), None)
Пример #6
0
 def setUp (self):
     self._tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_CBETA,
                                      tacl.constants.TOKENIZER_JOINER_CBETA)
     self._data_dir = os.path.join(os.path.dirname(__file__), 'data')
     self._corpus = tacl.Corpus(os.path.join(self._data_dir, 'stripped'),
                                self._tokenizer)
     self._catalogue = tacl.Catalogue()
     self._catalogue.load(os.path.join(self._data_dir, 'catalogue.txt'))
     self._store = tacl.DataStore(':memory:')
     self._store.add_ngrams(self._corpus, 1, 3)
Пример #7
0
def search_texts (args, parser):
    """Searches texts for presence of n-grams."""
    store = get_data_store(args)
    corpus = get_corpus(args)
    catalogue = tacl.Catalogue()
    if args.catalogue:
        catalogue.load(args.catalogue)
    store.validate(corpus, catalogue)
    with open(args.ngrams, 'r', encoding='utf-8') as fh:
        ngrams = [ngram.strip() for ngram in fh.readlines()]
    store.search(catalogue, ngrams, sys.stdout)
Пример #8
0
 def test_generate(self):
     expected_dir = os.path.join(self._data_dir, 'expected')
     catalogue = tacl.Catalogue()
     catalogue.load(os.path.join(self._data_dir, 'catalogue.txt'))
     results_path = os.path.join(self._data_dir, 'results.csv')
     tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta'])
     results = tacl.Results(results_path, tokenizer)
     label = 'A'
     with tempfile.TemporaryDirectory() as temp_dir:
         report = tacl.LifetimeReport()
         report.generate(temp_dir, catalogue, results, label)
         self._compare_results_dirs(temp_dir, expected_dir)
Пример #9
0
 def _compare_results(self, max_works, expected_dir_name):
     expected_dir = os.path.join(self._data_dir, 'expected',
                                 expected_dir_name)
     corpus = tacl.Corpus(self._corpus, self._tokenizer)
     catalogue = tacl.Catalogue()
     catalogue.load(self._catalogue)
     with tempfile.TemporaryDirectory() as temp_dir:
         data_store = tacl.DataStore(os.path.join(temp_dir, 'test.db'),
                                     False)
         data_store.add_ngrams(corpus, 1, 1)
         output_dir = os.path.join(temp_dir, 'output')
         test = paternity.PaternityTest(data_store, catalogue,
                                        self._tokenizer, 'P', 'C', 'U',
                                        max_works, output_dir)
         test.process()
         self._compare_results_dirs(output_dir, expected_dir)
Пример #10
0
 def _compare_results(self, corpus_dir, catalogue_name):
     """Compare all of the actual results files with the expected
     versions."""
     expected_dir = os.path.join(self._data_dir, 'expected')
     corpus = tacl.Corpus(os.path.join(self._data_dir, corpus_dir),
                          self._tokenizer)
     catalogue = tacl.Catalogue()
     catalogue.load(os.path.join(self._data_dir, catalogue_name))
     with tempfile.TemporaryDirectory() as temp_dir:
         data_store = tacl.DataStore(os.path.join(temp_dir, 'test.db'),
                                     False)
         data_store.add_ngrams(corpus, 1, 1)
         output_dir = os.path.join(temp_dir, 'output')
         reporter = lifetime.LifetimeReporter(data_store, catalogue,
                                              self._tokenizer, output_dir)
         reporter.process()
         self._compare_results_dirs(output_dir, expected_dir)
Пример #11
0
 def intersect_all(self):
     os.makedirs(self._output_dir, exist_ok=True)
     works = sorted(self._catalogue.keys())
     with open(self._tracking_path, 'a', newline='') as tracking_fh:
         writer = csv.writer(tracking_fh)
         for work in works:
             output_path = os.path.join(self._output_dir, work)
             os.makedirs(output_path, exist_ok=True)
             catalogue = tacl.Catalogue()
             catalogue[work] = work
             for alt_work in works:
                 self.process_pair(work, alt_work, catalogue, output_path,
                                   writer)
             # Delete output directory if empty.
             try:
                 os.rmdir(output_path)
             except OSError:
                 pass
Пример #12
0
 def test_tracking(self):
     """Tests that seen pairs are not regenerated."""
     catalogue = tacl.Catalogue({'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D'})
     seen_pairs = (('A', 'B'), ('A', 'C'), ('B', 'C'))
     self._compare_results('tracking', 1, 1, catalogue, seen_pairs)
Пример #13
0
 def test_intersect_all_no_db(self):
     catalogue = tacl.Catalogue({'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D'})
     self._compare_results('no-db', 1, 1, catalogue, None, None)
Пример #14
0
 def test_intersect_all_extend(self):
     catalogue = tacl.Catalogue({'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D'})
     self._compare_results('extend', 2, 2, catalogue, None)
Пример #15
0
 def test_intersect_all_catalogue(self):
     catalogue = tacl.Catalogue({'A': 'A', 'B': 'B', 'C': 'C'})
     self._compare_results('catalogue', 1, 1, catalogue, None)
Пример #16
0
def get_catalogue(args):
    """Returns a `tacl.Catalogue`."""
    catalogue = tacl.Catalogue()
    catalogue.load(args.catalogue)
    return catalogue