Exemplo n.º 1
0
 def test_prune_by_work_count(self):
     input_data = (['AB', '2', 'a', 'base', '4',
                    'A'], ['AB', '2', 'b', 'base', '7',
                           'A'], ['AB', '2', 'c', 'base', '1',
                                  'B'], ['AB', '2', 'd', 'base', '3', 'B'],
                   ['ABC', '3', 'a', 'base', '3',
                    'A'], ['ABC', '3', 'b', 'base', '5',
                           'A'], ['ABC', '3', 'c', 'base', '1',
                                  'B'], ['BA', '2', 'a', 'base', '6', 'A'],
                   ['B', '1', 'a', 'base', '5',
                    'A'], ['B', '1', 'b', 'base', '3',
                           'A'], ['B', '1', 'b', 'wit', '3',
                                  'A'], ['B', '1', 'c', 'base', '0', 'B'])
     fh = self._create_csv(input_data)
     results = tacl.Results(fh, self._tokenizer)
     results.prune_by_work_count(minimum=3)
     expected_rows = [
         tacl.constants.QUERY_FIELDNAMES,
         ('AB', '2', 'a', 'base', '4', 'A'),
         ('AB', '2', 'b', 'base', '7', 'A'),
         ('AB', '2', 'c', 'base', '1', 'B'),
         ('AB', '2', 'd', 'base', '3', 'B'),
         ('ABC', '3', 'a', 'base', '3', 'A'),
         ('ABC', '3', 'b', 'base', '5', 'A'),
         ('ABC', '3', 'c', 'base', '1', 'B')
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
     fh.seek(0)
     results = tacl.Results(fh, self._tokenizer)
     results.prune_by_work_count(maximum=3)
     expected_rows = [
         tacl.constants.QUERY_FIELDNAMES,
         ('ABC', '3', 'a', 'base', '3', 'A'),
         ('ABC', '3', 'b', 'base', '5', 'A'),
         ('ABC', '3', 'c', 'base', '1', 'B'),
         ('BA', '2', 'a', 'base', '6', 'A'),
         ('B', '1', 'a', 'base', '5', 'A'),
         ('B', '1', 'b', 'base', '3', 'A'),
         ('B', '1', 'b', 'wit', '3', 'A'),
         ('B', '1', 'c', 'base', '0', 'B'),
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
     fh.seek(0)
     results = tacl.Results(fh, self._tokenizer)
     results.prune_by_work_count(minimum=2, maximum=3)
     expected_rows = [
         tacl.constants.QUERY_FIELDNAMES,
         ('ABC', '3', 'a', 'base', '3', 'A'),
         ('ABC', '3', 'b', 'base', '5', 'A'),
         ('ABC', '3', 'c', 'base', '1', 'B'),
         ('B', '1', 'a', 'base', '5', 'A'), ('B', '1', 'b', 'base', '3',
                                             'A'),
         ('B', '1', 'b', 'wit', '3', 'A'), ('B', '1', 'c', 'base', '0', 'B')
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
Exemplo n.º 2
0
 def test_sort(self):
     input_data = (['AB', '2', 'a', 'base', '4',
                    'A'], ['AB', '2', 'a', 'wit', '3',
                           'A'], ['ABC', '3', 'a', 'base', '2',
                                  'A'], ['ABD', '3', 'a', 'base', '1', 'B'],
                   ['ABCD', '4', 'a', 'base', '2',
                    'B'], ['AB', '2', 'b', 'base', '2',
                           'AB'], ['AB', '2', 'b', 'a', '2', 'AB'],
                   ['ABC', '3', 'b', 'base', '2',
                    'AB'], ['ABC', '3', 'c', 'base', '3', 'A'])
     fh = self._create_csv(input_data)
     results = tacl.Results(fh, self._tokenizer)
     results.sort()
     expected_rows = [
         tacl.constants.QUERY_FIELDNAMES,
         ('ABCD', '4', 'a', 'base', '2', 'B'),
         ('ABC', '3', 'c', 'base', '3', 'A'),
         ('ABC', '3', 'a', 'base', '2', 'A'),
         ('ABC', '3', 'b', 'base', '2', 'AB'),
         ('ABD', '3', 'a', 'base', '1', 'B'),
         ('AB', '2', 'a', 'base', '4', 'A'),
         ('AB', '2', 'a', 'wit', '3', 'A'),
         ('AB', '2', 'b', 'a', '2', 'AB'),
         ('AB', '2', 'b', 'base', '2', 'AB')
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
Exemplo n.º 3
0
 def _test_empty_results(self, cmd, fieldnames, *args, **kwargs):
     fh = self._create_csv([])
     results = tacl.Results(fh, self._tokenizer)
     getattr(results, cmd)(*args, **kwargs)
     expected_rows = [fieldnames]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
Exemplo n.º 4
0
 def test_group_by_witness(self):
     input_results = (
         ['AB', '2', 'T1', 'wit1', '4', 'A'],
         ['AB', '2', 'T1', 'wit2', '3', 'A'],
         ['AB', '2', 'T2', 'wit1', '2', 'A'],
         ['ABC', '3', 'T1', 'wit1', '2', 'A'],
         ['ABC', '3', 'T1', 'wit2', '0', 'A'],
         ['AB', '2', 'T3', 'wit1', '2', 'B'],
         ['BC', '2', 'T1', 'wit1', '3', 'A'],
     )
     fh = self._create_csv(input_results)
     results = tacl.Results(fh, self._tokenizer)
     results.group_by_witness()
     fieldnames = (tacl.constants.WORK_FIELDNAME,
                   tacl.constants.SIGLUM_FIELDNAME,
                   tacl.constants.LABEL_FIELDNAME,
                   tacl.constants.NGRAMS_FIELDNAME,
                   tacl.constants.NUMBER_FIELDNAME,
                   tacl.constants.TOTAL_COUNT_FIELDNAME)
     expected_rows = [
         fieldnames,
         ('T1', 'wit1', 'A', 'AB, ABC, BC', '3', '9'),
         ('T1', 'wit2', 'A', 'AB', '1', '3'),
         ('T2', 'wit1', 'A', 'AB', '1', '2'),
         ('T3', 'wit1', 'B', 'AB', '1', '2'),
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(set(actual_rows), set(expected_rows))
Exemplo n.º 5
0
 def test_is_intersect_results(self):
     # Test that _is_intersect_results correctly identifies diff
     # and intersect results.
     intersect_results = (['AB', '2', 'a', 'base', '7',
                           'A'], ['AB', '2', 'b', 'base', '2',
                                  'B'], ['AB', '2', 'c', 'base', '5', 'C'])
     fh = self._create_csv(intersect_results)
     results = tacl.Results(fh, self._tokenizer)
     self.assertTrue(results._is_intersect_results(results._matches))
     diff_results = (['AB', '2', 'a', 'base', '7',
                      'A'], ['AB', '2', 'a', 'other', '1',
                             'A'], ['AB', '2', 'b', 'base', '5', 'A'],
                     ['BA', '2', 'c', 'base', '2', 'B'])
     fh = self._create_csv(diff_results)
     results = tacl.Results(fh, self._tokenizer)
     self.assertFalse(results._is_intersect_results(results._matches))
Exemplo n.º 6
0
 def test_collapse_witnesses(self):
     input_data = (
         ['AB', '2', 'a', 'base', '4', 'A'],
         ['AB', '2', 'a', 'wit 1', '4', 'A'],
         ['AB', '2', 'a', 'wit 2', '3', 'A'],
         ['AB', '2', 'b', 'base', '4', 'A'],
         ['AB', '2', 'b', 'wit 1', '3', 'A'],
         ['BC', '2', 'a', 'base', '4', 'A'],
         ['BC', '2', 'a', 'wit 1', '3', 'A'],
         ['BC', '2', 'a', 'wit 2', '3', 'A'],
     )
     fh = self._create_csv(input_data)
     results = tacl.Results(fh, self._tokenizer)
     results.collapse_witnesses()
     fieldnames = (tacl.constants.NGRAM_FIELDNAME,
                   tacl.constants.SIZE_FIELDNAME,
                   tacl.constants.WORK_FIELDNAME,
                   tacl.constants.SIGLA_FIELDNAME,
                   tacl.constants.COUNT_FIELDNAME,
                   tacl.constants.LABEL_FIELDNAME)
     expected_rows = [
         fieldnames,
         ('AB', '2', 'a', 'base, wit 1', '4', 'A'),
         ('AB', '2', 'a', 'wit 2', '3', 'A'),
         ('AB', '2', 'b', 'base', '4', 'A'),
         ('AB', '2', 'b', 'wit 1', '3', 'A'),
         ('BC', '2', 'a', 'base', '4', 'A'),
         ('BC', '2', 'a', 'wit 1, wit 2', '3', 'A'),
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
Exemplo n.º 7
0
 def test_group_by_ngram(self):
     input_results = (
         ['AB', '2', 'T1', 'wit1', '4', 'A'],
         ['AB', '2', 'T1', 'wit2', '3', 'A'],
         ['AB', '2', 'T2', 'wit1', '2', 'A'],
         ['ABC', '3', 'T1', 'wit1', '2', 'A'],
         ['ABC', '3', 'T1', 'wit2', '0', 'A'],
         ['AB', '2', 'T3', 'wit1', '2', 'B'],
         ['AB', '2', 'T4', 'wit1', '1', 'B'],
     )
     fh = self._create_csv(input_results)
     results = tacl.Results(fh, self._tokenizer)
     results.group_by_ngram(['B', 'A'])
     fieldnames = (tacl.constants.NGRAM_FIELDNAME,
                   tacl.constants.SIZE_FIELDNAME,
                   tacl.constants.LABEL_FIELDNAME,
                   tacl.constants.WORK_COUNTS_FIELDNAME)
     expected_rows = [
         fieldnames,
         ('AB', '2', 'B', 'T3(2), T4(1)'),
         ('AB', '2', 'A', 'T1(3-4), T2(2)'),
         ('ABC', '3', 'A', 'T1(0-2)'),
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
Exemplo n.º 8
0
 def test_add_label_work_count(self):
     input_data = (['AB', '2', 'a', 'base', '4',
                    'A'], ['AB', '2', 'a', 'wit1', '2',
                           'A'], ['AB', '2', 'b', 'base', '1',
                                  'A'], ['AB', '2', 'c', 'base', '2', 'B'],
                   ['BC', '2', 'a', 'base', '0',
                    'A'], ['BC', '2', 'a', 'wit1', '0',
                           'A'], ['CD', '2', 'a', 'base', '1', 'A'])
     fh = self._create_csv(input_data)
     results = tacl.Results(fh, self._tokenizer)
     results.add_label_work_count()
     fieldnames = tuple(
         list(tacl.constants.QUERY_FIELDNAMES) +
         [tacl.constants.LABEL_WORK_COUNT_FIELDNAME])
     expected_rows = [
         fieldnames, ('AB', '2', 'a', 'base', '4', 'A', '2'),
         ('AB', '2', 'a', 'wit1', '2', 'A', '2'),
         ('AB', '2', 'b', 'base', '1', 'A', '2'),
         ('AB', '2', 'c', 'base', '2', 'B', '1'),
         ('BC', '2', 'a', 'base', '0', 'A', '0'),
         ('BC', '2', 'a', 'wit1', '0', 'A', '0'),
         ('CD', '2', 'a', 'base', '1', 'A', '1')
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
Exemplo n.º 9
0
def results(args, parser):
    if args.results == '-':
        results_fh = io.TextIOWrapper(sys.stdin.buffer,
                                      encoding='utf-8',
                                      newline='')
    else:
        results_fh = open(args.results, 'r', encoding='utf-8', newline='')
    tokenizer = utils.get_tokenizer(args)
    results = tacl.Results(results_fh, tokenizer)
    if args.extend:
        corpus = tacl.Corpus(args.extend, tokenizer)
        results.extend(corpus)
    if args.bifurcated_extend:
        if not args.bifurcated_extend_size:
            parser.error('The bifurcated extend option requires that the '
                         '--max-be-count option also be supplied')
        corpus = tacl.Corpus(args.bifurcated_extend, tokenizer)
        results.bifurcated_extend(corpus, args.bifurcated_extend_size)
    if args.reduce:
        results.reduce()
    if args.reciprocal:
        results.reciprocal_remove()
    if args.excise:
        results.excise(args.excise)
    if args.zero_fill:
        corpus = tacl.Corpus(args.zero_fill, tokenizer)
        results.zero_fill(corpus)
    if args.ngrams:
        with open(args.ngrams, encoding='utf-8') as fh:
            ngrams = fh.read().split()
        results.prune_by_ngram(ngrams)
    if args.min_works or args.max_works:
        results.prune_by_work_count(args.min_works, args.max_works)
    if args.min_size or args.max_size:
        results.prune_by_ngram_size(args.min_size, args.max_size)
    if args.min_count or args.max_count:
        results.prune_by_ngram_count(args.min_count, args.max_count)
    if args.min_count_work or args.max_count_work:
        results.prune_by_ngram_count_per_work(args.min_count_work,
                                              args.max_count_work)
    if args.remove:
        results.remove_label(args.remove)
    if args.sort:
        results.sort()
    # Run format-changing operations last.
    if args.add_label_count:
        results.add_label_count()
    if args.add_label_work_count:
        results.add_label_work_count()
    if args.group_by_ngram:
        catalogue = tacl.Catalogue()
        catalogue.load(args.group_by_ngram)
        results.group_by_ngram(catalogue.ordered_labels)
    if args.group_by_witness:
        results.group_by_witness()
    if args.collapse_witnesses:
        results.collapse_witnesses()
    results.csv(sys.stdout)
Exemplo n.º 10
0
def lifetime_report(args, parser):
    """Generates a lifetime report."""
    catalogue = utils.get_catalogue(args)
    tokenizer = utils.get_tokenizer(args)
    results = tacl.Results(args.results, tokenizer)
    output_dir = os.path.abspath(args.output)
    os.makedirs(output_dir, exist_ok=True)
    report = tacl.LifetimeReport()
    report.generate(output_dir, catalogue, results, args.label)
Exemplo n.º 11
0
 def test_prune_by_ngram_size(self):
     input_data = (['AB', '2', 'a', 'base', '4',
                    'A'], ['ABC', '3', 'a', 'base', '2',
                           'A'], ['ABD', '3', 'a', 'wit', '1', 'A'],
                   ['ABCD', '4', 'a', 'base', '2',
                    'A'], ['AB', '2', 'b', 'base', '2',
                           'A'], ['ABC', '3', 'b', 'wit', '2', 'A'])
     fh = self._create_csv(input_data)
     results = tacl.Results(fh, self._tokenizer)
     results.prune_by_ngram_size(minimum=3)
     expected_rows = [
         tacl.constants.QUERY_FIELDNAMES,
         ('ABC', '3', 'a', 'base', '2', 'A'),
         ('ABD', '3', 'a', 'wit', '1', 'A'),
         ('ABCD', '4', 'a', 'base', '2', 'A'),
         ('ABC', '3', 'b', 'wit', '2', 'A')
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
     fh.seek(0)
     results = tacl.Results(fh, self._tokenizer)
     results.prune_by_ngram_size(maximum=3)
     expected_rows = [
         tacl.constants.QUERY_FIELDNAMES,
         ('AB', '2', 'a', 'base', '4', 'A'),
         ('ABC', '3', 'a', 'base', '2', 'A'),
         ('ABD', '3', 'a', 'wit', '1', 'A'),
         ('AB', '2', 'b', 'base', '2', 'A'),
         ('ABC', '3', 'b', 'wit', '2', 'A')
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
     fh.seek(0)
     results = tacl.Results(fh, self._tokenizer)
     results.prune_by_ngram_size(minimum=3, maximum=3)
     expected_rows = [
         tacl.constants.QUERY_FIELDNAMES,
         ('ABC', '3', 'a', 'base', '2', 'A'),
         ('ABD', '3', 'a', 'wit', '1', 'A'),
         ('ABC', '3', 'b', 'wit', '2', 'A')
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
Exemplo n.º 12
0
 def test_extend_no_extension(self):
     # Extend should return the original results when there are no
     # extensions to make.
     input_data = os.path.join(self._data_dir, 'extend-no-extensions.csv')
     corpus = tacl.Corpus(os.path.join(self._stripped_dir, 'cbeta'),
                          self._tokenizer)
     results = tacl.Results(input_data, self._tokenizer)
     results.extend(corpus)
     actual_rows = self._get_rows_from_results(results)
     expected_rows = self._get_rows_from_file(input_data)
     self.assertEqual(actual_rows, expected_rows)
Exemplo n.º 13
0
 def test_generate(self):
     expected_dir = os.path.join(self._data_dir, 'expected')
     catalogue = tacl.Catalogue()
     catalogue.load(os.path.join(self._data_dir, 'catalogue.txt'))
     results_path = os.path.join(self._data_dir, 'results.csv')
     tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta'])
     results = tacl.Results(results_path, tokenizer)
     label = 'A'
     with tempfile.TemporaryDirectory() as temp_dir:
         report = tacl.LifetimeReport()
         report.generate(temp_dir, catalogue, results, label)
         self._compare_results_dirs(temp_dir, expected_dir)
Exemplo n.º 14
0
 def test_bifurcated_extend(self):
     self.maxDiff = None
     # This is a test of Results._bifurcated_extend, which does not
     # require any information other than the results themselves.
     input_data = (
         ['AB', '2', 'a', 'base', '4', 'A', '7'],
         ['AB', '2', 'a', 'wit1', '5', 'A', '7'],
         ['AB', '2', 'b', 'base', '2', 'A', '7'],
         ['AB', '2', 'c', 'base', '4', 'B', '4'],
         ['ZAB', '3', 'a', 'base', '2', 'A', '3'],
         ['ZAB', '3', 'a', 'wit1', '2', 'A', '3'],
         ['ZAB', '3', 'b', 'base', '1', 'A', '3'],
         ['ABC', '3', 'a', 'base', '4', 'A', '5'],
         ['ABC', '3', 'a', 'wit1', '4', 'A', '5'],
         ['ABC', '3', 'b', 'base', '1', 'A', '5'],
         ['ABC', '3', 'c', 'base', '4', 'B', '4'],
         ['ZAB', '3', 'c', 'base', '2', 'B', '2'],
         ['XAB', '3', 'c', 'base', '2', 'B', '2'],
         ['ZABC', '4', 'a', 'base', '2', 'A', '2'],
         ['ZABC', '4', 'a', 'wit1', '2', 'A', '2'],
         ['ZABCD', '5', 'a', 'base', '1', 'A', '1'],
         ['ZABCD', '5', 'a', 'wit1', '1', 'A', '1'],
         ['ZABCDE', '6', 'a', 'base', '1', 'A', '1'],
     )
     fieldnames = tuple(
         list(tacl.constants.QUERY_FIELDNAMES[:]) +
         [tacl.constants.LABEL_COUNT_FIELDNAME])
     fh = self._create_csv(input_data, fieldnames=fieldnames)
     results = tacl.Results(fh, self._tokenizer)
     results._bifurcated_extend()
     expected_rows = [
         fieldnames,
         ('AB', '2', 'a', 'base', '4', 'A', '7'),
         ('AB', '2', 'a', 'wit1', '5', 'A', '7'),
         ('AB', '2', 'b', 'base', '2', 'A', '7'),
         ('ZAB', '3', 'a', 'base', '2', 'A', '3'),
         ('ABC', '3', 'a', 'base', '4', 'A', '5'),
         ('ZAB', '3', 'a', 'wit1', '2', 'A', '3'),
         ('ABC', '3', 'a', 'wit1', '4', 'A', '5'),
         ('ZAB', '3', 'b', 'base', '1', 'A', '3'),
         ('ABC', '3', 'b', 'base', '1', 'A', '5'),
         ('ABC', '3', 'c', 'base', '4', 'B', '4'),
         ('ZAB', '3', 'c', 'base', '2', 'B', '2'),
         ('XAB', '3', 'c', 'base', '2', 'B', '2'),
         ('ZABC', '4', 'a', 'base', '2', 'A', '2'),
         ('ZABC', '4', 'a', 'wit1', '2', 'A', '2'),
         ('ZABCD', '5', 'a', 'base', '1', 'A', '1'),
         ('ZABCD', '5', 'a', 'wit1', '1', 'A', '1'),
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
Exemplo n.º 15
0
 def test_zero_fill_no_duplicate_index_values(self):
     # Zero fill should not leave the matches with duplicate values
     # in the index, potentially raising a "cannot reindex from a
     # duplicate axis" ValueError when followed by another
     # operation.
     data_dir = os.path.join(os.path.dirname(__file__), 'data')
     corpus = tacl.Corpus(os.path.join(data_dir, 'stripped'),
                          self._tokenizer)
     input_file = os.path.join(self._data_dir, 'non-zero-fill-results.csv')
     results = tacl.Results(input_file, self._tokenizer)
     results.zero_fill(corpus)
     self.assertFalse(
         results._matches.index.has_duplicates,
         'Results._matches DataFrame is left with duplicate index values.')
Exemplo n.º 16
0
 def test_extend_no_duplicate_index_values(self):
     # Extend should not leave the matches with duplicate values in
     # the index, potentially raising a "cannot reindex from a
     # duplicate axis" ValueError when followed by another
     # operation.
     input_data = os.path.join(self._data_dir,
                               'cbeta-non-extend-results.csv')
     corpus = tacl.Corpus(os.path.join(self._stripped_dir, 'cbeta'),
                          self._tokenizer)
     results = tacl.Results(input_data, self._tokenizer)
     results.extend(corpus)
     self.assertFalse(
         results._matches.index.has_duplicates,
         'Results._matches DataFrame is left with duplicate index values.')
Exemplo n.º 17
0
 def _test_no_duplicate_index_values(self, cmd, *args, **kwargs):
     # No Results method should leave the matches with duplicate
     # values in the index, potentially raising a "cannot reindex
     # from a duplicate axis" ValueError when followed by another
     # operation.
     input_data = (['AB', '2', 'a', 'base', '4',
                    'A'], ['AB', '2', 'a', 'wit1', '5',
                           'A'], ['AB', '2', 'b', 'base', '3',
                                  'A'], ['AB', '2', 'b', 'wit1', '3', 'A'],
                   ['AB', '2', 'c', 'base', '2',
                    'B'], ['BC', '2', 'a', 'base', '2', 'A'])
     fh = self._create_csv(input_data)
     results = tacl.Results(fh, self._tokenizer)
     getattr(results, cmd)(*args, **kwargs)
     self.assertFalse(results._matches.index.has_duplicates)
Exemplo n.º 18
0
 def test_remove_label_missing_label(self):
     """Test removing a label that doesn't exist in the results."""
     input_data = (['AB', '2', 'a', 'base', '4',
                    'A'], ['AB', '2', 'a', 'wit', '3',
                           'A'], ['ABC', '3', 'a', 'base', '2', 'A'])
     fh = self._create_csv(input_data)
     results = tacl.Results(fh, self._tokenizer)
     results.remove_label('C')
     expected_rows = [
         tacl.constants.QUERY_FIELDNAMES,
         ('AB', '2', 'a', 'base', '4', 'A'),
         ('AB', '2', 'a', 'wit', '3', 'A'),
         ('ABC', '3', 'a', 'base', '2', 'A')
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
Exemplo n.º 19
0
    def _concatenate_results(self, result_filenames):
        """Returns a `tacl.Results` containing all of the results from the
        files specified in `result_filenames`.

        :param result_filenames: filenames of results to concatenate
        :type result_filenames: `list` of `str`
        :rtype: `tacl.Results`

        """
        results = [
            pd.read_csv(filename, encoding='utf-8', na_filter=False)
            for filename in result_filenames
        ]
        combined = pd.concat(results, ignore_index=True)
        identifying_fields = list(tacl.constants.QUERY_FIELDNAMES)
        identifying_fields.remove(tacl.constants.LABEL_FIELDNAME)
        combined.drop_duplicates(subset=identifying_fields, inplace=True)
        return tacl.Results(combined, self._tokenizer)
Exemplo n.º 20
0
 def _get_results(self, catalogue):
     results = io.StringIO()
     if self._store is None:
         store = tacl.DataStore(':memory:', True)
         store.add_ngrams(self._corpus, self._minimum, self._maximum,
                          catalogue)
     else:
         store = self._store
     self._logger.debug('Validating corpus/catalogue')
     store.validate(self._corpus, catalogue)
     self._logger.debug('Running intersection')
     store.intersection(catalogue, results)
     store = None
     results.seek(0)
     self._logger.debug('Generating results')
     results = tacl.Results(results, self._tokenizer)
     self._logger.debug('Extending results')
     results.extend(self._corpus)
     self._logger.debug('Reducing')
     results.reduce()
     return results
Exemplo n.º 21
0
    def _test_required_columns(self, cols, cmd, *args, **kwargs):
        """Tests that when `cmd` is run with `args` and `kwargs`, it raises a
        `MalformedResultsError when each of `cols` is not present in
        the results. Further tests that that exception is not raised
        when other columns are not present.

        This test is designed to test Results methods only.

        """
        input_results = (
            ['AB', '2', 'T1', 'base', '4', 'A'],
            ['AB', '2', 'T1', 'a', '3', 'A'],
            ['AB', '2', 'T2', 'base', '2', 'A'],
            ['ABC', '3', 'T1', 'base', '2', 'A'],
            ['ABC', '3', 'T1', 'a', '0', 'A'],
            ['AB', '2', 'T3', 'base', '2', 'B'],
            ['BC', '2', 'T1', 'base', '3', 'A'],
        )
        for col in tacl.constants.QUERY_FIELDNAMES:
            fs = list(tacl.constants.QUERY_FIELDNAMES[:])
            index = fs.index(col)
            fs[index] = 'dummy'
            fh = self._create_csv(input_results, fieldnames=fs)
            results = tacl.Results(fh, self._tokenizer)
            if col in cols:
                self.assertRaises(MalformedResultsError, getattr(results, cmd),
                                  *args, **kwargs)
            else:
                try:
                    getattr(results, cmd)(*args, **kwargs)
                except MalformedResultsError:
                    self.fail(
                        'Results.{} improperly raises MalformedResultsError '
                        'when column "{}" not present in results'.format(
                            cmd, col))
                except KeyError as e:
                    if str(e).strip('"\'') == col:
                        self.fail(
                            'Results.{} requires column "{}" but does not '
                            'specify this.'.format(cmd, col))
Exemplo n.º 22
0
 def test_excise(self):
     input_results = (
         ['AB', '2', 'T1', 'wit1', '4', 'A'],
         ['AC', '2', 'T1', 'wit1', '3', 'A'],
         ['ABde', '3', 'T1', 'wit1', '1', 'A'],
         ['dDe', '3', 'T1', 'wit1', '2', 'A'],
         ['Dde', '3', 'T1', 'wit1', '1', 'A'],
         ['ABdeD', '4', 'T1', 'wit1', '1', 'A'],
         ['deAB', '3', 'T2', 'wit1', '2', 'B'],
         ['deAB', '3', 'T2', 'wit2', '2', 'B'],
     )
     fh = self._create_csv(input_results)
     results = tacl.Results(fh, self._tokenizer)
     results.excise('de')
     expected_rows = [
         tacl.constants.QUERY_FIELDNAMES,
         ('AB', '2', 'T1', 'wit1', '4', 'A'),
         ('AC', '2', 'T1', 'wit1', '3', 'A'),
         ('dDe', '3', 'T1', 'wit1', '2', 'A'),
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(actual_rows, expected_rows)
Exemplo n.º 23
0
 def _perform_reduce(self, input_data, tokenizer):
     fh = self._create_csv(input_data)
     results = tacl.Results(fh, tokenizer)
     results.reduce()
     return self._get_rows_from_results(results)
Exemplo n.º 24
0
 def test_reciprocal_remove(self):
     input_data = (['AB', '2', 'a', 'base', '5',
                    'A'], ['ABCDEF', '6', 'a', 'base', '7',
                           'A'], ['DEF', '3', 'a', 'base', '2', 'A'],
                   ['GHIJ', '4', 'a', 'base', '3',
                    'A'], ['KLM', '3', 'b', 'base', '0',
                           'A'], ['ABCDEF', '6', 'b', 'base', '3', 'B'],
                   ['GHIJ', '4', 'b', 'base', '2',
                    'B'], ['KLM', '3', 'b', 'base', '17', 'B'])
     fh = self._create_csv(input_data)
     results = tacl.Results(fh, self._tokenizer)
     results.reciprocal_remove()
     expected_rows = [
         tacl.constants.QUERY_FIELDNAMES,
         ('ABCDEF', '6', 'a', 'base', '7', 'A'),
         ('GHIJ', '4', 'a', 'base', '3', 'A'),
         ('ABCDEF', '6', 'b', 'base', '3', 'B'),
         ('GHIJ', '4', 'b', 'base', '2', 'B')
     ]
     actual_rows = self._get_rows_from_results(results)
     self.assertEqual(set(actual_rows), set(expected_rows))
     # More than two labels, and more than one text per label.
     input_data = (['AB', '2', 'a', 'base', '5',
                    'A'], ['ABCDEF', '6', 'a', 'base', '7',
                           'A'], ['DEF', '3', 'a', 'base', '2',
                                  'A'], ['AB', '2', 'b', 'base', '6', 'A'],
                   ['GHIJ', '4', 'b', 'base', '3',
                    'A'], ['KLM', '3', 'b', 'base', '0',
                           'A'], ['ABCDEF', '6', 'c', 'base', '3', 'B'],
                   ['KLM', '3', 'c', 'base', '17',
                    'B'], ['GHIJ', '4', 'd', 'base', '2',
                           'B'], ['KLM', '3', 'e', 'base', '3', 'C'],
                   ['GHIJ', '4', 'f', 'base', '11',
                    'C'], ['ABCDEF', '6', 'g', 'base', '8', 'C'])
     fh = self._create_csv(input_data)
     results = tacl.Results(fh, self._tokenizer)
     results.reciprocal_remove()
     expected_rows = [
         tacl.constants.QUERY_FIELDNAMES,
         ('ABCDEF', '6', 'a', 'base', '7', 'A'),
         ('GHIJ', '4', 'b', 'base', '3', 'A'),
         ('ABCDEF', '6', 'c', 'base', '3', 'B'),
         ('GHIJ', '4', 'd', 'base', '2', 'B'),
         ('GHIJ', '4', 'f', 'base', '11', 'C'),
         ('ABCDEF', '6', 'g', 'base', '8', 'C')
     ]
     actual_rows = self._get_rows_from_csv(
         results.csv(io.StringIO(newline='')))
     self.assertEqual(set(actual_rows), set(expected_rows))
     # Now with variants.
     input_data = (['AB', '2', 'a', 'base', '5',
                    'A'], ['ABCDEF', '6', 'a', 'wit1', '7',
                           'A'], ['DEF', '3', 'a', 'base', '2',
                                  'A'], ['AB', '2', 'b', 'base', '6', 'A'],
                   ['GHIJ', '4', 'b', 'base', '3',
                    'A'], ['KLM', '3', 'b', 'base', '0',
                           'A'], ['ABCDEF', '6', 'c', 'base', '3', 'B'],
                   ['KLM', '3', 'c', 'base', '17',
                    'B'], ['GHIJ', '4', 'd', 'base', '2',
                           'B'], ['KLM', '3', 'e', 'base', '3', 'C'],
                   ['GHIJ', '4', 'f', 'wit2', '11',
                    'C'], ['ABCDEF', '6', 'g', 'base', '8', 'C'])
     fh = self._create_csv(input_data)
     results = tacl.Results(fh, self._tokenizer)
     results.reciprocal_remove()
     expected_rows = [
         tacl.constants.QUERY_FIELDNAMES,
         ('ABCDEF', '6', 'a', 'wit1', '7', 'A'),
         ('GHIJ', '4', 'b', 'base', '3', 'A'),
         ('ABCDEF', '6', 'c', 'base', '3', 'B'),
         ('GHIJ', '4', 'd', 'base', '2', 'B'),
         ('GHIJ', '4', 'f', 'wit2', '11', 'C'),
         ('ABCDEF', '6', 'g', 'base', '8', 'C')
     ]
     actual_rows = self._get_rows_from_csv(
         results.csv(io.StringIO(newline='')))
     self.assertEqual(set(actual_rows), set(expected_rows))