예제 #1
0
 def test_has_ngrams(self):
     store = tacl.DataStore(':memory:')
     store._conn = MagicMock(spec_set=sqlite3.Connection)
     cursor = store._conn.execute.return_value
     store._conn.execute.return_value = cursor
     # Path one: there are n-grams.
     cursor.fetchone.return_value = True
     actual_result = store._has_ngrams(sentinel.text_id, sentinel.size)
     self.assertEqual(store._conn.mock_calls, [
         call.execute(tacl.constants.SELECT_HAS_NGRAMS_SQL,
                      [sentinel.text_id, sentinel.size]),
         call.execute().fetchone()
     ])
     self.assertEqual(actual_result, True)
     # Path two: there are no n-grams.
     store._conn.reset_mock()
     cursor.reset_mock()
     cursor.fetchone.return_value = None
     actual_result = store._has_ngrams(sentinel.text_id, sentinel.size)
     self.assertEqual(store._conn.mock_calls, [
         call.execute(tacl.constants.SELECT_HAS_NGRAMS_SQL,
                      [sentinel.text_id, sentinel.size]),
         call.execute().fetchone()
     ])
     self.assertEqual(actual_result, False)
예제 #2
0
 def test_add_ngrams_with_catalogue(self):
     catalogue = tacl.Catalogue({'T1': 'A', 'T5': 'B'})
     store = tacl.DataStore(':memory:')
     store.add_ngrams(self._corpus, 1, 1, catalogue)
     store._conn.row_factory = None
     actual_rows = store._conn.execute(
         'SELECT Text.work, Text.siglum, Text.checksum, Text.label, '
         'TextNGram.ngram, TextNGram.size, TextNGram.count '
         'FROM Text, TextNGram WHERE Text.id = TextNGram.text').fetchall()
     expected_rows = [
         ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 't', 1, 2),
         ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'h', 1, 1),
         ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'e', 1, 3),
         ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'n', 1, 2),
         ('T1', 'base', '705c89d665a5300516fe7314f84ebce0', '', 'w', 1, 2),
         ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 't', 1, 2),
         ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'h', 1, 1),
         ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'e', 1, 3),
         ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'w', 1, 2),
         ('T1', 'a', 'e898b184b8d4d3ab5fea9d79fd645135', '', 'n', 1, 1),
         ('T5', 'base', '1b42a11f5f647e53d20da8c8f57a9f02', '', 'w', 1, 1),
         ('T5', 'base', '1b42a11f5f647e53d20da8c8f57a9f02', '', 'e', 1, 1),
         ('T5', 'base', '1b42a11f5f647e53d20da8c8f57a9f02', '', 'l', 1, 2),
     ]
     self.assertEqual(set(actual_rows), set(expected_rows))
예제 #3
0
 def _compare_results(self,
                      expected_dir_name,
                      minimum,
                      maximum,
                      catalogue,
                      seen_pairs,
                      db_name='test.db'):
     expected_dir = os.path.join(self._data_dir, 'expected',
                                 expected_dir_name)
     corpus = tacl.Corpus(os.path.join(self._data_dir, 'corpus'),
                          self._tokenizer)
     with tempfile.TemporaryDirectory() as temp_dir:
         if db_name is None:
             data_store = None
         else:
             data_store = tacl.DataStore(os.path.join(temp_dir, db_name),
                                         False)
             data_store.add_ngrams(corpus, minimum, maximum)
         actual_dir = os.path.join(temp_dir, 'actual')
         tracker_path = os.path.join(actual_dir, 'tracker.csv')
         if seen_pairs:
             os.makedirs(actual_dir, exist_ok=True)
             with open(tracker_path, 'w') as fh:
                 fh.writelines(
                     ['{},{}\n'.format(a, b) for a, b in seen_pairs])
         pi = PairedIntersector(data_store, corpus, self._tokenizer,
                                catalogue, actual_dir, tracker_path, 1, 1)
         pi.intersect_all()
         self._compare_results_dirs(actual_dir, expected_dir)
예제 #4
0
 def test_intersection_supplied_one_label(self):
     filenames = ['a.csv']
     labels = ['A']
     store = tacl.DataStore(':memory:')
     output_fh = MagicMock(name='fh')
     self.assertRaises(MalformedQueryError, store.intersection_supplied,
                       filenames, labels, output_fh)
예제 #5
0
 def test_reduce_diff_size(self):
     # Consider a diff where the smallest gram for a witness is
     # larger than the smallest gram across all witnesses:
     #   abdef vs abcbdef
     store = tacl.DataStore(':memory:')
     tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta'])
     input_data = (['abd', '3', 'a', 'base', '1',
                    'A'], ['abde', '4', 'a', 'base', '1',
                           'A'], ['abdef', '5', 'a', 'base', '1',
                                  'A'], ['bc', '2', 'b', 'base', '1', 'B'],
                   ['cb', '2', 'b', 'base', '1',
                    'B'], ['abc', '3', 'b', 'base', '1',
                           'B'], ['bcb', '3', 'b', 'base', '1',
                                  'B'], ['cbd', '3', 'b', 'base', '1', 'B'],
                   ['abcb', '4', 'b', 'base', '1',
                    'B'], ['bcbd', '4', 'b', 'base', '1',
                           'B'], ['cbde', '4', 'b', 'base', '1', 'B'],
                   ['abcbd', '5', 'b', 'base', '1',
                    'B'], ['bcbde', '5', 'b', 'base', '1',
                           'B'], ['cbdef', '5', 'b', 'base', '1', 'B'],
                   ['abcbde', '6', 'b', 'base', '1',
                    'B'], ['bcbdef', '6', 'b', 'base', '1',
                           'B'], ['abcbdef', '7', 'b', 'base', '1', 'B'])
     expected_rows = [
         tacl.constants.QUERY_FIELDNAMES,
         ('abd', '3', 'a', 'base', '1', 'A'),
         ('bc', '2', 'b', 'base', '1', 'B'),
         ('cb', '2', 'b', 'base', '1', 'B'),
         ('bcb', '3', 'b', 'base', '1', 'B')
     ]
     actual_rows = self._reduce_diff(store, input_data, tokenizer)
     self.assertEqual(set(actual_rows), set(expected_rows))
예제 #6
0
 def test_add_ngrams_with_catalogue(self):
     add_indices = self._create_patch('tacl.DataStore._add_indices')
     add_text_ngrams = self._create_patch('tacl.DataStore._add_text_ngrams')
     analyse = self._create_patch('tacl.DataStore._analyse')
     initialise = self._create_patch('tacl.DataStore._initialise_database')
     text1 = MagicMock(spec_set=tacl.WitnessText)
     text1.get_names = MagicMock(name='get_names')
     text1.get_names.return_value = ['T1', 'wit1']
     text2 = MagicMock(spec_set=tacl.WitnessText)
     text2.get_names = MagicMock(name='get_names')
     text2.get_names.return_value = ['T1', 'wit2']
     corpus = MagicMock(spec_set=tacl.Corpus)
     corpus.get_witnesses = MagicMock(name='get_witnesses')
     corpus.get_witnesses.return_value = iter([text1, text2])
     store = tacl.DataStore(':memory:')
     catalogue = tacl.Catalogue({'T1': 'A'})
     store.add_ngrams(corpus, 2, 3, catalogue)
     initialise.assert_called_once_with(store)
     corpus.get_witnesses.assert_called_once_with(
         'T1', text_class=tacl.WitnessText)
     add_text_ngrams.assert_has_calls(
         [call(store, text1, 2, 3),
          call(store, text2, 2, 3)])
     add_indices.assert_called_once_with(store)
     analyse.assert_called_once_with(store)
예제 #7
0
 def test_set_labels(self):
     catalogue = collections.OrderedDict([(sentinel.text1, sentinel.label1),
                                          (sentinel.text2, sentinel.label2),
                                          (sentinel.text3, sentinel.label1)
                                          ])
     store = tacl.DataStore(':memory:')
     store._conn = MagicMock(spec_set=sqlite3.Connection)
     cursor = store._conn.execute.return_value
     store._conn.execute.return_value = cursor
     cursor.fetchone.return_value = {'token_count': 10}
     actual_labels = store._set_labels(catalogue)
     expected_labels = {sentinel.label1: 20, sentinel.label2: 10}
     connection_calls = [
         call.execute(tacl.constants.UPDATE_LABELS_SQL, ['']),
         call.execute(tacl.constants.UPDATE_LABEL_SQL,
                      [sentinel.label1, sentinel.text1]),
         call.execute(tacl.constants.SELECT_TEXT_TOKEN_COUNT_SQL,
                      [sentinel.text1]),
         call.execute(tacl.constants.UPDATE_LABEL_SQL,
                      [sentinel.label2, sentinel.text2]),
         call.execute(tacl.constants.SELECT_TEXT_TOKEN_COUNT_SQL,
                      [sentinel.text2]),
         call.execute(tacl.constants.UPDATE_LABEL_SQL,
                      [sentinel.label1, sentinel.text3]),
         call.execute(tacl.constants.SELECT_TEXT_TOKEN_COUNT_SQL,
                      [sentinel.text3]),
     ]
     for connection_call in connection_calls:
         self.assertIn(connection_call, store._conn.mock_calls)
     self.assertEqual(actual_labels, expected_labels)
예제 #8
0
 def test_intersection(self):
     labels = [sentinel.label1, sentinel.label2]
     set_labels = self._create_patch('tacl.DataStore._set_labels')
     set_labels.return_value = {}
     sort_labels = self._create_patch('tacl.DataStore._sort_labels', False)
     sort_labels.return_value = labels
     get_placeholders = self._create_patch(
         'tacl.DataStore._get_placeholders', False)
     get_placeholders.return_value = sentinel.placeholders
     log_query_plan = self._create_patch('tacl.DataStore._log_query_plan',
                                         False)
     input_fh = MagicMock(name='fh')
     csv = self._create_patch('tacl.DataStore._csv', False)
     csv.return_value = input_fh
     catalogue = MagicMock(name='catalogue')
     store = tacl.DataStore(':memory:')
     store._conn = MagicMock(spec_set=sqlite3.Connection)
     cursor = store._conn.execute.return_value
     output_fh = store.intersection(catalogue, input_fh)
     set_labels.assert_called_once_with(store, catalogue)
     get_placeholders.assert_called_once_with(labels)
     log_query_plan.assert_called_once()
     sql = 'SELECT TextNGram.ngram, TextNGram.size, TextNGram.count, Text.name AS "text name", Text.siglum, Text.label FROM Text, TextNGram WHERE Text.label IN (sentinel.placeholders) AND Text.id = TextNGram.text AND TextNGram.ngram IN (SELECT TextNGram.ngram FROM Text, TextNGram WHERE Text.label = ? AND Text.id = TextNGram.text AND TextNGram.ngram IN (SELECT TextNGram.ngram FROM Text, TextNGram WHERE Text.label = ? AND Text.id = TextNGram.text))'
     self.assertEqual(store._conn.mock_calls,
                      [call.execute(sql, labels * 2)])
     csv.assert_called_once_with(cursor, tacl.constants.QUERY_FIELDNAMES,
                                 input_fh)
     self.assertEqual(input_fh, output_fh)
예제 #9
0
 def test_diff_asymmetric(self):
     labels = {sentinel.label: 1, sentinel.prime_label: 1}
     set_labels = self._create_patch('tacl.DataStore._set_labels')
     set_labels.return_value = labels
     get_placeholders = self._create_patch(
         'tacl.DataStore._get_placeholders', False)
     get_placeholders.return_value = sentinel.placeholders
     log_query_plan = self._create_patch('tacl.DataStore._log_query_plan',
                                         False)
     input_fh = MagicMock(name='fh')
     csv = self._create_patch('tacl.DataStore._csv', False)
     csv.return_value = input_fh
     catalogue = MagicMock(name='catalogue')
     store = tacl.DataStore(':memory:')
     store._conn = MagicMock(spec_set=sqlite3.Connection)
     cursor = store._conn.execute.return_value
     output_fh = store.diff_asymmetric(catalogue, sentinel.prime_label,
                                       input_fh)
     set_labels.assert_called_once_with(store, catalogue)
     get_placeholders.assert_called_once_with([sentinel.label])
     log_query_plan.assert_called_once()
     sql = tacl.constants.SELECT_DIFF_ASYMMETRIC_SQL.format(
         sentinel.placeholders)
     self.assertEqual(store._conn.mock_calls, [
         call.execute(
             sql,
             [sentinel.prime_label, sentinel.prime_label, sentinel.label])
     ])
     csv.assert_called_once_with(cursor, tacl.constants.QUERY_FIELDNAMES,
                                 input_fh)
     self.assertEqual(input_fh, output_fh)
예제 #10
0
 def test_validate_true(self):
     corpus = MagicMock(spec_set=tacl.Corpus)
     text = MagicMock(spec_set=tacl.WitnessText)
     text.get_checksum.return_value = sentinel.checksum
     text.get_names.return_value = (sentinel.name, sentinel.siglum)
     corpus.get_witnesses.return_value = (text, )
     catalogue = collections.OrderedDict([(sentinel.text1, sentinel.label1),
                                          (sentinel.text2, sentinel.label2),
                                          (sentinel.text3, sentinel.label1)
                                          ])
     store = tacl.DataStore(':memory:')
     store._conn = MagicMock(spec_set=sqlite3.Connection)
     cursor = store._conn.execute.return_value
     cursor.fetchone.return_value = {'checksum': sentinel.checksum}
     actual_result = store.validate(corpus, catalogue)
     corpus.get_witnesses.assert_has_calls(
         [call(sentinel.text1),
          call(sentinel.text2),
          call(sentinel.text3)])
     self.assertEqual(store._conn.mock_calls, [
         call.execute(tacl.constants.SELECT_TEXT_SQL,
                      [sentinel.name, sentinel.siglum]),
         call.execute().fetchone(),
         call.execute(tacl.constants.SELECT_TEXT_SQL,
                      [sentinel.name, sentinel.siglum]),
         call.execute().fetchone(),
         call.execute(tacl.constants.SELECT_TEXT_SQL,
                      [sentinel.name, sentinel.siglum]),
         call.execute().fetchone()
     ])
     self.assertEqual(actual_result, True)
예제 #11
0
 def test_diff_asymmetric(self):
     labels = {sentinel.label: 1, sentinel.prime_label: 1}
     set_labels = self._create_patch('tacl.DataStore._set_labels')
     set_labels.return_value = labels
     get_placeholders = self._create_patch(
         'tacl.DataStore._get_placeholders', False)
     get_placeholders.return_value = sentinel.placeholders
     log_query_plan = self._create_patch('tacl.DataStore._log_query_plan',
                                         False)
     input_fh = MagicMock(name='fh')
     catalogue = MagicMock(name='catalogue')
     store = tacl.DataStore(':memory:')
     store._conn = MagicMock(spec_set=sqlite3.Connection)
     tokenizer = MagicMock(name='tokenizer')
     _diff = self._create_patch('tacl.DataStore._diff', False)
     _diff.return_value = input_fh
     output_fh = store.diff_asymmetric(catalogue, sentinel.prime_label,
                                       tokenizer, input_fh)
     set_labels.assert_called_once_with(store, catalogue)
     get_placeholders.assert_called_once_with([sentinel.label])
     self.assertTrue(log_query_plan.called)
     sql = tacl.constants.SELECT_DIFF_ASYMMETRIC_SQL.format(
         sentinel.placeholders)
     self.assertEqual(store._conn.mock_calls, [
         call.execute(
             sql,
             [sentinel.prime_label, sentinel.prime_label, sentinel.label])
     ])
     self.assertTrue(_diff.called)
     self.assertEqual(input_fh, output_fh)
예제 #12
0
 def test_check_diff_result(self):
     # Test the various possibilities that
     # DataStore._reduce_diff_results must handle.
     store = tacl.DataStore(':memory:')
     tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta'])
     tokenize = tokenizer.tokenize
     join = tokenizer.joiner.join
     row = pd.Series(['ABC', 3, 'a', 'base', 1, 'A'],
                     index=tacl.constants.QUERY_FIELDNAMES)
     # N-gram is not composed of any existing (n-1)-gram.
     matches = {'CD': 1}
     actual_row = store._check_diff_result(row, matches, tokenize, join)
     self.assertEqual(actual_row['count'], 1)
     # N-gram is composed entirely of existing (n-1)-grams.
     matches = {'AB': 1, 'BC': 1, 'CD': 1}
     actual_row = store._check_diff_result(row, matches, tokenize, join)
     self.assertEqual(actual_row['count'], 1)
     # N-gram is composed partly by existing (n-1)-grams.
     matches = {'AB': 1, 'CD': 1}
     actual_row = store._check_diff_result(row, matches, tokenize, join)
     self.assertEqual(actual_row['count'], 0)
     matches = {'BC': 1, 'CD': 1}
     actual_row = store._check_diff_result(row, matches, tokenize, join)
     self.assertEqual(actual_row['count'], 0)
     # N-gram is composed of one or more n-grams with count 0.
     matches = {'AB': 0, 'BC': 1, 'CD': 1}
     actual_row = store._check_diff_result(row, matches, tokenize, join)
     self.assertEqual(actual_row['count'], 0)
     matches = {'AB': 1, 'BC': 0, 'CD': 1}
     actual_row = store._check_diff_result(row, matches, tokenize, join)
     self.assertEqual(actual_row['count'], 0)
     matches = {'AB': 0, 'BC': 0, 'CD': 1}
     actual_row = store._check_diff_result(row, matches, tokenize, join)
     self.assertEqual(actual_row['count'], 0)
예제 #13
0
 def test_delete_text_ngrams(self):
     store = tacl.DataStore(':memory:')
     store._conn = MagicMock(spec_set=sqlite3.Connection)
     store._delete_text_ngrams(sentinel.text_id)
     store._conn.execute.has_calls(
         call(tacl.constants.DELETE_TEXT_NGRAMS_SQL, [sentinel.text_id]),
         call(tacl.constants.DELETE_TEXT_HAS_NGRAMS_SQL,
              [sentinel.text_id]))
예제 #14
0
 def test_diff_one_label(self):
     catalogue = {'T1': 'A', 'T2': 'A'}
     store = tacl.DataStore(':memory:')
     output_fh = MagicMock(name='fh')
     set_labels = self._create_patch('tacl.DataStore._set_labels')
     set_labels.return_value = {'A': 2}
     self.assertRaises(MalformedQueryError, store.diff, catalogue,
                       output_fh)
예제 #15
0
 def test_diff_supplied_one_label(self):
     filenames = ['a.csv']
     labels = ['A']
     store = tacl.DataStore(':memory:')
     tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta'])
     output_fh = MagicMock(name='fh')
     self.assertRaises(MalformedQueryError, store.diff_supplied, filenames,
                       labels, tokenizer, output_fh)
예제 #16
0
 def test_diff_one_label(self):
     catalogue = {'T1': 'A', 'T2': 'A'}
     store = tacl.DataStore(':memory:')
     output_fh = MagicMock(name='fh')
     tokenizer = tacl.Tokenizer(*tacl.constants.TOKENIZERS['cbeta'])
     set_labels = self._create_patch('tacl.DataStore._set_labels')
     set_labels.return_value = {'A': 2}
     self.assertRaises(MalformedQueryError, store.diff, catalogue,
                       tokenizer, output_fh)
예제 #17
0
 def test_analyse(self):
     store = tacl.DataStore(':memory:')
     store._conn = MagicMock(spec_set=sqlite3.Connection)
     store._analyse()
     store._conn.execute.assert_called_once_with(
         tacl.constants.ANALYSE_SQL.format(''))
     store._conn.reset_mock()
     store._analyse(sentinel.table)
     store._conn.execute.assert_called_once_with(
         tacl.constants.ANALYSE_SQL.format(sentinel.table))
예제 #18
0
 def test_add_temporary_ngrams_not_duplicate(self):
     """Tests that duplicates n-grams are added only once to the temporary
     table."""
     store = tacl.DataStore(':memory:')
     input_ngrams = ['A', 'A']
     store._add_temporary_ngrams(input_ngrams)
     cursor = store._conn.execute('SELECT * FROM InputNGram')
     expected_ngrams = ['A']
     actual_ngrams = ([row['ngram'] for row in cursor.fetchall()])
     self.assertEqual(actual_ngrams, expected_ngrams)
예제 #19
0
 def setUp (self):
     self._tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_CBETA,
                                      tacl.constants.TOKENIZER_JOINER_CBETA)
     self._data_dir = os.path.join(os.path.dirname(__file__), 'data')
     self._corpus = tacl.Corpus(os.path.join(self._data_dir, 'stripped'),
                                self._tokenizer)
     self._catalogue = tacl.Catalogue()
     self._catalogue.load(os.path.join(self._data_dir, 'catalogue.txt'))
     self._store = tacl.DataStore(':memory:')
     self._store.add_ngrams(self._corpus, 1, 3)
예제 #20
0
 def test_sort_labels(self):
     store = tacl.DataStore(':memory:')
     label_data = {
         sentinel.label1: 2,
         sentinel.label2: 3,
         sentinel.label3: 1
     }
     actual_labels = store._sort_labels(label_data)
     expected_labels = [sentinel.label2, sentinel.label1, sentinel.label3]
     self.assertEqual(actual_labels, expected_labels)
예제 #21
0
 def test_add_temporary_ngrams_empty(self):
     """Tests that n-grams that are empty strings are not added to the
     temporary table."""
     store = tacl.DataStore(':memory:')
     input_ngrams = ['', 'A']
     store._add_temporary_ngrams(input_ngrams)
     cursor = store._conn.execute('SELECT * FROM InputNGram')
     expected_ngrams = ['A']
     actual_ngrams = ([row['ngram'] for row in cursor.fetchall()])
     self.assertEqual(actual_ngrams, expected_ngrams)
예제 #22
0
 def test_add_temporary_ngrams(self):
     store = tacl.DataStore(':memory:')
     store._conn = MagicMock(spec_set=sqlite3.Connection)
     store._add_temporary_ngrams(['A', 'B'])
     self.assertEqual(store._conn.mock_calls, [
         call.execute(tacl.constants.DROP_TEMPORARY_NGRAMS_TABLE_SQL),
         call.execute(tacl.constants.CREATE_TEMPORARY_NGRAMS_TABLE_SQL),
         call.executemany(tacl.constants.INSERT_TEMPORARY_NGRAM_SQL,
                          [('A', ), ('B', )])
     ])
예제 #23
0
 def test_diff_asymmetric_invalid_label(self):
     # Tests that the right error is raised when the supplied label
     # is not present in the catalogue.
     catalogue = {'T1': 'A', 'T2': 'B'}
     prime_label = 'C'
     input_fh = MagicMock(name='fh')
     store = tacl.DataStore(':memory:')
     set_labels = self._create_patch('tacl.DataStore._set_labels')
     set_labels.return_value = {'A': 1, 'B': 1}
     self.assertRaises(MalformedQueryError, store.diff_asymmetric,
                       catalogue, prime_label, input_fh)
예제 #24
0
 def test_intersection_one_label(self):
     labels = [sentinel.label1]
     set_labels = self._create_patch('tacl.DataStore._set_labels')
     set_labels.return_value = {}
     sort_labels = self._create_patch('tacl.DataStore._sort_labels', False)
     sort_labels.return_value = labels
     output_fh = MagicMock(name='fh')
     catalogue = MagicMock(name='catalogue')
     store = tacl.DataStore(':memory:')
     self.assertRaises(MalformedQueryError, store.intersection, catalogue,
                       output_fh)
예제 #25
0
 def test_delete_text_ngrams(self):
     store = tacl.DataStore(':memory:')
     store._conn = MagicMock(spec_set=sqlite3.Connection)
     store._delete_text_ngrams(sentinel.text_id)
     expected_calls = [
         call.execute(tacl.constants.DELETE_TEXT_NGRAMS_SQL,
                      [sentinel.text_id]),
         call.execute(tacl.constants.DELETE_TEXT_HAS_NGRAMS_SQL,
                      [sentinel.text_id]),
         call.commit()
     ]
     self.assertEqual(store._conn.mock_calls, expected_calls)
예제 #26
0
 def test_add_text_size_ngrams(self):
     store = tacl.DataStore(':memory:')
     store._conn = MagicMock(spec_set=sqlite3.Connection)
     size = 1
     ngrams = collections.OrderedDict([('a', 2), ('b', 1)])
     store._add_text_size_ngrams(sentinel.text_id, size, ngrams)
     store._conn.execute.assert_called_once_with(
         tacl.constants.INSERT_TEXT_HAS_NGRAM_SQL,
         [sentinel.text_id, size, len(ngrams)])
     store._conn.executemany.assert_called_once_with(
         tacl.constants.INSERT_NGRAM_SQL,
         [[sentinel.text_id, 'a', size, 2],
          [sentinel.text_id, 'b', size, 1]])
예제 #27
0
 def test_update_text_record(self):
     store = tacl.DataStore(':memory:')
     store._conn = MagicMock(spec_set=sqlite3.Connection)
     text = MagicMock(spec_set=tacl.WitnessText)
     text.get_checksum.return_value = sentinel.checksum
     tokens = [sentinel.token]
     text.get_tokens.return_value = tokens
     store._update_text_record(text, sentinel.text_id)
     self.assertEqual(
         text.mock_calls,
         [call.get_checksum(), call.get_tokens()])
     store._conn.execute.assert_called_once_with(
         tacl.constants.UPDATE_TEXT_SQL,
         [sentinel.checksum,
          len(tokens), sentinel.text_id])
예제 #28
0
 def test_add_temporary_ngrams_twice(self):
     # Test that multiple calls to the method succeed.
     store = tacl.DataStore(':memory:')
     input_ngrams = ['禁律', '律藏也']
     store._add_temporary_ngrams(input_ngrams)
     cursor = store._conn.execute('SELECT * FROM InputNGram')
     expected_ngrams = set(input_ngrams)
     actual_ngrams = set([row['ngram'] for row in cursor.fetchall()])
     self.assertEqual(actual_ngrams, expected_ngrams)
     input_ngrams = ['每', '以示']
     store._add_temporary_ngrams(input_ngrams)
     cursor = store._conn.execute('SELECT * FROM InputNGram')
     expected_ngrams = set(input_ngrams)
     actual_ngrams = set([row['ngram'] for row in cursor.fetchall()])
     self.assertEqual(actual_ngrams, expected_ngrams)
예제 #29
0
 def _compare_results(self, max_works, expected_dir_name):
     expected_dir = os.path.join(self._data_dir, 'expected',
                                 expected_dir_name)
     corpus = tacl.Corpus(self._corpus, self._tokenizer)
     catalogue = tacl.Catalogue()
     catalogue.load(self._catalogue)
     with tempfile.TemporaryDirectory() as temp_dir:
         data_store = tacl.DataStore(os.path.join(temp_dir, 'test.db'),
                                     False)
         data_store.add_ngrams(corpus, 1, 1)
         output_dir = os.path.join(temp_dir, 'output')
         test = paternity.PaternityTest(data_store, catalogue,
                                        self._tokenizer, 'P', 'C', 'U',
                                        max_works, output_dir)
         test.process()
         self._compare_results_dirs(output_dir, expected_dir)
예제 #30
0
 def _compare_results(self, corpus_dir, catalogue_name):
     """Compare all of the actual results files with the expected
     versions."""
     expected_dir = os.path.join(self._data_dir, 'expected')
     corpus = tacl.Corpus(os.path.join(self._data_dir, corpus_dir),
                          self._tokenizer)
     catalogue = tacl.Catalogue()
     catalogue.load(os.path.join(self._data_dir, catalogue_name))
     with tempfile.TemporaryDirectory() as temp_dir:
         data_store = tacl.DataStore(os.path.join(temp_dir, 'test.db'),
                                     False)
         data_store.add_ngrams(corpus, 1, 1)
         output_dir = os.path.join(temp_dir, 'output')
         reporter = lifetime.LifetimeReporter(data_store, catalogue,
                                              self._tokenizer, output_dir)
         reporter.process()
         self._compare_results_dirs(output_dir, expected_dir)