def test_write_database(tmp_path, mini_testsuite, empty_alt_testsuite): tmp_ts = tmp_path.joinpath('test_write_database') db = tsdb.Database(mini_testsuite) tsdb.write_database(db, str(tmp_ts)) assert tmp_ts.is_dir() assert tmp_ts.joinpath('relations').is_file() assert tmp_ts.joinpath('item').is_file() assert tmp_ts.joinpath('parse').is_file() assert tmp_ts.joinpath('result').is_file() assert tmp_ts.joinpath('parse').read_text() == ( '10@10@1\n' '20@20@0\n' '30@30@1\n') tsdb.write_database(db, str(tmp_ts), names=['item']) assert tmp_ts.joinpath('item').is_file() assert not tmp_ts.joinpath('parse').is_file() assert not tmp_ts.joinpath('result').is_file() # alt_schema drops i-wf field from mini_testsuite's schema alt_schema = tsdb.read_schema(empty_alt_testsuite) tsdb.write_database(db, str(tmp_ts), names=['item'], schema=alt_schema) alt_db = tsdb.Database(str(tmp_ts)) assert len(db.schema['item']) == 4 assert len(alt_db.schema['item']) == 3 assert tmp_ts.joinpath('item').read_text() == ( '10@It rained.@1-feb-2018 15:00\n' '20@Rained.@01-02-18 15:00:00\n' '30@It snowed.@2018-2-1 (15:00:00)\n')
def customize_itsdb(grammar_path): if 'sentence' not in ch: return today = datetime.datetime.today() author = 'Grammar Matrix Customization System' def get_item(s, i): return { 'i-id': str(i), 'i-origin': 'unknown', 'i-register': 'unknown', 'i-format': 'none', 'i-difficulty': '1', 'i-category': 'S' if not s.get('star', False) else '', 'i-input': s['orth'], 'i-wf': '0' if s.get('star', False) else '1', 'i-length': str(len(s['orth'].split())), 'i-author': author, 'i-date': today } skeletons = os.path.join(grammar_path, 'tsdb', 'skeletons') matrix_skeleton = os.path.join(skeletons, 'matrix') schema = tsdb.read_schema(os.path.join(skeletons, 'Relations')) tsdb.initialize_database(matrix_skeleton, schema=schema) records = [ tsdb.make_record(get_item(s, i), schema['item']) for i, s in enumerate(ch['sentence'], 1) ] tsdb.write(matrix_skeleton, 'item', records, schema['item'])
def test_write(single_item_skeleton): dir = pathlib.Path(single_item_skeleton) fields = tsdb.read_schema(dir)['item'] path = dir.joinpath('item') tsdb.write(dir, 'item', [(0, 'The cat meows.')], fields) with tsdb.open(dir, 'item') as fh: assert list(fh) == ['0@The cat meows.\n'] tsdb.write(dir, 'item', [(1, 'The wolf howls.')], fields, append=True) with tsdb.open(dir, 'item') as fh: assert list(fh) == ['0@The cat meows.\n', '1@The wolf howls.\n'] # cannot append and gzip at same time with pytest.raises(NotImplementedError): tsdb.write(dir, 'item', [], fields, gzip=True, append=True) tsdb.write(dir, 'item', [(0, 'The cat meows.')], fields, gzip=True) assert not path.with_suffix('').exists() assert path.with_suffix('.gz').exists() # cannot append to existing gzipped file with pytest.raises(NotImplementedError): tsdb.write(dir, 'item', [], fields, append=True) tsdb.write(dir, 'item', [(0, 'The cat meows.')], fields) assert path.with_suffix('').exists() assert not path.with_suffix('.gz').exists() tsdb.write(dir, 'item', [(0, 'The cat meows.')], fields, gzip=False) assert not path.with_suffix('.gz').exists() assert path.with_suffix('').exists() tsdb.write(dir, 'item', [], fields, gzip=True) assert not path.with_suffix('.gz').exists() assert path.with_suffix('').exists()
def _interpret_selection(select, source): schema = tsdb.read_schema(source) queryobj = tsql.inspect_query('select ' + select) projection = queryobj['projection'] if projection == '*' or len(projection) != 1: raise CommandError("select query must return a single column") relation, _, column = projection[0].rpartition('.') if not relation: # query could be 'i-input from item' instead of 'item.i-input' if len(queryobj['relations']) == 1: relation = queryobj['relations'][0] elif len(queryobj['relations']) > 1: raise CommandError( "select query may specify no more than 1 relation") # otherwise guess else: relation = next( (table for table in schema if any(f.name == column for f in schema[table])), None) if relation not in schema: raise CommandError('invalid or missing relation in query') elif not any(f.name == column for f in schema[relation]): raise CommandError(f'invalid column in query: {column}') try: condition = select[select.index(' where ') + 7:] except ValueError: condition = '' return column, relation, condition
def test_make_record(empty_testsuite): rel = pathlib.Path(empty_testsuite, 'relations') r = tsdb.read_schema(rel) assert (tsdb.make_record({'i-input': 'one', 'i-id': 100}, r['item']) == (100, 'one')) assert tsdb.make_record({'i-id': 100}, r['item']) == (100, None) assert tsdb.make_record({'i-id': 100, 'mrs': '[RELS: < > HCONS: < >]'}, r['item']) == (100, None)
def test_issue_285(empty_testsuite): fields = tsdb.read_schema(empty_testsuite)['item'] tsdb.write(empty_testsuite, 'item', [(0, 'The cat meows.\r')], fields) fh = tsdb.open(empty_testsuite, 'item') assert not fh.closed with fh: assert list(fh) == ['0@The cat meows.\r\n'] assert fh.closed
def test_split(empty_testsuite): assert tsdb.split('') == (None, ) assert tsdb.split('one') == ('one', ) assert tsdb.split(u'あ') == (u'あ', ) assert tsdb.split('one@two') == ('one', 'two') assert tsdb.split('one@@three') == ('one', None, 'three') assert (tsdb.split('one\\s@\\\\two\\nabc') == ('one@', '\\two\nabc')) rels = tsdb.read_schema(empty_testsuite) assert tsdb.split('10@one', fields=rels['item']) == (10, 'one')
def test_bad_date_issue_279(tmp_path, empty_alt_testsuite): tmp_ts = tmp_path.joinpath('test_bad_date_issue_279') tmp_ts.mkdir() schema = tsdb.read_schema(empty_alt_testsuite) fields = schema['item'] tsdb.write_schema(tmp_ts, schema) tsdb.write(tmp_ts, 'item', [(0, 'The cat meows.', datetime(1999, 9, 8))], fields) db = tsdb.Database(tmp_ts) assert list(db['item']) == [('0', 'The cat meows.', '8-sep-1999')] tsdb.write(tmp_ts, 'item', [(0, 'The cat meows.', 'September 8, 1999')], fields) assert list(db['item']) == [('0', 'The cat meows.', 'September 8, 1999')]
def test_bad_date_issue_279b(tmp_path, empty_alt_testsuite): tmp_ts = tmp_path.joinpath('test_bad_date_issue_279b') tmp_ts.mkdir() schema = tsdb.read_schema(empty_alt_testsuite) fields = schema['item'] tsdb.write_schema(tmp_ts, schema) tsdb.write(tmp_ts, 'item', [(0, 'The cat meows.', 'September 8, 1999')], fields) ts = itsdb.TestSuite(tmp_ts) assert list(ts['item'].select('i-date', cast=False)) == [('September 8, 1999', )] with pytest.warns(tsdb.TSDBWarning): ts['item'][0]['i-date']
def test_read_schema(empty_testsuite): r = tsdb.read_schema(empty_testsuite) f = tsdb.Field assert r == { 'item': [f('i-id', ':integer', (':key',)), f('i-input', ':string')], 'fold': [f('fold-id', ':integer', (':key',))], 'run': [f('run-id', ':integer', (':key',))], 'parse': [f('parse-id', ':integer', (':key',)), f('run-id', ':integer', (':key',)), f('i-id', ':integer', (':key',))], 'result': [f('parse-id', ':integer', (':key',)), f('result-id', ':integer'), f('mrs', ':string')] }
def _mkprof_cleanup(destination, skeleton, old_files): schema = tsdb.read_schema(destination) to_keep = set(schema) if skeleton: to_keep = to_keep.intersection(tsdb.TSDB_CORE_FILES) for name in set(schema).union(old_files): tx_path = destination.joinpath(name).with_suffix('') gz_path = destination.joinpath(name).with_suffix('.gz') if (tx_path.is_file() and (name not in to_keep or (skeleton and tx_path.stat().st_size == 0))): tx_path.unlink() if (gz_path.is_file() and (name not in to_keep or (skeleton and gz_path.stat().st_size == 0))): gz_path.unlink()
def __init__(self, path: util.PathLike = None, schema: tsdb.SchemaLike = None, encoding: str = 'utf-8') -> None: # Virtual test suites use a temporary directory if path is None: self._tempdir = tempfile.TemporaryDirectory() path = Path(self._tempdir.name) else: path = Path(path).expanduser() path.mkdir(exist_ok=True) # can fail if path is a file # Ensure test suite directory has a relations file if not path.joinpath(tsdb.SCHEMA_FILENAME).is_file(): if schema is None: raise ITSDBError( '*schema* argument is required for new test suites') elif isinstance(schema, (str, Path)): schema = tsdb.read_schema(schema) tsdb.write_schema(path, schema) super().__init__(path, autocast=False, encoding=encoding) self._data: Dict[str, Table] = {}
def empty_item_table(empty_testsuite): fields = tsdb.read_schema(empty_testsuite)['item'] table = itsdb.Table(empty_testsuite, 'item', fields) return table
def single_item_table(single_item_skeleton): fields = tsdb.read_schema(single_item_skeleton)['item'] table = itsdb.Table(single_item_skeleton, 'item', fields) return table
def mkprof(destination, source=None, schema=None, where=None, delimiter=None, refresh=False, skeleton=False, full=False, gzip=False, quiet=False): """ Create [incr tsdb()] profiles or skeletons. Data for the testsuite may come from an existing testsuite or from a list of sentences. There are four main usage patterns: - `source="testsuite/"` -- read data from `testsuite/` - `source=None, refresh=True` -- read data from *destination* - `source=None, refresh=False` -- read sentences from stdin - `source="sents.txt"` -- read sentences from `sents.txt` The latter two require the *schema* parameter. Args: destination (str): path of the new testsuite source (str): path to a source testsuite or a file containing sentences; if not given and *refresh* is `False`, sentences are read from stdin schema (str): path to a relations file to use for the created testsuite; if `None` and *source* is a test suite, the schema of *source* is used where (str): TSQL condition to filter records by; ignored if *source* is not a testsuite delimiter (str): if given, split lines from *source* or stdin on the character *delimiter*; if *delimiter* is `"@"`, split using :func:`delphin.tsdb.split`; a header line with field names is required; ignored when the data source is not text lines refresh (bool): if `True`, rewrite the data at *destination*; implies *full* is `True`; ignored if *source* is not `None`, best combined with *schema* or *gzip* (default: `False`) skeleton (bool): if `True`, only write tsdb-core files (default: `False`) full (bool): if `True`, copy all data from the source testsuite; ignored if the data source is not a testsuite or if *skeleton* is `True` (default: `False`) gzip (bool): if `True`, non-empty tables will be compressed with gzip quiet (bool): if `True`, don't print summary information """ destination = Path(destination).expanduser() if source is not None: source = Path(source).expanduser() if schema is not None: schema = tsdb.read_schema(schema) old_relation_files = [] # work in-place on destination test suite if source is None and refresh: db = tsdb.Database(destination) old_relation_files = list(db.schema) tsdb.write_database(db, db.path, schema=schema, gzip=gzip) # input is sentences on stdin or a file of sentences elif source is None and not refresh: _mkprof_from_lines( destination, sys.stdin, schema, delimiter, gzip) elif source.is_file(): with source.open() as fh: _mkprof_from_lines( destination, fh, schema, delimiter, gzip) # input is source testsuite elif source.is_dir(): db = tsdb.Database(source) old_relation_files = list(db.schema) _mkprof_from_database( destination, db, schema, where, full, gzip) else: raise CommandError(f'invalid source for mkprof: {source!s}') _mkprof_cleanup(destination, skeleton, old_relation_files) if not quiet: _mkprof_summarize(destination, tsdb.read_schema(destination))