Пример #1
0
def test_write_database(tmp_path, mini_testsuite, empty_alt_testsuite):
    tmp_ts = tmp_path.joinpath('test_write_database')
    db = tsdb.Database(mini_testsuite)
    tsdb.write_database(db, str(tmp_ts))
    assert tmp_ts.is_dir()
    assert tmp_ts.joinpath('relations').is_file()
    assert tmp_ts.joinpath('item').is_file()
    assert tmp_ts.joinpath('parse').is_file()
    assert tmp_ts.joinpath('result').is_file()
    assert tmp_ts.joinpath('parse').read_text() == (
        '10@10@1\n'
        '20@20@0\n'
        '30@30@1\n')
    tsdb.write_database(db, str(tmp_ts), names=['item'])
    assert tmp_ts.joinpath('item').is_file()
    assert not tmp_ts.joinpath('parse').is_file()
    assert not tmp_ts.joinpath('result').is_file()
    # alt_schema drops i-wf field from mini_testsuite's schema
    alt_schema = tsdb.read_schema(empty_alt_testsuite)
    tsdb.write_database(db, str(tmp_ts), names=['item'], schema=alt_schema)
    alt_db = tsdb.Database(str(tmp_ts))
    assert len(db.schema['item']) == 4
    assert len(alt_db.schema['item']) == 3
    assert tmp_ts.joinpath('item').read_text() == (
        '10@It rained.@1-feb-2018 15:00\n'
        '20@Rained.@01-02-18 15:00:00\n'
        '30@It snowed.@2018-2-1 (15:00:00)\n')
Пример #2
0
def customize_itsdb(grammar_path):
    if 'sentence' not in ch:
        return

    today = datetime.datetime.today()
    author = 'Grammar Matrix Customization System'

    def get_item(s, i):
        return {
            'i-id': str(i),
            'i-origin': 'unknown',
            'i-register': 'unknown',
            'i-format': 'none',
            'i-difficulty': '1',
            'i-category': 'S' if not s.get('star', False) else '',
            'i-input': s['orth'],
            'i-wf': '0' if s.get('star', False) else '1',
            'i-length': str(len(s['orth'].split())),
            'i-author': author,
            'i-date': today
        }

    skeletons = os.path.join(grammar_path, 'tsdb', 'skeletons')
    matrix_skeleton = os.path.join(skeletons, 'matrix')
    schema = tsdb.read_schema(os.path.join(skeletons, 'Relations'))
    tsdb.initialize_database(matrix_skeleton, schema=schema)
    records = [
        tsdb.make_record(get_item(s, i), schema['item'])
        for i, s in enumerate(ch['sentence'], 1)
    ]
    tsdb.write(matrix_skeleton, 'item', records, schema['item'])
Пример #3
0
def test_write(single_item_skeleton):
    dir = pathlib.Path(single_item_skeleton)
    fields = tsdb.read_schema(dir)['item']
    path = dir.joinpath('item')
    tsdb.write(dir, 'item', [(0, 'The cat meows.')], fields)
    with tsdb.open(dir, 'item') as fh:
        assert list(fh) == ['0@The cat meows.\n']
    tsdb.write(dir, 'item', [(1, 'The wolf howls.')], fields, append=True)
    with tsdb.open(dir, 'item') as fh:
        assert list(fh) == ['0@The cat meows.\n', '1@The wolf howls.\n']
    # cannot append and gzip at same time
    with pytest.raises(NotImplementedError):
        tsdb.write(dir, 'item', [], fields, gzip=True, append=True)
    tsdb.write(dir, 'item', [(0, 'The cat meows.')], fields, gzip=True)
    assert not path.with_suffix('').exists()
    assert path.with_suffix('.gz').exists()
    # cannot append to existing gzipped file
    with pytest.raises(NotImplementedError):
        tsdb.write(dir, 'item', [], fields, append=True)
    tsdb.write(dir, 'item', [(0, 'The cat meows.')], fields)
    assert path.with_suffix('').exists()
    assert not path.with_suffix('.gz').exists()
    tsdb.write(dir, 'item', [(0, 'The cat meows.')], fields, gzip=False)
    assert not path.with_suffix('.gz').exists()
    assert path.with_suffix('').exists()
    tsdb.write(dir, 'item', [], fields, gzip=True)
    assert not path.with_suffix('.gz').exists()
    assert path.with_suffix('').exists()
Пример #4
0
def _interpret_selection(select, source):
    schema = tsdb.read_schema(source)
    queryobj = tsql.inspect_query('select ' + select)
    projection = queryobj['projection']
    if projection == '*' or len(projection) != 1:
        raise CommandError("select query must return a single column")
    relation, _, column = projection[0].rpartition('.')
    if not relation:
        # query could be 'i-input from item' instead of 'item.i-input'
        if len(queryobj['relations']) == 1:
            relation = queryobj['relations'][0]
        elif len(queryobj['relations']) > 1:
            raise CommandError(
                "select query may specify no more than 1 relation")
        # otherwise guess
        else:
            relation = next(
                (table for table in schema
                 if any(f.name == column for f in schema[table])),
                None)

    if relation not in schema:
        raise CommandError('invalid or missing relation in query')
    elif not any(f.name == column for f in schema[relation]):
        raise CommandError(f'invalid column in query: {column}')

    try:
        condition = select[select.index(' where ') + 7:]
    except ValueError:
        condition = ''
    return column, relation, condition
Пример #5
0
def test_make_record(empty_testsuite):
    rel = pathlib.Path(empty_testsuite, 'relations')
    r = tsdb.read_schema(rel)
    assert (tsdb.make_record({'i-input': 'one', 'i-id': 100}, r['item'])
            == (100, 'one'))
    assert tsdb.make_record({'i-id': 100}, r['item']) == (100, None)
    assert tsdb.make_record({'i-id': 100, 'mrs': '[RELS: < > HCONS: < >]'},
                            r['item']) == (100, None)
Пример #6
0
def test_issue_285(empty_testsuite):
    fields = tsdb.read_schema(empty_testsuite)['item']
    tsdb.write(empty_testsuite, 'item', [(0, 'The cat meows.\r')], fields)
    fh = tsdb.open(empty_testsuite, 'item')
    assert not fh.closed
    with fh:
        assert list(fh) == ['0@The cat meows.\r\n']
    assert fh.closed
Пример #7
0
def test_split(empty_testsuite):
    assert tsdb.split('') == (None, )
    assert tsdb.split('one') == ('one', )
    assert tsdb.split(u'あ') == (u'あ', )
    assert tsdb.split('one@two') == ('one', 'two')
    assert tsdb.split('one@@three') == ('one', None, 'three')
    assert (tsdb.split('one\\s@\\\\two\\nabc') == ('one@', '\\two\nabc'))
    rels = tsdb.read_schema(empty_testsuite)
    assert tsdb.split('10@one', fields=rels['item']) == (10, 'one')
Пример #8
0
def test_bad_date_issue_279(tmp_path, empty_alt_testsuite):
    tmp_ts = tmp_path.joinpath('test_bad_date_issue_279')
    tmp_ts.mkdir()
    schema = tsdb.read_schema(empty_alt_testsuite)
    fields = schema['item']
    tsdb.write_schema(tmp_ts, schema)
    tsdb.write(tmp_ts, 'item', [(0, 'The cat meows.', datetime(1999, 9, 8))],
               fields)
    db = tsdb.Database(tmp_ts)
    assert list(db['item']) == [('0', 'The cat meows.', '8-sep-1999')]
    tsdb.write(tmp_ts, 'item', [(0, 'The cat meows.', 'September 8, 1999')],
               fields)
    assert list(db['item']) == [('0', 'The cat meows.', 'September 8, 1999')]
Пример #9
0
def test_bad_date_issue_279b(tmp_path, empty_alt_testsuite):
    tmp_ts = tmp_path.joinpath('test_bad_date_issue_279b')
    tmp_ts.mkdir()
    schema = tsdb.read_schema(empty_alt_testsuite)
    fields = schema['item']
    tsdb.write_schema(tmp_ts, schema)
    tsdb.write(tmp_ts, 'item', [(0, 'The cat meows.', 'September 8, 1999')],
               fields)
    ts = itsdb.TestSuite(tmp_ts)
    assert list(ts['item'].select('i-date',
                                  cast=False)) == [('September 8, 1999', )]
    with pytest.warns(tsdb.TSDBWarning):
        ts['item'][0]['i-date']
Пример #10
0
def test_read_schema(empty_testsuite):
    r = tsdb.read_schema(empty_testsuite)
    f = tsdb.Field
    assert r == {
        'item': [f('i-id', ':integer', (':key',)),
                 f('i-input', ':string')],
        'fold': [f('fold-id', ':integer', (':key',))],
        'run': [f('run-id', ':integer', (':key',))],
        'parse': [f('parse-id', ':integer', (':key',)),
                  f('run-id', ':integer', (':key',)),
                  f('i-id', ':integer', (':key',))],
        'result': [f('parse-id', ':integer', (':key',)),
                   f('result-id', ':integer'),
                   f('mrs', ':string')]
    }
Пример #11
0
def _mkprof_cleanup(destination, skeleton, old_files):
    schema = tsdb.read_schema(destination)
    to_keep = set(schema)
    if skeleton:
        to_keep = to_keep.intersection(tsdb.TSDB_CORE_FILES)
    for name in set(schema).union(old_files):
        tx_path = destination.joinpath(name).with_suffix('')
        gz_path = destination.joinpath(name).with_suffix('.gz')
        if (tx_path.is_file()
            and (name not in to_keep
                 or (skeleton and tx_path.stat().st_size == 0))):
            tx_path.unlink()
        if (gz_path.is_file()
            and (name not in to_keep
                 or (skeleton and gz_path.stat().st_size == 0))):
            gz_path.unlink()
Пример #12
0
    def __init__(self,
                 path: util.PathLike = None,
                 schema: tsdb.SchemaLike = None,
                 encoding: str = 'utf-8') -> None:
        # Virtual test suites use a temporary directory
        if path is None:
            self._tempdir = tempfile.TemporaryDirectory()
            path = Path(self._tempdir.name)
        else:
            path = Path(path).expanduser()
            path.mkdir(exist_ok=True)  # can fail if path is a file

        # Ensure test suite directory has a relations file
        if not path.joinpath(tsdb.SCHEMA_FILENAME).is_file():
            if schema is None:
                raise ITSDBError(
                    '*schema* argument is required for new test suites')
            elif isinstance(schema, (str, Path)):
                schema = tsdb.read_schema(schema)
            tsdb.write_schema(path, schema)

        super().__init__(path, autocast=False, encoding=encoding)
        self._data: Dict[str, Table] = {}
Пример #13
0
def empty_item_table(empty_testsuite):
    fields = tsdb.read_schema(empty_testsuite)['item']
    table = itsdb.Table(empty_testsuite, 'item', fields)
    return table
Пример #14
0
def single_item_table(single_item_skeleton):
    fields = tsdb.read_schema(single_item_skeleton)['item']
    table = itsdb.Table(single_item_skeleton, 'item', fields)
    return table
Пример #15
0
def mkprof(destination, source=None, schema=None, where=None, delimiter=None,
           refresh=False, skeleton=False, full=False, gzip=False, quiet=False):
    """
    Create [incr tsdb()] profiles or skeletons.

    Data for the testsuite may come from an existing testsuite or from
    a list of sentences. There are four main usage patterns:

        - `source="testsuite/"` -- read data from `testsuite/`
        - `source=None, refresh=True` -- read data from *destination*
        - `source=None, refresh=False` -- read sentences from stdin
        - `source="sents.txt"` -- read sentences from `sents.txt`

    The latter two require the *schema* parameter.

    Args:
        destination (str): path of the new testsuite
        source (str): path to a source testsuite or a file containing
            sentences; if not given and *refresh* is `False`, sentences
            are read from stdin
        schema (str): path to a relations file to use for the created
            testsuite; if `None` and *source* is a test suite, the
            schema of *source* is used
        where (str): TSQL condition to filter records by; ignored if
            *source* is not a testsuite
        delimiter (str): if given, split lines from *source* or stdin
            on the character *delimiter*; if *delimiter* is `"@"`,
            split using :func:`delphin.tsdb.split`; a header line
            with field names is required; ignored when the data source
            is not text lines
        refresh (bool): if `True`, rewrite the data at *destination*;
            implies *full* is `True`; ignored if *source* is not
            `None`, best combined with *schema* or *gzip* (default:
            `False`)
        skeleton (bool): if `True`, only write tsdb-core files
            (default: `False`)
        full (bool): if `True`, copy all data from the source
            testsuite; ignored if the data source is not a testsuite
            or if *skeleton* is `True` (default: `False`)
        gzip (bool): if `True`, non-empty tables will be compressed
            with gzip
        quiet (bool): if `True`, don't print summary information
    """
    destination = Path(destination).expanduser()
    if source is not None:
        source = Path(source).expanduser()
    if schema is not None:
        schema = tsdb.read_schema(schema)
    old_relation_files = []

    # work in-place on destination test suite
    if source is None and refresh:
        db = tsdb.Database(destination)
        old_relation_files = list(db.schema)
        tsdb.write_database(db, db.path, schema=schema, gzip=gzip)

    # input is sentences on stdin or a file of sentences
    elif source is None and not refresh:
        _mkprof_from_lines(
            destination, sys.stdin, schema, delimiter, gzip)
    elif source.is_file():
        with source.open() as fh:
            _mkprof_from_lines(
                destination, fh, schema, delimiter, gzip)

    # input is source testsuite
    elif source.is_dir():
        db = tsdb.Database(source)
        old_relation_files = list(db.schema)
        _mkprof_from_database(
            destination, db, schema, where, full, gzip)

    else:
        raise CommandError(f'invalid source for mkprof: {source!s}')

    _mkprof_cleanup(destination, skeleton, old_relation_files)

    if not quiet:
        _mkprof_summarize(destination, tsdb.read_schema(destination))