Пример #1
0
 def test_init3(self):  # with kw check=True
     bad_file = Path(test_data('bad_file.tsv'))
     assert_raises(ValueError, LexStat, bad_file.as_posix())
     ls = self._make_one(bad_file.as_posix(), check=True, apply_checks=True)
     assert hasattr(ls, 'errors')
     cleaned = bad_file.parent.joinpath(bad_file.name + '_cleaned.tsv')
     self.assertTrue(cleaned.exists())
     os.remove(cleaned.as_posix())
     assert_raises(ValueError, LexStat, {0: ['concept', 'language', 'ipa']})
Пример #2
0
def _download_sql_dump(rel, log):
    target = Path('glottolog-{0}.sql.gz'.format(rel['version']))
    log.info('retrieving {0}'.format(rel['sql_dump_url']))
    urlretrieve(rel['sql_dump_url'], target.as_posix())
    assert md5(target) == rel['sql_dump_md5']
    unpacked = target.with_suffix('')
    with gzip.open(target.as_posix()) as f, unpacked.open('wb') as u:
        shutil.copyfileobj(f, u)
    target.unlink()
    log.info('SQL dump for Glottolog release {0} written to {1}'.format(
        rel['version'], unpacked))
Пример #3
0
def _download_sql_dump(rel, log):
    target = Path('glottolog-{0}.sql.gz'.format(rel['version']))
    log.info('retrieving {0}'.format(rel['sql_dump_url']))
    urlretrieve(rel['sql_dump_url'], target.as_posix())
    assert md5(target) == rel['sql_dump_md5']
    unpacked = target.with_suffix('')
    with gzip.open(target.as_posix()) as f, unpacked.open('wb') as u:
        shutil.copyfileobj(f, u)
    target.unlink()
    log.info('SQL dump for Glottolog release {0} written to {1}'.format(
        rel['version'], unpacked))
Пример #4
0
    def create(self, dir_, content):
        """Write ``content`` to a file using ``dir_`` as file-system directory.

        :return: File-system path of the file that was created.
        """
        p = Path(dir_).joinpath(self.relpath)
        if not p.parent.exists():
            p.parent.mkdir(parents=True)
        with open(p.as_posix(), 'wb') as fp:
            if isinstance(content, text_type):
                content = content.encode('utf8')
            fp.write(content)
        return p.as_posix()
Пример #5
0
def lff2tree(tree=TREE, outdir=None, builddir=None, lffs=None):
    """
    - get mapping glottocode -> Languoid from old tree
    - assemble new directory tree
      - for each path component in lff/dff:
        - create new dir
        - copy info file from old tree (possibly updating the name) or
        - create info file
      - for each language/dialect in lff/dff:
        - create new dir
        - copy info file from old tree (possibly updating the name) or
        - create info file
    - rm old tree
    - copy new tree
    """
    # FIXME: instead of removing trees, we should just move the current one
    # from outdir to build, and then recreate in outdir.
    builddir = Path(builddir) if builddir else build_path('tree')
    old_tree = {l.id: l for l in walk_tree(tree)} if tree else {}
    out = Path(outdir or tree)
    if not out.parent.exists():
        out.parent.mkdir()

    if out.exists():
        if builddir.exists():
            try:
                rmtree(builddir)
            except:  # pragma: no cover
                pass
            if builddir.exists():  # pragma: no cover
                raise ValueError('please remove %s before proceeding' %
                                 builddir)
        # move the old tree out of the way
        shutil.move(out.as_posix(), builddir.as_posix())
    out.mkdir()

    lffs = lffs or {}
    languages = {}
    for lang in read_lff(Level.language, fp=lffs.get(Level.language)):
        languages[lang.id] = lang
        lang2tree(lang, lang.lineage, out, old_tree)

    for lang in read_lff(Level.dialect, fp=lffs.get(Level.dialect)):
        if not lang.lineage or lang.lineage[0][1] not in languages:
            raise ValueError('unattached dialect')  # pragma: no cover

        lang2tree(lang, languages[lang.lineage[0][1]].lineage + lang.lineage,
                  out, old_tree)
Пример #6
0
def download_tables(outdir=None):
    match = ZIP_NAME_PATTERN.search(urlopen(BASE_URL + 'download.asp').read())
    if not match:
        raise ValueError('no matching zip file name found')  # pragma: no cover
    target = Path(outdir or '.').joinpath(match.group('name'))
    urlretrieve(BASE_URL + match.group('name'), target.as_posix())
    return target
Пример #7
0
    def write(self, outdir='.', suffix='.csv', cited_sources_only=False, archive=False):
        outdir = Path(outdir)
        if not outdir.exists():
            raise ValueError(outdir.as_posix())

        close = False
        if archive:
            if isinstance(archive, Archive):
                container = archive
            else:
                container = Archive(outdir.joinpath(self.name + '.zip'), mode='w')
                close = True
        else:
            container = outdir

        fname = Path(outdir).joinpath(self.name + suffix)
        if fname.suffix in TAB_SUFFIXES:
            self.table.dialect.delimiter = '\t'

        with UnicodeWriter(
                None if isinstance(container, Archive) else fname,
                delimiter=self.table.dialect.delimiter) as writer:
            writer.writerow(self.fields)
            for row in self.rows:
                writer.writerow(row.to_list())

        if isinstance(container, Archive):
            container.write_text(writer.read(), fname.name)
        self.table.url = fname.name

        self.metadata.write(Dataset.filename(fname, 'metadata'), container)
        ids = self._cited_sources if cited_sources_only else None
        self.sources.write(Dataset.filename(fname, 'sources'), container, ids=ids)
        if close:
            container.close()
Пример #8
0
def lff2tree(tree=TREE, outdir=None, builddir=None, lffs=None):
    """
    - get mapping glottocode -> Languoid from old tree
    - assemble new directory tree
      - for each path component in lff/dff:
        - create new dir
        - copy info file from old tree (possibly updating the name) or
        - create info file
      - for each language/dialect in lff/dff:
        - create new dir
        - copy info file from old tree (possibly updating the name) or
        - create info file
    - rm old tree
    - copy new tree
    """
    # FIXME: instead of removing trees, we should just move the current one
    # from outdir to build, and then recreate in outdir.
    builddir = Path(builddir) if builddir else build_path("tree")
    old_tree = {l.id: l for l in walk_tree(tree)} if tree else {}
    out = Path(outdir or tree)
    if not out.parent.exists():
        out.parent.mkdir()

    if out.exists():
        if builddir.exists():
            try:
                rmtree(builddir)
            except:  # pragma: no cover
                pass
            if builddir.exists():  # pragma: no cover
                raise ValueError("please remove %s before proceeding" % builddir)
        # move the old tree out of the way
        shutil.move(out.as_posix(), builddir.as_posix())
    out.mkdir()

    lffs = lffs or {}
    languages = {}
    for lang in read_lff(Level.language, fp=lffs.get(Level.language)):
        languages[lang.id] = lang
        lang2tree(lang, lang.lineage, out, old_tree)

    for lang in read_lff(Level.dialect, fp=lffs.get(Level.dialect)):
        if not lang.lineage or lang.lineage[0][1] not in languages:
            raise ValueError("unattached dialect")  # pragma: no cover

        lang2tree(lang, languages[lang.lineage[0][1]].lineage + lang.lineage, out, old_tree)
Пример #9
0
class TemporaryPath(object):
    def __init__(self, suffix=''):
        fp = NamedTemporaryFile(suffix=suffix)
        self.name = Path(fp.name)
        fp.close()

    def __enter__(self):
        return self.name.as_posix()

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.name.exists():
            remove(self.name)
Пример #10
0
class TemporaryPath(object):
    def __init__(self, suffix=''):
        fp = NamedTemporaryFile(suffix=suffix)
        self.name = Path(fp.name)
        fp.close()

    def __enter__(self):
        return self.name.as_posix()

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.name.exists():
            remove(self.name)
Пример #11
0
def dependencies_graph(imps):
    deps = dict([((f1, f2), v) for (v, f1, f2) in imps if v > 0.0])
    V = set([f for fs in deps.iterkeys() for f in fs])
    G = dict([(k, v) for (k, v) in deps.items() if v > 0.0])
    MSTs = [mst(G, x) for x in V]
    (mv, H) = max([(sum(H.values()), H) for H in MSTs])
    #W = dict([(y, 1.0-v) for ((x, y), v) in H.iteritems()])
    #sav(dot(H, V), 'grambank_mst.gv')
    path = Path(grambank.__file__).parent.joinpath('static', 'dependencies.gv')
    with open(path.as_posix(), 'w') as fp:
        fp.write(dot(H, V))

    return (H, V) #dot(H, V)
Пример #12
0
    def test_extractor(self):
        config = self.make_cfg(
            [config_path(f).as_posix() for f in ("admin", "mk", "embed_data")])
        xml = beastling.beastxml.BeastXml(config)
        xmlfile = self.tmp.joinpath("beastling.xml")
        xml.write_file(xmlfile.as_posix())
        self.assertTrue(bool(self._extract(xmlfile)))

        config = self.make_cfg({
            'admin': {
                'basename': 'abcdefg'
            },
            'model': {
                'model': 'mk',
                'data': data_path('basic.csv').as_posix()
            }
        })
        xml = beastling.beastxml.BeastXml(config)
        xmlfile = self.tmp.joinpath("beastling.xml")
        xml.write_file(xmlfile.as_posix())
        beastling.extractor.extract(xmlfile)
        p = Path('abcdefg.conf')
        self.assertTrue(p.exists())
        cfg = INI(interpolation=None)
        cfg.read(p.as_posix())
        remove(p)
        self.assertEqual(cfg['admin']['basename'], 'abcdefg')
        self.assertEqual(cfg['model']['model'], 'mk')

        fname = self.tmp.joinpath('test.xml')
        datafile = self.tmp.joinpath(('test.csv'))
        self.assertFalse(datafile.exists())
        with fname.open('w', encoding='utf8') as fp:
            fp.write("""<?xml version="1.0" encoding="UTF-8"?>
<r>
  <!--%s
%s
[admin]
[model]
-->
  <!--%s:%s-->
</r>
""" % (beastling.extractor._generated_str,
            beastling.extractor._config_file_str,
            beastling.extractor._data_file_str, datafile.as_posix()))
        res = self._extract(fname)
        self.assertIn(datafile.name, ''.join(res))
Пример #13
0
    def create(self, req, filename=None, verbose=True):
        p = self.abspath(req)
        if not p.parent.exists():  # pragma: no cover
            p.parent.mkdir()
        tmp = Path('%s.tmp' % p.as_posix())

        if self.rdf:
            # we do not create archives with a readme for rdf downloads, because each
            # RDF entity points to the dataset and the void description of the dataset
            # covers all relevant metadata.
            #
            # TODO: write test for the file name things!?
            #
            with closing(GzipFile(
                    filename=Path(tmp.stem).stem, fileobj=tmp.open('wb')
            )) as fp:
                self.before(req, fp)
                for i, item in enumerate(page_query(self.query(req), verbose=verbose)):
                    self.dump(req, fp, item, i)
                self.after(req, fp)
        else:
            with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile:
                if not filename:
                    fp = self.get_stream()
                    self.before(req, fp)
                    for i, item in enumerate(
                            page_query(self.query(req), verbose=verbose)):
                        self.dump(req, fp, item, i)
                    self.after(req, fp)
                    zipfile.writestr(self.name, self.read_stream(fp))
                else:  # pragma: no cover
                    zipfile.write(filename, self.name)
                zipfile.writestr(
                    'README.txt',
                    README.format(
                        req.dataset.name,
                        '=' * (
                            len(req.dataset.name)
                            + len(' data download')),
                        req.dataset.license,
                        TxtCitation(None).render(req.dataset, req)).encode('utf8'))
        if p.exists():  # pragma: no cover
            remove(p)
        move(tmp, p)
Пример #14
0
    def test_extractor(self):
        config = self.make_cfg(
            [config_path(f).as_posix() for f in ("admin", "mk", "embed_data")])
        xml = beastling.beastxml.BeastXml(config)
        xmlfile = self.tmp.joinpath("beastling.xml")
        xml.write_file(xmlfile.as_posix())
        self.assertTrue(bool(self._extract(xmlfile)))

        config = self.make_cfg({
            'admin': {'basename': 'abcdefg'},
            'model': {
                'model': 'mk',
                'data': data_path('basic.csv').as_posix()}})
        xml = beastling.beastxml.BeastXml(config)
        xmlfile = self.tmp.joinpath("beastling.xml")
        xml.write_file(xmlfile.as_posix())
        beastling.extractor.extract(xmlfile)
        p = Path('abcdefg.conf')
        self.assertTrue(p.exists())
        cfg = INI(interpolation=None)
        cfg.read(p.as_posix())
        remove(p)
        self.assertEqual(cfg['admin']['basename'], 'abcdefg')
        self.assertEqual(cfg['model']['model'], 'mk')

        fname = self.tmp.joinpath('test.xml')
        datafile = self.tmp.joinpath(('test.csv'))
        self.assertFalse(datafile.exists())
        with fname.open('w', encoding='utf8') as fp:
            fp.write("""<?xml version="1.0" encoding="UTF-8"?>
<r>
  <!--%s
%s
[admin]
[model]
-->
  <!--%s:%s-->
</r>
""" % (beastling.extractor._generated_str,
       beastling.extractor._config_file_str,
       beastling.extractor._data_file_str,
       datafile.as_posix()))
        res = self._extract(fname)
        self.assertIn(datafile.name, ''.join(res))
Пример #15
0
    def create(self, req, filename=None, verbose=True):
        p = self.abspath(req)
        if not p.parent.exists():  # pragma: no cover
            p.parent.mkdir()
        tmp = Path('%s.tmp' % p.as_posix())

        if self.rdf:
            # we do not create archives with a readme for rdf downloads, because each
            # RDF entity points to the dataset and the void description of the dataset
            # covers all relevant metadata.
            #
            # TODO: write test for the file name things!?
            #
            with closing(
                    GzipFile(filename=Path(tmp.stem).stem,
                             fileobj=tmp.open('wb'))) as fp:
                self.before(req, fp)
                for i, item in enumerate(
                        page_query(self.query(req), verbose=verbose)):
                    self.dump(req, fp, item, i)
                self.after(req, fp)
        else:
            with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile:
                if not filename:
                    fp = self.get_stream()
                    self.before(req, fp)
                    for i, item in enumerate(
                            page_query(self.query(req), verbose=verbose)):
                        self.dump(req, fp, item, i)
                    self.after(req, fp)
                    zipfile.writestr(self.name, self.read_stream(fp))
                else:  # pragma: no cover
                    zipfile.write(filename, self.name)
                zipfile.writestr(
                    'README.txt',
                    README.format(
                        req.dataset.name,
                        '=' * (len(req.dataset.name) + len(' data download')),
                        req.dataset.license,
                        TxtCitation(None).render(req.dataset,
                                                 req)).encode('utf8'))
        if p.exists():  # pragma: no cover
            remove(p)
        move(tmp, p)
Пример #16
0
def test_extractor(config_factory, tmppath, data_dir):
    config = config_factory("admin", "mk", "embed_data")
    xml = beastling.beastxml.BeastXml(config)
    xmlfile = str(tmppath / "beastling.xml")
    xml.write_file(xmlfile)
    assert bool(_extract(xmlfile))

    config = config_factory({
            'admin': {'basename': 'abcdefg'},
            'model model': {
                'model': 'mk',
                'data': str(data_dir / 'basic.csv')}})
    xml = beastling.beastxml.BeastXml(config)
    xmlfile = str(tmppath / "beastling.xml")
    xml.write_file(xmlfile)
    beastling.extractor.extract(xmlfile)
    p = Path('abcdefg.conf')
    assert p.exists()
    cfg = INI(interpolation=None)
    cfg.read(p.as_posix())
    remove(p)
    assert cfg['admin']['basename'] == 'abcdefg'
    assert cfg['model model']['model'] == 'mk'

    fname = tmppath / 'test.xml'
    datafile = tmppath / 'test.csv'
    assert not datafile.exists()
    with fname.open('w', encoding='utf8') as fp:
        fp.write("""<?xml version="1.0" encoding="UTF-8"?>
<r>
  <!--%s
%s
[admin]
[model model]
-->
  <!--%s:%s-->
</r>
""" % (beastling.extractor._generated_str,
       beastling.extractor._config_file_str,
       beastling.extractor._data_file_str,
       datafile.as_posix()))
    res = _extract(fname)
    assert datafile.name in ''.join(res)
Пример #17
0
def _str_path(path, mkdir=False):
    """Get a file-system path as text_type, suitable for passing into io.open.

    Parameters
    ----------
    path : {text_type, Path}
        A fs path either as Path instance or as text_type.
    mkdir : bool (default=False)
        If True, create the directories within the path.

    Returns
    -------
    path : text_type
        The path as text_type.
    """
    res = Path(path_component(path))
    if mkdir and res.parent and not res.parent.exists():
        res.parent.mkdir(parents=True)
    return res.as_posix()
Пример #18
0
def _str_path(path, mkdir=False):
    """Get a file-system path as text_type, suitable for passing into io.open.

    Parameters
    ----------
    path : {text_type, Path}
        A fs path either as Path instance or as text_type.
    mkdir : bool (default=False)
        If True, create the directories within the path.

    Returns
    -------
    path : text_type
        The path as text_type.
    """
    res = Path(path_component(path))
    if mkdir and res.parent and not res.parent.exists():
        res.parent.mkdir(parents=True)
    return res.as_posix()
Пример #19
0
Файл: util.py Проект: clld/wals3
def contribution_detail_html(context=None, request=None, **kw):
    if context.id == 's4':
        raise HTTPFound(request.route_url('genealogy'))

    p = Path(wals3.__file__).parent.joinpath(
        'static', 'descriptions', str(context.id), 'body.xhtml')
    c = codecs.open(p.as_posix(), encoding='utf8').read()

    adapter = get_adapter(IRepresentation, Feature(), request, ext='snippet.html')

    for feature in DBSession.query(Feature)\
            .filter(Feature.contribution_pk == context.pk)\
            .options(joinedload_all(Feature.domain, DomainElement.values)):
        table = soup(adapter.render(feature, request))
        values = '\n'.join('%s' % table.find(tag).extract()
                           for tag in ['thead', 'tbody'])
        c = c.replace('__values_%s__' % feature.id, values)

    return {'text': c.replace('http://wals.info', request.application_url)}
Пример #20
0
def contribution_detail_html(context=None, request=None, **kw):
    if context.id == 's4':
        raise HTTPFound(request.route_url('genealogy'))

    p = Path(wals3.__file__).parent.joinpath('static', 'descriptions',
                                             str(context.id), 'body.xhtml')
    c = codecs.open(p.as_posix(), encoding='utf8').read()

    adapter = get_adapter(IRepresentation,
                          Feature(),
                          request,
                          ext='snippet.html')

    for feature in DBSession.query(Feature)\
            .filter(Feature.contribution_pk == context.pk)\
            .options(joinedload_all(Feature.domain, DomainElement.values)):
        table = soup(adapter.render(feature, request))
        values = '\n'.join('%s' % table.find(tag).extract()
                           for tag in ['thead', 'tbody'])
        c = c.replace('__values_%s__' % feature.id, values)

    return {'text': c.replace('http://wals.info', request.application_url)}
Пример #21
0
def rewrite(fname, visitor, **kw):
    """Utility function to rewrite rows in tsv files.

    :param fname: Path of the dsv file to operate on.
    :param visitor: A callable that takes a line-number and a row as input and returns a \
    (modified) row or None to filter out the row.
    :param kw: Keyword parameters are passed through to csv.reader/csv.writer.
    """
    if not isinstance(fname, Path):
        assert isinstance(fname, string_types)
        fname = Path(fname)

    assert fname.is_file()
    tmp = fname.parent.joinpath('.tmp.' + fname.name)

    with UnicodeReader(fname, **kw) as reader_:
        with UnicodeWriter(tmp, **kw) as writer:
            for i, row in enumerate(reader_):
                row = visitor(i, row)
                if row is not None:
                    writer.writerow(row)
    shutil.move(tmp.as_posix(), fname.as_posix())
Пример #22
0
    def from_ini(cls, ini, nodes=None):
        nodes = nodes or {}
        ini = Path(ini)
        directory = ini.parent
        cfg = INI(interpolation=None)
        cfg.read(ini.as_posix(), encoding='utf8')

        lineage = []
        for parent in directory.parents:
            id_ = parent.name
            assert id_ != directory.name
            if not Glottocode.pattern.match(id_):
                # we ignore leading non-languoid-dir path components.
                break

            if id_ not in nodes:
                l = Languoid.from_dir(parent, nodes=nodes)
                nodes[id_] = (l.name, l.id, l.level)
            lineage.append(nodes[id_])

        res = cls(cfg, list(reversed(lineage)), directory=directory)
        nodes[res.id] = (res.name, res.id, res.level)
        return res
Пример #23
0
    def from_ini(cls, ini, nodes={}):
        if not isinstance(ini, Path):
            ini = Path(ini)

        directory = ini.parent
        cfg = INI()
        cfg.read(ini.as_posix(), encoding='utf8')

        lineage = []
        for parent in directory.parents:
            id_ = parent.name.split('.')[-1]
            assert id_ != directory.name.split('.')[-1]
            if not cls.id_pattern.match(id_):
                # we ignore leading non-languoid-dir path components.
                break

            if id_ not in nodes:
                l = Languoid.from_dir(parent, nodes=nodes)
                nodes[id_] = (l.name, l.id, l.level)
            lineage.append(nodes[id_])

        res = cls(cfg, list(reversed(lineage)))
        nodes[res.id] = (res.name, res.id, res.level)
        return res
Пример #24
0
    def from_ini(cls, ini, nodes=None):
        if nodes is None:
            nodes = {}
        ini = Path(ini)
        directory = ini.parent
        cfg = INI(interpolation=None)
        cfg.read(ini.as_posix(), encoding='utf8')

        lineage = []
        for parent in directory.parents:
            id_ = parent.name
            assert id_ != directory.name
            if not Glottocode.pattern.match(id_):
                # we ignore leading non-languoid-dir path components.
                break

            if id_ not in nodes:
                l = Languoid.from_dir(parent, nodes=nodes)
                nodes[id_] = (l.name, l.id, l.level)
            lineage.append(nodes[id_])

        res = cls(cfg, list(reversed(lineage)), directory=directory)
        nodes[res.id] = (res.name, res.id, res.level)
        return res
Пример #25
0
    def from_ini(cls, ini, nodes={}):
        if not isinstance(ini, Path):
            ini = Path(ini)

        directory = ini.parent
        cfg = INI()
        cfg.read(ini.as_posix(), encoding='utf8')

        lineage = []
        for parent in directory.parents:
            id_ = parent.name.split('.')[-1]
            assert id_ != directory.name.split('.')[-1]
            if not cls.id_pattern.match(id_):
                # we ignore leading non-languoid-dir path components.
                break

            if id_ not in nodes:
                l = Languoid.from_dir(parent, nodes=nodes)
                nodes[id_] = (l.name, l.id, l.level)
            lineage.append(nodes[id_])

        res = cls(cfg, list(reversed(lineage)))
        nodes[res.id] = (res.name, res.id, res.level)
        return res
Пример #26
0
class Database(object):
    def __init__(self, fname):
        """
        A `Database` instance is initialized with a file path.

        :param fname: Path to a file in the file system where the db is to be stored.
        """
        self.fname = Path(fname)

    def drop(self):
        if self.fname.exists():
            remove(self.fname)

    def connection(self):
        return closing(sqlite3.connect(self.fname.as_posix()))

    def create(self, force=False, exists_ok=False):
        """
        Creates a db file with the core schema.

        :param force: If `True` an existing db file will be overwritten.
        """
        if self.fname and self.fname.exists():
            if force:
                self.drop()
            elif exists_ok:
                return
            else:
                raise ValueError('db file already exists, use force=True to overwrite')
        with self.connection() as db:
            db.execute(
                """\
CREATE TABLE dataset (
    ID TEXT PRIMARY KEY NOT NULL,
    name TEXT,
    version TEXT,
    metadata_json TEXT
)""")
            db.execute("""\
CREATE TABLE datasetmeta (
    dataset_ID TEXT ,
    key TEXT,
    value TEXT,
    PRIMARY KEY (dataset_ID, key),
    FOREIGN KEY(dataset_ID) REFERENCES dataset(ID)
)""")
            db.execute("""\
CREATE TABLE SourceTable (
    dataset_ID TEXT ,
    ID TEXT ,
    bibtex_type TEXT,
    {0}
    extra TEXT,
    PRIMARY KEY (dataset_ID, ID),
    FOREIGN KEY(dataset_ID) REFERENCES dataset(ID)
)""".format('\n    '.join('`{0}` TEXT,'.format(f) for f in BIBTEX_FIELDS)))

    def fetchone(self, sql, params=None, conn=None, verbose=False):
        return self._fetch(sql, 'fetchone', params, conn, verbose=verbose)

    def fetchall(self, sql, params=None, conn=None, verbose=False):
        return self._fetch(sql, 'fetchall', params, conn, verbose=verbose)

    def _fetch(self, sql, method, params, conn, verbose=False):
        sql = self.sql.get(sql, sql)

        def _do(conn, sql, method):
            cu = conn.cursor()
            if verbose:
                print(sql)
            cu.execute(sql, params or ())
            return getattr(cu, method)()

        if not conn:
            with self.connection() as conn:
                return _do(conn, sql, method)
        else:
            return _do(conn, sql, method)

    @property
    def tables(self):
        res = {r[0]: {} for r in self.fetchall(
            "SELECT name FROM sqlite_master WHERE type='table'")}
        for t in res:
            res[t] = {r[1]: r[2] for r in self.fetchall(
                "PRAGMA table_info({0})".format(t))}
        return res

    def unload(self, dataset_id):
        dataset_id = getattr(dataset_id, 'id', dataset_id)
        with self.connection() as db:
            for table in self.tables:
                if table != 'dataset':
                    db.execute(
                        "DELETE FROM {0} WHERE dataset_ID = ?".format(table),
                        (dataset_id,))
            db.execute("DELETE FROM dataset WHERE ID = ?", (dataset_id,))
            db.commit()

    def _create_table_if_not_exists(self, table):
        if table.name in self.tables:
            return False

        with self.connection() as conn:
            conn.execute(table.sql)
        return True

    def load(self, ds, verbose=False):
        """
        Load a CLDF dataset into the database.

        :param dataset:
        :return:
        """
        try:
            self.fetchone('select ID from dataset')
        except sqlite3.OperationalError:
            self.create(force=True)
        self.unload(ds)
        dataset = ds.cldf.wl
        tables, ref_tables = schema(dataset)

        # update the DB schema:
        for t in tables:
            if self._create_table_if_not_exists(t):
                continue
            db_cols = self.tables[t.name]
            for col in t.columns:
                if col.name not in db_cols:
                    with self.connection() as conn:
                        conn.execute(
                            "ALTER TABLE {0} ADD COLUMN `{1.name}` {1.db_type}".format(
                                t.name, col))
                else:
                    if db_cols[col.name] != col.db_type:
                        raise ValueError(
                            'column {0}:{1} {2} redefined with new type {3}'.format(
                                t.name, col.name, db_cols[col.name], col.db_type))

        for t in ref_tables.values():
            self._create_table_if_not_exists(t)

        self.update_schema()

        # then load the data:
        with self.connection() as db:
            db.execute('PRAGMA foreign_keys = ON;')
            insert(
                db,
                'dataset',
                'ID,name,version,metadata_json',
                (
                    ds.id,
                    '{0}'.format(dataset),
                    git_hash(ds.dir),
                    dumps(dataset.metadata_dict)))
            insert(
                db,
                'datasetmeta',
                'dataset_ID,key,value',
                *[(ds.id, k, '{0}'.format(v)) for k, v in dataset.properties.items()])

            # load sources:
            rows = []
            for src in dataset.sources.items():
                values = [ds.id, src.id, src.genre] + [src.get(k) for k in BIBTEX_FIELDS]
                values.append(
                    dumps({k: v for k, v in src.items() if k not in BIBTEX_FIELDS}))
                rows.append(tuple(values))
            insert(
                db,
                'SourceTable',
                ['dataset_ID', 'ID', 'bibtex_type'] + BIBTEX_FIELDS + ['extra'],
                *rows)

            # For regular tables, we extract and keep references to sources.
            refs = defaultdict(list)

            for t in tables:
                # We want to lookup columns by the name used in the CLDF dataset.
                cols = {col.cldf_name: col for col in t.columns}
                # But we also want to look up primary keys by the database column name.
                cols_by_name = {col.name: col for col in t.columns}

                ref_table = ref_tables.get(t.name)
                rows, keys = [], []
                try:
                    for row in dataset[t.name]:
                        keys, values = ['dataset_ID'], [ds.id]
                        for k, v in row.items():
                            if ref_table and k == ref_table.consumes:
                                col = cols_by_name[t.primary_key]
                                refs[ref_table.name].append((row[col.cldf_name], v))
                            else:
                                col = cols[k]
                                if isinstance(v, list):
                                    v = (col.separator or ';').join(
                                        nfilter(col.convert(vv) for vv in v))
                                else:
                                    v = col.convert(v)
                                keys.append("`{0}`".format(col.name))
                                values.append(v)
                        keys, values = self.update_row(t.name, keys, values)
                        rows.append(tuple(values))
                    insert(db, t.name, keys, *rows, **{'verbose': verbose})
                except FileNotFoundError:
                    if t.name != 'CognateTable':  # An empty CognateTable is allowed.
                        raise  # pragma: no cover

            # Now insert the references, i.e. the associations with sources:
            for tname, items in refs.items():
                rows = []
                for oid, sources in items:
                    for source in sources:
                        sid, context = Sources.parse(source)
                        rows.append([ds.id, oid, sid, context])
                oid_col = '{0}_ID'.format(tname.replace('Source', ''))
                insert(db, tname, ['dataset_ID', oid_col, 'Source_ID', 'Context'], *rows)
            db.commit()

    def update_schema(self):
        for tname, cname, type_ in [
            ('ParameterTable', 'Ontological_Category', 'TEXT'),
            ('ParameterTable', 'Semantic_Field', 'TEXT'),
            ('LanguageTable', 'Latitude', 'REAL'),
            ('LanguageTable', 'Longitude', 'REAL'),
        ]:
            if cname not in self.tables[tname]:
                with self.connection() as conn:
                    conn.execute("ALTER TABLE {0} ADD COLUMN `{1}` {2}".format(
                        tname, cname, type_))

    def update_row(self, table, keys, values):
        return keys, values

    def load_concepticon_data(self, concepticon):
        conceptsets = []
        for csid in self.fetchall("SELECT distinct concepticon_id FROM parametertable"):
            cs = concepticon.conceptsets.get(csid[0])
            if cs:
                conceptsets.append((
                    cs.gloss, cs.ontological_category, cs.semanticfield, cs.id))

        with self.connection() as db:
            db.executemany(
                """\
UPDATE parametertable
SET concepticon_gloss = ?, ontological_category = ?, semantic_field = ?
WHERE concepticon_id = ?""",
                conceptsets)
            db.commit()

    def load_glottolog_data(self, glottolog):
        langs = []
        languoids = {l.id: l for l in glottolog.languoids()}
        for gc in self.fetchall("SELECT distinct glottocode FROM languagetable"):
            lang = languoids.get(gc[0])
            if lang:
                langs.append((
                    lang.lineage[0][0] if lang.lineage else lang.name,
                    lang.macroareas[0].name if lang.macroareas else None,
                    lang.latitude,
                    lang.longitude,
                    lang.id))

        with self.connection() as db:
            db.executemany(
                """\
UPDATE languagetable
SET family = ?, macroarea = ?, latitude = ?, longitude = ?
WHERE glottocode = ?""",
                langs)
            db.commit()

    sql = {
        "conceptsets_by_dataset":
            "SELECT ds.id, count(distinct p.concepticon_id) "
            "FROM dataset as ds, parametertable as p "
            "WHERE ds.id = p.dataset_id GROUP BY ds.id",
        "families_by_dataset":
            "SELECT ds.id, count(distinct l.family) "
            "FROM dataset as ds, languagetable as l "
            "WHERE ds.id = l.dataset_id GROUP BY ds.id",
        "macroareas_by_dataset":
            "SELECT ds.id, group_concat(distinct l.macroarea) "
            "FROM dataset as ds, languagetable as l "
            "WHERE ds.id = l.dataset_id GROUP BY ds.id",
        "glottocodes_by_dataset":
            "SELECT ds.id, count(distinct l.glottocode) "
            "FROM dataset as ds, languagetable as l "
            "WHERE ds.id = l.dataset_id GROUP BY ds.id",
        "mapped_lexemes_by_dataset":
            "SELECT ds.id, count(distinct f.ID) "
            "FROM dataset as ds, formtable as f, languagetable as l, parametertable as p "
            "WHERE ds.id = f.dataset_id and f.Language_ID = l.ID and "
            "f.Parameter_ID = p.ID and l.glottocode is not null and "
            "p.concepticon_id is not null "
            "GROUP BY ds.id",
        "lexemes_by_dataset":
            "SELECT ds.id, count(f.ID) FROM dataset as ds, formtable as f "
            "WHERE ds.id = f.dataset_id GROUP BY ds.id",
    }
Пример #27
0
def includeme(config):
    """Upgrading:

    - register utilities "by hand", after config.include('clld.web.app')
    - add routes by hand (and remove these from the **kw passed to Configurator)

    :param config:
    :return:
    """
    #
    # now we exploit the default package layout as created via the CLLD scaffold:
    #
    # note: the following exploits the import time side effect of modifying the webassets
    # environment!
    root_package = config.root_package.__name__
    maybe_import('%s.assets' % root_package)

    pkg_dir = Path(config.root_package.__file__).parent.resolve()

    json_renderer = JSON()
    json_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat())
    json_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat())
    config.add_renderer('json', json_renderer)

    jsonp_renderer = JSONP(param_name='callback')
    jsonp_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat())
    jsonp_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat())
    config.add_renderer('jsonp', jsonp_renderer)

    config.set_request_factory(ClldRequest)
    config.registry.registerUtility(CtxFactoryQuery(), interfaces.ICtxFactoryQuery)
    config.registry.registerUtility(OlacConfig(), interfaces.IOlacConfig)

    # initialize the db connection
    engine = engine_from_config(config.registry.settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    Base.metadata.bind = engine

    config.add_settings({
        'pyramid.default_locale_name': 'en',
        'clld.pkg': root_package,
        'clld.parameters': {}})
    if 'clld.files' in config.registry.settings:
        # deployment-specific location of static data files
        abspath = Path(config.registry.settings['clld.files']).resolve()
        config.add_settings({'clld.files': abspath})
        config.add_static_view('files', abspath.as_posix())

    # event subscribers:
    config.add_subscriber(add_localizer, events.NewRequest)
    config.add_subscriber(init_map, events.ContextFound)
    config.add_subscriber(
        partial(add_renderer_globals, maybe_import('%s.util' % root_package)),
        events.BeforeRender)

    #
    # make it easy to register custom functionality
    #
    for name, func in {
        'register_datatable': partial(register_cls, interfaces.IDataTable),
        'register_map': partial(register_cls, interfaces.IMap),
        'register_menu': register_menu,
        'register_resource': register_resource,
        'register_adapter': register_adapter,
        'register_adapters': register_adapters,
        'register_download': register_download,
        'register_staticresource': register_staticresource,
        'add_route_and_view': add_route_and_view,
        'add_settings_from_file': add_settings_from_file,
        'add_301': add_301,
        'add_410': add_410,
        'add_page': add_page,
        'register_resource_routes_and_views': register_resource_routes_and_views,
    }.items():
        config.add_directive(name, func)

    #
    # routes and views
    #
    config.add_static_view('clld-static', 'clld:web/static')
    config.add_static_view('static', '%s:static' % root_package)

    config.add_route_and_view('_js', '/_js', js, http_cache=3600)

    # add some maintenance hatches
    config.add_route_and_view('_raise', '/_raise', _raise)
    config.add_route_and_view('_ping', '/_ping', _ping, renderer='json')

    # sitemap support:
    config.add_route_and_view('robots', '/robots.txt', robots)
    config.add_route_and_view('sitemapindex', '/sitemap.xml', sitemapindex)
    config.add_route_and_view('sitemap', '/sitemap.{rsc}.{n}.xml', sitemap)
    config.add_route('resourcemap', '/resourcemap.json')
    config.add_view(resourcemap, route_name='resourcemap', renderer='jsonp')
    config.add_route_and_view(
        'select_combination', '/_select_combination', select_combination)

    config.add_route_and_view('unapi', '/unapi', unapi)
    config.add_route_and_view('olac', '/olac', olac)

    config.add_settings_from_file(pkg_dir.joinpath('appconf.ini'))
    if not config.registry.settings.get('mako.directories'):
        config.add_settings({'mako.directories': ['clld:web/templates']})

    for rsc in RESOURCES:
        config.register_resource_routes_and_views(rsc)
        config.register_datatable(
            rsc.plural, getattr(datatables, rsc.plural.capitalize(), DataTable))
        register_resource_adapters(config, rsc)

    # maps
    config.register_map('languages', Map)
    config.register_map('language', LanguageMap)
    config.register_map('parameter', ParameterMap)
    config.register_map('combination', CombinationMap)

    config.include('clld.web.adapters')

    for icon in ICONS:
        config.registry.registerUtility(icon, interfaces.IIcon, name=icon.name)
    config.registry.registerUtility(ORDERED_ICONS, interfaces.IIconList)
    config.registry.registerUtility(MapMarker(), interfaces.IMapMarker)

    #
    # inspect default locations for views and templates:
    #
    home_comp = OrderedDict()
    for name, template in [
        ('introduction', False),
        ('about', False),
        ('terms', False),
        ('glossary', False),
        ('history', False),
        ('changes', False),
        ('credits', False),
        ('legal', True),
        ('download', True),
        ('contact', True),
        ('help', False),
    ]:
        home_comp[name] = template

    if pkg_dir.joinpath('templates').exists():
        for p in pkg_dir.joinpath('templates').iterdir():
            if p.stem in home_comp and p.suffix == '.mako':
                home_comp[p.stem] = True

    for name, template in home_comp.items():
        if template:
            config.add_page(name)

    config.add_settings({'home_comp': [k for k in home_comp.keys() if home_comp[k]]})

    if 'clld.favicon' not in config.registry.settings:
        favicon = {'clld.favicon': 'clld:web/static/images/favicon.ico'}
        # hard to test (in particular on travis) and without too much consequence
        # (and the consequences faced are easy to spot).
        if pkg_dir.joinpath('static', 'favicon.ico').exists():  # pragma: no cover
            favicon['clld.favicon'] = root_package + ':static/favicon.ico'
        config.add_settings(favicon)

    with open(abspath_from_asset_spec(
            config.registry.settings['clld.favicon']), mode='rb') as fp:
        fh = md5()
        fh.update(fp.read())
        config.add_settings({'clld.favicon_hash': fh.hexdigest()})

    translation_dirs = ['clld:locale']
    if pkg_dir.joinpath('locale').exists():
        translation_dirs.append('%s:locale' % root_package)  # pragma: no cover
    config.add_translation_dirs(*translation_dirs)

    if pkg_dir.joinpath('static/publisher_logo.png').exists():  # pragma: no cover
        config.add_settings(
            {'clld.publisher_logo': '%s:static/publisher_logo.png' % root_package})

    if asbool(config.registry.settings.get('clld.pacific_centered_maps')):
        geojson.pacific_centered()

    v = maybe_import('%s.views' % root_package)
    if v:
        config.scan(v)  # pragma: no cover

    menuitems = config.registry.settings.get(
        'clld.menuitems_list',
        ['contributions', 'parameters', 'languages', 'contributors'])
    config.register_menu(('dataset', dict(label='Home')), *menuitems)

    config.include('pyramid_mako')

    for name in ['adapters', 'datatables', 'maps']:
        mod = maybe_import('%s.%s' % (root_package, name))
        if mod and hasattr(mod, 'includeme'):
            config.include(mod)

    config.register_download(CldfDownload(common.Dataset, root_package))
Пример #28
0
def includeme(config):
    """Upgrading:

    - register utilities "by hand", after config.include('clld.web.app')
    - add routes by hand (and remove these from the **kw passed to Configurator)

    :param config:
    :return:
    """
    #
    # now we exploit the default package layout as created via the CLLD scaffold:
    #
    # note: the following exploits the import time side effect of modifying the webassets
    # environment!
    root_package = config.root_package.__name__
    maybe_import('%s.assets' % root_package)

    pkg_dir = Path(config.root_package.__file__).parent.resolve()

    json_renderer = JSON()
    json_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat())
    json_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat())
    config.add_renderer('json', json_renderer)

    jsonp_renderer = JSONP(param_name='callback')
    jsonp_renderer.add_adapter(datetime.datetime, lambda obj, req: obj.isoformat())
    jsonp_renderer.add_adapter(datetime.date, lambda obj, req: obj.isoformat())
    config.add_renderer('jsonp', jsonp_renderer)

    config.set_request_factory(ClldRequest)
    config.registry.registerUtility(CtxFactoryQuery(), interfaces.ICtxFactoryQuery)
    config.registry.registerUtility(OlacConfig(), interfaces.IOlacConfig)

    # initialize the db connection
    engine = engine_from_config(config.registry.settings, 'sqlalchemy.')
    DBSession.configure(bind=engine)
    Base.metadata.bind = engine

    config.add_settings({
        'pyramid.default_locale_name': 'en',
        'clld.pkg': root_package,
        'clld.parameters': {}})
    if 'clld.files' in config.registry.settings:
        # deployment-specific location of static data files
        abspath = Path(config.registry.settings['clld.files']).resolve()
        config.add_settings({'clld.files': abspath})
        config.add_static_view('files', abspath.as_posix())

    # event subscribers:
    config.add_subscriber(add_localizer, events.NewRequest)
    config.add_subscriber(init_map, events.ContextFound)
    config.add_subscriber(
        partial(add_renderer_globals, maybe_import('%s.util' % root_package)),
        events.BeforeRender)

    #
    # make it easy to register custom functionality
    #
    for name, func in {
        'register_datatable': partial(register_cls, interfaces.IDataTable),
        'register_map': partial(register_cls, interfaces.IMap),
        'register_menu': register_menu,
        'register_resource': register_resource,
        'register_adapter': register_adapter,
        'register_adapters': register_adapters,
        'register_download': register_download,
        'register_staticresource': register_staticresource,
        'add_route_and_view': add_route_and_view,
        'add_settings_from_file': add_settings_from_file,
        'add_301': add_301,
        'add_410': add_410,
        'register_resource_routes_and_views': register_resource_routes_and_views,
    }.items():
        config.add_directive(name, func)

    #
    # routes and views
    #
    config.add_static_view('clld-static', 'clld:web/static')
    config.add_static_view('static', '%s:static' % root_package)

    config.add_route_and_view('_js', '/_js', js, http_cache=3600)

    # add some maintenance hatches
    config.add_route_and_view('_raise', '/_raise', _raise)
    config.add_route_and_view('_ping', '/_ping', _ping, renderer='json')

    # sitemap support:
    config.add_route_and_view('robots', '/robots.txt', robots)
    config.add_route_and_view('sitemapindex', '/sitemap.xml', sitemapindex)
    config.add_route_and_view('sitemap', '/sitemap.{rsc}.{n}.xml', sitemap)
    config.add_route('resourcemap', '/resourcemap.json')
    config.add_view(resourcemap, route_name='resourcemap', renderer='jsonp')
    config.add_route_and_view(
        'select_combination', '/_select_combination', select_combination)

    config.add_route_and_view('unapi', '/unapi', unapi)
    config.add_route_and_view('olac', '/olac', olac)

    config.add_settings_from_file(pkg_dir.joinpath('appconf.ini'))
    if not config.registry.settings.get('mako.directories'):
        config.add_settings({'mako.directories': ['clld:web/templates']})

    for rsc in RESOURCES:
        config.register_resource_routes_and_views(rsc)
        config.register_datatable(
            rsc.plural, getattr(datatables, rsc.plural.capitalize(), DataTable))
        register_resource_adapters(config, rsc)

    # maps
    config.register_map('languages', Map)
    config.register_map('language', LanguageMap)
    config.register_map('parameter', ParameterMap)
    config.register_map('combination', CombinationMap)

    config.include('clld.web.adapters')

    for icon in ICONS:
        config.registry.registerUtility(icon, interfaces.IIcon, name=icon.name)
    config.registry.registerUtility(ORDERED_ICONS, interfaces.IIconList)
    config.registry.registerUtility(MapMarker(), interfaces.IMapMarker)

    #
    # inspect default locations for views and templates:
    #
    home_comp = OrderedDict()
    for name, template in [
        ('introduction', False),
        ('about', False),
        ('terms', False),
        ('glossary', False),
        ('history', False),
        ('changes', False),
        ('credits', False),
        ('legal', True),
        ('download', True),
        ('contact', True),
        ('help', False),
    ]:
        home_comp[name] = template

    if pkg_dir.joinpath('templates').exists():
        for p in pkg_dir.joinpath('templates').iterdir():
            if p.stem in home_comp and p.suffix == '.mako':
                home_comp[p.stem] = True

    views = maybe_import('%s.views' % root_package)
    for name, template in home_comp.items():
        if template:
            config.add_route_and_view(
                name,
                '/' + name,
                getattr(views, name, lambda r: {}),
                renderer=name + '.mako')

    config.add_settings({'home_comp': [k for k in home_comp.keys() if home_comp[k]]})

    if 'clld.favicon' not in config.registry.settings:
        favicon = {'clld.favicon': 'clld:web/static/images/favicon.ico'}
        # hard to test (in particular on travis) and without too much consequence
        # (and the consequences faced are easy to spot).
        if pkg_dir.joinpath('static', 'favicon.ico').exists():  # pragma: no cover
            favicon['clld.favicon'] = root_package + ':static/favicon.ico'
        config.add_settings(favicon)

    with open(abspath_from_asset_spec(
            config.registry.settings['clld.favicon']), mode='rb') as fp:
        fh = md5()
        fh.update(fp.read())
        config.add_settings({'clld.favicon_hash': fh.hexdigest()})

    translation_dirs = ['clld:locale']
    if pkg_dir.joinpath('locale').exists():
        translation_dirs.append('%s:locale' % root_package)  # pragma: no cover
    config.add_translation_dirs(*translation_dirs)

    if pkg_dir.joinpath('static/publisher_logo.png').exists():  # pragma: no cover
        config.add_settings(
            {'clld.publisher_logo': '%s:static/publisher_logo.png' % root_package})

    if asbool(config.registry.settings.get('clld.pacific_centered_maps')):
        geojson.pacific_centered()

    v = maybe_import('%s.views' % root_package)
    if v:
        config.scan(v)  # pragma: no cover

    menuitems = config.registry.settings.get(
        'clld.menuitems_list',
        ['contributions', 'parameters', 'languages', 'contributors'])
    config.register_menu(('dataset', dict(label='Home')), *menuitems)

    config.include('pyramid_mako')

    for name in ['adapters', 'datatables', 'maps']:
        mod = maybe_import('%s.%s' % (root_package, name))
        if mod and hasattr(mod, 'includeme'):
            config.include(mod)

    config.register_download(CldfDownload(common.Dataset, root_package))
Пример #29
0
class Database(object):
    def __init__(self, fname):
        """
        A `Database` instance is initialized with a file path.

        :param fname: Path to a file in the file system where the db is to be stored.
        """
        self.fname = Path(fname)

    def drop(self):
        if self.fname.exists():
            remove(self.fname)

    def connection(self):
        return closing(sqlite3.connect(self.fname.as_posix()))

    def create(self, force=False):
        """
        Creates a db file with the core schema.

        :param force: If `True` an existing db file will be overwritten.
        """
        if self.fname and self.fname.exists():
            if force:
                self.drop()
            else:
                raise ValueError(
                    'db file already exists, use force=True to overwrite')
        with self.connection() as db:
            db.execute("""\
CREATE TABLE dataset (
    ID INTEGER PRIMARY KEY NOT NULL,
    name TEXT,
    module TEXT,
    metadata_json TEXT
)""")
            db.execute("""\
CREATE TABLE datasetmeta (
    dataset_ID INT ,
    key TEXT,
    value TEXT,
    FOREIGN KEY(dataset_ID) REFERENCES dataset(ID)
)""")
            db.execute("""\
CREATE TABLE SourceTable (
    dataset_ID INT ,
    ID TEXT PRIMARY KEY NOT NULL,
    bibtex_type TEXT,
    {0}
    extra TEXT,
    FOREIGN KEY(dataset_ID) REFERENCES dataset(ID)
)""".format('\n    '.join('"{0}" TEXT,'.format(f) for f in BIBTEX_FIELDS)))

    def fetchone(self, sql, conn=None):
        return self._fetch(sql, 'fetchone', conn)

    def fetchall(self, sql, conn=None):
        return self._fetch(sql, 'fetchall', conn)

    def _fetch(self, sql, method, conn):
        def _do(conn, sql, method):
            cu = conn.cursor()
            cu.execute(sql)
            return getattr(cu, method)()

        if not conn:
            with self.connection() as conn:
                return _do(conn, sql, method)
        else:
            return _do(conn, sql, method)

    def delete(self, dataset_id):
        with self.connection() as db:
            for row in db.execute(
                    "SELECT name FROM sqlite_master WHERE type='table'"):
                table = row[0]
                if table != 'dataset':
                    db.execute(
                        "DELETE FROM {0} WHERE dataset_ID = ?".format(table),
                        (dataset_id, ))
            db.execute("DELETE FROM dataset WHERE ID = ?", (dataset_id, ))
            db.commit()

    def _create_table_if_not_exists(self, table):
        if table.name in [
                r[0] for r in self.fetchall(
                    "SELECT name FROM sqlite_master WHERE type='table'")
        ]:
            return False

        with self.connection() as conn:
            conn.execute(table.sql)
        return True

    def load(self, dataset):
        """
        Load a CLDF dataset into the database.

        :param dataset:
        :return:
        """
        tables, ref_tables = schema(dataset)

        # update the DB schema:
        for t in tables:
            if self._create_table_if_not_exists(t):
                continue
            db_cols = {
                r[1]: r[2]
                for r in self.fetchall("PRAGMA table_info({0})".format(t.name))
            }
            for col in t.columns:
                if col.name not in db_cols:
                    with self.connection() as conn:
                        conn.execute(
                            "ALTER TABLE {0} ADD COLUMN \"{1.name}\" {1.db_type}"
                            .format(t.name, col))
                else:
                    if db_cols[col.name] != col.db_type:
                        raise ValueError(
                            'column {0}:{1} {2} redefined with new type {3}'.
                            format(t.name, col.name, db_cols[col.name],
                                   col.db_type))

        for t in ref_tables.values():
            self._create_table_if_not_exists(t)

        # then load the data:
        with self.connection() as db:
            db.execute('PRAGMA foreign_keys = ON;')
            pk = max([
                r[0] for r in self.fetchall("SELECT ID FROM dataset", conn=db)
            ] or [0]) + 1
            insert(db, 'dataset', 'ID,name,module,metadata_json',
                   (pk, '{0}'.format(dataset), dataset.module,
                    dumps(dataset.metadata_dict)))
            insert(
                db, 'datasetmeta', 'dataset_ID,key,value',
                *[(pk, k, '{0}'.format(v))
                  for k, v in dataset.properties.items()])

            # load sources:
            rows = []
            for src in dataset.sources.items():
                values = [pk, src.id, src.genre
                          ] + [src.get(k) for k in BIBTEX_FIELDS]
                values.append(
                    dumps({
                        k: v
                        for k, v in src.items() if k not in BIBTEX_FIELDS
                    }))
                rows.append(tuple(values))
            insert(db, 'SourceTable', ['dataset_ID', 'ID', 'bibtex_type'] +
                   BIBTEX_FIELDS + ['extra'], *rows)

            # For regular tables, we extract and keep references to sources.
            refs = defaultdict(list)

            for t in tables:
                cols = {col.name: col for col in t.columns}
                ref_table = ref_tables.get(t.name)
                rows, keys = [], []
                for row in dataset[t.name]:
                    keys, values = ['dataset_ID'], [pk]
                    for k, v in row.items():
                        if ref_table and k == ref_table.consumes:
                            refs[ref_table.name].append(
                                (row[t.primary_key], v))
                        else:
                            col = cols[k]
                            if isinstance(v, list):
                                v = (col.separator
                                     or ';').join(col.convert(vv) for vv in v)
                            else:
                                v = col.convert(v)
                            keys.append(k)
                            values.append(v)
                    rows.append(tuple(values))
                insert(db, t.name, keys, *rows)

            # Now insert the references, i.e. the associations with sources:
            for tname, items in refs.items():
                rows = []
                for oid, sources in items:
                    for source in sources:
                        sid, context = Sources.parse(source)
                        rows.append([pk, oid, sid, context])
                oid_col = '{0}_ID'.format(tname.replace('Source', ''))
                insert(db, tname, [
                    'dataset_ID', '{:}'.format(oid_col), 'Source_ID', 'Context'
                ], *rows)
            db.commit()
Пример #30
0
    def create(self, req, filename=None, verbose=True):
        p = self.abspath(req)
        if not p.parent.exists():  # pragma: no cover
            p.parent.mkdir()
        tmp = Path('%s.tmp' % p)

        language_url_pattern = self.route_url_pattern(req, 'language')

        with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile:
            tables = []
            for param in DBSession.query(Parameter).options(joinedload(Parameter.domain)):
                fname = '%s-%s.csv' % (req.dataset.id, param.id)
                zipfile.writestr(fname, self.get_values(param, language_url_pattern))
                tables.append({
                    '@type': 'Table',
                    'url': fname,
                    'notes': [
                        {
                            '@id': req.resource_url(param),
                            'dc:identifier': param.id,
                            'dc:title': param.name,
                            'dc:description': param.description or ''}] + [
                        {
                            '@type': 'DomainElement',
                            'name': de.name,
                            'description': de.description,
                            'numeric': de.number
                        } for de in param.domain
                    ],
                })

            md = CsvmJsonAdapter.csvm_basic_doc(req, tables=tables)
            md.update({
                '@type': 'TableGroup',
                'dc:language': list(self.get_languages(req, language_url_pattern)),
                'tableSchema': {
                    "columns": [
                        {
                            "name": "ID",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Language_ID",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Parameter_ID",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Contribution_ID",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Value",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Source",
                            "datatype": "string",
                        },
                        {
                            "name": "Comment",
                            "datatype": "string",
                        },
                    ],
                    "primaryKey": "ID",
                    'aboutUrl': self.route_url_pattern(req, 'value', '{ID}'),
                },
            })
            zipfile.writestr(
                '%s.csv-metadata.json' % req.dataset.id, json.dumps(md, indent=4))
            bib = Database([
                rec.bibtex() for rec in DBSession.query(Source).order_by(Source.name)])
            zipfile.writestr('%s.bib' % req.dataset.id, ('%s' % bib).encode('utf8'))
            zipfile.writestr(
                'README.txt',
                README.format(
                    req.dataset.name,
                    '=' * (
                        len(req.dataset.name)
                        + len(' data download')),
                    req.dataset.license,
                    TxtCitation(None).render(req.dataset, req)).encode('utf8'))
        if p.exists():  # pragma: no cover
            remove(p)
        move(tmp, p)