Пример #1
0
    def from_bibfiles(cls, bibfiles, filepath, rebuild=False, page_size=32768):
        """If needed, (re)build the db from the bibfiles, hash, split/merge."""
        self = cls(filepath)

        if self.filepath.exists():
            if not rebuild and self.is_uptodate(bibfiles):
                return self
            path.remove(self.filepath)

        with self.engine.connect() as conn:
            if page_size is not None:
                conn.execute('PRAGMA page_size = %d' % page_size)
            Model.metadata.create_all(conn)

        with self.engine.connect() as conn:
            conn.execute('PRAGMA synchronous = OFF')
            conn.execute('PRAGMA journal_mode = MEMORY')
            conn = conn.execution_options(compiled_cache={})

            with conn.begin():
                import_bibfiles(conn, bibfiles)

            Entry.stats(conn)
            Value.fieldstats(conn)

            with conn.begin():
                generate_hashes(conn)

            Entry.hashstats(conn)
            Entry.hashidstats(conn)

            with conn.begin():
                assign_ids(conn)

        return self
Пример #2
0
 def temp_download(self, url, fname, log=None):
     p = None
     try:
         p = self.download(url, fname, log=log)
         yield p
     finally:
         if p and p.exists():
             remove(p)
Пример #3
0
    def test_remove(self):
        from clldutils.path import remove

        self.assertRaises(OSError, remove, self.tmp_path('nonexistingpath'))
        tmp = self.make_file('test')
        self.assertTrue(tmp.exists())
        remove(tmp)
        self.assertFalse(tmp.exists())
Пример #4
0
 def get_data(self, lid):  # pragma: no cover
     fname = self.raw.download(URL % (self.SECTION, lid),
                               '%s.xml' % lid,
                               log=self.log)
     if fname.stat().st_size == 0:
         remove(fname)
         return False
     return True
Пример #5
0
 def _clean(self, **kw):
     self.log.debug('removing CLDF directory %s' % self.cldf_dir)
     if self.cldf_dir.exists():
         for f in self.cldf_dir.iterdir():
             if f.is_file():
                 remove(f)
             else:
                 rmtree(f)
Пример #6
0
def test_remove(tmppath):
    from clldutils.path import remove

    with pytest.raises(OSError):
        remove(tmppath / 'nonexistingpath')
    tmp = make_file(tmppath, name='test')
    assert tmp.exists()
    remove(tmp)
    assert not tmp.exists()
Пример #7
0
Файл: util.py Проект: clld/clld
def safe_overwrite(fname):
    fname = Path(fname)
    if not fname.parent.exists():
        fname.parent.mkdir()
    assert fname.parent.exists()
    tmp = fname.parent
    while tmp.exists():
        tmp = fname.parent.joinpath('%s.%s' % (fname.name, random_string(6)))
    yield tmp
    if fname.exists():
        remove(fname)
    move(tmp, fname)
Пример #8
0
def safe_overwrite(fname):
    fname = Path(fname)
    if not fname.parent.exists():
        fname.parent.mkdir()
    assert fname.parent.exists()
    tmp = fname.parent
    while tmp.exists():
        tmp = fname.parent.joinpath('%s.%s' % (fname.name, random_string(6)))
    yield tmp
    if fname.exists():
        remove(fname)
    move(tmp, fname)
Пример #9
0
 def test_generate_extract(self):
     xml = self.tmp_path('test.xml')
     self._run_main('-v -o {0} {1}'.format(xml.as_posix(), config_path('basic')))
     self.assertTrue(xml.exists())
     # Overwriting existing files must be specified explicitely:
     self._run_main('-o {0} {1}'.format(
         xml.as_posix(), config_path('basic')), status=4)
     self._run_main('--overwrite -o {0} {1}'.format(
         xml.as_posix(), config_path('basic')), status=0)
     tcfg = Path('beastling_test.conf')
     self._run_main('--extract {0}'.format(xml.as_posix()))
     self.assertTrue(tcfg.exists())
     remove(tcfg)
Пример #10
0
def dump(version, all_langs, identifiers):
    out = args.data_file('files', 'glottolog-{0}'.format(version))
    if out.exists():
        for p in out.iterdir():
            remove(p)
    else:
        out.mkdir()

    langs = all_langs[version].values()
    langs_by_pk = {l.pk: l for l in langs}

    children = {
        pk: list(c)
        for pk, c in groupby(sorted(langs, key=lambda l: l.fpk),
                             lambda l: l.fpk)
    }

    for lang in langs:
        ancestors, fpk = [], lang.fpk
        while fpk and fpk in langs_by_pk:
            ancestors.append(langs_by_pk[fpk])
            fpk = langs_by_pk[fpk].fpk

        versions = [
            '<strong><a href="http://glottolog.org/resource/languoid/id/{0}">[{0}] in current Glottolog</a></strong>'
            .format(lang.id)
        ]
        for v in sorted(all_langs.keys()):
            if v != version:
                if lang.id in all_langs[v]:
                    versions.append(all_langs[v][lang.id].cross_version_link)
        clf = [link_list(children.get(lang.pk, []))]
        clf.append(lang.text)
        clf.extend(a.link for a in ancestors)
        write_text(
            out.joinpath('{0}.html'.format(lang.id)),
            T.render_unicode(
                version=version,
                lang=lang,
                clf=reduce(wrap, clf) if not lang.replacements else '',
                versions=versions,
                identifiers=identifiers.get(lang.pk, []),
                replacements=[
                    all_langs[version][lid].link for lid in lang.replacements
                    if lid in all_langs[version]
                ],
                wrap=wrap,
                link_list=link_list,
            ))
Пример #11
0
def recode(args):
    """Assign a new glottocode to an existing languoid.

    glottolog recode <code>
    """
    lang = args.repos.languoid(args.args[0])
    if not lang:
        raise ParserError('languoid not found')
    lang.id = Glottocode.from_name(lang.name)
    new_dir = lang.dir.parent.joinpath(lang.id)
    copytree(lang.dir, new_dir)
    lang.write_info(new_dir)
    remove(new_dir.joinpath('%s.ini' % args.args[0]))
    rmtree(lang.dir)
    print("%s -> %s" % (args.args[0], lang.id))
Пример #12
0
def recode(args):
    """Assign a new glottocode to an existing languoid.

    glottolog recode <code>
    """
    lang = find_languoid(glottocode=args.args[0])
    if not lang:
        raise ParserError('languoid not found')
    lang.id = Glottocode.from_name(lang.name)
    new_dir = lang.dir.parent.joinpath(lang.id)
    copytree(lang.dir, new_dir)
    lang.write_info(new_dir)
    remove(new_dir.joinpath('%s.ini' % args.args[0]))
    rmtree(lang.dir)
    print("%s -> %s" % (args.args[0], lang.id))
Пример #13
0
    def cmd_download(self, **kw):  # pragma: no cover
        assert self.SECTION in ['austronesian', 'mayan', 'utoaztecan']
        self.log.info('ABVD section set to %s' % self.SECTION)
        for fname in self.raw.iterdir():
            remove(fname)
        language_ids = [
            i for i in range(1, 2000)
            if i not in INVALID_LANGUAGE_IDS.get(self.SECTION, [])
        ]

        for lid in language_ids:
            if not self.get_data(lid):
                self.log.warn("No content for %s %d. Ending." %
                              (self.SECTION, lid))
                break
Пример #14
0
 def test_generate_extract(self):
     xml = self.tmp_path('test.xml')
     self._run_main('-v -o {0} {1}'.format(xml.as_posix(),
                                           config_path('basic')))
     self.assertTrue(xml.exists())
     # Overwriting existing files must be specified explicitely:
     self._run_main('-o {0} {1}'.format(xml.as_posix(),
                                        config_path('basic')),
                    status=4)
     self._run_main('--overwrite -o {0} {1}'.format(xml.as_posix(),
                                                    config_path('basic')),
                    status=0)
     tcfg = Path('beastling_test.conf')
     self._run_main('--extract {0}'.format(xml.as_posix()))
     self.assertTrue(tcfg.exists())
     remove(tcfg)
Пример #15
0
    def test_extractor(self):
        config = self.make_cfg(
            [config_path(f).as_posix() for f in ("admin", "mk", "embed_data")])
        xml = beastling.beastxml.BeastXml(config)
        xmlfile = self.tmp.joinpath("beastling.xml")
        xml.write_file(xmlfile.as_posix())
        self.assertTrue(bool(self._extract(xmlfile)))

        config = self.make_cfg({
            'admin': {
                'basename': 'abcdefg'
            },
            'model': {
                'model': 'mk',
                'data': data_path('basic.csv').as_posix()
            }
        })
        xml = beastling.beastxml.BeastXml(config)
        xmlfile = self.tmp.joinpath("beastling.xml")
        xml.write_file(xmlfile.as_posix())
        beastling.extractor.extract(xmlfile)
        p = Path('abcdefg.conf')
        self.assertTrue(p.exists())
        cfg = INI(interpolation=None)
        cfg.read(p.as_posix())
        remove(p)
        self.assertEqual(cfg['admin']['basename'], 'abcdefg')
        self.assertEqual(cfg['model']['model'], 'mk')

        fname = self.tmp.joinpath('test.xml')
        datafile = self.tmp.joinpath(('test.csv'))
        self.assertFalse(datafile.exists())
        with fname.open('w', encoding='utf8') as fp:
            fp.write("""<?xml version="1.0" encoding="UTF-8"?>
<r>
  <!--%s
%s
[admin]
[model]
-->
  <!--%s:%s-->
</r>
""" % (beastling.extractor._generated_str,
            beastling.extractor._config_file_str,
            beastling.extractor._data_file_str, datafile.as_posix()))
        res = self._extract(fname)
        self.assertIn(datafile.name, ''.join(res))
Пример #16
0
    def test_Matrix(self):
        from wals3.adapters import Matrix

        p = Path(mktemp())
        assert not p.exists()

        class TestMatrix(Matrix):
            def abspath(self, req):
                return p

            def query(self, req):
                return Matrix.query(self, req).filter(Language.pk < 100)

        m = TestMatrix(Language, "wals3", description="Feature values CSV")
        m.create(self.env["request"], verbose=False)
        assert p.exists()
        remove(p)
Пример #17
0
    def test_Matrix(self):
        from wals3.adapters import Matrix

        p = Path(mktemp())
        assert not p.exists()

        class TestMatrix(Matrix):
            def abspath(self, req):
                return p

            def query(self, req):
                return Matrix.query(self, req).filter(Language.pk < 100)

        m = TestMatrix(Language, 'wals3', description="Feature values CSV")
        m.create(self.env['request'], verbose=False)
        assert p.exists()
        remove(p)
Пример #18
0
    def create(self, req, filename=None, verbose=True):
        p = self.abspath(req)
        if not p.parent.exists():  # pragma: no cover
            p.parent.mkdir()
        tmp = Path('%s.tmp' % p.as_posix())

        if self.rdf:
            # we do not create archives with a readme for rdf downloads, because each
            # RDF entity points to the dataset and the void description of the dataset
            # covers all relevant metadata.
            #
            # TODO: write test for the file name things!?
            #
            with closing(
                    GzipFile(filename=Path(tmp.stem).stem,
                             fileobj=tmp.open('wb'))) as fp:
                self.before(req, fp)
                for i, item in enumerate(
                        page_query(self.query(req), verbose=verbose)):
                    self.dump(req, fp, item, i)
                self.after(req, fp)
        else:
            with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile:
                if not filename:
                    fp = self.get_stream()
                    self.before(req, fp)
                    for i, item in enumerate(
                            page_query(self.query(req), verbose=verbose)):
                        self.dump(req, fp, item, i)
                    self.after(req, fp)
                    zipfile.writestr(self.name, self.read_stream(fp))
                else:  # pragma: no cover
                    zipfile.write(filename, self.name)
                zipfile.writestr(
                    'README.txt',
                    README.format(
                        req.dataset.name,
                        '=' * (len(req.dataset.name) + len(' data download')),
                        req.dataset.license,
                        TxtCitation(None).render(req.dataset,
                                                 req)).encode('utf8'))
        if p.exists():  # pragma: no cover
            remove(p)
        move(tmp, p)
Пример #19
0
    def test_extractor(self):
        config = self.make_cfg(
            [config_path(f).as_posix() for f in ("admin", "mk", "embed_data")])
        xml = beastling.beastxml.BeastXml(config)
        xmlfile = self.tmp.joinpath("beastling.xml")
        xml.write_file(xmlfile.as_posix())
        self.assertTrue(bool(self._extract(xmlfile)))

        config = self.make_cfg({
            'admin': {'basename': 'abcdefg'},
            'model': {
                'model': 'mk',
                'data': data_path('basic.csv').as_posix()}})
        xml = beastling.beastxml.BeastXml(config)
        xmlfile = self.tmp.joinpath("beastling.xml")
        xml.write_file(xmlfile.as_posix())
        beastling.extractor.extract(xmlfile)
        p = Path('abcdefg.conf')
        self.assertTrue(p.exists())
        cfg = INI(interpolation=None)
        cfg.read(p.as_posix())
        remove(p)
        self.assertEqual(cfg['admin']['basename'], 'abcdefg')
        self.assertEqual(cfg['model']['model'], 'mk')

        fname = self.tmp.joinpath('test.xml')
        datafile = self.tmp.joinpath(('test.csv'))
        self.assertFalse(datafile.exists())
        with fname.open('w', encoding='utf8') as fp:
            fp.write("""<?xml version="1.0" encoding="UTF-8"?>
<r>
  <!--%s
%s
[admin]
[model]
-->
  <!--%s:%s-->
</r>
""" % (beastling.extractor._generated_str,
       beastling.extractor._config_file_str,
       beastling.extractor._data_file_str,
       datafile.as_posix()))
        res = self._extract(fname)
        self.assertIn(datafile.name, ''.join(res))
Пример #20
0
    def create(self, req, filename=None, verbose=True):
        p = self.abspath(req)
        if not p.parent.exists():  # pragma: no cover
            p.parent.mkdir()
        tmp = Path('%s.tmp' % p.as_posix())

        if self.rdf:
            # we do not create archives with a readme for rdf downloads, because each
            # RDF entity points to the dataset and the void description of the dataset
            # covers all relevant metadata.
            #
            # TODO: write test for the file name things!?
            #
            with closing(GzipFile(
                    filename=Path(tmp.stem).stem, fileobj=tmp.open('wb')
            )) as fp:
                self.before(req, fp)
                for i, item in enumerate(page_query(self.query(req), verbose=verbose)):
                    self.dump(req, fp, item, i)
                self.after(req, fp)
        else:
            with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile:
                if not filename:
                    fp = self.get_stream()
                    self.before(req, fp)
                    for i, item in enumerate(
                            page_query(self.query(req), verbose=verbose)):
                        self.dump(req, fp, item, i)
                    self.after(req, fp)
                    zipfile.writestr(self.name, self.read_stream(fp))
                else:  # pragma: no cover
                    zipfile.write(filename, self.name)
                zipfile.writestr(
                    'README.txt',
                    README.format(
                        req.dataset.name,
                        '=' * (
                            len(req.dataset.name)
                            + len(' data download')),
                        req.dataset.license,
                        TxtCitation(None).render(req.dataset, req)).encode('utf8'))
        if p.exists():  # pragma: no cover
            remove(p)
        move(tmp, p)
Пример #21
0
def dump(out, version, all_langs, identifiers):
    if out.exists():
        for p in out.iterdir():
            remove(p)
    else:
        out.mkdir()

    langs = all_langs[version].values()
    langs_by_pk = {l.pk: l for l in langs}

    children = {
        pk: list(c)
        for pk, c in groupby(sorted(langs, key=lambda l: l.fpk or 0), lambda l: l.fpk)}

    for lang in langs:
        ancestors, fpk = [], lang.fpk
        while fpk and fpk in langs_by_pk:
            ancestors.append(langs_by_pk[fpk])
            fpk = langs_by_pk[fpk].fpk

        versions = [
            '<strong><a href="http://glottolog.org/resource/languoid/id/{0}">[{0}] in current Glottolog</a></strong>'.format(lang.id)]
        for v in sorted(all_langs.keys()):
            if v != version:
                if lang.id in all_langs[v]:
                    versions.append(all_langs[v][lang.id].cross_version_link)
        clf = [link_list(children.get(lang.pk, []))]
        clf.append(lang.text)
        clf.extend(a.link for a in ancestors)
        write_text(
            out.joinpath('{0}.html'.format(lang.id)),
            T.render_unicode(
                version=version,
                lang=lang,
                clf=reduce(wrap, clf) if not lang.replacements else '',
                versions=versions,
                identifiers=identifiers.get(lang.pk, []),
                replacements=[all_langs[version][lid].link for lid in lang.replacements
                              if lid in all_langs[version]],
                wrap=wrap,
                link_list=link_list,
            )
        )
Пример #22
0
def test_extractor(config_factory, tmppath, data_dir):
    config = config_factory("admin", "mk", "embed_data")
    xml = beastling.beastxml.BeastXml(config)
    xmlfile = str(tmppath / "beastling.xml")
    xml.write_file(xmlfile)
    assert bool(_extract(xmlfile))

    config = config_factory({
            'admin': {'basename': 'abcdefg'},
            'model model': {
                'model': 'mk',
                'data': str(data_dir / 'basic.csv')}})
    xml = beastling.beastxml.BeastXml(config)
    xmlfile = str(tmppath / "beastling.xml")
    xml.write_file(xmlfile)
    beastling.extractor.extract(xmlfile)
    p = Path('abcdefg.conf')
    assert p.exists()
    cfg = INI(interpolation=None)
    cfg.read(p.as_posix())
    remove(p)
    assert cfg['admin']['basename'] == 'abcdefg'
    assert cfg['model model']['model'] == 'mk'

    fname = tmppath / 'test.xml'
    datafile = tmppath / 'test.csv'
    assert not datafile.exists()
    with fname.open('w', encoding='utf8') as fp:
        fp.write("""<?xml version="1.0" encoding="UTF-8"?>
<r>
  <!--%s
%s
[admin]
[model model]
-->
  <!--%s:%s-->
</r>
""" % (beastling.extractor._generated_str,
       beastling.extractor._config_file_str,
       beastling.extractor._data_file_str,
       datafile.as_posix()))
    res = _extract(fname)
    assert datafile.name in ''.join(res)
Пример #23
0
def test_Dataset_validate(tmpdir, mocker):
    ds = StructureDataset.in_dir(str(tmpdir / 'new'))
    ds.write(ValueTable=[])
    values = tmpdir / 'new' / 'values.csv'
    assert values.check()
    remove(str(values))
    log = mocker.Mock()
    assert not ds.validate(log=log)
    assert log.warn.called

    ds.write(ValueTable=[])
    assert ds.validate()

    ds['ValueTable'].tableSchema.columns = []
    with pytest.raises(ValueError):
        ds.validate()
    assert not ds.validate(log=mocker.Mock())
    ds.tablegroup.tables = []
    with pytest.raises(ValueError):
        ds.validate()

    ds = StructureDataset.in_dir(str(tmpdir / 'new'))
    ds.add_component('LanguageTable')
    ds.write(ValueTable=[], LanguageTable=[])
    assert ds.validate()

    # test violation of referential integrity:
    ds.write(ValueTable=[{'ID': '1', 'Value': '1', 'Language_ID': 'lid', 'Parameter_ID': 'pid'}], LanguageTable=[])
    assert not ds.validate(log=mocker.Mock())

    # test an invalid CLDF URL:
    ds['LanguageTable'].common_props['dc:conformsTo'] = 'http://cldf.clld.org/404'
    with pytest.raises(ValueError):
        ds.validate()

    ds = StructureDataset.in_dir(str(tmpdir / 'new'))
    ds['ValueTable'].get_column('Source').propertyUrl = URITemplate(
        'http://cldf.clld.org/404')
    ds.write(ValueTable=[])
    with pytest.raises(ValueError):
        ds.validate()
Пример #24
0
    def from_bibfiles(cls, bibfiles, filename, rebuild=False):
        """If needed, (re)build the db from the bibfiles, hash, split/merge."""
        if filename.exists():
            if not rebuild:
                self = cls(filename, bibfiles)
                if self.is_uptodate():
                    return self
            remove(filename)

        self = cls(filename, bibfiles)
        with self.connect(async=True) as conn:
            create_tables(conn)
            with conn:
                import_bibfiles(conn, bibfiles)
            entrystats(conn)
            fieldstats(conn)

            with conn:
                generate_hashes(conn)
            hashstats(conn)
            hashidstats(conn)

            with conn:
                assign_ids(conn)
Пример #25
0
 def cmd_download(self, **kw):
     zname = '{0}.zip'.format(self.id)
     self.raw.download(self.__cldf_url__, zname, log=self.log)
     archive = ZipFile(str(self.raw / zname))
     archive.extractall(str(self.raw))
     remove(self.raw / zname)
Пример #26
0
 def __exit__(self, exc_type, exc_val, exc_tb):
     if self.name.exists():
         remove(self.name)
Пример #27
0
Файл: util.py Проект: clld/clld
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == 'cleanup':
        for fname in args.data_file('gbs').glob('*.json'):
            try:
                data = jsonlib.load(fname)
                if data.get('totalItems') == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file('gbs', 'source%s.json' % source.id)

        if command == 'update':
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ['verify', 'update']:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn('no JSON object found in: %s' % filepath)
                    continue
                if not data['totalItems']:
                    continue
                item = data['items'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item['volumeInfo'].get('publishedDate', '').split('-')[0]
            if not year or year != slug(source.year or ''):
                needs_check = True
            twords = words(stitle)
            iwords = words(
                item['volumeInfo']['title'] + ' '
                + item['volumeInfo'].get('subtitle', ''))
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if int(source.id) == 241:
                log.info('%s' % sorted(words(stitle)))
                log.info('%s' % sorted(iwords))
            if needs_check:
                log.info('------- %s -> %s' % (
                    source.id, item['volumeInfo'].get('industryIdentifiers')))
                log.info('%s %s' % (
                    item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', '')))
                log.info(stitle)
                log.info(item['volumeInfo'].get('publishedDate'))
                log.info(source.year)
                log.info(item['volumeInfo'].get('authors'))
                log.info(source.author)
                log.info(item['volumeInfo'].get('publisher'))
                log.info(source.publisher)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == 'update':
            source.google_book_search_id = item['id']
            source.update_jsondata(gbs=item)
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    'inauthor:' + quote_plus(source.author.encode('utf8')),
                    'intitle:' + quote_plus(title.encode('utf8')),
                ]
                if source.publisher:
                    q.append('inpublisher:' + quote_plus(
                        source.publisher.encode('utf8')))
                url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned gbs ids for %s out of %s sources' % (count, i))
    elif command == 'download':
        log.info('queried gbs for %s sources' % count)
Пример #28
0
 def drop(self):
     if self.fname.exists():
         remove(self.fname)
Пример #29
0
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == 'cleanup':
        for fname in args.data_file('gbs').glob('*.json'):
            try:
                data = jsonlib.load(fname)
                if data.get('totalItems') == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file('gbs', 'source%s.json' % source.id)

        if command == 'update':
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ['verify', 'update']:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn('no JSON object found in: %s' % filepath)
                    continue
                if not data['totalItems']:
                    continue
                item = data['items'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item['volumeInfo'].get('publishedDate', '').split('-')[0]
            if not year or year != slug(source.year or ''):
                needs_check = True
            twords = words(stitle)
            iwords = words(item['volumeInfo']['title'] + ' ' +
                           item['volumeInfo'].get('subtitle', ''))
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if int(source.id) == 241:
                log.info('%s' % sorted(words(stitle)))
                log.info('%s' % sorted(iwords))
            if needs_check:
                log.info(
                    '------- %s -> %s' %
                    (source.id, item['volumeInfo'].get('industryIdentifiers')))
                log.info('%s %s' % (item['volumeInfo']['title'],
                                    item['volumeInfo'].get('subtitle', '')))
                log.info(stitle)
                log.info(item['volumeInfo'].get('publishedDate'))
                log.info(source.year)
                log.info(item['volumeInfo'].get('authors'))
                log.info(source.author)
                log.info(item['volumeInfo'].get('publisher'))
                log.info(source.publisher)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == 'update':
            source.google_book_search_id = item['id']
            source.update_jsondata(gbs=item)
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    'inauthor:' + quote_plus(source.author.encode('utf8')),
                    'intitle:' + quote_plus(title.encode('utf8')),
                ]
                if source.publisher:
                    q.append('inpublisher:' +
                             quote_plus(source.publisher.encode('utf8')))
                url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned gbs ids for %s out of %s sources' % (count, i))
    elif command == 'download':
        log.info('queried gbs for %s sources' % count)
Пример #30
0
 def remove(self, fname):
     remove(self.joinpath(fname))
Пример #31
0
 def clear(self):
     for key in self.keys():
         remove(self._path(key))
Пример #32
0
 def __exit__(self, exc_type, exc_val, exc_tb):
     if self.name.exists():
         remove(self.name)
Пример #33
0
 def __delitem__(self, key):
     remove(self._path(key))
Пример #34
0
def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == "cleanup":
        for fname in args.data_file("gbs").glob("*.json"):
            try:
                data = jsonlib.load(fname)
                if data.get("totalItems") == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source).order_by(common.Source.id).options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file("gbs", "source%s.json" % source.id)

        if command == "update":
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ["verify", "update"]:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn("no JSON object found in: %s" % filepath)
                    continue
                if not data["totalItems"]:
                    continue
                item = data["items"][0]
            else:
                continue

        if command == "verify":
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item["volumeInfo"].get("publishedDate", "").split("-")[0]
            if not year or year != slug(source.year or ""):
                needs_check = True
            twords = words(stitle)
            iwords = words(item["volumeInfo"]["title"] + " " + item["volumeInfo"].get("subtitle", ""))
            if (
                twords == iwords
                or (len(iwords) > 2 and iwords.issubset(twords))
                or (len(twords) > 2 and twords.issubset(iwords))
            ):
                needs_check = False
            if int(source.id) == 241:
                log.info("%s" % sorted(words(stitle)))
                log.info("%s" % sorted(iwords))
            if needs_check:
                log.info("------- %s -> %s" % (source.id, item["volumeInfo"].get("industryIdentifiers")))
                log.info("%s %s" % (item["volumeInfo"]["title"], item["volumeInfo"].get("subtitle", "")))
                log.info(stitle)
                log.info(item["volumeInfo"].get("publishedDate"))
                log.info(source.year)
                log.info(item["volumeInfo"].get("authors"))
                log.info(source.author)
                log.info(item["volumeInfo"].get("publisher"))
                log.info(source.publisher)
                if not confirm("Are the records the same?"):
                    log.warn("---- removing ----")
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == "update":
            source.google_book_search_id = item["id"]
            source.update_jsondata(gbs=item)
            count += 1
        elif command == "download":
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    "inauthor:" + quote_plus(source.author.encode("utf8")),
                    "intitle:" + quote_plus(title.encode("utf8")),
                ]
                if source.publisher:
                    q.append("inpublisher:" + quote_plus(source.publisher.encode("utf8")))
                url = api_url + "q=%s&key=%s" % ("+".join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={"accept": "application/json"})
                log.info("%s - %s" % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), "w") as fp:
                        fp.write(r.text.encode("utf8"))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == "update":
        log.info("assigned gbs ids for %s out of %s sources" % (count, i))
    elif command == "download":
        log.info("queried gbs for %s sources" % count)
Пример #35
0
    def create(self, req, filename=None, verbose=True):
        p = self.abspath(req)
        if not p.parent.exists():  # pragma: no cover
            p.parent.mkdir()
        tmp = Path('%s.tmp' % p)

        language_url_pattern = self.route_url_pattern(req, 'language')

        with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile:
            tables = []
            for param in DBSession.query(Parameter).options(joinedload(Parameter.domain)):
                fname = '%s-%s.csv' % (req.dataset.id, param.id)
                zipfile.writestr(fname, self.get_values(param, language_url_pattern))
                tables.append({
                    '@type': 'Table',
                    'url': fname,
                    'notes': [
                        {
                            '@id': req.resource_url(param),
                            'dc:identifier': param.id,
                            'dc:title': param.name,
                            'dc:description': param.description or ''}] + [
                        {
                            '@type': 'DomainElement',
                            'name': de.name,
                            'description': de.description,
                            'numeric': de.number
                        } for de in param.domain
                    ],
                })

            md = CsvmJsonAdapter.csvm_basic_doc(req, tables=tables)
            md.update({
                '@type': 'TableGroup',
                'dc:language': list(self.get_languages(req, language_url_pattern)),
                'tableSchema': {
                    "columns": [
                        {
                            "name": "ID",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Language_ID",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Parameter_ID",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Contribution_ID",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Value",
                            "datatype": "string",
                            "required": True
                        },
                        {
                            "name": "Source",
                            "datatype": "string",
                        },
                        {
                            "name": "Comment",
                            "datatype": "string",
                        },
                    ],
                    "primaryKey": "ID",
                    'aboutUrl': self.route_url_pattern(req, 'value', '{ID}'),
                },
            })
            zipfile.writestr(
                '%s.csv-metadata.json' % req.dataset.id, json.dumps(md, indent=4))
            bib = Database([
                rec.bibtex() for rec in DBSession.query(Source).order_by(Source.name)])
            zipfile.writestr('%s.bib' % req.dataset.id, ('%s' % bib).encode('utf8'))
            zipfile.writestr(
                'README.txt',
                README.format(
                    req.dataset.name,
                    '=' * (
                        len(req.dataset.name)
                        + len(' data download')),
                    req.dataset.license,
                    TxtCitation(None).render(req.dataset, req)).encode('utf8'))
        if p.exists():  # pragma: no cover
            remove(p)
        move(tmp, p)