def from_bibfiles(cls, bibfiles, filepath, rebuild=False, page_size=32768): """If needed, (re)build the db from the bibfiles, hash, split/merge.""" self = cls(filepath) if self.filepath.exists(): if not rebuild and self.is_uptodate(bibfiles): return self path.remove(self.filepath) with self.engine.connect() as conn: if page_size is not None: conn.execute('PRAGMA page_size = %d' % page_size) Model.metadata.create_all(conn) with self.engine.connect() as conn: conn.execute('PRAGMA synchronous = OFF') conn.execute('PRAGMA journal_mode = MEMORY') conn = conn.execution_options(compiled_cache={}) with conn.begin(): import_bibfiles(conn, bibfiles) Entry.stats(conn) Value.fieldstats(conn) with conn.begin(): generate_hashes(conn) Entry.hashstats(conn) Entry.hashidstats(conn) with conn.begin(): assign_ids(conn) return self
def temp_download(self, url, fname, log=None): p = None try: p = self.download(url, fname, log=log) yield p finally: if p and p.exists(): remove(p)
def test_remove(self): from clldutils.path import remove self.assertRaises(OSError, remove, self.tmp_path('nonexistingpath')) tmp = self.make_file('test') self.assertTrue(tmp.exists()) remove(tmp) self.assertFalse(tmp.exists())
def get_data(self, lid): # pragma: no cover fname = self.raw.download(URL % (self.SECTION, lid), '%s.xml' % lid, log=self.log) if fname.stat().st_size == 0: remove(fname) return False return True
def _clean(self, **kw): self.log.debug('removing CLDF directory %s' % self.cldf_dir) if self.cldf_dir.exists(): for f in self.cldf_dir.iterdir(): if f.is_file(): remove(f) else: rmtree(f)
def test_remove(tmppath): from clldutils.path import remove with pytest.raises(OSError): remove(tmppath / 'nonexistingpath') tmp = make_file(tmppath, name='test') assert tmp.exists() remove(tmp) assert not tmp.exists()
def safe_overwrite(fname): fname = Path(fname) if not fname.parent.exists(): fname.parent.mkdir() assert fname.parent.exists() tmp = fname.parent while tmp.exists(): tmp = fname.parent.joinpath('%s.%s' % (fname.name, random_string(6))) yield tmp if fname.exists(): remove(fname) move(tmp, fname)
def test_generate_extract(self): xml = self.tmp_path('test.xml') self._run_main('-v -o {0} {1}'.format(xml.as_posix(), config_path('basic'))) self.assertTrue(xml.exists()) # Overwriting existing files must be specified explicitely: self._run_main('-o {0} {1}'.format( xml.as_posix(), config_path('basic')), status=4) self._run_main('--overwrite -o {0} {1}'.format( xml.as_posix(), config_path('basic')), status=0) tcfg = Path('beastling_test.conf') self._run_main('--extract {0}'.format(xml.as_posix())) self.assertTrue(tcfg.exists()) remove(tcfg)
def dump(version, all_langs, identifiers): out = args.data_file('files', 'glottolog-{0}'.format(version)) if out.exists(): for p in out.iterdir(): remove(p) else: out.mkdir() langs = all_langs[version].values() langs_by_pk = {l.pk: l for l in langs} children = { pk: list(c) for pk, c in groupby(sorted(langs, key=lambda l: l.fpk), lambda l: l.fpk) } for lang in langs: ancestors, fpk = [], lang.fpk while fpk and fpk in langs_by_pk: ancestors.append(langs_by_pk[fpk]) fpk = langs_by_pk[fpk].fpk versions = [ '<strong><a href="http://glottolog.org/resource/languoid/id/{0}">[{0}] in current Glottolog</a></strong>' .format(lang.id) ] for v in sorted(all_langs.keys()): if v != version: if lang.id in all_langs[v]: versions.append(all_langs[v][lang.id].cross_version_link) clf = [link_list(children.get(lang.pk, []))] clf.append(lang.text) clf.extend(a.link for a in ancestors) write_text( out.joinpath('{0}.html'.format(lang.id)), T.render_unicode( version=version, lang=lang, clf=reduce(wrap, clf) if not lang.replacements else '', versions=versions, identifiers=identifiers.get(lang.pk, []), replacements=[ all_langs[version][lid].link for lid in lang.replacements if lid in all_langs[version] ], wrap=wrap, link_list=link_list, ))
def recode(args): """Assign a new glottocode to an existing languoid. glottolog recode <code> """ lang = args.repos.languoid(args.args[0]) if not lang: raise ParserError('languoid not found') lang.id = Glottocode.from_name(lang.name) new_dir = lang.dir.parent.joinpath(lang.id) copytree(lang.dir, new_dir) lang.write_info(new_dir) remove(new_dir.joinpath('%s.ini' % args.args[0])) rmtree(lang.dir) print("%s -> %s" % (args.args[0], lang.id))
def recode(args): """Assign a new glottocode to an existing languoid. glottolog recode <code> """ lang = find_languoid(glottocode=args.args[0]) if not lang: raise ParserError('languoid not found') lang.id = Glottocode.from_name(lang.name) new_dir = lang.dir.parent.joinpath(lang.id) copytree(lang.dir, new_dir) lang.write_info(new_dir) remove(new_dir.joinpath('%s.ini' % args.args[0])) rmtree(lang.dir) print("%s -> %s" % (args.args[0], lang.id))
def cmd_download(self, **kw): # pragma: no cover assert self.SECTION in ['austronesian', 'mayan', 'utoaztecan'] self.log.info('ABVD section set to %s' % self.SECTION) for fname in self.raw.iterdir(): remove(fname) language_ids = [ i for i in range(1, 2000) if i not in INVALID_LANGUAGE_IDS.get(self.SECTION, []) ] for lid in language_ids: if not self.get_data(lid): self.log.warn("No content for %s %d. Ending." % (self.SECTION, lid)) break
def test_generate_extract(self): xml = self.tmp_path('test.xml') self._run_main('-v -o {0} {1}'.format(xml.as_posix(), config_path('basic'))) self.assertTrue(xml.exists()) # Overwriting existing files must be specified explicitely: self._run_main('-o {0} {1}'.format(xml.as_posix(), config_path('basic')), status=4) self._run_main('--overwrite -o {0} {1}'.format(xml.as_posix(), config_path('basic')), status=0) tcfg = Path('beastling_test.conf') self._run_main('--extract {0}'.format(xml.as_posix())) self.assertTrue(tcfg.exists()) remove(tcfg)
def test_extractor(self): config = self.make_cfg( [config_path(f).as_posix() for f in ("admin", "mk", "embed_data")]) xml = beastling.beastxml.BeastXml(config) xmlfile = self.tmp.joinpath("beastling.xml") xml.write_file(xmlfile.as_posix()) self.assertTrue(bool(self._extract(xmlfile))) config = self.make_cfg({ 'admin': { 'basename': 'abcdefg' }, 'model': { 'model': 'mk', 'data': data_path('basic.csv').as_posix() } }) xml = beastling.beastxml.BeastXml(config) xmlfile = self.tmp.joinpath("beastling.xml") xml.write_file(xmlfile.as_posix()) beastling.extractor.extract(xmlfile) p = Path('abcdefg.conf') self.assertTrue(p.exists()) cfg = INI(interpolation=None) cfg.read(p.as_posix()) remove(p) self.assertEqual(cfg['admin']['basename'], 'abcdefg') self.assertEqual(cfg['model']['model'], 'mk') fname = self.tmp.joinpath('test.xml') datafile = self.tmp.joinpath(('test.csv')) self.assertFalse(datafile.exists()) with fname.open('w', encoding='utf8') as fp: fp.write("""<?xml version="1.0" encoding="UTF-8"?> <r> <!--%s %s [admin] [model] --> <!--%s:%s--> </r> """ % (beastling.extractor._generated_str, beastling.extractor._config_file_str, beastling.extractor._data_file_str, datafile.as_posix())) res = self._extract(fname) self.assertIn(datafile.name, ''.join(res))
def test_Matrix(self): from wals3.adapters import Matrix p = Path(mktemp()) assert not p.exists() class TestMatrix(Matrix): def abspath(self, req): return p def query(self, req): return Matrix.query(self, req).filter(Language.pk < 100) m = TestMatrix(Language, "wals3", description="Feature values CSV") m.create(self.env["request"], verbose=False) assert p.exists() remove(p)
def test_Matrix(self): from wals3.adapters import Matrix p = Path(mktemp()) assert not p.exists() class TestMatrix(Matrix): def abspath(self, req): return p def query(self, req): return Matrix.query(self, req).filter(Language.pk < 100) m = TestMatrix(Language, 'wals3', description="Feature values CSV") m.create(self.env['request'], verbose=False) assert p.exists() remove(p)
def create(self, req, filename=None, verbose=True): p = self.abspath(req) if not p.parent.exists(): # pragma: no cover p.parent.mkdir() tmp = Path('%s.tmp' % p.as_posix()) if self.rdf: # we do not create archives with a readme for rdf downloads, because each # RDF entity points to the dataset and the void description of the dataset # covers all relevant metadata. # # TODO: write test for the file name things!? # with closing( GzipFile(filename=Path(tmp.stem).stem, fileobj=tmp.open('wb'))) as fp: self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) else: with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile: if not filename: fp = self.get_stream() self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) zipfile.writestr(self.name, self.read_stream(fp)) else: # pragma: no cover zipfile.write(filename, self.name) zipfile.writestr( 'README.txt', README.format( req.dataset.name, '=' * (len(req.dataset.name) + len(' data download')), req.dataset.license, TxtCitation(None).render(req.dataset, req)).encode('utf8')) if p.exists(): # pragma: no cover remove(p) move(tmp, p)
def test_extractor(self): config = self.make_cfg( [config_path(f).as_posix() for f in ("admin", "mk", "embed_data")]) xml = beastling.beastxml.BeastXml(config) xmlfile = self.tmp.joinpath("beastling.xml") xml.write_file(xmlfile.as_posix()) self.assertTrue(bool(self._extract(xmlfile))) config = self.make_cfg({ 'admin': {'basename': 'abcdefg'}, 'model': { 'model': 'mk', 'data': data_path('basic.csv').as_posix()}}) xml = beastling.beastxml.BeastXml(config) xmlfile = self.tmp.joinpath("beastling.xml") xml.write_file(xmlfile.as_posix()) beastling.extractor.extract(xmlfile) p = Path('abcdefg.conf') self.assertTrue(p.exists()) cfg = INI(interpolation=None) cfg.read(p.as_posix()) remove(p) self.assertEqual(cfg['admin']['basename'], 'abcdefg') self.assertEqual(cfg['model']['model'], 'mk') fname = self.tmp.joinpath('test.xml') datafile = self.tmp.joinpath(('test.csv')) self.assertFalse(datafile.exists()) with fname.open('w', encoding='utf8') as fp: fp.write("""<?xml version="1.0" encoding="UTF-8"?> <r> <!--%s %s [admin] [model] --> <!--%s:%s--> </r> """ % (beastling.extractor._generated_str, beastling.extractor._config_file_str, beastling.extractor._data_file_str, datafile.as_posix())) res = self._extract(fname) self.assertIn(datafile.name, ''.join(res))
def create(self, req, filename=None, verbose=True): p = self.abspath(req) if not p.parent.exists(): # pragma: no cover p.parent.mkdir() tmp = Path('%s.tmp' % p.as_posix()) if self.rdf: # we do not create archives with a readme for rdf downloads, because each # RDF entity points to the dataset and the void description of the dataset # covers all relevant metadata. # # TODO: write test for the file name things!? # with closing(GzipFile( filename=Path(tmp.stem).stem, fileobj=tmp.open('wb') )) as fp: self.before(req, fp) for i, item in enumerate(page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) else: with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile: if not filename: fp = self.get_stream() self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) zipfile.writestr(self.name, self.read_stream(fp)) else: # pragma: no cover zipfile.write(filename, self.name) zipfile.writestr( 'README.txt', README.format( req.dataset.name, '=' * ( len(req.dataset.name) + len(' data download')), req.dataset.license, TxtCitation(None).render(req.dataset, req)).encode('utf8')) if p.exists(): # pragma: no cover remove(p) move(tmp, p)
def dump(out, version, all_langs, identifiers): if out.exists(): for p in out.iterdir(): remove(p) else: out.mkdir() langs = all_langs[version].values() langs_by_pk = {l.pk: l for l in langs} children = { pk: list(c) for pk, c in groupby(sorted(langs, key=lambda l: l.fpk or 0), lambda l: l.fpk)} for lang in langs: ancestors, fpk = [], lang.fpk while fpk and fpk in langs_by_pk: ancestors.append(langs_by_pk[fpk]) fpk = langs_by_pk[fpk].fpk versions = [ '<strong><a href="http://glottolog.org/resource/languoid/id/{0}">[{0}] in current Glottolog</a></strong>'.format(lang.id)] for v in sorted(all_langs.keys()): if v != version: if lang.id in all_langs[v]: versions.append(all_langs[v][lang.id].cross_version_link) clf = [link_list(children.get(lang.pk, []))] clf.append(lang.text) clf.extend(a.link for a in ancestors) write_text( out.joinpath('{0}.html'.format(lang.id)), T.render_unicode( version=version, lang=lang, clf=reduce(wrap, clf) if not lang.replacements else '', versions=versions, identifiers=identifiers.get(lang.pk, []), replacements=[all_langs[version][lid].link for lid in lang.replacements if lid in all_langs[version]], wrap=wrap, link_list=link_list, ) )
def test_extractor(config_factory, tmppath, data_dir): config = config_factory("admin", "mk", "embed_data") xml = beastling.beastxml.BeastXml(config) xmlfile = str(tmppath / "beastling.xml") xml.write_file(xmlfile) assert bool(_extract(xmlfile)) config = config_factory({ 'admin': {'basename': 'abcdefg'}, 'model model': { 'model': 'mk', 'data': str(data_dir / 'basic.csv')}}) xml = beastling.beastxml.BeastXml(config) xmlfile = str(tmppath / "beastling.xml") xml.write_file(xmlfile) beastling.extractor.extract(xmlfile) p = Path('abcdefg.conf') assert p.exists() cfg = INI(interpolation=None) cfg.read(p.as_posix()) remove(p) assert cfg['admin']['basename'] == 'abcdefg' assert cfg['model model']['model'] == 'mk' fname = tmppath / 'test.xml' datafile = tmppath / 'test.csv' assert not datafile.exists() with fname.open('w', encoding='utf8') as fp: fp.write("""<?xml version="1.0" encoding="UTF-8"?> <r> <!--%s %s [admin] [model model] --> <!--%s:%s--> </r> """ % (beastling.extractor._generated_str, beastling.extractor._config_file_str, beastling.extractor._data_file_str, datafile.as_posix())) res = _extract(fname) assert datafile.name in ''.join(res)
def test_Dataset_validate(tmpdir, mocker): ds = StructureDataset.in_dir(str(tmpdir / 'new')) ds.write(ValueTable=[]) values = tmpdir / 'new' / 'values.csv' assert values.check() remove(str(values)) log = mocker.Mock() assert not ds.validate(log=log) assert log.warn.called ds.write(ValueTable=[]) assert ds.validate() ds['ValueTable'].tableSchema.columns = [] with pytest.raises(ValueError): ds.validate() assert not ds.validate(log=mocker.Mock()) ds.tablegroup.tables = [] with pytest.raises(ValueError): ds.validate() ds = StructureDataset.in_dir(str(tmpdir / 'new')) ds.add_component('LanguageTable') ds.write(ValueTable=[], LanguageTable=[]) assert ds.validate() # test violation of referential integrity: ds.write(ValueTable=[{'ID': '1', 'Value': '1', 'Language_ID': 'lid', 'Parameter_ID': 'pid'}], LanguageTable=[]) assert not ds.validate(log=mocker.Mock()) # test an invalid CLDF URL: ds['LanguageTable'].common_props['dc:conformsTo'] = 'http://cldf.clld.org/404' with pytest.raises(ValueError): ds.validate() ds = StructureDataset.in_dir(str(tmpdir / 'new')) ds['ValueTable'].get_column('Source').propertyUrl = URITemplate( 'http://cldf.clld.org/404') ds.write(ValueTable=[]) with pytest.raises(ValueError): ds.validate()
def from_bibfiles(cls, bibfiles, filename, rebuild=False): """If needed, (re)build the db from the bibfiles, hash, split/merge.""" if filename.exists(): if not rebuild: self = cls(filename, bibfiles) if self.is_uptodate(): return self remove(filename) self = cls(filename, bibfiles) with self.connect(async=True) as conn: create_tables(conn) with conn: import_bibfiles(conn, bibfiles) entrystats(conn) fieldstats(conn) with conn: generate_hashes(conn) hashstats(conn) hashidstats(conn) with conn: assign_ids(conn)
def cmd_download(self, **kw): zname = '{0}.zip'.format(self.id) self.raw.download(self.__cldf_url__, zname, log=self.log) archive = ZipFile(str(self.raw / zname)) archive.extractall(str(self.raw)) remove(self.raw / zname)
def __exit__(self, exc_type, exc_val, exc_tb): if self.name.exists(): remove(self.name)
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if command == 'cleanup': for fname in args.data_file('gbs').glob('*.json'): try: data = jsonlib.load(fname) if data.get('totalItems') == 0: remove(fname) except ValueError: remove(fname) return if not sources: sources = DBSession.query(common.Source)\ .order_by(common.Source.id)\ .options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file('gbs', 'source%s.json' % source.id) if command == 'update': source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ['verify', 'update']: if filepath.exists(): try: data = jsonlib.load(filepath) except ValueError: log.warn('no JSON object found in: %s' % filepath) continue if not data['totalItems']: continue item = data['items'][0] else: continue if command == 'verify': stitle = source.description or source.title or source.booktitle needs_check = False year = item['volumeInfo'].get('publishedDate', '').split('-')[0] if not year or year != slug(source.year or ''): needs_check = True twords = words(stitle) iwords = words( item['volumeInfo']['title'] + ' ' + item['volumeInfo'].get('subtitle', '')) if twords == iwords \ or (len(iwords) > 2 and iwords.issubset(twords))\ or (len(twords) > 2 and twords.issubset(iwords)): needs_check = False if int(source.id) == 241: log.info('%s' % sorted(words(stitle))) log.info('%s' % sorted(iwords)) if needs_check: log.info('------- %s -> %s' % ( source.id, item['volumeInfo'].get('industryIdentifiers'))) log.info('%s %s' % ( item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', ''))) log.info(stitle) log.info(item['volumeInfo'].get('publishedDate')) log.info(source.year) log.info(item['volumeInfo'].get('authors')) log.info(source.author) log.info(item['volumeInfo'].get('publisher')) log.info(source.publisher) if not confirm('Are the records the same?'): log.warn('---- removing ----') jsonlib.dump({"totalItems": 0}, filepath) elif command == 'update': source.google_book_search_id = item['id'] source.update_jsondata(gbs=item) count += 1 elif command == 'download': if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ 'inauthor:' + quote_plus(source.author.encode('utf8')), 'intitle:' + quote_plus(title.encode('utf8')), ] if source.publisher: q.append('inpublisher:' + quote_plus( source.publisher.encode('utf8'))) url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key) count += 1 r = requests.get(url, headers={'accept': 'application/json'}) log.info('%s - %s' % (r.status_code, url)) if r.status_code == 200: with open(as_posix(filepath), 'w') as fp: fp.write(r.text.encode('utf8')) elif r.status_code == 403: log.warn("limit reached") break if command == 'update': log.info('assigned gbs ids for %s out of %s sources' % (count, i)) elif command == 'download': log.info('queried gbs for %s sources' % count)
def drop(self): if self.fname.exists(): remove(self.fname)
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if command == 'cleanup': for fname in args.data_file('gbs').glob('*.json'): try: data = jsonlib.load(fname) if data.get('totalItems') == 0: remove(fname) except ValueError: remove(fname) return if not sources: sources = DBSession.query(common.Source)\ .order_by(common.Source.id)\ .options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file('gbs', 'source%s.json' % source.id) if command == 'update': source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ['verify', 'update']: if filepath.exists(): try: data = jsonlib.load(filepath) except ValueError: log.warn('no JSON object found in: %s' % filepath) continue if not data['totalItems']: continue item = data['items'][0] else: continue if command == 'verify': stitle = source.description or source.title or source.booktitle needs_check = False year = item['volumeInfo'].get('publishedDate', '').split('-')[0] if not year or year != slug(source.year or ''): needs_check = True twords = words(stitle) iwords = words(item['volumeInfo']['title'] + ' ' + item['volumeInfo'].get('subtitle', '')) if twords == iwords \ or (len(iwords) > 2 and iwords.issubset(twords))\ or (len(twords) > 2 and twords.issubset(iwords)): needs_check = False if int(source.id) == 241: log.info('%s' % sorted(words(stitle))) log.info('%s' % sorted(iwords)) if needs_check: log.info( '------- %s -> %s' % (source.id, item['volumeInfo'].get('industryIdentifiers'))) log.info('%s %s' % (item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', ''))) log.info(stitle) log.info(item['volumeInfo'].get('publishedDate')) log.info(source.year) log.info(item['volumeInfo'].get('authors')) log.info(source.author) log.info(item['volumeInfo'].get('publisher')) log.info(source.publisher) if not confirm('Are the records the same?'): log.warn('---- removing ----') jsonlib.dump({"totalItems": 0}, filepath) elif command == 'update': source.google_book_search_id = item['id'] source.update_jsondata(gbs=item) count += 1 elif command == 'download': if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ 'inauthor:' + quote_plus(source.author.encode('utf8')), 'intitle:' + quote_plus(title.encode('utf8')), ] if source.publisher: q.append('inpublisher:' + quote_plus(source.publisher.encode('utf8'))) url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key) count += 1 r = requests.get(url, headers={'accept': 'application/json'}) log.info('%s - %s' % (r.status_code, url)) if r.status_code == 200: with open(as_posix(filepath), 'w') as fp: fp.write(r.text.encode('utf8')) elif r.status_code == 403: log.warn("limit reached") break if command == 'update': log.info('assigned gbs ids for %s out of %s sources' % (count, i)) elif command == 'download': log.info('queried gbs for %s sources' % count)
def remove(self, fname): remove(self.joinpath(fname))
def clear(self): for key in self.keys(): remove(self._path(key))
def __delitem__(self, key): remove(self._path(key))
def gbs_func(command, args, sources=None): # pragma: no cover def words(s): return set(slug(s.strip(), remove_whitespace=False).split()) log = args.log count = 0 api_url = "https://www.googleapis.com/books/v1/volumes?" if command == "cleanup": for fname in args.data_file("gbs").glob("*.json"): try: data = jsonlib.load(fname) if data.get("totalItems") == 0: remove(fname) except ValueError: remove(fname) return if not sources: sources = DBSession.query(common.Source).order_by(common.Source.id).options(joinedload(common.Source.data)) if callable(sources): sources = sources() for i, source in enumerate(page_query(sources, verbose=True, commit=True)): filepath = args.data_file("gbs", "source%s.json" % source.id) if command == "update": source.google_book_search_id = None source.update_jsondata(gbs={}) if command in ["verify", "update"]: if filepath.exists(): try: data = jsonlib.load(filepath) except ValueError: log.warn("no JSON object found in: %s" % filepath) continue if not data["totalItems"]: continue item = data["items"][0] else: continue if command == "verify": stitle = source.description or source.title or source.booktitle needs_check = False year = item["volumeInfo"].get("publishedDate", "").split("-")[0] if not year or year != slug(source.year or ""): needs_check = True twords = words(stitle) iwords = words(item["volumeInfo"]["title"] + " " + item["volumeInfo"].get("subtitle", "")) if ( twords == iwords or (len(iwords) > 2 and iwords.issubset(twords)) or (len(twords) > 2 and twords.issubset(iwords)) ): needs_check = False if int(source.id) == 241: log.info("%s" % sorted(words(stitle))) log.info("%s" % sorted(iwords)) if needs_check: log.info("------- %s -> %s" % (source.id, item["volumeInfo"].get("industryIdentifiers"))) log.info("%s %s" % (item["volumeInfo"]["title"], item["volumeInfo"].get("subtitle", ""))) log.info(stitle) log.info(item["volumeInfo"].get("publishedDate")) log.info(source.year) log.info(item["volumeInfo"].get("authors")) log.info(source.author) log.info(item["volumeInfo"].get("publisher")) log.info(source.publisher) if not confirm("Are the records the same?"): log.warn("---- removing ----") jsonlib.dump({"totalItems": 0}, filepath) elif command == "update": source.google_book_search_id = item["id"] source.update_jsondata(gbs=item) count += 1 elif command == "download": if source.author and (source.title or source.booktitle): title = source.title or source.booktitle if filepath.exists(): continue q = [ "inauthor:" + quote_plus(source.author.encode("utf8")), "intitle:" + quote_plus(title.encode("utf8")), ] if source.publisher: q.append("inpublisher:" + quote_plus(source.publisher.encode("utf8"))) url = api_url + "q=%s&key=%s" % ("+".join(q), args.api_key) count += 1 r = requests.get(url, headers={"accept": "application/json"}) log.info("%s - %s" % (r.status_code, url)) if r.status_code == 200: with open(as_posix(filepath), "w") as fp: fp.write(r.text.encode("utf8")) elif r.status_code == 403: log.warn("limit reached") break if command == "update": log.info("assigned gbs ids for %s out of %s sources" % (count, i)) elif command == "download": log.info("queried gbs for %s sources" % count)
def create(self, req, filename=None, verbose=True): p = self.abspath(req) if not p.parent.exists(): # pragma: no cover p.parent.mkdir() tmp = Path('%s.tmp' % p) language_url_pattern = self.route_url_pattern(req, 'language') with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile: tables = [] for param in DBSession.query(Parameter).options(joinedload(Parameter.domain)): fname = '%s-%s.csv' % (req.dataset.id, param.id) zipfile.writestr(fname, self.get_values(param, language_url_pattern)) tables.append({ '@type': 'Table', 'url': fname, 'notes': [ { '@id': req.resource_url(param), 'dc:identifier': param.id, 'dc:title': param.name, 'dc:description': param.description or ''}] + [ { '@type': 'DomainElement', 'name': de.name, 'description': de.description, 'numeric': de.number } for de in param.domain ], }) md = CsvmJsonAdapter.csvm_basic_doc(req, tables=tables) md.update({ '@type': 'TableGroup', 'dc:language': list(self.get_languages(req, language_url_pattern)), 'tableSchema': { "columns": [ { "name": "ID", "datatype": "string", "required": True }, { "name": "Language_ID", "datatype": "string", "required": True }, { "name": "Parameter_ID", "datatype": "string", "required": True }, { "name": "Contribution_ID", "datatype": "string", "required": True }, { "name": "Value", "datatype": "string", "required": True }, { "name": "Source", "datatype": "string", }, { "name": "Comment", "datatype": "string", }, ], "primaryKey": "ID", 'aboutUrl': self.route_url_pattern(req, 'value', '{ID}'), }, }) zipfile.writestr( '%s.csv-metadata.json' % req.dataset.id, json.dumps(md, indent=4)) bib = Database([ rec.bibtex() for rec in DBSession.query(Source).order_by(Source.name)]) zipfile.writestr('%s.bib' % req.dataset.id, ('%s' % bib).encode('utf8')) zipfile.writestr( 'README.txt', README.format( req.dataset.name, '=' * ( len(req.dataset.name) + len(' data download')), req.dataset.license, TxtCitation(None).render(req.dataset, req)).encode('utf8')) if p.exists(): # pragma: no cover remove(p) move(tmp, p)