def download_and_unpack_zipfiles(url, dataset, *paths): """Download zipfiles and immediately unpack the content""" with TemporaryDirectory() as tmpdir: urlretrieve(url, tmpdir.joinpath('ds.zip').as_posix()) with zipfile.ZipFile(tmpdir.joinpath('ds.zip').as_posix()) as zipf: for path in paths: zipf.extract(as_posix(path), path=tmpdir.as_posix()) copy(tmpdir.joinpath(path), dataset.raw)
def download(dataset): with TemporaryDirectory() as tmpdir: urlretrieve( 'http://www.evolution.reading.ac.uk/Files/%s.zip' % NAME, tmpdir.joinpath('gbd.zip').as_posix()) with ZipFile(tmpdir.joinpath('gbd.zip').as_posix()) as zip: zip.extract(NAME + '.xlsx', tmpdir.as_posix()) xls2csv(tmpdir.joinpath(NAME + '.xlsx'), outdir=dataset.raw)
def iter_tables(zippath=None): with TemporaryDirectory() as tmp: if not zippath: zippath = download_tables(tmp) with ZipArchive(zippath) as archive: for name in archive.namelist(): match = TABLE_NAME_PATTERN.search(name) if match: yield Table(match.group('name_and_date'), archive.read_text(name))
def download_and_unpack(self, url, *paths, **kw): """ Download a zipfile and immediately unpack selected content. :param url: :param paths: :param kw: :return: """ with self.temp_download(url, 'ds.zip', log=kw.pop('log', None)) as zipp: with TemporaryDirectory() as tmpdir: with zipfile.ZipFile(zipp.as_posix()) as zipf: for path in paths: zipf.extract(as_posix(path), path=tmpdir.as_posix()) copy(tmpdir.joinpath(path), self)
def download_and_unpack(self, url, *paths, **kw): """ Download a zipfile and immediately unpack selected content. :param url: :param paths: :param kw: """ with self.temp_download(url, 'ds.zip', log=kw.pop('log', None)) as zipp: with TemporaryDirectory() as tmpdir: with zipfile.ZipFile(str(zipp)) as zipf: for info in zipf.infolist(): if (not paths) or info.filename in paths: zipf.extract(info, path=str(tmpdir)) shutil.copy(str(tmpdir.joinpath(info.filename)), str(self))
def retrieve(self, item, cdstar_catalog, checksums, mediacatalog): """ - download - compute checksum - upload to CDSTAR - add to cdstar.json :return: Image instance """ md = self.metadata(item) or {} source_url = md.pop('source_url', None) if not source_url: return # We turn the Staged_images instance into a `dict`, which we will enrich and then # turn into an Images instance. item = dict(zip(item.fields(), item.csv_row())) with TemporaryDirectory() as tmp: if isinstance(source_url, Path): fname = tmp.joinpath(source_url.name) copy(source_url, fname) else: # download the thing fname = self._download(source_url, tmp) if not fname: return checksum = md5(fname) if checksum in checksums: raise ValueError('duplicate item {0} {1}'.format(item['id'], checksum)) item.update(md) item['id'] = checksum item['collection'] = 'Tsammalex' img = Images.fromdict(item) if checksum not in mediacatalog.items: # now upload to CDSTAR _, _, obj = list(cdstar_catalog.create(fname, item))[0] mediacatalog.add(obj) return img
def test_TemporaryDirectory(): from clldutils.path import TemporaryDirectory with TemporaryDirectory() as tmp: assert tmp.exists() assert not tmp.exists()
def download(dataset): with TemporaryDirectory() as tmpdir: urlretrieve(URL, tmpdir.joinpath('ds.xlsm').as_posix()) xls2csv(tmpdir.joinpath('ds.xlsm'), outdir=dataset.raw)
def create(self, req, filename=None, verbose=True, outfile=None): cldf_cfg = req.registry.getUtility(ICldfConfig) with TemporaryDirectory() as tmpd: cls = getattr(dataset, cldf_cfg.module) ds = cls.in_dir(tmpd) ds.properties['dc:bibliographicCitation'] = text_citation( req, req.dataset) ds.properties['dc:publisher'] = '%s, %s' % ( req.dataset.publisher_name, req.dataset.publisher_place) ds.properties['dc:license'] = req.dataset.license ds.properties['dc:issued'] = req.dataset.published.isoformat() ds.properties['dc:title'] = req.dataset.name ds.properties['dc:creator'] = req.dataset.formatted_editors() ds.properties['dc:identifier'] = req.resource_url(req.dataset) ds.properties['dcat:accessURL'] = req.route_url('download') if DBSession.query(Sentence).count(): ds.add_component('ExampleTable') if DBSession.query(DomainElement).count(): ds.add_component('CodeTable', { 'name': 'Number', 'datatype': 'integer' }) ds.add_component('ParameterTable') ds.add_component('LanguageTable') ds.add_table('contributions.csv', 'ID', 'Name', 'Description', 'Contributors') ds.add_columns( ds.primary_table, Column.fromvalue({ 'name': 'Contribution_ID', 'datatype': 'string', 'valueUrl': url_template(req, 'contribution', 'contribution').uri, })) ds.add_foreign_key(ds.primary_table, 'Contribution_ID', 'contributions.csv', 'ID') ds['LanguageTable'].aboutUrl = url_template(req, 'language', 'ID') ds['ParameterTable'].aboutUrl = url_template( req, 'parameter', 'ID') ds[ds.primary_table].aboutUrl = url_template(req, 'value', 'ID') cldf_cfg.custom_schema(req, ds) for src in cldf_cfg.query(Source): ds.sources.add(cldf_cfg.convert(Source, src, req)) fname = outfile or self.abspath(req) transaction.abort() tabledata = defaultdict(list) for table, model in [ ('ParameterTable', Parameter), ('CodeTable', DomainElement), ('LanguageTable', Language), ('ExampleTable', Sentence), ('contributions.csv', Contribution), (ds.primary_table, Value), ]: if verbose: print('exporting {0} ...'.format(model)) transaction.begin() for item in cldf_cfg.query(model): tabledata[table].append(cldf_cfg.convert(model, item, req)) transaction.abort() if verbose: print('... done') transaction.begin() ds.write(**cldf_cfg.custom_tabledata(req, tabledata)) ds.validate() shutil.make_archive(str(fname.parent / fname.stem), 'zip', str(tmpd))