def import_csv(dataset, url, args): """ Import the csv data into the dataset """ csv_data_url, source_url = url source = Source(dataset, shell_account(), csv_data_url) # Analyse the csv data and add it to the source # If we don't analyse it we'll be left with a weird message source.analysis = analyze_csv(csv_data_url) # Check to see if the dataset already has this source for source_ in dataset.sources: if source_.url == csv_data_url: source = source_ break db.session.add(source) db.session.commit() dataset.generate() importer = CSVImporter(source) importer.run(**vars(args)) # Check if imported from the file system (source and data url differ) if csv_data_url != source_url: # If we did, then we must update the source url based on the # sources in the dataset model (so we need to fetch the source again # or else we'll add a new one) source = Source.by_id(source.id) source.url = source_url db.session.commit()
def test_dimensions_edit_mask_with_data(self): cra = Dataset.by_name('cra') src = Source(cra, self.user, 'file:///dev/null') src.analysis = {'columns': ['amount', 'etc']} db.session.add(src) db.session.commit() response = self.app.get(url(controller='editor', action='dimensions_edit', dataset='cra'), extra_environ={'REMOTE_USER': '******'}) assert 'cannot edit dimensions' in response.body assert '"amount"' not in response.body assert 'Update' not in response.body
def create(self): """ Adds a new dataset dynamically through a POST request """ # User must be authenticated so we should have a user object in # c.account, if not abort with error message if not c.account: abort(status_code=400, detail='user not authenticated') # Check if the params are there ('metadata', 'csv_file') if len(request.params) != 2: abort(status_code=400, detail='incorrect number of params') metadata = request.params['metadata'] \ if 'metadata' in request.params \ else abort(status_code=400, detail='metadata is missing') csv_file = request.params['csv_file'] \ if 'csv_file' in request.params \ else abort(status_code=400, detail='csv_file is missing') # We proceed with the dataset try: model = json.load(urllib2.urlopen(metadata)) except: abort(status_code=400, detail='JSON model could not be parsed') try: log.info("Validating model") model = validate_model(model) except Invalid as i: log.error("Errors occured during model validation:") for field, error in i.asdict().items(): log.error("%s: %s", field, error) abort(status_code=400, detail='Model is not well formed') dataset = Dataset.by_name(model['dataset']['name']) if dataset is None: dataset = Dataset(model) require.dataset.create() dataset.managers.append(c.account) dataset.private = True # Default value db.session.add(dataset) else: require.dataset.update(dataset) log.info("Dataset: %s", dataset.name) source = Source(dataset=dataset, creator=c.account, url=csv_file) log.info(source) for source_ in dataset.sources: if source_.url == csv_file: source = source_ break db.session.add(source) db.session.commit() # Send loading of source into celery queue load_source.delay(source.id) return to_jsonp(dataset_apply_links(dataset.as_dict()))
def get_run(dataset, source, id): dataset = get_dataset(dataset) source = obj_or_404(Source.by_id(source)) if source.dataset != dataset: raise BadRequest("There was no source") run = obj_or_404(Run.by_id(id)) if run.source != source: raise BadRequest("There is no run %s" % str(id)) return dataset, source, run
def _get_run(self, dataset, source, id): self._get_dataset(dataset) require.dataset.update(c.dataset) c.source = Source.by_id(source) if c.source is None or c.source.dataset != c.dataset: abort(404, _("There is no source '%s'") % source) c.run = Run.by_id(id) if c.run is None or c.run.source != c.source: abort(404, _("There is no run '%s'") % id)
def get_run(dataset, source, id): dataset = get_dataset(dataset) require.dataset.update(dataset) source = obj_or_404(Source.by_id(source)) if source.dataset != dataset: raise BadRequest("There was no source") run = obj_or_404(Run.by_id(id)) if run.source != source: raise BadRequest("There is no run '" + str(id) + '") return dataset, source, run
def test_view_source(self): url_ = 'http://banana.com/split.csv' source = Source(self.dataset, self.user, url_) db.session.add(source) db.session.commit() response = self.app.get(url(controller='source', action='view', dataset='cra', id=source.id), extra_environ={'REMOTE_USER': '******'}) assert response.headers['Location'] == url_, response.headers
def check_column(source_id, columnkey, columnvalue): with flask_app.app_context(): source = Source.by_id(source_id) sourcerefine = source.get_or_create_ORProject() #should cache this at some point sourcefile_export = sourcerefine.refineproj.export() #remove BOM from the source file s = sourcefile_export.read() u = s.decode("utf-8-sig") sourcefile = io.BytesIO() sourcefile.write(str(u)) sourcefile_csv = csv.DictReader(sourcefile, delimiter="\t") arrayset = [] for row in sourcefile_csv: print row[columnvalue] arrayset.append(row[columnvalue]) sourcefile.close() returnval = {"errors": [], "message": "There was an unexpected error"} if columnkey == "country_level0": temp_geom_countries = db.session.query("country").from_statement( text( "SELECT geometry__country_level0.label as country FROM public.geometry__country_level0 " )).all() geom_countries = [y for x in temp_geom_countries for y in x] temp_geom_countries = None returnval['message'] = "The following countries were not found:" for country in arrayset: #there is probably a better method that takes advantage of a sorted list if country not in geom_countries: #log as error returnval['errors'].append(country) elif columnkey == "time": returnval['message'] = "Could not parse the following dates:" for date_col in arrayset: try: parse(date_col) except Exception, e: returnval['errors'].append(date_col) elif columnkey == "indicatorvalue": returnval['message'] = "Could not parse the following values: " for val_col in arrayset: try: float(val_col) except: returnval['errors'].append(val_col)
def load_with_model_and_csv(self, metadata, csv_file, private): """ Load a dataset using a metadata model file and a csv file """ if metadata is None: response.status = 400 return to_jsonp({'errors': 'metadata is missing'}) if csv_file is None: response.status = 400 return to_jsonp({'errors': 'csv_file is missing'}) # We proceed with the dataset try: model = json.load(urllib2.urlopen(metadata)) except: response.status = 400 return to_jsonp({'errors': 'JSON model could not be parsed'}) try: log.info("Validating model") model = validate_model(model) except Invalid as i: log.error("Errors occured during model validation:") for field, error in i.asdict().items(): log.error("%s: %s", field, error) response.status = 400 return to_jsonp({'errors': 'Model is not well formed'}) dataset = Dataset.by_name(model['dataset']['name']) if dataset is None: dataset = Dataset(model) require.dataset.create() dataset.managers.append(c.account) dataset.private = private db.session.add(dataset) else: require.dataset.update(dataset) log.info("Dataset: %s", dataset.name) source = Source(dataset=dataset, creator=c.account, url=csv_file) log.info(source) for source_ in dataset.sources: if source_.url == csv_file: source = source_ break db.session.add(source) db.session.commit() # Send loading of source into celery queue load_source.delay(source.id) return to_jsonp(dataset_apply_links(dataset.as_dict()))
def csvimport_fixture(name): model_fp = csvimport_fixture_file(name, 'model.json') mapping_fp = csvimport_fixture_file(name, 'mapping.json') model = json.load(model_fp) if mapping_fp: model['mapping'] = json.load(mapping_fp) dataset = Dataset(model) dataset.generate() db.session.add(dataset) data_path = csvimport_fixture_path(name, 'data.csv') user = make_account() source = Source(dataset, user, data_path) db.session.add(source) db.session.commit() return source
def analyze_source(source_id): from openspending.model import meta as db from openspending.model.source import Source from openspending.importer.analysis import analyze_csv source = Source.by_id(source_id) if not source: log.error("No such source: %s", source_id) return log.info("Analyzing: %s", source.url) source.analysis = analyze_csv(source.url) if 'error' in source.analysis: log.error(source.analysis.get('error')) else: log.info("Columns: %r", source.analysis.get('columns')) db.session.commit()
def check_column(source_id, columnkey, columnvalue): # with flask_app.app_context(): source = Source.by_id(source_id) sourcerefine = source.get_or_create_ORProject() # should cache this at some point sourcefile_export = sourcerefine.refineproj.export() # remove BOM from the source file s = sourcefile_export.read() u = s.decode("utf-8-sig") sourcefile = io.BytesIO() sourcefile.write(str(u)) sourcefile_csv = csv.DictReader(sourcefile, delimiter="\t") arrayset = [] for row in sourcefile_csv: print row[columnvalue] arrayset.append(row[columnvalue]) sourcefile.close() returnval = {"errors": [], "message": "There was an unexpected error"} if columnkey == "country_level0": temp_geom_countries = ( db.session.query("country") .from_statement( text("SELECT geometry__country_level0.label as country FROM public.geometry__country_level0 ") ) .all() ) geom_countries = [y for x in temp_geom_countries for y in x] temp_geom_countries = None returnval["message"] = "The following countries were not found:" for country in arrayset: # there is probably a better method that takes advantage of a sorted list if country not in geom_countries: # log as error returnval["errors"].append(country) elif columnkey == "time": returnval["message"] = "Could not parse the following dates:" for date_col in arrayset: try: parse(date_col) except Exception, e: returnval["errors"].append(date_col)
def update(archive_dir, dataset=None): """ Download all sources into an archive directory. If dataset parameter is provided only sources for that dataset will be fetched (otherwise all source in the database will be fetched) """ # Create archive directory if it doesn't exist if not os.path.isdir(archive_dir): os.makedirs(archive_dir) # If a dataset is provided we limit to only its sources (else we take all) sources = Source.all() if dataset is None else dataset.sources # Update each source for source in sources: update_source(archive_dir, source)
def load_source(source_id, sample=False): from openspending.model.source import Source from openspending.importer import CSVImporter source = Source.by_id(source_id) if not source: log.error("No such source: %s", source_id) if not source.loadable: log.error("Dataset has no mapping.") return source.dataset.generate() importer = CSVImporter(source) if sample: importer.run(dry_run=True, max_lines=1000, max_errors=1000) else: importer.run() index_dataset.delay(source.dataset.name)
def create(self, dataset): self._get_dataset(dataset) require.dataset.update(c.dataset) try: schema = source_schema() data = schema.deserialize(request.params) source = Source(c.dataset, c.account, data['url']) db.session.add(source) db.session.commit() analyze_source.apply_async(args=[source.id], countdown=2) h.flash_success(_("The source has been created.")) redirect( h.url_for(controller='editor', action='index', dataset=c.dataset.name)) except Invalid as i: errors = i.asdict() errors = [(k[len('source.'):], v) for k, v in errors.items()] return self.new(dataset, dict(errors))
def load_source(source_id, sample=False): with flask_app.app_context(): source = Source.by_id(source_id) if not source: return log.error("No such source: %s", source_id) if not source.dataset.mapping: return log.error("Dataset has no mapping.") #we should drop this first to make sure everything loads corrctly source.model.drop() source.model.generate() importer = ORImporter(source) if sample: importer.run(dry_run=True, max_lines=1000, max_errors=1000) else: importer.run()
def load_source(source_id, sample=False): # with flask_app.app_context(): source = Source.by_id(source_id) if not source: return log.error("No such source: %s", source_id) if not source.dataset.mapping: return log.error("Dataset has no mapping.") # we should drop this first to make sure everything loads corrctly source.model.drop() source.model.generate() importer = ORImporter(source) if sample: importer.run(dry_run=True, max_lines=1000, max_errors=1000) else: importer.run()
def load_budgetdatapackage(source_id, sample=False): """ Same as the CSV importer except that it uses the BudgetDataPackage importer instead of the CSVImporter """ from openspending.model.source import Source from openspending.importer import BudgetDataPackageImporter source = Source.by_id(source_id) if not source: log.error("No such source: %s", source_id) if not source.loadable: log.error("Dataset has no mapping.") return source.dataset.generate() importer = BudgetDataPackageImporter(source) if sample: importer.run(dry_run=True, max_lines=1000, max_errors=1000) else: importer.run() index_dataset.delay(source.dataset.name)
def create_budget_data_package(url, user, private): try: bdpkg = BudgetDataPackage(url) except Exception as problem: # Lots of different types of problems can arise with a # BudgetDataPackage, but their message should be understandable # so we catch just any Exception and email it's message to the user log.error("Failed to parse budget data package: {0}".format( problem.message)) return [] sources = [] for (idx, resource) in enumerate(bdpkg.resources): dataset = Dataset.by_name(bdpkg.name) if dataset is None: # Get information from the descriptior file for the given # resource (at index idx) info = get_dataset_info_from_descriptor(bdpkg, idx) # Set the dataset name based on the previously computed one info['dataset']['name'] = bdpkg.name # Create the model from the resource schema model = create_model_from_schema(resource.schema) # Set the default value for the time to the fiscal year of the # resource, because it isn't included in the budget CSV so we # won't be able to load it along with the data. model['time']['default_value'] = resource.fiscalYear # Add the model as the mapping info['mapping'] = model # Create the dataset dataset = Dataset(info) dataset.managers.append(user) dataset.private = private db.session.add(dataset) db.session.commit() else: if not dataset.can_update(user): log.error( "User {0} not permitted to update dataset {1}".format( user.name, bdpkg.name)) return [] if 'url' in resource: resource_url = resource.url elif 'path' in resource: if 'base' in bdpkg: resource_url = urlparse.urljoin(bdpkg.base, resource.path) else: resource_url = urlparse.urljoin(url, resource.path) else: log.error('Url not found') return [] # We do not re-add old sources so if we find the same source # we don't do anything, else we create the source and append it # to the source list for dataset_source in dataset.sources: if dataset_source.url == resource_url: break else: source = Source(dataset=dataset, creator=user, url=resource_url) db.session.add(source) db.session.commit() sources.append(source) return sources
def create(self): """ Adds a new dataset dynamically through a POST request """ # User must be authenticated so we should have a user object in # c.account, if not abort with error message if not c.account: abort(status_code=400, detail='user not authenticated') # Parse the loading api parameters to get them into the right format parser = LoadingAPIParamParser(request.params) params, errors = parser.parse() if errors: response.status = 400 return to_jsonp({'errors': errors}) if params['metadata'] is None: response.status = 400 return to_jsonp({'errors': 'metadata is missing'}) if params['csv_file'] is None: response.status = 400 return to_jsonp({'errors': 'csv_file is missing'}) # We proceed with the dataset try: model = json.load(urllib2.urlopen(params['metadata'])) except: response.status = 400 return to_jsonp({'errors': 'JSON model could not be parsed'}) try: log.info("Validating model") model = validate_model(model) except Invalid as i: log.error("Errors occured during model validation:") for field, error in i.asdict().items(): log.error("%s: %s", field, error) response.status = 400 return to_jsonp({'errors': 'Model is not well formed'}) dataset = Dataset.by_name(model['dataset']['name']) if dataset is None: dataset = Dataset(model) require.dataset.create() dataset.managers.append(c.account) dataset.private = params['private'] db.session.add(dataset) else: require.dataset.update(dataset) log.info("Dataset: %s", dataset.name) source = Source(dataset=dataset, creator=c.account, url=params['csv_file']) log.info(source) for source_ in dataset.sources: if source_.url == params['csv_file']: source = source_ break db.session.add(source) db.session.commit() # Send loading of source into celery queue load_source.delay(source.id) return to_jsonp(dataset_apply_links(dataset.as_dict()))
def _get_source(self, dataset, id): self._get_dataset(dataset) c.source = Source.by_id(id) if c.source is None or c.source.dataset != c.dataset: abort(404, _("There is no source '%s'") % id)