def upload(self, id): package_type = self._get_package_type(id) geno = get_geno(package_type) lc = ckanapi.LocalCKAN(username=c.user) dataset = lc.action.package_show(id=id) try: if request.POST['xls_update'] == '': raise BadExcelData('You must provide a valid file') _process_upload_file( lc, dataset, request.POST['xls_update'].file, geno) h.flash_success(_( "Your file was successfully uploaded into the central system." )) redirect(h.url_for(controller='package', action='read', id=id)) except BadExcelData, e: org = lc.action.organization_show(id=dataset['owner_org']) return self.preview_table( resource_name=dataset['resources'][0]['name'], owner_org=org['name'], errors=[e.message])
def recombinant_get_geno(dataset_type): """ Get the dataset definition (geno) for thr given dataset type """ try: return get_geno(dataset_type) except RecombinantException: return
def recombinant_get_geno(dataset_type): """ Get the dataset definition (geno) for thr given dataset type """ try: return get_geno(dataset_type) except RecombinantException: return
def schema_json(self, dataset_type): try: geno = get_geno(dataset_type) except RecombinantException: abort(404, _('Recombinant dataset_type not found')) schema = OrderedDict() for k in ['dataset_type', 'title', 'notes']: if k in geno: schema[k] = geno[k] schema['resources'] = [] for chromo in geno['resources']: resource = OrderedDict() schema['resources'].append(resource) choice_fields = dict( (f['datastore_id'], f['choices']) for f in recombinant_choice_fields( chromo['resource_name'], all_languages=True)) for k in ['title', 'resource_name']: if k in chromo: resource[k] = chromo[k] resource['fields'] = [] for field in chromo['fields']: if not field.get('visible_to_public', True): continue fld = OrderedDict() resource['fields'].append(fld) fld['id'] = field['datastore_id'] for k in ['label', 'description', 'obligation', 'format_type']: if k in field: fld[k] = field[k] if fld['id'] in choice_fields: choices = OrderedDict() fld['choices'] = choices for ck, cv in choice_fields[fld['id']]: choices[ck] = cv resource['primary_key'] = chromo['datastore_primary_key'] if 'examples' in chromo: ex_record = chromo['examples']['record'] example = OrderedDict() for field in chromo['fields']: if field['datastore_id'] in ex_record: example[field['datastore_id']] = ex_record[ field['datastore_id']] resource['example_record'] = example response.headers['Content-Type'] = 'application/json' response.headers['Content-Disposition'] = ( 'inline; filename="{0}.json"'.format( dataset_type)) return json.dumps(schema, indent=2, ensure_ascii=False).encode('utf-8')
def preview_table(self, id, resource_id): lc = ckanapi.LocalCKAN(username=c.user) dataset = lc.action.package_show(id=id) try: get_geno(dataset['type']) except RecombinantException: abort(404, _('Recombinant dataset_type not found')) for r in dataset['resources']: if r['id'] == resource_id: break else: abort(404, _('Resource not found')) return render('recombinant/resource_edit.html', extra_vars={ 'dataset': dataset, 'resource': r, })
def rebuild(command_name, csv_files=None, solr_url=None, strict=True): """ Implement rebuild command :param csv_file: path to .csv file for input :type csv_file: str :return: Nothing :rtype: None """ clear_index(command_name, solr_url, False) conn = solr_connection(command_name, solr_url) lc = LocalCKAN() if csv_files: for csv_file in csv_files: print csv_file + ':' prev_org = None unmatched = None firstpart, filename = os.path.split(csv_file) assert filename.endswith('.csv') resource_name = filename[:-4] chromo = get_chromo(resource_name) geno = get_geno(chromo['dataset_type']) for org_id, records in csv_data_batch(csv_file, chromo, strict=strict): records = [ dict((k, safe_for_solr(v)) for k, v in row_dict.items()) for row_dict in records ] if org_id != prev_org: unmatched = None try: org_detail = lc.action.organization_show(id=org_id) except NotFound: continue print " {0:s} {1}".format(org_id, len(records)) unmatched = _update_records(records, org_detail, conn, resource_name, unmatched) else: for org in lc.action.organization_list(): count = 0 org_detail = lc.action.organization_show(id=org) unmatched = None for resource_name, records in data_batch(org_detail['id'], lc, command_name): unmatched = _update_records(records, org_detail, conn, resource_name, unmatched) count += len(records) print org, count print "commit" conn.commit()
def data_dictionary(self, dataset_type): try: geno = get_geno(dataset_type) except RecombinantException: abort(404, _('Recombinant dataset_type not found')) book = excel_data_dictionary(geno) blob = StringIO() book.save(blob) response.headers['Content-Type'] = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' return blob.getvalue()
def _rebuild(self, csv_files=None, solr_url=None, strict=True): """ Implement rebuild command :param csv_files: sequence of paths to .csv files for input :type csv_files: sequence of str :return: Nothing :rtype: None """ self._clear_index(solr_url, False) conn = solr_connection('ati', solr_url) lc = LocalCKAN() if csv_files: for csv_file in csv_files: print csv_file + ':' firstpart, filename = os.path.split(csv_file) assert filename.endswith('.csv') resource_name = filename[:-4] chromo = get_chromo(resource_name) geno = get_geno(chromo['dataset_type']) assert geno.get('target_dataset') == TARGET_DATASET for org_id, records in csv_data_batch(csv_file, chromo, strict=strict): records = [ dict((k, safe_for_solr(v)) for k, v in row_dict.items()) for row_dict in records ] try: org_detail = lc.action.organization_show(id=org_id) except NotFound: continue print " {0:s} {1}".format(org_id, len(records)) _update_records(records, org_detail, conn) else: for org_id in lc.action.organization_list(): count = 0 org_detail = lc.action.organization_show(id=org_id) for resource_name, records in data_batch( org_detail['id'], lc, TARGET_DATASET): _update_records(records, org_detail, conn) count += len(records) print org_id, count print "commit" conn.commit()
def rebuild(command_name, csv_files=None, solr_url=None): """ Implement rebuild command :param csv_file: path to .csv file for input :type csv_file: str :return: Nothing :rtype: None """ clear_index(command_name, solr_url, False) conn = solr_connection(command_name, solr_url) lc = LocalCKAN() if csv_files: for csv_file in csv_files: print csv_file + ':' prev_org = None unmatched = None firstpart, filename = os.path.split(csv_file) assert filename.endswith('.csv') resource_name = filename[:-4] chromo = get_chromo(resource_name) geno = get_geno(chromo['dataset_type']) for org_id, records in csv_data_batch(csv_file, chromo): records = [dict((k, safe_for_solr(v)) for k, v in row_dict.items()) for row_dict in records] if org_id != prev_org: unmatched = None try: org_detail = lc.action.organization_show(id=org_id) except NotFound: continue print " {0:s} {1}".format(org_id, len(records)) unmatched = _update_records( records, org_detail, conn, resource_name, unmatched) else: for org in lc.action.organization_list(): count = 0 org_detail = lc.action.organization_show(id=org) unmatched = None for resource_name, records in data_batch(org_detail['id'], lc, command_name): unmatched = _update_records( records, org_detail, conn, resource_name, unmatched) count += len(records) print org, count print "commit" conn.commit()
def excel_template(dataset_type, org): """ return an openpyxl.Workbook object containing the sheet and header fields for passed dataset_type and org. Supports version 2 and 3 templates. """ geno = get_geno(dataset_type) version = geno.get('template_version', 2) book = openpyxl.Workbook() sheet = book.active refs = [] choice_ranges = [] for rnum, chromo in enumerate(geno['resources'], 1): if version == 2: _populate_excel_sheet_v2(sheet, chromo, org, refs) elif version == 3: choice_ranges.append(_populate_excel_sheet( sheet, geno, chromo, org, refs, rnum)) sheet.protection.enabled = True sheet.protection.formatRows = False sheet.protection.formatColumns = False sheet = book.create_sheet() if version == 2: _populate_reference_sheet_v2(sheet, chromo, refs) elif version == 3: _populate_reference_sheet(sheet, geno, refs) sheet.title = 'reference' sheet.protection.enabled = True if version == 2: return book for i, (chromo, cranges) in enumerate( zip(geno['resources'], choice_ranges), 1): sheet = book.create_sheet() _populate_excel_e_sheet(sheet, chromo, cranges) sheet.title = 'e{i}'.format(i=i) sheet.protection.enabled = True sheet.sheet_state = 'hidden' sheet = book.create_sheet() _populate_excel_r_sheet(sheet, chromo) sheet.title = 'r{i}'.format(i=i) sheet.protection.enabled = True sheet.sheet_state = 'hidden' return book
def _rebuild(self, csv_files=None, solr_url=None): """ Implement rebuild command :param csv_files: sequence of paths to .csv files for input :type csv_files: sequence of str :return: Nothing :rtype: None """ self._clear_index(solr_url, False) conn = solr_connection('ati', solr_url) lc = LocalCKAN() if csv_files: for csv_file in csv_files: print csv_file + ':' firstpart, filename = os.path.split(csv_file) assert filename.endswith('.csv') resource_name = filename[:-4] chromo = get_chromo(resource_name) geno = get_geno(chromo['dataset_type']) assert geno.get('target_dataset') == TARGET_DATASET for org_id, records in csv_data_batch(csv_file, chromo): records = [dict((k, safe_for_solr(v)) for k, v in row_dict.items()) for row_dict in records] try: org_detail = lc.action.organization_show(id=org_id) except NotFound: continue print " {0:s} {1}".format(org_id, len(records)) _update_records(records, org_detail, conn) else: for org_id in lc.action.organization_list(): count = 0 org_detail = lc.action.organization_show(id=org_id) for resource_name, records in data_batch(org_detail['id'], lc, TARGET_DATASET): _update_records(records, org_detail, conn) count += len(records) print org_id, count print "commit" conn.commit()
def data_batch(org_id, lc, target_dataset): """ Generator of dataset dicts for organization with name org :param org_id: the id for the organization of interest :ptype org_id: str :param lc: local CKAN :ptype lc: obj :param target_dataset: name of target dataset (e.g., 'ati', 'pd', etc.) :ptype target_dataset: str :return generates batches of dataset dict records :rtype batch of dataset dict records """ dataset_types = get_dataset_types() for dataset_type in dataset_types: geno = get_geno(dataset_type) if geno.get('target_dataset') == target_dataset: break else: return result = lc.action.package_search( q="type:{0:s} owner_org:{1:s}".format(dataset_type, org_id), rows=2)['results'] if not result: return if len(result) != 1: sys.stderr.write('1 record expected for %s %s, found %d' % (dataset_type, org_id, len(result))) dataset = result[0] for resource in dataset['resources']: offset = 0 while True: rval = lc.action.datastore_search( resource_id=resource['id'], limit=BATCH_SIZE, offset=offset) records = rval['records'] if not records: break offset += len(records) yield records
def _action_find_dataset(context, data_dict): ''' common code for actions that need to check for a dataset based on the dataset type and organization name or id ''' dataset_type = get_or_bust(data_dict, 'dataset_type') owner_org = get_or_bust(data_dict, 'owner_org') try: geno = get_geno(dataset_type) except RecombinantException: raise ValidationError( {'dataset_type': _("Recombinant dataset type not found")}) lc = LocalCKAN(username=context['user']) result = lc.action.package_search(q="type:%s organization:%s" % (dataset_type, owner_org), rows=2) return lc, geno, result['results']
def _action_find_dataset(context, data_dict): ''' common code for actions that need to check for a dataset based on the dataset type and organization name or id ''' dataset_type = get_or_bust(data_dict, 'dataset_type') owner_org = get_or_bust(data_dict, 'owner_org') try: geno = get_geno(dataset_type) except RecombinantException: raise ValidationError({'dataset_type': _("Recombinant dataset type not found")}) lc = LocalCKAN(username=context['user']) result = lc.action.package_search( q="type:%s organization:%s" % (dataset_type, owner_org), rows=2) return lc, geno, result['results']
def csv_data_batch(csv_path, target_dataset): """ Generator of dataset records from csv file :param csv_path: file to parse :return a batch of records for at most one organization :rtype: dict mapping at most one org-id to at most BATCH_SIZE (dict) records """ records = [] current_owner_org = None firstpart, filename = os.path.split(csv_path) assert filename.endswith(".csv") chromo = get_chromo(filename[:-4]) geno = get_geno(chromo["dataset_type"]) assert geno.get("target_dataset") == target_dataset with open(csv_path) as f: csv_in = DictReader(f) cols = csv_in.unicode_fieldnames expected = [f["datastore_id"] for f in chromo["fields"]] assert cols[:-2] == expected, "column mismatch:\n{0}\n{1}".format(cols[:-2], expected) for row_dict in csv_in: owner_org = row_dict.pop("owner_org") owner_org_title = row_dict.pop("owner_org_title") if owner_org != current_owner_org: if records: yield (current_owner_org, records) records = [] current_owner_org = owner_org row_dict = dict((k, safe_for_solr(v)) for k, v in row_dict.items()) records.append(row_dict) if len(records) >= BATCH_SIZE: yield (current_owner_org, records) records = [] if records: yield (current_owner_org, records)
def excel_template(dataset_type, org): """ return an openpyxl.Workbook object containing the sheet and header fields for passed dataset_type and org. """ geno = get_geno(dataset_type) book = openpyxl.Workbook() sheet = book.active refs = [] for chromo in geno['resources']: _populate_excel_sheet(sheet, chromo, org, refs) sheet = book.create_sheet() ref = sheet ref.title = 'reference' ref.append([u'field', u'key', u'value']) for ref_line in refs: ref.append(ref_line) return book
def excel_template(dataset_type, org): """ return an openpyxl.Workbook object containing the sheet and header fields for passed dataset_type and org. """ geno = get_geno(dataset_type) book = openpyxl.Workbook() sheet = book.active refs = [] for chromo in geno['resources']: _populate_excel_sheet(sheet, chromo, org, refs) sheet = book.create_sheet() ref = sheet ref.title = 'reference' ref.append([u'field', u'key', u'value']) for ref_line in refs: ref.append(ref_line) return book
def csv_data_batch(csv_path, target_dataset): """ Generator of dataset records from csv file :param csv_path: file to parse """ records = [] current_owner_org = None firstpart, filename = os.path.split(csv_path) assert filename.endswith('.csv') resource_name = filename[:-4] chromo = get_chromo(resource_name) geno = get_geno(chromo['dataset_type']) assert geno.get('target_dataset') == target_dataset with open(csv_path) as f: csv_in = DictReader(f) cols = csv_in.unicode_fieldnames expected = [f['datastore_id'] for f in chromo['fields']] assert cols[:-2] == expected, 'column mismatch:\n{0}\n{1}'.format( cols[:-2], expected) for row_dict in csv_in: owner_org = row_dict.pop('owner_org') owner_org_title = row_dict.pop('owner_org_title') if owner_org != current_owner_org: if records: yield (resource_name, current_owner_org, records) records = [] current_owner_org = owner_org row_dict = dict((k, safe_for_solr(v)) for k, v in row_dict.items()) records.append(row_dict) if len(records) >= BATCH_SIZE: yield (resource_name, current_owner_org, records) records = [] if records: yield (resource_name, current_owner_org, records)
def _show(self, dataset_type, org_name): """ Display some information about the status of recombinant datasets """ orgs = [org_name] if org_name else self._get_orgs() types = [dataset_type] if dataset_type else get_dataset_types() for dtype in types: print u'{geno[title]} ({dtype})'.format( geno=get_geno(dtype), dtype=dtype).encode('utf-8') packages = self._get_packages(dtype, orgs) if dataset_type: for p in packages: print p['owner_org'] if 'error' in p: print ' *** {p[error]}'.format(p=p) elif not p['metadata_correct']: print ' ! metadata needs to be updated' for r in p['resources']: print ' - id:{r[id]} {r[name]}'.format(r=r), if 'error' in r: print ' *** {r[error]}'.format(r=r) else: print 'rows:{r[datastore_rows]}'.format(r=r) if not r['datastore_correct']: print ' ! datastore needs to be updated' if not r['metadata_correct']: print ' ! metadata needs to be updated' if len(packages) != len(orgs): print (' > %d orgs but %d records found' % (len(orgs), len(packages))) else: print (' > %d datasets found' % (len(packages),)) need_update = sum(1 for p in packages if not p['all_correct']) if need_update: print (' --> %d need to be updated' % need_update)
def _show(self, dataset_type, org_name): """ Display some information about the status of recombinant datasets """ orgs = [org_name] if org_name else self._get_orgs() types = [dataset_type] if dataset_type else get_dataset_types() for dtype in types: print u'{geno[title]} ({dtype})'.format( geno=get_geno(dtype), dtype=dtype).encode('utf-8') packages = self._get_packages(dtype, orgs) if dataset_type: for p in packages: print p['owner_org'] if 'error' in p: print ' *** {p[error]}'.format(p=p) elif not p['metadata_correct']: print ' ! metadata needs to be updated' for r in p['resources']: print ' - id:{r[id]} {r[name]}'.format(r=r), if 'error' in r: print ' *** {r[error]}'.format(r=r) else: print 'rows:{r[datastore_rows]}'.format(r=r) if not r['datastore_correct']: print ' ! datastore needs to be updated' if not r['metadata_correct']: print ' ! metadata needs to be updated' if len(packages) != len(orgs): print(' > %d orgs but %d records found' % (len(orgs), len(packages))) else: print(' > %d datasets found' % (len(packages), )) need_update = sum(1 for p in packages if not p['all_correct']) if need_update: print(' --> %d need to be updated' % need_update)
def upload(self, id): package_type = self._get_package_type(id) geno = get_geno(package_type) lc = ckanapi.LocalCKAN(username=c.user) dataset = lc.action.package_show(id=id) try: if request.POST['xls_update'] == '': raise BadExcelData('You must provide a valid file') _process_upload_file(lc, dataset, request.POST['xls_update'].file, geno) h.flash_success( _("Your file was successfully uploaded into the central system." )) redirect(h.url_for(controller='package', action='read', id=id)) except BadExcelData, e: org = lc.action.organization_show(id=dataset['owner_org']) return self.preview_table( resource_name=dataset['resources'][0]['name'], owner_org=org['name'], errors=[e.message])
def upload(self, id): package_type = self._get_package_type(id) geno = get_geno(package_type) lc = ckanapi.LocalCKAN(username=c.user) dataset = lc.action.package_show(id=id) try: if request.POST['xls_update'] == '': raise BadExcelData('You must provide a valid file') _process_upload_file( lc, dataset, request.POST['xls_update'].file, geno) h.flash_success(_( "Your file was successfully uploaded into the central system." )) redirect(h.url_for(controller='package', action='read', id=id)) except BadExcelData, e: x_vars = {'errors': [e.message], 'action': 'edit'} c.pkg_dict = dataset return render(self._edit_template(package_type), extra_vars=x_vars)
def _dataset_types(self, dataset_types): for t in self._expand_dataset_types(): print t + ': ' + ' '.join( c['resource_name'] for c in get_geno(t)['resources'])
def _dataset_types(self, dataset_types): for t in self._expand_dataset_types(): print t + ': ' + ' '.join(c['resource_name'] for c in get_geno(t)['resources'])