def test_dataset_init_invalid(self): """Test Dataset.init() with invalid data.""" pdv = Dataset() # invalid data for data in test_config["invalid_set_types"]: with pytest.raises(AssertionError): pdv.set(data)
def test_dataset_set_dv_up(self, import_dataset_min_dict): """Test Dataset.set() with format=`dv_up`. Parameters ---------- import_dataset_min_dict : dict Fixture, which returns a flat dataset dict(). """ ds = Dataset() data = import_dataset_min_dict ds.set(data) """dataset""" assert ds.license == 'CC0' assert ds.termsOfUse == 'CC0 Waiver' assert ds.termsOfAccess == 'Terms of Access' """citation""" assert ds.citation_displayName == 'Citation Metadata' assert ds.title == 'Replication Data for: Title'
def create_testdata(config_file: str, force: bool) -> None: """Create testdata defined in a config file. Creates a pre-defined set of testdata on your instance. By default, the function uses the AUSSDA test data repository, which is so far not publicly available. If `PRODUCTION` is `true`, this function will not execute, as long as you not add `--force` to the function call. This is to protect from unwanted changes on a production instance. """ # Init if config.PRODUCTION and not force: print( "Create testdata on a PRODUCTION instance not allowed. Use --force to force it." ) sys.exit() pid_idx = [] users = read_json(config.USER_FILENAME) workflow = read_json(os.path.join(ROOT_DIR, config_file)) # Dataverses for dv_conf in workflow["dataverses"]: dv_alias = None if "create" in dv_conf: api = NativeApi( config.BASE_URL, users[dv_conf["create"]["user-handle"]]["api-token"]) dv = Dataverse() dv_filename = os.path.join(ROOT_DIR, dv_conf["create"]["metadata-filename"]) dv.from_json(read_file(dv_filename)) if "update" in dv_conf["create"]: for key, val in dv_conf["create"]["update"].items(): kwargs = {key: val} dv.set(kwargs) dv_alias = dv.get()["alias"] resp = api.create_dataverse(dv_conf["create"]["parent"], dv.json()) if "publish" in dv_conf: api = NativeApi( config.BASE_URL, users[dv_conf["publish"]["user-handle"]]["api-token"]) if not dv_alias and "alias" in dv_conf["publish"]: dv_alias = dv_conf["publish"]["alias"] resp = api.publish_dataverse(dv_alias) # Datasets for ds_conf in workflow["datasets"]: pid = None if "create" in ds_conf: api = NativeApi( config.BASE_URL, users[ds_conf["create"]["user-handle"]]["api-token"]) ds = Dataset() ds_filename = os.path.join(ROOT_DIR, ds_conf["create"]["metadata-filename"]) ds.from_json(read_file(ds_filename)) if "update" in ds_conf["create"]: for key, val in ds_conf["create"]["update"].items(): kwargs = {key: val} ds.set(kwargs) resp = api.create_dataset(dv_alias, ds.json()) pid = resp.json()["data"]["persistentId"] pid_idx.append(pid) if "publish" in ds_conf: if not pid: print("ERROR: PID missing!") sys.exit() api = NativeApi( config.BASE_URL, users[ds_conf["publish"]["user-handle"]]["api-token"]) resp = api.publish_dataset(pid, release_type="major") # Datafiles for dataset_id, ds_datafiles in workflow["datafiles"].items(): if int(dataset_id) == workflow["datasets"][int(dataset_id)]["id"]: pid = pid_idx[int(dataset_id)] else: print("ERROR: Dataset ID not matching.") sys.exit() for df_conf in ds_datafiles: if "upload" in df_conf: api = NativeApi( config.BASE_URL, users[df_conf["upload"]["user-handle"]]["api-token"], ) metadata = read_json(df_conf["upload"]["metadata-filename"]) df = Datafile() df.set(metadata) if "update" in df_conf["upload"]: for key, val in df_conf["upload"]["update"].items(): kwargs = {key: val} df.set(kwargs) df.set({"pid": pid}) filename = df_conf["upload"]["filename"] resp = api.upload_datafile(pid, filename, df.json()) if filename[-4:] == ".sav" or filename[-4:] == ".dta": sleep(30) else: sleep(3) if "publish-dataset" in df_conf: api = NativeApi( config.BASE_URL, users[df_conf["publish-dataset"]["user-handle"]]["api-token"], ) if df_conf["publish-dataset"]: resp = api.publish_dataset(pid, release_type="major")
class DataverseData(): def __init__(self, REPO, validate=False): self.ext = PARSABLE_EXTENSIONS self.REPO = REPO self.mapping_dsid2pid = {} self.validate_df = validate self.g = Github(GITHUB_TOKEN) self.repo = self.g.get_repo(REPO) self.urls_found = {} self.ds_id = 0 self.DEBUG = True def githubsearch(self, thisquery): repositories = self.g.search_repositories(query=thisquery, sort='updated') return repositories def search(self, thisquery): search_api = SearchApi(BASE_URL, API_TOKEN) return search_api.search(thisquery).json()['data'] def if_exist(self, thisquery): self.exists = False repoquery = "authorName:%s" % (thisquery) try: for item in self.search(repoquery)['items'][0]['authors']: if item == thisquery: self.exists = True print(item) except: self.exists = False if self.DEBUG: print(self.exists) return self.exists def datasync(self): native_api = NativeApi(BASE_URL, API_TOKEN) self.ds_id = str( int(self.make_dataset_id(self.REPO).hexdigest(), 16))[:6] ## turn the md5 string into a 6 digits integer metadata = self.make_dataset_metadata(self.REPO) print(metadata) self.ds = Dataset() self.ds.set(metadata) self.ds.displayName = metadata['title'] self.ds.json = metadata print(self.ds.get()) if self.DEBUG: print("[datasync]") print(self.ds) print(self.ds_id) print(self.ds.displayName) self.create_dataset(native_api, self.ds, DV_ALIAS, self.ds_id, BASE_URL) if self.DEBUG: print(metadata) self.upload_files_to_dataverse(self.ds_id, self.urls_found) return True def extract_urls(self, content: str) -> list: matches = re.findall(r"(http[^\s'\"\\]+)", content) pattern = re.compile(r"([^/\w]+)$") return [pattern.sub("", match) for match in matches] def decode_github_content(self, content: str) -> str: return base64.b64decode(content).decode("utf-8") def make_dataset_id(self, repo_name): return hashlib.md5(repo_name.encode("utf-8")) def make_default_dataset(self, data, repo_name): ds_id = self.make_dataset_id(repo_name) data[ds_id] = {'metadata': make_dataset_metadata(repo_name)} return data def make_dataset_metadata(self, repo_name): metadata = {} repo = self.g.get_repo(repo_name) metadata['termsOfAccess'] = '' metadata[ 'title'] = 'Automatic uploads from {} github repository'.format( repo_name) metadata[ 'subtitle'] = 'Automatic uploads from {} github repository'.format( repo_name) metadata['author'] = [{ "authorName": repo_name, "authorAffiliation": "CoronaWhy" }] metadata['dsDescription'] = [{'dsDescriptionValue': ''}] metadata['dsDescription'] = [{ 'dsDescriptionValue': format(repo.get_topics()) }] if len(metadata['dsDescription']) < 3: metadata['dsDescription'] = [{'dsDescriptionValue': 'coronavirus'}] metadata['subject'] = ['Medicine, Health and Life Sciences'] metadata['keyword'] = repo.get_topics() metadata['datasetContact'] = [{ 'datasetContactName': 'https://github.com/{}'.format(repo_name), 'datasetContactEmail': '*****@*****.**' }] return metadata def make_file_metadata(self, repo_name, file, url): metadata = {} metadata['description'] = file metadata['filename'] = url metadata['datafile_id'] = hashlib.md5(url.encode("utf-8")) metadata['dataset_id'] = hashlib.md5(repo_name.encode("utf-8")) return metadata def create_dataset(self, api, ds, dv_alias, ds_id, base_url): if self.DEBUG: print("\n\n[create_dataset]") print(ds.get()) # print(ds.to_json()) resp = '' try: resp = api.create_dataset(dv_alias, ds.json()) pid = resp.json()['data']['persistentId'] except: # print(resp.content) return resp, self.mapping_dsid2pid self.mapping_dsid2pid[ds_id] = pid time.sleep(1) print('{0}/dataset.xhtml?persistentId={1}&version=DRAFT'.format( base_url, pid)) return resp # Implementation adapted from http://guides.dataverse.org/en/latest/api/native-api.html#id62 def upload_datafile(self, server, api_key, p_id, repo_name, filename, repo_file, url, columns): dataverse_server = server api_key = api_key persistentId = p_id files = {'file': (url.split('/')[-1], open(filename, 'rb'))} desc = "Data snapshot from %s" % url cat = [repo_name.split('/')[1]] for col in columns: cat.append(col) params = dict(description=desc, directoryLabel=repo_file, categories=cat) params_as_json_string = json.dumps(params) payload = dict(jsonData=params_as_json_string) url_persistent_id = '%s/api/datasets/:persistentId/add?persistentId=%s&key=%s' % ( dataverse_server, persistentId, api_key) print('-' * 40) print('making request') r = requests.post(url_persistent_id, data=payload, files=files) print('-' * 40) try: print(r.json()) except: print(r.content) print(r.status_code) return def collect_urls(self): contents = self.repo.get_contents("") DEBUG = False while contents: file_content = contents.pop(0) urlfullpath = "%s/%s/%s/%s" % (gitroot, self.REPO, gitblob, file_content.path) rawurl = "%s/%s/%s/%s" % (gituserroot, self.REPO, gitmaster, file_content.path) rawurl = rawurl.replace(' ', '%20') if file_content.type == "dir": contents.extend(self.repo.get_contents(file_content.path)) continue if len(PARSABLE_EXTENSIONS) == 0 or file_content.name.split( '.')[-1] in PARSABLE_EXTENSIONS: if DEBUG: print("%s -> %s" % (urlfullpath, rawurl)) self.urls_found[file_content.path] = rawurl print('Found {} URLs'.format(len(self.urls_found))) return self.urls_found def upload_files_to_dataverse(self, ds_id, urls_found): for file, url in urls_found.items(): columns = [] #for url in urls: if file: print(url) try: tmpfile = urllib.request.urlretrieve( url ) # retrieve the csv in a temp file, if there is a problem with the URL it throws and we continue except: continue try: filename = 'file://{}'.format(tmpfile[0]) # TODO: try gzipped datasets as well #if not re.findall(r'(gz$|np$|nt$)', filename): # pd.read_csv(filename) # try reading it as csv, if fails continue print("%s -> %s" % (filename, url)) if self.validate_df: if re.search(r"(xls|xlsx)", url): df = pd.read_excel(filename) columns = list(df.columns) elif re.search(r"json", url): df = pd.read_excel(filename) columns = list(df.columns) else: df = pd.read_csv(filename) columns = list(df.columns) if self.DEBUG: print("Columns: %s" % df.columns) metadata = self.make_file_metadata(REPO, file, url) print('- uploading the following dataset {}'.format(url)) except: continue self.upload_datafile(BASE_URL, API_TOKEN, self.ds_id, self.REPO, tmpfile[0], file, url, columns) return