def _update_datasets(app): """ Updates from conp-datasets """ from app import db, config from app.models import Dataset as DBDataset from datalad import api from datalad.api import Dataset as DataladDataset import fnmatch import json datasetspath = app.config['DATA_PATH'] d = DataladDataset(path=datasetspath + '/conp-dataset') if not d.is_installed(): api.clone(source='https://github.com/CONP-PCNO/conp-dataset', path=datasetspath + '/conp-dataset') d = DataladDataset(path=datasetspath + '/conp-dataset') d.install(path='', recursive=True) try: d.update(path='', merge=True, recursive=True) except Exception as e: print("\033[91m") print("[ERROR ] An exception occurred in datalad update.") print(e.args) print("\033[0m") return print('[INFO ] conp-dataset update complete') print('[INFO ] Updating subdatasets') for ds in d.subdatasets(): print('[INFO ] Updating ' + ds['gitmodule_url']) subdataset = DataladDataset(path=ds['path']) if not subdataset.is_installed(): try: api.clone(source=ds['gitmodule_url'], path=ds['path']) subdataset = DataladDataset(path=ds['path']) subdataset.install(path='') except Exception as e: print("\033[91m") print( "[ERROR ] An exception occurred in datalad install for " + str(ds) + ".") print(e.args) print("\033[0m") continue dirs = os.listdir(ds['path']) descriptor = '' for file in dirs: if fnmatch.fnmatch(file.lower(), 'dats.json'): descriptor = file if descriptor == '': print("\033[91m") print('[ERROR ] DATS.json file can`t be found in ' + ds['path'] + ".") print("\033[0m") continue try: with open(os.path.join(ds['path'], descriptor), 'r') as f: dats = json.load(f) except Exception as e: print("\033[91m") print("[ERROR ] Descriptor file can't be read.") print(e.args) print("\033[0m") continue # use dats.json data to fill the datasets table # avoid duplication / REPLACE instead of insert dataset = DBDataset.query.filter_by( dataset_id=ds['gitmodule_name']).first() if dataset is None: dataset = DBDataset() dataset.dataset_id = ds['gitmodule_name'] dataset.date_created = datetime.utcnow() dataset.date_updated = datetime.utcnow() dataset.fspath = ds['path'] dataset.description = dats.get('description', 'No description in DATS.json') dataset.name = dats.get('title', os.path.basename(dataset.dataset_id)) db.session.merge(dataset) db.session.commit() print('[INFO ] ' + ds['gitmodule_name'] + ' updated.')
def _update_datasets(app): """ Updates from conp-datasets """ from app import db from app.models import ArkId from app.models import Dataset as DBDataset from app.models import DatasetAncestry as DBDatasetAncestry from sqlalchemy import exc from datalad import api from datalad.api import Dataset as DataladDataset import fnmatch import json from pathlib import Path import git datasetsdir = Path(app.config['DATA_PATH']) / 'conp-dataset' datasetsdir.mkdir(parents=True, exist_ok=True) # Initialize the git repository object try: repo = git.Repo(datasetsdir) except git.exc.InvalidGitRepositoryError: repo = git.Repo.clone_from('https://github.com/CONP-PCNO/conp-dataset', datasetsdir, branch='master') # Update to latest commit origin = repo.remotes.origin origin.pull('master') repo.submodule_update(recursive=False, keep_going=True) d = DataladDataset(path=datasetsdir) if not d.is_installed(): api.clone(source='https://github.com/CONP-PCNO/conp-dataset', path=datasetsdir) d = DataladDataset(path=datasetsdir) try: d.install(path='', recursive=True) except Exception as e: print("\033[91m") print("[ERROR ] An exception occurred in datalad update.") print(e.args) print("\033[0m") return print('[INFO ] conp-dataset update complete') print('[INFO ] Updating subdatasets') for ds in d.subdatasets(): print('[INFO ] Updating ' + ds['gitmodule_url']) subdataset = DataladDataset(path=ds['path']) if not subdataset.is_installed(): try: api.clone(source=ds['gitmodule_url'], path=ds['path']) subdataset = DataladDataset(path=ds['path']) subdataset.install(path='') except Exception as e: print("\033[91m") print( "[ERROR ] An exception occurred in datalad install for " + str(ds) + ".") print(e.args) print("\033[0m") continue # The following relates to the DATS.json files # of the projects directory in the conp-dataset repo. # Skip directories that aren't projects. patterns = [app.config['DATA_PATH'] + '/conp-dataset/projects/*'] if not any( fnmatch.fnmatch(ds['path'], pattern) for pattern in patterns): continue dirs = os.listdir(ds['path']) descriptor = '' for file in dirs: if fnmatch.fnmatch(file.lower(), 'dats.json'): descriptor = file if descriptor == '': print("\033[91m") print('[ERROR ] DATS.json file can`t be found in ' + ds['path'] + ".") print("\033[0m") continue try: with open(os.path.join(ds['path'], descriptor), 'r') as f: dats = json.load(f) except Exception as e: print("\033[91m") print("[ERROR ] Descriptor file can't be read.") print(e.args) print("\033[0m") continue # use dats.json data to fill the datasets table # avoid duplication / REPLACE instead of insert dataset = DBDataset.query.filter_by( dataset_id=ds['gitmodule_name']).first() # pull the timestamp of the first commit in the git log for the dataset create date createDate = datetime.utcnow() try: createTimeStamp = os.popen( "git -C {} log --pretty=format:%ct --reverse | head -1".format( ds['path'])).read() createDate = datetime.fromtimestamp(int(createTimeStamp)) except Exception: print("[ERROR ] Create Date couldnt be read.") firstMergeDate = datetime.utcnow() try: firstMergeTimeStamp = os.popen( "git -C {} log --pretty=format:%ct --reverse {} | head -1". format(app.config['DATA_PATH'] + "/conp-dataset", ds['path'])).read() firstMergeDate = datetime.fromtimestamp(int(firstMergeTimeStamp)) except Exception: print( "[ERROR ] First merge date of the submodule dataset could not be read." ) # last commit in the git log for the dataset update date updateDate = datetime.utcnow() try: createTimeStamp = os.popen( "git -C {} log --pretty=format:%ct | head -1".format( ds['path'])).read() updateDate = datetime.fromtimestamp(int(createTimeStamp)) except Exception: print("[ERROR ] Update Date couldnt be read.") # get the remote URL remoteUrl = None try: remoteUrl = os.popen( "git -C {} config --get remote.origin.url".format( ds['path'])).read() except Exception: print("[ERROR ] Remote URL couldnt be read.") if dataset is None: dataset = DBDataset() dataset.dataset_id = ds['gitmodule_name'] dataset.date_created = createDate dataset.date_added_to_portal = firstMergeDate if dataset.date_created != createDate: dataset.date_created = createDate # check for dataset ancestry extraprops = dats.get('extraProperties', []) for prop in extraprops: if prop.get('category') == 'parent_dataset_id': for x in prop.get('values', []): if x.get('value', None) is None: continue datasetAncestry = DBDatasetAncestry() datasetAncestry.id = str(uuid.uuid4()) datasetAncestry.parent_dataset_id = 'projects/' + \ x.get('value', None) datasetAncestry.child_dataset_id = dataset.dataset_id try: db.session.merge(datasetAncestry) db.session.commit() except exc.IntegrityError: # we already have a record of this ancestry db.session.rollback() if not dataset.date_added_to_portal: dataset.date_added_to_portal = firstMergeDate dataset.date_updated = updateDate dataset.fspath = ds['path'] dataset.remoteUrl = remoteUrl dataset.description = dats.get('description', 'No description in DATS.json') dataset.name = dats.get('title', os.path.basename(dataset.dataset_id)) db.session.merge(dataset) db.session.commit() # if the dataset does not have an ARK identifier yet, generate it dataset_with_ark_id_list = [ row[0] for row in db.session.query(ArkId.dataset_id).all() ] if dataset.dataset_id not in dataset_with_ark_id_list: new_ark_id = ark_id_minter(app, 'dataset') save_ark_id_in_database(app, 'dataset', new_ark_id, dataset.dataset_id) print('[INFO ] ' + ds['gitmodule_name'] + ' updated.')
def _update_datasets(app): """ Updates from conp-datasets """ from app import db, config from app.models import Dataset as DBDataset from datalad import api from datalad.api import Dataset as DataladDataset import fnmatch import json from pathlib import Path import git datasetsdir = Path(app.config['DATA_PATH']) / 'conp-dataset' datasetsdir.mkdir(parents=True, exist_ok=True) # Initialize the git repository object try: repo = git.Repo(datasetsdir) except git.exc.InvalidGitRepositoryError as e: repo = git.Repo.clone_from( 'https://github.com/CONP-PCNO/conp-dataset', datasetsdir, branch='master' ) # Update to latest commit origin = repo.remotes.origin origin.pull('master') repo.submodule_update(recursive=False, keep_going=True) d = DataladDataset(path=datasetsdir) if not d.is_installed(): api.clone( source='https://github.com/CONP-PCNO/conp-dataset', path=datasetsdir ) d = DataladDataset(path=datasetsdir) try: d.install(path='', recursive=True) except Exception as e: print("\033[91m") print("[ERROR ] An exception occurred in datalad update.") print(e.args) print("\033[0m") return print('[INFO ] conp-dataset update complete') print('[INFO ] Updating subdatasets') for ds in d.subdatasets(): print('[INFO ] Updating ' + ds['gitmodule_url']) subdataset = DataladDataset(path=ds['path']) if not subdataset.is_installed(): try: api.clone( source=ds['gitmodule_url'], path=ds['path'] ) subdataset = DataladDataset(path=ds['path']) subdataset.install(path='') except Exception as e: print("\033[91m") print("[ERROR ] An exception occurred in datalad install for " + str(ds) + ".") print(e.args) print("\033[0m") continue dirs = os.listdir(ds['path']) descriptor = '' for file in dirs: if fnmatch.fnmatch(file.lower(), 'dats.json'): descriptor = file if descriptor == '': print("\033[91m") print('[ERROR ] DATS.json file can`t be found in ' + ds['path'] + ".") print("\033[0m") continue try: with open(os.path.join(ds['path'], descriptor), 'r') as f: dats = json.load(f) except Exception as e: print("\033[91m") print("[ERROR ] Descriptor file can't be read.") print(e.args) print("\033[0m") continue # use dats.json data to fill the datasets table # avoid duplication / REPLACE instead of insert dataset = DBDataset.query.filter_by(dataset_id=ds['gitmodule_name']).first() if dataset is None: dataset = DBDataset() dataset.dataset_id = ds['gitmodule_name'] dataset.date_created = datetime.utcnow() dataset.date_updated = datetime.utcnow() dataset.fspath = ds['path'] dataset.description = dats.get('description', 'No description in DATS.json') dataset.name = dats.get( 'title', os.path.basename(dataset.dataset_id) ) db.session.merge(dataset) db.session.commit() print('[INFO ] ' + ds['gitmodule_name'] + ' updated.')