def init(self, username, reponame, force, backend=None): """ Initialize a Git repo Parameters ---------- username, reponame : Repo name is tuple (name, reponame) force: force initialization of the repo even if exists backend: backend that must be used for this (e.g. s3) """ key = self.key(username, reponame) # In local filesystem-based server, add a repo server_repodir = self.server_rootdir(username, reponame, create=False) # Force cleanup if needed if os.path.exists(server_repodir) and not force: raise RepositoryExists() if os.path.exists(server_repodir): shutil.rmtree(server_repodir) os.makedirs(server_repodir) # Initialize the repo with cd(server_repodir): git.init(".", "--bare") if backend is not None: backend.init_repo(server_repodir) # Now clone the filesystem-based repo repodir = self.rootdir(username, reponame, create=False) # Prepare it if needed if os.path.exists(repodir) and not force: raise Exception("Local repo already exists") if os.path.exists(repodir): shutil.rmtree(repodir) os.makedirs(repodir) # Now clone... with cd(os.path.dirname(repodir)): git.clone(server_repodir, '--no-hardlinks') url = server_repodir if backend is not None: url = backend.url(username, reponame) repo = Repo(username, reponame) repo.manager = self repo.remoteurl = url repo.rootdir = self.rootdir(username, reponame) self.add(repo) return repo
def add_raw(self, repo, files): result = None with cd(repo.rootdir): try: result = self._run(["add"] + files) except: pass
def clone_repo(self, url, gitdir): if not self.url_is_valid(url): raise Exception("Invalid URL") try: os.makedirs(gitdir) except: pass print("Syncing into local directory", gitdir) with cd(gitdir): if self.client == 'aws': cmd = ["aws", "s3", "sync", '--delete', url + "/", "."] else: cmd = ["s3cmd", "-c", self.s3cfg, "sync", url + "/", "."] # print("CMD", cmd) output = self.run(cmd) #print(output) print("Sync'd dataset with s3") # Make sure that hook is has correct permissions hooksdir = os.path.join(gitdir, 'hooks') postrecv_filename = os.path.join(hooksdir, 'post-receive') if os.path.exists(postrecv_filename): self.make_hook_executable(postrecv_filename) else: self.init_repo(gitdir)
def _run_generic_command(self, repo, cmd): """ Run a generic command within the repo. Assumes that you are in the repo's root directory """ result = None with cd(repo.rootdir): # Dont use sh. It is not collecting the stdout of all # child processes. output = self._run(cmd) try: result = { 'cmd': cmd, 'status': 'success', 'message': output, } except Exception as e: result = { 'cmd': cmd, 'status': 'error', 'message': str(e) } return result
def executable_commit(filename, what=['commit', 'username', 'useremail', 'date']): mapping = { 'commit': '%H', 'username': '******', 'useremail': '%ce', 'date': '%cd' } missing = [mapping[w] for w in what if w not in mapping] if len(missing) > 0: print("Cannot gather commit attributes of executable", missing) raise Exception("Invalid specification") codes = ",".join([mapping[w] for w in what if w in mapping]) with cd(os.path.dirname(filename)): cmd = 'git log -n 1 --date=iso --pretty="%s" -- %s ' % (codes, filename) output = run(cmd) output = output.strip() output = output.split(",") return {what[i]: output[i] for i in range(len(what))} return {}
def executable_repopath(filename): with cd(os.path.dirname(filename)): cmd = 'git rev-parse --show-prefix' output = run(cmd) output = output.strip() return {'path': os.path.join(output, os.path.basename(filename))}
def evaluate(self, repo, spec, args): """ Check the integrity of the datapackage.json """ status = [] with cd(repo.rootdir): files = spec.get('files', ['*']) resource_files = repo.find_matching_files(files) files = glob2.glob("**/*") disk_files = [f for f in files if os.path.isfile(f) and f != "datapackage.json"] allfiles = list(set(resource_files + disk_files)) allfiles.sort() for f in allfiles: if f in resource_files and f in disk_files: r = repo.get_resource(f) coded_sha256 = r['sha256'] computed_sha256 = compute_sha256(f) if computed_sha256 != coded_sha256: status.append({ 'target': f, 'rules': "", 'validator': self.name, 'description': self.description, 'status': 'ERROR', 'message': "Mismatch in checksum on disk and in datapackage.json" }) else: status.append({ 'target': f, 'rules': "", 'validator': self.name, 'description': self.description, 'status': 'OK', 'message': "" }) elif f in resource_files: status.append({ 'target': f, 'rules': "", 'validator': self.name, 'description': self.description, 'status': 'ERROR', 'message': "In datapackage.json but not in repo" }) else: status.append({ 'target': f, 'rules': "", 'validator': self.name, 'description': self.description, 'status': 'ERROR', 'message': "In repo but not in datapackage.json" }) return status
def executable_filetype(filename): with cd(os.path.dirname(filename)): cmd = '/usr/bin/file ' + filename output = run(cmd) output = output.strip() output = output[output.index(":") + 1:] return {'filetype': output}
def evaluate(self, repo, spec, force=False, args=[]): """ Evaluate an SQL query, cache the results in server """ files = spec.get('files', []) if len(files) == 0: # Nothing to do return [] db=MySQLdb.connect(host=self.host, port=self.port, db=self.db, user=self.username, passwd=self.password) cur = db.cursor() result = [] with cd(repo.rootdir): for f in files: cachepath = repo.cache_path(self.name, f + '.data') if not force and repo.cache_check(cachepath): #print("Found in cache") result.append({ 'target': f, 'transformer': self.name, 'status': 'OK', 'message': 'Result already cached ({})'.format(cachepath['relative']) }) continue # print("Not found in cache. So executing") # Run the query query = open(f).read() (info, schema, data) = self.execute(cur, query) # Save the results for output in [['info', info], ['schema', schema], ['data', data]]: cachepath = repo.cache_path(self.name, f + "." + output[0]) repo.cache_write(cachepath, output[1]) result.append({ 'target': files[0], 'transformer': self.name, 'status': 'OK', 'message': 'Executed the query' }) return result
def delete(self, repo, args=[]): """ Delete files from the repo """ result = None with cd(repo.rootdir): try: cmd = ['rm'] + list(args) result = {'status': 'success', 'message': self._run(cmd)} except Exception as e: result = {'status': 'error', 'message': str(e)} # print(result) return result
def repo_origin(filename, what=['Push URL']): with cd(os.path.dirname(filename)): cmd = "git remote show origin" output = run(cmd) #* remote origin #Fetch URL: [email protected]:jaredpar/VsVim.git #Push URL: [email protected]:jaredpar/VsVim.git #HEAD branch: master #Remote branches: response = {} output = output.split("\n") output = output[1:] for o in output: for w in what: if w in o: response[w] = o[o.index(":") + 1:] return response
def delete(self, repo, args=[]): """ Delete files from the repo """ result = None with cd(repo.rootdir): try: cmd = ['rm'] + list(args) result = { 'status': 'success', 'message': self._run(cmd) } except Exception as e: result = { 'status': 'error', 'message': str(e) } # print(result) return result
def add_files(self, repo, files): """ Add files to the repo """ rootdir = repo.rootdir for f in files: relativepath = f['relativepath'] sourcepath = f['localfullpath'] if sourcepath is None: # This can happen if the relative path is a URL continue # # Prepare the target path targetpath = os.path.join(rootdir, relativepath) try: os.makedirs(os.path.dirname(targetpath)) except: pass # print(sourcepath," => ", targetpath) print("Updating: {}".format(relativepath)) shutil.copyfile(sourcepath, targetpath) with cd(repo.rootdir): self._run(['add', relativepath])
def clone(self, url, backend=None): """ Clone a URL Parameters ---------- url : URL of the repo. Supports s3://, git@, http:// """ # s3://bucket/git/username/repo.git username = self.username reponame = url.split("/")[-1] # with git reponame = reponame.replace(".git", "") key = (username, reponame) # In local filesystem-based server, add a repo server_repodir = self.server_rootdir(username, reponame, create=False) rootdir = self.rootdir(username, reponame, create=False) if backend is None: # Backend is standard git repo (https://, git@...) with cd(os.path.dirname(rootdir)): self._run(['clone', '--no-hardlinks', url]) else: # Backend is s3 # Sync if needed. if not os.path.exists(server_repodir): # s3 -> .dgit/git/pingali/hello.git -> .dgit/datasets/pingali/hello backend.clone_repo(url, server_repodir) # After sync clone, with cd(os.path.dirname(rootdir)): self._run(['clone', '--no-hardlinks', server_repodir]) # Insert the notes push if True: configfile = os.path.join(rootdir, '.git', 'config') content = open(configfile).read() original = "fetch = +refs/heads/*:refs/remotes/origin/*" replacement = """fetch = +refs/heads/*:refs/remotes/origin/*\n fetch = +refs/notes/*:refs/notes/*""" if "notes" not in content: content = content.replace(original, replacement) with open(configfile, 'w') as fd: fd.write(content) # Pull the notes if any as well.. with cd(rootdir): self._run(['pull', 'origin']) # Insert the object into the internal table we maintain... r = Repo(username, reponame) r.rootdir = rootdir r.remoteurl = url r.manager = self package = os.path.join(r.rootdir, 'datapackage.json') packagedata = open(package).read() r.package = json.JSONDecoder( object_pairs_hook=collections.OrderedDict).decode(packagedata) return self.add(r)
def evaluate(self, repo, spec, args): """ Evaluate the files identified for checksum. """ status = [] # Do we have to any thing at all? if len(spec['files']) == 0: return status with cd(repo.rootdir): rules = None if 'rules-files' in spec and len(spec['rules-files']) > 0: rulesfiles = spec['rules-files'] rules = dict([(f, json.loads(open(f).read())) for f in rulesfiles]) elif 'rules' in spec: rules = { 'inline': spec['rules'] } if rules is None or len(rules) == 0: print("Regression quality validation has been enabled but no rules file has been specified") print("Example: { 'min-r2': 0.25 }. Put this either in file or in dgit.json") raise InvalidParameters("Regression quality checking rules missing") files = dict([(f, open(f).read()) for f in spec['files']]) for r in rules: if 'min-r2' not in rules[r]: continue minr2 = float(rules[r]['min-r2']) for f in files: match = re.search(r"R-squared:\s+(\d.\d+)", files[f]) if match is None: status.append({ 'target': f, 'validator': self.name, 'description': self.description, 'rules': r, 'status': "ERROR", 'message': "Invalid model output" }) else: r2 = match.group(1) r2 = float(r2) if r2 > minr2: status.append({ 'target': f, 'validator': self.name, 'description': self.description, 'rules': r, 'status': "OK", 'message': "Acceptable R2" }) else: status.append({ 'target': f, 'validator': self.name, 'description': self.description, 'rules': r, 'status': "ERROR", 'message': "R2 is too low" }) return status
def repo_remote_url(filename): with cd(os.path.dirname(filename)): cmd = "git config --get remote.origin.url" output = run(cmd) return {'remote.origin.url': output.strip()}
def clone(self, url, backend=None): """ Clone a URL Parameters ---------- url : URL of the repo. Supports s3://, git@, http:// """ # s3://bucket/git/username/repo.git username = self.username reponame = url.split("/")[-1] # with git reponame = reponame.replace(".git","") key = (username, reponame) # In local filesystem-based server, add a repo server_repodir = self.server_rootdir(username, reponame, create=False) rootdir = self.rootdir(username, reponame, create=False) if backend is None: # Backend is standard git repo (https://, git@...) with cd(os.path.dirname(rootdir)): self._run(['clone', '--no-hardlinks', url]) else: # Backend is s3 # Sync if needed. if not os.path.exists(server_repodir): # s3 -> .dgit/git/pingali/hello.git -> .dgit/datasets/pingali/hello backend.clone_repo(url, server_repodir) # After sync clone, with cd(os.path.dirname(rootdir)): self._run(['clone', '--no-hardlinks', server_repodir]) # Insert the notes push if True: configfile = os.path.join(rootdir, '.git', 'config') content = open(configfile).read() original = "fetch = +refs/heads/*:refs/remotes/origin/*" replacement ="""fetch = +refs/heads/*:refs/remotes/origin/*\n fetch = +refs/notes/*:refs/notes/*""" if "notes" not in content: content = content.replace(original, replacement) with open(configfile, 'w') as fd: fd.write(content) # Pull the notes if any as well.. with cd(rootdir): self._run(['pull','origin']) # Insert the object into the internal table we maintain... r = Repo(username, reponame) r.rootdir = rootdir r.remoteurl = url r.manager = self package = os.path.join(r.rootdir, 'datapackage.json') packagedata = open(package).read() r.package = json.JSONDecoder(object_pairs_hook=collections.OrderedDict).decode(packagedata) return self.add(r)
def evaluate(self, repo, spec, args): """ Evaluate the files identified for checksum. """ status = [] # Do we have to any thing at all? if len(spec['files']) == 0: return status with cd(repo.rootdir): rules = None if 'rules-files' in spec and len(spec['rules-files']) > 0: rulesfiles = spec['rules-files'] rules = {} for f in rulesfiles: d = json.loads(open(f).read()) rules.update(d) elif 'rules' in spec: rules = {'inline': spec['rules']} if rules is None or len(rules) == 0: print( "Regression quality validation has been enabled but no rules file has been specified" ) print( "Example: { 'min-r2': 0.25 }. Put this either in file or in dgit.json" ) raise InvalidParameters( "Regression quality checking rules missing") files = dict([(f, open(f).read()) for f in spec['files']]) for r in rules: if 'min-r2' not in rules[r]: continue minr2 = float(rules[r]['min-r2']) for f in files: match = re.search(r"R-squared:\s+(\d.\d+)", files[f]) if match is None: status.append({ 'target': f, 'validator': self.name, 'description': self.description, 'rules': r, 'status': "ERROR", 'message': "Invalid model output" }) else: r2 = match.group(1) r2 = float(r2) if r2 > minr2: status.append({ 'target': f, 'validator': self.name, 'description': self.description, 'rules': r, 'status': "OK", 'message': "Acceptable R2" }) else: status.append({ 'target': f, 'validator': self.name, 'description': self.description, 'rules': r, 'status': "ERROR", 'message': "R2 is too low" }) return status