def backup(**kwargs): import sh bakdir = "/var/toughradius/databak" if not os.path.exists(bakdir): os.mkdir(bakdir) now = datetime.now() dbname = kwargs.pop('dbname','toughradius') ftphost = kwargs.pop('ftphost','127.0.0.1') ftpport = kwargs.pop('ftpport',21) ftpuser = kwargs.pop('ftpuser','') ftppwd = kwargs.pop('ftppwd','') backfile = '%s/%s-backup-%s.gz'%(bakdir,dbname,now.strftime( "%Y%m%d")) sh.gzip(sh.mysqldump(u='root',B=dbname,S="/var/toughradius/mysql/mysql.sock"),'-cf',_out=backfile) if '127.0.0.1' not in ftphost: ftp=FTP() ftp.set_debuglevel(2) ftp.connect(ftphost,ftpport) ftp.login(ftpuser,ftppwd) ftp.cwd('/') bufsize = 1024 file_handler = open(backfile,'rb') ftp.storbinary('STOR %s' % os.path.basename(backfile),file_handler,bufsize) ftp.set_debuglevel(0) file_handler.close() ftp.quit()
def _test_replica(replica, verbose=False): """Test whether a replica has the checksum it reports and whether it passes the gzip test.""" with temp_dir() as tempdir: tempf = os.path.join(tempdir, 'temp.gz') if verbose: print_("Downloading and checking replica: "+replica) dm.backend._get(replica, tempf, verbose=verbose) remote_checksum = dm.checksum(replica) local_checksum = sh.adler32(tempf, _tty_out=False).strip() if local_checksum != remote_checksum: if verbose: print_(replica) print_("Local checksum %s is different from remote checksum %s."%(local_checksum, remote_checksum)) return False try: sh.gzip(tempf, test=True, _tty_out=False) except sh.ErrorReturnCode: if verbose: print_(replica) print_("Failed the gzip integrity test.") return False else: return True
def backup(**kwargs): import sh bakdir = "/var/toughradius/databak" if not os.path.exists(bakdir): os.mkdir(bakdir) now = datetime.now() dbname = kwargs.pop('dbname', 'toughradius') ftphost = kwargs.pop('ftphost', '127.0.0.1') ftpport = kwargs.pop('ftpport', 21) ftpuser = kwargs.pop('ftpuser', '') ftppwd = kwargs.pop('ftppwd', '') backfile = '%s/%s-backup-%s.gz' % (bakdir, dbname, now.strftime("%Y%m%d")) sh.gzip(sh.mysqldump(u='root', B=dbname, S="/var/toughradius/mysql/mysql.sock"), '-cf', _out=backfile) if '127.0.0.1' not in ftphost: ftp = FTP() ftp.set_debuglevel(2) ftp.connect(ftphost, ftpport) ftp.login(ftpuser, ftppwd) ftp.cwd('/') bufsize = 1024 file_handler = open(backfile, 'rb') ftp.storbinary('STOR %s' % os.path.basename(backfile), file_handler, bufsize) ftp.set_debuglevel(0) file_handler.close() ftp.quit()
def lzw_filter_single(min_complexity, x): un_comp_len = len(str(x.seq)) comp_len = sum(imap(len, sh.gzip(f=True, _in=str(x.seq)))) # sh.sh('lzw.sh', x) . . . complexity = comp_len / float(un_comp_len) #print complexity return complexity >= min_complexity
def update_cache(self): if not self.test_cache(): rm(self.path, '-rf') mkdir('-p', self.path) index_file_url = '/'.join( [self.repo_url.url.geturl(), 'Packages.gz']) index_file_path = os.path.join(self.path, self.index_file) print("Downloading index file '{0}' --> '{1}' ...".format( index_file_url, index_file_path)) try: with pushd(self.path): wget(index_file_url, '-O', self.index_file + '.gz') gzip('-d', self.index_file + '.gz') except Exception as err: print(str(err)) self.broken = True
def update_cache(self): if not self.test_cache(): rm(self.path, '-rf') mkdir('-p', self.path) index_file_url = '/'.join([self.repo_url.url.geturl(), 'Packages.gz']) index_file_path = os.path.join(self.path, self.index_file) print("Downloading index file '{0}' --> '{1}' ...".format( index_file_url, index_file_path )) try: with pushd(self.path): wget(index_file_url, '-O', self.index_file + '.gz') gzip('-d', self.index_file + '.gz') except Exception as err: print(str(err)) self.broken = True
def get_files(usaf, wban): output = sh.grep("%s %s" % (usaf, wban), "isd-history.txt").strip().split(" ") end = int(output.pop()[0:4]) start = int(output.pop()[0:4]) sh.mkdir("-p", "%s-%s" % (usaf, wban)) os.chdir("%s-%s" % (usaf, wban)) for year in range(start, end + 1): fn = "%s-%s-%s.gz" % (usaf, wban, year) if not os.path.exists(fn): sh.wget("ftp://ftp.ncdc.noaa.gov/pub/data/noaa/%s/%s" % (year, fn)) print(fn) output_fn = "%s-%s-data.csv" % (usaf, wban) h = open(output_fn, "w") sh.sort(sh.cut( sh.grep( sh.cut(sh.zcat(glob.glob("*.gz")), "--output-delimiter=,", "-c16-27,88-92"), "-v", "\+9999"), "--output-delimiter=.", "-c1-17,18"), _out=h) sh.gzip(output_fn) sh.mv("%s.gz" % (output_fn), "..")
def upload_repo(token): url = request.form.get('url') type_ = request.form.get('type') ref = request.form.get('ref') s = storage user = s.find_user_by_token(token) logger.error("user %s uploading repo",user["username"]) username = user["username"] if not url: return 'Empty url', 400 if not type_: if url.startswith('git://') or url.endswith('.git'): type_ = 'git' else: return 'Cannot define type of repository by url. Please, specify type.', 400 if type_ not in ['git', 'cvs', 'hg']: return 'Invalid cvs type', 400 if "UPLOAD_FOLDER" in current_app.config: base_clone_path = current_app.config["UPLOAD_FOLDER"] else: base_clone_path = "/tmp" clone_path = "%s/%s" % (base_clone_path,os.path.basename(url)) if os.path.exists(clone_path): sh.rm("-rf", clone_path) if type_ == 'git': ref = ref or "HEAD" sh.git("clone", url, clone_path) try: ref = sh.git("rev-parse", ref, _cwd=clone_path).strip() except sh.ErrorReturnCode as e: return 'Invalid reference. %s' % e, 400 if not os.path.exists(clone_path + "/info.yaml"): return 'info.yaml is required', 400 try: package_info = yaml.load(file(clone_path + '/info.yaml')) validate_info(package_info) except YAMLError: return 'Bad encoded info.yaml', 400 except (ValueError, KeyError) as e: return str(e), 400 try: depends_path = download_depends(package_info['depends'], package_info['type'], clone_path) except sh.ErrorReturnCode as e: return 'Unable to install dependencies. %s' % e, 503 # remove info.yaml from tar.gz with open(clone_path + '/.gitattributes', 'w') as f: f.write('info.yaml export-ignore') try: logger.debug("Packing application to tar.gz") sh.git("archive", ref, "--worktree-attributes", format="tar", o="app.tar", _cwd=clone_path), if package_info["type"] == "nodejs": sh.tar("-uf", "app.tar", "node_modules", _cwd=clone_path) elif package_info["type"] == "python": sh.tar("-uf", "app.tar", "-C", clone_path + "/depends", *depends_path, _cwd=clone_path) sh.gzip("app.tar", _cwd=clone_path) package_files = sh.tar('-tf', 'app.tar.gz', _cwd=clone_path) package_info['structure'] = [f.strip() for f in package_files] except sh.ErrorReturnCode as e: return 'Unable to pack application. %s' % e, 503 try: for line in sh.git("log", "-5", date="short", format="%h %ad %s [%an]", _cwd=clone_path): line = line.strip() # git log output is using ansi terminal codes which is messy for our purposes ansisequence = re.compile(r'\x1B\[[^A-Za-z]*[A-Za-z]') line = ansisequence.sub('', line) line = line.strip("\x1b=\r") line = line.strip("\x1b>") if not line: continue package_info.setdefault('changelog', []).append(line) except sh.ErrorReturnCode as e: return 'Unable to pack application. %s' % e, 503 try: with open(clone_path + "/app.tar.gz") as app: package_info['url'] = url uuid = upload_app(app, package_info, ref, token) return "Application %s was successfully uploaded" % uuid except (KeyError, ValueError) as e: return str(e), 400 return "Application was failed to upload", 400
def transformExpressionMatrixTo10XMtx(inputPath, outputDir): """ input: path or dataframe column: gene name index: barcode名 ( without -1 ) """ try: sh.mkdir(outputDir) except: sh.rm("-rf", outputDir) sh.mkdir(outputDir) if isinstance(inputPath, str): expressionMtx = pd.read_table( inputPath, index_col=0, ) else: expressionMtx = inputPath expressionMtx.rename_axis("index", inplace=True) expressionMtx = expressionMtx.loc[:, expressionMtx.sum(0) != 0] barcodes = pd.Series(expressionMtx.index + "-1") barcodes.to_csv(f"{outputDir}barcodes.tsv", header=None, index=None) feature = pd.DataFrame(expressionMtx.columns) feature[1] = feature.iloc[:, 0] feature[2] = "Gene Expression" feature.to_csv(f"{outputDir}features.tsv", sep="\t", header=None, index=None) indexMap = { i: k for i, k in zip(expressionMtx.index, range(1, 1 + len(expressionMtx.index))) } featureMap = { i: k for i, k in zip(expressionMtx.columns, range(1, 1 + len(expressionMtx.columns))) } expressionMtx.index = expressionMtx.index.map(indexMap) expressionMtx.columns = expressionMtx.columns.map(featureMap) expressionMtx = expressionMtx.astype(int) expressionMtx.reset_index(inplace=True) expressionMtx = expressionMtx.melt(id_vars="index") expressionMtx.columns = ["barcode", "feature", "count"] expressionMtx = expressionMtx.query("count != 0") expressionMtx = expressionMtx.reindex(["feature", "barcode", "count"], axis=1) expressionMtx.sort_values(["barcode", "feature"], ascending=[True, False], inplace=True) featureCounts, barcodeCounts, rowCounts = ( max(expressionMtx["feature"]), max(expressionMtx["barcode"]), len(expressionMtx), ) with open(f"{outputDir}matrix.mtx", "w") as fh: fh.write( f'%%MatrixMarket matrix coordinate integer general\n%metadata_json: {{"format_version": 2, "software_version": "X.X.0"}}\n{featureCounts} {barcodeCounts} {rowCounts}' ) for line in expressionMtx.itertuples(): fh.write(f"\n{line.feature} {line.barcode} {line.count}") sh.gzip(glob.glob(f"{outputDir}*"))
def _extract(self, file: str, target_dir: str): with open(file, 'r') as f: sh.cpio(sh.gzip(d=True, k=True, _in=f), i=True, _cwd=target_dir)
def extractSeq(fastqDir, outDir, lmdbPath, threads, splitInput, cutoff): try: os.mkdir(outDir) except: logger.warning(f"{outDir} existed!!") if not splitInput: allR1Path = glob.glob(f"{fastqDir}*R1*") allR2Path = [x.replace("R1", "R2") for x in allR1Path] else: fastqTemp = outDir + "tempSplited/" try: sh.mkdir(fastqTemp) except: logger.warning(f"{fastqTemp} existed!!") allR1Path = glob.glob(f"{fastqDir}*_R1*") allR2Path = [x.replace("R1", "R2") for x in allR1Path] allSplitedPath = [ fastqTemp + re.search(r"[\w\W]+?(?=_R1)", x.split("/")[-1])[0] + "/" for x in allR1Path ] if allR1Path[0].endswith(".gz"): formatGz = True else: formatGz = False splitedNum = threads // len(allSplitedPath) if splitedNum <= 1: allR1Path = glob.glob(f"{fastqDir}*R1*") allR2Path = [x.replace("R1", "R2") for x in allR1Path] if allR1Path[0].endswith(".gz"): logger.error("format gz, please uncompress it.") 1 / 0 else: mPResults = [] with multiP(threads // 2) as mP: for singleR1Path, singleR2Path, singleSplitedPath in zip( allR1Path, allR2Path, allSplitedPath): mPResults.append( mP.submit( sh.seqkit, "split2", "-f", "-1", singleR1Path, "-2", singleR2Path, p=splitedNum, O=singleSplitedPath, j=2, )) tempAllSplitedR1Path = glob.glob(f"{fastqTemp}*/*R1*") tempAllSplitedR2Path = [ x.replace("R1", "R2") for x in tempAllSplitedR1Path ] sampleId = set([ re.search(r"(?<=tempSplited/)[\w\W]+?(?=_L)", x)[0] for x in tempAllSplitedR1Path ]) if len(sampleId) != 1: allSample = ", ".join(sampleId) logger.warning(f"MORE THAN ONE INPUT SAMPLES: {allSample}") sampleId = sampleId.pop() logger.warning(f"The prefix will change to {sampleId}") else: sampleId = sampleId.pop() i = 0 formatGzUseThreadContents = [] for tempSingleSplitedR1Path, tempSingleSplitedR2Path in zip( tempAllSplitedR1Path, tempAllSplitedR2Path): i += 1 if formatGz: sh.mv( tempSingleSplitedR1Path, f"{fastqTemp}{sampleId}_L{i:03}_R1_001.fastq.gz", ) sh.mv( tempSingleSplitedR2Path, f"{fastqTemp}{sampleId}_L{i:03}_R2_001.fastq.gz", ) formatGzUseThreadContents.append( sh.gzip( "-d", f"{fastqTemp}{sampleId}_L{i:03}_R1_001.fastq.gz", _bg=True, )) formatGzUseThreadContents.append( sh.gzip( "-d", f"{fastqTemp}{sampleId}_L{i:03}_R2_001.fastq.gz", _bg=True, )) else: sh.mv( tempSingleSplitedR1Path, f"{fastqTemp}{sampleId}_L{i:03}_R1_001.fastq", ) sh.mv( tempSingleSplitedR2Path, f"{fastqTemp}{sampleId}_L{i:03}_R2_001.fastq", ) if formatGz: [x.wait() for x in formatGzUseThreadContents] for singleTempDir in glob.glob(f"{fastqTemp}*/"): sh.rmdir(singleTempDir) allR1Path = glob.glob(f"{fastqTemp}*R1*") allR2Path = [x.replace("R1", "R2") for x in allR1Path] allSubProcess = [] with multiP(threads) as mP: for singleR1Path, singleR2Path in zip(allR1Path, allR2Path): allSubProcess.append( mP.submit( processOneFastq, singleR1Path, singleR2Path, lmdbPath, outDir, cutoff, )) [x.result() for x in allSubProcess] if not splitInput: pass else: sh.rm("-rf", fastqTemp)
def gzip(self, content): return sh.gzip('--best', '--stdout', _tty_out=False, _in=content).stdout
'-o', 'PubkeyAuthentication=no', 'debian@{}/gateway.log*'.format(args.gateway), '{}/'.format(tmpdirname)) print('Done copying, start parsing') devices = {} logs = glob.glob(tmpdirname + '/gateway.log*') for log in logs: name, ext = os.path.splitext(log) print('Handling {}'.format(log)) # Uncompress this log file if needed if ext == '.gz': print('Have to uncompress first') gzip('-d', log) log = log[:-3] # Open it to read all of the JSON blobs with open(log) as f: print('Opened {} and parsing JSON'.format(log)) for l in f: try: blob = json.loads(l) # Check if we can identify this node if '_meta' in blob: id = blob['_meta']['device_id'] # Have to create the data structures if this is the
import fileinput import struct from sh import gzip # sh needed to be installed with pip3 #================= # file names are hard-wired - too lazy to make a better user interface inputFileName = "indexTrain.html" gzFileName = "indexTrain.html.gz" outputFileName = "indexTrain.h" #================= # first compress the file # keep the input file, force overwrite of existing out file, put Name of file inside the GZ gzip("-k", "-f", "-N", inputFileName) #================= # now convert the bytes in the GZIP to hex text gzFile = open(gzFileName, 'rb') outFile = open(outputFileName, 'w') hexCount = 0 byteCount = 0 # first write header lines for the .h file as required by the C++ code that will use it outFile.write( "#define index_ov2640_html_gz_len NNNN\nconst uint8_t index_ov2640_html_gz[] = {\n" ) # deal with each byte
def convert_tar_to_targz(tar_file): lgr.debug('Converting tar to tar.gz...') sh.gzip(tar_file)
'debian@{}/gateway.log*'.format(args.gateway), '{}/'.format(tmpdirname)) print('Done copying, start parsing') devices = {} logs = glob.glob(tmpdirname + '/gateway.log*') for log in logs: name, ext = os.path.splitext(log) print('Handling {}'.format(log)) # Uncompress this log file if needed if ext == '.gz': print('Have to uncompress first') gzip('-d', log) log = log[:-3] # Open it to read all of the JSON blobs with open(log) as f: print('Opened {} and parsing JSON'.format(log)) for l in f: try: blob = json.loads(l) # Check if we can identify this node if '_meta' in blob: id = blob['_meta']['device_id'] # Have to create the data structures if this is the
def lzw_filter_single(min_complexity, x): un_comp_len = len(str(x.seq)) comp_len = sum(imap(len, sh.gzip(f=True, _in=str(x.seq)))) complexity = comp_len / float(un_comp_len) return complexity >= min_complexity