def recursiveCheckEquivalent(file1, file2, name): obj1 = file1[name] obj2 = file1[name] # check attributes attr1 = obj1.attrs attr2 = obj2.attrs attrsName = "/".join([name, "attrs"]) checkContents(attr1, attr2, attrsName) for key, value in attr1.items(): if not attr2[key] == value: log("Failure: {} - file1={}, file2={}".format( "/".join([attrsName, key]), value, attr2[key])) # check subgroups / datasets if isGroup(obj1): checkContents(obj1, obj2) for key in obj1.keys(): recursiveCheckEquivalent(file1, file2, "/".join([name, key])) else: if not obj1.shape == obj2.shape: log("Failure: {}.shape - file1={}, file2={}".format( name, obj1.shape, obj2.shape)) if obj1.dtype.names is None: # just one column checkData(obj1, obj2, name) else: for col in obj1.dtype.names: checkData(obj1[col], obj2[col], ".".join([name, col]))
def execute(self): exitcode=1 if len(self.fileList) == 0: return 0 try: self.revert = False self.run() self.preSize = 0 self.postSize = 0 self.revert = True self.fileList = self.getReversionFileList() self.prefix = None self.run() exitcode = 0 for i in range(len(self.fileList)): exitcode += checkEquivalent(self.originalFileList[i], self.fileList[i]) except Exception as e: log("ERROR: " + str(e)) finally: for f in self.fileList: try: os.remove(f) except OSError: # file never created pass return exitcode
def checkEquivalent(fn1, fn2): log("Checking equivalence of {} (file 1) and {} (file 2)...".format( fn1, fn2)) with h5py.File(fn1, 'r') as file1, h5py.File(fn2, 'r') as file2: checkContents(file1, file2) for group in file1.values(): recursiveCheckEquivalent(file1, file2, group.name) log("Complete.")
def checkEquivalent(fn1, fn2): log("Checking equivalence of {} (file 1) and {} (file 2)...".format(fn1, fn2)) with h5py.File(fn1, 'r') as file1, h5py.File(fn2, 'r') as file2: exitcode = checkContents(file1, file2) for group in file1.values(): exitcode += recursiveCheckEquivalent(file1, file2, group.name) log("Complete with {} errors.".format(exitcode)) return exitcode
def runRealtime(args): from picopore.realtime import ReadsFolder readsFolder = ReadsFolder(args) try: while True: sleep(1) except KeyboardInterrupt: log("\nExiting Picopore.") readsFolder.stop()
def checkData(data1, data2, name): match = data1 == data2 if isArray(match): match = (data1 == data2).all() if not match: positions = [i for i in range(len(data1)) if not data1[i] == data2] for pos in positions: log("Failure: {}[{}] - file1={}, file2={}".format( name, pos, data1[pos], data2[pos]))
def execute(self): self.readsFolder.start() try: while True: sleep(5) except KeyboardInterrupt: log("\nExiting Picopore.") self.readsFolder.stop() return 0
def stop(self): log("Processing in-progress files. Press Ctrl-C again to abort.") try: self.runner.stop() except KeyboardInterrupt: log("Aborted.") pass self.observer.stop() self.observer.join()
def _process_func(filename, func, prefix, print_every): if prefix is not None: newFilename = getPrefixedFilename(filename, prefix) copyfile(filename, newFilename) else: newFilename = filename result = func(newFilename) if result is not None and print_every > 0 and np.random.rand( ) < 1.0 / print_every: log('.', end='') return result
def checkRealtime(args): if args.realtime: log("Performing real time {} compression. ".format(args.mode), end='') if args.y: print('') return True elif checkSure(): args.y = True return True else: return False
def checkContents(obj1, obj2, name=None): name = obj1.name if name is None else name keys1 = obj1.keys() keys2 = obj2.keys() for key in keys1: if key not in keys2: log("Failure: {} missing from file 2".format("/".join( [obj2.name, key]))) for key in keys2: if key not in keys1: log("Failure: {} missing from file 1".format("/".join( [obj1.name, key])))
def compress(func, filename, group="all"): try: with h5py.File(filename, 'r+') as f: filtr = func(f, group) subprocess.call( ["h5repack", "-f", filtr, filename, "{}.tmp".format(filename)]) subprocess.call(["mv", "{}.tmp".format(filename), filename]) return os.path.getsize(filename) except Exception as e: log("ERROR: {} on file {}".format(str(e), filename)) if os.path.isfile("{}.tmp".format(filename)): os.remove("{}.tmp".format(filename)) return os.path.getsize(filename)
def compress(func, filename, group="all", prefix=None): if prefix is not None: newFilename = getPrefixedFilename(filename, prefix) copyfile(filename, newFilename) else: newFilename = filename try: with h5py.File(newFilename, 'r+') as f: filtr = func(f, group) subprocess.call(["h5repack","-f",filtr,newFilename, "{}.tmp".format(newFilename)]) subprocess.call(["mv","{}.tmp".format(newFilename),newFilename]) except Exception as e: log(str(e))
def postprocess(self, results): self.postSize = sum(results) if self.revert: preStr, postStr = "Compressed size:", "Reverted size:" else: preStr, postStr = "Original size:", "Compressed size:" str_len = max(len(preStr), len(postStr)) + 1 num_len = len(str(max(self.preSize, self.postSize))) log("{}{}".format(preStr.ljust(str_len), str(self.preSize).rjust(num_len))) log("{}{}".format(postStr.ljust(str_len), str(self.postSize).rjust(num_len))) return self.preSize
def rename(filename, pattern, replacement): try: with h5py.File(filename, 'r+') as f: for path in findDatasets(f, entry_point="/", keyword=pattern, match_child=True): newPath = re.sub(pattern, replacement, path) f[newPath] = f[path] del f[path] log("Renamed {} to {}".format(path, newPath)) return 0 except Exception as e: log("ERROR: {} on file {}".format(str(e), filename)) return 1
def __init__(self, args): self.args = args self.event_handler = PatternMatchingEventHandler(patterns=["*.fast5"], ignore_patterns=[], ignore_directories=True) self.event_handler.on_created = self.on_created self.observer = Observer() observedPaths = [] for path in args.input: if os.path.isdir(path): self.observer.schedule(self.event_handler, path, recursive=True) observedPaths.append(path) log("Monitoring {} in real time. Press Ctrl+C to exit.".format(", ".join(self.args.input))) self.observer.start() run(args.revert, args.mode, args.input, args.y, args.threads, args.group, args.prefix, args.fastq, args.summary)
def __init__(self, args): super(PicoporeRealtimeRunner, self).__init__(args) _, name = chooseCompressFunc(self.revert, self.mode, self.fastq, self.summary, self.manual, realtime=True) log(name + "...", end='') if self.y: log() elif checkSure(): self.y = True else: exit(1) self.readsFolder = ReadsFolder(self)
def chooseCompressFunc(revert, mode, fastq, summary, manual, realtime=False): name = "Performing " if realtime: name += "real time " if revert: if mode == 'lossless': func = losslessDecompress name += "lossless decompression" elif mode == 'deep-lossless': func = deepLosslessDecompress name += "deep lossless decompression" else: log("Unable to revert raw files. Please use a basecaller instead.") exit(1) else: if mode == 'lossless': func = losslessCompress name += "lossless compression" elif mode == 'deep-lossless': func = deepLosslessCompress name += "deep lossless compression" elif mode == 'raw': name += "raw compression " if manual is not None: name += "with manual keyword " + manual keywords = [manual] else: keywords = __raw_compress_keywords__ if fastq and summary: name += "with FASTQ and summary" elif fastq: keywords += __raw_compress_summary__ name += "with FASTQ and no summary" elif summary: keywords += __raw_compress_fastq__ name += "with summary and no FASTQ" else: keywords += __raw_compress_fastq_summary__ name += "with no summary and no FASTQ" func = partial(rawCompress, keywords=keywords) try: return partial(compress, func), name except NameError: log("No compression method selected") exit(1)
def rawCompress(f, group, keywords): if "Picopore" in f: log("{} is compressed using picopore deep-lossless compression. Please use picpore --revert --mode deep-lossless before attempting raw compression." .format(f.filename)) else: paths = [] for kw in keywords: paths.extend(findDatasets(f, group, keyword=kw)) for path in paths: if path in f: del f[path] try: if len(f["Analyses"].keys()) == 0: del f["Analyses"] except KeyError: # no analyses, no worries pass return "GZIP=9"
def run(self, postprocess=True): func, message = self.get_func() self.func = functools.partial(_process_func, func=func, prefix=self.prefix, print_every=self.print_every) fileList = self.getFileList() if len(fileList) == 0: return 0 log("{} on {} files... ".format(message, len(fileList))) if self.y or checkSure(): self.process(fileList) if postprocess: return self.stop() else: return self.multiprocessor.wait() else: log("User cancelled. Exiting.") exit(1)
def __init__(self, runner): self.runner = runner self.event_handler = PatternMatchingEventHandler( patterns=["*.fast5"], ignore_patterns=[], ignore_directories=True) self.event_handler.on_created = self.on_created self.event_handler.on_moved = self.on_moved self.observer = Observer() self.observedPaths = [] for path in self.runner.input: if os.path.isdir(path): self.observer.schedule(self.event_handler, path, recursive=True) self.observedPaths.append(path) log("Monitoring {} in real time. Press Ctrl+C to exit.".format( ", ".join(self.observedPaths)))
def run(revert, mode, inp, y=False, threads=1, group="all", prefix=None, fastq=True, summary=False): func, message = chooseCompressFunc(revert, mode, fastq, summary) fileList = recursiveFindFast5(inp) if len(fileList) == 0: return 0 preSize = sum([os.path.getsize(f) for f in fileList]) log("{} on {} files... ".format(message, len(fileList))) if y or checkSure(): if threads <= 1: for f in fileList: compressWrapper([func, f, group, prefix]) else: argList = [[func, f, group, prefix] for f in fileList] pool = Pool(threads) pool.map(compressWrapper, argList) if revert: preStr, postStr = "Compressed size:", "Reverted size:" else: preStr, postStr = "Original size:", "Compressed size:" log("Complete.") postSize = sum([ os.path.getsize(getPrefixedFilename(f, prefix)) for f in fileList ]) str_len = max(len(preStr), len(postStr)) + 1 num_len = len(str(max(preSize, postSize))) log("{}{}".format(preStr.ljust(str_len), str(preSize).rjust(num_len))) log("{}{}".format(postStr.ljust(str_len), str(postSize).rjust(num_len))) return 0 else: log("User cancelled. Exiting.") exit(1)
def recursiveCheckEquivalent(file1, file2, name): obj1 = file1[name] obj2 = file1[name] # check attributes attr1 = obj1.attrs attr2 = obj2.attrs attrsName = "/".join([name, "attrs"]) exitcode = checkContents(attr1, attr2, attrsName) for key, value in attr1.items(): try: if not attr2[key] == value: log("Failure: {} - file1={}, file2={}".format("/".join([attrsName, key]), value, attr2[key])) exitcode += 1 except ValueError as e: # probably a numpy array if str(e) == "The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()" and not (attr2[key] == value).all(): log("Failure: {} - file1={}, file2={}".format("/".join([attrsName, key]), value, attr2[key])) exitcode += 1 # check subgroups / datasets if isGroup(obj1): exitcode += checkContents(obj1, obj2) for key in obj1.keys(): exitcode += recursiveCheckEquivalent(file1, file2, "/".join([name, key])) else: if not obj1.shape == obj2.shape: log("Failure: {}.shape - file1={}, file2={}".format(name, obj1.shape, obj2.shape)) if obj1.dtype.names is None: # just one column exitcode += checkData(obj1, obj2, name) else: for col in obj1.dtype.names: exitcode += checkData(obj1[col], obj2[col], ".".join([name, col])) return exitcode
def chooseCompressFunc(revert, mode, fastq, summary): if revert: if mode == 'lossless': func = losslessDecompress name = "Performing lossless decompression" elif mode == 'deep-lossless': func = deepLosslessDecompress name = "Performing deep lossless decompression" else: log("Unable to revert raw files. Please use a basecaller instead.") exit(1) else: if mode == 'lossless': func = losslessCompress name = "Performing lossless compression" elif mode == 'deep-lossless': func = deepLosslessCompress name = "Performing deep lossless compression" elif mode == 'raw': if fastq and summary: func = rawCompressFastqSummary name = "Performing raw compression with FASTQ and summary" elif fastq: func = rawCompressFastqNoSummary name = "Performing raw compression with FASTQ and no summary" elif summary: func = rawCompressSummaryNoFastq name = "Performing raw compression with summary and no FASTQ" else: func = rawCompressMinimal name = "Performing raw compression with no summary and no FASTQ" try: return func, name except NameError: log("No compression method selected") exit(1)
def stop(self): results = self.multiprocessor.join() log("Complete.") return self.postprocess(results)
def postprocess(self, results): log("Successfully renamed {} of {} files.".format( self.processed - sum(results), self.processed)) return self.processed
from __future__ import absolute_import from subprocess import call, PIPE from picopore.util import log if not call("type h5repack", shell=True, stdout=PIPE, stderr=PIPE) == 0: log("h5repack (hdf5-tools) not installed. Aborting.") exit(1)