def __init__( self, fastx: Path, sketch: Path, prefix: str = 'sketchy', outdir: Path = Path('sketchy_out'), verbose: bool = False, ): PoreLogger.__init__( self, level=logging.INFO if verbose else logging.ERROR, name='Compute' ) self.fastx = fastx self.sketch = sketch self.prefix = prefix self.outdir = outdir self.verbose = verbose self.logger.info(f'Sketchy wrapper v{__version__}') self.logger.info(f'Prefix: {prefix}') self.logger.info(f'Fastq file: {fastx.absolute()}') self.logger.info(f'Output directory: {outdir.absolute()}') self.outdir.mkdir(exist_ok=True, parents=True)
def __init__( self, index_file: Path = None, lineage_column: str = 'mlst', verbose: bool = True ): PoreLogger.__init__( self, level=logging.INFO if verbose else logging.ERROR ) if index_file: self.index = pandas.read_csv( index_file, sep='\t', header=0 ) if 'idx' not in self.index.columns: self.logger.info('Adding Mash index column "idx" to genotype index') self.index.index = [i for i in range( len(self.index) )] self.index.index.name = 'idx' else: self.logger.info('Using column "idx" as Mash index column in genotype index') self.index.index = self.index.set_index('idx') else: self.index = None self.lineage_column = lineage_column
def __init__(self, survey_directory: Path): PoreLogger.__init__(self) self.survey_data = SurveyData() self.logger.info(f'Parse survey directory: {survey_directory}') self.missing = '-' self.survey_data.read(survey_directory)
def __init__(self): PoreLogger.__init__(self) self.inter = pandas.DataFrame() # MASH outputs, updated self.interim = pandas.DataFrame() # MASH outputs, updated, with data self.lineage = Counter() # Prime lineage counter self.genotype = dict() # Genotype counters by lineage self.susceptibility = dict() # Susceptibility counters by lineage self.continuous = list() self.start_time_regex = r'start_time=(.*)Z'
def __init__( self, sssh: Path, index: Path, key: Path, stable: int = None, ssh: Path = None, verbose: bool = False ): PoreLogger.__init__(self, name="Evaluate") if verbose: self.logger.setLevel(level=logging.INFO) self.top_feature_values = 5 self.read_limit = 1000 self.preference_threshold = 0.6 self.na_color = 'darkgray' self.stable = stable self.logger.info(f"Loading data for evaluations from Sketchy Rust") self.logger.info(f"Ranked sum of shared hashes: {sssh}") self.logger.info(f"Sum of shared hashes: {ssh}") self.logger.info(f"Genotype feature index: {index}") self.logger.info(f"Genotype feature key: {key}") self.feature_key = self.read_feature_key(file=key) # key to headers and categories self.feature_index, self.feature_data = self.read_feature_index(file=index) self.ssh = self.read_ssh(file=ssh) self.sssh = self.read_sssh(file=sssh) self.features = self.feature_index.columns.tolist() if self.ssh is not None: # Merge ssh and feature index for heatmap self.ssh_features = self.ssh \ .join(self.feature_data, how='inner') \ .sort_values(['read', 'rank']) self.reads = len( self.ssh_features['read'].unique() ) self.ranks = len( self.ssh_features['rank'].unique() )
def __init__( self, indir: Path = None, outdir: Path = None, limit: int = 1000, top: int = 10, ): self.top = top self.indir = indir self.limit = limit self.outdir = outdir if outdir: outdir.mkdir(parents=True, exist_ok=True) self.false_color = "#d9d9d9" self.logger = PoreLogger().logger self.reads: int = 0 self.breakpoints = dict()
def link(iid: Path, directory, column, extension, outdir, symlink, bootstrap): """ Link ID file to FASTA, e.g. from filtered Pathfinder Survey """ if column is None: iids = pandas.read_csv(iid, index_col=0, header=0, dtype=str) else: iids = pandas.read_csv(iid, usecols=[column], header=0, sep='\t') iids.rename(columns={column: 'iid'}, inplace=True) if symlink: outdir.mkdir(exist_ok=True, parents=True) log = PoreLogger(level=logging.INFO).logger if bootstrap: isolates = random.choices(iids.iid, k=bootstrap) else: isolates = iids.iid for i in isolates: iid_path = directory / str(i + extension) if iid_path.exists(): if symlink: sym_path = (outdir / str(i + extension)).absolute() log.info( f'Symlink: {iid_path.absolute()} to {sym_path.absolute()}') os.symlink(str(iid_path.absolute()), str(sym_path.absolute())) else: print(f"{iid_path}") else: log.debug(f'Could not find: {iid_path}')
def fx_sort(fastx, output): """ Sort reads by 'start_time' in basecalled header from Guppy""" logger = PoreLogger(level=logging.INFO).logger logger.info('Creating index for random access to reads') fx, build_read = create_fastx_index(fastx=fastx) logger.info('Creating start time index for all reads') sim = SketchySimulator(fastx=fastx) run_index = sim.get_run_index() logger.info(f'Writing sorted reads by start time date') with get_output_handle(fpath=output) as fout: for i, row in run_index.iterrows(): read_str = build_read(fx[row['name']], comment=sim.create_header_comment(row)) fout.write(read_str + '\n') logger.info(f'Completed') Path(str(fastx) + '.fxi').unlink()
def __init__(self, sketch_path=Path.home() / '.sketchy', full: bool = False, verbose: bool = True): ######################################## # Public Google Cloud Storage Settings # ######################################## self.bucket_name = 'sketchy-sketch' self.sketches = ['kpneumoniae', 'saureus'] self.full = full self.pl = PoreLogger(logging.INFO if verbose else logging.ERROR) self.sketch_path = sketch_path
def __init__(self, sketch_path=Path.home() / '.sketchy' / 'db'): ######################################## # Public Google Cloud Storage Settings # ######################################## self.base_url = 'https://storage.googleapis.com/np-core-sketchy/' self.sketch_files = { 'kleb': 'kleb.default.msh', 'mrsa': 'mrsa.default.msh', 'tb': 'tb.default.msh' } self.pl = PoreLogger() self.sketch_path = sketch_path
def merge(sketch, features, key, prefix, index_column, mash_column, verbose): """ Merge sketch and feature data by common indices """ pl = PoreLogger(level=logging.INFO if verbose else logging.ERROR).logger pl.info(f'Extracting data from sketch: {sketch}') run_cmd(f'mash info -t {sketch} > {prefix}.mashinfo', shell=True) pl.info(f'Reading and converting data indices from sketch') converters = {'id': lambda x: Path(x).stem} mash_info = pandas.read_csv( f'{prefix}.mashinfo', sep='\t', header=None, skiprows=1, index_col=0, engine='c', usecols=[2], names=['id'], converters=converters, ) pl.info(f'Assigning sequential indices to index column: `idx`') mash_info['idx'] = [i for i in range(len(mash_info))] mash_info['ids'] = mash_info.index.tolist() nsketch = len(mash_info) pl.info(f'Ordered merge on column {index_column} with feature file {features}') d = pandas.read_csv(features, sep='\t') ndata = len(d) print(mash_info) print(d) mash_info = d.merge( mash_info, left_on=index_column, right_on=mash_column, how='inner' ) pl.info('Merged data and sketch information') if 'idx_y' in mash_info.columns: mash_info = mash_info.drop(columns="idx_x") mash_info = mash_info.rename(columns={'idx_y': 'idx'}) if 'ids_y' in mash_info.columns: mash_info = mash_info.drop(columns=["ids_y"]) if "ids_x" in mash_info.columns: mash_info = mash_info.drop(columns=["ids_x"]) mash_info = mash_info.sort_values('idx') mash_info.index = mash_info['idx'] mash_info = mash_info.drop(columns='idx') if key is not None: key_table = pandas.read_csv( key, sep='\t', header=0 ) mash_info = mash_info.merge( key_table, left_on='ids', right_on='uuid' ) mash_info.drop(columns=['uuid', 'fasta'], inplace=True) mash_info.rename(columns={'id': 'key'}, inplace=True) print(mash_info) pl.info(f'Writing merged feature index to: {prefix}.tsv') mash_info.to_csv( f'{prefix}.tsv', sep='\t', header=True, index=True, ) pl.info(f'Merged sketch data ({nsketch}) and feature data ({ndata})') pl.info(f'Final sketch and feature size is {len(mash_info)}') pl.info(f'Removed features not present in sketch: {len(mash_info) - ndata}') pl.info(f'Removing temporary file {prefix}.mashinfo') os.remove(f'{prefix}.mashinfo')
def monitor(interval, terminate, early, prefix, log): """ Monitor benchmarks during a Sketchy execution (Mash, Sketchy) """ # A bit hacky because Sketchy calls Mash in parallel, and there is # currently no return of specific PID from Mash # Essentially monitors ANY command that contains: mash, sketchy-rs # If multiple commands are running, resource usage is computed across all if interval < 0.1: raise ValueError('Interval (--interval, -i) must be >= 0.1') start_time = time.time() if log: logfile = f'{prefix}.log' else: logfile = None outfile = f'{prefix}.tsv' pore = PoreLogger(level=logging.INFO, file=logfile) data = [] try: pore.logger.info(f'CPU\tMEM\tMash\t\tSketchy\t') pore.logger.info(f'---\t---\t----\t\t-------\t') while True: run_time = time.time() - start_time if early and run_time > early: pore.logger.info( f'Early termination after {terminate} seconds.') exit(0) sketchy_pids, mash_pids = check_pids() # Early termination if no processes if not sketchy_pids and not mash_pids: if terminate: pore.logger.info('No more processes found. Exiting.') summarize_data(data, outfile, pore.logger) exit(0) else: pass else: try: cpu1, rss1 = get_resource_use(mash_pids) cpu2, rss2 = get_resource_use(sketchy_pids) cpu_total = cpu1 + cpu2 rss_total = rss1 + rss2 pore.logger.info( f'{round(cpu_total, 1)}\t{round(rss_total, 1)}\t' f'{round(cpu1, 1)}\t{round(rss1, 1)}\t' f'{cpu2}\t{round(rss2, 1)}') data.append([ run_time, cpu_total, rss_total, cpu1, rss1, cpu2, rss2 ]) except psutil.NoSuchProcess: if terminate: pore.logger.info('No more processes found. Exiting.') summarize_data(data, outfile, pore.logger) exit(0) time.sleep(interval) time.sleep(interval) except KeyboardInterrupt: summarize_data(data, outfile, pore.logger) exit(0)
def __init__(self): PoreLogger.__init__(self) self.data: pandas.DataFrame = pandas.DataFrame(None)
def __init__(self): PoreLogger.__init__(self) self.watcher = None # active watcher
def predict(fastq, sketch, data, prefix, tmp, keep, cores, threads, reads, top, sketchy): """ Lineage hashing from uncorrected nanopore reads (offline) """ if reads == 0: reads = None pl = PoreLogger() sketch_path = Path(sketch) tmp.mkdir(parents=True, exist_ok=True) if sketch in ('kleb', 'mrsa', 'tb'): sketch_path = sketchy / 'db' / f'{sketch}.default.msh' data = sketchy / 'data' / f'{sketch}.data.tsv' if not fastq.exists(): click.echo(f'File {fastq} does not exist.') exit(1) if not sketch_path.exists(): click.echo(f'Mash sketch {sketch_path} does not exist.') exit(1) if fastq.suffix == '.gz': # Unpack into temporary directory tmp_path = tmp / fastq.with_suffix('') pl.logger.debug(f'Decompressing file {fastq} to {tmp_path}') with pysam.FastxFile(fastq) as fin, \ open(tmp / fastq.with_suffix(''), mode='w') as fout: for entry in fin: string_out = str(entry) if not string_out.endswith('\n'): string_out += '\n' fout.write(string_out) fastq = tmp_path if reads is None: reads = get_total_reads(fastq) try: ms = MashScore() pl.logger.info('Compute min-wise shared hashes against sketch ...') _ = ms.run( fastq=fastq, nreads=reads, sketch=sketch_path, cores=cores, top=top, # direct mode only mode='single', data=data, tmpdir=tmp, ncpu=threads, ) se = SampleEvaluator(indir=tmp, limit=reads, top=top, sketch_data=data) se.top_ssh.to_csv(f'{str(prefix) + ".tsv"}', sep='\t') except KeyboardInterrupt: if not keep: shutil.rmtree(tmp) exit(0) else: if not keep: shutil.rmtree(tmp)