示例#1
0
    def __init__(
        self,
        fastx: Path,
        sketch: Path,
        prefix: str = 'sketchy',
        outdir: Path = Path('sketchy_out'),
        verbose: bool = False,
    ):

        PoreLogger.__init__(
            self,
            level=logging.INFO if verbose else logging.ERROR,
            name='Compute'
        )

        self.fastx = fastx
        self.sketch = sketch
        self.prefix = prefix
        self.outdir = outdir
        self.verbose = verbose

        self.logger.info(f'Sketchy wrapper v{__version__}')
        self.logger.info(f'Prefix: {prefix}')
        self.logger.info(f'Fastq file: {fastx.absolute()}')
        self.logger.info(f'Output directory: {outdir.absolute()}')

        self.outdir.mkdir(exist_ok=True, parents=True)
示例#2
0
    def __init__(
        self,
        index_file: Path = None,
        lineage_column: str = 'mlst',
        verbose: bool = True
    ):

        PoreLogger.__init__(
            self, level=logging.INFO if verbose else logging.ERROR
        )

        if index_file:
            self.index = pandas.read_csv(
                index_file, sep='\t', header=0
            )
            if 'idx' not in self.index.columns:
                self.logger.info('Adding Mash index column "idx" to genotype index')
                self.index.index = [i for i in range(
                    len(self.index)
                )]
                self.index.index.name = 'idx'
            else:
                self.logger.info('Using column "idx" as Mash index column in genotype index')
                self.index.index = self.index.set_index('idx')
        else:
            self.index = None

        self.lineage_column = lineage_column
示例#3
0
    def __init__(self, survey_directory: Path):

        PoreLogger.__init__(self)

        self.survey_data = SurveyData()

        self.logger.info(f'Parse survey directory: {survey_directory}')

        self.missing = '-'

        self.survey_data.read(survey_directory)
示例#4
0
    def __init__(self):

        PoreLogger.__init__(self)

        self.inter = pandas.DataFrame()    # MASH outputs, updated
        self.interim = pandas.DataFrame()  # MASH outputs, updated, with data

        self.lineage = Counter()        # Prime lineage counter
        self.genotype = dict()          # Genotype counters by lineage
        self.susceptibility = dict()    # Susceptibility counters by lineage

        self.continuous = list()
        self.start_time_regex = r'start_time=(.*)Z'
示例#5
0
    def __init__(
        self,
        sssh: Path,
        index: Path,
        key: Path,
        stable: int = None,
        ssh: Path = None,
        verbose: bool = False
    ):

        PoreLogger.__init__(self, name="Evaluate")

        if verbose:
            self.logger.setLevel(level=logging.INFO)

        self.top_feature_values = 5
        self.read_limit = 1000
        self.preference_threshold = 0.6
        self.na_color = 'darkgray'

        self.stable = stable

        self.logger.info(f"Loading data for evaluations from Sketchy Rust")
        self.logger.info(f"Ranked sum of shared hashes: {sssh}")
        self.logger.info(f"Sum of shared hashes: {ssh}")
        self.logger.info(f"Genotype feature index: {index}")
        self.logger.info(f"Genotype feature key: {key}")

        self.feature_key = self.read_feature_key(file=key)  # key to headers and categories
        self.feature_index, self.feature_data = self.read_feature_index(file=index)

        self.ssh = self.read_ssh(file=ssh)
        self.sssh = self.read_sssh(file=sssh)

        self.features = self.feature_index.columns.tolist()

        if self.ssh is not None:
            # Merge ssh and feature index for heatmap
            self.ssh_features = self.ssh \
                .join(self.feature_data, how='inner') \
                .sort_values(['read', 'rank'])

            self.reads = len(
                self.ssh_features['read'].unique()
            )

            self.ranks = len(
                self.ssh_features['rank'].unique()
            )
示例#6
0
    def __init__(
        self,
        indir: Path = None,
        outdir: Path = None,
        limit: int = 1000,
        top: int = 10,
    ):

        self.top = top
        self.indir = indir

        self.limit = limit

        self.outdir = outdir

        if outdir:
            outdir.mkdir(parents=True, exist_ok=True)

        self.false_color = "#d9d9d9"

        self.logger = PoreLogger().logger

        self.reads: int = 0

        self.breakpoints = dict()
示例#7
0
def link(iid: Path, directory, column, extension, outdir, symlink, bootstrap):
    """ Link ID file to FASTA, e.g. from filtered Pathfinder Survey """

    if column is None:
        iids = pandas.read_csv(iid, index_col=0, header=0, dtype=str)
    else:
        iids = pandas.read_csv(iid, usecols=[column], header=0, sep='\t')
        iids.rename(columns={column: 'iid'}, inplace=True)

    if symlink:
        outdir.mkdir(exist_ok=True, parents=True)

    log = PoreLogger(level=logging.INFO).logger

    if bootstrap:
        isolates = random.choices(iids.iid, k=bootstrap)
    else:
        isolates = iids.iid

    for i in isolates:
        iid_path = directory / str(i + extension)
        if iid_path.exists():
            if symlink:
                sym_path = (outdir / str(i + extension)).absolute()
                log.info(
                    f'Symlink: {iid_path.absolute()} to {sym_path.absolute()}')
                os.symlink(str(iid_path.absolute()), str(sym_path.absolute()))
            else:
                print(f"{iid_path}")
        else:
            log.debug(f'Could not find: {iid_path}')
示例#8
0
def fx_sort(fastx, output):
    """ Sort reads by 'start_time' in basecalled header from Guppy"""

    logger = PoreLogger(level=logging.INFO).logger

    logger.info('Creating index for random access to reads')
    fx, build_read = create_fastx_index(fastx=fastx)

    logger.info('Creating start time index for all reads')
    sim = SketchySimulator(fastx=fastx)
    run_index = sim.get_run_index()

    logger.info(f'Writing sorted reads by start time date')
    with get_output_handle(fpath=output) as fout:
        for i, row in run_index.iterrows():
            read_str = build_read(fx[row['name']],
                                  comment=sim.create_header_comment(row))
            fout.write(read_str + '\n')

    logger.info(f'Completed')

    Path(str(fastx) + '.fxi').unlink()
示例#9
0
    def __init__(self,
                 sketch_path=Path.home() / '.sketchy',
                 full: bool = False,
                 verbose: bool = True):

        ########################################
        # Public Google Cloud Storage Settings #
        ########################################

        self.bucket_name = 'sketchy-sketch'

        self.sketches = ['kpneumoniae', 'saureus']
        self.full = full

        self.pl = PoreLogger(logging.INFO if verbose else logging.ERROR)
        self.sketch_path = sketch_path
示例#10
0
    def __init__(self, sketch_path=Path.home() / '.sketchy' / 'db'):

        ########################################
        # Public Google Cloud Storage Settings #
        ########################################

        self.base_url = 'https://storage.googleapis.com/np-core-sketchy/'

        self.sketch_files = {
            'kleb': 'kleb.default.msh',
            'mrsa': 'mrsa.default.msh',
            'tb': 'tb.default.msh'
        }

        self.pl = PoreLogger()
        self.sketch_path = sketch_path
示例#11
0
def merge(sketch, features, key, prefix, index_column, mash_column, verbose):

    """ Merge sketch and feature data by common indices """

    pl = PoreLogger(level=logging.INFO if verbose else logging.ERROR).logger

    pl.info(f'Extracting data from sketch: {sketch}')
    run_cmd(f'mash info -t {sketch} > {prefix}.mashinfo', shell=True)

    pl.info(f'Reading and converting data indices from sketch')
    converters = {'id': lambda x: Path(x).stem}
    mash_info = pandas.read_csv(
        f'{prefix}.mashinfo',
        sep='\t',
        header=None,
        skiprows=1,
        index_col=0,
        engine='c',
        usecols=[2],
        names=['id'],
        converters=converters,
    )

    pl.info(f'Assigning sequential indices to index column: `idx`')
    mash_info['idx'] = [i for i in range(len(mash_info))]
    mash_info['ids'] = mash_info.index.tolist()

    nsketch = len(mash_info)

    pl.info(f'Ordered merge on column {index_column} with feature file {features}')
    d = pandas.read_csv(features, sep='\t')

    ndata = len(d)

    print(mash_info)
    print(d)

    mash_info = d.merge(
        mash_info, left_on=index_column, right_on=mash_column, how='inner'
    )
    pl.info('Merged data and sketch information')
    if 'idx_y' in mash_info.columns:
        mash_info = mash_info.drop(columns="idx_x")
        mash_info = mash_info.rename(columns={'idx_y': 'idx'})

    if 'ids_y' in mash_info.columns:
        mash_info = mash_info.drop(columns=["ids_y"])

    if "ids_x" in mash_info.columns:
        mash_info = mash_info.drop(columns=["ids_x"])

    mash_info = mash_info.sort_values('idx')
    mash_info.index = mash_info['idx']
    mash_info = mash_info.drop(columns='idx')

    if key is not None:
        key_table = pandas.read_csv(
            key, sep='\t', header=0
        )
        mash_info = mash_info.merge(
            key_table, left_on='ids', right_on='uuid'
        )
        mash_info.drop(columns=['uuid', 'fasta'], inplace=True)
        mash_info.rename(columns={'id': 'key'}, inplace=True)

    print(mash_info)
    pl.info(f'Writing merged feature index to: {prefix}.tsv')
    mash_info.to_csv(
        f'{prefix}.tsv',
        sep='\t',
        header=True,
        index=True,
    )

    pl.info(f'Merged sketch data ({nsketch}) and feature data ({ndata})')
    pl.info(f'Final sketch and feature size is {len(mash_info)}')
    pl.info(f'Removed features not present in sketch: {len(mash_info) - ndata}')
    pl.info(f'Removing temporary file {prefix}.mashinfo')
    os.remove(f'{prefix}.mashinfo')
示例#12
0
def monitor(interval, terminate, early, prefix, log):
    """ Monitor benchmarks during a Sketchy execution (Mash, Sketchy) """

    # A bit hacky because Sketchy calls Mash in parallel, and there is
    # currently no return of specific PID from Mash

    # Essentially monitors ANY command that contains: mash, sketchy-rs
    # If multiple commands are running, resource usage is computed across all

    if interval < 0.1:
        raise ValueError('Interval (--interval, -i) must be >= 0.1')

    start_time = time.time()

    if log:
        logfile = f'{prefix}.log'
    else:
        logfile = None

    outfile = f'{prefix}.tsv'

    pore = PoreLogger(level=logging.INFO, file=logfile)

    data = []

    try:
        pore.logger.info(f'CPU\tMEM\tMash\t\tSketchy\t')
        pore.logger.info(f'---\t---\t----\t\t-------\t')
        while True:
            run_time = time.time() - start_time
            if early and run_time > early:
                pore.logger.info(
                    f'Early termination after {terminate} seconds.')
                exit(0)

            sketchy_pids, mash_pids = check_pids()

            # Early termination if no processes
            if not sketchy_pids and not mash_pids:
                if terminate:
                    pore.logger.info('No more processes found. Exiting.')
                    summarize_data(data, outfile, pore.logger)
                    exit(0)
                else:
                    pass
            else:
                try:
                    cpu1, rss1 = get_resource_use(mash_pids)
                    cpu2, rss2 = get_resource_use(sketchy_pids)

                    cpu_total = cpu1 + cpu2
                    rss_total = rss1 + rss2

                    pore.logger.info(
                        f'{round(cpu_total, 1)}\t{round(rss_total, 1)}\t'
                        f'{round(cpu1, 1)}\t{round(rss1, 1)}\t'
                        f'{cpu2}\t{round(rss2, 1)}')

                    data.append([
                        run_time, cpu_total, rss_total, cpu1, rss1, cpu2, rss2
                    ])
                except psutil.NoSuchProcess:
                    if terminate:
                        pore.logger.info('No more processes found. Exiting.')
                        summarize_data(data, outfile, pore.logger)
                        exit(0)
                    time.sleep(interval)

            time.sleep(interval)

    except KeyboardInterrupt:
        summarize_data(data, outfile, pore.logger)
        exit(0)
示例#13
0
    def __init__(self):

        PoreLogger.__init__(self)

        self.data: pandas.DataFrame = pandas.DataFrame(None)
示例#14
0
    def __init__(self):

        PoreLogger.__init__(self)
        self.watcher = None  # active watcher
示例#15
0
def predict(fastq, sketch, data, prefix, tmp, keep, cores, threads, reads, top,
            sketchy):
    """ Lineage hashing from uncorrected nanopore reads (offline) """

    if reads == 0:
        reads = None

    pl = PoreLogger()
    sketch_path = Path(sketch)
    tmp.mkdir(parents=True, exist_ok=True)

    if sketch in ('kleb', 'mrsa', 'tb'):
        sketch_path = sketchy / 'db' / f'{sketch}.default.msh'
        data = sketchy / 'data' / f'{sketch}.data.tsv'

    if not fastq.exists():
        click.echo(f'File {fastq} does not exist.')
        exit(1)

    if not sketch_path.exists():
        click.echo(f'Mash sketch {sketch_path} does not exist.')
        exit(1)

    if fastq.suffix == '.gz':
        # Unpack into temporary directory
        tmp_path = tmp / fastq.with_suffix('')
        pl.logger.debug(f'Decompressing file {fastq} to {tmp_path}')
        with pysam.FastxFile(fastq) as fin, \
                open(tmp / fastq.with_suffix(''), mode='w') as fout:
            for entry in fin:
                string_out = str(entry)
                if not string_out.endswith('\n'):
                    string_out += '\n'
                fout.write(string_out)

        fastq = tmp_path

    if reads is None:
        reads = get_total_reads(fastq)

    try:

        ms = MashScore()
        pl.logger.info('Compute min-wise shared hashes against sketch ...')

        _ = ms.run(
            fastq=fastq,
            nreads=reads,
            sketch=sketch_path,
            cores=cores,
            top=top,  # direct mode only
            mode='single',
            data=data,
            tmpdir=tmp,
            ncpu=threads,
        )

        se = SampleEvaluator(indir=tmp, limit=reads, top=top, sketch_data=data)

        se.top_ssh.to_csv(f'{str(prefix) + ".tsv"}', sep='\t')

    except KeyboardInterrupt:
        if not keep:
            shutil.rmtree(tmp)
        exit(0)
    else:
        if not keep:
            shutil.rmtree(tmp)