def __init__(self, config, aligner=None):
     self.config = config
     self.genome = Genome(self.config)
     if aligner is not None:
         self.aligner = aligner
     else:
         assert self.genome.aligner is not None
         self.aligner = self.genome.aligner
     assert self.aligner is not None
    def run(self):
        for rec in self.input_function_generator:
            out = Genome.to_fasta_string(rec)

            self.aligner_input.write(out)
            self.aligner_input.flush()

        self.aligner_input.close()
Пример #3
0
    def __init__(self, config):
        self.config = config
        self.summary_filename = self.config.data_10x.data_10x_cell_summary
        self.bam_filename = self.config.data_10x.data_10x_bam
        self.bai_filename = self.config.data_10x.data_10x_bai

        assert os.path.exists(self.summary_filename), self.summary_filename
        assert os.path.exists(self.bam_filename), self.bam_filename
        assert os.path.exists(self.bai_filename), self.bai_filename

        self.summary_df = pd.read_csv(self.summary_filename, sep=',')

        self.barcodes = {
            k: v
            for (k, v) in self.summary_df[['barcode', 'cell_id']].to_records(
                index=False)
        }
        self.genome = Genome(self.config)
        assert self.genome is not None
Пример #4
0
class BinsPipeline(object):
    def __init__(self, config):
        self.config = config
        self.genome = Genome(self.config)

    def calc_bins_gc_content(self, chroms, bins_df):

        result = []
        for chrom in chroms:
            chrom_df = bins_df[bins_df['bin.chrom'] == chrom]
            gc_df = chrom_df.copy()
            gc_df.reset_index(inplace=True, drop=True)

            gc_series = pd.Series(index=gc_df.index)
            chrom_seq = self.genome.load_chrom(chrom)

            for index, row in gc_df.iterrows():
                start = row['bin.start']
                end = row['bin.end']
                seq = chrom_seq.seq[start:end]
                counts = [seq.count(x) for x in ['G', 'C', 'A', 'T']]
                total_counts = sum(counts)
                if total_counts == 0:
                    gc = 0.0
                else:
                    gc = float(sum(counts[0:2])) / sum(counts)
                gc_series.iloc[index] = gc

            gc_df['gc.content'] = gc_series
            result.append(gc_df)
        assert len(result) > 0
        if len(result) == 1:
            return result[0]
        df = pd.concat(result)
        return df

    def bins_boundaries_generator(self, chroms, mappable_regions_df):
        chrom_sizes = self.genome.chrom_sizes()
        chrom_bins = self.genome.calc_chrom_bins()

        # if mappable_regions_df is None:
        #     mappable_regions_df = self.load_mappable_regions()

        for chrom in chroms:
            chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom]
            chrom_df = chrom_df.sort_values(
                by=['chrom', 'start_pos', 'end_pos'])

            params = BinParams.build(chrom_size=chrom_sizes[chrom],
                                     chrom_bin=chrom_bins[chrom])
            mappable_bin = None
            current_excess = 0
            bins_count = params.bins_count

            for row in chrom_df.to_dict(orient="records"):
                if mappable_bin is None:
                    mappable_bin = MappableBin.from_start(params, start_pos=0)
                    current_excess = mappable_bin.adapt_excess(current_excess)
                if not mappable_bin.check_extend(row):
                    next_bin = mappable_bin.split_extend(row)

                    bins_count -= 1
                    if bins_count == 0:
                        # last bin a chromosome
                        mappable_bin.end_pos = chrom_sizes[chrom].size
                    yield mappable_bin
                    if next_bin.is_overfill():
                        current_excess, mappable_bins = \
                            next_bin.overfill_split(current_excess)

                        assert len(mappable_bins) > 1
                        for mb in mappable_bins[:-1]:
                            bins_count -= 1
                            yield mb
                        mappable_bin = mappable_bins[-1]
                    else:
                        mappable_bin = next_bin
                        current_excess = \
                            mappable_bin.adapt_excess(current_excess)
                # print("mappable_bin:", row, mappable_bin)
            mappable_bin = None

    def calc_bins_boundaries(self, chroms=None, regions_df=None):
        if chroms is None:
            chroms = self.genome.version.CHROMS
        bin_rows = []
        for mbin in self.bins_boundaries_generator(chroms, regions_df):
            # print("mbin:", mbin)
            bin_rows.append(
                (mbin.chrom, mbin.start_pos, mbin.start_abspos, mbin.end_pos,
                 mbin.end_pos - mbin.start_pos, mbin.bin_size))

        df = pd.DataFrame.from_records(bin_rows,
                                       columns=[
                                           'bin.chrom', 'bin.start',
                                           'bin.start.abspos', 'bin.end',
                                           'bin.length', 'mappable.positions'
                                       ])
        df.sort_values(by=['bin.start.abspos'], inplace=True)
        return df

    def load_mappable_regions(self, chrom=None):
        filename = self.config.mappable_regions_filename(chrom=chrom)

        df = pd.read_csv(self.config.mappable_regions_filename(),
                         names=['chrom', 'start_pos', 'end_pos'],
                         sep='\t')
        df = df.sort_values(by=['chrom', 'start_pos', 'end_pos'])
        assert len(df) > 0

        return df

    def run_once(self, chrom):
        print(
            colored(f"started calculating bins for chromosome {chrom}",
                    "green"))
        regions_df = self.load_mappable_regions(chrom=chrom)
        bins_df = self.calc_bins_boundaries([chrom], regions_df)
        df = self.calc_bins_gc_content([chrom], bins_df)
        outfilename = self.config.bins_boundaries_filename(chrom)
        print(
            colored(f"saving bins for chromosome {chrom} into {outfilename}",
                    "green"))
        df.to_csv(outfilename, sep='\t', index=False)
        return outfilename

    def concatenate_all_chroms(self):
        outfilename = self.config.bins_boundaries_filename()
        if os.path.exists(outfilename) and not self.config.force:
            print(
                colored(
                    "destination bins boundaries file already exists"
                    "use --force to overwrite", "red"))
            raise ValueError("destination file exists... use --force")

        if self.config.dry_run:
            return

        dataframes = []
        for chrom in self.genome.version.CHROMS:
            srcfilename = self.config.bins_boundaries_filename(chrom)
            df = pd.read_csv(srcfilename, sep='\t')
            dataframes.append(df)
        outdf = pd.concat(dataframes, ignore_index=True)
        outdf.sort_values(by=['bin.start.abspos', 'bin.start', 'bin.end'],
                          inplace=True)

        outdf.to_csv(outfilename, sep='\t', index=False)

    def run(self, dask_client):
        outfilename = self.config.bins_boundaries_filename()
        os.makedirs(os.path.dirname(outfilename), exist_ok=True)

        print(
            colored(
                "going to compute bin boundaries from mappable regions: {} "
                "into bins boundaries file {}".format(
                    self.config.mappable_regions_filename(), outfilename),
                "green"))
        if os.path.exists(outfilename) and not self.config.force:
            print(
                colored(
                    "output file {} already exists; "
                    "use --force to overwrite".format(outfilename), "red"))
            raise ValueError("output file already exists")

        if self.config.dry_run:
            return

        assert self.genome.chrom_sizes() is not None

        delayed_tasks = dask_client.map(self.run_once,
                                        self.genome.version.CHROMS)
        print(len(delayed_tasks), delayed_tasks)
        print(dask_client.scheduler_info())

        distributed.wait(delayed_tasks)
        for task in delayed_tasks:
            outfile = task.result()
            print(outfile, os.path.exists(outfile))

        self.concatenate_all_chroms()
class MappableRegionsPipeline(object):
    def __init__(self, config, aligner=None):
        self.config = config
        self.genome = Genome(self.config)
        if aligner is not None:
            self.aligner = aligner
        else:
            assert self.genome.aligner is not None
            self.aligner = self.genome.aligner
        assert self.aligner is not None

    def mappable_regions_check(self, chroms, mappable_regions_df):

        for chrom in chroms:
            chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom]
            chrom_df = chrom_df.sort_values(
                by=['chrom', 'start_pos', 'end_pos'])
            start_pos_count = len(chrom_df.start_pos.unique())
            if start_pos_count < len(chrom_df):
                LOG.error(
                    "chrom {} has duplicate mappable regions".format(chrom))

    def generate_reads(self, chroms, read_length):
        try:
            for chrom in chroms:
                seq_record = self.genome.load_chrom(chrom)
                for i in range(len(seq_record) - read_length + 1):
                    seq = seq_record.seq[i:i + read_length]
                    out_record = SeqRecord(seq,
                                           id="{}.{}".format(chrom, i + 1),
                                           description="generated_read")
                    # if 'N' in seq:
                    #     print('skipping: ', out_record)
                    #     continue
                    yield out_record
        finally:
            pass

    def generate_mappable_regions(self,
                                  chroms,
                                  read_length,
                                  outfile=None,
                                  aligner_options=[]):

        if outfile is None:
            outfile = sys.stdout

        reads_generator = self.generate_reads(chroms, read_length)

        def aligner_output_process_function(line):
            outfile.write(str(line))
            outfile.write("\n")

        aligner_command = self.aligner.build_mappable_regions_command(
            options=aligner_options)
        print('aligner command:', ' '.join(aligner_command))

        with Popen(aligner_command, stdout=PIPE, stdin=PIPE) as proc:

            control_queue = queue.Queue()
            input_thread = InputGeneratorThread(control_queue, proc.stdin,
                                                reads_generator)
            output_thread = AlignerOutputProcessingThread(
                control_queue, proc.stdout, aligner_output_process_function)

            input_thread.start()
            output_thread.start()

            while True:
                msg = None
                try:
                    msg = control_queue.get()
                except queue.Empty:
                    print("timeout - queue empty")
                    msg = None
                if msg == 'out_done':
                    print("output done")
                    break
                if msg == 'in_done':
                    print('input done')
            input_thread.join()
            output_thread.join()

    def mappable_regions_chrom_filename(self, chrom):
        mname = "{}_{}".format(chrom,
                               self.config.mappable_regions.mappable_file)
        filename = os.path.join(self.config.mappable_regions.mappable_dir,
                                mname)
        return filename

    def mappable_regions_filename(self):
        mname = self.config.mappable_regions.mappable_file
        filename = os.path.join(self.config.mappable_regions.mappable_dir,
                                mname)
        return filename

    def run_once(self, chrom):
        outfilename = self.mappable_regions_chrom_filename(chrom)
        with open(outfilename, "w") as outfile:
            self.generate_mappable_regions([chrom],
                                           read_length=50,
                                           outfile=outfile)
        return outfilename

    def concatenate_all_chroms(self):
        dst = self.mappable_regions_filename()
        if os.path.exists(dst) and not self.config.force:
            print(
                colored(
                    "destination mappable regions file already exists"
                    "use --force to overwrite", "red"))
            raise ValueError("destination file exists... use --force")

        if not self.config.dry_run:
            with open(dst, 'wb') as output:
                for chrom in self.genome.version.CHROMS:
                    src = self.mappable_regions_chrom_filename(chrom)
                    print(
                        colored("appending {} to {}".format(src, dst),
                                "green"))
                    with open(src, 'rb') as src:
                        if not self.config.dry_run:
                            shutil.copyfileobj(src, output, 1024 * 1024 * 10)

    def run(self, dask_client):
        outfilename = self.mappable_regions_filename()
        print(
            colored(
                "going to generate mappable regions with length {} "
                "from genome {} into {}".format(
                    self.config.mappable_regions.mappable_read_length,
                    self.config.genome.genome_dir, outfilename), "green"))

        if os.path.exists(outfilename) and not self.config.force:
            print(
                colored(
                    "output file {} already exists; "
                    "use --force to overwrite".format(outfilename), "red"))
            raise ValueError("output file already exists")

        genome_index_filenames = self.aligner.genome_index_filenames
        if not os.path.exists(genome_index_filenames[0]):
            print(
                colored(
                    "genome index file {} not found".format(
                        genome_index_filenames), "red"))
            raise ValueError("genome index file not found")

        if self.config.dry_run:
            return

        os.makedirs(self.config.mappable_regions.mappable_dir, exist_ok=True)

        assert dask_client

        delayed_tasks = dask_client.map(self.run_once,
                                        self.genome.version.CHROMS)

        distributed.wait(delayed_tasks)

        for fut in delayed_tasks:
            print("fut done:", fut.done())
            print("fut exception:", fut.exception())
            print("fut traceback:", fut.traceback())
            print("fut result:", fut.result())

            # if fut.traceback() is not None:
            #     traceback.print_tb(fut.traceback())
            # if fut.exception() is None:
            #     print(fut.result())

        self.concatenate_all_chroms()
Пример #6
0
 def __init__(self, config):
     self.config = config
     self.hg = Genome(config)
Пример #7
0
def hg(tests_config):
    return Genome(tests_config)
Пример #8
0
class GenomeIndexPipeline(object):

    def __init__(self, config):
        self.config = config
        # assert self.config.genome.version == 'hg19'
        self.genome = Genome(self.config)
        assert self.genome.aligner is not None

    def copy_chromes_files(self):
        self.config.check_nonempty_workdir(self.config.genome.genome_dir)

        for chrom in self.genome.version.CHROMS_ALL:
            if chrom == 'chrY':
                continue
            src = os.path.join(
                self.config.genome.genome_pristine_dir,
                "{}.fa".format(chrom)
            )
            dst = os.path.join(
                self.config.genome.genome_dir,
                "{}.fa".format(chrom)
            )
            print(colored(
                "copying chromosome {} from {} into "
                "working directory {}".format(
                    chrom, src, dst),
                "green"))
            if not self.config.dry_run:
                shutil.copy(src, dst)

    def mask_pars(self):
        dst = self.genome.chrom_filename('chrY')
        print(colored(
            "masking pseudoautosomal regions in chrY",
            "green")
        )
        if os.path.exists(dst) and not self.config.force:
            print(colored(
                "destination file for masked chrY already exists",
                "red"
            ))
            raise ValueError("dst file already exists")
        if not self.config.dry_run:
            masked = self.genome.mask_chrY_pars()
            self.genome.save_chrom(masked, 'chrY')

    def concatenate_all_chroms(self):
        dirname = self.config.genome.genome_dir
        dst = os.path.join(
            dirname,
            'genome.fa'
        )
        if os.path.exists(dst) and not self.config.force:
            print(colored(
                "destination genome file already exists"
                "use --force to overwrite", "red"))
            raise ValueError("destination file exists... use --force")

        if not self.config.dry_run:
            with open(dst, 'wb') as output:
                for chrom in self.genome.version.CHROMS_ALL:
                    src = self.genome.chrom_filename(chrom, pristine=False)
                    print(colored(
                        "appending {} to {}".format(src, dst),
                        "green"))
                    with open(src, 'rb') as src:
                        if not self.config.dry_run:
                            shutil.copyfileobj(src, output, 1024 * 1024 * 10)

    def build_aligner_index(self):
        print(colored(
            f"building genome index of {self.genome.sequence_filename} "
            f"into {self.genome.index_prefix}",
            "green"))
        command = " ".join(self.genome.aligner.build_index_command(
            self.genome.sequence_filename,
            self.genome.index_prefix
        ))
        print(colored(
            f"going to execute aligner genome index build: {command}",
            "green"))

        test_filename = self.genome.aligner.genome_index_filenames[0]
        print(colored(f"checking for index file: {test_filename}", "green"))
        if os.path.exists(test_filename) and not self.config.force:
            print(colored(
                "output genome index {} already exists".format(test_filename),
                "red"))
            raise ValueError("destination file already exists")

        if not self.config.dry_run:
            subprocess.check_call(command, shell=True)

    def run(self, **kwargs):
        self.copy_chromes_files()
        self.mask_pars()
        self.concatenate_all_chroms()
        self.build_aligner_index()
Пример #9
0
 def __init__(self, config):
     self.config = config
     # assert self.config.genome.version == 'hg19'
     self.genome = Genome(self.config)
     assert self.genome.aligner is not None
Пример #10
0
class GenomeIndexPipeline(object):

    def __init__(self, config):
        self.config = config
        # assert self.config.genome.version == 'hg19'
        self.hg = Genome(self.config)

    def copy_chromes_files(self):
        self.config.check_nonempty_workdir(
            self.config.abspath(self.config.genome.work_dir))

        for chrom in self.hg.version.CHROMS_ALL:
            if chrom == 'chrY':
                continue
            src = os.path.join(
                self.config.genome.data_dir,
                "{}.fa".format(chrom)
            )
            dst = os.path.join(
                self.config.genome.work_dir,
                "{}.fa".format(chrom)
            )
            print(colored(
                "copying chromosome {} from {} into "
                "working directory {}".format(
                    chrom, src, dst),
                "green"))
            if not self.config.dry_run:
                shutil.copy(src, dst)

    def mask_pars(self):
        dst = self.config.chrom_filename('chrY')
        print(colored(
            "masking pseudoautosomal regions in chrY",
            "green")
        )
        if os.path.exists(dst) and not self.config.force:
            print(colored(
                "destination file for masked chrY already exists",
                "red"
            ))
            raise ValueError("dst file already exists")
        if not self.config.dry_run:
            masked = self.hg.mask_chrY_pars()
            self.hg.save_chrom(masked, 'chrY')

    def concatenate_all_chroms(self):
        dirname = self.config.genome.work_dir
        dst = os.path.join(
            dirname,
            'genome.fa'
        )
        if os.path.exists(dst) and not self.config.force:
            print(colored(
                "destination genome file already exists"
                "use --force to overwrite", "red"))
            raise ValueError("destination file exists... use --force")

        if not self.config.dry_run:
            with open(dst, 'wb') as output:
                for chrom in self.hg.version.CHROMS_ALL:
                    src = self.config.chrom_filename(chrom, pristine=False)
                    print(colored(
                        "appending {} to {}".format(src, dst),
                        "green"))
                    with open(src, 'rb') as src:
                        if not self.config.dry_run:
                            shutil.copyfileobj(src, output, 1024 * 1024 * 10)

    def build_bowtie_index(self):
        src = os.path.join(
            self.config.genome.work_dir,
            'genome.fa'
        )
        dst = os.path.join(
            self.config.genome.work_dir,
            self.config.genome.index
        )
        print(colored(
            "building bowtie index of {} into {}".format(src, dst),
            "green"))
        command = "bowtie-build -f {} {}".format(src, dst)
        print(colored(
            "executing bowtie-build: {}".format(command),
            "green"))
        test_filename = "{}.1.bt2".format(dst)
        if os.path.exists(test_filename) and not self.config.force:
            print(colored(
                "output bowtie index {} already exists".format(test_filename),
                "red"))
            raise ValueError("destination file already exists")

        if not self.config.dry_run:
            subprocess.check_call(command, shell=True)

    def run(self):
        self.copy_chromes_files()
        self.mask_pars()
        self.concatenate_all_chroms()
        self.build_bowtie_index()
Пример #11
0
 def __init__(self, config):
     self.config = config
     # assert self.config.genome.version == 'hg19'
     self.hg = Genome(self.config)
Пример #12
0
 async def async_write_fasta(outfile, rec):
     out = Genome.to_fasta_string(rec)
     outfile.write(out)
     await outfile.drain()
Пример #13
0
class MappableRegionsPipeline(object):
    def __init__(self, config):
        self.config = config
        self.hg = Genome(self.config)

    def mappable_regions_check(self, chroms, mappable_regions_df):
        # if mappable_regions_df is None:
        #     mappable_regions_df = self.load_mappable_regions()

        for chrom in chroms:
            chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom]
            chrom_df = chrom_df.sort_values(
                by=['chrom', 'start_pos', 'end_pos'])
            start_pos_count = len(chrom_df.start_pos.unique())
            if start_pos_count < len(chrom_df):
                LOG.error(
                    "chrom {} has duplicate mappable regions".format(chrom))

    def generate_reads(self, chroms, read_length):
        try:
            for chrom in chroms:
                seq_record = self.hg.load_chrom(chrom)
                for i in range(len(seq_record) - read_length + 1):
                    out_record = SeqRecord(seq_record.seq[i:i + read_length],
                                           id="{}.{}".format(chrom, i + 1),
                                           description="generated_read")
                    yield out_record
        finally:
            pass

    async def async_start_bowtie(self, bowtie_opts=""):
        genomeindex = self.config.genome_index_filename()
        if bowtie_opts:
            command = [
                'bowtie',
                '-S',
                '-t',
                '-v',
                '0',
                '-m',
                '1',
                *bowtie_opts.split(' '),
                '-f',
                genomeindex,
                '-',
            ]
        else:
            command = [
                'bowtie',
                '-S',
                '-t',
                '-v',
                '0',
                '-m',
                '1',
                '-f',
                genomeindex,
                '-',
            ]
        print(
            colored("going to execute bowtie: {}".format(" ".join(command)),
                    "green"))
        create = asyncio.create_subprocess_exec(
            *command,
            stdin=asyncio.subprocess.PIPE,
            stdout=asyncio.subprocess.PIPE,
        )
        proc = await create
        return proc

    @staticmethod
    async def async_write_fasta(outfile, rec):
        out = Genome.to_fasta_string(rec)
        outfile.write(out)
        await outfile.drain()

    async def async_write_reads_generator(self, out, reads_generator):
        for rec in reads_generator:
            await self.async_write_fasta(out, rec)
        out.close()

    async def async_mappings_generator(self, reads_generator, bowtie):
        writer = asyncio.Task(
            self.async_write_reads_generator(bowtie.stdin, reads_generator))

        while True:
            line = await bowtie.stdout.readline()
            if not line:
                break
            yield line.decode()

        await bowtie.wait()
        await writer

    async def async_generate_mappings(self, chroms, read_length, outfile=None):
        if outfile is None:
            outfile = sys.stdout

        bowtie = await self.async_start_bowtie()
        reads_generator = self.generate_reads(chroms, read_length)
        async for mappings in self.async_mappings_generator(
                reads_generator, bowtie):
            outfile.write(mappings)

    async def async_generate_mappable_regions(self,
                                              chroms,
                                              read_length,
                                              outfile=None,
                                              bowtie_opts=""):

        bowtie = await self.async_start_bowtie(bowtie_opts=bowtie_opts)
        reads_generator = self.generate_reads(chroms, read_length)
        writer = asyncio.Task(
            self.async_write_reads_generator(bowtie.stdin, reads_generator))
        if outfile is None:
            outfile = sys.stdout
        async for mapping in self.async_mappable_regions_generator(
                bowtie.stdout):
            outfile.write(str(mapping))
            outfile.write('\n')
        await bowtie.wait()
        await writer

    async def async_mappable_regions_generator(self, infile):
        prev = None
        state = MappableState.OUT

        while True:
            line = await infile.readline()
            if not line:
                break
            line = line.decode()
            if line[0] == '@':
                # comment
                continue

            mapping = Mapping.parse_sam(line)

            if state == MappableState.OUT:
                if mapping.flag == 0:
                    prev = MappableRegion(mapping)
                    state = MappableState.IN
            else:
                if mapping.flag == 0:
                    if mapping.chrom == prev.chrom:
                        prev.extend(mapping.start)
                    else:
                        yield prev
                        prev = MappableRegion(mapping)
                else:
                    yield prev
                    state = MappableState.OUT

        if state == MappableState.IN:
            yield prev

    def run_once(self, chrom):
        event_loop = asyncio.get_event_loop()

        # LOG.info('enabling debugging')
        # Enable debugging
        # event_loop.set_debug(True)

        outfilename = self.config.mappable_regions_filename(chrom)
        with open(outfilename, "w") as outfile:
            event_loop.run_until_complete(
                self.async_generate_mappable_regions(
                    [chrom],
                    self.config.mappable_regions.length,
                    outfile=outfile,
                    bowtie_opts=self.config.mappable_regions.bowtie_opts))

    def concatenate_all_chroms(self):
        dst = self.config.mappable_regions_filename()
        if os.path.exists(dst) and not self.config.force:
            print(
                colored(
                    "destination mappable regions file already exists"
                    "use --force to overwrite", "red"))
            raise ValueError("destination file exists... use --force")

        if not self.config.dry_run:
            with open(dst, 'wb') as output:
                for chrom in self.hg.version.CHROMS:
                    src = self.config.mappable_regions_filename(chrom)
                    print(
                        colored("appending {} to {}".format(src, dst),
                                "green"))
                    with open(src, 'rb') as src:
                        if not self.config.dry_run:
                            shutil.copyfileobj(src, output, 1024 * 1024 * 10)

    def run(self):
        outfilename = self.config.mappable_regions_filename()
        print(
            colored(
                "going to generate mappable regions with length {} "
                "from genome {} into {}".format(
                    self.config.mappable_regions.length,
                    self.config.genome.work_dir, outfilename), "green"))

        if os.path.exists(outfilename) and not self.config.force:
            print(
                colored(
                    "output file {} already exists; "
                    "use --force to overwrite".format(outfilename), "red"))
            raise ValueError("output file already exists")

        if not self.config.genome_index_filename_exists():
            print(
                colored(
                    "genome index file {} not found".format(
                        self.config.genome_index_filename()), "red"))
            raise ValueError("genome index file not found")

        if self.config.dry_run:
            return

        if not os.path.exists(self.config.mappable_regions.work_dir):
            os.makedirs(self.config.mappable_regions.work_dir)

        pool = multiprocessing.Pool(processes=self.config.parallel)
        pool.map(self.run_once, self.hg.version.CHROMS)

        self.concatenate_all_chroms()
Пример #14
0
class VarbinPipeline(object):
    def __init__(self, config):
        self.config = config
        self.genome = Genome(config)

    def find_bin_index(self, abspos, bins):
        index = np.searchsorted(abspos, bins, side='right')

        index = index - 1
        return index

    def mapping_all_filenames(self):
        pattern = os.path.join(
            self.config.mapping.mapping_dir,
            "*{}".format(self.config.mapping.mapping_suffix))
        filenames = glob.glob(pattern)

        return filenames

    def find_bin_index_binsearch(self, bins, abspos):
        index_up = len(bins)
        index_down = 0
        index_mid = int((index_up - index_down) / 2.0)

        while True:
            if abspos >= int(bins[index_mid]):
                index_down = index_mid + 0
                index_mid = int((index_up - index_down) / 2.0) + index_mid
            else:
                index_up = index_mid + 0
                index_mid = int((index_up - index_down) / 2.0) + index_down

            if index_up - index_down < 2:
                break

        return index_down

    def varbin(self, filename):
        try:
            assert os.path.exists(filename), os.path.abspath(filename)

            infile = pysam.AlignmentFile(filename, 'rb')
            bins_df = self.genome.bins_boundaries()
            assert bins_df is not None
            chrom_sizes = self.genome.chrom_sizes()
            chroms = set(self.genome.version.CHROMS)

            count = 0
            dups = 0
            total_reads = 0

            prev_pos = 0
            bin_counts = defaultdict(int)

            bins = bins_df['bin.start.abspos'].values

            for seg in infile:
                total_reads += 1
                if seg.is_unmapped:
                    continue
                chrom = seg.reference_name
                if chrom not in chroms:
                    continue
                if seg.cigarstring != f'{seg.reference_length}M':
                    print("non exact mapping:", seg, seg.cigarstring)
                    continue
                assert seg.cigarstring == f'{seg.reference_length}M', \
                    (seg, seg.cigarstring)

                abspos = chrom_sizes[chrom].abspos + seg.reference_start
                if prev_pos == abspos:
                    dups += 1
                    continue
                count += 1
                index = self.find_bin_index_binsearch(bins, abspos)

                bin_counts[index] += 1
                prev_pos = abspos

            result = []
            for index, row in bins_df.iterrows():
                bin_count = bin_counts[index]
                result.append([
                    row['bin.chrom'],
                    row['bin.start'],
                    row['bin.start.abspos'],
                    bin_count,
                ])
            df = pd.DataFrame.from_records(result,
                                           columns=[
                                               'chrom',
                                               'chrompos',
                                               'abspos',
                                               'bincount',
                                           ])

            df.sort_values(by=['abspos'], inplace=True)
            total_count = df.bincount.sum()
            total_reads_per_bin = float(total_count) / len(bins_df)
            df['ratio'] = df.bincount / total_reads_per_bin

            return df
        except Exception as ex:
            traceback.print_exc()
            raise ex
        return None

    def run_once(self, mapping_filename):
        cellname = self.config.cellname(mapping_filename)
        outfile = self.config.varbin_filename(cellname)
        print(
            colored(
                "processing cell {}; reading from {}; writing to {}".format(
                    cellname, mapping_filename, outfile), "green"))

        if os.path.exists(outfile) and not self.config.force:
            print(
                colored(
                    "output file {} exists; add --force to overwrite".format(
                        outfile), "red"))
        else:
            if not self.config.dry_run:
                df = self.varbin(mapping_filename)
                df.to_csv(outfile, index=False, sep='\t')

    def run(self, dask_client):
        mapping_filenames = self.mapping_all_filenames()
        print(
            colored("processing files: {}".format(mapping_filenames), "green"))

        if self.config.dry_run:
            return

        assert dask_client
        os.makedirs(self.config.varbin.varbin_dir, exist_ok=True)

        delayed_tasks = dask_client.map(self.run_once, mapping_filenames)
        distributed.wait(delayed_tasks)
Пример #15
0
 def __init__(self, config):
     self.config = config
     self.genome = Genome(self.config)
Пример #16
0
def tests_genome(tests_config):
    genome = Genome(tests_config)
    assert genome is not None
    assert genome.version.VERSION == 'hg19'
    return genome
Пример #17
0
def hg():
    config = Config.load("tests/data/scpipe_tests.yml", use_config_dir=True)
    return Genome(config)
Пример #18
0
class VarbinPipeline(object):
    def __init__(self, config):
        self.config = config
        self.hg = Genome(config)

    def find_bin_index(self, abspos, bins):
        index = np.searchsorted(abspos, bins, side='right')

        index = index - 1
        return index

    def find_bin_index_binsearch(self, bins, abspos):
        index_up = len(bins)
        index_down = 0
        index_mid = int((index_up - index_down) / 2.0)

        while True:
            if abspos >= int(bins[index_mid]):
                index_down = index_mid + 0
                index_mid = int((index_up - index_down) / 2.0) + index_mid
            else:
                index_up = index_mid + 0
                index_mid = int((index_up - index_down) / 2.0) + index_down

            if index_up - index_down < 2:
                break

        return index_down

    def varbin(self, filename):
        try:
            assert os.path.exists(filename), os.path.abspath(filename)

            infile = pysam.AlignmentFile(filename, 'rb')
            bins_df = self.hg.bins_boundaries()
            assert bins_df is not None
            chrom_sizes = self.hg.chrom_sizes()
            chroms = set(self.hg.version.CHROMS)

            count = 0
            dups = 0
            total_reads = 0

            prev_pos = 0
            bin_counts = defaultdict(int)

            bins = bins_df['bin.start.abspos'].values

            for seg in infile:
                total_reads += 1
                if seg.is_unmapped:
                    continue
                chrom = seg.reference_name
                if chrom not in chroms:
                    continue

                abspos = chrom_sizes[chrom].abspos + seg.reference_start
                if prev_pos == abspos:
                    dups += 1
                    continue
                count += 1
                index = self.find_bin_index_binsearch(bins, abspos)

                bin_counts[index] += 1
                prev_pos = abspos
        except Exception:
            traceback.print_exc()

        number_of_reads_per_bin = float(count) / len(bins_df)
        result = []
        for index, row in bins_df.iterrows():
            bin_count = bin_counts[index]
            ratio = float(bin_count) / number_of_reads_per_bin
            result.append([
                row['bin.chrom'], row['bin.start'], row['bin.start.abspos'],
                bin_count, ratio
            ])
        df = pd.DataFrame.from_records(result,
                                       columns=[
                                           'chrom',
                                           'chrompos',
                                           'abspos',
                                           'bincount',
                                           'ratio',
                                       ])
        df.sort_values(by=['abspos'], inplace=True)
        return df

    def run_once(self, mapping_filename):
        cellname = self.config.cellname(mapping_filename)
        outfile = self.config.varbin_filename(cellname)
        print(
            colored(
                "processing cell {}; reading from {}; writing to {}".format(
                    cellname, mapping_filename, outfile), "green"))

        if os.path.exists(outfile) and not self.config.force:
            print(
                colored(
                    "output file {} exists; add --force to overwrite".format(
                        outfile), "red"))
        else:
            if not self.config.dry_run:
                df = self.varbin(mapping_filename)
                df.to_csv(outfile, index=False, sep='\t')

    def run(self):
        mapping_filenames = self.config.mapping_filenames()
        print(
            colored("processing files: {}".format(mapping_filenames), "green"))

        pool = multiprocessing.Pool(processes=self.config.parallel)
        pool.map(self.run_once, mapping_filenames)