Пример #1
0
class VarbinPipeline(object):
    def __init__(self, config):
        self.config = config
        self.genome = Genome(config)

    def find_bin_index(self, abspos, bins):
        index = np.searchsorted(abspos, bins, side='right')

        index = index - 1
        return index

    def mapping_all_filenames(self):
        pattern = os.path.join(
            self.config.mapping.mapping_dir,
            "*{}".format(self.config.mapping.mapping_suffix))
        filenames = glob.glob(pattern)

        return filenames

    def find_bin_index_binsearch(self, bins, abspos):
        index_up = len(bins)
        index_down = 0
        index_mid = int((index_up - index_down) / 2.0)

        while True:
            if abspos >= int(bins[index_mid]):
                index_down = index_mid + 0
                index_mid = int((index_up - index_down) / 2.0) + index_mid
            else:
                index_up = index_mid + 0
                index_mid = int((index_up - index_down) / 2.0) + index_down

            if index_up - index_down < 2:
                break

        return index_down

    def varbin(self, filename):
        try:
            assert os.path.exists(filename), os.path.abspath(filename)

            infile = pysam.AlignmentFile(filename, 'rb')
            bins_df = self.genome.bins_boundaries()
            assert bins_df is not None
            chrom_sizes = self.genome.chrom_sizes()
            chroms = set(self.genome.version.CHROMS)

            count = 0
            dups = 0
            total_reads = 0

            prev_pos = 0
            bin_counts = defaultdict(int)

            bins = bins_df['bin.start.abspos'].values

            for seg in infile:
                total_reads += 1
                if seg.is_unmapped:
                    continue
                chrom = seg.reference_name
                if chrom not in chroms:
                    continue
                if seg.cigarstring != f'{seg.reference_length}M':
                    print("non exact mapping:", seg, seg.cigarstring)
                    continue
                assert seg.cigarstring == f'{seg.reference_length}M', \
                    (seg, seg.cigarstring)

                abspos = chrom_sizes[chrom].abspos + seg.reference_start
                if prev_pos == abspos:
                    dups += 1
                    continue
                count += 1
                index = self.find_bin_index_binsearch(bins, abspos)

                bin_counts[index] += 1
                prev_pos = abspos

            result = []
            for index, row in bins_df.iterrows():
                bin_count = bin_counts[index]
                result.append([
                    row['bin.chrom'],
                    row['bin.start'],
                    row['bin.start.abspos'],
                    bin_count,
                ])
            df = pd.DataFrame.from_records(result,
                                           columns=[
                                               'chrom',
                                               'chrompos',
                                               'abspos',
                                               'bincount',
                                           ])

            df.sort_values(by=['abspos'], inplace=True)
            total_count = df.bincount.sum()
            total_reads_per_bin = float(total_count) / len(bins_df)
            df['ratio'] = df.bincount / total_reads_per_bin

            return df
        except Exception as ex:
            traceback.print_exc()
            raise ex
        return None

    def run_once(self, mapping_filename):
        cellname = self.config.cellname(mapping_filename)
        outfile = self.config.varbin_filename(cellname)
        print(
            colored(
                "processing cell {}; reading from {}; writing to {}".format(
                    cellname, mapping_filename, outfile), "green"))

        if os.path.exists(outfile) and not self.config.force:
            print(
                colored(
                    "output file {} exists; add --force to overwrite".format(
                        outfile), "red"))
        else:
            if not self.config.dry_run:
                df = self.varbin(mapping_filename)
                df.to_csv(outfile, index=False, sep='\t')

    def run(self, dask_client):
        mapping_filenames = self.mapping_all_filenames()
        print(
            colored("processing files: {}".format(mapping_filenames), "green"))

        if self.config.dry_run:
            return

        assert dask_client
        os.makedirs(self.config.varbin.varbin_dir, exist_ok=True)

        delayed_tasks = dask_client.map(self.run_once, mapping_filenames)
        distributed.wait(delayed_tasks)
Пример #2
0
class BinsPipeline(object):
    def __init__(self, config):
        self.config = config
        self.genome = Genome(self.config)

    def calc_bins_gc_content(self, chroms, bins_df):

        result = []
        for chrom in chroms:
            chrom_df = bins_df[bins_df['bin.chrom'] == chrom]
            gc_df = chrom_df.copy()
            gc_df.reset_index(inplace=True, drop=True)

            gc_series = pd.Series(index=gc_df.index)
            chrom_seq = self.genome.load_chrom(chrom)

            for index, row in gc_df.iterrows():
                start = row['bin.start']
                end = row['bin.end']
                seq = chrom_seq.seq[start:end]
                counts = [seq.count(x) for x in ['G', 'C', 'A', 'T']]
                total_counts = sum(counts)
                if total_counts == 0:
                    gc = 0.0
                else:
                    gc = float(sum(counts[0:2])) / sum(counts)
                gc_series.iloc[index] = gc

            gc_df['gc.content'] = gc_series
            result.append(gc_df)
        assert len(result) > 0
        if len(result) == 1:
            return result[0]
        df = pd.concat(result)
        return df

    def bins_boundaries_generator(self, chroms, mappable_regions_df):
        chrom_sizes = self.genome.chrom_sizes()
        chrom_bins = self.genome.calc_chrom_bins()

        # if mappable_regions_df is None:
        #     mappable_regions_df = self.load_mappable_regions()

        for chrom in chroms:
            chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom]
            chrom_df = chrom_df.sort_values(
                by=['chrom', 'start_pos', 'end_pos'])

            params = BinParams.build(chrom_size=chrom_sizes[chrom],
                                     chrom_bin=chrom_bins[chrom])
            mappable_bin = None
            current_excess = 0
            bins_count = params.bins_count

            for row in chrom_df.to_dict(orient="records"):
                if mappable_bin is None:
                    mappable_bin = MappableBin.from_start(params, start_pos=0)
                    current_excess = mappable_bin.adapt_excess(current_excess)
                if not mappable_bin.check_extend(row):
                    next_bin = mappable_bin.split_extend(row)

                    bins_count -= 1
                    if bins_count == 0:
                        # last bin a chromosome
                        mappable_bin.end_pos = chrom_sizes[chrom].size
                    yield mappable_bin
                    if next_bin.is_overfill():
                        current_excess, mappable_bins = \
                            next_bin.overfill_split(current_excess)

                        assert len(mappable_bins) > 1
                        for mb in mappable_bins[:-1]:
                            bins_count -= 1
                            yield mb
                        mappable_bin = mappable_bins[-1]
                    else:
                        mappable_bin = next_bin
                        current_excess = \
                            mappable_bin.adapt_excess(current_excess)
                # print("mappable_bin:", row, mappable_bin)
            mappable_bin = None

    def calc_bins_boundaries(self, chroms=None, regions_df=None):
        if chroms is None:
            chroms = self.genome.version.CHROMS
        bin_rows = []
        for mbin in self.bins_boundaries_generator(chroms, regions_df):
            # print("mbin:", mbin)
            bin_rows.append(
                (mbin.chrom, mbin.start_pos, mbin.start_abspos, mbin.end_pos,
                 mbin.end_pos - mbin.start_pos, mbin.bin_size))

        df = pd.DataFrame.from_records(bin_rows,
                                       columns=[
                                           'bin.chrom', 'bin.start',
                                           'bin.start.abspos', 'bin.end',
                                           'bin.length', 'mappable.positions'
                                       ])
        df.sort_values(by=['bin.start.abspos'], inplace=True)
        return df

    def load_mappable_regions(self, chrom=None):
        filename = self.config.mappable_regions_filename(chrom=chrom)

        df = pd.read_csv(self.config.mappable_regions_filename(),
                         names=['chrom', 'start_pos', 'end_pos'],
                         sep='\t')
        df = df.sort_values(by=['chrom', 'start_pos', 'end_pos'])
        assert len(df) > 0

        return df

    def run_once(self, chrom):
        print(
            colored(f"started calculating bins for chromosome {chrom}",
                    "green"))
        regions_df = self.load_mappable_regions(chrom=chrom)
        bins_df = self.calc_bins_boundaries([chrom], regions_df)
        df = self.calc_bins_gc_content([chrom], bins_df)
        outfilename = self.config.bins_boundaries_filename(chrom)
        print(
            colored(f"saving bins for chromosome {chrom} into {outfilename}",
                    "green"))
        df.to_csv(outfilename, sep='\t', index=False)
        return outfilename

    def concatenate_all_chroms(self):
        outfilename = self.config.bins_boundaries_filename()
        if os.path.exists(outfilename) and not self.config.force:
            print(
                colored(
                    "destination bins boundaries file already exists"
                    "use --force to overwrite", "red"))
            raise ValueError("destination file exists... use --force")

        if self.config.dry_run:
            return

        dataframes = []
        for chrom in self.genome.version.CHROMS:
            srcfilename = self.config.bins_boundaries_filename(chrom)
            df = pd.read_csv(srcfilename, sep='\t')
            dataframes.append(df)
        outdf = pd.concat(dataframes, ignore_index=True)
        outdf.sort_values(by=['bin.start.abspos', 'bin.start', 'bin.end'],
                          inplace=True)

        outdf.to_csv(outfilename, sep='\t', index=False)

    def run(self, dask_client):
        outfilename = self.config.bins_boundaries_filename()
        os.makedirs(os.path.dirname(outfilename), exist_ok=True)

        print(
            colored(
                "going to compute bin boundaries from mappable regions: {} "
                "into bins boundaries file {}".format(
                    self.config.mappable_regions_filename(), outfilename),
                "green"))
        if os.path.exists(outfilename) and not self.config.force:
            print(
                colored(
                    "output file {} already exists; "
                    "use --force to overwrite".format(outfilename), "red"))
            raise ValueError("output file already exists")

        if self.config.dry_run:
            return

        assert self.genome.chrom_sizes() is not None

        delayed_tasks = dask_client.map(self.run_once,
                                        self.genome.version.CHROMS)
        print(len(delayed_tasks), delayed_tasks)
        print(dask_client.scheduler_info())

        distributed.wait(delayed_tasks)
        for task in delayed_tasks:
            outfile = task.result()
            print(outfile, os.path.exists(outfile))

        self.concatenate_all_chroms()
Пример #3
0
class VarbinPipeline(object):
    def __init__(self, config):
        self.config = config
        self.hg = Genome(config)

    def find_bin_index(self, abspos, bins):
        index = np.searchsorted(abspos, bins, side='right')

        index = index - 1
        return index

    def find_bin_index_binsearch(self, bins, abspos):
        index_up = len(bins)
        index_down = 0
        index_mid = int((index_up - index_down) / 2.0)

        while True:
            if abspos >= int(bins[index_mid]):
                index_down = index_mid + 0
                index_mid = int((index_up - index_down) / 2.0) + index_mid
            else:
                index_up = index_mid + 0
                index_mid = int((index_up - index_down) / 2.0) + index_down

            if index_up - index_down < 2:
                break

        return index_down

    def varbin(self, filename):
        try:
            assert os.path.exists(filename), os.path.abspath(filename)

            infile = pysam.AlignmentFile(filename, 'rb')
            bins_df = self.hg.bins_boundaries()
            assert bins_df is not None
            chrom_sizes = self.hg.chrom_sizes()
            chroms = set(self.hg.version.CHROMS)

            count = 0
            dups = 0
            total_reads = 0

            prev_pos = 0
            bin_counts = defaultdict(int)

            bins = bins_df['bin.start.abspos'].values

            for seg in infile:
                total_reads += 1
                if seg.is_unmapped:
                    continue
                chrom = seg.reference_name
                if chrom not in chroms:
                    continue

                abspos = chrom_sizes[chrom].abspos + seg.reference_start
                if prev_pos == abspos:
                    dups += 1
                    continue
                count += 1
                index = self.find_bin_index_binsearch(bins, abspos)

                bin_counts[index] += 1
                prev_pos = abspos
        except Exception:
            traceback.print_exc()

        number_of_reads_per_bin = float(count) / len(bins_df)
        result = []
        for index, row in bins_df.iterrows():
            bin_count = bin_counts[index]
            ratio = float(bin_count) / number_of_reads_per_bin
            result.append([
                row['bin.chrom'], row['bin.start'], row['bin.start.abspos'],
                bin_count, ratio
            ])
        df = pd.DataFrame.from_records(result,
                                       columns=[
                                           'chrom',
                                           'chrompos',
                                           'abspos',
                                           'bincount',
                                           'ratio',
                                       ])
        df.sort_values(by=['abspos'], inplace=True)
        return df

    def run_once(self, mapping_filename):
        cellname = self.config.cellname(mapping_filename)
        outfile = self.config.varbin_filename(cellname)
        print(
            colored(
                "processing cell {}; reading from {}; writing to {}".format(
                    cellname, mapping_filename, outfile), "green"))

        if os.path.exists(outfile) and not self.config.force:
            print(
                colored(
                    "output file {} exists; add --force to overwrite".format(
                        outfile), "red"))
        else:
            if not self.config.dry_run:
                df = self.varbin(mapping_filename)
                df.to_csv(outfile, index=False, sep='\t')

    def run(self):
        mapping_filenames = self.config.mapping_filenames()
        print(
            colored("processing files: {}".format(mapping_filenames), "green"))

        pool = multiprocessing.Pool(processes=self.config.parallel)
        pool.map(self.run_once, mapping_filenames)