Пример #1
0
class VarbinPipeline(object):
    def __init__(self, config):
        self.config = config
        self.genome = Genome(config)

    def find_bin_index(self, abspos, bins):
        index = np.searchsorted(abspos, bins, side='right')

        index = index - 1
        return index

    def mapping_all_filenames(self):
        pattern = os.path.join(
            self.config.mapping.mapping_dir,
            "*{}".format(self.config.mapping.mapping_suffix))
        filenames = glob.glob(pattern)

        return filenames

    def find_bin_index_binsearch(self, bins, abspos):
        index_up = len(bins)
        index_down = 0
        index_mid = int((index_up - index_down) / 2.0)

        while True:
            if abspos >= int(bins[index_mid]):
                index_down = index_mid + 0
                index_mid = int((index_up - index_down) / 2.0) + index_mid
            else:
                index_up = index_mid + 0
                index_mid = int((index_up - index_down) / 2.0) + index_down

            if index_up - index_down < 2:
                break

        return index_down

    def varbin(self, filename):
        try:
            assert os.path.exists(filename), os.path.abspath(filename)

            infile = pysam.AlignmentFile(filename, 'rb')
            bins_df = self.genome.bins_boundaries()
            assert bins_df is not None
            chrom_sizes = self.genome.chrom_sizes()
            chroms = set(self.genome.version.CHROMS)

            count = 0
            dups = 0
            total_reads = 0

            prev_pos = 0
            bin_counts = defaultdict(int)

            bins = bins_df['bin.start.abspos'].values

            for seg in infile:
                total_reads += 1
                if seg.is_unmapped:
                    continue
                chrom = seg.reference_name
                if chrom not in chroms:
                    continue
                if seg.cigarstring != f'{seg.reference_length}M':
                    print("non exact mapping:", seg, seg.cigarstring)
                    continue
                assert seg.cigarstring == f'{seg.reference_length}M', \
                    (seg, seg.cigarstring)

                abspos = chrom_sizes[chrom].abspos + seg.reference_start
                if prev_pos == abspos:
                    dups += 1
                    continue
                count += 1
                index = self.find_bin_index_binsearch(bins, abspos)

                bin_counts[index] += 1
                prev_pos = abspos

            result = []
            for index, row in bins_df.iterrows():
                bin_count = bin_counts[index]
                result.append([
                    row['bin.chrom'],
                    row['bin.start'],
                    row['bin.start.abspos'],
                    bin_count,
                ])
            df = pd.DataFrame.from_records(result,
                                           columns=[
                                               'chrom',
                                               'chrompos',
                                               'abspos',
                                               'bincount',
                                           ])

            df.sort_values(by=['abspos'], inplace=True)
            total_count = df.bincount.sum()
            total_reads_per_bin = float(total_count) / len(bins_df)
            df['ratio'] = df.bincount / total_reads_per_bin

            return df
        except Exception as ex:
            traceback.print_exc()
            raise ex
        return None

    def run_once(self, mapping_filename):
        cellname = self.config.cellname(mapping_filename)
        outfile = self.config.varbin_filename(cellname)
        print(
            colored(
                "processing cell {}; reading from {}; writing to {}".format(
                    cellname, mapping_filename, outfile), "green"))

        if os.path.exists(outfile) and not self.config.force:
            print(
                colored(
                    "output file {} exists; add --force to overwrite".format(
                        outfile), "red"))
        else:
            if not self.config.dry_run:
                df = self.varbin(mapping_filename)
                df.to_csv(outfile, index=False, sep='\t')

    def run(self, dask_client):
        mapping_filenames = self.mapping_all_filenames()
        print(
            colored("processing files: {}".format(mapping_filenames), "green"))

        if self.config.dry_run:
            return

        assert dask_client
        os.makedirs(self.config.varbin.varbin_dir, exist_ok=True)

        delayed_tasks = dask_client.map(self.run_once, mapping_filenames)
        distributed.wait(delayed_tasks)
Пример #2
0
class VarbinPipeline(object):
    def __init__(self, config):
        self.config = config
        self.hg = Genome(config)

    def find_bin_index(self, abspos, bins):
        index = np.searchsorted(abspos, bins, side='right')

        index = index - 1
        return index

    def find_bin_index_binsearch(self, bins, abspos):
        index_up = len(bins)
        index_down = 0
        index_mid = int((index_up - index_down) / 2.0)

        while True:
            if abspos >= int(bins[index_mid]):
                index_down = index_mid + 0
                index_mid = int((index_up - index_down) / 2.0) + index_mid
            else:
                index_up = index_mid + 0
                index_mid = int((index_up - index_down) / 2.0) + index_down

            if index_up - index_down < 2:
                break

        return index_down

    def varbin(self, filename):
        try:
            assert os.path.exists(filename), os.path.abspath(filename)

            infile = pysam.AlignmentFile(filename, 'rb')
            bins_df = self.hg.bins_boundaries()
            assert bins_df is not None
            chrom_sizes = self.hg.chrom_sizes()
            chroms = set(self.hg.version.CHROMS)

            count = 0
            dups = 0
            total_reads = 0

            prev_pos = 0
            bin_counts = defaultdict(int)

            bins = bins_df['bin.start.abspos'].values

            for seg in infile:
                total_reads += 1
                if seg.is_unmapped:
                    continue
                chrom = seg.reference_name
                if chrom not in chroms:
                    continue

                abspos = chrom_sizes[chrom].abspos + seg.reference_start
                if prev_pos == abspos:
                    dups += 1
                    continue
                count += 1
                index = self.find_bin_index_binsearch(bins, abspos)

                bin_counts[index] += 1
                prev_pos = abspos
        except Exception:
            traceback.print_exc()

        number_of_reads_per_bin = float(count) / len(bins_df)
        result = []
        for index, row in bins_df.iterrows():
            bin_count = bin_counts[index]
            ratio = float(bin_count) / number_of_reads_per_bin
            result.append([
                row['bin.chrom'], row['bin.start'], row['bin.start.abspos'],
                bin_count, ratio
            ])
        df = pd.DataFrame.from_records(result,
                                       columns=[
                                           'chrom',
                                           'chrompos',
                                           'abspos',
                                           'bincount',
                                           'ratio',
                                       ])
        df.sort_values(by=['abspos'], inplace=True)
        return df

    def run_once(self, mapping_filename):
        cellname = self.config.cellname(mapping_filename)
        outfile = self.config.varbin_filename(cellname)
        print(
            colored(
                "processing cell {}; reading from {}; writing to {}".format(
                    cellname, mapping_filename, outfile), "green"))

        if os.path.exists(outfile) and not self.config.force:
            print(
                colored(
                    "output file {} exists; add --force to overwrite".format(
                        outfile), "red"))
        else:
            if not self.config.dry_run:
                df = self.varbin(mapping_filename)
                df.to_csv(outfile, index=False, sep='\t')

    def run(self):
        mapping_filenames = self.config.mapping_filenames()
        print(
            colored("processing files: {}".format(mapping_filenames), "green"))

        pool = multiprocessing.Pool(processes=self.config.parallel)
        pool.map(self.run_once, mapping_filenames)