Exemplo n.º 1
0
    def from_bed(
            cls, bed, location, chrom_size_path, region_dim="region", sort_bed=True
    ):
        """
        Create empty RegionDS from a bed file.

        Parameters
        ----------
        bed
        location
        region_dim
        chrom_size_path
        sort_bed

        Returns
        -------

        """

        # sort bed based on chrom_size_path
        if isinstance(bed, (str, pathlib.PosixPath)):
            if sort_bed:
                bed = BedTool(bed).sort(g=chrom_size_path).to_dataframe()
            else:
                bed = BedTool(bed)
        else:
            bed = bed

        n_cols = bed.shape[1]
        if n_cols == 3:
            bed.index = bed.index.map(lambda i: f"{region_dim}_{i}")
        elif n_cols == 4:
            bed.set_index(bed.columns[3], inplace=True)
        else:
            raise ValueError(
                "bed file need to be either 3 columns (chrom, start, end) "
                "or 4 columns (chrom, start, end, name)"
            )
        bed.index.name = region_dim
        bed.columns = ["chrom", "start", "end"]

        ds = xr.Dataset({})
        region_dim = bed.index.name
        for k, v in bed.items():
            key = f"{region_dim}_{k}"
            ds.coords[key] = v
            if ds.coords[key].dtype == "object":
                ds.coords[key] = ds.coords[key].astype(str)

        location = pathlib.Path(location).absolute()
        location.mkdir(exist_ok=True, parents=True)
        region_ds = cls(
            ds,
            region_dim=region_dim,
            location=location,
            chrom_size_path=chrom_size_path,
        )
        region_ds.save()
        return region_ds
Exemplo n.º 2
0
    def preprocess(self,
                   chrlenPath,
                   genomePath,
                   w=100,
                   upStream=1000,
                   downStream=1000,
                   overlap=0.5,
                   method='mean',
                   col=4,
                   type='bed',
                   n_workers=4):

        assert upStream % w == 0 and downStream % w == 0
        window = BedTool().window_maker(g=chrlenPath, w=w)
        genes = BedTool(genomePath).to_dataframe()
        genes = genes[genes['feature'] == 'gene'][[
            'seqname', 'start', 'strand', 'attributes'
        ]]
        genes['attributes'] = genes['attributes'].apply(
            lambda x: x[x.find('=') + 1:x.find(';')])
        genes['start'] = genes['start'].apply(lambda x: x - upStream)
        genes['end'] = genes['start'] + upStream + downStream
        genes = genes[['seqname', 'start', 'end', 'attributes', 'strand']]
        genes.columns = ['chrom', 'start', 'end', 'ID', 'strand']
        genes = genes[genes.start >= 0]
        genes.chrom = genes.chrom.apply(lambda x: x[0].lower() + x[1:])
        atlas = BedTool.from_dataframe(genes[['chrom', 'start', 'end',
                                              'ID']]).sort()
        genes = genes.set_index(['ID'])

        def worker(atlas, window, path, genes, col, method, overlap, type):
            p = BedTool(path).sort()
            a = None
            if type == 'bed':
                a = window.map(p, c=1, o='count', F=overlap)
            elif type == 'sigbed':
                a = window.map(p, o=method, c=col, F=overlap)
            tmp = atlas.intersect(a, loj=True, wa=True, wb=True).to_dataframe()
            grps = tmp.groupby(['name'])
            data = []
            for ind in genes.index:
                row = grps.get_group(ind)['thickEnd'].tolist()
                data.append(row if genes.ix[ind].strand == '+' else row[::-1])
            return pd.DataFrame(data, index=genes.index.tolist())

        self.raw = {}
        with futures.ThreadPoolExecutor(max_workers=n_workers) as executor:
            jobs = {}
            for i, path in enumerate(self.paths):
                job = executor.submit(worker, atlas, window, path, genes, col,
                                      method, overlap, type)
                jobs[job] = self.names[i]

            for job in futures.as_completed(jobs):
                self.raw[jobs[job]] = job.result().dropna()
                if self.genes == None:
                    self.genes = self.raw[jobs[job]].index.tolist()