예제 #1
0
    def convert_bigwig(cls, bigwig, species, bigwig_cmd_path):

        log = Log()

        genome = DataInterface.load_genome(species, cls.window_size)
        coverage_array = np.zeros(len(genome))

        log.append('Converting BigWig file to coverage array ...')

        if not os.path.exists(cls._get_genome_bin_path(species)):
            log.append('Writing bins ...')
            cls._write_genome_bins(species)

        try:

            temp = tempfile.NamedTemporaryFile('w', delete=False)
            temp.close()

            process = subprocess.run([bigwig_cmd_path, bigwig, cls._get_genome_bin_path(species), temp.name], capture_output=True)

            if process.returncode == 0:

                with open(temp.name, 'r') as cmd_output:
                    for line in cmd_output:
                        fields = line.strip().split('\t')
                        coverage_array[int(fields[0])] = fields[4]
                    
                return coverage_array
            
            else:
                raise AssertionError(process.stderr.decode('utf-8'))
        finally:
            os.remove(temp.name)
예제 #2
0
    def convert_bigwig(cls, bigwig, species, log=None):

        if log is None:
            log = Log()

        genome = DataInterface.load_genome(species, cls.window_size)
        coverage_array = np.zeros(len(genome))

        log.append('Converting BigWig file to coverage array ...')

        bar = LoadingBar('Progress', len(genome) // 1000 + 1, cold_start=True)

        try:
            coverage_bw = bw.open(bigwig)

            log.append(bar, update_line=True)

            for i, window in enumerate(genome.list_windows()):

                if window.chromosome in coverage_bw.chroms():
                    mean_coverage = coverage_bw.stats(*window.to_tuple())[0]
                    coverage_array[i] = mean_coverage

                if i % 1000 == 0:
                    log.append(bar, update_line=True)

            return np.nan_to_num(coverage_array)

        finally:
            coverage_bw.close()
예제 #3
0
def main(species, motif_bed, window_size, gamma_threshold=0.95):

    genome = DataInterface.load_genome(species, window_size)

    log = Log(target=stderr)

    factor_name = None
    window_nums, scores = [], []

    with gzip.open(motif_bed, 'rb') as f:

        bed = f.readlines()

        bar = LoadingBar('Binning {} motif hits'.format(str(len(bed))),
                         len(bed),
                         cold_start=True)

        for i, line in enumerate(bed):

            chrom, start, end, factor, relscore, log_pval, strand = line.decode(
                'utf-8').strip().split('\t')

            if i == 0:
                factor_name = factor

            try:
                hit_windows = genome.get_region_windows(
                    Region(chrom, start, end))
                window_nums.extend(hit_windows)

                scores.extend([float(log_pval) / 100] * len(hit_windows))

            except BadRegionError:
                pass

            log.append(bar, update_line=True)

    log.append('')

    log.append('Done')

    hits = sparse.csc_matrix((scores, window_nums, [0, len(window_nums)]),
                             shape=(len(genome), 1)).tocoo().tocsc()

    sample_hit_scores = np.random.choice(np.array(hits.todense()).reshape(-1),
                                         size=10000)

    min_bin_score = gamma(*gamma.fit(sample_hit_scores)).ppf(gamma_threshold)

    hit_indices = hits.indices[(hits.data >= min_bin_score) & (hits.data > 0)]

    return hit_indices, factor_name
예제 #4
0
def main(species, window_size, path):

    region_fields = parse_bedfile(path, header = False)

    regions = [Region(*r) for r in region_fields]

    genome = DataInterface.load_genome(species, window_size)

    indices = []

    for region in regions:
        try:
            windows = genome.get_region_windows(region)
            indices.extend(windows)
        except BadRegionError:
            pass

    return list(set(indices))
예제 #5
0
def main(*,species, motif_bed, window_size, dataset_id, output):

    genome = DataInterface.load_genome(species, window_size)

    factor_name = None
    window_nums, scores = [],[]

    #adjust p-val cutoff based on the filesize (only affects p-val if file is huge)
    pval_cutoff = 430

    with open(output, 'w') as o:

        with gzip.open(motif_bed, 'rb') as bed:

            for i, line in enumerate(bed):
                
                chrom, start, end, factor, relscore, log_pval, strand = line.decode('utf-8').strip().split('\t')
                
                if i == 0:
                    factor_name = factor
                    print('Binning {} motifs with pval cutoff of {} ...'.format(factor_name.upper(), str(pval_cutoff)), file = stderr)

                neg_log10_pval = int(log_pval)

                if neg_log10_pval >= pval_cutoff:

                    try:
                        hit_windows = genome.get_region_windows(Region(chrom, start, end))
                        
                        for hit_window in hit_windows:
                            print(dataset_id, hit_window, neg_log10_pval, sep = '\t', file = o)                    

                    except BadRegionError:
                        pass
    
    print(dataset_id, factor_name.upper(), 'JASPAR', sep = '\t')