예제 #1
0
    def __yield_score_dfm(self, snp_dfm):
        snp_bed_obj = BedTool(snp_dfm.to_string(index=False,
                                                header=False,
                                                index_names=False),
                              from_string=True)

        for key, bed_fn in self.src_data_fn.items():
            rep_bed_fn = os.path.join(self.src_data_dir, bed_fn)
            rep_bed_obj = BedTool(rep_bed_fn)

            # Downstream scores
            closest_iu = snp_bed_obj.closest(rep_bed_obj, D='ref', iu=True)
            closest_iu_dfm = pd.read_table(StringIO(str(closest_iu)),
                                           header=None,
                                           names=[
                                               'snpChrom', 'snpChromStart',
                                               'snpChromEnd', 'snpName',
                                               'repChrom', 'repChromStart',
                                               'repChromEnd', 'repScore',
                                               'distance'
                                           ],
                                           usecols=['snpName', 'repScore'])
            closest_iu_dfm = closest_iu_dfm.rename(columns={
                'snpName': 'name',
                'repScore': 'iu_score'
            })

            # Upstream scores
            closest_id = snp_bed_obj.closest(rep_bed_obj, D='ref', id=True)
            closest_id_dfm = pd.read_table(StringIO(str(closest_id)),
                                           header=None,
                                           names=[
                                               'snpChrom', 'snpChromStart',
                                               'snpChromEnd', 'snpName',
                                               'repChrom', 'repChromStart',
                                               'repChromEnd', 'repScore',
                                               'distance'
                                           ],
                                           usecols=['snpName', 'repScore'])
            closest_id_dfm = closest_id_dfm.rename(columns={
                'snpName': 'name',
                'repScore': 'id_score'
            })

            # score_dfm = pd.concat([closest_iu_dfm, closest_id_dfm], axis=1)
            score_dfm = closest_iu_dfm.merge(closest_id_dfm, on='name')

            score_dfm = score_dfm.assign(avg_score=(score_dfm['iu_score'] + score_dfm['id_score']) / 2.0). \
                drop(['iu_score', 'id_score'], axis=1)
            score_dfm = score_dfm.rename(columns={'avg_score': key})

            yield score_dfm
예제 #2
0
def find_closest_genes(peaks, annotation, annotationFeature, filteredoutput,
                       referencePoint, filename):
    """
    Find the closest gene using bedtools.closest
    """
    Peaks = BedTool(peaks)
    Annotation = BedTool(annotation)
    Peaks = Peaks.sort()
    sites = Annotation.sort()
    if annotationFeature:
        filteredAnnotation = __filter_annotation(filteredoutput,
                                                 annotationFeature, annotation,
                                                 referencePoint)
        sites = BedTool(filteredAnnotation).sort()
    elif referencePoint:
        filteredAnnotation = list()
        for feature in gffutils.DataIterator(annotation):
            filteredAnnotation.append(str(__get_reference_coordinate(feature,
                                          referencePoint)))
        sites = BedTool(filteredAnnotation).sort()
    mapped = Peaks.closest(sites, t="first")

    if filename:
        mapped.saveas(filename)

    return(mapped)
예제 #3
0
 def process(self):
     all_sites = pd.read_csv(self.sites_file, usecols=['chr', 'coordinate'])
     all_sites = get_winid.convert_chr_to_num(all_sites)
     chrs = np.sort(all_sites['chr'].unique())
     all_sites_closest = []
     for chr in chrs:
         print('processing sites on chr ' + str(chr))
         chr_file = self.data_dir + 'chr' + chr + '.tsv'
         if not os.path.exists(chr_file):
             self.split_by_chr()
         chr_sites = all_sites.query('chr==@chr')
         chr_sites['coordinate'] = chr_sites['coordinate'].astype('i8')
         chr_sites['end'] = chr_sites['coordinate'] + 1
         chr_sites = BedTool([tuple(x[1]) for x in chr_sites.iterrows()])
         chr_sites_closest = chr_sites.closest(chr_file,
                                               d=True,
                                               nonamecheck=True)
         for row in chr_sites_closest:
             all_sites_closest.extend(
                 [[row[0], row[1], row[6], row[7], row[8]]])
         del chr_sites_closest
         del chr_sites
         gc.collect()
     all_sites_closest = pd.DataFrame(all_sites_closest,
                                      columns=[
                                          'chr', 'coordinate',
                                          'eigen_phred', 'eigen_pc_phred',
                                          'distiance_to_nearest_eigen'
                                      ])
     all_sites_closest = all_sites_closest.groupby(
         ['chr', 'coordinate']).apply(self.mean_max).reset_index()
     with pd.HDFStore(self.additional_feature_file, 'a') as h5s:
         h5s['Eigen'] = all_sites_closest
예제 #4
0
def feat_dist(vf, af, name):
    print "inside feat_dist"
    v = BedTool(vf)
    a = BedTool(af)
    closest = v.closest(a, D="b")
    results = dict([ (r.name, int(r[len(r.fields)-1])) for r in closest ])
    print "exiting feat_dist"
    return Series(results, name=name)
예제 #5
0
def main():
    """
    Runs Python example from the manuscript
    """
    bedtools_dir = path.split(__file__)[0]
    snps = BedTool(path.join(bedtools_dir, '../test/data/snps.bed.gz'))
    genes = BedTool(path.join(bedtools_dir, '../test/data/hg19.gff'))

    intergenic_snps = (snps - genes)

    nearby = genes.closest(intergenic_snps, d=True, stream=True)

    for gene in nearby:
        if int(gene[-1]) < 5000:
            print gene.name
예제 #6
0
def main():
    """
    Runs Python example from the manuscript
    """
    bedtools_dir = path.split(__file__)[0]
    snps = BedTool(path.join(bedtools_dir, '../test/data/snps.bed.gz'))
    genes = BedTool(path.join(bedtools_dir, '../test/data/hg19.gff'))

    intergenic_snps = (snps - genes)

    nearby = genes.closest(intergenic_snps, d=True, stream=True)

    for gene in nearby:
        if int(gene[-1]) < 5000:
            print gene.name
예제 #7
0
def computeFromBed(vcfrecord,bedtoolObj,loc_simple,loc_complex):
    for alt in vcfrecord.ALT:
        #if not alt:
        #    logging.info("{} record may have an invalid VCF syntax to represent a deletion event.".format(vcfrecord))
        #    record_bed=BedTool('{} {} {}'.format(vcfrecord.CHROM,vcfrecord.POS-1,vcfrecord.POS + len(vcfrecord.REF)-0, from_string=True))
        if len(vcfrecord.REF) <= len(alt) : #SNP or #Insertions
            record_bed=BedTool('{} {} {}'.format(vcfrecord.CHROM,vcfrecord.POS-1,vcfrecord.POS), from_string=True)
        else: #Deletions
            record_bed=BedTool('{} {} {}'.format(vcfrecord.CHROM,vcfrecord.POS-1,vcfrecord.POS + len(vcfrecord.REF)-len(alt)), from_string=True)

        isec=record_bed.closest(bedtoolObj,D="b")#d=True)#,wb=False)
        simple=getLocation(str(isec).rstrip().split()[-1],loc_simple)
        complex=getLocation(str(isec).rstrip().split()[-1],loc_complex)
        updateglobalsLocation(complex)
        vcfrecord.INFO["LOC"] = simple
        vcfrecord.INFO["LOC_DETAIL"] = complex
        return vcfrecord
def annotatePAS(DB_file, pas_generator, chromosome, strand):
    if DB_file is not None:
        long_bed_str = ''
        if pas_generator is not None:
            pas_dict = generator_to_dict(pas_generator)
            _i = 1
            for _pos in pas_dict:
                _bl = '\t'.join(
                    str(e) for e in [
                        chromosome,
                        int(_pos) - 1, _pos, pas_dict[_pos], chromosome + ":" +
                        strand + ":" + str(_i), strand
                    ]) + '\n'
                long_bed_str += _bl
                _i += 1
            pas_bed = BedTool(long_bed_str, from_string=True)
            pas_bed = pas_bed.sort()
            anno_pas_bed = pas_bed.closest(DB_file, s=True, D='b')
            annotated_pas_out = []
            for _apb in anno_pas_bed:
                annotated_pas_out.append(
                    (_apb[0], _apb[1], _apb[2], _apb[3], _apb[4], _apb[5],
                     _apb[9], _apb[10], _apb[12]))
            return annotated_pas_out
        else:
            return
    else:
        if pas_generator is not None:
            pas_out = []
            pas_dict = generator_to_dict(pas_generator)
            _i = 1
            for _pos in pas_dict:
                pas_out.append(
                    (chromosome, int(_pos) - 1, _pos, sub_pas[_pos],
                     chromosome + ":" + strand + ":" + str(_i), strand))
                _i += 1
            return pas_out
        else:
            return
예제 #9
0
def add_closest(aname, bname):
    a, b = BedTool(aname), BedTool(bname)

    afields = a.field_count()
    c = a.closest(b, d=True)
    get_name = gen_get_name(b, afields)

    dbed = open(BedTool._tmp(), "w")
    # keep the name and distance
    seen_by_line = collections.defaultdict(list)
    for feat in c:
        key = "\t".join(feat[:afields])
        seen_by_line[key].append([feat[-1], get_name(feat)])

    for key, dist_names in seen_by_line.items():
        if len(dist_names) > 0:
            assert len(set([d[0] for d in dist_names])) == 1
        names = ",".join(sorted(set(d[1] for d in dist_names)))
        new_line = "\t".join([key] + [names] + [dist_names[0][0]])
        dbed.write(new_line + "\n")
    dbed.close()
    d = BedTool(dbed.name)
    assert len(d) == len(a)
    return d
예제 #10
0
def add_closest(aname, bname):
    a, b = BedTool(aname), BedTool(bname)

    afields = a.field_count()
    c = a.closest(b, d=True)
    get_name = gen_get_name(b, afields)

    dbed = open(BedTool._tmp(), "w")
    # keep the name and distance
    seen_by_line = collections.defaultdict(list)
    for feat in c:
        key = "\t".join(feat[:afields])
        seen_by_line[key].append([feat[-1], get_name(feat)])

    for key, dist_names in seen_by_line.iteritems():
        if len(dist_names) > 0:
            assert len(set([d[0] for d in dist_names])) == 1
        names = ",".join(sorted(set(d[1] for d in dist_names)))
        new_line = "\t".join([key] + [names] + [dist_names[0][0]])
        dbed.write(new_line + "\n")
    dbed.close()
    d = BedTool(dbed.name)
    assert len(d) == len(a)
    return d
예제 #11
0
def main():
    tmpdir_obj = TemporaryDirectory(dir=project_temp_dir)
    tmpdir_path = Path(tmpdir_obj.name)

    gencode_dir = Path(
        "/icgc/dkfzlsdf/analysis/hs_ontogeny/databases/gene_annotations")

    gencode19_gtf_appris_principal = (
        "/icgc/dkfzlsdf/analysis/hs_ontogeny/databases/gene_annotations"
        "/gencode.vM19.annotation.appris-principal.no-prefix.gtf")
    gencode19_gtf = (
        "/icgc/dkfzlsdf/analysis/hs_ontogeny/databases/gene_annotations"
        "/gencode.vM19.annotation.no-prefix.gtf")

    all_tss_area_bed = gencode_dir.joinpath('all-tss_slop-100000-5000.bed')

    strand_dtype = CategoricalDtype(['+', '-'], ordered=True)

    # %% compute tss area intersects, ~ 30s
    gencode_df = pd.read_csv(gencode19_gtf,
                             sep='\t',
                             header=None,
                             comment='#',
                             names=[
                                 'feat_chrom', 'source', 'feature', 'Start',
                                 'End', 'score', 'feat_strand', 'frame',
                                 'attribute'
                             ],
                             dtype={
                                 'feat_chrom': str,
                                 'Start': 'i8',
                                 'End': 'i8',
                                 'feat_strand': strand_dtype
                             })

    tss_n_upstream = 100_000
    tss_n_downstream = 5000
    transcripts = gencode_df.query('feature == "transcript"').copy()
    on_plus_strand = transcripts['feat_strand'] == '+'
    transcripts['TSS'] = -1
    transcripts['feat_start'] = -1
    transcripts['feat_end'] = -1
    transcripts['feat_class'] = 'TSS_area'
    transcripts = expand_gtf_attributes(transcripts)

    # custom slop
    transcripts.loc[on_plus_strand, 'TSS'] = transcripts.loc[on_plus_strand,
                                                             'Start']
    transcripts.loc[~on_plus_strand, 'TSS'] = transcripts.loc[~on_plus_strand,
                                                              'End']
    transcripts.loc[on_plus_strand,
                    'feat_start'] = transcripts.loc[on_plus_strand,
                                                    'Start'] - tss_n_upstream
    transcripts.loc[on_plus_strand,
                    'feat_end'] = transcripts.loc[on_plus_strand,
                                                  'Start'] + tss_n_downstream
    transcripts.loc[~on_plus_strand,
                    'feat_start'] = transcripts.loc[~on_plus_strand,
                                                    'End'] - tss_n_downstream
    transcripts.loc[~on_plus_strand,
                    'feat_end'] = transcripts.loc[~on_plus_strand,
                                                  'End'] + tss_n_upstream
    transcripts = transcripts.sort_values(
        ['feat_chrom', 'feat_start', 'feat_end', 'TSS'])
    transcripts.loc[transcripts['feat_start'].lt(0), 'feat_start'] = 0
    transcripts_cols = [
        'feat_chrom', 'feat_start', 'feat_end', 'TSS', 'feat_strand',
        'feat_class', 'gene_name', 'gene_id', 'transcript_id',
        'appris_principal_score'
    ]
    transcripts[transcripts_cols].to_csv(all_tss_area_bed,
                                         sep='\t',
                                         header=False,
                                         index=False)

    all_tss_area_bt = BedTool(str(all_tss_area_bed))
    merged_dmrs_bt = BedTool(str(merged_dmrs_bed))
    tss_intersect_bt = merged_dmrs_bt.intersect(all_tss_area_bt,
                                                wa=True,
                                                wb=True)
    tss_intersect_df = pd.read_csv(
        tss_intersect_bt.fn,
        sep='\t',
        names=['Chromosome', 'Start', 'End', 'region_id'] + transcripts_cols)
    tss_intersect_df['perc_feature'] = np.nan
    tss_intersect_df['perc_region'] = np.nan
    tss_intersect_df['distance'] = -1e8
    tss_intersect_df['center'] = tss_intersect_df.eval(
        'Start + (End - Start)/2')
    tss_intersect_df['feat_center'] = np.nan
    tss_intersect_df['has_center'] = False
    tss_intersect_df['distance'] = tss_intersect_df.eval('center - TSS')
    assert tss_intersect_df['distance'].ne(-1e8).all()
    # tss_intersect_df.loc[tss_intersect_df.eval('Start <= TSS <= End'), 'distance'] = 0
    # tss_intersect_df.loc[tss_intersect_df.eval('End < TSS'), 'distance'] = tss_intersect_df.eval('End - TSS')
    # tss_intersect_df.loc[tss_intersect_df.eval('Start > TSS'), 'distance'] = tss_intersect_df.eval('Start - TSS')

    full_cols = [
        'Chromosome', 'Start', 'End', 'region_id', 'center', 'feat_class',
        'perc_feature', 'perc_region', 'distance', 'has_center', 'gene_name',
        'gene_id', 'transcript_id', 'appris_principal_score', 'feat_chrom',
        'feat_start', 'feat_end', 'feat_center', 'feat_strand'
    ]
    tss_intersect_df_full = tss_intersect_df[full_cols]

    # %% compute exon, intron overlap, ~45s
    transcript_parts = gencode_df.loc[
        ~gencode_df['feature'].isin(['gene', 'start_codon', 'stop_codon']), :]
    transcript_parts_fp = tmpdir_path.joinpath('transcript_parths.gtf')
    transcript_parts.to_csv(transcript_parts_fp,
                            sep='\t',
                            header=False,
                            index=False)

    transcript_parts_bt = BedTool(str(transcript_parts_fp))
    transcript_parts_anno = merged_dmrs_bt.intersect(transcript_parts_bt,
                                                     wa=True,
                                                     wb=True)
    transcript_parts_anno.head()

    transcript_parts_df = pd.read_csv(transcript_parts_anno.fn,
                                      sep='\t',
                                      header=None)
    transcript_parts_df.columns = [
        'Chromosome', 'Start', 'End', 'region_id'
    ] + [
        'feat_chrom', 'source', 'feat_class', 'feat_start', 'feat_end',
        'score', 'feat_strand', 'frame', 'attribute'
    ]
    start = transcript_parts_df.eval('Start - feat_start').where(
        lambda ser: ser.gt(0), 0)
    feat_size = transcript_parts_df.eval('feat_end - feat_start')
    end = transcript_parts_df.eval('End - feat_start').where(
        lambda ser: ser.lt(feat_size), feat_size)
    overlap_size = end - start
    region_size = transcript_parts_df.eval('End - Start')

    transcript_parts_df['center'] = transcript_parts_df.eval(
        'Start + (End - Start)/2')
    transcript_parts_df['feat_center'] = transcript_parts_df.eval(
        'feat_start + (feat_end - feat_start)/2')
    transcript_parts_df['distance'] = transcript_parts_df.eval(
        'center - feat_center')
    transcript_parts_df['has_center'] = transcript_parts_df['distance'].lt(
        feat_size / 2)

    transcript_parts_df['perc_feature'] = overlap_size / feat_size
    transcript_parts_df['perc_region'] = overlap_size / region_size
    transcript_parts_df['distance'] = np.nan

    transcript_parts_df = expand_gtf_attributes(transcript_parts_df)

    transcript_parts_df_full = transcript_parts_df[full_cols]

    # %% classify into proximal and distal cis regulatory regions
    promoter_anno = tss_intersect_df_full.copy()
    is_proximal_promoter = promoter_anno.eval('-5000 <= distance <= 1000')
    is_distant_cis_regulatory_domain = promoter_anno.eval(
        '-20000 <= distance < -5000')
    promoter_anno['feat_class'] = np.nan
    promoter_anno.loc[is_proximal_promoter, 'feat_class'] = 'Promoter'
    promoter_anno.loc[is_proximal_promoter, 'has_center'] = True
    promoter_anno.loc[is_distant_cis_regulatory_domain, 'feat_class'] = 'UCRD'
    promoter_anno.loc[is_distant_cis_regulatory_domain, 'has_center'] = True
    promoter_anno = promoter_anno.loc[~promoter_anno['feat_class'].isna(), :]

    # %% concatenate and type casts
    full_annos = (pd.concat([promoter_anno, transcript_parts_df_full],
                            axis=0).sort_values(['Chromosome', 'Start',
                                                 'End']))

    precedence = pd.Series([
        'start_codon',
        'stop_codon',
        'Promoter',
        'UTR',
        'exon',
        'CDS',
        'UCRD',
        'transcript',
    ])

    # %% Filter according to precedence
    def filter_annotations(group_df):
        highest_class = precedence.iloc[precedence.isin(
            group_df['feat_class']).idxmax()]
        class_df = group_df.query('feat_class == @highest_class').sort_values(
            ['appris_principal_score', 'perc_region'])
        # TODO: sort by appris score, then by overlap
        if highest_class == 'transcript':
            class_df['feat_class'] = 'intron'
        if class_df['gene_name'].nunique() == 1:
            return class_df.iloc[[0], :]
        else:
            return class_df.groupby('gene_name', as_index=False).nth(0)

    center_annos = full_annos.loc[full_annos['has_center'], :]

    # filter, takes ~
    # could maybe be sped up by removing more introns, perhaps with better intron annotation?
    t1 = time.time()
    filtered_annos = center_annos.groupby(
        'region_id', group_keys=False).apply(filter_annotations)
    print(time.time() - t1)
    # cores = 24
    # filtered_annos_l = Parallel(cores)(delayed(filter_annotations)(group_df) for unused_name, group_df in grouped)
    # filtered_annos = pd.concat(filtered_annos_l, axis=0)

    filtered_annos.to_pickle(
        results_dir.joinpath('filtered-annos_no-intergenic.p'))
    # filtered_annos = pd.read_pickle(results_dir.joinpath('filtered-annos_no-intergenic.p'))

    ids_annotated_regions = filtered_annos['region_id'].unique()
    merged_dmrs_df = pd.read_pickle(merged_dmrs_p)
    intergenic_regions = merged_dmrs_df.loc[
        ~merged_dmrs_df['region_id'].isin(ids_annotated_regions), :].copy()
    intergenic_regions['feat_class'] = 'intergenic'

    # error: chromosome dtypes are different
    all_regions_annotated = (pd.concat([filtered_annos, intergenic_regions],
                                       sort=False,
                                       axis=0))
    all_regions_annotated['Chromosome'] = all_regions_annotated[
        'Chromosome'].astype(str)
    all_regions_annotated.sort_values(['Chromosome', 'Start', 'End'],
                                      inplace=True)
    assert (
        all_regions_annotated['region_id'].unique() == np.arange(53231)).all()
    all_regions_annotated['region_id'].value_counts().value_counts()
    all_regions_annotated['feat_class'].value_counts()

    all_regions_annotated.to_pickle(gencode_anno_p)
    all_regions_annotated.to_csv(gencode_anno_tsv,
                                 sep='\t',
                                 header=True,
                                 index=False)

    filtered_annos['feat_class'].value_counts()

    gencode_df_w_attributes = expand_gtf_attributes(gencode_df)
    principal_transcripts = gencode_df_w_attributes.query(
        'appris_principal_score > 0 and feature == "transcript"').copy()
    # get TSS
    tss_on_plus_strand = principal_transcripts['feat_strand'].eq('+')
    principal_transcripts.loc[
        tss_on_plus_strand,
        'End'] = principal_transcripts.loc[tss_on_plus_strand, 'Start'] + 1
    principal_transcripts.loc[
        ~tss_on_plus_strand,
        'Start'] = principal_transcripts.loc[~tss_on_plus_strand, 'End'] - 1
    principal_transcripts = principal_transcripts.sort_values(
        ['feat_chrom', 'Start', 'End'])
    principal_transcripts_fp = tmpdir_path / 'principal-transcripts.gtf'
    principal_transcripts.iloc[:, 0:9].to_csv(principal_transcripts_fp,
                                              sep='\t',
                                              header=False,
                                              index=False)

    # pybedtools.featurefuncs.TSS has a bug

    gtf_princ_tss_bt = BedTool(str(principal_transcripts_fp))
    closest_tss_bt = merged_dmrs_bt.closest(gtf_princ_tss_bt,
                                            D='b',
                                            fu=True,
                                            t='first')
    closest_tss_df = pd.read_csv(closest_tss_bt.fn, sep='\t', header=None)
    distances = closest_tss_df.iloc[:, -1]
    distances = distances.loc[(distances < 100_000) & (distances > -100_000)]

    import matplotlib as mpl
    from matplotlib.axes import Axes  # for autocompletion in pycharm
    from matplotlib.figure import Figure  # for autocompletion in pycharm
    mpl.use('Agg')  # import before pyplot import!
    import matplotlib.pyplot as plt
    import matplotlib.gridspec as gridspec
    import seaborn as sns

    fig, ax = plt.subplots(1, 1)
    sns.distplot(distances.values, bins=1000, kde=False, ax=ax)
    fig.savefig(results_dir.joinpath('tss-distance-dist.png'))
    fig.savefig(results_dir.joinpath('tss-distance-dist.pdf'))
예제 #12
0
def get_transition_matrix_cluster(h3k27ac, h3k4me3, h3k27me3, diff_dict,
                                  cluster_index):
    intervals = BedTool(
        "/srv/scratch/annashch/dmso/dp_gp/dpgp_diff_peaks_fold/" +
        str(cluster_index) + ".bed")
    intersection_h3k27ac = intervals.closest(h3k27ac, wao=True)
    intersection_h3k4me3 = intervals.closest(h3k4me3, wao=True)
    intersection_h3k27me3 = intervals.closest(h3k27me3, wao=True)
    results = dict()
    for i in range(len(intervals)):
        if i % 100 == 0:
            print(i)
        cur_interval = tuple(intersection_h3k27ac[i][0:3])
        results[cur_interval] = dict()
        results[cur_interval]['dmso'] = [0, 0, 0]  #h3k27ac, h3k4me3, h3k27me3
        results[cur_interval]['control'] = [0, 0, 0]
        #determine which of 6 possible mark combinations is represented for DMSO & Control
        if intersection_h3k27ac[i][3].startswith('c'):
            #is the h3k27ac mark differential or stable?
            overlap = tuple(intersection_h3k27ac[i][3:6])
            #is the peak up in dmso?
            if overlap in diff_dict['h3k27ac_up']:
                results[cur_interval]['dmso'][0] = 1
            #is the peak up in control?
            elif overlap in diff_dict['h3k27ac_down']:
                results[cur_interval]['control'][0] = 1
            else:
                results[cur_interval]['dmso'][0] = 1
                results[cur_interval]['control'][0] = 1
        if intersection_h3k4me3[i][3].startswith('c'):
            #is the h3k4me3 mark differential or stable?
            overlap = tuple(intersection_h3k4me3[i][3:6])
            #is the peak up in dmso?
            if overlap in diff_dict['h3k4me3_up']:
                results[cur_interval]['dmso'][1] = 1
            #is the peak up in control?
            elif overlap in diff_dict['h3k4me3_down']:
                results[cur_interval]['control'][1] = 1
            else:
                results[cur_interval]['dmso'][1] = 1
                results[cur_interval]['control'][1] = 1

        if intersection_h3k27me3[i][3].startswith('c'):
            #is the h3k27me3 mark differential or stable?
            overlap = tuple(intersection_h3k27me3[i][3:6])
            #is the peak up in dmso?
            if overlap in diff_dict['h3k27me3_up']:
                results[cur_interval]['dmso'][2] = 1
            #is the peak up in control?
            elif overlap in diff_dict['h3k27me3_down']:
                results[cur_interval]['control'][2] = 1
            else:
                results[cur_interval]['dmso'][2] = 1
                results[cur_interval]['control'][2] = 1
    print("completed interval labels")
    #aggregate results into matrix
    transition_mat = dict()
    start_states = set([])
    end_states = set([])
    for interval in results:
        start_state = tuple(results[interval]['control'])
        end_state = tuple(results[interval]['dmso'])
        start_states.add(start_state)
        end_states.add(end_state)
        if start_state not in transition_mat:
            transition_mat[start_state] = dict()
        if end_state not in transition_mat[start_state]:
            transition_mat[start_state][end_state] = 1
        else:
            transition_mat[start_state][end_state] += 1
    outf = open("chipseq_transition_matrix_" + str(cluster_index) + ".txt",
                'w')
    start_states = list(start_states)
    end_states = list(end_states)
    outf.write('\t' + '\t'.join([str(i) for i in end_states]) + '\n')
    for s in start_states:
        outf.write(str(s))
        for e in end_states:
            if e in transition_mat[s]:
                outf.write('\t' + str(transition_mat[s][e]))
            else:
                outf.write('\t0')
        outf.write('\n')
예제 #13
0
def proximal(path1,
             path2,
             window_min,
             window_max,
             upstream=False,
             downstream=False,
             bins=None):
    """
       This is the main function of pairwise_asymmetries.py
       Uses pybedtools closest function to find proximal coordinates
       Then calculates asymmetry through orientation function for proximal pairs
       # the flags it uses from here https://bedtools.readthedocs.io/en/latest/content/tools/closest.html
       if bins==True then return not the counts but the lists of counts to bin them
    """
    # Finds the occurrences within the proximity limits and saves their pairwise orientation.
    DataL1 = BedTool(path1).sort()
    DataL2 = BedTool(path2).sort()
    if upstream == downstream and upstream == True:
        closest = DataL1.closest(DataL2, D='ref')
    elif upstream is True:
        closest = DataL1.closest(DataL2, D='ref', id=False, iu=True)
    elif downstream is True:
        closest = DataL1.closest(DataL2, D='ref', iu=False, id=True)
    else:
        closest = DataL1.closest(DataL2, D='ref')

    closest_df = closest.to_dataframe()
    Strand1_init = list(closest_df.iloc[:, 5])
    Strand2_init = list(closest_df.iloc[:, 11])
    Distance_init = [i for i in list(closest_df.iloc[:, -1])]
    Distance1_temp, Strand1, Strand2 = zip(*(
        (dist, strand1, strand2) for dist, strand1, strand2 in zip(
            Distance_init, Strand1_init, Strand2_init)
        if abs(dist) <= window_max and abs(dist) >= window_min and dist >= 0))
    Distance2_temp, Strand1_temp, Strand2_temp = zip(
        *((dist, strand2, strand1) for dist, strand1, strand2 in zip(
            Distance_init, Strand1_init, Strand2_init)
          if abs(dist) <= window_max and abs(dist) >= window_min and dist < 0))
    Distance = list(Distance1_temp) + list(Distance2_temp)
    Strand1 = list(Strand1) + list(Strand1_temp)
    Strand2 = list(Strand2) + list(Strand2_temp)
    p_p, m_m, p_m, m_p, same_strand, opposite_strand, convergent, divergent = orientation(
        Strand1, Strand2)

    # Calculate the distance distributions for all orientations
    Distances_orientations = get_distance_orientations(Distance, Strand1,
                                                       Strand2, window_min,
                                                       window_max)

    p_pL_bin = []
    m_mL_bin = []  # Same orientation
    p_mL_bin = []
    m_pL_bin = []  # Opposite orientation
    same_strandL_bin = []
    opposite_strandL_bin = []  # Combined same / opposite orientations
    convergentL_bin = []
    divergentL_bin = []

    Bins = []
    if bins is not None:
        # Performs the same analysis for each bin.
        Bins = binner(window_min, window_max, bins)
        for index, bin_i in enumerate(Bins):
            Strand1Bin = []
            Strand2Bin = []
            min_bin, max_bin = bin_i
            for k in range(len(Distance)):
                if Distance[k] >= min_bin and Distance[k] < max_bin:
                    Strand1Bin.append(Strand1[k])
                    Strand2Bin.append(Strand2[k])

            p_p_bin, m_m_bin, p_m_bin, m_p_bin, same_strand_bin, opposite_strand_bin, convergent_bin, divergent_bin = orientation(
                Strand1Bin, Strand2Bin)
            p_pL_bin.append(p_p_bin)
            m_mL_bin.append(m_m_bin)  # Same orientation, per bin
            p_mL_bin.append(p_m_bin)
            m_pL_bin.append(m_p_bin)  # Opposite orientation per bin
            same_strandL_bin.append(same_strand_bin)
            opposite_strandL_bin.append(opposite_strand_bin)
            convergentL_bin.append(convergent_bin)
            divergentL_bin.append(divergent_bin)

    return (Distances_orientations, p_p, m_m, p_m, m_p, same_strand,
            opposite_strand, convergent,
            divergent), (Bins, p_pL_bin, m_mL_bin, p_mL_bin, m_pL_bin,
                         same_strandL_bin, opposite_strandL_bin,
                         convergentL_bin, divergentL_bin)
예제 #14
0
#!/usr/bin/python
"""
Example from the manuscript to print the names of genes that are <5000 bp away
from intergenic SNPs.  See sh_ms_example.sh for the shell script equivalent.
"""
from pybedtools import BedTool

snps = BedTool('../test/data/snps.bed.gz')
genes = BedTool('../test/data/hg19.gff')

intergenic_snps = (snps - genes)

nearby = genes.closest(intergenic_snps, d=True, stream=True)

for gene in nearby:
    if int(gene[-1]) < 5000:
        print gene.name
grch38gff='/home/drew/Desktop/IPyNB-Variant-Analysis/data/cuffcmp.combined.gtf'
#snps = BedTool('snps.bed.gz')  # [1]
genes = BedTool(grch38gff)    # [1]


# In[ ]:


get_ipython().run_cell_magic('bash', '', 'ln -P /home/drew/Desktop/IPyNB-Variant-Analysis/data\nln -P /media/drew/easystore/ReferenceGenomes/GCA_000001405.15_GRCh38_no_alt_analysis_set/\nln -P /media/drew/easystore/ReferenceGenomes/GRCh38/')


# In[ ]:


intergenic_snps = snps.subtract(genes)                       # [2]
nearby = genes.closest(intergenic_snps, d=True, stream=True) # [2, 3]

for gene in nearby:             # [4]
    if int(gene[-1]) < 5000:    # [4]
        print gene.name         # [4]


# In[ ]:


get_ipython().run_cell_magic('bash', '', 'cd /media/drew/easystore/GoodCell-Resources/AnalysisBaseDir/GSA_Data/2018_07\npwd\nls -l */*vcf')


# In[ ]:

예제 #16
0
def run(chipdir, refseq, filedir, DMSO, CA):
    TSS = (-200, 1000)

    a = BedTool(chipdir)
    b = a.closest(refseq, d=True)
    b.cut([9, 10, 11, 12, 13, 14, 21]).saveas(filedir + "/SRF_closest.bed")
    d = dict()
    with open(filedir + "/SRF_closest.bed") as F:
        for line in F:
            line = line.strip().split()
            chrom, start, stop = line[0:3]
            d[chrom + "\t" + start + "\t" + stop + "\t"] = "\t".join(line[3:])
    outfile = open(filedir + "/SRF_closest.rmdup.bed", "w")
    for key in d:
        if "." not in key.split():
            outfile.write(key + d[key] + "\n")
    outfile.close()
    # os.system("sort -k1,1 -k2,2n " + filedir + "/SRF_closest.rmdup.bed > " + filedir + "/SRF_closest.rmdup.sorted.bed")
    a = BedTool(filedir + "/SRF_closest.rmdup.bed")
    a.sort().saveas(filedir + "/SRF_closest.rmdup.sorted.bed")
    outfile = open(filedir + "/SRF.TSS.bed", "w")
    outfile2 = open(filedir + "/SRF.gene.bed", "w")
    with open(filedir + "/SRF_closest.rmdup.sorted.bed") as F:
        for line in F:
            chrom, start, stop, gene, number, strand, distance = line.strip().split()
            if int(stop) - int(start) > 2000 and int(distance) > 10000:
                if strand is "+":
                    outfile.write(chrom + "\t" + str(int(start) + TSS[0]) + "\t" + str(int(start) + TSS[1]) + "\n")
                    outfile2.write(chrom + "\t" + str(int(start) + TSS[1]) + "\t" + stop + "\n")
                else:
                    outfile.write(chrom + "\t" + str(int(stop) - TSS[1]) + "\t" + str(int(stop) - TSS[0]) + "\n")
                    outfile2.write(chrom + "\t" + start + "\t" + str(int(stop) - TSS[1]) + "\n")
    outfile.close()
    outfile2.close()
    a = BedTool(filedir + "/SRF.TSS.bed")
    a.sort().saveas(filedir + "/SRF.TSS.bed")
    a = BedTool(filedir + "/SRF.gene.bed")
    a.sort().saveas(filedir + "/SRF.gene.bed")

    TSS = filedir + "/SRF.TSS.bed"
    genes = filedir + "/SRF.gene.bed"

    os.system("bedtools map -a " + genes + " -b " + DMSO + " -c 4 -o sum > " + filedir + "/DMSO.genes.bed")
    os.system("bedtools map -a " + TSS + " -b " + DMSO + " -c 4 -o sum > " + filedir + "/DMSO.TSS.bed")
    os.system("bedtools map -a " + genes + " -b " + CA + " -c 4 -o sum > " + filedir + "/CA.genes.bed")
    os.system("bedtools map -a " + TSS + " -b " + CA + " -c 4 -o sum > " + filedir + "/CA.TSS.bed")

    TRx = list()
    TRy = list()
    expressionlist = list()

    with open(filedir + "/DMSO.genes.bed") as a, open(filedir + "/DMSO.TSS.bed") as b, open(
        filedir + "/CA.genes.bed"
    ) as c, open(filedir + "/CA.TSS.bed") as d:
        for line in a:
            bline = b.readline().strip().split()[-1]
            cline = c.readline().strip().split()[-1]
            dline = d.readline().strip().split()[-1]
            if line.strip().split()[-1] is ".":
                DMSOgene = 0.0
            else:
                DMSOgene = float(line.strip().split()[-1])
            if bline is ".":
                DMSOTSS = 0.0
            else:
                DMSOTSS = float(bline)
            if cline is ".":
                CAgene = 0.0
            else:
                CAgene = float(cline)
            if dline is ".":
                CATSS = 0.0
            else:
                CATSS = float(dline)
            if DMSOgene == 0.0:
                TRx.append(0.0)
            else:
                TRx.append((DMSOTSS / DMSOgene))
            if CAgene == 0.0:
                TRy.append(0.0)
            else:
                TRy.append((CATSS / CAgene))
            expressionlist.append((np.log2(DMSOgene) + np.log2(CAgene)) / 2.0)

    F6 = plt.figure()
    ax1 = F6.add_subplot(111)
    xy = np.vstack([TRx, TRy])
    z = gaussian_kde(xy)(xy)
    ax1.scatter(TRx, TRy, c=z, edgecolor="")
    # ax1.scatter(TRx2,TRy2,c='red',edgecolor="",s=expressionlist2)
    ax1.set_title("Pausing Index")
    ax1.set_ylabel("CA")
    ax1.set_xlabel("DMSO")
    ax1.get_xaxis().tick_bottom()
    ax1.get_yaxis().tick_left()
    # ax1.plot([0,1/slope1],[intercept1,1],color = 'r')
    ax1.set_xlim([0, 20])
    ax1.set_ylim([0, 20])
    ax1.plot([0, 50.0], [0, 50.0], color="k")
    # ax1.text(8,18, "Pearson = " + str(pearsons)[0:5])
    # ax2 = F6.add_subplot(122)
    # ax2.plot(np.sort(cdf),np.linspace(0,1,len(cdf)))
    # ax2.plot(stats.norm.cdf(np.linspace(min(cdf),max(cdf)),0,np.var(cdf)),np.linspace(0,1,len(cdf)))
    plt.savefig(figuredir + "/PausingIndex.png")
예제 #17
0
    peak_background = peak_background.sort()
    gene_background = gene_background.sort()
    return peak_background, gene_background


import pdb
for cluster in range(1, 7):
    #get the peaks & genes for the current cluster
    peak_bed = BedTool(str(cluster) + ".peaks.bed")
    gene_bed = BedTool(str(cluster) + ".genes.bed")
    #get the background
    peak_background, gene_background = get_background(cluster)
    #peak to gene closest, cur cluster
    peak_to_gene_foreground = [
        int(str(i).strip().split('\t')[-1])
        for i in peak_bed.closest(gene_bed, wao=True, d=True, t="first")
    ]
    #gene to peak closest, cur cluster
    gene_to_peak_foreground = [
        int(str(i).strip().split('\t')[-1])
        for i in gene_bed.closest(peak_bed, wao=True, d=True, t="first")
    ]
    #peak to gene closest, background
    peak_to_gene_background = [
        int(str(i).strip().split('\t')[-1])
        for i in peak_bed.closest(gene_background, wao=True, d=True, t="first")
    ]
    #gene to peak closest, background
    gene_to_peak_background = [
        int(str(i).strip().split('\t')[-1])
        for i in gene_bed.closest(peak_background, wao=True, d=True, t="first")
예제 #18
0
            csv_writer.writerow(out)

print("\n\nFile(s) generated:\n\t", fileOut)


def chrom_format(gencode):
    return BedTool([list(j.replace('chr', '') for j in i) for i in gencode])


### ANNOTATE #####

if not delly_format and gencode and len(bedpe_list) > 0:
    bedpe_bed = BedTool(bedpe_list)
    if not 'chr' in bedpe_list[0][0]:
        gencode = chrom_format(gencode)
    bedpe_gencode = bedpe_bed.closest(gencode, d=True)
    bedpe_bed2 = BedTool([i[3:6] + i[:] for i in bedpe_gencode[:]])
    del (bedpe_gencode)
    bedpe_gencode = bedpe_bed2.closest(gencode, d=True)
    bedpe_list2 = [i[3:] for i in bedpe_gencode[:]]
    bedpe_header = header + [
        'chrom_gene1', 'start_gene1', 'end_gene1', 'name_gene1',
        'strand_gene1', 'dist_gene1', 'chrom_gene2', 'start_gene2',
        'end_gene2', 'name_gene2', 'strand_gene2', 'dist_gene2', 'fusion_gene'
    ]
    with open(fileOutAnno, 'wb') as wout:
        bedpe_writer = csv.writer(wout, delimiter="\t")
        max_distance = 50000
        gene_list = list()
        for r in bedpe_list2:
            r = [i.replace('\r', '') for i in r]
예제 #19
0
def run(DMSO, Nutlin1, Nutlin3, P53, figuredir, file2dir):
    D = BedTool(DMSO)
    N1 = BedTool(Nutlin1)
    N3 = BedTool(Nutlin3)
    P = BedTool(P53).cut([0, 1, 2])

    start = time.time()

    w1 = (D + P).saveas(file2dir + 'Wave1.bed')
    w1rand = BedTool([P[i] for i in rn.randint(0, len(P), len(w1))]).sort()
    w2 = (N1 + P - D).saveas(file2dir + 'Wave2.bed')
    w2rand = BedTool([P[i] for i in rn.randint(0, len(P), len(w2))]).sort()
    w3 = (N3 + P - N1 - D).saveas(file2dir + 'Wave3.bed')
    w3rand = BedTool([P[i] for i in rn.randint(0, len(P), len(w3))]).sort()

    # w1 = D+P
    # print rn.shuffle(list(P))[:len(w1)]
    # w1rand = BedTool(rn.shuffle(P)[:len(w1)]).sort()
    # w2 = N1+P-D
    # w2rand = BedTool(rn.shuffle(P)[:len(w2)]).sort()
    # w3 = N3+P-N1-D
    # w3rand = BedTool(rn.shuffle(P)[:len(w3)]).sort()

    end = time.time()

    print(end - start)

    a = w2.closest(w1, d=True)
    b = w2rand.closest(w1rand, d=True)
    c = w3.closest(w2, d=True)
    d = w3rand.closest(w2rand, d=True)

    w21 = list()
    w2r1r = list()
    w32 = list()
    w3r2r = list()

    for x in a:
        try:
            w21.append(math.log(float(x[-1]), 10))
        except:
            w21.append(0)

    for x in b:
        try:
            w2r1r.append(math.log(float(x[-1]), 10))
        except:
            w2r1r.append(0)

    for x in c:
        try:
            w32.append(math.log(float(x[-1]), 10))
        except:
            w32.append(0)

    for x in d:
        try:
            w3r2r.append(math.log(float(x[-1]), 10))
        except:
            w3r2r.append(0)

    print len(w21), len(w2r1r), len(w32), len(w3r2r)

    # w21 = [math.log(float(x[-1])) for x in a if float(x[-1]) != 0]
    # w2r1r = [math.log(float(x[-1])) for x in b if float(x[-1]) != 0]
    # w32 = [math.log(float(x[-1])) for x in c if float(x[-1]) != 0]
    # w3r2r = [math.log(float(x[-1])) for x in d if float(x[-1]) != 0]

    # print stats.ks_2samp(w21, w2r1r)
    # print stats.ks_2samp(w32, w3r2r)

    F = plt.figure()
    ax1 = F.add_subplot(221)
    ax1.set_title('Wave2 to Wave1 (pval: ' +
                  str(stats.ks_2samp(w21, w2r1r)[1]) + ')')
    ax1.set_ylabel('Count')
    ax1.set_xlabel('Log 10 Distance (bp)')
    ax1.hist(w21, bins=np.arange(0, 8 + 0.2, 0.2), alpha=0.5, color='green')
    # ax1.set_xlim([0,500000])
    # ax1.set_ylim([0,600])
    # ax1.hist(w21,bins=np.arange(0, 18 + 0.2, 0.2))
    # ax1.set_xscale('log')

    # ax2.F.add_subplot(222)
    # ax2.set_title('Wave2rand to Wave1rand')
    # ax2.set_ylabel('Count')
    # ax2.set_xlabel('Distance (bp)')
    ax1.hist(w2r1r, bins=np.arange(0, 8 + 0.2, 0.2), alpha=0.5, color='red')
    ax1.legend(['Observed', 'Expected'], loc='upper left')
    # ax2.set_xlim([0,500000])
    # ax2.set_ylim([0,600])
    # ax1.hist(w2r1r,bins=np.arange(0, 18 + 0.2, 0.2))
    # ax2.set_xscale('log')

    ax2 = F.add_subplot(222)
    ax2.set_title('Cumulative distribution function')
    ax2.set_ylabel('CDF')
    ax2.set_xlabel('Log 10 Distance (bp)')
    # Use the histogram function to bin the data
    counts, bin_edges = np.histogram(w21,
                                     bins=np.arange(0, 18 + 0.2, 0.2),
                                     normed=True)
    counts_r, bin_edges_r = np.histogram(w2r1r,
                                         bins=np.arange(0, 18 + 0.2, 0.2),
                                         normed=True)
    # Now find the cdf
    cdf = np.cumsum(counts)
    cdf_r = np.cumsum(counts_r)
    # And finally plot the cdf
    plt.plot(bin_edges[1:], cdf, color='green')
    plt.plot(bin_edges_r[1:], cdf_r, color='red')
    ax2.legend(['Observed', 'Expected'], loc='upper left')

    ax3 = F.add_subplot(223)
    ax3.set_title('Wave3 to Wave2 (pval: ' +
                  str(stats.ks_2samp(w32, w3r2r)[1]) + ')')
    ax3.set_ylabel('Count')
    ax3.set_xlabel('Log 10 Distance (bp)')
    ax3.hist(w32, bins=np.arange(0, 8 + 0.2, 0.2), alpha=0.5, color='green')
    # ax3.set_xlim([0,500000])
    # ax3.set_ylim([0,3500])
    # ax3.hist(w32,bins=np.arange(0, 18 + 0.2, 0.2))
    # ax3.set_xscale('log')

    ax4 = F.add_subplot(224)
    ax4.set_title('Cumulative distribution function')
    ax4.set_ylabel('CDF')
    ax4.set_xlabel('Log 10 Distance (bp)')
    # Use the histogram function to bin the data
    counts, bin_edges = np.histogram(w32,
                                     bins=np.arange(0, 18 + 0.2, 0.2),
                                     normed=True)
    counts_r, bin_edges_r = np.histogram(w3r2r,
                                         bins=np.arange(0, 18 + 0.2, 0.2),
                                         normed=True)
    # Now find the cdf
    cdf = np.cumsum(counts)
    cdf_r = np.cumsum(counts_r)
    # And finally plot the cdf
    plt.plot(bin_edges[1:], cdf, color='green')
    plt.plot(bin_edges_r[1:], cdf_r, color='red')
    ax4.legend(['Observed', 'Expected'], loc='upper left')

    # ax4 = F.add_subplot(224)
    # ax4.set_title('Wave3rand to Wave2rand')
    # ax4.set_ylabel('Count')
    # ax4.set_xlabel('Distance (bp)')
    ax3.hist(w3r2r, bins=np.arange(0, 8 + 0.2, 0.2), alpha=0.5, color='red')
    ax3.legend(['Observed', 'Expected'], loc='upper left')
    # ax4.set_xlim([0,500000])
    # ax4.set_ylim([0,3500])
    # ax3.hist(w3r2r,bins=np.arange(0, 18 + 0.2, 0.2))
    # ax4.set_xscale('log')

    plt.savefig(figuredir + 'Cluster_analysis.png', dpi=1200)