Пример #1
0
    def dmirt(self, cell_lines=['GM12878', 'K562']):
        from histogram_cl import histogram_gpu

        fpath = os.path.join(self.root, 'prediction', 'pred_tss_20_cnn.db')
        con = sqlite3.connect(fpath)
        for cell_line in cell_lines:
            print(cell_line)
            df_ref = pd.read_sql("SELECT miRNA, chromosome, start, end, strand FROM "
                                       "(SELECT * FROM 'upstream_pred_{}')".format(cell_line), con)
            df_ref[['start', 'end']] = df_ref[['start', 'end']].astype(int)

            tss = df_ref[['start', 'end']].mean(axis=1).astype(int)
            df_ref['start'] = tss - 500
            df_ref['end'] = tss + 500

            gro_path = os.path.join(self.root, 'database', 'GRO_cap.db')
            hgpu = histogram_gpu(XmlHandler.load_param("user_param.xml"))
            df = hgpu.run(df_ref[['chromosome', 'start', 'end', 'strand']], gro_path, '_'.join([cell_line, 'hg19']))

            df['miRNA'] = df_ref['miRNA']
            nidx = df[df['strand'] == '-'].index
            hist_col = list(range(50))
            contents = df.loc[nidx, hist_col[::-1]]
            contents.columns = hist_col
            df.loc[nidx, hist_col] = contents

            dirname = os.path.join(self.root, 'evaluation')
            if not os.path.exists(dirname):
                os.mkdir(dirname)
            out_con = sqlite3.connect(os.path.join(dirname, 'eval_peaks.db'))
            df.to_sql('DMIRT_GRO_{}'.format(cell_line), out_con, index=None, if_exists='replace')
Пример #2
0
    def cage_tag(self, paper='DMIRT', cell_lines=['GM12878', 'K562']):
        from histogram_cl import histogram_gpu

        for cell_line in cell_lines:
            if paper == 'DMIRT':
                fpath = os.path.join(self.root, 'prediction', 'pred_tss_20_cnn.db')
                con = sqlite3.connect(fpath)
                df_ref = pd.read_sql("SELECT miRNA, chromosome, start, end, strand FROM "
                                           "(SELECT * FROM 'upstream_pred_{}')".format(cell_line), con)
                df_ref[['start', 'end']] = df_ref[['start', 'end']].astype(int)
                tss = df_ref[['start', 'end']].mean(axis=1).astype(int)
                mir_label = 'miRNA'
            elif paper == 'HUA':
                fpath = os.path.join(self.root, 'database', 'Supplementary file4-alternative_TSS.db')
                con = sqlite3.connect(fpath)
                df_ref = pd.read_sql("SELECT * FROM 'cell_specific' WHERE cell_lines LIKE '%{}%'".format(cell_line), con)
                tss = df_ref['tss'].astype(int)
                mir_label = '#MIR'
            elif paper == 'PRO':
                fpath = os.path.join(self.root, 'database', 'PRO.db')
                con = sqlite3.connect(fpath)
                df_ref = pd.read_sql("SELECT * FROM '{}'".format(cell_line), con)
                tss = df_ref[['tss_start', 'tss_stop']].mean(axis=1).astype(int)
                mir_label = 'miRNA'
            else:
                fpath = os.path.join(self.root, 'prediction', 'pred_tss_20_cnn.db')
                con = sqlite3.connect(fpath)
                df_ref = pd.read_sql("SELECT miRNA, chromosome, start, end, strand FROM "
                                           "(SELECT * FROM 'upstream_pred_{}')".format(cell_line), con)
                df_ref[['start', 'end']] = df_ref[['start', 'end']].astype(int)
                tss = df_ref[['start', 'end']].mean(axis=1).astype(int)
                mir_label = 'miRNA'

            df_ref['start'] = tss - 500
            df_ref['end'] = tss + 500

            tss = df_ref[['start', 'end']].mean(axis=1).astype(int)

            df_ref['start'] = tss - 500
            df_ref['end'] = tss + 500

            hgpu = histogram_gpu(XmlHandler.load_param("user_param.xml"))
            fpath = os.path.join(self.root, "database", "hCAGE_ctss.db")
            rsc_tname = '{}_hg19_ctss'.format(cell_line)
            df = hgpu.run(df_ref[['chromosome', 'start', 'end', 'strand']], fpath, rsc_tname)
            if mir_label:
                df['miRNA'] = df_ref[mir_label]

            nidx = df[df['strand'] == '-'].index
            hist_col = list(range(50))
            contents = df.loc[nidx, hist_col[::-1]]
            contents.columns = hist_col
            df.loc[nidx, hist_col] = contents

            out_con = sqlite3.connect(os.path.join(self.root, 'evaluation', 'eval_peaks.db'))
            df.to_sql('{}_CAGE_{}'.format(paper, cell_line), out_con, index=None, if_exists='replace')
Пример #3
0
    def run(self):
        import matplotlib.pyplot as plt

        dirname__ = os.path.join(self.root, 'database/Histone ChIP-seq')
        filenames = XmlHandler.load_param('histogram_param.xml')
        N = len(self.histones)
        hgpu = histogram_gpu(fpath='histogram_param.xml')

        for tissue, fnames in filenames.items():
            df_ref = self.load_ref(tissue)
            if df_ref is None:
                continue

            dfs = {}
            for histone, fname in fnames.items():
                rsc_path = os.path.join(dirname__, tissue, 'histone.db')
                hist_out = self.get_hist(df_ref, rsc_path, fname)
                print(hist_out)
Пример #4
0
    def hua(self, cell_lines=['GM12878', 'K562']):
        from histogram_cl import histogram_gpu
        fpath = os.path.join(self.root, 'database', 'Supplementary file4-alternative_TSS.db')
        con = sqlite3.connect(fpath)

        for cell_line in cell_lines:
            df_ref = pd.read_sql("SELECT * FROM 'cell_specific' WHERE cell_lines LIKE '%{}%'".format(cell_line), con)
            df_ref['start'] = df_ref['tss'].astype(int) - 500
            df_ref['end'] = df_ref['tss'].astype(int) + 500

            gro_path = os.path.join(self.root, 'database', 'GRO_cap.db')
            hgpu = histogram_gpu(XmlHandler.load_param("user_param.xml"))
            df = hgpu.run(df_ref, gro_path, '_'.join([cell_line, 'hg19']))

            dirname = os.path.join(self.root, 'evaluation')
            if not os.path.exists(dirname):
                os.mkdir(dirname)
            out_con = sqlite3.connect(os.path.join(dirname, 'eval_peaks.db'))
            df.to_sql('HUA_GRO_{}'.format(cell_line), out_con, if_exists='replace', index=None)
Пример #5
0
    def pro(self, cell_lines=['GM12878', 'K562']):
        from histogram_cl import histogram_gpu
        fpath = os.path.join(self.root, 'database', 'PRO.db')
        con = sqlite3.connect(fpath)

        for cell_line in cell_lines:
            df_ref = pd.read_sql("SELECT * FROM '{}'".format(cell_line), con)
            df_ref['tss'] = df_ref[['tss_start', 'tss_stop']].mean(axis=1).astype(int)
            df_ref['start'] = df_ref['tss'].astype(int) - 500
            df_ref['end'] = df_ref['tss'].astype(int) + 500

            gro_path = os.path.join(self.root, 'database', 'GRO_cap.db')
            hgpu = histogram_gpu(XmlHandler.load_param("user_param.xml"))
            df = hgpu.run(df_ref, gro_path, '_'.join([cell_line, 'hg19']))

            dirname = os.path.join(self.root, 'evaluation')
            if not os.path.exists(dirname):
                os.mkdir(dirname)
            out_con = sqlite3.connect(os.path.join(dirname, 'eval_peaks.db'))
            df.to_sql('PRO_GRO_{}'.format(cell_line), out_con, if_exists='replace', index=None)