yielded_score_dfm = list(self.__yield_score_dfm(snp_dfm)) # result = pd.concat(yielded_score_dfm, axis=1) result = reduce(lambda left, right: pd.merge(left, right, on='name'), yielded_score_dfm) return result def save_temp(self, _result): _result.to_csv(self.temp_dest, sep='\t', index=False, header=True) if __name__ == '__main__': rc_util = RepliChipUtil() snp_bed_fn = os.path.join(sys_tool.find_directory("fsu_repli_chip"), 'RSNP_50kb.bed') snp_dfm = pd.read_table(snp_bed_fn, header=None, names=['chrom', 'chromStart', 'chromEnd', 'name']) rc_util.src_data_dir = sys_tool.find_directory("fsu_repli_chip") rc_util.src_data_fn = dict( FsuBg02esRep1="wgEncodeFsuRepliChipBg02esWaveSignalRep1.bed", FsuBg02esRep2="wgEncodeFsuRepliChipBg02esWaveSignalRep2.bed", FsuGm06990Rep1="wgEncodeFsuRepliChipGm06990WaveSignalRep1.bed", FsuGm06990Rep2="wgEncodeFsuRepliChipGm06990WaveSignalRep2.bed", FsuH1hescRep1="wgEncodeFsuRepliChipH1hescWaveSignalRep1.bed", FsuH1hescRep2="wgEncodeFsuRepliChipH1hescWaveSignalRep2.bed", FsuH1hescRep3="wgEncodeFsuRepliChipH1hescWaveSignalRep3.bed", FsuH7esRep1="wgEncodeFsuRepliChipH7esWaveSignalRep1.bed",
'tfbsName': 'pwm' }) result = snp_dfm.merge(result, how='left', on=['name', 'chrom'], copy=True) return result.loc[:, ['name', 'pwm']] def save_temp(self, _result): _result.to_csv(self.temp_dest, sep='\t', index=False, header=True) if __name__ == '__main__': rsnp_bed_fn = os.path.join(find_directory('dhs'), "RSNP_50kb.bed") rsnp_dfm = pd.read_table(rsnp_bed_fn, header=None, names=['chrom', 'chromStart', 'chromEnd', 'name']) jaspar_tf_util = JasparTfbsUtil() jaspar_tf_util.src_data_dir = find_directory('Jaspar_TFBS') jaspar_tf_util.src_data_fn = 'jaspar_tfbs_ensembl_75_hg19.txt' result = jaspar_tf_util.extract(rsnp_dfm) result.to_csv("FOO_RSNP.tsv", sep='\t', header=True, index=False) # if __name__ == '__main__': # rsnp_bed_fn = "{}/RSNP_50kb.bed".format(find_directory('dhs')) # rsnp_dfm = pd.read_table(rsnp_bed_fn, header=None,
index_names=False), from_string=True) yielded_dfms = list(self.__yield_fitcons_dfm(snp_bed_obj)) snp_fitcons = reduce( lambda left, right: pd.merge(left, right, on='name'), yielded_dfms) return snp_fitcons def save_temp(self, _result): _result.to_csv(self.temp_dest, sep='\t', index=False, header=True) if __name__ == '__main__': fitcons_util = FitconsUtil() fitcons_util.src_data_dir = find_directory("fitcons") fitcons_util.src_data_fn = dict( fitConsGm="fc-gm-0.bed", fitConsH1="fc-h1-0.bed", fitConsHu="fc-hu-0.bed", fitConsI6="fc-i6-0.bed", ) rsnp_dfm = pd.read_table(os.path.join(find_directory('fitcons'), "RSNP_50kb.bed"), header=None, names=['chrom', 'chromStart', 'chromEnd', 'name']) fitcons_util.temp_dest = 'foo_rsnp.txt' fitcons_util.extract(_input=rsnp_dfm) csnp_dfm = pd.read_table(os.path.join(find_directory('fitcons'),
# find minimum non-NAN 'P_Val' for each SNP result = result.groupby('name').agg(min).reset_index() # some SNPs has no non-NAN 'P_Val'; mark their aggregated 'P_VAL' NAN result = snp_dfm.merge(result, how='left', on='name', copy=True) result.loc[:, 'P_Val'] = result.loc[:, 'P_Val'].apply( self.__transform_p_value) result = result.rename(columns={'P_Val': 'eqtlPvalue'}) return result.loc[:, ['name', 'eqtlPvalue']] def save_temp(self, _result): _result.to_csv(self.temp_dest, sep='\t', header=True, index=False) if __name__ == '__main__': rsnp_dfm = pd.read_table(os.path.join(find_directory('CADD'), "RSNP_50kb.bed"), header=None, names=['chrom', 'chromStart', 'chromEnd', 'name']) print(rsnp_dfm.shape) eqtl_util = EqtlUtil() eqtl_util.src_data_dir = find_directory('eqtl') # eqtl_util.src_data_fn = ['Stomach.portal.eqtl', 'Heart_Left_Ventricle.portal.eqtl'] eqtl_util.temp_dest = 'foo_rsnp.txt' result = eqtl_util.extract(rsnp_dfm) print(result)
on=['name', 'chrom'], copy=True) return snp_dfm.loc[:, ["name", "masterDhsScore", "masterDhsCount" ]].fillna(0) def save_temp(self, _result): _result.to_csv(self.temp_dest, sep='\t', header=True, index=False, columns=["name", "masterDhsScore", "masterDhsCount"]) if __name__ == '__main__': rsnp_bed_fn = os.path.join(find_directory('dhs'), "RSNP_50kb.bed") rsnp_dfm = pd.read_table(rsnp_bed_fn, header=None, names=['chrom', 'chromStart', 'chromEnd', 'name']) csnp_bed_fn = os.path.join(find_directory('dhs'), "CSNP_50kb.bed") csnp_dfm = pd.read_table(csnp_bed_fn, header=None, names=['chrom', 'chromStart', 'chromEnd', 'name']) dhs_util = MasterDhsUtil() dhs_util.db_config_key = 'local_hg19' dhs_util.temp_dest = 'FOO_CSNP.tsv' dhs_util.extract(csnp_dfm) dhs_util.temp_dest = 'FOO_RSNP.tsv'
coord_util = CoordUtil() coord_util.db_config_key = db_config_key tf_util = TfUtil(reproduce_osu17=True) tf_util.db_config_key = db_config_key # jaspar_tfbs_util = JasparTfbsUtil() # jaspar_tfbs_util.src_data_dir = sys_tool.find_directory('Jaspar_TFBS') # jaspar_tfbs_util.src_data_fn = 'jaspar_tfbs_ensembl_75_hg19.txt' mst_dhs_util = MasterDhsUtil() mst_dhs_util.db_config_key = db_config_key uni_dhs_util = UniformDhsUtil() uni_dhs_util.src_data_dir = sys_tool.find_directory("dhs") uni_dhs_util.src_data_fn = "UniformDnaseIHS" phastcons_util = PhastconsUtil() phastcons_util.db_config_key = db_config_key tss_dist_util = TssDistUtil() tss_dist_util.db_config_key = db_config_key eqtl_util = EqtlUtil() eqtl_util.src_data_dir = sys_tool.find_directory('eqtl') gerp_util = GerpUtil() gerp_util.src_data_dir = sys_tool.find_directory("gerp") gerp_util.src_data_fn = "All_hg19_RS.bw"
'CTCF': 'CTCF_REG', 'E': 'ENH', 'PF': 'TSS_FLANK', 'R': 'REP', 'T': 'TRAN', 'TSS': 'TSS', 'WE': 'WEAK_ENH' } gwava_dfm = pd.DataFrame(results, index=names.keys()).T.rename(columns=names) snp_dfm = snp_dfm.merge(gwava_dfm, how='left', left_on='name', right_index=True, copy=True) return snp_dfm.fillna(0).drop(['chrom', 'chromStart', 'chromEnd'], axis=1) def save_temp(self, _result): _result.to_csv(self.temp_dest, sep='\t', index=False, header=True) if __name__ == '__main__': snp_bed_fn = os.path.join(sys_tool.find_directory("fsu_repli_chip"), 'RSNP_50kb.bed') snp_dfm = pd.read_table(snp_bed_fn, header=None, names=['chrom', 'chromStart', 'chromEnd', 'name']) gwava_util = GwavaUtil() gwava_util.src_data_dir = sys_tool.find_directory('GWAVA') gwava_util.src_data_fn = 'segmentation.bed.gz' gwava_util.temp_dest = 'GWAVA_RSNP.txt' gwava_util.extract(_input=snp_dfm)