def fetch_gene_gtf(gtf_fname: str, gene_ids_fname: str): """ LOADS wormbase_gene This function fetches and parses the canonical geneset GTF and yields a dictionary for each row. """ gene_gtf = read_gtf_as_dataframe(gtf_fname) gene_ids = get_gene_ids(gene_ids_fname) # Add locus column # Rename seqname to chrom gene_gtf = gene_gtf.rename({'seqname': 'chrom'}, axis='columns') gene_gtf = gene_gtf.assign( locus=[gene_ids.get(x) for x in gene_gtf.gene_id]) gene_gtf = gene_gtf.assign( chrom_num=[CHROM_NUMERIC[x] for x in gene_gtf.chrom]) gene_gtf = gene_gtf.assign(pos=(((gene_gtf.end - gene_gtf.start) / 2) + gene_gtf.start).map(int)) gene_gtf.frame = gene_gtf.frame.apply(lambda x: x if x != "." else None) gene_gtf.exon_number = gene_gtf.exon_number.apply(lambda x: x if x != "" else None) gene_gtf['arm_or_center'] = gene_gtf.apply( lambda row: arm_or_center(row['chrom'], row['pos']), axis=1) for row in gene_gtf.to_dict('records'): yield row
def test_ensembl_gtf_gene_names(): df = read_gtf_as_dataframe(ENSEMBL_GTF_PATH) gene_names = set(df["gene_name"]) assert gene_names == EXPECTED_GENE_NAMES, "Wrong gene names: %s, missing %s and unexpected %s" % ( gene_names, EXPECTED_GENE_NAMES.difference(gene_names), gene_names.difference(EXPECTED_GENE_NAMES), )
def test_ensembl_gtf_gene_names(): df = read_gtf_as_dataframe(ENSEMBL_GTF_PATH) gene_names = set(df["gene_name"]) assert gene_names == EXPECTED_GENE_NAMES, \ "Wrong gene names: %s, missing %s and unexpected %s" % ( gene_names, EXPECTED_GENE_NAMES.difference(gene_names), gene_names.difference(EXPECTED_GENE_NAMES) )
def _load_full_dataframe_from_gtf(self): """ Parse this genome source's GTF file and load it as a Pandas DataFrame """ logger.info("Reading GTF from %s", self.gtf_path) df = read_gtf_as_dataframe( self.gtf_path, column_converters={ "seqname": normalize_chromosome, "strand": normalize_strand, }, infer_biotype_column=True) features = set(df["feature"]) column_names = set(df.keys()) # older Ensembl releases don't have "gene" or "transcript" # features, so fill in those rows if they're missing if "gene" not in features: # if we have to reconstruct gene feature rows then # fill in values for 'gene_name' and 'gene_biotype' # but only if they're actually present in the GTF logger.info("Creating missing gene features...") df = create_missing_features( dataframe=df, unique_keys={"gene": "gene_id"}, extra_columns={ "gene": { "gene_name", "gene_biotype" }.intersection(column_names), }, missing_value="") logger.info("Done.") if "transcript" not in features: logger.info("Creating missing transcript features...") df = create_missing_features( dataframe=df, unique_keys={"transcript": "transcript_id"}, extra_columns={ "transcript": { "gene_id", "gene_name", "gene_biotype", "transcript_name", "transcript_biotype", "protein_id", }.intersection(column_names) }, missing_value="") logger.info("Done.") return df
def _load_full_dataframe_from_gtf(self): """ Parse this genome source's GTF file and load it as a Pandas DataFrame """ logger.info("Reading GTF from %s", self.gtf_path) df = read_gtf_as_dataframe(self.gtf_path, column_converters={ "seqname": normalize_chromosome, "strand": normalize_strand, }, infer_biotype_column=True) features = set(df["feature"]) column_names = set(df.keys()) # older Ensembl releases don't have "gene" or "transcript" # features, so fill in those rows if they're missing if "gene" not in features: # if we have to reconstruct gene feature rows then # fill in values for 'gene_name' and 'gene_biotype' # but only if they're actually present in the GTF logger.info("Creating missing gene features...") df = create_missing_features(dataframe=df, unique_keys={"gene": "gene_id"}, extra_columns={ "gene": {"gene_name", "gene_biotype" }.intersection(column_names), }, missing_value="") logger.info("Done.") if "transcript" not in features: logger.info("Creating missing transcript features...") df = create_missing_features( dataframe=df, unique_keys={"transcript": "transcript_id"}, extra_columns={ "transcript": { "gene_id", "gene_name", "gene_biotype", "transcript_name", "transcript_biotype", "protein_id", }.intersection(column_names) }, missing_value="") logger.info("Done.") return df
def mouse_gene_intervals(): df = read_gtf_as_dataframe(GENCODE_MM10_FILE) df = df[df.feature == 'gene' & df.feature_type == 'protein_coding'] print(len(df)) trees = {chromosome_strand: IntervalTree() for chromosome_strand in product(MOUSE_CHROMOSOMES, ['+', '-'])} for _, row in df.iterrows(): if row['end'] > row['start']: # end is included, start count at 0 instead of 1 trees[row['seqname'] + row['strand']][row['start'] - 1:row['end'] ] = (row['gene_id']) logging.info('Built mouse exon tree with {} nodes' .format(sum([len(tree) for tree in trees.values()]))) return trees
def tr_gene_map(gtf): gtf_df = read_gtf_as_dataframe(gtf) tr_df = gtf_df[gtf_df.feature == 'transcript'] tr_gene_map = tr_df.loc[:, ['transcript_id', 'gene_id']] tr_gene_map = tr_gene_map.set_index('transcript_id') return tr_gene_map
def test_read_refseq_gtf_as_dataframe(): gtf_df = read_gtf_as_dataframe(REFSEQ_GTF_PATH) _check_required_columns(gtf_df)
def read_gencode(genome=GENOME): ''' Buffered gencode read with HAVANA/ENSEMBL merged Swissprot IDs are merged and start-end indexing is adjusted Returns relevant columns only Returns the gencode dataframe but with havana and ensembl merged ''' if genome == 'hg19': df = read_gtf_as_dataframe(GENCODE_HG19_FILE) elif genome == 'hg38': df = read_gtf_as_dataframe(GENCODE_HG38_FILE) elif genome == 'mm10': df = read_gtf_as_dataframe(GENCODE_MM10_FILE) df.exon_number = df.exon_number.apply(pd.to_numeric, errors='coerce') df.protein_id = df.protein_id.map(lambda v: v[:v.find('.')]) df.exon_id = df.exon_id.map(lambda v: v[:v.find('.')]) df.gene_id = df.gene_id.map(lambda v: v[:v.find('.')]) df.transcript_id = df.transcript_id.map(lambda v: v[:v.find('.')]) # only take protein_coding genes/transcripts/exons df = df[ (df['gene_type'] == 'protein_coding') & (df['feature'].isin(['gene', 'transcript', 'exon', 'UTR'])) & (df['seqname'].isin(CHROMOSOMES))] # drop all transcripts and exons that have no protein_id df.drop(df.index[(df.protein_id == '') & ( df.feature.isin(['exon', 'transcript', 'UTR']))], inplace=True) # only take exons and transcripts which contain a basic-tag non_basic_transcripts = (df['feature'].isin(['transcript', 'exon', 'UTR'])) & \ ~(df['tag'].str.contains('basic')) df.drop(df.index[non_basic_transcripts], inplace=True) # add swissprot id mappings protein_id_mapping = load_protein_mapping() protein_id_mapping = protein_id_mapping[ protein_id_mapping.ID_NAME == 'Ensembl_PRO'][ ['swissprot_id', 'protein_id']] df = df.merge(protein_id_mapping, how='left', on='protein_id') # delete ENSEMBL entries which come from both, HAVANA and ENSEMBL mixed_ids = df[['gene_id', 'source']].drop_duplicates() counts = mixed_ids.gene_id.value_counts() duplicate_ids = counts.index[counts == 2] df.drop(df.index[ df.gene_id.isin(duplicate_ids) & (df.source == 'ENSEMBL')], inplace=True) # fix indexing df.start -= 1 # drop alternative_3or5_UTR transcripts # df = df.drop(df.index[df.tag.str.contains('alternative_')]) # drop all genes which have no transcripts valid_genes = df[df['feature'] == 'transcript'].gene_id.drop_duplicates() # double check, there are no orphan-exons or so assert set(valid_genes) == \ set(df[df['feature'] == 'exon'].gene_id.drop_duplicates()) df.drop(df.index[~df.gene_id.isin(valid_genes)], inplace=True) # select best transcript df = df.groupby('gene_id').apply(_filter_best_transcript) df.reset_index(level=0, drop=True, inplace=True) return df[[ 'feature', 'gene_id', 'transcript_id', 'start', 'end', 'exon_id', 'exon_number', 'gene_name', 'transcript_type', 'strand', 'gene_type', 'tag', 'protein_id', 'swissprot_id', 'score', 'seqname', 'source']]
def test_ensembl_gtf_columns(): df = read_gtf_as_dataframe(ENSEMBL_GTF_PATH) features = set(df["feature"]) eq_(features, EXPECTED_FEATURES)
def test_read_stringtie_gtf_as_dataframe(): gtf_df = read_gtf_as_dataframe(B16_GTF_PATH) _check_required_columns(gtf_df) _check_string_cov_and_FPKM(gtf_df)
fig = self.plt.figure() cmap = self.plt.cm.tab20 color = iter(cmap(np.linspace(0, 1, len(data_map)))) for label, data_arrays in data_map.items(): self.plt.plot(data_arrays[0], data_arrays[1], marker, label=label, color=next(color)) if legend: self.plt.legend(loc=legend_loc) self._set_properties_and_close(fig, title, xlab, ylab) if __name__ == '__main__': args = parser.parse_args() df = read_gtf_as_dataframe(args.gtf) df_trs = df[df["feature"] == "exon"] et = defaultdict(lambda: defaultdict(dict)) for row in df_trs.itertuples(): et[row.gene_id][row.transcript_id][int(row.exon_number)] = float(row.cov) etp = defaultdict(lambda: defaultdict(list)) for gene, trs_info in sorted(et.items(), key=lambda x: x[0]): for trs, exon_info in trs_info.items(): exon_numbers = np.array(list(exon_info.keys()), dtype=int) exon_cov = np.array(list(exon_info.values()), dtype=float) # exon_cov = exon_cov / np.sum(exon_cov) exon_cov = np.log(exon_cov + 1) etp[gene][trs].extend([exon_numbers, exon_cov])
def load_dataset(drop_locus=True): ''' Load and prepare the achilles dataset to be processed be azimuth feature extraction :returns: Xdf, Y, gene_position, target_genes as in azimuth.load_dataset ''' activity_scores = pd.read_csv(ACHILLES_GUIDE_ACTIVITY_SCORES_FILE, sep='\t') guide_map = pd.read_csv(ACHILLES_GUIDE_MAPPING, sep='\t') guide_map.dropna(inplace=True) guide_map.rename(index=str, columns={'Gene': 'Target'}, inplace=True) guide_map = guide_map.groupby('Guide').first() activity_scores.dropna(inplace=True) activity_scores.set_index('Guide', inplace=True) df = guide_map.join(activity_scores) # TODO why hg38 and not 37 hg38 = read_gtf_as_dataframe(GENCODE_HG38_FILE) hg38 = hg38.loc[(hg38.feature == 'gene')] # remove duplicate gene names (chrX, chrY) by using first one # this might be inaccurate but shouldn't have a big impact. # It affects around 60 datapoints only (out of 70000) hg38 = hg38.groupby('gene_name').first().reset_index() # fix wrong gene names hg38['gene_id'] = hg38['gene_id'].apply(lambda v: v[:15]) merged_mapping = GENE_ID_MAPPING.merge(hg38[['gene_id', 'gene_name']], how='inner', on='gene_id') df.Target = df.Target.apply(lambda gene: gene if (gene == hg38.gene_name). any() else _first_or_none(merged_mapping.loc[ merged_mapping.symbol == gene].gene_name)) df.dropna(inplace=True) # contexts = df.apply(lambda row: _find_context(row.name, *row.Locus.split('_')[:2]), axis=1) df['30mer'] = [c[0] for c in contexts] df['Strand'] = [c[1] for c in contexts] df.dropna(inplace=True) Y = pd.DataFrame({'score_drug_gene_rank': df['Activity']}, index=df.index) # calculate percent peptide and 'Amino Acid Cut position' df_positions = df.merge(hg38[['gene_name', 'start', 'end']], left_on='Target', right_on='gene_name') nt_cut_position = df_positions.Locus.map(lambda v: int(v.split('_')[1])) pp = (100.0 * (nt_cut_position - df_positions.start) / (df_positions.end - df_positions.start)) # 'Amino Acid Cut position' is just a very stupid heuristic because I am # too lazy to calculate the real value aacp = (pp / 100.0) * ((df_positions.end - df_positions.start) / 100) df.drop(['Activity'], axis=1, inplace=True) if drop_locus: df.drop(['Locus'], axis=1, inplace=True) gene_position = pd.DataFrame({ 'Percent Peptide': pp, 'Amino Acid Cut position': aacp }) gene_position.set_index(df.index, inplace=True) df.index.name = 'Sequence' df['drug'] = 'nodrug' target_genes = df['Target'].drop_duplicates() df.reset_index(inplace=True) df.set_index(['Sequence', 'Target', 'drug'], inplace=True) return df, Y, gene_position, target_genes
def read_hg38(): print('read gencode') df = read_gtf_as_dataframe(GENCODE_HG38_FILE) df.gene_id = df.gene_id.apply(lambda gid: gid[:15]) return df
def test_read_stringtie_gtf_as_dataframe_float_values(): gtf_df = read_gtf_as_dataframe( B16_GTF_PATH, column_converters={"cov": float, "FPKM": float}) _check_required_columns(gtf_df) _check_float_cov_and_FPKM(gtf_df)