def change_accessions(ids, input_format, output_format, species, tmp): # refseq->ensemble->entrez; if input_format != output_format: mart_file = 'biomart_%s_%s_%s.ipage.pickle' % (species, input_format, output_format) mart_file = os.path.join(tmp, mart_file) if os.path.isfile(mart_file) and os.stat(mart_file).st_size != 0: with open(mart_file, 'rb') as f: input_to_output = pickle.load(f) else: if species == 'mouse': dataset = pybiomart.Dataset(name='mmusculus_gene_ensembl', host='http://www.ensembl.org') elif species == 'human': dataset = pybiomart.Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') # print(*dataset.attributes.keys(), sep='\n') mart_attributes = { 'enst': ['ensembl_transcript_id'], 'ensg': ['ensembl_gene_id'], 'refseq': [ 'refseq_mrna', 'refseq_mrna_predicted', 'refseq_ncrna', 'refseq_ncrna_predicted' ], 'entrez': ['entrezgene_id'], 'gs': ['entrezgene_accession'], 'ext': ['external_gene_name'] } input_to_output = {} output_attributes = mart_attributes[output_format] if output_format == 'refseq': output_attributes = [output_attributes[0]] for mart in mart_attributes[input_format]: df1 = dataset.query(attributes=[mart] + output_attributes) df1 = df1[df1.iloc[:, 0].notna()] df1 = df1[df1.iloc[:, 1].notna()] if input_format == 'entrez' or output_format == 'entrez': df1['NCBI gene ID'] = df1['NCBI gene ID'].apply( lambda x: '%.f' % x) if input_format == 'gene_symbol' or output_format == 'gene_symbol': upper = lambda x: x.upper() if type(x) == str else x df1['NCBI gene accession'] = df1[ 'NCBI gene accession'].apply(upper) input_to_output = { **input_to_output, **dict(zip(df1.iloc[:, 0], df1.iloc[:, 1])) } with open(mart_file, 'wb') as f: pickle.dump(input_to_output, f, pickle.HIGHEST_PROTOCOL) new_ids = [] for id_ in ids: if id_ in input_to_output.keys(): new_ids.append(input_to_output[id_]) else: new_ids.append('-') return new_ids else: return ids
def save_as_bed(ensembl_ids, output_location, output_name, header): ''' Take in pandas dataframe of 1 column containing a list of Ensembl IDs, use the pybiomart package to query each Ensembl ID one by one and return a BED file row for each one. Save the output as a .BED file with a header containing the panel information. NOTE: Loop through one by one as biomart cant deal with lots of inputs (not sure what max is - could speed this up using batches?) ''' # Load in pybiomart dataset - GRCh37 dataset = pybiomart.Dataset(name='hsapiens_gene_ensembl', host='http://grch37.ensembl.org') # Query each ID one at a time and add to list bed_list = [] for gene in ensembl_ids: query = dataset.query(attributes=[ 'chromosome_name', 'start_position', 'end_position', 'external_gene_name' ], filters={'link_ensembl_gene_id': gene}) bed_list += [query.to_csv(header=False, index=False, sep="\t")] # Save BED file out = str(output_location) + str(output_name) + '.bed' csv_out = open(out, 'w') csv_out.write(str(header) + '\n') for item in sorted(bed_list): csv_out.write(item) csv_out.close()
def fetch_ensembl_exons(build='37'): """Fetch the ensembl genes Args: build(str): ['37', '38'] """ LOG.info("Fetching ensembl exons build %s ...", build) if build == '37': url = 'http://grch37.ensembl.org' else: url = 'http://www.ensembl.org' dataset_name = 'hsapiens_gene_ensembl' dataset = pybiomart.Dataset(name=dataset_name, host=url) attributes = [ 'chromosome_name', 'ensembl_gene_id', 'ensembl_transcript_id', 'ensembl_exon_id', 'exon_chrom_start', 'exon_chrom_end', '5_utr_start', '5_utr_end', '3_utr_start', '3_utr_end', 'strand', 'rank' ] filters = { 'chromosome_name': CHROMOSOMES, } result = dataset.query(attributes=attributes, filters=filters) return result
def _get_attributes(self, attributes=None, dataset_name='mmusculus_gene_ensembl'): """ Get gene attributes and find principal transcripts. Called after ensembl_gene_id query. Dependent on pybiomart package. """ # Set the dataset. Default to mouse genes. dataset = bm.Dataset(name=dataset_name, host='http://www.ensembl.org') # Set the attributes and filters for query. # Some temporary hard-coding here. if attributes == None: attributes = ['mgi_symbol', 'ensembl_gene_id', 'ensembl_gene_id_version', \ 'ensembl_transcript_id', 'ensembl_transcript_id_version', \ 'transcript_appris', 'transcript_length', \ 'gene_biotype', 'transcript_count'] filters = {'link_ensembl_gene_id': self.gene_list['ensembl_gene_id'].tolist()} # Retrieve information query_result = dataset.query(attributes=attributes, filters = filters) ##################################################### ##### Find the transcript to use for each gene. ##### ##################################################### # Create a new column for the chosen transcript self.gene_list['ensembl_transcript_id_version'] = '' # For lncRNA, choose the longest transcript. lnc_qr = query_result[query_result['Gene type'] == 'lncRNA'] lnc_ind = lnc_qr.groupby(['Gene stable ID']) \ ['Transcript length (including UTRs and CDS)'].idxmax() lnc_qr = query_result.loc[lnc_ind] for ind,row in lnc_qr.iterrows(): self.gene_list.at[self.gene_list['ensembl_gene_id'] == row['Gene stable ID'], 'ensembl_transcript_id_version'] = row.loc['Transcript stable ID version'] if self.verbose: print('\n%s' % self.gene_list[self.gene_list['ensembl_gene_id'] == row['Gene stable ID']]) # For protein coding genes, select the one with smallest APPRIS annotation prot_qr = query_result[query_result['Gene type'] == 'protein_coding'] prot_qr['appris_rank'] = prot_qr['APPRIS annotation'].apply(self._appris2rank) # lowest APPRIS rank prot_ind = prot_qr.groupby(['Gene stable ID'])['appris_rank'].idxmin() prot_qr = query_result.loc[prot_ind] # longest prot_ind = prot_qr.groupby(['Gene stable ID'])['Transcript length (including UTRs and CDS)'].idxmax() prot_qr = query_result.loc[prot_ind] # Write the selected transcript for ind,row in prot_qr.iterrows(): self.gene_list.at[self.gene_list['ensembl_gene_id'] == row['Gene stable ID'], 'ensembl_transcript_id_version'] = row.loc['Transcript stable ID version'] self.gene_list.at[self.gene_list['ensembl_gene_id'] == row['Gene stable ID'], 'appris_rank'] = row.loc['APPRIS annotation'] self.gene_list.at[self.gene_list['ensembl_gene_id'] == row['Gene stable ID'], 'transcript_length'] = row.loc['Transcript length (including UTRs and CDS)']
def _homology_map(from_org, to_org, host, cache=True): # Determine column names for version. from_column = 'ensembl_gene_id' to_column = to_org + '_homolog_ensembl_gene' # Get map_frame from Ensembl. dataset = pybiomart.Dataset(host=host, name=from_org + '_gene_ensembl', use_cache=cache) map_frame = dataset.query(attributes=[from_column, to_column]) # Override map names to reflect requested types. map_frame.columns = [ _format_name(from_org, 'ensembl'), _format_name(to_org, 'ensembl') ] return _convert_to_str(map_frame)
def _id_map(from_type, to_type, host, organism='hsapiens', cache=True): # Try to lookup column as alias. from_column = ID_ALIASES.get(from_type, from_type) to_column = ID_ALIASES.get(to_type, to_type) # Get map_frame from Ensembl. dataset = pybiomart.Dataset(host=host, name=organism + '_gene_ensembl', use_cache=cache) map_frame = dataset.query(attributes=[from_column, to_column]) # Override map names to reflect requested types. map_frame.columns = [ _format_name(organism, from_type), _format_name(organism, to_type) ] return _convert_to_str(map_frame)
def fetch_ensembl_transcripts(build='37', chromosomes=None): """Fetch the ensembl genes Args: build(str): ['37', '38'] chromosomes(iterable(str)) Returns: result(DataFrame) """ chromosomes = chromosomes or CHROMOSOMES LOG.info("Fetching ensembl transcripts build %s ...", build) if build == '37': url = 'http://grch37.ensembl.org' else: url = 'http://www.ensembl.org' dataset_name = 'hsapiens_gene_ensembl' dataset = pybiomart.Dataset(name=dataset_name, host=url) attributes = [ 'chromosome_name', 'ensembl_gene_id', 'ensembl_transcript_id', 'transcript_start', 'transcript_end', 'refseq_mrna', 'refseq_mrna_predicted', 'refseq_ncrna', ] filters = { 'chromosome_name': chromosomes, } result = dataset.query( attributes = attributes, filters = filters, use_attr_names=True, ) return result
def fetch_ensembl_genes(build='37'): """Fetch the ensembl genes Args: build(str): ['37', '38'] """ if build == '37': url = 'http://grch37.ensembl.org' else: url = 'http://www.ensembl.org' LOG.info("Fetching ensembl genes from %s", url) dataset_name = 'hsapiens_gene_ensembl' dataset = pybiomart.Dataset(name=dataset_name, host=url) attributes = [ 'chromosome_name', 'start_position', 'end_position', 'ensembl_gene_id', 'hgnc_symbol', 'hgnc_id', ] filters = { 'chromosome_name': CHROMOSOMES, } result = dataset.query( attributes = attributes, filters = filters, use_attr_names=True, ) return result
# 'chromosome_name': ['1','2'], #'1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y', 'transcript_biotype': 'protein_coding', # 'link_ensembl_gene_id': 'ENSG00000139618' } use_filters = filters # server = pybiomart.Server(host=server_url) # marts = server.list_marts() # print(marts) #server = pybiomart.Server(host='http://www.ensembl.org') #print(server.list_marts()) #sys.exit() dataset = pybiomart.Dataset(name='hsapiens_gene_ensembl', host=server_url) if int(release) <= 78: use_attributes = [] for attribute in sequence_attributes: if attribute in dataset.attributes: use_attributes.append(attribute) elif attribute == 'external_gene_name': use_attributes.append('external_gene_id') print('Replacing "external_gene_name" with "external_gene_id"') else: print('Removing Attribute: %s' % attribute) use_filters = {'biotype': filters['transcript_biotype']} #print(dataset.list_filters())
type=int, action="store", dest="seed", default=0, help="Use this integer seed for reproducibility." ) args = parser.parse_args() singularity run -B /ddn1 /ddn1/vol1/site_scratch/leuven/325/vsc32528/sif/vibsinglecellnf-pycistopic-0.1.img ipython args.sampleId = 'sample_test' args.fragments = '/ddn1/vol1/staging/leuven/stg_00002/lcb/cflerin/analysis/pbmc_atac/analysis/nextflow/test/out/fragments/VIB_1.sinto.fragments.tsv.gz' dataset = pbm.Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype']) filter = annot['Chromosome/scaffold name'].str.contains('CHR|GL|JH|MT') annot = annot[~filter] annot['Chromosome/scaffold name'] = annot['Chromosome/scaffold name'].str.replace(r'(\b\S)', r'chr\1') annot.columns=['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type'] annot = annot[annot.Transcript_type == 'protein_coding'] ################################################## fragments_dict = { args.sampleId: args.fragments } path_to_regions = {'Run_1':'/staging/leuven/stg_00002/lcb/dwmax/documents/aertslab/MLV/10x/exp/ih/20190425_NextSeq500_10x_scATAC/MLV__4aa2e0__Mouse_liver_ctrl/outs/peaks.bed', 'Run_2':'/staging/leuven/stg_00002/lcb/lcb_projects/MLV/cellranger_atac/NovaSeq6000_20200730/MLV__0d3236__liver_fresh_07_07_2020/outs/peaks.bed'}