def geneTranslation(var): """Translates variants to genes Arguments: var {list} -- List of chromosome regions """ data = pyensembl.Genome( reference_name='GRCh37', annotation_name='my_genome_features', gtf_path_or_url='../../data/datasets/Original/ensembl_v37.gtf') data.index() genes, variants = [], [] bar = Bar('Processing', max=len(var), suffix='%(percent)d%%') for v in var: locus = v.split(':') chr, pos = locus[0][3:], int(float(locus[1])) gene = data.genes_at_locus(chr, pos) if len(gene) == 0: genes.append('None') variants.append(v) elif gene[0].biotype == 'protein_coding': genes.append(gene[0].gene_name) variants.append(v) else: genes.append('None') variants.append(v) bar.next() bar.finish() result = zip(genes, variants) # Creates a list of (gene,variant region) print('>>> Writing txt...') with open('../../data/genes/geneList.csv', mode='w', newline='') as myfile: wr = csv.writer(myfile) wr.writerow(("Genes", "Variants")) for row in result: wr.writerow(list(row))
def load_ensembl_gene_ids(mouse_gtf=None): data = pyensembl.Genome( reference_name='GRCm38', gtf_path_or_url= 'ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/Mus_musculus.GRCm38.81.gtf.gz', transcript_fasta_path_or_url= 'ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz', protein_fasta_path_or_url= 'ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz' ) return data
def main(): sys.stderr = open(snakemake.log[0], "w") data = pyensembl.Genome( reference_name='GRCm38', annotation_name='mus_musculus', gtf_path_or_url=snakemake.params.gtf) data.index() df_cts = annotate(snakemake.input.counts, data) df_dge = annotate(snakemake.input.dge, data) df = df_dge.merge(df_cts, how='inner', left_index=True, right_index=True) df = df.sort_values(by='padj') df.to_csv(snakemake.output.table, sep='\t')
def get_genome(reference_name, gtf, transcript_fasta=None, logging_args=None, annotation_name='ensembl', **kwargs): """ Retrieve the pyensembl annotations associated with the given reference. The script also creates the database (with the "index" method) if it does not already exist. This function is largely a wrapper around the pyensembl.Genome constructor, so please see it for more details. Parameters ---------- reference_name: string The identifier for the reference gtf: string The path to the GTF annotation file transcript_fasta: string (optional) The path to the fasta file with transcript sequences. The fasta keys must match the transcript_id in the GTF file logging_args: argparse.Namespace pyensembl appears to change several logging levels while opening the annotation database (sometimes?). If the logging arguments are given, then they will be restored after opening the database. annotation_name, kwargs: Other options to pass to the pyensembl constructor """ ensembl = pyensembl.Genome(reference_name=reference_name, gtf_path_or_url=gtf, transcript_fasta_paths_or_urls=transcript_fasta, annotation_name=annotation_name, **kwargs) # this will create the database if needed ensembl.index() if logging_args is not None: logging_utils.update_logging(logging_args) return ensembl
def main(tsv_path): """ annotate.py Manually annotate a file with gene symbols from Ensemble ID. :param tsv_path: Path for TSV to annotate """ data = pyensembl.Genome(reference_name='GRCm38', annotation_name='mus_musculus', gtf_path_or_url='resources/genome.gtf') data.index() df = annotate(tsv_path, data) parent_dir = os.path.dirname(tsv_path) name, ext = os.path.splitext(os.path.basename(tsv_path)) out_dir = os.path.join(parent_dir, "{}_anno{}".format(name, ext)) df.to_csv(out_dir, sep='\t')
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Given a meme motif file, extract the gene names and map " "them to ensembl identifiers using pyensembl. The pyensembl database " "information can be given either in a yaml config file or as command " "line options. The yaml config file values have precedence over the " "command line options.") parser.add_argument('meme', help="The meme file") parser.add_argument('out', help="The output file") parser.add_argument( '-c', '--config', help="The yaml config file. If " "given, this should include keys 'genome_name' and 'gtf'. Otherwise, " "they may be specified using the respective command line options.", default=None) parser.add_argument('-n', '--genome-name', help="The genome_parameter for " "retrieving the pyensembl database", default=None) parser.add_argument('-g', '--gtf', help="The gtf file for pyensembl", default=None) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # if the config file was given, use any values in it to replace those # passed on the command line if args.config is not None: msg = "Reading config file" logger.info(msg) config = yaml.load(open(args.config)) args.genome_name = config.get('genome_name', args.genome_name) args.gtf = config.get('gtf', args.gtf) msg = "genome_name: {}".format(args.genome_name) logger.debug(msg) msg = "gtf: {}".format(args.gtf) logger.debug(msg) msg = "Loading pyensembl database" logger.info(msg) ensembl = pyensembl.Genome(reference_name=args.genome_name, annotation_name="ensembl", gtf_path_or_url=args.gtf) # this will create the database if needed ensembl.index() msg = "Parsing motif gene names" logger.info(msg) # a line from CISBP looks like: # MOTIF M002_0.6 (Ankhd1)_(Homo_sapiens)_(RBD_1.00) all_motifs = [] motif_re = ("\((?P<gene_name>[^\)]+)\)_\((?P<species>[^\)]+)\)_" "\((?P<rbd_score>[^\)]+)\)") motif_re = re.compile(motif_re) with open(args.meme) as meme_f: for line in meme_f: if line.startswith("MOTIF"): (key, motif_name, info) = line.split() m = motif_re.match(info) if m is None: msg = ("Could not parse gene name. Guessing the entire " "string is the gene name: '{}'.".format(info)) logger.warning(msg) gene_name = info else: gene_name = m.group("gene_name") try: ensembl_ids = ensembl.gene_ids_of_gene_name(gene_name) except ValueError: msg = ("Could not find Ensembl identifier for gene_name: " "'{}'".format(gene_name)) logger.warning(msg) ensembl_ids = [gene_name] for ensembl_id in ensembl_ids: motif = { "motif_name": motif_name, "gene_name": gene_name, "ensembl_id": ensembl_id } all_motifs.append(motif) msg = "Joining motif gene names into large data frame" logger.info(msg) all_motifs_df = pd.DataFrame(all_motifs) msg = "Writing motifs to disk" logger.info(msg) utils.write_df(all_motifs_df, args.out, index=False)
from __future__ import print_function import sys import pyensembl myfile = open(sys.argv[1]) gtffile = sys.argv[2] outfile = open(sys.argv[1][:-4] + '_geneanno.sam', 'w') data = pyensembl.Genome(reference_name='GRCH37', annotation_name='my_genome_features', gtf_path_or_url=gtffile) data.index() with myfile: for line in myfile: if line[0] == '@': continue info = line.split('\t') chr = info[2] pos = int(info[3]) nameresult = data.gene_names_at_locus(contig=chr, position=pos) k = 20 if nameresult == []: nameresult = data.gene_names_at_locus(contig=chr, position=pos+k) if nameresult == []: nameresult = data.gene_names_at_locus(contig=chr, position=pos - k) genename = '' for item in nameresult: genename += item + ';' genename = genename[:-1] outfile.write(genename + '\t' + line)
peak[2], width=peak[1] - peak[0], color='b', edgecolor='none') axes[8].set_xlim([start_pos, end_pos]) axes[8].set_title('RelA Lipid A 2h') axes[8].set_yticks([0, 40]) axes[8].set_xticks([]) """ Plot location of Ccl3 and Ccl4 """ mouse_genome = pyensembl.Genome( reference_name='NCBIM37', gtf_path_or_url= 'ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz', transcript_fasta_path_or_url= 'ftp://ftp.ensembl.org/pub/release-67/fasta/mus_musculus/cdna/Mus_musculus.NCBIM37.67.cdna.all.fa.gz', protein_fasta_path_or_url= 'ftp://ftp.ensembl.org/pub/release-67/fasta/mus_musculus/pep/Mus_musculus.NCBIM37.67.pep.all.fa.gz' ) list_of_genes = ['Ccl3', 'Ccl4'] transcript_dict = {} gene_object_dict = {} for gene in list_of_genes: transcript_dict[gene] = mouse_genome.transcript_ids_of_gene_name(gene) for gene in list_of_genes: gene_object_dict[gene] = mouse_genome.genes_by_name(gene)[0] # Gene locations
def count_rsem_files(direc_name, bammed_direc='aligned_star', quant_direc='quant_rsem', counted_direc='counted_rsem', spikeids=[]): quant_path = os.path.join(direc_name, quant_direc) counted_path = os.path.join(direc_name, counted_direc) bammed_path = os.path.join(direc_name, bammed_direc) file_list = os.listdir(quant_path) mouse_genome = pyensembl.Genome( reference_name='GRCm38', gtf_path_or_url= 'ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/Mus_musculus.GRCm38.81.gtf.gz', transcript_fasta_path_or_url= 'ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz', protein_fasta_path_or_url= 'ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz' ) for seq_file_temp in file_list: if fnmatch.fnmatch(seq_file_temp, r'*.isoforms.results'): seq_file = seq_file_temp gene_names = mouse_genome.gene_names() transcript_dict = {} counter = 0 for gene in gene_names: transcript_dict[gene] = mouse_genome.transcript_ids_of_gene_name(gene) counter += 1 print counter for seq_file in file_list: print seq_file, fnmatch.fnmatch(seq_file, r'*.isoforms.results') if fnmatch.fnmatch(seq_file, r'*.isoforms.results'): bfs = seq_file.split('.') bam_file_sorted_by_loc = bfs[0] + '.transcript.sorted.bam' num_mapped, num_unmapped = bam_read_count( os.path.join(bammed_path, bam_file_sorted_by_loc)) transcripts, spikeins = load_sequence_counts_rsem( rsem_file=os.path.join(quant_path, seq_file), spikeids=spikeids) filename_save = os.path.join(counted_path, seq_file + '.h5') print 'Saving ' + filename_save store = pd.HDFStore(filename_save) d = {'num_mapped': num_mapped, 'num_unmapped': num_unmapped} quality_control = pd.DataFrame(d, index=[0]) # Create a transcript table indexed by gene name - sum over all isoforms counts_list = [] fpkm_list = [] tpm_list = [] for j in xrange(len(gene_names)): gene = gene_names[j] transcript_list = transcript_dict[gene] counts = 0 fpkm = 0 tpm = 0 for transcript in transcript_list: if transcript in transcripts.index: counts += transcripts.loc[transcript]['est_counts'] fpkm += transcripts.loc[transcript]['fpkm'] tpm += transcripts.loc[transcript]['tpm'] counts_list += [counts] fpkm_list += [fpkm] tpm_list += [tpm] d = { 'gene_name': gene_names, 'est_counts': counts_list, 'fpkm': fpkm_list, 'tpm': tpm_list } gene_counts = pd.DataFrame(d) gene_counts.set_index('gene_name', inplace=True) # Store data frames in HDF5 format store['quality_control'] = quality_control store['transcripts'] = transcripts store['gene_counts'] = gene_counts store['spikeins'] = spikeins store.close()
def count_hdf5_files(direc_name, bammed_direc='bammed', aligned_direc='aligned', counted_direc='counted', spikeids=[]): bammed_path = os.path.join(direc_name, bammed_direc) aligned_path = os.path.join(direc_name, aligned_direc) counted_path = os.path.join(direc_name, counted_direc) file_list = os.listdir(aligned_path) mouse_genome = pyensembl.Genome( reference_name='GRCm38', gtf_path_or_url= 'ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/Mus_musculus.GRCm38.81.gtf.gz', transcript_fasta_path_or_url= 'ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz', protein_fasta_path_or_url= 'ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz' ) for seq_file_temp in file_list: if fnmatch.fnmatch(seq_file_temp, r'*.h5'): seq_file = seq_file_temp transcripts, spikeins = load_sequence_counts_kallisto( h5file_name=os.path.join(aligned_path, seq_file), spikeids=spikeids) transcript_names = transcripts.index gene_names = [] counter = 0 for transcript in transcript_names: counter += 1 print counter gene_names += [mouse_genome.gene_name_of_transcript_id(transcript)] unique_gene_names = list(set(gene_names)) transcript_dict = {} for gene in unique_gene_names: transcript_dict[gene] = [] for transcript in transcript_names: gene_name = mouse_genome.gene_name_of_transcript_id(transcript) transcript_dict[gene_name] += [transcript] for seq_file in file_list: print seq_file, fnmatch.fnmatch(seq_file, r'*.h5') if fnmatch.fnmatch(seq_file, r'*.h5'): bfs = seq_file.split('.') bam_file_sorted_by_loc = bfs[0] + '_sorted_by_location.bam' num_mapped, num_unmapped = bam_read_count( os.path.join(bammed_path, bam_file_sorted_by_loc)) transcripts, spikeins = load_sequence_counts_kallisto( h5file_name=os.path.join(aligned_path, seq_file), spikeids=spikeids) filename_save = os.path.join(counted_path, seq_file[:-3] + '.h5') print 'Saving ' + filename_save store = pd.HDFStore(filename_save) d = {'num_mapped': num_mapped, 'num_unmapped': num_unmapped} quality_control = pd.DataFrame(d, index=[0]) # Create a transcript table indexed by gene name counts_list = [] fpkm_list = [] tpm_list = [] length_list = [] for j in xrange(len(unique_gene_names)): gene = unique_gene_names[j] transcript_list = transcript_dict[gene] counts = 0 fpkm = 0 tpm = 0 mean_eff_length = 0 for transcript in transcript_list: counts += transcripts.loc[transcript]['est_counts'] fpkm += transcripts.loc[transcript]['fpkm'] tpm += transcripts.loc[transcript]['tpm'] mean_eff_length += transcripts.loc[transcript][ 'eff_length'] / len(transcript_list) counts_list += [counts] fpkm_list += [fpkm] tpm_list += [tpm] length_list += [mean_eff_length] d = { 'gene_name': unique_gene_names, 'est_counts': counts_list, 'fpkm': fpkm_list, 'tpm': tpm_list, 'mean_eff_length': length_list } gene_counts = pd.DataFrame(d) gene_counts.set_index('gene_name', inplace=True) # Store data frames in HDF5 format store['quality_control'] = quality_control store['transcripts'] = transcripts store['gene_counts'] = gene_counts store['spikeins'] = spikeins store.close()