def main(): ensembl_num = 81 #fetch before running program using command: pyensembl install --release <list of Ensembl release numbers> --species <species-name> gen_ref = EnsemblRelease(ensembl_num) output_file = open(sys.argv[2], "w") #output filename firstline = True input_file = open(sys.argv[1]).read().split("\r") for line in input_file: if firstline: output_file.write(line + "\r") firstline = False continue parameters = strip_values(line) if parameters == "nothing": output_file.write(line + "\r") else: gene_name = gen_ref.gene_names_at_locus(contig=parameters[0], position=parameters[1], end =parameters[2]) for gene in gene_name[0:len(gene_name)]: line = line + "," + gene output_file.write(line + "\r") output_file.write("Generated on " + time.strftime("%m/%d/%Y") + " with Ensemble Release " + str(ensembl_num) + " and locus_serach.py v1." ) output_file.close()
class ScrapeEnsembl(): ''' ''' def __init__(self, query, hg_version): self.query = query.replace("chr","") self.hg_version = ScrapeEnsembl.genome.get(hg_version) # convert to ensembl release self.hg = EnsemblRelease(self.hg_version) # convert to ensembl release object genome = {"hg19": 75, "hg38": 83} def get_gene_info(self): ''' Get the gene information at a given genomic position ''' # check if the input is a genomic position or genomic range if re.search(r"[-:]", self.query) and self.query.replace(":","").isdigit(): chrom = int(self.query.split(":")[0]) pos = int(self.query.split(":")[1]) gene_name = self.hg.gene_names_at_locus(contig=chrom, position=pos) if not gene_name: msg = " ".join(("No gene found at",self.query,"for genome version", str(self.hg_version))) return msg gene_info = self.hg.genes_by_name(gene_name[0]) # gene_info[0].loaction doesn't work, hence the mess below gene_location = str(gene_info[0]).split(",")[-1][:-1].split("=")[1] gene_info = (gene_info[0].name, gene_info[0].id, gene_info[0].biotype, gene_location) return(gene_info) def get_canonical_transcript(self, gene_name): ''' Determine and return the canonical transcript of the given gene ''' all_transcripts = self.hg.transcript_ids_of_gene_name(gene_name) all_transcript_details = [self.hg.transcript_by_id(x) for x in all_transcripts] protein_coding_transcripts = [] for x in all_transcript_details: split_transcript_info = re.split(r"[=,]",str(x)) transcript = split_transcript_info[1] transcript_type = split_transcript_info[9] location = split_transcript_info[-1][:-1] start = re.split(r"[:-]", location)[1] stop = re.split(r"[:-]", location)[2] size = int(stop) - int(start) if transcript_type == "protein_coding": protein_coding_transcripts.append((size,transcript,transcript_type)) # sort by size and return the largest protein coding transcript if protein_coding_transcripts: canonical_transcript = sorted(protein_coding_transcripts)[-1][1] return canonical_transcript
class Ensembl(object): def __init__(self): self.db = EnsemblRelease(75) def annotate_one_gene(self, location): chrom, start, stop = self.parse_location(location) return self.db.gene_names_at_locus(chrom, start, stop) @staticmethod def parse_location(loc): start, stop = loc.split('-') chrom, start = start.split(':') start, stop = int(start), int(stop) return chrom, start, stop
len(mygenes) print mygenes print len(genes_20_blood.intersection(genes_20_saliva)) print len(genes_20_blood.union(genes_20_saliva)) print list(genes_20_blood) + list(genes_20_saliva) from matplotlib_venn import venn2 venn2([genes_20_blood, genes_20_saliva], set_labels=('genes_20_blood', 'genes_20_saliva') ) plt.savefig("/mnt/xfs1/home/asalomatov/Blood_vs_Saliva_genes_lt_20.png") plt.close() ### annotate with genes from pyensembl import EnsemblRelease data = EnsemblRelease(75) data.gene_names_at_locus(contig=1, position=100000) df_exome['gene_name'] = df_exome.apply(lambda x: data.gene_names_at_locus(contig=x['chr'], position=x['start'])) x = df_exome.apply(lambda x: data.gene_names_at_locus(contig=x['chr'], position=(x['start'] + x['end'])/2), axis=1) x df_exome.head() df_exome.tail() df_exome.shape df_exome.bin.value_counts() df_exome.isnull().sum() my_plot.set_xlabel("Customers") my_plot.set_ylabel("Sales ($)") x = np.random.normal(size=1000)
def map_the_read(infile, outfile): f = open(infile, "r") l = f.readlines() l = [x[:-1] for x in l] l = [x.split("\t") for x in l] total_reads = int(l[-1][1]) l = l[:-1] # import the ensembl gene annotation genome = EnsemblRelease(75) # parse the file seq_lib = {} num = 0 num_chimeric = 0 #chimeric alignment # num_no_map = 0 # seqs without mapping info num_chrM = 0 # read that mapped to the mitochondria l_out = [] for seq in l: num += 1 frequency = seq[1] try: lsub = seq[2] lsub = lsub.split(";") lsub = [x.split(",") for x in lsub] if lsub[0][0] == " * chrM": num_chrM += 1 l_out.append(' '.join(seq) + "chrM\n") elif lsub[0][0] == "*": num_no_map += 1 l_out.append(' '.join(seq) + " * no match\n") else: if lsub[0][0] == lsub[1][0]: chr = lsub[0][0][3:] seq_start = lsub[0][1] seq_end = lsub[1][1] seq_position = ( int(seq_start) + int(seq_end) ) / 2 # use the average position for annotation gene_name = genome.gene_names_at_locus( contig=chr, position=seq_position) if gene_name == []: gene = "non-coding" # in case the sequence can't be mapped to a gene but none-coding part l_out.append(' '.join(seq) + " non-coding\n") else: gene = gene_name[ 0] # seq that can be mapped to the gene l_out.append(' '.join(seq) + "%s\n" % gene) if (chr, gene) in seq_lib: seq_lib[(chr, gene)].append( (num, frequency, seq_position)) else: seq_lib[(chr, gene)] = [ (num, frequency, seq_position), ] else: num_chimeric += 1 l_out.append(' '.join(seq) + " * chimeric\n") except IndexError: num_no_map += 1 l_out.append(' '.join(seq) + " * no match\n") #write the file out f = open(outfile, "w") for i in l_out: f.write(i) f.write("total reads number: %d" % total_reads) f.close() return seq_lib, num_chimeric, num_no_map, num_chrM, total_reads
prot = df_descr['protocol'][df_descr.smpl_id.isin([smpl])].iloc[0] print prot # desc = '_'.join([fam, rel]) # print desc fname='/mnt/xfs1/scratch/asalomatov/BloodVsSaliva/output/'+smpl+'-D1.final-exome-cvrg.txt' if not os.path.isfile(fname): print fname continue df_exome = pd.read_csv(fname, sep="\t", header=None) df_exome = df_exome[df_exome[0] != 'all'] df_exome.columns = ['chr', 'start', 'end', 'bin', 'length', 'ex_length', 'perc'] df_exome['w'] = df_exome['perc']*df_exome['bin'] df_exome['chr_start'] = df_exome['chr'].astype(str)+'_'+df_exome['start'].astype(str) df_ex_sum = df_exome.groupby(['chr_start']).w.sum() c1 = df_ex_sum < 20 mygenes = set() for i, n in df_ex_sum[c1].iteritems(): cont, sta = i.split("_") g_list = data.gene_names_at_locus(contig=cont, position=int(sta)) for g in g_list: mygenes.add(g) if prot == 'blood': print 'adding to blood' genes_20_blood = genes_20_blood.union(mygenes) else: print 'adding to saliva' genes_20_saliva = genes_20_saliva.union(mygenes) print len(genes_20_blood), len(genes_20_saliva)
def gene_names_at_locus(*args, **kwargs): genome = EnsemblRelease(ENSEMBL_RELEASE_VERSION) return genome.gene_names_at_locus(*args, **kwargs)
from pyensembl import EnsemblRelease # release 77 uses human reference genome GRCh38 data = EnsemblRelease(77) # will return ['HLA-A'] # get all exons associated with HLA-A # exon_ids = data .exon_ids_of_gene_name('HLA-A')ct.download() data.download() data.index() gene_names = data.gene_names_at_locus(contig=6, position=29945884) print(2)
print(chr) for f in glob.glob(chr+'/*.switchAndError.txt'): with open(f) as input_file: header_index = list_index(input_file.readline().strip().split('\t')) for line in input_file: line = line.strip().split('\t') for snp in line[header_index['overlapping_snps']].split(','): if len(snp.strip()) == 0: continue snps.add(snp) for snp in line[header_index['snps_only_chunkVCF']].split(','): if len(snp.strip()) == 0: continue snps.add(snp) for snp in line[header_index['snps_only_refVCF']].split(','): if len(snp.strip()) == 0: continue snps.add(snp) with open('snp_gene_locations.txt','w') as out: number_of_snps = len(snps) for index, snp in enumerate(snps): if index % 5000 == 0: print(str(index)+'/'+str(number_of_snps)) gene_names = data.gene_names_at_locus(contig=chr.replace("chr",""), position=int(snp.split('_')[1])) gene_ids = [] for gene_name in gene_names: for gene_id in data.gene_ids_of_gene_name(gene_name): gene_ids.append(gene_id) out.write(snp+'\t'+','.join(gene_names)+'\t'+','.join(gene_ids)+'\n')
'X': [11740000], '20': [], '16': [], '17': [], '7': [47870000, 54800000, 55230000, 55240000, 55250000], '22': [29070000], '4': [60000, 191040000], '21': [], '12': [9730000, 22320000, 28560000], '15': [35410000, 41860000], '8': [3620000, 18770000, 70690000, 128750000, 132260000, 135090000] } for chromosome in intervals_dict: genes_dict.update({chromosome: []}) for interval in intervals_dict[chromosome]: genes = genome.gene_names_at_locus(contig=chromosome, position=interval - 10000, end=interval) if len(genes) > 0: for gene in genes: if str(gene) not in genes_dict[chromosome]: genes_dict[chromosome].append(str(gene)) for key in genes_dict: if len(genes_dict[key]): print('---------------------------') print('Genes in chromosome', key) for item in genes_dict[key]: print(item)
outputdatasheet = outputfile.create_sheet(title='Output') header = ["Chr ID", "Chr Pos", "Gene 1", "Gene 2", "Gene 3", "Gene 4"] outputdatasheet.append(header) for row in datasheet.iter_rows('A2:B%d' % rowcount): coord = [0, 0] #Parse cells within row (Make list with contig, Chrom. position) for cell in row: #c=1 if cell.value: coordidx = cell.column - 1 coord[coordidx] = cell.value #c += 1 else: print('Error: Row', row, 'contains invalid coordinate.') #Look up Gene Name data = EnsemblRelease(75) gene_name = data.gene_names_at_locus(contig=coord[0], position=coord[1]) coord = coord + gene_name outputdatasheet.append(coord) # Save the output file. outputfile.save(outputfilename) print() print('Wrote data to file', outputfilename) print()