예제 #1
0
def main():
	ensembl_num = 81 #fetch before running program using command: pyensembl install --release <list of Ensembl release numbers> --species <species-name>
	gen_ref = EnsemblRelease(ensembl_num)
	
	output_file = open(sys.argv[2], "w") #output filename

	firstline = True
	input_file = open(sys.argv[1]).read().split("\r")
	for line in input_file:
		if firstline:
			output_file.write(line + "\r")
			firstline = False
			continue
		parameters = strip_values(line)
		if parameters == "nothing":
			output_file.write(line + "\r")
		else:
			gene_name = gen_ref.gene_names_at_locus(contig=parameters[0], position=parameters[1], end =parameters[2])
			for gene in gene_name[0:len(gene_name)]:
				line = line + "," + gene
			output_file.write(line + "\r")

	output_file.write("Generated on " + time.strftime("%m/%d/%Y") + " with Ensemble Release " + str(ensembl_num) + " and locus_serach.py v1." )

	output_file.close()
예제 #2
0
class ScrapeEnsembl():
    ''' 
    '''
    def __init__(self, query, hg_version):
        self.query = query.replace("chr","")
        self.hg_version = ScrapeEnsembl.genome.get(hg_version) # convert to ensembl release
        self.hg = EnsemblRelease(self.hg_version) # convert to ensembl release object

    
    genome = {"hg19": 75, "hg38": 83}
    
    def get_gene_info(self):
        ''' Get the gene information at a given genomic position
        '''
         
        # check if the input is a genomic position or genomic range
        if re.search(r"[-:]", self.query) and self.query.replace(":","").isdigit():

            chrom = int(self.query.split(":")[0])
            pos = int(self.query.split(":")[1])
            gene_name = self.hg.gene_names_at_locus(contig=chrom, position=pos)
            if not gene_name:
                msg = " ".join(("No gene found at",self.query,"for genome version",
                                str(self.hg_version)))
                return msg 
            
            gene_info = self.hg.genes_by_name(gene_name[0])
            # gene_info[0].loaction doesn't work, hence the mess below
            gene_location = str(gene_info[0]).split(",")[-1][:-1].split("=")[1]

            gene_info = (gene_info[0].name, gene_info[0].id, 
                         gene_info[0].biotype, gene_location)
            
            return(gene_info)
    
    
    def get_canonical_transcript(self, gene_name):
        ''' Determine and return the canonical transcript of the given gene
        '''
        all_transcripts = self.hg.transcript_ids_of_gene_name(gene_name)
        all_transcript_details = [self.hg.transcript_by_id(x) for x in all_transcripts]
        protein_coding_transcripts = []
        for x in all_transcript_details:
            split_transcript_info = re.split(r"[=,]",str(x))
            transcript = split_transcript_info[1]
            transcript_type = split_transcript_info[9]
            location = split_transcript_info[-1][:-1]
            start = re.split(r"[:-]", location)[1]
            stop = re.split(r"[:-]", location)[2]
            size = int(stop) - int(start)
            if transcript_type == "protein_coding":
                protein_coding_transcripts.append((size,transcript,transcript_type)) 
        
        # sort by size and return the largest protein coding transcript
        if protein_coding_transcripts:    
            canonical_transcript = sorted(protein_coding_transcripts)[-1][1]
            return canonical_transcript
예제 #3
0
class Ensembl(object):
    def __init__(self):
        self.db = EnsemblRelease(75)

    def annotate_one_gene(self, location):
        chrom, start, stop = self.parse_location(location)
        return self.db.gene_names_at_locus(chrom, start, stop)

    @staticmethod
    def parse_location(loc):
        start, stop = loc.split('-')
        chrom, start = start.split(':')
        start, stop = int(start), int(stop)
        return chrom, start, stop
예제 #4
0
len(mygenes)
print mygenes
print len(genes_20_blood.intersection(genes_20_saliva))
print len(genes_20_blood.union(genes_20_saliva))
print list(genes_20_blood) + list(genes_20_saliva)
from matplotlib_venn import venn2
venn2([genes_20_blood, genes_20_saliva], set_labels=('genes_20_blood', 'genes_20_saliva') )
plt.savefig("/mnt/xfs1/home/asalomatov/Blood_vs_Saliva_genes_lt_20.png")
plt.close()


### annotate with genes
from pyensembl import EnsemblRelease
data = EnsemblRelease(75)
data.gene_names_at_locus(contig=1, position=100000)
df_exome['gene_name'] = df_exome.apply(lambda x: data.gene_names_at_locus(contig=x['chr'], position=x['start']))
x = df_exome.apply(lambda x: data.gene_names_at_locus(contig=x['chr'], position=(x['start'] + x['end'])/2),
        axis=1)
x

df_exome.head()
df_exome.tail()
df_exome.shape
df_exome.bin.value_counts()
df_exome.isnull().sum()

my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales ($)")

x = np.random.normal(size=1000)
예제 #5
0
def map_the_read(infile, outfile):
    f = open(infile, "r")
    l = f.readlines()
    l = [x[:-1] for x in l]
    l = [x.split("\t") for x in l]
    total_reads = int(l[-1][1])
    l = l[:-1]
    # import the ensembl gene annotation
    genome = EnsemblRelease(75)
    # parse the file
    seq_lib = {}
    num = 0
    num_chimeric = 0  #chimeric alignment #
    num_no_map = 0  # seqs without mapping info
    num_chrM = 0  # read that mapped to the mitochondria
    l_out = []
    for seq in l:
        num += 1
        frequency = seq[1]
        try:
            lsub = seq[2]
            lsub = lsub.split(";")
            lsub = [x.split(",") for x in lsub]
            if lsub[0][0] == " * chrM":
                num_chrM += 1
                l_out.append(' '.join(seq) + "chrM\n")
            elif lsub[0][0] == "*":
                num_no_map += 1
                l_out.append(' '.join(seq) + " * no match\n")
            else:
                if lsub[0][0] == lsub[1][0]:
                    chr = lsub[0][0][3:]
                    seq_start = lsub[0][1]
                    seq_end = lsub[1][1]
                    seq_position = (
                        int(seq_start) + int(seq_end)
                    ) / 2  # use the average position for annotation
                    gene_name = genome.gene_names_at_locus(
                        contig=chr, position=seq_position)
                    if gene_name == []:
                        gene = "non-coding"  # in case the sequence can't be mapped to a gene but none-coding part
                        l_out.append(' '.join(seq) + " non-coding\n")
                    else:
                        gene = gene_name[
                            0]  # seq that can be mapped to the gene
                        l_out.append(' '.join(seq) + "%s\n" % gene)
                    if (chr, gene) in seq_lib:
                        seq_lib[(chr, gene)].append(
                            (num, frequency, seq_position))
                    else:
                        seq_lib[(chr, gene)] = [
                            (num, frequency, seq_position),
                        ]
                else:
                    num_chimeric += 1
                    l_out.append(' '.join(seq) + " * chimeric\n")
        except IndexError:
            num_no_map += 1
            l_out.append(' '.join(seq) + " * no match\n")
    #write the file out
    f = open(outfile, "w")
    for i in l_out:
        f.write(i)
    f.write("total reads number: %d" % total_reads)
    f.close()

    return seq_lib, num_chimeric, num_no_map, num_chrM, total_reads
예제 #6
0
    prot = df_descr['protocol'][df_descr.smpl_id.isin([smpl])].iloc[0]
    print prot
   # desc = '_'.join([fam, rel])
   # print desc
    fname='/mnt/xfs1/scratch/asalomatov/BloodVsSaliva/output/'+smpl+'-D1.final-exome-cvrg.txt'
    if not os.path.isfile(fname):
        print fname
        continue
    df_exome = pd.read_csv(fname, sep="\t", header=None) 
    df_exome = df_exome[df_exome[0] != 'all']
    df_exome.columns = ['chr', 'start', 'end', 'bin', 'length', 'ex_length', 'perc']
    df_exome['w'] = df_exome['perc']*df_exome['bin']
    df_exome['chr_start'] = df_exome['chr'].astype(str)+'_'+df_exome['start'].astype(str)
    df_ex_sum = df_exome.groupby(['chr_start']).w.sum()
    c1 = df_ex_sum < 20
    mygenes = set()
    for i, n in df_ex_sum[c1].iteritems():
        cont, sta = i.split("_")
        g_list = data.gene_names_at_locus(contig=cont, position=int(sta))
        for g in g_list:
            mygenes.add(g)
    if prot == 'blood':
        print 'adding to blood'
        genes_20_blood = genes_20_blood.union(mygenes)
    else:
        print 'adding to saliva'
        genes_20_saliva = genes_20_saliva.union(mygenes)
    print len(genes_20_blood), len(genes_20_saliva)


예제 #7
0
def gene_names_at_locus(*args, **kwargs):
    genome = EnsemblRelease(ENSEMBL_RELEASE_VERSION)
    return genome.gene_names_at_locus(*args, **kwargs)
예제 #8
0
파일: test.py 프로젝트: aquaflakes/tfTest
from pyensembl import EnsemblRelease

# release 77 uses human reference genome GRCh38
data = EnsemblRelease(77)

# will return ['HLA-A']

# get all exons associated with HLA-A
# exon_ids  = data .exon_ids_of_gene_name('HLA-A')ct.download()
data.download()
data.index()
gene_names = data.gene_names_at_locus(contig=6, position=29945884)

print(2)
    print(chr)
    for f in glob.glob(chr+'/*.switchAndError.txt'):
        with open(f) as input_file:
            header_index = list_index(input_file.readline().strip().split('\t'))
            for line in input_file:
                line = line.strip().split('\t')
                for snp in line[header_index['overlapping_snps']].split(','):
                    if len(snp.strip()) == 0:
                        continue
                    snps.add(snp)
                for snp in line[header_index['snps_only_chunkVCF']].split(','):
                    if len(snp.strip()) == 0:
                        continue
                    snps.add(snp)
                for snp in line[header_index['snps_only_refVCF']].split(','):
                    if len(snp.strip()) == 0:
                        continue
                    snps.add(snp)

with open('snp_gene_locations.txt','w') as out:
    number_of_snps = len(snps)
    for index, snp in enumerate(snps):
        if index % 5000 == 0:
            print(str(index)+'/'+str(number_of_snps))
        gene_names = data.gene_names_at_locus(contig=chr.replace("chr",""), position=int(snp.split('_')[1]))
        gene_ids = []
        for gene_name in gene_names:
            for gene_id in data.gene_ids_of_gene_name(gene_name):
                gene_ids.append(gene_id)
        out.write(snp+'\t'+','.join(gene_names)+'\t'+','.join(gene_ids)+'\n')
예제 #10
0
    'X': [11740000],
    '20': [],
    '16': [],
    '17': [],
    '7': [47870000, 54800000, 55230000, 55240000, 55250000],
    '22': [29070000],
    '4': [60000, 191040000],
    '21': [],
    '12': [9730000, 22320000, 28560000],
    '15': [35410000, 41860000],
    '8': [3620000, 18770000, 70690000, 128750000, 132260000, 135090000]
}

for chromosome in intervals_dict:
    genes_dict.update({chromosome: []})
    for interval in intervals_dict[chromosome]:
        genes = genome.gene_names_at_locus(contig=chromosome,
                                           position=interval - 10000,
                                           end=interval)
        if len(genes) > 0:
            for gene in genes:
                if str(gene) not in genes_dict[chromosome]:
                    genes_dict[chromosome].append(str(gene))

for key in genes_dict:
    if len(genes_dict[key]):
        print('---------------------------')
        print('Genes in chromosome', key)
        for item in genes_dict[key]:
            print(item)
예제 #11
0
outputdatasheet = outputfile.create_sheet(title='Output')

header = ["Chr ID", "Chr Pos", "Gene 1", "Gene 2", "Gene 3", "Gene 4"]
outputdatasheet.append(header)

for row in datasheet.iter_rows('A2:B%d' % rowcount):
    coord = [0, 0]

    #Parse cells within row (Make list with contig, Chrom. position)
    for cell in row:
        #c=1
        if cell.value:
            coordidx = cell.column - 1
            coord[coordidx] = cell.value
            #c += 1
        else:
            print('Error: Row', row, 'contains invalid coordinate.')

    #Look up Gene Name
    data = EnsemblRelease(75)
    gene_name = data.gene_names_at_locus(contig=coord[0], position=coord[1])

    coord = coord + gene_name
    outputdatasheet.append(coord)

# Save the output file.
outputfile.save(outputfilename)
print()
print('Wrote data to file', outputfilename)
print()