def store(cursor, in_path, infile): inf = erropen (in_path+"/"+infile, "r") print "storing contents of ", in_path, " file ", infile ct = 0 start = time() for line in inf: ct += 1 if (not ct%1000): print " %5d %8.3f" % (ct, time()-start); start = time() fixed_fields = {} update_fields = {} line = line.rstrip() field = line.split("\t") if len(field) < 4: continue [human_stable_id, cognate_stable_id, species, common_name] = field fixed_fields ['ensembl_gene_id'] = human_stable_id fixed_fields ['species'] = species update_fields['cognate_gene_id'] = cognate_stable_id update_fields['common_name'] = common_name store_or_update (cursor, 'ortholog', fixed_fields, update_fields) inf.close()
def dump_paralogues(species_list, db_info): [local_db, ensembl_db_name, outdir] = db_info db = connect_to_mysql() cursor = db.cursor() for species in species_list: print print "############################" print species qry = "use " + ensembl_db_name[species] search_db(cursor, qry) outfile = "{0}/{1}_para_dump.txt".format(outdir, species) print outfile #continue of = erropen(outfile, "w") if not of: continue if (species == 'homo_sapiens'): gene_ids = get_gene_ids(cursor, biotype='protein_coding', is_known=1) else: gene_ids = get_gene_ids(cursor, biotype='protein_coding') para_table = 'paralogue' ct = 0 seen = [] for gene_id in gene_ids: ct += 1 if not ct % 100: print "\t", species, " ", ct, "out of", len(gene_ids) if gene_id in seen: continue stable_id = gene2stable(cursor, gene_id) paralogues = read_paralogues(cursor, gene_id) if (paralogues): # dump for para in paralogues: print >> of, stable_id, para seen += paralogues of.close() cursor.close() db.close()
def dump_paralogues(species_list, db_info): [local_db, ensembl_db_name, outdir] = db_info db = connect_to_mysql() cursor = db.cursor() for species in species_list: print print "############################" print species qry = "use " + ensembl_db_name[species] search_db(cursor, qry) outfile = "{0}/{1}_para_dump.txt".format(outdir, species) print outfile #continue of = erropen (outfile,"w") if not of: continue if (species=='homo_sapiens'): gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1) else: gene_ids = get_gene_ids (cursor, biotype='protein_coding') para_table = 'paralogue' ct = 0 seen = [] for gene_id in gene_ids: ct += 1 if not ct%100: print "\t", species, " ", ct, "out of", len(gene_ids) if gene_id in seen: continue stable_id = gene2stable(cursor, gene_id) paralogues = read_paralogues(cursor, gene_id) if ( paralogues): # dump for para in paralogues: print >> of, stable_id, para seen += paralogues of.close() cursor.close() db.close()
def store(cursor, in_path, infile): table = 'name_resolution' inf = erropen (in_path+"/"+infile, "r") print "storing contents of ", infile ct = 0 for line in inf: ct += 1 fixed_fields = {} update_fields = {} line = line.rstrip() fields = line.split("\t") #if not ct%100: print ct, fields[0] if 'ENSG' in fields[-1]: ensembl_gene_id = fields[-1] else: continue # check we are tracking that gene (for example, if it is pseudo, we are not) qry = "select count(1) from exon_homo_sapiens where ensembl_gene_id = '%s'" % ensembl_gene_id rows = search_db(cursor, qry) if not rows or not rows[0][0]: continue for field in fields[:-1]: if not field.replace (' ',''): continue fixed_fields ['synonym'] = field.replace("'", "").upper() fixed_fields ['stable_id'] = ensembl_gene_id store_or_update (cursor, table, fixed_fields, update_fields) inf.close()
def dump_orthos (species_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) # in the afa headers use 'trivial' names for the species: cow, dog, pig, ... trivial_name = translate_to_trivial(cursor, all_species) out_path = cfg.get_path('afs_dumps') outfile = "{0}/orthologue_dump.txt".format(out_path) print outfile of = erropen (outfile,"w") species = 'homo_sapiens' switch_to_db (cursor, ensembl_db_name[species]) qry = "select * from orthologue" rows = search_db (cursor, qry) for row in rows: [pair_id, human_gene_id, cognate_gene_id, genome_db_id, source] = row species = genome_db_id2species (cursor, genome_db_id) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) human_stable_id = gene2stable(cursor, human_gene_id) switch_to_db (cursor, ensembl_db_name[species]) cognate_stable_id = gene2stable(cursor, cognate_gene_id) print >>of, orthos_tabstring ([human_stable_id, cognate_stable_id, species, trivial_name[species]]) of.close() cursor.close() db .close()
def get_theme_ids(cursor, cfg, theme_name): resources = cfg.dir_path['resources'] fnm = resources + '/' + theme_name+'.txt' if not os.path.exists(fnm): print fnm, "not found" exit(1) if not os.path.getsize(fnm) > 0: print fnm, "empty" exit(1) inf = erropen(fnm, "r") gene_ids = [] for line in inf: line.rstrip() [stable_id, name] = line.split("\t") qry = "select gene_id, description from gene where stable_id='%s'"% stable_id rows = search_db (cursor, qry) if not rows: continue gene_ids.append(rows[0][0]) inf.close() return gene_ids
def store(cursor, infile): inf = erropen(infile, "r") total = 0 id_not_found = 0 for line in inf: line.rstrip() total += 1 if not total%1000: print "\t", total if ( len(line.split()) != 2 or not 'ENS' in line): continue [stable_id1, stable_id2] = line.split() fixed_fields = {} update_fields = {} fixed_fields['gene_id1'] = stable_id1 fixed_fields['gene_id2'] = stable_id2 store_or_update (cursor, 'paralog', fixed_fields, update_fields) print "done with ", infile, "total ", total inf.close ()
def store(cursor, infile): inf = erropen(infile, "r") total = 0 id_not_found = 0 for line in inf: line.rstrip() total += 1 if not total % 1000: print "\t", total if (len(line.split()) != 2 or not 'ENS' in line): continue [stable_id1, stable_id2] = line.split() fixed_fields = {} update_fields = {} fixed_fields['gene_id1'] = stable_id1 fixed_fields['gene_id2'] = stable_id2 store_or_update(cursor, 'paralog', fixed_fields, update_fields) print "done with ", infile, "total ", total inf.close()
def main(): verbose = True db = connect_to_mysql() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species(cursor) logf = erropen("error.log", "w") if not logf: exit(1) outf = erropen("mut_significance_bg_data.txt", "w") if not outf: exit(1) switch_to_db(cursor, ensembl_db_name['homo_sapiens']) gene_ids = get_gene_ids(cursor, biotype='protein_coding', is_known=1, ref_only=True) # the categories of mutations for which we will be collecting statistics fill_category() # for each human gene #gene_ids = [10093176 ] for gene_id in gene_ids: switch_to_db(cursor, ensembl_db_name['homo_sapiens']) stable_id = gene2stable(cursor, gene_id) # find all canonical coding human exons # get_canonical_coding_exons also sorts exons by the start in the gene canonical_human_exons = get_canonical_coding_exons( cursor, gene_id, ensembl_db_name['homo_sapiens']) # bail out if there is a problem if not canonical_human_exons: continue full_reconstituted_cDNA = "" prev_codon_piece_plus_right_flank = "" for human_exon in canonical_human_exons: [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, nucseq] = \ get_exon_seqs(cursor, human_exon.exon_id, human_exon.is_known) # add the split codon phase = get_exon_phase(cursor, human_exon.exon_id, human_exon.is_known) left_flank_plus_codon_piece = left_flank + nucseq[: pepseq_transl_start] split_codon = "" if phase > 0 and prev_codon_piece_plus_right_flank and left_flank: offset = (3 - phase) % 3 # hedge against the possibility that the translation starts # right at the start of the exon, but there is supposed to be a phase split_codon = prev_codon_piece_plus_right_flank[: phase] + left_flank_plus_codon_piece[ -offset:] full_reconstituted_cDNA += split_codon + nucseq[ pepseq_transl_start:pepseq_transl_end] prev_codon_piece_plus_right_flank = nucseq[ pepseq_transl_end:] + right_flank mitochondrial = is_mitochondrial(cursor, gene_id) if (mitochondrial): full_reconstituted_seq = Seq(full_reconstituted_cDNA).translate( table="Vertebrate Mitochondrial").tostring() else: full_reconstituted_seq = Seq( full_reconstituted_cDNA).translate().tostring() canonical = get_canonical_transl(acg, cursor, gene_id, 'homo_sapiens', strip_X=False) if canonical[ 0] == 'X': #that's some crap apparently wrong transcript is annotated as canonical print >> logf, "warning", gene_id, stable_id, get_description( cursor, gene_id) print >> logf, "the deposited canonical sequence starts with X - is there an alternative (?)" canonical = canonical[1:] if full_reconstituted_seq[-1] == '*' and canonical[-1] != '*': canonical += '*' if (len(full_reconstituted_seq) != len(canonical) or full_reconstituted_seq != canonical): if (len(canonical) - len(full_reconstituted_seq) < 3 and full_reconstituted_seq in canonical): # go with it - I do not have that much of that crap anyway print >> logf, "warning", gene_id, stable_id, get_description( cursor, gene_id) print >> logf, "missing a couple of amino acids in beginning or in the end" else: print >> logf, "error", gene_id, stable_id, get_description( cursor, gene_id) print >> logf, "error reassembling, len(full_reconstituted_seq) != len(canonical) ", len( full_reconstituted_seq), len(canonical) print >> logf, "canonical:" print >> logf, canonical print >> logf, "reconstituted:" print >> logf, full_reconstituted_seq continue # nucleotide stats count = {'A': 0, 'C': 0, 'C-CpG': 0, 'T': 0, 'G': 0, 'G-CpG': 0} is_CpG = {} for i in range(len(full_reconstituted_cDNA)): is_CpG[i] = False if full_reconstituted_cDNA[i] == 'A': count['A'] += 1 elif full_reconstituted_cDNA[i] == 'T': count['T'] += 1 elif full_reconstituted_cDNA[i] == 'C': if i + 1 < len(full_reconstituted_cDNA ) and full_reconstituted_cDNA[i + 1] == 'G': count['C-CpG'] += 1 is_CpG[i] = True else: count['C'] += 1 elif full_reconstituted_cDNA[i] == 'G': if i > 0 and full_reconstituted_cDNA[i - 1] == 'C': count['G-CpG'] += 1 is_CpG[i] = True else: count['G'] += 1 # in each category_dict (AT transt, AT transv, CG trans, CG transv, Cpg trans, cpGtransv, how many missense, # how many nonsense, how many silent possible codons = map(''.join, zip(*[iter(full_reconstituted_cDNA)] * 3)) silent = {} missense = {} nonsense = {} for cg in categories: silent[cg] = 0 missense[cg] = 0 nonsense[cg] = 0 for i in range(len(codons)): codon = codons[i] aa = full_reconstituted_seq[i] for j in range(3): nt_position = i * 3 + j nt = full_reconstituted_cDNA[nt_position] for new_nt in ['A', 'C', 'T', 'G']: if new_nt == nt: continue mutated_codon = mutate(codon, j, new_nt) if (mitochondrial): mutated_aa = Seq(mutated_codon).translate( table="Vertebrate Mitochondrial").tostring() else: mutated_aa = Seq(mutated_codon).translate().tostring() cg = category_dict[codon[j]][new_nt][is_CpG[nt_position]] if not cg or not cg in categories: print >> logf, "category problem in ", gene_id, stable_id, get_description( cursor, gene_id) print >> logf, codon, mutated_codon, j, codon[ j], new_nt, is_CpG[nt_position], cg print >> logf, i, j, nt_position, nt print >> logf, aa, mutated_aa continue if (mutated_aa == aa): silent[cg] += 1 elif (mutated_aa == "*"): nonsense[cg] += 1 else: missense[cg] += 1 print >> outf, stable_id, get_description(cursor, gene_id) print >> outf, "# CpG nucleotides (format: cdna_position|nucleotide|codon|context; )" print >> outf, "# ('context' contains one nucleotide before and one after the CpG nucleotide)" outstr = "" for i in range(len(full_reconstituted_cDNA)): if (is_CpG[i]): context = "" if i > 0: context += full_reconstituted_cDNA[i - 1] context += full_reconstituted_cDNA[i] if i < len(full_reconstituted_cDNA) - 1: context += full_reconstituted_cDNA[i + 1] outstr += "%d|%s|%s|%s;" % (i + 1, full_reconstituted_cDNA[i], codons[i / 3], context) print >> outf, outstr print >> outf, "# mutations possible (in principle)" print >> outf, "# %10s %5s %5s %5s" % ("category", "silent", "nonsense", "missense") for cg in categories: print >> outf, "%10s %5d %5d %5d" % (cg, silent[cg], nonsense[cg], missense[cg]) print >> outf, "# canonical sequence (format: <amino_acid><position_on_peptide_chain><codon>;):" outstr = "" for i in range(len(codons)): if (mitochondrial): codon_transl = Seq(codons[i]).translate( table="Vertebrate Mitochondrial").tostring() else: codon_transl = Seq(codons[i]).translate().tostring() outstr += "%s%d%s;" % (full_reconstituted_seq[i], i + 1, codons[i]) print >> outf, outstr print >> outf, stable_id, "done" logf.close()
def multiple_exon_alnmt(gene_list, db_info): print "process pid: %d, length of gene list: %d" % ( get_process_id(), len(gene_list)) [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) species = 'homo_sapiens' switch_to_db (cursor, ensembl_db_name[species]) gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1) # for each human gene gene_ct = 0 tot = 0 ok = 0 no_maps = 0 no_pepseq = 0 no_orthologues = 0 min_similarity = cfg.get_value('min_accptbl_exon_sim') #gene_list.reverse() for gene_id in gene_list: start = time() gene_ct += 1 if not gene_ct%10: print gene_ct, "genes out of", len(gene_list) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) print gene_ct, len(gene_ids), gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id) human_exons = filter (lambda e: e.is_known==1 and e.is_coding and e.covering_exon<0, gene2exon_list(cursor, gene_id)) human_exons.sort(key=lambda exon: exon.start_in_gene) ################################################################## for human_exon in human_exons: tot += 1 # find all orthologous exons the human exon maps to maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known) if verbose: print "\texon no.", tot, " id", human_exon.exon_id, if not maps: print " no maps" print human_exon print if not maps: no_maps += 1 continue # human sequence to fasta: seqname = "{0}:{1}:{2}".format('homo_sapiens', human_exon.exon_id, human_exon.is_known) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = get_exon_seqs (cursor, human_exon.exon_id, human_exon.is_known) if (not pepseq): if verbose and human_exon.is_coding and human_exon.covering_exon <0: # this should be a master exon print "no pep seq for", human_exon.exon_id, "coding ", human_exon.is_coding, print "canonical: ", human_exon.is_canonical print "length of dna ", len(dna_seq) no_pepseq += 1 continue # collect seq from all maps, and output them in fasta format hassw = False headers = [] sequences = {} exons_per_species = {} for map in maps: switch_to_db (cursor, ensembl_db_name[map.species_2]) if map.similarity < min_similarity: continue exon = map2exon(cursor, ensembl_db_name, map) pepseq = get_exon_pepseq (cursor,exon) if (not pepseq): continue if map.source == 'sw_sharp': exon_known_code = 2 hassw = True elif map.source == 'usearch': exon_known_code = 3 hassw = True else: exon_known_code = map.exon_known_2 seqname = "{0}:{1}:{2}".format(map.species_2, map.exon_id_2, exon_known_code) headers.append(seqname) sequences[seqname] = pepseq # for split exon concatenation (see below) if not map.species_2 in exons_per_species.keys(): exons_per_species[map.species_2] = [] exons_per_species[map.species_2].append ([ map.exon_id_2, exon_known_code]); if (len(headers) <=1 ): if verbose: print "single species in the alignment" no_orthologues += 1 continue # concatenate exons from the same gene - the alignment program might go wrong otherwise concatenated = concatenate_exons (cursor, ensembl_db_name, sequences, exons_per_species) fasta_fnm = "{0}/{1}.fa".format( cfg.dir_path['scratch'], human_exon.exon_id) output_fasta (fasta_fnm, sequences.keys(), sequences) # align afa_fnm = "{0}/{1}.afa".format( cfg.dir_path['scratch'], human_exon.exon_id) mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm) ret = commands.getoutput(mafftcmd) if (verbose): print 'almt to', afa_fnm # read in the alignment inf = erropen(afa_fnm, "r") aligned_seqs = {} for record in SeqIO.parse(inf, "fasta"): aligned_seqs[record.id] = str(record.seq) inf.close() # split back the concatenated exons if concatenated: split_concatenated_exons (aligned_seqs, concatenated) human_seq_seen = False for seq_name, sequence in aligned_seqs.iteritems(): # if this is one of the concatenated seqs, split them back to two ### store the alignment as bitstring # Generate the bitmap bs = Bits(bin='0b' + re.sub("[^0]","1", sequence.replace('-','0'))) # The returned value of tobytes() will be padded at the end # with between zero and seven 0 bits to make it byte aligned. # I will end up with something that looks like extra alignment gaps, that I'll have to return msa_bitmap = bs.tobytes() # Retrieve information on the cognate cognate_species, cognate_exon_id, cognate_exon_known = seq_name.split(':') if cognate_exon_known == '2': source = 'sw_sharp' elif cognate_exon_known == '3': source = 'usearch' else: source = 'ensembl' if (cognate_species == 'homo_sapiens'): human_seq_seen = True cognate_genome_db_id = species2genome_db_id(cursor, cognate_species) # moves the cursor switch_to_db(cursor, ensembl_db_name['homo_sapiens']) # so move it back to h**o sapiens # Write the bitmap to the database #if (cognate_species == 'homo_sapiens'): if verbose: # and (source=='sw_sharp' or source=='usearch'): print "storing" print human_exon.exon_id, human_exon.is_known print cognate_species, cognate_genome_db_id, cognate_exon_id, cognate_exon_known, source print sequence if not msa_bitmap: print "no msa_bitmap" continue store_or_update(cursor, "exon_map", {"cognate_genome_db_id":cognate_genome_db_id, "cognate_exon_id":cognate_exon_id ,"cognate_exon_known" :cognate_exon_known, "source": source, "exon_id" :human_exon.exon_id, "exon_known":human_exon.is_known}, {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)}) ok += 1 commands.getoutput("rm "+afa_fnm+" "+fasta_fnm) if verbose: print " time: %8.3f\n" % (time()-start); print "tot: ", tot, "ok: ", ok print "no maps ", no_pepseq print "no pepseq ", no_pepseq print "no orthologues ", no_orthologues print
def store(cursor, table, in_path, infile, species): inf = erropen (in_path+"/"+infile, "r") if not inf: exit(1) print "storing contents of ", in_path, " file ", infile ct = 0 start = time() for line in inf: ct += 1 if (not ct%10000): print " %s %5d %8.3f" % (species, ct, time()-start) sys.stdout.flush() start = time() fixed_fields = {} update_fields = {} line = line.rstrip() field = line.split("\t") if len(field) < 18: print "number of fields smaller than expected" continue exon_id = int(field[0]) ensembl_gene_id = field[1] ensembl_exon_id = field[2] start_in_gene = int(field[3]) end_in_gene = int(field[4]) strand = int(field[5]) is_known = int(field[6]) is_coding = int(field[7]) is_canonical = int(field[8]) is_constitutive = int(field[9]) species = field[10] source = field[11] #if source == 'sw_sharp' or source=='usearch': # human_exon = field[12] # protein_seq = field[13] # here I have two fields showing where the peptide translation starts and where it ends # left_flank = field[16] # right_flank = field[17] # dna_seq = field[18] # fixed_fields['maps_to_human_exon_id'] = human_exon #else: protein_seq = field[12] # here I have two fields showing where the peptide translation starts and where it ends left_flank = field[15] right_flank = field[16] dna_seq = field[17] exon_key = ensembl_gene_id + "_" + str(exon_id) + "_" + str(is_known) fixed_fields ['exon_key'] = exon_key update_fields['ensembl_gene_id'] = ensembl_gene_id update_fields['ensembl_exon_id'] = ensembl_exon_id update_fields['start_in_gene'] = start_in_gene update_fields['end_in_gene'] = end_in_gene update_fields['strand'] = strand update_fields['is_known'] = is_known update_fields['is_coding'] = is_coding update_fields['is_canonical'] = is_canonical update_fields['is_constitutive'] = is_constitutive update_fields['species'] = species update_fields['source'] = source update_fields['protein_seq'] = protein_seq update_fields['left_flank'] = left_flank update_fields['right_flank'] = right_flank update_fields['dna_seq'] = dna_seq store_or_update (cursor, table, fixed_fields, update_fields) inf.close()
def multiple_exon_alnmt(species_list, db_info): [local_db, ensembl_db_name] = db_info verbose = False db = connect_to_mysql() cfg = ConfigurationReader() acg = AlignmentCommandGenerator() cursor = db.cursor() for species in species_list: print print "############################" print species switch_to_db (cursor, ensembl_db_name[species]) gene_ids = get_gene_ids (cursor, biotype='protein_coding') #gene_ids = get_theme_ids(cursor, cfg, 'wnt_pathway') if not gene_ids: print "no gene_ids" continue gene_ct = 0 tot = 0 ok = 0 no_maps = 0 no_pepseq = 0 no_paralogues = 0 for gene_id in gene_ids: if verbose: start = time() gene_ct += 1 if not gene_ct%100: print species, gene_ct, "genes out of", len(gene_ids) if verbose: print print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id) # get the paralogues - only the representative for the family will have this paralogues = get_paras (cursor, gene_id) if not paralogues: if verbose: print "\t not a template or no paralogues" continue if verbose: print "paralogues: ", paralogues # get _all_ exons template_exons = gene2exon_list(cursor, gene_id) if (not template_exons): if verbose: print 'no exons for ', gene_id continue # find all template exons we are tracking in the database for template_exon in template_exons: if verbose: print template_exon.exon_id maps = get_maps(cursor, ensembl_db_name, template_exon.exon_id, template_exon.is_known, species=species, table='para_exon_map') if not maps: no_maps += 1 continue # output to fasta: seqname = "{0}:{1}:{2}".format('template', template_exon.exon_id, template_exon.is_known) exon_seqs_info = get_exon_seqs (cursor, template_exon.exon_id, template_exon.is_known) if not exon_seqs_info: continue [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs_info if (not pepseq): if ( template_exon.is_coding and template_exon.covering_exon <0): # this should be a master exon print "no pep seq for", template_exon.exon_id, "coding ", template_exon.is_coding, print "canonical: ", template_exon.is_canonical print "length of dna ", len(dna_seq) no_pepseq += 1 continue tot += 1 sequences = {seqname:pepseq} headers = [seqname] for map in maps: exon = map2exon(cursor, ensembl_db_name, map, paralogue=True) pepseq = get_exon_pepseq (cursor,exon) if (not pepseq): continue seqname = "{0}:{1}:{2}".format('para', map.exon_id_2, map.exon_known_2) headers.append(seqname) sequences[seqname] = pepseq fasta_fnm = "{0}/{1}_{2}_{3}.fa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known) output_fasta (fasta_fnm, headers, sequences) if (len(headers) <=1 ): print "single species in the alignment (?)" no_paralogues += 1 continue # align afa_fnm = "{0}/{1}_{2}_{3}.afa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known) mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm) ret = commands.getoutput(mafftcmd) # read in the alignment inf = erropen(afa_fnm, "r") if not inf: print gene_id continue template_seq_seen = False for record in SeqIO.parse(inf, "fasta"): ### store the alignment as bitstring # Generate the bitmap bs = Bits(bin='0b' + re.sub("[^0]","1", str(record.seq).replace('-','0'))) msa_bitmap = bs.tobytes() # Retrieve information on the cognate label, cognate_exon_id, cognate_exon_known = record.id.split(':') if (label == 'template'): template_seq_seen = True # Write the bitmap to the database #print "updating: ", template_exon.exon_id store_or_update(cursor, "para_exon_map", {"cognate_exon_id" :cognate_exon_id, "cognate_exon_known" :cognate_exon_known, "exon_id" :template_exon.exon_id, "exon_known" :template_exon.is_known}, {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)}) inf.close() ok += 1 commands.getoutput("rm "+afa_fnm+" "+fasta_fnm) if verbose: print " time: %8.3f\n" % (time()-start); outstr = species + " done \n" outstr += "tot: %d ok: %d \n" % (tot, ok) outstr += "no maps %d \n" % no_pepseq outstr += "no pepseq %d \n" % no_pepseq outstr += "no paralogues %d \n" % no_paralogues outstr += "\n" print outstr
def dump_exons(species_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() out_path = "{0}/exons".format(cfg.get_path('afs_dumps')) if not os.path.exists(out_path): print out_path, "not found" exit(1) # exit on failed output dir check for species in species_list: #if (not species=='homo_sapiens'): # continue outfile = "{0}/{1}_exon_dump.txt".format(out_path, species) of = erropen(outfile, "w") if not of: continue switch_to_db(cursor, ensembl_db_name[species]) if (species == 'homo_sapiens'): gene_ids = get_gene_ids(cursor, biotype='protein_coding', is_known=1, ref_only=True) else: gene_ids = get_gene_ids(cursor, biotype='protein_coding') source = get_analysis_dict(cursor) ct = 0 for gene_id in gene_ids: ct += 1 if (not ct % 1000): print species, ct, len(gene_ids) # get _all_ exons exons = gene2exon_list(cursor, gene_id) if (not exons): print 'no exons for ', gene_id continue for exon in exons: if exon.covering_exon > 0: continue # exons seqs are its aa translation, left_flank, right_flank, and dna_seq exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known) if (not exon_seqs): continue # human readable string describing the source of annotation for this exon if exon.is_known == 2: analysis = 'sw_sharp' elif exon.is_known == 3: analysis = 'usearch' else: analysis = source[exon.analysis_id] # the first field return by get_exon_seqs is the exon_seq_id, so get rid of it gene_stable_id = gene2stable(cursor, gene_id) if (exon.is_known == 1): exon_stable_id = exon2stable(cursor, exon.exon_id) elif (exon.is_known == 2): exon_stable_id = 'sw_sharp_' + str(exon.exon_id) elif (exon.is_known == 3): exon_stable_id = 'usearch_' + str(exon.exon_id) else: exon_stable_id = "anon" print >> of, exon_tabstring(exon, gene_stable_id, exon_stable_id, species, analysis, exon_seqs[1:]) of.close() print species, "done" cursor.close() db.close()
def main(): verbose = True db = connect_to_mysql() acg = AlignmentCommandGenerator() cursor = db.cursor() # find db ids adn common names for each species db [all_species, ensembl_db_name] = get_species (cursor) logf = erropen("error.log", "w") if not logf: exit(1) outf = erropen("mut_significance_bg_data.txt", "w") if not outf: exit(1) switch_to_db (cursor, ensembl_db_name['homo_sapiens']) gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True) # the categories of mutations for which we will be collecting statistics fill_category () # for each human gene #gene_ids = [10093176 ] for gene_id in gene_ids: switch_to_db (cursor, ensembl_db_name['homo_sapiens']) stable_id = gene2stable(cursor, gene_id) # find all canonical coding human exons # get_canonical_coding_exons also sorts exons by the start in the gene canonical_human_exons = get_canonical_coding_exons (cursor, gene_id, ensembl_db_name['homo_sapiens']) # bail out if there is a problem if not canonical_human_exons: continue full_reconstituted_cDNA = "" prev_codon_piece_plus_right_flank = "" for human_exon in canonical_human_exons: [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, nucseq] = \ get_exon_seqs(cursor, human_exon.exon_id, human_exon.is_known) # add the split codon phase = get_exon_phase (cursor, human_exon.exon_id, human_exon.is_known) left_flank_plus_codon_piece = left_flank + nucseq[:pepseq_transl_start] split_codon = "" if phase > 0 and prev_codon_piece_plus_right_flank and left_flank: offset = (3-phase)%3 # hedge against the possibility that the translation starts # right at the start of the exon, but there is supposed to be a phase split_codon = prev_codon_piece_plus_right_flank[:phase] + left_flank_plus_codon_piece[-offset:] full_reconstituted_cDNA += split_codon + nucseq[pepseq_transl_start:pepseq_transl_end] prev_codon_piece_plus_right_flank = nucseq[pepseq_transl_end:] + right_flank mitochondrial = is_mitochondrial(cursor, gene_id); if (mitochondrial): full_reconstituted_seq = Seq(full_reconstituted_cDNA).translate(table="Vertebrate Mitochondrial").tostring() else: full_reconstituted_seq = Seq(full_reconstituted_cDNA).translate().tostring() canonical = get_canonical_transl (acg, cursor, gene_id, 'homo_sapiens', strip_X = False) if canonical[0] == 'X': #that's some crap apparently wrong transcript is annotated as canonical print >> logf, "warning", gene_id, stable_id, get_description (cursor, gene_id) print >> logf, "the deposited canonical sequence starts with X - is there an alternative (?)" canonical = canonical[1:] if full_reconstituted_seq[-1] == '*' and canonical[-1] != '*': canonical += '*' if ( len(full_reconstituted_seq) != len(canonical) or full_reconstituted_seq != canonical): if ( len(canonical) - len(full_reconstituted_seq) < 3 and full_reconstituted_seq in canonical): # go with it - I do not have that much of that crap anyway print >> logf, "warning", gene_id, stable_id, get_description (cursor, gene_id) print >> logf, "missing a couple of amino acids in beginning or in the end" else: print >> logf, "error" , gene_id, stable_id, get_description (cursor, gene_id) print >> logf, "error reassembling, len(full_reconstituted_seq) != len(canonical) ", len(full_reconstituted_seq) , len(canonical) print >> logf, "canonical:" print >> logf, canonical print >> logf, "reconstituted:" print >> logf, full_reconstituted_seq continue # nucleotide stats count = {'A':0, 'C':0, 'C-CpG':0, 'T':0, 'G':0, 'G-CpG':0} is_CpG = {} for i in range( len(full_reconstituted_cDNA) ): is_CpG[i] = False if full_reconstituted_cDNA[i] == 'A': count['A'] += 1 elif full_reconstituted_cDNA[i] == 'T': count['T'] += 1 elif full_reconstituted_cDNA[i] == 'C': if i + 1 < len(full_reconstituted_cDNA) and full_reconstituted_cDNA[i + 1] == 'G': count['C-CpG'] += 1 is_CpG[i] = True else: count['C'] += 1 elif full_reconstituted_cDNA[i] == 'G': if i > 0 and full_reconstituted_cDNA[i - 1] == 'C': count['G-CpG'] += 1 is_CpG[i] = True else: count['G'] += 1 # in each category_dict (AT transt, AT transv, CG trans, CG transv, Cpg trans, cpGtransv, how many missense, # how many nonsense, how many silent possible codons = map(''.join, zip(*[iter(full_reconstituted_cDNA)]*3)) silent = {} missense = {} nonsense = {} for cg in categories: silent[cg] = 0 missense[cg] = 0 nonsense[cg] = 0 for i in range(len(codons)): codon = codons[i] aa = full_reconstituted_seq[i] for j in range(3): nt_position = i*3 + j nt = full_reconstituted_cDNA[nt_position] for new_nt in ['A', 'C', 'T', 'G']: if new_nt == nt: continue mutated_codon = mutate(codon, j, new_nt) if (mitochondrial): mutated_aa = Seq(mutated_codon).translate(table="Vertebrate Mitochondrial").tostring() else: mutated_aa = Seq(mutated_codon).translate().tostring() cg = category_dict[codon[j]][new_nt][is_CpG[nt_position]]; if not cg or not cg in categories: print >> logf, "category problem in ", gene_id, stable_id, get_description (cursor, gene_id) print >> logf, codon, mutated_codon, j, codon[j], new_nt, is_CpG[nt_position], cg print >> logf, i, j, nt_position, nt print >> logf, aa, mutated_aa continue if (mutated_aa == aa): silent[cg] += 1 elif (mutated_aa == "*"): nonsense[cg] += 1 else: missense[cg] += 1 print >> outf, stable_id, get_description (cursor, gene_id) print >> outf, "# CpG nucleotides (format: cdna_position|nucleotide|codon|context; )" print >> outf, "# ('context' contains one nucleotide before and one after the CpG nucleotide)" outstr = "" for i in range(len(full_reconstituted_cDNA)): if (is_CpG[i]): context = "" if i>0: context += full_reconstituted_cDNA[i-1] context += full_reconstituted_cDNA[i] if i<len(full_reconstituted_cDNA)-1: context += full_reconstituted_cDNA[i+1] outstr += "%d|%s|%s|%s;" % (i+1, full_reconstituted_cDNA[i], codons[i/3], context) print >> outf, outstr print >> outf,"# mutations possible (in principle)" print >> outf,"# %10s %5s %5s %5s" % ("category", "silent", "nonsense", "missense") for cg in categories: print >> outf,"%10s %5d %5d %5d" % (cg, silent[cg], nonsense[cg], missense[cg]) print >> outf, "# canonical sequence (format: <amino_acid><position_on_peptide_chain><codon>;):" outstr = "" for i in range(len(codons)): if (mitochondrial): codon_transl = Seq(codons[i]).translate(table="Vertebrate Mitochondrial").tostring() else: codon_transl = Seq(codons[i]).translate().tostring() outstr += "%s%d%s;" % (full_reconstituted_seq[i], i+1, codons[i]) print >> outf, outstr print >> outf, stable_id, "done" logf.close()
def dump_exons (species_list, db_info): [local_db, ensembl_db_name] = db_info db = connect_to_mysql() cfg = ConfigurationReader() cursor = db.cursor() out_path = "{0}/exons".format(cfg.get_path('afs_dumps')) if not os.path.exists(out_path): print out_path, "not found" exit (1) # exit on failed output dir check for species in species_list: #if (not species=='homo_sapiens'): # continue outfile = "{0}/{1}_exon_dump.txt".format(out_path, species) of = erropen (outfile,"w") if not of: continue switch_to_db (cursor, ensembl_db_name[species]) if (species=='homo_sapiens'): gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True) else: gene_ids = get_gene_ids (cursor, biotype='protein_coding') source = get_analysis_dict(cursor) ct = 0 for gene_id in gene_ids: ct += 1 if (not ct%1000): print species, ct, len(gene_ids) # get _all_ exons exons = gene2exon_list(cursor, gene_id) if (not exons): print 'no exons for ', gene_id continue for exon in exons: if exon.covering_exon > 0: continue # exons seqs are its aa translation, left_flank, right_flank, and dna_seq exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known) if (not exon_seqs): continue # human readable string describing the source of annotation for this exon if exon.is_known==2: analysis = 'sw_sharp' elif exon.is_known==3: analysis = 'usearch' else: analysis = source[exon.analysis_id] # the first field return by get_exon_seqs is the exon_seq_id, so get rid of it gene_stable_id = gene2stable(cursor,gene_id) if ( exon.is_known == 1): exon_stable_id = exon2stable(cursor,exon.exon_id) elif ( exon.is_known == 2): exon_stable_id = 'sw_sharp_'+str(exon.exon_id) elif ( exon.is_known == 3): exon_stable_id = 'usearch_'+str(exon.exon_id) else: exon_stable_id = "anon" print >> of, exon_tabstring (exon, gene_stable_id, exon_stable_id, species, analysis, exon_seqs[1:]) of.close() print species, "done" cursor.close() db .close()