def create_SQ_header(database_v,species): ''' :param database_v: database version :return: list of chromosomes with their size (from ENSEMBL) ''' SQ=[] # create connection to ensembm database # create connection to ensembl database if species=="arabidopsis_thaliana": Genome_species=Species.getCommonName(species.replace('_',' ')) account=host.HostAccount(host="mysql-eg-publicsql.ebi.ac.uk",user="******",passwd="",port=4157) ensembl=Genome(Species=Genome_species,account=account,Release=30) else: Genome_species=Species.getCommonName(species.replace('_',' ')) ensembl=Genome(Species=Genome_species,Release=database_v,account=None) # convert IDs coord_table=ensembl.CoreDb.getTable('coord_system') seq_region_table=ensembl.CoreDb.getTable('seq_region') select_obj=[seq_region_table.c.name, seq_region_table.c.length, coord_table.c.version, ] from_obj=seq_region_table.join(coord_table,coord_table.c.coord_system_id==seq_region_table.c.coord_system_id) query = sql.select(select_obj,from_obj=[from_obj], whereclause = coord_table.c.rank==1) for row in query.execute(): if '_' not in row[0]: SQ_string= "@SQ\tSN:chr"+str(row[0])+"\tLN:"+str(row[1])+"\tAS:"+str(row[2])+"\tSP:"+str(species) SQ.append(SQ_string) return SQ
def prepareAnnotationENSEMBL(psm_protein_id,mode,database_v,species,three_frame_translation): ''' :param psm_protein_id: list of protein IDs (untagged) :param mode: transcript or protein mode :param database_v: database version :param species: species name :return: dictionairy mapping proteins into ENSEMBL ''' print('Commencing ENSEMBL data retrieval') # create connection to ensembl database if species=="arabidopsis_thaliana": Genome_species=Species.getCommonName(species.replace('_',' ')) account=host.HostAccount(host="mysql-eg-publicsql.ebi.ac.uk",user="******",passwd="",port=4157) ensembl=Genome(Species=Genome_species,account=account,Release=30) else: Genome_species=Species.getCommonName(species.replace('_',' ')) ensembl=Genome(Species=Genome_species,Release=database_v,account=None) # convert IDs translation_table=ensembl.CoreDb.getTable('translation') transcript_table=ensembl.CoreDb.getTable('transcript') select_obj=[transcript_table.c.stable_id, translation_table.c.stable_id, transcript_table.c.transcript_id, translation_table.c.seq_start, translation_table.c.start_exon_id, ] from_obj=translation_table.join(transcript_table,transcript_table.c.transcript_id==translation_table.c.transcript_id) if mode=='protein': id=1 query = sql.select(select_obj,from_obj=[from_obj], whereclause = translation_table.c.stable_id.in_(psm_protein_id)) elif mode=='transcript': id=0 query = sql.select(select_obj,from_obj=[from_obj], whereclause = transcript_table.c.stable_id.in_(psm_protein_id)) psm_protein_id={} transcript_ids=[] for row in query.execute(): #print row transcript_ids.append(row[2]) psm_protein_id[row[id]]={'transcript_id':row[0],'translation_id':row[1], 'transcript_seq':'','protein_seq':'', 'chr':'','strand':'','5UTR_offset':row[3],'start_exon_rank':row[4]} return ensembl_construct_sequences(psm_protein_id,ensembl,transcript_ids,database_v,species, three_frame_translation,mode)