Пример #1
0
def translate_seq(sequence, strand):
    '''
    :param sequence: DNA sequence
    :param strand: strand of DNA sequence
    :return: translated DNA sequence
    '''
    if str(strand) == "1":
        return standard_code.translate(sequence)
    else:
        return standard_code.translate(reverse_complement(sequence))
Пример #2
0
def ensembl_construct_sequences(psm_hash,ensembl,transcript_ids,database_v,species,three_frame_translation,mode,):
    '''
    :param psm_hash: dictionair with protein / ensembl information ( see prepareAnnotationENSEMBL)
    :param ensembl:ensembl genome
    :param transcript_ids: list of transcrip ids (converted from protein IDs)
    :param database_v: database version
    :param species: species name
    :return: dictionairy mapping proteins into ENSEMBL
    '''
    print "Commencing transcript and protein sequence retrieval"

    no_protein_seq=[]
    biomart_key_hash={}
    stable_transcript_ids=[]

    for key in psm_hash.keys():
        biomart_key_hash[psm_hash[key]['transcript_id']]=key
        stable_transcript_ids.append(psm_hash[key]['transcript_id'])

    # Retrieve cds,chr,transcript_id and strand from biomart
    biomart_result=proBAM_biomart.retrieve_data_from_biomart(database_v,species,stable_transcript_ids,three_frame_translation)
    for row in biomart_result:
        row=row.split("\t")
        try:
            psm_hash[biomart_key_hash[row[1]]]['transcript_seq']=row[0]
            psm_hash[biomart_key_hash[row[1]]]['protein_seq']=standard_code.translate(row[0])
            #TODO what to do with "special" ensembl chromosomes: currently leave them out => bam conversion
            #TODO considers these psms unmapped
            #if "_" in row[2]:
            #    print row[1],row[2]
            psm_hash[biomart_key_hash[row[1]]]['chr']=row[2]
            psm_hash[biomart_key_hash[row[1]]]['strand']=row[3]
            del row
        except IndexError:
            pass
    del biomart_result

    # get exons directly from core database
    temp_exon_hash=get_ensembl_exons(ensembl,transcript_ids,psm_hash,mode)
    exon_hash=temp_exon_hash[0]
    psm_hash=temp_exon_hash[1]
    del temp_exon_hash
    # retrieve protein sequences for transcript where the protein sequence could not be fetched automatically
    for key in no_protein_seq:
        psm_hash[key]['transcript_seq']=retrieve_protein_seq(psm_hash[key]['transcript_seq'],
                                                                   exon_hash[psm_hash[key]['transcript_id']],
                                                                   psm_hash[key]['5UTR_offset'],
                                                                   psm_hash[key]['start_exon_rank'])
        #translate till stop codon
        psm_hash[key]['protein_seq']=standard_code.translate(psm_hash[key]
                                                                   ['transcript_seq']).partition('*')[0]
    return [psm_hash,exon_hash]
Пример #3
0
def map_peptide_to_protein_3frame(peptide_seq, transcript_seq,
                                  allowed_mismatches, strand):
    '''
    :param peptide_seq: peptide sequence (string)
    :param transcript_seq: transcript sequence (string)
    :param allowed_mismatches: number of allowed mismatches
    :param strand: chromosome strand
    :return: number of hits of peptide on protein
    '''
    size_adjust = -1  # adjust size of transcript for starting at +1/+2frame
    hits = []
    pre_post_aa = ['', '']
    pep_length = len(peptide_seq)
    frame = [0] * 3
    frame[0] = standard_code.translate(transcript_seq)
    frame[1] = standard_code.translate(transcript_seq[1:])
    frame[2] = standard_code.translate(transcript_seq[2:])
    for f in frame:
        size_adjust += 1
        for i in range(0, (len(f) - pep_length)):
            if hamming(peptide_seq, f[i:pep_length + i]) <= allowed_mismatches:
                adjusted_hit_pos = (i * 3) + size_adjust
                hits.append([
                    adjusted_hit_pos,
                    hamming(peptide_seq, f[i:pep_length + i])
                ])

                # compute 2 preceding AA
                if (i - 1) == 0:
                    pre_post_aa[0] = f[(i - 1)]
                elif (i - 2) >= 0:
                    pre_post_aa[0] = f[(i - 2):i]
                else:
                    pre_post_aa[0] = "*"

                # compute 2 folowwing AA
                if (i + 1) == (len(f) - 1):
                    pre_post_aa[1] = f[pep_length + i]
                elif (i + 2) <= (len(f) - 1):
                    pre_post_aa[1] = f[(pep_length + i):(pep_length + i + 2)]
                else:
                    pre_post_aa[1] = "*"
    return [hits, pre_post_aa]
Пример #4
0
def map_peptide_to_protein_3frame(peptide_seq,transcript_seq,allowed_mismatches,strand):
    '''
    :param peptide_seq: peptide sequence (string)
    :param transcript_seq: transcript sequence (string)
    :param allowed_mismatches: number of allowed mismatches
    :param strand: chromosome strand
    :return: number of hits of peptide on protein
    '''
    size_adjust=-1    # adjust size of transcript for starting at +1/+2frame
    hits=[]
    pre_post_aa=['','']
    pep_length=len(peptide_seq)
    frame=[0]*3
    frame[0]=standard_code.translate(transcript_seq)
    frame[1]=standard_code.translate(transcript_seq[1:])
    frame[2]=standard_code.translate(transcript_seq[2:])
    for f in frame:
        size_adjust+=1
        for i in range(0,(len(f)-pep_length)):
            if hamming(peptide_seq,f[i:pep_length+i]) <= allowed_mismatches:
                adjusted_hit_pos=(i*3)+size_adjust
                hits.append([adjusted_hit_pos,hamming(peptide_seq,f[i:pep_length+i])])

                # compute 2 preceding AA
                if (i - 1) == 0:
                    pre_post_aa[0] = f[(i - 1)]
                elif (i - 2) >= 0:
                    pre_post_aa[0] = f[(i - 2):i]
                else:
                    pre_post_aa[0] = "*"


                # compute 2 folowwing AA
                if (i + 1) == (len(f) - 1):
                    pre_post_aa[1] = f[pep_length + i]
                elif (i + 2) <= (len(f) - 1):
                    pre_post_aa[1] = f[(pep_length + i):(pep_length + i + 2)]
                else:
                    pre_post_aa[1] = "*"

    return [hits,pre_post_aa]
Пример #5
0
def ensembl_construct_sequences(
    psm_hash,
    mysql_db,
    transcript_ids,
    database_v,
    species,
    three_frame_translation,
    mode,
):
    '''
    :param psm_hash: dictionair with protein / ensembl information ( see prepareAnnotationENSEMBL)
    :param ensembl:ensembl genome
    :param transcript_ids: list of transcrip ids (converted from protein IDs)
    :param database_v: database version
    :param species: species name
    :return: dictionairy mapping proteins into ENSEMBL
    '''
    print "Commencing transcript and protein sequence retrieval"

    no_protein_seq = []
    biomart_key_hash = {}
    stable_transcript_ids = []

    for key in psm_hash.keys():
        biomart_key_hash[psm_hash[key]['transcript_id']] = key
        stable_transcript_ids.append(psm_hash[key]['transcript_id'])

    chunked_stable_transcript_id = chunkIt(stable_transcript_ids, 10)
    process = 0
    c = 0
    for chunk in chunked_stable_transcript_id:
        # Retrieve cds,chr,transcript_id and strand from biomart
        try:
            biomart_result = proBAM_biomart.retrieve_data_from_biomart(
                database_v, species, chunk, three_frame_translation)
        except AttributeError:
            time.sleep(60)
            print "BioMart connection timeout, reconnecting to BioMart"
            biomart_result = proBAM_biomart.retrieve_data_from_biomart(
                database_v, species, chunk, three_frame_translation)
        for row in biomart_result:
            row = row.split("\t")
            try:
                psm_hash[biomart_key_hash[row[1]]]['transcript_seq'] = row[0]
                psm_hash[biomart_key_hash[row[1]]]['shift'] = _calc_seq_shift_(
                    row[0])
                psm_hash[biomart_key_hash[
                    row[1]]]['protein_seq'] = standard_code.translate(row[0])
                psm_hash[biomart_key_hash[row[1]]]['chr'] = row[2]
                psm_hash[biomart_key_hash[row[1]]]['strand'] = row[3]
                del row
            except IndexError:
                pass
        del biomart_result
        if process < 100:
            process += 10
            print str(process) + "% ",
    print " "

    # get exons directly from core database
    temp_exon_hash = get_ensembl_exons(mysql_db, transcript_ids, psm_hash,
                                       mode)
    exon_hash = temp_exon_hash[0]
    psm_hash = temp_exon_hash[1]
    del temp_exon_hash
    # retrieve protein sequences for transcript where the protein sequence could not be fetched automatically
    for key in no_protein_seq:
        psm_hash[key]['transcript_seq'] = retrieve_protein_seq(
            psm_hash[key]['transcript_seq'],
            exon_hash[psm_hash[key]['transcript_id']],
            psm_hash[key]['5UTR_offset'], psm_hash[key]['start_exon_rank'])
        psm_hash[key]['shift'] = _calc_seq_shift_(
            psm_hash[key]['transcript_seq'])
        #translate till stop codon
        psm_hash[key]['protein_seq'] = standard_code.translate(
            psm_hash[key]['transcript_seq']).partition('*')[0]
    return [psm_hash, exon_hash]