def importCriticalExonLocations(species,array_type,ensembl_exon_db,force):
    ###ensembl_exon_db[(geneid,chr,strand)] = [[E5,exon_info]] #exon_info = (exon_start,exon_stop,exon_id,exon_annot)
    ###ensembl_probeset_db[geneid,chr,strand].append(probeset_data) #probeset_data = [start,stop,probeset_id,exon_class,transcript_cluster_id]
    gene_info_db = {}
    for (ens_geneid,chr,strand) in ensembl_exon_db: gene_info_db[ens_geneid] = chr,strand
    filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_critical_exon_locations.txt'
    array_ensembl={}

    ###Get the most recent gene-symbol annotations (applicable with a new Ensembl build for the same genomic build)
    ensembl_symbol_db = getEnsemblAnnotations(species)
    primary_gene_annotation_file = 'AltDatabase/'+species +'/'+ array_type +'/'+ array_type+ '_gene_annotations.txt'
    update.verifyFile(primary_gene_annotation_file,array_type)
    array_gene_annotations = JunctionArray.importGeneric(primary_gene_annotation_file)
            
    for array_geneid in array_gene_annotations:    
        t = array_gene_annotations[array_geneid]; description=t[0];entrez=t[1];symbol=t[2]
        if symbol in ensembl_symbol_db and len(symbol)>0 and len(array_geneid)>0:
            ens_geneid = ensembl_symbol_db[symbol]
            if len(ens_geneid)>0: array_ensembl[array_geneid]= ens_geneid
          
    update.verifyFile(filename,array_type)  
    ensembl_probeset_db = importJunctionLocationData(filename,array_ensembl,gene_info_db,test)
       
    print len(ensembl_probeset_db), "Genes inlcuded in",array_type,"location database"
    return ensembl_probeset_db
예제 #2
0
def importCriticalJunctionSeq(filename, species, array_type):
    update.verifyFile(filename, array_type)  ### Will force download if missing
    fn = filepath(filename)
    probeset_seq_db = {}
    x = 0
    for line in open(fn, 'rU').xreadlines():
        data, newline = string.split(line, '\n')
        t = string.split(data, '\t')
        if x == 0: x = 1
        else:
            try:
                probeset, probeset_seq, junction_seq = t
            except Exception:
                try:
                    probeset, probeset_seq, junction_seq, null = t
                except Exception:
                    print filename, t
                    kill
            if array_type == 'RNASeq':
                ### Ensure the junction sequence is sufficient for searching
                left, right = string.split(probeset_seq, '|')
                if len(left) > 2 and len(right) > 2: null = []
                else: probeset_seq = ''
                if len(probeset_seq) < 8: probeset_seq = ''
            probeset_seq = string.replace(probeset_seq, '|', '')
            probeset_seq_db[probeset] = probeset_seq, junction_seq
            x += 1

    print len(probeset_seq_db), 'probesets with associated sequence'
    return probeset_seq_db
예제 #3
0
def importCriticalJunctionSeq(filename, species, array_type):
    update.verifyFile(filename, array_type)  ### Will force download if missing
    fn = filepath(filename)
    probeset_seq_db = {}
    x = 0
    for line in open(fn, "rU").xreadlines():
        data, newline = string.split(line, "\n")
        t = string.split(data, "\t")
        if x == 0:
            x = 1
        else:
            try:
                probeset, probeset_seq, junction_seq = t
            except Exception:
                try:
                    probeset, probeset_seq, junction_seq, null = t
                except Exception:
                    print filename, t
                    kill
            if array_type == "RNASeq":
                ### Ensure the junction sequence is sufficient for searching
                left, right = string.split(probeset_seq, "|")
                if len(left) > 2 and len(right) > 2:
                    null = []
                else:
                    probeset_seq = ""
                if len(probeset_seq) < 8:
                    probeset_seq = ""
            probeset_seq = string.replace(probeset_seq, "|", "")
            probeset_seq_db[probeset] = probeset_seq, junction_seq
            x += 1

    print len(probeset_seq_db), "probesets with associated sequence"
    return probeset_seq_db
예제 #4
0
def importCriticalExonLocations(species, array_type, ensembl_exon_db, force):
    ###ensembl_exon_db[(geneid,chr,strand)] = [[E5,exon_info]] #exon_info = (exon_start,exon_stop,exon_id,exon_annot)
    ###ensembl_probeset_db[geneid,chr,strand].append(probeset_data) #probeset_data = [start,stop,probeset_id,exon_class,transcript_cluster_id]
    gene_info_db = {}
    for (ens_geneid, chr, strand) in ensembl_exon_db:
        gene_info_db[ens_geneid] = chr, strand
    filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '_critical_exon_locations.txt'
    array_ensembl = {}

    ###Get the most recent gene-symbol annotations (applicable with a new Ensembl build for the same genomic build)
    ensembl_symbol_db = getEnsemblAnnotations(species)
    primary_gene_annotation_file = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '_gene_annotations.txt'
    update.verifyFile(primary_gene_annotation_file, array_type)
    array_gene_annotations = JunctionArray.importGeneric(
        primary_gene_annotation_file)

    for array_geneid in array_gene_annotations:
        t = array_gene_annotations[array_geneid]
        description = t[0]
        entrez = t[1]
        symbol = t[2]
        if symbol in ensembl_symbol_db and len(symbol) > 0 and len(
                array_geneid) > 0:
            ens_geneid = ensembl_symbol_db[symbol]
            if len(ens_geneid) > 0: array_ensembl[array_geneid] = ens_geneid

    update.verifyFile(filename, array_type)
    ensembl_probeset_db = importJunctionLocationData(filename, array_ensembl,
                                                     gene_info_db, test)

    print len(ensembl_probeset_db
              ), "Genes inlcuded in", array_type, "location database"
    return ensembl_probeset_db
예제 #5
0
def importSplicingAnnotationDatabaseAndSequence(species,array_type,biotype):
    array_ens_db={}
    if array_type == 'AltMouse':
        filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'-Ensembl_relationships.txt'
        update.verifyFile(filename,array_type) ### Will force download if missing
        fn=filepath(filename); x = 0
        for line in open(fn,'r').xreadlines():
            data, newline = string.split(line,'\n'); t = string.split(data,'\t')
            if x==0: x=1
            else: 
                array_gene,ens_gene = t
                try: array_ens_db[array_gene].append(ens_gene)
                except KeyError: array_ens_db[array_gene]=[ens_gene]

    filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_critical-junction-seq.txt'         
    fn=filepath(filename); probeset_seq_db={}; x = 0
    for line in open(fn,'r').xreadlines():
        data, newline = string.split(line,'\n'); t = string.split(data,'\t')
        if x==0: x=1
        else: 
            probeset,probeset_seq,junction_seq = t; junction_seq=string.replace(junction_seq,'|','')
            probeset_seq_db[probeset] = probeset_seq,junction_seq
            
    ###Import reciprocol junctions, so we can compare these directly instead of hits to nulls and combine with sequence data
    ###This short-cuts what we did in two function in ExonSeqModule with exon level data
    filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_junction-comparisons.txt'
    fn=filepath(filename); probeset_gene_seq_db={}; x = 0
    for line in open(fn,'r').xreadlines():
        data, newline = string.split(line,'\n'); t = string.split(data,'\t')
        if x==0: x=1
        else: 
            array_gene,probeset1,probeset2,critical_exons = t #; critical_exons = string.split(critical_exons,'|')
            probesets = [probeset1,probeset2]
            if array_type == 'junction' or array_type == 'RNASeq': array_ens_db[array_gene]=[array_gene]
            if array_gene in array_ens_db:
                ensembl_gene_ids = array_ens_db[array_gene]
                for probeset_id in probesets:
                    if probeset_id in probeset_seq_db:
                        probeset_seq,junction_seq = probeset_seq_db[probeset_id]
                        if biotype == 'gene':
                            for ensembl_gene_id in ensembl_gene_ids:
                                probe_data = ExonSeqModule.JunctionDataSimple(probeset_id,ensembl_gene_id,array_gene,probesets,critical_exons)
                                probe_data.SetExonSeq(probeset_seq)
                                probe_data.SetJunctionSeq(junction_seq)
                                try: probeset_gene_seq_db[ensembl_gene_id].append(probe_data)
                                except KeyError: probeset_gene_seq_db[ensembl_gene_id] = [probe_data]
                        else: ### Used for probeset annotations downstream of sequence alignment in LinkEST, analagous to exon_db for exon analyses
                            probe_data = ExonSeqModule.JunctionDataSimple(probeset_id,ensembl_gene_ids,array_gene,probesets,critical_exons)
                            probe_data.SetExonSeq(probeset_seq)
                            probe_data.SetJunctionSeq(junction_seq)                            
                            probeset_gene_seq_db[probeset_id] = probe_data                
    print len(probeset_gene_seq_db),"genes with probeset sequence associated"
    return probeset_gene_seq_db
예제 #6
0
def importSplicingAnnotationDatabaseAndSequence(species,array_type,biotype):
    array_ens_db={}
    if array_type == 'AltMouse':
        filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'-Ensembl_relationships.txt'
        update.verifyFile(filename,array_type) ### Will force download if missing
        fn=filepath(filename); x = 0
        for line in open(fn,'r').xreadlines():
            data, newline = string.split(line,'\n'); t = string.split(data,'\t')
            if x==0: x=1
            else: 
                array_gene,ens_gene = t
                try: array_ens_db[array_gene].append(ens_gene)
                except KeyError: array_ens_db[array_gene]=[ens_gene]

    filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_critical-junction-seq.txt'         
    fn=filepath(filename); probeset_seq_db={}; x = 0
    for line in open(fn,'r').xreadlines():
        data, newline = string.split(line,'\n'); t = string.split(data,'\t')
        if x==0: x=1
        else: 
            probeset,probeset_seq,junction_seq = t; junction_seq=string.replace(junction_seq,'|','')
            probeset_seq_db[probeset] = probeset_seq,junction_seq
            
    ###Import reciprocol junctions, so we can compare these directly instead of hits to nulls and combine with sequence data
    ###This short-cuts what we did in two function in ExonSeqModule with exon level data
    filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_junction-comparisons.txt'
    fn=filepath(filename); probeset_gene_seq_db={}; x = 0
    for line in open(fn,'r').xreadlines():
        data, newline = string.split(line,'\n'); t = string.split(data,'\t')
        if x==0: x=1
        else: 
            array_gene,probeset1,probeset2,critical_exons = t #; critical_exons = string.split(critical_exons,'|')
            probesets = [probeset1,probeset2]
            if array_type == 'junction' or array_type == 'RNASeq': array_ens_db[array_gene]=[array_gene]
            if array_gene in array_ens_db:
                ensembl_gene_ids = array_ens_db[array_gene]
                for probeset_id in probesets:
                    if probeset_id in probeset_seq_db:
                        probeset_seq,junction_seq = probeset_seq_db[probeset_id]
                        if biotype == 'gene':
                            for ensembl_gene_id in ensembl_gene_ids:
                                probe_data = ExonSeqModule.JunctionDataSimple(probeset_id,ensembl_gene_id,array_gene,probesets,critical_exons)
                                probe_data.SetExonSeq(probeset_seq)
                                probe_data.SetJunctionSeq(junction_seq)
                                try: probeset_gene_seq_db[ensembl_gene_id].append(probe_data)
                                except KeyError: probeset_gene_seq_db[ensembl_gene_id] = [probe_data]
                        else: ### Used for probeset annotations downstream of sequence alignment in LinkEST, analagous to exon_db for exon analyses
                            probe_data = ExonSeqModule.JunctionDataSimple(probeset_id,ensembl_gene_ids,array_gene,probesets,critical_exons)
                            probe_data.SetExonSeq(probeset_seq)
                            probe_data.SetJunctionSeq(junction_seq)                            
                            probeset_gene_seq_db[probeset_id] = probe_data                
    print len(probeset_gene_seq_db),"genes with probeset sequence associated"
    return probeset_gene_seq_db
예제 #7
0
def runExtractUniProt(species,species_full,uniprot_filename_url,trembl_filename_url,force):
    global uniprot_ensembl_db;uniprot_ensembl_db={}
    global uniprot_db;uniprot_db={}; global species_name; global uniprot_fildir
    global secondary_to_primary_db; secondary_to_primary_db={}
    import update; reload(update)
    
    species_name = species_full
    
    import UI; species_names = UI.getSpeciesInfo()
    species_full = species_names[species]
    species_full = string.replace(species_full,' ','_')

    uniprot_file = string.split(uniprot_filename_url,'/')[-1]; uniprot_file = string.replace(uniprot_file,'.gz','')
    trembl_file = string.split(trembl_filename_url,'/')[-1]; trembl_file = string.replace(trembl_file,'.gz','')
    uniprot_fildir = 'AltDatabase/uniprot/'+species+'/'
    uniprot_download_fildir = 'AltDatabase/uniprot/'
    uniprot_ens_file = species+'_Ensembl-UniProt.txt'; uniprot_ens_location = uniprot_fildir+uniprot_ens_file
    uniprot_location = uniprot_download_fildir+uniprot_file
    trembl_location = uniprot_download_fildir+trembl_file

    add_trembl_annotations = 'no' ### Currently we don't need these annotations    
    try: importEnsemblUniprot(uniprot_ens_location)
    except IOError:
        try:
            ### Download the data from the AltAnalyze website (if there)
            update.downloadCurrentVersion(uniprot_ens_location,species,'txt')
            importEnsemblUniprot(uniprot_ens_location)
        except Exception: null=[]
    try:
        uniprot_ens_location_built = string.replace(uniprot_ens_location,'UniProt','Uniprot-SWISSPROT')
        uniprot_ens_location_built = string.replace(uniprot_ens_location_built,'uniprot','Uniprot-SWISSPROT')
        importEnsemblUniprot(uniprot_ens_location_built)
    except Exception: null=[]
    
    ### Import UniProt annotations
    counts = update.verifyFile(uniprot_location,'counts')
    if force == 'no' or counts > 8: import_uniprot_db(uniprot_location)
    else:
        ### Directly download the data from UniProt
        gz_filepath, status = update.download(uniprot_filename_url,uniprot_download_fildir,'')

        if status == 'not-removed':
            try: os.remove(gz_filepath) ### Not sure why this works now and not before
            except OSError: status = status     
        import_uniprot_db(uniprot_location)
        
    if add_trembl_annotations == 'yes':
        ### Import TreMBL annotations
        try:
            if force == 'yes': uniprot_location += '!!!!!' ### Force an IOError
            import_uniprot_db(trembl_location)
        except IOError:
            ### Directly download the data from UniProt
            update.download(trembl_filename_url,uniprot_download_fildir,'')
            import_uniprot_db(trembl_location)        
    export()
    exportEnsemblUniprot(uniprot_ens_location)
예제 #8
0
def runExtractUniProt(species,species_full,uniprot_filename_url,trembl_filename_url,force):
    global uniprot_ensembl_db;uniprot_ensembl_db={}
    global uniprot_db;uniprot_db={}; global species_name; global uniprot_fildir
    global secondary_to_primary_db; secondary_to_primary_db={}
    import update; reload(update)
    
    species_name = species_full
    
    import UI; species_names = UI.getSpeciesInfo()
    species_full = species_names[species]
    species_full = string.replace(species_full,' ','_')

    uniprot_file = string.split(uniprot_filename_url,'/')[-1]; uniprot_file = string.replace(uniprot_file,'.gz','')
    trembl_file = string.split(trembl_filename_url,'/')[-1]; trembl_file = string.replace(trembl_file,'.gz','')
    uniprot_fildir = 'AltDatabase/uniprot/'+species+'/'
    uniprot_download_fildir = 'AltDatabase/uniprot/'
    uniprot_ens_file = species+'_Ensembl-UniProt.txt'; uniprot_ens_location = uniprot_fildir+uniprot_ens_file
    uniprot_location = uniprot_download_fildir+uniprot_file
    trembl_location = uniprot_download_fildir+trembl_file

    add_trembl_annotations = 'no' ### Currently we don't need these annotations    
    try: importEnsemblUniprot(uniprot_ens_location)
    except IOError:
        try:
            ### Download the data from the AltAnalyze website (if there)
            update.downloadCurrentVersion(uniprot_ens_location,species,'txt')
            importEnsemblUniprot(uniprot_ens_location)
        except Exception: null=[]
    try:
        uniprot_ens_location_built = string.replace(uniprot_ens_location,'UniProt','Uniprot-SWISSPROT')
        uniprot_ens_location_built = string.replace(uniprot_ens_location_built,'uniprot','Uniprot-SWISSPROT')
        importEnsemblUniprot(uniprot_ens_location_built)
    except Exception: null=[]
    
    ### Import UniProt annotations
    counts = update.verifyFile(uniprot_location,'counts')
    if force == 'no' or counts > 8: import_uniprot_db(uniprot_location)
    else:
        ### Directly download the data from UniProt
        gz_filepath, status = update.download(uniprot_filename_url,uniprot_download_fildir,'')

        if status == 'not-removed':
            try: os.remove(gz_filepath) ### Not sure why this works now and not before
            except OSError: status = status     
        import_uniprot_db(uniprot_location)
        
    if add_trembl_annotations == 'yes':
        ### Import TreMBL annotations
        try:
            if force == 'yes': uniprot_location += '!!!!!' ### Force an IOError
            import_uniprot_db(trembl_location)
        except IOError:
            ### Directly download the data from UniProt
            update.download(trembl_filename_url,uniprot_download_fildir,'')
            import_uniprot_db(trembl_location)        
    export()
    exportEnsemblUniprot(uniprot_ens_location)
예제 #9
0
def grab_exon_level_feature_calls(species,array_type,genes_analyzed):
    arrayid_uniprot_file = 'AltDatabase/uniprot/'+species+'/'+'arrayid-uniprot.txt'    
    arrayid_ensembl_file = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'-Ensembl_relationships.txt'
    ensembl_ft_file = 'AltDatabase/ensembl/'+species+'/'+'DomainFile_All.txt'
    null,null,null,protein_coordinate_file = getEnsemblRelationshipDirs(species)

    global uniprot_arrayid_db; uniprot_arrayid_db = {}; global arrayid_uniprot_db; arrayid_uniprot_db = {}
    global ensembl_arrayid_db; ensembl_arrayid_db={}
    if array_type == 'AltMouse':
        update.verifyFile(arrayid_uniprot_file,array_type) ### Will force download if missing
        update.verifyFile(arrayid_ensembl_file,array_type) ### Will force download if missing
        import_arrayid_uniprot(arrayid_uniprot_file)
        import_arrayid_ensembl(arrayid_ensembl_file)
        ###Otherwise, these databases can be built on-the-fly in downstream methods, since Ensembl will be used as the array gene id
    else: ensembl_arrayid_db = genes_analyzed ###ensembl to ensembl for those being analyzed in the program
    ensembl_protein_seq_db,ensembl_ft_db,domain_gene_counts = import_ensembl_ft_data(species,ensembl_ft_file,ensembl_arrayid_db,array_type) ###Import function domain annotations for Ensembl proteins
    print 'Ensembl based domain feature genes:',len(ensembl_ft_db),len(domain_gene_counts)
    uniprot_protein_seq_db,uniprot_ft_db,domain_gene_counts = import_uniprot_ft_data(species,protein_coordinate_file,domain_gene_counts,ensembl_arrayid_db,array_type)  ###" " " " UniProt "
    print 'UniProt based domain feature genes:',len(uniprot_ft_db),len(domain_gene_counts)
    arrayid_ft_db = combineDatabases(uniprot_ft_db,ensembl_ft_db)  ###arrayid relating to classes of functional domain attributes and associated proteins (ensembl and uniprot)
    return arrayid_ft_db,domain_gene_counts
예제 #10
0
def importCriticalJunctionSeq(filename,species,array_type):
    update.verifyFile(filename,array_type) ### Will force download if missing
    fn=filepath(filename); probeset_seq_db={}; x = 0
    for line in open(fn,'rU').xreadlines():
        data, newline = string.split(line,'\n'); t = string.split(data,'\t')
        if x==0: x=1
        else: 
            try: probeset,probeset_seq,junction_seq = t
            except Exception:
                try:
                    probeset,probeset_seq,junction_seq, null = t
                except Exception: print filename,t;kill
            if array_type == 'RNASeq':
                ### Ensure the junction sequence is sufficient for searching
                left,right = string.split(probeset_seq,'|')
                if len(left)>2 and len(right)>2: null=[]
                else: probeset_seq = ''
                if len(probeset_seq) < 8: probeset_seq = ''
            probeset_seq=string.replace(probeset_seq,'|','')
            probeset_seq_db[probeset] = probeset_seq,junction_seq
            x+=1
            
    print len(probeset_seq_db),'probesets with associated sequence'
    return probeset_seq_db
예제 #11
0
def exportAltMouseExonSequence():
    probeset_exon_db = {}
    x = 0
    species = 'Mm'
    array_type = 'AltMouse'

    critical_exon_import_file = 'AltDatabase/Mm/AltMouse/AltMouse_junction-comparisons.txt'
    update.verifyFile(critical_exon_import_file, array_type)
    critical_exon_db = {}
    critical_probesets = {}
    fn = filepath(critical_exon_import_file)
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        gene, probeset1, probeset2, critical_exons = string.split(data, '\t')
        critical_exons = string.split(critical_exons, '|')
        for exon in critical_exons:
            try:
                critical_exon_db[gene,
                                 exon].append(probeset1 + '-' + probeset2)
            except KeyError:
                critical_exon_db[gene, exon] = [probeset1 + '-' + probeset2]
            critical_probesets[probeset1] = []
            critical_probesets[probeset2] = []

    probeset_annotations_file = "AltDatabase/Mm/AltMouse/MASTER-probeset-transcript.txt"
    update.verifyFile(probeset_annotations_file, array_type)
    fn = filepath(probeset_annotations_file)
    for line in open(fn, 'rU').xreadlines():
        probeset_data = cleanUpLine(line)  #remove endline
        if x == 0: x = 1
        else:
            probeset, affygene, exons, transcript_num, transcripts, probe_type_call, ensembl, block_exon_ids, block_structure, comparison_info = string.split(
                probeset_data, '\t')
            if probeset in critical_probesets:
                exons = exons[:-1]
                exons = string.split(exons, '-')
                affygene = affygene[:-1]
                if '|' in exons:
                    print exons
                    kill
                probeset_exon_db[probeset, affygene] = exons

    exon_protein_sequence_file = "AltDatabase/Mm/AltMouse/SEQUENCE-transcript-dbase.txt"
    update.verifyFile(exon_protein_sequence_file, array_type)
    transcript_cdna_sequence_dbase, transcript_associations, exon_sequence_database = import_existing_sequence_build(
        exon_protein_sequence_file)

    critical_exon_seq_export = 'AltDatabase/Mm/AltMouse/AltMouse_critical-exon-seq.txt'
    update.verifyFile(critical_exon_seq_export, array_type)
    fn = filepath(critical_exon_seq_export)
    data = open(fn, 'w')
    title = ['Affygene:exon', 'critical_exon-num', 'critical-probeset-comps']
    title = string.join(title, '\t') + '\n'
    data.write(title)
    for (gene, exon_num) in critical_exon_db:
        probeset_comp_list = critical_exon_db[(gene, exon_num)]
        probeset_comp_list = string.join(probeset_comp_list, '|')
        try:  ###Restrict export to previously exported critical exons (ExonAnnotate_module)
            exon_sequence_database[(gene, exon_num)]
            esd = exon_sequence_database[(gene, exon_num)]
            exon_seq = esd.ExonSeq()
            exon_data = string.join(
                [gene + ':' + exon_num, probeset_comp_list, exon_seq],
                '\t') + '\n'
            data.write(exon_data)
        except KeyError:
            null = []
    data.close()

    probeset_seq_file = 'AltDatabase/Mm/AltMouse/probeset_sequence_reversed.txt'
    update.verifyFile(probeset_seq_file, array_type)
    probeset_seq_db = {}
    x = 0
    fn = filepath(probeset_seq_file)
    for line in open(fn, 'rU').xreadlines():
        if x == 0: x = 1
        else:
            data = cleanUpLine(line)
            t = string.split(data, '\t')
            probeset = t[0]
            probeset_seq_list = t[1:]
            probeset_seq_db[probeset] = probeset_seq_list

    critical_junction_seq_export = 'AltDatabase/Mm/AltMouse/AltMouse_critical-junction-seq.txt'
    update.verifyFile(critical_junction_seq_export, array_type)
    fn = filepath(critical_junction_seq_export)
    data = open(fn, 'w')
    x = 0
    k = 0
    l = 0
    title = ['probeset', 'probeset-seq', 'junction-seq']
    title = string.join(title, '\t') + '\n'
    data.write(title)
    for (probeset, gene) in probeset_exon_db:
        junction_seq = []
        y = 0
        positions = []
        try:
            probeset_seq_list = probeset_seq_db[probeset]
            for exon_num in probeset_exon_db[(probeset, gene)]:
                try:  ###Restrict export to previously exported critical exons (ExonAnnotate_module)
                    exon_sequence_database[(gene, exon_num)]
                    esd = exon_sequence_database[(gene, exon_num)]
                    exon_seq = esd.ExonSeq()
                    strand = esd.Strand()
                    junction_seq.append(exon_seq)
                    y += 1
                    #exon_data = string.join([gene+':'+exon_num,probeset_comp_list,exon_seq],'\t')+'\n'
                    #data.write(exon_data)
                except KeyError:
                    null = []
            #if 'E5' in probeset_exon_db[(probeset,gene)]:
            if y > 0:
                if strand == '-': junction_seq.reverse()
                junction_seq_str = string.join(junction_seq, '')
                junction_seq_str = string.upper(junction_seq_str)
                not_found = 0
                for probeset_seq in probeset_seq_list:
                    #probeset_seq = reverse_string(probeset_seq)
                    probeset_seq_rev = reverse_orientation(probeset_seq)
                    if probeset_seq in junction_seq_str:
                        f = string.find(junction_seq_str, probeset_seq)
                        positions.append((f, len(probeset_seq)))
                        k += 1
                    else:
                        not_found = 1
                        x += 1
                if not_found == 1:
                    new_probeset_seq = probeset_seq_list[
                        0]  ###pick the first probe sequence found
                if len(positions) > 0:
                    positions.sort()
                    new_probeset_seq = junction_seq_str[
                        positions[0][0]:positions[-1][0] + positions[-1][1]]
                    #print new_probeset_seq,positions, probeset,probeset_exon_db[(probeset,gene)],probeset_seq_list,junction_seq;kill
                junction_seq = string.join(
                    junction_seq, '|')  ###indicate where the junction is
                probe_seq_data = string.join(
                    [probeset, new_probeset_seq, junction_seq], '\t') + '\n'
                data.write(probe_seq_data)
        except KeyError:
            null = []
    data.close()
    print k, x
예제 #12
0
def importJunctionAnnotationDatabaseAndSequence(species, array_type, biotype):
    """This function imports AffyGene-Ensembl relationships, junction probeset sequences, and recipricol junction comparisons.
    with data stored from this function, we can match probeset sequence to mRNAs and determine which combinations of probesets
    can be used as match-match or match-nulls."""

    array_ens_db = {}
    if array_type == 'AltMouse':
        ### Import AffyGene to Ensembl associations (e.g., AltMouse array)
        filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '-Ensembl_relationships.txt'
        update.verifyFile(filename,
                          array_type)  ### Will force download if missing
        fn = filepath(filename)
        x = 0
        for line in open(fn, 'rU').xreadlines():
            data, newline = string.split(line, '\n')
            t = string.split(data, '\t')
            if x == 0: x = 1
            else:
                array_gene, ens_gene = t
                try:
                    array_ens_db[array_gene].append(ens_gene)
                except KeyError:
                    array_ens_db[array_gene] = [ens_gene]
        print len(array_ens_db), 'Ensembl-AltMouse relationships imported.'

    if array_type == 'RNASeq' and coordinateBasedMatching == True:
        probeset_seq_db = {}
    else:
        filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '_critical-junction-seq.txt'
        probeset_seq_db = importCriticalJunctionSeq(filename, species,
                                                    array_type)

    ###Import reciprocol junctions, so we can compare these directly instead of hits to nulls and combine with sequence data
    ###This short-cuts what we did in two function in ExonModule with exon level data
    if array_type == 'AltMouse':
        filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '_junction-comparisons.txt'
        update.verifyFile(filename,
                          array_type)  ### Will force download if missing
    elif array_type == 'junction':
        filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_junction_comps_updated.txt'
    elif array_type == 'RNASeq':
        filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_junction_comps.txt'
    fn = filepath(filename)
    probeset_gene_seq_db = {}
    added_probesets = {}
    pairwise_probesets = {}
    x = 0
    for line in open(fn, 'rU').xreadlines():
        data, newline = string.split(line, '\n')
        t = string.split(data, '\t')
        if x == 0: x = 1
        else:
            if (array_type == 'junction' or array_type == 'RNASeq'):
                array_gene, critical_exons, excl_junction, incl_junction, probeset2, probeset1, data_source = t
                array_ens_db[array_gene] = [array_gene]
            elif array_type == 'AltMouse':
                array_gene, probeset1, probeset2, critical_exons = t  #; critical_exons = string.split(critical_exons,'|')
            probesets = [probeset1, probeset2]
            pairwise_probesets[probeset1, probeset2] = []
            if array_gene in array_ens_db:
                ensembl_gene_ids = array_ens_db[array_gene]
                for probeset_id in probesets:
                    if probeset_id in probeset_seq_db:
                        probeset_seq, junction_seq = probeset_seq_db[
                            probeset_id]
                        if biotype == 'gene':
                            for ensembl_gene_id in ensembl_gene_ids:
                                if probeset_id not in added_probesets:
                                    probe_data = JunctionDataSimple(
                                        probeset_id, array_gene)
                                    probe_data.SetExonSeq(probeset_seq)
                                    probe_data.SetJunctionSeq(junction_seq)
                                    try:
                                        probeset_gene_seq_db[
                                            ensembl_gene_id].append(probe_data)
                                    except KeyError:
                                        probeset_gene_seq_db[
                                            ensembl_gene_id] = [probe_data]
                                    added_probesets[probeset_id] = []
                    elif array_type == 'RNASeq' and coordinateBasedMatching == True:  ### Coordinate matching as opposed to sequence
                        if biotype == 'gene':
                            for ensembl_gene_id in ensembl_gene_ids:
                                if probeset_id not in added_probesets:
                                    probe_data = JunctionDataSimple(
                                        probeset_id, array_gene)
                                    try:
                                        probeset_gene_seq_db[
                                            ensembl_gene_id].append(probe_data)
                                    except KeyError:
                                        probeset_gene_seq_db[
                                            ensembl_gene_id] = [probe_data]
                                    added_probesets[probeset_id] = []
    print len(probeset_gene_seq_db), "genes with probeset sequence associated"
    return probeset_gene_seq_db, pairwise_probesets
예제 #13
0
def importJunctionAnnotationDatabaseAndSequence(species, array_type, biotype):
    """This function imports AffyGene-Ensembl relationships, junction probeset sequences, and recipricol junction comparisons.
    with data stored from this function, we can match probeset sequence to mRNAs and determine which combinations of probesets
    can be used as match-match or match-nulls."""

    array_ens_db = {}
    if array_type == "AltMouse":
        ### Import AffyGene to Ensembl associations (e.g., AltMouse array)
        filename = "AltDatabase/" + species + "/" + array_type + "/" + array_type + "-Ensembl_relationships.txt"
        update.verifyFile(filename, array_type)  ### Will force download if missing
        fn = filepath(filename)
        x = 0
        for line in open(fn, "rU").xreadlines():
            data, newline = string.split(line, "\n")
            t = string.split(data, "\t")
            if x == 0:
                x = 1
            else:
                array_gene, ens_gene = t
                try:
                    array_ens_db[array_gene].append(ens_gene)
                except KeyError:
                    array_ens_db[array_gene] = [ens_gene]
        print len(array_ens_db), "Ensembl-AltMouse relationships imported."

    if array_type == "RNASeq" and coordinateBasedMatching == True:
        probeset_seq_db = {}
    else:
        filename = "AltDatabase/" + species + "/" + array_type + "/" + array_type + "_critical-junction-seq.txt"
        probeset_seq_db = importCriticalJunctionSeq(filename, species, array_type)

    ###Import reciprocol junctions, so we can compare these directly instead of hits to nulls and combine with sequence data
    ###This short-cuts what we did in two function in ExonModule with exon level data
    if array_type == "AltMouse":
        filename = "AltDatabase/" + species + "/" + array_type + "/" + array_type + "_junction-comparisons.txt"
        update.verifyFile(filename, array_type)  ### Will force download if missing
    elif array_type == "junction":
        filename = "AltDatabase/" + species + "/" + array_type + "/" + species + "_junction_comps_updated.txt"
    elif array_type == "RNASeq":
        filename = "AltDatabase/" + species + "/" + array_type + "/" + species + "_junction_comps.txt"
    fn = filepath(filename)
    probeset_gene_seq_db = {}
    added_probesets = {}
    pairwise_probesets = {}
    x = 0
    for line in open(fn, "rU").xreadlines():
        data, newline = string.split(line, "\n")
        t = string.split(data, "\t")
        if x == 0:
            x = 1
        else:
            if array_type == "junction" or array_type == "RNASeq":
                array_gene, critical_exons, excl_junction, incl_junction, probeset2, probeset1, data_source = t
                array_ens_db[array_gene] = [array_gene]
            elif array_type == "AltMouse":
                array_gene, probeset1, probeset2, critical_exons = (
                    t
                )  # ; critical_exons = string.split(critical_exons,'|')
            probesets = [probeset1, probeset2]
            pairwise_probesets[probeset1, probeset2] = []
            if array_gene in array_ens_db:
                ensembl_gene_ids = array_ens_db[array_gene]
                for probeset_id in probesets:
                    if probeset_id in probeset_seq_db:
                        probeset_seq, junction_seq = probeset_seq_db[probeset_id]
                        if biotype == "gene":
                            for ensembl_gene_id in ensembl_gene_ids:
                                if probeset_id not in added_probesets:
                                    probe_data = JunctionDataSimple(probeset_id, array_gene)
                                    probe_data.SetExonSeq(probeset_seq)
                                    probe_data.SetJunctionSeq(junction_seq)
                                    try:
                                        probeset_gene_seq_db[ensembl_gene_id].append(probe_data)
                                    except KeyError:
                                        probeset_gene_seq_db[ensembl_gene_id] = [probe_data]
                                    added_probesets[probeset_id] = []
                    elif (
                        array_type == "RNASeq" and coordinateBasedMatching == True
                    ):  ### Coordinate matching as opposed to sequence
                        if biotype == "gene":
                            for ensembl_gene_id in ensembl_gene_ids:
                                if probeset_id not in added_probesets:
                                    probe_data = JunctionDataSimple(probeset_id, array_gene)
                                    try:
                                        probeset_gene_seq_db[ensembl_gene_id].append(probe_data)
                                    except KeyError:
                                        probeset_gene_seq_db[ensembl_gene_id] = [probe_data]
                                    added_probesets[probeset_id] = []
    print len(probeset_gene_seq_db), "genes with probeset sequence associated"
    return probeset_gene_seq_db, pairwise_probesets
예제 #14
0
def exportAltMouseExonSequence():
    probeset_exon_db={}; x=0
    species = 'Mm'; array_type = 'AltMouse'

    critical_exon_import_file = 'AltDatabase/Mm/AltMouse/AltMouse_junction-comparisons.txt'
    update.verifyFile(critical_exon_import_file,array_type)
    critical_exon_db={}; critical_probesets={}
    fn=filepath(critical_exon_import_file)
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        gene,probeset1,probeset2,critical_exons=string.split(data,'\t')
        critical_exons= string.split(critical_exons,'|')
        for exon in critical_exons:
            try: critical_exon_db[gene,exon].append(probeset1+'-'+probeset2)
            except KeyError: critical_exon_db[gene,exon] = [probeset1+'-'+probeset2]
            critical_probesets[probeset1]=[]; critical_probesets[probeset2]=[]
            
    probeset_annotations_file = "AltDatabase/Mm/AltMouse/MASTER-probeset-transcript.txt"
    update.verifyFile(probeset_annotations_file,array_type)
    fn=filepath(probeset_annotations_file)
    for line in open(fn,'rU').xreadlines():             
        probeset_data = cleanUpLine(line)  #remove endline
        if x==0: x=1
        else:
            probeset,affygene,exons,transcript_num,transcripts,probe_type_call,ensembl,block_exon_ids,block_structure,comparison_info = string.split(probeset_data,'\t')
            if probeset in critical_probesets:
                exons = exons[:-1]; exons = string.split(exons,'-')
                affygene = affygene[:-1]
                if '|' in exons: print exons;kill
                probeset_exon_db[probeset,affygene]=exons

    exon_protein_sequence_file = "AltDatabase/Mm/AltMouse/SEQUENCE-transcript-dbase.txt"
    update.verifyFile(exon_protein_sequence_file,array_type)
    transcript_cdna_sequence_dbase,transcript_associations,exon_sequence_database = import_existing_sequence_build(exon_protein_sequence_file)
    
    critical_exon_seq_export = 'AltDatabase/Mm/AltMouse/AltMouse_critical-exon-seq.txt'
    update.verifyFile(critical_exon_seq_export,array_type)
    fn=filepath(critical_exon_seq_export)
    data = open(fn,'w')
    title = ['Affygene:exon','critical_exon-num','critical-probeset-comps']; title = string.join(title,'\t')+'\n'; data.write(title)    
    for (gene,exon_num) in critical_exon_db:
        probeset_comp_list = critical_exon_db[(gene,exon_num)]; probeset_comp_list = string.join(probeset_comp_list,'|')
        try: ###Restrict export to previously exported critical exons (ExonAnnotate_module)
            exon_sequence_database[(gene,exon_num)]; esd = exon_sequence_database[(gene,exon_num)]
            exon_seq = esd.ExonSeq()
            exon_data = string.join([gene+':'+exon_num,probeset_comp_list,exon_seq],'\t')+'\n'
            data.write(exon_data)
        except KeyError: null=[]
    data.close()

    probeset_seq_file = 'AltDatabase/Mm/AltMouse/probeset_sequence_reversed.txt'
    update.verifyFile(probeset_seq_file,array_type)
    probeset_seq_db={}; x=0
    fn=filepath(probeset_seq_file)
    for line in open(fn,'rU').xreadlines():
        if x == 0: x=1
        else:
            data = cleanUpLine(line); t = string.split(data,'\t')
            probeset = t[0]
            probeset_seq_list = t[1:]
            probeset_seq_db[probeset] = probeset_seq_list
            
    critical_junction_seq_export = 'AltDatabase/Mm/AltMouse/AltMouse_critical-junction-seq.txt'
    update.verifyFile(critical_junction_seq_export,array_type)
    fn=filepath(critical_junction_seq_export)
    data = open(fn,'w'); x=0; k=0;l=0
    title = ['probeset','probeset-seq','junction-seq']; title = string.join(title,'\t')+'\n'; data.write(title)
    for (probeset,gene) in probeset_exon_db:
        junction_seq = []; y=0; positions=[]
        try:
            probeset_seq_list = probeset_seq_db[probeset]
            for exon_num in probeset_exon_db[(probeset,gene)]: 
                try: ###Restrict export to previously exported critical exons (ExonAnnotate_module)
                    exon_sequence_database[(gene,exon_num)]; esd = exon_sequence_database[(gene,exon_num)]
                    exon_seq = esd.ExonSeq(); strand = esd.Strand()
                    junction_seq.append(exon_seq); y+=1
                    #exon_data = string.join([gene+':'+exon_num,probeset_comp_list,exon_seq],'\t')+'\n'
                    #data.write(exon_data)
                except KeyError: null=[]
            #if 'E5' in probeset_exon_db[(probeset,gene)]:
            if y>0:
                if strand == '-': junction_seq.reverse()
                junction_seq_str = string.join(junction_seq,'')
                junction_seq_str = string.upper(junction_seq_str)
                not_found = 0
                for probeset_seq in probeset_seq_list:
                    #probeset_seq = reverse_string(probeset_seq)
                    probeset_seq_rev = reverse_orientation(probeset_seq)
                    if probeset_seq in junction_seq_str:
                        f = string.find(junction_seq_str,probeset_seq)
                        positions.append((f,len(probeset_seq)))
                        k+=1
                    else:
                        not_found = 1
                        x+=1
                if not_found == 1:
                    new_probeset_seq = probeset_seq_list[0] ###pick the first probe sequence found
                if len(positions)>0:
                    positions.sort()
                    new_probeset_seq = junction_seq_str[positions[0][0]:positions[-1][0]+positions[-1][1]]
                    #print new_probeset_seq,positions, probeset,probeset_exon_db[(probeset,gene)],probeset_seq_list,junction_seq;kill
                junction_seq = string.join(junction_seq,'|') ###indicate where the junction is
                probe_seq_data = string.join([probeset,new_probeset_seq,junction_seq],'\t')+'\n'
                data.write(probe_seq_data)
        except KeyError: null=[]
    data.close()
    print k,x