def annotateJunctionIDsAsExon(species,array_type):
    import ExonSeqModule
    probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_junction_probesets-filtered.txt'
    if array_type == 'RNASeq':
        probeset_annotations_file = string.replace(probeset_annotations_file,'junction_probesets-filtered','exons')
    junction_exon_db = ExonSeqModule.importSplicingAnnotationDatabase(probeset_annotations_file,array_type)
    probeset_annotations_file = 'AltDatabase/'+species+'/exon/'+species+'_Ensembl_probesets.txt'
    exon_db = ExonSeqModule.importSplicingAnnotationDatabase(probeset_annotations_file,array_type)
    
    ### Extract unique exon regions from Exon Array annotations
    multiple_exon_regions={}; unique_exon_regions={}
    for probeset in exon_db:
        y = exon_db[probeset]
        geneid = y.GeneID()
        if '|' in y.ExonRegionID():
            exonids = string.split(y.ExonRegionID(),'|')
            for exonid in exonids: multiple_exon_regions[geneid,exonid] = y
        else:
            unique_exon_regions[geneid,y.ExonRegionID()] = y
    ### Add missing exons to unique
    for uid in multiple_exon_regions:
        if uid not in unique_exon_regions: unique_exon_regions[uid]=multiple_exon_regions[uid]

    """
        for i in unique_exon_regions:
            if 'ENSMUSG00000066842' in i:
                print i
    stop
    """
    
    ### Extract unique exon regions from Junction Array annotation
    junction_to_exonids={}
    for probeset in junction_exon_db:
        if 'ENSMUSG00000066842' in probeset: print probeset
        y = junction_exon_db[probeset]
        geneid = y.GeneID()
        if '|' in y.ExonRegionID():
            exonids = string.split(y.ExonRegionID(),'|')
            if probeset == 'ENSMUSG00000066842|E60.1': print [[exonids]]
            for exonid in exonids:
                if (geneid,exonid) in unique_exon_regions:
                    y = unique_exon_regions[geneid,exonid]
                    if probeset == 'ENSMUSG00000066842:E60.1': print [y.Probeset()]
                    junction_to_exonids[probeset] = y.Probeset()
        else:
            if (geneid,string.replace(y.ExonRegionID(),'.','-')) in unique_exon_regions:
                #if ':' in probeset: print [probeset,y.ExonRegionID()];kill
                y = unique_exon_regions[geneid,string.replace(y.ExonRegionID(),'.','-')]
                junction_to_exonids[probeset] = y.Probeset()
                
    output_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_'+array_type+'-exon_probesets.txt'
    fn=filepath(output_file); data = open(fn,'w')
    data.write(array_type+'_probeset\texon_probeset\n')
    
    for probeset in junction_to_exonids:
        exon_probeset = junction_to_exonids[probeset]
        data.write(probeset+'\t'+exon_probeset+'\n')    
    data.close()
示例#2
0
def getParametersAndExecute(probeset_seq_file,array_type,species,data_type):
    if data_type == 'critical-exons':
        if array_type == 'RNASeq': probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_exons.txt'
        else: probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_'+array_type+'_probesets.txt'
        ###Import probe-level associations
        exon_db = ExonSeqModule.importSplicingAnnotationDatabase(probeset_annotations_file,array_type)
        start_time = time.time()
        probeset_seq_db = importProbesetSeqeunces(probeset_seq_file,exon_db,species)  ###Do this locally with a function that works on tab-delimited as opposed to fasta sequences (exon array)
        end_time = time.time(); time_diff = int(end_time-start_time)
    elif data_type == 'junctions':
        start_time = time.time(); biotype = 'gene' ### Indicates whether to store information at the level of genes or probesets
        probeset_seq_db = importSplicingAnnotationDatabaseAndSequence(species,array_type,biotype)
        end_time = time.time(); time_diff = int(end_time-start_time)
    print "Analyses finished in %d seconds" % time_diff
    return probeset_seq_db
示例#3
0
def getParametersAndExecute(probeset_seq_file, array_type, species, data_type):
    if data_type == 'critical-exons':
        if array_type == 'RNASeq':
            probeset_annotations_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_exons.txt'
        else:
            probeset_annotations_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_' + array_type + '_probesets.txt'
        ###Import probe-level associations
        exon_db = ExonSeqModule.importSplicingAnnotationDatabase(
            probeset_annotations_file, array_type)
        start_time = time.time()
        probeset_seq_db = importProbesetSeqeunces(
            probeset_seq_file, exon_db, species
        )  ###Do this locally with a function that works on tab-delimited as opposed to fasta sequences (exon array)
        end_time = time.time()
        time_diff = int(end_time - start_time)
    elif data_type == 'junctions':
        start_time = time.time()
        biotype = 'gene'  ### Indicates whether to store information at the level of genes or probesets
        probeset_seq_db = importSplicingAnnotationDatabaseAndSequence(
            species, array_type, biotype)
        end_time = time.time()
        time_diff = int(end_time - start_time)
    print "Analyses finished in %d seconds" % time_diff
    return probeset_seq_db