Exemplo n.º 1
0
def runProgram(Species,Array_type,mir_source,stringency,Force):
    global species; global array_type; global force
    process_microRNA_predictions = 'yes'

    species = Species; array_type = Array_type; force = Force
    
    import_dir = '/AltDatabase/'+species+'/'+array_type
    filedir = import_dir[1:]+'/'
    dir_list = read_directory(import_dir)  #send a sub_directory to a function to identify all files in a directory
    probeset_seq_file=''
    for input_file in dir_list:    #loop through each file in the directory to  results
        if 'critical-exon-seq_updated' in input_file: probeset_seq_file = filedir+input_file
        elif 'critical-exon-seq' in input_file: probeset_seq_file2 = filedir+input_file
    if len(probeset_seq_file)==0: probeset_seq_file=probeset_seq_file2
        
    data_type = 'critical-exons'
    try: splice_event_db = getParametersAndExecute(probeset_seq_file,array_type,species,data_type)
    except UnboundLocalError:
        probeset_seq_file = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_critical-exon-seq_updated.txt'
        update.downloadCurrentVersion(probeset_seq_file,array_type,'txt')
        splice_event_db = getParametersAndExecute(probeset_seq_file,array_type,species,data_type)
        
    if process_microRNA_predictions == 'yes':
        print 'stringency:',stringency
        try:
            ensembl_mirna_db = ExonSeqModule.importmiRNATargetPredictionsAdvanced(species)
            ExonSeqModule.alignmiRNAData(array_type,mir_source,species,stringency,ensembl_mirna_db,splice_event_db)
        except Exception: pass
def annotateJunctionIDsAsExon(species,array_type):
    import ExonSeqModule
    probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_junction_probesets-filtered.txt'
    if array_type == 'RNASeq':
        probeset_annotations_file = string.replace(probeset_annotations_file,'junction_probesets-filtered','exons')
    junction_exon_db = ExonSeqModule.importSplicingAnnotationDatabase(probeset_annotations_file,array_type)
    probeset_annotations_file = 'AltDatabase/'+species+'/exon/'+species+'_Ensembl_probesets.txt'
    exon_db = ExonSeqModule.importSplicingAnnotationDatabase(probeset_annotations_file,array_type)
    
    ### Extract unique exon regions from Exon Array annotations
    multiple_exon_regions={}; unique_exon_regions={}
    for probeset in exon_db:
        y = exon_db[probeset]
        geneid = y.GeneID()
        if '|' in y.ExonRegionID():
            exonids = string.split(y.ExonRegionID(),'|')
            for exonid in exonids: multiple_exon_regions[geneid,exonid] = y
        else:
            unique_exon_regions[geneid,y.ExonRegionID()] = y
    ### Add missing exons to unique
    for uid in multiple_exon_regions:
        if uid not in unique_exon_regions: unique_exon_regions[uid]=multiple_exon_regions[uid]

    """
        for i in unique_exon_regions:
            if 'ENSMUSG00000066842' in i:
                print i
    stop
    """
    
    ### Extract unique exon regions from Junction Array annotation
    junction_to_exonids={}
    for probeset in junction_exon_db:
        if 'ENSMUSG00000066842' in probeset: print probeset
        y = junction_exon_db[probeset]
        geneid = y.GeneID()
        if '|' in y.ExonRegionID():
            exonids = string.split(y.ExonRegionID(),'|')
            if probeset == 'ENSMUSG00000066842|E60.1': print [[exonids]]
            for exonid in exonids:
                if (geneid,exonid) in unique_exon_regions:
                    y = unique_exon_regions[geneid,exonid]
                    if probeset == 'ENSMUSG00000066842:E60.1': print [y.Probeset()]
                    junction_to_exonids[probeset] = y.Probeset()
        else:
            if (geneid,string.replace(y.ExonRegionID(),'.','-')) in unique_exon_regions:
                #if ':' in probeset: print [probeset,y.ExonRegionID()];kill
                y = unique_exon_regions[geneid,string.replace(y.ExonRegionID(),'.','-')]
                junction_to_exonids[probeset] = y.Probeset()
                
    output_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_'+array_type+'-exon_probesets.txt'
    fn=filepath(output_file); data = open(fn,'w')
    data.write(array_type+'_probeset\texon_probeset\n')
    
    for probeset in junction_to_exonids:
        exon_probeset = junction_to_exonids[probeset]
        data.write(probeset+'\t'+exon_probeset+'\n')    
    data.close()
Exemplo n.º 3
0
def runProgram(Species, Array_type, mir_source, stringency, Force):
    global species
    global array_type
    global force
    process_microRNA_predictions = 'yes'

    species = Species
    array_type = Array_type
    force = Force

    import_dir = '/AltDatabase/' + species + '/' + array_type
    filedir = import_dir[1:] + '/'
    dir_list = read_directory(
        import_dir
    )  #send a sub_directory to a function to identify all files in a directory
    probeset_seq_file = ''
    for input_file in dir_list:  #loop through each file in the directory to  results
        if 'critical-exon-seq_updated' in input_file:
            probeset_seq_file = filedir + input_file
        elif 'critical-exon-seq' in input_file:
            probeset_seq_file2 = filedir + input_file
    if len(probeset_seq_file) == 0: probeset_seq_file = probeset_seq_file2

    data_type = 'critical-exons'
    try:
        splice_event_db = getParametersAndExecute(probeset_seq_file,
                                                  array_type, species,
                                                  data_type)
    except UnboundLocalError:
        probeset_seq_file = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '_critical-exon-seq_updated.txt'
        update.downloadCurrentVersion(probeset_seq_file, array_type, 'txt')
        splice_event_db = getParametersAndExecute(probeset_seq_file,
                                                  array_type, species,
                                                  data_type)

    if process_microRNA_predictions == 'yes':
        print 'stringency:', stringency
        try:
            ensembl_mirna_db = ExonSeqModule.importmiRNATargetPredictionsAdvanced(
                species)
            ExonSeqModule.alignmiRNAData(array_type, mir_source, species,
                                         stringency, ensembl_mirna_db,
                                         splice_event_db)
        except Exception:
            pass
Exemplo n.º 4
0
def getParametersAndExecute(probeset_seq_file,array_type,species,data_type):
    if data_type == 'critical-exons':
        if array_type == 'RNASeq': probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_exons.txt'
        else: probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_'+array_type+'_probesets.txt'
        ###Import probe-level associations
        exon_db = ExonSeqModule.importSplicingAnnotationDatabase(probeset_annotations_file,array_type)
        start_time = time.time()
        probeset_seq_db = importProbesetSeqeunces(probeset_seq_file,exon_db,species)  ###Do this locally with a function that works on tab-delimited as opposed to fasta sequences (exon array)
        end_time = time.time(); time_diff = int(end_time-start_time)
    elif data_type == 'junctions':
        start_time = time.time(); biotype = 'gene' ### Indicates whether to store information at the level of genes or probesets
        probeset_seq_db = importSplicingAnnotationDatabaseAndSequence(species,array_type,biotype)
        end_time = time.time(); time_diff = int(end_time-start_time)
    print "Analyses finished in %d seconds" % time_diff
    return probeset_seq_db
Exemplo n.º 5
0
def getParametersAndExecute(probeset_seq_file, array_type, species, data_type):
    if data_type == 'critical-exons':
        if array_type == 'RNASeq':
            probeset_annotations_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_exons.txt'
        else:
            probeset_annotations_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_' + array_type + '_probesets.txt'
        ###Import probe-level associations
        exon_db = ExonSeqModule.importSplicingAnnotationDatabase(
            probeset_annotations_file, array_type)
        start_time = time.time()
        probeset_seq_db = importProbesetSeqeunces(
            probeset_seq_file, exon_db, species
        )  ###Do this locally with a function that works on tab-delimited as opposed to fasta sequences (exon array)
        end_time = time.time()
        time_diff = int(end_time - start_time)
    elif data_type == 'junctions':
        start_time = time.time()
        biotype = 'gene'  ### Indicates whether to store information at the level of genes or probesets
        probeset_seq_db = importSplicingAnnotationDatabaseAndSequence(
            species, array_type, biotype)
        end_time = time.time()
        time_diff = int(end_time - start_time)
    print "Analyses finished in %d seconds" % time_diff
    return probeset_seq_db
Exemplo n.º 6
0
def executeParameters(species,array_type,force,genomic_build,update_uniprot,update_ensembl,update_probeset_to_ensembl,update_domain,update_miRs,update_all,update_miR_seq,ensembl_version):    
    if '|' in array_type: array_type, specific_array_type = string.split(array_type,'|') ### To destinguish between array sub-types, like the HJAY and hGlue
    else: specific_array_type = array_type
    
    if update_all == 'yes':
        update_uniprot='yes'; update_ensembl='yes'; update_probeset_to_ensembl='yes'; update_domain='yes'; update_miRs = 'yes'
        
    if update_ensembl == 'yes':
        import EnsemblSQL; reload(EnsemblSQL)

        """ Used to grab all essential Ensembl annotations previously obtained via BioMart"""        
        configType = 'Advanced'; analysisType = 'AltAnalyzeDBs'; externalDBName = ''
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl-to-External gene associations"""
        configType = 'Basic'; analysisType = 'ExternalOnly'; externalDBName = 'Uniprot/SWISSPROT'
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """
        if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq':
            EnsemblSQL.getFullGeneSequences(ensembl_version,species)
            
    if update_uniprot == 'yes':            
        ###Might need to delete the existing versions of downloaded databases or force download
        buildUniProtFunctAnnotations(species,force)
                
    if update_probeset_to_ensembl == 'yes':
        if species == 'Mm' and array_type == 'AltMouse':
            buildAltMouseExonAnnotations(species,array_type,force,genomic_build)
        elif array_type == 'junction':
            buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build)
        elif array_type == 'RNASeq':
            import RNASeq; test_status = 'no'; data_type = 'mRNA'
            RNASeq.getEnsemblAssociations(species,data_type,test_status,force)
        else: buildExonArrayExonAnnotations(species,array_type,force)

    if update_domain == 'yes':

        ### Get UCSC associations for all Ensembl linked genes (download databases if necessary)        if species == 'Mm' and array_type == 'AltMouse':
        mRNA_Type = 'mrna'; run_from_scratch = 'yes'
        export_all_associations = 'yes' ### YES only for protein prediction analysis
        buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force)

        if (species == 'Mm' and array_type == 'AltMouse'):
            """Imports and re-exports array-Ensembl annotations"""
            import JunctionArray
            null = JunctionArray.importArrayAnnotations(species,array_type); null={}
        if (species == 'Mm' and array_type == 'AltMouse') or array_type == 'junction' or array_type == 'RNASeq':
            """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing"""
            import mRNASeqAlign; analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force)
       
        import IdentifyAltIsoforms; run_seqcomp = 'no'
        IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
        import FeatureAlignment; import JunctionArray
        FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
        
        if array_type == 'junction' or array_type == 'RNASeq':
            ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force)
            IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
            ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon'
            IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
            # FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed
            
            """ Repeat above with CoordinateBasedMatching = True """ 
            ### Peform coordinate based junction mapping to transcripts (requires certain sequence files built in IdentifyAltIosofmrs)
            analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
            IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
            if array_type == 'RNASeq':
                JunctionArray.combineExonJunctionAnnotations(species,array_type)
                
    if update_miRs == 'yes':
        if update_miR_seq == 'yes':
            import MatchMiRTargetPredictions; only_add_sequence_to_previous_results = 'no'
            MatchMiRTargetPredictions.runProgram(species,force,only_add_sequence_to_previous_results)

        if array_type == 'exon' or array_type == 'gene':        
            import ExonSeqModule
            stringency = 'strict'; process_microRNA_predictions = 'yes'; mir_source = 'multiple'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            stringency = 'lax'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        else:
            import JunctionSeqModule
            stringency = 'strict'; mir_source = 'multiple'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)
            stringency = 'lax'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)

    if array_type == 'junction':
        try:
            import JunctionArray; import JunctionArrayEnsemblRules
            JunctionArray.filterForCriticalExons(species,array_type)
            JunctionArray.overRideExonEntriesWithJunctions(species,array_type)
            JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        except IOError: print 'No built junction files to analyze';sys.exit()
    if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm' or species == 'Rn'):
        import JunctionArray; import JunctionArrayEnsemblRules
        try: JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
        except IOError: print 'No Ensembl_exons.txt file to analyze';sys.exit()
    
    try:
        filename = 'AltDatabase/'+species+'/SequenceData/miRBS-combined_gene-targets.txt'; ef=filepath(filename)
        er = string.replace(ef,species+'/SequenceData/miRBS-combined_gene-targets.txt','ensembl/'+species+'/'+species+'_microRNA-Ensembl.txt')
        import shutil; shutil.copyfile(ef,er)
    except Exception: null=[]
    if array_type != 'RNASeq':
        ### Get the probeset-probe relationships from online - needed for FIRMA analysis
        filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt'
        if array_type == 'junction' and 'lue' in specific_array_type:
            server_folder = 'junction/hGlue'
            verifyFile(filename,server_folder) ### Will force download if missing
            verifyFile('AltDatabase/'+species+'/'+array_type+'/platform.txt',server_folder) ### Will force download if missing
        elif array_type != 'AltMouse': verifyFile(filename,array_type) ### Will force download if missing
        if (array_type == 'exon' or array_type == 'AltMouse') and species != 'Rn':
            try:
                ### Available for select exon-arrays and AltMouse
                probeset_to_remove_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probes_to_remove.txt'
                verifyFile(probeset_to_remove_file,array_type)
            except Exception: null=[]
Exemplo n.º 7
0
def executeParameters(species, array_type, force, genomic_build,
                      update_uniprot, update_ensembl,
                      update_probeset_to_ensembl, update_domain, update_miRs,
                      update_all, update_miR_seq, ensembl_version):
    if '|' in array_type:
        array_type, specific_array_type = string.split(
            array_type, '|'
        )  ### To destinguish between array sub-types, like the HJAY and hGlue
    else:
        specific_array_type = array_type

    if update_all == 'yes':
        update_uniprot = 'yes'
        update_ensembl = 'yes'
        update_probeset_to_ensembl = 'yes'
        update_domain = 'yes'
        update_miRs = 'yes'

    if update_ensembl == 'yes':
        import EnsemblSQL
        reload(EnsemblSQL)
        """ Used to grab all essential Ensembl annotations previously obtained via BioMart"""
        configType = 'Advanced'
        analysisType = 'AltAnalyzeDBs'
        externalDBName = ''
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species, configType,
                                                       analysisType,
                                                       externalDBName,
                                                       ensembl_version, force)
        """ Used to grab Ensembl-to-External gene associations"""
        configType = 'Basic'
        analysisType = 'ExternalOnly'
        externalDBName = 'Uniprot/SWISSPROT'
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species, configType,
                                                       analysisType,
                                                       externalDBName,
                                                       ensembl_version, force)
        """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """
        if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq':
            EnsemblSQL.getFullGeneSequences(ensembl_version, species)

    if update_uniprot == 'yes':
        ###Might need to delete the existing versions of downloaded databases or force download
        buildUniProtFunctAnnotations(species, force)

    if update_probeset_to_ensembl == 'yes':
        if species == 'Mm' and array_type == 'AltMouse':
            buildAltMouseExonAnnotations(species, array_type, force,
                                         genomic_build)
        elif array_type == 'junction':
            buildJunctionExonAnnotations(species, array_type,
                                         specific_array_type, force,
                                         genomic_build)
        elif array_type == 'RNASeq':
            import RNASeq
            test_status = 'no'
            data_type = 'mRNA'
            RNASeq.getEnsemblAssociations(species, data_type, test_status,
                                          force)
        else:
            buildExonArrayExonAnnotations(species, array_type, force)

    if update_domain == 'yes':

        ### Get UCSC associations for all Ensembl linked genes (download databases if necessary)        if species == 'Mm' and array_type == 'AltMouse':
        mRNA_Type = 'mrna'
        run_from_scratch = 'yes'
        export_all_associations = 'yes'  ### YES only for protein prediction analysis
        buildUCSCAnnoationFiles(species, mRNA_Type, export_all_associations,
                                run_from_scratch, force)

        if (species == 'Mm' and array_type == 'AltMouse'):
            """Imports and re-exports array-Ensembl annotations"""
            import JunctionArray
            null = JunctionArray.importArrayAnnotations(species, array_type)
            null = {}
        if (species == 'Mm' and array_type == 'AltMouse'
            ) or array_type == 'junction' or array_type == 'RNASeq':
            """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing"""
            import mRNASeqAlign
            analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species, array_type,
                                                     analysis_type, force)

        import IdentifyAltIsoforms
        run_seqcomp = 'no'
        IdentifyAltIsoforms.runProgram(species, array_type, 'null', force,
                                       run_seqcomp)
        import FeatureAlignment
        FeatureAlignment.findDomainsByGenomeCoordinates(
            species, array_type, 'null')

        if array_type == 'junction' or array_type == 'RNASeq':
            ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction'
            mRNASeqAlign.alignProbesetsToTranscripts(species, array_type,
                                                     'single', force)
            IdentifyAltIsoforms.runProgram(species, array_type, 'junction',
                                           force, run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(
                species, array_type, 'junction')
            if array_type == 'junction' or array_type == 'RNASeq':
                ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon'
                IdentifyAltIsoforms.runProgram(species, array_type, 'exon',
                                               force, run_seqcomp)
                # FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed
                if array_type == 'RNASeq':
                    import JunctionArray
                    JunctionArray.combineExonJunctionAnnotations(
                        species, array_type)

    if update_miRs == 'yes':
        if update_miR_seq == 'yes':
            import MatchMiRTargetPredictions
            only_add_sequence_to_previous_results = 'no'
            MatchMiRTargetPredictions.runProgram(
                species, force, only_add_sequence_to_previous_results)

        if array_type == 'exon' or array_type == 'gene':
            import ExonSeqModule
            stringency = 'strict'
            process_microRNA_predictions = 'yes'
            mir_source = 'multiple'
            ExonSeqModule.runProgram(species, array_type,
                                     process_microRNA_predictions, mir_source,
                                     stringency)
            stringency = 'lax'
            ExonSeqModule.runProgram(species, array_type,
                                     process_microRNA_predictions, mir_source,
                                     stringency)
            ExonArray.exportMetaProbesets(
                array_type, species)  ### Export metaprobesets for this build
        else:
            import JunctionSeqModule
            stringency = 'strict'
            mir_source = 'multiple'
            JunctionSeqModule.runProgram(species, array_type, mir_source,
                                         stringency, force)
            stringency = 'lax'
            JunctionSeqModule.runProgram(species, array_type, mir_source,
                                         stringency, force)

    if array_type == 'junction':
        try:
            import JunctionArray
            import JunctionArrayEnsemblRules
            JunctionArray.filterForCriticalExons(species, array_type)
            JunctionArray.overRideExonEntriesWithJunctions(species, array_type)
            JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(
                species, array_type)
            ExonArray.exportMetaProbesets(
                array_type, species)  ### Export metaprobesets for this build
        except IOError:
            print 'No built junction files to analyze'
            sys.exit()
    if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm'
                                   or species == 'Rn'):
        import JunctionArray
        import JunctionArrayEnsemblRules
        try:
            JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(
                species, array_type)
        except IOError:
            print 'No Ensembl_exons.txt file to analyze'
            sys.exit()

    try:
        filename = 'AltDatabase/' + species + '/SequenceData/miRBS-combined_gene-targets.txt'
        ef = filepath(filename)
        er = string.replace(
            ef, species + '/SequenceData/miRBS-combined_gene-targets.txt',
            'ensembl/' + species + '/' + species + '_microRNA-Ensembl.txt')
        import shutil
        shutil.copyfile(ef, er)
    except Exception:
        null = []
    if array_type != 'RNASeq':
        ### Get the probeset-probe relationships from online - needed for FIRMA analysis
        filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_probeset-probes.txt'
        if array_type == 'junction' and 'lue' in specific_array_type:
            server_folder = 'junction/hGlue'
            verifyFile(filename,
                       server_folder)  ### Will force download if missing
            verifyFile('AltDatabase/' + species + '/' + array_type +
                       '/platform.txt',
                       server_folder)  ### Will force download if missing
        elif array_type != 'AltMouse':
            verifyFile(filename,
                       array_type)  ### Will force download if missing
        if (array_type == 'exon'
                or array_type == 'AltMouse') and species != 'Rn':
            try:
                ### Available for select exon-arrays and AltMouse
                probeset_to_remove_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_probes_to_remove.txt'
                verifyFile(probeset_to_remove_file, array_type)
            except Exception:
                null = []
Exemplo n.º 8
0
def importSplicingAnnotationDatabaseAndSequence(species, array_type, biotype):
    array_ens_db = {}
    if array_type == 'AltMouse':
        filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '-Ensembl_relationships.txt'
        update.verifyFile(filename,
                          array_type)  ### Will force download if missing
        fn = filepath(filename)
        x = 0
        for line in open(fn, 'r').xreadlines():
            data, newline = string.split(line, '\n')
            t = string.split(data, '\t')
            if x == 0: x = 1
            else:
                array_gene, ens_gene = t
                try:
                    array_ens_db[array_gene].append(ens_gene)
                except KeyError:
                    array_ens_db[array_gene] = [ens_gene]

    filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '_critical-junction-seq.txt'
    fn = filepath(filename)
    probeset_seq_db = {}
    x = 0
    for line in open(fn, 'r').xreadlines():
        data, newline = string.split(line, '\n')
        t = string.split(data, '\t')
        if x == 0: x = 1
        else:
            probeset, probeset_seq, junction_seq = t
            junction_seq = string.replace(junction_seq, '|', '')
            probeset_seq_db[probeset] = probeset_seq, junction_seq

    ###Import reciprocol junctions, so we can compare these directly instead of hits to nulls and combine with sequence data
    ###This short-cuts what we did in two function in ExonSeqModule with exon level data
    filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '_junction-comparisons.txt'
    fn = filepath(filename)
    probeset_gene_seq_db = {}
    x = 0
    for line in open(fn, 'r').xreadlines():
        data, newline = string.split(line, '\n')
        t = string.split(data, '\t')
        if x == 0: x = 1
        else:
            array_gene, probeset1, probeset2, critical_exons = t  #; critical_exons = string.split(critical_exons,'|')
            probesets = [probeset1, probeset2]
            if array_type == 'junction' or array_type == 'RNASeq':
                array_ens_db[array_gene] = [array_gene]
            if array_gene in array_ens_db:
                ensembl_gene_ids = array_ens_db[array_gene]
                for probeset_id in probesets:
                    if probeset_id in probeset_seq_db:
                        probeset_seq, junction_seq = probeset_seq_db[
                            probeset_id]
                        if biotype == 'gene':
                            for ensembl_gene_id in ensembl_gene_ids:
                                probe_data = ExonSeqModule.JunctionDataSimple(
                                    probeset_id, ensembl_gene_id, array_gene,
                                    probesets, critical_exons)
                                probe_data.SetExonSeq(probeset_seq)
                                probe_data.SetJunctionSeq(junction_seq)
                                try:
                                    probeset_gene_seq_db[
                                        ensembl_gene_id].append(probe_data)
                                except KeyError:
                                    probeset_gene_seq_db[ensembl_gene_id] = [
                                        probe_data
                                    ]
                        else:  ### Used for probeset annotations downstream of sequence alignment in LinkEST, analagous to exon_db for exon analyses
                            probe_data = ExonSeqModule.JunctionDataSimple(
                                probeset_id, ensembl_gene_ids, array_gene,
                                probesets, critical_exons)
                            probe_data.SetExonSeq(probeset_seq)
                            probe_data.SetJunctionSeq(junction_seq)
                            probeset_gene_seq_db[probeset_id] = probe_data
    print len(probeset_gene_seq_db), "genes with probeset sequence associated"
    return probeset_gene_seq_db