示例#1
0
def importSplicingAnnotationDatabaseAndSequence(species, array_type, biotype):
    array_ens_db = {}
    if array_type == 'AltMouse':
        filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '-Ensembl_relationships.txt'
        update.verifyFile(filename,
                          array_type)  ### Will force download if missing
        fn = filepath(filename)
        x = 0
        for line in open(fn, 'r').xreadlines():
            data, newline = string.split(line, '\n')
            t = string.split(data, '\t')
            if x == 0: x = 1
            else:
                array_gene, ens_gene = t
                try:
                    array_ens_db[array_gene].append(ens_gene)
                except KeyError:
                    array_ens_db[array_gene] = [ens_gene]

    filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '_critical-junction-seq.txt'
    fn = filepath(filename)
    probeset_seq_db = {}
    x = 0
    for line in open(fn, 'r').xreadlines():
        data, newline = string.split(line, '\n')
        t = string.split(data, '\t')
        if x == 0: x = 1
        else:
            probeset, probeset_seq, junction_seq = t
            junction_seq = string.replace(junction_seq, '|', '')
            probeset_seq_db[probeset] = probeset_seq, junction_seq

    ###Import reciprocol junctions, so we can compare these directly instead of hits to nulls and combine with sequence data
    ###This short-cuts what we did in two function in ExonSeqModule with exon level data
    filename = 'AltDatabase/' + species + '/' + array_type + '/' + array_type + '_junction-comparisons.txt'
    fn = filepath(filename)
    probeset_gene_seq_db = {}
    x = 0
    for line in open(fn, 'r').xreadlines():
        data, newline = string.split(line, '\n')
        t = string.split(data, '\t')
        if x == 0: x = 1
        else:
            array_gene, probeset1, probeset2, critical_exons = t  #; critical_exons = string.split(critical_exons,'|')
            probesets = [probeset1, probeset2]
            if array_type == 'junction' or array_type == 'RNASeq':
                array_ens_db[array_gene] = [array_gene]
            if array_gene in array_ens_db:
                ensembl_gene_ids = array_ens_db[array_gene]
                for probeset_id in probesets:
                    if probeset_id in probeset_seq_db:
                        probeset_seq, junction_seq = probeset_seq_db[
                            probeset_id]
                        if biotype == 'gene':
                            for ensembl_gene_id in ensembl_gene_ids:
                                probe_data = ExonSeqModule.JunctionDataSimple(
                                    probeset_id, ensembl_gene_id, array_gene,
                                    probesets, critical_exons)
                                probe_data.SetExonSeq(probeset_seq)
                                probe_data.SetJunctionSeq(junction_seq)
                                try:
                                    probeset_gene_seq_db[
                                        ensembl_gene_id].append(probe_data)
                                except KeyError:
                                    probeset_gene_seq_db[ensembl_gene_id] = [
                                        probe_data
                                    ]
                        else:  ### Used for probeset annotations downstream of sequence alignment in LinkEST, analagous to exon_db for exon analyses
                            probe_data = ExonSeqModule.JunctionDataSimple(
                                probeset_id, ensembl_gene_ids, array_gene,
                                probesets, critical_exons)
                            probe_data.SetExonSeq(probeset_seq)
                            probe_data.SetJunctionSeq(junction_seq)
                            probeset_gene_seq_db[probeset_id] = probe_data
    print len(probeset_gene_seq_db), "genes with probeset sequence associated"
    return probeset_gene_seq_db